diff --git a/Makefile b/Makefile
index 9c0c83f30..5b2376eb1 100644
--- a/Makefile
+++ b/Makefile
@@ -139,7 +139,7 @@ endif
 ifdef LLAMA_CUBLAS
 	CUBLAS_FLAGS = -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
 	CUBLASLD_FLAGS = -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -Lconda/envs/linux/lib -Lconda/envs/linux/lib/stubs -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib -L/usr/local/cuda/targets/aarch64-linux/lib -L/usr/lib/wsl/lib
-	CUBLAS_OBJS = ggml-cuda.o ggml_v2-cuda.o ggml_v2-cuda-legacy.o
+	CUBLAS_OBJS = ggml-cuda.o ggml_v3-cuda.o ggml_v2-cuda.o ggml_v2-cuda-legacy.o
 	NVCC      = nvcc
 	NVCCFLAGS = --forward-unknown-to-host-compiler -use_fast_math
 
@@ -193,6 +193,8 @@ ggml_v2-cuda.o: otherarch/ggml_v2-cuda.cu otherarch/ggml_v2-cuda.h
 	$(NVCC) $(NVCCFLAGS) $(subst -Ofast,-O3,$(CXXFLAGS)) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
 ggml_v2-cuda-legacy.o: otherarch/ggml_v2-cuda-legacy.cu otherarch/ggml_v2-cuda-legacy.h
 	$(NVCC) $(NVCCFLAGS) $(subst -Ofast,-O3,$(CXXFLAGS)) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
+ggml_v3-cuda.o: otherarch/ggml_v3-cuda.cu otherarch/ggml_v3-cuda.h
+	$(NVCC) $(NVCCFLAGS) $(subst -Ofast,-O3,$(CXXFLAGS)) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
 endif # LLAMA_CUBLAS
 
 ifdef LLAMA_HIPBLAS
@@ -205,7 +207,7 @@ ifdef LLAMA_HIPBLAS
 	LLAMA_CUDA_KQUANTS_ITER ?= 2
 	HIPFLAGS   += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C)
 	HIPLDFLAGS    += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib -lhipblas -lamdhip64 -lrocblas
-	HIP_OBJS       += ggml-cuda.o ggml_v2-cuda.o ggml_v2-cuda-legacy.o
+	HIP_OBJS       += ggml-cuda.o ggml_v3-cuda.o ggml_v2-cuda.o ggml_v2-cuda-legacy.o
 ggml-cuda.o: HIPFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) \
 						-DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X) \
                         -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y) \
@@ -218,12 +220,18 @@ ggml_v2-cuda-legacy.o: HIPFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) \
 						-DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X) \
                         -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y) \
                         -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
+ggml_v3-cuda.o: HIPFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) \
+						-DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X) \
+                        -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y) \
+                        -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
 	$(HCXX) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
 ggml_v2-cuda.o: otherarch/ggml_v2-cuda.cu otherarch/ggml_v2-cuda.h
 	$(HCXX) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
 ggml_v2-cuda-legacy.o: otherarch/ggml_v2-cuda-legacy.cu otherarch/ggml_v2-cuda-legacy.h
 	$(HCXX) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
+ggml_v3-cuda.o: otherarch/ggml_v3-cuda.cu otherarch/ggml_v3-cuda.h
+	$(HCXX) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
 endif # LLAMA_HIPBLAS
 
 
@@ -371,6 +379,22 @@ ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
 ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
 	$(CC)  $(CFLAGS) -c $< -o $@
 
+#version 3 libs
+ggml_v3.o: otherarch/ggml_v3.c otherarch/ggml_v3.h
+	$(CC)  $(CFLAGS) $(FULLCFLAGS) -c $< -o $@
+ggml_v3_openblas.o: otherarch/ggml_v3.c otherarch/ggml_v3.h
+	$(CC)  $(CFLAGS) $(FULLCFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@
+ggml_v3_failsafe.o: otherarch/ggml_v3.c otherarch/ggml_v3.h
+	$(CC)  $(CFLAGS) $(NONECFLAGS) -c $< -o $@
+ggml_v3_noavx2.o: otherarch/ggml_v3.c otherarch/ggml_v3.h
+	$(CC)  $(CFLAGS) $(SIMPLECFLAGS) -c $< -o $@
+ggml_v3_clblast.o: otherarch/ggml_v3.c otherarch/ggml_v3.h
+	$(CC)  $(CFLAGS) $(FULLCFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
+ggml_v3_cublas.o: otherarch/ggml_v3.c otherarch/ggml_v3.h
+	$(CC)  $(CFLAGS) $(FULLCFLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@
+ggml_v3_clblast_noavx2.o: otherarch/ggml_v3.c otherarch/ggml_v3.h
+	$(CC)  $(CFLAGS) $(SIMPLECFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
+
 #version 2 libs
 ggml_v2.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
 	$(CC)  $(CFLAGS) $(FULLCFLAGS) -c $< -o $@
@@ -400,6 +424,8 @@ ggml_v2-opencl.o: otherarch/ggml_v2-opencl.cpp otherarch/ggml_v2-opencl.h
 	$(CXX) $(CXXFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
 ggml_v2-opencl-legacy.o: otherarch/ggml_v2-opencl-legacy.c otherarch/ggml_v2-opencl-legacy.h
 	$(CC) $(CFLAGS) -c $< -o $@
+ggml_v3-opencl.o: otherarch/ggml_v3-opencl.cpp otherarch/ggml_v3-opencl.h
+	$(CXX) $(CXXFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
 
 # intermediate objects
 llama.o: llama.cpp ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h otherarch/llama-util.h
@@ -440,11 +466,11 @@ gguf: examples/gguf/gguf.cpp build-info.h ggml.o llama.o $(OBJS)
 
 
 #generated libraries
-koboldcpp_default: ggml.o ggml_v2.o ggml_v1.o expose.o common.o gpttype_adapter.o ggml-quants.o ggml-alloc.o ggml-backend.o grammar-parser.o $(OBJS)
+koboldcpp_default: ggml.o ggml_v3.o ggml_v2.o ggml_v1.o expose.o common.o gpttype_adapter.o ggml-quants.o ggml-alloc.o ggml-backend.o grammar-parser.o $(OBJS)
 	$(DEFAULT_BUILD)
 
 ifdef OPENBLAS_BUILD
-koboldcpp_openblas: ggml_openblas.o ggml_v2_openblas.o ggml_v1.o expose.o common.o gpttype_adapter.o ggml-quants.o ggml-alloc.o ggml-backend.o grammar-parser.o $(OBJS)
+koboldcpp_openblas: ggml_openblas.o ggml_v3_openblas.o ggml_v2_openblas.o ggml_v1.o expose.o common.o gpttype_adapter.o ggml-quants.o ggml-alloc.o ggml-backend.o grammar-parser.o $(OBJS)
 	$(OPENBLAS_BUILD)
 else
 koboldcpp_openblas:
@@ -452,7 +478,7 @@ koboldcpp_openblas:
 endif
 
 ifdef FAILSAFE_BUILD
-koboldcpp_failsafe: ggml_failsafe.o ggml_v2_failsafe.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o ggml-quants_failsafe.o ggml-alloc.o ggml-backend.o grammar-parser.o $(OBJS)
+koboldcpp_failsafe: ggml_failsafe.o ggml_v3_failsafe.o ggml_v2_failsafe.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o ggml-quants_failsafe.o ggml-alloc.o ggml-backend.o grammar-parser.o $(OBJS)
 	$(FAILSAFE_BUILD)
 else
 koboldcpp_failsafe:
@@ -460,7 +486,7 @@ koboldcpp_failsafe:
 endif
 
 ifdef NOAVX2_BUILD
-koboldcpp_noavx2: ggml_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o ggml-quants_noavx2.o ggml-alloc.o ggml-backend.o grammar-parser.o $(OBJS)
+koboldcpp_noavx2: ggml_noavx2.o ggml_v3_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o ggml-quants_noavx2.o ggml-alloc.o ggml-backend.o grammar-parser.o $(OBJS)
 	$(NOAVX2_BUILD)
 else
 koboldcpp_noavx2:
@@ -468,10 +494,10 @@ koboldcpp_noavx2:
 endif
 
 ifdef CLBLAST_BUILD
-koboldcpp_clblast: ggml_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o common.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o ggml-quants.o ggml-alloc.o ggml-backend.o grammar-parser.o $(OBJS)
+koboldcpp_clblast: ggml_clblast.o ggml_v3_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o common.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v3-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o ggml-quants.o ggml-alloc.o ggml-backend.o grammar-parser.o $(OBJS)
 	$(CLBLAST_BUILD)
 ifdef NOAVX2_BUILD
-koboldcpp_clblast_noavx2: ggml_clblast_noavx2.o ggml_v2_clblast_noavx2.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_clblast_noavx2.o ggml-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o ggml-quants_noavx2.o ggml-alloc.o ggml-backend.o grammar-parser.o $(OBJS)
+koboldcpp_clblast_noavx2: ggml_clblast_noavx2.o ggml_v3_clblast_noavx2.o ggml_v2_clblast_noavx2.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_clblast_noavx2.o ggml-opencl.o ggml_v3-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o ggml-quants_noavx2.o ggml-alloc.o ggml-backend.o grammar-parser.o $(OBJS)
 	$(CLBLAST_BUILD)
 else
 koboldcpp_clblast_noavx2:
@@ -485,7 +511,7 @@ koboldcpp_clblast_noavx2:
 endif
 
 ifdef CUBLAS_BUILD
-koboldcpp_cublas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o ggml-quants.o ggml-alloc.o ggml-backend.o grammar-parser.o $(CUBLAS_OBJS) $(OBJS)
+koboldcpp_cublas: ggml_cublas.o ggml_v3_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o ggml-quants.o ggml-alloc.o ggml-backend.o grammar-parser.o $(CUBLAS_OBJS) $(OBJS)
 	$(CUBLAS_BUILD)
 else
 koboldcpp_cublas:
@@ -493,7 +519,7 @@ koboldcpp_cublas:
 endif
 
 ifdef HIPBLAS_BUILD
-koboldcpp_hipblas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o ggml-quants.o ggml-alloc.o ggml-backend.o grammar-parser.o $(HIP_OBJS) $(OBJS)
+koboldcpp_hipblas: ggml_cublas.o ggml_v3_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o ggml-quants.o ggml-alloc.o ggml-backend.o grammar-parser.o $(HIP_OBJS) $(OBJS)
 	$(HIPBLAS_BUILD)
 else
 koboldcpp_hipblas:
diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp
index b6660c240..e9809eade 100644
--- a/gpttype_adapter.cpp
+++ b/gpttype_adapter.cpp
@@ -774,17 +774,29 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
     }
 
     //this is used for the mem_per_token eval, openblas needs more RAM
-    bool use_scratch = ggml_cpu_has_gpublas();
+    bool v3_use_scratch = ggml_v3_cpu_has_gpublas();
 
     int cu_parseinfo_maindevice = inputs.cublas_info<=0?0:inputs.cublas_info;
 
     printf("System Info: %s\n", llama_print_system_info());
     #if defined(GGML_USE_CUBLAS)
-    if(ggml_cpu_has_gpublas() && cu_parseinfo_maindevice>0)
+    if(file_format==FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON)
     {
-        printf("CUBLAS: Set main device to %d\n",cu_parseinfo_maindevice);
-        ggml_cuda_set_main_device(cu_parseinfo_maindevice);
+        if(ggml_cpu_has_gpublas() && cu_parseinfo_maindevice>0)
+        {
+            printf("CUBLAS: Set main device to %d\n",cu_parseinfo_maindevice);
+            ggml_cuda_set_main_device(cu_parseinfo_maindevice);
+        }
     }
+    else
+    {
+        if(ggml_v3_cpu_has_gpublas() && cu_parseinfo_maindevice>0)
+        {
+            printf("CUBLAS v3: Set main device to %d\n",cu_parseinfo_maindevice);
+            ggml_v3_cuda_set_main_device(cu_parseinfo_maindevice);
+        }
+    }
+
     #endif
     SetQuantsUnshuffled(false);
     if(file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2)
@@ -1187,7 +1199,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
             n_vocab = gpt2_ctx_v3.hparams.n_vocab;
 
             // determine the required inference memory per token:
-            gpt2_eval(gpt2_ctx_v3, kcpp_params->n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, use_scratch);
+            gpt2_eval(gpt2_ctx_v3, kcpp_params->n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, v3_use_scratch);
             return ModelLoadResult::SUCCESS;
         }
         else
@@ -1262,19 +1274,19 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
             n_vocab = gptj_ctx_v3.hparams.n_vocab;
 
             // determine the required inference memory per token:
-            gptj_eval(gptj_ctx_v3, kcpp_params->n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, use_scratch);
+            gptj_eval(gptj_ctx_v3, kcpp_params->n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, v3_use_scratch);
 
             //if the logits are NAN or duplicated, it means the model is incompatible
             std::vector<float> oldlogits(logits);
 
             //this is another hack because they change the library - we run the eval through the model
             //twice and compare logits. if they give the same logits for different inputs, model is broken
-            gptj_eval(gptj_ctx_v3, kcpp_params->n_threads, 0, {4, 5, 6, 7}, logits, mem_per_token, use_scratch);
+            gptj_eval(gptj_ctx_v3, kcpp_params->n_threads, 0, {4, 5, 6, 7}, logits, mem_per_token, v3_use_scratch);
 
             if(logits.size()>0 && (IsNanCheck(logits[0]) || LogitsDuplicated(oldlogits,logits)))
             {
                 printf("\nBad Logits detected! Retrying GPT-J model loading...");
-                ggml_free(gptj_ctx_v3.ctx);
+                ggml_v3_free(gptj_ctx_v3.ctx);
                 return ModelLoadResult::RETRY_LOAD;
             }
 
@@ -1338,7 +1350,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
             n_vocab = neox_ctx_v3.hparams.n_vocab;
 
             // determine the required inference memory per token:
-            gpt_neox_eval(neox_ctx_v3, kcpp_params->n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, use_scratch);
+            gpt_neox_eval(neox_ctx_v3, kcpp_params->n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, v3_use_scratch);
 
             return ModelLoadResult::SUCCESS;
         }
@@ -1399,7 +1411,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
         n_vocab = mpt_ctx_v3.hparams.n_vocab;
 
         // determine the required inference memory per token:
-        mpt_eval(mpt_ctx_v3, kcpp_params->n_threads, 0, { 0, 1, 2, 3 }, logits, false, mem_per_token, use_scratch);
+        mpt_eval(mpt_ctx_v3, kcpp_params->n_threads, 0, { 0, 1, 2, 3 }, logits, false, mem_per_token, v3_use_scratch);
         return ModelLoadResult::SUCCESS;
     }
     else
@@ -1709,7 +1721,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
     }
 
     bool startedsampling = false;
-    bool use_scratch = true; //for normal inference always use scratch
+    bool v3_use_scratch = true; //for normal inference always use scratch
 
     timer_start();
     double time1 = 0, time2 = 0;
@@ -1849,7 +1861,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
             }
             else if(file_format==FileFormat::GPT2_4)
             {
-                evalres = gpt2_eval(gpt2_ctx_v3, kcpp_params->n_threads, n_past, embd, logits, mem_per_token, use_scratch);
+                evalres = gpt2_eval(gpt2_ctx_v3, kcpp_params->n_threads, n_past, embd, logits, mem_per_token, v3_use_scratch);
             }
             else if(file_format==FileFormat::NEOX_1 || file_format == FileFormat::NEOX_2 || file_format == FileFormat::NEOX_3 || file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5)
             {
@@ -1857,7 +1869,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
             }
             else if(file_format==FileFormat::NEOX_6|| file_format==FileFormat::NEOX_7)
             {
-                evalres = gpt_neox_eval(neox_ctx_v3, kcpp_params->n_threads, n_past, embd, logits, mem_per_token, use_scratch);
+                evalres = gpt_neox_eval(neox_ctx_v3, kcpp_params->n_threads, n_past, embd, logits, mem_per_token, v3_use_scratch);
             }
             else if(file_format==FileFormat::GPTJ_1 || file_format==FileFormat::GPTJ_2)
             {
@@ -1869,11 +1881,11 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
             }
             else if(file_format==FileFormat::GPTJ_5)
             {
-                evalres = gptj_eval(gptj_ctx_v3, kcpp_params->n_threads, n_past, embd, logits, mem_per_token, use_scratch);
+                evalres = gptj_eval(gptj_ctx_v3, kcpp_params->n_threads, n_past, embd, logits, mem_per_token, v3_use_scratch);
             }
             else if(file_format==FileFormat::MPT_1)
             {
-                evalres = mpt_eval(mpt_ctx_v3, kcpp_params->n_threads, n_past, embd, logits, false, mem_per_token, use_scratch);
+                evalres = mpt_eval(mpt_ctx_v3, kcpp_params->n_threads, n_past, embd, logits, false, mem_per_token, v3_use_scratch);
             }
             else
             {
diff --git a/otherarch/ggml_v3-cuda.cu b/otherarch/ggml_v3-cuda.cu
new file mode 100644
index 000000000..0447499f3
--- /dev/null
+++ b/otherarch/ggml_v3-cuda.cu
@@ -0,0 +1,10325 @@
+#include <algorithm>
+#include <assert.h>
+#include <atomic>
+#include <cinttypes>
+#include <cstddef>
+#include <cstdint>
+#include <float.h>
+#include <limits>
+#include <stdint.h>
+#include <stdio.h>
+#include <vector>
+
+
+#if defined(GGML_USE_HIPBLAS)
+#include <hip/hip_runtime.h>
+#include <hipblas/hipblas.h>
+#include <hip/hip_fp16.h>
+#ifdef __HIP_PLATFORM_AMD__
+// for rocblas_initialize()
+#include "rocblas/rocblas.h"
+#endif // __HIP_PLATFORM_AMD__
+#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
+#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
+#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
+#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
+#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
+#define CUBLAS_OP_N HIPBLAS_OP_N
+#define CUBLAS_OP_T HIPBLAS_OP_T
+#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#define CUBLAS_TF32_TENSOR_OP_MATH 0
+#define CUDA_R_16F  HIPBLAS_R_16F
+#define CUDA_R_32F  HIPBLAS_R_32F
+#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
+#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
+#define cublasCreate hipblasCreate
+#define cublasGemmEx hipblasGemmEx
+#define cublasGemmBatchedEx hipblasGemmBatchedEx
+#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
+#define cublasHandle_t hipblasHandle_t
+#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
+#define cublasSetStream hipblasSetStream
+#define cublasSgemm hipblasSgemm
+#define cublasStatus_t hipblasStatus_t
+#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6
+#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
+#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
+#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
+#define cudaDeviceProp hipDeviceProp_t
+#define cudaDeviceSynchronize hipDeviceSynchronize
+#define cudaError_t hipError_t
+#define cudaEventCreateWithFlags hipEventCreateWithFlags
+#define cudaEventDisableTiming hipEventDisableTiming
+#define cudaEventRecord hipEventRecord
+#define cudaEvent_t hipEvent_t
+#define cudaEventDestroy hipEventDestroy
+#define cudaFree hipFree
+#define cudaFreeHost hipHostFree
+#define cudaGetDevice hipGetDevice
+#define cudaGetDeviceCount hipGetDeviceCount
+#define cudaGetDeviceProperties hipGetDeviceProperties
+#define cudaGetErrorString hipGetErrorString
+#define cudaGetLastError hipGetLastError
+#ifdef GGML_V3_HIP_UMA
+#define cudaMalloc hipMallocManaged
+#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size)
+#else
+#define cudaMalloc hipMalloc
+#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
+#endif
+#define cudaMemcpy hipMemcpy
+#define cudaMemcpyAsync hipMemcpyAsync
+#define cudaMemcpyPeerAsync hipMemcpyPeerAsync
+#define cudaMemcpy2DAsync hipMemcpy2DAsync
+#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
+#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
+#define cudaMemcpyKind hipMemcpyKind
+#define cudaMemset hipMemset
+#define cudaMemsetAsync hipMemsetAsync
+#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
+#define cudaSetDevice hipSetDevice
+#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
+#define cudaStreamFireAndForget hipStreamFireAndForget
+#define cudaStreamNonBlocking hipStreamNonBlocking
+#define cudaStreamSynchronize hipStreamSynchronize
+#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
+#define cudaStream_t hipStream_t
+#define cudaSuccess hipSuccess
+#define __trap abort
+#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
+#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED
+#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE
+#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH
+#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR
+#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED
+#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
+#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
+#else
+#include <cuda_runtime.h>
+#include <cuda.h>
+#include <cublas_v2.h>
+#include <cuda_fp16.h>
+
+#if CUDART_VERSION < 11020
+#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
+#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
+#define CUBLAS_COMPUTE_16F CUDA_R_16F
+#define CUBLAS_COMPUTE_32F CUDA_R_32F
+#define cublasComputeType_t cudaDataType_t
+#endif // CUDART_VERSION < 11020
+
+#endif // defined(GGML_USE_HIPBLAS)
+
+#include "ggml_v3-cuda.h"
+#include "ggml_v3.h"
+
+#define CUDART_HMAX     11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
+
+#define CC_PASCAL     600
+#define MIN_CC_DP4A   610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
+#define CC_VOLTA      700
+#define CC_OFFSET_AMD 1000000
+#define CC_RDNA1      (CC_OFFSET_AMD + 1010)
+#define CC_RDNA2      (CC_OFFSET_AMD + 1030)
+#define CC_RDNA3      (CC_OFFSET_AMD + 1100)
+
+#define GGML_V3_CUDA_MAX_NODES 8192
+
+// define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
+// on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
+// for large computational tasks. the drawback is that this requires some extra amount of VRAM:
+// -  7B quantum model: +100-200 MB
+// - 13B quantum model: +200-400 MB
+//
+#define GGML_V3_CUDA_FORCE_MMQ
+
+// TODO: improve this to be correct for more hardware
+//       for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores
+#if !defined(GGML_V3_CUDA_FORCE_MMQ)
+#define CUDA_USE_TENSOR_CORES
+#endif
+
+// max batch size to use MMQ kernels when tensor cores are available
+#define MMQ_MAX_BATCH_SIZE 32
+
+#if defined(GGML_USE_HIPBLAS)
+#define __CUDA_ARCH__ 1300
+
+#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
+    defined(__gfx1150__) || defined(__gfx1151__)
+#define RDNA3
+#endif
+
+#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
+    defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
+#define RDNA2
+#endif
+
+#ifndef __has_builtin
+    #define __has_builtin(x) 0
+#endif
+
+typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
+static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
+    const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
+    const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
+#if __has_builtin(__builtin_elementwise_sub_sat)
+    const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
+    return reinterpret_cast<const int &>(c);
+#else
+    int8x4_t c;
+    int16_t tmp;
+#pragma unroll
+    for (int i = 0; i < 4; i++) {
+        tmp = va[i] - vb[i];
+        if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();
+        if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
+        c[i] = tmp;
+    }
+    return reinterpret_cast<int &>(c);
+#endif // __has_builtin(__builtin_elementwise_sub_sat)
+}
+
+static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
+#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
+    c = __builtin_amdgcn_sdot4(a, b, c, false);
+#elif defined(RDNA3)
+    c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
+#elif defined(__gfx1010__) || defined(__gfx900__)
+    int tmp1;
+    int tmp2;
+    asm("\n \
+        v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \
+        v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \
+        v_add3_u32 %0, %1, %2, %0 \n \
+        v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \
+        v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \
+        v_add3_u32 %0, %1, %2, %0 \n \
+        "
+        : "+v"(c), "=&v"(tmp1), "=&v"(tmp2)
+        : "v"(a), "v"(b)
+    );
+#else
+    const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
+    const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
+    c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
+#endif
+    return c;
+}
+#endif // defined(GGML_USE_HIPBLAS)
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+static_assert(sizeof(half) == sizeof(ggml_v3_fp16_t), "wrong fp16 size");
+
+[[noreturn]]
+static void ggml_v3_cuda_error(const char * stmt, const char * func, const char * file, const int line, const char * msg) {
+    int id = -1; // in case cudaGetDevice fails
+    cudaGetDevice(&id);
+
+    fprintf(stderr, "CUDA error: %s\n", msg);
+    fprintf(stderr, "  current device: %d, in function %s at %s:%d\n", id, func, file, line);
+    fprintf(stderr, "  %s\n", stmt);
+    // abort with GGML_V3_ASSERT to get a stack trace
+    GGML_V3_ASSERT(!"CUDA error");
+}
+
+#define CUDA_CHECK_GEN(err, success, error_fn)                                      \
+     do {                                                                           \
+        auto err_ = (err);                                                          \
+        if (err_ != (success)) {                                                    \
+            ggml_v3_cuda_error(#err, __func__, __FILE__, __LINE__, error_fn(err_));    \
+        }                                                                           \
+    } while (0)
+
+#define CUDA_CHECK(err) CUDA_CHECK_GEN(err, cudaSuccess, cudaGetErrorString)
+
+#if CUDART_VERSION >= 12000
+    static const char * cublas_get_error_str(const cublasStatus_t err) {
+        return cublasGetStatusString(err);
+    }
+#else
+    static const char * cublas_get_error_str(const cublasStatus_t err) {
+        switch (err) {
+            case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
+            case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
+            case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
+            case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
+            case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
+            case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
+            case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
+            case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
+            case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED";
+            default: return "unknown error";
+        }
+    }
+#endif // CUDART_VERSION >= 12000
+
+#define CUBLAS_CHECK(err) CUDA_CHECK_GEN(err, CUBLAS_STATUS_SUCCESS, cublas_get_error_str)
+
+#if !defined(GGML_USE_HIPBLAS)
+static const char * cu_get_error_str(CUresult err) {
+    const char * err_str;
+    cuGetErrorString(err, &err_str);
+    return err_str;
+}
+#define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str)
+#endif
+
+#if CUDART_VERSION >= 11100
+#define GGML_V3_CUDA_ASSUME(x) __builtin_assume(x)
+#else
+#define GGML_V3_CUDA_ASSUME(x)
+#endif // CUDART_VERSION >= 11100
+
+#ifdef GGML_V3_CUDA_F16
+typedef half dfloat; // dequantize float
+typedef half2 dfloat2;
+#else
+typedef float dfloat; // dequantize float
+typedef float2 dfloat2;
+#endif //GGML_V3_CUDA_F16
+
+static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const int & i32) {
+    const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
+
+    int x32 = 0;
+    x32 |= x16[0] <<  0;
+    x32 |= x16[1] << 16;
+
+    return x32;
+}
+
+static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, const int & i32) {
+    const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
+
+    int x32 = 0;
+    x32 |= x16[0] <<  0;
+    x32 |= x16[1] << 16;
+
+    return x32;
+}
+
+static __device__ __forceinline__ int get_int_from_int8_aligned(const int8_t * x8, const int & i32) {
+    return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
+}
+
+static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t * x8, const int & i32) {
+    return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
+}
+
+template<typename T>
+using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int k, cudaStream_t stream);
+typedef to_t_cuda_t<float> to_fp32_cuda_t;
+typedef to_t_cuda_t<half> to_fp16_cuda_t;
+
+typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
+typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v);
+typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
+typedef void (*ggml_v3_cuda_func_t)(const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst);
+typedef void (*ggml_v3_cuda_op_mul_mat_t)(
+    const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
+    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
+    const int64_t src1_padded_row_size, cudaStream_t stream);
+typedef void (*ggml_v3_cuda_op_flatten_t)(
+    const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst,
+    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream);
+
+// QK = number of values after dequantization
+// QR = QK / number of values before dequantization
+// QI = number of 32 bit integers before dequantization
+
+#define QK4_0 32
+#define QR4_0 2
+#define QI4_0 (QK4_0 / (4 * QR4_0))
+typedef struct {
+    half    d;              // delta
+    uint8_t qs[QK4_0 / 2];  // nibbles / quants
+} block_q4_0;
+static_assert(sizeof(block_q4_0) == sizeof(ggml_v3_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
+
+#define QK4_1 32
+#define QR4_1 2
+#define QI4_1 (QK4_1 / (4 * QR4_1))
+typedef struct {
+    half2   dm;             // dm.x = delta, dm.y = min
+    uint8_t qs[QK4_1 / 2];  // nibbles / quants
+} block_q4_1;
+static_assert(sizeof(block_q4_1) == sizeof(ggml_v3_fp16_t) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
+
+#define QK5_0 32
+#define QR5_0 2
+#define QI5_0 (QK5_0 / (4 * QR5_0))
+typedef struct {
+    half d;                 // delta
+    uint8_t qh[4];          // 5-th bit of quants
+    uint8_t qs[QK5_0 / 2];  // nibbles / quants
+} block_q5_0;
+static_assert(sizeof(block_q5_0) == sizeof(ggml_v3_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
+
+#define QK5_1 32
+#define QR5_1 2
+#define QI5_1 (QK5_1 / (4 * QR5_1))
+typedef struct {
+    half2 dm;               // dm.x = delta, dm.y = min
+    uint8_t qh[4];          // 5-th bit of quants
+    uint8_t qs[QK5_1 / 2];  // nibbles / quants
+} block_q5_1;
+static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_v3_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
+
+#define QK8_0 32
+#define QR8_0 1
+#define QI8_0 (QK8_0 / (4 * QR8_0))
+typedef struct {
+    half    d;              // delta
+    int8_t  qs[QK8_0];      // quants
+} block_q8_0;
+static_assert(sizeof(block_q8_0) == sizeof(ggml_v3_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
+
+#define QK8_1 32
+#define QR8_1 1
+#define QI8_1 (QK8_1 / (4 * QR8_1))
+typedef struct {
+    half2   ds;             // ds.x = delta, ds.y = sum
+    int8_t  qs[QK8_0];      // quants
+} block_q8_1;
+static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_v3_fp16_t) + QK8_0, "wrong q8_1 block size/padding");
+
+typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs);
+typedef void (*allocate_tiles_cuda_t)(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc);
+typedef void (*load_tiles_cuda_t)(
+    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
+    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row);
+typedef float (*vec_dot_q_mul_mat_cuda_t)(
+    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
+    const int * __restrict__ y_qs, const half2 * __restrict__ y_ms, const int & i, const int & j, const int & k);
+
+//================================= k-quants
+
+#ifdef GGML_V3_QKK_64
+#define QK_K 64
+#define K_SCALE_SIZE 4
+#else
+#define QK_K 256
+#define K_SCALE_SIZE 12
+#endif
+
+#define QR2_K 4
+#define QI2_K (QK_K / (4*QR2_K))
+typedef struct {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    half2 dm;                // super-block scale for quantized scales/mins
+} block_q2_K;
+static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_v3_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
+
+#define QR3_K 4
+#define QI3_K (QK_K / (4*QR3_K))
+typedef struct {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#ifdef GGML_V3_QKK_64
+    uint8_t scales[2]; // scales, quantized with 8 bits
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    half d;             // super-block scale
+} block_q3_K;
+//static_assert(sizeof(block_q3_K) == sizeof(ggml_v3_fp16_t) + QK_K / 4 + QK_K / 8 + K_SCALE_SIZE, "wrong q3_K block size/padding");
+
+#define QR4_K 2
+#define QI4_K (QK_K / (4*QR4_K))
+#ifdef GGML_V3_QKK_64
+typedef struct {
+    half    dm[2];             // super-block scales/mins
+    uint8_t scales[2];         // 4-bit block scales/mins
+    uint8_t qs[QK_K/2];        // 4--bit quants
+} block_q4_K;
+static_assert(sizeof(block_q4_K) == sizeof(half2) + QK_K/2 + 2, "wrong q4_K block size/padding");
+#else
+typedef struct {
+    half2 dm;                  // super-block scale for quantized scales/mins
+    uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+} block_q4_K;
+static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_v3_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
+#endif
+
+#define QR5_K 2
+#define QI5_K (QK_K / (4*QR5_K))
+#ifdef GGML_V3_QKK_64
+typedef struct {
+    half d;                  // super-block scale
+    int8_t scales[QK_K/16];  // block scales
+    uint8_t qh[QK_K/8];      // quants, high bit
+    uint8_t qs[QK_K/2];      // quants, low 4 bits
+} block_q5_K;
+static_assert(sizeof(block_q5_K) == sizeof(ggml_v3_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
+#else
+typedef struct {
+    half2 dm;                     // super-block scale for quantized scales/mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];           // quants, high bit
+    uint8_t qs[QK_K/2];           // quants, low 4 bits
+} block_q5_K;
+static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_v3_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
+#endif
+
+#define QR6_K 2
+#define QI6_K (QK_K / (4*QR6_K))
+typedef struct {
+    uint8_t ql[QK_K/2];   // quants, lower 4 bits
+    uint8_t qh[QK_K/4];   // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales
+    half    d;         // delta
+} block_q6_K;
+static_assert(sizeof(block_q6_K) == sizeof(ggml_v3_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding");
+
+#define QR2_XXS 8
+#define QI2_XXS (QK_K / (4*QR2_XXS))
+typedef struct {
+    half d;
+    uint16_t qs[QK_K/8];
+} block_iq2_xxs;
+static_assert(sizeof(block_iq2_xxs) == sizeof(ggml_v3_fp16_t) + QK_K/8*sizeof(uint16_t), "wrong iq2_xxs block size/padding");
+
+#define QR2_XS 8
+#define QI2_XS (QK_K / (4*QR2_XS))
+typedef struct {
+    half d;
+    uint16_t qs[QK_K/8];
+    uint8_t  scales[QK_K/32];
+} block_iq2_xs;
+static_assert(sizeof(block_iq2_xs) == sizeof(ggml_v3_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding");
+
+#define WARP_SIZE 32
+#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
+
+#define CUDA_GELU_BLOCK_SIZE 256
+#define CUDA_SILU_BLOCK_SIZE 256
+#define CUDA_TANH_BLOCK_SIZE 256
+#define CUDA_RELU_BLOCK_SIZE 256
+#define CUDA_SQR_BLOCK_SIZE 256
+#define CUDA_CPY_BLOCK_SIZE 32
+#define CUDA_SCALE_BLOCK_SIZE 256
+#define CUDA_CLAMP_BLOCK_SIZE 256
+#define CUDA_ROPE_BLOCK_SIZE 256
+#define CUDA_SOFT_MAX_BLOCK_SIZE 1024
+#define CUDA_ALIBI_BLOCK_SIZE 32
+#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
+#define CUDA_QUANTIZE_BLOCK_SIZE 256
+#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
+#define CUDA_GET_ROWS_BLOCK_SIZE 256
+#define CUDA_UPSCALE_BLOCK_SIZE 256
+#define CUDA_CONCAT_BLOCK_SIZE 256
+#define CUDA_PAD_BLOCK_SIZE 256
+#define CUDA_ACC_BLOCK_SIZE 256
+#define CUDA_IM2COL_BLOCK_SIZE 256
+
+// dmmv = dequantize_mul_mat_vec
+#ifndef GGML_V3_CUDA_DMMV_X
+#define GGML_V3_CUDA_DMMV_X 32
+#endif
+#ifndef GGML_V3_CUDA_MMV_Y
+#define GGML_V3_CUDA_MMV_Y 1
+#endif
+
+#ifndef K_QUANTS_PER_ITERATION
+#define K_QUANTS_PER_ITERATION 2
+#else
+static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
+#endif
+
+#ifndef GGML_V3_CUDA_PEER_MAX_BATCH_SIZE
+#define GGML_V3_CUDA_PEER_MAX_BATCH_SIZE 128
+#endif // GGML_V3_CUDA_PEER_MAX_BATCH_SIZE
+
+#define MUL_MAT_SRC1_COL_STRIDE 128
+
+#define MAX_STREAMS 8
+static cudaStream_t g_cudaStreams[GGML_V3_CUDA_MAX_DEVICES][MAX_STREAMS] = { { nullptr } };
+
+struct ggml_v3_tensor_extra_gpu {
+    void * data_device[GGML_V3_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
+    cudaEvent_t events[GGML_V3_CUDA_MAX_DEVICES][MAX_STREAMS]; // events for synchronizing multiple GPUs
+};
+
+// this is faster on Windows
+// probably because the Windows CUDA libraries forget to make this check before invoking the drivers
+static void ggml_v3_cuda_set_device(const int device) {
+    int current_device;
+    CUDA_CHECK(cudaGetDevice(&current_device));
+
+    if (device == current_device) {
+        return;
+    }
+
+    CUDA_CHECK(cudaSetDevice(device));
+}
+
+static int g_device_count = -1;
+static int g_main_device = 0;
+static float g_tensor_split[GGML_V3_CUDA_MAX_DEVICES] = {0};
+static bool g_mul_mat_q = false;
+
+struct cuda_device_capabilities {
+    int     cc;                 // compute capability
+    size_t  smpb;               // max. shared memory per block
+    bool    vmm;                // virtual memory support
+    size_t  vmm_granularity;    // granularity of virtual memory
+};
+
+static cuda_device_capabilities g_device_caps[GGML_V3_CUDA_MAX_DEVICES] = { {0, 0, false, 0} };
+
+static void * g_scratch_buffer = nullptr;
+static size_t g_scratch_size = 0; // disabled by default
+static size_t g_scratch_offset = 0;
+
+static cublasHandle_t g_cublas_handles[GGML_V3_CUDA_MAX_DEVICES] = {nullptr};
+
+[[noreturn]]
+static __device__ void bad_arch() {
+    printf("ERROR: ggml-cuda was compiled without support for the current GPU architecture.\n");
+    __trap();
+
+    (void) bad_arch; // suppress unused function warning
+}
+
+static __device__ __forceinline__ float warp_reduce_sum(float x) {
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        x += __shfl_xor_sync(0xffffffff, x, mask, 32);
+    }
+    return x;
+}
+
+static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
+        a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
+    }
+    return a;
+}
+
+static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
+#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
+    }
+    return a;
+#else
+    (void) a;
+    bad_arch();
+#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
+}
+
+static __device__ __forceinline__ float warp_reduce_max(float x) {
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
+    }
+    return x;
+}
+
+static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
+#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        x = __hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
+    }
+    return x;
+#else
+    (void) x;
+    bad_arch();
+#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
+}
+
+static __device__ __forceinline__ float op_repeat(const float a, const float b) {
+    return b;
+    GGML_V3_UNUSED(a);
+}
+
+static __device__ __forceinline__ float op_add(const float a, const float b) {
+    return a + b;
+}
+
+static __device__ __forceinline__ float op_mul(const float a, const float b) {
+    return a * b;
+}
+
+static __device__ __forceinline__ float op_div(const float a, const float b) {
+    return a / b;
+}
+
+template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
+static __global__ void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
+        int ne0, int ne1, int ne2, int ne3,
+        int ne10, int ne11, int ne12, int ne13,
+        /*int s0, */ int s1,  int s2,  int s3,
+        /*int s10,*/ int s11, int s12, int s13) {
+    const int i0s = blockDim.x*blockIdx.x + threadIdx.x;
+    const int i1 = (blockDim.y*blockIdx.y + threadIdx.y);
+    const int i2 = (blockDim.z*blockIdx.z + threadIdx.z) / ne3;
+    const int i3 = (blockDim.z*blockIdx.z + threadIdx.z) % ne3;
+
+    if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
+        return;
+    }
+
+    const int i11 = i1 % ne11;
+    const int i12 = i2 % ne12;
+    const int i13 = i3 % ne13;
+
+    const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
+    const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
+    const size_t i_dst  = i_src0;
+
+    const src0_t * src0_row = src0 + i_src0;
+    const src1_t * src1_row = src1 + i_src1;
+    dst_t * dst_row = dst + i_dst;
+
+    for (int i0 = i0s; i0 < ne0; i0 += blockDim.x*gridDim.x) {
+        const int i10 = i0 % ne10;
+        dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
+    }
+}
+
+template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
+static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst,
+        int ne0, int ne1, int ne2, int ne3,
+        int ne10, int ne11, int ne12, int ne13,
+        /*int s0, */ int s1,  int s2,  int s3,
+        /*int s10,*/ int s11, int s12, int s13) {
+
+    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    const int i3 = i/(ne2*ne1*ne0);
+    const int i2 = (i/(ne1*ne0)) % ne2;
+    const int i1 = (i/ne0) % ne1;
+    const int i0 = i % ne0;
+
+    if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
+        return;
+    }
+
+    const int i11 = i1 % ne11;
+    const int i12 = i2 % ne12;
+    const int i13 = i3 % ne13;
+
+    const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
+    const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
+    const size_t i_dst  = i_src0;
+
+    const src0_t * src0_row = src0 + i_src0;
+    const src1_t * src1_row = src1 + i_src1;
+    dst_t * dst_row = dst + i_dst;
+
+    const int i10 = i0 % ne10;
+    dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
+}
+
+static __global__ void acc_f32(const float * x, const float * y, float * dst, const int ne,
+    const int ne10, const int ne11, const int ne12,
+    const int nb1, const int nb2, int offset) {
+    const int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i >= ne) {
+        return;
+    }
+    int src1_idx = i - offset;
+    int oz = src1_idx / nb2;
+    int oy = (src1_idx - (oz * nb2)) / nb1;
+    int ox = src1_idx % nb1;
+    if (src1_idx >= 0 && ox < ne10 && oy < ne11 && oz < ne12) {
+        dst[i] = x[i] + y[ox + oy * ne10 + oz * ne10 * ne11];
+    } else {
+        dst[i] = x[i];
+    }
+}
+
+static __global__ void gelu_f32(const float * x, float * dst, const int k) {
+    const float GELU_COEF_A    = 0.044715f;
+    const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
+    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i >= k) {
+        return;
+    }
+
+    float xi = x[i];
+    dst[i] = 0.5f*xi*(1.0f + tanhf(SQRT_2_OVER_PI*xi*(1.0f + GELU_COEF_A*xi*xi)));
+}
+
+static __global__ void silu_f32(const float * x, float * dst, const int k) {
+    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i >= k) {
+        return;
+    }
+    dst[i] = x[i] / (1.0f + expf(-x[i]));
+}
+
+static __global__ void gelu_quick_f32(const float * x, float * dst, int k) {
+    const float GELU_QUICK_COEF = -1.702f;
+    const int i  = blockDim.x*blockIdx.x + threadIdx.x;
+    if (i >= k) {
+        return;
+    }
+    dst[i] = x[i] * (1.0f / (1.0f + expf(GELU_QUICK_COEF * x[i])));
+}
+
+static __global__ void tanh_f32(const float * x, float * dst, int k) {
+    const int i  = blockDim.x*blockIdx.x + threadIdx.x;
+    if (i >= k) {
+        return;
+    }
+    dst[i] = tanhf(x[i]);
+}
+
+static __global__ void relu_f32(const float * x, float * dst, const int k) {
+    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i >= k) {
+        return;
+    }
+    dst[i] = fmaxf(x[i], 0);
+}
+
+static __global__ void leaky_relu_f32(const float * x, float * dst, const int k, const float negative_slope) {
+    const int i  = blockDim.x*blockIdx.x + threadIdx.x;
+    if (i >= k) {
+        return;
+    }
+    dst[i] = fmaxf(x[i], 0) + fminf(x[i], 0.0f) * negative_slope;
+}
+
+static __global__ void sqr_f32(const float * x, float * dst, const int k) {
+    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i >= k) {
+        return;
+    }
+    dst[i] = x[i] * x[i];
+}
+
+template <int block_size>
+static __global__ void norm_f32(const float * x, float * dst, const int ncols, const float eps) {
+    const int row = blockIdx.x*blockDim.y + threadIdx.y;
+    const int tid = threadIdx.x;
+
+    float2 mean_var = make_float2(0.f, 0.f);
+
+    for (int col = tid; col < ncols; col += block_size) {
+        const float xi = x[row*ncols + col];
+        mean_var.x += xi;
+        mean_var.y += xi * xi;
+    }
+
+    // sum up partial sums
+    mean_var = warp_reduce_sum(mean_var);
+    if (block_size > WARP_SIZE) {
+        __shared__ float2 s_sum[32];
+        int warp_id = threadIdx.x / WARP_SIZE;
+        int lane_id = threadIdx.x % WARP_SIZE;
+        if (lane_id == 0) {
+            s_sum[warp_id] = mean_var;
+        }
+        __syncthreads();
+        mean_var = s_sum[lane_id];
+        mean_var = warp_reduce_sum(mean_var);
+    }
+
+    const float mean = mean_var.x / ncols;
+    const float var = mean_var.y / ncols - mean * mean;
+    const float inv_std = rsqrtf(var + eps);
+
+    for (int col = tid; col < ncols; col += block_size) {
+        dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_std;
+    }
+}
+
+static __global__ void concat_f32(const float * x,const float * y, float * dst, const int ne0, const int ne02) {
+    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
+    if (nidx >= ne0) {
+        return;
+    }
+    // operation
+    int offset_dst =
+        nidx +
+        blockIdx.y * ne0 +
+        blockIdx.z * ne0 * gridDim.y;
+    if (blockIdx.z < ne02) { // src0
+        int offset_src =
+            nidx +
+            blockIdx.y * ne0 +
+            blockIdx.z * ne0 * gridDim.y;
+            dst[offset_dst] = x[offset_src];
+    } else {
+        int offset_src =
+            nidx +
+            blockIdx.y * ne0 +
+            (blockIdx.z - ne02) * ne0 *  gridDim.y;
+            dst[offset_dst] = y[offset_src];
+    }
+}
+
+static __global__ void upscale_f32(const float * x, float * dst, const int ne00, const int nb02, const int scale_factor) {
+    int ne0 = ne00 * scale_factor;
+    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
+    if (nidx >= ne0) {
+        return;
+    }
+    // operation
+    int i00 = nidx / scale_factor;
+    int i01 = blockIdx.y / scale_factor;
+    int offset_src =
+        i00 +
+        i01 * ne00 +
+        blockIdx.z * nb02;
+    int offset_dst =
+        nidx +
+        blockIdx.y * ne0 +
+        blockIdx.z * ne0 * gridDim.y;
+    dst[offset_dst] = x[offset_src];
+}
+
+static __global__ void pad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02) {
+    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
+    if (nidx >= ne0) {
+        return;
+    }
+
+    // operation
+    int offset_dst =
+        nidx +
+        blockIdx.y * ne0 +
+        blockIdx.z * ne0 * gridDim.y;
+    if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02) {
+        int offset_src =
+            nidx +
+            blockIdx.y * ne00 +
+            blockIdx.z * ne00 * ne01;
+            dst[offset_dst] = x[offset_src];
+    } else {
+        dst[offset_dst] = 0.0f;
+    }
+}
+
+template <int block_size>
+static __global__ void group_norm_f32(const float * x, float * dst, const int group_size, const int ne_elements, const float eps) {
+    int start = blockIdx.x * group_size;
+    int end = start + group_size;
+
+    start += threadIdx.x;
+
+    if (end >= ne_elements) {
+        end = ne_elements;
+    }
+
+    float tmp = 0.0f; // partial sum for thread in warp
+
+    for (int j = start; j < end; j += block_size) {
+        tmp += x[j];
+    }
+
+    tmp = warp_reduce_sum(tmp);
+    if (block_size > WARP_SIZE) {
+        __shared__ float s_sum[32];
+        int warp_id = threadIdx.x / WARP_SIZE;
+        int lane_id = threadIdx.x % WARP_SIZE;
+        if (lane_id == 0) {
+            s_sum[warp_id] = tmp;
+        }
+        __syncthreads();
+        tmp = s_sum[lane_id];
+        tmp = warp_reduce_sum(tmp);
+    }
+
+    float mean = tmp / group_size;
+    tmp = 0.0f;
+
+    for (int j = start; j < end; j += block_size) {
+        float xi = x[j] - mean;
+        dst[j] = xi;
+        tmp += xi * xi;
+    }
+
+    tmp = warp_reduce_sum(tmp);
+    if (block_size > WARP_SIZE) {
+        __shared__ float s_sum[32];
+        int warp_id = threadIdx.x / WARP_SIZE;
+        int lane_id = threadIdx.x % WARP_SIZE;
+        if (lane_id == 0) {
+            s_sum[warp_id] = tmp;
+        }
+        __syncthreads();
+        tmp = s_sum[lane_id];
+        tmp = warp_reduce_sum(tmp);
+    }
+
+    float variance = tmp / group_size;
+    float scale = rsqrtf(variance + eps);
+    for (int j = start; j < end; j += block_size) {
+        dst[j] *= scale;
+    }
+}
+
+template <int block_size>
+static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
+    const int row = blockIdx.x*blockDim.y + threadIdx.y;
+    const int tid = threadIdx.x;
+
+    float tmp = 0.0f; // partial sum for thread in warp
+
+    for (int col = tid; col < ncols; col += block_size) {
+        const float xi = x[row*ncols + col];
+        tmp += xi * xi;
+    }
+
+    // sum up partial sums
+    tmp = warp_reduce_sum(tmp);
+    if (block_size > WARP_SIZE) {
+        __shared__ float s_sum[32];
+        int warp_id = threadIdx.x / WARP_SIZE;
+        int lane_id = threadIdx.x % WARP_SIZE;
+        if (lane_id == 0) {
+            s_sum[warp_id] = tmp;
+        }
+        __syncthreads();
+        tmp = s_sum[lane_id];
+        tmp = warp_reduce_sum(tmp);
+    }
+
+    const float mean = tmp / ncols;
+    const float scale = rsqrtf(mean + eps);
+
+    for (int col = tid; col < ncols; col += block_size) {
+        dst[row*ncols + col] = scale * x[row*ncols + col];
+    }
+}
+
+static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
+    const block_q4_0 * x = (const block_q4_0 *) vx;
+
+    const dfloat d = x[ib].d;
+
+    const int vui = x[ib].qs[iqs];
+
+    v.x = vui & 0xF;
+    v.y = vui >> 4;
+
+#ifdef GGML_V3_CUDA_F16
+    v = __hsub2(v, {8.0f, 8.0f});
+    v = __hmul2(v, {d, d});
+#else
+    v.x = (v.x - 8.0f) * d;
+    v.y = (v.y - 8.0f) * d;
+#endif // GGML_V3_CUDA_F16
+}
+
+static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
+    const block_q4_1 * x = (const block_q4_1 *) vx;
+
+    const dfloat d = __low2half(x[ib].dm);
+    const dfloat m = __high2half(x[ib].dm);
+
+    const int vui = x[ib].qs[iqs];
+
+    v.x = vui & 0xF;
+    v.y = vui >> 4;
+
+#ifdef GGML_V3_CUDA_F16
+    v = __hmul2(v, {d, d});
+    v = __hadd2(v, {m, m});
+#else
+    v.x = (v.x * d) + m;
+    v.y = (v.y * d) + m;
+#endif // GGML_V3_CUDA_F16
+}
+
+static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
+    const block_q5_0 * x = (const block_q5_0 *) vx;
+
+    const dfloat d = x[ib].d;
+
+    uint32_t qh;
+    memcpy(&qh, x[ib].qh, sizeof(qh));
+
+    const int xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
+    const int xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
+
+    v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
+    v.y = ((x[ib].qs[iqs] >>  4) | xh_1);
+
+#ifdef GGML_V3_CUDA_F16
+    v = __hsub2(v, {16.0f, 16.0f});
+    v = __hmul2(v, {d, d});
+#else
+    v.x = (v.x - 16.0f) * d;
+    v.y = (v.y - 16.0f) * d;
+#endif // GGML_V3_CUDA_F16
+}
+
+static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
+    const block_q5_1 * x = (const block_q5_1 *) vx;
+
+    const dfloat d = __low2half(x[ib].dm);
+    const dfloat m = __high2half(x[ib].dm);
+
+    uint32_t qh;
+    memcpy(&qh, x[ib].qh, sizeof(qh));
+
+    const int xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
+    const int xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
+
+    v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
+    v.y = ((x[ib].qs[iqs] >>  4) | xh_1);
+
+#ifdef GGML_V3_CUDA_F16
+    v = __hmul2(v, {d, d});
+    v = __hadd2(v, {m, m});
+#else
+    v.x = (v.x * d) + m;
+    v.y = (v.y * d) + m;
+#endif // GGML_V3_CUDA_F16
+}
+
+static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
+    const block_q8_0 * x = (const block_q8_0 *) vx;
+
+    const dfloat d = x[ib].d;
+
+    v.x = x[ib].qs[iqs + 0];
+    v.y = x[ib].qs[iqs + 1];
+
+#ifdef GGML_V3_CUDA_F16
+    v = __hmul2(v, {d, d});
+#else
+    v.x *= d;
+    v.y *= d;
+#endif // GGML_V3_CUDA_F16
+}
+
+//================================== k-quants
+
+template<typename dst_t>
+static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
+
+    const int i   = blockIdx.x;
+    const block_q2_K * x = (const block_q2_K *) vx;
+
+    const int tid = threadIdx.x;
+#if QK_K == 256
+    const int n   = tid/32;
+    const int l   = tid - 32*n;
+    const int is  = 8*n + l/16;
+
+    const uint8_t q = x[i].qs[32*n + l];
+    dst_t * y = yy + i*QK_K + 128*n;
+
+    float dall = __low2half(x[i].dm);
+    float dmin = __high2half(x[i].dm);
+    y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
+    y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
+    y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
+    y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
+#else
+    const int is = tid/16;  // 0 or 1
+    const int il = tid%16;  // 0...15
+    const uint8_t q = x[i].qs[il] >> (2*is);
+    dst_t * y = yy + i*QK_K + 16*is + il;
+    float dall = __low2half(x[i].dm);
+    float dmin = __high2half(x[i].dm);
+    y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
+    y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
+#endif
+
+}
+
+template<typename dst_t>
+static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
+
+    const int i = blockIdx.x;
+    const block_q3_K * x = (const block_q3_K *) vx;
+
+#if QK_K == 256
+    const int r = threadIdx.x/4;
+    const int tid = r/2;
+    const int is0 = r%2;
+    const int l0 = 16*is0 + 4*(threadIdx.x%4);
+    const int n = tid / 4;
+    const int j = tid - 4*n;
+
+    uint8_t m = 1 << (4*n + j);
+    int is = 8*n + 2*j + is0;
+    int shift = 2*j;
+
+    int8_t us = is <  4 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+8] >> 0) & 3) << 4) :
+                is <  8 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+4] >> 2) & 3) << 4) :
+                is < 12 ? (x[i].scales[is-8] >>  4) | (((x[i].scales[is+0] >> 4) & 3) << 4) :
+                          (x[i].scales[is-8] >>  4) | (((x[i].scales[is-4] >> 6) & 3) << 4);
+    float d_all = x[i].d;
+    float dl = d_all * (us - 32);
+
+    dst_t * y = yy + i*QK_K + 128*n + 32*j;
+    const uint8_t * q = x[i].qs + 32*n;
+    const uint8_t * hm = x[i].hmask;
+
+    for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
+#else
+    const int tid = threadIdx.x;
+    const int is  = tid/16;  // 0 or 1
+    const int il  = tid%16;  // 0...15
+    const int im  = il/8;    // 0...1
+    const int in  = il%8;    // 0...7
+
+    dst_t * y = yy + i*QK_K + 16*is + il;
+
+    const uint8_t q = x[i].qs[il] >> (2*is);
+    const uint8_t h = x[i].hmask[in] >> (2*is + im);
+    const float   d = (float)x[i].d;
+
+    if (is == 0) {
+        y[ 0] = d * ((x[i].scales[0] & 0xF) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
+        y[32] = d * ((x[i].scales[1] & 0xF) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
+    } else {
+        y[ 0] = d * ((x[i].scales[0] >>  4) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
+        y[32] = d * ((x[i].scales[1] >>  4) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
+    }
+#endif
+
+}
+
+#if QK_K == 256
+static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
+    if (j < 4) {
+        d = q[j] & 63; m = q[j + 4] & 63;
+    } else {
+        d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4);
+        m = (q[j+4] >>  4) | ((q[j-0] >> 6) << 4);
+    }
+}
+#endif
+
+template<typename dst_t>
+static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
+    const block_q4_K * x = (const block_q4_K *) vx;
+
+    const int i = blockIdx.x;
+
+#if QK_K == 256
+    // assume 32 threads
+    const int tid = threadIdx.x;
+    const int il  = tid/8;
+    const int ir  = tid%8;
+    const int is  = 2*il;
+    const int n   = 4;
+
+    dst_t * y = yy + i*QK_K + 64*il + n*ir;
+
+    const float dall = __low2half(x[i].dm);
+    const float dmin = __high2half(x[i].dm);
+
+    const uint8_t * q = x[i].qs + 32*il + n*ir;
+
+    uint8_t sc, m;
+    get_scale_min_k4(is + 0, x[i].scales, sc, m);
+    const float d1 = dall * sc; const float m1 = dmin * m;
+    get_scale_min_k4(is + 1, x[i].scales, sc, m);
+    const float d2 = dall * sc; const float m2 = dmin * m;
+    for (int l = 0; l < n; ++l) {
+        y[l + 0] = d1 * (q[l] & 0xF) - m1;
+        y[l +32] = d2 * (q[l] >>  4) - m2;
+    }
+#else
+    const int tid = threadIdx.x;
+    const uint8_t * q = x[i].qs;
+    dst_t * y = yy + i*QK_K;
+    const float d = (float)x[i].dm[0];
+    const float m = (float)x[i].dm[1];
+    y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
+    y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >>  4) - m * (x[i].scales[1] >> 4);
+#endif
+}
+
+template<typename dst_t>
+static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
+    const block_q5_K * x = (const block_q5_K *) vx;
+
+    const int i = blockIdx.x;
+
+#if QK_K == 256
+    // assume 64 threads - this is very slightly better than the one below
+    const int tid = threadIdx.x;
+    const int il  = tid/16;   // il is in 0...3
+    const int ir  = tid%16;   // ir is in 0...15
+    const int is  = 2*il;     // is is in 0...6
+
+    dst_t * y = yy + i*QK_K + 64*il + 2*ir;
+
+    const float dall = __low2half(x[i].dm);
+    const float dmin = __high2half(x[i].dm);
+
+    const uint8_t * ql = x[i].qs + 32*il + 2*ir;
+    const uint8_t * qh = x[i].qh + 2*ir;
+
+    uint8_t sc, m;
+    get_scale_min_k4(is + 0, x[i].scales, sc, m);
+    const float d1 = dall * sc; const float m1 = dmin * m;
+    get_scale_min_k4(is + 1, x[i].scales, sc, m);
+    const float d2 = dall * sc; const float m2 = dmin * m;
+
+    uint8_t   hm  = 1 << (2*il);
+    y[ 0] = d1 * ((ql[ 0] & 0xF) + (qh[ 0] & hm ? 16 : 0)) - m1;
+    y[ 1] = d1 * ((ql[ 1] & 0xF) + (qh[ 1] & hm ? 16 : 0)) - m1;
+    hm <<= 1;
+    y[32] = d2 * ((ql[ 0] >>  4) + (qh[ 0] & hm ? 16 : 0)) - m2;
+    y[33] = d2 * ((ql[ 1] >>  4) + (qh[ 1] & hm ? 16 : 0)) - m2;
+#else
+    const int tid = threadIdx.x;
+    const uint8_t q = x[i].qs[tid];
+    const int im = tid/8;  // 0...3
+    const int in = tid%8;  // 0...7
+    const int is = tid/16; // 0 or 1
+    const uint8_t h = x[i].qh[in] >> im;
+    const float d = x[i].d;
+    dst_t * y = yy + i*QK_K + tid;
+    y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16));
+    y[32] = d * x[i].scales[is+2] * ((q >>  4) - ((h >> 4) & 1 ? 0 : 16));
+#endif
+}
+
+template<typename dst_t>
+static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
+    const block_q6_K * x = (const block_q6_K *) vx;
+
+    const int i = blockIdx.x;
+#if QK_K == 256
+
+    // assume 64 threads - this is very slightly better than the one below
+    const int tid = threadIdx.x;
+    const int ip  = tid/32;   // ip is 0 or 1
+    const int il  = tid - 32*ip; // 0...32
+    const int is  = 8*ip + il/16;
+
+    dst_t * y = yy + i*QK_K + 128*ip + il;
+
+    const float d = x[i].d;
+
+    const uint8_t * ql = x[i].ql + 64*ip + il;
+    const uint8_t   qh = x[i].qh[32*ip + il];
+    const int8_t  * sc = x[i].scales + is;
+
+    y[ 0] = d * sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
+    y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
+    y[64] = d * sc[4] * ((int8_t)((ql[ 0]  >> 4) | (((qh >> 4) & 3) << 4)) - 32);
+    y[96] = d * sc[6] * ((int8_t)((ql[32]  >> 4) | (((qh >> 6) & 3) << 4)) - 32);
+#else
+
+    // assume 32 threads
+    const int tid = threadIdx.x;
+    const int ip  = tid/16;         // 0 or 1
+    const int il  = tid - 16*ip;    // 0...15
+
+    dst_t * y = yy + i*QK_K + 16*ip + il;
+
+    const float d = x[i].d;
+
+    const uint8_t   ql = x[i].ql[16*ip + il];
+    const uint8_t   qh = x[i].qh[il] >> (2*ip);
+    const int8_t  * sc = x[i].scales;
+
+    y[ 0] = d * sc[ip+0] * ((int8_t)((ql & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
+    y[32] = d * sc[ip+2] * ((int8_t)((ql  >> 4) | (((qh >> 4) & 3) << 4)) - 32);
+#endif
+}
+
+static const __device__ uint64_t iq2xxs_grid[256] = {
+    0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
+    0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x08080808082b0808,
+    0x08080808082b082b, 0x08080808082b2b08, 0x08080808082b2b2b, 0x0808080819080819,
+    0x0808080819081908, 0x0808080819190808, 0x0808080819192b08, 0x08080808192b0819,
+    0x08080808192b1908, 0x080808082b080808, 0x080808082b08082b, 0x080808082b082b2b,
+    0x080808082b2b082b, 0x0808081908080819, 0x0808081908081908, 0x0808081908190808,
+    0x0808081908191919, 0x0808081919080808, 0x080808192b081908, 0x080808192b192b08,
+    0x0808082b08080808, 0x0808082b0808082b, 0x0808082b082b082b, 0x0808082b2b08082b,
+    0x0808190808080819, 0x0808190808081908, 0x0808190808190808, 0x08081908082b0819,
+    0x08081908082b1908, 0x0808190819080808, 0x080819081908082b, 0x0808190819082b08,
+    0x08081908192b0808, 0x080819082b080819, 0x080819082b081908, 0x080819082b190808,
+    0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b, 0x0808191908082b08,
+    0x08081919082b0808, 0x080819191908192b, 0x08081919192b2b19, 0x080819192b080808,
+    0x080819192b190819, 0x0808192b08082b19, 0x0808192b08190808, 0x0808192b19080808,
+    0x0808192b2b081908, 0x0808192b2b2b1908, 0x08082b0808080808, 0x08082b0808081919,
+    0x08082b0808082b08, 0x08082b0808191908, 0x08082b08082b2b08, 0x08082b0819080819,
+    0x08082b0819081908, 0x08082b0819190808, 0x08082b081919082b, 0x08082b082b082b08,
+    0x08082b1908081908, 0x08082b1919080808, 0x08082b2b0808082b, 0x08082b2b08191908,
+    0x0819080808080819, 0x0819080808081908, 0x0819080808190808, 0x08190808082b0819,
+    0x0819080819080808, 0x08190808192b0808, 0x081908082b081908, 0x081908082b190808,
+    0x081908082b191919, 0x0819081908080808, 0x0819081908082b08, 0x08190819082b0808,
+    0x0819081919190808, 0x0819081919192b2b, 0x081908192b080808, 0x0819082b082b1908,
+    0x0819082b19081919, 0x0819190808080808, 0x0819190808082b08, 0x08191908082b0808,
+    0x08191908082b1919, 0x0819190819082b19, 0x081919082b080808, 0x0819191908192b08,
+    0x08191919192b082b, 0x0819192b08080808, 0x0819192b0819192b, 0x08192b0808080819,
+    0x08192b0808081908, 0x08192b0808190808, 0x08192b0819080808, 0x08192b082b080819,
+    0x08192b1908080808, 0x08192b1908081919, 0x08192b192b2b0808, 0x08192b2b19190819,
+    0x082b080808080808, 0x082b08080808082b, 0x082b080808082b2b, 0x082b080819081908,
+    0x082b0808192b0819, 0x082b08082b080808, 0x082b08082b08082b, 0x082b0819082b2b19,
+    0x082b081919082b08, 0x082b082b08080808, 0x082b082b0808082b, 0x082b190808080819,
+    0x082b190808081908, 0x082b190808190808, 0x082b190819080808, 0x082b19081919192b,
+    0x082b191908080808, 0x082b191919080819, 0x082b1919192b1908, 0x082b192b2b190808,
+    0x082b2b0808082b08, 0x082b2b08082b0808, 0x082b2b082b191908, 0x082b2b2b19081908,
+    0x1908080808080819, 0x1908080808081908, 0x1908080808190808, 0x1908080808192b08,
+    0x19080808082b0819, 0x19080808082b1908, 0x1908080819080808, 0x1908080819082b08,
+    0x190808081919192b, 0x19080808192b0808, 0x190808082b080819, 0x190808082b081908,
+    0x190808082b190808, 0x1908081908080808, 0x19080819082b0808, 0x19080819192b0819,
+    0x190808192b080808, 0x190808192b081919, 0x1908082b08080819, 0x1908082b08190808,
+    0x1908082b19082b08, 0x1908082b1919192b, 0x1908082b192b2b08, 0x1908190808080808,
+    0x1908190808082b08, 0x19081908082b0808, 0x190819082b080808, 0x190819082b192b19,
+    0x190819190819082b, 0x19081919082b1908, 0x1908192b08080808, 0x19082b0808080819,
+    0x19082b0808081908, 0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919,
+    0x19082b1908080808, 0x19082b1919192b08, 0x19082b19192b0819, 0x19082b192b08082b,
+    0x19082b2b19081919, 0x19082b2b2b190808, 0x1919080808080808, 0x1919080808082b08,
+    0x1919080808190819, 0x1919080808192b19, 0x19190808082b0808, 0x191908082b080808,
+    0x191908082b082b08, 0x1919081908081908, 0x191908191908082b, 0x191908192b2b1908,
+    0x1919082b2b190819, 0x191919082b190808, 0x191919082b19082b, 0x1919191908082b2b,
+    0x1919192b08080819, 0x1919192b19191908, 0x19192b0808080808, 0x19192b0808190819,
+    0x19192b0808192b19, 0x19192b08192b1908, 0x19192b1919080808, 0x19192b2b08082b08,
+    0x192b080808081908, 0x192b080808190808, 0x192b080819080808, 0x192b0808192b2b08,
+    0x192b081908080808, 0x192b081919191919, 0x192b082b08192b08, 0x192b082b192b0808,
+    0x192b190808080808, 0x192b190808081919, 0x192b191908190808, 0x192b19190819082b,
+    0x192b19192b081908, 0x192b2b081908082b, 0x2b08080808080808, 0x2b0808080808082b,
+    0x2b08080808082b2b, 0x2b08080819080819, 0x2b0808082b08082b, 0x2b08081908081908,
+    0x2b08081908192b08, 0x2b08081919080808, 0x2b08082b08190819, 0x2b08190808080819,
+    0x2b08190808081908, 0x2b08190808190808, 0x2b08190808191919, 0x2b08190819080808,
+    0x2b081908192b0808, 0x2b08191908080808, 0x2b0819191908192b, 0x2b0819192b191908,
+    0x2b08192b08082b19, 0x2b08192b19080808, 0x2b08192b192b0808, 0x2b082b080808082b,
+    0x2b082b1908081908, 0x2b082b2b08190819, 0x2b19080808081908, 0x2b19080808190808,
+    0x2b190808082b1908, 0x2b19080819080808, 0x2b1908082b2b0819, 0x2b1908190819192b,
+    0x2b1908192b080808, 0x2b19082b19081919, 0x2b19190808080808, 0x2b191908082b082b,
+    0x2b19190819081908, 0x2b19191919190819, 0x2b192b082b080819, 0x2b192b19082b0808,
+    0x2b2b08080808082b, 0x2b2b080819190808, 0x2b2b08082b081919, 0x2b2b081908082b19,
+    0x2b2b082b08080808, 0x2b2b190808192b08, 0x2b2b2b0819190808, 0x2b2b2b1908081908,
+};
+
+static const __device__ uint64_t iq2xs_grid[512] = {
+    0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
+    0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
+    0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
+    0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
+    0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
+    0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x080808082b080808,
+    0x080808082b08082b, 0x080808082b081919, 0x080808082b082b08, 0x080808082b190819,
+    0x080808082b191908, 0x080808082b192b19, 0x080808082b2b0808, 0x0808081908080819,
+    0x0808081908081908, 0x080808190808192b, 0x0808081908082b19, 0x0808081908190808,
+    0x080808190819082b, 0x0808081908191919, 0x0808081908192b08, 0x0808081908192b2b,
+    0x08080819082b0819, 0x08080819082b1908, 0x0808081919080808, 0x080808191908082b,
+    0x0808081919081919, 0x0808081919082b08, 0x0808081919190819, 0x0808081919191908,
+    0x08080819192b0808, 0x08080819192b2b08, 0x080808192b080819, 0x080808192b081908,
+    0x080808192b190808, 0x0808082b08080808, 0x0808082b0808082b, 0x0808082b08081919,
+    0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908, 0x0808082b082b0808,
+    0x0808082b19080819, 0x0808082b19081908, 0x0808082b19190808, 0x0808082b19191919,
+    0x0808082b2b080808, 0x0808082b2b082b2b, 0x0808190808080819, 0x0808190808081908,
+    0x080819080808192b, 0x0808190808082b19, 0x0808190808190808, 0x080819080819082b,
+    0x0808190808191919, 0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908,
+    0x0808190819080808, 0x080819081908082b, 0x0808190819081919, 0x0808190819082b08,
+    0x0808190819190819, 0x0808190819191908, 0x080819081919192b, 0x08081908192b0808,
+    0x080819082b080819, 0x080819082b081908, 0x080819082b190808, 0x0808191908080808,
+    0x080819190808082b, 0x0808191908081919, 0x0808191908082b08, 0x0808191908190819,
+    0x0808191908191908, 0x08081919082b0808, 0x0808191919080819, 0x0808191919081908,
+    0x0808191919190808, 0x08081919192b0819, 0x080819192b080808, 0x0808192b08080819,
+    0x0808192b08081908, 0x0808192b08190808, 0x0808192b082b192b, 0x0808192b19080808,
+    0x0808192b1908082b, 0x0808192b2b081908, 0x08082b0808080808, 0x08082b080808082b,
+    0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808082b2b, 0x08082b0808190819,
+    0x08082b0808191908, 0x08082b08082b0808, 0x08082b08082b1919, 0x08082b0819080819,
+    0x08082b0819081908, 0x08082b0819190808, 0x08082b0819192b08, 0x08082b082b080808,
+    0x08082b082b2b0808, 0x08082b082b2b2b2b, 0x08082b1908080819, 0x08082b1908081908,
+    0x08082b1908190808, 0x08082b1919080808, 0x08082b192b080819, 0x08082b192b082b19,
+    0x08082b2b08080808, 0x08082b2b082b0808, 0x08082b2b082b2b08, 0x08082b2b2b19192b,
+    0x08082b2b2b2b0808, 0x0819080808080819, 0x0819080808081908, 0x081908080808192b,
+    0x0819080808082b19, 0x0819080808190808, 0x081908080819082b, 0x0819080808191919,
+    0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908, 0x0819080819080808,
+    0x081908081908082b, 0x0819080819081919, 0x0819080819082b08, 0x0819080819190819,
+    0x0819080819191908, 0x08190808192b0808, 0x08190808192b2b2b, 0x081908082b080819,
+    0x081908082b081908, 0x081908082b190808, 0x0819081908080808, 0x081908190808082b,
+    0x0819081908081919, 0x0819081908082b08, 0x0819081908190819, 0x0819081908191908,
+    0x08190819082b0808, 0x0819081919080819, 0x0819081919081908, 0x0819081919190808,
+    0x081908192b080808, 0x081908192b191908, 0x081908192b19192b, 0x0819082b08080819,
+    0x0819082b08081908, 0x0819082b0808192b, 0x0819082b08190808, 0x0819082b19080808,
+    0x0819082b192b0808, 0x0819190808080808, 0x081919080808082b, 0x0819190808081919,
+    0x0819190808082b08, 0x0819190808190819, 0x0819190808191908, 0x08191908082b0808,
+    0x0819190819080819, 0x0819190819081908, 0x0819190819082b19, 0x0819190819190808,
+    0x08191908192b1908, 0x081919082b080808, 0x0819191908080819, 0x0819191908081908,
+    0x0819191908190808, 0x0819191919080808, 0x0819192b08080808, 0x0819192b08191908,
+    0x0819192b19082b19, 0x08192b0808080819, 0x08192b0808081908, 0x08192b0808190808,
+    0x08192b080819082b, 0x08192b0819080808, 0x08192b0819191908, 0x08192b082b08192b,
+    0x08192b1908080808, 0x08192b1908081919, 0x08192b19192b192b, 0x08192b2b19190819,
+    0x08192b2b2b2b2b19, 0x082b080808080808, 0x082b08080808082b, 0x082b080808081919,
+    0x082b080808082b08, 0x082b080808082b2b, 0x082b080808190819, 0x082b080808191908,
+    0x082b0808082b0808, 0x082b080819080819, 0x082b080819081908, 0x082b080819190808,
+    0x082b08082b080808, 0x082b08082b2b0808, 0x082b081908080819, 0x082b081908081908,
+    0x082b081908190808, 0x082b081919080808, 0x082b081919082b08, 0x082b0819192b1919,
+    0x082b082b08080808, 0x082b082b082b082b, 0x082b082b2b080808, 0x082b082b2b2b2b08,
+    0x082b190808080819, 0x082b190808081908, 0x082b190808190808, 0x082b1908082b2b19,
+    0x082b190819080808, 0x082b191908080808, 0x082b191919080819, 0x082b19191919082b,
+    0x082b19192b192b19, 0x082b192b08080819, 0x082b192b08192b2b, 0x082b192b2b2b192b,
+    0x082b2b0808080808, 0x082b2b0808082b08, 0x082b2b0808082b2b, 0x082b2b08082b0808,
+    0x082b2b0819191919, 0x082b2b082b082b08, 0x082b2b082b2b082b, 0x082b2b19192b2b08,
+    0x082b2b192b190808, 0x082b2b2b08082b08, 0x082b2b2b082b0808, 0x082b2b2b2b08082b,
+    0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819, 0x1908080808081908,
+    0x190808080808192b, 0x1908080808082b19, 0x1908080808190808, 0x190808080819082b,
+    0x1908080808191919, 0x1908080808192b08, 0x19080808082b0819, 0x19080808082b1908,
+    0x1908080819080808, 0x190808081908082b, 0x1908080819081919, 0x1908080819082b08,
+    0x1908080819082b2b, 0x1908080819190819, 0x1908080819191908, 0x19080808192b0808,
+    0x19080808192b1919, 0x190808082b080819, 0x190808082b081908, 0x190808082b190808,
+    0x1908081908080808, 0x190808190808082b, 0x1908081908081919, 0x1908081908082b08,
+    0x1908081908190819, 0x1908081908191908, 0x19080819082b0808, 0x1908081919080819,
+    0x1908081919081908, 0x1908081919190808, 0x190808192b080808, 0x190808192b081919,
+    0x190808192b2b082b, 0x1908082b08080819, 0x1908082b08081908, 0x1908082b08190808,
+    0x1908082b0819082b, 0x1908082b082b2b19, 0x1908082b19080808, 0x1908190808080808,
+    0x190819080808082b, 0x1908190808081919, 0x1908190808082b08, 0x1908190808190819,
+    0x1908190808191908, 0x1908190808192b19, 0x19081908082b0808, 0x1908190819080819,
+    0x1908190819081908, 0x1908190819190808, 0x190819082b080808, 0x190819082b191908,
+    0x1908191908080819, 0x1908191908081908, 0x1908191908190808, 0x19081919082b1908,
+    0x1908191919080808, 0x190819192b192b2b, 0x1908192b08080808, 0x1908192b08082b2b,
+    0x1908192b19081908, 0x1908192b19190808, 0x19082b0808080819, 0x19082b0808081908,
+    0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919, 0x19082b0819191908,
+    0x19082b08192b082b, 0x19082b1908080808, 0x19082b1908190819, 0x19082b1919081908,
+    0x19082b1919190808, 0x19082b19192b2b19, 0x19082b2b08081908, 0x1919080808080808,
+    0x191908080808082b, 0x1919080808081919, 0x1919080808082b08, 0x1919080808190819,
+    0x1919080808191908, 0x19190808082b0808, 0x19190808082b2b08, 0x1919080819080819,
+    0x1919080819081908, 0x1919080819190808, 0x191908082b080808, 0x1919081908080819,
+    0x1919081908081908, 0x1919081908190808, 0x1919081908191919, 0x1919081919080808,
+    0x191908191908082b, 0x1919082b08080808, 0x1919082b19081908, 0x1919082b2b2b2b2b,
+    0x1919190808080819, 0x1919190808081908, 0x1919190808190808, 0x19191908082b0819,
+    0x1919190819080808, 0x19191908192b0808, 0x191919082b080819, 0x191919082b2b0819,
+    0x1919191908080808, 0x1919191908082b08, 0x191919192b080808, 0x191919192b082b08,
+    0x1919192b082b0819, 0x1919192b192b2b08, 0x1919192b2b2b0819, 0x19192b0808080808,
+    0x19192b0808191908, 0x19192b0819080819, 0x19192b0819190808, 0x19192b082b192b19,
+    0x19192b1908192b2b, 0x19192b1919080808, 0x19192b191908082b, 0x19192b2b2b081919,
+    0x192b080808080819, 0x192b080808081908, 0x192b080808190808, 0x192b080819080808,
+    0x192b080819191908, 0x192b0808192b082b, 0x192b08082b08192b, 0x192b08082b2b2b19,
+    0x192b081908080808, 0x192b082b082b1908, 0x192b082b19082b2b, 0x192b082b2b19082b,
+    0x192b190808080808, 0x192b19080819192b, 0x192b191908190808, 0x192b191919080808,
+    0x192b191919081919, 0x192b19192b2b1908, 0x192b2b0808080819, 0x192b2b08192b2b2b,
+    0x192b2b19082b1919, 0x192b2b2b0808192b, 0x192b2b2b19191908, 0x192b2b2b192b082b,
+    0x2b08080808080808, 0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08,
+    0x2b08080808190819, 0x2b08080808191908, 0x2b080808082b0808, 0x2b080808082b2b2b,
+    0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808082b080808,
+    0x2b0808082b08082b, 0x2b0808082b2b2b08, 0x2b0808082b2b2b2b, 0x2b08081908080819,
+    0x2b08081908081908, 0x2b0808190808192b, 0x2b08081908190808, 0x2b08081919080808,
+    0x2b08081919190819, 0x2b08081919192b19, 0x2b08082b08080808, 0x2b08082b082b0808,
+    0x2b08082b2b080808, 0x2b08082b2b08082b, 0x2b08082b2b2b0808, 0x2b08082b2b2b2b08,
+    0x2b08190808080819, 0x2b08190808081908, 0x2b08190808190808, 0x2b0819080819082b,
+    0x2b08190808191919, 0x2b08190819080808, 0x2b081908192b0808, 0x2b0819082b082b19,
+    0x2b08191908080808, 0x2b08191919081908, 0x2b0819192b2b1919, 0x2b08192b08192b08,
+    0x2b08192b192b2b2b, 0x2b082b0808080808, 0x2b082b0808082b08, 0x2b082b08082b1919,
+    0x2b082b0819192b2b, 0x2b082b082b080808, 0x2b082b082b08082b, 0x2b082b082b2b2b08,
+    0x2b082b190808192b, 0x2b082b2b082b082b, 0x2b082b2b2b080808, 0x2b082b2b2b082b08,
+    0x2b082b2b2b19192b, 0x2b082b2b2b2b2b08, 0x2b19080808080819, 0x2b19080808081908,
+    0x2b19080808190808, 0x2b19080819080808, 0x2b1908081919192b, 0x2b1908082b081908,
+    0x2b19081908080808, 0x2b190819082b082b, 0x2b190819192b1908, 0x2b19082b1919192b,
+    0x2b19082b2b082b19, 0x2b19190808080808, 0x2b19190808081919, 0x2b19190819081908,
+    0x2b19190819190808, 0x2b19190819192b08, 0x2b191919082b2b19, 0x2b1919192b190808,
+    0x2b1919192b19082b, 0x2b19192b19080819, 0x2b192b0819190819, 0x2b192b082b2b192b,
+    0x2b192b1919082b19, 0x2b192b2b08191919, 0x2b192b2b192b0808, 0x2b2b080808080808,
+    0x2b2b08080808082b, 0x2b2b080808082b08, 0x2b2b080808082b2b, 0x2b2b0808082b0808,
+    0x2b2b0808082b2b2b, 0x2b2b08082b2b0808, 0x2b2b081919190819, 0x2b2b081919192b19,
+    0x2b2b08192b2b192b, 0x2b2b082b08080808, 0x2b2b082b0808082b, 0x2b2b082b08082b08,
+    0x2b2b082b082b2b2b, 0x2b2b082b2b080808, 0x2b2b082b2b2b0808, 0x2b2b190819080808,
+    0x2b2b19082b191919, 0x2b2b192b192b1919, 0x2b2b192b2b192b08, 0x2b2b2b0808082b2b,
+    0x2b2b2b08082b0808, 0x2b2b2b08082b082b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b0808,
+    0x2b2b2b082b2b2b08, 0x2b2b2b1908081908, 0x2b2b2b192b081908, 0x2b2b2b192b08192b,
+    0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
+};
+
+static const __device__ uint8_t ksigns_iq2xs[128] = {
+      0, 129, 130,   3, 132,   5,   6, 135, 136,   9,  10, 139,  12, 141, 142,  15,
+    144,  17,  18, 147,  20, 149, 150,  23,  24, 153, 154,  27, 156,  29,  30, 159,
+    160,  33,  34, 163,  36, 165, 166,  39,  40, 169, 170,  43, 172,  45,  46, 175,
+     48, 177, 178,  51, 180,  53,  54, 183, 184,  57,  58, 187,  60, 189, 190,  63,
+    192,  65,  66, 195,  68, 197, 198,  71,  72, 201, 202,  75, 204,  77,  78, 207,
+     80, 209, 210,  83, 212,  85,  86, 215, 216,  89,  90, 219,  92, 221, 222,  95,
+     96, 225, 226,  99, 228, 101, 102, 231, 232, 105, 106, 235, 108, 237, 238, 111,
+    240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255,
+};
+
+static const __device__ uint8_t kmask_iq2xs[8] = {1, 2, 4, 8, 16, 32, 64, 128};
+
+inline bool ggml_v3_cuda_supports_mmq(enum ggml_v3_type type) {
+    switch (type) {
+        case GGML_V3_TYPE_Q4_0:
+        case GGML_V3_TYPE_Q4_1:
+        case GGML_V3_TYPE_Q5_0:
+        case GGML_V3_TYPE_Q5_1:
+        case GGML_V3_TYPE_Q8_0:
+        case GGML_V3_TYPE_Q2_K:
+        case GGML_V3_TYPE_Q3_K:
+        case GGML_V3_TYPE_Q4_K:
+        case GGML_V3_TYPE_Q5_K:
+        case GGML_V3_TYPE_Q6_K:
+            return true;
+        default:
+            return false;
+    }
+}
+
+template<typename dst_t>
+static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
+
+    const int i   = blockIdx.x;
+    const block_iq2_xxs * x = (const block_iq2_xxs  *) vx;
+
+    const int tid = threadIdx.x;
+#if QK_K == 256
+    const int il = tid/8; // 0...3
+    const int ib = tid%8; // 0...7
+    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
+    const uint16_t * q2 = x[i].qs + 4*ib;
+    const uint8_t  * aux8 = (const uint8_t *)q2;
+    const uint8_t  * grid = (const uint8_t *)(iq2xxs_grid + aux8[il]);
+    const uint32_t aux32 = q2[2] | (q2[3] << 16);
+    const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.25f;
+    const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
+    for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
+#else
+    assert(false);
+#endif
+
+}
+
+template<typename dst_t>
+static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
+
+    const int i   = blockIdx.x;
+    const block_iq2_xs * x = (const block_iq2_xs *) vx;
+
+    const int tid = threadIdx.x;
+#if QK_K == 256
+    const int il = tid/8; // 0...3
+    const int ib = tid%8; // 0...7
+    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
+    const uint16_t * q2 = x[i].qs + 4*ib;
+    const uint8_t  * grid = (const uint8_t *)(iq2xs_grid + (q2[il] & 511));
+    const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
+    const uint8_t signs = ksigns_iq2xs[q2[il] >> 9];
+    for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
+#else
+    assert(false);
+#endif
+
+}
+
+static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
+
+    static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
+
+    const int row = blockIdx.x*blockDim.y + threadIdx.y;
+    if (row > nrows) return;
+
+    const int num_blocks_per_row = ncols / QK_K;
+    const int ib0 = row*num_blocks_per_row;
+
+    const block_q2_K * x = (const block_q2_K *)vx + ib0;
+
+    float tmp = 0; // partial sum for thread in warp
+
+#if QK_K == 256
+    const int tid = threadIdx.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...15
+    const int ix  = threadIdx.x%K_QUANTS_PER_ITERATION;  // 0 or 0,1
+
+    const int step = 16/K_QUANTS_PER_ITERATION;
+
+    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
+    const int in = tid - step*im;                        // 0...15 or 0...7
+
+    const int l0 = K_QUANTS_PER_ITERATION*in;            // 0...15 or 0...14 in steps of 2
+    const int q_offset = 32*im + l0;
+    const int s_offset = 8*im;
+    const int y_offset = 128*im + l0;
+
+    uint32_t aux[4];
+    const uint8_t * d = (const uint8_t *)aux;
+    const uint8_t * m = (const uint8_t *)(aux + 2);
+
+    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
+
+        const float   * y = yy + i * QK_K + y_offset;
+        const uint8_t * q = x[i].qs + q_offset;
+
+        const float dall = __low2half(x[i].dm);
+        const float dmin = __high2half(x[i].dm);
+
+        const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
+        aux[0] = a[0] & 0x0f0f0f0f;
+        aux[1] = a[1] & 0x0f0f0f0f;
+        aux[2] = (a[0] >> 4) & 0x0f0f0f0f;
+        aux[3] = (a[1] >> 4) & 0x0f0f0f0f;
+
+        float sum1 = 0, sum2 = 0;
+        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
+            sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3)
+                  + y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3)
+                  + y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3)
+                  + y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3)
+                  + y[l+16] * d[1] * ((q[l+16] >> 0) & 3)
+                  + y[l+48] * d[3] * ((q[l+16] >> 2) & 3)
+                  + y[l+80] * d[5] * ((q[l+16] >> 4) & 3)
+                  +y[l+112] * d[7] * ((q[l+16] >> 6) & 3);
+            sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6]
+                  + y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7];
+
+        }
+        tmp += dall * sum1 - dmin * sum2;
+
+    }
+#else
+    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15 or 0...7
+    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);  // 0....1 or 0...3
+    const int offset = tid * K_QUANTS_PER_ITERATION;
+
+    uint32_t uaux[2];
+    const uint8_t * d = (const uint8_t *)uaux;
+
+    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
+
+        const float   * y = yy + i * QK_K + offset;
+        const uint8_t * q = x[i].qs + offset;
+        const uint32_t * s = (const uint32_t *)x[i].scales;
+
+        uaux[0] = s[0] & 0x0f0f0f0f;
+        uaux[1] = (s[0] >> 4) & 0x0f0f0f0f;
+
+        const float2 dall = __half22float2(x[i].dm);
+
+        float sum1 = 0, sum2 = 0;
+        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
+            const uint8_t ql = q[l];
+            sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3)
+                  + y[l+16] * d[1] * ((ql >> 2) & 3)
+                  + y[l+32] * d[2] * ((ql >> 4) & 3)
+                  + y[l+48] * d[3] * ((ql >> 6) & 3);
+            sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7];
+        }
+        tmp += dall.x * sum1 - dall.y * sum2;
+    }
+#endif
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
+    }
+
+    if (threadIdx.x == 0) {
+        dst[row] = tmp;
+    }
+}
+
+static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
+
+    const int row = blockIdx.x*blockDim.y + threadIdx.y;
+    if (row > nrows) return;
+
+    const int num_blocks_per_row = ncols / QK_K;
+    const int ib0 = row*num_blocks_per_row;
+
+    const block_q3_K * x = (const block_q3_K *)vx + ib0;
+
+    float tmp = 0; // partial sum for thread in warp
+
+#if QK_K == 256
+
+    const uint16_t kmask1 = 0x0303;
+    const uint16_t kmask2 = 0x0f0f;
+
+    const int tid = threadIdx.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...16
+    const int ix  = threadIdx.x%K_QUANTS_PER_ITERATION;  // 0 or 0,1
+
+    const int n  = K_QUANTS_PER_ITERATION;               // iterations in the inner loop
+    const int step = 16/K_QUANTS_PER_ITERATION;
+    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
+    const int in = tid - step*im;                        // 0....15 or 0...7
+
+    const uint8_t m = 1 << (4*im);
+
+    const int l0 = n*in;                                 // 0...15 or 0...14 in steps of 2
+    const int q_offset =  32*im + l0;
+    const int y_offset = 128*im + l0;
+
+    uint16_t utmp[4];
+    const int8_t * s = (const int8_t *)utmp;
+
+    const uint16_t s_shift = 4*im;
+
+    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
+
+        const float   * y  = yy + i * QK_K + y_offset;
+        const uint8_t * q = x[i].qs + q_offset;
+        const uint8_t * h = x[i].hmask + l0;
+
+        const uint16_t * a = (const uint16_t *)x[i].scales;
+        utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4);
+        utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4);
+        utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4);
+        utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4);
+
+        const float d = x[i].d;
+
+        float sum = 0;
+        for (int l = 0; l < n; ++l) {
+            sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4))
+                 + y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4))
+                 + y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4))
+                 + y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4));
+            sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4))
+                 + y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4))
+                 + y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4))
+                + y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4));
+        }
+        tmp += d * sum;
+
+    }
+#else
+
+    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15 or 0...7
+    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);  // 0....1 or 0...3
+    const int offset = tid * K_QUANTS_PER_ITERATION;         // 0...15 or 0...14
+    const int in = offset/8;                                 // 0 or 1
+    const int im = offset%8;                                 // 0...7
+
+    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
+
+        const float   * y = yy + i * QK_K + offset;
+        const uint8_t * q = x[i].qs + offset;
+        const uint8_t * s = x[i].scales;
+
+        const float dall = (float)x[i].d;
+
+        float sum = 0;
+        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
+            const uint8_t hl = x[i].hmask[im+l] >> in;
+            const uint8_t ql = q[l];
+            sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4))
+                 + y[l+16] * dall * ((s[0] >>  4) - 8) * ((int8_t)((ql >> 2) & 3) - ((hl >> 2) & 1 ? 0 : 4))
+                 + y[l+32] * dall * ((s[1] & 0xF) - 8) * ((int8_t)((ql >> 4) & 3) - ((hl >> 4) & 1 ? 0 : 4))
+                 + y[l+48] * dall * ((s[1] >>  4) - 8) * ((int8_t)((ql >> 6) & 3) - ((hl >> 6) & 1 ? 0 : 4));
+        }
+        tmp += sum;
+    }
+#endif
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
+    }
+
+    if (threadIdx.x == 0) {
+        dst[row] = tmp;
+    }
+}
+
+static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
+
+    const int row = blockIdx.x*blockDim.y + threadIdx.y;
+    if (row > nrows) return;
+    const int num_blocks_per_row = ncols / QK_K;
+    const int ib0 = row*num_blocks_per_row;
+
+    const block_q4_K * x = (const block_q4_K *)vx + ib0;
+
+#if QK_K == 256
+    const uint16_t kmask1 = 0x3f3f;
+    const uint16_t kmask2 = 0x0f0f;
+    const uint16_t kmask3 = 0xc0c0;
+
+    const int tid = threadIdx.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...16
+    const int ix  = threadIdx.x%K_QUANTS_PER_ITERATION;  // 0 or 0,1
+
+    const int step = 8/K_QUANTS_PER_ITERATION;           // 8 or 4
+
+    const int il  = tid/step;                            // 0...3
+    const int ir  = tid - step*il;                       // 0...7 or 0...3
+    const int n   = 2 * K_QUANTS_PER_ITERATION;          // 2 or 4
+
+    const int im = il/2;  // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
+    const int in = il%2;
+
+    const int l0 = n*(2*ir + in);
+    const int q_offset = 32*im + l0;
+    const int y_offset = 64*im + l0;
+
+    uint16_t aux[4];
+    const uint8_t * sc = (const uint8_t *)aux;
+
+#if K_QUANTS_PER_ITERATION == 2
+    uint32_t q32[4];
+    const uint8_t * q4 = (const uint8_t *)q32;
+#else
+    uint16_t q16[4];
+    const uint8_t * q4 = (const uint8_t *)q16;
+#endif
+
+    float tmp = 0; // partial sum for thread in warp
+
+    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
+
+        const float   * y1 = yy + i*QK_K + y_offset;
+        const float   * y2 = y1 + 128;
+
+        const float dall = __low2half(x[i].dm);
+        const float dmin = __high2half(x[i].dm);
+
+        const uint16_t * a = (const uint16_t *)x[i].scales;
+        aux[0] = a[im+0] & kmask1;
+        aux[1] = a[im+2] & kmask1;
+        aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
+        aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
+
+#if K_QUANTS_PER_ITERATION == 2
+        const uint32_t * q1 = (const uint32_t *)(x[i].qs + q_offset);
+        const uint32_t * q2 = q1 + 16;
+
+        q32[0] = q1[0] & 0x0f0f0f0f;
+        q32[1] = q1[0] & 0xf0f0f0f0;
+        q32[2] = q2[0] & 0x0f0f0f0f;
+        q32[3] = q2[0] & 0xf0f0f0f0;
+
+        float4 s = {0.f, 0.f, 0.f, 0.f};
+        float smin = 0;
+        for (int l = 0; l < 4; ++l) {
+            s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+ 4];
+            s.z += y2[l] * q4[l+8]; s.w += y2[l+32] * q4[l+12];
+            smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
+        }
+        tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
+#else
+        const uint16_t * q1 = (const uint16_t *)(x[i].qs + q_offset);
+        const uint16_t * q2 = q1 + 32;
+
+        q16[0] = q1[0] & 0x0f0f;
+        q16[1] = q1[0] & 0xf0f0;
+        q16[2] = q2[0] & 0x0f0f;
+        q16[3] = q2[0] & 0xf0f0;
+
+        float4 s = {0.f, 0.f, 0.f, 0.f};
+        float smin = 0;
+        for (int l = 0; l < 2; ++l) {
+            s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+2];
+            s.z += y2[l] * q4[l+4]; s.w += y2[l+32] * q4[l+6];
+            smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
+        }
+        tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
+#endif
+
+    }
+#else
+    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15
+    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);
+
+    const int step = tid * K_QUANTS_PER_ITERATION;
+
+    uint16_t aux16[2];
+    const uint8_t * s = (const uint8_t *)aux16;
+
+    float tmp = 0;
+
+    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
+        const uint8_t * q = x[i].qs + step;
+        const float   * y = yy + i*QK_K + step;
+        const uint16_t * a = (const uint16_t *)x[i].scales;
+        aux16[0] = a[0] & 0x0f0f;
+        aux16[1] = (a[0] >> 4) & 0x0f0f;
+        const float d = (float)x[i].dm[0];
+        const float m = (float)x[i].dm[1];
+        float sum = 0.f;
+        for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
+            sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
+                 + y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2])
+                 + y[j+32] * (d * s[1] * (q[j+ 0] >>  4) - m * s[3])
+                 + y[j+48] * (d * s[1] * (q[j+16] >>  4) - m * s[3]);
+        }
+        tmp += sum;
+    }
+
+#endif
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
+    }
+
+    if (tid == 0) {
+        dst[row] = tmp;
+    }
+}
+
+static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols) {
+
+    const int row = blockIdx.x;
+    const int num_blocks_per_row = ncols / QK_K;
+    const int ib0 = row*num_blocks_per_row;
+
+    const block_q5_K * x = (const block_q5_K *)vx + ib0;
+
+    float tmp = 0; // partial sum for thread in warp
+
+#if QK_K == 256
+    const uint16_t kmask1 = 0x3f3f;
+    const uint16_t kmask2 = 0x0f0f;
+    const uint16_t kmask3 = 0xc0c0;
+
+    const int tid = threadIdx.x/2;  // 0...15
+    const int ix  = threadIdx.x%2;
+
+    const int il  = tid/4;     // 0...3
+    const int ir  = tid - 4*il;// 0...3
+    const int n   = 2;
+
+    const int im = il/2;  // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
+    const int in = il%2;
+
+    const int l0 = n*(2*ir + in);
+    const int q_offset = 32*im + l0;
+    const int y_offset = 64*im + l0;
+
+    const uint8_t hm1  = 1 << (2*im);
+    const uint8_t hm2  = hm1 << 4;
+
+    uint16_t aux[4];
+    const uint8_t * sc = (const uint8_t *)aux;
+
+    uint16_t q16[8];
+    const uint8_t * q4 = (const uint8_t *)q16;
+
+    for (int i = ix; i < num_blocks_per_row; i += 2) {
+
+        const uint8_t * ql1 = x[i].qs + q_offset;
+        const uint8_t * qh  = x[i].qh + l0;
+        const float   * y1  = yy + i*QK_K + y_offset;
+        const float   * y2  = y1 + 128;
+
+        const float dall = __low2half(x[i].dm);
+        const float dmin = __high2half(x[i].dm);
+
+        const uint16_t * a = (const uint16_t *)x[i].scales;
+        aux[0] = a[im+0] & kmask1;
+        aux[1] = a[im+2] & kmask1;
+        aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
+        aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
+
+        float4 sum = {0.f, 0.f, 0.f, 0.f};
+        float smin = 0;
+        const uint16_t * q1 = (const uint16_t *)ql1;
+        const uint16_t * q2 = q1 + 32;
+        q16[0] = q1[0] & 0x0f0f;
+        q16[1] = q1[8] & 0x0f0f;
+        q16[2] = (q1[0] >> 4) & 0x0f0f;
+        q16[3] = (q1[8] >> 4) & 0x0f0f;
+        q16[4] = q2[0] & 0x0f0f;
+        q16[5] = q2[8] & 0x0f0f;
+        q16[6] = (q2[0] >> 4) & 0x0f0f;
+        q16[7] = (q2[8] >> 4) & 0x0f0f;
+        for (int l = 0; l < n; ++l) {
+            sum.x += y1[l+ 0] * (q4[l +0] + (qh[l+ 0] & (hm1 << 0) ? 16 : 0))
+                   + y1[l+16] * (q4[l +2] + (qh[l+16] & (hm1 << 0) ? 16 : 0));
+            sum.y += y1[l+32] * (q4[l +4] + (qh[l+ 0] & (hm1 << 1) ? 16 : 0))
+                   + y1[l+48] * (q4[l +6] + (qh[l+16] & (hm1 << 1) ? 16 : 0));
+            sum.z += y2[l+ 0] * (q4[l +8] + (qh[l+ 0] & (hm2 << 0) ? 16 : 0))
+                   + y2[l+16] * (q4[l+10] + (qh[l+16] & (hm2 << 0) ? 16 : 0));
+            sum.w += y2[l+32] * (q4[l+12] + (qh[l+ 0] & (hm2 << 1) ? 16 : 0))
+                   + y2[l+48] * (q4[l+14] + (qh[l+16] & (hm2 << 1) ? 16 : 0));
+            smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]
+                  + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
+        }
+        tmp += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin;
+    }
+
+#else
+    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15
+    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);
+    const int step = tid * K_QUANTS_PER_ITERATION;
+    const int im = step/8;
+    const int in = step%8;
+
+    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
+        const uint8_t * q = x[i].qs + step;
+        const int8_t  * s = x[i].scales;
+        const float   * y = yy + i*QK_K + step;
+        const float     d = x[i].d;
+        float sum = 0.f;
+        for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
+            const uint8_t h = x[i].qh[in+j] >> im;
+            sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16))
+                 + y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16))
+                 + y[j+32] * d * s[2] * ((q[j+ 0] >>  4) - ((h >> 4) & 1 ? 0 : 16))
+                 + y[j+48] * d * s[3] * ((q[j+16] >>  4) - ((h >> 6) & 1 ? 0 : 16));
+        }
+        tmp += sum;
+    }
+#endif
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
+    }
+
+    if (threadIdx.x == 0) {
+        dst[row] = tmp;
+    }
+}
+
+static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
+
+    static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
+
+    const int row = blockIdx.x*blockDim.y + threadIdx.y;
+    if (row > nrows) return;
+
+    const int num_blocks_per_row = ncols / QK_K;
+    const int ib0 = row*num_blocks_per_row;
+
+    const block_q6_K * x = (const block_q6_K *)vx + ib0;
+
+#if QK_K == 256
+
+    const int tid = threadIdx.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...16
+    const int ix  = threadIdx.x%K_QUANTS_PER_ITERATION;  // 0 or 0, 1
+
+    const int step = 16/K_QUANTS_PER_ITERATION;          // 16 or 8
+
+    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
+    const int in = tid - step*im;                        // 0...15 or 0...7
+
+#if K_QUANTS_PER_ITERATION == 1
+    const int l0 = K_QUANTS_PER_ITERATION*in;            // 0...15
+    const int is = 0;
+#else
+    const int l0 = 4 * in;                               // 0, 4, 8, ..., 28
+    const int is = in / 4;
+#endif
+    const int ql_offset = 64*im + l0;
+    const int qh_offset = 32*im + l0;
+    const int s_offset  =  8*im + is;
+    const int y_offset = 128*im + l0;
+
+    float tmp = 0; // partial sum for thread in warp
+
+    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
+
+        const float   * y  = yy + i * QK_K + y_offset;
+        const uint8_t * ql = x[i].ql + ql_offset;
+        const uint8_t * qh = x[i].qh + qh_offset;
+        const int8_t  * s  = x[i].scales + s_offset;
+
+        const float d = x[i].d;
+
+#if K_QUANTS_PER_ITERATION == 1
+        float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
+                  + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
+                  + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
+                  + y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32)
+                  + y[64] * s[4] * d * ((int8_t)((ql[ 0]  >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32)
+                  + y[80] * s[5] * d * ((int8_t)((ql[16]  >> 4) | ((qh[16] & 0x30) >> 0)) - 32)
+                  + y[96] * s[6] * d * ((int8_t)((ql[32]  >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
+                  +y[112] * s[7] * d * ((int8_t)((ql[48]  >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
+        tmp += sum;
+#else
+        float sum = 0;
+        for (int l = 0; l < 4; ++l) {
+            sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
+                 + y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32)
+                 + y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0]  >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32)
+                 + y[l+96] * s[6] * d * ((int8_t)((ql[l+32]  >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
+        }
+        tmp += sum;
+#endif
+
+    }
+
+#else
+
+    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...7
+    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);  // 0...3
+
+    const int step = tid * K_QUANTS_PER_ITERATION;
+
+    float tmp = 0; // partial sum for thread in warp
+
+    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
+
+        const float   * y  = yy + i * QK_K + step;
+        const uint8_t * ql = x[i].ql + step;
+        const uint8_t * qh = x[i].qh + step;
+        const int8_t  * s  = x[i].scales;
+
+        const float d = x[i+0].d;
+
+        float sum = 0;
+        for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
+            sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32)
+                 + y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32)
+                 + y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >>  4) | ((qh[j] & 0x30) >> 0)) - 32)
+                 + y[j+48] * s[3] * d * ((int8_t)((ql[j+16] >>  4) | ((qh[j] & 0xc0) >> 2)) - 32);
+        }
+        tmp += sum;
+
+    }
+
+#endif
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
+    }
+
+    if (tid == 0) {
+        dst[row] = tmp;
+    }
+}
+
+static __device__ void convert_f16(const void * vx, const int ib, const int iqs, dfloat2 & v){
+    const half * x = (const half *) vx;
+
+    // automatic half -> float type cast if dfloat == float
+    v.x = x[ib + iqs + 0];
+    v.y = x[ib + iqs + 1];
+}
+
+static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded) {
+    const int ix = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (ix >= kx_padded) {
+        return;
+    }
+
+    const int iy = blockDim.y*blockIdx.y + threadIdx.y;
+
+    const int i_padded = iy*kx_padded + ix;
+
+    block_q8_1 * y = (block_q8_1 *) vy;
+
+    const int ib = i_padded / QK8_1; // block index
+    const int iqs = i_padded % QK8_1; // quant index
+
+    const float xi = ix < kx ? x[iy*kx + ix] : 0.0f;
+    float amax = fabsf(xi);
+    float sum = xi;
+
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        amax = fmaxf(amax, __shfl_xor_sync(0xffffffff, amax, mask, 32));
+        sum += __shfl_xor_sync(0xffffffff, sum, mask, 32);
+    }
+
+    const float d = amax / 127;
+    const int8_t q = amax == 0.0f ? 0 : roundf(xi / d);
+
+    y[ib].qs[iqs] = q;
+
+    if (iqs > 0) {
+        return;
+    }
+
+    reinterpret_cast<half&>(y[ib].ds.x) = d;
+    reinterpret_cast<half&>(y[ib].ds.y) = sum;
+}
+
+template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
+static __global__ void k_get_rows(
+            const void * src0, const int32_t * src1, dst_t * dst,
+            int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
+            /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
+            /*size_t s0,*/ size_t s1, size_t s2, size_t s3,
+            /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
+            size_t s10, size_t s11, size_t s12/*, size_t s13*/) {
+
+    const int i00 = (blockIdx.x*blockDim.x + threadIdx.x)*2;
+    const int i10 = blockDim.y*blockIdx.y + threadIdx.y;
+    const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;
+    const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;
+
+    if (i00 >= ne00) {
+        return;
+    }
+
+    const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
+
+    dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
+    const void * src0_row = (const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03;
+
+    const int ib = i00/qk; // block index
+    const int iqs = (i00%qk)/qr; // quant index
+    const int iybs = i00 - i00%qk; // dst block start index
+    const int y_offset = qr == 1 ? 1 : qk/2;
+
+    // dequantize
+    dfloat2 v;
+    dequantize_kernel(src0_row, ib, iqs, v);
+
+    dst_row[iybs + iqs + 0]        = v.x;
+    dst_row[iybs + iqs + y_offset] = v.y;
+}
+
+template<typename src0_t, typename dst_t>
+static __global__ void k_get_rows_float(
+            const src0_t * src0, const int32_t * src1, dst_t * dst,
+            int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
+            /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
+            /*size_t s0,*/ size_t s1, size_t s2, size_t s3,
+            /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
+            size_t s10, size_t s11, size_t s12/*, size_t s13*/) {
+
+    const int i00 = blockIdx.x*blockDim.x + threadIdx.x;
+    const int i10 = blockDim.y*blockIdx.y + threadIdx.y;
+    const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;
+    const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;
+
+    if (i00 >= ne00) {
+        return;
+    }
+
+    const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
+
+    dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
+    const src0_t * src0_row = (const src0_t *)((const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03);
+
+    dst_row[i00] = src0_row[i00];
+}
+
+template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
+static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
+    const int i = 2*(blockDim.x*blockIdx.x + threadIdx.x);
+
+    if (i >= k) {
+        return;
+    }
+
+    const int ib = i/qk; // block index
+    const int iqs = (i%qk)/qr; // quant index
+    const int iybs = i - i%qk; // y block start index
+    const int y_offset = qr == 1 ? 1 : qk/2;
+
+    // dequantize
+    dfloat2 v;
+    dequantize_kernel(vx, ib, iqs, v);
+
+    y[iybs + iqs + 0]        = v.x;
+    y[iybs + iqs + y_offset] = v.y;
+}
+
+template <typename src_t, typename dst_t>
+static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
+    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i >= k) {
+        return;
+    }
+
+    const src_t * x = (src_t *) vx;
+
+    y[i] = x[i];
+}
+
+// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
+// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
+
+#define VDR_Q4_0_Q8_1_MMVQ 2
+#define VDR_Q4_0_Q8_1_MMQ  4
+
+template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_impl(
+    const int * v, const int * u, const float & d4, const half2 & ds8) {
+
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    int sumi = 0;
+
+#pragma unroll
+    for (int i = 0; i < vdr; ++i) {
+        const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
+        const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
+
+        // SIMD dot product of quantized values
+        sumi = __dp4a(vi0, u[2*i+0], sumi);
+        sumi = __dp4a(vi1, u[2*i+1], sumi);
+    }
+
+    const float2 ds8f = __half22float2(ds8);
+
+    // second part effectively subtracts 8 from each quant value
+    return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
+#else
+    bad_arch();
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+#define VDR_Q4_1_Q8_1_MMVQ 2
+#define VDR_Q4_1_Q8_1_MMQ  4
+
+template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_impl(
+    const int * v, const int * u, const half2 & dm4, const half2 & ds8) {
+
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    int sumi = 0;
+
+#pragma unroll
+    for (int i = 0; i < vdr; ++i) {
+        const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
+        const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
+
+        // SIMD dot product of quantized values
+        sumi = __dp4a(vi0, u[2*i+0], sumi);
+        sumi = __dp4a(vi1, u[2*i+1], sumi);
+    }
+
+#ifdef GGML_V3_CUDA_F16
+    const float2 tmp = __half22float2(__hmul2(dm4, ds8));
+    const float d4d8 = tmp.x;
+    const float m4s8 = tmp.y;
+#else
+    const float2 dm4f = __half22float2(dm4);
+    const float2 ds8f = __half22float2(ds8);
+    const float d4d8 = dm4f.x * ds8f.x;
+    const float m4s8 = dm4f.y * ds8f.y;
+#endif // GGML_V3_CUDA_F16
+
+    // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
+    return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
+#else
+    bad_arch();
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+#define VDR_Q5_0_Q8_1_MMVQ 2
+#define VDR_Q5_0_Q8_1_MMQ  4
+
+template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_impl(
+    const int * vl, const int * vh, const int * u, const float & d5, const half2 & ds8) {
+
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    int sumi = 0;
+
+#pragma unroll
+    for (int i = 0; i < vdr; ++i) {
+        int vi0 = (vl[i] >>  0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
+        vi0    |= (vh[i] <<  4) & 0x00000010; // 0 ->  4
+        vi0    |= (vh[i] << 11) & 0x00001000; // 1 -> 12
+        vi0    |= (vh[i] << 18) & 0x00100000; // 2 -> 20
+        vi0    |= (vh[i] << 25) & 0x10000000; // 3 -> 28
+        sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values
+
+        int vi1 = (vl[i] >>  4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
+        vi1    |= (vh[i] >> 12) & 0x00000010; // 16 ->  4
+        vi1    |= (vh[i] >>  5) & 0x00001000; // 17 -> 12
+        vi1    |= (vh[i] <<  2) & 0x00100000; // 18 -> 20
+        vi1    |= (vh[i] <<  9) & 0x10000000; // 19 -> 28
+        sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
+    }
+
+    const float2 ds8f = __half22float2(ds8);
+
+    // second part effectively subtracts 16 from each quant value
+    return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
+#else
+    bad_arch();
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+#define VDR_Q5_1_Q8_1_MMVQ 2
+#define VDR_Q5_1_Q8_1_MMQ  4
+
+template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_impl(
+    const int * vl, const int * vh, const int * u, const half2 & dm5, const half2 & ds8) {
+
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    int sumi = 0;
+
+#pragma unroll
+    for (int i = 0; i < vdr; ++i) {
+        int vi0 = (vl[i] >>  0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
+        vi0    |= (vh[i] <<  4) & 0x00000010; // 0 ->  4
+        vi0    |= (vh[i] << 11) & 0x00001000; // 1 -> 12
+        vi0    |= (vh[i] << 18) & 0x00100000; // 2 -> 20
+        vi0    |= (vh[i] << 25) & 0x10000000; // 3 -> 28
+        sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values
+
+        int vi1 = (vl[i] >>  4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
+        vi1    |= (vh[i] >> 12) & 0x00000010; // 16 ->  4
+        vi1    |= (vh[i] >>  5) & 0x00001000; // 17 -> 12
+        vi1    |= (vh[i] <<  2) & 0x00100000; // 18 -> 20
+        vi1    |= (vh[i] <<  9) & 0x10000000; // 19 -> 28
+        sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
+    }
+
+#ifdef GGML_V3_CUDA_F16
+    const float2 tmp = __half22float2(__hmul2(dm5, ds8));
+    const float d5d8 = tmp.x;
+    const float m5s8 = tmp.y;
+#else
+    const float2 dm5f = __half22float2(dm5);
+    const float2 ds8f = __half22float2(ds8);
+    const float d5d8 = dm5f.x * ds8f.x;
+    const float m5s8 = dm5f.y * ds8f.y;
+#endif // GGML_V3_CUDA_F16
+
+    // scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it
+    return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
+
+#else
+    bad_arch();
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+#define VDR_Q8_0_Q8_1_MMVQ 2
+#define VDR_Q8_0_Q8_1_MMQ 8
+
+template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_impl(
+    const int * v, const int * u, const float & d8_0, const float & d8_1) {
+
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    int sumi = 0;
+
+#pragma unroll
+    for (int i = 0; i < vdr; ++i) {
+        // SIMD dot product of quantized values
+        sumi = __dp4a(v[i], u[i], sumi);
+    }
+
+    return d8_0*d8_1 * sumi;
+#else
+    bad_arch();
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_impl(
+    const int * v, const int * u, const half2 & dm8, const half2 & ds8) {
+
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    int sumi = 0;
+
+#pragma unroll
+    for (int i = 0; i < vdr; ++i) {
+        // SIMD dot product of quantized values
+        sumi = __dp4a(v[i], u[i], sumi);
+    }
+
+#ifdef GGML_V3_CUDA_F16
+    const float2 tmp = __half22float2(__hmul2(dm8, ds8));
+    const float d8d8 = tmp.x;
+    const float m8s8 = tmp.y;
+#else
+    const float2 dm8f = __half22float2(dm8);
+    const float2 ds8f = __half22float2(ds8);
+    const float d8d8 = dm8f.x * ds8f.x;
+    const float m8s8 = dm8f.y * ds8f.y;
+#endif // GGML_V3_CUDA_F16
+
+    // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
+    return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
+#else
+    bad_arch();
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+#define VDR_Q2_K_Q8_1_MMVQ 1
+#define VDR_Q2_K_Q8_1_MMQ  2
+
+// contiguous v/x values
+static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
+    const int & v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
+    const half2 & dm2, const float * __restrict__ d8) {
+
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    float sumf_d = 0.0f;
+    float sumf_m = 0.0f;
+
+#pragma unroll
+    for (int i = 0; i < QR2_K; ++i) {
+        const int sc = scales[2*i];
+
+        const int vi = (v >> (2*i)) & 0x03030303;
+
+        sumf_d += d8[i] * (__dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product
+
+        // fill int with 4x m
+        int m = sc >> 4;
+        m |= m <<  8;
+        m |= m << 16;
+        sumf_m += d8[i] * __dp4a(m, u[i], 0); // multiply constant q2_K part with sum of q8_1 values
+    }
+
+    const float2 dm2f = __half22float2(dm2);
+
+    return dm2f.x*sumf_d - dm2f.y*sumf_m;
+#else
+    bad_arch();
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+// contiguous u/y values
+static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
+    const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
+    const half2 & dm2, const float & d8) {
+
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    int sumi_d = 0;
+    int sumi_m = 0;
+
+#pragma unroll
+    for (int i0 = 0; i0 < QI8_1; i0 += QI8_1/2) {
+        int sumi_d_sc = 0;
+
+        const int sc = scales[i0 / (QI8_1/2)];
+
+        // fill int with 4x m
+        int m = sc >> 4;
+        m |= m <<  8;
+        m |= m << 16;
+
+#pragma unroll
+        for (int i = i0; i < i0 + QI8_1/2; ++i) {
+            sumi_d_sc = __dp4a(v[i], u[i], sumi_d_sc); // SIMD dot product
+            sumi_m    = __dp4a(m,    u[i], sumi_m); // multiply sum of q8_1 values with m
+        }
+
+        sumi_d += sumi_d_sc * (sc & 0xF);
+    }
+
+    const float2 dm2f = __half22float2(dm2);
+
+    return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
+#else
+    bad_arch();
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+#define VDR_Q3_K_Q8_1_MMVQ 1
+#define VDR_Q3_K_Q8_1_MMQ  2
+
+// contiguous v/x values
+static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
+    const int & vl, const int & vh, const int * __restrict__ u, const uint8_t * __restrict__ scales,
+    const int & scale_offset, const float & d3, const float * __restrict__ d8) {
+
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    float sumf = 0.0f;
+
+#pragma unroll
+    for (int i = 0; i < QR3_K; ++i) {
+        const int isc = scale_offset + 2*i;
+
+        const int isc_low = isc % (QK_K/32);
+        const int sc_shift_low = 4 * (isc / (QK_K/32));
+        const int sc_low  = (scales[isc_low] >> sc_shift_low) & 0xF;
+
+        const int isc_high = isc % (QK_K/64);
+        const int sc_shift_high = 2 * (isc / (QK_K/64));
+        const int sc_high = ((scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
+
+        const int sc = (sc_low | sc_high) - 32;
+
+        const int vil = (vl >> (2*i)) & 0x03030303;
+
+        const int vih = ((vh >> i) << 2) & 0x04040404;
+
+        const int vi = __vsubss4(vil, vih);
+
+        sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
+    }
+
+    return d3 * sumf;
+#else
+    bad_arch();
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+// contiguous u/y values
+static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
+    const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ scales,
+    const float & d3, const float & d8) {
+
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    int sumi = 0;
+
+#pragma unroll
+    for (int i0 = 0; i0 < QR3_K*VDR_Q3_K_Q8_1_MMQ; i0 += QI8_1/2) {
+        int sumi_sc = 0;
+
+        for (int i = i0; i < i0 + QI8_1/2; ++i) {
+            sumi_sc = __dp4a(v[i], u[i], sumi_sc); // SIMD dot product
+        }
+
+        sumi += sumi_sc * scales[i0 / (QI8_1/2)];
+    }
+
+    return d3*d8 * sumi;
+#else
+    bad_arch();
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+#define VDR_Q4_K_Q8_1_MMVQ 2
+#define VDR_Q4_K_Q8_1_MMQ  8
+
+// contiguous v/x values
+static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
+    const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
+    const uint8_t * __restrict__ m, const half2 & dm4, const float * __restrict__ d8) {
+
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    float sumf_d = 0.0f;
+    float sumf_m = 0.0f;
+
+#pragma unroll
+    for (int i = 0; i < QR4_K; ++i) {
+        const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F;
+        const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F;
+
+        const int dot1 = __dp4a(v1i, u[2*i+1], __dp4a(v0i, u[2*i+0], 0)); // SIMD dot product
+        const int dot2 = __dp4a(0x01010101, u[2*i+1], __dp4a(0x01010101, u[2*i+0], 0)); // sum of u
+
+        sumf_d += d8[i] * (dot1 * sc[i]);
+        sumf_m += d8[i] * (dot2 * m[i]);  // multiply constant part of q4_K with sum of q8_1 values
+    }
+
+    const float2 dm4f = __half22float2(dm4);
+
+    return dm4f.x*sumf_d - dm4f.y*sumf_m;
+
+#else
+    bad_arch();
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+// contiguous u/y values
+static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
+    const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
+    const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
+
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    float sumf_d = 0.0f;
+    float sumf_m = 0.0f;
+
+#pragma unroll
+    for (int i = 0; i < QR4_K*VDR_Q4_K_Q8_1_MMQ/QI8_1; ++i) {
+        int sumi_d = 0;
+
+#pragma unroll
+        for (int j = 0; j < QI8_1; ++j) {
+            sumi_d = __dp4a((v[j] >> (4*i)) & 0x0F0F0F0F, u[i*QI8_1 + j], sumi_d); // SIMD dot product
+        }
+
+        const float2 ds8f = __half22float2(ds8[i]);
+
+        sumf_d += ds8f.x * (sc[i] * sumi_d);
+        sumf_m += ds8f.y *   m[i]; // sum of q8_1 block * q4_K min val
+    }
+
+    const float2 dm4f = __half22float2(dm4);
+
+    return dm4f.x*sumf_d - dm4f.y*sumf_m;
+
+#else
+    bad_arch();
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+#define VDR_Q5_K_Q8_1_MMVQ 2
+#define VDR_Q5_K_Q8_1_MMQ  8
+
+// contiguous v/x values
+static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
+    const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc,
+    const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) {
+
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    float sumf_d = 0.0f;
+    float sumf_m = 0.0f;
+
+#pragma unroll
+    for (int i = 0; i < QR5_K; ++i) {
+        const int vl0i = (vl[0] >> (4*i)) & 0x0F0F0F0F;
+        const int vl1i = (vl[1] >> (4*i)) & 0x0F0F0F0F;
+
+        const int vh0i = ((vh[0] >> i) << 4) & 0x10101010;
+        const int vh1i = ((vh[1] >> i) << 4) & 0x10101010;
+
+        const int v0i = vl0i | vh0i;
+        const int v1i = vl1i | vh1i;
+
+        const int dot1 = __dp4a(v0i, u[2*i+0], __dp4a(v1i, u[2*i+1], 0)); // SIMD dot product
+        const int dot2 = __dp4a(0x01010101, u[2*i+0], __dp4a(0x01010101, u[2*i+1], 0)); // sum of u
+
+        sumf_d += d8[i] * (dot1 * sc[i]);
+        sumf_m += d8[i] * (dot2 * m[i]);
+
+    }
+
+    const float2 dm5f = __half22float2(dm5);
+
+    return dm5f.x*sumf_d - dm5f.y*sumf_m;
+
+#else
+    bad_arch();
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+// contiguous u/y values
+static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
+    const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
+    const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
+
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    float sumf_d = 0.0f;
+    float sumf_m = 0.0f;
+
+#pragma unroll
+    for (int i = 0; i < QR5_K*VDR_Q5_K_Q8_1_MMQ/QI8_1; ++i) {
+        int sumi_d = 0;
+
+#pragma unroll
+        for (int j = 0; j < QI8_1; ++j) {
+            sumi_d = __dp4a(v[i*QI8_1 + j], u[i*QI8_1 + j], sumi_d); // SIMD dot product
+        }
+
+        const float2 ds8f = __half22float2(ds8[i]);
+
+        sumf_d += ds8f.x * (sc[i] * sumi_d);
+        sumf_m += ds8f.y *   m[i]; // sum of q8_1 block * q4_K min val
+    }
+
+    const float2 dm4f = __half22float2(dm4);
+
+    return dm4f.x*sumf_d - dm4f.y*sumf_m;
+
+#else
+    bad_arch();
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+#define VDR_Q6_K_Q8_1_MMVQ 1
+#define VDR_Q6_K_Q8_1_MMQ  8
+
+// contiguous v/x values
+static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
+    const int & vl, const int & vh, const int * __restrict__ u, const int8_t * __restrict__ scales,
+    const float & d, const float * __restrict__ d8) {
+
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    float sumf = 0.0f;
+
+#pragma unroll
+    for (int i = 0; i < QR6_K; ++i) {
+        const int sc = scales[4*i];
+
+        const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
+
+        const int vih = ((vh >> (4*i)) << 4) & 0x30303030;
+
+        const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
+
+        sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
+    }
+
+    return d*sumf;
+#else
+    bad_arch();
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+// contiguous u/y values
+static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
+    const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ sc,
+    const float & d6, const float * __restrict__ d8) {
+
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    float sumf_d = 0.0f;
+
+#pragma unroll
+    for (int i0 = 0; i0 < VDR_Q6_K_Q8_1_MMQ; i0 += 4) {
+        int2 sumi_d = {0, 0}; // 2 q6_K scales per q8_1 scale
+
+#pragma unroll
+        for (int i = i0; i < i0 + 2; ++i) {
+            sumi_d.x = __dp4a(v[2*i+0], u[2*i+0], sumi_d.x); // SIMD dot product
+            sumi_d.x = __dp4a(v[2*i+1], u[2*i+1], sumi_d.x); // SIMD dot product
+
+            sumi_d.y = __dp4a(v[2*i+4], u[2*i+4], sumi_d.y); // SIMD dot product
+            sumi_d.y = __dp4a(v[2*i+5], u[2*i+5], sumi_d.y); // SIMD dot product
+        }
+
+        sumf_d += d8[i0/4] * (sc[i0/2+0]*sumi_d.x + sc[i0/2+1]*sumi_d.y);
+    }
+
+    return d6 * sumf_d;
+
+#else
+    bad_arch();
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+
+    const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
+
+    int v[VDR_Q4_0_Q8_1_MMVQ];
+    int u[2*VDR_Q4_0_Q8_1_MMVQ];
+
+#pragma unroll
+    for (int i = 0; i < VDR_Q4_0_Q8_1_MMVQ; ++i) {
+        v[i]     = get_int_from_uint8(bq4_0->qs, iqs + i);
+        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
+        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_0);
+    }
+
+    return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, bq4_0->d, bq8_1->ds);
+}
+
+template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
+    (void)x_qh; (void)x_sc;
+
+    __shared__ int  tile_x_qs[mmq_y * (WARP_SIZE)       + mmq_y];
+    __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0];
+
+    *x_ql = tile_x_qs;
+    *x_dm = (half2 *) tile_x_d;
+}
+
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
+    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
+    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
+    (void)x_qh; (void)x_sc;
+    GGML_V3_CUDA_ASSUME(i_offset >= 0);
+    GGML_V3_CUDA_ASSUME(i_offset <  nwarps);
+    GGML_V3_CUDA_ASSUME(k >= 0);
+    GGML_V3_CUDA_ASSUME(k <  WARP_SIZE);
+
+    const int kbx  = k / QI4_0;
+    const int kqsx = k % QI4_0;
+
+    const block_q4_0 * bx0 = (const block_q4_0 *) vx;
+
+    float * x_dmf = (float *) x_dm;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbx;
+
+        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
+        // x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d;
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI4_0;
+    const int kbxd = k % blocks_per_tile_x_row;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_0) {
+        int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd;
+
+        x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd] = bxi->d;
+    }
+}
+
+static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
+    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
+    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
+    (void)x_qh; (void)x_sc;
+
+    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
+    const float * x_dmf = (const float *) x_dm;
+
+    int u[2*VDR_Q4_0_Q8_1_MMQ];
+
+#pragma unroll
+    for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) {
+        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
+        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_0) % WARP_SIZE];
+    }
+
+    return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMQ>
+        (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k/QI4_0],
+         y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
+}
+
+static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+
+    const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
+
+    int v[VDR_Q4_1_Q8_1_MMVQ];
+    int u[2*VDR_Q4_1_Q8_1_MMVQ];
+
+#pragma unroll
+    for (int i = 0; i < VDR_Q4_1_Q8_1_MMVQ; ++i) {
+        v[i]    = get_int_from_uint8_aligned(bq4_1->qs, iqs + i);
+        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
+        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_1);
+    }
+
+    return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, bq4_1->dm, bq8_1->ds);
+}
+
+template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
+    (void)x_qh; (void)x_sc;
+
+    __shared__ int   tile_x_qs[mmq_y * (WARP_SIZE) +     + mmq_y];
+    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1];
+
+    *x_ql = tile_x_qs;
+    *x_dm = tile_x_dm;
+}
+
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
+    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
+    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
+    (void)x_qh; (void)x_sc;
+
+    GGML_V3_CUDA_ASSUME(i_offset >= 0);
+    GGML_V3_CUDA_ASSUME(i_offset <  nwarps);
+    GGML_V3_CUDA_ASSUME(k >= 0);
+    GGML_V3_CUDA_ASSUME(k <  WARP_SIZE);
+
+    const int kbx  = k / QI4_1;
+    const int kqsx = k % QI4_1;
+
+    const block_q4_1 * bx0 = (const block_q4_1 *) vx;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbx;
+
+        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI4_1;
+    const int kbxd = k % blocks_per_tile_x_row;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_1) {
+        int i = i0 + i_offset * QI4_1 + k / blocks_per_tile_x_row;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbxd;
+
+        x_dm[i * (WARP_SIZE/QI4_1) + i / QI4_1 + kbxd] = bxi->dm;
+    }
+}
+
+static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat(
+    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
+    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
+    (void)x_qh; (void)x_sc;
+
+    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
+
+    int u[2*VDR_Q4_1_Q8_1_MMQ];
+
+#pragma unroll
+    for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) {
+        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
+        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_1) % WARP_SIZE];
+    }
+
+    return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMQ>
+        (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k/QI4_1],
+         y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
+}
+
+static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+
+    const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
+
+    int vl[VDR_Q5_0_Q8_1_MMVQ];
+    int vh[VDR_Q5_0_Q8_1_MMVQ];
+    int  u[2*VDR_Q5_0_Q8_1_MMVQ];
+
+#pragma unroll
+    for (int i = 0; i < VDR_Q5_0_Q8_1_MMVQ; ++i) {
+        vl[i]    = get_int_from_uint8(bq5_0->qs, iqs + i);
+        vh[i]    = get_int_from_uint8(bq5_0->qh, 0) >> (4 * (iqs + i));
+        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
+        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_0);
+    }
+
+    return vec_dot_q5_0_q8_1_impl<VDR_Q5_0_Q8_1_MMVQ>(vl, vh, u, bq5_0->d, bq8_1->ds);
+}
+
+template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
+    (void)x_qh; (void)x_sc;
+
+    __shared__ int  tile_x_ql[mmq_y * (2*WARP_SIZE)     + mmq_y];
+    __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0];
+
+    *x_ql = tile_x_ql;
+    *x_dm = (half2 *) tile_x_d;
+}
+
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
+    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
+    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
+    (void)x_qh; (void)x_sc;
+
+    GGML_V3_CUDA_ASSUME(i_offset >= 0);
+    GGML_V3_CUDA_ASSUME(i_offset <  nwarps);
+    GGML_V3_CUDA_ASSUME(k >= 0);
+    GGML_V3_CUDA_ASSUME(k <  WARP_SIZE);
+
+    const int kbx  = k / QI5_0;
+    const int kqsx = k % QI5_0;
+
+    const block_q5_0 * bx0 = (const block_q5_0 *) vx;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbx;
+
+        const int ql = get_int_from_uint8(bxi->qs, kqsx);
+        const int qh = get_int_from_uint8(bxi->qh, 0) >> (4 * (k % QI5_0));
+
+        int qs0 = (ql >>  0)   & 0x0F0F0F0F;
+        qs0    |= (qh <<  4)   & 0x00000010;  // 0 ->  4
+        qs0    |= (qh << 11)   & 0x00001000;  // 1 -> 12
+        qs0    |= (qh << 18)   & 0x00100000;  // 2 -> 20
+        qs0    |= (qh << 25)   & 0x10000000;  // 3 -> 28
+        qs0     = __vsubss4(qs0, 0x10101010); // subtract 16
+
+        x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;
+
+        int qs1 = (ql >>  4)   & 0x0F0F0F0F;
+        qs1    |= (qh >> 12)   & 0x00000010;  // 16 ->  4
+        qs1    |= (qh >>  5)   & 0x00001000;  // 17 -> 12
+        qs1    |= (qh <<  2)   & 0x00100000;  // 18 -> 20
+        qs1    |= (qh <<  9)   & 0x10000000;  // 19 -> 28
+        qs1     = __vsubss4(qs1, 0x10101010); // subtract 16
+
+        x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI5_0;
+    const int kbxd = k % blocks_per_tile_x_row;
+    float * x_dmf = (float *) x_dm;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_0) {
+        int i = i0 + i_offset * QI5_0 + k / blocks_per_tile_x_row;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbxd;
+
+        x_dmf[i * (WARP_SIZE/QI5_0) + i / QI5_0 + kbxd] = bxi->d;
+    }
+}
+
+static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
+    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
+    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
+    (void)x_qh; (void)x_sc;
+
+    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
+    const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0;
+    const float * x_dmf = (const float *) x_dm;
+    const float * y_df  = (const float *) y_ds;
+
+    int u[2*VDR_Q5_0_Q8_1_MMQ];
+
+#pragma unroll
+    for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) {
+        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
+        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_0) % WARP_SIZE];
+    }
+
+    return vec_dot_q8_0_q8_1_impl<QR5_0*VDR_Q5_0_Q8_1_MMQ>
+        (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
+}
+
+static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+
+    const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
+
+    int vl[VDR_Q5_1_Q8_1_MMVQ];
+    int vh[VDR_Q5_1_Q8_1_MMVQ];
+    int  u[2*VDR_Q5_1_Q8_1_MMVQ];
+
+#pragma unroll
+    for (int i = 0; i < VDR_Q5_1_Q8_1_MMVQ; ++i) {
+        vl[i]   = get_int_from_uint8_aligned(bq5_1->qs, iqs + i);
+        vh[i]   = get_int_from_uint8_aligned(bq5_1->qh, 0) >> (4 * (iqs + i));
+        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
+        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_1);
+    }
+
+    return vec_dot_q5_1_q8_1_impl<VDR_Q5_1_Q8_1_MMVQ>(vl, vh, u, bq5_1->dm, bq8_1->ds);
+}
+
+template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
+    (void)x_qh; (void)x_sc;
+
+    __shared__ int   tile_x_ql[mmq_y * (2*WARP_SIZE)     + mmq_y];
+    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1];
+
+    *x_ql = tile_x_ql;
+    *x_dm = tile_x_dm;
+}
+
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
+    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
+    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
+    (void)x_qh; (void)x_sc;
+
+    GGML_V3_CUDA_ASSUME(i_offset >= 0);
+    GGML_V3_CUDA_ASSUME(i_offset < nwarps);
+    GGML_V3_CUDA_ASSUME(k >= 0);
+    GGML_V3_CUDA_ASSUME(k <  WARP_SIZE);
+
+    const int kbx  = k / QI5_1;
+    const int kqsx = k % QI5_1;
+
+    const block_q5_1 * bx0 = (const block_q5_1 *) vx;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbx;
+
+        const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
+        const int qh = get_int_from_uint8_aligned(bxi->qh, 0) >> (4 * (k % QI5_1));
+
+        int qs0 = (ql >>  0) & 0x0F0F0F0F;
+        qs0    |= (qh <<  4) & 0x00000010; // 0 ->  4
+        qs0    |= (qh << 11) & 0x00001000; // 1 -> 12
+        qs0    |= (qh << 18) & 0x00100000; // 2 -> 20
+        qs0    |= (qh << 25) & 0x10000000; // 3 -> 28
+
+        x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;
+
+        int qs1 = (ql >>  4) & 0x0F0F0F0F;
+        qs1    |= (qh >> 12) & 0x00000010; // 16 ->  4
+        qs1    |= (qh >>  5) & 0x00001000; // 17 -> 12
+        qs1    |= (qh <<  2) & 0x00100000; // 18 -> 20
+        qs1    |= (qh <<  9) & 0x10000000; // 19 -> 28
+
+        x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI5_1;
+    const int kbxd = k % blocks_per_tile_x_row;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_1) {
+        int i = i0 + i_offset * QI5_1 + k / blocks_per_tile_x_row;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbxd;
+
+        x_dm[i * (WARP_SIZE/QI5_1) + i / QI5_1 + kbxd] = bxi->dm;
+    }
+}
+
+static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
+    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
+    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
+    (void)x_qh; (void)x_sc;
+
+    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
+    const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1;
+
+    int u[2*VDR_Q5_1_Q8_1_MMQ];
+
+#pragma unroll
+    for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) {
+        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
+        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_1) % WARP_SIZE];
+    }
+
+    return vec_dot_q8_1_q8_1_impl<QR5_1*VDR_Q5_1_Q8_1_MMQ>
+        (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
+}
+
+static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+
+    const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
+
+    int v[VDR_Q8_0_Q8_1_MMVQ];
+    int u[VDR_Q8_0_Q8_1_MMVQ];
+
+#pragma unroll
+    for (int i = 0; i < VDR_Q8_0_Q8_1_MMVQ; ++i) {
+        v[i] = get_int_from_int8(bq8_0->qs, iqs + i);
+        u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
+    }
+
+    return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, __low2half(bq8_1->ds));
+}
+
+template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
+    (void)x_qh; (void)x_sc;
+
+    __shared__ int  tile_x_qs[mmq_y * (WARP_SIZE)       + mmq_y];
+    __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0];
+
+    *x_ql = tile_x_qs;
+    *x_dm = (half2 *) tile_x_d;
+}
+
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
+    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
+    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
+    (void)x_qh; (void)x_sc;
+
+    GGML_V3_CUDA_ASSUME(i_offset >= 0);
+    GGML_V3_CUDA_ASSUME(i_offset <  nwarps);
+    GGML_V3_CUDA_ASSUME(k >= 0);
+    GGML_V3_CUDA_ASSUME(k <  WARP_SIZE);
+
+    const int kbx  = k / QI8_0;
+    const int kqsx = k % QI8_0;
+    float * x_dmf = (float *) x_dm;
+
+    const block_q8_0 * bx0 = (const block_q8_0 *) vx;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbx;
+
+        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_int8(bxi->qs, kqsx);
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI8_0;
+    const int kbxd = k % blocks_per_tile_x_row;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI8_0) {
+        int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd;
+
+        x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd] = bxi->d;
+    }
+}
+
+static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat(
+    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
+    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
+    (void)x_qh; (void)x_sc;
+
+    const float * x_dmf = (const float *) x_dm;
+    const float * y_df  = (const float *) y_ds;
+
+    return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMQ>
+        (&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[j * WARP_SIZE + k], x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k/QI8_0],
+         y_df[j * (WARP_SIZE/QI8_1) + k/QI8_1]);
+}
+
+static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+
+    const block_q2_K * bq2_K = (const block_q2_K *) vbq;
+
+    const int bq8_offset = QR2_K * (iqs / QI8_1);
+    const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
+
+    const uint8_t * scales = bq2_K->scales + scale_offset;
+
+    const int v = get_int_from_uint8_aligned(bq2_K->qs, iqs);
+    int    u[QR2_K];
+    float d8[QR2_K];
+
+#pragma unroll
+    for (int i = 0; i < QR2_K; ++ i) {
+        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
+        d8[i] = __low2half(bq8_1[bq8_offset + i].ds);
+    }
+
+    return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
+}
+
+template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
+    (void)x_qh;
+
+    __shared__ int   tile_x_ql[mmq_y * (WARP_SIZE)       + mmq_y];
+    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K];
+    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE/4)     + mmq_y/4];
+
+    *x_ql = tile_x_ql;
+    *x_dm = tile_x_dm;
+    *x_sc = tile_x_sc;
+}
+
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
+    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
+    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
+    (void)x_qh;
+
+    GGML_V3_CUDA_ASSUME(i_offset >= 0);
+    GGML_V3_CUDA_ASSUME(i_offset <  nwarps);
+    GGML_V3_CUDA_ASSUME(k >= 0);
+    GGML_V3_CUDA_ASSUME(k <  WARP_SIZE);
+
+    const int kbx  = k / QI2_K;
+    const int kqsx = k % QI2_K;
+
+    const block_q2_K * bx0 = (const block_q2_K *) vx;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q2_K * bxi = bx0 + i*blocks_per_row + kbx;
+
+        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI2_K;
+    const int kbxd = k % blocks_per_tile_x_row;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI2_K) {
+        int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % mmq_y;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q2_K * bxi = bx0 + i*blocks_per_row + kbxd;
+
+        x_dm[i * (WARP_SIZE/QI2_K) + i / QI2_K + kbxd] = bxi->dm;
+    }
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
+        int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q2_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI2_K/4);
+
+        x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8_aligned(bxi->scales, k % (QI2_K/4));
+    }
+}
+
+static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat(
+    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
+    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
+    (void)x_qh;
+
+    const int kbx = k / QI2_K;
+    const int ky  = (k % QI2_K) * QR2_K;
+    const float * y_df = (const float *) y_ds;
+
+    int v[QR2_K*VDR_Q2_K_Q8_1_MMQ];
+
+    const int kqsx = i * (WARP_SIZE + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2);
+    const int shift = 2 * ((ky % (2*QI2_K)) / (QI2_K/2));
+
+#pragma unroll
+    for (int l = 0; l < QR2_K*VDR_Q2_K_Q8_1_MMQ; ++l) {
+        v[l] = (x_ql[kqsx + l] >> shift) & 0x03030303;
+    }
+
+    const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE/4) + i/4 + kbx*4]) + ky/4;
+
+    const int index_y = j * WARP_SIZE + (QR2_K*k) % WARP_SIZE;
+    return vec_dot_q2_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]);
+}
+
+static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+
+    const block_q3_K * bq3_K = (const block_q3_K *) vbq;
+
+    const int bq8_offset = QR3_K * (iqs / (QI3_K/2));
+    const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
+
+    const float d = bq3_K->d;
+
+    const int vl = get_int_from_uint8(bq3_K->qs, iqs);
+
+    // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
+    const int vh = ~get_int_from_uint8(bq3_K->hmask, iqs % (QI3_K/2)) >> bq8_offset;
+
+    int    u[QR3_K];
+    float d8[QR3_K];
+
+#pragma unroll
+    for (int i = 0; i < QR3_K; ++i) {
+        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
+        d8[i] = __low2half(bq8_1[bq8_offset + i].ds);
+    }
+
+    return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
+}
+
+template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
+
+    __shared__ int   tile_x_ql[mmq_y * (WARP_SIZE)       + mmq_y];
+    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI3_K) + mmq_y/QI3_K];
+    __shared__ int   tile_x_qh[mmq_y * (WARP_SIZE/2)     + mmq_y/2];
+    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE/4)     + mmq_y/4];
+
+    *x_ql = tile_x_ql;
+    *x_dm = tile_x_dm;
+    *x_qh = tile_x_qh;
+    *x_sc = tile_x_sc;
+}
+
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q3_K(
+    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
+    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
+
+    GGML_V3_CUDA_ASSUME(i_offset >= 0);
+    GGML_V3_CUDA_ASSUME(i_offset <  nwarps);
+    GGML_V3_CUDA_ASSUME(k >= 0);
+    GGML_V3_CUDA_ASSUME(k <  WARP_SIZE);
+
+    const int kbx  = k / QI3_K;
+    const int kqsx = k % QI3_K;
+
+    const block_q3_K * bx0 = (const block_q3_K *) vx;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q3_K * bxi = bx0 + i*blocks_per_row + kbx;
+
+        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI3_K;
+    const int kbxd = k % blocks_per_tile_x_row;
+    float * x_dmf = (float *) x_dm;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI3_K) {
+        int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % mmq_y;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q3_K * bxi = bx0 + i*blocks_per_row + kbxd;
+
+        x_dmf[i * (WARP_SIZE/QI3_K) + i / QI3_K + kbxd] = bxi->d;
+    }
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 2) {
+        int i = i0 + i_offset * 2 + k / (WARP_SIZE/2);
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/2)) / (QI3_K/2);
+
+        // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
+        x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = ~get_int_from_uint8(bxi->hmask, k % (QI3_K/2));
+    }
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
+        int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI3_K/4);
+
+        const int ksc = k % (QI3_K/4);
+
+        const int ksc_low = ksc % (QI3_K/8);
+        const int shift_low = 4 * (ksc / (QI3_K/8));
+        const int sc_low = (get_int_from_uint8(bxi->scales, ksc_low) >> shift_low) & 0x0F0F0F0F;
+
+        const int ksc_high = QI3_K/8;
+        const int shift_high = 2 * ksc;
+        const int sc_high = ((get_int_from_uint8(bxi->scales, ksc_high) >> shift_high) << 4) & 0x30303030;
+
+        const int sc = __vsubss4(sc_low | sc_high, 0x20202020);
+
+        x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = sc;
+    }
+}
+
+static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat(
+    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
+    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
+
+    const int kbx  = k / QI3_K;
+    const int ky  = (k % QI3_K) * QR3_K;
+    const float * x_dmf = (const float *) x_dm;
+    const float * y_df  = (const float *) y_ds;
+
+    const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
+
+    int v[QR3_K*VDR_Q3_K_Q8_1_MMQ];
+
+#pragma unroll
+    for (int l = 0; l < QR3_K*VDR_Q3_K_Q8_1_MMQ; ++l) {
+        const int kqsx = i * (WARP_SIZE + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2);
+        const int shift = 2 * ((ky % 32) / 8);
+        const int vll = (x_ql[kqsx + l] >> shift) & 0x03030303;
+
+        const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8);
+        const int vlh = (vh << 2) & 0x04040404;
+
+        v[l] = __vsubss4(vll, vlh);
+    }
+
+    const int index_y = j * WARP_SIZE + (k*QR3_K) % WARP_SIZE;
+    return vec_dot_q3_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]);
+}
+
+static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+
+#ifndef GGML_V3_QKK_64
+    const block_q4_K * bq4_K = (const block_q4_K *) vbq;
+
+    int    v[2];
+    int    u[2*QR4_K];
+    float d8[QR4_K];
+
+    // iqs is in 0,2..30. bq8_offset = iqs/4 -> bq8_offset = 0, 2, 4, 6
+    const int bq8_offset = QR4_K * ((iqs/2) / (QI8_1/2));
+
+    // iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12
+    // iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44
+    // iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76
+    // iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108
+
+    const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
+    v[0] = q4[0];
+    v[1] = q4[4];
+
+    const uint16_t * scales = (const uint16_t *)bq4_K->scales;
+    uint16_t aux[2];
+    const int j = bq8_offset/2;
+    if (j < 2) {
+        aux[0] = scales[j+0] & 0x3f3f;
+        aux[1] = scales[j+2] & 0x3f3f;
+    } else {
+        aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
+        aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
+    }
+    const uint8_t * sc = (const uint8_t *)aux;
+    const uint8_t * m  = sc + 2;
+
+    for (int i = 0; i < QR4_K; ++i) {
+        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
+        d8[i] = __low2half(bq8i->ds);
+
+        const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
+        u[2*i+0] = q8[0];
+        u[2*i+1] = q8[4];
+    }
+
+    return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
+
+#else
+
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    const block_q4_K * bq4_K = (const block_q4_K *) vbq;
+
+    float sumf_d = 0.0f;
+    float sumf_m = 0.0f;
+
+    uint16_t aux16[2];
+    const uint8_t * s = (const uint8_t *)aux16;
+
+    const uint16_t * a = (const uint16_t *)bq4_K->scales;
+    aux16[0] = a[0] & 0x0f0f;
+    aux16[1] = (a[0] >> 4) & 0x0f0f;
+
+    const float dall = bq4_K->dm[0];
+    const float dmin = bq4_K->dm[1];
+
+    const float d8_1 = __low2float(bq8_1[0].ds);
+    const float d8_2 = __low2float(bq8_1[1].ds);
+
+    const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
+    const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
+    const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
+    const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
+
+    const int * q4 = (const int *)bq4_K->qs + (iqs/2);
+    const int v1 = q4[0];
+    const int v2 = q4[4];
+
+    const int dot1 = __dp4a(ui2, v2 & 0x0f0f0f0f, __dp4a(ui1, v1 & 0x0f0f0f0f, 0));
+    const int dot2 = __dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, __dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
+    const int dot3 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
+    const int dot4 = __dp4a(0x01010101, ui4, __dp4a(0x01010101, ui3, 0));
+
+    sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
+    sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);
+
+    return dall * sumf_d - dmin * sumf_m;
+
+#else
+    bad_arch();
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+
+#endif
+}
+
+template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
+    (void)x_qh;
+
+    __shared__ int   tile_x_ql[mmq_y * (WARP_SIZE)       + mmq_y];
+    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K];
+    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE/8)     + mmq_y/8];
+
+    *x_ql = tile_x_ql;
+    *x_dm = tile_x_dm;
+    *x_sc = tile_x_sc;
+}
+
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
+    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
+    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
+    (void)x_qh;
+
+    GGML_V3_CUDA_ASSUME(i_offset >= 0);
+    GGML_V3_CUDA_ASSUME(i_offset <  nwarps);
+    GGML_V3_CUDA_ASSUME(k >= 0);
+    GGML_V3_CUDA_ASSUME(k <  WARP_SIZE);
+
+    const int kbx  = k / QI4_K; // == 0 if QK_K == 256
+    const int kqsx = k % QI4_K; // == k if QK_K == 256
+
+    const block_q4_K * bx0 = (const block_q4_K *) vx;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q4_K * bxi = bx0 + i*blocks_per_row + kbx;
+
+        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI4_K; // == 1 if QK_K == 256
+    const int kbxd = k % blocks_per_tile_x_row;          // == 0 if QK_K == 256
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_K) {
+        int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % mmq_y;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
+
+#if QK_K == 256
+        x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
+#else
+        x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]};
+#endif
+    }
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
+        int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);
+
+        const int * scales = (const int *) bxi->scales;
+
+        const int ksc = k % (WARP_SIZE/8);
+
+        // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
+        int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
+        scales8    |= (scales[ksc/2]              >> (2 * (ksc % 2)))       & 0x30303030; // upper 2 bits
+
+        x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
+    }
+}
+
+static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
+    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
+    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
+    (void)x_qh;
+
+    const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
+
+    const int index_y = j * WARP_SIZE + (QR4_K*k) % WARP_SIZE;
+    return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[index_y], sc, sc+8,
+                                      x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
+}
+
+static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+
+#ifndef GGML_V3_QKK_64
+    const block_q5_K * bq5_K = (const block_q5_K *) vbq;
+
+    int   vl[2];
+    int   vh[2];
+    int    u[2*QR5_K];
+    float d8[QR5_K];
+
+    const int bq8_offset = QR5_K * ((iqs/2) / (QI8_1/2));
+    const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
+    const int * qh = (const int *)(bq5_K->qh + 4 * ((iqs/2)%4));
+
+    vl[0] = ql[0];
+    vl[1] = ql[4];
+
+    vh[0] = qh[0] >> bq8_offset;
+    vh[1] = qh[4] >> bq8_offset;
+
+    const uint16_t * scales = (const uint16_t *)bq5_K->scales;
+    uint16_t aux[2];
+    const int j = bq8_offset/2;
+    if (j < 2) {
+        aux[0] = scales[j+0] & 0x3f3f;
+        aux[1] = scales[j+2] & 0x3f3f;
+    } else {
+        aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
+        aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
+    }
+    const uint8_t * sc = (const uint8_t *)aux;
+    const uint8_t * m  = sc + 2;
+
+#pragma unroll
+    for (int i = 0; i < QR5_K; ++i) {
+        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
+        d8[i] = __low2float(bq8i->ds);
+
+        const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
+        u[2*i+0] = q8[0];
+        u[2*i+1] = q8[4];
+    }
+
+    return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
+
+#else
+
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    const block_q5_K * bq5_K = (const block_q5_K *) vbq;
+
+    const int8_t * s = bq5_K->scales;
+
+    const float d = bq5_K->d;
+
+    const float d8_1 = __low2half(bq8_1[0].ds);
+    const float d8_2 = __low2half(bq8_1[1].ds);
+
+    const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
+    const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
+    const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
+    const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
+
+    const int * ql = (const int *)bq5_K->qs + (iqs/2);
+    const int vl1 = ql[0];
+    const int vl2 = ql[4];
+
+    const int step = 4 * (iqs/2); // 0, 4, 8, 12
+    const int im = step/8; // = 0 for iqs = 0, 2, = 1 for iqs = 4, 6
+    const int in = step%8; // 0, 4, 0, 4
+    const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
+
+    const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f);
+    const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f);
+    const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
+    const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
+
+    const float sumf_d = d8_1 * (__dp4a(ui1, v1, 0) * s[0] + __dp4a(ui2, v2, 0) * s[1])
+                       + d8_2 * (__dp4a(ui3, v3, 0) * s[2] + __dp4a(ui4, v4, 0) * s[3]);
+
+    return d * sumf_d;
+
+#else
+    bad_arch();
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+
+#endif
+}
+
+template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
+    (void)x_qh;
+
+    __shared__ int   tile_x_ql[mmq_y * (2*WARP_SIZE)     + mmq_y];
+    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K];
+    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE/8)     + mmq_y/8];
+
+    *x_ql = tile_x_ql;
+    *x_dm = tile_x_dm;
+    *x_sc = tile_x_sc;
+}
+
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
+    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
+    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
+    (void)x_qh;
+
+    GGML_V3_CUDA_ASSUME(i_offset >= 0);
+    GGML_V3_CUDA_ASSUME(i_offset <  nwarps);
+    GGML_V3_CUDA_ASSUME(k >= 0);
+    GGML_V3_CUDA_ASSUME(k <  WARP_SIZE);
+
+    const int kbx  = k / QI5_K; // == 0 if QK_K == 256
+    const int kqsx = k % QI5_K; // == k if QK_K == 256
+
+    const block_q5_K * bx0 = (const block_q5_K *) vx;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q5_K * bxi = bx0 + i*blocks_per_row + kbx;
+        const int ky = QR5_K*kqsx;
+
+        const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
+        const int ql0 = (ql >> 0) & 0x0F0F0F0F;
+        const int ql1 = (ql >> 4) & 0x0F0F0F0F;
+
+        const int qh = get_int_from_uint8_aligned(bxi->qh, kqsx % (QI5_K/4));
+        const int qh0 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 0)) << 4) & 0x10101010;
+        const int qh1 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 1)) << 4) & 0x10101010;
+
+        const int kq0 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + 0;
+        const int kq1 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + (QI5_K/4);
+
+        x_ql[i * (2*WARP_SIZE + 1) + kq0] = ql0 | qh0;
+        x_ql[i * (2*WARP_SIZE + 1) + kq1] = ql1 | qh1;
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI5_K; // == 1 if QK_K == 256
+    const int kbxd = k % blocks_per_tile_x_row;          // == 0 if QK_K == 256
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_K) {
+        int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % mmq_y;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
+
+#if QK_K == 256
+        x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
+#endif
+    }
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
+        int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
+
+        const int * scales = (const int *) bxi->scales;
+
+        const int ksc = k % (WARP_SIZE/8);
+
+        // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
+        int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
+        scales8    |= (scales[ksc/2]              >> (2 * (ksc % 2)))       & 0x30303030; // upper 2 bits
+
+        x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
+    }
+}
+
+static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
+    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
+    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
+    (void)x_qh;
+
+    const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8);
+
+    const int index_x = i * (QR5_K*WARP_SIZE + 1) +  QR5_K*k;
+    const int index_y = j * WARP_SIZE             + (QR5_K*k) % WARP_SIZE;
+    return vec_dot_q5_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8,
+                                      x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
+}
+
+static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+
+    const block_q6_K * bq6_K = (const block_q6_K *) vbq;
+
+    const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4);
+    const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8);
+    const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));
+
+    const int vl = get_int_from_uint8(bq6_K->ql, iqs);
+    const int vh = get_int_from_uint8(bq6_K->qh, (QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4)) >> vh_shift;
+
+    const int8_t * scales = bq6_K->scales + scale_offset;
+
+    int    u[QR6_K];
+    float d8[QR6_K];
+
+#pragma unroll
+    for (int i = 0; i < QR6_K; ++i) {
+        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
+        d8[i] = __low2half(bq8_1[bq8_offset + 2*i].ds);
+    }
+
+    return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
+}
+
+template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
+    (void)x_qh;
+
+    __shared__ int   tile_x_ql[mmq_y * (2*WARP_SIZE)     + mmq_y];
+    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K];
+    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE/8)     + mmq_y/8];
+
+    *x_ql = tile_x_ql;
+    *x_dm = tile_x_dm;
+    *x_sc = tile_x_sc;
+}
+
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
+    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
+    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
+    (void)x_qh;
+
+    GGML_V3_CUDA_ASSUME(i_offset >= 0);
+    GGML_V3_CUDA_ASSUME(i_offset <  nwarps);
+    GGML_V3_CUDA_ASSUME(k >= 0);
+    GGML_V3_CUDA_ASSUME(k <  WARP_SIZE);
+
+    const int kbx  = k / QI6_K; // == 0 if QK_K == 256
+    const int kqsx = k % QI6_K; // == k if QK_K == 256
+
+    const block_q6_K * bx0 = (const block_q6_K *) vx;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q6_K * bxi = bx0 + i*blocks_per_row + kbx;
+        const int ky = QR6_K*kqsx;
+
+        const int ql = get_int_from_uint8(bxi->ql, kqsx);
+        const int ql0 = (ql >> 0) & 0x0F0F0F0F;
+        const int ql1 = (ql >> 4) & 0x0F0F0F0F;
+
+        const int qh = get_int_from_uint8(bxi->qh, (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4));
+        const int qh0 = ((qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) << 4) & 0x30303030;
+        const int qh1 =  (qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4))))       & 0x30303030;
+
+        const int kq0 = ky - ky % QI6_K + k % (QI6_K/2) + 0;
+        const int kq1 = ky - ky % QI6_K + k % (QI6_K/2) + (QI6_K/2);
+
+        x_ql[i * (2*WARP_SIZE + 1) + kq0] = __vsubss4(ql0 | qh0, 0x20202020);
+        x_ql[i * (2*WARP_SIZE + 1) + kq1] = __vsubss4(ql1 | qh1, 0x20202020);
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256
+    const int kbxd = k % blocks_per_tile_x_row;          // == 0 if QK_K == 256
+    float * x_dmf = (float *) x_dm;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI6_K) {
+        int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % mmq_y;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q6_K * bxi = bx0 + i*blocks_per_row + kbxd;
+
+        x_dmf[i * (WARP_SIZE/QI6_K) + i / QI6_K + kbxd] = bxi->d;
+    }
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
+        int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q6_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / 4;
+
+        x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_int8(bxi->scales, k % (QI6_K/8));
+    }
+}
+
+static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
+    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
+    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
+    (void)x_qh;
+
+    const float * x_dmf = (const float *) x_dm;
+    const float * y_df  = (const float *) y_ds;
+
+    const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/8]);
+
+    const int index_x = i * (QR6_K*WARP_SIZE + 1) +  QR6_K*k;
+    const int index_y = j * WARP_SIZE             + (QR6_K*k) % WARP_SIZE;
+    return vec_dot_q6_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]);
+}
+
+static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+#if QK_K == 256
+    const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq;
+
+#if QR2_XXS == 8
+    const int ib32 = iqs;
+    const uint16_t * q2 = bq2->qs + 4*ib32;
+    const uint8_t  * aux8 = (const uint8_t *)q2;
+    const int8_t   * q8 = bq8_1[ib32].qs;
+    uint32_t aux32 = q2[2] | (q2[3] << 16);
+    int sumi = 0;
+    for (int l = 0; l < 4; ++l) {
+        const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
+        const uint8_t  signs = ksigns_iq2xs[aux32 & 127];
+        for (int j = 0; j < 8; ++j) {
+            sumi += q8[j] * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
+        }
+        q8 += 8;
+        aux32 >>= 7;
+    }
+    const float d = (float)bq2->d * (0.5f + aux32) * (float)bq8_1[ib32].ds.x * 0.25f;
+    return d * sumi;
+#else
+    // iqs is 0...15
+    const int ib32 = iqs/2;
+    const int il = iqs%2;
+    const uint16_t * q2 = bq2->qs + 4*ib32;
+    const uint8_t  * aux8 = (const uint8_t *)q2;
+    const uint8_t  * grid1 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+0]);
+    const uint8_t  * grid2 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+1]);
+    const uint32_t aux32 = q2[2] | (q2[3] << 16);
+    const float d = (float)bq2->d * (0.5f + (aux32 >> 28)) * (float)bq8_1[ib32].ds.x * 0.25f;
+    const uint8_t signs1 = ksigns_iq2xs[(aux32 >> 14*il) & 127];
+    const uint8_t signs2 = ksigns_iq2xs[(aux32 >> (14*il + 7)) & 127];
+    const int8_t * q8 = bq8_1[ib32].qs + 16*il;
+    int sumi1 = 0, sumi2 = 0;
+    for (int j = 0; j < 8; ++j) {
+        sumi1 += q8[j+0] * grid1[j] * (signs1 & kmask_iq2xs[j] ? -1 : 1);
+        sumi2 += q8[j+8] * grid2[j] * (signs2 & kmask_iq2xs[j] ? -1 : 1);
+    }
+    return d * (sumi1 + sumi2);
+#endif
+#else
+    assert(false);
+    return 0.f;
+#endif
+}
+
+static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+#if QK_K == 256
+    const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq;
+
+    const int ib32 = iqs;
+    const uint16_t * q2 = bq2->qs + 4*ib32;
+    const int8_t   * q8 = bq8_1[ib32].qs;
+    const uint8_t ls1 = bq2->scales[ib32] & 0xf;
+    const uint8_t ls2 = bq2->scales[ib32] >>  4;
+    int sumi1 = 0;
+    for (int l = 0; l < 2; ++l) {
+        const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
+        const uint8_t  signs = ksigns_iq2xs[q2[l] >> 9];
+        for (int j = 0; j < 8; ++j) {
+            sumi1 += q8[j] * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
+        }
+        q8 += 8;
+    }
+    int sumi2 = 0;
+    for (int l = 2; l < 4; ++l) {
+        const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
+        const uint8_t  signs = ksigns_iq2xs[q2[l] >> 9];
+        for (int j = 0; j < 8; ++j) {
+            sumi2 += q8[j] * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
+        }
+        q8 += 8;
+    }
+    const float d = (float)bq2->d * (float)bq8_1[ib32].ds.x * 0.25f;
+    return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
+#else
+    assert(false);
+    return 0.f;
+#endif
+}
+
+template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
+              allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
+static __device__ __forceinline__ void mul_mat_q(
+    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+
+    const block_q_t  * x = (const block_q_t  *) vx;
+    const block_q8_1 * y = (const block_q8_1 *) vy;
+
+    const int blocks_per_row_x = ncols_x / qk;
+    const int blocks_per_col_y = nrows_y / QK8_1;
+    const int blocks_per_warp = WARP_SIZE / qi;
+
+    const int & ncols_dst = ncols_y;
+
+    const int row_dst_0 = blockIdx.x*mmq_y;
+    const int & row_x_0 = row_dst_0;
+
+    const int col_dst_0 = blockIdx.y*mmq_x;
+    const int & col_y_0 = col_dst_0;
+
+    int   * tile_x_ql = nullptr;
+    half2 * tile_x_dm = nullptr;
+    int   * tile_x_qh = nullptr;
+    int   * tile_x_sc = nullptr;
+
+    allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
+
+    __shared__ int    tile_y_qs[mmq_x * WARP_SIZE];
+    __shared__ half2  tile_y_ds[mmq_x * WARP_SIZE/QI8_1];
+
+    float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {{0.0f}};
+
+    for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
+
+        load_tiles(x + row_x_0*blocks_per_row_x + ib0, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc,
+                   threadIdx.y, nrows_x-row_x_0-1, threadIdx.x, blocks_per_row_x);
+
+#pragma unroll
+        for (int ir = 0; ir < qr; ++ir) {
+            const int kqs = ir*WARP_SIZE + threadIdx.x;
+            const int kbxd = kqs / QI8_1;
+
+#pragma unroll
+            for (int i = 0; i < mmq_x; i += nwarps) {
+                const int col_y_eff = min(col_y_0 + threadIdx.y + i, ncols_y-1); // to prevent out-of-bounds memory accesses
+
+                const block_q8_1 * by0 = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kbxd];
+
+                const int index_y = (threadIdx.y + i) * WARP_SIZE + kqs % WARP_SIZE;
+                tile_y_qs[index_y] = get_int_from_int8_aligned(by0->qs, threadIdx.x % QI8_1);
+            }
+
+#pragma unroll
+            for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) {
+                const int ids = (ids0 + threadIdx.y * QI8_1 + threadIdx.x / (WARP_SIZE/QI8_1)) % mmq_x;
+                const int kby = threadIdx.x % (WARP_SIZE/QI8_1);
+                const int col_y_eff = min(col_y_0 + ids, ncols_y-1);
+
+                // if the sum is not needed it's faster to transform the scale to f32 ahead of time
+                const half2 * dsi_src = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + ir*(WARP_SIZE/QI8_1) + kby].ds;
+                half2       * dsi_dst = &tile_y_ds[ids * (WARP_SIZE/QI8_1) + kby];
+                if (need_sum) {
+                    *dsi_dst = *dsi_src;
+                } else {
+                    float * dfi_dst = (float *) dsi_dst;
+                    *dfi_dst = __low2half(*dsi_src);
+                }
+            }
+
+            __syncthreads();
+
+// #pragma unroll // unrolling this loop causes too much register pressure
+            for (int k = ir*WARP_SIZE/qr; k < (ir+1)*WARP_SIZE/qr; k += vdr) {
+#pragma unroll
+                for (int j = 0; j < mmq_x; j += nwarps) {
+#pragma unroll
+                    for (int i = 0; i < mmq_y; i += WARP_SIZE) {
+                        sum[i/WARP_SIZE][j/nwarps] += vec_dot(
+                            tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds,
+                            threadIdx.x + i, threadIdx.y + j, k);
+                    }
+                }
+            }
+
+            __syncthreads();
+        }
+    }
+
+#pragma unroll
+    for (int j = 0; j < mmq_x; j += nwarps) {
+        const int col_dst = col_dst_0 + j + threadIdx.y;
+
+        if (col_dst >= ncols_dst) {
+            return;
+        }
+
+#pragma unroll
+        for (int i = 0; i < mmq_y; i += WARP_SIZE) {
+            const int row_dst = row_dst_0 + threadIdx.x + i;
+
+            if (row_dst >= nrows_dst) {
+                continue;
+            }
+
+            dst[col_dst*nrows_dst + row_dst] = sum[i/WARP_SIZE][j/nwarps];
+        }
+    }
+}
+
+#define  MMQ_X_Q4_0_RDNA2  64
+#define  MMQ_Y_Q4_0_RDNA2  128
+#define NWARPS_Q4_0_RDNA2  8
+#define  MMQ_X_Q4_0_RDNA1  64
+#define  MMQ_Y_Q4_0_RDNA1  64
+#define NWARPS_Q4_0_RDNA1  8
+#if defined(CUDA_USE_TENSOR_CORES)
+#define  MMQ_X_Q4_0_AMPERE 4
+#define  MMQ_Y_Q4_0_AMPERE 32
+#define NWARPS_Q4_0_AMPERE 4
+#else
+#define  MMQ_X_Q4_0_AMPERE 64
+#define  MMQ_Y_Q4_0_AMPERE 128
+#define NWARPS_Q4_0_AMPERE 4
+#endif
+#define  MMQ_X_Q4_0_PASCAL 64
+#define  MMQ_Y_Q4_0_PASCAL 64
+#define NWARPS_Q4_0_PASCAL 8
+
+template <bool need_check> static __global__ void
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    __launch_bounds__(WARP_SIZE*NWARPS_Q4_0_RDNA2, 2)
+#endif // defined(RDNA3) || defined(RDNA2)
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+    mul_mat_q4_0(
+    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    const int mmq_x  =  MMQ_X_Q4_0_RDNA2;
+    const int mmq_y  =  MMQ_Y_Q4_0_RDNA2;
+    const int nwarps = NWARPS_Q4_0_RDNA2;
+#else
+    const int mmq_x  =  MMQ_X_Q4_0_RDNA1;
+    const int mmq_y  =  MMQ_Y_Q4_0_RDNA1;
+    const int nwarps = NWARPS_Q4_0_RDNA1;
+#endif // defined(RDNA3) || defined(RDNA2)
+
+    mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
+        load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif __CUDA_ARCH__ >= CC_VOLTA
+    const int mmq_x  =  MMQ_X_Q4_0_AMPERE;
+    const int mmq_y  =  MMQ_Y_Q4_0_AMPERE;
+    const int nwarps = NWARPS_Q4_0_AMPERE;
+
+    mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
+        load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif __CUDA_ARCH__ >= MIN_CC_DP4A
+    const int mmq_x  =  MMQ_X_Q4_0_PASCAL;
+    const int mmq_y  =  MMQ_Y_Q4_0_PASCAL;
+    const int nwarps = NWARPS_Q4_0_PASCAL;
+
+    mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
+        load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+#else
+    (void) vec_dot_q4_0_q8_1_mul_mat;
+    bad_arch();
+#endif // __CUDA_ARCH__ >= CC_VOLTA
+}
+
+#define  MMQ_X_Q4_1_RDNA2  64
+#define  MMQ_Y_Q4_1_RDNA2  128
+#define NWARPS_Q4_1_RDNA2  8
+#define  MMQ_X_Q4_1_RDNA1  64
+#define  MMQ_Y_Q4_1_RDNA1  64
+#define NWARPS_Q4_1_RDNA1  8
+#if defined(CUDA_USE_TENSOR_CORES)
+#define  MMQ_X_Q4_1_AMPERE 4
+#define  MMQ_Y_Q4_1_AMPERE 32
+#define NWARPS_Q4_1_AMPERE 4
+#else
+#define  MMQ_X_Q4_1_AMPERE 64
+#define  MMQ_Y_Q4_1_AMPERE 128
+#define NWARPS_Q4_1_AMPERE 4
+#endif
+#define  MMQ_X_Q4_1_PASCAL 64
+#define  MMQ_Y_Q4_1_PASCAL 64
+#define NWARPS_Q4_1_PASCAL 8
+
+template <bool need_check> static __global__ void
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_RDNA2, 2)
+#endif // defined(RDNA3) || defined(RDNA2)
+#elif __CUDA_ARCH__ < CC_VOLTA
+    __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_PASCAL, 2)
+#endif // __CUDA_ARCH__ < CC_VOLTA
+    mul_mat_q4_1(
+    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    const int mmq_x  =  MMQ_X_Q4_1_RDNA2;
+    const int mmq_y  =  MMQ_Y_Q4_1_RDNA2;
+    const int nwarps = NWARPS_Q4_1_RDNA2;
+#else
+    const int mmq_x  =  MMQ_X_Q4_1_RDNA1;
+    const int mmq_y  =  MMQ_Y_Q4_1_RDNA1;
+    const int nwarps = NWARPS_Q4_1_RDNA1;
+#endif // defined(RDNA3) || defined(RDNA2)
+
+    mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
+        load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif __CUDA_ARCH__ >= CC_VOLTA
+    const int mmq_x  =  MMQ_X_Q4_1_AMPERE;
+    const int mmq_y  =  MMQ_Y_Q4_1_AMPERE;
+    const int nwarps = NWARPS_Q4_1_AMPERE;
+
+    mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
+        load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif __CUDA_ARCH__ >= MIN_CC_DP4A
+    const int mmq_x  =  MMQ_X_Q4_1_PASCAL;
+    const int mmq_y  =  MMQ_Y_Q4_1_PASCAL;
+    const int nwarps = NWARPS_Q4_1_PASCAL;
+
+    mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
+        load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+#else
+    (void) vec_dot_q4_1_q8_1_mul_mat;
+    bad_arch();
+#endif // __CUDA_ARCH__ >= CC_VOLTA
+}
+
+#define  MMQ_X_Q5_0_RDNA2  64
+#define  MMQ_Y_Q5_0_RDNA2  128
+#define NWARPS_Q5_0_RDNA2  8
+#define  MMQ_X_Q5_0_RDNA1  64
+#define  MMQ_Y_Q5_0_RDNA1  64
+#define NWARPS_Q5_0_RDNA1  8
+#if defined(CUDA_USE_TENSOR_CORES)
+#define  MMQ_X_Q5_0_AMPERE 4
+#define  MMQ_Y_Q5_0_AMPERE 32
+#define NWARPS_Q5_0_AMPERE 4
+#else
+#define  MMQ_X_Q5_0_AMPERE 128
+#define  MMQ_Y_Q5_0_AMPERE 64
+#define NWARPS_Q5_0_AMPERE 4
+#endif
+#define  MMQ_X_Q5_0_PASCAL 64
+#define  MMQ_Y_Q5_0_PASCAL 64
+#define NWARPS_Q5_0_PASCAL 8
+
+template <bool need_check> static __global__ void
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    __launch_bounds__(WARP_SIZE*NWARPS_Q5_0_RDNA2, 2)
+#endif // defined(RDNA3) || defined(RDNA2)
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+    mul_mat_q5_0(
+    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    const int mmq_x  =  MMQ_X_Q5_0_RDNA2;
+    const int mmq_y  =  MMQ_Y_Q5_0_RDNA2;
+    const int nwarps = NWARPS_Q5_0_RDNA2;
+#else
+    const int mmq_x  =  MMQ_X_Q5_0_RDNA1;
+    const int mmq_y  =  MMQ_Y_Q5_0_RDNA1;
+    const int nwarps = NWARPS_Q5_0_RDNA1;
+#endif // defined(RDNA3) || defined(RDNA2)
+
+    mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
+        load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif __CUDA_ARCH__ >= CC_VOLTA
+    const int mmq_x  =  MMQ_X_Q5_0_AMPERE;
+    const int mmq_y  =  MMQ_Y_Q5_0_AMPERE;
+    const int nwarps = NWARPS_Q5_0_AMPERE;
+
+    mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
+        load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif __CUDA_ARCH__ >= MIN_CC_DP4A
+    const int mmq_x  =  MMQ_X_Q5_0_PASCAL;
+    const int mmq_y  =  MMQ_Y_Q5_0_PASCAL;
+    const int nwarps = NWARPS_Q5_0_PASCAL;
+
+    mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
+        load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+#else
+    (void) vec_dot_q5_0_q8_1_mul_mat;
+    bad_arch();
+#endif // __CUDA_ARCH__ >= CC_VOLTA
+}
+
+#define  MMQ_X_Q5_1_RDNA2  64
+#define  MMQ_Y_Q5_1_RDNA2  128
+#define NWARPS_Q5_1_RDNA2  8
+#define  MMQ_X_Q5_1_RDNA1  64
+#define  MMQ_Y_Q5_1_RDNA1  64
+#define NWARPS_Q5_1_RDNA1  8
+#if defined(CUDA_USE_TENSOR_CORES)
+#define  MMQ_X_Q5_1_AMPERE 4
+#define  MMQ_Y_Q5_1_AMPERE 32
+#define NWARPS_Q5_1_AMPERE 4
+#else
+#define  MMQ_X_Q5_1_AMPERE 128
+#define  MMQ_Y_Q5_1_AMPERE 64
+#define NWARPS_Q5_1_AMPERE 4
+#endif
+#define  MMQ_X_Q5_1_PASCAL 64
+#define  MMQ_Y_Q5_1_PASCAL 64
+#define NWARPS_Q5_1_PASCAL 8
+
+template <bool need_check> static __global__ void
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    __launch_bounds__(WARP_SIZE*NWARPS_Q5_1_RDNA2, 2)
+#endif // defined(RDNA3) || defined(RDNA2)
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+mul_mat_q5_1(
+    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    const int mmq_x  =  MMQ_X_Q5_1_RDNA2;
+    const int mmq_y  =  MMQ_Y_Q5_1_RDNA2;
+    const int nwarps = NWARPS_Q5_1_RDNA2;
+#else
+    const int mmq_x  =  MMQ_X_Q5_1_RDNA1;
+    const int mmq_y  =  MMQ_Y_Q5_1_RDNA1;
+    const int nwarps = NWARPS_Q5_1_RDNA1;
+#endif // defined(RDNA3) || defined(RDNA2)
+
+    mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
+        load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif __CUDA_ARCH__ >= CC_VOLTA
+    const int mmq_x  =  MMQ_X_Q5_1_AMPERE;
+    const int mmq_y  =  MMQ_Y_Q5_1_AMPERE;
+    const int nwarps = NWARPS_Q5_1_AMPERE;
+
+    mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
+        load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif __CUDA_ARCH__ >= MIN_CC_DP4A
+    const int mmq_x  =  MMQ_X_Q5_1_PASCAL;
+    const int mmq_y  =  MMQ_Y_Q5_1_PASCAL;
+    const int nwarps = NWARPS_Q5_1_PASCAL;
+
+    mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
+        load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+#else
+    (void) vec_dot_q5_1_q8_1_mul_mat;
+    bad_arch();
+#endif // __CUDA_ARCH__ >= CC_VOLTA
+}
+
+#define  MMQ_X_Q8_0_RDNA2  64
+#define  MMQ_Y_Q8_0_RDNA2  128
+#define NWARPS_Q8_0_RDNA2  8
+#define  MMQ_X_Q8_0_RDNA1  64
+#define  MMQ_Y_Q8_0_RDNA1  64
+#define NWARPS_Q8_0_RDNA1  8
+#if defined(CUDA_USE_TENSOR_CORES)
+#define  MMQ_X_Q8_0_AMPERE 4
+#define  MMQ_Y_Q8_0_AMPERE 32
+#define NWARPS_Q8_0_AMPERE 4
+#else
+#define  MMQ_X_Q8_0_AMPERE 128
+#define  MMQ_Y_Q8_0_AMPERE 64
+#define NWARPS_Q8_0_AMPERE 4
+#endif
+#define  MMQ_X_Q8_0_PASCAL 64
+#define  MMQ_Y_Q8_0_PASCAL 64
+#define NWARPS_Q8_0_PASCAL 8
+
+template <bool need_check> static __global__ void
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    __launch_bounds__(WARP_SIZE*NWARPS_Q8_0_RDNA2, 2)
+#endif // defined(RDNA3) || defined(RDNA2)
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+    mul_mat_q8_0(
+    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    const int mmq_x  =  MMQ_X_Q8_0_RDNA2;
+    const int mmq_y  =  MMQ_Y_Q8_0_RDNA2;
+    const int nwarps = NWARPS_Q8_0_RDNA2;
+#else
+    const int mmq_x  =  MMQ_X_Q8_0_RDNA1;
+    const int mmq_y  =  MMQ_Y_Q8_0_RDNA1;
+    const int nwarps = NWARPS_Q8_0_RDNA1;
+#endif // defined(RDNA3) || defined(RDNA2)
+
+    mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
+        load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif __CUDA_ARCH__ >= CC_VOLTA
+    const int mmq_x  =  MMQ_X_Q8_0_AMPERE;
+    const int mmq_y  =  MMQ_Y_Q8_0_AMPERE;
+    const int nwarps = NWARPS_Q8_0_AMPERE;
+
+    mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
+        load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif __CUDA_ARCH__ >= MIN_CC_DP4A
+    const int mmq_x  =  MMQ_X_Q8_0_PASCAL;
+    const int mmq_y  =  MMQ_Y_Q8_0_PASCAL;
+    const int nwarps = NWARPS_Q8_0_PASCAL;
+
+    mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
+        load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+#else
+    (void) vec_dot_q8_0_q8_1_mul_mat;
+    bad_arch();
+#endif // __CUDA_ARCH__ >= CC_VOLTA
+}
+
+#define  MMQ_X_Q2_K_RDNA2  64
+#define  MMQ_Y_Q2_K_RDNA2  128
+#define NWARPS_Q2_K_RDNA2  8
+#define  MMQ_X_Q2_K_RDNA1  128
+#define  MMQ_Y_Q2_K_RDNA1  32
+#define NWARPS_Q2_K_RDNA1  8
+#if defined(CUDA_USE_TENSOR_CORES)
+#define  MMQ_X_Q2_K_AMPERE 4
+#define  MMQ_Y_Q2_K_AMPERE 32
+#define NWARPS_Q2_K_AMPERE 4
+#else
+#define  MMQ_X_Q2_K_AMPERE 64
+#define  MMQ_Y_Q2_K_AMPERE 128
+#define NWARPS_Q2_K_AMPERE 4
+#endif
+#define  MMQ_X_Q2_K_PASCAL 64
+#define  MMQ_Y_Q2_K_PASCAL 64
+#define NWARPS_Q2_K_PASCAL 8
+
+template <bool need_check> static __global__ void
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    __launch_bounds__(WARP_SIZE*NWARPS_Q2_K_RDNA2, 2)
+#endif // defined(RDNA3) || defined(RDNA2)
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+mul_mat_q2_K(
+    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    const int mmq_x  =  MMQ_X_Q2_K_RDNA2;
+    const int mmq_y  =  MMQ_Y_Q2_K_RDNA2;
+    const int nwarps = NWARPS_Q2_K_RDNA2;
+#else
+    const int mmq_x  =  MMQ_X_Q2_K_RDNA1;
+    const int mmq_y  =  MMQ_Y_Q2_K_RDNA1;
+    const int nwarps = NWARPS_Q2_K_RDNA1;
+#endif // defined(RDNA3) || defined(RDNA2)
+
+    mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
+        load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif __CUDA_ARCH__ >= CC_VOLTA
+    const int mmq_x  =  MMQ_X_Q2_K_AMPERE;
+    const int mmq_y  =  MMQ_Y_Q2_K_AMPERE;
+    const int nwarps = NWARPS_Q2_K_AMPERE;
+
+    mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
+        load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif __CUDA_ARCH__ >= MIN_CC_DP4A
+    const int mmq_x  =  MMQ_X_Q2_K_PASCAL;
+    const int mmq_y  =  MMQ_Y_Q2_K_PASCAL;
+    const int nwarps = NWARPS_Q2_K_PASCAL;
+
+    mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
+        load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+#else
+    (void) vec_dot_q2_K_q8_1_mul_mat;
+    bad_arch();
+#endif // __CUDA_ARCH__ >= CC_VOLTA
+}
+
+#define  MMQ_X_Q3_K_RDNA2  128
+#define  MMQ_Y_Q3_K_RDNA2  64
+#define NWARPS_Q3_K_RDNA2  8
+#define  MMQ_X_Q3_K_RDNA1  32
+#define  MMQ_Y_Q3_K_RDNA1  128
+#define NWARPS_Q3_K_RDNA1  8
+#if defined(CUDA_USE_TENSOR_CORES)
+#define  MMQ_X_Q3_K_AMPERE 4
+#define  MMQ_Y_Q3_K_AMPERE 32
+#define NWARPS_Q3_K_AMPERE 4
+#else
+#define  MMQ_X_Q3_K_AMPERE 128
+#define  MMQ_Y_Q3_K_AMPERE 128
+#define NWARPS_Q3_K_AMPERE 4
+#endif
+#define  MMQ_X_Q3_K_PASCAL 64
+#define  MMQ_Y_Q3_K_PASCAL 64
+#define NWARPS_Q3_K_PASCAL 8
+
+template <bool need_check> static __global__ void
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_RDNA2, 2)
+#endif // defined(RDNA3) || defined(RDNA2)
+#elif __CUDA_ARCH__ < CC_VOLTA
+    __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_PASCAL, 2)
+#endif // __CUDA_ARCH__ < CC_VOLTA
+    mul_mat_q3_K(
+    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    const int mmq_x  =  MMQ_X_Q3_K_RDNA2;
+    const int mmq_y  =  MMQ_Y_Q3_K_RDNA2;
+    const int nwarps = NWARPS_Q3_K_RDNA2;
+#else
+    const int mmq_x  =  MMQ_X_Q3_K_RDNA1;
+    const int mmq_y  =  MMQ_Y_Q3_K_RDNA1;
+    const int nwarps = NWARPS_Q3_K_RDNA1;
+#endif // defined(RDNA3) || defined(RDNA2)
+
+    mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
+        load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif __CUDA_ARCH__ >= CC_VOLTA
+    const int mmq_x  =  MMQ_X_Q3_K_AMPERE;
+    const int mmq_y  =  MMQ_Y_Q3_K_AMPERE;
+    const int nwarps = NWARPS_Q3_K_AMPERE;
+
+    mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
+        load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif __CUDA_ARCH__ >= MIN_CC_DP4A
+    const int mmq_x  =  MMQ_X_Q3_K_PASCAL;
+    const int mmq_y  =  MMQ_Y_Q3_K_PASCAL;
+    const int nwarps = NWARPS_Q3_K_PASCAL;
+
+    mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
+        load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+#else
+    (void) vec_dot_q3_K_q8_1_mul_mat;
+    bad_arch();
+#endif // __CUDA_ARCH__ >= CC_VOLTA
+}
+
+#define  MMQ_X_Q4_K_RDNA2  64
+#define  MMQ_Y_Q4_K_RDNA2  128
+#define NWARPS_Q4_K_RDNA2  8
+#define  MMQ_X_Q4_K_RDNA1  32
+#define  MMQ_Y_Q4_K_RDNA1  64
+#define NWARPS_Q4_K_RDNA1  8
+#if defined(CUDA_USE_TENSOR_CORES)
+#define  MMQ_X_Q4_K_AMPERE 4
+#define  MMQ_Y_Q4_K_AMPERE 32
+#define NWARPS_Q4_K_AMPERE 4
+#else
+#define  MMQ_X_Q4_K_AMPERE 64
+#define  MMQ_Y_Q4_K_AMPERE 128
+#define NWARPS_Q4_K_AMPERE 4
+#endif
+#define  MMQ_X_Q4_K_PASCAL 64
+#define  MMQ_Y_Q4_K_PASCAL 64
+#define NWARPS_Q4_K_PASCAL 8
+
+template <bool need_check> static __global__ void
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_RDNA2, 2)
+#endif // defined(RDNA3) || defined(RDNA2)
+#elif __CUDA_ARCH__ < CC_VOLTA
+    __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_PASCAL, 2)
+#endif // __CUDA_ARCH__ < CC_VOLTA
+    mul_mat_q4_K(
+    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    const int mmq_x  =  MMQ_X_Q4_K_RDNA2;
+    const int mmq_y  =  MMQ_Y_Q4_K_RDNA2;
+    const int nwarps = NWARPS_Q4_K_RDNA2;
+#else
+    const int mmq_x  =  MMQ_X_Q4_K_RDNA1;
+    const int mmq_y  =  MMQ_Y_Q4_K_RDNA1;
+    const int nwarps = NWARPS_Q4_K_RDNA1;
+#endif // defined(RDNA3) || defined(RDNA2)
+
+    mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
+        load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif __CUDA_ARCH__ >= CC_VOLTA
+    const int mmq_x  =  MMQ_X_Q4_K_AMPERE;
+    const int mmq_y  =  MMQ_Y_Q4_K_AMPERE;
+    const int nwarps = NWARPS_Q4_K_AMPERE;
+
+    mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
+        load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif __CUDA_ARCH__ >= MIN_CC_DP4A
+    const int mmq_x  =  MMQ_X_Q4_K_PASCAL;
+    const int mmq_y  =  MMQ_Y_Q4_K_PASCAL;
+    const int nwarps = NWARPS_Q4_K_PASCAL;
+
+    mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
+        load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+#else
+    (void) vec_dot_q4_K_q8_1_mul_mat;
+    bad_arch();
+#endif // __CUDA_ARCH__ >= CC_VOLTA
+}
+
+#define  MMQ_X_Q5_K_RDNA2  64
+#define  MMQ_Y_Q5_K_RDNA2  128
+#define NWARPS_Q5_K_RDNA2  8
+#define  MMQ_X_Q5_K_RDNA1  32
+#define  MMQ_Y_Q5_K_RDNA1  64
+#define NWARPS_Q5_K_RDNA1  8
+#if defined(CUDA_USE_TENSOR_CORES)
+#define  MMQ_X_Q5_K_AMPERE 4
+#define  MMQ_Y_Q5_K_AMPERE 32
+#define NWARPS_Q5_K_AMPERE 4
+#else
+#define  MMQ_X_Q5_K_AMPERE 64
+#define  MMQ_Y_Q5_K_AMPERE 128
+#define NWARPS_Q5_K_AMPERE 4
+#endif
+#define  MMQ_X_Q5_K_PASCAL 64
+#define  MMQ_Y_Q5_K_PASCAL 64
+#define NWARPS_Q5_K_PASCAL 8
+
+template <bool need_check> static __global__ void
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    __launch_bounds__(WARP_SIZE*NWARPS_Q5_K_RDNA2, 2)
+#endif // defined(RDNA3) || defined(RDNA2)
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+mul_mat_q5_K(
+    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    const int mmq_x  =  MMQ_X_Q5_K_RDNA2;
+    const int mmq_y  =  MMQ_Y_Q5_K_RDNA2;
+    const int nwarps = NWARPS_Q5_K_RDNA2;
+#else
+    const int mmq_x  =  MMQ_X_Q5_K_RDNA1;
+    const int mmq_y  =  MMQ_Y_Q5_K_RDNA1;
+    const int nwarps = NWARPS_Q5_K_RDNA1;
+#endif // defined(RDNA3) || defined(RDNA2)
+
+    mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
+        load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif __CUDA_ARCH__ >= CC_VOLTA
+    const int mmq_x  =  MMQ_X_Q5_K_AMPERE;
+    const int mmq_y  =  MMQ_Y_Q5_K_AMPERE;
+    const int nwarps = NWARPS_Q5_K_AMPERE;
+
+    mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
+        load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif __CUDA_ARCH__ >= MIN_CC_DP4A
+    const int mmq_x  =  MMQ_X_Q5_K_PASCAL;
+    const int mmq_y  =  MMQ_Y_Q5_K_PASCAL;
+    const int nwarps = NWARPS_Q5_K_PASCAL;
+
+    mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
+        load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+#else
+    (void) vec_dot_q5_K_q8_1_mul_mat;
+    bad_arch();
+#endif // __CUDA_ARCH__ >= CC_VOLTA
+}
+
+#define  MMQ_X_Q6_K_RDNA2  64
+#define  MMQ_Y_Q6_K_RDNA2  128
+#define NWARPS_Q6_K_RDNA2  8
+#define  MMQ_X_Q6_K_RDNA1  32
+#define  MMQ_Y_Q6_K_RDNA1  64
+#define NWARPS_Q6_K_RDNA1  8
+#if defined(CUDA_USE_TENSOR_CORES)
+#define  MMQ_X_Q6_K_AMPERE 4
+#define  MMQ_Y_Q6_K_AMPERE 32
+#define NWARPS_Q6_K_AMPERE 4
+#else
+#define  MMQ_X_Q6_K_AMPERE 64
+#define  MMQ_Y_Q6_K_AMPERE 64
+#define NWARPS_Q6_K_AMPERE 4
+#endif
+#define  MMQ_X_Q6_K_PASCAL 64
+#define  MMQ_Y_Q6_K_PASCAL 64
+#define NWARPS_Q6_K_PASCAL 8
+
+template <bool need_check> static __global__ void
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_RDNA2, 2)
+#endif // defined(RDNA3) || defined(RDNA2)
+#elif __CUDA_ARCH__ < CC_VOLTA
+    __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_PASCAL, 2)
+#endif // __CUDA_ARCH__ < CC_VOLTA
+    mul_mat_q6_K(
+    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    const int mmq_x  =  MMQ_X_Q6_K_RDNA2;
+    const int mmq_y  =  MMQ_Y_Q6_K_RDNA2;
+    const int nwarps = NWARPS_Q6_K_RDNA2;
+#else
+    const int mmq_x  =  MMQ_X_Q6_K_RDNA1;
+    const int mmq_y  =  MMQ_Y_Q6_K_RDNA1;
+    const int nwarps = NWARPS_Q6_K_RDNA1;
+#endif // defined(RDNA3) || defined(RDNA2)
+
+    mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
+        load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif __CUDA_ARCH__ >= CC_VOLTA
+    const int mmq_x  =  MMQ_X_Q6_K_AMPERE;
+    const int mmq_y  =  MMQ_Y_Q6_K_AMPERE;
+    const int nwarps = NWARPS_Q6_K_AMPERE;
+
+    mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
+        load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif __CUDA_ARCH__ >= MIN_CC_DP4A
+    const int mmq_x  =  MMQ_X_Q6_K_PASCAL;
+    const int mmq_y  =  MMQ_Y_Q6_K_PASCAL;
+    const int nwarps = NWARPS_Q6_K_PASCAL;
+
+    mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
+        load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+#else
+    (void) vec_dot_q6_K_q8_1_mul_mat;
+    bad_arch();
+#endif // __CUDA_ARCH__ >= CC_VOLTA
+}
+
+template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
+static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
+    const int row = blockIdx.x*blockDim.y + threadIdx.y;
+
+    if (row >= nrows) {
+        return;
+    }
+
+    const int blocks_per_row = ncols / qk;
+    const int blocks_per_warp = vdr * WARP_SIZE / qi;
+
+// partial sum for each thread
+    float tmp = 0.0f;
+
+    const block_q_t  * x = (const block_q_t  *) vx;
+    const block_q8_1 * y = (const block_q8_1 *) vy;
+
+    for (int i = 0; i < blocks_per_row; i += blocks_per_warp) {
+        const int ibx = row*blocks_per_row + i + threadIdx.x / (qi/vdr); // x block index
+
+        const int iby = (i + threadIdx.x / (qi/vdr)) * (qk/QK8_1); // y block index that aligns with ibx
+
+        const int iqs  = vdr * (threadIdx.x % (qi/vdr)); // x block quant index when casting the quants to int
+
+        tmp += vec_dot_q_cuda(&x[ibx], &y[iby], iqs);
+    }
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
+    }
+
+    if (threadIdx.x == 0) {
+        dst[row] = tmp;
+    }
+}
+
+template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
+static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
+    // qk = quantized weights per x block
+    // qr = number of quantized weights per data value in x block
+    const int row = blockIdx.x*blockDim.y + threadIdx.y;
+
+    if (row >= nrows) {
+        return;
+    }
+
+    const int tid = threadIdx.x;
+
+    const int iter_stride = 2*GGML_V3_CUDA_DMMV_X;
+    const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter
+    const int y_offset = qr == 1 ? 1 : qk/2;
+
+// partial sum for each thread
+#ifdef GGML_V3_CUDA_F16
+    half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics
+#else
+    float tmp = 0.0f;
+#endif // GGML_V3_CUDA_F16
+
+    for (int i = 0; i < ncols; i += iter_stride) {
+        const int col = i + vals_per_iter*tid;
+        const int ib = (row*ncols + col)/qk; // x block index
+        const int iqs = (col%qk)/qr; // x quant index
+        const int iybs = col - col%qk; // y block start index
+
+// processing >2 values per i iter is faster for fast GPUs
+#pragma unroll
+        for (int j = 0; j < vals_per_iter; j += 2) {
+            // process 2 vals per j iter
+
+            // dequantize
+            // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
+            dfloat2 v;
+            dequantize_kernel(vx, ib, iqs + j/qr, v);
+
+            // matrix multiplication
+            // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
+#ifdef GGML_V3_CUDA_F16
+            tmp += __hmul2(v, {
+                y[iybs + iqs + j/qr + 0],
+                y[iybs + iqs + j/qr + y_offset]
+            });
+#else
+            tmp += v.x * y[iybs + iqs + j/qr + 0];
+            tmp += v.y * y[iybs + iqs + j/qr + y_offset];
+#endif // GGML_V3_CUDA_F16
+        }
+    }
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
+    }
+
+    if (tid == 0) {
+#ifdef GGML_V3_CUDA_F16
+        dst[row] = tmp.x + tmp.y;
+#else
+        dst[row] = tmp;
+#endif // GGML_V3_CUDA_F16
+    }
+}
+
+static __global__ void mul_mat_p021_f16_f32(
+    const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int nchannels_x, const int nchannels_y) {
+
+    const half * x = (const half *) vx;
+
+    const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
+    const int channel = blockDim.z*blockIdx.z + threadIdx.z;
+    const int channel_x = channel / (nchannels_y / nchannels_x);
+
+    const int nrows_y = ncols_x;
+    const int nrows_dst = nrows_x;
+    const int row_dst = row_x;
+
+    float tmp = 0.0f;
+
+    for (int col_x0 = 0; col_x0 < ncols_x; col_x0 += blockDim.x) {
+        const int col_x = col_x0 + threadIdx.x;
+
+        if (col_x >= ncols_x) {
+            break;
+        }
+
+        // x is transposed and permuted
+        const int ix = row_x*nchannels_x*ncols_x + channel_x*ncols_x + col_x;
+        const float xi = __half2float(x[ix]);
+
+        const int row_y = col_x;
+
+        // y is not transposed but permuted
+        const int iy = channel*nrows_y + row_y;
+
+        tmp += xi * y[iy];
+    }
+
+    // dst is not transposed and not permuted
+    const int idst = channel*nrows_dst + row_dst;
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
+    }
+
+    if (threadIdx.x == 0) {
+        dst[idst] = tmp;
+    }
+}
+
+static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
+    const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x,
+    const int row_stride_x, const int channel_stride_x, const int channel_x_divisor) {
+
+    const half * x = (const half *) vx;
+
+    const int row_x     = blockDim.y*blockIdx.y + threadIdx.y;
+    const int channel   = blockDim.z*blockIdx.z + threadIdx.z;
+    const int channel_x = channel / channel_x_divisor;
+
+    const int nrows_y   = ncols_x;
+    const int nrows_dst = nrows_x;
+    const int row_dst   = row_x;
+
+    const int idst = channel*nrows_dst + row_dst;
+
+    float tmp = 0.0f;
+
+    for (int col_x0 = 0; col_x0 < ncols_x; col_x0 += blockDim.x) {
+        const int col_x = col_x0 + threadIdx.x;
+
+        if (col_x >= ncols_x) {
+            break;
+        }
+
+        const int row_y = col_x;
+
+        const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
+        const int iy = channel*nrows_y + row_y;
+
+        const float xi = __half2float(x[ix]);
+
+        tmp += xi * y[iy];
+    }
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
+    }
+
+    if (threadIdx.x == 0) {
+        dst[idst] = tmp;
+    }
+}
+
+static __device__ void cpy_1_f32_f32(const char * cxi, char * cdsti) {
+    const float * xi = (const float *) cxi;
+    float * dsti = (float *) cdsti;
+
+    *dsti = *xi;
+}
+
+static __device__ void cpy_1_f32_f16(const char * cxi, char * cdsti) {
+    const float * xi = (const float *) cxi;
+    half * dsti = (half *) cdsti;
+
+    *dsti = __float2half(*xi);
+}
+
+static __device__ void cpy_1_f16_f16(const char * cxi, char * cdsti) {
+    const half * xi = (const half *) cxi;
+    half * dsti = (half *) cdsti;
+
+    *dsti = *xi;
+}
+
+template <cpy_kernel_t cpy_1>
+static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
+                                   const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
+                                   const int ne10, const int ne11, const int nb10, const int nb11, const int nb12) {
+    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i >= ne) {
+        return;
+    }
+
+    // determine indices i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor
+    // then combine those indices with the corresponding byte offsets to get the total offsets
+    const int i02 = i / (ne00*ne01);
+    const int i01 = (i - i02*ne01*ne00) / ne00;
+    const int i00 = i - i02*ne01*ne00 - i01*ne00;
+    const int x_offset = i00*nb00 + i01*nb01 + i02*nb02;
+
+    const int i12 = i / (ne10*ne11);
+    const int i11 = (i - i12*ne10*ne11) / ne10;
+    const int i10 = i - i12*ne10*ne11 - i11*ne10;
+    const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12;
+
+    cpy_1(cx + x_offset, cdst + dst_offset);
+}
+
+static __device__ void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
+    const float * xi = (const float *) cxi;
+    block_q8_0 * dsti = (block_q8_0 *) cdsti;
+
+    float amax = 0.0f; // absolute max
+
+    for (int j = 0; j < QK8_0; j++) {
+        const float v = xi[j];
+        amax = fmaxf(amax, fabsf(v));
+    }
+
+    const float d = amax / ((1 << 7) - 1);
+    const float id = d ? 1.0f/d : 0.0f;
+
+    dsti->d = d;
+
+    for (int j = 0; j < QK8_0; ++j) {
+        const float x0 = xi[j]*id;
+
+        dsti->qs[j] = roundf(x0);
+    }
+}
+
+static __device__ void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) {
+    const float * xi = (const float *) cxi;
+    block_q4_0 * dsti = (block_q4_0 *) cdsti;
+
+    float amax = 0.0f;
+    float vmax = 0.0f;
+
+    for (int j = 0; j < QK4_0; ++j) {
+        const float v = xi[j];
+        if (amax < fabsf(v)) {
+            amax = fabsf(v);
+            vmax = v;
+        }
+    }
+
+    const float d  = vmax / -8;
+    const float id = d ? 1.0f/d : 0.0f;
+
+    dsti->d = d;
+
+    for (int j = 0; j < QK4_0/2; ++j) {
+        const float x0 = xi[0       + j]*id;
+        const float x1 = xi[QK4_0/2 + j]*id;
+
+        const uint8_t xi0 = min(15, (int8_t)(x0 + 8.5f));
+        const uint8_t xi1 = min(15, (int8_t)(x1 + 8.5f));
+
+        dsti->qs[j]  = xi0;
+        dsti->qs[j] |= xi1 << 4;
+    }
+}
+
+static __device__ void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) {
+    const float * xi = (const float *) cxi;
+    block_q4_1 * dsti = (block_q4_1 *) cdsti;
+
+    float vmin = FLT_MAX;
+    float vmax = -FLT_MAX;
+
+    for (int j = 0; j < QK4_1; ++j) {
+        const float v = xi[j];
+
+        if (v < vmin) vmin = v;
+        if (v > vmax) vmax = v;
+    }
+
+    const float d  = (vmax - vmin) / ((1 << 4) - 1);
+    const float id = d ? 1.0f/d : 0.0f;
+
+    dsti->dm.x = d;
+    dsti->dm.y = vmin;
+
+    for (int j = 0; j < QK4_1/2; ++j) {
+        const float x0 = (xi[0       + j] - vmin)*id;
+        const float x1 = (xi[QK4_1/2 + j] - vmin)*id;
+
+        const uint8_t xi0 = min(15, (int8_t)(x0 + 0.5f));
+        const uint8_t xi1 = min(15, (int8_t)(x1 + 0.5f));
+
+        dsti->qs[j]  = xi0;
+        dsti->qs[j] |= xi1 << 4;
+    }
+}
+
+template <cpy_kernel_t cpy_blck, int qk>
+static __global__ void cpy_f32_q(const char * cx, char * cdst, const int ne,
+                                 const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
+                                 const int ne10, const int ne11, const int nb10, const int nb11, const int nb12) {
+    const int i = (blockDim.x*blockIdx.x + threadIdx.x)*qk;
+
+    if (i >= ne) {
+        return;
+    }
+
+    const int i02 = i / (ne00*ne01);
+    const int i01 = (i - i02*ne01*ne00) / ne00;
+    const int i00 = (i - i02*ne01*ne00 - i01*ne00);
+    const int x_offset = i00*nb00 + i01*nb01 + i02*nb02;
+
+    const int i12 = i / (ne10*ne11);
+    const int i11 = (i - i12*ne10*ne11) / ne10;
+    const int i10 = (i - i12*ne10*ne11 - i11*ne10)/qk;
+    const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12;
+
+    cpy_blck(cx + x_offset, cdst + dst_offset);
+}
+
+static __device__ float rope_yarn_ramp(const float low, const float high, const int i0) {
+    const float y = (i0 / 2 - low) / max(0.001f, high - low);
+    return 1.0f - min(1.0f, max(0.0f, y));
+}
+
+struct rope_corr_dims {
+    float v[4];
+};
+
+// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
+// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
+static __device__ void rope_yarn(
+    float theta_extrap, float freq_scale, rope_corr_dims corr_dims, int64_t i0, float ext_factor, float mscale,
+    float * cos_theta, float * sin_theta
+) {
+    // Get n-d rotational scaling corrected for extrapolation
+    float theta_interp = freq_scale * theta_extrap;
+    float theta = theta_interp;
+    if (ext_factor != 0.0f) {
+        float ramp_mix = rope_yarn_ramp(corr_dims.v[0], corr_dims.v[1], i0) * ext_factor;
+        theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
+
+        // Get n-d magnitude scaling corrected for interpolation
+        mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale);
+    }
+    *cos_theta = cosf(theta) * mscale;
+    *sin_theta = sinf(theta) * mscale;
+}
+
+// rope == RoPE == rotary positional embedding
+template<typename T, bool has_pos>
+static __global__ void rope(
+    const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
+    float ext_factor, float attn_factor, rope_corr_dims corr_dims
+) {
+    const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
+
+    if (col >= ncols) {
+        return;
+    }
+
+    const int row = blockDim.x*blockIdx.x + threadIdx.x;
+    const int i = row*ncols + col;
+    const int i2 = row/p_delta_rows;
+
+    const int p = has_pos ? pos[i2] : 0;
+    const float theta_base = p*powf(freq_base, -float(col)/ncols);
+
+    float cos_theta, sin_theta;
+    rope_yarn(theta_base, freq_scale, corr_dims, col, ext_factor, attn_factor, &cos_theta, &sin_theta);
+
+    const float x0 = x[i + 0];
+    const float x1 = x[i + 1];
+
+    dst[i + 0] = x0*cos_theta - x1*sin_theta;
+    dst[i + 1] = x0*sin_theta + x1*cos_theta;
+}
+
+template<typename T, bool has_pos>
+static __global__ void rope_neox(
+    const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
+    float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims
+) {
+    const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
+
+    if (col >= ncols) {
+        return;
+    }
+
+    const int row = blockDim.x*blockIdx.x + threadIdx.x;
+    const int ib = col / n_dims;
+    const int ic = col % n_dims;
+
+    if (ib > 0) {
+        const int i = row*ncols + ib*n_dims + ic;
+
+        dst[i + 0] = x[i + 0];
+        dst[i + 1] = x[i + 1];
+
+        return;
+    }
+
+    const int i  = row*ncols + ib*n_dims + ic/2;
+    const int i2 = row/p_delta_rows;
+
+    float cur_rot = inv_ndims * ic - ib;
+
+    const int p = has_pos ? pos[i2] : 0;
+    const float theta_base = p*freq_scale*powf(theta_scale, col/2.0f);
+
+    float cos_theta, sin_theta;
+    rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
+
+    const float x0 = x[i + 0];
+    const float x1 = x[i + n_dims/2];
+
+    dst[i + 0]        = x0*cos_theta - x1*sin_theta;
+    dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
+}
+
+static __global__ void rope_glm_f32(
+    const float * x, float * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
+    int n_ctx
+) {
+    const int col = blockDim.x*blockIdx.x + threadIdx.x;
+    const int half_n_dims = ncols/4;
+
+    if (col >= half_n_dims) {
+        return;
+    }
+
+    const int row = blockDim.y*blockIdx.y + threadIdx.y;
+    const int i = row*ncols + col;
+    const int i2 = row/p_delta_rows;
+
+    const float col_theta_scale = powf(freq_base, -2.0f*col/ncols);
+     // FIXME: this is likely wrong
+    const int p = pos != nullptr ? pos[i2] : 0;
+
+    const float theta = min(p, n_ctx - 2)*freq_scale*col_theta_scale;
+    const float sin_theta = sinf(theta);
+    const float cos_theta = cosf(theta);
+
+    const float x0 = x[i + 0];
+    const float x1 = x[i + half_n_dims];
+
+    dst[i + 0]           = x0*cos_theta - x1*sin_theta;
+    dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;
+
+    const float block_theta = ((float)max(p - n_ctx - 2, 0))*col_theta_scale;
+    const float sin_block_theta = sinf(block_theta);
+    const float cos_block_theta = cosf(block_theta);
+
+    const float x2 = x[i + half_n_dims * 2];
+    const float x3 = x[i + half_n_dims * 3];
+
+    dst[i + half_n_dims * 2] = x2*cos_block_theta - x3*sin_block_theta;
+    dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
+}
+
+static __global__ void alibi_f32(const float * x, float * dst, const int ncols, const int k_rows,
+                                 const int n_heads_log2_floor, const float m0, const float m1) {
+    const int col = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (col >= ncols) {
+        return;
+    }
+
+    const int row = blockDim.y*blockIdx.y + threadIdx.y;
+    const int i = row*ncols + col;
+
+    const int k = row/k_rows;
+
+    float m_k;
+    if (k < n_heads_log2_floor) {
+        m_k = powf(m0, k + 1);
+    } else {
+        m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
+    }
+
+    dst[i] = col * m_k + x[i];
+}
+
+static __global__ void k_sum_rows_f32(const float * x, float * dst, const int ncols) {
+    const int row = blockIdx.y;
+    const int col = threadIdx.x;
+
+    float sum = 0.0f;
+    for (int i = col; i < ncols; i += blockDim.x) {
+        sum += x[row * ncols + i];
+    }
+
+    sum = warp_reduce_sum(sum);
+
+    if (col == 0) {
+        dst[row] = sum;
+    }
+}
+
+template<typename T>
+static inline __device__ void swap(T & a, T & b) {
+    T tmp = a;
+    a = b;
+    b = tmp;
+}
+
+template<ggml_v3_sort_order order>
+static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int ncols) {
+    // bitonic sort
+    int col = threadIdx.x;
+    int row = blockIdx.y;
+
+    if (col >= ncols) return;
+
+    const float * x_row = x + row * ncols;
+    int * dst_row = dst + row * ncols;
+
+    // initialize indices
+    if (col < ncols) {
+        dst_row[col] = col;
+    }
+    __syncthreads();
+
+    for (int k = 2; k <= ncols; k *= 2) {
+        for (int j = k / 2; j > 0; j /= 2) {
+            int ixj = col ^ j;
+            if (ixj > col) {
+                if ((col & k) == 0) {
+                    if (order == GGML_V3_SORT_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) {
+                        swap(dst_row[col], dst_row[ixj]);
+                    }
+                } else {
+                    if (order == GGML_V3_SORT_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) {
+                        swap(dst_row[col], dst_row[ixj]);
+                    }
+                }
+            }
+            __syncthreads();
+        }
+    }
+}
+
+static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
+    const int col = blockDim.y*blockIdx.y + threadIdx.y;
+    const int row = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (col >= ncols) {
+        return;
+    }
+
+    const int i = row*ncols + col;
+    //dst[i] = col > (n_past + row % rows_per_channel) ? -INFINITY : x[i];
+    //dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU
+    dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX;
+}
+
+template <bool vals_smem, int ncols_template, int block_size_template, bool need_check>
+static __global__ void soft_max_f16(const float * x, const float * y, float * dst, const int ncols_par, const int nrows_y, const float scale) {
+#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
+    const int ncols_data = ncols_template == 0 ? ncols_par : ncols_template;
+    const int ncols_smem = GGML_V3_PAD(ncols_data, 2*WARP_SIZE)/2;
+
+    const int tid  = threadIdx.x;
+    const int rowx = blockIdx.x;
+    const int rowy = rowx % nrows_y; // broadcast the mask (y) in the row dimension
+
+    const int block_size = block_size_template == 0 ? blockDim.x : block_size_template;
+
+    const int warp_id = threadIdx.x / WARP_SIZE;
+    const int lane_id = threadIdx.x % WARP_SIZE;
+
+    extern __shared__ half data_soft_max_f16[];
+    half  * buf_iw = data_soft_max_f16 + 0; // shared memory buffer for inter-warp communication
+    // (shared memory) buffer to cache values between iterations:
+    half2 * vals = vals_smem ? (half2 *) (buf_iw + WARP_SIZE) : (half2 *) (dst + rowx*ncols_data);
+    // if the buffer is larger than max. shared memory per block, use dst as temp. buffer instead
+    // in that case col_smem == col_data must be enforced to avoid race conditions
+
+    half2 max_val = make_half2(-INFINITY, -INFINITY);
+
+#pragma unroll
+    for (int col0 = 0; col0 < ncols_smem; col0 += block_size) {
+        const int col_data = 2*col0 + 2*WARP_SIZE*warp_id + lane_id;
+        const int col_smem = vals_smem ? col0 + tid : col_data;
+
+        const int ix = rowx*ncols_data + col_data;
+        const int iy = rowy*ncols_data + col_data;
+
+        half2 val;
+        if (need_check && col_data + 0 >= ncols_data) {
+            val.x = -INFINITY;
+        } else {
+            val.x = x[ix + 0]*scale + (y ? y[iy + 0] : 0.0f);
+        }
+        if (need_check && col_data + WARP_SIZE >= ncols_data) {
+            val.y = -INFINITY;
+        } else {
+            val.y = x[ix + WARP_SIZE]*scale + (y ? y[iy + WARP_SIZE] : 0.0f);
+        }
+        if (!need_check || col_smem < (vals_smem ? ncols_smem : ncols_data)) {
+            vals[col_smem] = val;
+        }
+        max_val = __hmax2(max_val, val);
+    }
+
+    // find the max value in the block
+    max_val = warp_reduce_max(max_val);
+    if (block_size > WARP_SIZE) {
+        if (warp_id == 0) {
+            buf_iw[lane_id] = -INFINITY;
+        }
+        __syncthreads();
+
+        if (lane_id == 0) {
+            buf_iw[warp_id] = __hmax(max_val.x, max_val.y);
+        }
+        __syncthreads();
+
+        max_val = __half2half2(buf_iw[lane_id]);
+        max_val = warp_reduce_max(max_val);
+    } else {
+        max_val = __half2half2(__hmax(max_val.x, max_val.y));
+    }
+
+    half2 tmp = make_half2(0.0f, 0.0f); // partial sums
+
+#pragma unroll
+    for (int col0 = 0; col0 < ncols_smem; col0 += block_size) {
+        const int col_smem = vals_smem ? col0 + tid : 2*col0 + 2*warp_id*WARP_SIZE + lane_id;
+
+        if (ncols_template == 0 && col_smem >= (vals_smem ? ncols_smem : ncols_data)) {
+            break;
+        }
+
+        const half2 val = h2exp(vals[col_smem] - max_val);
+
+        tmp += val;
+        vals[col_smem] = val;
+    }
+
+    // find the sum of exps in the block
+    tmp = warp_reduce_sum(tmp);
+    if (block_size > WARP_SIZE) {
+        if (warp_id == 0) {
+            buf_iw[lane_id] = 0.0f;
+        }
+        __syncthreads();
+
+        if (lane_id == 0) {
+            buf_iw[warp_id] = tmp.x + tmp.y;
+        }
+        __syncthreads();
+
+        tmp = __half2half2(buf_iw[lane_id]);
+        tmp = warp_reduce_sum(tmp);
+    } else {
+        tmp = __half2half2(tmp.x + tmp.y);
+    }
+
+    const half2 inv_sum = make_half2(1.0f, 1.0f) / tmp;
+
+#pragma unroll
+    for (int col0 = 0; col0 < ncols_smem; col0 += block_size) {
+        const int col_data = 2*col0 + 2*WARP_SIZE*warp_id + lane_id;
+        const int col_smem = vals_smem ? col0 + tid : col_data;
+
+        const int idst = rowx*ncols_data + col_data;
+        const half2 result = vals[col_smem] * inv_sum;
+
+        if (need_check && col_data + 0 >= ncols_data) {
+            return;
+        }
+        dst[idst] = result.x;
+
+        if (need_check && col_data + WARP_SIZE >= ncols_data) {
+            return;
+        }
+
+        dst[idst + WARP_SIZE] = result.y;
+    }
+#else
+    (void) x; (void) y; (void) dst; (void) ncols_par; (void) nrows_y; (void) scale;
+    bad_arch();
+#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
+}
+
+template <bool vals_smem, int ncols_template, int block_size_template>
+static __global__ void soft_max_f32(const float * x, const float * y, float * dst, const int ncols_par, const int nrows_y, const float scale) {
+    const int ncols = ncols_template == 0 ? ncols_par : ncols_template;
+
+    const int tid  = threadIdx.x;
+    const int rowx = blockIdx.x;
+    const int rowy = rowx % nrows_y; // broadcast the mask (y) in the row dimension
+
+    const int block_size = block_size_template == 0 ? blockDim.x : block_size_template;
+
+    const int warp_id = threadIdx.x / WARP_SIZE;
+    const int lane_id = threadIdx.x % WARP_SIZE;
+
+    extern __shared__ float data_soft_max_f32[];
+    float * buf_iw = data_soft_max_f32; // shared memory buffer for inter-warp communication
+    // shared memory buffer to cache values between iterations:
+    float * vals = vals_smem ? buf_iw + WARP_SIZE : dst + rowx*ncols;
+
+    float max_val = -INFINITY;
+
+#pragma unroll
+    for (int col0 = 0; col0 < ncols; col0 += block_size) {
+        const int col = col0 + tid;
+
+        if (ncols_template == 0 && col >= ncols) {
+            break;
+        }
+
+        const int ix = rowx*ncols + col;
+        const int iy = rowy*ncols + col;
+
+        const float val = x[ix]*scale + (y ? y[iy] : 0.0f);
+        vals[col] = val;
+        max_val = max(max_val, val);
+    }
+
+    // find the max value in the block
+    max_val = warp_reduce_max(max_val);
+    if (block_size > WARP_SIZE) {
+        if (warp_id == 0) {
+            buf_iw[lane_id] = -INFINITY;
+        }
+        __syncthreads();
+
+        if (lane_id == 0) {
+            buf_iw[warp_id] = max_val;
+        }
+        __syncthreads();
+
+        max_val = buf_iw[lane_id];
+        max_val = warp_reduce_max(max_val);
+    }
+
+    float tmp = 0.0f; // partial sum
+
+#pragma unroll
+    for (int col0 = 0; col0 < ncols; col0 += block_size) {
+        const int col = col0 + tid;
+
+        if (ncols_template == 0 && col >= ncols) {
+            break;
+        }
+
+        const float val = expf(vals[col] - max_val);
+        tmp += val;
+        vals[col] = val;
+    }
+
+    // find the sum of exps in the block
+    tmp = warp_reduce_sum(tmp);
+    if (block_size > WARP_SIZE) {
+        if (warp_id == 0) {
+            buf_iw[lane_id] = 0.0f;
+        }
+        __syncthreads();
+
+        if (lane_id == 0) {
+            buf_iw[warp_id] = tmp;
+        }
+        __syncthreads();
+
+        tmp = buf_iw[lane_id];
+        tmp = warp_reduce_sum(tmp);
+    }
+
+    const float inv_sum = 1.0f / tmp;
+
+#pragma unroll
+    for (int col0 = 0; col0 < ncols; col0 += block_size) {
+        const int col = col0 + tid;
+
+        if (ncols_template == 0 && col >= ncols) {
+            return;
+        }
+
+        const int idst = rowx*ncols + col;
+        dst[idst] = vals[col] * inv_sum;
+    }
+}
+
+static __global__ void scale_f32(const float * x, float * dst, const float scale, const int k) {
+    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i >= k) {
+        return;
+    }
+
+    dst[i] = scale * x[i];
+}
+
+static __global__ void clamp_f32(const float * x, float * dst, const float min, const float max, const int k) {
+    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i >= k) {
+        return;
+    }
+
+    dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
+}
+
+static  __global__ void im2col_f32_f16(
+        const float * x, half * dst,
+        int offset_delta, int IW, int IH, int OW, int KW, int KH, int pelements, int CHW,
+        int s0, int s1, int p0, int p1, int d0, int d1) {
+    const int i = threadIdx.x + blockIdx.x * blockDim.x;
+    if (i >= pelements) {
+        return;
+    }
+
+    const int ksize = OW * (KH > 1 ? KW : 1);
+    const int kx = i / ksize;
+    const int kd = kx * ksize;
+    const int ky = (i - kd) / OW;
+    const int ix = i % OW;
+
+    const int64_t iiw = ix * s0 + kx * d0 - p0;
+    const int64_t iih = blockIdx.y * s1 + ky * d1 - p1;
+
+    const int64_t offset_dst =
+        (blockIdx.y * OW + ix) * CHW +
+        (blockIdx.z * (KW * KH) + ky * KW + kx);
+
+    if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
+        dst[offset_dst] = __float2half(0.0f);
+    } else {
+        const int64_t offset_src = blockIdx.z * offset_delta;
+        dst[offset_dst] = __float2half(x[offset_src + iih * IW + iiw]);
+    }
+}
+
+template<int qk, int qr, dequantize_kernel_t dq>
+static void get_rows_cuda(const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst,
+                            const void * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
+
+    GGML_V3_TENSOR_BINARY_OP_LOCALS
+
+    const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
+    const int block_num_x = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
+    const dim3 block_nums(block_num_x, ne10, ne11*ne12);
+
+    // strides in elements
+    //const size_t s0 = nb0 / ggml_v3_element_size(dst);
+    const size_t s1 = nb1 / ggml_v3_element_size(dst);
+    const size_t s2 = nb2 / ggml_v3_element_size(dst);
+    const size_t s3 = nb3 / ggml_v3_element_size(dst);
+
+    const size_t s10 = nb10 / ggml_v3_element_size(src1);
+    const size_t s11 = nb11 / ggml_v3_element_size(src1);
+    const size_t s12 = nb12 / ggml_v3_element_size(src1);
+    //const size_t s13 = nb13 / ggml_v3_element_size(src1);
+
+    GGML_V3_ASSERT(ne00 % 2 == 0);
+
+    k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(
+            src0_dd, src1_dd, dst_dd,
+            ne00, /*ne01, ne02, ne03,*/
+            /*ne10, ne11,*/ ne12, /*ne13,*/
+            /* s0,*/ s1, s2, s3,
+            /* nb00,*/ nb01, nb02, nb03,
+            s10, s11, s12/*, s13*/);
+
+    (void) dst;
+}
+
+template<typename src0_t>
+static void get_rows_cuda_float(const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst,
+                                const src0_t * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
+
+    GGML_V3_TENSOR_BINARY_OP_LOCALS
+
+    const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
+    const int block_num_x = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE;
+    const dim3 block_nums(block_num_x, ne10, ne11*ne12);
+
+    // strides in elements
+    //const size_t s0 = nb0 / ggml_v3_element_size(dst);
+    const size_t s1 = nb1 / ggml_v3_element_size(dst);
+    const size_t s2 = nb2 / ggml_v3_element_size(dst);
+    const size_t s3 = nb3 / ggml_v3_element_size(dst);
+
+    const size_t s10 = nb10 / ggml_v3_element_size(src1);
+    const size_t s11 = nb11 / ggml_v3_element_size(src1);
+    const size_t s12 = nb12 / ggml_v3_element_size(src1);
+    //const size_t s13 = nb13 / ggml_v3_element_size(src1);
+
+    k_get_rows_float<<<block_nums, block_dims, 0, stream>>>(
+            src0_dd, src1_dd, dst_dd,
+            ne00, /*ne01, ne02, ne03,*/
+            /*ne10, ne11,*/ ne12, /*ne13,*/
+            /* s0,*/ s1, s2, s3,
+            /* nb00,*/ nb01, nb02, nb03,
+            s10, s11, s12/*, s13*/);
+
+    (void) dst;
+}
+
+template<float (*bin_op)(const float, const float)>
+struct bin_bcast_cuda {
+    template<typename src0_t, typename src1_t, typename dst_t>
+    void operator()(const struct ggml_v3_tensor * src0, const struct ggml_v3_tensor * src1, struct ggml_v3_tensor * dst,
+            const src0_t * src0_dd, const src1_t * src1_dd, dst_t * dst_dd,
+            cudaStream_t stream) {
+
+        GGML_V3_TENSOR_BINARY_OP_LOCALS
+
+        int nr0 = ne10/ne0;
+        int nr1 = ne11/ne1;
+        int nr2 = ne12/ne2;
+        int nr3 = ne13/ne3;
+
+        int nr[4] = { nr0, nr1, nr2, nr3 };
+
+        // collapse dimensions until first broadcast dimension
+        int64_t cne0[] = {ne0, ne1, ne2, ne3};
+        int64_t cne1[] = {ne10, ne11, ne12, ne13};
+        size_t cnb0[] = {nb0, nb1, nb2, nb3};
+        size_t cnb1[] = {nb10, nb11, nb12, nb13};
+        auto collapse = [](int64_t cne[]) {
+            cne[0] *= cne[1];
+            cne[1] = cne[2];
+            cne[2] = cne[3];
+            cne[3] = 1;
+        };
+
+        auto collapse_nb = [](size_t cnb[], const int64_t cne[]) {
+            cnb[1] *= cne[1];
+            cnb[2] *= cne[2];
+            cnb[3] *= cne[3];
+        };
+
+        for (int i = 0; i < 4; i++) {
+            if (nr[i] != 1) {
+                break;
+            }
+            if (i > 0) {
+                collapse_nb(cnb0, cne0);
+                collapse_nb(cnb1, cne1);
+                collapse(cne0);
+                collapse(cne1);
+            }
+        }
+        {
+            int64_t ne0 = cne0[0];
+            int64_t ne1 = cne0[1];
+            int64_t ne2 = cne0[2];
+            int64_t ne3 = cne0[3];
+
+            int64_t ne10 = cne1[0];
+            int64_t ne11 = cne1[1];
+            int64_t ne12 = cne1[2];
+            int64_t ne13 = cne1[3];
+
+            size_t nb0 = cnb0[0];
+            size_t nb1 = cnb0[1];
+            size_t nb2 = cnb0[2];
+            size_t nb3 = cnb0[3];
+
+            size_t nb10 = cnb1[0];
+            size_t nb11 = cnb1[1];
+            size_t nb12 = cnb1[2];
+            size_t nb13 = cnb1[3];
+
+            size_t s0 = nb0 / sizeof(dst_t);
+            size_t s1 = nb1 / sizeof(dst_t);
+            size_t s2 = nb2 / sizeof(dst_t);
+            size_t s3 = nb3 / sizeof(dst_t);
+
+            size_t s10 = nb10 / sizeof(src1_t);
+            size_t s11 = nb11 / sizeof(src1_t);
+            size_t s12 = nb12 / sizeof(src1_t);
+            size_t s13 = nb13 / sizeof(src1_t);
+
+            GGML_V3_ASSERT(s0 == 1);
+            GGML_V3_ASSERT(s10 == 1);
+
+            const int block_size = 128;
+
+            int64_t hne0 = std::max(ne0/2LL, 1LL);
+
+            dim3 block_dims;
+            block_dims.x = std::min<unsigned int>(hne0, block_size);
+            block_dims.y = std::min<unsigned int>(ne1, block_size / block_dims.x);
+            block_dims.z = std::min(std::min<unsigned int>(ne2*ne3, block_size / block_dims.x / block_dims.y), 64U);
+
+            dim3 block_nums(
+                (hne0 + block_dims.x - 1) / block_dims.x,
+                (ne1 + block_dims.y - 1) / block_dims.y,
+                (ne2*ne3 + block_dims.z - 1) / block_dims.z
+            );
+
+            if (block_nums.z > 65535) {
+                // this is the maximum number of blocks in z direction, fallback to 1D grid kernel
+                int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size;
+                k_bin_bcast_unravel<bin_op><<<block_num, block_size, 0, stream>>>(
+                    src0_dd, src1_dd, dst_dd,
+                    ne0, ne1, ne2, ne3,
+                    ne10, ne11, ne12, ne13,
+                    /* s0, */ s1, s2, s3,
+                    /* s10, */ s11, s12, s13);
+            } else {
+                k_bin_bcast<bin_op><<<block_nums, block_dims, 0, stream>>>(
+                    src0_dd, src1_dd, dst_dd,
+                    ne0, ne1, ne2, ne3,
+                    ne10, ne11, ne12, ne13,
+                    /* s0, */ s1, s2, s3,
+                    /* s10, */ s11, s12, s13);
+            }
+        }
+    }
+};
+
+static void acc_f32_cuda(const float * x, const float * y, float * dst, const int n_elements,
+    const int ne10, const int ne11, const int ne12,
+    const int nb1, const int nb2, const int offset, cudaStream_t stream) {
+    int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE;
+    acc_f32<<<num_blocks, CUDA_ACC_BLOCK_SIZE, 0, stream>>>(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset);
+}
+
+static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
+    gelu_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
+}
+
+static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_SILU_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE;
+    silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
+}
+
+static void gelu_quick_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
+    gelu_quick_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
+}
+
+static void tanh_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_TANH_BLOCK_SIZE - 1) / CUDA_TANH_BLOCK_SIZE;
+    tanh_f32<<<num_blocks, CUDA_TANH_BLOCK_SIZE, 0, stream>>>(x, dst, k);
+}
+
+static void relu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
+    relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
+}
+
+static void leaky_relu_f32_cuda(const float * x, float * dst, const int k, const float negative_slope, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
+    leaky_relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k, negative_slope);
+}
+
+static void sqr_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_SQR_BLOCK_SIZE - 1) / CUDA_SQR_BLOCK_SIZE;
+    sqr_f32<<<num_blocks, CUDA_SQR_BLOCK_SIZE, 0, stream>>>(x, dst, k);
+}
+
+static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
+    GGML_V3_ASSERT(ncols % WARP_SIZE == 0);
+    if (ncols < 1024) {
+        const dim3 block_dims(WARP_SIZE, 1, 1);
+        norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
+    } else {
+        const dim3 block_dims(1024, 1, 1);
+        norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
+    }
+}
+
+static void group_norm_f32_cuda(const float * x, float * dst, const int num_groups, const int group_size, const int ne_elements, cudaStream_t stream) {
+    static const float eps = 1e-6f;
+    if (group_size < 1024) {
+        const dim3 block_dims(WARP_SIZE, 1, 1);
+        group_norm_f32<WARP_SIZE><<<num_groups, block_dims, 0, stream>>>(x, dst, group_size, ne_elements, eps);
+    } else {
+        const dim3 block_dims(1024, 1, 1);
+        group_norm_f32<1024><<<num_groups, block_dims, 0, stream>>>(x, dst, group_size, ne_elements, eps);
+    }
+}
+
+static void concat_f32_cuda(const float * x, const float * y, float * dst, const int ne0, int ne1, int ne2, int ne02, cudaStream_t stream) {
+    int num_blocks = (ne0 + CUDA_CONCAT_BLOCK_SIZE - 1) / CUDA_CONCAT_BLOCK_SIZE;
+    dim3 gridDim(num_blocks, ne1, ne2);
+    concat_f32<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne02);
+}
+
+static void upscale_f32_cuda(const float * x, float * dst, const int ne00, const int ne01, const int ne02, const int scale_factor, cudaStream_t stream) {
+    int ne0 = (ne00 * scale_factor);
+    int num_blocks = (ne0 + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE;
+    dim3 gridDim(num_blocks, (ne01 * scale_factor), ne02);
+    upscale_f32<<<gridDim, CUDA_UPSCALE_BLOCK_SIZE, 0, stream>>>(x, dst, ne00, ne00 * ne01, scale_factor);
+}
+
+static void pad_f32_cuda(const float * x, float * dst,
+    const int ne00, const int ne01, const int ne02,
+    const int ne0, const int ne1, const int ne2, cudaStream_t stream) {
+    int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
+    dim3 gridDim(num_blocks, ne1, ne2);
+    pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02);
+}
+
+static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
+    GGML_V3_ASSERT(ncols % WARP_SIZE == 0);
+    if (ncols < 1024) {
+        const dim3 block_dims(WARP_SIZE, 1, 1);
+        rms_norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
+    } else {
+        const dim3 block_dims(1024, 1, 1);
+        rms_norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
+    }
+}
+
+static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, const int ky, const int kx_padded, cudaStream_t stream) {
+    const int block_num_x = (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
+    const dim3 num_blocks(block_num_x, ky, 1);
+    const dim3 block_size(CUDA_DEQUANTIZE_BLOCK_SIZE, 1, 1);
+    quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
+}
+
+template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
+static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
+    const int num_blocks = (k + 2*CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / (2*CUDA_DEQUANTIZE_BLOCK_SIZE);
+    dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
+}
+
+template<typename dst_t>
+static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+    const int nb = k / QK_K;
+#if QK_K == 256
+    dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y);
+#else
+    dequantize_block_q2_K<<<nb, 32, 0, stream>>>(vx, y);
+#endif
+}
+
+template<typename dst_t>
+static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+    const int nb = k / QK_K;
+#if QK_K == 256
+    dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y);
+#else
+    dequantize_block_q3_K<<<nb, 32, 0, stream>>>(vx, y);
+#endif
+}
+
+template<typename dst_t>
+static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+    const int nb = k / QK_K;
+    dequantize_block_q4_K<<<nb, 32, 0, stream>>>(vx, y);
+}
+
+template<typename dst_t>
+static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+    const int nb = k / QK_K;
+#if QK_K == 256
+    dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y);
+#else
+    dequantize_block_q5_K<<<nb, 32, 0, stream>>>(vx, y);
+#endif
+}
+
+template<typename dst_t>
+static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+    const int nb = k / QK_K;
+#if QK_K == 256
+    dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
+#else
+    dequantize_block_q6_K<<<nb, 32, 0, stream>>>(vx, y);
+#endif
+}
+
+template<typename dst_t>
+static void dequantize_row_iq2_xxs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+    const int nb = k / QK_K;
+    dequantize_block_iq2_xxs<<<nb, 32, 0, stream>>>(vx, y);
+}
+
+template<typename dst_t>
+static void dequantize_row_iq2_xs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+    const int nb = k / QK_K;
+    dequantize_block_iq2_xs<<<nb, 32, 0, stream>>>(vx, y);
+}
+
+template <typename src_t, typename dst_t>
+static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
+    convert_unary<src_t><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
+}
+
+static to_fp16_cuda_t ggml_v3_get_to_fp16_cuda(ggml_v3_type type) {
+    switch (type) {
+        case GGML_V3_TYPE_Q4_0:
+            return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
+        case GGML_V3_TYPE_Q4_1:
+            return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
+        case GGML_V3_TYPE_Q5_0:
+            return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
+        case GGML_V3_TYPE_Q5_1:
+            return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
+        case GGML_V3_TYPE_Q8_0:
+            return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
+        case GGML_V3_TYPE_Q2_K:
+            return dequantize_row_q2_K_cuda;
+        case GGML_V3_TYPE_Q3_K:
+            return dequantize_row_q3_K_cuda;
+        case GGML_V3_TYPE_Q4_K:
+            return dequantize_row_q4_K_cuda;
+        case GGML_V3_TYPE_Q5_K:
+            return dequantize_row_q5_K_cuda;
+        case GGML_V3_TYPE_Q6_K:
+            return dequantize_row_q6_K_cuda;
+        case GGML_V3_TYPE_IQ2_XXS:
+            return dequantize_row_iq2_xxs_cuda;
+        case GGML_V3_TYPE_IQ2_XS:
+            return dequantize_row_iq2_xs_cuda;
+        case GGML_V3_TYPE_F32:
+            return convert_unary_cuda<float>;
+        default:
+            return nullptr;
+    }
+}
+
+static to_fp32_cuda_t ggml_v3_get_to_fp32_cuda(ggml_v3_type type) {
+    switch (type) {
+        case GGML_V3_TYPE_Q4_0:
+            return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
+        case GGML_V3_TYPE_Q4_1:
+            return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
+        case GGML_V3_TYPE_Q5_0:
+            return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
+        case GGML_V3_TYPE_Q5_1:
+            return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
+        case GGML_V3_TYPE_Q8_0:
+            return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
+        case GGML_V3_TYPE_Q2_K:
+            return dequantize_row_q2_K_cuda;
+        case GGML_V3_TYPE_Q3_K:
+            return dequantize_row_q3_K_cuda;
+        case GGML_V3_TYPE_Q4_K:
+            return dequantize_row_q4_K_cuda;
+        case GGML_V3_TYPE_Q5_K:
+            return dequantize_row_q5_K_cuda;
+        case GGML_V3_TYPE_Q6_K:
+            return dequantize_row_q6_K_cuda;
+        case GGML_V3_TYPE_IQ2_XXS:
+            return dequantize_row_iq2_xxs_cuda;
+        case GGML_V3_TYPE_IQ2_XS:
+            return dequantize_row_iq2_xs_cuda;
+        case GGML_V3_TYPE_F16:
+            return convert_unary_cuda<half>;
+        default:
+            return nullptr;
+    }
+}
+
+static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_V3_ASSERT(ncols % GGML_V3_CUDA_DMMV_X == 0);
+    const int block_num_y = (nrows + GGML_V3_CUDA_MMV_Y - 1) / GGML_V3_CUDA_MMV_Y;
+    // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
+    const dim3 block_nums(block_num_y, 1, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_V3_CUDA_MMV_Y, 1);
+    dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>
+        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
+}
+
+static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_V3_ASSERT(ncols % GGML_V3_CUDA_DMMV_X == 0);
+    const int block_num_y = (nrows + GGML_V3_CUDA_MMV_Y - 1) / GGML_V3_CUDA_MMV_Y;
+    const dim3 block_nums(block_num_y, 1, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_V3_CUDA_MMV_Y, 1);
+    dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
+}
+
+static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_V3_ASSERT(ncols % GGML_V3_CUDA_DMMV_X == 0);
+    const int block_num_y = (nrows + GGML_V3_CUDA_MMV_Y - 1) / GGML_V3_CUDA_MMV_Y;
+    const dim3 block_nums(block_num_y, 1, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_V3_CUDA_MMV_Y, 1);
+    dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>
+        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
+}
+
+static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_V3_ASSERT(ncols % GGML_V3_CUDA_DMMV_X == 0);
+    const int block_num_y = (nrows + GGML_V3_CUDA_MMV_Y - 1) / GGML_V3_CUDA_MMV_Y;
+    const dim3 block_nums(block_num_y, 1, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_V3_CUDA_MMV_Y, 1);
+    dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
+}
+
+static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_V3_ASSERT(ncols % GGML_V3_CUDA_DMMV_X == 0);
+    const int block_num_y = (nrows + GGML_V3_CUDA_MMV_Y - 1) / GGML_V3_CUDA_MMV_Y;
+    const dim3 block_nums(block_num_y, 1, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_V3_CUDA_MMV_Y, 1);
+    dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>
+        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
+}
+
+static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_V3_ASSERT(ncols % QK_K == 0);
+    const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2
+    const int block_num_y = (nrows + ny - 1) / ny;
+    const dim3 block_nums(block_num_y, 1, 1);
+    const dim3 block_dims(32, ny, 1);
+    dequantize_mul_mat_vec_q2_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
+}
+
+static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_V3_ASSERT(ncols % QK_K == 0);
+    const int ny = 2 / K_QUANTS_PER_ITERATION;
+    const int block_num_y = (nrows + ny - 1) / ny;
+    const dim3 block_nums(block_num_y, 1, 1);
+    const dim3 block_dims(32, ny, 1);
+    dequantize_mul_mat_vec_q3_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
+}
+
+static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_V3_ASSERT(ncols % QK_K == 0);
+    const int ny = 2 / K_QUANTS_PER_ITERATION;
+    const int block_num_y = (nrows + ny - 1) / ny;
+    const dim3 block_nums(block_num_y, 1, 1);
+    const dim3 block_dims(32, ny, 1);
+    dequantize_mul_mat_vec_q4_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
+}
+
+static void dequantize_mul_mat_vec_q5_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_V3_ASSERT(ncols % QK_K == 0);
+    const dim3 block_dims(32, 1, 1);
+    dequantize_mul_mat_vec_q5_k<<<nrows, block_dims, 0, stream>>>(vx, y, dst, ncols);
+}
+
+static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_V3_ASSERT(ncols % QK_K == 0);
+    const int ny = 2 / K_QUANTS_PER_ITERATION;
+    const int block_num_y = (nrows + ny - 1) / ny;
+    const dim3 block_nums(block_num_y, 1, 1);
+    const dim3 block_dims(32, ny, 1);
+    dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
+}
+
+static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_V3_ASSERT(ncols % GGML_V3_CUDA_DMMV_X == 0);
+    const int block_num_y = (nrows + GGML_V3_CUDA_MMV_Y - 1) / GGML_V3_CUDA_MMV_Y;
+    const dim3 block_nums(block_num_y, 1, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_V3_CUDA_MMV_Y, 1);
+    dequantize_mul_mat_vec<1, 1, convert_f16>
+        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
+}
+
+static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_V3_ASSERT(ncols % QK4_0 == 0);
+    const int block_num_y = (nrows + GGML_V3_CUDA_MMV_Y - 1) / GGML_V3_CUDA_MMV_Y;
+    const dim3 block_nums(block_num_y, 1, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_V3_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
+}
+
+static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_V3_ASSERT(ncols % QK4_1 == 0);
+    const int block_num_y = (nrows + GGML_V3_CUDA_MMV_Y - 1) / GGML_V3_CUDA_MMV_Y;
+    const dim3 block_nums(block_num_y, 1, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_V3_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
+}
+
+static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_V3_ASSERT(ncols % QK5_0 == 0);
+    const int block_num_y = (nrows + GGML_V3_CUDA_MMV_Y - 1) / GGML_V3_CUDA_MMV_Y;
+    const dim3 block_nums(block_num_y, 1, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_V3_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
+}
+
+static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_V3_ASSERT(ncols % QK5_1 == 0);
+    const int block_num_y = (nrows + GGML_V3_CUDA_MMV_Y - 1) / GGML_V3_CUDA_MMV_Y;
+    const dim3 block_nums(block_num_y, 1, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_V3_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
+}
+
+static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_V3_ASSERT(ncols % QK8_0 == 0);
+    const int block_num_y = (nrows + GGML_V3_CUDA_MMV_Y - 1) / GGML_V3_CUDA_MMV_Y;
+    const dim3 block_nums(block_num_y, 1, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_V3_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
+}
+
+static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_V3_ASSERT(ncols % QK_K == 0);
+    const int block_num_y = (nrows + GGML_V3_CUDA_MMV_Y - 1) / GGML_V3_CUDA_MMV_Y;
+    const dim3 block_nums(block_num_y, 1, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_V3_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
+}
+
+static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_V3_ASSERT(ncols % QK_K == 0);
+    const int block_num_y = (nrows + GGML_V3_CUDA_MMV_Y - 1) / GGML_V3_CUDA_MMV_Y;
+    const dim3 block_nums(block_num_y, 1, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_V3_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
+}
+
+static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_V3_ASSERT(ncols % QK_K == 0);
+    const int block_num_y = (nrows + GGML_V3_CUDA_MMV_Y - 1) / GGML_V3_CUDA_MMV_Y;
+    const dim3 block_nums(block_num_y, 1, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_V3_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
+}
+
+static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_V3_ASSERT(ncols % QK_K == 0);
+    const int block_num_y = (nrows + GGML_V3_CUDA_MMV_Y - 1) / GGML_V3_CUDA_MMV_Y;
+    const dim3 block_nums(block_num_y, 1, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_V3_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
+}
+
+static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_V3_ASSERT(ncols % QK_K == 0);
+    const int block_num_y = (nrows + GGML_V3_CUDA_MMV_Y - 1) / GGML_V3_CUDA_MMV_Y;
+    const dim3 block_nums(block_num_y, 1, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_V3_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
+}
+
+static void mul_mat_vec_iq2_xxs_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_V3_ASSERT(ncols % QK_K == 0);
+    const int block_num_y = (nrows + GGML_V3_CUDA_MMV_Y - 1) / GGML_V3_CUDA_MMV_Y;
+    const dim3 block_nums(block_num_y, 1, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_V3_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<QK_K, QI2_XXS, block_iq2_xxs, 1, vec_dot_iq2_xxs_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
+}
+
+static void mul_mat_vec_iq2_xs_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_V3_ASSERT(ncols % QK_K == 0);
+    const int block_num_y = (nrows + GGML_V3_CUDA_MMV_Y - 1) / GGML_V3_CUDA_MMV_Y;
+    const dim3 block_nums(block_num_y, 1, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_V3_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<QK_K, QI2_XS, block_iq2_xs, 1, vec_dot_iq2_xs_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
+}
+
+static void ggml_v3_mul_mat_q4_0_q8_1_cuda(
+    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
+
+    int id;
+    CUDA_CHECK(cudaGetDevice(&id));
+    const int compute_capability = g_device_caps[id].cc;
+
+    int mmq_x, mmq_y, nwarps;
+    if (compute_capability >= CC_RDNA2) {
+        mmq_x  =  MMQ_X_Q4_0_RDNA2;
+        mmq_y  =  MMQ_Y_Q4_0_RDNA2;
+        nwarps = NWARPS_Q4_0_RDNA2;
+    } else if (compute_capability >= CC_OFFSET_AMD) {
+        mmq_x  =  MMQ_X_Q4_0_RDNA1;
+        mmq_y  =  MMQ_Y_Q4_0_RDNA1;
+        nwarps = NWARPS_Q4_0_RDNA1;
+    } else if (compute_capability >= CC_VOLTA) {
+        mmq_x  =  MMQ_X_Q4_0_AMPERE;
+        mmq_y  =  MMQ_Y_Q4_0_AMPERE;
+        nwarps = NWARPS_Q4_0_AMPERE;
+    } else if (compute_capability >= MIN_CC_DP4A) {
+        mmq_x  =  MMQ_X_Q4_0_PASCAL;
+        mmq_y  =  MMQ_Y_Q4_0_PASCAL;
+        nwarps = NWARPS_Q4_0_PASCAL;
+    } else {
+        GGML_V3_ASSERT(false);
+    }
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const dim3 block_nums(block_num_x, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE, nwarps, 1);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        mul_mat_q4_0<need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    } else {
+        const bool need_check = true;
+        mul_mat_q4_0<need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    }
+}
+
+static void ggml_v3_mul_mat_q4_1_q8_1_cuda(
+    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
+
+    int id;
+    CUDA_CHECK(cudaGetDevice(&id));
+    const int compute_capability = g_device_caps[id].cc;
+
+    int mmq_x, mmq_y, nwarps;
+    if (compute_capability >= CC_RDNA2) {
+        mmq_x  =  MMQ_X_Q4_1_RDNA2;
+        mmq_y  =  MMQ_Y_Q4_1_RDNA2;
+        nwarps = NWARPS_Q4_1_RDNA2;
+    } else if (compute_capability >= CC_OFFSET_AMD) {
+        mmq_x  =  MMQ_X_Q4_1_RDNA1;
+        mmq_y  =  MMQ_Y_Q4_1_RDNA1;
+        nwarps = NWARPS_Q4_1_RDNA1;
+    } else if (compute_capability >= CC_VOLTA) {
+        mmq_x  =  MMQ_X_Q4_1_AMPERE;
+        mmq_y  =  MMQ_Y_Q4_1_AMPERE;
+        nwarps = NWARPS_Q4_1_AMPERE;
+    } else if (compute_capability >= MIN_CC_DP4A) {
+        mmq_x  =  MMQ_X_Q4_1_PASCAL;
+        mmq_y  =  MMQ_Y_Q4_1_PASCAL;
+        nwarps = NWARPS_Q4_1_PASCAL;
+    } else {
+        GGML_V3_ASSERT(false);
+    }
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const dim3 block_nums(block_num_x, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE, nwarps, 1);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        mul_mat_q4_1<need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    } else {
+        const bool need_check = true;
+        mul_mat_q4_1<need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    }
+}
+
+static void ggml_v3_mul_mat_q5_0_q8_1_cuda(
+    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
+
+    int id;
+    CUDA_CHECK(cudaGetDevice(&id));
+    const int compute_capability = g_device_caps[id].cc;
+
+    int mmq_x, mmq_y, nwarps;
+    if (compute_capability >= CC_RDNA2) {
+        mmq_x  =  MMQ_X_Q5_0_RDNA2;
+        mmq_y  =  MMQ_Y_Q5_0_RDNA2;
+        nwarps = NWARPS_Q5_0_RDNA2;
+    } else if (compute_capability >= CC_OFFSET_AMD) {
+        mmq_x  =  MMQ_X_Q5_0_RDNA1;
+        mmq_y  =  MMQ_Y_Q5_0_RDNA1;
+        nwarps = NWARPS_Q5_0_RDNA1;
+    } else if (compute_capability >= CC_VOLTA) {
+        mmq_x  =  MMQ_X_Q5_0_AMPERE;
+        mmq_y  =  MMQ_Y_Q5_0_AMPERE;
+        nwarps = NWARPS_Q5_0_AMPERE;
+    } else if (compute_capability >= MIN_CC_DP4A) {
+        mmq_x  =  MMQ_X_Q5_0_PASCAL;
+        mmq_y  =  MMQ_Y_Q5_0_PASCAL;
+        nwarps = NWARPS_Q5_0_PASCAL;
+    } else {
+        GGML_V3_ASSERT(false);
+    }
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const dim3 block_nums(block_num_x, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE, nwarps, 1);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        mul_mat_q5_0<need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    } else {
+        const bool need_check = true;
+        mul_mat_q5_0<need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    }
+}
+
+static void ggml_v3_mul_mat_q5_1_q8_1_cuda(
+    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
+
+    int id;
+    CUDA_CHECK(cudaGetDevice(&id));
+    const int compute_capability = g_device_caps[id].cc;
+
+    int mmq_x, mmq_y, nwarps;
+    if (compute_capability >= CC_RDNA2) {
+        mmq_x  =  MMQ_X_Q5_1_RDNA2;
+        mmq_y  =  MMQ_Y_Q5_1_RDNA2;
+        nwarps = NWARPS_Q5_1_RDNA2;
+    } else if (compute_capability >= CC_OFFSET_AMD) {
+        mmq_x  =  MMQ_X_Q5_1_RDNA1;
+        mmq_y  =  MMQ_Y_Q5_1_RDNA1;
+        nwarps = NWARPS_Q5_1_RDNA1;
+    } else if (compute_capability >= CC_VOLTA) {
+        mmq_x  =  MMQ_X_Q5_1_AMPERE;
+        mmq_y  =  MMQ_Y_Q5_1_AMPERE;
+        nwarps = NWARPS_Q5_1_AMPERE;
+    } else if (compute_capability >= MIN_CC_DP4A) {
+        mmq_x  =  MMQ_X_Q5_1_PASCAL;
+        mmq_y  =  MMQ_Y_Q5_1_PASCAL;
+        nwarps = NWARPS_Q5_1_PASCAL;
+    } else {
+        GGML_V3_ASSERT(false);
+    }
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const dim3 block_nums(block_num_x, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE, nwarps, 1);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        mul_mat_q5_1<need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    } else {
+        const bool need_check = true;
+        mul_mat_q5_1<need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    }
+}
+
+static void ggml_v3_mul_mat_q8_0_q8_1_cuda(
+    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
+
+    int id;
+    CUDA_CHECK(cudaGetDevice(&id));
+    const int compute_capability = g_device_caps[id].cc;
+
+    int mmq_x, mmq_y, nwarps;
+    if (compute_capability >= CC_RDNA2) {
+        mmq_x  =  MMQ_X_Q8_0_RDNA2;
+        mmq_y  =  MMQ_Y_Q8_0_RDNA2;
+        nwarps = NWARPS_Q8_0_RDNA2;
+    } else if (compute_capability >= CC_OFFSET_AMD) {
+        mmq_x  =  MMQ_X_Q8_0_RDNA1;
+        mmq_y  =  MMQ_Y_Q8_0_RDNA1;
+        nwarps = NWARPS_Q8_0_RDNA1;
+    } else if (compute_capability >= CC_VOLTA) {
+        mmq_x  =  MMQ_X_Q8_0_AMPERE;
+        mmq_y  =  MMQ_Y_Q8_0_AMPERE;
+        nwarps = NWARPS_Q8_0_AMPERE;
+    } else if (compute_capability >= MIN_CC_DP4A) {
+        mmq_x  =  MMQ_X_Q8_0_PASCAL;
+        mmq_y  =  MMQ_Y_Q8_0_PASCAL;
+        nwarps = NWARPS_Q8_0_PASCAL;
+    } else {
+        GGML_V3_ASSERT(false);
+    }
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const dim3 block_nums(block_num_x, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE, nwarps, 1);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        mul_mat_q8_0<need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    } else {
+        const bool need_check = true;
+        mul_mat_q8_0<need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    }
+}
+
+static void ggml_v3_mul_mat_q2_K_q8_1_cuda(
+    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
+
+    int id;
+    CUDA_CHECK(cudaGetDevice(&id));
+    const int compute_capability = g_device_caps[id].cc;
+
+    int mmq_x, mmq_y, nwarps;
+    if (compute_capability >= CC_RDNA2) {
+        mmq_x  =  MMQ_X_Q2_K_RDNA2;
+        mmq_y  =  MMQ_Y_Q2_K_RDNA2;
+        nwarps = NWARPS_Q2_K_RDNA2;
+    } else if (compute_capability >= CC_OFFSET_AMD) {
+        mmq_x  =  MMQ_X_Q2_K_RDNA1;
+        mmq_y  =  MMQ_Y_Q2_K_RDNA1;
+        nwarps = NWARPS_Q2_K_RDNA1;
+    } else if (compute_capability >= CC_VOLTA) {
+        mmq_x  =  MMQ_X_Q2_K_AMPERE;
+        mmq_y  =  MMQ_Y_Q2_K_AMPERE;
+        nwarps = NWARPS_Q2_K_AMPERE;
+    } else if (compute_capability >= MIN_CC_DP4A) {
+        mmq_x  =  MMQ_X_Q2_K_PASCAL;
+        mmq_y  =  MMQ_Y_Q2_K_PASCAL;
+        nwarps = NWARPS_Q2_K_PASCAL;
+    } else {
+        GGML_V3_ASSERT(false);
+    }
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const dim3 block_nums(block_num_x, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE, nwarps, 1);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        mul_mat_q2_K<need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    } else {
+        const bool need_check = true;
+        mul_mat_q2_K<need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    }
+}
+
+static void ggml_v3_mul_mat_q3_K_q8_1_cuda(
+    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
+
+#if QK_K == 256
+
+    int id;
+    CUDA_CHECK(cudaGetDevice(&id));
+    const int compute_capability = g_device_caps[id].cc;
+
+    int mmq_x, mmq_y, nwarps;
+    if (compute_capability >= CC_RDNA2) {
+        mmq_x  =  MMQ_X_Q3_K_RDNA2;
+        mmq_y  =  MMQ_Y_Q3_K_RDNA2;
+        nwarps = NWARPS_Q3_K_RDNA2;
+    } else if (compute_capability >= CC_OFFSET_AMD) {
+        mmq_x  =  MMQ_X_Q3_K_RDNA1;
+        mmq_y  =  MMQ_Y_Q3_K_RDNA1;
+        nwarps = NWARPS_Q3_K_RDNA1;
+    } else if (compute_capability >= CC_VOLTA) {
+        mmq_x  =  MMQ_X_Q3_K_AMPERE;
+        mmq_y  =  MMQ_Y_Q3_K_AMPERE;
+        nwarps = NWARPS_Q3_K_AMPERE;
+    } else if (compute_capability >= MIN_CC_DP4A) {
+        mmq_x  =  MMQ_X_Q3_K_PASCAL;
+        mmq_y  =  MMQ_Y_Q3_K_PASCAL;
+        nwarps = NWARPS_Q3_K_PASCAL;
+    } else {
+        GGML_V3_ASSERT(false);
+    }
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const dim3 block_nums(block_num_x, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE, nwarps, 1);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    } else {
+        const bool need_check = true;
+        mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    }
+#endif
+}
+
+static void ggml_v3_mul_mat_q4_K_q8_1_cuda(
+    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
+
+    int id;
+    CUDA_CHECK(cudaGetDevice(&id));
+    const int compute_capability = g_device_caps[id].cc;
+
+    int mmq_x, mmq_y, nwarps;
+    if (compute_capability >= CC_RDNA2) {
+        mmq_x  =  MMQ_X_Q4_K_RDNA2;
+        mmq_y  =  MMQ_Y_Q4_K_RDNA2;
+        nwarps = NWARPS_Q4_K_RDNA2;
+    } else if (compute_capability >= CC_OFFSET_AMD) {
+        mmq_x  =  MMQ_X_Q4_K_RDNA1;
+        mmq_y  =  MMQ_Y_Q4_K_RDNA1;
+        nwarps = NWARPS_Q4_K_RDNA1;
+    } else if (compute_capability >= CC_VOLTA) {
+        mmq_x  =  MMQ_X_Q4_K_AMPERE;
+        mmq_y  =  MMQ_Y_Q4_K_AMPERE;
+        nwarps = NWARPS_Q4_K_AMPERE;
+    } else if (compute_capability >= MIN_CC_DP4A) {
+        mmq_x  =  MMQ_X_Q4_K_PASCAL;
+        mmq_y  =  MMQ_Y_Q4_K_PASCAL;
+        nwarps = NWARPS_Q4_K_PASCAL;
+    } else {
+        GGML_V3_ASSERT(false);
+    }
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const dim3 block_nums(block_num_x, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE, nwarps, 1);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        mul_mat_q4_K<need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    } else {
+        const bool need_check = true;
+        mul_mat_q4_K<need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    }
+}
+
+static void ggml_v3_mul_mat_q5_K_q8_1_cuda(
+    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
+
+    int id;
+    CUDA_CHECK(cudaGetDevice(&id));
+    const int compute_capability = g_device_caps[id].cc;
+
+    int mmq_x, mmq_y, nwarps;
+    if (compute_capability >= CC_RDNA2) {
+        mmq_x  =  MMQ_X_Q5_K_RDNA2;
+        mmq_y  =  MMQ_Y_Q5_K_RDNA2;
+        nwarps = NWARPS_Q5_K_RDNA2;
+    } else if (compute_capability >= CC_OFFSET_AMD) {
+        mmq_x  =  MMQ_X_Q5_K_RDNA1;
+        mmq_y  =  MMQ_Y_Q5_K_RDNA1;
+        nwarps = NWARPS_Q5_K_RDNA1;
+    } else if (compute_capability >= CC_VOLTA) {
+        mmq_x  =  MMQ_X_Q5_K_AMPERE;
+        mmq_y  =  MMQ_Y_Q5_K_AMPERE;
+        nwarps = NWARPS_Q5_K_AMPERE;
+    } else if (compute_capability >= MIN_CC_DP4A) {
+        mmq_x  =  MMQ_X_Q5_K_PASCAL;
+        mmq_y  =  MMQ_Y_Q5_K_PASCAL;
+        nwarps = NWARPS_Q5_K_PASCAL;
+    } else {
+        GGML_V3_ASSERT(false);
+    }
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const dim3 block_nums(block_num_x, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE, nwarps, 1);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        mul_mat_q5_K<need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    } else {
+        const bool need_check = true;
+        mul_mat_q5_K<need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    }
+}
+
+static void ggml_v3_mul_mat_q6_K_q8_1_cuda(
+    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
+
+    int id;
+    CUDA_CHECK(cudaGetDevice(&id));
+    const int compute_capability = g_device_caps[id].cc;
+
+    int mmq_x, mmq_y, nwarps;
+    if (compute_capability >= CC_RDNA2) {
+        mmq_x  =  MMQ_X_Q6_K_RDNA2;
+        mmq_y  =  MMQ_Y_Q6_K_RDNA2;
+        nwarps = NWARPS_Q6_K_RDNA2;
+    } else if (compute_capability >= CC_OFFSET_AMD) {
+        mmq_x  =  MMQ_X_Q6_K_RDNA1;
+        mmq_y  =  MMQ_Y_Q6_K_RDNA1;
+        nwarps = NWARPS_Q6_K_RDNA1;
+    } else if (compute_capability >= CC_VOLTA) {
+        mmq_x  =  MMQ_X_Q6_K_AMPERE;
+        mmq_y  =  MMQ_Y_Q6_K_AMPERE;
+        nwarps = NWARPS_Q6_K_AMPERE;
+    } else if (compute_capability >= MIN_CC_DP4A) {
+        mmq_x  =  MMQ_X_Q6_K_PASCAL;
+        mmq_y  =  MMQ_Y_Q6_K_PASCAL;
+        nwarps = NWARPS_Q6_K_PASCAL;
+    } else {
+        GGML_V3_ASSERT(false);
+    }
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const dim3 block_nums(block_num_x, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE, nwarps, 1);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        mul_mat_q6_K<need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    } else {
+        const bool need_check = true;
+        mul_mat_q6_K<need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    }
+}
+
+static void ggml_v3_mul_mat_p021_f16_f32_cuda(
+    const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
+    const int nchannels_x, const int nchannels_y, cudaStream_t stream) {
+
+    const dim3 block_nums(1, nrows_x, nchannels_y);
+    const dim3 block_dims(WARP_SIZE, 1, 1);
+    mul_mat_p021_f16_f32<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols_x, nrows_x, nchannels_x, nchannels_y);
+}
+
+static void ggml_v3_mul_mat_vec_nc_f16_f32_cuda(
+    const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int row_stride_x,
+    const int nchannels_x, const int nchannels_y, const int channel_stride_x, cudaStream_t stream) {
+
+    const dim3 block_nums(1, nrows_x, nchannels_y);
+    const dim3 block_dims(WARP_SIZE, 1, 1);
+    mul_mat_vec_nc_f16_f32<<<block_nums, block_dims, 0, stream>>>
+        (vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x, nchannels_y/nchannels_x);
+}
+
+static void ggml_v3_cpy_f32_f32_cuda(
+    const char * cx, char * cdst, const int ne,
+    const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
+    const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
+
+    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
+    cpy_f32_f16<cpy_1_f32_f32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
+        (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
+}
+
+static void ggml_v3_cpy_f32_f16_cuda(
+    const char * cx, char * cdst, const int ne,
+    const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
+    const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
+
+    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
+    cpy_f32_f16<cpy_1_f32_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
+        (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
+}
+
+static void ggml_v3_cpy_f32_q8_0_cuda(
+    const char * cx, char * cdst, const int ne,
+    const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
+    const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
+
+    GGML_V3_ASSERT(ne % QK8_0 == 0);
+    const int num_blocks = ne / QK8_0;
+    cpy_f32_q<cpy_blck_f32_q8_0, QK8_0><<<num_blocks, 1, 0, stream>>>
+        (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
+}
+
+static void ggml_v3_cpy_f32_q4_0_cuda(
+    const char * cx, char * cdst, const int ne,
+    const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
+    const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
+
+    GGML_V3_ASSERT(ne % QK4_0 == 0);
+    const int num_blocks = ne / QK4_0;
+    cpy_f32_q<cpy_blck_f32_q4_0, QK4_0><<<num_blocks, 1, 0, stream>>>
+        (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
+}
+
+static void ggml_v3_cpy_f32_q4_1_cuda(
+    const char * cx, char * cdst, const int ne,
+    const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
+    const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
+
+    GGML_V3_ASSERT(ne % QK4_1 == 0);
+    const int num_blocks = ne / QK4_1;
+    cpy_f32_q<cpy_blck_f32_q4_1, QK4_1><<<num_blocks, 1, 0, stream>>>
+        (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
+}
+
+static void ggml_v3_cpy_f16_f16_cuda(
+    const char * cx, char * cdst, const int ne,
+    const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
+    const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
+
+    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
+    cpy_f32_f16<cpy_1_f16_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
+        (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
+}
+
+static void scale_f32_cuda(const float * x, float * dst, const float scale, const int k, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
+    scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
+}
+
+static void clamp_f32_cuda(const float * x, float * dst, const float min, const float max, const int k, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_CLAMP_BLOCK_SIZE - 1) / CUDA_CLAMP_BLOCK_SIZE;
+    clamp_f32<<<num_blocks, CUDA_CLAMP_BLOCK_SIZE, 0, stream>>>(x, dst, min, max, k);
+}
+
+template<typename T>
+static void rope_cuda(
+    const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
+    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
+) {
+    GGML_V3_ASSERT(ncols % 2 == 0);
+    const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
+    const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
+    const dim3 block_nums(nrows, num_blocks_x, 1);
+    if (pos == nullptr) {
+        rope<T, false><<<block_nums, block_dims, 0, stream>>>(
+            x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
+        );
+    } else {
+        rope<T, true><<<block_nums, block_dims, 0, stream>>>(
+            x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
+        );
+    }
+}
+
+template<typename T>
+static void rope_neox_cuda(
+    const T * x, T * dst, int ncols, int n_dims, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
+    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
+) {
+    GGML_V3_ASSERT(ncols % 2 == 0);
+    const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
+    const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
+    const dim3 block_nums(nrows, num_blocks_x, 1);
+
+    const float theta_scale = powf(freq_base, -2.0f/n_dims);
+    const float inv_ndims = -1.0f / n_dims;
+
+    if (pos == nullptr) {
+        rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(
+            x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
+            theta_scale, inv_ndims
+        );
+    } else {
+        rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(
+            x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
+            theta_scale, inv_ndims
+        );
+    }
+}
+
+static void rope_glm_f32_cuda(
+    const float * x, float * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
+    float freq_base, int n_ctx, cudaStream_t stream
+) {
+    GGML_V3_ASSERT(ncols % 4 == 0);
+    const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE/4, 1, 1);
+    const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE;
+    const dim3 block_nums(num_blocks_x, nrows, 1);
+    rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, n_ctx);
+}
+
+static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
+                           const int k_rows, const int n_heads_log2_floor, const float m0,
+                           const float m1, cudaStream_t stream) {
+    const dim3 block_dims(CUDA_ALIBI_BLOCK_SIZE, 1, 1);
+    const int num_blocks_x = (ncols + CUDA_ALIBI_BLOCK_SIZE - 1) / (CUDA_ALIBI_BLOCK_SIZE);
+    const dim3 block_nums(num_blocks_x, nrows, 1);
+    alibi_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, k_rows, n_heads_log2_floor, m0, m1);
+}
+
+static void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    const dim3 block_dims(WARP_SIZE, 1, 1);
+    const dim3 block_nums(1, nrows, 1);
+    k_sum_rows_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
+}
+
+static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, const int nrows, ggml_v3_sort_order order, cudaStream_t stream) {
+    // bitonic sort requires ncols to be power of 2
+    GGML_V3_ASSERT((ncols & (ncols - 1)) == 0);
+
+    const dim3 block_dims(ncols, 1, 1);
+    const dim3 block_nums(1, nrows, 1);
+    if (order == GGML_V3_SORT_ASC) {
+        k_argsort_f32_i32<GGML_V3_SORT_ASC><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
+    } else if (order == GGML_V3_SORT_DESC) {
+        k_argsort_f32_i32<GGML_V3_SORT_DESC><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
+    } else {
+        GGML_V3_ASSERT(false);
+    }
+}
+
+static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
+    const dim3 block_dims(1, CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1);
+    const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
+    const dim3 block_nums(nrows_x, block_num_x, 1);
+    diag_mask_inf_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x, rows_per_channel, n_past);
+}
+
+static void soft_max_f16_cuda(const float * x, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, cudaStream_t stream) {
+    int nth = WARP_SIZE;
+    while (nth < ncols_x/2 && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2;
+    const dim3 block_dims(nth,     1, 1);
+    const dim3 block_nums(nrows_x, 1, 1);
+    const size_t shmem = (GGML_V3_PAD(ncols_x, 2*WARP_SIZE) + WARP_SIZE)*sizeof(half);
+    static_assert(CUDA_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted.");
+    if (shmem <= g_device_caps[g_main_device].smpb) {
+        switch (ncols_x) {
+            case 32:
+                soft_max_f16<true, 32, 32, true><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
+                break;
+            case 64:
+                soft_max_f16<true, 64, 32, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
+                break;
+            case 128:
+                soft_max_f16<true, 128, 64, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
+                break;
+            case 256:
+                soft_max_f16<true, 256, 128, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
+                break;
+            case 512:
+                soft_max_f16<true, 512, 256, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
+                break;
+            case 1024:
+                soft_max_f16<true, 1024, 512, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
+                break;
+            case 2048:
+                soft_max_f16<true, 2048, 1024, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
+                break;
+            case 4096:
+                soft_max_f16<true, 4096, 1024, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
+                break;
+            default:
+                soft_max_f16<true, 0, 0, true><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
+                break;
+        }
+    } else {
+        const size_t shmem_low = WARP_SIZE*sizeof(half);
+        soft_max_f16<false, 0, 0, true><<<block_nums, block_dims, shmem_low, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
+    }
+}
+
+static void soft_max_f32_cuda(const float * x, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, cudaStream_t stream) {
+    int nth = WARP_SIZE;
+    while (nth < ncols_x && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2;
+    const dim3 block_dims(nth,     1, 1);
+    const dim3 block_nums(nrows_x, 1, 1);
+    const size_t shmem = (GGML_V3_PAD(ncols_x, WARP_SIZE) + WARP_SIZE)*sizeof(float);
+    static_assert(CUDA_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted.");
+    if (shmem < g_device_caps[g_main_device].smpb) {
+        switch (ncols_x) {
+            case 32:
+                soft_max_f32<true, 32, 32><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
+                break;
+            case 64:
+                soft_max_f32<true, 64, 64><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
+                break;
+            case 128:
+                soft_max_f32<true, 128, 128><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
+                break;
+            case 256:
+                soft_max_f32<true, 256, 256><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
+                break;
+            case 512:
+                soft_max_f32<true, 512, 512><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
+                break;
+            case 1024:
+                soft_max_f32<true, 1024, 1024><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
+                break;
+            case 2048:
+                soft_max_f32<true, 2048, 1024><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
+                break;
+            case 4096:
+                soft_max_f32<true, 4096, 1024><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
+                break;
+            default:
+                soft_max_f32<true, 0, 0><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
+                break;
+        }
+    } else {
+        const size_t shmem_low = WARP_SIZE*sizeof(float);
+        soft_max_f32<false, 0, 0><<<block_nums, block_dims, shmem_low, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
+    }
+}
+
+static void im2col_f32_f16_cuda(const float* x, half* dst,
+    int IW, int IH, int OW, int OH, int KW, int KH, int IC,
+    int offset_delta,
+    int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
+    const int parallel_elements = OW * KW * KH;
+    const int num_blocks = (parallel_elements + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE;
+    dim3 block_nums(num_blocks, OH, IC);
+    im2col_f32_f16<<<block_nums, CUDA_IM2COL_BLOCK_SIZE, 0, stream>>>(x, dst, offset_delta, IW, IH, OW, KW, KH, parallel_elements, (IC * KH * KW), s0, s1, p0, p1, d0, d1);
+}
+
+// buffer pool for cuda
+#define MAX_CUDA_BUFFERS 256
+
+struct scoped_spin_lock {
+    std::atomic_flag& lock;
+    scoped_spin_lock(std::atomic_flag& lock) : lock(lock) {
+        while (lock.test_and_set(std::memory_order_acquire)) {
+            ; // spin
+        }
+    }
+    ~scoped_spin_lock() {
+        lock.clear(std::memory_order_release);
+    }
+    scoped_spin_lock(const scoped_spin_lock&) = delete;
+    scoped_spin_lock& operator=(const scoped_spin_lock&) = delete;
+};
+
+static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT;
+
+// #define DEBUG_CUDA_MALLOC
+struct ggml_v3_cuda_buffer {
+    void * ptr = nullptr;
+    size_t size = 0;
+};
+
+static ggml_v3_cuda_buffer g_cuda_buffer_pool[GGML_V3_CUDA_MAX_DEVICES][MAX_CUDA_BUFFERS];
+static size_t g_cuda_pool_size[GGML_V3_CUDA_MAX_DEVICES] = {0};
+
+static void * ggml_v3_cuda_pool_malloc_leg(int device, size_t size, size_t * actual_size) {
+    scoped_spin_lock lock(g_cuda_pool_lock);
+
+    int best_i = -1;
+    size_t best_size = std::numeric_limits<size_t>::max(); //smallest unused buffer that fits our needs
+    int worst_i = -1;
+    size_t worst_size = 0; //largest unused buffer seen so far
+
+    for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
+        ggml_v3_cuda_buffer& b = g_cuda_buffer_pool[device][i];
+        if (b.size > 0 && b.size >= size && b.size < best_size)
+        {
+            best_i = i;
+            best_size = b.size;
+        }
+        if (b.size > 0 && b.size > worst_size)
+        {
+            worst_i = i;
+            worst_size = b.size;
+        }
+    }
+    if(best_i!=-1) //found the smallest buffer that fits our needs
+    {
+        ggml_v3_cuda_buffer& b = g_cuda_buffer_pool[device][best_i];
+        void * ptr = b.ptr;
+        *actual_size = b.size;
+        b.ptr = nullptr;
+        b.size = 0;
+        return ptr;
+    }
+    if(worst_i!=-1 && !g_mul_mat_q) //no buffer that fits our needs, resize largest one to save memory (non mmq only)
+    {
+        ggml_v3_cuda_buffer& b = g_cuda_buffer_pool[device][worst_i];
+        b.size = 0;
+        void * ptr = b.ptr;
+		ggml_v3_cuda_set_device(device);
+        cudaFree(ptr);
+		g_cuda_pool_size[device] -= size;
+        b.ptr = ptr = nullptr;
+    }
+    void * ptr;
+
+    size_t look_ahead_size = (size_t) (1.05 * size);
+    look_ahead_size = 256 * ((look_ahead_size + 255)/256);
+    ggml_v3_cuda_set_device(device);
+    CUDA_CHECK(cudaMalloc((void **) &ptr, look_ahead_size));
+    *actual_size = look_ahead_size;
+    g_cuda_pool_size[device] += look_ahead_size;
+
+    return ptr;
+}
+
+static void ggml_v3_cuda_pool_free_leg(int device, void * ptr, size_t size) {
+    scoped_spin_lock lock(g_cuda_pool_lock);
+
+    for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
+        ggml_v3_cuda_buffer& b = g_cuda_buffer_pool[device][i];
+        if (b.ptr == nullptr) {
+            b.ptr = ptr;
+            b.size = size;
+            return;
+        }
+    }
+    fprintf(stderr, "WARNING: cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
+    ggml_v3_cuda_set_device(device);
+    CUDA_CHECK(cudaFree(ptr));
+    g_cuda_pool_size[device] -= size;
+}
+
+#if !defined(GGML_USE_HIPBLAS)
+// pool with virtual memory
+static CUdeviceptr g_cuda_pool_addr[GGML_V3_CUDA_MAX_DEVICES] = {0};
+static size_t g_cuda_pool_used[GGML_V3_CUDA_MAX_DEVICES] = {0};
+static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
+
+static void * ggml_v3_cuda_pool_malloc_vmm(int device, size_t size, size_t * actual_size) {
+    scoped_spin_lock lock(g_cuda_pool_lock);
+
+    // round up the allocation size to the alignment to ensure that all allocations are aligned for all data types
+    const size_t alignment = 128;
+    size = alignment * ((size + alignment - 1) / alignment);
+
+    size_t avail = g_cuda_pool_size[device] - g_cuda_pool_used[device];
+
+    if (size > avail) {
+        // round up to the next multiple of the granularity
+        size_t reserve_size = size - avail;
+        const size_t granularity = g_device_caps[device].vmm_granularity;
+        reserve_size = granularity * ((reserve_size + granularity - 1) / granularity);
+
+        GGML_V3_ASSERT(g_cuda_pool_size[device] + reserve_size <= CUDA_POOL_VMM_MAX_SIZE);
+
+        // allocate more physical memory
+        CUmemAllocationProp prop = {};
+        prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+        prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+        prop.location.id = device;
+        CUmemGenericAllocationHandle handle;
+        CU_CHECK(cuMemCreate(&handle, reserve_size, &prop, 0));
+
+        // reserve virtual address space (if not already reserved)
+        if (g_cuda_pool_addr[device] == 0) {
+            CU_CHECK(cuMemAddressReserve(&g_cuda_pool_addr[device], CUDA_POOL_VMM_MAX_SIZE, 0, 0, 0));
+        }
+
+        // map at the end of the pool
+        CU_CHECK(cuMemMap(g_cuda_pool_addr[device] + g_cuda_pool_size[device], reserve_size, 0, handle, 0));
+
+        // the memory allocation handle is no longer needed after mapping
+        CU_CHECK(cuMemRelease(handle));
+
+        // set access
+        CUmemAccessDesc access = {};
+        access.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+        access.location.id = device;
+        access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+        CU_CHECK(cuMemSetAccess(g_cuda_pool_addr[device] + g_cuda_pool_size[device], reserve_size, &access, 1));
+
+        // add to the pool
+        g_cuda_pool_size[device] += reserve_size;
+
+        //printf("cuda pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
+        //       id, (unsigned long long) (g_cuda_pool_size[id]/1024/1024),
+        //       (unsigned long long) (reserve_size/1024/1024));
+    }
+
+    GGML_V3_ASSERT(g_cuda_pool_addr[device] != 0);
+
+    void * ptr = (void *) (g_cuda_pool_addr[device] + g_cuda_pool_used[device]);
+    *actual_size = size;
+    g_cuda_pool_used[device] += size;
+
+#ifdef DEBUG_CUDA_MALLOC
+    printf("cuda pool[%d]: allocated %llu bytes at %llx [%s]\n", id, (unsigned long long) size, ptr);
+#endif
+
+    return ptr;
+}
+
+static void ggml_v3_cuda_pool_free_vmm(int device, void * ptr, size_t size) {
+    scoped_spin_lock lock(g_cuda_pool_lock);
+
+#ifdef DEBUG_CUDA_MALLOC
+    printf("cuda pool[%d]: freed %llu bytes at %llx\n", id, (unsigned long long) size, ptr);
+#endif
+
+    g_cuda_pool_used[device] -= size;
+
+    // all deallocations must be in reverse order of the allocations
+    GGML_V3_ASSERT(ptr == (void *) (g_cuda_pool_addr[device] + g_cuda_pool_used[device]));
+}
+
+static void * ggml_v3_cuda_pool_malloc(int device, size_t size, size_t * actual_size) {
+    if (g_device_caps[device].vmm) {
+        return ggml_v3_cuda_pool_malloc_vmm(device, size, actual_size);
+    } else {
+        return ggml_v3_cuda_pool_malloc_leg(device, size, actual_size);
+    }
+}
+
+static void ggml_v3_cuda_pool_free(int device, void * ptr, size_t size) {
+    if (g_device_caps[device].vmm) {
+        ggml_v3_cuda_pool_free_vmm(device, ptr, size);
+    } else {
+        ggml_v3_cuda_pool_free_leg(device, ptr, size);
+    }
+}
+#else
+#define ggml_v3_cuda_pool_malloc ggml_v3_cuda_pool_malloc_leg
+#define ggml_v3_cuda_pool_free ggml_v3_cuda_pool_free_leg
+#endif // !defined(GGML_USE_HIPBLAS)
+
+template<typename T>
+struct cuda_pool_alloc {
+    int device = -1;
+    T * ptr = nullptr;
+    size_t actual_size = 0;
+
+    // size is in number of elements
+    T * alloc(size_t size) {
+        GGML_V3_ASSERT(ptr == nullptr);
+        CUDA_CHECK(cudaGetDevice(&device));
+        ptr = (T *) ggml_v3_cuda_pool_malloc(device, size * sizeof(T), &this->actual_size);
+        return ptr;
+    }
+
+    cuda_pool_alloc(size_t size) {
+        alloc(size);
+    }
+
+    ~cuda_pool_alloc() {
+        if (ptr != nullptr) {
+            ggml_v3_cuda_pool_free(device, ptr, actual_size);
+        }
+    }
+
+    T * get() {
+        return ptr;
+    }
+
+    cuda_pool_alloc() = default;
+    cuda_pool_alloc(const cuda_pool_alloc &) = delete;
+    cuda_pool_alloc(cuda_pool_alloc &&) = delete;
+    cuda_pool_alloc& operator=(const cuda_pool_alloc &) = delete;
+    cuda_pool_alloc& operator=(cuda_pool_alloc &&) = delete;
+};
+
+static bool g_cublas_loaded = false;
+
+bool ggml_v3_cublas_loaded(void) {
+    return g_cublas_loaded;
+}
+
+void ggml_v3_init_cublas() {
+    static bool initialized = false;
+
+    if (!initialized) {
+
+#ifdef __HIP_PLATFORM_AMD__
+        // Workaround for a rocBLAS bug when using multiple graphics cards:
+        // https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346
+        rocblas_initialize();
+        CUDA_CHECK(cudaDeviceSynchronize());
+#endif
+
+        if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) {
+            initialized = true;
+            g_cublas_loaded = false;
+            return;
+        }
+
+        GGML_V3_ASSERT(g_device_count <= GGML_V3_CUDA_MAX_DEVICES);
+        int64_t total_vram = 0;
+        // fprintf(stderr, "%s: GGML_V3_CUDA_FORCE_MMQ:   %s\n", __func__,"maybe");
+        // fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: %s\n", __func__,"maybe");
+        fprintf(stderr, "%s: found %d " GGML_V3_CUDA_NAME " devices:\n", __func__, g_device_count);
+        for (int id = 0; id < g_device_count; ++id) {
+            int device_vmm = 0;
+
+#if !defined(GGML_USE_HIPBLAS)
+            CUdevice device;
+            CU_CHECK(cuDeviceGet(&device, id));
+            CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
+
+            if (device_vmm) {
+                CUmemAllocationProp alloc_prop = {};
+                alloc_prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+                alloc_prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+                alloc_prop.location.id = id;
+                CU_CHECK(cuMemGetAllocationGranularity(&g_device_caps[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
+            }
+#endif // !defined(GGML_USE_HIPBLAS)
+            g_device_caps[id].vmm = !!device_vmm;
+
+            cudaDeviceProp prop;
+            CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
+            fprintf(stderr, "  Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
+
+            g_tensor_split[id] = total_vram;
+            total_vram += prop.totalGlobalMem;
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+            g_device_caps[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
+#else
+            g_device_caps[id].cc = 100*prop.major + 10*prop.minor;
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+            g_device_caps[id].smpb = prop.sharedMemPerBlock;
+        }
+        for (int id = 0; id < g_device_count; ++id) {
+            g_tensor_split[id] /= total_vram;
+        }
+
+        for (int id = 0; id < g_device_count; ++id) {
+            ggml_v3_cuda_set_device(id);
+
+            // create cuda streams
+            for (int is = 0; is < MAX_STREAMS; ++is) {
+                CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams[id][is], cudaStreamNonBlocking));
+            }
+
+            // create cublas handle
+            CUBLAS_CHECK(cublasCreate(&g_cublas_handles[id]));
+            CUBLAS_CHECK(cublasSetMathMode(g_cublas_handles[id], CUBLAS_TF32_TENSOR_OP_MATH));
+        }
+
+        // configure logging to stdout
+        // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
+
+        initialized = true;
+        g_cublas_loaded = true;
+    }
+}
+
+void ggml_v3_cuda_set_tensor_split(const float * tensor_split) {
+    if (tensor_split == nullptr) {
+        return;
+    }
+    bool all_zero = true;
+    for (int i = 0; i < g_device_count; ++i) {
+        if (tensor_split[i] != 0.0f) {
+            all_zero = false;
+            break;
+        }
+    }
+    if (all_zero) {
+        return;
+    }
+    float split_sum = 0.0f;
+    for (int i = 0; i < g_device_count; ++i) {
+        g_tensor_split[i] = split_sum;
+        split_sum += tensor_split[i];
+    }
+    for (int i = 0; i < g_device_count; ++i) {
+        g_tensor_split[i] /= split_sum;
+    }
+}
+
+void * ggml_v3_cuda_host_malloc(size_t size) {
+    if (getenv("GGML_V3_CUDA_NO_PINNED") != nullptr) {
+        return nullptr;
+    }
+
+    void * ptr = nullptr;
+    cudaError_t err = cudaMallocHost((void **) &ptr, size);
+    if (err != cudaSuccess) {
+        // clear the error
+        cudaGetLastError();
+        fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
+            size/1024.0/1024.0, cudaGetErrorString(err));
+        return nullptr;
+    }
+
+    return ptr;
+}
+
+void ggml_v3_cuda_host_free(void * ptr) {
+    CUDA_CHECK(cudaFreeHost(ptr));
+}
+
+static cudaError_t ggml_v3_cuda_cpy_tensor_2d(
+    void * dst, const struct ggml_v3_tensor * src, int64_t i3, int64_t i2, int64_t i1_low, int64_t i1_high, cudaStream_t stream) {
+
+    cudaMemcpyKind kind;
+    char * src_ptr;
+    if (src->backend == GGML_V3_BACKEND_CPU) {
+        kind = cudaMemcpyHostToDevice;
+        src_ptr = (char *) src->data;
+    } else if (src->backend == GGML_V3_BACKEND_GPU || src->backend == GGML_V3_BACKEND_GPU_SPLIT) {
+        GGML_V3_ASSERT(src->backend != GGML_V3_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
+        kind = cudaMemcpyDeviceToDevice;
+        ggml_v3_tensor_extra_gpu * extra = (ggml_v3_tensor_extra_gpu *) src->extra;
+        int id;
+        CUDA_CHECK(cudaGetDevice(&id));
+        src_ptr = (char *) extra->data_device[id];
+    } else {
+        GGML_V3_ASSERT(false);
+    }
+    char * dst_ptr = (char *) dst;
+
+    const int64_t ne0 = src->ne[0];
+    const int64_t nb0 = src->nb[0];
+    const int64_t nb1 = src->nb[1];
+    const int64_t nb2 = src->nb[2];
+    const int64_t nb3 = src->nb[3];
+    const enum ggml_v3_type type = src->type;
+    const int64_t ts = ggml_v3_type_size(type);
+    const int64_t bs = ggml_v3_blck_size(type);
+    int64_t i1_diff = i1_high - i1_low;
+
+    const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
+    if (nb0 == ts && nb1 == ts*ne0/bs) {
+        return cudaMemcpyAsync(dst_ptr, x, i1_diff*nb1, kind, stream);
+    } else if (nb0 == ts) {
+        return cudaMemcpy2DAsync(dst_ptr, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, kind, stream);
+    } else {
+        for (int64_t i1 = 0; i1 < i1_diff; i1++) {
+            const void * rx = (const void *) ((const char *) x + i1*nb1);
+            void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
+            // pretend the row is a matrix with cols=1
+            cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream);
+            if (r != cudaSuccess) return r;
+        }
+        return cudaSuccess;
+    }
+}
+
+static void ggml_v3_cuda_op_get_rows(
+    const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst,
+    const float * src0_d, const float * src1_d, float * dst_d, cudaStream_t stream) {
+
+    GGML_V3_ASSERT(src1->type == GGML_V3_TYPE_I32);
+    GGML_V3_ASSERT(dst->type == GGML_V3_TYPE_F32);
+
+    GGML_V3_ASSERT(src0->nb[0] == ggml_v3_type_size(src0->type));
+    GGML_V3_ASSERT(src1->nb[0] == ggml_v3_type_size(src1->type));
+    GGML_V3_ASSERT(dst->nb[0] == ggml_v3_type_size(dst->type));
+
+    const int32_t * src1_i32 = (const int32_t *) src1_d;
+
+    switch (src0->type) {
+        case GGML_V3_TYPE_F16:
+            get_rows_cuda_float(src0, src1, dst, (const half *)src0_d, src1_i32, dst_d, stream);
+            break;
+        case GGML_V3_TYPE_F32:
+            get_rows_cuda_float(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
+            break;
+        case GGML_V3_TYPE_Q4_0:
+            get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
+            break;
+        case GGML_V3_TYPE_Q4_1:
+            get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
+            break;
+        case GGML_V3_TYPE_Q5_0:
+            get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
+            break;
+        case GGML_V3_TYPE_Q5_1:
+            get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
+            break;
+        case GGML_V3_TYPE_Q8_0:
+            get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
+            break;
+        default:
+            // TODO: k-quants
+            fprintf(stderr, "%s: unsupported type: %s\n", __func__, ggml_v3_type_name(src0->type));
+            GGML_V3_ASSERT(false);
+            break;
+    }
+}
+
+template<class op>
+static void ggml_v3_cuda_op_bin_bcast(
+    const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst,
+    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
+
+    GGML_V3_ASSERT(src1->type == GGML_V3_TYPE_F32);
+
+    if (src0->type == GGML_V3_TYPE_F32 && dst->type == GGML_V3_TYPE_F32) {
+        op()(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
+    } else if (src0->type == GGML_V3_TYPE_F16 && dst->type == GGML_V3_TYPE_F16) {
+        op()(src0, src1, dst, (const half *) src0_dd, src1_dd, (half *) dst_dd, main_stream);
+    } else if (src0->type == GGML_V3_TYPE_F16 && dst->type == GGML_V3_TYPE_F32) {
+        op()(src0, src1, dst, (const half *) src0_dd, src1_dd, dst_dd, main_stream);
+    } else {
+        fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
+            ggml_v3_type_name(dst->type), ggml_v3_type_name(src0->type), ggml_v3_type_name(src1->type));
+        GGML_V3_ASSERT(false);
+    }
+}
+
+static void ggml_v3_cuda_op_repeat(
+    const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst,
+    const float * src0_d, const float * src1_d, float * dst_d, cudaStream_t main_stream) {
+
+    ggml_v3_cuda_op_bin_bcast<bin_bcast_cuda<op_repeat>>(dst, src0, dst, nullptr, src0_d, dst_d, main_stream);
+
+    (void) src1;
+    (void) src1_d;
+}
+
+static void ggml_v3_cuda_op_add(
+    const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst,
+    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
+
+    ggml_v3_cuda_op_bin_bcast<bin_bcast_cuda<op_add>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
+}
+
+static void ggml_v3_cuda_op_acc(
+    const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst,
+    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
+
+    GGML_V3_ASSERT(src0->type == GGML_V3_TYPE_F32);
+    GGML_V3_ASSERT(src1->type == GGML_V3_TYPE_F32);
+    GGML_V3_ASSERT( dst->type == GGML_V3_TYPE_F32);
+    GGML_V3_ASSERT(dst->ne[3] == 1); // just 3D tensors supported
+
+    int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
+    int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
+    // int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused
+    int offset = dst->op_params[3] / 4; // offset in bytes
+
+    acc_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_v3_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, main_stream);
+
+    (void) dst;
+}
+
+static void ggml_v3_cuda_op_mul(
+    const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst,
+    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
+
+    ggml_v3_cuda_op_bin_bcast<bin_bcast_cuda<op_mul>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
+}
+
+static void ggml_v3_cuda_op_div(
+    const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst,
+    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
+
+    ggml_v3_cuda_op_bin_bcast<bin_bcast_cuda<op_div>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
+}
+
+static void ggml_v3_cuda_op_gelu(
+    const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst,
+    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
+
+    GGML_V3_ASSERT(src0->type == GGML_V3_TYPE_F32);
+    GGML_V3_ASSERT( dst->type == GGML_V3_TYPE_F32);
+
+    gelu_f32_cuda(src0_dd, dst_dd, ggml_v3_nelements(src0), main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+static void ggml_v3_cuda_op_silu(
+    const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst,
+    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
+
+    GGML_V3_ASSERT(src0->type == GGML_V3_TYPE_F32);
+    GGML_V3_ASSERT( dst->type == GGML_V3_TYPE_F32);
+
+    silu_f32_cuda(src0_dd, dst_dd, ggml_v3_nelements(src0), main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+static void ggml_v3_cuda_op_gelu_quick(
+    const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst,
+    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
+
+    GGML_V3_ASSERT(src0->type == GGML_V3_TYPE_F32);
+    GGML_V3_ASSERT( dst->type == GGML_V3_TYPE_F32);
+
+    gelu_quick_f32_cuda(src0_dd, dst_dd, ggml_v3_nelements(src0), main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+static void ggml_v3_cuda_op_tanh(
+    const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst,
+    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
+
+    GGML_V3_ASSERT(src0->type == GGML_V3_TYPE_F32);
+    GGML_V3_ASSERT( dst->type == GGML_V3_TYPE_F32);
+
+    tanh_f32_cuda(src0_dd, dst_dd, ggml_v3_nelements(src0), main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+static void ggml_v3_cuda_op_relu(
+    const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst,
+    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
+
+    GGML_V3_ASSERT(src0->type == GGML_V3_TYPE_F32);
+    GGML_V3_ASSERT( dst->type == GGML_V3_TYPE_F32);
+
+    relu_f32_cuda(src0_dd, dst_dd, ggml_v3_nelements(src0), main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+static void ggml_v3_cuda_op_leaky_relu(
+    const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst,
+    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
+
+    GGML_V3_ASSERT(src0->type == GGML_V3_TYPE_F32);
+    GGML_V3_ASSERT( dst->type == GGML_V3_TYPE_F32);
+
+    float negative_slope;
+    memcpy(&negative_slope, dst->op_params, sizeof(float));
+
+    leaky_relu_f32_cuda(src0_dd, dst_dd, ggml_v3_nelements(src0), negative_slope, main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+static void ggml_v3_cuda_op_sqr(
+    const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst,
+    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
+
+    GGML_V3_ASSERT(src0->type == GGML_V3_TYPE_F32);
+    GGML_V3_ASSERT( dst->type == GGML_V3_TYPE_F32);
+
+    sqr_f32_cuda(src0_dd, dst_dd, ggml_v3_nelements(src0), main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+static void ggml_v3_cuda_op_norm(
+    const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst,
+    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
+
+    GGML_V3_ASSERT(src0->type == GGML_V3_TYPE_F32);
+    GGML_V3_ASSERT( dst->type == GGML_V3_TYPE_F32);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t nrows = ggml_v3_nrows(src0);
+
+    float eps;
+    memcpy(&eps, dst->op_params, sizeof(float));
+
+    norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+static void ggml_v3_cuda_op_group_norm(
+    const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst,
+    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
+
+    GGML_V3_ASSERT(src0->type == GGML_V3_TYPE_F32);
+    GGML_V3_ASSERT( dst->type == GGML_V3_TYPE_F32);
+
+    int num_groups = dst->op_params[0];
+    int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
+    group_norm_f32_cuda(src0_dd, dst_dd, num_groups, group_size, src0->ne[0] * src0->ne[1] * src0->ne[2], main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+static void ggml_v3_cuda_op_concat(
+    const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst,
+    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
+
+    GGML_V3_ASSERT(src0->type == GGML_V3_TYPE_F32);
+    GGML_V3_ASSERT(src1->type == GGML_V3_TYPE_F32);
+    GGML_V3_ASSERT(dst->type == GGML_V3_TYPE_F32);
+
+    for (int i3 = 0; i3 < dst->ne[3]; i3++) {
+        concat_f32_cuda(src0_dd + i3 * (src0->nb[3] / 4), src1_dd + i3 * (src1->nb[3] / 4), dst_dd + i3 * (dst->nb[3] / 4), dst->ne[0], dst->ne[1], dst->ne[2], src0->ne[2], main_stream);
+    }
+
+    (void) src1;
+    (void) dst;
+}
+
+static void ggml_v3_cuda_op_upscale(
+    const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst,
+    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
+
+    GGML_V3_ASSERT(src0->type == GGML_V3_TYPE_F32);
+    GGML_V3_ASSERT(dst->type == GGML_V3_TYPE_F32);
+    GGML_V3_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
+
+    const int scale_factor = dst->op_params[0];
+
+    upscale_f32_cuda(src0_dd, dst_dd, src0->ne[0], src0->ne[1], src0->ne[2], scale_factor, main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+static void ggml_v3_cuda_op_pad(
+    const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst,
+    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
+
+    GGML_V3_ASSERT(src0->type == GGML_V3_TYPE_F32);
+    GGML_V3_ASSERT(dst->type == GGML_V3_TYPE_F32);
+    GGML_V3_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
+
+    pad_f32_cuda(src0_dd, dst_dd,
+        src0->ne[0], src0->ne[1], src0->ne[2],
+        dst->ne[0], dst->ne[1], dst->ne[2], main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+static void ggml_v3_cuda_op_rms_norm(
+    const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst,
+    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
+
+    GGML_V3_ASSERT(src0->type == GGML_V3_TYPE_F32);
+    GGML_V3_ASSERT( dst->type == GGML_V3_TYPE_F32);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t nrows = ggml_v3_nrows(src0);
+
+    float eps;
+    memcpy(&eps, dst->op_params, sizeof(float));
+
+    rms_norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+static void ggml_v3_cuda_op_mul_mat_q(
+    const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
+    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
+    const int64_t src1_padded_row_size, cudaStream_t stream) {
+
+    const int64_t ne00 = src0->ne[0];
+
+    const int64_t ne10 = src1->ne[0];
+    GGML_V3_ASSERT(ne10 % QK8_1 == 0);
+
+    const int64_t ne0 = dst->ne[0];
+
+    const int64_t row_diff = row_high - row_low;
+
+    int id;
+    CUDA_CHECK(cudaGetDevice(&id));
+
+    // the main device has a larger memory buffer to hold the results from all GPUs
+    // nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into
+    const int64_t nrows_dst = dst->backend == GGML_V3_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
+
+    switch (src0->type) {
+        case GGML_V3_TYPE_Q4_0:
+            ggml_v3_mul_mat_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            break;
+        case GGML_V3_TYPE_Q4_1:
+            ggml_v3_mul_mat_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            break;
+        case GGML_V3_TYPE_Q5_0:
+            ggml_v3_mul_mat_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            break;
+        case GGML_V3_TYPE_Q5_1:
+            ggml_v3_mul_mat_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            break;
+        case GGML_V3_TYPE_Q8_0:
+            ggml_v3_mul_mat_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            break;
+        case GGML_V3_TYPE_Q2_K:
+            ggml_v3_mul_mat_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            break;
+        case GGML_V3_TYPE_Q3_K:
+            ggml_v3_mul_mat_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            break;
+        case GGML_V3_TYPE_Q4_K:
+            ggml_v3_mul_mat_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            break;
+        case GGML_V3_TYPE_Q5_K:
+            ggml_v3_mul_mat_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            break;
+        case GGML_V3_TYPE_Q6_K:
+            ggml_v3_mul_mat_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            break;
+        default:
+            GGML_V3_ASSERT(false);
+            break;
+    }
+
+    (void) src1;
+    (void) dst;
+    (void) src1_ddf_i;
+}
+
+static int64_t get_row_rounding(ggml_v3_type type) {
+    int64_t min_compute_capability = INT_MAX;
+    int64_t max_compute_capability = INT_MIN;
+    for (int id = 0; id < g_device_count; ++id) {
+        if (g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
+            if (min_compute_capability > g_device_caps[id].cc) {
+                min_compute_capability = g_device_caps[id].cc;
+            }
+            if (max_compute_capability < g_device_caps[id].cc) {
+                max_compute_capability = g_device_caps[id].cc;
+            }
+        }
+    }
+
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+    switch(type) {
+        case GGML_V3_TYPE_Q4_0:
+        case GGML_V3_TYPE_Q4_1:
+        case GGML_V3_TYPE_Q5_0:
+        case GGML_V3_TYPE_Q5_1:
+        case GGML_V3_TYPE_Q8_0:
+            return max_compute_capability >= CC_RDNA2 ? 128 : 64;
+        case GGML_V3_TYPE_F16:
+        case GGML_V3_TYPE_F32:
+            return 1;
+        case GGML_V3_TYPE_Q2_K:
+            return max_compute_capability >= CC_RDNA2 ? 128 : 32;
+        case GGML_V3_TYPE_Q3_K:
+            return min_compute_capability < CC_RDNA2 ? 128 : 64;
+        case GGML_V3_TYPE_Q4_K:
+        case GGML_V3_TYPE_Q5_K:
+        case GGML_V3_TYPE_Q6_K:
+        case GGML_V3_TYPE_IQ2_XXS:
+        case GGML_V3_TYPE_IQ2_XS:
+            return max_compute_capability >= CC_RDNA2 ? 128 : 64;
+        default:
+            GGML_V3_ASSERT(false);
+    }
+#else
+    switch(type) {
+        case GGML_V3_TYPE_Q4_0:
+        case GGML_V3_TYPE_Q4_1:
+            return max_compute_capability >= CC_VOLTA ? 128 : 64;
+        case GGML_V3_TYPE_Q5_0:
+        case GGML_V3_TYPE_Q5_1:
+        case GGML_V3_TYPE_Q8_0:
+            return 64;
+        case GGML_V3_TYPE_F16:
+        case GGML_V3_TYPE_F32:
+            return 1;
+        case GGML_V3_TYPE_Q2_K:
+        case GGML_V3_TYPE_Q3_K:
+        case GGML_V3_TYPE_Q4_K:
+        case GGML_V3_TYPE_Q5_K:
+        case GGML_V3_TYPE_IQ2_XXS:
+        case GGML_V3_TYPE_IQ2_XS:
+            return max_compute_capability >= CC_VOLTA ? 128 : 64;
+        case GGML_V3_TYPE_Q6_K:
+            return 64;
+        default:
+            GGML_V3_ASSERT(false);
+    }
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+}
+
+static void ggml_v3_cuda_op_mul_mat_vec_q(
+    const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
+    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
+    const int64_t src1_padded_row_size, cudaStream_t stream) {
+
+    GGML_V3_ASSERT(ggml_v3_nrows(src1) == 1);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t row_diff = row_high - row_low;
+
+    switch (src0->type) {
+        case GGML_V3_TYPE_Q4_0:
+            mul_mat_vec_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_V3_TYPE_Q4_1:
+            mul_mat_vec_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_V3_TYPE_Q5_0:
+            mul_mat_vec_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_V3_TYPE_Q5_1:
+            mul_mat_vec_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_V3_TYPE_Q8_0:
+            mul_mat_vec_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_V3_TYPE_Q2_K:
+            mul_mat_vec_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_V3_TYPE_Q3_K:
+            mul_mat_vec_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_V3_TYPE_Q4_K:
+            mul_mat_vec_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_V3_TYPE_Q5_K:
+            mul_mat_vec_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_V3_TYPE_Q6_K:
+            mul_mat_vec_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_V3_TYPE_IQ2_XXS:
+            mul_mat_vec_iq2_xxs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_V3_TYPE_IQ2_XS:
+            mul_mat_vec_iq2_xs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        default:
+            GGML_V3_ASSERT(false);
+            break;
+    }
+
+    (void) src1;
+    (void) dst;
+    (void) src1_ddf_i;
+    (void) src1_ncols;
+    (void) src1_padded_row_size;
+}
+
+static void ggml_v3_cuda_op_dequantize_mul_mat_vec(
+    const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
+    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
+    const int64_t src1_padded_row_size, cudaStream_t stream) {
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t row_diff = row_high - row_low;
+
+    GGML_V3_ASSERT(src1->type == GGML_V3_TYPE_F32);
+
+    // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
+#ifdef GGML_V3_CUDA_F16
+    cuda_pool_alloc<half> src1_dfloat_a;
+    half * src1_dfloat = nullptr; // dfloat == half
+
+    bool src1_convert_f16 =
+        src0->type == GGML_V3_TYPE_Q4_0 || src0->type == GGML_V3_TYPE_Q4_1 ||
+        src0->type == GGML_V3_TYPE_Q5_0 || src0->type == GGML_V3_TYPE_Q5_1 ||
+        src0->type == GGML_V3_TYPE_Q8_0 || src0->type == GGML_V3_TYPE_F16;
+
+    if (src1_convert_f16) {
+        src1_dfloat = src1_dfloat_a.alloc(ne00);
+        ggml_v3_cpy_f32_f16_cuda((const char *) src1_ddf_i, (char *) src1_dfloat, ne00,
+                                ne00, 1, sizeof(float), 0, 0,
+                                ne00, 1, sizeof(half),  0, 0, stream);
+    }
+#else
+    const dfloat * src1_dfloat = (const dfloat *) src1_ddf_i; // dfloat == float, no conversion
+#endif // GGML_V3_CUDA_F16
+
+    switch (src0->type) {
+        case GGML_V3_TYPE_Q4_0:
+            dequantize_mul_mat_vec_q4_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_V3_TYPE_Q4_1:
+            dequantize_mul_mat_vec_q4_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_V3_TYPE_Q5_0:
+            dequantize_mul_mat_vec_q5_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_V3_TYPE_Q5_1:
+            dequantize_mul_mat_vec_q5_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_V3_TYPE_Q8_0:
+            dequantize_mul_mat_vec_q8_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_V3_TYPE_Q2_K:
+            dequantize_mul_mat_vec_q2_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_V3_TYPE_Q3_K:
+            dequantize_mul_mat_vec_q3_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_V3_TYPE_Q4_K:
+            dequantize_mul_mat_vec_q4_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_V3_TYPE_Q5_K:
+            dequantize_mul_mat_vec_q5_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_V3_TYPE_Q6_K:
+            dequantize_mul_mat_vec_q6_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_V3_TYPE_F16:
+            convert_mul_mat_vec_f16_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
+            break;
+        default:
+            GGML_V3_ASSERT(false);
+            break;
+    }
+
+    (void) src1;
+    (void) dst;
+    (void) src1_ddq_i;
+    (void) src1_ncols;
+    (void) src1_padded_row_size;
+}
+
+static void ggml_v3_cuda_op_mul_mat_cublas(
+    const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
+    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
+    const int64_t src1_padded_row_size, cudaStream_t stream) {
+
+    GGML_V3_ASSERT(src0_dd_i  != nullptr);
+    GGML_V3_ASSERT(src1_ddf_i != nullptr);
+    GGML_V3_ASSERT(dst_dd_i   != nullptr);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne10 = src1->ne[0];
+
+    const int64_t ne0 = dst->ne[0];
+
+    const int64_t row_diff = row_high - row_low;
+
+    int id;
+    CUDA_CHECK(cudaGetDevice(&id));
+
+    // the main device has a larger memory buffer to hold the results from all GPUs
+    // ldc == nrows of the matrix that cuBLAS writes into
+    int ldc = dst->backend == GGML_V3_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
+
+    const int compute_capability = g_device_caps[id].cc;
+
+    if (compute_capability >= CC_VOLTA && (src0->type == GGML_V3_TYPE_F16 || ggml_v3_is_quantized(src0->type)) && ggml_v3_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_V3_PREC_DEFAULT) {
+        //printf("this branch\n");
+        // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
+        cuda_pool_alloc<half> src0_as_f16;
+        if (src0->type != GGML_V3_TYPE_F16) {
+            const to_fp16_cuda_t to_fp16_cuda = ggml_v3_get_to_fp16_cuda(src0->type);
+            GGML_V3_ASSERT(to_fp16_cuda != nullptr);
+            size_t ne = row_diff*ne00;
+            src0_as_f16.alloc(ne);
+            to_fp16_cuda(src0_dd_i, src0_as_f16.get(), ne, stream);
+        }
+        const half * src0_ptr = src0->type == GGML_V3_TYPE_F16 ? (const half *) src0_dd_i : src0_as_f16.get();
+
+        cuda_pool_alloc<half> src1_as_f16;
+        if (src1->type != GGML_V3_TYPE_F16) {
+            const to_fp16_cuda_t to_fp16_cuda = ggml_v3_get_to_fp16_cuda(src1->type);
+            GGML_V3_ASSERT(to_fp16_cuda != nullptr);
+            size_t ne = src1_ncols*ne10;
+            src1_as_f16.alloc(ne);
+            to_fp16_cuda(src1_ddf_i, src1_as_f16.get(), ne, stream);
+        }
+        const half * src1_ptr = src1->type == GGML_V3_TYPE_F16 ? (const half *) src1_ddf_i : src1_as_f16.get();
+        cuda_pool_alloc<half> dst_f16(row_diff*src1_ncols);
+
+        const half alpha_f16 = 1.0f;
+        const half beta_f16 = 0.0f;
+
+        CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], stream));
+        CUBLAS_CHECK(
+            cublasGemmEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
+                    row_diff, src1_ncols, ne10,
+                    &alpha_f16, src0_ptr,       CUDA_R_16F, ne00,
+                                src1_ptr,       CUDA_R_16F, ne10,
+                    &beta_f16,   dst_f16.get(), CUDA_R_16F, ldc,
+                    CUBLAS_COMPUTE_16F,
+                    CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+
+        const to_fp32_cuda_t to_fp32_cuda = ggml_v3_get_to_fp32_cuda(GGML_V3_TYPE_F16);
+        to_fp32_cuda(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
+    } else {
+        cuda_pool_alloc<float> src0_ddq_as_f32;
+        cuda_pool_alloc<float> src1_ddq_as_f32;
+
+        if (src0->type != GGML_V3_TYPE_F32) {
+            const to_fp32_cuda_t to_fp32_cuda = ggml_v3_get_to_fp32_cuda(src0->type);
+            GGML_V3_ASSERT(to_fp32_cuda != nullptr);
+            src0_ddq_as_f32.alloc(row_diff*ne00);
+            to_fp32_cuda(src0_dd_i, src0_ddq_as_f32.get(), row_diff*ne00, stream);
+        }
+        if (src1->type != GGML_V3_TYPE_F32) {
+            const to_fp32_cuda_t to_fp32_cuda = ggml_v3_get_to_fp32_cuda(src1->type);
+            GGML_V3_ASSERT(to_fp32_cuda != nullptr);
+            src1_ddq_as_f32.alloc(src1_ncols*ne10);
+            to_fp32_cuda(src1_ddf_i, src1_ddq_as_f32.get(), src1_ncols*ne10, stream);
+        }
+
+        const float * src0_ddf_i = src0->type == GGML_V3_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32.get();
+        const float * src1_ddf1_i = src1->type == GGML_V3_TYPE_F32 ? (const float *) src1_ddf_i : src1_ddq_as_f32.get();
+
+        const float alpha = 1.0f;
+        const float beta = 0.0f;
+
+        CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], stream));
+        CUBLAS_CHECK(
+            cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
+                    row_diff, src1_ncols, ne10,
+                    &alpha, src0_ddf_i,  ne00,
+                            src1_ddf1_i, ne10,
+                    &beta,  dst_dd_i,    ldc));
+    }
+
+    (void) dst;
+    (void) src1_ddq_i;
+    (void) src1_padded_row_size;
+}
+
+static void ggml_v3_cuda_op_rope(
+    const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst,
+    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
+
+    GGML_V3_ASSERT(src0->type == GGML_V3_TYPE_F32 || src0->type == GGML_V3_TYPE_F16);
+    GGML_V3_ASSERT( dst->type == GGML_V3_TYPE_F32 ||  dst->type == GGML_V3_TYPE_F16);
+    GGML_V3_ASSERT(src0->type == dst->type);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne2 = dst->ne[2];
+    const int64_t nrows = ggml_v3_nrows(src0);
+
+    //const int n_past      = ((int32_t *) dst->op_params)[0];
+    const int n_dims      = ((int32_t *) dst->op_params)[1];
+    const int mode        = ((int32_t *) dst->op_params)[2];
+    const int n_ctx       = ((int32_t *) dst->op_params)[3];
+    const int n_orig_ctx  = ((int32_t *) dst->op_params)[4];
+
+    // RoPE alteration for extended context
+    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
+    memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
+    memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
+    memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
+    memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
+    memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
+    memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
+
+    const int32_t * pos = nullptr;
+    if ((mode & 1) == 0) {
+        GGML_V3_ASSERT(src1->type == GGML_V3_TYPE_I32);
+        GGML_V3_ASSERT(src1->ne[0] == ne2);
+        pos = (const int32_t *) src1_dd;
+    }
+
+    const bool is_neox = mode & 2;
+    const bool is_glm  = mode & 4;
+
+    rope_corr_dims corr_dims;
+    ggml_v3_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims.v);
+
+    // compute
+    if (is_glm) {
+        GGML_V3_ASSERT(false);
+        rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, n_ctx, main_stream);
+    } else if (is_neox) {
+        if (src0->type == GGML_V3_TYPE_F32) {
+            rope_neox_cuda(
+                (const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
+                attn_factor, corr_dims, main_stream
+            );
+        } else if (src0->type == GGML_V3_TYPE_F16) {
+            rope_neox_cuda(
+                (const half *)src0_dd, (half *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
+                attn_factor, corr_dims, main_stream
+            );
+        } else {
+            GGML_V3_ASSERT(false);
+        }
+    } else {
+        if (src0->type == GGML_V3_TYPE_F32) {
+            rope_cuda(
+                (const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
+                attn_factor, corr_dims, main_stream
+            );
+        } else if (src0->type == GGML_V3_TYPE_F16) {
+            rope_cuda(
+                (const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
+                attn_factor, corr_dims, main_stream
+            );
+        } else {
+            GGML_V3_ASSERT(false);
+        }
+    }
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+static void ggml_v3_cuda_op_alibi(
+    const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst,
+    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
+
+    GGML_V3_ASSERT(src0->type == GGML_V3_TYPE_F32);
+    GGML_V3_ASSERT( dst->type == GGML_V3_TYPE_F32);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+    const int64_t nrows = ggml_v3_nrows(src0);
+
+    //const int n_past = ((int32_t *) dst->op_params)[0];
+    const int n_head = ((int32_t *) dst->op_params)[1];
+    float max_bias;
+    memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
+
+    //GGML_V3_ASSERT(ne01 + n_past == ne00);
+    GGML_V3_ASSERT(n_head == ne02);
+
+    const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
+
+    const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
+    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
+
+    alibi_f32_cuda(src0_dd, dst_dd, ne00, nrows, ne01, n_heads_log2_floor, m0, m1, main_stream);
+
+    (void) src1;
+    (void) src1_dd;
+}
+
+static void ggml_v3_cuda_op_im2col(
+    const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst,
+    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
+
+    GGML_V3_ASSERT(src0->type == GGML_V3_TYPE_F16);
+    GGML_V3_ASSERT(src1->type == GGML_V3_TYPE_F32);
+    GGML_V3_ASSERT( dst->type == GGML_V3_TYPE_F16);
+
+    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
+    const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
+    const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
+    const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
+    const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
+    const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
+
+    const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
+
+    const int64_t IC = src1->ne[is_2D ? 2 : 1];
+    const int64_t IH = is_2D ? src1->ne[1] : 1;
+    const int64_t IW =         src1->ne[0];
+
+    const int64_t KH = is_2D ? src0->ne[1] : 1;
+    const int64_t KW =         src0->ne[0];
+
+    const int64_t OH = is_2D ? dst->ne[2] : 1;
+    const int64_t OW =         dst->ne[1];
+
+    const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
+
+    im2col_f32_f16_cuda(src1_dd, (half*) dst_dd, IW, IH, OW, OH, KW, KH, IC, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
+
+    (void) src0;
+    (void) src0_dd;
+}
+
+static void ggml_v3_cuda_op_sum_rows(
+    const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst,
+    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
+
+    GGML_V3_ASSERT(src0->type == GGML_V3_TYPE_F32);
+    GGML_V3_ASSERT( dst->type == GGML_V3_TYPE_F32);
+
+    const int64_t ncols = src0->ne[0];
+    const int64_t nrows = ggml_v3_nrows(src0);
+
+    sum_rows_f32_cuda(src0_dd, dst_dd, ncols, nrows, main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+static void ggml_v3_cuda_op_argsort(
+    const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst,
+    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
+
+    GGML_V3_ASSERT(src0->type == GGML_V3_TYPE_F32);
+    GGML_V3_ASSERT( dst->type == GGML_V3_TYPE_I32);
+
+    const int64_t ncols = src0->ne[0];
+    const int64_t nrows = ggml_v3_nrows(src0);
+
+    enum ggml_v3_sort_order order = (enum ggml_v3_sort_order) dst->op_params[0];
+
+    argsort_f32_i32_cuda(src0_dd, (int *)dst_dd, ncols, nrows, order, main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+static void ggml_v3_cuda_op_diag_mask_inf(
+    const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst,
+    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
+
+    GGML_V3_ASSERT(src0->type == GGML_V3_TYPE_F32);
+    GGML_V3_ASSERT( dst->type == GGML_V3_TYPE_F32);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int nrows0 = ggml_v3_nrows(src0);
+
+    const int n_past = ((int32_t *) dst->op_params)[0];
+
+    diag_mask_inf_f32_cuda(src0_dd, dst_dd, ne00, nrows0, ne01, n_past, main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+static void ggml_v3_cuda_op_soft_max(
+    const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst,
+    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
+
+    GGML_V3_ASSERT(src0->type == GGML_V3_TYPE_F32);
+    GGML_V3_ASSERT( dst->type == GGML_V3_TYPE_F32);
+
+    GGML_V3_ASSERT(!src1 || src1->type == GGML_V3_TYPE_F32); // src1 contains mask and it is optional
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t nrows_x = ggml_v3_nrows(src0);
+    const int64_t nrows_y = src1 ? ggml_v3_nrows(src1) : 1;
+
+    float scale = 1.0f;
+    memcpy(&scale, dst->op_params, sizeof(float));
+
+#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION >= CUDART_HMAX
+#ifdef GGML_V3_CUDA_F16
+    const bool use_f16_soft_max = true;
+#else
+    const bool use_f16_soft_max = false;
+#endif // GGML_V3_CUDA_F16
+#else
+    const bool use_f16_soft_max = false;
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && CUDART_VERSION >= CUDART_HMAX
+
+    if (use_f16_soft_max) {
+        soft_max_f16_cuda(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream);
+    } else {
+        soft_max_f32_cuda(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream);
+    }
+
+    (void) dst;
+}
+
+static void ggml_v3_cuda_op_scale(
+    const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst,
+    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
+
+    GGML_V3_ASSERT(src0->type == GGML_V3_TYPE_F32);
+    GGML_V3_ASSERT( dst->type == GGML_V3_TYPE_F32);
+
+    float scale;
+    memcpy(&scale, dst->op_params, sizeof(float));
+
+    scale_f32_cuda(src0_dd, dst_dd, scale, ggml_v3_nelements(src0), main_stream);
+    CUDA_CHECK(cudaGetLastError());
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+static void ggml_v3_cuda_op_clamp(
+    const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst,
+    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
+
+    GGML_V3_ASSERT(src0->type == GGML_V3_TYPE_F32);
+    GGML_V3_ASSERT( dst->type == GGML_V3_TYPE_F32);
+
+    float min;
+    float max;
+    memcpy(&min, dst->op_params, sizeof(float));
+    memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
+
+    clamp_f32_cuda(src0_dd, dst_dd, min, max, ggml_v3_nelements(src0), main_stream);
+    CUDA_CHECK(cudaGetLastError());
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+static void ggml_v3_cuda_op_flatten(const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst, const ggml_v3_cuda_op_flatten_t op) {
+    const int64_t nrows0 = ggml_v3_nrows(src0);
+
+    const bool use_src1 = src1 != nullptr;
+    const int64_t nrows1 = use_src1 ? ggml_v3_nrows(src1) : 1;
+
+    GGML_V3_ASSERT(!use_src1 || src1->backend != GGML_V3_BACKEND_GPU_SPLIT);
+    GGML_V3_ASSERT(              dst->backend != GGML_V3_BACKEND_GPU_SPLIT);
+
+    ggml_v3_tensor_extra_gpu * src0_extra =            (ggml_v3_tensor_extra_gpu *) src0->extra;
+    ggml_v3_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_v3_tensor_extra_gpu *) src1->extra : nullptr;
+    ggml_v3_tensor_extra_gpu * dst_extra  =            (ggml_v3_tensor_extra_gpu *)  dst->extra;
+
+    const bool src0_on_device =             src0->backend == GGML_V3_BACKEND_GPU || src0->backend == GGML_V3_BACKEND_GPU_SPLIT;
+    const bool src1_on_device = use_src1 && src1->backend == GGML_V3_BACKEND_GPU;
+    const bool  dst_on_device =              dst->backend == GGML_V3_BACKEND_GPU;
+
+    // dd = data device
+    float * src0_ddf = nullptr;
+    float * src1_ddf = nullptr;
+    float *  dst_ddf = nullptr;
+
+    cuda_pool_alloc<float> src0_f;
+    cuda_pool_alloc<float> src1_f;
+    cuda_pool_alloc<float>  dst_f;
+
+    ggml_v3_cuda_set_device(g_main_device);
+    cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
+
+    if (src0_on_device) {
+        src0_ddf = (float *) src0_extra->data_device[g_main_device];
+    } else {
+        src0_ddf = src0_f.alloc(ggml_v3_nelements(src0));
+        CUDA_CHECK(ggml_v3_cuda_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream));
+    }
+
+    if (use_src1) {
+        if (src1_on_device) {
+            src1_ddf = (float *) src1_extra->data_device[g_main_device];
+        } else {
+            src1_ddf = src1_f.alloc(ggml_v3_nelements(src1));
+            CUDA_CHECK(ggml_v3_cuda_cpy_tensor_2d(src1_ddf, src1, 0, 0, 0, nrows1, main_stream));
+        }
+    }
+    if (dst_on_device) {
+        dst_ddf = (float *) dst_extra->data_device[g_main_device];
+    } else {
+        dst_ddf = dst_f.alloc(ggml_v3_nelements(dst));
+    }
+
+    // do the computation
+    op(src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream);
+    CUDA_CHECK(cudaGetLastError());
+
+    // copy dst to host if necessary
+    if (!dst_on_device) {
+        CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_v3_nbytes(dst), cudaMemcpyDeviceToHost, main_stream));
+    }
+
+    if (dst->backend == GGML_V3_BACKEND_CPU) {
+        CUDA_CHECK(cudaDeviceSynchronize());
+    }
+}
+
+static void ggml_v3_cuda_set_peer_access(const int n_tokens) {
+    static bool peer_access_enabled = false;
+
+    const bool enable_peer_access = n_tokens <= GGML_V3_CUDA_PEER_MAX_BATCH_SIZE;
+
+    if (peer_access_enabled == enable_peer_access) {
+        return;
+    }
+
+#ifdef NDEBUG
+    for (int id = 0; id < g_device_count; ++id) {
+        ggml_v3_cuda_set_device(id);
+        CUDA_CHECK(cudaDeviceSynchronize());
+    }
+
+    for (int id = 0; id < g_device_count; ++id) {
+        ggml_v3_cuda_set_device(id);
+
+        for (int id_other = 0; id_other < g_device_count; ++id_other) {
+            if (id == id_other) {
+                continue;
+            }
+            if (id != g_main_device && id_other != g_main_device) {
+                continue;
+            }
+
+            int can_access_peer;
+            CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other));
+            if (can_access_peer) {
+                if (enable_peer_access) {
+                    CUDA_CHECK(cudaDeviceEnablePeerAccess(id_other, 0));
+                } else {
+                    CUDA_CHECK(cudaDeviceDisablePeerAccess(id_other));
+                }
+            }
+        }
+    }
+#endif // NDEBUG
+
+    peer_access_enabled = enable_peer_access;
+}
+
+static void ggml_v3_cuda_op_mul_mat(
+    const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst, ggml_v3_cuda_op_mul_mat_t op,
+    const bool convert_src1_to_q8_1) {
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+    const int64_t ne03 = src0->ne[3];
+
+    const int64_t ne10 = src1->ne[0];
+    const int64_t ne11 = src1->ne[1];
+    const int64_t ne12 = src1->ne[2];
+    const int64_t ne13 = src1->ne[3];
+    const int64_t nrows1 = ggml_v3_nrows(src1);
+
+    GGML_V3_ASSERT(ne03 == ne13);
+
+    const int64_t ne0 = dst->ne[0];
+    const int64_t ne1 = dst->ne[1];
+
+    const int nb2 = dst->nb[2];
+    const int nb3 = dst->nb[3];
+
+    GGML_V3_ASSERT(dst->backend != GGML_V3_BACKEND_GPU_SPLIT);
+    GGML_V3_ASSERT(src1->backend != GGML_V3_BACKEND_GPU_SPLIT);
+    GGML_V3_ASSERT(src1->type == GGML_V3_TYPE_F32 || (src1->ne[2] == 1 && src1->ne[3] == 1));
+
+    GGML_V3_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);
+
+    const int64_t i02_divisor = ne12 / ne02;
+
+    const size_t src0_ts = ggml_v3_type_size(src0->type);
+    const size_t src0_bs = ggml_v3_blck_size(src0->type);
+    const size_t q8_1_ts = sizeof(block_q8_1);
+    const size_t q8_1_bs = QK8_1;
+
+    ggml_v3_tensor_extra_gpu * src0_extra = (ggml_v3_tensor_extra_gpu *) src0->extra;
+    ggml_v3_tensor_extra_gpu * src1_extra = (ggml_v3_tensor_extra_gpu *) src1->extra;
+    ggml_v3_tensor_extra_gpu *  dst_extra = (ggml_v3_tensor_extra_gpu *)  dst->extra;
+
+    const bool src0_on_device = src0->backend == GGML_V3_BACKEND_GPU || src0->backend == GGML_V3_BACKEND_GPU_SPLIT;
+    const bool src0_is_contiguous = ggml_v3_is_contiguous(src0);
+    const bool src1_is_contiguous = ggml_v3_is_contiguous(src1);
+
+    const int64_t src1_padded_col_size = GGML_V3_PAD(ne10, MATRIX_ROW_PADDING);
+
+    const bool split = src0->backend == GGML_V3_BACKEND_GPU_SPLIT;
+    GGML_V3_ASSERT(!(split && ne02 > 1));
+    GGML_V3_ASSERT(!(split && ne03 > 1));
+    GGML_V3_ASSERT(!(split && ne02 < ne12));
+
+    struct dev_data {
+        cuda_pool_alloc<char>  src0_dd_alloc;
+        cuda_pool_alloc<float> src1_ddf_alloc;
+        cuda_pool_alloc<char>  src1_ddq_alloc;
+        cuda_pool_alloc<float>   dst_dd_alloc;
+
+        char  *  src0_dd = nullptr;
+        float * src1_ddf = nullptr; // float
+        char  * src1_ddq = nullptr; // q8_1
+        float *   dst_dd = nullptr;
+
+        int64_t  row_low;
+        int64_t row_high;
+    };
+
+    dev_data dev[GGML_V3_CUDA_MAX_DEVICES];
+
+    int used_devices = 0;
+
+    for (int id = 0; id < g_device_count; ++id) {
+        // by default, use all rows
+        dev[id].row_low  = 0;
+        dev[id].row_high = ne01;
+
+        // for multi GPU, get the row boundaries from tensor split
+        // and round to mul_mat_q tile sizes
+        if (split) {
+            const int64_t rounding = get_row_rounding(src0->type);
+
+            if (id != 0) {
+                dev[id].row_low  = ne01*g_tensor_split[id];
+                if (dev[id].row_low < ne01) {
+                    dev[id].row_low -= dev[id].row_low % rounding;
+                }
+            }
+
+            if (id != g_device_count - 1) {
+                dev[id].row_high  = ne01*g_tensor_split[id + 1];
+                if (dev[id].row_high < ne01) {
+                    dev[id].row_high -= dev[id].row_high % rounding;
+                }
+            }
+        }
+    }
+
+    for (int id = 0; id < g_device_count; ++id) {
+        if ((!split && id != g_main_device) || dev[id].row_low == dev[id].row_high) {
+            continue;
+        }
+
+        used_devices++;
+
+        const bool src1_on_device = src1->backend == GGML_V3_BACKEND_GPU && id == g_main_device;
+        const bool  dst_on_device =  dst->backend == GGML_V3_BACKEND_GPU && id == g_main_device;
+
+        ggml_v3_cuda_set_device(id);
+        cudaStream_t stream = g_cudaStreams[id][0];
+
+        if (src0_on_device && src0_is_contiguous) {
+            dev[id].src0_dd = (char *) src0_extra->data_device[id];
+        } else {
+            dev[id].src0_dd = dev[id].src0_dd_alloc.alloc(ggml_v3_nbytes(src0));
+        }
+
+        if (src1_on_device && src1_is_contiguous) {
+            dev[id].src1_ddf = (float *) src1_extra->data_device[id];
+        } else {
+            dev[id].src1_ddf = dev[id].src1_ddf_alloc.alloc(ggml_v3_nelements(src1));
+        }
+
+        if (convert_src1_to_q8_1) {
+            dev[id].src1_ddq = dev[id].src1_ddq_alloc.alloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs);
+
+            if (src1_on_device && src1_is_contiguous) {
+                quantize_row_q8_1_cuda(dev[id].src1_ddf, dev[id].src1_ddq, ne10, nrows1, src1_padded_col_size, stream);
+                CUDA_CHECK(cudaGetLastError());
+            }
+        }
+
+        if (dst_on_device) {
+            dev[id].dst_dd = (float *) dst_extra->data_device[id];
+        } else {
+            const size_t size_dst_ddf = split ? (dev[id].row_high - dev[id].row_low)*ne1 : ggml_v3_nelements(dst);
+            dev[id].dst_dd = dev[id].dst_dd_alloc.alloc(size_dst_ddf);
+        }
+    }
+
+    // if multiple devices are used they need to wait for the main device
+    // here an event is recorded that signals that the main device has finished calculating the input data
+    if (split && used_devices > 1) {
+        ggml_v3_cuda_set_device(g_main_device);
+        CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device][0], g_cudaStreams[g_main_device][0]));
+    }
+
+    const int64_t src1_col_stride = split && used_devices > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
+    for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
+        const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0;
+        const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
+
+        for (int id = 0; id < g_device_count; ++id) {
+            if ((!split && id != g_main_device) || dev[id].row_low == dev[id].row_high) {
+                continue;
+            }
+
+            const bool src1_on_device = src1->backend == GGML_V3_BACKEND_GPU && id == g_main_device;
+            const bool  dst_on_device =  dst->backend == GGML_V3_BACKEND_GPU && id == g_main_device;
+            const int64_t row_diff = dev[id].row_high - dev[id].row_low;
+
+            ggml_v3_cuda_set_device(id);
+            cudaStream_t stream = g_cudaStreams[id][is];
+
+            // wait for main GPU data if necessary
+            if (split && (id != g_main_device || is != 0)) {
+                CUDA_CHECK(cudaStreamWaitEvent(stream, src0_extra->events[g_main_device][0], 0));
+            }
+
+            for (int64_t i0 = 0; i0 < ne13*ne12; ++i0) {
+                const int64_t i03 = i0 / ne12;
+                const int64_t i02 = i0 % ne12;
+
+                const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs;
+
+                // for split tensors the data begins at i0 == i0_offset_low
+                char  *  src0_dd_i =  dev[id].src0_dd + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs;
+                float * src1_ddf_i = dev[id].src1_ddf + (i0*ne11 + src1_col_0) * ne10;
+                char  * src1_ddq_i = dev[id].src1_ddq +  src1_ddq_i_offset;
+                float *   dst_dd_i =   dev[id].dst_dd + (i0*ne1  + src1_col_0) * (dst_on_device ? ne0 : row_diff);
+
+                // the main device memory buffer can be on VRAM scratch, with space for all partial results
+                // in that case an offset on dst_ddf_i is needed
+                if (dst->backend == GGML_V3_BACKEND_GPU && id == g_main_device) {
+                    dst_dd_i += dev[id].row_low; // offset is 0 if no tensor split
+                }
+
+                // copy src0, src1 to device if necessary
+                if (src1->backend == GGML_V3_BACKEND_GPU && src1_is_contiguous) {
+                    if (id != g_main_device) {
+                        if (convert_src1_to_q8_1) {
+                            char * src1_ddq_i_source = dev[g_main_device].src1_ddq + src1_ddq_i_offset;
+                            CUDA_CHECK(cudaMemcpyPeerAsync(src1_ddq_i, id, src1_ddq_i_source, g_main_device,
+                                                            src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs, stream));
+                        } else {
+                            float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device];
+                            src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10;
+                            CUDA_CHECK(cudaMemcpyPeerAsync(src1_ddf_i, id, src1_ddf_i_source, g_main_device,
+                                                            src1_ncols*ne10*sizeof(float), stream));
+                        }
+                    }
+                } else if (src1->backend == GGML_V3_BACKEND_CPU || (src1_on_device && !src1_is_contiguous)) {
+                    CUDA_CHECK(ggml_v3_cuda_cpy_tensor_2d(
+                                src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
+                } else {
+                    GGML_V3_ASSERT(false);
+                }
+
+                if (convert_src1_to_q8_1 && (src1->backend == GGML_V3_BACKEND_CPU || !src1_is_contiguous)) {
+                    quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
+                    CUDA_CHECK(cudaGetLastError());
+                }
+
+                if (src1_col_0 == 0 && (!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) {
+                    CUDA_CHECK(ggml_v3_cuda_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, dev[id].row_low, dev[id].row_high, stream));
+                }
+
+                // do the computation
+                op(src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i,
+                    dev[id].row_low, dev[id].row_high, src1_ncols, src1_padded_col_size, stream);
+                CUDA_CHECK(cudaGetLastError());
+
+                // copy dst to host or other device if necessary
+                if (!dst_on_device) {
+                    void * dst_off_device;
+                    cudaMemcpyKind kind;
+                    if (dst->backend == GGML_V3_BACKEND_CPU) {
+                        dst_off_device = dst->data;
+                        kind = cudaMemcpyDeviceToHost;
+                    } else if (dst->backend == GGML_V3_BACKEND_GPU) {
+                        dst_off_device = dst_extra->data_device[g_main_device];
+                        kind = cudaMemcpyDeviceToDevice;
+                    } else {
+                        GGML_V3_ASSERT(false);
+                    }
+                    if (split) {
+                        // src0 = weight matrix is saved as a transposed matrix for better memory layout.
+                        // dst is NOT transposed.
+                        // The outputs of matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU.
+                        // Instead they need to be copied to the correct slice in ne0 = dst row index.
+                        // If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
+                        float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
+                        GGML_V3_ASSERT(dst->nb[1] == ne0*sizeof(float));
+                        dhf_dst_i += src1_col_0*ne0 + dev[id].row_low;
+#if !defined(GGML_USE_HIPBLAS)
+                        if (kind == cudaMemcpyDeviceToDevice) {
+                            // cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
+                            cudaMemcpy3DPeerParms p = {};
+                            p.dstDevice = g_main_device;
+                            p.dstPtr = make_cudaPitchedPtr(dhf_dst_i, ne0*sizeof(float), row_diff, src1_ncols);
+                            p.srcDevice = id;
+                            p.srcPtr = make_cudaPitchedPtr(dst_dd_i, row_diff*sizeof(float), row_diff, src1_ncols);
+                            p.extent = make_cudaExtent(row_diff*sizeof(float), src1_ncols, 1);
+                            CUDA_CHECK(cudaMemcpy3DPeerAsync(&p, stream));
+                        } else
+#endif
+                        {
+                            CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float),
+                                                            dst_dd_i, row_diff*sizeof(float),
+                                                            row_diff*sizeof(float), src1_ncols,
+                                                            kind, stream));
+                        }
+                    } else {
+                        float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
+                        GGML_V3_ASSERT(dst->nb[1] == ne0*sizeof(float));
+                        dhf_dst_i += src1_col_0*ne0;
+                        CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_dd_i, src1_ncols*ne0*sizeof(float), kind, stream));
+                    }
+                }
+
+                // add event for the main device to wait on until other device is done
+                if (split && (id != g_main_device || is != 0)) {
+                    CUDA_CHECK(cudaEventRecord(src0_extra->events[id][is], stream));
+                }
+            }
+        }
+    }
+
+    // main device waits for all other devices to be finished
+    if (split && g_device_count > 1) {
+        int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1) / MUL_MAT_SRC1_COL_STRIDE;
+        is_max = is_max <= MAX_STREAMS ? is_max : MAX_STREAMS;
+
+        ggml_v3_cuda_set_device(g_main_device);
+        for (int id = 0; id < g_device_count; ++id) {
+            if (dev[id].row_low == dev[id].row_high) {
+                continue;
+            }
+            for (int64_t is = 0; is < is_max; ++is) {
+                CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is], 0));
+            }
+        }
+    }
+
+    if (dst->backend == GGML_V3_BACKEND_CPU) {
+        ggml_v3_cuda_set_device(g_main_device);
+        CUDA_CHECK(cudaDeviceSynchronize());
+    }
+}
+
+static void ggml_v3_cuda_repeat(const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst) {
+    ggml_v3_cuda_op_flatten(src0, src1, dst, ggml_v3_cuda_op_repeat);
+}
+
+static void ggml_v3_cuda_get_rows(const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst) {
+    ggml_v3_cuda_op_flatten(src0, src1, dst, ggml_v3_cuda_op_get_rows);
+}
+
+static void ggml_v3_cuda_add(const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst) {
+    ggml_v3_cuda_op_flatten(src0, src1, dst, ggml_v3_cuda_op_add);
+}
+
+static void ggml_v3_cuda_acc(const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst) {
+    ggml_v3_cuda_op_flatten(src0, src1, dst, ggml_v3_cuda_op_acc);
+}
+
+static void ggml_v3_cuda_mul(const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst) {
+    ggml_v3_cuda_op_flatten(src0, src1, dst, ggml_v3_cuda_op_mul);
+}
+
+static void ggml_v3_cuda_div(const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst) {
+    ggml_v3_cuda_op_flatten(src0, src1, dst, ggml_v3_cuda_op_div);
+}
+
+static void ggml_v3_cuda_gelu(const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst) {
+    ggml_v3_cuda_op_flatten(src0, src1, dst, ggml_v3_cuda_op_gelu);
+}
+
+static void ggml_v3_cuda_silu(const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst) {
+    ggml_v3_cuda_op_flatten(src0, src1, dst, ggml_v3_cuda_op_silu);
+}
+
+static void ggml_v3_cuda_gelu_quick(const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst) {
+    ggml_v3_cuda_op_flatten(src0, src1, dst, ggml_v3_cuda_op_gelu_quick);
+}
+
+static void ggml_v3_cuda_tanh(const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst) {
+    ggml_v3_cuda_op_flatten(src0, src1, dst, ggml_v3_cuda_op_tanh);
+}
+
+static void ggml_v3_cuda_relu(const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst) {
+    ggml_v3_cuda_op_flatten(src0, src1, dst, ggml_v3_cuda_op_relu);
+}
+
+static void ggml_v3_cuda_leaky_relu(const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst) {
+    ggml_v3_cuda_op_flatten(src0, src1, dst, ggml_v3_cuda_op_leaky_relu);
+}
+
+static void ggml_v3_cuda_sqr(const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst) {
+    ggml_v3_cuda_op_flatten(src0, src1, dst, ggml_v3_cuda_op_sqr);
+}
+
+static void ggml_v3_cuda_norm(const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst) {
+    ggml_v3_cuda_op_flatten(src0, src1, dst, ggml_v3_cuda_op_norm);
+}
+
+static void ggml_v3_cuda_group_norm(const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst) {
+    ggml_v3_cuda_op_flatten(src0, src1, dst, ggml_v3_cuda_op_group_norm);
+}
+
+static void ggml_v3_cuda_concat(const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst) {
+    ggml_v3_cuda_op_flatten(src0, src1, dst, ggml_v3_cuda_op_concat);
+}
+
+static void ggml_v3_cuda_upscale(const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst) {
+    ggml_v3_cuda_op_flatten(src0, src1, dst, ggml_v3_cuda_op_upscale);
+}
+
+static void ggml_v3_cuda_pad(const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst) {
+    ggml_v3_cuda_op_flatten(src0, src1, dst, ggml_v3_cuda_op_pad);
+}
+
+static void ggml_v3_cuda_rms_norm(const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst) {
+    ggml_v3_cuda_op_flatten(src0, src1, dst, ggml_v3_cuda_op_rms_norm);
+}
+
+bool ggml_v3_cuda_can_mul_mat(const struct ggml_v3_tensor * src0, const struct ggml_v3_tensor * src1, struct ggml_v3_tensor * dst) {
+    if (!g_cublas_loaded) return false;
+
+    const int64_t ne10 = src1->ne[0];
+
+    const int64_t ne0 = dst->ne[0];
+    const int64_t ne1 = dst->ne[1];
+
+    // TODO: find the optimal values for these
+    return (src0->type == GGML_V3_TYPE_F32 || src0->type == GGML_V3_TYPE_F16 || ggml_v3_is_quantized(src0->type)) &&
+            src1->type == GGML_V3_TYPE_F32 &&
+             dst->type == GGML_V3_TYPE_F32 &&
+            (ne0 >= 32 && ne1 >= 32 && ne10 >= 32);
+}
+
+static void ggml_v3_cuda_mul_mat_vec_p021(const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst){
+    GGML_V3_ASSERT(ggml_v3_is_permuted(src0) && ggml_v3_is_permuted(src1));
+    GGML_V3_ASSERT(src0->backend != GGML_V3_BACKEND_GPU_SPLIT);
+    GGML_V3_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
+    GGML_V3_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation
+    GGML_V3_ASSERT(src0->type == GGML_V3_TYPE_F16);
+    GGML_V3_ASSERT(src1->type == GGML_V3_TYPE_F32);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+
+    const int64_t ne12 = src1->ne[2];
+
+    ggml_v3_cuda_set_device(g_main_device);
+    cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
+
+    ggml_v3_tensor_extra_gpu * src0_extra = (ggml_v3_tensor_extra_gpu *) src0->extra;
+    void * src0_ddq = src0_extra->data_device[g_main_device];
+
+    ggml_v3_tensor_extra_gpu * src1_extra = (ggml_v3_tensor_extra_gpu *) src1->extra;
+    float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
+
+    ggml_v3_tensor_extra_gpu * dst_extra = (ggml_v3_tensor_extra_gpu *) dst->extra;
+    float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
+
+    ggml_v3_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
+}
+
+static void ggml_v3_cuda_mul_mat_vec_nc(const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst){
+    GGML_V3_ASSERT(!ggml_v3_is_transposed(src0));
+    GGML_V3_ASSERT(!ggml_v3_is_transposed(src1));
+    GGML_V3_ASSERT(!ggml_v3_is_permuted(src0));
+    GGML_V3_ASSERT(src0->backend != GGML_V3_BACKEND_GPU_SPLIT);
+    GGML_V3_ASSERT(src0->type == GGML_V3_TYPE_F16);
+    GGML_V3_ASSERT(src1->type == GGML_V3_TYPE_F32);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+
+    const int64_t nb01 = src0->nb[1];
+    const int64_t nb02 = src0->nb[2];
+
+    const int64_t ne12 = src1->ne[2];
+
+    ggml_v3_cuda_set_device(g_main_device);
+    cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
+
+    ggml_v3_tensor_extra_gpu * src0_extra = (ggml_v3_tensor_extra_gpu *) src0->extra;
+    void * src0_ddq = src0_extra->data_device[g_main_device];
+
+    ggml_v3_tensor_extra_gpu * src1_extra = (ggml_v3_tensor_extra_gpu *) src1->extra;
+    float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
+
+    ggml_v3_tensor_extra_gpu * dst_extra = (ggml_v3_tensor_extra_gpu *) dst->extra;
+    float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
+
+    const int64_t row_stride_x = nb01 / sizeof(half);
+    const int64_t channel_stride_x = nb02 / sizeof(half);
+
+    ggml_v3_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
+}
+
+static __global__ void k_compute_batched_ptrs(
+        const half * src0_as_f16, const half * src1_as_f16, char * dst,
+        const void ** ptrs_src, void ** ptrs_dst,
+        int64_t ne12, int64_t ne13,
+        int64_t ne23,
+        size_t  nb02, size_t  nb03,
+        size_t  nb12, size_t  nb13,
+        size_t  nbd2, size_t  nbd3,
+        int64_t r2,   int64_t r3) {
+    int64_t i13 = blockIdx.x * blockDim.x + threadIdx.x;
+    int64_t i12 = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (i13 >= ne13 || i12 >= ne12) {
+        return;
+    }
+
+    int64_t i03 = i13 / r3;
+    int64_t i02 = i12 / r2;
+
+    ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
+    ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12 + i13*nb13;
+    ptrs_dst[0*ne23 + i12 + i13*ne12] = (      char *)         dst + i12*nbd2 + i13*nbd3;
+}
+
+static void ggml_v3_cuda_mul_mat_mat_batched_cublas(const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst) {
+    GGML_V3_ASSERT(!ggml_v3_is_transposed(src0));
+    GGML_V3_ASSERT(!ggml_v3_is_transposed(src1));
+
+    GGML_V3_ASSERT(src0->backend != GGML_V3_BACKEND_GPU_SPLIT);
+    GGML_V3_ASSERT(src0->type == GGML_V3_TYPE_F16);
+
+    GGML_V3_TENSOR_BINARY_OP_LOCALS
+
+    const int64_t ne_dst = ggml_v3_nelements(dst);
+
+    ggml_v3_cuda_set_device(g_main_device);
+    cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
+
+    CUBLAS_CHECK(cublasSetStream(g_cublas_handles[g_main_device], main_stream));
+
+    ggml_v3_tensor_extra_gpu * src0_extra = (ggml_v3_tensor_extra_gpu *) src0->extra;
+    void * src0_ddq = src0_extra->data_device[g_main_device];
+    half * src0_f16 = (half *) src0_ddq;
+
+    ggml_v3_tensor_extra_gpu * src1_extra = (ggml_v3_tensor_extra_gpu *) src1->extra;
+    float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
+
+    ggml_v3_tensor_extra_gpu * dst_extra = (ggml_v3_tensor_extra_gpu *) dst->extra;
+    float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
+
+    // convert src1 to fp16
+    cuda_pool_alloc<half> src1_f16_alloc;
+    if (src1->type != GGML_V3_TYPE_F16) {
+        const to_fp16_cuda_t to_fp16_cuda = ggml_v3_get_to_fp16_cuda(src1->type);
+        const int64_t ne_src1 = ggml_v3_nelements(src1);
+        src1_f16_alloc.alloc(ne_src1);
+        GGML_V3_ASSERT(to_fp16_cuda != nullptr);
+        to_fp16_cuda(src1_ddf, src1_f16_alloc.get(), ne_src1, main_stream);
+    }
+    half * src1_f16 = src1->type == GGML_V3_TYPE_F16 ? (half *) src1_ddf : src1_f16_alloc.get();
+
+    cuda_pool_alloc<half> dst_f16;
+    char * dst_t;
+
+    cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
+    cudaDataType_t      cu_data_type    = CUDA_R_16F;
+
+    // dst strides
+    size_t nbd2 = dst->nb[2];
+    size_t nbd3 = dst->nb[3];
+
+    const half  alpha_f16 = 1.0f;
+    const half  beta_f16  = 0.0f;
+
+    const float alpha_f32 = 1.0f;
+    const float beta_f32  = 0.0f;
+
+    const void * alpha = &alpha_f16;
+    const void * beta  = &beta_f16;
+
+    if (dst->op_params[0] == GGML_V3_PREC_DEFAULT) {
+        dst_t = (char *) dst_f16.alloc(ne_dst);
+
+        nbd2 /= sizeof(float) / sizeof(half);
+        nbd3 /= sizeof(float) / sizeof(half);
+    } else {
+        dst_t = (char *) dst_ddf;
+
+        cu_compute_type = CUBLAS_COMPUTE_32F;
+        cu_data_type    = CUDA_R_32F;
+
+        alpha = &alpha_f32;
+        beta  = &beta_f32;
+    }
+
+    GGML_V3_ASSERT(ne12 % ne02 == 0);
+    GGML_V3_ASSERT(ne13 % ne03 == 0);
+
+    // broadcast factors
+    const int64_t r2 = ne12/ne02;
+    const int64_t r3 = ne13/ne03;
+
+#if 0
+    // use cublasGemmEx
+    {
+        for (int i13 = 0; i13 < ne13; ++i13) {
+            for (int i12 = 0; i12 < ne12; ++i12) {
+                int i03 = i13 / r3;
+                int i02 = i12 / r2;
+
+                CUBLAS_CHECK(
+                        cublasGemmEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
+                            ne01, ne11, ne10,
+                            alpha, (const char *) src0_as_f16 + i02*src0->nb[2]   + i03*src0->nb[3]  , CUDA_R_16F,   nb01/sizeof(half),
+                                   (const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F,   nb11/sizeof(float),
+                            beta,  (      char *)       dst_t + i12*nbd2          + i13*nbd3,          cu_data_type, ne01,
+                            cu_compute_type,
+                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+            }
+        }
+    }
+#else
+    if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) {
+        // there is no broadcast and src0, src1 are contiguous across dims 2, 3
+        // use cublasGemmStridedBatchedEx
+        CUBLAS_CHECK(
+        cublasGemmStridedBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
+                ne01, ne11, ne10,
+                alpha, (const char *) src0_f16, CUDA_R_16F,   nb01/nb00, nb02/nb00,  // strideA
+                       (const char *) src1_f16, CUDA_R_16F,   nb11/nb10, nb12/nb10,  // strideB
+                beta,  (      char *)    dst_t, cu_data_type, ne01,       nb2/nb0,   // strideC
+                ne12*ne13,
+                cu_compute_type,
+                CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+    } else {
+        // use cublasGemmBatchedEx
+        const int ne23 = ne12*ne13;
+
+        cuda_pool_alloc<const void *> ptrs_src(2*ne23);
+        cuda_pool_alloc<      void *> ptrs_dst(1*ne23);
+
+        dim3 block_dims(ne13, ne12);
+        k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
+                src0_f16, src1_f16, dst_t,
+                ptrs_src.get(), ptrs_dst.get(),
+                ne12, ne13,
+                ne23,
+                nb02, nb03,
+                src1->type == GGML_V3_TYPE_F16 ? nb12 : nb12/2,
+                src1->type == GGML_V3_TYPE_F16 ? nb13 : nb13/2,
+                nbd2, nbd3,
+                r2, r3);
+        CUDA_CHECK(cudaGetLastError());
+
+        CUBLAS_CHECK(
+        cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
+                ne01, ne11, ne10,
+                alpha, (const void **) (ptrs_src.get() + 0*ne23), CUDA_R_16F,   nb01/nb00,
+                       (const void **) (ptrs_src.get() + 1*ne23), CUDA_R_16F,   nb11/nb10,
+                beta,  (      void **) (ptrs_dst.get() + 0*ne23), cu_data_type, ne01,
+                ne23,
+                cu_compute_type,
+                CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+    }
+#endif
+
+    if (dst->op_params[0] == GGML_V3_PREC_DEFAULT) {
+        const to_fp32_cuda_t to_fp32_cuda = ggml_v3_get_to_fp32_cuda(GGML_V3_TYPE_F16);
+        to_fp32_cuda(dst_f16.get(), dst_ddf, ne_dst, main_stream);
+    }
+}
+
+static void ggml_v3_cuda_mul_mat(const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst) {
+    const bool all_on_device =
+        (src0->backend == GGML_V3_BACKEND_GPU || src0->backend == GGML_V3_BACKEND_GPU_SPLIT) &&
+        (src1->backend == GGML_V3_BACKEND_GPU) &&
+        ( dst->backend == GGML_V3_BACKEND_GPU);
+
+    const bool split = src0->backend == GGML_V3_BACKEND_GPU_SPLIT;
+
+    int64_t min_compute_capability = INT_MAX;
+    for (int id = 0; id < g_device_count; ++id) {
+        if (min_compute_capability > g_device_caps[id].cc && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
+            min_compute_capability = g_device_caps[id].cc;
+        }
+    }
+
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+
+    const bool fp16_performance_good = min_compute_capability >= CC_RDNA1;
+    bool               use_mul_mat_q = ggml_v3_is_quantized(src0->type);
+
+    if(!g_mul_mat_q)
+    {
+        use_mul_mat_q = use_mul_mat_q && min_compute_capability < CC_RDNA3;
+    }
+
+
+#else
+
+    const bool fp16_performance_good = min_compute_capability >= CC_VOLTA;
+    bool               use_mul_mat_q = min_compute_capability >= MIN_CC_DP4A && ggml_v3_is_quantized(src0->type);
+
+    // when tensor cores are available, use them for large batch size
+    // ref: https://github.com/ggerganov/llama.cpp/pull/3776
+    if(!g_mul_mat_q)
+    {
+        use_mul_mat_q = use_mul_mat_q && !(fp16_performance_good && src1->ne[1] > MMQ_MAX_BATCH_SIZE);
+    }
+
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+
+    use_mul_mat_q = use_mul_mat_q && ggml_v3_cuda_supports_mmq(src0->type);
+
+    // debug helpers
+    //printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
+    //printf("      %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
+    //printf("src1: %8d %8d %8d %8d\n", src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3]);
+    //printf("      %8d %8d %8d %8d\n", src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3]);
+    //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_v3_is_contiguous(src0), ggml_v3_is_transposed(src0), ggml_v3_type_name(src0->type), src0->name);
+    //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_v3_is_contiguous(src1), ggml_v3_is_transposed(src1), ggml_v3_type_name(src1->type), src1->name);
+
+    if (!split && all_on_device && !fp16_performance_good && src0->type == GGML_V3_TYPE_F16 && ggml_v3_is_permuted(src0) && ggml_v3_is_permuted(src1) && src1->ne[1] == 1) {
+        // KQ single-batch
+        ggml_v3_cuda_mul_mat_vec_p021(src0, src1, dst);
+    } else if (!split && all_on_device && !fp16_performance_good && src0->type == GGML_V3_TYPE_F16 && !ggml_v3_is_contiguous(src0) && !ggml_v3_is_transposed(src1) && src1->ne[1] == 1) {
+        // KQV single-batch
+        ggml_v3_cuda_mul_mat_vec_nc(src0, src1, dst);
+    } else if (!split && all_on_device && fp16_performance_good && src0->type == GGML_V3_TYPE_F16 && !ggml_v3_is_transposed(src0) && !ggml_v3_is_transposed(src1)) {
+        // KQ + KQV multi-batch
+        ggml_v3_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
+    } else if (src0->type == GGML_V3_TYPE_F32) {
+        ggml_v3_cuda_op_mul_mat(src0, src1, dst, ggml_v3_cuda_op_mul_mat_cublas, false);
+    } else if (ggml_v3_is_quantized(src0->type) || src0->type == GGML_V3_TYPE_F16) {
+        if (src1->ne[1] == 1 && src0->ne[0] % GGML_V3_CUDA_DMMV_X == 0 && src1->type == GGML_V3_TYPE_F32) {
+#ifdef GGML_V3_CUDA_FORCE_DMMV
+            const bool use_mul_mat_vec_q = false;
+#else
+            const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_v3_is_quantized(src0->type) && ggml_v3_nrows(src1) == 1;
+#endif // GGML_V3_CUDA_FORCE_DMMV
+
+            if (use_mul_mat_vec_q) {
+                // NOTE: this kernel does not support ggml_v3_nrows(src1) > 1
+                ggml_v3_cuda_op_mul_mat(src0, src1, dst, ggml_v3_cuda_op_mul_mat_vec_q, true);
+            } else {
+                ggml_v3_cuda_op_mul_mat(src0, src1, dst, ggml_v3_cuda_op_dequantize_mul_mat_vec, false);
+            }
+        } else {
+            if (use_mul_mat_q) {
+                ggml_v3_cuda_op_mul_mat(src0, src1, dst, ggml_v3_cuda_op_mul_mat_q, true);
+            } else {
+                ggml_v3_cuda_op_mul_mat(src0, src1, dst, ggml_v3_cuda_op_mul_mat_cublas, false);
+            }
+        }
+    } else {
+        GGML_V3_ASSERT(false);
+    }
+}
+
+#if 0
+template<typename ... Srcs>
+static __global__ void k_compute_batched_ptrs_id(
+        const void ** ptrs_src, void ** ptrs_dst,
+        int ne12, int ne13,
+        int ne23,
+        int nb02, int nb03,
+        int nb12, int nb13,
+        int nb2, int nb3,
+        int r2, int r3,
+        ggml_v3_type src0_type, half * src0_as_f16, int64_t src0_ne,
+        const half * src1_f16, half * dst_f16,
+        const int32_t * ids, const int id,
+        Srcs... src0s) {
+
+    int i = ids[id];
+
+    half * src0_f16;
+    const void * srcs_ar[] = { (const half *) src0s... };
+    if (src0_type == GGML_V3_TYPE_F16) {
+        src0_f16 = (half *) srcs_ar[i];
+    } else {
+        src0_f16 = src0_as_f16;
+        if (threadIdx.x == 0 && threadIdx.y == 0) {
+            const to_fp16_cuda_t to_fp16 = ggml_v3_get_to_fp16_cuda(src0_type);
+            to_fp16(srcs_ar[i], src0_f16, src0_ne, cudaStreamFireAndForget);
+        }
+    }
+
+    int i13 = blockIdx.x * blockDim.x + threadIdx.x;
+    int i12 = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (i13 >= ne13 || i12 >= ne12) {
+        return;
+    }
+
+    int i03 = i13 / r3;
+    int i02 = i12 / r2;
+
+    ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_f16 + i02*nb02   + i03*nb03;
+    ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_f16 + i12*nb12/2 + i13*nb13/2;
+    ptrs_dst[0*ne23 + i12 + i13*ne12] = (      char *)  dst_f16 + i12* nb2/2 + i13* nb3/2;
+}
+
+static void ggml_v3_cuda_mul_mat_id_cublas(ggml_v3_tensor * dst) {
+    const struct ggml_v3_tensor * ids = dst->src[0];
+    const struct ggml_v3_tensor * src1 = dst->src[1];
+    const struct ggml_v3_tensor * src00 = dst->src[2];
+
+    const int id = dst->op_params[0];
+
+    GGML_V3_ASSERT(!ggml_v3_is_transposed(src00));
+    GGML_V3_ASSERT(!ggml_v3_is_transposed(src1));
+
+    GGML_V3_ASSERT(src00->backend != GGML_V3_BACKEND_GPU_SPLIT);
+    GGML_V3_ASSERT(src1->type == GGML_V3_TYPE_F32);
+
+    const int64_t ne00 = src00->ne[0]; GGML_V3_UNUSED(ne00);
+    const int64_t ne01 = src00->ne[1];
+    const int64_t ne02 = src00->ne[2];
+    const int64_t ne03 = src00->ne[3];
+
+    //const int64_t nb01 = src00->nb[1];
+    const int64_t nb02 = src00->nb[2]; GGML_V3_UNUSED(nb02);
+    const int64_t nb03 = src00->nb[3]; GGML_V3_UNUSED(nb03);
+
+    const int64_t ne10 = src1->ne[0];
+    const int64_t ne11 = src1->ne[1];
+    const int64_t ne12 = src1->ne[2];
+    const int64_t ne13 = src1->ne[3];
+
+    //const int64_t nb11 = src1->nb[1];
+    const int64_t nb12 = src1->nb[2]; GGML_V3_UNUSED(nb12);
+    const int64_t nb13 = src1->nb[3]; GGML_V3_UNUSED(nb13);
+
+    const int64_t ne1 = ggml_v3_nelements(src1);
+    const int64_t ne  = ggml_v3_nelements(dst);
+
+    ggml_v3_cuda_set_device(g_main_device);
+    cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
+
+    CUBLAS_CHECK(cublasSetStream(g_cublas_handles[g_main_device], main_stream));
+
+    //ggml_v3_tensor_extra_gpu * src0_extra = (ggml_v3_tensor_extra_gpu *) src0->extra;
+    //void * src0_ddq = src0_extra->data_device[g_main_device];
+    //half * src0_as_f16 = (half *) src0_ddq;
+
+    ggml_v3_tensor_extra_gpu * src1_extra = (ggml_v3_tensor_extra_gpu *) src1->extra;
+    float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
+
+    ggml_v3_tensor_extra_gpu * dst_extra = (ggml_v3_tensor_extra_gpu *) dst->extra;
+    float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
+
+    // convert src1 to fp16
+    const to_fp16_cuda_t to_fp16_cuda = ggml_v3_get_to_fp16_cuda(src1->type);
+    GGML_V3_ASSERT(to_fp16_cuda != nullptr);
+
+    size_t src1_as = 0;
+    half * src1_as_f16 = (half *) ggml_v3_cuda_pool_malloc(ne1 * sizeof(half), &src1_as);
+    to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
+
+    size_t dst_as = 0;
+    half * dst_f16 = (half *) ggml_v3_cuda_pool_malloc(ne * sizeof(half), &dst_as);
+
+    GGML_V3_ASSERT(ne12 % ne02 == 0);
+    GGML_V3_ASSERT(ne13 % ne03 == 0);
+
+    // broadcast factors
+    const int64_t r2 = ne12/ne02;
+    const int64_t r3 = ne13/ne03;
+
+    const half alpha_f16 = 1.0f;
+    const half beta_f16  = 0.0f;
+
+    // use cublasGemmBatchedEx
+    const int ne23 = ne12*ne13;
+
+    const void ** ptrs_src = nullptr;
+          void ** ptrs_dst = nullptr;
+
+    size_t ptrs_src_s = 0;
+    size_t ptrs_dst_s = 0;
+
+    ptrs_src = (const void **) ggml_v3_cuda_pool_malloc(2*ne23*sizeof(void *), &ptrs_src_s);
+    ptrs_dst = (      void **) ggml_v3_cuda_pool_malloc(1*ne23*sizeof(void *), &ptrs_dst_s);
+
+    int64_t src0_ne = ggml_v3_nelements(src00);
+    half * src0_as_f16 = nullptr;
+    size_t src0_as = 0;
+    if (src00->type != GGML_V3_TYPE_F16) {
+        src0_as_f16 = (half *) ggml_v3_cuda_pool_malloc(src0_ne * sizeof(half), &src0_as);
+    }
+
+    static_assert(GGML_V3_MAX_SRC == 6, "GGML_V3_MAX_SRC == 6");
+    dim3 block_dims(ne13, ne12);
+    k_compute_batched_ptrs_id<<<1, block_dims, 0, main_stream>>>(
+            ptrs_src, ptrs_dst,
+            ne12, ne13,
+            ne23,
+            ne00*ne01*sizeof(half), ne00*ne01*ne02*sizeof(half),
+            nb12, nb13,
+            dst->nb[2], dst->nb[3],
+            r2, r3,
+            src00->type, src0_as_f16, src0_ne,
+            src1_as_f16, dst_f16,
+            (const int *)((ggml_v3_tensor_extra_gpu *)ids->extra)->data_device[g_main_device], id,
+            dst->src[2] ? (const half *)((ggml_v3_tensor_extra_gpu *)dst->src[2]->extra)->data_device[g_main_device] : nullptr,
+            dst->src[3] ? (const half *)((ggml_v3_tensor_extra_gpu *)dst->src[3]->extra)->data_device[g_main_device] : nullptr,
+            dst->src[4] ? (const half *)((ggml_v3_tensor_extra_gpu *)dst->src[4]->extra)->data_device[g_main_device] : nullptr,
+            dst->src[5] ? (const half *)((ggml_v3_tensor_extra_gpu *)dst->src[5]->extra)->data_device[g_main_device] : nullptr
+    );
+    CUDA_CHECK(cudaGetLastError());
+
+    CUBLAS_CHECK(
+    cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
+            ne01, ne11, ne10,
+            &alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, ne00,
+                        (const void **) (ptrs_src + 1*ne23), CUDA_R_16F, ne10,
+            &beta_f16,  (      void **) (ptrs_dst + 0*ne23), CUDA_R_16F, ne01,
+            ne23,
+            CUBLAS_COMPUTE_16F,
+            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+
+    if (src0_as != 0) {
+        ggml_v3_cuda_pool_free(src0_as_f16, src0_as);
+    }
+    if (ptrs_src_s != 0) {
+        ggml_v3_cuda_pool_free(ptrs_src, ptrs_src_s);
+    }
+    if (ptrs_dst_s != 0) {
+        ggml_v3_cuda_pool_free(ptrs_dst, ptrs_dst_s);
+    }
+
+    const to_fp32_cuda_t to_fp32_cuda = ggml_v3_get_to_fp32_cuda(GGML_V3_TYPE_F16);
+    to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
+
+    ggml_v3_cuda_pool_free(src1_as_f16, src1_as);
+    ggml_v3_cuda_pool_free(dst_f16, dst_as);
+}
+#endif
+
+static void ggml_v3_cuda_mul_mat_id(const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst) {
+#if 0
+    ggml_v3_cuda_mul_mat_id_cublas(dst);
+    // TODO: mmq/mmv support
+#endif
+
+    const int64_t nb11 = src1->nb[1];
+    const int64_t nb1  =  dst->nb[1];
+
+    const struct ggml_v3_tensor * ids = src0;
+    const int32_t id = ((int32_t *) dst->op_params)[0];
+    const int32_t n_as = ((int32_t *) dst->op_params)[1];
+
+    std::vector<char> ids_host(ggml_v3_nbytes(ids));
+
+    cudaStream_t stream = g_cudaStreams[g_main_device][0];
+
+    if (ids->backend == GGML_V3_BACKEND_GPU) {
+        const char * ids_dev = (const char *)((const ggml_v3_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
+        CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_v3_nbytes(ids), cudaMemcpyDeviceToHost, stream));
+        CUDA_CHECK(cudaStreamSynchronize(stream));
+    } else {
+        memcpy(ids_host.data(), ids->data, ggml_v3_nbytes(ids));
+    }
+
+    const ggml_v3_tensor_extra_gpu * src1_extra = (const ggml_v3_tensor_extra_gpu *) src1->extra;
+    const ggml_v3_tensor_extra_gpu * dst_extra = (const ggml_v3_tensor_extra_gpu *) dst->extra;
+
+    ggml_v3_tensor_extra_gpu src1_row_extra;
+    ggml_v3_tensor_extra_gpu dst_row_extra;
+
+    ggml_v3_tensor src1_row = *src1;
+    ggml_v3_tensor dst_row = *dst;
+
+    src1_row.backend = GGML_V3_BACKEND_GPU;
+    dst_row.backend  = GGML_V3_BACKEND_GPU;
+
+    src1_row.extra = &src1_row_extra;
+    dst_row.extra = &dst_row_extra;
+
+    char * src1_original = src1->backend == GGML_V3_BACKEND_CPU ?
+        (char *) src1->data : (char *) src1_extra->data_device[g_main_device];
+    char * dst_original  =  dst->backend == GGML_V3_BACKEND_CPU ?
+        (char *)  dst->data : (char *)  dst_extra->data_device[g_main_device];
+
+    if (src1->ne[1] == 1) {
+        GGML_V3_ASSERT(src1->backend == GGML_V3_BACKEND_GPU);
+        GGML_V3_ASSERT(dst->backend  == GGML_V3_BACKEND_GPU);
+
+        for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
+            //int32_t row_id;
+            //CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
+            //CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
+
+            const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
+
+            GGML_V3_ASSERT(row_id >= 0 && row_id < n_as);
+
+            const struct ggml_v3_tensor * src0_row = dst->src[row_id + 2];
+
+            src1_row_extra.data_device[g_main_device] = src1_original + i01*src1->nb[1];
+            src1_row.data = (char *) src1->data + i01*src1->nb[1]; // TODO why is this set?
+
+            dst_row_extra.data_device[g_main_device] = dst_original + i01*dst->nb[1];
+            dst_row.data = (char *) dst->data + i01*dst->nb[1]; // TODO why is this set?
+
+            ggml_v3_cuda_mul_mat(src0_row, &src1_row, &dst_row);
+        }
+    } else {
+        cuda_pool_alloc<char> src1_contiguous(sizeof(float)*ggml_v3_nelements(src1));
+        cuda_pool_alloc<char>  dst_contiguous(sizeof(float)*ggml_v3_nelements(dst));
+
+        src1_row_extra.data_device[g_main_device] = src1_contiguous.get();
+        dst_row_extra.data_device[g_main_device]  =  dst_contiguous.get();
+
+        const cudaMemcpyKind src1_kind = src1->backend == GGML_V3_BACKEND_CPU ?
+            cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice;
+        const cudaMemcpyKind dst_kind  =  dst->backend == GGML_V3_BACKEND_CPU ?
+            cudaMemcpyDeviceToHost : cudaMemcpyDeviceToDevice;
+
+        for (int32_t row_id = 0; row_id < n_as; ++row_id) {
+            const struct ggml_v3_tensor * src0_row = dst->src[row_id + 2];
+
+            int64_t num_src1_rows = 0;
+            for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
+                const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
+
+                if (row_id_i != row_id) {
+                    continue;
+                }
+
+                GGML_V3_ASSERT(row_id >= 0 && row_id < n_as);
+
+                CUDA_CHECK(cudaMemcpyAsync(src1_contiguous.get() + num_src1_rows*nb11, src1_original + i01*nb11,
+                                        nb11, src1_kind, stream));
+                num_src1_rows++;
+            }
+
+            if (num_src1_rows == 0) {
+                continue;
+            }
+
+            src1_row.ne[1] = num_src1_rows;
+            dst_row.ne[1] = num_src1_rows;
+
+            src1_row.nb[1] = nb11;
+            src1_row.nb[2] = num_src1_rows*nb11;
+            src1_row.nb[3] = num_src1_rows*nb11;
+
+            dst_row.nb[1] = nb1;
+            dst_row.nb[2] = num_src1_rows*nb1;
+            dst_row.nb[3] = num_src1_rows*nb1;
+
+            ggml_v3_cuda_mul_mat(src0_row, &src1_row, &dst_row);
+
+            num_src1_rows = 0;
+            for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
+                const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
+
+                if (row_id_i != row_id) {
+                    continue;
+                }
+
+                GGML_V3_ASSERT(row_id >= 0 && row_id < n_as);
+
+                CUDA_CHECK(cudaMemcpyAsync(dst_original + i01*nb1, dst_contiguous.get() + num_src1_rows*nb1,
+                                        nb1, dst_kind, stream));
+                num_src1_rows++;
+            }
+        }
+    }
+
+    if (dst->backend == GGML_V3_BACKEND_CPU) {
+        CUDA_CHECK(cudaStreamSynchronize(stream));
+    }
+}
+
+static void ggml_v3_cuda_scale(const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst) {
+    ggml_v3_cuda_op_flatten(src0, src1, dst, ggml_v3_cuda_op_scale);
+}
+
+static void ggml_v3_cuda_clamp(const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst) {
+    ggml_v3_cuda_op_flatten(src0, src1, dst, ggml_v3_cuda_op_clamp);
+}
+
+static void ggml_v3_cuda_cpy(const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst) {
+    const int64_t ne = ggml_v3_nelements(src0);
+    GGML_V3_ASSERT(ne == ggml_v3_nelements(src1));
+
+    GGML_V3_ASSERT(src0->backend == GGML_V3_BACKEND_GPU);
+    GGML_V3_ASSERT(src1->backend == GGML_V3_BACKEND_GPU);
+
+    GGML_V3_ASSERT(ggml_v3_nbytes(src0) <= INT_MAX);
+    GGML_V3_ASSERT(ggml_v3_nbytes(src1) <= INT_MAX);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    GGML_V3_ASSERT(src0->ne[3] == 1);
+
+    const int64_t nb00 = src0->nb[0];
+    const int64_t nb01 = src0->nb[1];
+    const int64_t nb02 = src0->nb[2];
+
+    const int64_t ne10 = src1->ne[0];
+    const int64_t ne11 = src1->ne[1];
+    GGML_V3_ASSERT(src1->ne[3] == 1);
+
+    const int64_t nb10 = src1->nb[0];
+    const int64_t nb11 = src1->nb[1];
+    const int64_t nb12 = src1->nb[2];
+
+    ggml_v3_cuda_set_device(g_main_device);
+    cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
+
+    const ggml_v3_tensor_extra_gpu * src0_extra = (ggml_v3_tensor_extra_gpu *) src0->extra;
+    const ggml_v3_tensor_extra_gpu * src1_extra = (ggml_v3_tensor_extra_gpu *) src1->extra;
+
+    char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
+    char * src1_ddc = (char *) src1_extra->data_device[g_main_device];
+
+    if (src0->type == GGML_V3_TYPE_F32 && src1->type == GGML_V3_TYPE_F32) {
+        ggml_v3_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
+    } else if (src0->type == GGML_V3_TYPE_F32 && src1->type == GGML_V3_TYPE_F16) {
+        ggml_v3_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
+    } else if (src0->type == GGML_V3_TYPE_F32 && src1->type == GGML_V3_TYPE_Q8_0) {
+        ggml_v3_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
+    } else if (src0->type == GGML_V3_TYPE_F32 && src1->type == GGML_V3_TYPE_Q4_0) {
+        ggml_v3_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
+    } else if (src0->type == GGML_V3_TYPE_F32 && src1->type == GGML_V3_TYPE_Q4_1) {
+        ggml_v3_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
+    } else if (src0->type == GGML_V3_TYPE_F16 && src1->type == GGML_V3_TYPE_F16) {
+        ggml_v3_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
+    } else {
+        fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
+                ggml_v3_type_name(src0->type), ggml_v3_type_name(src1->type));
+        GGML_V3_ASSERT(false);
+    }
+
+    (void) dst;
+}
+
+static void ggml_v3_cuda_dup(const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst) {
+    // TODO: why do we pass dst as src1 here?
+    ggml_v3_cuda_cpy(src0, dst, nullptr);
+    (void) src1;
+}
+
+static void ggml_v3_cuda_diag_mask_inf(const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst) {
+    ggml_v3_cuda_op_flatten(src0, src1, dst, ggml_v3_cuda_op_diag_mask_inf);
+}
+
+static void ggml_v3_cuda_soft_max(const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst) {
+    ggml_v3_cuda_op_flatten(src0, src1, dst, ggml_v3_cuda_op_soft_max);
+}
+
+static void ggml_v3_cuda_rope(const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst) {
+    GGML_V3_ASSERT(ggml_v3_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented
+    ggml_v3_cuda_op_flatten(src0, src1, dst, ggml_v3_cuda_op_rope);
+}
+
+static void ggml_v3_cuda_alibi(const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst) {
+    ggml_v3_cuda_op_flatten(src0, src1, dst, ggml_v3_cuda_op_alibi);
+}
+
+static void ggml_v3_cuda_im2col(const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst) {
+    ggml_v3_cuda_op_flatten(src0, src1, dst, ggml_v3_cuda_op_im2col);
+}
+
+static void ggml_v3_cuda_sum_rows(const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst) {
+    GGML_V3_ASSERT(ggml_v3_is_contiguous(src0));
+    ggml_v3_cuda_op_flatten(src0, src1, dst, ggml_v3_cuda_op_sum_rows);
+}
+
+static void ggml_v3_cuda_argsort(const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst) {
+    GGML_V3_ASSERT(ggml_v3_is_contiguous(src0));
+    ggml_v3_cuda_op_flatten(src0, src1, dst, ggml_v3_cuda_op_argsort);
+}
+
+static void ggml_v3_cuda_nop(const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst) {
+    (void) src0;
+    (void) src1;
+    (void) dst;
+}
+
+static size_t ggml_v3_nbytes_split(const struct ggml_v3_tensor * tensor, int nrows_split) {
+    static_assert(GGML_V3_MAX_DIMS == 4, "GGML_V3_MAX_DIMS is not 4 - update this function");
+
+    return nrows_split*ggml_v3_row_size(tensor->type, tensor->ne[0]);
+}
+
+void ggml_v3_cuda_transform_tensor(void * data, struct ggml_v3_tensor * tensor) {
+    const int64_t nrows = ggml_v3_nrows(tensor);
+
+    const int64_t ne0 = tensor->ne[0];
+
+    const size_t nb1 = tensor->nb[1];
+
+    ggml_v3_backend_type backend = tensor->backend;
+    ggml_v3_tensor_extra_gpu * extra = new struct ggml_v3_tensor_extra_gpu;
+    memset(extra, 0, sizeof(*extra));
+
+    for (int id = 0; id < g_device_count; ++id) {
+        if (backend == GGML_V3_BACKEND_GPU && id != g_main_device) {
+            continue;
+        }
+
+        ggml_v3_cuda_set_device(id);
+
+        int64_t row_low, row_high;
+        if (backend == GGML_V3_BACKEND_GPU) {
+            row_low = 0;
+            row_high = nrows;
+        } else if (backend == GGML_V3_BACKEND_GPU_SPLIT) {
+            const int64_t rounding = get_row_rounding(tensor->type);
+
+            row_low = id == 0 ? 0 : nrows*g_tensor_split[id];
+            row_low -= row_low % rounding;
+
+            if (id == g_device_count - 1) {
+                row_high = nrows;
+            } else {
+                row_high = nrows*g_tensor_split[id + 1];
+                row_high -= row_high % rounding;
+            }
+        } else {
+            GGML_V3_ASSERT(false);
+        }
+        if (row_low == row_high) {
+            continue;
+        }
+
+        int64_t nrows_split = row_high - row_low;
+
+        const size_t offset_split = row_low*nb1;
+        size_t size = ggml_v3_nbytes_split(tensor, nrows_split);
+        const size_t original_size = size;
+
+        // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
+        if (ne0 % MATRIX_ROW_PADDING != 0) {
+            size += ggml_v3_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
+        }
+
+        char * buf;
+        CUDA_CHECK(cudaMalloc(&buf, size));
+        char * buf_host = (char *)data + offset_split;
+
+        // set padding to 0 to avoid possible NaN values
+        if (size > original_size) {
+            CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
+        }
+
+        CUDA_CHECK(cudaMemcpy(buf, buf_host, original_size, cudaMemcpyHostToDevice));
+
+        extra->data_device[id] = buf;
+
+        if (backend == GGML_V3_BACKEND_GPU_SPLIT) {
+            for (int64_t is = 0; is < MAX_STREAMS; ++is) {
+                CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming));
+            }
+        }
+    }
+
+    tensor->extra = extra;
+}
+
+void ggml_v3_cuda_free_data(struct ggml_v3_tensor * tensor) {
+    if (!tensor || !tensor->extra || (tensor->backend != GGML_V3_BACKEND_GPU && tensor->backend != GGML_V3_BACKEND_GPU_SPLIT) ) {
+        return;
+    }
+
+    ggml_v3_tensor_extra_gpu * extra = (ggml_v3_tensor_extra_gpu *) tensor->extra;
+
+    for (int id = 0; id < g_device_count; ++id) {
+        ggml_v3_cuda_set_device(id);
+        if (extra->data_device[id] != nullptr) {
+            CUDA_CHECK(cudaFree(extra->data_device[id]));
+        }
+
+        for (int64_t is = 0; is < MAX_STREAMS; ++is) {
+            if (extra->events[id][is] != nullptr) {
+                CUDA_CHECK(cudaEventDestroy(extra->events[id][is]));
+            }
+        }
+    }
+
+    delete extra;
+}
+
+static ggml_v3_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
+static size_t g_temp_tensor_extra_index = 0;
+
+static ggml_v3_tensor_extra_gpu * ggml_v3_cuda_alloc_temp_tensor_extra() {
+    if (g_temp_tensor_extras == nullptr) {
+        g_temp_tensor_extras = new ggml_v3_tensor_extra_gpu[GGML_V3_CUDA_MAX_NODES];
+    }
+
+    size_t alloc_index = g_temp_tensor_extra_index;
+    g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_V3_CUDA_MAX_NODES;
+    ggml_v3_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
+    memset(extra, 0, sizeof(*extra));
+
+    return extra;
+}
+
+static void ggml_v3_cuda_assign_buffers_impl(struct ggml_v3_tensor * tensor, bool scratch, bool force_inplace, bool no_alloc) {
+    if (scratch && g_scratch_size == 0) {
+        return;
+    }
+
+    tensor->backend = GGML_V3_BACKEND_GPU;
+
+    // recursively assign CUDA buffers until a compute tensor is found
+    if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_V3_BACKEND_CPU) {
+        const ggml_v3_op src0_op = tensor->src[0]->op;
+        if (src0_op == GGML_V3_OP_RESHAPE || src0_op == GGML_V3_OP_TRANSPOSE || src0_op == GGML_V3_OP_VIEW || src0_op == GGML_V3_OP_PERMUTE) {
+            ggml_v3_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace, no_alloc);
+        }
+    }
+    if (tensor->op == GGML_V3_OP_CPY && tensor->src[1]->backend == GGML_V3_BACKEND_CPU) {
+        ggml_v3_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc);
+    }
+
+    if (scratch && no_alloc) {
+        return;
+    }
+
+    ggml_v3_tensor_extra_gpu * extra;
+
+    const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
+        tensor->op == GGML_V3_OP_VIEW ||
+        force_inplace;
+    const size_t size = ggml_v3_nbytes(tensor);
+
+    ggml_v3_cuda_set_device(g_main_device);
+    if (inplace && (tensor->src[0]->backend == GGML_V3_BACKEND_GPU || tensor->src[0]->backend == GGML_V3_BACKEND_GPU_SPLIT)) {
+        ggml_v3_tensor_extra_gpu * src0_extra = (ggml_v3_tensor_extra_gpu * ) tensor->src[0]->extra;
+        char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
+        size_t offset = 0;
+        if (tensor->op == GGML_V3_OP_VIEW) {
+            memcpy(&offset, tensor->op_params, sizeof(size_t));
+        }
+        extra = ggml_v3_cuda_alloc_temp_tensor_extra();
+        extra->data_device[g_main_device] = src0_ddc + offset;
+    } else if (tensor->op == GGML_V3_OP_CPY) {
+        ggml_v3_tensor_extra_gpu * src1_extra = (ggml_v3_tensor_extra_gpu * ) tensor->src[1]->extra;
+        void * src1_ddv = src1_extra->data_device[g_main_device];
+        extra = ggml_v3_cuda_alloc_temp_tensor_extra();
+        extra->data_device[g_main_device] = src1_ddv;
+    } else if (scratch) {
+        GGML_V3_ASSERT(size <= g_scratch_size);
+        if (g_scratch_offset + size > g_scratch_size) {
+            g_scratch_offset = 0;
+        }
+
+        char * data = (char *) g_scratch_buffer;
+        if (data == nullptr) {
+            CUDA_CHECK(cudaMalloc(&data, g_scratch_size));
+            g_scratch_buffer = data;
+        }
+        extra = ggml_v3_cuda_alloc_temp_tensor_extra();
+        extra->data_device[g_main_device] = data + g_scratch_offset;
+
+        g_scratch_offset += size;
+
+        GGML_V3_ASSERT(g_scratch_offset <= g_scratch_size);
+    } else { // allocate new buffers outside of scratch
+        void * data;
+        CUDA_CHECK(cudaMalloc(&data, size));
+        CUDA_CHECK(cudaMemset(data, 0, size));
+        extra = new ggml_v3_tensor_extra_gpu;
+        memset(extra, 0, sizeof(*extra));
+        extra->data_device[g_main_device] = data;
+    }
+
+    tensor->extra = extra;
+}
+
+void ggml_v3_cuda_assign_scratch_offset(struct ggml_v3_tensor * tensor, size_t offset) {
+    if (g_scratch_size == 0) {
+        return;
+    }
+    if (g_scratch_buffer == nullptr) {
+        ggml_v3_cuda_set_device(g_main_device);
+        CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size));
+    }
+
+    ggml_v3_tensor_extra_gpu * extra = ggml_v3_cuda_alloc_temp_tensor_extra();
+
+    const bool inplace = tensor->view_src != nullptr;
+
+    if (inplace && (tensor->view_src->backend == GGML_V3_BACKEND_GPU || tensor->view_src->backend == GGML_V3_BACKEND_GPU_SPLIT)) {
+        ggml_v3_tensor_extra_gpu * src0_extra = (ggml_v3_tensor_extra_gpu * ) tensor->view_src->extra;
+        char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
+        size_t view_offset = 0;
+        if (tensor->op == GGML_V3_OP_VIEW) {
+            memcpy(&view_offset, tensor->op_params, sizeof(size_t));
+        }
+        extra->data_device[g_main_device] = src0_ddc + view_offset;
+    } else {
+        extra->data_device[g_main_device] = (char *) g_scratch_buffer + offset;
+    }
+
+    tensor->extra = extra;
+}
+
+void ggml_v3_cuda_copy_to_device(struct ggml_v3_tensor * tensor) {
+    GGML_V3_ASSERT(tensor->backend == GGML_V3_BACKEND_GPU);
+    GGML_V3_ASSERT(ggml_v3_is_contiguous(tensor));
+
+    ggml_v3_tensor_extra_gpu * extra = (ggml_v3_tensor_extra_gpu *) tensor->extra;
+    ggml_v3_cuda_set_device(g_main_device);
+    CUDA_CHECK(cudaMemcpy(extra->data_device[g_main_device], tensor->data, ggml_v3_nbytes(tensor), cudaMemcpyHostToDevice));
+}
+
+void ggml_v3_cuda_assign_buffers(struct ggml_v3_tensor * tensor) {
+    ggml_v3_cuda_assign_buffers_impl(tensor, true, false, false);
+}
+
+void ggml_v3_cuda_assign_buffers_no_alloc(struct ggml_v3_tensor * tensor) {
+    ggml_v3_cuda_assign_buffers_impl(tensor, true, false, true);
+}
+
+void ggml_v3_cuda_assign_buffers_no_scratch(struct ggml_v3_tensor * tensor) {
+    ggml_v3_cuda_assign_buffers_impl(tensor, false, false, false);
+}
+
+void ggml_v3_cuda_assign_buffers_force_inplace(struct ggml_v3_tensor * tensor) {
+    ggml_v3_cuda_assign_buffers_impl(tensor, false, true, false);
+}
+
+void ggml_v3_cuda_set_main_device(const int main_device) {
+    if (main_device >= g_device_count) {
+        fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
+                main_device, g_device_count, g_main_device);
+        return;
+    }
+
+    if (g_main_device != main_device && g_device_count > 1) {
+        g_main_device = main_device;
+        cudaDeviceProp prop;
+        CUDA_CHECK(cudaGetDeviceProperties(&prop, g_main_device));
+        fprintf(stderr, "%s: using device %d (%s) as main device\n", __func__, g_main_device, prop.name);
+    }
+}
+
+void ggml_v3_cuda_set_mul_mat_q(const bool mul_mat_q) {
+    g_mul_mat_q = mul_mat_q;
+}
+
+void ggml_v3_cuda_set_scratch_size(const size_t scratch_size) {
+    // this is a hack to not completely break llama.cpp when using multiple models or contexts simultaneously
+    // it still won't always work as expected, but it's better than nothing
+    if (scratch_size > g_scratch_size) {
+        ggml_v3_cuda_free_scratch();
+    }
+    g_scratch_size = std::max(g_scratch_size, scratch_size);
+}
+
+void ggml_v3_cuda_free_scratch() {
+    if (g_scratch_buffer == nullptr) {
+        return;
+    }
+
+    CUDA_CHECK(cudaFree(g_scratch_buffer));
+    g_scratch_buffer = nullptr;
+}
+
+bool ggml_v3_cuda_compute_forward(struct ggml_v3_compute_params * params, struct ggml_v3_tensor * tensor) {
+    if (!g_cublas_loaded) return false;
+
+    ggml_v3_cuda_func_t func;
+    const bool any_on_device = tensor->backend == GGML_V3_BACKEND_GPU
+        || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_V3_BACKEND_GPU || tensor->src[0]->backend == GGML_V3_BACKEND_GPU_SPLIT))
+        || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_V3_BACKEND_GPU);
+
+    if (!any_on_device && tensor->op != GGML_V3_OP_MUL_MAT && tensor->op != GGML_V3_OP_MUL_MAT_ID) {
+        return false;
+    }
+
+    if (tensor->op == GGML_V3_OP_MUL_MAT) {
+        if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
+#ifndef NDEBUG
+            fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
+#endif
+            return false;
+        }
+    }
+
+    switch (tensor->op) {
+        case GGML_V3_OP_REPEAT:
+            func = ggml_v3_cuda_repeat;
+            break;
+        case GGML_V3_OP_GET_ROWS:
+            func = ggml_v3_cuda_get_rows;
+            break;
+        case GGML_V3_OP_DUP:
+            func = ggml_v3_cuda_dup;
+            break;
+        case GGML_V3_OP_ADD:
+            func = ggml_v3_cuda_add;
+            break;
+        case GGML_V3_OP_ACC:
+            func = ggml_v3_cuda_acc;
+            break;
+        case GGML_V3_OP_MUL:
+            func = ggml_v3_cuda_mul;
+            break;
+        case GGML_V3_OP_DIV:
+            func = ggml_v3_cuda_div;
+            break;
+        case GGML_V3_OP_UNARY:
+            switch (ggml_v3_get_unary_op(tensor)) {
+                case GGML_V3_UNARY_OP_GELU:
+                    func = ggml_v3_cuda_gelu;
+                    break;
+                case GGML_V3_UNARY_OP_SILU:
+                    func = ggml_v3_cuda_silu;
+                    break;
+                case GGML_V3_UNARY_OP_GELU_QUICK:
+                    func = ggml_v3_cuda_gelu_quick;
+                    break;
+                case GGML_V3_UNARY_OP_TANH:
+                    func = ggml_v3_cuda_tanh;
+                    break;
+                case GGML_V3_UNARY_OP_RELU:
+                    func = ggml_v3_cuda_relu;
+                    break;
+                default:
+                    return false;
+            }
+            break;
+        case GGML_V3_OP_NORM:
+            func = ggml_v3_cuda_norm;
+            break;
+        case GGML_V3_OP_GROUP_NORM:
+            func = ggml_v3_cuda_group_norm;
+            break;
+        case GGML_V3_OP_CONCAT:
+            func = ggml_v3_cuda_concat;
+            break;
+        case GGML_V3_OP_UPSCALE:
+            func = ggml_v3_cuda_upscale;
+            break;
+        case GGML_V3_OP_PAD:
+            func = ggml_v3_cuda_pad;
+            break;
+        case GGML_V3_OP_LEAKY_RELU:
+            func = ggml_v3_cuda_leaky_relu;
+            break;
+        case GGML_V3_OP_RMS_NORM:
+            func = ggml_v3_cuda_rms_norm;
+            break;
+        case GGML_V3_OP_MUL_MAT:
+            if (!any_on_device && !ggml_v3_cuda_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
+                return false;
+            }
+            func = ggml_v3_cuda_mul_mat;
+            break;
+        case GGML_V3_OP_MUL_MAT_ID:
+            if (!any_on_device && !ggml_v3_cuda_can_mul_mat(tensor->src[2], tensor->src[1], tensor)) {
+                return false;
+            }
+            func = ggml_v3_cuda_mul_mat_id;
+            break;
+        case GGML_V3_OP_SCALE:
+            func = ggml_v3_cuda_scale;
+            break;
+        case GGML_V3_OP_SQR:
+            func = ggml_v3_cuda_sqr;
+            break;
+        case GGML_V3_OP_CLAMP:
+            func = ggml_v3_cuda_clamp;
+            break;
+        case GGML_V3_OP_CPY:
+            func = ggml_v3_cuda_cpy;
+            break;
+        case GGML_V3_OP_CONT:
+            func = ggml_v3_cuda_dup;
+            break;
+        case GGML_V3_OP_NONE:
+        case GGML_V3_OP_RESHAPE:
+        case GGML_V3_OP_VIEW:
+        case GGML_V3_OP_PERMUTE:
+        case GGML_V3_OP_TRANSPOSE:
+            func = ggml_v3_cuda_nop;
+            break;
+        case GGML_V3_OP_DIAG_MASK_INF:
+            func = ggml_v3_cuda_diag_mask_inf;
+            break;
+        case GGML_V3_OP_SOFT_MAX:
+            func = ggml_v3_cuda_soft_max;
+            break;
+        case GGML_V3_OP_ROPE:
+            func = ggml_v3_cuda_rope;
+            break;
+        case GGML_V3_OP_ALIBI:
+            func = ggml_v3_cuda_alibi;
+            break;
+        case GGML_V3_OP_IM2COL:
+            func = ggml_v3_cuda_im2col;
+            break;
+        case GGML_V3_OP_SUM_ROWS:
+            func = ggml_v3_cuda_sum_rows;
+            break;
+        case GGML_V3_OP_ARGSORT:
+            func = ggml_v3_cuda_argsort;
+            break;
+        default:
+            return false;
+    }
+
+    if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_V3_BACKEND_GPU_SPLIT) {
+        ggml_v3_cuda_set_peer_access(tensor->src[1]->ne[1]);
+    }
+
+    if (params->ith != 0) {
+        return true;
+    }
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return true;
+    }
+    func(tensor->src[0], tensor->src[1], tensor);
+    return true;
+}
+
+int ggml_v3_cuda_get_device_count() {
+    int device_count;
+    if (cudaGetDeviceCount(&device_count) != cudaSuccess) {
+        return 0;
+    }
+    return device_count;
+}
+
+void ggml_v3_cuda_get_device_description(int device, char * description, size_t description_size) {
+    cudaDeviceProp prop;
+    CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
+    snprintf(description, description_size, "%s", prop.name);
+}
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/otherarch/ggml_v3-cuda.h b/otherarch/ggml_v3-cuda.h
new file mode 100644
index 000000000..976dfb344
--- /dev/null
+++ b/otherarch/ggml_v3-cuda.h
@@ -0,0 +1,53 @@
+#pragma once
+
+#include "ggml_v3.h"
+
+#ifdef GGML_USE_HIPBLAS
+#define GGML_V3_CUDA_NAME "ROCm"
+#define GGML_V3_CUBLAS_NAME "hipBLAS"
+#else
+#define GGML_V3_CUDA_NAME "CUDA"
+#define GGML_V3_CUBLAS_NAME "cuBLAS"
+#endif
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+#define GGML_V3_CUDA_MAX_DEVICES       16
+
+// Always success. To check if CUDA is actually loaded, use `ggml_v3_cublas_loaded`.
+GGML_V3_API void   ggml_v3_init_cublas(void);
+
+// Returns `true` if there are available CUDA devices and cublas loads successfully; otherwise, it returns `false`.
+GGML_V3_API bool   ggml_v3_cublas_loaded(void);
+
+GGML_V3_API void * ggml_v3_cuda_host_malloc(size_t size);
+GGML_V3_API void   ggml_v3_cuda_host_free(void * ptr);
+
+GGML_V3_API bool   ggml_v3_cuda_can_mul_mat(const struct ggml_v3_tensor * src0, const struct ggml_v3_tensor * src1, struct ggml_v3_tensor * dst);
+GGML_V3_API void   ggml_v3_cuda_set_tensor_split(const float * tensor_split);
+GGML_V3_API void   ggml_v3_cuda_transform_tensor(void * data, struct ggml_v3_tensor * tensor);
+GGML_V3_API void   ggml_v3_cuda_free_data(struct ggml_v3_tensor * tensor);
+
+GGML_V3_API void   ggml_v3_cuda_assign_buffers(struct ggml_v3_tensor * tensor);
+GGML_V3_API void   ggml_v3_cuda_assign_buffers_no_scratch(struct ggml_v3_tensor * tensor);
+GGML_V3_API void   ggml_v3_cuda_assign_buffers_force_inplace(struct ggml_v3_tensor * tensor);
+
+GGML_V3_API void   ggml_v3_cuda_assign_buffers_no_alloc(struct ggml_v3_tensor * tensor);
+GGML_V3_API void   ggml_v3_cuda_assign_scratch_offset(struct ggml_v3_tensor * tensor, size_t offset);
+GGML_V3_API void   ggml_v3_cuda_copy_to_device(struct ggml_v3_tensor * tensor);
+
+GGML_V3_API void   ggml_v3_cuda_set_main_device(int main_device);
+GGML_V3_API void   ggml_v3_cuda_set_mul_mat_q(bool mul_mat_q);
+GGML_V3_API void   ggml_v3_cuda_set_scratch_size(size_t scratch_size);
+GGML_V3_API void   ggml_v3_cuda_free_scratch(void);
+GGML_V3_API bool   ggml_v3_cuda_compute_forward(struct ggml_v3_compute_params * params, struct ggml_v3_tensor * tensor);
+
+GGML_V3_API int    ggml_v3_cuda_get_device_count(void);
+GGML_V3_API void   ggml_v3_cuda_get_device_description(int device, char * description, size_t description_size);
+
+
+#ifdef  __cplusplus
+}
+#endif
diff --git a/otherarch/ggml_v3-opencl.cpp b/otherarch/ggml_v3-opencl.cpp
new file mode 100644
index 000000000..ac218bb0e
--- /dev/null
+++ b/otherarch/ggml_v3-opencl.cpp
@@ -0,0 +1,1908 @@
+#include "ggml_v3.h"
+#include "ggml_v3-opencl.h"
+
+#include <array>
+#include <atomic>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <limits>
+#include <sstream>
+#include <vector>
+
+#define CL_TARGET_OPENCL_VERSION 110
+#include <clblast.h>
+#include <clblast_c.h>
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+#define CL_DMMV_LOCAL_SIZE 32
+
+#ifndef K_QUANTS_PER_ITERATION
+#define K_QUANTS_PER_ITERATION 1
+#else
+static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
+#endif
+
+#define MULTILINE_QUOTE(...) #__VA_ARGS__
+static std::string program_source = MULTILINE_QUOTE(
+
+typedef char int8_t;
+typedef uchar uint8_t;
+typedef short int16_t;
+typedef ushort uint16_t;
+typedef int int32_t;
+typedef uint uint32_t;
+
+struct __attribute__ ((packed)) block_q4_0
+{
+    half d;
+    uint8_t qs[QK4_0 / 2];
+};
+
+struct __attribute__ ((packed)) block_q4_1
+{
+    half d;
+    half m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+struct __attribute__ ((packed)) block_q5_0
+{
+    half d;
+    uint32_t qh;
+    uint8_t qs[QK5_0 / 2];
+};
+
+struct __attribute__ ((packed)) block_q5_1
+{
+    half d;
+    half m;
+    uint32_t qh;
+    uint8_t qs[QK5_1 / 2];
+};
+
+struct __attribute__ ((packed)) block_q8_0
+{
+    half d;
+    int8_t qs[QK8_0];
+};
+
+struct __attribute__((packed)) block_q2_K
+{
+    uint8_t scales[16];
+    uint8_t qs[64];
+    half d;
+    half dmin;
+};
+
+struct __attribute__((packed)) block_q3_K
+{
+    uint8_t hmask[32];
+    uint8_t qs[64];
+    uint8_t scales[12];
+    half d;
+};
+
+struct __attribute__((packed)) block_q4_K
+{
+    half d;
+    half dmin;
+    uint8_t scales[12];
+    uint8_t qs[128];
+};
+
+struct __attribute__((packed)) block_q5_K
+{
+    half d;
+    half dmin;
+    uint8_t scales[12];
+    uint8_t qh[32];
+    uint8_t qs[128];
+};
+
+struct __attribute__((packed)) block_q6_K
+{
+    uint8_t ql[128];
+    uint8_t qh[64];
+    int8_t scales[16];
+    half d;
+};
+
+__kernel void convert_fp16_to_fp32(__global half* x, __global float* y) {
+    const uint i = get_global_id(0);
+
+    y[i] = vload_half(0, &x[i]);
+}
+
+void dequantize_q4_0(__global const struct block_q4_0* x, const int ib, const int iqs, float* v0, float* v1) {
+    const float d = vload_half(0, &x[ib].d);
+
+    const uint8_t vui = x[ib].qs[iqs];
+
+    const int8_t vi0 = vui & 0xF;
+    const int8_t vi1 = vui >> 4;
+
+    *v0 = (vi0 - 8)*d;
+    *v1 = (vi1 - 8)*d;
+}
+void dequantize_q4_1(__global const struct block_q4_1* x, const int ib, const int iqs, float* v0, float* v1) {
+    const float d = vload_half(0, &x[ib].d);
+    const float m = vload_half(0, &x[ib].m);
+
+    const uint8_t vui = x[ib].qs[iqs];
+
+    const int8_t vi0 = vui & 0xF;
+    const int8_t vi1 = vui >> 4;
+
+    *v0 = vi0*d + m;
+    *v1 = vi1*d + m;
+}
+void dequantize_q5_0(__global const struct block_q5_0* x, const int ib, const int iqs, float* v0, float* v1) {
+    const float d = vload_half(0, &x[ib].d);
+
+    uint32_t qh = x[ib].qh;
+
+    const uint8_t xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
+    const uint8_t xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
+
+    const int32_t x0 = ((x[ib].qs[iqs] & 0xf) | xh_0) - 16;
+    const int32_t x1 = ((x[ib].qs[iqs] >>  4) | xh_1) - 16;
+
+    *v0 = x0*d;
+    *v1 = x1*d;
+}
+void dequantize_q5_1(__global const struct block_q5_1* x, const int ib, const int iqs, float* v0, float* v1) {
+    const float d = vload_half(0, &x[ib].d);
+    const float m = vload_half(0, &x[ib].m);
+
+    uint32_t qh = x[ib].qh;
+
+    const uint8_t xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
+    const uint8_t xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
+
+    const int32_t x0 = ((x[ib].qs[iqs] & 0xf) | xh_0);
+    const int32_t x1 = ((x[ib].qs[iqs] >>  4) | xh_1);
+
+    *v0 = x0*d + m;
+    *v1 = x1*d + m;
+}
+void dequantize_q8_0(__global const struct block_q8_0* x, const int ib, const int iqs, float* v0, float* v1) {
+    const float d = vload_half(0, &x[ib].d);
+
+    const int8_t vi0 = x[ib].qs[iqs + 0];
+    const int8_t vi1 = x[ib].qs[iqs + 1];
+
+    *v0 = vi0*d;
+    *v1 = vi1*d;
+}
+void convert_f16(__global half* x, const int ib, const int iqs, float* v0, float* v1){
+    *v0 = vload_half(0, &x[ib + 0]);
+    *v1 = vload_half(0, &x[ib + 1]);
+}
+);
+
+static std::string k_quants_source = MULTILINE_QUOTE(
+inline void get_scale_min_k4(int j, const __global uint8_t *q, uint8_t *d, uint8_t *m)
+{
+    if (j < 4)
+    {
+        *d = q[j] & 63;
+        *m = q[j + 4] & 63;
+    }
+    else
+    {
+        *d = (q[j + 4] & 0xF) | ((q[j - 4] >> 6) << 4);
+        *m = (q[j + 4] >> 4) | ((q[j - 0] >> 6) << 4);
+    }
+}
+
+__kernel void dequantize_block_q2_K(__global const struct block_q2_K *x, __global float *yy)
+{
+    const int i = get_group_id(0) + get_global_offset(0);
+    const int tid = get_local_id(0);
+    const int n = tid / 32;
+    const int l = tid - 32 * n;
+    const int is = 8 * n + l / 16;
+
+    const uint8_t q = x[i].qs[32 * n + l];
+    __global float *y = yy + get_group_id(0) * QK_K + 128 * n;
+
+    const float dall = vload_half(0, &x[i].d);
+    const float dmin = vload_half(0, &x[i].dmin);
+
+    y[l + 0] = dall * (x[i].scales[is + 0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is + 0] >> 4);
+    y[l + 32] = dall * (x[i].scales[is + 2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is + 2] >> 4);
+    y[l + 64] = dall * (x[i].scales[is + 4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is + 4] >> 4);
+    y[l + 96] = dall * (x[i].scales[is + 6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is + 6] >> 4);
+}
+
+__kernel void dequantize_block_q3_K(__global const struct block_q3_K *x, __global float *yy)
+{
+    int r = get_local_id(0) / 4;
+    int i = get_group_id(0) + get_global_offset(0);
+    int tid = r / 2;
+    int is0 = r % 2;
+    int l0 = 16 * is0 + 4 * (get_local_id(0) % 4);
+    int n = tid / 4;
+    int j = tid - 4 * n;
+
+    uint8_t m = 1 << (4 * n + j);
+    int is = 8 * n + 2 * j + is0;
+    int shift = 2 * j;
+
+    int8_t us = is < 4 ? (x[i].scales[is - 0] & 0xF) | (((x[i].scales[is + 8] >> 0) & 3) << 4)
+              : is < 8 ? (x[i].scales[is - 0] & 0xF) | (((x[i].scales[is + 4] >> 2) & 3) << 4)
+              : is < 12  ? (x[i].scales[is - 8] >> 4) | (((x[i].scales[is + 0] >> 4) & 3) << 4)
+              : (x[i].scales[is - 8] >> 4) | (((x[i].scales[is - 4] >> 6) & 3) << 4);
+    float d_all = vload_half(0, &x[i].d);
+    float dl = d_all * (us - 32);
+
+    __global float *y = yy + get_group_id(0) * QK_K + 128 * n + 32 * j;
+    const __global uint8_t *q = x[i].qs + 32 * n;
+    const __global uint8_t *hm = x[i].hmask;
+
+    for (int l = l0; l < l0 + 4; ++l)
+        y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
+}
+
+__kernel void dequantize_block_q4_K(__global const struct block_q4_K *x, __global float *yy)
+{
+    const int i = get_group_id(0) + get_global_offset(0);
+    const int tid = get_local_id(0);
+    const int il = tid / 8;
+    const int ir = tid % 8;
+    const int is = 2 * il;
+    const int n = 4;
+
+    __global float *y = yy + get_group_id(0) * QK_K + 64 * il + n * ir;
+
+    const float dall = vload_half(0, &x[i].d);
+    const float dmin = vload_half(0, &x[i].dmin);
+
+    __global const uint8_t *q = x[i].qs + 32 * il + n * ir;
+
+    uint8_t sc, m;
+    get_scale_min_k4(is + 0, x[i].scales, &sc, &m);
+    float d1 = dall * sc;
+    float m1 = dmin * m;
+    get_scale_min_k4(is + 1, x[i].scales, &sc, &m);
+    float d2 = dall * sc;
+    float m2 = dmin * m;
+    for (int l = 0; l < n; ++l)
+    {
+        y[l + 0] = d1 * (q[l] & 0xF) - m1;
+        y[l + 32] = d2 * (q[l] >> 4) - m2;
+    }
+}
+
+__kernel void dequantize_block_q5_K(__global const struct block_q5_K *x, __global float *yy)
+{
+    const int i = get_group_id(0) + get_global_offset(0);
+    const int tid = get_local_id(0);
+    const int il = tid / 16;
+    const int ir = tid % 16;
+    const int is = 2 * il;
+
+    __global float *y = yy + get_group_id(0) * QK_K + 64 * il + 2 * ir;
+
+    const float dall = vload_half(0, &x[i].d);
+    const float dmin = vload_half(0, &x[i].dmin);
+
+    __global const uint8_t *ql = x[i].qs + 32 * il + 2 * ir;
+    __global const uint8_t *qh = x[i].qh + 2 * ir;
+
+    uint8_t sc, m;
+    get_scale_min_k4(is + 0, x[i].scales, &sc, &m);
+    const float d1 = dall * sc;
+    const float m1 = dmin * m;
+    get_scale_min_k4(is + 1, x[i].scales, &sc, &m);
+    const float d2 = dall * sc;
+    const float m2 = dmin * m;
+
+    uint8_t hm = 1 << (2 * il);
+    y[0] = d1 * ((ql[0] & 0xF) + (qh[0] & hm ? 16 : 0)) - m1;
+    y[1] = d1 * ((ql[1] & 0xF) + (qh[1] & hm ? 16 : 0)) - m1;
+    hm <<= 1;
+    y[32] = d2 * ((ql[0] >> 4) + (qh[0] & hm ? 16 : 0)) - m2;
+    y[33] = d2 * ((ql[1] >> 4) + (qh[1] & hm ? 16 : 0)) - m2;
+}
+
+__kernel void dequantize_block_q6_K(__global const struct block_q6_K *x, __global float *yy)
+{
+    const int i = get_group_id(0) + get_global_offset(0);
+    const int tid = get_local_id(0);
+    const int ip = tid / 32;
+    const int il = tid - 32 * ip;
+    const int is = 8 * ip + il / 16;
+
+    __global float *y = yy + get_group_id(0) * QK_K + 128 * ip + il;
+
+    const float d = vload_half(0, &x[i].d);
+
+    __global const uint8_t *ql = x[i].ql + 64 * ip + il;
+    const uint8_t qh = x[i].qh[32 * ip + il];
+    __global const int8_t *sc = x[i].scales + is;
+
+    y[0] = d * sc[0] * ((int8_t)((ql[0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
+    y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
+    y[64] = d * sc[4] * ((int8_t)((ql[0] >> 4) | (((qh >> 4) & 3) << 4)) - 32);
+    y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
+}
+
+__kernel void dequantize_mul_mat_vec_q2_K(__global const struct block_q2_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) {
+
+    const int row = get_group_id(0);
+
+    const int num_blocks_per_row = ncols / QK_K;
+    const int ib0 = row*num_blocks_per_row + get_global_offset(0);
+
+    __global const struct block_q2_K * x = xx + ib0;
+
+    const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION;  // 0...31 or 0...15
+    const int ix  = get_local_id(0)%K_QUANTS_PER_ITERATION;  // 0 or 0,1
+
+    const int step = 16/K_QUANTS_PER_ITERATION;
+
+    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
+    const int in = tid - step*im;                        // 0...15 or 0...7
+
+    const int l0 = K_QUANTS_PER_ITERATION*in;            // 0...15 or 0...14 in steps of 2
+    const int q_offset = 32*im + l0;
+    const int s_offset = 8*im;
+    const int y_offset = 128*im + l0;
+
+    tmp[16 * ix + tid] = 0;
+
+    uint32_t aux[4];
+    const uint8_t * d = (const uint8_t *)aux;
+    const uint8_t * m = (const uint8_t *)(aux + 2);
+
+    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
+
+        __global const float   * y = yy + i * QK_K + y_offset;
+        __global const uint8_t * q = x[i].qs + q_offset;
+
+        const float dall = vload_half(0, &x[i].d);
+        const float dmin = vload_half(0, &x[i].dmin);
+
+        __global const uint32_t * a = (__global const uint32_t *)(x[i].scales + s_offset);
+        aux[0] = a[0] & 0x0f0f0f0f;
+        aux[1] = a[1] & 0x0f0f0f0f;
+        aux[2] = (a[0] >> 4) & 0x0f0f0f0f;
+        aux[3] = (a[1] >> 4) & 0x0f0f0f0f;
+
+        float sum1 = 0, sum2 = 0;
+        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
+            sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3)
+                  + y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3)
+                  + y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3)
+                  + y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3)
+                  + y[l+16] * d[1] * ((q[l+16] >> 0) & 3)
+                  + y[l+48] * d[3] * ((q[l+16] >> 2) & 3)
+                  + y[l+80] * d[5] * ((q[l+16] >> 4) & 3)
+                  +y[l+112] * d[7] * ((q[l+16] >> 6) & 3);
+            sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6]
+                  + y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7];
+
+        }
+        tmp[16 * ix + tid] += dall * sum1 - dmin * sum2;
+
+    }
+
+    // sum up partial sums and write back result
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for (int s=16; s>0; s>>=1) {
+        if (tid < s) {
+            tmp[tid] += tmp[tid + s];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if (tid == 0) {
+        dst[row] = tmp[0];
+    }
+}
+
+__kernel void dequantize_mul_mat_vec_q3_K(__global const struct block_q3_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) {
+    const uint16_t kmask1 = 0x0303;
+    const uint16_t kmask2 = 0x0f0f;
+
+    const int row = get_group_id(0);
+
+    const int num_blocks_per_row = ncols / QK_K;
+    const int ib0 = row*num_blocks_per_row + get_global_offset(0);
+
+    __global const struct block_q3_K * x = xx + ib0;
+
+    const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION;  // 0...31 or 0...16
+    const int ix  = get_local_id(0)%K_QUANTS_PER_ITERATION;  // 0 or 0,1
+
+    const int n  = K_QUANTS_PER_ITERATION;               // iterations in the inner loop
+    const int step = 16/K_QUANTS_PER_ITERATION;
+    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
+    const int in = tid - step*im;                        // 0....15 or 0...7
+
+    const uint8_t m = 1 << (4*im);
+
+    const int l0 = n*in;                                 // 0...15 or 0...14 in steps of 2
+    const int q_offset =  32*im + l0;
+    const int y_offset = 128*im + l0;
+
+    uint16_t utmp[4];
+    const int8_t * s = (const int8_t *)utmp;
+
+    const uint16_t s_shift = 4*im;
+
+    tmp[16 * ix + tid] = 0;
+
+    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
+
+        __global const float   * y  = yy + i * QK_K + y_offset;
+        __global const uint8_t * q = x[i].qs + q_offset;
+        __global const uint8_t * h = x[i].hmask + l0;
+
+        __global const uint16_t * a = (__global const uint16_t *)x[i].scales;
+        utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4);
+        utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4);
+        utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4);
+        utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4);
+
+        const float d = vload_half(0, &x[i].d);
+
+        float sum = 0;
+        for (int l = 0; l < n; ++l) {
+            sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4))
+                 + y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4))
+                 + y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4))
+                 + y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4));
+            sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4))
+                 + y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4))
+                 + y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4))
+                + y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4));
+        }
+        tmp[16 * ix + tid] += d * sum;
+
+    }
+
+    // sum up partial sums and write back result
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for (int s=16; s>0; s>>=1) {
+        if (tid < s) {
+            tmp[tid] += tmp[tid + s];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if (tid == 0) {
+        dst[row] = tmp[0];
+    }
+}
+
+__kernel void dequantize_mul_mat_vec_q4_K(__global const struct block_q4_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) {
+
+    //to rename it later, just to test now
+    const uint16_t kmask1 = 0x3f3f;
+    const uint16_t kmask2 = 0x0f0f;
+    const uint16_t kmask3 = 0xc0c0;
+
+    const int row = get_group_id(0);
+    const int num_blocks_per_row = ncols / QK_K;
+    const int ib0 = row*num_blocks_per_row + get_global_offset(0);
+
+    const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION;  // 0...15
+    const int ix  = get_local_id(0)%K_QUANTS_PER_ITERATION;
+
+    const int step = 8/K_QUANTS_PER_ITERATION;
+
+    const int il  = tid/step;     // 0...3
+    const int ir  = tid - step*il;// 0...3
+    const int n   = 2*K_QUANTS_PER_ITERATION;
+
+    const int im = il/2;  // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
+    const int in = il%2;
+
+    const int l0 = n*(2*ir + in);
+    const int q_offset = 32*im + l0;
+    const int y_offset = 64*im + l0;
+
+    uint16_t aux[4];
+    const uint8_t * sc = (const uint8_t *)aux;
+
+    __global const struct block_q4_K * x = xx + ib0;
+
+    tmp[16 * ix + tid] = 0;
+
+    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
+
+        __global const uint8_t * q1 = x[i].qs + q_offset;
+        __global const uint8_t * q2 = q1 + 64;
+        __global const float   * y1 = yy + i*QK_K + y_offset;
+        __global const float   * y2 = y1 + 128;
+
+        const float dall = vload_half(0, &x[i].d);
+        const float dmin = vload_half(0, &x[i].dmin);
+
+        __global const uint16_t * a = (__global const uint16_t *)x[i].scales;
+        aux[0] = a[im+0] & kmask1;
+        aux[1] = a[im+2] & kmask1;
+        aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
+        aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
+
+        float4 s = (float4)(0.f);
+        float smin = 0;
+        for (int l = 0; l < n; ++l) {
+            s.x += y1[l] * (q1[l] & 0xF); s.y += y1[l+32] * (q1[l] >> 4);
+            s.z += y2[l] * (q2[l] & 0xF); s.w += y2[l+32] * (q2[l] >> 4);
+            smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
+        }
+        tmp[16 * ix + tid] += dall * (s.x * sc[0] + s.y * sc[1] + s.z * sc[4] + s.w * sc[5]) - dmin * smin;
+
+    }
+
+    // sum up partial sums and write back result
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for (int s=16; s>0; s>>=1) {
+        if (tid < s) {
+            tmp[tid] += tmp[tid + s];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if (tid == 0) {
+        dst[row] = tmp[0];
+    }
+}
+
+__kernel void dequantize_mul_mat_vec_q5_K(__global const struct block_q5_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) {
+
+    const uint16_t kmask1 = 0x3f3f;
+    const uint16_t kmask2 = 0x0f0f;
+    const uint16_t kmask3 = 0xc0c0;
+
+    const int row = get_group_id(0);
+    const int num_blocks_per_row = ncols / QK_K;
+    const int ib0 = row*num_blocks_per_row + get_global_offset(0);
+
+    const int tid = get_local_id(0)/2;  // 0...15
+    const int ix  = get_local_id(0)%2;
+
+    const int il  = tid/4;     // 0...3
+    const int ir  = tid - 4*il;// 0...3
+    const int n   = 2;
+
+    const int im = il/2;  // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
+    const int in = il%2;
+
+    const int l0 = n*(2*ir + in);
+    const int q_offset = 32*im + l0;
+    const int y_offset = 64*im + l0;
+
+    const uint8_t hm1  = 1 << (2*im);
+    const uint8_t hm2  = hm1 << 4;
+
+    uint16_t aux[4];
+    const uint8_t * sc = (const uint8_t *)aux;
+
+    __global const struct block_q5_K * x = xx + ib0;
+
+    tmp[16 * ix + tid] = 0;
+
+    for (int i = ix; i < num_blocks_per_row; i += 2) {
+
+        __global const uint8_t * ql1 = x[i].qs + q_offset;
+        __global const uint8_t * ql2 = ql1 + 64;
+        __global const uint8_t * qh  = x[i].qh + l0;
+        __global const float   * y1  = yy + i*QK_K + y_offset;
+        __global const float   * y2  = y1 + 128;
+
+        const float dall = vload_half(0, &x[i].d);
+        const float dmin = vload_half(0, &x[i].dmin);
+
+        __global const uint16_t * a = (__global const uint16_t *)x[i].scales;
+        aux[0] = a[im+0] & kmask1;
+        aux[1] = a[im+2] & kmask1;
+        aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
+        aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
+
+        float4 sum = (float4)(0.f);
+        float smin = 0;
+        for (int l = 0; l < n; ++l) {
+            sum.x += y1[l+ 0] * ((ql1[l+ 0] & 0xF) + (qh[l+ 0] & (hm1 << 0) ? 16 : 0))
+                   + y1[l+16] * ((ql1[l+16] & 0xF) + (qh[l+16] & (hm1 << 0) ? 16 : 0));
+            sum.y += y1[l+32] * ((ql1[l+ 0] >>  4) + (qh[l+ 0] & (hm1 << 1) ? 16 : 0))
+                   + y1[l+48] * ((ql1[l+16] >>  4) + (qh[l+16] & (hm1 << 1) ? 16 : 0));
+            sum.z += y2[l+ 0] * ((ql2[l+ 0] & 0xF) + (qh[l+ 0] & (hm2 << 0) ? 16 : 0))
+                   + y2[l+16] * ((ql2[l+16] & 0xF) + (qh[l+16] & (hm2 << 0) ? 16 : 0));
+            sum.w += y2[l+32] * ((ql2[l+ 0] >>  4) + (qh[l+ 0] & (hm2 << 1) ? 16 : 0))
+                   + y2[l+48] * ((ql2[l+16] >>  4) + (qh[l+16] & (hm2 << 1) ? 16 : 0));
+            smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]
+                  + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
+        }
+        tmp[16 * ix + tid] += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin;
+
+    }
+
+    // sum up partial sums and write back result
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for (int s=16; s>0; s>>=1) {
+        if (tid < s) {
+            tmp[tid] += tmp[tid + s];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if (tid == 0) {
+        dst[row] = tmp[0];
+    }
+}
+
+__kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx, __local float* tmp, __global const float * yy, __global float * dst, const int ncols) {
+
+    const int row = get_group_id(0);
+
+    const int num_blocks_per_row = ncols / QK_K;
+    const int ib0 = row*num_blocks_per_row + get_global_offset(0);
+
+    __global const struct block_q6_K * x = xx + ib0;
+
+    const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION;  // 0...31 or 0...16
+    const int ix  = get_local_id(0)%K_QUANTS_PER_ITERATION;  // 0 or 0, 1
+
+    const int step = 16/K_QUANTS_PER_ITERATION;          // 16 or 8
+
+    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
+    const int in = tid - step*im;                        // 0...15 or 0...7
+
+\n#if K_QUANTS_PER_ITERATION == 1\n
+    const int l0 = K_QUANTS_PER_ITERATION*in;            // 0...15
+    const int is = 0;
+
+\n#else\n
+
+    const int l0 = 4 * in;                               // 0, 4, 8, ..., 28
+    const int is = in / 4;
+
+\n#endif\n
+
+    const int ql_offset = 64*im + l0;
+    const int qh_offset = 32*im + l0;
+    const int s_offset  =  8*im + is;
+    const int y_offset = 128*im + l0;
+
+    tmp[16 * ix + tid] = 0; // partial sum for thread in warp
+
+    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
+
+        __global const float   * y  = yy + i * QK_K + y_offset;
+        __global const uint8_t * ql = x[i].ql + ql_offset;
+        __global const uint8_t * qh = x[i].qh + qh_offset;
+        __global const int8_t  * s  = x[i].scales + s_offset;
+
+        const float d = vload_half(0, &x[i].d);
+
+\n#if K_QUANTS_PER_ITERATION == 1\n
+        float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
+                  + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
+                  + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
+                  + y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32)
+                  + y[64] * s[4] * d * ((int8_t)((ql[ 0]  >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32)
+                  + y[80] * s[5] * d * ((int8_t)((ql[16]  >> 4) | ((qh[16] & 0x30) >> 0)) - 32)
+                  + y[96] * s[6] * d * ((int8_t)((ql[32]  >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
+                  +y[112] * s[7] * d * ((int8_t)((ql[48]  >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
+        tmp[16 * ix + tid] += sum;
+\n#else\n
+        float sum = 0;
+        for (int l = 0; l < 4; ++l) {
+            sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
+                 + y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32)
+                 + y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0]  >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32)
+                 + y[l+96] * s[6] * d * ((int8_t)((ql[l+32]  >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
+        }
+        tmp[16 * ix + tid] += sum;
+\n#endif\n
+
+    }
+
+    // sum up partial sums and write back result
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for (int s=16; s>0; s>>=1) {
+        if (tid < s) {
+            tmp[tid] += tmp[tid + s];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if (tid == 0) {
+        dst[row] = tmp[0];
+    }
+}
+
+);
+
+
+static std::string dequant_template = MULTILINE_QUOTE(
+__kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) {
+    const int i = get_group_id(0)*get_local_size(0) + get_local_id(0)*2;
+
+    if (i >= get_global_size(0)) {
+        return;
+    }
+
+    const uint qk = QUANT_K;
+    const uint qr = QUANT_R;
+
+    const int ib = i/qk + get_global_offset(0); // block index
+    const int iqs = (i%qk)/qr; // quant index
+    const int iybs = i - i%qk; // y block start index
+    const int y_offset = qr == 1 ? 1 : qk/2;
+
+    // dequantize
+    float v0, v1;
+    DEQUANT_FUNC(x, ib, iqs, &v0, &v1);
+    y[iybs + iqs + 0] = v0;
+    y[iybs + iqs + y_offset] = v1;
+}
+);
+
+static std::string dequant_mul_mat_vec_template = MULTILINE_QUOTE(
+__kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
+    const int local_size = get_local_size(0);
+    const int row = get_group_id(0);
+    const int tid = get_local_id(0);
+
+    const uint qk = QUANT_K;
+    const uint qr = QUANT_R;
+
+    const int col_step = local_size * 2;
+    const int y_offset = qr == 1 ? 1 : qk/2;
+
+    x += get_global_offset(0);
+
+    tmp[tid] = 0;
+
+    for (int col = tid*2; col < ncols; col += col_step) {
+        const int ib = (row*ncols + col)/qk; // block index
+        const int iqs = (col%qk)/qr; // quant index
+        const int iybs = col - col%qk; // y block start index
+
+        // dequantize
+        float v0, v1;
+        DEQUANT_FUNC(x, ib, iqs, &v0, &v1);
+
+        // matrix multiplication
+        tmp[tid] += v0 * y[iybs + iqs + 0];
+        tmp[tid] += v1 * y[iybs + iqs + y_offset];
+    }
+
+    // sum up partial sums and write back result
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for (int s=local_size/2; s>0; s>>=1) {
+        if (tid < s) {
+            tmp[tid] += tmp[tid + s];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if (tid == 0) {
+        dst[row] = tmp[0];
+    }
+}
+);
+
+
+static std::string mul_template = MULTILINE_QUOTE(
+__kernel void KERNEL_NAME(__global TYPE* x, const int x_offset, __global TYPE* y, const int y_offset, __global TYPE* dst, const int dst_offset, const int ky) {
+    const int i = get_group_id(0)*get_local_size(0) + get_local_id(0);
+
+    if (i >= get_global_size(0)) {
+        return;
+    }
+
+    dst[dst_offset + i] = x[x_offset + i] * y[y_offset + i%ky];
+}
+);
+
+#define CL_CHECK(err)                                               \
+    do {                                                            \
+        cl_int err_ = (err);                                        \
+        if (err_ != CL_SUCCESS) {                                   \
+            fprintf(stderr, "ggml_v3_opencl: %s error %d at %s:%d\n",  \
+                #err, err_, __FILE__, __LINE__);                    \
+            fprintf(stderr, "You may be out of VRAM. Please check if you have enough.\n");\
+            exit(1);                                                \
+        }                                                           \
+    } while (0)
+
+#define CLBLAST_CHECK(err)                                          \
+    do {                                                            \
+        CLBlastStatusCode err_ = (err);                             \
+        if (err_ != CLBlastSuccess) {                               \
+            fprintf(stderr, "ggml_v3_opencl: %s error %d at %s:%d\n",  \
+                #err, err_, __FILE__, __LINE__);                    \
+            fprintf(stderr, "You may be out of VRAM. Please check if you have enough.\n");\
+            exit(1);                                                \
+        }                                                           \
+    } while (0)
+
+static std::array<std::string, 5> dequant_str_keys = {
+    "KERNEL_NAME", "X_TYPE", "QUANT_K", "QUANT_R", "DEQUANT_FUNC"
+};
+
+static std::array<std::string, 30> dequant_str_values = {
+    "dequantize_row_q4_0", "struct block_q4_0", "QK4_0", "QR4_0", "dequantize_q4_0",
+    "dequantize_row_q4_1", "struct block_q4_1", "QK4_1", "QR4_1", "dequantize_q4_1",
+    "dequantize_row_q5_0", "struct block_q5_0", "QK5_0", "QR5_0", "dequantize_q5_0",
+    "dequantize_row_q5_1", "struct block_q5_1", "QK5_1", "QR5_1", "dequantize_q5_1",
+    "dequantize_row_q8_0", "struct block_q8_0", "QK8_0", "QR8_0", "dequantize_q8_0",
+    "convert_row_f16", "half", "1", "1", "convert_f16"
+};
+
+static std::array<std::string, 30> dequant_mul_mat_vec_str_values = {
+    "dequantize_mul_mat_vec_q4_0", "struct block_q4_0", "QK4_0", "QR4_0", "dequantize_q4_0",
+    "dequantize_mul_mat_vec_q4_1", "struct block_q4_1", "QK4_1", "QR4_1", "dequantize_q4_1",
+    "dequantize_mul_mat_vec_q5_0", "struct block_q5_0", "QK5_0", "QR5_0", "dequantize_q5_0",
+    "dequantize_mul_mat_vec_q5_1", "struct block_q5_1", "QK5_1", "QR5_1", "dequantize_q5_1",
+    "dequantize_mul_mat_vec_q8_0", "struct block_q8_0", "QK8_0", "QR8_0", "dequantize_q8_0",
+    "convert_mul_mat_vec_f16", "half", "1", "1", "convert_f16"
+};
+
+static std::array<std::string, 2> mul_str_keys = {
+    "KERNEL_NAME", "TYPE"
+};
+static std::array<std::string, 2> mul_str_values = {
+    "mul_f32", "float"
+};
+
+static std::string& replace(std::string& s, const std::string& from, const std::string& to) {
+    size_t pos = 0;
+    while ((pos = s.find(from, pos)) != std::string::npos) {
+         s.replace(pos, from.length(), to);
+         pos += to.length();
+    }
+    return s;
+}
+
+static std::string generate_kernels() {
+    std::stringstream src;
+    src << program_source << '\n';
+    src << k_quants_source << '\n';
+    for (size_t i = 0; i < dequant_str_values.size(); i += dequant_str_keys.size()) {
+        std::string dequant_kernel = dequant_template;
+        std::string dmmv_kernel = dequant_mul_mat_vec_template;
+        for (size_t j = 0; j < dequant_str_keys.size(); j++) {
+            replace(dequant_kernel, dequant_str_keys[j], dequant_str_values[i + j]);
+            replace(dmmv_kernel, dequant_str_keys[j], dequant_mul_mat_vec_str_values[i + j]);
+        }
+        src << dequant_kernel << '\n';
+        src << dmmv_kernel << '\n';
+    }
+    for (size_t i = 0; i < mul_str_values.size(); i += mul_str_keys.size()) {
+        std::string mul_kernel = mul_template;
+        for (size_t j = 0; j < mul_str_keys.size(); j++) {
+            replace(mul_kernel, mul_str_keys[j], mul_str_values[i + j]);
+        }
+        src << mul_kernel << '\n';
+    }
+
+    return src.str();
+}
+
+static cl_platform_id platform;
+static cl_device_id device;
+static cl_context context;
+static cl_command_queue queue;
+static cl_program program;
+static cl_kernel convert_row_f16_cl;
+static cl_kernel dequantize_row_q4_0_cl, dequantize_row_q4_1_cl, dequantize_row_q5_0_cl, dequantize_row_q5_1_cl, dequantize_row_q8_0_cl;
+static cl_kernel dequantize_mul_mat_vec_q4_0_cl, dequantize_mul_mat_vec_q4_1_cl, dequantize_mul_mat_vec_q5_0_cl, dequantize_mul_mat_vec_q5_1_cl, dequantize_mul_mat_vec_q8_0_cl, convert_mul_mat_vec_f16_cl;
+static cl_kernel dequantize_block_q2_k_cl, dequantize_block_q3_k_cl, dequantize_block_q4_k_cl, dequantize_block_q5_k_cl, dequantize_block_q6_k_cl;
+static cl_kernel dequantize_mul_mat_vec_q2_K_cl, dequantize_mul_mat_vec_q3_K_cl, dequantize_mul_mat_vec_q4_K_cl, dequantize_mul_mat_vec_q5_K_cl, dequantize_mul_mat_vec_q6_K_cl;
+static cl_kernel mul_f32_cl;
+static bool fp16_support;
+
+static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer) {
+    cl_program p;
+    char *program_log;
+    size_t program_size;
+    size_t log_size;
+    int err;
+
+    program_size = strlen(program_buffer);
+
+    p = clCreateProgramWithSource(ctx, 1, (const char**)&program_buffer, &program_size, &err);
+    if(err < 0) {
+        fprintf(stderr, "OpenCL error creating program");
+        exit(1);
+    }
+
+    std::string compile_opts = "-cl-mad-enable -cl-unsafe-math-optimizations -cl-finite-math-only -cl-fast-relaxed-math "
+                               "-DQK4_0=32 -DQR4_0=2 -DQK4_1=32 -DQR4_1=2 -DQK5_0=32 -DQR5_0=2 -DQK5_1=32 -DQR5_1=2 -DQK8_0=32 -DQR8_0=1 "
+                               "-DQK_K=256 -DK_QUANTS_PER_ITERATION=" + std::to_string(K_QUANTS_PER_ITERATION);
+
+    err = clBuildProgram(p, 0, NULL, compile_opts.c_str(), NULL, NULL);
+    if(err < 0) {
+
+        clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
+        program_log = (char*) malloc(log_size + 1);
+        program_log[log_size] = '\0';
+        clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, log_size + 1, program_log, NULL);
+        fprintf(stderr, "ggml_v3_opencl: kernel compile error:\n\n%s\n", program_log);
+        free(program_log);
+        exit(1);
+    }
+
+    return p;
+}
+
+void ggml_v3_cl_init(void) {
+    cl_int err;
+
+    struct cl_device;
+    struct cl_platform {
+        cl_platform_id id;
+        unsigned number;
+        char name[128];
+        char vendor[128];
+        struct cl_device * devices;
+        unsigned n_devices;
+        struct cl_device * default_device;
+    };
+
+    struct cl_device {
+        struct cl_platform * platform;
+        cl_device_id id;
+        unsigned number;
+        cl_device_type type;
+        char name[128];
+    };
+
+    enum { NPLAT = 16, NDEV = 16 };
+
+    struct cl_platform platforms[NPLAT];
+    unsigned n_platforms = 0;
+    struct cl_device devices[NDEV];
+    unsigned n_devices = 0;
+    struct cl_device * default_device = NULL;
+
+    platform = NULL;
+    device = NULL;
+
+    cl_platform_id platform_ids[NPLAT];
+    CL_CHECK(clGetPlatformIDs(NPLAT, platform_ids, &n_platforms));
+
+    for (unsigned i = 0; i < n_platforms; i++) {
+        struct cl_platform * p = &platforms[i];
+        p->number = i;
+        p->id = platform_ids[i];
+        CL_CHECK(clGetPlatformInfo(p->id, CL_PLATFORM_NAME, sizeof(p->name), &p->name, NULL));
+        CL_CHECK(clGetPlatformInfo(p->id, CL_PLATFORM_VENDOR, sizeof(p->vendor), &p->vendor, NULL));
+
+        cl_device_id device_ids[NDEV];
+        cl_int clGetDeviceIDsError = clGetDeviceIDs(p->id, CL_DEVICE_TYPE_ALL, NDEV, device_ids, &p->n_devices);
+        if (clGetDeviceIDsError == CL_DEVICE_NOT_FOUND) {
+            p->n_devices = 0;
+        } else {
+            CL_CHECK(clGetDeviceIDsError);
+        }
+        p->devices = p->n_devices > 0 ? &devices[n_devices] : NULL;
+        p->default_device = NULL;
+
+        for (unsigned j = 0; j < p->n_devices; j++) {
+            struct cl_device * d = &devices[n_devices];
+            d->number = n_devices++;
+            d->id = device_ids[j];
+            d->platform = p;
+            CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_NAME, sizeof(d->name), &d->name, NULL));
+            CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_TYPE, sizeof(d->type), &d->type, NULL));
+            printf("\nPlatform:%d Device:%d  - %s with %s",i,j,p->name,d->name);
+
+            if (p->default_device == NULL && d->type == CL_DEVICE_TYPE_GPU) {
+                p->default_device = d;
+            }
+        }
+
+        if (default_device == NULL && p->default_device != NULL) {
+            default_device = p->default_device;
+        }
+    }
+
+    printf("\n\n");
+
+    if (n_devices == 0) {
+        fprintf(stderr, "ggml_v3_opencl: could find any OpenCL devices.\n");
+        exit(1);
+    }
+
+    char * user_platform_string = getenv("GGML_OPENCL_PLATFORM");
+    char * user_device_string = getenv("GGML_OPENCL_DEVICE");
+    int user_platform_number = -1;
+    int user_device_number = -1;
+
+    unsigned n;
+    if (user_platform_string != NULL && sscanf(user_platform_string, " %u", &n) == 1 && n < n_platforms) {
+        user_platform_number = (int)n;
+    }
+    if (user_device_string != NULL && sscanf(user_device_string, " %u", &n) == 1 && n < n_devices) {
+        user_device_number = (int)n;
+    }
+    if (user_platform_number != -1 && user_device_number != -1) {
+        cl_platform* platform = &platforms[user_platform_number];
+        if ((unsigned)user_device_number >= platform->n_devices) {
+            fprintf(stderr, "ggml_v3_opencl: invalid device number %d\n", user_device_number);
+            exit(1);
+        }
+        default_device = &platform->devices[user_device_number];
+    } else {
+
+        struct cl_device * selected_devices = devices;
+        unsigned n_selected_devices = n_devices;
+
+        if (user_platform_number == -1 && user_platform_string != NULL && user_platform_string[0] != 0) {
+            for (unsigned i = 0; i < n_platforms; i++) {
+                struct cl_platform * p = &platforms[i];
+                if (strstr(p->name, user_platform_string) != NULL ||
+                    strstr(p->vendor, user_platform_string) != NULL) {
+                    user_platform_number = (int)i;
+                    break;
+                }
+            }
+            if (user_platform_number == -1) {
+                fprintf(stderr, "ggml_v3_opencl: no platform matching '%s' was found.\n", user_platform_string);
+                exit(1);
+            }
+        }
+        if (user_platform_number != -1) {
+            struct cl_platform * p = &platforms[user_platform_number];
+            selected_devices = p->devices;
+            n_selected_devices = p->n_devices;
+            default_device = p->default_device;
+            if (n_selected_devices == 0) {
+                fprintf(stderr, "ggml_v3_opencl: selected platform '%s' does not have any devices.\n", p->name);
+                exit(1);
+            }
+        }
+
+        if (user_device_number == -1 && user_device_string != NULL && user_device_string[0] != 0) {
+            for (unsigned i = 0; i < n_selected_devices; i++) {
+                struct cl_device * d = &selected_devices[i];
+                if (strstr(d->name, user_device_string) != NULL) {
+                    user_device_number = d->number;
+                    break;
+                }
+            }
+            if (user_device_number == -1) {
+                fprintf(stderr, "ggml_v3_opencl: no device matching '%s' was found.\n", user_device_string);
+                exit(1);
+            }
+        }
+        if (user_device_number != -1) {
+            selected_devices = &devices[user_device_number];
+            n_selected_devices = 1;
+            default_device = &selected_devices[0];
+        }
+
+        GGML_V3_ASSERT(n_selected_devices > 0);
+
+        if (default_device == NULL) {
+            default_device = &selected_devices[0];
+        }
+    }
+
+    fprintf(stderr, "ggml_v3_opencl: selecting platform: '%s'\n", default_device->platform->name);
+    fprintf(stderr, "ggml_v3_opencl: selecting device: '%s'\n", default_device->name);
+    if (default_device->type != CL_DEVICE_TYPE_GPU) {
+        fprintf(stderr, "ggml_v3_opencl: warning, not a GPU: '%s'.\n", default_device->name);
+    }
+
+    platform = default_device->platform->id;
+    device = default_device->id;
+
+    size_t ext_str_size;
+    clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 0, NULL, &ext_str_size);
+    char *ext_buffer = (char *)alloca(ext_str_size + 1);
+    clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, ext_str_size, ext_buffer, NULL);
+    ext_buffer[ext_str_size] = '\0'; // ensure it is null terminated
+    // Check if ext_buffer contains cl_khr_fp16
+    fp16_support = strstr(ext_buffer, "cl_khr_fp16") != NULL;
+    fprintf(stderr, "ggml_v3_opencl: device FP16 support: %s\n", fp16_support ? "true" : "false");
+    fp16_support = false;
+    printf("CL FP16 temporarily disabled pending further optimization.\n");
+
+    cl_context_properties properties[] = {
+        (intptr_t)CL_CONTEXT_PLATFORM, (intptr_t)platform, 0
+    };
+
+    CL_CHECK((context = clCreateContext(properties, 1, &device, NULL, NULL, &err), err));
+
+    CL_CHECK((queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err),
+        (err != CL_INVALID_QUEUE_PROPERTIES && err != CL_INVALID_VALUE ? err :
+        (queue = clCreateCommandQueue(context, device, 0, &err), err)
+    )));
+
+    const std::string kernel_src = generate_kernels();
+
+    program = build_program_from_source(context, device, kernel_src.c_str());
+
+    // FP16 to FP32 kernel
+    CL_CHECK((convert_row_f16_cl = clCreateKernel(program, "convert_row_f16", &err), err));
+
+    // Dequantize kernels
+    CL_CHECK((dequantize_row_q4_0_cl = clCreateKernel(program, "dequantize_row_q4_0", &err), err));
+    CL_CHECK((dequantize_row_q4_1_cl = clCreateKernel(program, "dequantize_row_q4_1", &err), err));
+    CL_CHECK((dequantize_row_q5_0_cl = clCreateKernel(program, "dequantize_row_q5_0", &err), err));
+    CL_CHECK((dequantize_row_q5_1_cl = clCreateKernel(program, "dequantize_row_q5_1", &err), err));
+    CL_CHECK((dequantize_row_q8_0_cl = clCreateKernel(program, "dequantize_row_q8_0", &err), err));
+    CL_CHECK((dequantize_row_q8_0_cl = clCreateKernel(program, "dequantize_row_q8_0", &err), err));
+    CL_CHECK((dequantize_block_q2_k_cl = clCreateKernel(program, "dequantize_block_q2_K", &err), err));
+    CL_CHECK((dequantize_block_q3_k_cl = clCreateKernel(program, "dequantize_block_q3_K", &err), err));
+    CL_CHECK((dequantize_block_q4_k_cl = clCreateKernel(program, "dequantize_block_q4_K", &err), err));
+    CL_CHECK((dequantize_block_q5_k_cl = clCreateKernel(program, "dequantize_block_q5_K", &err), err));
+    CL_CHECK((dequantize_block_q6_k_cl = clCreateKernel(program, "dequantize_block_q6_K", &err), err));
+
+    // dequant mul mat kernel
+    CL_CHECK((dequantize_mul_mat_vec_q4_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q4_0", &err), err));
+    CL_CHECK((dequantize_mul_mat_vec_q4_1_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q4_1", &err), err));
+    CL_CHECK((dequantize_mul_mat_vec_q5_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_0", &err), err));
+    CL_CHECK((dequantize_mul_mat_vec_q5_1_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_1", &err), err));
+    CL_CHECK((dequantize_mul_mat_vec_q8_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q8_0", &err), err));
+    CL_CHECK((convert_mul_mat_vec_f16_cl = clCreateKernel(program, "convert_mul_mat_vec_f16", &err), err));
+    CL_CHECK((dequantize_mul_mat_vec_q2_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q2_K", &err), err));
+    CL_CHECK((dequantize_mul_mat_vec_q3_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q3_K", &err), err));
+    CL_CHECK((dequantize_mul_mat_vec_q4_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q4_K", &err), err));
+    CL_CHECK((dequantize_mul_mat_vec_q5_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_K", &err), err));
+    CL_CHECK((dequantize_mul_mat_vec_q6_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q6_K", &err), err));
+
+    // mul kernel
+    CL_CHECK((mul_f32_cl = clCreateKernel(program, "mul_f32", &err), err));
+}
+
+static cl_kernel* ggml_v3_get_to_fp32_cl(ggml_v3_type type) {
+    switch (type) {
+        case GGML_V3_TYPE_Q4_0:
+            return &dequantize_row_q4_0_cl;
+        case GGML_V3_TYPE_Q4_1:
+            return &dequantize_row_q4_1_cl;
+        case GGML_V3_TYPE_Q5_0:
+            return &dequantize_row_q5_0_cl;
+        case GGML_V3_TYPE_Q5_1:
+            return &dequantize_row_q5_1_cl;
+        case GGML_V3_TYPE_Q8_0:
+            return &dequantize_row_q8_0_cl;
+        case GGML_V3_TYPE_Q2_K:
+            return &dequantize_block_q2_k_cl;
+        case GGML_V3_TYPE_Q3_K:
+            return &dequantize_block_q3_k_cl;
+        case GGML_V3_TYPE_Q4_K:
+            return &dequantize_block_q4_k_cl;
+        case GGML_V3_TYPE_Q5_K:
+            return &dequantize_block_q5_k_cl;
+        case GGML_V3_TYPE_Q6_K:
+            return &dequantize_block_q6_k_cl;
+        case GGML_V3_TYPE_F16:
+            return &convert_row_f16_cl;
+        default:
+            return nullptr;
+    }
+}
+
+static size_t ggml_v3_cl_global_denom(ggml_v3_type type) {
+    switch (type) {
+        case GGML_V3_TYPE_Q4_0:
+        case GGML_V3_TYPE_Q4_1:
+        case GGML_V3_TYPE_Q5_0:
+        case GGML_V3_TYPE_Q5_1:
+        case GGML_V3_TYPE_Q8_0:
+            return 1;
+        case GGML_V3_TYPE_Q2_K:
+        case GGML_V3_TYPE_Q3_K:
+            return 4;
+        case GGML_V3_TYPE_Q4_K:
+            return 8;
+        case GGML_V3_TYPE_Q5_K:
+        case GGML_V3_TYPE_Q6_K:
+            return 4;
+        case GGML_V3_TYPE_F16:
+        default:
+            return 1;
+    }
+}
+
+static size_t ggml_v3_cl_local_size(ggml_v3_type type) {
+    switch (type) {
+        case GGML_V3_TYPE_Q4_0:
+        case GGML_V3_TYPE_Q4_1:
+        case GGML_V3_TYPE_Q5_0:
+        case GGML_V3_TYPE_Q5_1:
+        case GGML_V3_TYPE_Q8_0:
+            return 0;
+        case GGML_V3_TYPE_Q2_K:
+        case GGML_V3_TYPE_Q3_K:
+            return 64;
+        case GGML_V3_TYPE_Q4_K:
+            return 32;
+        case GGML_V3_TYPE_Q5_K:
+        case GGML_V3_TYPE_Q6_K:
+            return 64;
+        case GGML_V3_TYPE_F16:
+        default:
+            return 0;
+    }
+}
+
+static cl_kernel* ggml_v3_get_dequantize_mul_mat_vec_cl(ggml_v3_type type) {
+    switch (type) {
+        case GGML_V3_TYPE_Q4_0:
+            return &dequantize_mul_mat_vec_q4_0_cl;
+        case GGML_V3_TYPE_Q4_1:
+            return &dequantize_mul_mat_vec_q4_1_cl;
+        case GGML_V3_TYPE_Q5_0:
+            return &dequantize_mul_mat_vec_q5_0_cl;
+        case GGML_V3_TYPE_Q5_1:
+            return &dequantize_mul_mat_vec_q5_1_cl;
+        case GGML_V3_TYPE_Q8_0:
+            return &dequantize_mul_mat_vec_q8_0_cl;
+        case GGML_V3_TYPE_F16:
+            return &convert_mul_mat_vec_f16_cl;
+        case GGML_V3_TYPE_Q2_K:
+            return &dequantize_mul_mat_vec_q2_K_cl;
+        case GGML_V3_TYPE_Q3_K:
+            return &dequantize_mul_mat_vec_q3_K_cl;
+        case GGML_V3_TYPE_Q4_K:
+            return &dequantize_mul_mat_vec_q4_K_cl;
+        case GGML_V3_TYPE_Q5_K:
+            return &dequantize_mul_mat_vec_q5_K_cl;
+        case GGML_V3_TYPE_Q6_K:
+            return &dequantize_mul_mat_vec_q6_K_cl;
+        default:
+            return nullptr;
+    }
+}
+
+// buffer pool for cl
+#define MAX_CL_BUFFERS 400
+
+struct scoped_spin_lock {
+    std::atomic_flag& lock;
+    scoped_spin_lock(std::atomic_flag& lock) : lock(lock) {
+        while (lock.test_and_set(std::memory_order_acquire)) {
+            ; // spin
+        }
+    }
+    ~scoped_spin_lock() {
+        lock.clear(std::memory_order_release);
+    }
+    scoped_spin_lock(const scoped_spin_lock&) = delete;
+    scoped_spin_lock& operator=(const scoped_spin_lock&) = delete;
+};
+
+struct cl_buffer {
+    cl_mem mem;
+    size_t size = 0;
+};
+
+static cl_buffer g_cl_buffer_pool[MAX_CL_BUFFERS];
+static std::atomic_flag g_cl_pool_lock = ATOMIC_FLAG_INIT;
+
+static cl_mem ggml_v3_cl_pool_malloc(size_t size, size_t * actual_size) {
+    scoped_spin_lock lock(g_cl_pool_lock);
+    cl_int err;
+
+    int best_i = -1;
+    size_t best_size = std::numeric_limits<size_t>::max(); //smallest unused buffer that fits our needs
+    int worst_i = -1;
+    size_t worst_size = 0; //largest unused buffer seen so far
+    for (int i = 0; i < MAX_CL_BUFFERS; ++i) {
+        cl_buffer &b = g_cl_buffer_pool[i];
+        if (b.size > 0 && b.size >= size && b.size < best_size)
+        {
+            best_i = i;
+            best_size = b.size;
+        }
+        if (b.size > 0 && b.size > worst_size)
+        {
+            worst_i = i;
+            worst_size = b.size;
+        }
+    }
+    if(best_i!=-1) //found the smallest buffer that fits our needs
+    {
+        cl_buffer& b = g_cl_buffer_pool[best_i];
+        cl_mem mem = b.mem;
+        *actual_size = b.size;
+        b.size = 0;
+        return mem;
+    }
+    if(worst_i!=-1) //no buffer that fits our needs, resize largest one to save memory
+    {
+         cl_buffer& b = g_cl_buffer_pool[worst_i];
+         cl_mem mem = b.mem;
+         b.size = 0;
+         clReleaseMemObject(mem);
+    }
+    cl_mem mem;
+    CL_CHECK((mem = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &err), err));
+    *actual_size = size;
+    return mem;
+}
+
+static void ggml_v3_cl_pool_free(cl_mem mem, size_t size) {
+    scoped_spin_lock lock(g_cl_pool_lock);
+
+    for (int i = 0; i < MAX_CL_BUFFERS; ++i) {
+        cl_buffer& b = g_cl_buffer_pool[i];
+        if (b.size == 0) {
+            b.mem = mem;
+            b.size = size;
+            return;
+        }
+    }
+    fprintf(stderr, "WARNING: cl buffer pool full, increase MAX_CL_BUFFERS\n");
+    clReleaseMemObject(mem);
+}
+
+void ggml_v3_cl_free_data(const struct ggml_v3_tensor* tensor) {
+    if (tensor->backend != GGML_V3_BACKEND_GPU) {
+        return;
+    }
+
+    cl_mem mem = (cl_mem)tensor->extra;
+    clReleaseMemObject(mem);
+}
+
+static cl_int ggml_v3_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t offset, const struct ggml_v3_tensor * src, uint64_t i3, uint64_t i2, cl_event* ev) {
+    cl_int err;
+    const uint64_t ne0 = src->ne[0];
+    const uint64_t ne1 = src->ne[1];
+    const uint64_t nb0 = src->nb[0];
+    const uint64_t nb1 = src->nb[1];
+    const uint64_t nb2 = src->nb[2];
+    const uint64_t nb3 = src->nb[3];
+    const enum ggml_v3_type type = src->type;
+    const size_t ts = ggml_v3_type_size(type);
+    const size_t bs = ggml_v3_blck_size(type);
+    const uint64_t row_size = ts*ne0/bs;
+
+    const char * x = (const char *) src->data + i2*nb2 + i3*nb3;
+    if (nb0 == ts && nb1 == row_size) {
+        return clEnqueueWriteBuffer(queue, dst, CL_FALSE, offset, ne1*row_size, x, 0, NULL, ev);
+    }
+    if (nb0 == ts) {
+        const size_t buffer_origin[3] = { offset, 0, 0 };
+        const size_t host_origin[3] = { 0, 0, 0 };
+        const size_t region[3] = { row_size, ne1, 1 };
+        return clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, row_size, 0, nb1, 0, x, 0, NULL, ev);
+    }
+    std::vector<cl_event> events;
+    if (ev && ne1>1) events.reserve(ne1-1);
+    for (uint64_t i1 = 0; i1 < ne1; i1++) {
+        // pretend the row is a matrix with cols=1
+        const size_t buffer_origin[3] = { offset + i1*row_size, 0, 0 };
+        const size_t host_origin[3] = { 0, 0, 0 };
+        const size_t region[3] = { ts, ne0/bs, 1 };
+        // if an event is requested, make the last write wait for all previous writes to complete
+        if (ev && i1) {
+            events.push_back(*ev);
+        }
+        cl_uint nevents = i1 == ne1-1 ? events.size() : 0U;
+        err = clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, ts, 0, nb0, 0, x + i1*nb1, nevents, nevents ? events.data() : nullptr, ev);
+        if (err != CL_SUCCESS) {
+            for (auto event : events) {
+                clReleaseEvent(event);
+            }
+            return err;
+        }
+    }
+    for (auto event : events) {
+        CL_CHECK(clReleaseEvent(event));
+    }
+    return CL_SUCCESS;
+}
+
+static void ggml_v3_cl_mul_f32(const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst) {
+    GGML_V3_ASSERT(src1->backend == GGML_V3_BACKEND_GPU);
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+    const int64_t ne03 = src0->ne[3];
+    const int64_t ne10 = src1->ne[0];
+    const int64_t ne11 = src1->ne[1];
+    const int64_t ne12 = src1->ne[2];
+    const int64_t ne13 = src1->ne[3];
+    const int nb2  = dst->nb[2];
+    const int nb3  = dst->nb[3];
+    size_t x_size;
+    size_t d_size;
+
+    cl_mem d_X = ggml_v3_cl_pool_malloc(ne00 * ne01 * sizeof(float), &x_size); // src0
+    cl_mem d_Y = (cl_mem) src1->extra; // src1 is already on device, broadcasted.
+    cl_mem d_D = ggml_v3_cl_pool_malloc(ne00 * ne01 * sizeof(float), &d_size); // dst
+
+
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        for (int64_t i02 = 0; i02 < ne02; i02++) {
+            cl_event ev;
+
+            // copy src0 to device
+            CL_CHECK(ggml_v3_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, &ev));
+
+            const int64_t i13 = i03%ne13;
+            const int64_t i12 = i02%ne12;
+            const int i1 = i13*ne12*ne11 + i12*ne11;
+
+            cl_int x_offset = 0;
+            cl_int y_offset = i1*ne10;
+            cl_int d_offset = 0;
+
+            size_t global = ne00 * ne01;
+            cl_int ky = ne10 * ne11;
+
+            CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X));
+            CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset));
+            CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y));
+            CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset));
+            CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D));
+            CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset));
+            CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky));
+            CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
+
+            CL_CHECK(clReleaseEvent(ev));
+            CL_CHECK(clFinish(queue));
+
+            // copy dst to host
+            float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
+            CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * ne00*ne01, d, 0, NULL, NULL));
+        }
+    }
+    ggml_v3_cl_pool_free(d_X, x_size);
+    ggml_v3_cl_pool_free(d_D, d_size);
+}
+
+void ggml_v3_cl_mul(const struct ggml_v3_tensor * src0, const struct ggml_v3_tensor * src1, struct ggml_v3_tensor * dst) {
+    GGML_V3_ASSERT(src0->type == GGML_V3_TYPE_F32 && src1->type == GGML_V3_TYPE_F32 && dst->type == GGML_V3_TYPE_F32);
+    ggml_v3_cl_mul_f32(src0, src1, dst);
+}
+
+static void ggml_v3_cl_mul_mat_f32(const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst) {
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+    const int64_t ne03 = src0->ne[3];
+
+    const int64_t ne10 = src1->ne[0];
+    const int64_t ne11 = src1->ne[1];
+    const int64_t ne12 = src1->ne[2];
+    const int64_t ne13 = src1->ne[3];
+
+    const int nb2  = dst->nb[2];
+    const int nb3  = dst->nb[3];
+
+    const int64_t r2 = ne12 / ne02;
+    const int64_t r3 = ne13 / ne03;
+
+    const float alpha = 1.0f;
+    const float beta = 0.0f;
+    const int x_ne = ne01 * ne00;
+    const int y_ne = ne11 * ne10;
+    const int d_ne = ne11 * ne01;
+
+    size_t x_size;
+    size_t y_size;
+    size_t d_size;
+    cl_mem d_X;
+    if (src0->backend == GGML_V3_BACKEND_GPU) { // NOLINT
+        d_X = (cl_mem) src0->extra;
+    } else {
+        d_X = ggml_v3_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
+    }
+    cl_mem d_Y = ggml_v3_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
+    cl_mem d_D = ggml_v3_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
+
+    size_t x_offset = 0;
+
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        // TODO: copy src0 here when r3>1
+        for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                if (src0->backend == GGML_V3_BACKEND_GPU) {
+                    x_offset = (i03 * ne02 + i02) * x_ne;
+                } else {
+                    // copy src0 to device
+                    CL_CHECK(ggml_v3_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
+                }
+
+                for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
+                    // copy src1 to device
+                    CL_CHECK(ggml_v3_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
+
+                    CL_CHECK(clFinish(queue));
+
+                    // compute
+                    cl_event ev_sgemm;
+                    clblast::StatusCode status = (clblast::StatusCode)CLBlastSgemm((CLBlastLayout)clblast::Layout::kColMajor,
+                                                               (CLBlastTranspose)clblast::Transpose::kYes, (CLBlastTranspose)clblast::Transpose::kNo,
+                                                               ne01, ne11, ne10,
+                                                               alpha,
+                                                               d_X, x_offset, ne00,
+                                                               d_Y, 0, ne10,
+                                                               beta,
+                                                               d_D, 0, ne01,
+                                                               &queue, &ev_sgemm);
+
+                    if (status != clblast::StatusCode::kSuccess) {
+						printf("\nF32 Matmul Failed (%d): [dims: %ld,%ld,%ld,%ld] You may be out of VRAM. Please check if you have enough.\n",static_cast<int>(status),ne00,ne01,ne10,ne11);
+                        GGML_V3_ASSERT(false);
+                    }
+
+                    // copy dst to host
+                    float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
+                    CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
+                }
+            }
+        }
+    }
+
+    if (src0->backend != GGML_V3_BACKEND_GPU) {
+        ggml_v3_cl_pool_free(d_X, x_size);
+    }
+    ggml_v3_cl_pool_free(d_Y, y_size);
+    ggml_v3_cl_pool_free(d_D, d_size);
+}
+
+static void ggml_v3_cl_mul_mat_f16(const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst, void * wdata, size_t wsize) {
+    GGML_V3_ASSERT(fp16_support);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+    const int64_t ne03 = src0->ne[3];
+
+    const int64_t ne10 = src1->ne[0];
+    const int64_t ne11 = src1->ne[1];
+    const int64_t ne12 = src1->ne[2];
+    const int64_t ne13 = src1->ne[3];
+
+    const int nb10 = src1->nb[0];
+    const int nb11 = src1->nb[1];
+    const int nb12 = src1->nb[2];
+    const int nb13 = src1->nb[3];
+
+    const int nb2  = dst->nb[2];
+    const int nb3  = dst->nb[3];
+
+    const int64_t r2 = ne12 / ne02;
+    const int64_t r3 = ne13 / ne03;
+
+    const ggml_v3_fp16_t alpha = ggml_v3_fp32_to_fp16(1.0f);
+    const ggml_v3_fp16_t beta = ggml_v3_fp32_to_fp16(0.0f);
+    const int x_ne = ne01 * ne00;
+    const int y_ne = ne11 * ne10;
+    const int d_ne = ne11 * ne01;
+
+    GGML_V3_ASSERT(wsize >= sizeof(ggml_v3_fp16_t) * y_ne);
+    GGML_V3_ASSERT(wsize >= sizeof(ggml_v3_fp16_t) * d_ne);
+    ggml_v3_fp16_t * const tmp = (ggml_v3_fp16_t *) wdata;
+
+    size_t x_size;
+    size_t y_size;
+    size_t d_size;
+    cl_mem d_X;
+    if (src0->backend == GGML_V3_BACKEND_GPU) { // NOLINT
+        d_X = (cl_mem) src0->extra;
+    } else {
+        d_X = ggml_v3_cl_pool_malloc(sizeof(ggml_v3_fp16_t) * x_ne, &x_size);
+    }
+    cl_mem d_Y = ggml_v3_cl_pool_malloc(sizeof(ggml_v3_fp16_t) * y_ne, &y_size);
+    cl_mem d_D = ggml_v3_cl_pool_malloc(sizeof(ggml_v3_fp16_t) * d_ne, &d_size);
+
+    bool src1_cont_rows = nb10 == sizeof(float);
+    bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
+
+    size_t x_offset = 0;
+
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        // TODO: copy src0 here when r3>1
+        for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                if (src0->backend == GGML_V3_BACKEND_GPU) {
+                    x_offset = (i03 * ne02 + i02) * x_ne;
+                } else {
+                    // copy src0 to device
+                    CL_CHECK(ggml_v3_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
+                }
+
+                for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
+                    // convert src1 to fp16
+                    // TODO: use multiple threads
+                    char * src1i = (char *) src1->data + i13*nb13 + i12*nb12;
+                    if (src1_cont_rows) {
+                        if (src1_cont_cols) {
+                            ggml_v3_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
+                        }
+                        else {
+                            for (int64_t i11 = 0; i11 < ne11; i11++) {
+                                ggml_v3_fp32_to_fp16_row((float *) (src1i + i11*nb11), tmp + i11*ne10, ne10);
+                            }
+                        }
+                    }
+                    else {
+                        for (int64_t i11 = 0; i11 < ne11; i11++) {
+                            for (int64_t i10 = 0; i10 < ne10; i10++) {
+                                // very slow due to no inlining
+                                tmp[i11*ne10 + i10] = ggml_v3_fp32_to_fp16(*(float *) (src1i + i11*nb11 + i10*nb10));
+                            }
+                        }
+                    }
+
+                    // copy src1 to device
+                    CL_CHECK(clEnqueueWriteBuffer(queue, d_Y, false, 0, sizeof(ggml_v3_fp16_t) * y_ne, tmp, 0, NULL, NULL));
+
+                    CL_CHECK(clFinish(queue));
+
+                    // compute
+                    cl_event ev_sgemm;
+                    clblast::StatusCode status = (clblast::StatusCode)CLBlastHgemm((CLBlastLayout)clblast::Layout::kColMajor,
+                                                               (CLBlastTranspose)clblast::Transpose::kYes, (CLBlastTranspose)clblast::Transpose::kNo,
+                                                               ne01, ne11, ne10,
+                                                               alpha,
+                                                               d_X, x_offset, ne00,
+                                                               d_Y, 0, ne10,
+                                                               beta,
+                                                               d_D, 0, ne01,
+                                                               &queue, &ev_sgemm);
+
+                    if (status != clblast::StatusCode::kSuccess) {
+						printf("\nF16 Matmul Failed (%d): [dims: %ld,%ld,%ld,%ld] You may be out of VRAM. Please check if you have enough.\n",static_cast<int>(status),ne00,ne01,ne10,ne11);
+                        GGML_V3_ASSERT(false);
+                    }
+
+                    // copy dst to host, then convert to float
+                    CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_v3_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
+
+                    float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
+
+                    ggml_v3_fp16_to_fp32_row(tmp, d, d_ne);
+                }
+            }
+        }
+    }
+
+    if (src0->backend != GGML_V3_BACKEND_GPU) {
+        ggml_v3_cl_pool_free(d_X, x_size);
+    }
+    ggml_v3_cl_pool_free(d_Y, y_size);
+    ggml_v3_cl_pool_free(d_D, d_size);
+}
+
+static void ggml_v3_cl_mul_mat_q_f32(const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst) {
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+    const int64_t ne03 = src0->ne[3];
+
+    const int64_t ne10 = src1->ne[0];
+    const int64_t ne11 = src1->ne[1];
+    const int64_t ne12 = src1->ne[2];
+    const int64_t ne13 = src1->ne[3];
+
+    const int nb2  = dst->nb[2];
+    const int nb3  = dst->nb[3];
+    const ggml_v3_type type = src0->type;
+    const bool mul_mat_vec = ne11 == 1 && ne00%2 == 0;
+
+    const int64_t r2 = ne12 / ne02;
+    const int64_t r3 = ne13 / ne03;
+
+    const float alpha = 1.0f;
+    const float beta = 0.0f;
+    const int x_ne = ne01 * ne00;
+    const int y_ne = ne11 * ne10;
+    const int d_ne = ne11 * ne01;
+    const int x_bps = x_ne / ggml_v3_blck_size(type); // blocks per 2D slice
+    const size_t q_sz = ggml_v3_type_size(type) * x_bps;
+
+    size_t x_size;
+    size_t y_size;
+    size_t d_size;
+    size_t q_size;
+    cl_mem d_X;
+    if (!mul_mat_vec) {
+        d_X = ggml_v3_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
+    }
+    cl_mem d_Y = ggml_v3_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
+    cl_mem d_D = ggml_v3_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
+    cl_mem d_Q;
+    if (src0->backend == GGML_V3_BACKEND_CPU) {
+        d_Q = ggml_v3_cl_pool_malloc(q_sz, &q_size);
+    }
+
+    cl_kernel* to_fp32_cl = ggml_v3_get_to_fp32_cl(type);
+    cl_kernel* dmmv = ggml_v3_get_dequantize_mul_mat_vec_cl(type);
+    GGML_V3_ASSERT(to_fp32_cl != nullptr);
+
+    const size_t global_denom = ggml_v3_cl_global_denom(type);
+    const size_t local = mul_mat_vec ? CL_DMMV_LOCAL_SIZE : ggml_v3_cl_local_size(type);
+
+    size_t ev_idx = 0;
+    std::vector<cl_event> events;
+
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        // TODO: copy and dequantize src0 here when r3>1
+        for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                // copy src0 to device if necessary
+                if (src0->backend == GGML_V3_BACKEND_CPU) {
+                    events.emplace_back();
+                    CL_CHECK(ggml_v3_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
+                } else if (src0->backend == GGML_V3_BACKEND_GPU) {
+                    d_Q = (cl_mem) src0->extra;
+                } else {
+                    GGML_V3_ASSERT(false);
+                }
+
+                if (!mul_mat_vec) {
+                    // convert src0 to fp32 on device
+                    const size_t global = x_ne / global_denom;
+                    const size_t offset = src0->backend == GGML_V3_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
+                    CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
+                    CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
+                    CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, &offset, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
+                }
+
+                for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
+                    if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
+                        // copy src1 to device
+                        events.emplace_back();
+                        CL_CHECK(ggml_v3_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++));
+
+                        // compute
+                        const size_t global = ne01 * local;
+                        const size_t offset = src0->backend == GGML_V3_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
+                        const cl_int ncols = ne00;
+                        events.emplace_back();
+                        CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
+                        CL_CHECK(clSetKernelArg(*dmmv, 1, sizeof(float) * local, NULL));
+                        CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y));
+                        CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D));
+                        CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols));
+                        CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, &offset, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
+                    } else { // CLBlast matrix matrix multiplication
+                        // copy src1 to device
+                        CL_CHECK(ggml_v3_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
+
+                        // wait for conversion
+                        CL_CHECK(clFinish(queue));
+
+                        // compute
+                        events.emplace_back();
+                        clblast::StatusCode status = (clblast::StatusCode)CLBlastSgemm((CLBlastLayout)clblast::Layout::kColMajor,
+                                                                   (CLBlastTranspose)clblast::Transpose::kYes, (CLBlastTranspose)clblast::Transpose::kNo,
+                                                                   ne01, ne11, ne10,
+                                                                   alpha,
+                                                                   d_X, 0, ne00,
+                                                                   d_Y, 0, ne10,
+                                                                   beta,
+                                                                   d_D, 0, ne01,
+                                                                   &queue, events.data() + ev_idx++);
+
+                        if (status != clblast::StatusCode::kSuccess) {
+							printf("\nQF32 Matmul Failed (%d): [dims: %ld,%ld,%ld,%ld] You may be out of VRAM. Please check if you have enough.\n",static_cast<int>(status),ne00,ne01,ne10,ne11);
+                            GGML_V3_ASSERT(false);
+                        }
+                    }
+
+                    // copy dst to host
+                    float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
+                    CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL));
+                    for (auto *event : events) {
+                        clReleaseEvent(event);
+                    }
+
+                    ev_idx = 0;
+                    events.clear();
+                }
+            }
+        }
+    }
+
+    if (!mul_mat_vec) {
+        ggml_v3_cl_pool_free(d_X, x_size);
+    }
+    ggml_v3_cl_pool_free(d_Y, y_size);
+    ggml_v3_cl_pool_free(d_D, d_size);
+    if (src0->backend == GGML_V3_BACKEND_CPU) {
+        ggml_v3_cl_pool_free(d_Q, q_size);
+    }
+}
+
+
+bool ggml_v3_cl_can_mul_mat(const struct ggml_v3_tensor * src0, const struct ggml_v3_tensor * src1, struct ggml_v3_tensor * dst) {
+    const int64_t ne10 = src1->ne[0];
+
+    const int64_t ne0 = dst->ne[0];
+    const int64_t ne1 = dst->ne[1];
+
+    // TODO: find the optimal values for these
+    if ((src0->type == GGML_V3_TYPE_F32 || src0->type == GGML_V3_TYPE_F16 || ggml_v3_is_quantized(src0->type)) &&
+        src1->type == GGML_V3_TYPE_F32 &&
+        dst->type == GGML_V3_TYPE_F32 &&
+        ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_V3_BACKEND_GPU)) {
+        return true;
+    }
+
+    return false;
+}
+
+static bool ggml_v3_cl_mul_mat_use_f16(const struct ggml_v3_tensor * src0, const struct ggml_v3_tensor * src1, struct ggml_v3_tensor * /* dst */) {
+    // If device doesn't support FP16
+    if (!fp16_support) {
+        return false;
+    }
+
+    size_t src0_sz = ggml_v3_nbytes(src0);
+    size_t src1_sz = ggml_v3_nbytes(src1);
+
+    // mul_mat_q: src0 is converted to fp32 on device
+    size_t mul_mat_q_transfer = src0_sz + src1_sz;
+
+    // mul_mat_f16: src1 is converted to fp16 on cpu
+    size_t mul_mat_f16_transfer = src0_sz + sizeof(ggml_v3_fp16_t) * ggml_v3_nelements(src1);
+
+    // choose the smaller one to transfer to the device
+    // TODO: this is not always the best choice due to the overhead of converting to fp16
+    return mul_mat_f16_transfer < mul_mat_q_transfer;
+}
+
+void ggml_v3_cl_mul_mat(const struct ggml_v3_tensor * src0, const struct ggml_v3_tensor * src1, struct ggml_v3_tensor * dst, void * wdata, size_t wsize) {
+    GGML_V3_ASSERT(ggml_v3_cl_can_mul_mat(src0, src1, dst));
+
+    if (src0->type == GGML_V3_TYPE_F32) {
+        ggml_v3_cl_mul_mat_f32(src0, src1, dst);
+    }
+    else if (src0->type == GGML_V3_TYPE_F16) {
+        if (ggml_v3_cl_mul_mat_use_f16(src0, src1, dst)) {
+            ggml_v3_cl_mul_mat_f16(src0, src1, dst, wdata, wsize);
+        }
+        else {
+            ggml_v3_cl_mul_mat_q_f32(src0, src1, dst);
+        }
+    }
+    else if (ggml_v3_is_quantized(src0->type)) {
+        ggml_v3_cl_mul_mat_q_f32(src0, src1, dst);
+    }
+    else {
+        GGML_V3_ASSERT(false);
+    }
+}
+
+size_t ggml_v3_cl_mul_mat_get_wsize(const struct ggml_v3_tensor * src0, const struct ggml_v3_tensor * src1, struct ggml_v3_tensor * dst) {
+    if (src0->type == GGML_V3_TYPE_F16 && ggml_v3_cl_mul_mat_use_f16(src0, src1, dst)) {
+        return sizeof(ggml_v3_fp16_t) * std::max(src1->ne[0] * src1->ne[1], dst->ne[0] * dst->ne[1]);
+    }
+    return 0;
+}
+
+void ggml_v3_cl_transform_tensor(void * data, ggml_v3_tensor * tensor) {
+    const int64_t ne0 = tensor->ne[0];
+    const int64_t ne1 = tensor->ne[1];
+    const int64_t ne2 = tensor->ne[2];
+    const int64_t ne3 = tensor->ne[3];
+
+    const ggml_v3_type type = tensor->type;
+    const size_t s_sz = ggml_v3_type_size(type) * (size_t) (ne0 * ne1 / ggml_v3_blck_size(type));
+    const size_t q_sz = s_sz * (size_t) (ne2 * ne3);
+
+    size_t q_size;
+    cl_mem dst = ggml_v3_cl_pool_malloc(q_sz, &q_size);
+
+    tensor->data = data;
+    // copy tensor to device
+    size_t offset = 0;
+    for (int64_t i3 = 0; i3 < ne3; i3++) {
+        for (int64_t i2 = 0; i2 < ne2; i2++) {
+            CL_CHECK(ggml_v3_cl_h2d_tensor_2d(queue, dst, offset, tensor, i3, i2, NULL));
+            offset += s_sz;
+        }
+    }
+
+    CL_CHECK(clFinish(queue));
+
+    tensor->extra = dst;
+    GGML_V3_ASSERT(tensor->backend == GGML_V3_BACKEND_GPU);
+}
\ No newline at end of file
diff --git a/otherarch/ggml_v3-opencl.h b/otherarch/ggml_v3-opencl.h
new file mode 100644
index 000000000..3aa499ba9
--- /dev/null
+++ b/otherarch/ggml_v3-opencl.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#include "ggml_v3.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+GGML_V3_API void ggml_v3_cl_init(void);
+
+GGML_V3_API void   ggml_v3_cl_mul(const struct ggml_v3_tensor * src0, const struct ggml_v3_tensor * src1, struct ggml_v3_tensor * dst);
+GGML_V3_API bool   ggml_v3_cl_can_mul_mat(const struct ggml_v3_tensor * src0, const struct ggml_v3_tensor * src1, struct ggml_v3_tensor * dst);
+GGML_V3_API size_t ggml_v3_cl_mul_mat_get_wsize(const struct ggml_v3_tensor * src0, const struct ggml_v3_tensor * src1, struct ggml_v3_tensor * dst);
+GGML_V3_API void   ggml_v3_cl_mul_mat(const struct ggml_v3_tensor * src0, const struct ggml_v3_tensor * src1, struct ggml_v3_tensor * dst, void * wdata, size_t wsize);
+
+GGML_V3_API void * ggml_v3_cl_host_malloc(size_t size);
+GGML_V3_API void   ggml_v3_cl_host_free(void * ptr);
+
+GGML_V3_API void ggml_v3_cl_free_data(const struct ggml_v3_tensor* tensor);
+
+GGML_V3_API void ggml_v3_cl_transform_tensor(void * data, struct ggml_v3_tensor * tensor);
+
+#ifdef  __cplusplus
+}
+#endif
\ No newline at end of file
diff --git a/otherarch/ggml_v3.c b/otherarch/ggml_v3.c
new file mode 100644
index 000000000..fb600a8c8
--- /dev/null
+++ b/otherarch/ggml_v3.c
@@ -0,0 +1,28222 @@
+#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnings on Windows
+#define _USE_MATH_DEFINES // For M_PI on MSVC
+
+
+/// Start ggml-impl.h
+#include "ggml_v3.h"
+// GGML internal header
+
+#include <assert.h>
+#include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
+#include <stddef.h>
+#include <stdbool.h>
+#include <string.h> // memcpy
+#include <math.h>   // fabsf
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// static_assert should be a #define, but if it's not,
+// fall back to the _Static_assert C11 keyword.
+// if C99 - static_assert is noop
+// ref: https://stackoverflow.com/a/53923785/4039976
+#ifndef static_assert
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
+#define static_assert(cond, msg) _Static_assert(cond, msg)
+#else
+#define static_assert(cond, msg) struct global_scope_noop_trick
+#endif
+#endif
+
+// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
+#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
+#ifndef __FMA__
+#define __FMA__
+#endif
+#ifndef __F16C__
+#define __F16C__
+#endif
+#ifndef __SSE3__
+#define __SSE3__
+#endif
+#endif
+
+// 16-bit float
+// on Arm, we use __fp16
+// on x86, we use uint16_t
+#if defined(__ARM_NEON) && !defined(_MSC_VER)
+
+// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
+//
+//   $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
+//
+#include <arm_neon.h>
+
+#define GGML_V3_COMPUTE_FP16_TO_FP32(x) ((float) (x))
+#define GGML_V3_COMPUTE_FP32_TO_FP16(x) (x)
+
+#define GGML_V3_FP16_TO_FP32(x) ((float) (x))
+#define GGML_V3_FP32_TO_FP16(x) (x)
+
+#else
+
+#ifdef __wasm_simd128__
+#include <wasm_simd128.h>
+#else
+#ifdef __POWER9_VECTOR__
+#include <altivec.h>
+#undef bool
+#define bool _Bool
+#else
+#if defined(_MSC_VER) || defined(__MINGW32__)
+#include <intrin.h>
+#else
+#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__)
+#if !defined(__riscv)
+#include <immintrin.h>
+#endif
+#endif
+#endif
+#endif
+#endif
+
+#ifdef __riscv_v_intrinsic
+#include <riscv_vector.h>
+#endif
+
+#ifdef __F16C__
+
+#ifdef _MSC_VER
+#define GGML_V3_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
+#define GGML_V3_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
+#else
+#define GGML_V3_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
+#define GGML_V3_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
+#endif
+
+#elif defined(__POWER9_VECTOR__)
+
+#define GGML_V3_COMPUTE_FP16_TO_FP32(x) ggml_v3_compute_fp16_to_fp32(x)
+#define GGML_V3_COMPUTE_FP32_TO_FP16(x) ggml_v3_compute_fp32_to_fp16(x)
+/* the inline asm below is about 12% faster than the lookup method */
+#define GGML_V3_FP16_TO_FP32(x) GGML_V3_COMPUTE_FP16_TO_FP32(x)
+#define GGML_V3_FP32_TO_FP16(x) GGML_V3_COMPUTE_FP32_TO_FP16(x)
+
+static inline float ggml_v3_compute_fp16_to_fp32(ggml_v3_fp16_t h) {
+    register float f;
+    register double d;
+    __asm__(
+        "mtfprd %0,%2\n"
+        "xscvhpdp %0,%0\n"
+        "frsp %1,%0\n" :
+        /* temp */ "=d"(d),
+        /* out */  "=f"(f):
+        /* in */   "r"(h));
+    return f;
+}
+
+static inline ggml_v3_fp16_t ggml_v3_compute_fp32_to_fp16(float f) {
+    register double d;
+    register ggml_v3_fp16_t r;
+    __asm__( /* xscvdphp can work on double or single precision */
+        "xscvdphp %0,%2\n"
+        "mffprd %1,%0\n" :
+        /* temp */ "=d"(d),
+        /* out */  "=r"(r):
+        /* in */   "f"(f));
+    return r;
+}
+
+#else
+
+// FP16 <-> FP32
+// ref: https://github.com/Maratyszcza/FP16
+
+static inline float fp32_from_bits(uint32_t w) {
+    union {
+        uint32_t as_bits;
+        float as_value;
+    } fp32;
+    fp32.as_bits = w;
+    return fp32.as_value;
+}
+
+static inline uint32_t fp32_to_bits(float f) {
+    union {
+        float as_value;
+        uint32_t as_bits;
+    } fp32;
+    fp32.as_value = f;
+    return fp32.as_bits;
+}
+
+static inline float ggml_v3_compute_fp16_to_fp32(ggml_v3_fp16_t h) {
+    const uint32_t w = (uint32_t) h << 16;
+    const uint32_t sign = w & UINT32_C(0x80000000);
+    const uint32_t two_w = w + w;
+
+    const uint32_t exp_offset = UINT32_C(0xE0) << 23;
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
+    const float exp_scale = 0x1.0p-112f;
+#else
+    const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
+#endif
+    const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
+
+    const uint32_t magic_mask = UINT32_C(126) << 23;
+    const float magic_bias = 0.5f;
+    const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
+
+    const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
+    const uint32_t result = sign |
+        (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
+    return fp32_from_bits(result);
+}
+
+static inline ggml_v3_fp16_t ggml_v3_compute_fp32_to_fp16(float f) {
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
+    const float scale_to_inf = 0x1.0p+112f;
+    const float scale_to_zero = 0x1.0p-110f;
+#else
+    const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
+    const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
+#endif
+    float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
+
+    const uint32_t w = fp32_to_bits(f);
+    const uint32_t shl1_w = w + w;
+    const uint32_t sign = w & UINT32_C(0x80000000);
+    uint32_t bias = shl1_w & UINT32_C(0xFF000000);
+    if (bias < UINT32_C(0x71000000)) {
+        bias = UINT32_C(0x71000000);
+    }
+
+    base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
+    const uint32_t bits = fp32_to_bits(base);
+    const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
+    const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
+    const uint32_t nonsign = exp_bits + mantissa_bits;
+    return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
+}
+
+#define GGML_V3_COMPUTE_FP16_TO_FP32(x) ggml_v3_compute_fp16_to_fp32(x)
+#define GGML_V3_COMPUTE_FP32_TO_FP16(x) ggml_v3_compute_fp32_to_fp16(x)
+
+#endif // __F16C__
+
+#endif // __ARM_NEON
+
+// precomputed f32 table for f16 (256 KB)
+// defined in ggml.c, initialized in ggml_v3_init()
+extern float ggml_v3_table_f32_f16[1 << 16];
+
+// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_v3_lookup_fp16_to_fp32,
+// so we define GGML_V3_FP16_TO_FP32 and GGML_V3_FP32_TO_FP16 elsewhere for NEON.
+// This is also true for POWER9.
+#if !defined(GGML_V3_FP16_TO_FP32) || !defined(GGML_V3_FP32_TO_FP16)
+
+inline static float ggml_v3_lookup_fp16_to_fp32(ggml_v3_fp16_t f) {
+    uint16_t s;
+    memcpy(&s, &f, sizeof(uint16_t));
+    return ggml_v3_table_f32_f16[s];
+}
+
+#define GGML_V3_FP16_TO_FP32(x) ggml_v3_lookup_fp16_to_fp32(x)
+#define GGML_V3_FP32_TO_FP16(x) GGML_V3_COMPUTE_FP32_TO_FP16(x)
+
+#endif
+
+#define GGML_V3_HASHTABLE_FULL ((size_t)-1)
+#define GGML_V3_HASHTABLE_ALREADY_EXISTS ((size_t)-2)
+
+bool   ggml_v3_hash_contains      (const struct ggml_v3_hash_set hash_set, struct ggml_v3_tensor * key);
+
+// returns GGML_V3_HASHTABLE_FULL if table is full, otherwise the current index of the key or where it should be inserted
+size_t ggml_v3_hash_find          (const struct ggml_v3_hash_set hash_set, struct ggml_v3_tensor * key);
+
+// returns GGML_V3_HASHTABLE_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full
+size_t ggml_v3_hash_insert        (      struct ggml_v3_hash_set hash_set, struct ggml_v3_tensor * key);
+
+// return index, asserts if table is full
+size_t ggml_v3_hash_find_or_insert(      struct ggml_v3_hash_set hash_set, struct ggml_v3_tensor * key);
+
+#ifdef __cplusplus
+}
+#endif
+/// end ggml-imph.h
+
+#include <stddef.h>
+#include <stdint.h>
+
+
+#define QK4_0 32
+typedef struct {
+    ggml_v3_fp16_t d;          // delta
+    uint8_t qs[QK4_0 / 2];  // nibbles / quants
+} block_q4_0;
+static_assert(sizeof(block_q4_0) == sizeof(ggml_v3_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
+
+#define QK4_1 32
+typedef struct {
+    ggml_v3_fp16_t d;          // delta
+    ggml_v3_fp16_t m;          // min
+    uint8_t qs[QK4_1 / 2];  // nibbles / quants
+} block_q4_1;
+static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_v3_fp16_t) + QK4_1 / 2, "wrong q4_1 block size/padding");
+
+#define QK5_0 32
+typedef struct {
+    ggml_v3_fp16_t d;         // delta
+    uint8_t qh[4];         // 5-th bit of quants
+    uint8_t qs[QK5_0 / 2]; // nibbles / quants
+} block_q5_0;
+static_assert(sizeof(block_q5_0) == sizeof(ggml_v3_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
+
+#define QK5_1 32
+typedef struct {
+    ggml_v3_fp16_t d;         // delta
+    ggml_v3_fp16_t m;         // min
+    uint8_t qh[4];         // 5-th bit of quants
+    uint8_t qs[QK5_1 / 2]; // nibbles / quants
+} block_q5_1;
+static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_v3_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
+
+#define QK8_0 32
+typedef struct {
+    ggml_v3_fp16_t d;         // delta
+    int8_t  qs[QK8_0];     // quants
+} block_q8_0;
+static_assert(sizeof(block_q8_0) == sizeof(ggml_v3_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
+
+#define QK8_1 32
+typedef struct {
+    float d;               // delta
+    float s;               // d * sum(qs[i])
+    int8_t  qs[QK8_1];     // quants
+} block_q8_1;
+static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block size/padding");
+
+//
+// Super-block quantization structures
+//
+
+// Super-block size
+#ifdef GGML_V3_QKK_64
+#define QK_K 64
+#define K_SCALE_SIZE 4
+#else
+#define QK_K 256
+#define K_SCALE_SIZE 12
+#endif
+
+// 2-bit quantization
+// weight is represented as x = a * q + b
+// 16 blocks of 16 elements each
+// Effectively 2.625 bits per weight
+typedef struct {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    ggml_v3_fp16_t d;           // super-block scale for quantized scales
+    ggml_v3_fp16_t dmin;        // super-block scale for quantized mins
+} block_q2_K;
+static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_v3_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
+
+// 3-bit quantization
+// weight is represented as x = a * q
+// 16 blocks of 16 elements each
+// Effectively 3.4375 bits per weight
+#ifdef GGML_V3_QKK_64
+typedef struct {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+    uint8_t scales[2];
+    ggml_v3_fp16_t d;             // super-block scale
+} block_q3_K;
+static_assert(sizeof(block_q3_K) == sizeof(ggml_v3_fp16_t) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding");
+#else
+typedef struct {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+    uint8_t scales[12];        // scales, quantized with 6 bits
+    ggml_v3_fp16_t d;             // super-block scale
+} block_q3_K;
+static_assert(sizeof(block_q3_K) == sizeof(ggml_v3_fp16_t) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding");
+#endif
+
+// 4-bit quantization
+// 8 blocks of 32 elements each
+// weight is represented as x = a * q + b
+// Effectively 4.5 bits per weight
+#ifdef GGML_V3_QKK_64
+typedef struct {
+    ggml_v3_fp16_t d[2];          // super-block scales/mins
+    uint8_t scales[2];         // 4-bit block scales/mins
+    uint8_t qs[QK_K/2];        // 4--bit quants
+} block_q4_K;
+static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_v3_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding");
+#else
+typedef struct {
+    ggml_v3_fp16_t d;             // super-block scale for quantized scales
+    ggml_v3_fp16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+} block_q4_K;
+static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_v3_fp16_t) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");
+#endif
+
+// 5-bit quantization
+// 8 blocks of 32 elements each
+// weight is represented as x = a * q + b
+// Effectively 5.5 bits per weight
+#ifdef GGML_V3_QKK_64
+typedef struct {
+    ggml_v3_fp16_t d;               // super-block scale
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+} block_q5_K;
+static_assert(sizeof(block_q5_K) == sizeof(ggml_v3_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
+#else
+typedef struct {
+    ggml_v3_fp16_t d;               // super-block scale for quantized scales
+    ggml_v3_fp16_t dmin;            // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+} block_q5_K;
+static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_v3_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
+#endif
+
+// 6-bit quantization
+// weight is represented as x = a * q
+// 16 blocks of 16 elements each
+// Effectively 6.5625 bits per weight
+typedef struct {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    ggml_v3_fp16_t d;           // super-block scale
+} block_q6_K;
+static_assert(sizeof(block_q6_K) == sizeof(ggml_v3_fp16_t) + QK_K / 16 + 3*QK_K/4, "wrong q6_K block size/padding");
+
+// This is only used for intermediate quantization and dot products
+typedef struct {
+    float   d;              // delta
+    int8_t  qs[QK_K];       // quants
+    int16_t bsums[QK_K/16]; // sum of quants in groups of 16
+} block_q8_K;
+static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_t), "wrong q8_K block size/padding");
+
+// (Almost) "true" 2-bit quantization.
+// Due to the need to use blocks as per ggml dsign, it ends up using
+// 2.0625 bpw because of the 16-bit scale for each block of 256.
+typedef struct {
+    ggml_v3_fp16_t d;
+    uint16_t qs[QK_K/8];
+} block_iq2_xxs;
+static_assert(sizeof(block_iq2_xxs) == sizeof(ggml_v3_fp16_t) + QK_K/8*sizeof(uint16_t), "wrong iq2_xxs block size/padding");
+
+// 2.3125 bpw quants
+typedef struct {
+    ggml_v3_fp16_t d;
+    uint16_t qs[QK_K/8];
+    uint8_t  scales[QK_K/32];
+} block_iq2_xs;
+static_assert(sizeof(block_iq2_xs) == sizeof(ggml_v3_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding");
+
+// Quantization
+static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k);
+static void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k);
+static void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k);
+static void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k);
+static void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k);
+static void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k);
+
+static void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k);
+static void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k);
+static void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k);
+static void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k);
+static void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
+static void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
+static void quantize_row_iq2_xxs_reference(const float * restrict x, block_iq2_xxs * restrict y, int k);
+static void quantize_row_iq2_xs_reference (const float * restrict x, block_iq2_xs  * restrict y, int k);
+
+static void quantize_row_q4_0(const float * restrict x, void * restrict y, int k);
+static void quantize_row_q4_1(const float * restrict x, void * restrict y, int k);
+static void quantize_row_q5_0(const float * restrict x, void * restrict y, int k);
+static void quantize_row_q5_1(const float * restrict x, void * restrict y, int k);
+static void quantize_row_q8_0(const float * restrict x, void * restrict y, int k);
+static void quantize_row_q8_1(const float * restrict x, void * restrict y, int k);
+
+static void quantize_row_q2_K(const float * restrict x, void * restrict y, int k);
+static void quantize_row_q3_K(const float * restrict x, void * restrict y, int k);
+static void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);
+static void quantize_row_q5_K(const float * restrict x, void * restrict y, int k);
+static void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);
+static void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
+static void quantize_row_iq2_xxs(const float * restrict x, void * restrict y, int k);
+static void quantize_row_iq2_xs (const float * restrict x, void * restrict y, int k);
+
+// Dequantization
+static void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k);
+static void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k);
+static void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k);
+static void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k);
+static void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int k);
+//static void dequantize_row_q8_1(const block_q8_1 * restrict x, float * restrict y, int k);
+
+static void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k);
+static void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k);
+static void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k);
+static void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k);
+static void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k);
+static void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k);
+static void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k);
+static void dequantize_row_iq2_xs (const block_iq2_xs  * restrict x, float * restrict y, int k);
+
+// Dot product
+static void ggml_v3_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+static void ggml_v3_vec_dot_q4_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+static void ggml_v3_vec_dot_q5_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+static void ggml_v3_vec_dot_q5_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+static void ggml_v3_vec_dot_q8_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+
+static void ggml_v3_vec_dot_q2_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+static void ggml_v3_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+static void ggml_v3_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+static void ggml_v3_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+static void ggml_v3_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+static void ggml_v3_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+static void ggml_v3_vec_dot_iq2_xs_q8_K (int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+
+
+#if defined(_MSC_VER) || defined(__MINGW32__)
+#include <malloc.h> // using malloc.h with MSC/MINGW
+#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
+#include <alloca.h>
+#endif
+
+#include <assert.h>
+#include <errno.h>
+#include <time.h>
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <float.h>
+#include <limits.h>
+#include <stdarg.h>
+#include <signal.h>
+
+#ifdef GGML_USE_METAL
+#include <unistd.h>
+#endif
+
+#if defined(_MSC_VER)
+// disable "possible loss of data" to avoid hundreds of casts
+// we should just be careful :)
+#pragma warning(disable: 4244 4267)
+
+// disable POSIX deprecation warnings
+// these functions are never going away, anyway
+#pragma warning(disable: 4996)
+#endif
+
+#if defined(_WIN32)
+
+#include <windows.h>
+
+typedef volatile LONG atomic_int;
+typedef atomic_int atomic_bool;
+
+static void atomic_store(atomic_int * ptr, LONG val) {
+    InterlockedExchange(ptr, val);
+}
+static LONG atomic_load(atomic_int * ptr) {
+    return InterlockedCompareExchange(ptr, 0, 0);
+}
+static LONG atomic_fetch_add(atomic_int * ptr, LONG inc) {
+    return InterlockedExchangeAdd(ptr, inc);
+}
+static LONG atomic_fetch_sub(atomic_int * ptr, LONG dec) {
+    return atomic_fetch_add(ptr, -(dec));
+}
+
+typedef HANDLE pthread_t;
+
+typedef DWORD thread_ret_t;
+static int pthread_create(pthread_t * out, void * unused, thread_ret_t(*func)(void *), void * arg) {
+    (void) unused;
+    HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, arg, 0, NULL);
+    if (handle == NULL)
+    {
+        return EAGAIN;
+    }
+
+    *out = handle;
+    return 0;
+}
+
+static int pthread_join(pthread_t thread, void * unused) {
+    (void) unused;
+    int ret = (int) WaitForSingleObject(thread, INFINITE);
+    CloseHandle(thread);
+    return ret;
+}
+
+static int sched_yield (void) {
+    Sleep (0);
+    return 0;
+}
+#else
+#include <pthread.h>
+#include <stdatomic.h>
+
+typedef void * thread_ret_t;
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#endif
+
+#ifdef GGML_USE_CPU_HBM
+#include <hbwmalloc.h>
+#endif
+
+#if defined(__APPLE__)
+#include <TargetConditionals.h>
+#endif
+
+#if (defined(__linux__) || defined(__APPLE__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__)) && \
+    (!defined(TARGET_OS_TV) && !defined(TARGET_OS_WATCH))
+
+#include <sys/wait.h>
+
+void ggml_v3_print_backtrace(void) {
+    /*
+    #include <execinfo.h>
+    #include <dlfcn.h>
+
+    void * trace[100];
+
+    int nptrs = backtrace(trace, sizeof(trace)/sizeof(trace[0]));
+
+    backtrace_symbols_fd(trace, nptrs, STDERR_FILENO);
+    */
+
+    // backtrack_symbols does not show line numbers, use gdb instead
+    char attach[32];
+    snprintf(attach, sizeof(attach), "attach %d", getpid());
+    int pid = fork();
+    if (pid == 0) {
+        execlp("gdb", "gdb", "--batch",
+            "-ex", "set style enabled on",
+            "-ex", attach,
+            "-ex", "bt -frame-info source-and-location",
+            "-ex", "detach",
+            "-ex", "quit",
+            (char *) NULL);
+    } else {
+        waitpid(pid, NULL, 0);
+    }
+}
+#else
+void ggml_v3_print_backtrace(void) {
+    // platform not supported
+}
+#endif
+
+/*#define GGML_V3_PERF*/
+#define GGML_V3_DEBUG 0
+#define GGML_V3_GELU_FP16
+#define GGML_V3_GELU_QUICK_FP16
+#define GGML_V3_SILU_FP16
+// #define GGML_V3_CROSS_ENTROPY_EXP_FP16
+// #define GGML_V3_FLASH_ATTN_EXP_FP16
+
+#define GGML_V3_SOFT_MAX_UNROLL 4
+#define GGML_V3_VEC_DOT_UNROLL  2
+#define GGML_V3_VEC_MAD_UNROLL  32
+
+//
+// logging
+//
+
+#if (GGML_V3_DEBUG >= 1)
+#define GGML_V3_PRINT_DEBUG(...) printf(__VA_ARGS__)
+#else
+#define GGML_V3_PRINT_DEBUG(...)
+#endif
+
+#if (GGML_V3_DEBUG >= 5)
+#define GGML_V3_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
+#else
+#define GGML_V3_PRINT_DEBUG_5(...)
+#endif
+
+#if (GGML_V3_DEBUG >= 10)
+#define GGML_V3_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
+#else
+#define GGML_V3_PRINT_DEBUG_10(...)
+#endif
+
+#define GGML_V3_PRINT(...) printf(__VA_ARGS__)
+
+//
+// end of logging block
+//
+
+#ifdef GGML_USE_ACCELERATE
+// uncomment to use vDSP for soft max computation
+// note: not sure if it is actually faster
+//#define GGML_V3_SOFT_MAX_ACCELERATE
+#endif
+
+#if defined(_MSC_VER) || defined(__MINGW32__)
+#define GGML_V3_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_V3_MEM_ALIGN)
+#define GGML_V3_ALIGNED_FREE(ptr)    _aligned_free(ptr)
+#else
+inline static void * ggml_v3_aligned_malloc(size_t size) {
+    if (size == 0) {
+        GGML_V3_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_v3_aligned_malloc!\n");
+        return NULL;
+    }
+    void * aligned_memory = NULL;
+#ifdef GGML_USE_CPU_HBM
+    int result = hbw_posix_memalign(&aligned_memory, 16, size);
+#elif GGML_USE_METAL
+    int result = posix_memalign(&aligned_memory, sysconf(_SC_PAGESIZE), size);
+#else
+    int result = posix_memalign(&aligned_memory, GGML_V3_MEM_ALIGN, size);
+#endif
+    if (result != 0) {
+        // Handle allocation failure
+        const char *error_desc = "unknown allocation error";
+        switch (result) {
+            case EINVAL:
+                error_desc = "invalid alignment value";
+                break;
+            case ENOMEM:
+                error_desc = "insufficient memory";
+                break;
+        }
+        GGML_V3_PRINT("%s: %s (attempted to allocate %6.2f MB)\n", __func__, error_desc, size/(1024.0*1024.0));
+        return NULL;
+    }
+    return aligned_memory;
+}
+#define GGML_V3_ALIGNED_MALLOC(size) ggml_v3_aligned_malloc(size)
+#ifdef GGML_USE_CPU_HBM
+#define GGML_V3_ALIGNED_FREE(ptr)    if(NULL != ptr) hbw_free(ptr)
+#else
+#define GGML_V3_ALIGNED_FREE(ptr)    free(ptr)
+#endif
+#endif
+
+#define UNUSED GGML_V3_UNUSED
+#define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
+
+#if defined(GGML_USE_ACCELERATE)
+#include <Accelerate/Accelerate.h>
+#if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
+#include "ggml_v3-opencl.h"
+#endif
+#elif defined(GGML_USE_OPENBLAS)
+#if defined(GGML_V3_BLAS_USE_MKL)
+#include <mkl.h>
+#else
+#include <cblas.h>
+#endif
+#elif defined(GGML_USE_CUBLAS)
+#include "ggml_v3-cuda.h"
+#elif defined(GGML_USE_CLBLAST)
+#include "ggml_v3-opencl.h"
+#endif
+
+// floating point type used to accumulate sums
+typedef double ggml_v3_float;
+
+#undef MIN
+#undef MAX
+
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+
+//
+// global data
+//
+
+// precomputed gelu table for f16 (128 KB)
+static ggml_v3_fp16_t ggml_v3_table_gelu_f16[1 << 16];
+
+// precomputed quick gelu table for f16 (128 KB)
+static ggml_v3_fp16_t ggml_v3_table_gelu_quick_f16[1 << 16];
+
+// precomputed silu table for f16 (128 KB)
+static ggml_v3_fp16_t ggml_v3_table_silu_f16[1 << 16];
+
+// precomputed exp table for f16 (128 KB)
+static ggml_v3_fp16_t ggml_v3_table_exp_f16[1 << 16];
+
+// precomputed f32 table for f16 (256 KB) (ggml-impl.h)
+float ggml_v3_table_f32_f16[1 << 16];
+
+// note: do not use these inside ggml.c
+// these are meant to be used via the ggml.h API
+float ggml_v3_fp16_to_fp32(ggml_v3_fp16_t x) {
+    return (float) GGML_V3_FP16_TO_FP32(x);
+}
+
+ggml_v3_fp16_t ggml_v3_fp32_to_fp16(float x) {
+    return GGML_V3_FP32_TO_FP16(x);
+}
+
+void ggml_v3_fp16_to_fp32_row(const ggml_v3_fp16_t * x, float * y, int n) {
+    for (int i = 0; i < n; i++) {
+        y[i] = GGML_V3_FP16_TO_FP32(x[i]);
+    }
+}
+
+void ggml_v3_fp32_to_fp16_row(const float * x, ggml_v3_fp16_t * y, int n) {
+    int i = 0;
+#if defined(__F16C__)
+    for (; i + 7 < n; i += 8) {
+        __m256 x_vec = _mm256_loadu_ps(x + i);
+        __m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
+        _mm_storeu_si128((__m128i *)(y + i), y_vec);
+    }
+    for(; i + 3 < n; i += 4) {
+        __m128 x_vec = _mm_loadu_ps(x + i);
+        __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
+        _mm_storel_epi64((__m128i *)(y + i), y_vec);
+    }
+#endif
+    for (; i < n; i++) {
+        y[i] = GGML_V3_FP32_TO_FP16(x[i]);
+    }
+}
+
+//
+// timing
+//
+
+#if defined(_MSC_VER) || defined(__MINGW32__)
+static int64_t timer_freq, timer_start;
+void ggml_v3_time_init(void) {
+    LARGE_INTEGER t;
+    QueryPerformanceFrequency(&t);
+    timer_freq = t.QuadPart;
+
+    // The multiplication by 1000 or 1000000 below can cause an overflow if timer_freq
+    // and the uptime is high enough.
+    // We subtract the program start time to reduce the likelihood of that happening.
+    QueryPerformanceCounter(&t);
+    timer_start = t.QuadPart;
+}
+int64_t ggml_v3_time_ms(void) {
+    LARGE_INTEGER t;
+    QueryPerformanceCounter(&t);
+    return ((t.QuadPart-timer_start) * 1000) / timer_freq;
+}
+int64_t ggml_v3_time_us(void) {
+    LARGE_INTEGER t;
+    QueryPerformanceCounter(&t);
+    return ((t.QuadPart-timer_start) * 1000000) / timer_freq;
+}
+#else
+void ggml_v3_time_init(void) {}
+int64_t ggml_v3_time_ms(void) {
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    return (int64_t)ts.tv_sec*1000 + (int64_t)ts.tv_nsec/1000000;
+}
+
+int64_t ggml_v3_time_us(void) {
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    return (int64_t)ts.tv_sec*1000000 + (int64_t)ts.tv_nsec/1000;
+}
+#endif
+
+int64_t ggml_v3_cycles(void) {
+    return clock();
+}
+
+int64_t ggml_v3_cycles_per_ms(void) {
+    return CLOCKS_PER_SEC/1000;
+}
+
+#ifdef GGML_V3_PERF
+#define ggml_v3_perf_time_ms()       ggml_v3_time_ms()
+#define ggml_v3_perf_time_us()       ggml_v3_time_us()
+#define ggml_v3_perf_cycles()        ggml_v3_cycles()
+#define ggml_v3_perf_cycles_per_ms() ggml_v3_cycles_per_ms()
+#else
+#define ggml_v3_perf_time_ms()       0
+#define ggml_v3_perf_time_us()       0
+#define ggml_v3_perf_cycles()        0
+#define ggml_v3_perf_cycles_per_ms() 0
+#endif
+
+//
+// cache line
+//
+
+#if defined(__cpp_lib_hardware_interference_size)
+#define CACHE_LINE_SIZE hardware_destructive_interference_size
+#else
+#if defined(__POWER9_VECTOR__)
+#define CACHE_LINE_SIZE 128
+#else
+#define CACHE_LINE_SIZE 64
+#endif
+#endif
+
+static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
+
+static void ggml_v3_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y);
+static void ggml_v3_vec_dot_f16(const int n, float * restrict s, ggml_v3_fp16_t * restrict x, ggml_v3_fp16_t * restrict y);
+
+ggml_v3_collect_imatrix_t g_imatrix_collect_v3 = NULL;
+
+void ggml_v3_set_imatrix_collection(ggml_v3_collect_imatrix_t imatrix_collect) {
+    g_imatrix_collect_v3 = imatrix_collect;
+}
+
+static const ggml_v3_type_traits_t type_traits[GGML_V3_TYPE_COUNT] = {
+    [GGML_V3_TYPE_I8] = {
+        .type_name                = "i8",
+        .blck_size                = 1,
+        .type_size                = sizeof(int8_t),
+        .is_quantized             = false,
+    },
+    [GGML_V3_TYPE_I16] = {
+        .type_name                = "i16",
+        .blck_size                = 1,
+        .type_size                = sizeof(int16_t),
+        .is_quantized             = false,
+    },
+    [GGML_V3_TYPE_I32] = {
+        .type_name                = "i32",
+        .blck_size                = 1,
+        .type_size                = sizeof(int32_t),
+        .is_quantized             = false,
+    },
+    [GGML_V3_TYPE_F32] = {
+        .type_name                = "f32",
+        .blck_size                = 1,
+        .type_size                = sizeof(float),
+        .is_quantized             = false,
+        .vec_dot                  = (ggml_v3_vec_dot_t) ggml_v3_vec_dot_f32,
+        .vec_dot_type             = GGML_V3_TYPE_F32,
+    },
+    [GGML_V3_TYPE_F16] = {
+        .type_name                = "f16",
+        .blck_size                = 1,
+        .type_size                = sizeof(ggml_v3_fp16_t),
+        .is_quantized             = false,
+        .to_float                 = (ggml_v3_to_float_t) ggml_v3_fp16_to_fp32_row,
+        .from_float               = (ggml_v3_from_float_t) ggml_v3_fp32_to_fp16_row,
+        .from_float_reference     = (ggml_v3_from_float_t) ggml_v3_fp32_to_fp16_row,
+        .vec_dot                  = (ggml_v3_vec_dot_t) ggml_v3_vec_dot_f16,
+        .vec_dot_type             = GGML_V3_TYPE_F16,
+    },
+    [GGML_V3_TYPE_Q4_0] = {
+        .type_name                = "q4_0",
+        .blck_size                = QK4_0,
+        .type_size                = sizeof(block_q4_0),
+        .is_quantized             = true,
+        .to_float                 = (ggml_v3_to_float_t) dequantize_row_q4_0,
+        .from_float               = quantize_row_q4_0,
+        .from_float_reference     = (ggml_v3_from_float_t) quantize_row_q4_0_reference,
+        .vec_dot                  = ggml_v3_vec_dot_q4_0_q8_0,
+        .vec_dot_type             = GGML_V3_TYPE_Q8_0,
+    },
+    [GGML_V3_TYPE_Q4_1] = {
+        .type_name                = "q4_1",
+        .blck_size                = QK4_1,
+        .type_size                = sizeof(block_q4_1),
+        .is_quantized             = true,
+        .to_float                 = (ggml_v3_to_float_t) dequantize_row_q4_1,
+        .from_float               = quantize_row_q4_1,
+        .from_float_reference     = (ggml_v3_from_float_t) quantize_row_q4_1_reference,
+        .vec_dot                  = ggml_v3_vec_dot_q4_1_q8_1,
+        .vec_dot_type             = GGML_V3_TYPE_Q8_1,
+    },
+    [4] = { // GGML_V3_TYPE_Q4_2
+        .type_name                = "DEPRECATED",
+        .blck_size                = 0,
+        .type_size                = 0,
+        .is_quantized             = false,
+        .to_float                 = NULL,
+        .from_float               = NULL,
+        .from_float_reference     = NULL,
+        .vec_dot                  = NULL,
+        .vec_dot_type             = GGML_V3_TYPE_COUNT,
+    },
+    [5] = { // GGML_V3_TYPE_Q4_3
+        .type_name                = "DEPRECATED",
+        .blck_size                = 0,
+        .type_size                = 0,
+        .is_quantized             = false,
+        .to_float                 = NULL,
+        .from_float               = NULL,
+        .from_float_reference     = NULL,
+        .vec_dot                  = NULL,
+        .vec_dot_type             = GGML_V3_TYPE_COUNT,
+    },
+    [GGML_V3_TYPE_Q5_0] = {
+        .type_name                = "q5_0",
+        .blck_size                = QK5_0,
+        .type_size                = sizeof(block_q5_0),
+        .is_quantized             = true,
+        .to_float                 = (ggml_v3_to_float_t) dequantize_row_q5_0,
+        .from_float               = quantize_row_q5_0,
+        .from_float_reference     = (ggml_v3_from_float_t) quantize_row_q5_0_reference,
+        .vec_dot                  = ggml_v3_vec_dot_q5_0_q8_0,
+        .vec_dot_type             = GGML_V3_TYPE_Q8_0,
+    },
+    [GGML_V3_TYPE_Q5_1] = {
+        .type_name                = "q5_1",
+        .blck_size                = QK5_1,
+        .type_size                = sizeof(block_q5_1),
+        .is_quantized             = true,
+        .to_float                 = (ggml_v3_to_float_t) dequantize_row_q5_1,
+        .from_float               = quantize_row_q5_1,
+        .from_float_reference     = (ggml_v3_from_float_t) quantize_row_q5_1_reference,
+        .vec_dot                  = ggml_v3_vec_dot_q5_1_q8_1,
+        .vec_dot_type             = GGML_V3_TYPE_Q8_1,
+    },
+    [GGML_V3_TYPE_Q8_0] = {
+        .type_name                = "q8_0",
+        .blck_size                = QK8_0,
+        .type_size                = sizeof(block_q8_0),
+        .is_quantized             = true,
+        .to_float                 = (ggml_v3_to_float_t) dequantize_row_q8_0,
+        .from_float               = quantize_row_q8_0,
+        .from_float_reference     = (ggml_v3_from_float_t) quantize_row_q8_0_reference,
+        .vec_dot                  = ggml_v3_vec_dot_q8_0_q8_0,
+        .vec_dot_type             = GGML_V3_TYPE_Q8_0,
+    },
+    [GGML_V3_TYPE_Q8_1] = {
+        .type_name                = "q8_1",
+        .blck_size                = QK8_1,
+        .type_size                = sizeof(block_q8_1),
+        .is_quantized             = true,
+        .from_float               = quantize_row_q8_1,
+        .from_float_reference     = (ggml_v3_from_float_t) quantize_row_q8_1_reference,
+        .vec_dot_type             = GGML_V3_TYPE_Q8_1,
+    },
+    [GGML_V3_TYPE_Q2_K] = {
+        .type_name                = "q2_K",
+        .blck_size                = QK_K,
+        .type_size                = sizeof(block_q2_K),
+        .is_quantized             = true,
+        .to_float                 = (ggml_v3_to_float_t) dequantize_row_q2_K,
+        .from_float               = quantize_row_q2_K,
+        .from_float_reference     = (ggml_v3_from_float_t) quantize_row_q2_K_reference,
+        .vec_dot                  = ggml_v3_vec_dot_q2_K_q8_K,
+        .vec_dot_type             = GGML_V3_TYPE_Q8_K,
+    },
+    [GGML_V3_TYPE_Q3_K] = {
+        .type_name                = "q3_K",
+        .blck_size                = QK_K,
+        .type_size                = sizeof(block_q3_K),
+        .is_quantized             = true,
+        .to_float                 = (ggml_v3_to_float_t) dequantize_row_q3_K,
+        .from_float               = quantize_row_q3_K,
+        .from_float_reference     = (ggml_v3_from_float_t) quantize_row_q3_K_reference,
+        .vec_dot                  = ggml_v3_vec_dot_q3_K_q8_K,
+        .vec_dot_type             = GGML_V3_TYPE_Q8_K,
+    },
+    [GGML_V3_TYPE_Q4_K] = {
+        .type_name                = "q4_K",
+        .blck_size                = QK_K,
+        .type_size                = sizeof(block_q4_K),
+        .is_quantized             = true,
+        .to_float                 = (ggml_v3_to_float_t) dequantize_row_q4_K,
+        .from_float               = quantize_row_q4_K,
+        .from_float_reference     = (ggml_v3_from_float_t) quantize_row_q4_K_reference,
+        .vec_dot                  = ggml_v3_vec_dot_q4_K_q8_K,
+        .vec_dot_type             = GGML_V3_TYPE_Q8_K,
+    },
+    [GGML_V3_TYPE_Q5_K] = {
+        .type_name                = "q5_K",
+        .blck_size                = QK_K,
+        .type_size                = sizeof(block_q5_K),
+        .is_quantized             = true,
+        .to_float                 = (ggml_v3_to_float_t) dequantize_row_q5_K,
+        .from_float               = quantize_row_q5_K,
+        .from_float_reference     = (ggml_v3_from_float_t) quantize_row_q5_K_reference,
+        .vec_dot                  = ggml_v3_vec_dot_q5_K_q8_K,
+        .vec_dot_type             = GGML_V3_TYPE_Q8_K,
+    },
+    [GGML_V3_TYPE_Q6_K] = {
+        .type_name                = "q6_K",
+        .blck_size                = QK_K,
+        .type_size                = sizeof(block_q6_K),
+        .is_quantized             = true,
+        .to_float                 = (ggml_v3_to_float_t) dequantize_row_q6_K,
+        .from_float               = quantize_row_q6_K,
+        .from_float_reference     = (ggml_v3_from_float_t) quantize_row_q6_K_reference,
+        .vec_dot                  = ggml_v3_vec_dot_q6_K_q8_K,
+        .vec_dot_type             = GGML_V3_TYPE_Q8_K,
+    },
+    [GGML_V3_TYPE_IQ2_XXS] = {
+        .type_name                = "iq2_xxs",
+        .blck_size                = QK_K,
+        .type_size                = sizeof(block_iq2_xxs),
+        .is_quantized             = true,
+        .to_float                 = (ggml_v3_to_float_t) dequantize_row_iq2_xxs,
+        .from_float               = quantize_row_iq2_xxs,
+        .from_float_reference     = (ggml_v3_from_float_t) quantize_row_iq2_xxs_reference,
+        .vec_dot                  = ggml_v3_vec_dot_iq2_xxs_q8_K,
+        .vec_dot_type             = GGML_V3_TYPE_Q8_K,
+    },
+    [GGML_V3_TYPE_IQ2_XS] = {
+        .type_name                = "iq2_xs",
+        .blck_size                = QK_K,
+        .type_size                = sizeof(block_iq2_xs),
+        .is_quantized             = true,
+        .to_float                 = (ggml_v3_to_float_t) dequantize_row_iq2_xs,
+        .from_float               = quantize_row_iq2_xs,
+        .from_float_reference     = (ggml_v3_from_float_t) quantize_row_iq2_xs_reference,
+        .vec_dot                  = ggml_v3_vec_dot_iq2_xs_q8_K,
+        .vec_dot_type             = GGML_V3_TYPE_Q8_K,
+    },
+    [GGML_V3_TYPE_Q8_K] = {
+        .type_name                = "q8_K",
+        .blck_size                = QK_K,
+        .type_size                = sizeof(block_q8_K),
+        .is_quantized             = true,
+        .from_float               = quantize_row_q8_K,
+    }
+};
+
+// For internal test use
+ggml_v3_type_traits_t ggml_v3_internal_get_type_traits(enum ggml_v3_type type) {
+    GGML_V3_ASSERT(type < GGML_V3_TYPE_COUNT);
+    return type_traits[type];
+}
+
+//
+// simd mappings
+//
+
+#if defined(__ARM_NEON)
+#if !defined(__aarch64__)
+
+// 64-bit compatibility
+
+inline static float vaddvq_f32(float32x4_t v) {
+    return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
+}
+
+#endif
+#endif
+
+// we define a common set of C macros which map to specific intrinsics based on the current architecture
+// we then implement the fundamental computation operations below using only these macros
+// adding support for new architectures requires to define the corresponding SIMD macros
+//
+// GGML_V3_F32_STEP / GGML_V3_F16_STEP
+//   number of elements to process in a single step
+//
+// GGML_V3_F32_EPR / GGML_V3_F16_EPR
+//   number of elements to fit in a single register
+//
+
+#if defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
+
+#define GGML_V3_SIMD
+
+// F32 NEON
+
+#define GGML_V3_F32_STEP 16
+#define GGML_V3_F32_EPR  4
+
+#define GGML_V3_F32x4              float32x4_t
+#define GGML_V3_F32x4_ZERO         vdupq_n_f32(0.0f)
+#define GGML_V3_F32x4_SET1(x)      vdupq_n_f32(x)
+#define GGML_V3_F32x4_LOAD         vld1q_f32
+#define GGML_V3_F32x4_STORE        vst1q_f32
+#define GGML_V3_F32x4_FMA(a, b, c) vfmaq_f32(a, b, c)
+#define GGML_V3_F32x4_ADD          vaddq_f32
+#define GGML_V3_F32x4_MUL          vmulq_f32
+#define GGML_V3_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
+#define GGML_V3_F32x4_REDUCE(res, x)              \
+{                                              \
+    int offset = GGML_V3_F32_ARR >> 1;            \
+    for (int i = 0; i < offset; ++i) {         \
+        x[i] = vaddq_f32(x[i], x[offset+i]);   \
+    }                                          \
+    offset >>= 1;                              \
+    for (int i = 0; i < offset; ++i) {         \
+        x[i] = vaddq_f32(x[i], x[offset+i]);   \
+    }                                          \
+    offset >>= 1;                              \
+    for (int i = 0; i < offset; ++i) {         \
+        x[i] = vaddq_f32(x[i], x[offset+i]);   \
+    }                                          \
+    res = GGML_V3_F32x4_REDUCE_ONE(x[0]);         \
+}
+
+#define GGML_V3_F32_VEC        GGML_V3_F32x4
+#define GGML_V3_F32_VEC_ZERO   GGML_V3_F32x4_ZERO
+#define GGML_V3_F32_VEC_SET1   GGML_V3_F32x4_SET1
+#define GGML_V3_F32_VEC_LOAD   GGML_V3_F32x4_LOAD
+#define GGML_V3_F32_VEC_STORE  GGML_V3_F32x4_STORE
+#define GGML_V3_F32_VEC_FMA    GGML_V3_F32x4_FMA
+#define GGML_V3_F32_VEC_ADD    GGML_V3_F32x4_ADD
+#define GGML_V3_F32_VEC_MUL    GGML_V3_F32x4_MUL
+#define GGML_V3_F32_VEC_REDUCE GGML_V3_F32x4_REDUCE
+
+// F16 NEON
+
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+    #define GGML_V3_F16_STEP 32
+    #define GGML_V3_F16_EPR  8
+
+    #define GGML_V3_F16x8              float16x8_t
+    #define GGML_V3_F16x8_ZERO         vdupq_n_f16(0.0f)
+    #define GGML_V3_F16x8_SET1(x)      vdupq_n_f16(x)
+    #define GGML_V3_F16x8_LOAD         vld1q_f16
+    #define GGML_V3_F16x8_STORE        vst1q_f16
+    #define GGML_V3_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
+    #define GGML_V3_F16x8_ADD          vaddq_f16
+    #define GGML_V3_F16x8_MUL          vmulq_f16
+    #define GGML_V3_F16x8_REDUCE(res, x)                             \
+    do {                                                          \
+        int offset = GGML_V3_F16_ARR >> 1;                           \
+        for (int i = 0; i < offset; ++i) {                        \
+            x[i] = vaddq_f16(x[i], x[offset+i]);                  \
+        }                                                         \
+        offset >>= 1;                                             \
+        for (int i = 0; i < offset; ++i) {                        \
+            x[i] = vaddq_f16(x[i], x[offset+i]);                  \
+        }                                                         \
+        offset >>= 1;                                             \
+        for (int i = 0; i < offset; ++i) {                        \
+            x[i] = vaddq_f16(x[i], x[offset+i]);                  \
+        }                                                         \
+        const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 (x[0])); \
+        const float32x4_t t1 = vcvt_f32_f16(vget_high_f16(x[0])); \
+        res = (ggml_v3_float) vaddvq_f32(vaddq_f32(t0, t1));         \
+    } while (0)
+
+    #define GGML_V3_F16_VEC                GGML_V3_F16x8
+    #define GGML_V3_F16_VEC_ZERO           GGML_V3_F16x8_ZERO
+    #define GGML_V3_F16_VEC_SET1           GGML_V3_F16x8_SET1
+    #define GGML_V3_F16_VEC_LOAD(p, i)     GGML_V3_F16x8_LOAD(p)
+    #define GGML_V3_F16_VEC_STORE(p, r, i) GGML_V3_F16x8_STORE(p, r[i])
+    #define GGML_V3_F16_VEC_FMA            GGML_V3_F16x8_FMA
+    #define GGML_V3_F16_VEC_ADD            GGML_V3_F16x8_ADD
+    #define GGML_V3_F16_VEC_MUL            GGML_V3_F16x8_MUL
+    #define GGML_V3_F16_VEC_REDUCE         GGML_V3_F16x8_REDUCE
+#else
+    // if FP16 vector arithmetic is not supported, we use FP32 instead
+    // and take advantage of the vcvt_ functions to convert to/from FP16
+
+    #define GGML_V3_F16_STEP 16
+    #define GGML_V3_F16_EPR  4
+
+    #define GGML_V3_F32Cx4              float32x4_t
+    #define GGML_V3_F32Cx4_ZERO         vdupq_n_f32(0.0f)
+    #define GGML_V3_F32Cx4_SET1(x)      vdupq_n_f32(x)
+    #define GGML_V3_F32Cx4_LOAD(x)      vcvt_f32_f16(vld1_f16(x))
+    #define GGML_V3_F32Cx4_STORE(x, y)  vst1_f16(x, vcvt_f16_f32(y))
+    #define GGML_V3_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
+    #define GGML_V3_F32Cx4_ADD          vaddq_f32
+    #define GGML_V3_F32Cx4_MUL          vmulq_f32
+    #define GGML_V3_F32Cx4_REDUCE       GGML_V3_F32x4_REDUCE
+
+    #define GGML_V3_F16_VEC                GGML_V3_F32Cx4
+    #define GGML_V3_F16_VEC_ZERO           GGML_V3_F32Cx4_ZERO
+    #define GGML_V3_F16_VEC_SET1           GGML_V3_F32Cx4_SET1
+    #define GGML_V3_F16_VEC_LOAD(p, i)     GGML_V3_F32Cx4_LOAD(p)
+    #define GGML_V3_F16_VEC_STORE(p, r, i) GGML_V3_F32Cx4_STORE(p, r[i])
+    #define GGML_V3_F16_VEC_FMA            GGML_V3_F32Cx4_FMA
+    #define GGML_V3_F16_VEC_ADD            GGML_V3_F32Cx4_ADD
+    #define GGML_V3_F16_VEC_MUL            GGML_V3_F32Cx4_MUL
+    #define GGML_V3_F16_VEC_REDUCE         GGML_V3_F32Cx4_REDUCE
+#endif
+
+#elif defined(__AVX__)
+
+#define GGML_V3_SIMD
+
+// F32 AVX
+
+#define GGML_V3_F32_STEP 32
+#define GGML_V3_F32_EPR  8
+
+#define GGML_V3_F32x8         __m256
+#define GGML_V3_F32x8_ZERO    _mm256_setzero_ps()
+#define GGML_V3_F32x8_SET1(x) _mm256_set1_ps(x)
+#define GGML_V3_F32x8_LOAD    _mm256_loadu_ps
+#define GGML_V3_F32x8_STORE   _mm256_storeu_ps
+#if defined(__FMA__)
+    #define GGML_V3_F32x8_FMA(a, b, c) _mm256_fmadd_ps(b, c, a)
+#else
+    #define GGML_V3_F32x8_FMA(a, b, c) _mm256_add_ps(_mm256_mul_ps(b, c), a)
+#endif
+#define GGML_V3_F32x8_ADD     _mm256_add_ps
+#define GGML_V3_F32x8_MUL     _mm256_mul_ps
+#define GGML_V3_F32x8_REDUCE(res, x)                                 \
+do {                                                              \
+    int offset = GGML_V3_F32_ARR >> 1;                               \
+    for (int i = 0; i < offset; ++i) {                            \
+        x[i] = _mm256_add_ps(x[i], x[offset+i]);                  \
+    }                                                             \
+    offset >>= 1;                                                 \
+    for (int i = 0; i < offset; ++i) {                            \
+        x[i] = _mm256_add_ps(x[i], x[offset+i]);                  \
+    }                                                             \
+    offset >>= 1;                                                 \
+    for (int i = 0; i < offset; ++i) {                            \
+        x[i] = _mm256_add_ps(x[i], x[offset+i]);                  \
+    }                                                             \
+    const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]),    \
+                                 _mm256_extractf128_ps(x[0], 1)); \
+    const __m128 t1 = _mm_hadd_ps(t0, t0);                        \
+    res = _mm_cvtss_f32(_mm_hadd_ps(t1, t1));                     \
+} while (0)
+// TODO: is this optimal ?
+
+#define GGML_V3_F32_VEC        GGML_V3_F32x8
+#define GGML_V3_F32_VEC_ZERO   GGML_V3_F32x8_ZERO
+#define GGML_V3_F32_VEC_SET1   GGML_V3_F32x8_SET1
+#define GGML_V3_F32_VEC_LOAD   GGML_V3_F32x8_LOAD
+#define GGML_V3_F32_VEC_STORE  GGML_V3_F32x8_STORE
+#define GGML_V3_F32_VEC_FMA    GGML_V3_F32x8_FMA
+#define GGML_V3_F32_VEC_ADD    GGML_V3_F32x8_ADD
+#define GGML_V3_F32_VEC_MUL    GGML_V3_F32x8_MUL
+#define GGML_V3_F32_VEC_REDUCE GGML_V3_F32x8_REDUCE
+
+// F16 AVX
+
+#define GGML_V3_F16_STEP 32
+#define GGML_V3_F16_EPR  8
+
+// F16 arithmetic is not supported by AVX, so we use F32 instead
+
+#define GGML_V3_F32Cx8             __m256
+#define GGML_V3_F32Cx8_ZERO        _mm256_setzero_ps()
+#define GGML_V3_F32Cx8_SET1(x)     _mm256_set1_ps(x)
+
+#if defined(__F16C__)
+// the  _mm256_cvt intrinsics require F16C
+#define GGML_V3_F32Cx8_LOAD(x)     _mm256_cvtph_ps(_mm_loadu_si128((__m128i *)(x)))
+#define GGML_V3_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0))
+#else
+static inline __m256 __avx_f32cx8_load(ggml_v3_fp16_t *x) {
+    float tmp[8];
+
+    for (int i = 0; i < 8; i++) {
+        tmp[i] = GGML_V3_FP16_TO_FP32(x[i]);
+    }
+
+    return _mm256_loadu_ps(tmp);
+}
+static inline void __avx_f32cx8_store(ggml_v3_fp16_t *x, __m256 y) {
+    float arr[8];
+
+    _mm256_storeu_ps(arr, y);
+
+    for (int i = 0; i < 8; i++)
+        x[i] = GGML_V3_FP32_TO_FP16(arr[i]);
+}
+#define GGML_V3_F32Cx8_LOAD(x)     __avx_f32cx8_load(x)
+#define GGML_V3_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y)
+#endif
+
+#define GGML_V3_F32Cx8_FMA         GGML_V3_F32x8_FMA
+#define GGML_V3_F32Cx8_ADD         _mm256_add_ps
+#define GGML_V3_F32Cx8_MUL         _mm256_mul_ps
+#define GGML_V3_F32Cx8_REDUCE      GGML_V3_F32x8_REDUCE
+
+#define GGML_V3_F16_VEC                GGML_V3_F32Cx8
+#define GGML_V3_F16_VEC_ZERO           GGML_V3_F32Cx8_ZERO
+#define GGML_V3_F16_VEC_SET1           GGML_V3_F32Cx8_SET1
+#define GGML_V3_F16_VEC_LOAD(p, i)     GGML_V3_F32Cx8_LOAD(p)
+#define GGML_V3_F16_VEC_STORE(p, r, i) GGML_V3_F32Cx8_STORE(p, r[i])
+#define GGML_V3_F16_VEC_FMA            GGML_V3_F32Cx8_FMA
+#define GGML_V3_F16_VEC_ADD            GGML_V3_F32Cx8_ADD
+#define GGML_V3_F16_VEC_MUL            GGML_V3_F32Cx8_MUL
+#define GGML_V3_F16_VEC_REDUCE         GGML_V3_F32Cx8_REDUCE
+
+#elif defined(__POWER9_VECTOR__)
+
+#define GGML_V3_SIMD
+
+// F32 POWER9
+
+#define GGML_V3_F32_STEP 32
+#define GGML_V3_F32_EPR  4
+
+#define GGML_V3_F32x4              vector float
+#define GGML_V3_F32x4_ZERO         0.0f
+#define GGML_V3_F32x4_SET1         vec_splats
+#define GGML_V3_F32x4_LOAD(p)      vec_xl(0, p)
+#define GGML_V3_F32x4_STORE(p, r)  vec_xst(r, 0, p)
+#define GGML_V3_F32x4_FMA(a, b, c) vec_madd(b, c, a)
+#define GGML_V3_F32x4_ADD          vec_add
+#define GGML_V3_F32x4_MUL          vec_mul
+#define GGML_V3_F32x4_REDUCE(res, x)              \
+{                                              \
+    int offset = GGML_V3_F32_ARR >> 1;            \
+    for (int i = 0; i < offset; ++i) {         \
+        x[i] = vec_add(x[i], x[offset+i]);     \
+    }                                          \
+    offset >>= 1;                              \
+    for (int i = 0; i < offset; ++i) {         \
+        x[i] = vec_add(x[i], x[offset+i]);     \
+    }                                          \
+    offset >>= 1;                              \
+    for (int i = 0; i < offset; ++i) {         \
+        x[i] = vec_add(x[i], x[offset+i]);     \
+    }                                          \
+    res = vec_extract(x[0], 0) +               \
+          vec_extract(x[0], 1) +               \
+          vec_extract(x[0], 2) +               \
+          vec_extract(x[0], 3);                \
+}
+
+#define GGML_V3_F32_VEC        GGML_V3_F32x4
+#define GGML_V3_F32_VEC_ZERO   GGML_V3_F32x4_ZERO
+#define GGML_V3_F32_VEC_SET1   GGML_V3_F32x4_SET1
+#define GGML_V3_F32_VEC_LOAD   GGML_V3_F32x4_LOAD
+#define GGML_V3_F32_VEC_STORE  GGML_V3_F32x4_STORE
+#define GGML_V3_F32_VEC_FMA    GGML_V3_F32x4_FMA
+#define GGML_V3_F32_VEC_ADD    GGML_V3_F32x4_ADD
+#define GGML_V3_F32_VEC_MUL    GGML_V3_F32x4_MUL
+#define GGML_V3_F32_VEC_REDUCE GGML_V3_F32x4_REDUCE
+
+// F16 POWER9
+#define GGML_V3_F16_STEP       GGML_V3_F32_STEP
+#define GGML_V3_F16_EPR        GGML_V3_F32_EPR
+#define GGML_V3_F16_VEC        GGML_V3_F32x4
+#define GGML_V3_F16_VEC_ZERO   GGML_V3_F32x4_ZERO
+#define GGML_V3_F16_VEC_SET1   GGML_V3_F32x4_SET1
+#define GGML_V3_F16_VEC_FMA    GGML_V3_F32x4_FMA
+#define GGML_V3_F16_VEC_REDUCE GGML_V3_F32x4_REDUCE
+// Use vec_xl, not vec_ld, in case the load address is not aligned.
+#define GGML_V3_F16_VEC_LOAD(p, i) (i & 0x1) ?                   \
+  vec_extract_fp32_from_shorth(vec_xl(0, p - GGML_V3_F16_EPR)) : \
+  vec_extract_fp32_from_shortl(vec_xl(0, p))
+#define GGML_V3_ENDIAN_BYTE(i) ((unsigned char *)&(uint16_t){1})[i]
+#define GGML_V3_F16_VEC_STORE(p, r, i)                             \
+  if (i & 0x1)                                                  \
+    vec_xst(vec_pack_to_short_fp32(r[i - GGML_V3_ENDIAN_BYTE(1)],  \
+                                   r[i - GGML_V3_ENDIAN_BYTE(0)]), \
+            0, p - GGML_V3_F16_EPR)
+
+#elif defined(__wasm_simd128__)
+
+#define GGML_V3_SIMD
+
+// F32 WASM
+
+#define GGML_V3_F32_STEP 16
+#define GGML_V3_F32_EPR  4
+
+#define GGML_V3_F32x4              v128_t
+#define GGML_V3_F32x4_ZERO         wasm_f32x4_splat(0.0f)
+#define GGML_V3_F32x4_SET1(x)      wasm_f32x4_splat(x)
+#define GGML_V3_F32x4_LOAD         wasm_v128_load
+#define GGML_V3_F32x4_STORE        wasm_v128_store
+#define GGML_V3_F32x4_FMA(a, b, c) wasm_f32x4_add(wasm_f32x4_mul(b, c), a)
+#define GGML_V3_F32x4_ADD          wasm_f32x4_add
+#define GGML_V3_F32x4_MUL          wasm_f32x4_mul
+#define GGML_V3_F32x4_REDUCE(res, x)                  \
+{                                                  \
+    int offset = GGML_V3_F32_ARR >> 1;                \
+    for (int i = 0; i < offset; ++i) {             \
+        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
+    }                                              \
+    offset >>= 1;                                  \
+    for (int i = 0; i < offset; ++i) {             \
+        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
+    }                                              \
+    offset >>= 1;                                  \
+    for (int i = 0; i < offset; ++i) {             \
+        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
+    }                                              \
+    res = wasm_f32x4_extract_lane(x[0], 0) +       \
+          wasm_f32x4_extract_lane(x[0], 1) +       \
+          wasm_f32x4_extract_lane(x[0], 2) +       \
+          wasm_f32x4_extract_lane(x[0], 3);        \
+}
+
+#define GGML_V3_F32_VEC        GGML_V3_F32x4
+#define GGML_V3_F32_VEC_ZERO   GGML_V3_F32x4_ZERO
+#define GGML_V3_F32_VEC_SET1   GGML_V3_F32x4_SET1
+#define GGML_V3_F32_VEC_LOAD   GGML_V3_F32x4_LOAD
+#define GGML_V3_F32_VEC_STORE  GGML_V3_F32x4_STORE
+#define GGML_V3_F32_VEC_FMA    GGML_V3_F32x4_FMA
+#define GGML_V3_F32_VEC_ADD    GGML_V3_F32x4_ADD
+#define GGML_V3_F32_VEC_MUL    GGML_V3_F32x4_MUL
+#define GGML_V3_F32_VEC_REDUCE GGML_V3_F32x4_REDUCE
+
+// F16 WASM
+
+#define GGML_V3_F16_STEP 16
+#define GGML_V3_F16_EPR  4
+
+inline static v128_t __wasm_f16x4_load(const ggml_v3_fp16_t * p) {
+    float tmp[4];
+
+    tmp[0] = GGML_V3_FP16_TO_FP32(p[0]);
+    tmp[1] = GGML_V3_FP16_TO_FP32(p[1]);
+    tmp[2] = GGML_V3_FP16_TO_FP32(p[2]);
+    tmp[3] = GGML_V3_FP16_TO_FP32(p[3]);
+
+    return wasm_v128_load(tmp);
+}
+
+inline static void __wasm_f16x4_store(ggml_v3_fp16_t * p, v128_t x) {
+    float tmp[4];
+
+    wasm_v128_store(tmp, x);
+
+    p[0] = GGML_V3_FP32_TO_FP16(tmp[0]);
+    p[1] = GGML_V3_FP32_TO_FP16(tmp[1]);
+    p[2] = GGML_V3_FP32_TO_FP16(tmp[2]);
+    p[3] = GGML_V3_FP32_TO_FP16(tmp[3]);
+}
+
+#define GGML_V3_F16x4             v128_t
+#define GGML_V3_F16x4_ZERO        wasm_f32x4_splat(0.0f)
+#define GGML_V3_F16x4_SET1(x)     wasm_f32x4_splat(x)
+#define GGML_V3_F16x4_LOAD(x)     __wasm_f16x4_load(x)
+#define GGML_V3_F16x4_STORE(x, y) __wasm_f16x4_store(x, y)
+#define GGML_V3_F16x4_FMA         GGML_V3_F32x4_FMA
+#define GGML_V3_F16x4_ADD         wasm_f32x4_add
+#define GGML_V3_F16x4_MUL         wasm_f32x4_mul
+#define GGML_V3_F16x4_REDUCE(res, x)                  \
+{                                                  \
+    int offset = GGML_V3_F16_ARR >> 1;                \
+    for (int i = 0; i < offset; ++i) {             \
+        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
+    }                                              \
+    offset >>= 1;                                  \
+    for (int i = 0; i < offset; ++i) {             \
+        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
+    }                                              \
+    offset >>= 1;                                  \
+    for (int i = 0; i < offset; ++i) {             \
+        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
+    }                                              \
+    res = wasm_f32x4_extract_lane(x[0], 0) +       \
+          wasm_f32x4_extract_lane(x[0], 1) +       \
+          wasm_f32x4_extract_lane(x[0], 2) +       \
+          wasm_f32x4_extract_lane(x[0], 3);        \
+}
+
+#define GGML_V3_F16_VEC                GGML_V3_F16x4
+#define GGML_V3_F16_VEC_ZERO           GGML_V3_F16x4_ZERO
+#define GGML_V3_F16_VEC_SET1           GGML_V3_F16x4_SET1
+#define GGML_V3_F16_VEC_LOAD(p, i)     GGML_V3_F16x4_LOAD(p)
+#define GGML_V3_F16_VEC_STORE(p, r, i) GGML_V3_F16x4_STORE(p, r[i])
+#define GGML_V3_F16_VEC_FMA            GGML_V3_F16x4_FMA
+#define GGML_V3_F16_VEC_ADD            GGML_V3_F16x4_ADD
+#define GGML_V3_F16_VEC_MUL            GGML_V3_F16x4_MUL
+#define GGML_V3_F16_VEC_REDUCE         GGML_V3_F16x4_REDUCE
+
+#elif defined(__SSE3__)
+
+#define GGML_V3_SIMD
+
+// F32 SSE
+
+#define GGML_V3_F32_STEP 32
+#define GGML_V3_F32_EPR  4
+
+#define GGML_V3_F32x4         __m128
+#define GGML_V3_F32x4_ZERO    _mm_setzero_ps()
+#define GGML_V3_F32x4_SET1(x) _mm_set1_ps(x)
+#define GGML_V3_F32x4_LOAD    _mm_loadu_ps
+#define GGML_V3_F32x4_STORE   _mm_storeu_ps
+#if defined(__FMA__)
+    // TODO: Does this work?
+    #define GGML_V3_F32x4_FMA(a, b, c) _mm_fmadd_ps(b, c, a)
+#else
+    #define GGML_V3_F32x4_FMA(a, b, c) _mm_add_ps(_mm_mul_ps(b, c), a)
+#endif
+#define GGML_V3_F32x4_ADD     _mm_add_ps
+#define GGML_V3_F32x4_MUL     _mm_mul_ps
+#define GGML_V3_F32x4_REDUCE(res, x)                                 \
+{                                                                 \
+    int offset = GGML_V3_F32_ARR >> 1;                               \
+    for (int i = 0; i < offset; ++i) {                            \
+        x[i] = _mm_add_ps(x[i], x[offset+i]);                     \
+    }                                                             \
+    offset >>= 1;                                                 \
+    for (int i = 0; i < offset; ++i) {                            \
+        x[i] = _mm_add_ps(x[i], x[offset+i]);                     \
+    }                                                             \
+    offset >>= 1;                                                 \
+    for (int i = 0; i < offset; ++i) {                            \
+        x[i] = _mm_add_ps(x[i], x[offset+i]);                     \
+    }                                                             \
+    const __m128 t0 = _mm_hadd_ps(x[0], x[0]);                    \
+    res = _mm_cvtss_f32(_mm_hadd_ps(t0, t0));                     \
+}
+// TODO: is this optimal ?
+
+#define GGML_V3_F32_VEC        GGML_V3_F32x4
+#define GGML_V3_F32_VEC_ZERO   GGML_V3_F32x4_ZERO
+#define GGML_V3_F32_VEC_SET1   GGML_V3_F32x4_SET1
+#define GGML_V3_F32_VEC_LOAD   GGML_V3_F32x4_LOAD
+#define GGML_V3_F32_VEC_STORE  GGML_V3_F32x4_STORE
+#define GGML_V3_F32_VEC_FMA    GGML_V3_F32x4_FMA
+#define GGML_V3_F32_VEC_ADD    GGML_V3_F32x4_ADD
+#define GGML_V3_F32_VEC_MUL    GGML_V3_F32x4_MUL
+#define GGML_V3_F32_VEC_REDUCE GGML_V3_F32x4_REDUCE
+
+// F16 SSE
+
+#define GGML_V3_F16_STEP 32
+#define GGML_V3_F16_EPR  4
+
+static inline __m128 __sse_f16x4_load(ggml_v3_fp16_t *x) {
+    float tmp[4];
+
+    tmp[0] = GGML_V3_FP16_TO_FP32(x[0]);
+    tmp[1] = GGML_V3_FP16_TO_FP32(x[1]);
+    tmp[2] = GGML_V3_FP16_TO_FP32(x[2]);
+    tmp[3] = GGML_V3_FP16_TO_FP32(x[3]);
+
+    return _mm_loadu_ps(tmp);
+}
+
+static inline void __sse_f16x4_store(ggml_v3_fp16_t *x, __m128 y) {
+    float arr[4];
+
+    _mm_storeu_ps(arr, y);
+
+    x[0] = GGML_V3_FP32_TO_FP16(arr[0]);
+    x[1] = GGML_V3_FP32_TO_FP16(arr[1]);
+    x[2] = GGML_V3_FP32_TO_FP16(arr[2]);
+    x[3] = GGML_V3_FP32_TO_FP16(arr[3]);
+}
+
+#define GGML_V3_F32Cx4             __m128
+#define GGML_V3_F32Cx4_ZERO        _mm_setzero_ps()
+#define GGML_V3_F32Cx4_SET1(x)     _mm_set1_ps(x)
+#define GGML_V3_F32Cx4_LOAD(x)     __sse_f16x4_load(x)
+#define GGML_V3_F32Cx4_STORE(x, y) __sse_f16x4_store(x, y)
+#define GGML_V3_F32Cx4_FMA         GGML_V3_F32x4_FMA
+#define GGML_V3_F32Cx4_ADD         _mm_add_ps
+#define GGML_V3_F32Cx4_MUL         _mm_mul_ps
+#define GGML_V3_F32Cx4_REDUCE      GGML_V3_F32x4_REDUCE
+
+#define GGML_V3_F16_VEC                 GGML_V3_F32Cx4
+#define GGML_V3_F16_VEC_ZERO            GGML_V3_F32Cx4_ZERO
+#define GGML_V3_F16_VEC_SET1            GGML_V3_F32Cx4_SET1
+#define GGML_V3_F16_VEC_LOAD(p, i)      GGML_V3_F32Cx4_LOAD(p)
+#define GGML_V3_F16_VEC_STORE(p, r, i)  GGML_V3_F32Cx4_STORE(p, r[i])
+#define GGML_V3_F16_VEC_FMA             GGML_V3_F32Cx4_FMA
+#define GGML_V3_F16_VEC_ADD             GGML_V3_F32Cx4_ADD
+#define GGML_V3_F16_VEC_MUL             GGML_V3_F32Cx4_MUL
+#define GGML_V3_F16_VEC_REDUCE          GGML_V3_F32Cx4_REDUCE
+
+#endif
+
+// GGML_V3_F32_ARR / GGML_V3_F16_ARR
+//   number of registers to use per step
+#ifdef GGML_V3_SIMD
+#define GGML_V3_F32_ARR (GGML_V3_F32_STEP/GGML_V3_F32_EPR)
+#define GGML_V3_F16_ARR (GGML_V3_F16_STEP/GGML_V3_F16_EPR)
+#endif
+
+//
+// fundamental operations
+//
+
+inline static void ggml_v3_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
+
+inline static void ggml_v3_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
+
+inline static void ggml_v3_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
+
+inline static void ggml_v3_vec_set_f16(const int n, ggml_v3_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
+
+inline static void ggml_v3_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i] + y[i]; }
+inline static void ggml_v3_vec_add1_f32(const int n, float * z, const float * x, const float   v) { for (int i = 0; i < n; ++i) z[i]  = x[i] + v;    }
+inline static void ggml_v3_vec_acc_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i] += x[i];        }
+inline static void ggml_v3_vec_acc1_f32(const int n, float * y, const float   v)                  { for (int i = 0; i < n; ++i) y[i] += v;           }
+inline static void ggml_v3_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i] - y[i]; }
+inline static void ggml_v3_vec_set_f32 (const int n, float * x, const float   v)                  { for (int i = 0; i < n; ++i) x[i]  = v;           }
+inline static void ggml_v3_vec_cpy_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i]  = x[i];        }
+inline static void ggml_v3_vec_neg_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i]  = -x[i];       }
+inline static void ggml_v3_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]*y[i];   }
+inline static void ggml_v3_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]/y[i];   }
+
+static void ggml_v3_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y) {
+#ifdef GGML_V3_SIMD
+    float sumf = 0.0f;
+    const int np = (n & ~(GGML_V3_F32_STEP - 1));
+
+    GGML_V3_F32_VEC sum[GGML_V3_F32_ARR] = { GGML_V3_F32_VEC_ZERO };
+
+    GGML_V3_F32_VEC ax[GGML_V3_F32_ARR];
+    GGML_V3_F32_VEC ay[GGML_V3_F32_ARR];
+
+    for (int i = 0; i < np; i += GGML_V3_F32_STEP) {
+        for (int j = 0; j < GGML_V3_F32_ARR; j++) {
+            ax[j] = GGML_V3_F32_VEC_LOAD(x + i + j*GGML_V3_F32_EPR);
+            ay[j] = GGML_V3_F32_VEC_LOAD(y + i + j*GGML_V3_F32_EPR);
+
+            sum[j] = GGML_V3_F32_VEC_FMA(sum[j], ax[j], ay[j]);
+        }
+    }
+
+    // reduce sum0..sum3 to sum0
+    GGML_V3_F32_VEC_REDUCE(sumf, sum);
+
+    // leftovers
+    for (int i = np; i < n; ++i) {
+        sumf += x[i]*y[i];
+    }
+#else
+    // scalar
+    ggml_v3_float sumf = 0.0;
+    for (int i = 0; i < n; ++i) {
+        sumf += (ggml_v3_float)(x[i]*y[i]);
+    }
+#endif
+
+    *s = sumf;
+}
+
+static void ggml_v3_vec_dot_f16(const int n, float * restrict s, ggml_v3_fp16_t * restrict x, ggml_v3_fp16_t * restrict y) {
+    ggml_v3_float sumf = 0.0;
+
+#if defined(GGML_V3_SIMD)
+    const int np = (n & ~(GGML_V3_F16_STEP - 1));
+
+    GGML_V3_F16_VEC sum[GGML_V3_F16_ARR] = { GGML_V3_F16_VEC_ZERO };
+
+    GGML_V3_F16_VEC ax[GGML_V3_F16_ARR];
+    GGML_V3_F16_VEC ay[GGML_V3_F16_ARR];
+
+    for (int i = 0; i < np; i += GGML_V3_F16_STEP) {
+        for (int j = 0; j < GGML_V3_F16_ARR; j++) {
+            ax[j] = GGML_V3_F16_VEC_LOAD(x + i + j*GGML_V3_F16_EPR, j);
+            ay[j] = GGML_V3_F16_VEC_LOAD(y + i + j*GGML_V3_F16_EPR, j);
+
+            sum[j] = GGML_V3_F16_VEC_FMA(sum[j], ax[j], ay[j]);
+        }
+    }
+
+    // reduce sum0..sum3 to sum0
+    GGML_V3_F16_VEC_REDUCE(sumf, sum);
+
+    // leftovers
+    for (int i = np; i < n; ++i) {
+        sumf += (ggml_v3_float)(GGML_V3_FP16_TO_FP32(x[i])*GGML_V3_FP16_TO_FP32(y[i]));
+    }
+#else
+    for (int i = 0; i < n; ++i) {
+        sumf += (ggml_v3_float)(GGML_V3_FP16_TO_FP32(x[i])*GGML_V3_FP16_TO_FP32(y[i]));
+    }
+#endif
+
+    *s = sumf;
+}
+
+// compute GGML_V3_VEC_DOT_UNROLL dot products at once
+// xs - x row stride in bytes
+inline static void ggml_v3_vec_dot_f16_unroll(const int n, const int xs, float * restrict s, void * restrict xv, ggml_v3_fp16_t * restrict y) {
+    ggml_v3_float sumf[GGML_V3_VEC_DOT_UNROLL] = { 0.0 };
+
+    ggml_v3_fp16_t * restrict x[GGML_V3_VEC_DOT_UNROLL];
+
+    for (int i = 0; i < GGML_V3_VEC_DOT_UNROLL; ++i) {
+        x[i] = (ggml_v3_fp16_t *) ((char *) xv + i*xs);
+    }
+
+#if defined(GGML_V3_SIMD)
+    const int np = (n & ~(GGML_V3_F16_STEP - 1));
+
+    GGML_V3_F16_VEC sum[GGML_V3_VEC_DOT_UNROLL][GGML_V3_F16_ARR] = { { GGML_V3_F16_VEC_ZERO } };
+
+    GGML_V3_F16_VEC ax[GGML_V3_F16_ARR];
+    GGML_V3_F16_VEC ay[GGML_V3_F16_ARR];
+
+    for (int i = 0; i < np; i += GGML_V3_F16_STEP) {
+        for (int j = 0; j < GGML_V3_F16_ARR; j++) {
+            ay[j] = GGML_V3_F16_VEC_LOAD(y + i + j*GGML_V3_F16_EPR, j);
+
+            for (int k = 0; k < GGML_V3_VEC_DOT_UNROLL; ++k) {
+                ax[j] = GGML_V3_F16_VEC_LOAD(x[k] + i + j*GGML_V3_F16_EPR, j);
+
+                sum[k][j] = GGML_V3_F16_VEC_FMA(sum[k][j], ax[j], ay[j]);
+            }
+        }
+    }
+
+    // reduce sum0..sum3 to sum0
+    for (int k = 0; k < GGML_V3_VEC_DOT_UNROLL; ++k) {
+        GGML_V3_F16_VEC_REDUCE(sumf[k], sum[k]);
+    }
+
+    // leftovers
+    for (int i = np; i < n; ++i) {
+        for (int j = 0; j < GGML_V3_VEC_DOT_UNROLL; ++j) {
+            sumf[j] += (ggml_v3_float)(GGML_V3_FP16_TO_FP32(x[j][i])*GGML_V3_FP16_TO_FP32(y[i]));
+        }
+    }
+#else
+    for (int i = 0; i < n; ++i) {
+        for (int j = 0; j < GGML_V3_VEC_DOT_UNROLL; ++j) {
+            sumf[j] += (ggml_v3_float)(GGML_V3_FP16_TO_FP32(x[j][i])*GGML_V3_FP16_TO_FP32(y[i]));
+        }
+    }
+#endif
+
+    for (int i = 0; i < GGML_V3_VEC_DOT_UNROLL; ++i) {
+        s[i] = sumf[i];
+    }
+}
+
+inline static void ggml_v3_vec_mad_f32(const int n, float * restrict y, const float * restrict x, const float v) {
+#if defined(GGML_V3_SIMD)
+    const int np = (n & ~(GGML_V3_F32_STEP - 1));
+
+    GGML_V3_F32_VEC vx = GGML_V3_F32_VEC_SET1(v);
+
+    GGML_V3_F32_VEC ax[GGML_V3_F32_ARR];
+    GGML_V3_F32_VEC ay[GGML_V3_F32_ARR];
+
+    for (int i = 0; i < np; i += GGML_V3_F32_STEP) {
+        for (int j = 0; j < GGML_V3_F32_ARR; j++) {
+            ax[j] = GGML_V3_F32_VEC_LOAD(x + i + j*GGML_V3_F32_EPR);
+            ay[j] = GGML_V3_F32_VEC_LOAD(y + i + j*GGML_V3_F32_EPR);
+            ay[j] = GGML_V3_F32_VEC_FMA(ay[j], ax[j], vx);
+
+            GGML_V3_F32_VEC_STORE(y + i + j*GGML_V3_F32_EPR, ay[j]);
+        }
+    }
+
+    // leftovers
+    for (int i = np; i < n; ++i) {
+        y[i] += x[i]*v;
+    }
+#else
+    // scalar
+    for (int i = 0; i < n; ++i) {
+        y[i] += x[i]*v;
+    }
+#endif
+}
+
+// xs and vs are byte strides of x and v
+inline static void ggml_v3_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * restrict y, const float * restrict xv, const float * restrict vv) {
+
+    const float * restrict x[GGML_V3_VEC_MAD_UNROLL];
+    const float * restrict v[GGML_V3_VEC_MAD_UNROLL];
+
+    for (int i = 0; i < GGML_V3_VEC_MAD_UNROLL; ++i) {
+        x[i] = (const float *) ((const char *) xv + i*xs);
+        v[i] = (const float *) ((const char *) vv + i*vs);
+    }
+
+#if defined(GGML_V3_SIMD)
+    const int np = (n & ~(GGML_V3_F32_STEP - 1));
+
+    GGML_V3_F32_VEC vx[GGML_V3_VEC_MAD_UNROLL];
+
+    for (int k = 0; k < GGML_V3_VEC_MAD_UNROLL; ++k) {
+        vx[k] = GGML_V3_F32_VEC_SET1(v[k][0]);
+    }
+
+    GGML_V3_F32_VEC ax[GGML_V3_VEC_MAD_UNROLL][GGML_V3_F32_ARR];
+    GGML_V3_F32_VEC ay[GGML_V3_F32_ARR];
+
+    for (int i = 0; i < np; i += GGML_V3_F32_STEP) {
+        for (int j = 0; j < GGML_V3_F32_ARR; j++) {
+            ay[j] = GGML_V3_F32_VEC_LOAD(y + i + j*GGML_V3_F32_EPR);
+
+            for (int k = 0; k < GGML_V3_VEC_MAD_UNROLL; ++k) {
+                ax[k][j] = GGML_V3_F32_VEC_LOAD(x[k] + i + j*GGML_V3_F32_EPR);
+                ay[j] = GGML_V3_F32_VEC_FMA(ay[j], ax[k][j], vx[k]);
+            }
+
+            GGML_V3_F32_VEC_STORE(y + i + j*GGML_V3_F32_EPR, ay[j]);
+        }
+    }
+
+    // leftovers
+    for (int k = 0; k < GGML_V3_VEC_MAD_UNROLL; ++k) {
+        for (int i = np; i < n; ++i) {
+            y[i] += x[k][i]*v[k][0];
+        }
+    }
+#else
+    // scalar
+    for (int k = 0; k < GGML_V3_VEC_MAD_UNROLL; ++k) {
+        for (int i = 0; i < n; ++i) {
+            y[i] += x[k][i]*v[k][0];
+        }
+    }
+#endif
+}
+
+//inline static void ggml_v3_vec_scale_f32(const int n, float * y, const float   v) { for (int i = 0; i < n; ++i) y[i] *= v;          }
+inline static void ggml_v3_vec_scale_f32(const int n, float * y, const float   v) {
+#if defined(GGML_USE_ACCELERATE)
+    vDSP_vsmul(y, 1, &v, y, 1, n);
+#elif defined(GGML_V3_SIMD)
+    const int np = (n & ~(GGML_V3_F32_STEP - 1));
+
+    GGML_V3_F32_VEC vx = GGML_V3_F32_VEC_SET1(v);
+
+    GGML_V3_F32_VEC ay[GGML_V3_F32_ARR];
+
+    for (int i = 0; i < np; i += GGML_V3_F32_STEP) {
+        for (int j = 0; j < GGML_V3_F32_ARR; j++) {
+            ay[j] = GGML_V3_F32_VEC_LOAD(y + i + j*GGML_V3_F32_EPR);
+            ay[j] = GGML_V3_F32_VEC_MUL(ay[j], vx);
+
+            GGML_V3_F32_VEC_STORE(y + i + j*GGML_V3_F32_EPR, ay[j]);
+        }
+    }
+
+    // leftovers
+    for (int i = np; i < n; ++i) {
+        y[i] *= v;
+    }
+#else
+    // scalar
+    for (int i = 0; i < n; ++i) {
+        y[i] *= v;
+    }
+#endif
+}
+
+inline static void ggml_v3_vec_norm_f32 (const int n, float * s, const float * x) { ggml_v3_vec_dot_f32(n, s, x, x); *s = sqrtf(*s);   }
+inline static void ggml_v3_vec_sqr_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i];   }
+inline static void ggml_v3_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
+inline static void ggml_v3_vec_log_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]);   }
+inline static void ggml_v3_vec_abs_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); }
+inline static void ggml_v3_vec_sgn_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
+inline static void ggml_v3_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
+inline static void ggml_v3_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]);  }
+inline static void ggml_v3_vec_elu_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
+inline static void ggml_v3_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
+inline static void ggml_v3_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
+
+static const float GELU_COEF_A     = 0.044715f;
+static const float GELU_QUICK_COEF = -1.702f;
+static const float SQRT_2_OVER_PI  = 0.79788456080286535587989211986876f;
+
+inline static float ggml_v3_gelu_f32(float x) {
+    return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
+}
+
+inline static void ggml_v3_vec_gelu_f16(const int n, ggml_v3_fp16_t * y, const ggml_v3_fp16_t * x) {
+    const uint16_t * i16 = (const uint16_t *) x;
+    for (int i = 0; i < n; ++i) {
+        y[i] = ggml_v3_table_gelu_f16[i16[i]];
+    }
+}
+
+#ifdef GGML_V3_GELU_FP16
+inline static void ggml_v3_vec_gelu_f32(const int n, float * y, const float * x) {
+    uint16_t t;
+    for (int i = 0; i < n; ++i) {
+        ggml_v3_fp16_t fp16 = GGML_V3_FP32_TO_FP16(x[i]);
+        memcpy(&t, &fp16, sizeof(uint16_t));
+        y[i] = GGML_V3_FP16_TO_FP32(ggml_v3_table_gelu_f16[t]);
+    }
+}
+#else
+inline static void ggml_v3_vec_gelu_f32(const int n, float * y, const float * x) {
+    for (int i = 0; i < n; ++i) {
+        y[i] = ggml_v3_gelu_f32(x[i]);
+    }
+}
+#endif
+
+inline static float ggml_v3_gelu_quick_f32(float x) {
+    return x*(1.0f/(1.0f+expf(GELU_QUICK_COEF*x)));
+}
+
+//inline static void ggml_v3_vec_gelu_quick_f16(const int n, ggml_v3_fp16_t * y, const ggml_v3_fp16_t * x) {
+//    const uint16_t * i16 = (const uint16_t *) x;
+//    for (int i = 0; i < n; ++i) {
+//        y[i] = ggml_v3_table_gelu_quick_f16[i16[i]];
+//    }
+//}
+
+#ifdef GGML_V3_GELU_QUICK_FP16
+inline static void ggml_v3_vec_gelu_quick_f32(const int n, float * y, const float * x) {
+    uint16_t t;
+    for (int i = 0; i < n; ++i) {
+        ggml_v3_fp16_t fp16 = GGML_V3_FP32_TO_FP16(x[i]);
+        memcpy(&t, &fp16, sizeof(uint16_t));
+        y[i] = GGML_V3_FP16_TO_FP32(ggml_v3_table_gelu_quick_f16[t]);
+    }
+}
+#else
+inline static void ggml_v3_vec_gelu_quick_f32(const int n, float * y, const float * x) {
+    for (int i = 0; i < n; ++i) {
+        y[i] = ggml_v3_gelu_quick_f32(x[i]);
+    }
+}
+#endif
+
+// Sigmoid Linear Unit (SiLU) function
+inline static float ggml_v3_silu_f32(float x) {
+    return x/(1.0f + expf(-x));
+}
+
+//inline static void ggml_v3_vec_silu_f16(const int n, ggml_v3_fp16_t * y, const ggml_v3_fp16_t * x) {
+//    const uint16_t * i16 = (const uint16_t *) x;
+//    for (int i = 0; i < n; ++i) {
+//        y[i] = ggml_v3_table_silu_f16[i16[i]];
+//    }
+//}
+
+#ifdef GGML_V3_SILU_FP16
+inline static void ggml_v3_vec_silu_f32(const int n, float * y, const float * x) {
+    uint16_t t;
+    for (int i = 0; i < n; ++i) {
+        ggml_v3_fp16_t fp16 = GGML_V3_FP32_TO_FP16(x[i]);
+        memcpy(&t, &fp16, sizeof(uint16_t));
+        y[i] = GGML_V3_FP16_TO_FP32(ggml_v3_table_silu_f16[t]);
+    }
+}
+#else
+inline static void ggml_v3_vec_silu_f32(const int n, float * y, const float * x) {
+    for (int i = 0; i < n; ++i) {
+        y[i] = ggml_v3_silu_f32(x[i]);
+    }
+}
+#endif
+
+inline static float ggml_v3_silu_backward_f32(float x, float dy) {
+    const float s = 1.0f/(1.0f + expf(-x));
+    return dy*s*(1.0f + x*(1.0f - s));
+}
+
+#ifdef GGML_V3_SILU_FP16
+inline static void ggml_v3_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) {
+    for (int i = 0; i < n; ++i) {
+        // we did not use x[i] to compute forward silu but its f16 equivalent
+        // take derivative at f16 of x[i]:
+        ggml_v3_fp16_t fp16 = GGML_V3_FP32_TO_FP16(x[i]);
+        float usedx = GGML_V3_FP16_TO_FP32(fp16);
+        dx[i] = ggml_v3_silu_backward_f32(usedx, dy[i]);
+    }
+}
+#else
+inline static void ggml_v3_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) {
+    for (int i = 0; i < n; ++i) {
+        dx[i] = ggml_v3_silu_backward_f32(x[i], dy[i]);
+    }
+}
+#endif
+
+inline static void ggml_v3_vec_sum_f32(const int n, float * s, const float * x) {
+#ifndef GGML_USE_ACCELERATE
+    ggml_v3_float sum = 0.0;
+    for (int i = 0; i < n; ++i) {
+        sum += (ggml_v3_float)x[i];
+    }
+    *s = sum;
+#else
+    vDSP_sve(x, 1, s, n);
+#endif
+}
+
+inline static void ggml_v3_vec_sum_f32_ggf(const int n, ggml_v3_float * s, const float * x) {
+    ggml_v3_float sum = 0.0;
+    for (int i = 0; i < n; ++i) {
+        sum += (ggml_v3_float)x[i];
+    }
+    *s = sum;
+}
+
+inline static void ggml_v3_vec_sum_f16_ggf(const int n, float * s, const ggml_v3_fp16_t * x) {
+    float sum = 0.0f;
+    for (int i = 0; i < n; ++i) {
+        sum += GGML_V3_FP16_TO_FP32(x[i]);
+    }
+    *s = sum;
+}
+
+inline static void ggml_v3_vec_max_f32(const int n, float * s, const float * x) {
+#ifndef GGML_USE_ACCELERATE
+    float max = -INFINITY;
+    for (int i = 0; i < n; ++i) {
+        max = MAX(max, x[i]);
+    }
+    *s = max;
+#else
+    vDSP_maxv(x, 1, s, n);
+#endif
+}
+
+inline static void ggml_v3_vec_norm_inv_f32(const int n, float * s, const float * x) {
+    ggml_v3_vec_norm_f32(n, s, x);
+    *s = 1.f/(*s);
+}
+
+inline static void ggml_v3_vec_argmax_f32(const int n, int * s, const float * x) {
+    float max = -INFINITY;
+    int idx = 0;
+    for (int i = 0; i < n; ++i) {
+        max = MAX(max, x[i]);
+        if (max == x[i]) { idx = i; }
+    }
+    *s = idx;
+}
+
+//
+// data types
+//
+
+static const char * GGML_V3_OP_NAME[GGML_V3_OP_COUNT] = {
+    "NONE",
+
+    "DUP",
+    "ADD",
+    "ADD1",
+    "ACC",
+    "SUB",
+    "MUL",
+    "DIV",
+    "SQR",
+    "SQRT",
+    "LOG",
+    "SUM",
+    "SUM_ROWS",
+    "MEAN",
+    "ARGMAX",
+    "REPEAT",
+    "REPEAT_BACK",
+    "CONCAT",
+    "SILU_BACK",
+    "NORM",
+    "RMS_NORM",
+    "RMS_NORM_BACK",
+    "GROUP_NORM",
+
+    "MUL_MAT",
+    "MUL_MAT_ID",
+    "OUT_PROD",
+
+    "SCALE",
+    "SET",
+    "CPY",
+    "CONT",
+    "RESHAPE",
+    "VIEW",
+    "PERMUTE",
+    "TRANSPOSE",
+    "GET_ROWS",
+    "GET_ROWS_BACK",
+    "DIAG",
+    "DIAG_MASK_INF",
+    "DIAG_MASK_ZERO",
+    "SOFT_MAX",
+    "SOFT_MAX_BACK",
+    "ROPE",
+    "ROPE_BACK",
+    "ALIBI",
+    "CLAMP",
+    "CONV_TRANSPOSE_1D",
+    "IM2COL",
+    "CONV_TRANSPOSE_2D",
+    "POOL_1D",
+    "POOL_2D",
+    "UPSCALE",
+    "PAD",
+    "ARGSORT",
+    "LEAKY_RELU",
+
+    "FLASH_ATTN",
+    "FLASH_FF",
+    "FLASH_ATTN_BACK",
+    "WIN_PART",
+    "WIN_UNPART",
+    "GET_REL_POS",
+    "ADD_REL_POS",
+
+    "UNARY",
+
+    "MAP_UNARY",
+    "MAP_BINARY",
+
+    "MAP_CUSTOM1_F32",
+    "MAP_CUSTOM2_F32",
+    "MAP_CUSTOM3_F32",
+
+    "MAP_CUSTOM1",
+    "MAP_CUSTOM2",
+    "MAP_CUSTOM3",
+
+    "CROSS_ENTROPY_LOSS",
+    "CROSS_ENTROPY_LOSS_BACK",
+};
+
+static_assert(GGML_V3_OP_COUNT == 72, "GGML_V3_OP_COUNT != 72");
+
+static const char * GGML_V3_OP_SYMBOL[GGML_V3_OP_COUNT] = {
+    "none",
+
+    "x",
+    "x+y",
+    "x+y",
+    "view(x,nb,offset)+=y->x",
+    "x-y",
+    "x*y",
+    "x/y",
+    "x^2",
+    "√x",
+    "log(x)",
+    "Σx",
+    "Σx_k",
+    "Σx/n",
+    "argmax(x)",
+    "repeat(x)",
+    "repeat_back(x)",
+    "concat(x, y)",
+    "silu_back(x)",
+    "norm(x)",
+    "rms_norm(x)",
+    "rms_norm_back(x)",
+    "group_norm(x)",
+
+    "X*Y",
+    "X[i]*Y",
+    "X*Y",
+
+    "x*v",
+    "y-\\>view(x)",
+    "x-\\>y",
+    "cont(x)",
+    "reshape(x)",
+    "view(x)",
+    "permute(x)",
+    "transpose(x)",
+    "get_rows(x)",
+    "get_rows_back(x)",
+    "diag(x)",
+    "diag_mask_inf(x)",
+    "diag_mask_zero(x)",
+    "soft_max(x)",
+    "soft_max_back(x)",
+    "rope(x)",
+    "rope_back(x)",
+    "alibi(x)",
+    "clamp(x)",
+    "conv_transpose_1d(x)",
+    "im2col(x)",
+    "conv_transpose_2d(x)",
+    "pool_1d(x)",
+    "pool_2d(x)",
+    "upscale(x)",
+    "pad(x)",
+    "argsort(x)",
+    "leaky_relu(x)",
+
+    "flash_attn(x)",
+    "flash_ff(x)",
+    "flash_attn_back(x)",
+    "win_part(x)",
+    "win_unpart(x)",
+    "get_rel_pos(x)",
+    "add_rel_pos(x)",
+
+    "unary(x)",
+
+    "f(x)",
+    "f(x,y)",
+
+    "custom_f32(x)",
+    "custom_f32(x,y)",
+    "custom_f32(x,y,z)",
+
+    "custom(x)",
+    "custom(x,y)",
+    "custom(x,y,z)",
+
+    "cross_entropy_loss(x,y)",
+    "cross_entropy_loss_back(x,y)",
+};
+
+static_assert(GGML_V3_OP_COUNT == 72, "GGML_V3_OP_COUNT != 72");
+
+static_assert(GGML_V3_OP_POOL_COUNT == 2, "GGML_V3_OP_POOL_COUNT != 2");
+
+
+static const char * GGML_V3_UNARY_OP_NAME[GGML_V3_UNARY_OP_COUNT] = {
+    "ABS",
+    "SGN",
+    "NEG",
+    "STEP",
+    "TANH",
+    "ELU",
+    "RELU",
+    "GELU",
+    "GELU_QUICK",
+    "SILU",
+};
+
+static_assert(GGML_V3_UNARY_OP_COUNT == 10, "GGML_V3_UNARY_OP_COUNT != 10");
+
+
+static_assert(sizeof(struct ggml_v3_object)%GGML_V3_MEM_ALIGN == 0, "ggml_v3_object size must be a multiple of GGML_V3_MEM_ALIGN");
+static_assert(sizeof(struct ggml_v3_tensor)%GGML_V3_MEM_ALIGN == 0, "ggml_v3_tensor size must be a multiple of GGML_V3_MEM_ALIGN");
+
+// WARN:
+// Mis-configuration can lead to problem that's hard to reason about:
+// * At best  it crash or talks nosense.
+// * At worst it talks slightly difference but hard to perceive.
+//
+// An op has to enable INIT or FINALIZE when any of it's branch needs that pass.
+// Take care about compile options (e.g., GGML_USE_xxx).
+static bool GGML_V3_OP_HAS_INIT    [GGML_V3_OP_COUNT] = { 0 };
+static bool GGML_V3_OP_HAS_FINALIZE[GGML_V3_OP_COUNT] = { 0 };
+
+static void ggml_v3_setup_op_has_task_pass(void) {
+    {   // INIT
+        bool * p = GGML_V3_OP_HAS_INIT;
+
+        p[GGML_V3_OP_ACC                    ] = true;
+        p[GGML_V3_OP_MUL_MAT                ] = true;
+        p[GGML_V3_OP_MUL_MAT_ID             ] = true;
+        p[GGML_V3_OP_OUT_PROD               ] = true;
+        p[GGML_V3_OP_SET                    ] = true;
+        p[GGML_V3_OP_GET_ROWS_BACK          ] = true;
+        p[GGML_V3_OP_DIAG_MASK_INF          ] = true;
+        p[GGML_V3_OP_DIAG_MASK_ZERO         ] = true;
+        p[GGML_V3_OP_CONV_TRANSPOSE_1D      ] = true;
+        p[GGML_V3_OP_CONV_TRANSPOSE_2D      ] = true;
+        p[GGML_V3_OP_FLASH_ATTN_BACK        ] = true;
+        p[GGML_V3_OP_CROSS_ENTROPY_LOSS     ] = true;
+        p[GGML_V3_OP_ADD_REL_POS            ] = true;
+    }
+
+    {   // FINALIZE
+        bool * p = GGML_V3_OP_HAS_FINALIZE;
+
+        p[GGML_V3_OP_CROSS_ENTROPY_LOSS     ] = true;
+    }
+}
+
+//
+// ggml context
+//
+
+struct ggml_v3_context {
+    size_t mem_size;
+    void * mem_buffer;
+    bool   mem_buffer_owned;
+    bool   no_alloc;
+    bool   no_alloc_save; // this is used to save the no_alloc state when using scratch buffers
+
+    int    n_objects;
+
+    struct ggml_v3_object * objects_begin;
+    struct ggml_v3_object * objects_end;
+
+    struct ggml_v3_scratch scratch;
+    struct ggml_v3_scratch scratch_save;
+};
+
+struct ggml_v3_context_container {
+    bool used;
+
+    struct ggml_v3_context context;
+};
+
+//
+// NUMA support
+//
+
+#define GGML_V3_NUMA_MAX_NODES 8
+#define GGML_V3_NUMA_MAX_CPUS 512
+
+struct ggml_v3_numa_node {
+    uint32_t cpus[GGML_V3_NUMA_MAX_CPUS]; // hardware threads on this node
+    uint32_t n_cpus;
+};
+
+struct ggml_v3_numa_nodes {
+    struct ggml_v3_numa_node nodes[GGML_V3_NUMA_MAX_NODES];
+    uint32_t n_nodes;
+    uint32_t total_cpus; // hardware threads on system
+};
+
+//
+// ggml state
+//
+
+struct ggml_v3_state {
+    struct ggml_v3_context_container contexts[GGML_V3_MAX_CONTEXTS];
+    struct ggml_v3_numa_nodes numa;
+};
+
+// global state
+static struct ggml_v3_state g_state;
+static atomic_int g_state_barrier = 0;
+
+// barrier via spin lock
+inline static void ggml_v3_critical_section_start(void) {
+    int processing = atomic_fetch_add(&g_state_barrier, 1);
+
+    while (processing > 0) {
+        // wait for other threads to finish
+        atomic_fetch_sub(&g_state_barrier, 1);
+        sched_yield(); // TODO: reconsider this
+        processing = atomic_fetch_add(&g_state_barrier, 1);
+    }
+}
+
+// TODO: make this somehow automatically executed
+//       some sort of "sentry" mechanism
+inline static void ggml_v3_critical_section_end(void) {
+    atomic_fetch_sub(&g_state_barrier, 1);
+}
+
+void ggml_v3_numa_init(void) {
+    if (g_state.numa.n_nodes > 0) {
+        fprintf(stderr, "ggml_v3_numa_init: NUMA already initialized\n");
+
+        return;
+    }
+
+#ifdef __linux__
+    struct stat st;
+    char path[256];
+    int rv;
+
+    // enumerate nodes
+    while (g_state.numa.n_nodes < GGML_V3_NUMA_MAX_NODES) {
+        rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", g_state.numa.n_nodes);
+        GGML_V3_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
+        if (stat(path, &st) != 0) { break; }
+        ++g_state.numa.n_nodes;
+    }
+
+    // enumerate CPUs
+    while (g_state.numa.total_cpus < GGML_V3_NUMA_MAX_CPUS) {
+        rv = snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%u", g_state.numa.total_cpus);
+        GGML_V3_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
+        if (stat(path, &st) != 0) { break; }
+        ++g_state.numa.total_cpus;
+    }
+
+    GGML_V3_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", g_state.numa.n_nodes, g_state.numa.total_cpus);
+
+    if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1) {
+        g_state.numa.n_nodes = 0;
+        return;
+    }
+
+    for (uint32_t n = 0; n < g_state.numa.n_nodes; ++n) {
+        struct ggml_v3_numa_node * node = &g_state.numa.nodes[n];
+        GGML_V3_PRINT_DEBUG("CPUs on node %u:", n);
+        node->n_cpus = 0;
+        for (uint32_t c = 0; c < g_state.numa.total_cpus; ++c) {
+            rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u/cpu%u", n, c);
+            GGML_V3_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
+            if (stat(path, &st) == 0) {
+                node->cpus[node->n_cpus++] = c;
+                GGML_V3_PRINT_DEBUG(" %u", c);
+            }
+        }
+        GGML_V3_PRINT_DEBUG("\n");
+    }
+
+    if (ggml_v3_is_numa()) {
+        FILE *fptr = fopen("/proc/sys/kernel/numa_balancing", "r");
+        if (fptr != NULL) {
+            char buf[42];
+            if (fgets(buf, sizeof(buf), fptr) && strncmp(buf, "0\n", sizeof(buf)) != 0) {
+                GGML_V3_PRINT("WARNING: /proc/sys/kernel/numa_balancing is enabled, this has been observed to impair performance\n");
+            }
+            fclose(fptr);
+        }
+    }
+#else
+    // TODO
+#endif
+}
+
+bool ggml_v3_is_numa(void) {
+    return g_state.numa.n_nodes > 1;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+void ggml_v3_print_object(const struct ggml_v3_object * obj) {
+    GGML_V3_PRINT(" - ggml_v3_object: type = %d, offset = %zu, size = %zu, next = %p\n",
+            obj->type, obj->offs, obj->size, (const void *) obj->next);
+}
+
+void ggml_v3_print_objects(const struct ggml_v3_context * ctx) {
+    struct ggml_v3_object * obj = ctx->objects_begin;
+
+    GGML_V3_PRINT("%s: objects in context %p:\n", __func__, (const void *) ctx);
+
+    while (obj != NULL) {
+        ggml_v3_print_object(obj);
+        obj = obj->next;
+    }
+
+    GGML_V3_PRINT("%s: --- end ---\n", __func__);
+}
+
+int64_t ggml_v3_nelements(const struct ggml_v3_tensor * tensor) {
+    static_assert(GGML_V3_MAX_DIMS == 4, "GGML_V3_MAX_DIMS is not 4 - update this function");
+
+    return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
+}
+
+int64_t ggml_v3_nrows(const struct ggml_v3_tensor * tensor) {
+    static_assert(GGML_V3_MAX_DIMS == 4, "GGML_V3_MAX_DIMS is not 4 - update this function");
+
+    return tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
+}
+
+size_t ggml_v3_nbytes(const struct ggml_v3_tensor * tensor) {
+    size_t nbytes;
+    size_t blck_size = ggml_v3_blck_size(tensor->type);
+    if (blck_size == 1) {
+        nbytes = ggml_v3_type_size(tensor->type);
+        for (int i = 0; i < GGML_V3_MAX_DIMS; ++i) {
+            nbytes += (tensor->ne[i] - 1)*tensor->nb[i];
+        }
+    }
+    else {
+        nbytes = tensor->ne[0]*tensor->nb[0]/blck_size;
+        for (int i = 1; i < GGML_V3_MAX_DIMS; ++i) {
+            nbytes += (tensor->ne[i] - 1)*tensor->nb[i];
+        }
+    }
+
+    return nbytes;
+}
+
+size_t ggml_v3_nbytes_pad(const struct ggml_v3_tensor * tensor) {
+    return GGML_V3_PAD(ggml_v3_nbytes(tensor), GGML_V3_MEM_ALIGN);
+}
+
+int ggml_v3_blck_size(enum ggml_v3_type type) {
+    return type_traits[type].blck_size;
+}
+
+size_t ggml_v3_type_size(enum ggml_v3_type type) {
+    return type_traits[type].type_size;
+}
+
+size_t ggml_v3_row_size(enum ggml_v3_type type, int64_t ne) {
+    assert(ne % ggml_v3_blck_size(type) == 0);
+    return ggml_v3_type_size(type)*ne/ggml_v3_blck_size(type);
+}
+
+double ggml_v3_type_sizef(enum ggml_v3_type type) {
+    return ((double)(type_traits[type].type_size))/type_traits[type].blck_size;
+}
+
+const char * ggml_v3_type_name(enum ggml_v3_type type) {
+    return type_traits[type].type_name;
+}
+
+bool ggml_v3_is_quantized(enum ggml_v3_type type) {
+    return type_traits[type].is_quantized;
+}
+
+const char * ggml_v3_op_name(enum ggml_v3_op op) {
+    return GGML_V3_OP_NAME[op];
+}
+
+const char * ggml_v3_op_symbol(enum ggml_v3_op op) {
+    return GGML_V3_OP_SYMBOL[op];
+}
+
+const char * ggml_v3_unary_op_name(enum ggml_v3_unary_op op) {
+    return GGML_V3_UNARY_OP_NAME[op];
+}
+
+const char * ggml_v3_op_desc(const struct ggml_v3_tensor * t) {
+    if (t->op == GGML_V3_OP_UNARY) {
+        enum ggml_v3_unary_op uop = ggml_v3_get_unary_op(t);
+        return ggml_v3_unary_op_name(uop);
+    }
+    else {
+        return ggml_v3_op_name(t->op);
+    }
+}
+
+size_t ggml_v3_element_size(const struct ggml_v3_tensor * tensor) {
+    return ggml_v3_type_size(tensor->type);
+}
+
+bool ggml_v3_is_scalar(const struct ggml_v3_tensor * tensor) {
+    static_assert(GGML_V3_MAX_DIMS == 4, "GGML_V3_MAX_DIMS is not 4 - update this function");
+
+    return tensor->ne[0] == 1 && tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
+}
+
+bool ggml_v3_is_vector(const struct ggml_v3_tensor * tensor) {
+    static_assert(GGML_V3_MAX_DIMS == 4, "GGML_V3_MAX_DIMS is not 4 - update this function");
+
+    return tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
+}
+
+bool ggml_v3_is_matrix(const struct ggml_v3_tensor * tensor) {
+    static_assert(GGML_V3_MAX_DIMS == 4, "GGML_V3_MAX_DIMS is not 4 - update this function");
+
+    return tensor->ne[2] == 1 && tensor->ne[3] == 1;
+}
+
+bool ggml_v3_is_3d(const struct ggml_v3_tensor * tensor) {
+    return tensor->ne[3] == 1;
+}
+
+int ggml_v3_n_dims(const struct ggml_v3_tensor * tensor) {
+    for (int i = GGML_V3_MAX_DIMS - 1; i >= 1; --i) {
+        if (tensor->ne[i] > 1) {
+            return i + 1;
+        }
+    }
+    return 1;
+}
+
+static inline bool ggml_v3_can_mul_mat(const struct ggml_v3_tensor * t0, const struct ggml_v3_tensor * t1) {
+    static_assert(GGML_V3_MAX_DIMS == 4, "GGML_V3_MAX_DIMS is not 4 - update this function");
+
+    return (t0->ne[0]           == t1->ne[0])  &&
+           (t1->ne[2]%t0->ne[2] == 0)          && // verify t0 is broadcastable
+           (t1->ne[3]%t0->ne[3] == 0);
+}
+
+static inline bool ggml_v3_can_out_prod(const struct ggml_v3_tensor * t0, const struct ggml_v3_tensor * t1) {
+    static_assert(GGML_V3_MAX_DIMS == 4, "GGML_V3_MAX_DIMS is not 4 - update this function");
+
+    return (t0->ne[1] == t1->ne[1])   &&
+           (t1->ne[2]%t0->ne[2] == 0) && // verify t0 is broadcastable
+           (t1->ne[3]%t0->ne[3] == 0);
+}
+
+enum ggml_v3_type ggml_v3_ftype_to_ggml_v3_type(enum ggml_v3_ftype ftype) {
+    enum ggml_v3_type wtype = GGML_V3_TYPE_COUNT;
+
+    switch (ftype) {
+        case GGML_V3_FTYPE_ALL_F32:              wtype = GGML_V3_TYPE_F32;   break;
+        case GGML_V3_FTYPE_MOSTLY_F16:           wtype = GGML_V3_TYPE_F16;   break;
+        case GGML_V3_FTYPE_MOSTLY_Q4_0:          wtype = GGML_V3_TYPE_Q4_0;  break;
+        case GGML_V3_FTYPE_MOSTLY_Q4_1:          wtype = GGML_V3_TYPE_Q4_1;  break;
+        case GGML_V3_FTYPE_MOSTLY_Q5_0:          wtype = GGML_V3_TYPE_Q5_0;  break;
+        case GGML_V3_FTYPE_MOSTLY_Q5_1:          wtype = GGML_V3_TYPE_Q5_1;  break;
+        case GGML_V3_FTYPE_MOSTLY_Q8_0:          wtype = GGML_V3_TYPE_Q8_0;  break;
+        case GGML_V3_FTYPE_MOSTLY_Q2_K:          wtype = GGML_V3_TYPE_Q2_K;  break;
+        case GGML_V3_FTYPE_MOSTLY_Q3_K:          wtype = GGML_V3_TYPE_Q3_K;  break;
+        case GGML_V3_FTYPE_MOSTLY_Q4_K:          wtype = GGML_V3_TYPE_Q4_K;  break;
+        case GGML_V3_FTYPE_MOSTLY_Q5_K:          wtype = GGML_V3_TYPE_Q5_K;  break;
+        case GGML_V3_FTYPE_MOSTLY_Q6_K:          wtype = GGML_V3_TYPE_Q6_K;  break;
+        case GGML_V3_FTYPE_MOSTLY_IQ2_XXS:       wtype = GGML_V3_TYPE_IQ2_XXS;  break;
+        case GGML_V3_FTYPE_MOSTLY_IQ2_XS:        wtype = GGML_V3_TYPE_IQ2_XS;   break;
+        case GGML_V3_FTYPE_UNKNOWN:              wtype = GGML_V3_TYPE_COUNT; break;
+        case GGML_V3_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_V3_TYPE_COUNT; break;
+    }
+
+    GGML_V3_ASSERT(wtype != GGML_V3_TYPE_COUNT);
+
+    return wtype;
+}
+
+size_t ggml_v3_tensor_overhead(void) {
+    return GGML_V3_OBJECT_SIZE + GGML_V3_TENSOR_SIZE;
+}
+
+bool ggml_v3_is_transposed(const struct ggml_v3_tensor * tensor) {
+    return tensor->nb[0] > tensor->nb[1];
+}
+
+bool ggml_v3_is_contiguous(const struct ggml_v3_tensor * tensor) {
+    static_assert(GGML_V3_MAX_DIMS == 4, "GGML_V3_MAX_DIMS is not 4 - update this function");
+
+    return
+        tensor->nb[0] == ggml_v3_type_size(tensor->type) &&
+        tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_v3_blck_size(tensor->type) &&
+        tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
+        tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
+}
+
+static inline bool ggml_v3_is_contiguous_except_dim_1(const struct ggml_v3_tensor * tensor) {
+    static_assert(GGML_V3_MAX_DIMS == 4, "GGML_V3_MAX_DIMS is not 4 - update this function");
+
+    return
+        tensor->nb[0] == ggml_v3_type_size(tensor->type) &&
+        tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
+        tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
+}
+
+bool ggml_v3_is_permuted(const struct ggml_v3_tensor * tensor) {
+    static_assert(GGML_V3_MAX_DIMS == 4, "GGML_V3_MAX_DIMS is not 4 - update this function");
+
+    return tensor->nb[0] > tensor->nb[1] || tensor->nb[1] > tensor->nb[2] || tensor->nb[2] > tensor->nb[3];
+}
+
+static inline bool ggml_v3_is_padded_1d(const struct ggml_v3_tensor * tensor) {
+    static_assert(GGML_V3_MAX_DIMS == 4, "GGML_V3_MAX_DIMS is not 4 - update this function");
+
+    return
+        tensor->nb[0] == ggml_v3_type_size(tensor->type) &&
+        tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
+        tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
+}
+
+bool ggml_v3_are_same_shape(const struct ggml_v3_tensor * t0, const struct ggml_v3_tensor * t1) {
+    static_assert(GGML_V3_MAX_DIMS == 4, "GGML_V3_MAX_DIMS is not 4 - update this function");
+
+    return
+        (t0->ne[0] == t1->ne[0] ) &&
+        (t0->ne[1] == t1->ne[1] ) &&
+        (t0->ne[2] == t1->ne[2] ) &&
+        (t0->ne[3] == t1->ne[3] );
+}
+
+// check if t1 can be represented as a repeatition of t0
+static inline bool ggml_v3_can_repeat(const struct ggml_v3_tensor * t0, const struct ggml_v3_tensor * t1) {
+    static_assert(GGML_V3_MAX_DIMS == 4, "GGML_V3_MAX_DIMS is not 4 - update this function");
+
+    return
+        (t1->ne[0]%t0->ne[0] == 0) &&
+        (t1->ne[1]%t0->ne[1] == 0) &&
+        (t1->ne[2]%t0->ne[2] == 0) &&
+        (t1->ne[3]%t0->ne[3] == 0);
+}
+
+static inline bool ggml_v3_can_repeat_rows(const struct ggml_v3_tensor * t0, const struct ggml_v3_tensor * t1) {
+    static_assert(GGML_V3_MAX_DIMS == 4, "GGML_V3_MAX_DIMS is not 4 - update this function");
+
+    return (t0->ne[0] == t1->ne[0]) && ggml_v3_can_repeat(t0, t1);
+}
+
+static inline int ggml_v3_up32(int n) {
+    return (n + 31) & ~31;
+}
+
+//static inline int ggml_v3_up64(int n) {
+//    return (n + 63) & ~63;
+//}
+
+static inline int ggml_v3_up(int n, int m) {
+    // assert m is a power of 2
+    GGML_V3_ASSERT((m & (m - 1)) == 0);
+    return (n + m - 1) & ~(m - 1);
+}
+
+// assert that pointer is aligned to GGML_V3_MEM_ALIGN
+#define ggml_v3_assert_aligned(ptr) \
+    GGML_V3_ASSERT(((uintptr_t) (ptr))%GGML_V3_MEM_ALIGN == 0)
+
+////////////////////////////////////////////////////////////////////////////////
+
+struct ggml_v3_context * ggml_v3_init(struct ggml_v3_init_params params) {
+    // make this function thread safe
+    ggml_v3_critical_section_start();
+
+    static bool is_first_call = true;
+
+    if (is_first_call) {
+        // initialize time system (required on Windows)
+        ggml_v3_time_init();
+
+        // initialize GELU, Quick GELU, SILU and EXP F32 tables
+        {
+            const uint64_t t_start = ggml_v3_time_us(); UNUSED(t_start);
+
+            ggml_v3_fp16_t ii;
+            for (int i = 0; i < (1 << 16); ++i) {
+                uint16_t ui = i;
+                memcpy(&ii, &ui, sizeof(ii));
+                const float f = ggml_v3_table_f32_f16[i] = GGML_V3_COMPUTE_FP16_TO_FP32(ii);
+                ggml_v3_table_gelu_f16[i] = GGML_V3_FP32_TO_FP16(ggml_v3_gelu_f32(f));
+                ggml_v3_table_gelu_quick_f16[i] = GGML_V3_FP32_TO_FP16(ggml_v3_gelu_quick_f32(f));
+                ggml_v3_table_silu_f16[i] = GGML_V3_FP32_TO_FP16(ggml_v3_silu_f32(f));
+                ggml_v3_table_exp_f16[i]  = GGML_V3_FP32_TO_FP16(expf(f));
+            }
+
+            const uint64_t t_end = ggml_v3_time_us(); UNUSED(t_end);
+
+            GGML_V3_PRINT_DEBUG("%s: GELU, Quick GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
+        }
+
+        // initialize g_state
+        {
+            const uint64_t t_start = ggml_v3_time_us(); UNUSED(t_start);
+
+            g_state = (struct ggml_v3_state) {
+                /*.contexts =*/ { { 0 } },
+                /*.numa =*/ {
+                    .n_nodes = 0,
+                    .total_cpus = 0,
+                },
+            };
+
+            for (int i = 0; i < GGML_V3_MAX_CONTEXTS; ++i) {
+                g_state.contexts[i].used = false;
+            }
+
+            const uint64_t t_end = ggml_v3_time_us(); UNUSED(t_end);
+
+            GGML_V3_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
+        }
+
+#if defined(GGML_USE_CUBLAS)
+        ggml_v3_init_cublas();
+#elif defined(GGML_USE_CLBLAST)
+        ggml_v3_cl_init();
+#endif
+
+        ggml_v3_setup_op_has_task_pass();
+
+        is_first_call = false;
+    }
+
+    // find non-used context in g_state
+    struct ggml_v3_context * ctx = NULL;
+
+    for (int i = 0; i < GGML_V3_MAX_CONTEXTS; i++) {
+        if (!g_state.contexts[i].used) {
+            g_state.contexts[i].used = true;
+            ctx = &g_state.contexts[i].context;
+
+            GGML_V3_PRINT_DEBUG("%s: found unused context %d\n", __func__, i);
+            break;
+        }
+    }
+
+    if (ctx == NULL) {
+        GGML_V3_PRINT_DEBUG("%s: no unused context found\n", __func__);
+
+        ggml_v3_critical_section_end();
+
+        return NULL;
+    }
+
+    // allow to call ggml_v3_init with 0 size
+    if (params.mem_size == 0) {
+        params.mem_size = GGML_V3_MEM_ALIGN;
+    }
+
+    const size_t mem_size = params.mem_buffer ? params.mem_size : GGML_V3_PAD(params.mem_size, GGML_V3_MEM_ALIGN);
+
+    *ctx = (struct ggml_v3_context) {
+        /*.mem_size           =*/ mem_size,
+        /*.mem_buffer         =*/ params.mem_buffer ? params.mem_buffer : GGML_V3_ALIGNED_MALLOC(mem_size),
+        /*.mem_buffer_owned   =*/ params.mem_buffer ? false : true,
+        /*.no_alloc           =*/ params.no_alloc,
+        /*.no_alloc_save      =*/ params.no_alloc,
+        /*.n_objects          =*/ 0,
+        /*.objects_begin      =*/ NULL,
+        /*.objects_end        =*/ NULL,
+        /*.scratch            =*/ { 0, 0, NULL, },
+        /*.scratch_save       =*/ { 0, 0, NULL, },
+    };
+
+    GGML_V3_ASSERT(ctx->mem_buffer != NULL);
+
+    ggml_v3_assert_aligned(ctx->mem_buffer);
+
+    GGML_V3_PRINT_DEBUG("%s: context initialized\n", __func__);
+
+    ggml_v3_critical_section_end();
+
+    return ctx;
+}
+
+void ggml_v3_free(struct ggml_v3_context * ctx) {
+    // make this function thread safe
+    ggml_v3_critical_section_start();
+
+    bool found = false;
+
+    for (int i = 0; i < GGML_V3_MAX_CONTEXTS; i++) {
+        if (&g_state.contexts[i].context == ctx) {
+            g_state.contexts[i].used = false;
+
+            GGML_V3_PRINT_DEBUG("%s: context %d has been freed. memory used = %zu\n",
+                    __func__, i, ggml_v3_used_mem(ctx));
+
+            if (ctx->mem_buffer_owned) {
+                GGML_V3_ALIGNED_FREE(ctx->mem_buffer);
+            }
+
+            found = true;
+            break;
+        }
+    }
+
+    if (!found) {
+        GGML_V3_PRINT_DEBUG("%s: context not found\n", __func__);
+    }
+
+    ggml_v3_critical_section_end();
+}
+
+size_t ggml_v3_used_mem(const struct ggml_v3_context * ctx) {
+    return ctx->objects_end == NULL ? 0 : ctx->objects_end->offs + ctx->objects_end->size;
+}
+
+size_t ggml_v3_set_scratch(struct ggml_v3_context * ctx, struct ggml_v3_scratch scratch) {
+    const size_t result = ctx->scratch.data ? ctx->scratch.offs : 0;
+
+    ctx->scratch = scratch;
+
+    return result;
+}
+
+bool ggml_v3_get_no_alloc(struct ggml_v3_context * ctx) {
+    return ctx->no_alloc;
+}
+
+void ggml_v3_set_no_alloc(struct ggml_v3_context * ctx, bool no_alloc) {
+    ctx->no_alloc = no_alloc;
+}
+
+void * ggml_v3_get_mem_buffer(const struct ggml_v3_context * ctx) {
+    return ctx->mem_buffer;
+}
+
+size_t ggml_v3_get_mem_size(const struct ggml_v3_context * ctx) {
+    return ctx->mem_size;
+}
+
+size_t ggml_v3_get_max_tensor_size(const struct ggml_v3_context * ctx) {
+    size_t max_size = 0;
+
+    for (struct ggml_v3_tensor * tensor = ggml_v3_get_first_tensor(ctx); tensor != NULL; tensor = ggml_v3_get_next_tensor(ctx, tensor)) {
+        max_size = MAX(max_size, ggml_v3_nbytes(tensor));
+    }
+
+    return max_size;
+}
+
+// IMPORTANT:
+// when creating "opt" tensors, always save and load the scratch buffer
+// this is an error prone process, but it is necessary to support inplace
+// operators when using scratch buffers
+// TODO: implement a better way
+static void ggml_v3_scratch_save(struct ggml_v3_context * ctx) {
+    // this is needed to allow opt tensors to store their data
+    // TODO: again, need to find a better way
+    ctx->no_alloc_save = ctx->no_alloc;
+    ctx->no_alloc      = false;
+
+    ctx->scratch_save = ctx->scratch;
+    ctx->scratch.data = NULL;
+}
+
+static void ggml_v3_scratch_load(struct ggml_v3_context * ctx) {
+    ctx->no_alloc = ctx->no_alloc_save;
+
+    ctx->scratch = ctx->scratch_save;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+static struct ggml_v3_object * ggml_v3_new_object(struct ggml_v3_context * ctx, enum ggml_v3_object_type type, size_t size) {
+    // always insert objects at the end of the context's memory pool
+    struct ggml_v3_object * obj_cur = ctx->objects_end;
+
+    const size_t cur_offs = obj_cur == NULL ? 0 : obj_cur->offs;
+    const size_t cur_size = obj_cur == NULL ? 0 : obj_cur->size;
+    const size_t cur_end  = cur_offs + cur_size;
+
+    // align to GGML_V3_MEM_ALIGN
+    size_t size_needed = GGML_V3_PAD(size, GGML_V3_MEM_ALIGN);
+
+    char * const mem_buffer = ctx->mem_buffer;
+    struct ggml_v3_object * const obj_new = (struct ggml_v3_object *)(mem_buffer + cur_end);
+
+    if (cur_end + size_needed + GGML_V3_OBJECT_SIZE > ctx->mem_size) {
+        GGML_V3_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
+                __func__, cur_end + size_needed, ctx->mem_size);
+        assert(false);
+        return NULL;
+    }
+
+    *obj_new = (struct ggml_v3_object) {
+        .offs = cur_end + GGML_V3_OBJECT_SIZE,
+        .size = size_needed,
+        .next = NULL,
+        .type = type,
+    };
+
+    ggml_v3_assert_aligned(mem_buffer + obj_new->offs);
+
+    if (obj_cur != NULL) {
+        obj_cur->next = obj_new;
+    } else {
+        // this is the first object in this context
+        ctx->objects_begin = obj_new;
+    }
+
+    ctx->objects_end = obj_new;
+
+    //printf("%s: inserted new object at %zu, size = %zu\n", __func__, cur_end, obj_new->size);
+
+    return obj_new;
+}
+
+static struct ggml_v3_tensor * ggml_v3_new_tensor_impl(
+        struct ggml_v3_context * ctx,
+        enum   ggml_v3_type      type,
+        int                   n_dims,
+        const int64_t       * ne,
+        struct ggml_v3_tensor  * view_src,
+        size_t                view_offs) {
+
+    assert(n_dims >= 1 && n_dims <= GGML_V3_MAX_DIMS);
+
+    // find the base tensor and absolute offset
+    if (view_src != NULL && view_src->view_src != NULL) {
+        view_offs += view_src->view_offs;
+        view_src   = view_src->view_src;
+    }
+
+    size_t data_size = ggml_v3_row_size(type, ne[0]);
+    for (int i = 1; i < n_dims; i++) {
+        data_size *= ne[i];
+    }
+
+    GGML_V3_ASSERT(view_src == NULL || data_size + view_offs <= ggml_v3_nbytes(view_src));
+
+    void * data = view_src != NULL ? view_src->data : NULL;
+    if (data != NULL) {
+        data = (char *) data + view_offs;
+    }
+
+    size_t obj_alloc_size = 0;
+
+    if (view_src == NULL && !ctx->no_alloc) {
+        if (ctx->scratch.data != NULL) {
+            // allocate tensor data in the scratch buffer
+            if (ctx->scratch.offs + data_size > ctx->scratch.size) {
+                GGML_V3_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
+                        __func__, ctx->scratch.offs + data_size, ctx->scratch.size);
+                assert(false);
+                return NULL;
+            }
+
+            data = (char * const) ctx->scratch.data + ctx->scratch.offs;
+
+            ctx->scratch.offs += data_size;
+        } else {
+            // allocate tensor data in the context's memory pool
+            obj_alloc_size = data_size;
+        }
+    }
+
+    struct ggml_v3_object * const obj_new = ggml_v3_new_object(ctx, GGML_V3_OBJECT_TENSOR, GGML_V3_TENSOR_SIZE + obj_alloc_size);
+
+    // TODO: for recoverable errors, we would need to free the data allocated from the scratch buffer here
+
+    struct ggml_v3_tensor * const result = (struct ggml_v3_tensor *)((char *)ctx->mem_buffer + obj_new->offs);
+
+    *result = (struct ggml_v3_tensor) {
+        /*.type         =*/ type,
+        /*.backend      =*/ GGML_V3_BACKEND_CPU,
+        /*.buffer       =*/ NULL,
+        /*.ne           =*/ { 1, 1, 1, 1 },
+        /*.nb           =*/ { 0, 0, 0, 0 },
+        /*.op           =*/ GGML_V3_OP_NONE,
+        /*.op_params    =*/ { 0 },
+        /*.is_param     =*/ false,
+        /*.grad         =*/ NULL,
+        /*.src          =*/ { NULL },
+        /*.perf_runs    =*/ 0,
+        /*.perf_cycles  =*/ 0,
+        /*.perf_time_us =*/ 0,
+        /*.view_src     =*/ view_src,
+        /*.view_offs    =*/ view_offs,
+        /*.data         =*/ obj_alloc_size > 0 ? (void *)(result + 1) : data,
+        /*.name         =*/ { 0 },
+        /*.extra        =*/ NULL,
+        /*.padding      =*/ { 0 },
+    };
+
+    // TODO: this should not be needed as long as we don't rely on aligned SIMD loads
+    //ggml_v3_assert_aligned(result->data);
+
+    for (int i = 0; i < n_dims; i++) {
+        result->ne[i] = ne[i];
+    }
+
+    result->nb[0] = ggml_v3_type_size(type);
+    result->nb[1] = result->nb[0]*(result->ne[0]/ggml_v3_blck_size(type));
+    for (int i = 2; i < GGML_V3_MAX_DIMS; i++) {
+        result->nb[i] = result->nb[i - 1]*result->ne[i - 1];
+    }
+
+    ctx->n_objects++;
+
+    return result;
+}
+
+struct ggml_v3_tensor * ggml_v3_new_tensor(
+        struct ggml_v3_context * ctx,
+        enum   ggml_v3_type      type,
+        int                   n_dims,
+        const int64_t       * ne) {
+    return ggml_v3_new_tensor_impl(ctx, type, n_dims, ne, NULL, 0);
+}
+
+struct ggml_v3_tensor * ggml_v3_new_tensor_1d(
+        struct ggml_v3_context * ctx,
+        enum   ggml_v3_type      type,
+        int64_t ne0) {
+    return ggml_v3_new_tensor(ctx, type, 1, &ne0);
+}
+
+struct ggml_v3_tensor * ggml_v3_new_tensor_2d(
+        struct ggml_v3_context * ctx,
+        enum   ggml_v3_type      type,
+        int64_t ne0,
+        int64_t ne1) {
+    const int64_t ne[2] = { ne0, ne1 };
+    return ggml_v3_new_tensor(ctx, type, 2, ne);
+}
+
+struct ggml_v3_tensor * ggml_v3_new_tensor_3d(
+        struct ggml_v3_context * ctx,
+        enum   ggml_v3_type      type,
+        int64_t ne0,
+        int64_t ne1,
+        int64_t ne2) {
+    const int64_t ne[3] = { ne0, ne1, ne2 };
+    return ggml_v3_new_tensor(ctx, type, 3, ne);
+}
+
+struct ggml_v3_tensor * ggml_v3_new_tensor_4d(
+        struct ggml_v3_context * ctx,
+        enum   ggml_v3_type type,
+        int64_t ne0,
+        int64_t ne1,
+        int64_t ne2,
+        int64_t ne3) {
+    const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
+    return ggml_v3_new_tensor(ctx, type, 4, ne);
+}
+
+struct ggml_v3_tensor * ggml_v3_new_i32(struct ggml_v3_context * ctx, int32_t value) {
+    ggml_v3_scratch_save(ctx);
+
+    struct ggml_v3_tensor * result = ggml_v3_new_tensor_1d(ctx, GGML_V3_TYPE_I32, 1);
+
+    ggml_v3_scratch_load(ctx);
+
+    ggml_v3_set_i32(result, value);
+
+    return result;
+}
+
+struct ggml_v3_tensor * ggml_v3_new_f32(struct ggml_v3_context * ctx, float value) {
+    ggml_v3_scratch_save(ctx);
+
+    struct ggml_v3_tensor * result = ggml_v3_new_tensor_1d(ctx, GGML_V3_TYPE_F32, 1);
+
+    ggml_v3_scratch_load(ctx);
+
+    ggml_v3_set_f32(result, value);
+
+    return result;
+}
+
+struct ggml_v3_tensor * ggml_v3_dup_tensor(struct ggml_v3_context * ctx, const struct ggml_v3_tensor * src) {
+    return ggml_v3_new_tensor(ctx, src->type, GGML_V3_MAX_DIMS, src->ne);
+}
+
+static void ggml_v3_set_op_params(struct ggml_v3_tensor * tensor, const void * params, size_t params_size) {
+    GGML_V3_ASSERT(tensor != NULL); // silence -Warray-bounds warnings
+    assert(params_size <= GGML_V3_MAX_OP_PARAMS);
+    memcpy(tensor->op_params, params, params_size);
+}
+
+static int32_t ggml_v3_get_op_params_i32(const struct ggml_v3_tensor * tensor, uint32_t i) {
+    assert(i < GGML_V3_MAX_OP_PARAMS / sizeof(int32_t));
+    return ((const int32_t *)(tensor->op_params))[i];
+}
+
+static void ggml_v3_set_op_params_i32(struct ggml_v3_tensor * tensor, uint32_t i, int32_t value) {
+    assert(i < GGML_V3_MAX_OP_PARAMS / sizeof(int32_t));
+    ((int32_t *)(tensor->op_params))[i] = value;
+}
+
+struct ggml_v3_tensor * ggml_v3_set_zero(struct ggml_v3_tensor * tensor) {
+    memset(tensor->data, 0, ggml_v3_nbytes(tensor));
+    return tensor;
+}
+
+struct ggml_v3_tensor * ggml_v3_set_i32 (struct ggml_v3_tensor * tensor, int32_t value) {
+    const int n     = ggml_v3_nrows(tensor);
+    const int nc    = tensor->ne[0];
+    const size_t n1 = tensor->nb[1];
+
+    char * const data = tensor->data;
+
+    switch (tensor->type) {
+        case GGML_V3_TYPE_I8:
+            {
+                assert(tensor->nb[0] == sizeof(int8_t));
+                for (int i = 0; i < n; i++) {
+                    ggml_v3_vec_set_i8(nc, (int8_t *)(data + i*n1), value);
+                }
+            } break;
+        case GGML_V3_TYPE_I16:
+            {
+                assert(tensor->nb[0] == sizeof(int16_t));
+                for (int i = 0; i < n; i++) {
+                    ggml_v3_vec_set_i16(nc, (int16_t *)(data + i*n1), value);
+                }
+            } break;
+        case GGML_V3_TYPE_I32:
+            {
+                assert(tensor->nb[0] == sizeof(int32_t));
+                for (int i = 0; i < n; i++) {
+                    ggml_v3_vec_set_i32(nc, (int32_t *)(data + i*n1), value);
+                }
+            } break;
+        case GGML_V3_TYPE_F16:
+            {
+                assert(tensor->nb[0] == sizeof(ggml_v3_fp16_t));
+                for (int i = 0; i < n; i++) {
+                    ggml_v3_vec_set_f16(nc, (ggml_v3_fp16_t *)(data + i*n1), GGML_V3_FP32_TO_FP16(value));
+                }
+            } break;
+        case GGML_V3_TYPE_F32:
+            {
+                assert(tensor->nb[0] == sizeof(float));
+                for (int i = 0; i < n; i++) {
+                    ggml_v3_vec_set_f32(nc, (float *)(data + i*n1), value);
+                }
+            } break;
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+
+    return tensor;
+}
+
+struct ggml_v3_tensor * ggml_v3_set_f32(struct ggml_v3_tensor * tensor, float value) {
+    const int n     = ggml_v3_nrows(tensor);
+    const int nc    = tensor->ne[0];
+    const size_t n1 = tensor->nb[1];
+
+    char * const data = tensor->data;
+
+    switch (tensor->type) {
+        case GGML_V3_TYPE_I8:
+            {
+                assert(tensor->nb[0] == sizeof(int8_t));
+                for (int i = 0; i < n; i++) {
+                    ggml_v3_vec_set_i8(nc, (int8_t *)(data + i*n1), value);
+                }
+            } break;
+        case GGML_V3_TYPE_I16:
+            {
+                assert(tensor->nb[0] == sizeof(int16_t));
+                for (int i = 0; i < n; i++) {
+                    ggml_v3_vec_set_i16(nc, (int16_t *)(data + i*n1), value);
+                }
+            } break;
+        case GGML_V3_TYPE_I32:
+            {
+                assert(tensor->nb[0] == sizeof(int32_t));
+                for (int i = 0; i < n; i++) {
+                    ggml_v3_vec_set_i32(nc, (int32_t *)(data + i*n1), value);
+                }
+            } break;
+        case GGML_V3_TYPE_F16:
+            {
+                assert(tensor->nb[0] == sizeof(ggml_v3_fp16_t));
+                for (int i = 0; i < n; i++) {
+                    ggml_v3_vec_set_f16(nc, (ggml_v3_fp16_t *)(data + i*n1), GGML_V3_FP32_TO_FP16(value));
+                }
+            } break;
+        case GGML_V3_TYPE_F32:
+            {
+                assert(tensor->nb[0] == sizeof(float));
+                for (int i = 0; i < n; i++) {
+                    ggml_v3_vec_set_f32(nc, (float *)(data + i*n1), value);
+                }
+            } break;
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+
+    return tensor;
+}
+
+void ggml_v3_unravel_index(const struct ggml_v3_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3) {
+    const int64_t ne2 = tensor->ne[2];
+    const int64_t ne1 = tensor->ne[1];
+    const int64_t ne0 = tensor->ne[0];
+
+    const int64_t i3_ = (i/(ne2*ne1*ne0));
+    const int64_t i2_ = (i - i3_*ne2*ne1*ne0)/(ne1*ne0);
+    const int64_t i1_ = (i - i3_*ne2*ne1*ne0 - i2_*ne1*ne0)/ne0;
+    const int64_t i0_ = (i - i3_*ne2*ne1*ne0 - i2_*ne1*ne0 - i1_*ne0);
+
+    if (i0) {
+        * i0 = i0_;
+    }
+    if (i1) {
+        * i1 = i1_;
+    }
+    if (i2) {
+        * i2 = i2_;
+    }
+    if (i3) {
+        * i3 = i3_;
+    }
+}
+
+int32_t ggml_v3_get_i32_1d(const struct ggml_v3_tensor * tensor, int i) {
+    if (!ggml_v3_is_contiguous(tensor)) {
+        int64_t id[4] = { 0, 0, 0, 0 };
+        ggml_v3_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]);
+        return ggml_v3_get_i32_nd(tensor, id[0], id[1], id[2], id[3]);
+    }
+    switch (tensor->type) {
+        case GGML_V3_TYPE_I8:
+            {
+                GGML_V3_ASSERT(tensor->nb[0] == sizeof(int8_t));
+                return ((int8_t *)(tensor->data))[i];
+            }
+        case GGML_V3_TYPE_I16:
+            {
+                GGML_V3_ASSERT(tensor->nb[0] == sizeof(int16_t));
+                return ((int16_t *)(tensor->data))[i];
+            }
+        case GGML_V3_TYPE_I32:
+            {
+                GGML_V3_ASSERT(tensor->nb[0] == sizeof(int32_t));
+                return ((int32_t *)(tensor->data))[i];
+            }
+        case GGML_V3_TYPE_F16:
+            {
+                GGML_V3_ASSERT(tensor->nb[0] == sizeof(ggml_v3_fp16_t));
+                return GGML_V3_FP16_TO_FP32(((ggml_v3_fp16_t *)(tensor->data))[i]);
+            }
+        case GGML_V3_TYPE_F32:
+            {
+                GGML_V3_ASSERT(tensor->nb[0] == sizeof(float));
+                return ((float *)(tensor->data))[i];
+            }
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            }
+    }
+
+    return 0.0f;
+}
+
+void ggml_v3_set_i32_1d(const struct ggml_v3_tensor * tensor, int i, int32_t value) {
+    if (!ggml_v3_is_contiguous(tensor)) {
+        int64_t id[4] = { 0, 0, 0, 0 };
+        ggml_v3_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]);
+        ggml_v3_set_i32_nd(tensor, id[0], id[1], id[2], id[3], value);
+        return;
+    }
+    switch (tensor->type) {
+        case GGML_V3_TYPE_I8:
+            {
+                GGML_V3_ASSERT(tensor->nb[0] == sizeof(int8_t));
+                ((int8_t *)(tensor->data))[i] = value;
+            } break;
+        case GGML_V3_TYPE_I16:
+            {
+                GGML_V3_ASSERT(tensor->nb[0] == sizeof(int16_t));
+                ((int16_t *)(tensor->data))[i] = value;
+            } break;
+        case GGML_V3_TYPE_I32:
+            {
+                GGML_V3_ASSERT(tensor->nb[0] == sizeof(int32_t));
+                ((int32_t *)(tensor->data))[i] = value;
+            } break;
+        case GGML_V3_TYPE_F16:
+            {
+                GGML_V3_ASSERT(tensor->nb[0] == sizeof(ggml_v3_fp16_t));
+                ((ggml_v3_fp16_t *)(tensor->data))[i] = GGML_V3_FP32_TO_FP16(value);
+            } break;
+        case GGML_V3_TYPE_F32:
+            {
+                GGML_V3_ASSERT(tensor->nb[0] == sizeof(float));
+                ((float *)(tensor->data))[i] = value;
+            } break;
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+}
+
+int32_t ggml_v3_get_i32_nd(const struct ggml_v3_tensor * tensor, int i0, int i1, int i2, int i3) {
+    void * data   = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3];
+    switch (tensor->type) {
+        case GGML_V3_TYPE_I8:
+            return ((int8_t *) data)[0];
+        case GGML_V3_TYPE_I16:
+            return ((int16_t *) data)[0];
+        case GGML_V3_TYPE_I32:
+            return ((int32_t *) data)[0];
+        case GGML_V3_TYPE_F16:
+            return GGML_V3_FP16_TO_FP32(((ggml_v3_fp16_t *) data)[0]);
+        case GGML_V3_TYPE_F32:
+            return ((float *) data)[0];
+        default:
+            GGML_V3_ASSERT(false);
+    }
+
+    return 0.0f;
+}
+
+void ggml_v3_set_i32_nd(const struct ggml_v3_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value) {
+    void * data   = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3];
+    switch (tensor->type) {
+        case GGML_V3_TYPE_I8:
+            {
+                ((int8_t *)(data))[0] = value;
+            } break;
+        case GGML_V3_TYPE_I16:
+            {
+                ((int16_t *)(data))[0] = value;
+            } break;
+        case GGML_V3_TYPE_I32:
+            {
+                ((int32_t *)(data))[0] = value;
+            } break;
+        case GGML_V3_TYPE_F16:
+            {
+                ((ggml_v3_fp16_t *)(data))[0] = GGML_V3_FP32_TO_FP16(value);
+            } break;
+        case GGML_V3_TYPE_F32:
+            {
+                ((float *)(data))[0] = value;
+            } break;
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+}
+
+float ggml_v3_get_f32_1d(const struct ggml_v3_tensor * tensor, int i) {
+    if (!ggml_v3_is_contiguous(tensor)) {
+        int64_t id[4] = { 0, 0, 0, 0 };
+        ggml_v3_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]);
+        return ggml_v3_get_f32_nd(tensor, id[0], id[1], id[2], id[3]);
+    }
+    switch (tensor->type) {
+        case GGML_V3_TYPE_I8:
+            {
+                GGML_V3_ASSERT(tensor->nb[0] == sizeof(int8_t));
+                return ((int8_t *)(tensor->data))[i];
+            }
+        case GGML_V3_TYPE_I16:
+            {
+                GGML_V3_ASSERT(tensor->nb[0] == sizeof(int16_t));
+                return ((int16_t *)(tensor->data))[i];
+            }
+        case GGML_V3_TYPE_I32:
+            {
+                GGML_V3_ASSERT(tensor->nb[0] == sizeof(int32_t));
+                return ((int32_t *)(tensor->data))[i];
+            }
+        case GGML_V3_TYPE_F16:
+            {
+                GGML_V3_ASSERT(tensor->nb[0] == sizeof(ggml_v3_fp16_t));
+                return GGML_V3_FP16_TO_FP32(((ggml_v3_fp16_t *)(tensor->data))[i]);
+            }
+        case GGML_V3_TYPE_F32:
+            {
+                GGML_V3_ASSERT(tensor->nb[0] == sizeof(float));
+                return ((float *)(tensor->data))[i];
+            }
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            }
+    }
+
+    return 0.0f;
+}
+
+void ggml_v3_set_f32_1d(const struct ggml_v3_tensor * tensor, int i, float value) {
+    if (!ggml_v3_is_contiguous(tensor)) {
+        int64_t id[4] = { 0, 0, 0, 0 };
+        ggml_v3_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]);
+        ggml_v3_set_f32_nd(tensor, id[0], id[1], id[2], id[3], value);
+        return;
+    }
+    switch (tensor->type) {
+        case GGML_V3_TYPE_I8:
+            {
+                GGML_V3_ASSERT(tensor->nb[0] == sizeof(int8_t));
+                ((int8_t *)(tensor->data))[i] = value;
+            } break;
+        case GGML_V3_TYPE_I16:
+            {
+                GGML_V3_ASSERT(tensor->nb[0] == sizeof(int16_t));
+                ((int16_t *)(tensor->data))[i] = value;
+            } break;
+        case GGML_V3_TYPE_I32:
+            {
+                GGML_V3_ASSERT(tensor->nb[0] == sizeof(int32_t));
+                ((int32_t *)(tensor->data))[i] = value;
+            } break;
+        case GGML_V3_TYPE_F16:
+            {
+                GGML_V3_ASSERT(tensor->nb[0] == sizeof(ggml_v3_fp16_t));
+                ((ggml_v3_fp16_t *)(tensor->data))[i] = GGML_V3_FP32_TO_FP16(value);
+            } break;
+        case GGML_V3_TYPE_F32:
+            {
+                GGML_V3_ASSERT(tensor->nb[0] == sizeof(float));
+                ((float *)(tensor->data))[i] = value;
+            } break;
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+}
+
+float ggml_v3_get_f32_nd(const struct ggml_v3_tensor * tensor, int i0, int i1, int i2, int i3) {
+    void * data   = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3];
+    switch (tensor->type) {
+        case GGML_V3_TYPE_I8:
+            return ((int8_t *) data)[0];
+        case GGML_V3_TYPE_I16:
+            return ((int16_t *) data)[0];
+        case GGML_V3_TYPE_I32:
+            return ((int32_t *) data)[0];
+        case GGML_V3_TYPE_F16:
+            return GGML_V3_FP16_TO_FP32(((ggml_v3_fp16_t *) data)[0]);
+        case GGML_V3_TYPE_F32:
+            return ((float *) data)[0];
+        default:
+            GGML_V3_ASSERT(false);
+    }
+
+    return 0.0f;
+}
+
+void ggml_v3_set_f32_nd(const struct ggml_v3_tensor * tensor, int i0, int i1, int i2, int i3, float value) {
+    void * data   = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3];
+    switch (tensor->type) {
+        case GGML_V3_TYPE_I8:
+            {
+                ((int8_t *)(data))[0] = value;
+            } break;
+        case GGML_V3_TYPE_I16:
+            {
+                ((int16_t *)(data))[0] = value;
+            } break;
+        case GGML_V3_TYPE_I32:
+            {
+                ((int32_t *)(data))[0] = value;
+            } break;
+        case GGML_V3_TYPE_F16:
+            {
+                ((ggml_v3_fp16_t *)(data))[0] = GGML_V3_FP32_TO_FP16(value);
+            } break;
+        case GGML_V3_TYPE_F32:
+            {
+                ((float *)(data))[0] = value;
+            } break;
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+}
+
+void * ggml_v3_get_data(const struct ggml_v3_tensor * tensor) {
+    return tensor->data;
+}
+
+float * ggml_v3_get_data_f32(const struct ggml_v3_tensor * tensor) {
+    assert(tensor->type == GGML_V3_TYPE_F32);
+    return (float *)(tensor->data);
+}
+
+enum ggml_v3_unary_op ggml_v3_get_unary_op(const struct ggml_v3_tensor * tensor) {
+    GGML_V3_ASSERT(tensor->op == GGML_V3_OP_UNARY);
+    return (enum ggml_v3_unary_op) ggml_v3_get_op_params_i32(tensor, 0);
+}
+
+const char * ggml_v3_get_name(const struct ggml_v3_tensor * tensor) {
+    return tensor->name;
+}
+
+struct ggml_v3_tensor * ggml_v3_set_name(struct ggml_v3_tensor * tensor, const char * name) {
+    strncpy(tensor->name, name, sizeof(tensor->name));
+    tensor->name[sizeof(tensor->name) - 1] = '\0';
+    return tensor;
+}
+
+struct ggml_v3_tensor * ggml_v3_format_name(struct ggml_v3_tensor * tensor, const char * fmt, ...) {
+    va_list args;
+    va_start(args, fmt);
+    vsnprintf(tensor->name, sizeof(tensor->name), fmt, args);
+    va_end(args);
+    return tensor;
+}
+
+struct ggml_v3_tensor * ggml_v3_view_tensor(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * src) {
+    struct ggml_v3_tensor * result = ggml_v3_new_tensor_impl(ctx, src->type, GGML_V3_MAX_DIMS, src->ne, src, 0);
+    ggml_v3_format_name(result, "%s (view)", src->name);
+
+    for (int i = 0; i < GGML_V3_MAX_DIMS; i++) {
+        result->nb[i] = src->nb[i];
+    }
+
+    return result;
+}
+
+struct ggml_v3_tensor * ggml_v3_get_first_tensor(const struct ggml_v3_context * ctx) {
+    struct ggml_v3_object * obj = ctx->objects_begin;
+
+    char * const mem_buffer = ctx->mem_buffer;
+
+    while (obj != NULL) {
+        if (obj->type == GGML_V3_OBJECT_TENSOR) {
+            return (struct ggml_v3_tensor *)(mem_buffer + obj->offs);
+        }
+
+        obj = obj->next;
+    }
+
+    return NULL;
+}
+
+struct ggml_v3_tensor * ggml_v3_get_next_tensor(const struct ggml_v3_context * ctx, struct ggml_v3_tensor * tensor) {
+    struct ggml_v3_object * obj = (struct ggml_v3_object *) ((char *)tensor - GGML_V3_OBJECT_SIZE);
+    obj = obj->next;
+
+    char * const mem_buffer = ctx->mem_buffer;
+
+    while (obj != NULL) {
+        if (obj->type == GGML_V3_OBJECT_TENSOR) {
+            return (struct ggml_v3_tensor *)(mem_buffer + obj->offs);
+        }
+
+        obj = obj->next;
+    }
+
+    return NULL;
+}
+
+struct ggml_v3_tensor * ggml_v3_get_tensor(struct ggml_v3_context * ctx, const char * name) {
+    struct ggml_v3_object * obj = ctx->objects_begin;
+
+    char * const mem_buffer = ctx->mem_buffer;
+
+    while (obj != NULL) {
+        if (obj->type == GGML_V3_OBJECT_TENSOR) {
+            struct ggml_v3_tensor * cur = (struct ggml_v3_tensor *)(mem_buffer + obj->offs);
+            if (strcmp(cur->name, name) == 0) {
+                return cur;
+            }
+        }
+
+        obj = obj->next;
+    }
+
+    return NULL;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+// ggml_v3_dup
+
+static struct ggml_v3_tensor * ggml_v3_dup_impl(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor * a,
+        bool inplace) {
+    bool is_node = false;
+
+    if (!inplace && (a->grad)) {
+        is_node = true;
+    }
+
+    struct ggml_v3_tensor * result = inplace ? ggml_v3_view_tensor(ctx, a) : ggml_v3_dup_tensor(ctx, a);
+
+    result->op   = GGML_V3_OP_DUP;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+struct ggml_v3_tensor * ggml_v3_dup(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor * a) {
+    return ggml_v3_dup_impl(ctx, a, false);
+}
+
+struct ggml_v3_tensor * ggml_v3_dup_inplace(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor * a) {
+    return ggml_v3_dup_impl(ctx, a, true);
+}
+
+// ggml_v3_add
+
+static struct ggml_v3_tensor * ggml_v3_add_impl(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor * a,
+        struct ggml_v3_tensor * b,
+        bool inplace) {
+    GGML_V3_ASSERT(ggml_v3_can_repeat(b, a));
+
+    bool is_node = false;
+
+    if (!inplace && (a->grad || b->grad)) {
+        // TODO: support backward pass for broadcasting
+        GGML_V3_ASSERT(ggml_v3_are_same_shape(a, b));
+        is_node = true;
+    }
+
+    struct ggml_v3_tensor * result = inplace ? ggml_v3_view_tensor(ctx, a) : ggml_v3_dup_tensor(ctx, a);
+
+    result->op   = GGML_V3_OP_ADD;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+struct ggml_v3_tensor * ggml_v3_add(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor * a,
+        struct ggml_v3_tensor * b) {
+    return ggml_v3_add_impl(ctx, a, b, false);
+}
+
+struct ggml_v3_tensor * ggml_v3_add_inplace(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor * a,
+        struct ggml_v3_tensor * b) {
+    return ggml_v3_add_impl(ctx, a, b, true);
+}
+
+// ggml_v3_add_cast
+
+static struct ggml_v3_tensor * ggml_v3_add_cast_impl(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor * a,
+        struct ggml_v3_tensor * b,
+        enum   ggml_v3_type     type) {
+    // TODO: support less-strict constraint
+    //       GGML_V3_ASSERT(ggml_v3_can_repeat(b, a));
+    GGML_V3_ASSERT(ggml_v3_can_repeat_rows(b, a));
+    GGML_V3_ASSERT(ggml_v3_is_quantized(a->type) || a->type == GGML_V3_TYPE_F16); // currently only supported for quantized input and f16
+
+    bool is_node = false;
+
+    if (a->grad || b->grad) {
+        // TODO: support backward pass for broadcasting
+        GGML_V3_ASSERT(ggml_v3_are_same_shape(a, b));
+        is_node = true;
+    }
+
+    struct ggml_v3_tensor * result = ggml_v3_new_tensor(ctx, type, GGML_V3_MAX_DIMS, a->ne);
+
+    result->op   = GGML_V3_OP_ADD;
+    result->grad = is_node ? ggml_v3_new_tensor(ctx, GGML_V3_TYPE_F32, GGML_V3_MAX_DIMS, a->ne) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+struct ggml_v3_tensor * ggml_v3_add_cast(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor * a,
+        struct ggml_v3_tensor * b,
+        enum   ggml_v3_type     type) {
+    return ggml_v3_add_cast_impl(ctx, a, b, type);
+}
+
+// ggml_v3_add1
+
+static struct ggml_v3_tensor * ggml_v3_add1_impl(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor * a,
+        struct ggml_v3_tensor * b,
+        bool inplace) {
+    GGML_V3_ASSERT(ggml_v3_is_scalar(b));
+    GGML_V3_ASSERT(ggml_v3_is_padded_1d(a));
+
+    bool is_node = false;
+
+    if (a->grad || b->grad) {
+        is_node = true;
+    }
+
+    struct ggml_v3_tensor * result = inplace ? ggml_v3_view_tensor(ctx, a) : ggml_v3_dup_tensor(ctx, a);
+
+    result->op   = GGML_V3_OP_ADD1;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+struct ggml_v3_tensor * ggml_v3_add1(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor * a,
+        struct ggml_v3_tensor * b) {
+    return ggml_v3_add1_impl(ctx, a, b, false);
+}
+
+struct ggml_v3_tensor * ggml_v3_add1_inplace(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor * a,
+        struct ggml_v3_tensor * b) {
+    return ggml_v3_add1_impl(ctx, a, b, true);
+}
+
+// ggml_v3_acc
+
+static struct ggml_v3_tensor * ggml_v3_acc_impl(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor * a,
+        struct ggml_v3_tensor * b,
+        size_t               nb1,
+        size_t               nb2,
+        size_t               nb3,
+        size_t               offset,
+        bool inplace) {
+    GGML_V3_ASSERT(ggml_v3_nelements(b) <= ggml_v3_nelements(a));
+    GGML_V3_ASSERT(ggml_v3_is_contiguous(a));
+    GGML_V3_ASSERT(a->type == GGML_V3_TYPE_F32);
+    GGML_V3_ASSERT(b->type == GGML_V3_TYPE_F32);
+
+    bool is_node = false;
+
+    if (!inplace && (a->grad || b->grad)) {
+        is_node = true;
+    }
+
+    struct ggml_v3_tensor * result = inplace ? ggml_v3_view_tensor(ctx, a) : ggml_v3_dup_tensor(ctx, a);
+
+    int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 };
+    ggml_v3_set_op_params(result, params, sizeof(params));
+
+    result->op   = GGML_V3_OP_ACC;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+struct ggml_v3_tensor * ggml_v3_acc(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor * a,
+        struct ggml_v3_tensor * b,
+        size_t               nb1,
+        size_t               nb2,
+        size_t               nb3,
+        size_t               offset) {
+    return ggml_v3_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, false);
+}
+
+struct ggml_v3_tensor * ggml_v3_acc_inplace(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor * a,
+        struct ggml_v3_tensor * b,
+        size_t               nb1,
+        size_t               nb2,
+        size_t               nb3,
+        size_t               offset) {
+    return ggml_v3_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, true);
+}
+
+// ggml_v3_sub
+
+static struct ggml_v3_tensor * ggml_v3_sub_impl(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor * a,
+        struct ggml_v3_tensor * b,
+        bool inplace) {
+    GGML_V3_ASSERT(ggml_v3_are_same_shape(a, b));
+
+    bool is_node = false;
+
+    if (!inplace && (a->grad || b->grad)) {
+        is_node = true;
+    }
+
+    struct ggml_v3_tensor * result = inplace ? ggml_v3_view_tensor(ctx, a) : ggml_v3_dup_tensor(ctx, a);
+
+    result->op   = GGML_V3_OP_SUB;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+struct ggml_v3_tensor * ggml_v3_sub(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor * a,
+        struct ggml_v3_tensor * b) {
+    return ggml_v3_sub_impl(ctx, a, b, false);
+}
+
+struct ggml_v3_tensor * ggml_v3_sub_inplace(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor * a,
+        struct ggml_v3_tensor * b) {
+    return ggml_v3_sub_impl(ctx, a, b, true);
+}
+
+// ggml_v3_mul
+
+static struct ggml_v3_tensor * ggml_v3_mul_impl(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor * a,
+        struct ggml_v3_tensor * b,
+        bool inplace) {
+    GGML_V3_ASSERT(ggml_v3_can_repeat(b, a));
+
+    bool is_node = false;
+
+    if (!inplace && (a->grad || b->grad)) {
+        // TODO: support backward pass for broadcasting
+        GGML_V3_ASSERT(ggml_v3_are_same_shape(a, b));
+        is_node = true;
+    }
+
+    if (inplace) {
+        GGML_V3_ASSERT(!is_node);
+    }
+
+    struct ggml_v3_tensor * result = inplace ? ggml_v3_view_tensor(ctx, a) : ggml_v3_dup_tensor(ctx, a);
+
+    result->op   = GGML_V3_OP_MUL;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+struct ggml_v3_tensor * ggml_v3_mul(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        struct ggml_v3_tensor  * b) {
+    return ggml_v3_mul_impl(ctx, a, b, false);
+}
+
+struct ggml_v3_tensor * ggml_v3_mul_inplace(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        struct ggml_v3_tensor  * b) {
+    return ggml_v3_mul_impl(ctx, a, b, true);
+}
+
+// ggml_v3_div
+
+static struct ggml_v3_tensor * ggml_v3_div_impl(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor * a,
+        struct ggml_v3_tensor * b,
+        bool inplace) {
+    GGML_V3_ASSERT(ggml_v3_can_repeat(b, a));
+
+    bool is_node = false;
+
+    if (!inplace && (a->grad || b->grad)) {
+        is_node = true;
+    }
+
+    if (inplace) {
+        GGML_V3_ASSERT(!is_node);
+    }
+
+    struct ggml_v3_tensor * result = inplace ? ggml_v3_view_tensor(ctx, a) : ggml_v3_dup_tensor(ctx, a);
+
+    result->op   = GGML_V3_OP_DIV;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+struct ggml_v3_tensor * ggml_v3_div(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        struct ggml_v3_tensor  * b) {
+    return ggml_v3_div_impl(ctx, a, b, false);
+}
+
+struct ggml_v3_tensor * ggml_v3_div_inplace(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        struct ggml_v3_tensor  * b) {
+    return ggml_v3_div_impl(ctx, a, b, true);
+}
+
+// ggml_v3_sqr
+
+static struct ggml_v3_tensor * ggml_v3_sqr_impl(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor * a,
+        bool inplace) {
+    bool is_node = false;
+
+    if (!inplace && (a->grad)) {
+        is_node = true;
+    }
+
+    struct ggml_v3_tensor * result = inplace ? ggml_v3_view_tensor(ctx, a) : ggml_v3_dup_tensor(ctx, a);
+
+    result->op   = GGML_V3_OP_SQR;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+struct ggml_v3_tensor * ggml_v3_sqr(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a) {
+    return ggml_v3_sqr_impl(ctx, a, false);
+}
+
+struct ggml_v3_tensor * ggml_v3_sqr_inplace(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a) {
+    return ggml_v3_sqr_impl(ctx, a, true);
+}
+
+// ggml_v3_sqrt
+
+static struct ggml_v3_tensor * ggml_v3_sqrt_impl(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor * a,
+        bool inplace) {
+    bool is_node = false;
+
+    if (!inplace && (a->grad)) {
+        is_node = true;
+    }
+
+    struct ggml_v3_tensor * result = inplace ? ggml_v3_view_tensor(ctx, a) : ggml_v3_dup_tensor(ctx, a);
+
+    result->op   = GGML_V3_OP_SQRT;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+struct ggml_v3_tensor * ggml_v3_sqrt(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a) {
+    return ggml_v3_sqrt_impl(ctx, a, false);
+}
+
+struct ggml_v3_tensor * ggml_v3_sqrt_inplace(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a) {
+    return ggml_v3_sqrt_impl(ctx, a, true);
+}
+
+// ggml_v3_log
+
+static struct ggml_v3_tensor * ggml_v3_log_impl(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        bool inplace) {
+    bool is_node = false;
+
+    if (!inplace && (a->grad)) {
+        is_node = true;
+    }
+
+    struct ggml_v3_tensor * result = inplace ? ggml_v3_view_tensor(ctx, a) : ggml_v3_dup_tensor(ctx, a);
+
+    result->op   = GGML_V3_OP_LOG;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+struct ggml_v3_tensor * ggml_v3_log(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a) {
+    return ggml_v3_log_impl(ctx, a, false);
+}
+
+struct ggml_v3_tensor * ggml_v3_log_inplace(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a) {
+    return ggml_v3_log_impl(ctx, a, true);
+}
+
+// ggml_v3_sum
+
+struct ggml_v3_tensor * ggml_v3_sum(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor * a) {
+    bool is_node = false;
+
+    if (a->grad) {
+        is_node = true;
+    }
+
+    struct ggml_v3_tensor * result = ggml_v3_new_tensor_1d(ctx, a->type, 1);
+
+    result->op   = GGML_V3_OP_SUM;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+// ggml_v3_sum_rows
+
+struct ggml_v3_tensor * ggml_v3_sum_rows(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor * a) {
+    bool is_node = false;
+
+    if (a->grad) {
+        is_node = true;
+    }
+
+    int64_t ne[GGML_V3_MAX_DIMS] = { 1 };
+    for (int i = 1; i < GGML_V3_MAX_DIMS; ++i) {
+        ne[i] = a->ne[i];
+    }
+
+    struct ggml_v3_tensor * result = ggml_v3_new_tensor(ctx, a->type, GGML_V3_MAX_DIMS, ne);
+
+    result->op   = GGML_V3_OP_SUM_ROWS;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+// ggml_v3_mean
+
+struct ggml_v3_tensor * ggml_v3_mean(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor * a) {
+    bool is_node = false;
+
+    if (a->grad) {
+        GGML_V3_ASSERT(false); // TODO: implement
+        is_node = true;
+    }
+
+    int64_t ne[4] = { 1, a->ne[1], a->ne[2], a->ne[3] };
+    struct ggml_v3_tensor * result = ggml_v3_new_tensor(ctx, GGML_V3_TYPE_F32, 4, ne);
+
+    result->op   = GGML_V3_OP_MEAN;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+// ggml_v3_argmax
+
+struct ggml_v3_tensor * ggml_v3_argmax(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor * a) {
+    GGML_V3_ASSERT(ggml_v3_is_matrix(a));
+    bool is_node = false;
+
+    if (a->grad) {
+        GGML_V3_ASSERT(false);
+        is_node = true;
+    }
+
+    struct ggml_v3_tensor * result = ggml_v3_new_tensor_1d(ctx, GGML_V3_TYPE_I32, a->ne[1]);
+
+    result->op   = GGML_V3_OP_ARGMAX;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+// ggml_v3_repeat
+
+struct ggml_v3_tensor * ggml_v3_repeat(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor * a,
+        struct ggml_v3_tensor * b) {
+    GGML_V3_ASSERT(ggml_v3_can_repeat(a, b));
+
+    bool is_node = false;
+
+    if (a->grad) {
+        is_node = true;
+    }
+
+    struct ggml_v3_tensor * result = ggml_v3_new_tensor(ctx, a->type, GGML_V3_MAX_DIMS, b->ne);
+
+    result->op   = GGML_V3_OP_REPEAT;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+// ggml_v3_repeat_back
+
+struct ggml_v3_tensor * ggml_v3_repeat_back(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor * a,
+        struct ggml_v3_tensor * b) {
+    GGML_V3_ASSERT(ggml_v3_can_repeat(b, a));
+
+    bool is_node = false;
+
+    if (a->grad) {
+        is_node = true;
+    }
+
+    if (ggml_v3_are_same_shape(a, b) && !is_node) {
+        return a;
+    }
+
+    struct ggml_v3_tensor * result = ggml_v3_new_tensor(ctx, a->type, GGML_V3_MAX_DIMS, b->ne);
+
+    result->op   = GGML_V3_OP_REPEAT_BACK;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+// ggml_v3_concat
+
+struct ggml_v3_tensor * ggml_v3_concat(
+    struct ggml_v3_context* ctx,
+    struct ggml_v3_tensor* a,
+    struct ggml_v3_tensor* b) {
+    GGML_V3_ASSERT(a->ne[0] == b->ne[0] && a->ne[1] == b->ne[1] && a->ne[3] == b->ne[3]);
+
+    bool is_node = false;
+
+    if (a->grad || b->grad) {
+        is_node = true;
+    }
+
+    struct ggml_v3_tensor * result = ggml_v3_new_tensor_4d(ctx, a->type, a->ne[0], a->ne[1], a->ne[2] + b->ne[2], a->ne[3]);
+
+    result->op = GGML_V3_OP_CONCAT;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+// ggml_v3_abs
+
+struct ggml_v3_tensor * ggml_v3_abs(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a) {
+    return ggml_v3_unary(ctx, a, GGML_V3_UNARY_OP_ABS);
+}
+
+struct ggml_v3_tensor * ggml_v3_abs_inplace(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a) {
+    return ggml_v3_unary_inplace(ctx, a, GGML_V3_UNARY_OP_ABS);
+}
+
+// ggml_v3_sgn
+
+struct ggml_v3_tensor * ggml_v3_sgn(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a) {
+    return ggml_v3_unary(ctx, a, GGML_V3_UNARY_OP_SGN);
+}
+
+struct ggml_v3_tensor * ggml_v3_sgn_inplace(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a) {
+    return ggml_v3_unary_inplace(ctx, a, GGML_V3_UNARY_OP_SGN);
+}
+
+// ggml_v3_neg
+
+struct ggml_v3_tensor * ggml_v3_neg(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a) {
+    return ggml_v3_unary(ctx, a, GGML_V3_UNARY_OP_NEG);
+}
+
+struct ggml_v3_tensor * ggml_v3_neg_inplace(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a) {
+    return ggml_v3_unary_inplace(ctx, a, GGML_V3_UNARY_OP_NEG);
+}
+
+// ggml_v3_step
+
+struct ggml_v3_tensor * ggml_v3_step(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a) {
+    return ggml_v3_unary(ctx, a, GGML_V3_UNARY_OP_STEP);
+}
+
+struct ggml_v3_tensor * ggml_v3_step_inplace(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a) {
+    return ggml_v3_unary_inplace(ctx, a, GGML_V3_UNARY_OP_STEP);
+}
+
+// ggml_v3_tanh
+
+struct ggml_v3_tensor * ggml_v3_tanh(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a) {
+    return ggml_v3_unary(ctx, a, GGML_V3_UNARY_OP_TANH);
+}
+
+struct ggml_v3_tensor * ggml_v3_tanh_inplace(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a) {
+    return ggml_v3_unary_inplace(ctx, a, GGML_V3_UNARY_OP_TANH);
+}
+
+// ggml_v3_elu
+
+struct ggml_v3_tensor * ggml_v3_elu(
+    struct ggml_v3_context * ctx,
+    struct ggml_v3_tensor  * a) {
+    return ggml_v3_unary(ctx, a, GGML_V3_UNARY_OP_ELU);
+}
+
+struct ggml_v3_tensor * ggml_v3_elu_inplace(
+    struct ggml_v3_context * ctx,
+    struct ggml_v3_tensor  * a) {
+    return ggml_v3_unary_inplace(ctx, a, GGML_V3_UNARY_OP_ELU);
+}
+
+// ggml_v3_relu
+
+struct ggml_v3_tensor * ggml_v3_relu(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a) {
+    return ggml_v3_unary(ctx, a, GGML_V3_UNARY_OP_RELU);
+}
+
+struct ggml_v3_tensor * ggml_v3_relu_inplace(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a) {
+    return ggml_v3_unary_inplace(ctx, a, GGML_V3_UNARY_OP_RELU);
+}
+
+// ggml_v3_leaky_relu
+
+struct ggml_v3_tensor * ggml_v3_leaky_relu(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a, float negative_slope, bool inplace) {
+    bool is_node = false;
+
+    if (!inplace && (a->grad)) {
+        is_node = true;
+    }
+
+    struct ggml_v3_tensor * result = inplace ? ggml_v3_view_tensor(ctx, a) : ggml_v3_dup_tensor(ctx, a);
+    ggml_v3_set_op_params(result, &negative_slope, sizeof(negative_slope));
+
+    result->op   = GGML_V3_OP_LEAKY_RELU;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+// ggml_v3_gelu
+
+struct ggml_v3_tensor * ggml_v3_gelu(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a) {
+    return ggml_v3_unary(ctx, a, GGML_V3_UNARY_OP_GELU);
+}
+
+struct ggml_v3_tensor * ggml_v3_gelu_inplace(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a) {
+    return ggml_v3_unary_inplace(ctx, a, GGML_V3_UNARY_OP_GELU);
+}
+
+// ggml_v3_gelu_quick
+
+struct ggml_v3_tensor * ggml_v3_gelu_quick(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a) {
+    return ggml_v3_unary(ctx, a, GGML_V3_UNARY_OP_GELU_QUICK);
+}
+
+struct ggml_v3_tensor * ggml_v3_gelu_quick_inplace(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a) {
+    return ggml_v3_unary_inplace(ctx, a, GGML_V3_UNARY_OP_GELU_QUICK);
+}
+
+// ggml_v3_silu
+
+struct ggml_v3_tensor * ggml_v3_silu(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a) {
+    return ggml_v3_unary(ctx, a, GGML_V3_UNARY_OP_SILU);
+}
+
+struct ggml_v3_tensor * ggml_v3_silu_inplace(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a) {
+    return ggml_v3_unary_inplace(ctx, a, GGML_V3_UNARY_OP_SILU);
+}
+
+// ggml_v3_silu_back
+
+struct ggml_v3_tensor * ggml_v3_silu_back(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        struct ggml_v3_tensor  * b) {
+    bool is_node = false;
+
+    if (a->grad || b->grad) {
+        // TODO: implement backward
+        is_node = true;
+    }
+
+    struct ggml_v3_tensor * result = ggml_v3_dup_tensor(ctx, a);
+
+    result->op   = GGML_V3_OP_SILU_BACK;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+// ggml_v3_norm
+
+static struct ggml_v3_tensor * ggml_v3_norm_impl(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        float eps,
+        bool inplace) {
+    bool is_node = false;
+
+    if (!inplace && (a->grad)) {
+        GGML_V3_ASSERT(false); // TODO: implement backward
+        is_node = true;
+    }
+
+    struct ggml_v3_tensor * result = inplace ? ggml_v3_view_tensor(ctx, a) : ggml_v3_dup_tensor(ctx, a);
+
+    ggml_v3_set_op_params(result, &eps, sizeof(eps));
+
+    result->op   = GGML_V3_OP_NORM;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+struct ggml_v3_tensor * ggml_v3_norm(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        float eps) {
+    return ggml_v3_norm_impl(ctx, a, eps, false);
+}
+
+struct ggml_v3_tensor * ggml_v3_norm_inplace(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        float eps) {
+    return ggml_v3_norm_impl(ctx, a, eps, true);
+}
+
+// ggml_v3_rms_norm
+
+static struct ggml_v3_tensor * ggml_v3_rms_norm_impl(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        float eps,
+        bool inplace) {
+    bool is_node = false;
+
+    if (!inplace && (a->grad)) {
+        is_node = true;
+    }
+
+    struct ggml_v3_tensor * result = inplace ? ggml_v3_view_tensor(ctx, a) : ggml_v3_dup_tensor(ctx, a);
+
+    ggml_v3_set_op_params(result, &eps, sizeof(eps));
+
+    result->op   = GGML_V3_OP_RMS_NORM;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+struct ggml_v3_tensor * ggml_v3_rms_norm(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        float  eps) {
+    return ggml_v3_rms_norm_impl(ctx, a, eps, false);
+}
+
+struct ggml_v3_tensor * ggml_v3_rms_norm_inplace(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        float eps) {
+    return ggml_v3_rms_norm_impl(ctx, a, eps, true);
+}
+
+// ggml_v3_rms_norm_back
+
+struct ggml_v3_tensor * ggml_v3_rms_norm_back(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        struct ggml_v3_tensor  * b,
+        float  eps) {
+    bool is_node = false;
+
+    if (a->grad) {
+        // TODO: implement backward
+        is_node = true;
+    }
+
+    struct ggml_v3_tensor * result = ggml_v3_dup_tensor(ctx, a);
+
+    ggml_v3_set_op_params(result, &eps, sizeof(eps));
+
+    result->op   = GGML_V3_OP_RMS_NORM_BACK;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+// ggml_v3_group_norm
+
+static struct ggml_v3_tensor * ggml_v3_group_norm_impl(
+    struct ggml_v3_context * ctx,
+    struct ggml_v3_tensor * a,
+    int n_groups,
+    bool inplace) {
+
+    bool is_node = false;
+    if (!inplace && (a->grad)) {
+        GGML_V3_ASSERT(false); // TODO: implement backward
+        is_node = true;
+    }
+
+    struct ggml_v3_tensor * result = inplace ? ggml_v3_view_tensor(ctx, a) : ggml_v3_dup_tensor(ctx, a);
+
+    result->op_params[0] = n_groups;
+
+    result->op = GGML_V3_OP_GROUP_NORM;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+struct ggml_v3_tensor * ggml_v3_group_norm(
+    struct ggml_v3_context * ctx,
+    struct ggml_v3_tensor * a,
+    int n_groups) {
+    return ggml_v3_group_norm_impl(ctx, a, n_groups, false);
+}
+
+struct ggml_v3_tensor * ggml_v3_group_norm_inplace(
+    struct ggml_v3_context * ctx,
+    struct ggml_v3_tensor * a,
+    int n_groups) {
+    return ggml_v3_group_norm_impl(ctx, a, n_groups, true);
+}
+
+// ggml_v3_mul_mat
+
+struct ggml_v3_tensor * ggml_v3_mul_mat(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        struct ggml_v3_tensor  * b) {
+    GGML_V3_ASSERT(ggml_v3_can_mul_mat(a, b));
+    GGML_V3_ASSERT(!ggml_v3_is_transposed(a));
+
+    bool is_node = false;
+
+    if (a->grad || b->grad) {
+        is_node = true;
+    }
+
+    const int64_t ne[4] = { a->ne[1], b->ne[1], b->ne[2], b->ne[3] };
+    struct ggml_v3_tensor * result = ggml_v3_new_tensor(ctx, GGML_V3_TYPE_F32, 4, ne);
+
+    result->op   = GGML_V3_OP_MUL_MAT;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+void ggml_v3_mul_mat_set_prec(
+        struct ggml_v3_tensor * a,
+        enum ggml_v3_prec       prec) {
+    const int32_t prec_i32 = (int32_t) prec;
+
+    ggml_v3_set_op_params_i32(a, 0, prec_i32);
+}
+
+// ggml_v3_mul_mat_id
+
+struct ggml_v3_tensor * ggml_v3_mul_mat_id(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * const as[],
+        int                   n_as,
+        struct ggml_v3_tensor  * ids,
+        int                   id,
+        struct ggml_v3_tensor  * b) {
+
+    GGML_V3_ASSERT(ids->type == GGML_V3_TYPE_I32);
+    GGML_V3_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1);
+    GGML_V3_ASSERT(ids->ne[1] == b->ne[1]);
+    GGML_V3_ASSERT(ids->ne[2] == b->ne[2] && ids->ne[3] == b->ne[3]);
+    GGML_V3_ASSERT(n_as > 0 && n_as <= GGML_V3_MAX_SRC - 2);
+    GGML_V3_ASSERT(id >= 0 && id < ids->ne[0]);
+
+    bool is_node = false;
+
+    if (as[0]->grad || b->grad) {
+        is_node = true;
+    }
+
+    const int64_t ne[4] = { as[0]->ne[1], b->ne[1], b->ne[2], b->ne[3] };
+    struct ggml_v3_tensor * result = ggml_v3_new_tensor(ctx, GGML_V3_TYPE_F32, 4, ne);
+
+    ggml_v3_set_op_params_i32(result, 0, id);
+    ggml_v3_set_op_params_i32(result, 1, n_as);
+
+    result->op   = GGML_V3_OP_MUL_MAT_ID;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = ids;
+    result->src[1] = b;
+
+    for (int i = 0; i < n_as; i++) {
+        struct ggml_v3_tensor * a = as[i];
+        GGML_V3_ASSERT(ggml_v3_are_same_shape(as[0], a));
+        GGML_V3_ASSERT(ggml_v3_can_mul_mat(a, b));
+        GGML_V3_ASSERT(!ggml_v3_is_transposed(a));
+        result->src[i + 2] = a;
+    }
+
+    return result;
+}
+
+// ggml_v3_out_prod
+
+struct ggml_v3_tensor * ggml_v3_out_prod(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        struct ggml_v3_tensor  * b) {
+    GGML_V3_ASSERT(ggml_v3_can_out_prod(a, b));
+    GGML_V3_ASSERT(!ggml_v3_is_transposed(a));
+
+    bool is_node = false;
+
+    if (a->grad || b->grad) {
+        is_node = true;
+    }
+
+    // a is broadcastable to b for ne[2] and ne[3] -> use b->ne[2] and b->ne[3]
+    const int64_t ne[4] = { a->ne[0], b->ne[0], b->ne[2], b->ne[3] };
+    struct ggml_v3_tensor * result = ggml_v3_new_tensor(ctx, GGML_V3_TYPE_F32, 4, ne);
+
+    result->op   = GGML_V3_OP_OUT_PROD;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+// ggml_v3_scale
+
+static struct ggml_v3_tensor * ggml_v3_scale_impl(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        float                 s,
+        bool inplace) {
+    GGML_V3_ASSERT(ggml_v3_is_padded_1d(a));
+
+    bool is_node = false;
+
+    if (a->grad) {
+        is_node = true;
+    }
+
+    struct ggml_v3_tensor * result = inplace ? ggml_v3_view_tensor(ctx, a) : ggml_v3_dup_tensor(ctx, a);
+
+    ggml_v3_set_op_params(result, &s, sizeof(s));
+
+    result->op   = GGML_V3_OP_SCALE;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+struct ggml_v3_tensor * ggml_v3_scale(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor * a,
+        float                s) {
+    return ggml_v3_scale_impl(ctx, a, s, false);
+}
+
+struct ggml_v3_tensor * ggml_v3_scale_inplace(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor * a,
+        float                s) {
+    return ggml_v3_scale_impl(ctx, a, s, true);
+}
+
+// ggml_v3_set
+
+static struct ggml_v3_tensor * ggml_v3_set_impl(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        struct ggml_v3_tensor  * b,
+        size_t                nb1,
+        size_t                nb2,
+        size_t                nb3,
+        size_t                offset,
+        bool inplace) {
+    GGML_V3_ASSERT(ggml_v3_nelements(a) >= ggml_v3_nelements(b));
+
+    bool is_node = false;
+
+    if (a->grad || b->grad) {
+        is_node = true;
+    }
+
+    // make a view of the destination
+    struct ggml_v3_tensor * result = inplace ? ggml_v3_view_tensor(ctx, a) : ggml_v3_dup_tensor(ctx, a);
+
+    int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 };
+    ggml_v3_set_op_params(result, params, sizeof(params));
+
+    result->op   = GGML_V3_OP_SET;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+struct ggml_v3_tensor * ggml_v3_set(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor *  a,
+        struct ggml_v3_tensor *  b,
+        size_t                nb1,
+        size_t                nb2,
+        size_t                nb3,
+        size_t                offset) {
+    return ggml_v3_set_impl(ctx, a, b, nb1, nb2, nb3, offset, false);
+}
+
+struct ggml_v3_tensor * ggml_v3_set_inplace(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor *  a,
+        struct ggml_v3_tensor *  b,
+        size_t                nb1,
+        size_t                nb2,
+        size_t                nb3,
+        size_t                offset) {
+    return ggml_v3_set_impl(ctx, a, b, nb1, nb2, nb3, offset, true);
+}
+
+struct ggml_v3_tensor * ggml_v3_set_1d(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor *  a,
+        struct ggml_v3_tensor *  b,
+        size_t                offset) {
+    return ggml_v3_set_impl(ctx, a, b, a->nb[1], a->nb[2], a->nb[3], offset, false);
+}
+
+struct ggml_v3_tensor * ggml_v3_set_1d_inplace(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor *  a,
+        struct ggml_v3_tensor *  b,
+        size_t                offset) {
+    return ggml_v3_set_impl(ctx, a, b, a->nb[1], a->nb[2], a->nb[3], offset, true);
+}
+
+struct ggml_v3_tensor * ggml_v3_set_2d(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor *  a,
+        struct ggml_v3_tensor *  b,
+        size_t                nb1,
+        size_t                offset) {
+    return ggml_v3_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, false);
+}
+
+struct ggml_v3_tensor * ggml_v3_set_2d_inplace(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor *  a,
+        struct ggml_v3_tensor *  b,
+        size_t                nb1,
+        size_t                offset) {
+    return ggml_v3_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, true);
+}
+
+// ggml_v3_cpy
+
+static struct ggml_v3_tensor * ggml_v3_cpy_impl(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        struct ggml_v3_tensor  * b) {
+    GGML_V3_ASSERT(ggml_v3_nelements(a) == ggml_v3_nelements(b));
+
+    bool is_node = false;
+
+    if (a->grad || b->grad) {
+        // inplace is false and either one have a grad
+        is_node = true;
+    }
+
+    // make a view of the destination
+    struct ggml_v3_tensor * result = ggml_v3_view_tensor(ctx, b);
+    if (strlen(b->name) > 0) {
+        ggml_v3_format_name(result, "%s (copy of %s)", b->name, a->name);
+    } else {
+        ggml_v3_format_name(result, "%s (copy)", a->name);
+    }
+
+    result->op   = GGML_V3_OP_CPY;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+struct ggml_v3_tensor * ggml_v3_cpy(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor * a,
+        struct ggml_v3_tensor * b) {
+    return ggml_v3_cpy_impl(ctx, a, b);
+}
+
+// ggml_v3_cont
+
+static struct ggml_v3_tensor * ggml_v3_cont_impl(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a) {
+    bool is_node = false;
+
+    if (a->grad) {
+        is_node = true;
+    }
+
+    struct ggml_v3_tensor * result = ggml_v3_dup_tensor(ctx, a);
+    ggml_v3_format_name(result, "%s (cont)", a->name);
+
+    result->op   = GGML_V3_OP_CONT;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+struct ggml_v3_tensor * ggml_v3_cont(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor * a) {
+    return ggml_v3_cont_impl(ctx, a);
+}
+
+// make contiguous, with new shape
+GGML_V3_API struct ggml_v3_tensor * ggml_v3_cont_1d(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        int64_t               ne0) {
+    return ggml_v3_cont_4d(ctx, a, ne0, 1, 1, 1);
+}
+
+GGML_V3_API struct ggml_v3_tensor * ggml_v3_cont_2d(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        int64_t               ne0,
+        int64_t               ne1) {
+    return ggml_v3_cont_4d(ctx, a, ne0, ne1, 1, 1);
+}
+
+GGML_V3_API struct ggml_v3_tensor * ggml_v3_cont_3d(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        int64_t               ne0,
+        int64_t               ne1,
+        int64_t               ne2) {
+    return ggml_v3_cont_4d(ctx, a, ne0, ne1, ne2, 1);
+}
+
+struct ggml_v3_tensor * ggml_v3_cont_4d(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        int64_t               ne0,
+        int64_t               ne1,
+        int64_t               ne2,
+        int64_t               ne3) {
+    GGML_V3_ASSERT(ggml_v3_nelements(a) == (ne0*ne1*ne2*ne3));
+
+    bool is_node = false;
+
+    struct ggml_v3_tensor * result = ggml_v3_new_tensor_4d(ctx, a->type, ne0, ne1, ne2, ne3);
+    ggml_v3_format_name(result, "%s (cont)", a->name);
+
+    result->op   = GGML_V3_OP_CONT;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+// ggml_v3_reshape
+
+struct ggml_v3_tensor * ggml_v3_reshape(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor * a,
+        struct ggml_v3_tensor * b) {
+    GGML_V3_ASSERT(ggml_v3_is_contiguous(a));
+    // as only the shape of b is relevant, and not its memory layout, b is allowed to be non contiguous.
+    GGML_V3_ASSERT(ggml_v3_nelements(a) == ggml_v3_nelements(b));
+
+    bool is_node = false;
+
+    if (a->grad) {
+        is_node = true;
+    }
+
+    if (b->grad) {
+        // gradient propagation is not supported
+        //GGML_V3_ASSERT(false);
+    }
+
+    struct ggml_v3_tensor * result = ggml_v3_new_tensor_impl(ctx, a->type, GGML_V3_MAX_DIMS, b->ne, a, 0);
+    ggml_v3_format_name(result, "%s (reshaped)", a->name);
+
+    result->op   = GGML_V3_OP_RESHAPE;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+struct ggml_v3_tensor * ggml_v3_reshape_1d(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        int64_t               ne0) {
+    GGML_V3_ASSERT(ggml_v3_is_contiguous(a));
+    GGML_V3_ASSERT(ggml_v3_nelements(a) == ne0);
+
+    bool is_node = false;
+
+    if (a->grad) {
+        is_node = true;
+    }
+
+    const int64_t ne[1] = { ne0 };
+    struct ggml_v3_tensor * result = ggml_v3_new_tensor_impl(ctx, a->type, 1, ne, a, 0);
+    ggml_v3_format_name(result, "%s (reshaped)", a->name);
+
+    result->op   = GGML_V3_OP_RESHAPE;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+struct ggml_v3_tensor * ggml_v3_reshape_2d(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        int64_t               ne0,
+        int64_t               ne1) {
+    GGML_V3_ASSERT(ggml_v3_is_contiguous(a));
+    GGML_V3_ASSERT(ggml_v3_nelements(a) == ne0*ne1);
+
+    bool is_node = false;
+
+    if (a->grad) {
+        is_node = true;
+    }
+
+    const int64_t ne[2] = { ne0, ne1 };
+    struct ggml_v3_tensor * result = ggml_v3_new_tensor_impl(ctx, a->type, 2, ne, a, 0);
+    ggml_v3_format_name(result, "%s (reshaped)", a->name);
+
+    result->op   = GGML_V3_OP_RESHAPE;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+struct ggml_v3_tensor * ggml_v3_reshape_3d(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        int64_t               ne0,
+        int64_t               ne1,
+        int64_t               ne2) {
+    GGML_V3_ASSERT(ggml_v3_is_contiguous(a));
+    GGML_V3_ASSERT(ggml_v3_nelements(a) == ne0*ne1*ne2);
+
+    bool is_node = false;
+
+    if (a->grad) {
+        is_node = true;
+    }
+
+    const int64_t ne[3] = { ne0, ne1, ne2 };
+    struct ggml_v3_tensor * result = ggml_v3_new_tensor_impl(ctx, a->type, 3, ne, a, 0);
+    ggml_v3_format_name(result, "%s (reshaped)", a->name);
+
+    result->op   = GGML_V3_OP_RESHAPE;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+struct ggml_v3_tensor * ggml_v3_reshape_4d(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        int64_t               ne0,
+        int64_t               ne1,
+        int64_t               ne2,
+        int64_t               ne3) {
+    GGML_V3_ASSERT(ggml_v3_is_contiguous(a));
+    GGML_V3_ASSERT(ggml_v3_nelements(a) == ne0*ne1*ne2*ne3);
+
+    bool is_node = false;
+
+    if (a->grad) {
+        is_node = true;
+    }
+
+    const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
+    struct ggml_v3_tensor * result = ggml_v3_new_tensor_impl(ctx, a->type, 4, ne, a, 0);
+    ggml_v3_format_name(result, "%s (reshaped)", a->name);
+
+    result->op   = GGML_V3_OP_RESHAPE;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+static struct ggml_v3_tensor * ggml_v3_view_impl(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        int                   n_dims,
+        const int64_t       * ne,
+        size_t                offset) {
+
+    bool is_node = false;
+
+    if (a->grad) {
+        is_node = true;
+    }
+
+    struct ggml_v3_tensor * result = ggml_v3_new_tensor_impl(ctx, a->type, n_dims, ne, a, offset);
+    ggml_v3_format_name(result, "%s (view)", a->name);
+
+    ggml_v3_set_op_params(result, &offset, sizeof(offset));
+
+    result->op   = GGML_V3_OP_VIEW;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+// ggml_v3_view_1d
+
+struct ggml_v3_tensor * ggml_v3_view_1d(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        int64_t               ne0,
+        size_t                offset) {
+
+    struct ggml_v3_tensor * result = ggml_v3_view_impl(ctx, a, 1, &ne0, offset);
+
+    return result;
+}
+
+// ggml_v3_view_2d
+
+struct ggml_v3_tensor * ggml_v3_view_2d(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        int64_t               ne0,
+        int64_t               ne1,
+        size_t                nb1,
+        size_t                offset) {
+
+    const int64_t ne[2] = { ne0, ne1 };
+
+    struct ggml_v3_tensor * result = ggml_v3_view_impl(ctx, a, 2, ne, offset);
+
+    result->nb[1] = nb1;
+    result->nb[2] = result->nb[1]*ne1;
+    result->nb[3] = result->nb[2];
+
+    return result;
+}
+
+// ggml_v3_view_3d
+
+struct ggml_v3_tensor * ggml_v3_view_3d(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        int64_t               ne0,
+        int64_t               ne1,
+        int64_t               ne2,
+        size_t                nb1,
+        size_t                nb2,
+        size_t                offset) {
+
+    const int64_t ne[3] = { ne0, ne1, ne2 };
+
+    struct ggml_v3_tensor * result = ggml_v3_view_impl(ctx, a, 3, ne, offset);
+
+    result->nb[1] = nb1;
+    result->nb[2] = nb2;
+    result->nb[3] = result->nb[2]*ne2;
+
+    return result;
+}
+
+// ggml_v3_view_4d
+
+struct ggml_v3_tensor * ggml_v3_view_4d(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        int64_t               ne0,
+        int64_t               ne1,
+        int64_t               ne2,
+        int64_t               ne3,
+        size_t                nb1,
+        size_t                nb2,
+        size_t                nb3,
+        size_t                offset) {
+
+    const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
+
+    struct ggml_v3_tensor * result = ggml_v3_view_impl(ctx, a, 4, ne, offset);
+
+    result->nb[1] = nb1;
+    result->nb[2] = nb2;
+    result->nb[3] = nb3;
+
+    return result;
+}
+
+// ggml_v3_permute
+
+struct ggml_v3_tensor * ggml_v3_permute(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        int                   axis0,
+        int                   axis1,
+        int                   axis2,
+        int                   axis3) {
+    GGML_V3_ASSERT(axis0 >= 0 && axis0 < GGML_V3_MAX_DIMS);
+    GGML_V3_ASSERT(axis1 >= 0 && axis1 < GGML_V3_MAX_DIMS);
+    GGML_V3_ASSERT(axis2 >= 0 && axis2 < GGML_V3_MAX_DIMS);
+    GGML_V3_ASSERT(axis3 >= 0 && axis3 < GGML_V3_MAX_DIMS);
+
+    GGML_V3_ASSERT(axis0 != axis1);
+    GGML_V3_ASSERT(axis0 != axis2);
+    GGML_V3_ASSERT(axis0 != axis3);
+    GGML_V3_ASSERT(axis1 != axis2);
+    GGML_V3_ASSERT(axis1 != axis3);
+    GGML_V3_ASSERT(axis2 != axis3);
+
+    bool is_node = false;
+
+    if (a->grad) {
+        is_node = true;
+    }
+
+    struct ggml_v3_tensor * result = ggml_v3_view_tensor(ctx, a);
+    ggml_v3_format_name(result, "%s (permuted)", a->name);
+
+    int ne[GGML_V3_MAX_DIMS];
+    int nb[GGML_V3_MAX_DIMS];
+
+    ne[axis0] = a->ne[0];
+    ne[axis1] = a->ne[1];
+    ne[axis2] = a->ne[2];
+    ne[axis3] = a->ne[3];
+
+    nb[axis0] = a->nb[0];
+    nb[axis1] = a->nb[1];
+    nb[axis2] = a->nb[2];
+    nb[axis3] = a->nb[3];
+
+    result->ne[0] = ne[0];
+    result->ne[1] = ne[1];
+    result->ne[2] = ne[2];
+    result->ne[3] = ne[3];
+
+    result->nb[0] = nb[0];
+    result->nb[1] = nb[1];
+    result->nb[2] = nb[2];
+    result->nb[3] = nb[3];
+
+    result->op   = GGML_V3_OP_PERMUTE;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    int32_t params[] = { axis0, axis1, axis2, axis3 };
+    ggml_v3_set_op_params(result, params, sizeof(params));
+
+    return result;
+}
+
+// ggml_v3_transpose
+
+struct ggml_v3_tensor * ggml_v3_transpose(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a) {
+    bool is_node = false;
+
+    if (a->grad) {
+        is_node = true;
+    }
+
+    struct ggml_v3_tensor * result = ggml_v3_view_tensor(ctx, a);
+    ggml_v3_format_name(result, "%s (transposed)", a->name);
+
+    result->ne[0] = a->ne[1];
+    result->ne[1] = a->ne[0];
+
+    result->nb[0] = a->nb[1];
+    result->nb[1] = a->nb[0];
+
+    result->op   = GGML_V3_OP_TRANSPOSE;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+// ggml_v3_get_rows
+
+struct ggml_v3_tensor * ggml_v3_get_rows(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        struct ggml_v3_tensor  * b) {
+    GGML_V3_ASSERT(a->ne[2] == b->ne[1]);
+    GGML_V3_ASSERT(b->ne[3] == 1);
+    GGML_V3_ASSERT(b->type == GGML_V3_TYPE_I32);
+
+    bool is_node = false;
+
+    if (a->grad || b->grad) {
+        is_node = true;
+    }
+
+    // TODO: implement non F32 return
+    enum ggml_v3_type type = GGML_V3_TYPE_F32;
+    if (a->type == GGML_V3_TYPE_I32) {
+        type = a->type;
+    }
+    struct ggml_v3_tensor * result = ggml_v3_new_tensor_4d(ctx, type, a->ne[0], b->ne[0], b->ne[1], b->ne[2]);
+
+    result->op   = GGML_V3_OP_GET_ROWS;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+// ggml_v3_get_rows_back
+
+struct ggml_v3_tensor * ggml_v3_get_rows_back(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        struct ggml_v3_tensor  * b,
+        struct ggml_v3_tensor  * c) {
+    GGML_V3_ASSERT(ggml_v3_is_matrix(a) && ggml_v3_is_vector(b) && b->type == GGML_V3_TYPE_I32);
+    GGML_V3_ASSERT(ggml_v3_is_matrix(c) && (a->ne[0] == c->ne[0]));
+
+    bool is_node = false;
+
+    if (a->grad || b->grad) {
+        is_node = true;
+    }
+
+    // TODO: implement non F32 return
+    //struct ggml_v3_tensor * result = ggml_v3_new_tensor_2d(ctx, a->type, a->ne[0], b->ne[0]);
+    struct ggml_v3_tensor * result = ggml_v3_new_tensor_2d(ctx, GGML_V3_TYPE_F32, c->ne[0], c->ne[1]);
+
+    result->op   = GGML_V3_OP_GET_ROWS_BACK;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+// ggml_v3_diag
+
+struct ggml_v3_tensor * ggml_v3_diag(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a) {
+    GGML_V3_ASSERT(a->ne[1] == 1);
+    bool is_node = false;
+
+    if (a->grad) {
+        is_node = true;
+    }
+
+    const int64_t ne[4] = { a->ne[0], a->ne[0], a->ne[2], a->ne[3] };
+    struct ggml_v3_tensor * result = ggml_v3_new_tensor(ctx, a->type, 4, ne);
+
+    result->op   = GGML_V3_OP_DIAG;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+// ggml_v3_diag_mask_inf
+
+static struct ggml_v3_tensor * ggml_v3_diag_mask_inf_impl(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        int                   n_past,
+        bool                  inplace) {
+    bool is_node = false;
+
+    if (a->grad) {
+        is_node = true;
+    }
+
+    struct ggml_v3_tensor * result = inplace ? ggml_v3_view_tensor(ctx, a) : ggml_v3_dup_tensor(ctx, a);
+
+    int32_t params[] = { n_past };
+    ggml_v3_set_op_params(result, params, sizeof(params));
+
+    result->op   = GGML_V3_OP_DIAG_MASK_INF;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+struct ggml_v3_tensor * ggml_v3_diag_mask_inf(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        int                   n_past) {
+    return ggml_v3_diag_mask_inf_impl(ctx, a, n_past, false);
+}
+
+struct ggml_v3_tensor * ggml_v3_diag_mask_inf_inplace(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        int                   n_past) {
+    return ggml_v3_diag_mask_inf_impl(ctx, a, n_past, true);
+}
+
+// ggml_v3_diag_mask_zero
+
+static struct ggml_v3_tensor * ggml_v3_diag_mask_zero_impl(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        int                   n_past,
+        bool                  inplace) {
+    bool is_node = false;
+
+    if (a->grad) {
+        is_node = true;
+    }
+
+    struct ggml_v3_tensor * result = inplace ? ggml_v3_view_tensor(ctx, a) : ggml_v3_dup_tensor(ctx, a);
+
+    int32_t params[] = { n_past };
+    ggml_v3_set_op_params(result, params, sizeof(params));
+
+    result->op   = GGML_V3_OP_DIAG_MASK_ZERO;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+struct ggml_v3_tensor * ggml_v3_diag_mask_zero(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        int                   n_past) {
+    return ggml_v3_diag_mask_zero_impl(ctx, a, n_past, false);
+}
+
+struct ggml_v3_tensor * ggml_v3_diag_mask_zero_inplace(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        int                   n_past) {
+    return ggml_v3_diag_mask_zero_impl(ctx, a, n_past, true);
+}
+
+// ggml_v3_soft_max
+
+static struct ggml_v3_tensor * ggml_v3_soft_max_impl(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        struct ggml_v3_tensor  * mask,
+        float                 scale,
+        bool                  inplace) {
+    GGML_V3_ASSERT(ggml_v3_is_contiguous(a));
+    if (mask) {
+        GGML_V3_ASSERT(ggml_v3_is_contiguous(mask));
+        GGML_V3_ASSERT(mask->ne[2] == 1);
+        GGML_V3_ASSERT(mask->ne[3] == 1);
+        GGML_V3_ASSERT(ggml_v3_can_repeat_rows(mask, a));
+    }
+
+    bool is_node = false;
+
+    if (a->grad) {
+        is_node = true;
+    }
+
+    struct ggml_v3_tensor * result = inplace ? ggml_v3_view_tensor(ctx, a) : ggml_v3_dup_tensor(ctx, a);
+
+    float params[] = { scale };
+    ggml_v3_set_op_params(result, params, sizeof(params));
+
+    result->op   = GGML_V3_OP_SOFT_MAX;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = mask;
+
+    return result;
+}
+
+struct ggml_v3_tensor * ggml_v3_soft_max(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a) {
+    return ggml_v3_soft_max_impl(ctx, a, NULL, 1.0f, false);
+}
+
+struct ggml_v3_tensor * ggml_v3_soft_max_inplace(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a) {
+    return ggml_v3_soft_max_impl(ctx, a, NULL, 1.0f, true);
+}
+
+struct ggml_v3_tensor * ggml_v3_soft_max_ext(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        struct ggml_v3_tensor  * mask,
+        float                 scale) {
+    return ggml_v3_soft_max_impl(ctx, a, mask, scale, false);
+}
+
+// ggml_v3_soft_max_back
+
+static struct ggml_v3_tensor * ggml_v3_soft_max_back_impl(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        struct ggml_v3_tensor  * b,
+        bool                  inplace) {
+    bool is_node = false;
+
+    if (a->grad || b->grad) {
+        is_node = true; // TODO : implement backward pass
+    }
+
+    struct ggml_v3_tensor * result = inplace ? ggml_v3_view_tensor(ctx, a) : ggml_v3_dup_tensor(ctx, a);
+
+    result->op   = GGML_V3_OP_SOFT_MAX_BACK;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+struct ggml_v3_tensor * ggml_v3_soft_max_back(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        struct ggml_v3_tensor  * b) {
+    return ggml_v3_soft_max_back_impl(ctx, a, b, false);
+}
+
+struct ggml_v3_tensor * ggml_v3_soft_max_back_inplace(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        struct ggml_v3_tensor  * b) {
+    return ggml_v3_soft_max_back_impl(ctx, a, b, true);
+}
+
+// ggml_v3_rope
+
+static struct ggml_v3_tensor * ggml_v3_rope_impl(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        struct ggml_v3_tensor  * b,
+        int                   n_dims,
+        int                   mode,
+        int                   n_ctx,
+        int                   n_orig_ctx,
+        float                 freq_base,
+        float                 freq_scale,
+        float                 ext_factor,
+        float                 attn_factor,
+        float                 beta_fast,
+        float                 beta_slow,
+        float                 xpos_base,
+        bool                  xpos_down,
+        bool                  inplace) {
+    GGML_V3_ASSERT(ggml_v3_is_vector(b));
+    GGML_V3_ASSERT(b->type == GGML_V3_TYPE_I32);
+    GGML_V3_ASSERT(a->ne[2] == b->ne[0]);
+
+    bool is_node = false;
+
+    if (a->grad) {
+        is_node = true;
+    }
+
+    struct ggml_v3_tensor * result = inplace ? ggml_v3_view_tensor(ctx, a) : ggml_v3_dup_tensor(ctx, a);
+
+    int32_t params[13] = { /*n_past*/ 0, n_dims, mode, n_ctx, n_orig_ctx };
+    memcpy(params +  5, &freq_base,    sizeof(float));
+    memcpy(params +  6, &freq_scale,   sizeof(float));
+    memcpy(params +  7, &ext_factor,   sizeof(float));
+    memcpy(params +  8, &attn_factor,  sizeof(float));
+    memcpy(params +  9, &beta_fast,    sizeof(float));
+    memcpy(params + 10, &beta_slow,    sizeof(float));
+    memcpy(params + 11, &xpos_base,    sizeof(float));
+    memcpy(params + 12, &xpos_down,    sizeof(bool));
+    ggml_v3_set_op_params(result, params, sizeof(params));
+
+    result->op   = GGML_V3_OP_ROPE;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+struct ggml_v3_tensor * ggml_v3_rope(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        struct ggml_v3_tensor  * b,
+        int                   n_dims,
+        int                   mode,
+        int                   n_ctx) {
+    return ggml_v3_rope_impl(
+        ctx, a, b, n_dims, mode, n_ctx, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, false, false
+    );
+}
+
+struct ggml_v3_tensor * ggml_v3_rope_inplace(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        struct ggml_v3_tensor  * b,
+        int                   n_dims,
+        int                   mode,
+        int                   n_ctx) {
+    return ggml_v3_rope_impl(
+        ctx, a, b, n_dims, mode, n_ctx, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, false, true
+    );
+}
+
+struct ggml_v3_tensor * ggml_v3_rope_custom(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        struct ggml_v3_tensor  * b,
+        int                   n_dims,
+        int                   mode,
+        int                   n_ctx,
+        int                   n_orig_ctx,
+        float                 freq_base,
+        float                 freq_scale,
+        float                 ext_factor,
+        float                 attn_factor,
+        float                 beta_fast,
+        float                 beta_slow) {
+    return ggml_v3_rope_impl(
+        ctx, a, b, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale,
+        ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, false
+    );
+}
+
+struct ggml_v3_tensor * ggml_v3_rope_custom_inplace(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        struct ggml_v3_tensor  * b,
+        int                   n_dims,
+        int                   mode,
+        int                   n_ctx,
+        int                   n_orig_ctx,
+        float                 freq_base,
+        float                 freq_scale,
+        float                 ext_factor,
+        float                 attn_factor,
+        float                 beta_fast,
+        float                 beta_slow) {
+    return ggml_v3_rope_impl(
+        ctx, a, b, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale,
+        ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, true
+    );
+}
+
+struct ggml_v3_tensor * ggml_v3_rope_xpos_inplace(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        struct ggml_v3_tensor  * b,
+        int                   n_dims,
+        float                 base,
+        bool                  down) {
+    return ggml_v3_rope_impl(ctx, a, b, n_dims, 0, 0, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, base, down, true);
+}
+
+// ggml_v3_rope_back
+
+struct ggml_v3_tensor * ggml_v3_rope_back(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        struct ggml_v3_tensor  * b,
+        int                   n_dims,
+        int                   mode,
+        int                   n_ctx,
+        int                   n_orig_ctx,
+        float                 freq_base,
+        float                 freq_scale,
+        float                 ext_factor,
+        float                 attn_factor,
+        float                 beta_fast,
+        float                 beta_slow,
+        float                 xpos_base,
+        bool                  xpos_down) {
+    GGML_V3_ASSERT(ggml_v3_is_vector(b));
+    GGML_V3_ASSERT(b->type == GGML_V3_TYPE_I32);
+    GGML_V3_ASSERT(a->ne[2] == b->ne[0]);
+
+    GGML_V3_ASSERT((mode & 4) == 0 && "ggml_v3_rope_back() for ChatGLM not implemented yet");
+
+    bool is_node = false;
+
+    if (a->grad) {
+        is_node = false; // TODO: implement backward
+    }
+
+    struct ggml_v3_tensor * result = ggml_v3_dup_tensor(ctx, a);
+
+    int32_t params[13] = { /*n_past*/ 0, n_dims, mode, n_ctx, n_orig_ctx };
+    memcpy(params +  5, &freq_base,    sizeof(float));
+    memcpy(params +  6, &freq_scale,   sizeof(float));
+    memcpy(params +  7, &ext_factor,   sizeof(float));
+    memcpy(params +  8, &attn_factor,  sizeof(float));
+    memcpy(params +  9, &beta_fast,    sizeof(float));
+    memcpy(params + 10, &beta_slow,    sizeof(float));
+    memcpy(params + 11, &xpos_base,    sizeof(float));
+    memcpy(params + 12, &xpos_down,    sizeof(bool));
+    ggml_v3_set_op_params(result, params, sizeof(params));
+
+    result->op   = GGML_V3_OP_ROPE_BACK;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+// ggml_v3_alibi
+
+struct ggml_v3_tensor * ggml_v3_alibi(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        int                   n_past,
+        int                   n_head,
+        float                 bias_max) {
+    GGML_V3_ASSERT(n_past >= 0);
+    bool is_node = false;
+
+    if (a->grad) {
+        GGML_V3_ASSERT(false); // TODO: implement backward
+        is_node = true;
+    }
+
+    // TODO: when implement backward, fix this:
+    //struct ggml_v3_tensor * result = inplace ? ggml_v3_view_tensor(ctx, a) : ggml_v3_dup_tensor(ctx, a);
+    struct ggml_v3_tensor * result = ggml_v3_view_tensor(ctx, a);
+
+    int32_t op_params[3] = { n_past, n_head };
+    memcpy(op_params + 2, &bias_max, sizeof(float));
+    ggml_v3_set_op_params(result, op_params, sizeof(op_params));
+
+    result->op   = GGML_V3_OP_ALIBI;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+// ggml_v3_clamp
+
+struct ggml_v3_tensor * ggml_v3_clamp(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        float                 min,
+        float                 max) {
+    bool is_node = false;
+
+    if (a->grad) {
+        GGML_V3_ASSERT(false); // TODO: implement backward
+        is_node = true;
+    }
+
+    // TODO: when implement backward, fix this:
+    struct ggml_v3_tensor * result = ggml_v3_view_tensor(ctx, a);
+
+    float params[] = { min, max };
+    ggml_v3_set_op_params(result, params, sizeof(params));
+
+    result->op   = GGML_V3_OP_CLAMP;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+// ggml_v3_conv_1d
+
+static int64_t ggml_v3_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p, int d) {
+    return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
+}
+
+GGML_V3_API struct ggml_v3_tensor * ggml_v3_conv_1d(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        struct ggml_v3_tensor  * b,
+        int                   s0,
+        int                   p0,
+        int                   d0) {
+    struct ggml_v3_tensor * im2col = ggml_v3_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false); // [N, OL, IC * K]
+
+    struct ggml_v3_tensor * result =
+        ggml_v3_mul_mat(ctx,
+                ggml_v3_reshape_2d(ctx, im2col, im2col->ne[0], (im2col->ne[2] * im2col->ne[1])), // [N, OL, IC * K] => [N*OL, IC * K]
+                ggml_v3_reshape_2d(ctx, a, (a->ne[0] * a->ne[1]), a->ne[2]));                    // [OC，IC, K] => [OC, IC * K]
+
+    result = ggml_v3_reshape_3d(ctx, result, im2col->ne[1], a->ne[2], im2col->ne[2]); // [N, OC, OL]
+
+    return result;
+}
+
+// ggml_v3_conv_1d_ph
+
+struct ggml_v3_tensor* ggml_v3_conv_1d_ph(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        struct ggml_v3_tensor  * b,
+        int                   s,
+        int                   d) {
+    return ggml_v3_conv_1d(ctx, a, b, s, a->ne[0] / 2, d);
+}
+
+// ggml_v3_conv_transpose_1d
+
+static int64_t ggml_v3_calc_conv_transpose_1d_output_size(int64_t ins, int64_t ks, int s, int p, int d) {
+    return (ins - 1) * s - 2 * p + d * (ks - 1) + 1;
+}
+
+GGML_V3_API struct ggml_v3_tensor * ggml_v3_conv_transpose_1d(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        struct ggml_v3_tensor  * b,
+        int                   s0,
+        int                   p0,
+        int                   d0) {
+    GGML_V3_ASSERT(ggml_v3_is_matrix(b));
+    GGML_V3_ASSERT(a->ne[2] == b->ne[1]);
+    GGML_V3_ASSERT(a->ne[3] == 1);
+
+    GGML_V3_ASSERT(p0 == 0);
+    GGML_V3_ASSERT(d0 == 1);
+
+    bool is_node = false;
+
+    if (a->grad || b->grad) {
+        GGML_V3_ASSERT(false); // TODO: implement backward
+        is_node = true;
+    }
+
+    const int64_t ne[4] = {
+        ggml_v3_calc_conv_transpose_1d_output_size(b->ne[0], a->ne[0], s0, 0 /*p0*/, 1 /*d0*/),
+        a->ne[1], b->ne[2], 1,
+    };
+    struct ggml_v3_tensor * result = ggml_v3_new_tensor(ctx, GGML_V3_TYPE_F32, 4, ne);
+
+    int32_t params[] = { s0, p0, d0 };
+    ggml_v3_set_op_params(result, params, sizeof(params));
+
+    result->op = GGML_V3_OP_CONV_TRANSPOSE_1D;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+// ggml_v3_conv_2d
+
+// im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
+// a: [OC，IC, KH, KW]
+// b: [N, IC, IH, IW]
+// result: [N, OH, OW, IC*KH*KW]
+struct ggml_v3_tensor * ggml_v3_im2col(
+    struct ggml_v3_context * ctx,
+    struct ggml_v3_tensor  * a,
+    struct ggml_v3_tensor  * b,
+    int                  s0,
+    int                  s1,
+    int                  p0,
+    int                  p1,
+    int                  d0,
+    int                  d1,
+    bool                 is_2D) {
+
+    if(is_2D) {
+        GGML_V3_ASSERT(a->ne[2] == b->ne[2]);
+    } else {
+        GGML_V3_ASSERT(a->ne[1] == b->ne[1]);
+    }
+    bool is_node = false;
+
+    if (a->grad || b->grad) {
+        GGML_V3_ASSERT(false); // TODO: implement backward
+        is_node = true;
+    }
+
+    const int64_t OH = is_2D ? ggml_v3_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1) : 0;
+    const int64_t OW =         ggml_v3_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
+
+    const int64_t ne[4] = {
+        is_2D ? (a->ne[2] * a->ne[1] * a->ne[0]) : a->ne[1] * a->ne[0],
+        OW,
+        is_2D ? OH : b->ne[2],
+        is_2D ?      b->ne[3] : 1,
+    };
+
+    struct ggml_v3_tensor * result = ggml_v3_new_tensor(ctx, GGML_V3_TYPE_F16, 4, ne);
+    int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
+    ggml_v3_set_op_params(result, params, sizeof(params));
+
+    result->op = GGML_V3_OP_IM2COL;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+// a: [OC，IC, KH, KW]
+// b: [N, IC, IH, IW]
+// result: [N, OC, OH, OW]
+struct ggml_v3_tensor * ggml_v3_conv_2d(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        struct ggml_v3_tensor  * b,
+        int                  s0,
+        int                  s1,
+        int                  p0,
+        int                  p1,
+        int                  d0,
+        int                  d1) {
+    struct ggml_v3_tensor * im2col = ggml_v3_im2col(ctx, a, b, s0, s1, p0, p1, d0, d1, true); // [N, OH, OW, IC * KH * KW]
+
+    struct ggml_v3_tensor * result =
+        ggml_v3_mul_mat(ctx,
+                ggml_v3_reshape_2d(ctx, im2col, im2col->ne[0],  im2col->ne[3] * im2col->ne[2] * im2col->ne[1]), // [N, OH, OW, IC * KH * KW] => [N*OH*OW, IC * KH * KW]
+                ggml_v3_reshape_2d(ctx, a, (a->ne[0] * a->ne[1] * a->ne[2]),  a->ne[3]));                       // [OC，IC, KH, KW] => [OC, IC * KH * KW]
+
+    result = ggml_v3_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], a->ne[3], im2col->ne[3]); // [N, OC, OH, OW]
+
+    return result;
+}
+
+// ggml_v3_conv_2d_sk_p0
+struct ggml_v3_tensor * ggml_v3_conv_2d_sk_p0(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        struct ggml_v3_tensor  * b) {
+    return ggml_v3_conv_2d(ctx, a, b, a->ne[0], a->ne[1], 0, 0, 1, 1);
+}
+
+// ggml_v3_conv_2d_s1_ph
+
+struct ggml_v3_tensor * ggml_v3_conv_2d_s1_ph(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        struct ggml_v3_tensor  * b) {
+    return ggml_v3_conv_2d(ctx, a, b, 1, 1, a->ne[0] / 2, a->ne[1] / 2, 1, 1);
+}
+
+// ggml_v3_conv_transpose_2d_p0
+
+static int64_t ggml_v3_calc_conv_transpose_output_size(int64_t ins, int64_t ks, int s, int p) {
+    return (ins - 1) * s - 2 * p + ks;
+}
+
+struct ggml_v3_tensor * ggml_v3_conv_transpose_2d_p0(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        struct ggml_v3_tensor  * b,
+        int                   stride) {
+    GGML_V3_ASSERT(a->ne[3] == b->ne[2]);
+
+    bool is_node = false;
+
+    if (a->grad || b->grad) {
+        GGML_V3_ASSERT(false); // TODO: implement backward
+        is_node = true;
+    }
+
+    const int64_t ne[4] = {
+        ggml_v3_calc_conv_transpose_output_size(b->ne[0], a->ne[0], stride, 0 /*p0*/),
+        ggml_v3_calc_conv_transpose_output_size(b->ne[1], a->ne[1], stride, 0 /*p1*/),
+        a->ne[2], b->ne[3],
+    };
+
+    struct ggml_v3_tensor* result = ggml_v3_new_tensor(ctx, GGML_V3_TYPE_F32, 4, ne);
+
+    ggml_v3_set_op_params_i32(result, 0, stride);
+
+    result->op = GGML_V3_OP_CONV_TRANSPOSE_2D;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+// ggml_v3_pool_*
+
+static int64_t ggml_v3_calc_pool_output_size(int64_t ins, int ks, int s, float p) {
+    return (ins + 2 * p - ks) / s + 1;
+}
+
+// ggml_v3_pool_1d
+
+struct ggml_v3_tensor * ggml_v3_pool_1d(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        enum ggml_v3_op_pool     op,
+        int                   k0,
+        int                   s0,
+        int                   p0) {
+
+    bool is_node = false;
+
+    if (a->grad) {
+        GGML_V3_ASSERT(false); // TODO: implement backward
+        is_node = true;
+    }
+
+    const int64_t ne[2] = {
+        ggml_v3_calc_pool_output_size(a->ne[0], k0, s0, p0),
+        a->ne[1],
+    };
+    struct ggml_v3_tensor * result = ggml_v3_new_tensor(ctx, GGML_V3_TYPE_F32, 2, ne);
+
+    int32_t params[] = { op, k0, s0, p0 };
+    ggml_v3_set_op_params(result, params, sizeof(params));
+
+    result->op = GGML_V3_OP_POOL_1D;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+// ggml_v3_pool_2d
+
+struct ggml_v3_tensor * ggml_v3_pool_2d(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        enum ggml_v3_op_pool     op,
+        int                   k0,
+        int                   k1,
+        int                   s0,
+        int                   s1,
+        float                 p0,
+        float                 p1) {
+
+    bool is_node = false;
+
+    if (a->grad) {
+        GGML_V3_ASSERT(false); // TODO: implement backward
+        is_node = true;
+    }
+
+    const int64_t ne[3] = {
+        ggml_v3_calc_pool_output_size(a->ne[0], k0, s0, p0),
+        ggml_v3_calc_pool_output_size(a->ne[1], k1, s1, p1),
+        a->ne[2],
+    };
+    struct ggml_v3_tensor * result = ggml_v3_new_tensor(ctx, GGML_V3_TYPE_F32, 3, ne);
+
+    int32_t params[] = { op, k0, k1, s0, s1, p0, p1 };
+    ggml_v3_set_op_params(result, params, sizeof(params));
+
+    result->op = GGML_V3_OP_POOL_2D;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+// ggml_v3_upscale
+
+static struct ggml_v3_tensor * ggml_v3_upscale_impl(
+    struct ggml_v3_context * ctx,
+    struct ggml_v3_tensor * a,
+    int scale_factor) {
+    bool is_node = false;
+
+    if (a->grad) {
+        GGML_V3_ASSERT(false); // TODO: implement backward
+        is_node = true;
+    }
+
+    struct ggml_v3_tensor * result = ggml_v3_new_tensor_4d(ctx, a->type,
+            a->ne[0] * scale_factor,
+            a->ne[1] * scale_factor,
+            a->ne[2], a->ne[3]);
+
+    result->op = GGML_V3_OP_UPSCALE;
+    result->op_params[0] = scale_factor;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+struct ggml_v3_tensor * ggml_v3_pad(
+    struct ggml_v3_context * ctx,
+    struct ggml_v3_tensor  * a,
+    int p0, int p1, int p2, int p3) {
+    bool is_node = false;
+
+    if (a->grad) {
+        GGML_V3_ASSERT(false); // TODO: implement backward
+        is_node = true;
+    }
+
+    struct ggml_v3_tensor * result = ggml_v3_new_tensor_4d(ctx, a->type,
+            a->ne[0] + p0,
+            a->ne[1] + p1,
+            a->ne[2] + p2,
+            a->ne[3] + p3);
+
+    result->op = GGML_V3_OP_PAD;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+struct ggml_v3_tensor * ggml_v3_upscale(
+    struct ggml_v3_context * ctx,
+    struct ggml_v3_tensor * a,
+    int scale_factor) {
+    return ggml_v3_upscale_impl(ctx, a, scale_factor);
+}
+
+// ggml_v3_argsort
+
+struct ggml_v3_tensor * ggml_v3_argsort(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        enum ggml_v3_sort_order  order) {
+    bool is_node = false;
+
+    struct ggml_v3_tensor * result = ggml_v3_new_tensor(ctx, GGML_V3_TYPE_I32, GGML_V3_MAX_DIMS, a->ne);
+
+    ggml_v3_set_op_params_i32(result, 0, (int32_t) order);
+
+    result->op   = GGML_V3_OP_ARGSORT;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+// ggml_v3_top_k
+
+struct ggml_v3_tensor * ggml_v3_top_k(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        int                   k) {
+    GGML_V3_ASSERT(a->ne[0] >= k);
+
+    struct ggml_v3_tensor * result = ggml_v3_argsort(ctx, a, GGML_V3_SORT_DESC);
+
+    result = ggml_v3_view_4d(ctx, result,
+                k, result->ne[1], result->ne[2], result->ne[3],
+                   result->nb[1], result->nb[2], result->nb[3],
+                0);
+
+    return result;
+}
+
+// ggml_v3_flash_attn
+
+struct ggml_v3_tensor * ggml_v3_flash_attn(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * q,
+        struct ggml_v3_tensor  * k,
+        struct ggml_v3_tensor  * v,
+        bool                  masked) {
+    GGML_V3_ASSERT(ggml_v3_can_mul_mat(k, q));
+    // TODO: check if vT can be multiplied by (k*qT)
+
+    bool is_node = false;
+
+    if (q->grad || k->grad || v->grad) {
+        is_node = true;
+    }
+
+    //struct ggml_v3_tensor * result = ggml_v3_dup_tensor(ctx, q);
+    struct ggml_v3_tensor * result = ggml_v3_new_tensor(ctx, GGML_V3_TYPE_F32, GGML_V3_MAX_DIMS, q->ne);
+
+    int32_t t = masked ? 1 : 0;
+    ggml_v3_set_op_params(result, &t, sizeof(t));
+
+    result->op   = GGML_V3_OP_FLASH_ATTN;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = q;
+    result->src[1] = k;
+    result->src[2] = v;
+
+    return result;
+}
+
+// ggml_v3_flash_ff
+
+struct ggml_v3_tensor * ggml_v3_flash_ff(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        struct ggml_v3_tensor  * b0,
+        struct ggml_v3_tensor  * b1,
+        struct ggml_v3_tensor  * c0,
+        struct ggml_v3_tensor  * c1) {
+    GGML_V3_ASSERT(ggml_v3_can_mul_mat(b0, a));
+    // TODO: more checks
+
+    bool is_node = false;
+
+    if (a->grad || b0->grad || b1->grad || c0->grad || c1->grad) {
+        is_node = true;
+    }
+
+    //struct ggml_v3_tensor * result = ggml_v3_dup_tensor(ctx, a);
+    struct ggml_v3_tensor * result = ggml_v3_new_tensor(ctx, GGML_V3_TYPE_F32, GGML_V3_MAX_DIMS, a->ne);
+
+    result->op   = GGML_V3_OP_FLASH_FF;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b0;
+    result->src[2] = b1;
+    result->src[3] = c0;
+    result->src[4] = c1;
+
+    return result;
+}
+
+// ggml_v3_flash_attn_back
+
+struct ggml_v3_tensor * ggml_v3_flash_attn_back(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * q,
+        struct ggml_v3_tensor  * k,
+        struct ggml_v3_tensor  * v,
+        struct ggml_v3_tensor  * d,
+        bool                  masked) {
+    GGML_V3_ASSERT(ggml_v3_can_mul_mat(k, q));
+    // TODO: check if vT can be multiplied by (k*qT)
+
+    // d shape [D,N,ne2,ne3]
+    // q shape [D,N,ne2,ne3]
+    // k shape [D,M,kvne2,ne3]
+    // v shape [M,D,kvne2,ne3]
+
+    const int64_t     D = q->ne[0];
+    const int64_t     N = q->ne[1];
+    const int64_t     M = k->ne[1];
+    const int64_t   ne2 = q->ne[2];
+    const int64_t   ne3 = q->ne[3];
+    const int64_t kvne2 = k->ne[2];
+
+    GGML_V3_ASSERT(k->ne[0] == D);
+    GGML_V3_ASSERT(v->ne[0] == M);
+    GGML_V3_ASSERT(v->ne[1] == D);
+    GGML_V3_ASSERT(d->ne[0] == D);
+    GGML_V3_ASSERT(d->ne[1] == N);
+    GGML_V3_ASSERT(k->ne[2] == kvne2);
+    GGML_V3_ASSERT(k->ne[3] == ne3);
+    GGML_V3_ASSERT(v->ne[2] == kvne2);
+    GGML_V3_ASSERT(v->ne[3] == ne3);
+    GGML_V3_ASSERT(d->ne[2] == ne2);
+    GGML_V3_ASSERT(d->ne[3] == ne3);
+
+    GGML_V3_ASSERT(ne2 % kvne2 == 0);
+
+    bool is_node = false;
+
+    if (q->grad || k->grad || v->grad) {
+        // when using this operation (in backwards pass) these grads are set.
+        // we don't want to create (big) grad of our result, so is_node is false.
+        is_node = false;
+    }
+
+    // store gradients of q, k and v as continuous tensors concatenated in result.
+    // note: v and gradv are actually transposed, i.e. v->ne[0] != D.
+    const int64_t elem_q = ggml_v3_nelements(q);
+    const int64_t elem_k = ggml_v3_nelements(k);
+    const int64_t elem_v = ggml_v3_nelements(v);
+
+    enum ggml_v3_type result_type = GGML_V3_TYPE_F32;
+    GGML_V3_ASSERT(ggml_v3_blck_size(result_type) == 1);
+    const size_t tsize = ggml_v3_type_size(result_type);
+
+    const size_t offs_q = 0;
+    const size_t offs_k = offs_q + GGML_V3_PAD(elem_q * tsize, GGML_V3_MEM_ALIGN);
+    const size_t offs_v = offs_k + GGML_V3_PAD(elem_k * tsize, GGML_V3_MEM_ALIGN);
+    const size_t end    = offs_v + GGML_V3_PAD(elem_v * tsize, GGML_V3_MEM_ALIGN);
+
+    const size_t nelements = (end + tsize - 1)/tsize;
+
+    struct ggml_v3_tensor * result = ggml_v3_new_tensor_1d(ctx, GGML_V3_TYPE_F32, nelements);
+
+    int32_t masked_i = masked ? 1 : 0;
+    ggml_v3_set_op_params(result, &masked_i, sizeof(masked_i));
+
+    result->op   = GGML_V3_OP_FLASH_ATTN_BACK;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = q;
+    result->src[1] = k;
+    result->src[2] = v;
+    result->src[3] = d;
+
+    return result;
+}
+
+// ggml_v3_win_part
+
+struct ggml_v3_tensor * ggml_v3_win_part(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        int                   w) {
+    GGML_V3_ASSERT(a->ne[3] == 1);
+    GGML_V3_ASSERT(a->type  == GGML_V3_TYPE_F32);
+
+    bool is_node = false;
+
+    if (a->grad) {
+        GGML_V3_ASSERT(false); // TODO: implement backward
+        is_node = true;
+    }
+
+    // padding
+    const int px = (w - a->ne[1]%w)%w;
+    const int py = (w - a->ne[2]%w)%w;
+
+    const int npx = (px + a->ne[1])/w;
+    const int npy = (py + a->ne[2])/w;
+    const int np  = npx*npy;
+
+    const int64_t ne[4] = { a->ne[0], w, w, np, };
+    struct ggml_v3_tensor * result = ggml_v3_new_tensor(ctx, GGML_V3_TYPE_F32, 4, ne);
+
+    int32_t params[] = { npx, npy, w };
+    ggml_v3_set_op_params(result, params, sizeof(params));
+
+    result->op   = GGML_V3_OP_WIN_PART;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+// ggml_v3_win_unpart
+
+struct ggml_v3_tensor * ggml_v3_win_unpart(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        int                   w0,
+        int                   h0,
+        int                   w) {
+    GGML_V3_ASSERT(a->type == GGML_V3_TYPE_F32);
+
+    bool is_node = false;
+
+    if (a->grad) {
+        GGML_V3_ASSERT(false); // TODO: implement backward
+        is_node = true;
+    }
+
+    const int64_t ne[4] = { a->ne[0], w0, h0, 1, };
+    struct ggml_v3_tensor * result = ggml_v3_new_tensor(ctx, GGML_V3_TYPE_F32, 3, ne);
+
+    int32_t params[] = { w };
+    ggml_v3_set_op_params(result, params, sizeof(params));
+
+    result->op   = GGML_V3_OP_WIN_UNPART;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+// ggml_v3_get_rel_pos
+
+struct ggml_v3_tensor * ggml_v3_get_rel_pos(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        int                   qh,
+        int                   kh) {
+    GGML_V3_ASSERT(qh == kh);
+    GGML_V3_ASSERT(2*MAX(qh, kh) - 1 == a->ne[1]);
+
+    bool is_node = false;
+
+    if (a->grad) {
+        GGML_V3_ASSERT(false); // TODO: implement backward
+        is_node = true;
+    }
+
+    const int64_t ne[4] = { a->ne[0], kh, qh, 1, };
+    struct ggml_v3_tensor * result = ggml_v3_new_tensor(ctx, GGML_V3_TYPE_F16, 3, ne);
+
+    result->op   = GGML_V3_OP_GET_REL_POS;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+// ggml_v3_add_rel_pos
+
+static struct ggml_v3_tensor * ggml_v3_add_rel_pos_impl(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        struct ggml_v3_tensor  * pw,
+        struct ggml_v3_tensor  * ph,
+        bool                  inplace) {
+    GGML_V3_ASSERT(ggml_v3_are_same_shape(pw, ph));
+    GGML_V3_ASSERT(ggml_v3_is_contiguous(a));
+    GGML_V3_ASSERT(ggml_v3_is_contiguous(pw));
+    GGML_V3_ASSERT(ggml_v3_is_contiguous(ph));
+    GGML_V3_ASSERT(ph->type == GGML_V3_TYPE_F32);
+    GGML_V3_ASSERT(pw->type == GGML_V3_TYPE_F32);
+    GGML_V3_ASSERT(pw->ne[3] == a->ne[2]);
+    GGML_V3_ASSERT(pw->ne[0]*pw->ne[0] == a->ne[0]);
+    GGML_V3_ASSERT(pw->ne[1]*pw->ne[2] == a->ne[1]);
+
+    bool is_node = false;
+
+    if (!inplace && (a->grad || pw->grad || ph->grad)) {
+        is_node = true;
+    }
+
+    struct ggml_v3_tensor * result = inplace ? ggml_v3_view_tensor(ctx, a) : ggml_v3_dup_tensor(ctx, a);
+    ggml_v3_set_op_params_i32(result, 0, inplace ? 1 : 0);
+
+    result->op   = GGML_V3_OP_ADD_REL_POS;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = pw;
+    result->src[2] = ph;
+
+    return result;
+}
+
+struct ggml_v3_tensor * ggml_v3_add_rel_pos(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        struct ggml_v3_tensor  * pw,
+        struct ggml_v3_tensor  * ph) {
+    return ggml_v3_add_rel_pos_impl(ctx, a, pw, ph, false);
+}
+
+struct ggml_v3_tensor * ggml_v3_add_rel_pos_inplace(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        struct ggml_v3_tensor  * pw,
+        struct ggml_v3_tensor  * ph) {
+    return ggml_v3_add_rel_pos_impl(ctx, a, pw, ph, true);
+}
+
+// gmml_unary
+
+static struct ggml_v3_tensor * ggml_v3_unary_impl(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor * a,
+        enum ggml_v3_unary_op op,
+        bool inplace) {
+    bool is_node = false;
+
+    if (!inplace && (a->grad)) {
+        is_node = true;
+    }
+
+    struct ggml_v3_tensor * result = inplace ? ggml_v3_view_tensor(ctx, a) : ggml_v3_dup_tensor(ctx, a);
+
+    ggml_v3_set_op_params_i32(result, 0, (int32_t) op);
+
+    result->op   = GGML_V3_OP_UNARY;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+struct ggml_v3_tensor * ggml_v3_unary(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        enum ggml_v3_unary_op op) {
+    return ggml_v3_unary_impl(ctx, a, op, false);
+}
+
+struct ggml_v3_tensor * ggml_v3_unary_inplace(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        enum ggml_v3_unary_op op) {
+    return ggml_v3_unary_impl(ctx, a, op, true);
+}
+
+// ggml_v3_map_unary
+
+static struct ggml_v3_tensor * ggml_v3_map_unary_impl_f32(
+        struct ggml_v3_context        * ctx,
+        struct ggml_v3_tensor         * a,
+        const  ggml_v3_unary_op_f32_t fun,
+        bool   inplace) {
+    bool is_node = false;
+
+    if (!inplace && a->grad) {
+        is_node = true;
+    }
+
+    struct ggml_v3_tensor * result = inplace ? ggml_v3_view_tensor(ctx, a) : ggml_v3_dup_tensor(ctx, a);
+
+    ggml_v3_set_op_params(result, (const void *) &fun, sizeof(fun));
+
+    result->op = GGML_V3_OP_MAP_UNARY;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+struct ggml_v3_tensor * ggml_v3_map_unary_f32(
+        struct ggml_v3_context        * ctx,
+        struct ggml_v3_tensor         * a,
+        const  ggml_v3_unary_op_f32_t fun) {
+    return ggml_v3_map_unary_impl_f32(ctx, a, fun, false);
+}
+
+struct ggml_v3_tensor * ggml_v3_map_unary_inplace_f32(
+        struct ggml_v3_context        * ctx,
+        struct ggml_v3_tensor         * a,
+        const  ggml_v3_unary_op_f32_t fun) {
+    return ggml_v3_map_unary_impl_f32(ctx, a, fun, true);
+}
+
+// ggml_v3_map_binary
+
+static struct ggml_v3_tensor * ggml_v3_map_binary_impl_f32(
+        struct ggml_v3_context         * ctx,
+        struct ggml_v3_tensor          * a,
+        struct ggml_v3_tensor          * b,
+        const  ggml_v3_binary_op_f32_t fun,
+        bool   inplace) {
+    GGML_V3_ASSERT(ggml_v3_are_same_shape(a, b));
+
+    bool is_node = false;
+
+    if (!inplace && (a->grad || b->grad)) {
+        is_node = true;
+    }
+
+    struct ggml_v3_tensor * result = inplace ? ggml_v3_view_tensor(ctx, a) : ggml_v3_dup_tensor(ctx, a);
+
+    ggml_v3_set_op_params(result, (const void *) &fun, sizeof(fun));
+
+    result->op = GGML_V3_OP_MAP_BINARY;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+struct ggml_v3_tensor * ggml_v3_map_binary_f32(
+        struct ggml_v3_context         * ctx,
+        struct ggml_v3_tensor          * a,
+        struct ggml_v3_tensor          * b,
+        const  ggml_v3_binary_op_f32_t fun) {
+    return ggml_v3_map_binary_impl_f32(ctx, a, b, fun, false);
+}
+
+struct ggml_v3_tensor * ggml_v3_map_binary_inplace_f32(
+        struct ggml_v3_context         * ctx,
+        struct ggml_v3_tensor          * a,
+        struct ggml_v3_tensor          * b,
+        const  ggml_v3_binary_op_f32_t fun) {
+    return ggml_v3_map_binary_impl_f32(ctx, a, b, fun, true);
+}
+
+// ggml_v3_map_custom1_f32
+
+static struct ggml_v3_tensor * ggml_v3_map_custom1_impl_f32(
+        struct ggml_v3_context          * ctx,
+        struct ggml_v3_tensor           * a,
+        const  ggml_v3_custom1_op_f32_t   fun,
+        bool   inplace) {
+    bool is_node = false;
+
+    if (!inplace && a->grad) {
+        is_node = true;
+    }
+
+    struct ggml_v3_tensor * result = inplace ? ggml_v3_view_tensor(ctx, a) : ggml_v3_dup_tensor(ctx, a);
+
+    ggml_v3_set_op_params(result, (const void *) &fun, sizeof(fun));
+
+    result->op = GGML_V3_OP_MAP_CUSTOM1_F32;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+struct ggml_v3_tensor * ggml_v3_map_custom1_f32(
+        struct ggml_v3_context          * ctx,
+        struct ggml_v3_tensor           * a,
+        const  ggml_v3_custom1_op_f32_t   fun) {
+    return ggml_v3_map_custom1_impl_f32(ctx, a, fun, false);
+}
+
+struct ggml_v3_tensor * ggml_v3_map_custom1_inplace_f32(
+        struct ggml_v3_context          * ctx,
+        struct ggml_v3_tensor           * a,
+        const  ggml_v3_custom1_op_f32_t   fun) {
+    return ggml_v3_map_custom1_impl_f32(ctx, a, fun, true);
+}
+
+// ggml_v3_map_custom2_f32
+
+static struct ggml_v3_tensor * ggml_v3_map_custom2_impl_f32(
+        struct ggml_v3_context          * ctx,
+        struct ggml_v3_tensor           * a,
+        struct ggml_v3_tensor           * b,
+        const  ggml_v3_custom2_op_f32_t   fun,
+        bool   inplace) {
+    bool is_node = false;
+
+    if (!inplace && (a->grad || b->grad)) {
+        is_node = true;
+    }
+
+    struct ggml_v3_tensor * result = inplace ? ggml_v3_view_tensor(ctx, a) : ggml_v3_dup_tensor(ctx, a);
+
+    ggml_v3_set_op_params(result, (const void *) &fun, sizeof(fun));
+
+    result->op = GGML_V3_OP_MAP_CUSTOM2_F32;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+struct ggml_v3_tensor * ggml_v3_map_custom2_f32(
+        struct ggml_v3_context          * ctx,
+        struct ggml_v3_tensor           * a,
+        struct ggml_v3_tensor           * b,
+        const  ggml_v3_custom2_op_f32_t   fun) {
+    return ggml_v3_map_custom2_impl_f32(ctx, a, b, fun, false);
+}
+
+struct ggml_v3_tensor * ggml_v3_map_custom2_inplace_f32(
+        struct ggml_v3_context          * ctx,
+        struct ggml_v3_tensor           * a,
+        struct ggml_v3_tensor           * b,
+        const  ggml_v3_custom2_op_f32_t   fun) {
+    return ggml_v3_map_custom2_impl_f32(ctx, a, b, fun, true);
+}
+
+// ggml_v3_map_custom3_f32
+
+static struct ggml_v3_tensor * ggml_v3_map_custom3_impl_f32(
+        struct ggml_v3_context          * ctx,
+        struct ggml_v3_tensor           * a,
+        struct ggml_v3_tensor           * b,
+        struct ggml_v3_tensor           * c,
+        const  ggml_v3_custom3_op_f32_t   fun,
+        bool   inplace) {
+    bool is_node = false;
+
+    if (!inplace && (a->grad || b->grad || c->grad)) {
+        is_node = true;
+    }
+
+    struct ggml_v3_tensor * result = inplace ? ggml_v3_view_tensor(ctx, a) : ggml_v3_dup_tensor(ctx, a);
+
+    ggml_v3_set_op_params(result, (const void *) &fun, sizeof(fun));
+
+    result->op = GGML_V3_OP_MAP_CUSTOM3_F32;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+    result->src[2] = c;
+
+    return result;
+}
+
+struct ggml_v3_tensor * ggml_v3_map_custom3_f32(
+        struct ggml_v3_context          * ctx,
+        struct ggml_v3_tensor           * a,
+        struct ggml_v3_tensor           * b,
+        struct ggml_v3_tensor           * c,
+        const  ggml_v3_custom3_op_f32_t   fun) {
+    return ggml_v3_map_custom3_impl_f32(ctx, a, b, c, fun, false);
+}
+
+struct ggml_v3_tensor * ggml_v3_map_custom3_inplace_f32(
+        struct ggml_v3_context          * ctx,
+        struct ggml_v3_tensor           * a,
+        struct ggml_v3_tensor           * b,
+        struct ggml_v3_tensor           * c,
+        const  ggml_v3_custom3_op_f32_t   fun) {
+    return ggml_v3_map_custom3_impl_f32(ctx, a, b, c, fun, true);
+}
+
+// ggml_v3_map_custom1
+struct ggml_v3_map_custom1_op_params {
+    ggml_v3_custom1_op_t fun;
+    int n_tasks;
+    void * userdata;
+};
+
+static struct ggml_v3_tensor * ggml_v3_map_custom1_impl(
+        struct ggml_v3_context          * ctx,
+        struct ggml_v3_tensor           * a,
+        const  ggml_v3_custom1_op_t       fun,
+        int                            n_tasks,
+        void                         * userdata,
+        bool                           inplace) {
+    GGML_V3_ASSERT(n_tasks == GGML_V3_N_TASKS_MAX || n_tasks > 0);
+
+    bool is_node = false;
+
+    if (!inplace && a->grad) {
+        is_node = true;
+    }
+
+    struct ggml_v3_tensor * result = inplace ? ggml_v3_view_tensor(ctx, a) : ggml_v3_dup_tensor(ctx, a);
+
+    struct ggml_v3_map_custom1_op_params params = {
+        /*.fun      =*/ fun,
+        /*.n_tasks  =*/ n_tasks,
+        /*.userdata =*/ userdata
+    };
+    ggml_v3_set_op_params(result, (const void *) &params, sizeof(params));
+
+    result->op = GGML_V3_OP_MAP_CUSTOM1;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+struct ggml_v3_tensor * ggml_v3_map_custom1(
+        struct ggml_v3_context          * ctx,
+        struct ggml_v3_tensor           * a,
+        const  ggml_v3_custom1_op_t       fun,
+        int                            n_tasks,
+        void                         * userdata) {
+    return ggml_v3_map_custom1_impl(ctx, a, fun, n_tasks, userdata, false);
+}
+
+struct ggml_v3_tensor * ggml_v3_map_custom1_inplace(
+        struct ggml_v3_context          * ctx,
+        struct ggml_v3_tensor           * a,
+        const  ggml_v3_custom1_op_t       fun,
+        int                            n_tasks,
+        void                         * userdata) {
+    return ggml_v3_map_custom1_impl(ctx, a, fun, n_tasks, userdata, true);
+}
+
+// ggml_v3_map_custom2
+
+struct ggml_v3_map_custom2_op_params {
+    ggml_v3_custom2_op_t fun;
+    int n_tasks;
+    void * userdata;
+};
+
+static struct ggml_v3_tensor * ggml_v3_map_custom2_impl(
+        struct ggml_v3_context          * ctx,
+        struct ggml_v3_tensor           * a,
+        struct ggml_v3_tensor           * b,
+        const  ggml_v3_custom2_op_t       fun,
+        int                            n_tasks,
+        void                         * userdata,
+        bool                           inplace) {
+    GGML_V3_ASSERT(n_tasks == GGML_V3_N_TASKS_MAX || n_tasks > 0);
+
+    bool is_node = false;
+
+    if (!inplace && (a->grad || b->grad)) {
+        is_node = true;
+    }
+
+    struct ggml_v3_tensor * result = inplace ? ggml_v3_view_tensor(ctx, a) : ggml_v3_dup_tensor(ctx, a);
+
+    struct ggml_v3_map_custom2_op_params params = {
+        /*.fun      =*/ fun,
+        /*.n_tasks  =*/ n_tasks,
+        /*.userdata =*/ userdata
+    };
+    ggml_v3_set_op_params(result, (const void *) &params, sizeof(params));
+
+    result->op = GGML_V3_OP_MAP_CUSTOM2;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+struct ggml_v3_tensor * ggml_v3_map_custom2(
+        struct ggml_v3_context          * ctx,
+        struct ggml_v3_tensor           * a,
+        struct ggml_v3_tensor           * b,
+        const  ggml_v3_custom2_op_t       fun,
+        int                            n_tasks,
+        void                         * userdata) {
+    return ggml_v3_map_custom2_impl(ctx, a, b, fun, n_tasks, userdata, false);
+}
+
+struct ggml_v3_tensor * ggml_v3_map_custom2_inplace(
+        struct ggml_v3_context          * ctx,
+        struct ggml_v3_tensor           * a,
+        struct ggml_v3_tensor           * b,
+        const  ggml_v3_custom2_op_t       fun,
+        int                            n_tasks,
+        void                         * userdata) {
+    return ggml_v3_map_custom2_impl(ctx, a, b, fun, n_tasks, userdata, true);
+}
+
+// ggml_v3_map_custom3
+
+struct ggml_v3_map_custom3_op_params {
+    ggml_v3_custom3_op_t fun;
+    int n_tasks;
+    void * userdata;
+};
+
+static struct ggml_v3_tensor * ggml_v3_map_custom3_impl(
+        struct ggml_v3_context          * ctx,
+        struct ggml_v3_tensor           * a,
+        struct ggml_v3_tensor           * b,
+        struct ggml_v3_tensor           * c,
+        const  ggml_v3_custom3_op_t       fun,
+        int                            n_tasks,
+        void                         * userdata,
+        bool                           inplace) {
+    GGML_V3_ASSERT(n_tasks == GGML_V3_N_TASKS_MAX || n_tasks > 0);
+
+    bool is_node = false;
+
+    if (!inplace && (a->grad || b->grad || c->grad)) {
+        is_node = true;
+    }
+
+    struct ggml_v3_tensor * result = inplace ? ggml_v3_view_tensor(ctx, a) : ggml_v3_dup_tensor(ctx, a);
+
+    struct ggml_v3_map_custom3_op_params params = {
+        /*.fun      =*/ fun,
+        /*.n_tasks  =*/ n_tasks,
+        /*.userdata =*/ userdata
+    };
+    ggml_v3_set_op_params(result, (const void *) &params, sizeof(params));
+
+    result->op = GGML_V3_OP_MAP_CUSTOM3;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+    result->src[2] = c;
+
+    return result;
+}
+
+struct ggml_v3_tensor * ggml_v3_map_custom3(
+        struct ggml_v3_context          * ctx,
+        struct ggml_v3_tensor           * a,
+        struct ggml_v3_tensor           * b,
+        struct ggml_v3_tensor           * c,
+        const  ggml_v3_custom3_op_t       fun,
+        int                            n_tasks,
+        void                         * userdata) {
+    return ggml_v3_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, false);
+}
+
+struct ggml_v3_tensor * ggml_v3_map_custom3_inplace(
+        struct ggml_v3_context          * ctx,
+        struct ggml_v3_tensor           * a,
+        struct ggml_v3_tensor           * b,
+        struct ggml_v3_tensor           * c,
+        const  ggml_v3_custom3_op_t       fun,
+        int                            n_tasks,
+        void                         * userdata) {
+    return ggml_v3_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, true);
+}
+
+// ggml_v3_cross_entropy_loss
+
+struct ggml_v3_tensor * ggml_v3_cross_entropy_loss(
+        struct ggml_v3_context         * ctx,
+        struct ggml_v3_tensor          * a,
+        struct ggml_v3_tensor          * b) {
+    GGML_V3_ASSERT(ggml_v3_are_same_shape(a, b));
+    bool is_node = false;
+
+    if (a->grad || b->grad) {
+        is_node = true;
+    }
+
+    struct ggml_v3_tensor * result = ggml_v3_new_tensor_1d(ctx, a->type, 1);
+
+    result->op   = GGML_V3_OP_CROSS_ENTROPY_LOSS;
+    result->grad = is_node ? ggml_v3_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+// ggml_v3_cross_entropy_loss_back
+
+struct ggml_v3_tensor * ggml_v3_cross_entropy_loss_back(
+        struct ggml_v3_context         * ctx,
+        struct ggml_v3_tensor          * a,
+        struct ggml_v3_tensor          * b,
+        struct ggml_v3_tensor          * c) {
+    GGML_V3_ASSERT(ggml_v3_are_same_shape(a, b));
+    GGML_V3_ASSERT(ggml_v3_is_scalar(c));
+
+    struct ggml_v3_tensor * result = ggml_v3_dup_tensor(ctx, a);
+
+    result->op   = GGML_V3_OP_CROSS_ENTROPY_LOSS_BACK;
+    result->grad = NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+    result->src[2] = c;
+
+    return result;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+void ggml_v3_set_param(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor * tensor) {
+    tensor->is_param = true;
+
+    GGML_V3_ASSERT(tensor->grad == NULL);
+    tensor->grad = ggml_v3_dup_tensor(ctx, tensor);
+    ggml_v3_format_name(tensor->grad, "%s (grad)", tensor->name);
+}
+
+// ggml_v3_compute_forward_dup
+
+static void ggml_v3_compute_forward_dup_same_cont(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    GGML_V3_ASSERT(ggml_v3_nelements(dst) == ggml_v3_nelements(src0));
+    GGML_V3_ASSERT(ggml_v3_is_contiguous(dst) && ggml_v3_is_contiguous(src0));
+    GGML_V3_ASSERT(src0->type == dst->type);
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    const size_t nb00 = src0->nb[0];
+    const size_t nb0 = dst->nb[0];
+
+    const int ith = params->ith; // thread index
+    const int nth = params->nth; // number of threads
+
+    // parallelize by elements
+    const int ne = ggml_v3_nelements(dst);
+    const int dr = (ne + nth - 1) / nth;
+    const int ie0 = dr * ith;
+    const int ie1 = MIN(ie0 + dr, ne);
+
+    if (ie0 < ie1) {
+        memcpy(
+            ((char *)  dst->data + ie0*nb0),
+            ((char *) src0->data + ie0*nb00),
+            (ie1 - ie0) * ggml_v3_type_size(src0->type));
+    }
+
+}
+static void ggml_v3_compute_forward_dup_f16(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    GGML_V3_ASSERT(ggml_v3_nelements(dst) == ggml_v3_nelements(src0));
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    GGML_V3_TENSOR_UNARY_OP_LOCALS
+
+    const int ith = params->ith; // thread index
+    const int nth = params->nth; // number of threads
+
+    if (ggml_v3_is_contiguous(src0) && ggml_v3_is_contiguous(dst) && src0->type == dst->type) {
+        ggml_v3_compute_forward_dup_same_cont(params, src0, dst);
+        return;
+    }
+
+    // parallelize by rows
+    const int nr = ne01;
+    // number of rows per thread
+    const int dr = (nr + nth - 1) / nth;
+    // row range for this thread
+    const int ir0 = dr * ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    if (src0->type == dst->type &&
+        ne00 == ne0 &&
+        nb00 == ggml_v3_type_size(src0->type) && nb0 == ggml_v3_type_size(dst->type)) {
+        // copy by rows
+        const size_t rs = ne00*nb00;
+        for (int64_t i03 = 0; i03 < ne03; i03++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                for (int64_t i01 = ir0; i01 < ir1; i01++) {
+                    memcpy(
+                        ((char *)  dst->data + i01*nb1  + i02*nb2  + i03*nb3),
+                        ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03),
+                        rs);
+                }
+            }
+        }
+        return;
+    }
+
+    // TODO: add more special-case implementations for tensor shapes/strides that can benefit from memcpy
+
+    if (ggml_v3_is_contiguous(dst)) {
+        if (nb00 == sizeof(ggml_v3_fp16_t)) {
+            if (dst->type == GGML_V3_TYPE_F16) {
+                size_t id = 0;
+                const size_t rs = ne00 * nb00;
+                char * dst_ptr = (char *) dst->data;
+
+                for (int i03 = 0; i03 < ne03; i03++) {
+                    for (int i02 = 0; i02 < ne02; i02++) {
+                        id += rs * ir0;
+                        for (int i01 = ir0; i01 < ir1; i01++) {
+                            const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
+                            memcpy(dst_ptr + id, src0_ptr, rs);
+                            id += rs;
+                        }
+                        id += rs * (ne01 - ir1);
+                    }
+                }
+            } else if (dst->type == GGML_V3_TYPE_F32) {
+                size_t id = 0;
+                float * dst_ptr = (float *) dst->data;
+
+                for (int i03 = 0; i03 < ne03; i03++) {
+                    for (int i02 = 0; i02 < ne02; i02++) {
+                        id += ne00 * ir0;
+                        for (int i01 = ir0; i01 < ir1; i01++) {
+                            const ggml_v3_fp16_t * src0_ptr = (ggml_v3_fp16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
+                            for (int i00 = 0; i00 < ne00; i00++) {
+                                dst_ptr[id] = GGML_V3_FP16_TO_FP32(src0_ptr[i00]);
+                                id++;
+                            }
+                        }
+                        id += ne00 * (ne01 - ir1);
+                    }
+                }
+            } else if (type_traits[dst->type].from_float) {
+                ggml_v3_from_float_t const quantize_row_q = type_traits[dst->type].from_float;
+                float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;
+
+                size_t id = 0;
+                size_t rs = nb0 * (ne00 / ggml_v3_blck_size(dst->type));
+                char * dst_ptr = (char *) dst->data;
+
+                for (int i03 = 0; i03 < ne03; i03++) {
+                    for (int i02 = 0; i02 < ne02; i02++) {
+                        id += rs * ir0;
+                        for (int i01 = ir0; i01 < ir1; i01++) {
+                            const ggml_v3_fp16_t * src0_ptr = (ggml_v3_fp16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
+
+                            for (int i00 = 0; i00 < ne00; i00++) {
+                                src0_f32[i00] = GGML_V3_FP16_TO_FP32(src0_ptr[i00]);
+                            }
+
+                            quantize_row_q(src0_f32, dst_ptr + id, ne00);
+                            id += rs;
+                        }
+                        id += rs * (ne01 - ir1);
+                    }
+                }
+            } else {
+                GGML_V3_ASSERT(false); // TODO: implement
+            }
+        } else {
+            //printf("%s: this is not optimal - fix me\n", __func__);
+
+            if (dst->type == GGML_V3_TYPE_F32) {
+                size_t id = 0;
+                float * dst_ptr = (float *) dst->data;
+
+                for (int i03 = 0; i03 < ne03; i03++) {
+                    for (int i02 = 0; i02 < ne02; i02++) {
+                        id += ne00 * ir0;
+                        for (int i01 = ir0; i01 < ir1; i01++) {
+                            for (int i00 = 0; i00 < ne00; i00++) {
+                                const ggml_v3_fp16_t * src0_ptr = (ggml_v3_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+
+                                dst_ptr[id] = GGML_V3_FP16_TO_FP32(*src0_ptr);
+                                id++;
+                            }
+                        }
+                        id += ne00 * (ne01 - ir1);
+                    }
+                }
+            } else if (dst->type == GGML_V3_TYPE_F16) {
+                size_t id = 0;
+                ggml_v3_fp16_t * dst_ptr = (ggml_v3_fp16_t *) dst->data;
+
+                for (int i03 = 0; i03 < ne03; i03++) {
+                    for (int i02 = 0; i02 < ne02; i02++) {
+                        id += ne00 * ir0;
+                        for (int i01 = ir0; i01 < ir1; i01++) {
+                            for (int i00 = 0; i00 < ne00; i00++) {
+                                const ggml_v3_fp16_t * src0_ptr = (ggml_v3_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+
+                                dst_ptr[id] = *src0_ptr;
+                                id++;
+                            }
+                        }
+                        id += ne00 * (ne01 - ir1);
+                    }
+                }
+            } else {
+                GGML_V3_ASSERT(false); // TODO: implement
+            }
+        }
+        return;
+    }
+
+    // dst counters
+    int64_t i10 = 0;
+    int64_t i11 = 0;
+    int64_t i12 = 0;
+    int64_t i13 = 0;
+
+    if (dst->type == GGML_V3_TYPE_F16) {
+        for (int64_t i03 = 0; i03 < ne03; i03++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                i10 += ne00 * ir0;
+                while (i10 >= ne0) {
+                    i10 -= ne0;
+                    if (++i11 == ne1) {
+                        i11 = 0;
+                        if (++i12 == ne2) {
+                            i12 = 0;
+                            if (++i13 == ne3) {
+                                i13 = 0;
+                            }
+                        }
+                    }
+                }
+                for (int64_t i01 = ir0; i01 < ir1; i01++) {
+                    for (int64_t i00 = 0; i00 < ne00; i00++) {
+                        const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+                              char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
+
+                        memcpy(dst_ptr, src0_ptr, sizeof(ggml_v3_fp16_t));
+
+                        if (++i10 == ne00) {
+                            i10 = 0;
+                            if (++i11 == ne01) {
+                                i11 = 0;
+                                if (++i12 == ne02) {
+                                    i12 = 0;
+                                    if (++i13 == ne03) {
+                                        i13 = 0;
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+                i10 += ne00 * (ne01 - ir1);
+                while (i10 >= ne0) {
+                    i10 -= ne0;
+                    if (++i11 == ne1) {
+                        i11 = 0;
+                        if (++i12 == ne2) {
+                            i12 = 0;
+                            if (++i13 == ne3) {
+                                i13 = 0;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    } else if (dst->type == GGML_V3_TYPE_F32) {
+        for (int64_t i03 = 0; i03 < ne03; i03++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                i10 += ne00 * ir0;
+                while (i10 >= ne0) {
+                    i10 -= ne0;
+                    if (++i11 == ne1) {
+                        i11 = 0;
+                        if (++i12 == ne2) {
+                            i12 = 0;
+                            if (++i13 == ne3) {
+                                i13 = 0;
+                            }
+                        }
+                    }
+                }
+                for (int64_t i01 = ir0; i01 < ir1; i01++) {
+                    for (int64_t i00 = 0; i00 < ne00; i00++) {
+                        const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+                              char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
+
+                        *(float *) dst_ptr = GGML_V3_FP16_TO_FP32(*(const ggml_v3_fp16_t *) src0_ptr);
+
+                        if (++i10 == ne0) {
+                            i10 = 0;
+                            if (++i11 == ne1) {
+                                i11 = 0;
+                                if (++i12 == ne2) {
+                                    i12 = 0;
+                                    if (++i13 == ne3) {
+                                        i13 = 0;
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+                i10 += ne00 * (ne01 - ir1);
+                while (i10 >= ne0) {
+                    i10 -= ne0;
+                    if (++i11 == ne1) {
+                        i11 = 0;
+                        if (++i12 == ne2) {
+                            i12 = 0;
+                            if (++i13 == ne3) {
+                                i13 = 0;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    } else {
+        GGML_V3_ASSERT(false); // TODO: implement
+    }
+}
+
+static void ggml_v3_compute_forward_dup_f32(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    GGML_V3_ASSERT(ggml_v3_nelements(dst) == ggml_v3_nelements(src0));
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    GGML_V3_TENSOR_UNARY_OP_LOCALS
+
+    const int ith = params->ith; // thread index
+    const int nth = params->nth; // number of threads
+
+    if (ggml_v3_is_contiguous(src0) && ggml_v3_is_contiguous(dst) && src0->type == dst->type) {
+        ggml_v3_compute_forward_dup_same_cont(params, src0, dst);
+        return;
+    }
+
+    // parallelize by rows
+    const int nr = ne01;
+    // number of rows per thread
+    const int dr = (nr + nth - 1) / nth;
+    // row range for this thread
+    const int ir0 = dr * ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    if (src0->type == dst->type &&
+        ne00 == ne0 &&
+        nb00 == ggml_v3_type_size(src0->type) && nb0 == ggml_v3_type_size(dst->type)) {
+        // copy by rows
+        const size_t rs = ne00*nb00;
+        for (int64_t i03 = 0; i03 < ne03; i03++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                for (int64_t i01 = ir0; i01 < ir1; i01++) {
+                    memcpy(
+                        ((char *)  dst->data + i01*nb1  + i02*nb2  + i03*nb3),
+                        ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03),
+                        rs);
+                }
+            }
+        }
+        return;
+    }
+
+    if (ggml_v3_is_contiguous(dst)) {
+        // TODO: simplify
+        if (nb00 == sizeof(float)) {
+            if (dst->type == GGML_V3_TYPE_F32) {
+                size_t id = 0;
+                const size_t rs = ne00 * nb00;
+                char * dst_ptr = (char *) dst->data;
+
+                for (int i03 = 0; i03 < ne03; i03++) {
+                    for (int i02 = 0; i02 < ne02; i02++) {
+                        id += rs * ir0;
+                        for (int i01 = ir0; i01 < ir1; i01++) {
+                            const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
+                            memcpy(dst_ptr + id, src0_ptr, rs);
+                            id += rs;
+                        }
+                        id += rs * (ne01 - ir1);
+                    }
+                }
+            } else if (type_traits[dst->type].from_float) {
+                ggml_v3_from_float_t const quantize_row_q = type_traits[dst->type].from_float;
+
+                size_t id = 0;
+                size_t rs = nb0 * (ne00 / ggml_v3_blck_size(dst->type));
+                char * dst_ptr = (char *) dst->data;
+
+                for (int i03 = 0; i03 < ne03; i03++) {
+                    for (int i02 = 0; i02 < ne02; i02++) {
+                        id += rs * ir0;
+                        for (int i01 = ir0; i01 < ir1; i01++) {
+                            const float * src0_ptr = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
+                            quantize_row_q(src0_ptr, dst_ptr + id, ne00);
+                            id += rs;
+                        }
+                        id += rs * (ne01 - ir1);
+                    }
+                }
+            } else {
+                GGML_V3_ASSERT(false); // TODO: implement
+            }
+        } else {
+            //printf("%s: this is not optimal - fix me\n", __func__);
+
+            if (dst->type == GGML_V3_TYPE_F32) {
+                size_t id = 0;
+                float * dst_ptr = (float *) dst->data;
+
+                for (int i03 = 0; i03 < ne03; i03++) {
+                    for (int i02 = 0; i02 < ne02; i02++) {
+                        id += ne00 * ir0;
+                        for (int i01 = ir0; i01 < ir1; i01++) {
+                            for (int i00 = 0; i00 < ne00; i00++) {
+                                const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+
+                                dst_ptr[id] = *src0_ptr;
+                                id++;
+                            }
+                        }
+                        id += ne00 * (ne01 - ir1);
+                    }
+                }
+            } else if (dst->type == GGML_V3_TYPE_F16) {
+                size_t id = 0;
+                ggml_v3_fp16_t * dst_ptr = (ggml_v3_fp16_t *) dst->data;
+
+                for (int i03 = 0; i03 < ne03; i03++) {
+                    for (int i02 = 0; i02 < ne02; i02++) {
+                        id += ne00 * ir0;
+                        for (int i01 = ir0; i01 < ir1; i01++) {
+                            for (int i00 = 0; i00 < ne00; i00++) {
+                                const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+
+                                dst_ptr[id] = GGML_V3_FP32_TO_FP16(*src0_ptr);
+                                id++;
+                            }
+                        }
+                        id += ne00 * (ne01 - ir1);
+                    }
+                }
+            } else {
+                GGML_V3_ASSERT(false); // TODO: implement
+            }
+        }
+
+        return;
+    }
+
+    // dst counters
+
+    int64_t i10 = 0;
+    int64_t i11 = 0;
+    int64_t i12 = 0;
+    int64_t i13 = 0;
+
+    if (dst->type == GGML_V3_TYPE_F32) {
+        for (int64_t i03 = 0; i03 < ne03; i03++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                i10 += ne00 * ir0;
+                while (i10 >= ne0) {
+                    i10 -= ne0;
+                    if (++i11 == ne1) {
+                        i11 = 0;
+                        if (++i12 == ne2) {
+                            i12 = 0;
+                            if (++i13 == ne3) {
+                                i13 = 0;
+                            }
+                        }
+                    }
+                }
+                for (int64_t i01 = ir0; i01 < ir1; i01++) {
+                    for (int64_t i00 = 0; i00 < ne00; i00++) {
+                        const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+                              char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
+
+                        memcpy(dst_ptr, src0_ptr, sizeof(float));
+
+                        if (++i10 == ne0) {
+                            i10 = 0;
+                            if (++i11 == ne1) {
+                                i11 = 0;
+                                if (++i12 == ne2) {
+                                    i12 = 0;
+                                    if (++i13 == ne3) {
+                                        i13 = 0;
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+                i10 += ne00 * (ne01 - ir1);
+                while (i10 >= ne0) {
+                    i10 -= ne0;
+                    if (++i11 == ne1) {
+                        i11 = 0;
+                        if (++i12 == ne2) {
+                            i12 = 0;
+                            if (++i13 == ne3) {
+                                i13 = 0;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    } else if (dst->type == GGML_V3_TYPE_F16) {
+        for (int64_t i03 = 0; i03 < ne03; i03++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                i10 += ne00 * ir0;
+                while (i10 >= ne0) {
+                    i10 -= ne0;
+                    if (++i11 == ne1) {
+                        i11 = 0;
+                        if (++i12 == ne2) {
+                            i12 = 0;
+                            if (++i13 == ne3) {
+                                i13 = 0;
+                            }
+                        }
+                    }
+                }
+                for (int64_t i01 = ir0; i01 < ir1; i01++) {
+                    for (int64_t i00 = 0; i00 < ne00; i00++) {
+                        const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+                              char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
+
+                        *(ggml_v3_fp16_t *) dst_ptr = GGML_V3_FP32_TO_FP16(*(const float *) src0_ptr);
+
+                        if (++i10 == ne0) {
+                            i10 = 0;
+                            if (++i11 == ne1) {
+                                i11 = 0;
+                                if (++i12 == ne2) {
+                                    i12 = 0;
+                                    if (++i13 == ne3) {
+                                        i13 = 0;
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+                i10 += ne00 * (ne01 - ir1);
+                while (i10 >= ne0) {
+                    i10 -= ne0;
+                    if (++i11 == ne1) {
+                        i11 = 0;
+                        if (++i12 == ne2) {
+                            i12 = 0;
+                            if (++i13 == ne3) {
+                                i13 = 0;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    } else {
+        GGML_V3_ASSERT(false); // TODO: implement
+    }
+}
+
+// A simplified version of ggml_v3_compute_forward_dup that doesn't do float upcasting, and just plain old memcpy.
+static void ggml_v3_compute_forward_dup_bytes(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    GGML_V3_ASSERT(ggml_v3_nelements(dst) == ggml_v3_nelements(src0));
+    GGML_V3_ASSERT(src0->type == dst->type);
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    if (ggml_v3_is_contiguous(src0) && ggml_v3_is_contiguous(dst)) {
+        ggml_v3_compute_forward_dup_same_cont(params, src0, dst);
+        return;
+    }
+
+    GGML_V3_TENSOR_UNARY_OP_LOCALS;
+
+    const size_t type_size = ggml_v3_type_size(src0->type);
+    const int ith = params->ith; // thread index
+    const int nth = params->nth; // number of threads
+
+
+    // parallelize by rows
+    const int nr = ne01;
+    // number of rows per thread
+    const int dr = (nr + nth - 1) / nth;
+    // row range for this thread
+    const int ir0 = dr * ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    if (src0->type == dst->type &&
+        ne00 == ne0 &&
+        nb00 == type_size && nb0 == type_size) {
+        // copy by rows
+        const size_t rs = ne00 * type_size;
+        for (int64_t i03 = 0; i03 < ne03; i03++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                for (int64_t i01 = ir0; i01 < ir1; i01++) {
+                    memcpy(
+                        ((char *)  dst->data + i01*nb1  + i02*nb2  + i03*nb3),
+                        ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03),
+                        rs);
+                }
+            }
+        }
+        return;
+    }
+
+    if (ggml_v3_is_contiguous(dst)) {
+        size_t id = 0;
+        char * dst_ptr = (char *) dst->data;
+        const size_t rs = ne00 * type_size;
+
+        if (nb00 == type_size) {
+            // src0 is contigous on first dimension, copy by rows
+            for (int64_t i03 = 0; i03 < ne03; i03++) {
+                for (int64_t i02 = 0; i02 < ne02; i02++) {
+                    id += rs * ir0;
+                    for (int64_t i01 = ir0; i01 < ir1; i01++) {
+                        const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
+                        memcpy(dst_ptr + id, src0_ptr, rs);
+                        id += rs;
+                    }
+                    id += rs * (ne01 - ir1);
+                }
+            }
+        } else {
+            //printf("%s: this is not optimal - fix me\n", __func__);
+
+            for (int64_t i03 = 0; i03 < ne03; i03++) {
+                for (int64_t i02 = 0; i02 < ne02; i02++) {
+                    id += rs * ir0;
+                    for (int64_t i01 = ir0; i01 < ir1; i01++) {
+                        for (int64_t i00 = 0; i00 < ne00; i00++) {
+                            const char * src0_ptr = (char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03;
+                            memcpy(dst_ptr + id, src0_ptr, type_size);
+
+                            id += type_size;
+                        }
+                    }
+                    id += rs * (ne01 - ir1);
+                }
+            }
+        }
+
+        return;
+    }
+
+    // dst counters
+
+    int64_t i10 = 0;
+    int64_t i11 = 0;
+    int64_t i12 = 0;
+    int64_t i13 = 0;
+
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        for (int64_t i02 = 0; i02 < ne02; i02++) {
+            i10 += ne00 * ir0;
+            while (i10 >= ne0) {
+                i10 -= ne0;
+                if (++i11 == ne1) {
+                    i11 = 0;
+                    if (++i12 == ne2) {
+                        i12 = 0;
+                        if (++i13 == ne3) {
+                            i13 = 0;
+                        }
+                    }
+                }
+            }
+            for (int64_t i01 = ir0; i01 < ir1; i01++) {
+                for (int64_t i00 = 0; i00 < ne00; i00++) {
+                    const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+                          char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
+
+                    memcpy(dst_ptr, src0_ptr, type_size);
+
+                    if (++i10 == ne0) {
+                        i10 = 0;
+                        if (++i11 == ne1) {
+                            i11 = 0;
+                            if (++i12 == ne2) {
+                                i12 = 0;
+                                if (++i13 == ne3) {
+                                    i13 = 0;
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+            i10 += ne00 * (ne01 - ir1);
+            while (i10 >= ne0) {
+                i10 -= ne0;
+                if (++i11 == ne1) {
+                    i11 = 0;
+                    if (++i12 == ne2) {
+                        i12 = 0;
+                        if (++i13 == ne3) {
+                            i13 = 0;
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+static void ggml_v3_compute_forward_dup(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    if (src0->type == dst->type) {
+        ggml_v3_compute_forward_dup_bytes(params, src0, dst);
+        return;
+    }
+
+    switch (src0->type) {
+        case GGML_V3_TYPE_F16:
+            {
+                ggml_v3_compute_forward_dup_f16(params, src0, dst);
+            } break;
+        case GGML_V3_TYPE_F32:
+            {
+                ggml_v3_compute_forward_dup_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_v3_compute_forward_add
+
+static void ggml_v3_compute_forward_add_f32(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        const struct ggml_v3_tensor * src1,
+        struct ggml_v3_tensor * dst) {
+    GGML_V3_ASSERT(ggml_v3_can_repeat(src1, src0) && ggml_v3_are_same_shape(src0, dst));
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr  = ggml_v3_nrows(src0);
+
+    GGML_V3_TENSOR_BINARY_OP_LOCALS
+
+    GGML_V3_ASSERT( nb0 == sizeof(float));
+    GGML_V3_ASSERT(nb00 == sizeof(float));
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    if (nb10 == sizeof(float)) {
+        for (int ir = ir0; ir < ir1; ++ir) {
+            // src1 is broadcastable across src0 and dst in i1, i2, i3
+            const int64_t i03 = ir/(ne02*ne01);
+            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
+            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+
+            const int64_t i13 = i03 % ne13;
+            const int64_t i12 = i02 % ne12;
+            const int64_t i11 = i01 % ne11;
+            const int64_t nr0 = ne00 / ne10;
+
+            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
+            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
+            float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
+
+            for (int64_t r = 0; r < nr0; ++r) {
+#ifdef GGML_USE_ACCELERATE
+                vDSP_vadd(src0_ptr + r*ne10, 1, src1_ptr, 1, dst_ptr + r*ne10, 1, ne10);
+#else
+                ggml_v3_vec_add_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
+#endif
+            }
+        }
+    } else {
+        // src1 is not contiguous
+        for (int ir = ir0; ir < ir1; ++ir) {
+            // src1 is broadcastable across src0 and dst in i1, i2, i3
+            const int64_t i03 = ir/(ne02*ne01);
+            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
+            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+
+            const int64_t i13 = i03 % ne13;
+            const int64_t i12 = i02 % ne12;
+            const int64_t i11 = i01 % ne11;
+
+            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
+            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
+
+            for (int64_t i0 = 0; i0 < ne0; ++i0) {
+                const int64_t i10 = i0 % ne10;
+                float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
+
+                dst_ptr[i0] = src0_ptr[i0] + *src1_ptr;
+            }
+        }
+    }
+}
+
+static void ggml_v3_compute_forward_add_f16_f32(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        const struct ggml_v3_tensor * src1,
+        struct ggml_v3_tensor * dst) {
+    GGML_V3_ASSERT(ggml_v3_are_same_shape(src0, src1) && ggml_v3_are_same_shape(src0, dst));
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr  = ggml_v3_nrows(src0);
+
+    GGML_V3_TENSOR_BINARY_OP_LOCALS
+
+    GGML_V3_ASSERT(src0->type == GGML_V3_TYPE_F16);
+    GGML_V3_ASSERT(src1->type == GGML_V3_TYPE_F32);
+
+    if (dst->type == GGML_V3_TYPE_F32) {
+        GGML_V3_ASSERT( nb0 == sizeof(float));
+    }
+    else {
+        GGML_V3_ASSERT(dst->type  == GGML_V3_TYPE_F16);
+        GGML_V3_ASSERT( nb0 == sizeof(ggml_v3_fp16_t));
+    }
+
+    GGML_V3_ASSERT(nb00 == sizeof(ggml_v3_fp16_t));
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    if (nb10 == sizeof(float)) {
+        if (dst->type == GGML_V3_TYPE_F16) {
+            for (int ir = ir0; ir < ir1; ++ir) {
+                // src0, src1 and dst are same shape => same indices
+                const int i3 = ir/(ne2*ne1);
+                const int i2 = (ir - i3*ne2*ne1)/ne1;
+                const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+                ggml_v3_fp16_t * dst_ptr  = (ggml_v3_fp16_t *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1);
+                ggml_v3_fp16_t * src0_ptr = (ggml_v3_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
+                float *       src1_ptr = (float *)       ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
+
+                for (int i = 0; i < ne0; i++) {
+                    dst_ptr[i] = GGML_V3_FP32_TO_FP16(GGML_V3_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i]);
+                }
+            }
+        } else {
+            for (int ir = ir0; ir < ir1; ++ir) {
+                // src0, src1 and dst are same shape => same indices
+                const int i3 = ir/(ne2*ne1);
+                const int i2 = (ir - i3*ne2*ne1)/ne1;
+                const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+                float *       dst_ptr  = (float *)       ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1);
+                ggml_v3_fp16_t * src0_ptr = (ggml_v3_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
+                float *       src1_ptr = (float *)       ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
+
+                for (int i = 0; i < ne0; i++) {
+                    dst_ptr[i] = GGML_V3_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i];
+                }
+            }
+        }
+    }
+    else {
+        // src1 is not contiguous
+        GGML_V3_ASSERT(false);
+    }
+}
+
+static void ggml_v3_compute_forward_add_f16_f16(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        const struct ggml_v3_tensor * src1,
+        struct ggml_v3_tensor * dst) {
+    GGML_V3_ASSERT(ggml_v3_are_same_shape(src0, src1) && ggml_v3_are_same_shape(src0, dst));
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr  = ggml_v3_nrows(src0);
+
+    GGML_V3_TENSOR_BINARY_OP_LOCALS
+
+    GGML_V3_ASSERT(src0->type == GGML_V3_TYPE_F16);
+    GGML_V3_ASSERT(src1->type == GGML_V3_TYPE_F16);
+    GGML_V3_ASSERT(dst->type  == GGML_V3_TYPE_F16);
+
+    GGML_V3_ASSERT( nb0 == sizeof(ggml_v3_fp16_t));
+    GGML_V3_ASSERT(nb00 == sizeof(ggml_v3_fp16_t));
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    if (nb10 == sizeof(ggml_v3_fp16_t)) {
+        for (int ir = ir0; ir < ir1; ++ir) {
+            // src0, src1 and dst are same shape => same indices
+            const int i3 = ir/(ne2*ne1);
+            const int i2 = (ir - i3*ne2*ne1)/ne1;
+            const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+            ggml_v3_fp16_t * dst_ptr  = (ggml_v3_fp16_t *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1);
+            ggml_v3_fp16_t * src0_ptr = (ggml_v3_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
+            ggml_v3_fp16_t * src1_ptr = (ggml_v3_fp16_t *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
+
+            for (int i = 0; i < ne0; i++) {
+                dst_ptr[i] = GGML_V3_FP32_TO_FP16(GGML_V3_FP16_TO_FP32(src0_ptr[i]) + GGML_V3_FP16_TO_FP32(src1_ptr[i]));
+            }
+        }
+    }
+    else {
+        // src1 is not contiguous
+        GGML_V3_ASSERT(false);
+    }
+}
+
+static void ggml_v3_compute_forward_add_q_f32(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        const struct ggml_v3_tensor * src1,
+        struct ggml_v3_tensor * dst) {
+    GGML_V3_ASSERT(ggml_v3_are_same_shape(src0, src1) && ggml_v3_are_same_shape(src0, dst));
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    const int nr  = ggml_v3_nrows(src0);
+
+    GGML_V3_TENSOR_BINARY_OP_LOCALS
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const enum ggml_v3_type type = src0->type;
+    const enum ggml_v3_type dtype = dst->type;
+    ggml_v3_to_float_t const dequantize_row_q = type_traits[type].to_float;
+    ggml_v3_from_float_t const quantize_row_q = type_traits[dtype].from_float;
+
+    // we don't support permuted src0 or src1
+    GGML_V3_ASSERT(nb00 == ggml_v3_type_size(type));
+    GGML_V3_ASSERT(nb10 == sizeof(float));
+
+    // dst cannot be transposed or permuted
+    GGML_V3_ASSERT(nb0 <= nb1);
+    GGML_V3_ASSERT(nb1 <= nb2);
+    GGML_V3_ASSERT(nb2 <= nb3);
+
+    GGML_V3_ASSERT(ggml_v3_is_quantized(src0->type));
+    GGML_V3_ASSERT(src1->type == GGML_V3_TYPE_F32);
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    float * wdata = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;
+
+    for (int ir = ir0; ir < ir1; ++ir) {
+        // src0 indices
+        const int i03 = ir/(ne02*ne01);
+        const int i02 = (ir - i03*ne02*ne01)/ne01;
+        const int i01 = (ir - i03*ne02*ne01 - i02*ne01);
+
+        // src1 and dst are same shape as src0 => same indices
+        const int i13 = i03;
+        const int i12 = i02;
+        const int i11 = i01;
+
+        const int i3 = i03;
+        const int i2 = i02;
+        const int i1 = i01;
+
+        void  * src0_row = (void *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03));
+        float * src1_row = (float *)((char *) src1->data + (i11*nb11 + i12*nb12 + i13*nb13));
+        void  * dst_row  = (void *) ((char *)  dst->data + ( i1*nb1  +  i2*nb2  +  i3*nb3));
+
+        assert(ne00 % 32 == 0);
+
+        // unquantize row from src0 to temp buffer
+        dequantize_row_q(src0_row, wdata, ne00);
+        // add src1
+        ggml_v3_vec_acc_f32(ne00, wdata, src1_row);
+        // quantize row to dst
+        if (quantize_row_q != NULL) {
+            quantize_row_q(wdata, dst_row, ne00);
+        } else {
+            memcpy(dst_row, wdata, ne0*nb0);
+        }
+    }
+}
+
+static void ggml_v3_compute_forward_add(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        const struct ggml_v3_tensor * src1,
+        struct ggml_v3_tensor * dst) {
+    switch (src0->type) {
+        case GGML_V3_TYPE_F32:
+            {
+                ggml_v3_compute_forward_add_f32(params, src0, src1, dst);
+            } break;
+        case GGML_V3_TYPE_F16:
+            {
+                if (src1->type == GGML_V3_TYPE_F16) {
+                    ggml_v3_compute_forward_add_f16_f16(params, src0, src1, dst);
+                }
+                else if (src1->type == GGML_V3_TYPE_F32) {
+                    ggml_v3_compute_forward_add_f16_f32(params, src0, src1, dst);
+                }
+                else {
+                    GGML_V3_ASSERT(false);
+                }
+            } break;
+        case GGML_V3_TYPE_Q4_0:
+        case GGML_V3_TYPE_Q4_1:
+        case GGML_V3_TYPE_Q5_0:
+        case GGML_V3_TYPE_Q5_1:
+        case GGML_V3_TYPE_Q8_0:
+        case GGML_V3_TYPE_Q2_K:
+        case GGML_V3_TYPE_Q3_K:
+        case GGML_V3_TYPE_Q4_K:
+        case GGML_V3_TYPE_Q5_K:
+        case GGML_V3_TYPE_Q6_K:
+        case GGML_V3_TYPE_IQ2_XXS:
+        case GGML_V3_TYPE_IQ2_XS:
+            {
+                ggml_v3_compute_forward_add_q_f32(params, src0, src1, dst);
+            } break;
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_v3_compute_forward_add1
+
+static void ggml_v3_compute_forward_add1_f32(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        const struct ggml_v3_tensor * src1,
+        struct ggml_v3_tensor * dst) {
+    GGML_V3_ASSERT(ggml_v3_are_same_shape(src0, dst));
+    GGML_V3_ASSERT(ggml_v3_is_scalar(src1));
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr  = ggml_v3_nrows(src0);
+
+    GGML_V3_TENSOR_UNARY_OP_LOCALS
+
+    GGML_V3_ASSERT( nb0 == sizeof(float));
+    GGML_V3_ASSERT(nb00 == sizeof(float));
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int ir = ir0; ir < ir1; ++ir) {
+        // src0 and dst are same shape => same indices
+        const int i3 = ir/(ne2*ne1);
+        const int i2 = (ir - i3*ne2*ne1)/ne1;
+        const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+#ifdef GGML_USE_ACCELERATE
+        UNUSED(ggml_v3_vec_add1_f32);
+
+        vDSP_vadd(
+                (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
+                (float *) ((char *) src1->data), 0,
+                (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 ), 1,
+                ne0);
+#else
+        ggml_v3_vec_add1_f32(ne0,
+                (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 ),
+                (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01),
+               *(float *) src1->data);
+#endif
+    }
+}
+
+static void ggml_v3_compute_forward_add1_f16_f32(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        const struct ggml_v3_tensor * src1,
+        struct ggml_v3_tensor * dst) {
+    GGML_V3_ASSERT(ggml_v3_are_same_shape(src0, dst));
+    GGML_V3_ASSERT(ggml_v3_is_scalar(src1));
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    // scalar to add
+    const float v = *(float *) src1->data;
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr  = ggml_v3_nrows(src0);
+
+    GGML_V3_TENSOR_UNARY_OP_LOCALS
+
+    GGML_V3_ASSERT(src0->type == GGML_V3_TYPE_F16);
+    GGML_V3_ASSERT(src1->type == GGML_V3_TYPE_F32);
+    GGML_V3_ASSERT(dst->type  == GGML_V3_TYPE_F16);
+
+    GGML_V3_ASSERT( nb0 == sizeof(ggml_v3_fp16_t));
+    GGML_V3_ASSERT(nb00 == sizeof(ggml_v3_fp16_t));
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int ir = ir0; ir < ir1; ++ir) {
+        // src0 and dst are same shape => same indices
+        const int i3 = ir/(ne2*ne1);
+        const int i2 = (ir - i3*ne2*ne1)/ne1;
+        const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+        ggml_v3_fp16_t * dst_ptr  = (ggml_v3_fp16_t *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 );
+        ggml_v3_fp16_t * src0_ptr = (ggml_v3_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
+        for (int i = 0; i < ne0; i++) {
+            dst_ptr[i] = GGML_V3_FP32_TO_FP16(GGML_V3_FP16_TO_FP32(src0_ptr[i]) + v);
+        }
+    }
+}
+
+static void ggml_v3_compute_forward_add1_f16_f16(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        const struct ggml_v3_tensor * src1,
+        struct ggml_v3_tensor * dst) {
+    GGML_V3_ASSERT(ggml_v3_are_same_shape(src0, dst));
+    GGML_V3_ASSERT(ggml_v3_is_scalar(src1));
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    // scalar to add
+    const float v = GGML_V3_FP16_TO_FP32(*(ggml_v3_fp16_t *) src1->data);
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr  = ggml_v3_nrows(src0);
+
+    GGML_V3_TENSOR_UNARY_OP_LOCALS
+
+    GGML_V3_ASSERT(src0->type == GGML_V3_TYPE_F16);
+    GGML_V3_ASSERT(src1->type == GGML_V3_TYPE_F16);
+    GGML_V3_ASSERT(dst->type  == GGML_V3_TYPE_F16);
+
+    GGML_V3_ASSERT( nb0 == sizeof(ggml_v3_fp16_t));
+    GGML_V3_ASSERT(nb00 == sizeof(ggml_v3_fp16_t));
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int ir = ir0; ir < ir1; ++ir) {
+        // src0 and dst are same shape => same indices
+        const int i3 = ir/(ne2*ne1);
+        const int i2 = (ir - i3*ne2*ne1)/ne1;
+        const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+        ggml_v3_fp16_t * dst_ptr  = (ggml_v3_fp16_t *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 );
+        ggml_v3_fp16_t * src0_ptr = (ggml_v3_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
+        for (int i = 0; i < ne0; i++) {
+            dst_ptr[i] = GGML_V3_FP32_TO_FP16(GGML_V3_FP16_TO_FP32(src0_ptr[i]) + v);
+        }
+    }
+}
+
+static void ggml_v3_compute_forward_add1_q_f32(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        const struct ggml_v3_tensor * src1,
+        struct ggml_v3_tensor * dst) {
+    GGML_V3_ASSERT(ggml_v3_are_same_shape(src0, dst));
+    GGML_V3_ASSERT(ggml_v3_is_scalar(src1));
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    // scalar to add
+    const float v = *(float *) src1->data;
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr  = ggml_v3_nrows(src0);
+
+    GGML_V3_TENSOR_UNARY_OP_LOCALS
+
+    const enum ggml_v3_type type = src0->type;
+    ggml_v3_to_float_t const dequantize_row_q = type_traits[type].to_float;
+    ggml_v3_from_float_t const quantize_row_q = type_traits[type].from_float;
+
+    // we don't support permuted src0
+    GGML_V3_ASSERT(nb00 == ggml_v3_type_size(type));
+
+    // dst cannot be transposed or permuted
+    GGML_V3_ASSERT(nb0 <= nb1);
+    GGML_V3_ASSERT(nb1 <= nb2);
+    GGML_V3_ASSERT(nb2 <= nb3);
+
+    GGML_V3_ASSERT(ggml_v3_is_quantized(src0->type));
+    GGML_V3_ASSERT(dst->type == src0->type);
+    GGML_V3_ASSERT(src1->type == GGML_V3_TYPE_F32);
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    float * wdata = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32) * ith;
+
+    for (int ir = ir0; ir < ir1; ++ir) {
+        // src0 and dst are same shape => same indices
+        const int i3 = ir/(ne2*ne1);
+        const int i2 = (ir - i3*ne2*ne1)/ne1;
+        const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+        void  * src0_row = (void *) ((char *) src0->data + (i1*nb01 + i2*nb02 + i3*nb03));
+        void  * dst_row  = (void *) ((char *)  dst->data + (i1*nb1  + i2*nb2  + i3*nb0 ));
+
+        assert(ne0 % 32 == 0);
+
+        // unquantize row from src0 to temp buffer
+        dequantize_row_q(src0_row, wdata, ne0);
+        // add src1
+        ggml_v3_vec_acc1_f32(ne0, wdata, v);
+        // quantize row to dst
+        quantize_row_q(wdata, dst_row, ne0);
+    }
+}
+
+static void ggml_v3_compute_forward_add1(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        const struct ggml_v3_tensor * src1,
+        struct ggml_v3_tensor * dst) {
+    switch (src0->type) {
+        case GGML_V3_TYPE_F32:
+            {
+                ggml_v3_compute_forward_add1_f32(params, src0, src1, dst);
+            } break;
+        case GGML_V3_TYPE_F16:
+            {
+                if (src1->type == GGML_V3_TYPE_F16) {
+                    ggml_v3_compute_forward_add1_f16_f16(params, src0, src1, dst);
+                }
+                else if (src1->type == GGML_V3_TYPE_F32) {
+                    ggml_v3_compute_forward_add1_f16_f32(params, src0, src1, dst);
+                }
+                else {
+                    GGML_V3_ASSERT(false);
+                }
+            } break;
+        case GGML_V3_TYPE_Q4_0:
+        case GGML_V3_TYPE_Q4_1:
+        case GGML_V3_TYPE_Q5_0:
+        case GGML_V3_TYPE_Q5_1:
+        case GGML_V3_TYPE_Q8_0:
+        case GGML_V3_TYPE_Q8_1:
+        case GGML_V3_TYPE_Q2_K:
+        case GGML_V3_TYPE_Q3_K:
+        case GGML_V3_TYPE_Q4_K:
+        case GGML_V3_TYPE_Q5_K:
+        case GGML_V3_TYPE_Q6_K:
+        case GGML_V3_TYPE_IQ2_XXS:
+        case GGML_V3_TYPE_IQ2_XS:
+            {
+                ggml_v3_compute_forward_add1_q_f32(params, src0, src1, dst);
+            } break;
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_v3_compute_forward_acc
+
+static void ggml_v3_compute_forward_acc_f32(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        const struct ggml_v3_tensor * src1,
+        struct ggml_v3_tensor * dst) {
+    GGML_V3_ASSERT(ggml_v3_are_same_shape(src0, dst));
+    GGML_V3_ASSERT(ggml_v3_is_contiguous(dst) && ggml_v3_is_contiguous(src0));
+
+    // view src0 and dst with these strides and data offset inbytes during acc
+    // nb0 is implicitly element_size because src0 and dst are contiguous
+    size_t nb1     = ((int32_t *) dst->op_params)[0];
+    size_t nb2     = ((int32_t *) dst->op_params)[1];
+    size_t nb3     = ((int32_t *) dst->op_params)[2];
+    size_t offset  = ((int32_t *) dst->op_params)[3];
+    bool   inplace = (bool) ((int32_t *) dst->op_params)[4];
+
+    if (!inplace && (params->type == GGML_V3_TASK_INIT)) {
+        // memcpy needs to be synchronized across threads to avoid race conditions.
+        // => do it in INIT phase
+        memcpy(
+            ((char *)  dst->data),
+            ((char *) src0->data),
+            ggml_v3_nbytes(dst));
+    }
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr = ggml_v3_nrows(src1);
+    const int nc = src1->ne[0];
+
+    GGML_V3_TENSOR_LOCALS(int64_t, ne1, src1, ne)
+    GGML_V3_TENSOR_LOCALS(size_t,  nb1, src1, nb)
+
+    // src0 and dst as viewed during acc
+    const size_t nb0 = ggml_v3_element_size(src0);
+
+    const size_t nb00 = nb0;
+    const size_t nb01 = nb1;
+    const size_t nb02 = nb2;
+    const size_t nb03 = nb3;
+
+    GGML_V3_ASSERT(offset + (ne10 == 0 ? 0 : ne10-1)*nb0  + (ne11 == 0 ? 0 : ne11-1)*nb1  + (ne12 == 0 ? 0 : ne12-1)*nb2  + (ne13 == 0 ? 0 : ne13-1)*nb3  < ggml_v3_nbytes(dst));
+    GGML_V3_ASSERT(offset + (ne10 == 0 ? 0 : ne10-1)*nb00 + (ne11 == 0 ? 0 : ne11-1)*nb01 + (ne12 == 0 ? 0 : ne12-1)*nb02 + (ne13 == 0 ? 0 : ne13-1)*nb03 < ggml_v3_nbytes(src0));
+
+    GGML_V3_ASSERT(nb10 == sizeof(float));
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int ir = ir0; ir < ir1; ++ir) {
+        // src0 and dst are viewed with shape of src1 and offset
+        // => same indices
+        const int i3 = ir/(ne12*ne11);
+        const int i2 = (ir - i3*ne12*ne11)/ne11;
+        const int i1 = (ir - i3*ne12*ne11 - i2*ne11);
+
+#ifdef GGML_USE_ACCELERATE
+        vDSP_vadd(
+                (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + offset), 1,
+                (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
+                (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1  + offset), 1, nc);
+#else
+        ggml_v3_vec_add_f32(nc,
+                (float *) ((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + offset),
+                (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + offset),
+                (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
+#endif
+    }
+}
+
+static void ggml_v3_compute_forward_acc(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        const struct ggml_v3_tensor * src1,
+        struct ggml_v3_tensor * dst) {
+
+    switch (src0->type) {
+        case GGML_V3_TYPE_F32:
+            {
+                ggml_v3_compute_forward_acc_f32(params, src0, src1, dst);
+            } break;
+        case GGML_V3_TYPE_F16:
+        case GGML_V3_TYPE_Q4_0:
+        case GGML_V3_TYPE_Q4_1:
+        case GGML_V3_TYPE_Q5_0:
+        case GGML_V3_TYPE_Q5_1:
+        case GGML_V3_TYPE_Q8_0:
+        case GGML_V3_TYPE_Q8_1:
+        case GGML_V3_TYPE_Q2_K:
+        case GGML_V3_TYPE_Q3_K:
+        case GGML_V3_TYPE_Q4_K:
+        case GGML_V3_TYPE_Q5_K:
+        case GGML_V3_TYPE_Q6_K:
+        case GGML_V3_TYPE_IQ2_XXS:
+        case GGML_V3_TYPE_IQ2_XS:
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_v3_compute_forward_sub
+
+static void ggml_v3_compute_forward_sub_f32(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        const struct ggml_v3_tensor * src1,
+        struct ggml_v3_tensor * dst) {
+    assert(params->ith == 0);
+    assert(ggml_v3_are_same_shape(src0, src1) && ggml_v3_are_same_shape(src0, dst));
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    const int nr  = ggml_v3_nrows(src0);
+
+    GGML_V3_TENSOR_BINARY_OP_LOCALS
+
+    GGML_V3_ASSERT( nb0 == sizeof(float));
+    GGML_V3_ASSERT(nb00 == sizeof(float));
+
+    if (nb10 == sizeof(float)) {
+        for (int ir = 0; ir < nr; ++ir) {
+            // src0, src1 and dst are same shape => same indices
+            const int i3 = ir/(ne2*ne1);
+            const int i2 = (ir - i3*ne2*ne1)/ne1;
+            const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+#ifdef GGML_USE_ACCELERATE
+            vDSP_vsub(
+                    (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
+                    (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
+                    (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 ), 1,
+                    ne0);
+#else
+            ggml_v3_vec_sub_f32(ne0,
+                    (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 ),
+                    (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01),
+                    (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
+#endif
+                // }
+            // }
+        }
+    } else {
+        // src1 is not contiguous
+        for (int ir = 0; ir < nr; ++ir) {
+            // src0, src1 and dst are same shape => same indices
+            const int i3 = ir/(ne2*ne1);
+            const int i2 = (ir - i3*ne2*ne1)/ne1;
+            const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+            float * dst_ptr  = (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 );
+            float * src0_ptr = (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
+            for (int i0 = 0; i0 < ne0; i0++) {
+                float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11 + i0*nb10);
+
+                dst_ptr[i0] = src0_ptr[i0] - *src1_ptr;
+            }
+        }
+    }
+}
+
+static void ggml_v3_compute_forward_sub(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        const struct ggml_v3_tensor * src1,
+        struct ggml_v3_tensor * dst) {
+    switch (src0->type) {
+        case GGML_V3_TYPE_F32:
+            {
+                ggml_v3_compute_forward_sub_f32(params, src0, src1, dst);
+            } break;
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_v3_compute_forward_mul
+
+static void ggml_v3_compute_forward_mul_f32(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        const struct ggml_v3_tensor * src1,
+        struct ggml_v3_tensor * dst) {
+    GGML_V3_ASSERT(ggml_v3_can_repeat(src1, src0) && ggml_v3_are_same_shape(src0, dst));
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+#ifdef GGML_USE_CLBLAST
+    if (src1->backend == GGML_V3_BACKEND_GPU) {
+        // TODO: OpenCL kernel support full broadcast
+        GGML_V3_ASSERT(ggml_v3_can_repeat_rows(src1, src0));
+        if (ith == 0) {
+            ggml_v3_cl_mul(src0, src1, dst);
+        }
+        return;
+    }
+#endif
+
+    const int64_t nr = ggml_v3_nrows(src0);
+
+    GGML_V3_TENSOR_BINARY_OP_LOCALS
+
+    GGML_V3_ASSERT( nb0 == sizeof(float));
+    GGML_V3_ASSERT(nb00 == sizeof(float));
+
+    if (nb10 == sizeof(float)) {
+        for (int64_t ir = ith; ir < nr; ir += nth) {
+            // src0 and dst are same shape => same indices
+            const int64_t i03 = ir/(ne02*ne01);
+            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
+            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+
+            const int64_t i13 = i03 % ne13;
+            const int64_t i12 = i02 % ne12;
+            const int64_t i11 = i01 % ne11;
+            const int64_t nr0 = ne00 / ne10;
+
+            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
+            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
+            float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
+
+            for (int64_t r = 0 ; r < nr0; ++r) {
+#ifdef GGML_USE_ACCELERATE
+                UNUSED(ggml_v3_vec_mul_f32);
+
+                vDSP_vmul(src0_ptr + r*ne10, 1, src1_ptr, 1, dst_ptr + r*ne10, 1, ne10);
+#else
+                ggml_v3_vec_mul_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
+#endif
+            }
+        }
+    } else {
+        // src1 is not contiguous
+        for (int64_t ir = ith; ir < nr; ir += nth) {
+            // src0 and dst are same shape => same indices
+            // src1 is broadcastable across src0 and dst in i1, i2, i3
+            const int64_t i03 = ir/(ne02*ne01);
+            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
+            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+
+            const int64_t i13 = i03 % ne13;
+            const int64_t i12 = i02 % ne12;
+            const int64_t i11 = i01 % ne11;
+
+            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
+            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
+
+            for (int64_t i0 = 0; i0 < ne00; ++i0) {
+                const int64_t i10 = i0 % ne10;
+                float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
+
+                dst_ptr[i0] = src0_ptr[i0] * (*src1_ptr);
+            }
+        }
+    }
+}
+
+static void ggml_v3_compute_forward_mul(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        const struct ggml_v3_tensor * src1,
+        struct ggml_v3_tensor * dst) {
+    GGML_V3_ASSERT(src1->type == GGML_V3_TYPE_F32 && "only f32 src1 supported for now");
+
+    switch (src0->type) {
+        case GGML_V3_TYPE_F32:
+            {
+                ggml_v3_compute_forward_mul_f32(params, src0, src1, dst);
+            } break;
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_v3_compute_forward_div
+
+static void ggml_v3_compute_forward_div_f32(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        const struct ggml_v3_tensor * src1,
+        struct ggml_v3_tensor * dst) {
+    GGML_V3_ASSERT(ggml_v3_can_repeat(src1, src0) && ggml_v3_are_same_shape(src0, dst));
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int64_t nr = ggml_v3_nrows(src0);
+
+    GGML_V3_TENSOR_BINARY_OP_LOCALS
+
+    GGML_V3_ASSERT( nb0 == sizeof(float));
+    GGML_V3_ASSERT(nb00 == sizeof(float));
+
+    if (nb10 == sizeof(float)) {
+        for (int64_t ir = ith; ir < nr; ir += nth) {
+            // src0 and dst are same shape => same indices
+            const int64_t i03 = ir/(ne02*ne01);
+            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
+            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+
+            const int64_t i13 = i03 % ne13;
+            const int64_t i12 = i02 % ne12;
+            const int64_t i11 = i01 % ne11;
+            const int64_t nr0 = ne00 / ne10;
+
+            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
+            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
+            float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
+
+            for (int64_t r = 0; r < nr0; ++r) {
+#ifdef GGML_USE_ACCELERATE
+                UNUSED(ggml_v3_vec_div_f32);
+
+                vDSP_vdiv(src1_ptr, 1, src0_ptr + r*ne10, 1, dst_ptr + r*ne10, 1, ne10);
+#else
+                ggml_v3_vec_div_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
+#endif
+            }
+        }
+    } else {
+        // src1 is not contiguous
+        for (int64_t ir = ith; ir < nr; ir += nth) {
+            // src0 and dst are same shape => same indices
+            // src1 is broadcastable across src0 and dst in i1, i2, i3
+            const int64_t i03 = ir/(ne02*ne01);
+            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
+            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+
+            const int64_t i13 = i03 % ne13;
+            const int64_t i12 = i02 % ne12;
+            const int64_t i11 = i01 % ne11;
+
+            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
+            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
+
+            for (int64_t i0 = 0; i0 < ne00; ++i0) {
+                const int64_t i10 = i0 % ne10;
+                float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
+
+                dst_ptr[i0] = src0_ptr[i0] / (*src1_ptr);
+            }
+        }
+    }
+}
+
+static void ggml_v3_compute_forward_div(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        const struct ggml_v3_tensor * src1,
+        struct ggml_v3_tensor * dst) {
+    switch (src0->type) {
+        case GGML_V3_TYPE_F32:
+            {
+                ggml_v3_compute_forward_div_f32(params, src0, src1, dst);
+            } break;
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_v3_compute_forward_sqr
+
+static void ggml_v3_compute_forward_sqr_f32(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    assert(params->ith == 0);
+    assert(ggml_v3_are_same_shape(src0, dst));
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    const int n     = ggml_v3_nrows(src0);
+    const int nc    = src0->ne[0];
+
+    assert( dst->nb[0] == sizeof(float));
+    assert(src0->nb[0] == sizeof(float));
+
+    for (int i = 0; i < n; i++) {
+        ggml_v3_vec_sqr_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+
+static void ggml_v3_compute_forward_sqr(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    switch (src0->type) {
+        case GGML_V3_TYPE_F32:
+            {
+                ggml_v3_compute_forward_sqr_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_v3_compute_forward_sqrt
+
+static void ggml_v3_compute_forward_sqrt_f32(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    assert(params->ith == 0);
+    assert(ggml_v3_are_same_shape(src0, dst));
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    const int n  = ggml_v3_nrows(src0);
+    const int nc = src0->ne[0];
+
+    assert( dst->nb[0] == sizeof(float));
+    assert(src0->nb[0] == sizeof(float));
+
+    for (int i = 0; i < n; i++) {
+        ggml_v3_vec_sqrt_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+
+static void ggml_v3_compute_forward_sqrt(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    switch (src0->type) {
+        case GGML_V3_TYPE_F32:
+            {
+                ggml_v3_compute_forward_sqrt_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_v3_compute_forward_log
+
+static void ggml_v3_compute_forward_log_f32(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    GGML_V3_ASSERT(params->ith == 0);
+    GGML_V3_ASSERT(ggml_v3_are_same_shape(src0, dst));
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    const int n  = ggml_v3_nrows(src0);
+    const int nc = src0->ne[0];
+
+    GGML_V3_ASSERT( dst->nb[0] == sizeof(float));
+    GGML_V3_ASSERT(src0->nb[0] == sizeof(float));
+
+    for (int i = 0; i < n; i++) {
+        ggml_v3_vec_log_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+
+static void ggml_v3_compute_forward_log(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    switch (src0->type) {
+        case GGML_V3_TYPE_F32:
+            {
+                ggml_v3_compute_forward_log_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_v3_compute_forward_sum
+
+static void ggml_v3_compute_forward_sum_f32(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    assert(params->ith == 0);
+    assert(ggml_v3_is_scalar(dst));
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    assert(ggml_v3_is_scalar(dst));
+    assert(src0->nb[0] == sizeof(float));
+
+    GGML_V3_TENSOR_LOCALS(int64_t, ne0, src0, ne)
+    GGML_V3_TENSOR_LOCALS(size_t,  nb0, src0, nb)
+
+    ggml_v3_float sum     = 0;
+    ggml_v3_float row_sum = 0;
+
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        for (int64_t i02 = 0; i02 < ne02; i02++) {
+            for (int64_t i01 = 0; i01 < ne01; i01++) {
+                ggml_v3_vec_sum_f32_ggf(ne00,
+                        &row_sum,
+                        (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03));
+                sum += row_sum;
+            }
+        }
+    }
+    ((float *) dst->data)[0] = sum;
+}
+
+static void ggml_v3_compute_forward_sum_f16(
+    const struct ggml_v3_compute_params * params,
+    const struct ggml_v3_tensor * src0,
+          struct ggml_v3_tensor * dst) {
+    assert(params->ith == 0);
+    assert(ggml_v3_is_scalar(dst));
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    assert(src0->nb[0] == sizeof(ggml_v3_fp16_t));
+
+    GGML_V3_TENSOR_LOCALS(int64_t, ne0, src0, ne)
+    GGML_V3_TENSOR_LOCALS(size_t,  nb0, src0, nb)
+
+    float sum = 0;
+    float row_sum = 0;
+
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        for (int64_t i02 = 0; i02 < ne02; i02++) {
+            for (int64_t i01 = 0; i01 < ne01; i01++) {
+                ggml_v3_vec_sum_f16_ggf(ne00,
+                    &row_sum,
+                    (ggml_v3_fp16_t *) ((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03));
+                sum += row_sum;
+            }
+        }
+    }
+    ((ggml_v3_fp16_t *) dst->data)[0] = GGML_V3_FP32_TO_FP16(sum);
+}
+
+static void ggml_v3_compute_forward_sum(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    switch (src0->type) {
+        case GGML_V3_TYPE_F32:
+            {
+                ggml_v3_compute_forward_sum_f32(params, src0, dst);
+            } break;
+        case GGML_V3_TYPE_F16:
+            {
+                ggml_v3_compute_forward_sum_f16(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_v3_compute_forward_sum_rows
+
+static void ggml_v3_compute_forward_sum_rows_f32(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    GGML_V3_ASSERT(params->ith == 0);
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    GGML_V3_ASSERT(src0->nb[0] == sizeof(float));
+    GGML_V3_ASSERT(dst->nb[0] == sizeof(float));
+
+    GGML_V3_TENSOR_UNARY_OP_LOCALS
+
+    GGML_V3_ASSERT(ne0 == 1);
+    GGML_V3_ASSERT(ne1 == ne01);
+    GGML_V3_ASSERT(ne2 == ne02);
+    GGML_V3_ASSERT(ne3 == ne03);
+
+    for (int64_t i3 = 0; i3 < ne03; i3++) {
+        for (int64_t i2 = 0; i2 < ne02; i2++) {
+            for (int64_t i1 = 0; i1 < ne01; i1++) {
+                float * src_row = (float *) ((char *) src0->data + i1*nb01 + i2*nb02 + i3*nb03);
+                float * dst_row = (float *) ((char *) dst->data  + i1*nb1  + i2*nb2  + i3*nb3);
+                float row_sum = 0;
+                ggml_v3_vec_sum_f32(ne00, &row_sum, src_row);
+                dst_row[0] = row_sum;
+            }
+        }
+    }
+}
+
+static void ggml_v3_compute_forward_sum_rows(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    switch (src0->type) {
+        case GGML_V3_TYPE_F32:
+            {
+                ggml_v3_compute_forward_sum_rows_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_v3_compute_forward_mean
+
+static void ggml_v3_compute_forward_mean_f32(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    assert(params->ith == 0);
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    assert(src0->nb[0] == sizeof(float));
+
+    GGML_V3_TENSOR_UNARY_OP_LOCALS
+
+    assert(ne0 == 1);
+    assert(ne1 == ne01);
+    assert(ne2 == ne02);
+    assert(ne3 == ne03);
+
+    UNUSED(ne0);
+    UNUSED(ne1);
+    UNUSED(ne2);
+    UNUSED(ne3);
+
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        for (int64_t i02 = 0; i02 < ne02; i02++) {
+            for (int64_t i01 = 0; i01 < ne01; i01++) {
+                ggml_v3_vec_sum_f32(ne00,
+                        (float *) ((char *)  dst->data + i01*nb1  + i02*nb2  + i03*nb3),
+                        (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03));
+
+                *(float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3) /= (float) ne00;
+            }
+        }
+    }
+}
+
+static void ggml_v3_compute_forward_mean(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    switch (src0->type) {
+        case GGML_V3_TYPE_F32:
+            {
+                ggml_v3_compute_forward_mean_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_v3_compute_forward_argmax
+
+static void ggml_v3_compute_forward_argmax_f32(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    assert(params->ith == 0);
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    assert(src0->nb[0] == sizeof(float));
+    assert(dst->nb[0] == sizeof(float));
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+
+    const size_t nb01 = src0->nb[1];
+    const size_t nb0 = dst->nb[0];
+
+    for (int64_t i1 = 0; i1 < ne01; i1++) {
+        float * src = (float *) ((char *) src0->data + i1*nb01);
+        int32_t * dst_ = (int32_t *) ((char *)  dst->data + i1*nb0);
+        int v = 0;
+        ggml_v3_vec_argmax_f32(ne00, &v, src);
+        dst_[0] = v;
+    }
+}
+
+static void ggml_v3_compute_forward_argmax(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    switch (src0->type) {
+        case GGML_V3_TYPE_F32:
+            {
+                ggml_v3_compute_forward_argmax_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_v3_compute_forward_repeat
+
+static void ggml_v3_compute_forward_repeat_f32(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    GGML_V3_ASSERT(params->ith == 0);
+    GGML_V3_ASSERT(ggml_v3_can_repeat(src0, dst));
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    GGML_V3_TENSOR_UNARY_OP_LOCALS
+
+    // guaranteed to be an integer due to the check in ggml_v3_can_repeat
+    const int nr0 = (int)(ne0/ne00);
+    const int nr1 = (int)(ne1/ne01);
+    const int nr2 = (int)(ne2/ne02);
+    const int nr3 = (int)(ne3/ne03);
+
+    // TODO: support for transposed / permuted tensors
+    GGML_V3_ASSERT(nb0  == sizeof(float));
+    GGML_V3_ASSERT(nb00 == sizeof(float));
+
+    // TODO: maybe this is not optimal?
+    for                         (int i3 = 0; i3 < nr3;  i3++) {
+        for                     (int k3 = 0; k3 < ne03; k3++) {
+            for                 (int i2 = 0; i2 < nr2;  i2++) {
+                for             (int k2 = 0; k2 < ne02; k2++) {
+                    for         (int i1 = 0; i1 < nr1;  i1++) {
+                        for     (int k1 = 0; k1 < ne01; k1++) {
+                            for (int i0 = 0; i0 < nr0;  i0++) {
+                                ggml_v3_vec_cpy_f32(ne00,
+                                        (float *) ((char *)  dst->data + (i3*ne03 + k3)*nb3  + (i2*ne02 + k2)*nb2  + (i1*ne01 + k1)*nb1  + (i0*ne00)*nb0),
+                                        (float *) ((char *) src0->data + (          k3)*nb03 + (          k2)*nb02 + (          k1)*nb01));
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+static void ggml_v3_compute_forward_repeat_f16(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    GGML_V3_ASSERT(params->ith == 0);
+    GGML_V3_ASSERT(ggml_v3_can_repeat(src0, dst));
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    GGML_V3_TENSOR_UNARY_OP_LOCALS
+
+    // guaranteed to be an integer due to the check in ggml_v3_can_repeat
+    const int nr0 = (int)(ne0/ne00);
+    const int nr1 = (int)(ne1/ne01);
+    const int nr2 = (int)(ne2/ne02);
+    const int nr3 = (int)(ne3/ne03);
+
+    // TODO: support for transposed / permuted tensors
+    GGML_V3_ASSERT(nb0  == sizeof(ggml_v3_fp16_t));
+    GGML_V3_ASSERT(nb00 == sizeof(ggml_v3_fp16_t));
+
+    // TODO: maybe this is not optimal?
+    for                         (int i3 = 0; i3 < nr3;  i3++) {
+        for                     (int k3 = 0; k3 < ne03; k3++) {
+            for                 (int i2 = 0; i2 < nr2;  i2++) {
+                for             (int k2 = 0; k2 < ne02; k2++) {
+                    for         (int i1 = 0; i1 < nr1;  i1++) {
+                        for     (int k1 = 0; k1 < ne01; k1++) {
+                            for (int i0 = 0; i0 < nr0;  i0++) {
+                                ggml_v3_fp16_t * y = (ggml_v3_fp16_t *) ((char *)  dst->data + (i3*ne03 + k3)*nb3  + (i2*ne02 + k2)*nb2  + (i1*ne01 + k1)*nb1  + (i0*ne00)*nb0);
+                                ggml_v3_fp16_t * x = (ggml_v3_fp16_t *) ((char *) src0->data + (          k3)*nb03 + (          k2)*nb02 + (          k1)*nb01);
+                                // ggml_v3_vec_cpy_f16(ne00, y, x)
+                                for (int i = 0; i < ne00; ++i) {
+                                    y[i]  = x[i];
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+static void ggml_v3_compute_forward_repeat(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    switch (src0->type) {
+        case GGML_V3_TYPE_F16:
+        case GGML_V3_TYPE_I16:
+            {
+                ggml_v3_compute_forward_repeat_f16(params, src0, dst);
+            } break;
+        case GGML_V3_TYPE_F32:
+        case GGML_V3_TYPE_I32:
+            {
+                ggml_v3_compute_forward_repeat_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_v3_compute_forward_repeat_back
+
+static void ggml_v3_compute_forward_repeat_back_f32(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    GGML_V3_ASSERT(params->ith == 0);
+    GGML_V3_ASSERT(ggml_v3_can_repeat(dst, src0));
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    GGML_V3_TENSOR_UNARY_OP_LOCALS
+
+    // guaranteed to be an integer due to the check in ggml_v3_can_repeat
+    const int nr0 = (int)(ne00/ne0);
+    const int nr1 = (int)(ne01/ne1);
+    const int nr2 = (int)(ne02/ne2);
+    const int nr3 = (int)(ne03/ne3);
+
+    // TODO: support for transposed / permuted tensors
+    GGML_V3_ASSERT(nb0  == sizeof(float));
+    GGML_V3_ASSERT(nb00 == sizeof(float));
+
+    if (ggml_v3_is_contiguous(dst)) {
+        ggml_v3_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
+    } else {
+        for         (int k3 = 0; k3 < ne3; k3++) {
+            for     (int k2 = 0; k2 < ne2; k2++) {
+                for (int k1 = 0; k1 < ne1; k1++) {
+                    ggml_v3_vec_set_f32(ne0,
+                        (float *) ((char *) dst->data + k1*nb1 + k2*nb2 + k3*nb3),
+                        0);
+                }
+            }
+        }
+    }
+
+    // TODO: maybe this is not optimal?
+    for                         (int i3 = 0; i3 < nr3; i3++) {
+        for                     (int k3 = 0; k3 < ne3; k3++) {
+            for                 (int i2 = 0; i2 < nr2; i2++) {
+                for             (int k2 = 0; k2 < ne2; k2++) {
+                    for         (int i1 = 0; i1 < nr1; i1++) {
+                        for     (int k1 = 0; k1 < ne1; k1++) {
+                            for (int i0 = 0; i0 < nr0; i0++) {
+                                ggml_v3_vec_acc_f32(ne0,
+                                        (float *) ((char *)  dst->data + (         k3)*nb3  + (         k2)*nb2  + (         k1)*nb1),
+                                        (float *) ((char *) src0->data + (i3*ne3 + k3)*nb03 + (i2*ne2 + k2)*nb02 + (i1*ne1 + k1)*nb01 + (i0*ne0)*nb00));
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+static void ggml_v3_compute_forward_repeat_back(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    switch (src0->type) {
+        case GGML_V3_TYPE_F32:
+            {
+                ggml_v3_compute_forward_repeat_back_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_v3_compute_forward_concat
+
+static void ggml_v3_compute_forward_concat_f32(
+    const struct ggml_v3_compute_params * params,
+    const struct ggml_v3_tensor * src0,
+    const struct ggml_v3_tensor * src1,
+    struct ggml_v3_tensor * dst) {
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    GGML_V3_ASSERT(src0->nb[0] == sizeof(float));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    GGML_V3_TENSOR_BINARY_OP_LOCALS
+
+    // TODO: support for transposed / permuted tensors
+    GGML_V3_ASSERT(nb0  == sizeof(float));
+    GGML_V3_ASSERT(nb00 == sizeof(float));
+    GGML_V3_ASSERT(nb10 == sizeof(float));
+
+    for (int i3 = 0; i3 < ne3; i3++) {
+        for (int i2 = ith; i2 < ne2; i2 += nth) {
+            if (i2 < ne02) { // src0
+                for (int i1 = 0; i1 < ne1; i1++) {
+                    for (int i0 = 0; i0 < ne0; i0++) {
+                        const float * x = (float *)((char *) src0->data + i0 * nb00 + i1 * nb01 + i2 * nb02 + i3 * nb03);
+
+                        float * y = (float *)((char *)dst->data + i0 * nb0 + i1 * nb1 + i2 * nb2 + i3 * nb3);
+                        *y = *x;
+                    }
+                }
+            } // src1
+            else {
+                for (int i1 = 0; i1 < ne1; i1++) {
+                    for (int i0 = 0; i0 < ne0; i0++) {
+                        const float * x = (float *)((char *) src1->data + i0 * nb10 + i1 * nb11 + (i2 - ne02) * nb12 + i3 * nb13);
+
+                        float * y = (float *)((char *)dst->data + i0 * nb0 + i1 * nb1 + i2 * nb2 + i3 * nb3);
+                        *y = *x;
+                    }
+                }
+            }
+        }
+    }
+}
+
+static void ggml_v3_compute_forward_concat(
+    const struct ggml_v3_compute_params* params,
+    const struct ggml_v3_tensor* src0,
+    const struct ggml_v3_tensor* src1,
+    struct ggml_v3_tensor* dst) {
+    switch (src0->type) {
+        case GGML_V3_TYPE_F32:
+        case GGML_V3_TYPE_I32:
+            {
+                ggml_v3_compute_forward_concat_f32(params, src0, src1, dst);
+            } break;
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_v3_compute_forward_abs
+
+static void ggml_v3_compute_forward_abs_f32(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    assert(params->ith == 0);
+    assert(ggml_v3_are_same_shape(src0, dst));
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    const int n  = ggml_v3_nrows(src0);
+    const int nc = src0->ne[0];
+
+    assert(dst->nb[0]  == sizeof(float));
+    assert(src0->nb[0] == sizeof(float));
+
+    for (int i = 0; i < n; i++) {
+        ggml_v3_vec_abs_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+
+static void ggml_v3_compute_forward_abs(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    switch (src0->type) {
+        case GGML_V3_TYPE_F32:
+            {
+                ggml_v3_compute_forward_abs_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_v3_compute_forward_sgn
+
+static void ggml_v3_compute_forward_sgn_f32(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    assert(params->ith == 0);
+    assert(ggml_v3_are_same_shape(src0, dst));
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    const int n  = ggml_v3_nrows(src0);
+    const int nc = src0->ne[0];
+
+    assert(dst->nb[0]  == sizeof(float));
+    assert(src0->nb[0] == sizeof(float));
+
+    for (int i = 0; i < n; i++) {
+        ggml_v3_vec_sgn_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+
+static void ggml_v3_compute_forward_sgn(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    switch (src0->type) {
+        case GGML_V3_TYPE_F32:
+            {
+                ggml_v3_compute_forward_sgn_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_v3_compute_forward_neg
+
+static void ggml_v3_compute_forward_neg_f32(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    assert(params->ith == 0);
+    assert(ggml_v3_are_same_shape(src0, dst));
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    const int n  = ggml_v3_nrows(src0);
+    const int nc = src0->ne[0];
+
+    assert(dst->nb[0]  == sizeof(float));
+    assert(src0->nb[0] == sizeof(float));
+
+    for (int i = 0; i < n; i++) {
+        ggml_v3_vec_neg_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+
+static void ggml_v3_compute_forward_neg(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    switch (src0->type) {
+        case GGML_V3_TYPE_F32:
+            {
+                ggml_v3_compute_forward_neg_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_v3_compute_forward_step
+
+static void ggml_v3_compute_forward_step_f32(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    assert(params->ith == 0);
+    assert(ggml_v3_are_same_shape(src0, dst));
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    const int n  = ggml_v3_nrows(src0);
+    const int nc = src0->ne[0];
+
+    assert(dst->nb[0]  == sizeof(float));
+    assert(src0->nb[0] == sizeof(float));
+
+    for (int i = 0; i < n; i++) {
+        ggml_v3_vec_step_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+
+static void ggml_v3_compute_forward_step(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    switch (src0->type) {
+        case GGML_V3_TYPE_F32:
+            {
+                ggml_v3_compute_forward_step_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_v3_compute_forward_tanh
+
+static void ggml_v3_compute_forward_tanh_f32(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    assert(params->ith == 0);
+    assert(ggml_v3_are_same_shape(src0, dst));
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    const int n  = ggml_v3_nrows(src0);
+    const int nc = src0->ne[0];
+
+    assert(dst->nb[0]  == sizeof(float));
+    assert(src0->nb[0] == sizeof(float));
+
+    for (int i = 0; i < n; i++) {
+        ggml_v3_vec_tanh_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+
+static void ggml_v3_compute_forward_tanh(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    switch (src0->type) {
+        case GGML_V3_TYPE_F32:
+            {
+                ggml_v3_compute_forward_tanh_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_v3_compute_forward_elu
+
+static void ggml_v3_compute_forward_elu_f32(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    assert(params->ith == 0);
+    assert(ggml_v3_are_same_shape(src0, dst));
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    const int n  = ggml_v3_nrows(src0);
+    const int nc = src0->ne[0];
+
+    assert(dst->nb[0]  == sizeof(float));
+    assert(src0->nb[0] == sizeof(float));
+
+    for (int i = 0; i < n; i++) {
+        ggml_v3_vec_elu_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+
+static void ggml_v3_compute_forward_elu(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    switch (src0->type) {
+        case GGML_V3_TYPE_F32:
+            {
+                ggml_v3_compute_forward_elu_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_v3_compute_forward_relu
+
+static void ggml_v3_compute_forward_relu_f32(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    assert(params->ith == 0);
+    assert(ggml_v3_are_same_shape(src0, dst));
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    const int n  = ggml_v3_nrows(src0);
+    const int nc = src0->ne[0];
+
+    assert(dst->nb[0]  == sizeof(float));
+    assert(src0->nb[0] == sizeof(float));
+
+    for (int i = 0; i < n; i++) {
+        ggml_v3_vec_relu_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+
+static void ggml_v3_compute_forward_relu(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    switch (src0->type) {
+        case GGML_V3_TYPE_F32:
+            {
+                ggml_v3_compute_forward_relu_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_v3_compute_forward_gelu
+
+static void ggml_v3_compute_forward_gelu_f32(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    GGML_V3_ASSERT(ggml_v3_is_contiguous_except_dim_1(src0));
+    GGML_V3_ASSERT(ggml_v3_is_contiguous_except_dim_1(dst));
+    GGML_V3_ASSERT(ggml_v3_are_same_shape(src0, dst));
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nc = src0->ne[0];
+    const int nr = ggml_v3_nrows(src0);
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int i1 = ir0; i1 < ir1; i1++) {
+        ggml_v3_vec_gelu_f32(nc,
+                (float *) ((char *) dst->data  + i1*( dst->nb[1])),
+                (float *) ((char *) src0->data + i1*(src0->nb[1])));
+
+#ifndef NDEBUG
+        for (int k = 0; k < nc; k++) {
+            const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
+            UNUSED(x);
+            assert(!isnan(x));
+            assert(!isinf(x));
+        }
+#endif
+    }
+}
+
+static void ggml_v3_compute_forward_gelu(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    switch (src0->type) {
+        case GGML_V3_TYPE_F32:
+            {
+                ggml_v3_compute_forward_gelu_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_v3_compute_forward_gelu_quick
+
+static void ggml_v3_compute_forward_gelu_quick_f32(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    GGML_V3_ASSERT(ggml_v3_is_contiguous_except_dim_1(src0));
+    GGML_V3_ASSERT(ggml_v3_is_contiguous_except_dim_1(dst));
+    GGML_V3_ASSERT(ggml_v3_are_same_shape(src0, dst));
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nc = src0->ne[0];
+    const int nr = ggml_v3_nrows(src0);
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int i1 = ir0; i1 < ir1; i1++) {
+        ggml_v3_vec_gelu_quick_f32(nc,
+                (float *) ((char *) dst->data  + i1*( dst->nb[1])),
+                (float *) ((char *) src0->data + i1*(src0->nb[1])));
+
+#ifndef NDEBUG
+        for (int k = 0; k < nc; k++) {
+            const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
+            UNUSED(x);
+            assert(!isnan(x));
+            assert(!isinf(x));
+        }
+#endif
+    }
+}
+
+static void ggml_v3_compute_forward_gelu_quick(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    switch (src0->type) {
+        case GGML_V3_TYPE_F32:
+            {
+                ggml_v3_compute_forward_gelu_quick_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_v3_compute_forward_silu
+
+static void ggml_v3_compute_forward_silu_f32(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    GGML_V3_ASSERT(ggml_v3_is_contiguous_except_dim_1(src0));
+    GGML_V3_ASSERT(ggml_v3_is_contiguous_except_dim_1(dst));
+    GGML_V3_ASSERT(ggml_v3_are_same_shape(src0, dst));
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nc = src0->ne[0];
+    const int nr = ggml_v3_nrows(src0);
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int i1 = ir0; i1 < ir1; i1++) {
+        ggml_v3_vec_silu_f32(nc,
+                (float *) ((char *) dst->data  + i1*( dst->nb[1])),
+                (float *) ((char *) src0->data + i1*(src0->nb[1])));
+
+#ifndef NDEBUG
+        for (int k = 0; k < nc; k++) {
+            const float x = ((float *) ((char *) dst->data + i1*(dst->nb[1])))[k];
+            UNUSED(x);
+            assert(!isnan(x));
+            assert(!isinf(x));
+        }
+#endif
+    }
+}
+
+static void ggml_v3_compute_forward_silu(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    switch (src0->type) {
+        case GGML_V3_TYPE_F32:
+            {
+                ggml_v3_compute_forward_silu_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+}
+// ggml_v3_compute_forward_leaky_relu
+
+static void ggml_v3_compute_forward_leaky_relu_f32(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    assert(params->ith == 0);
+    assert(ggml_v3_are_same_shape(src0, dst));
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    const int n  = ggml_v3_nrows(src0);
+    const int nc = src0->ne[0];
+
+    float negative_slope;
+    memcpy(&negative_slope, dst->op_params, sizeof(float));
+
+    assert(dst->nb[0]  == sizeof(float));
+    assert(src0->nb[0] == sizeof(float));
+
+    for (int i = 0; i < n; i++) {
+        ggml_v3_vec_leaky_relu_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])), negative_slope);
+    }
+}
+
+static void ggml_v3_compute_forward_leaky_relu(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    switch (src0->type) {
+        case GGML_V3_TYPE_F32:
+            {
+                ggml_v3_compute_forward_leaky_relu_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_v3_compute_forward_silu_back
+
+static void ggml_v3_compute_forward_silu_back_f32(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        const struct ggml_v3_tensor * grad,
+        struct ggml_v3_tensor * dst) {
+    GGML_V3_ASSERT(ggml_v3_is_contiguous_except_dim_1(grad));
+    GGML_V3_ASSERT(ggml_v3_is_contiguous_except_dim_1(src0));
+    GGML_V3_ASSERT(ggml_v3_is_contiguous_except_dim_1(dst));
+    GGML_V3_ASSERT(ggml_v3_are_same_shape(src0, dst));
+    GGML_V3_ASSERT(ggml_v3_are_same_shape(src0, grad));
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nc = src0->ne[0];
+    const int nr = ggml_v3_nrows(src0);
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int i1 = ir0; i1 < ir1; i1++) {
+        ggml_v3_vec_silu_backward_f32(nc,
+                (float *) ((char *) dst->data  + i1*( dst->nb[1])),
+                (float *) ((char *) src0->data + i1*(src0->nb[1])),
+                (float *) ((char *) grad->data + i1*(grad->nb[1])));
+
+#ifndef NDEBUG
+        for (int k = 0; k < nc; k++) {
+            const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
+            UNUSED(x);
+            assert(!isnan(x));
+            assert(!isinf(x));
+        }
+#endif
+    }
+}
+
+static void ggml_v3_compute_forward_silu_back(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        const struct ggml_v3_tensor * grad,
+        struct ggml_v3_tensor * dst) {
+    switch (src0->type) {
+        case GGML_V3_TYPE_F32:
+            {
+                ggml_v3_compute_forward_silu_back_f32(params, src0, grad, dst);
+            } break;
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_v3_compute_forward_norm
+
+static void ggml_v3_compute_forward_norm_f32(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    GGML_V3_ASSERT(ggml_v3_are_same_shape(src0, dst));
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    GGML_V3_ASSERT(src0->nb[0] == sizeof(float));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    GGML_V3_TENSOR_UNARY_OP_LOCALS
+
+    float eps;
+    memcpy(&eps, dst->op_params, sizeof(float));
+
+    GGML_V3_ASSERT(eps > 0.0f);
+
+    // TODO: optimize
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        for (int64_t i02 = 0; i02 < ne02; i02++) {
+            for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
+                const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
+
+                ggml_v3_float sum = 0.0;
+                for (int64_t i00 = 0; i00 < ne00; i00++) {
+                    sum += (ggml_v3_float)x[i00];
+                }
+
+                float mean = sum/ne00;
+
+                float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
+
+                ggml_v3_float sum2 = 0.0;
+                for (int64_t i00 = 0; i00 < ne00; i00++) {
+                    float v = x[i00] - mean;
+                    y[i00] = v;
+                    sum2 += (ggml_v3_float)(v*v);
+                }
+
+                float variance = sum2/ne00;
+                const float scale = 1.0f/sqrtf(variance + eps);
+
+                ggml_v3_vec_scale_f32(ne00, y, scale);
+            }
+        }
+    }
+}
+
+static void ggml_v3_compute_forward_norm(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    switch (src0->type) {
+        case GGML_V3_TYPE_F32:
+            {
+                ggml_v3_compute_forward_norm_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_v3_compute_forward_group_rms_norm
+
+static void ggml_v3_compute_forward_rms_norm_f32(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    GGML_V3_ASSERT(ggml_v3_are_same_shape(src0, dst));
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    GGML_V3_ASSERT(src0->nb[0] == sizeof(float));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    GGML_V3_TENSOR_UNARY_OP_LOCALS
+
+    float eps;
+    memcpy(&eps, dst->op_params, sizeof(float));
+
+    GGML_V3_ASSERT(eps > 0.0f);
+
+    // TODO: optimize
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        for (int64_t i02 = 0; i02 < ne02; i02++) {
+            for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
+                const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
+
+                ggml_v3_float sum = 0.0;
+                for (int64_t i00 = 0; i00 < ne00; i00++) {
+                    sum += (ggml_v3_float)(x[i00] * x[i00]);
+                }
+
+                const float mean = sum/ne00;
+
+                float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
+
+                memcpy(y, x, ne00 * sizeof(float));
+                // for (int i00 = 0; i00 < ne00; i00++) {
+                //     y[i00] = x[i00];
+                // }
+
+                const float scale = 1.0f/sqrtf(mean + eps);
+
+                ggml_v3_vec_scale_f32(ne00, y, scale);
+            }
+        }
+    }
+}
+
+static void ggml_v3_compute_forward_rms_norm(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    switch (src0->type) {
+        case GGML_V3_TYPE_F32:
+            {
+                ggml_v3_compute_forward_rms_norm_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+}
+
+static void ggml_v3_compute_forward_rms_norm_back_f32(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        const struct ggml_v3_tensor * src1,
+        struct ggml_v3_tensor * dst) {
+    GGML_V3_ASSERT(ggml_v3_are_same_shape(src0, dst) && ggml_v3_are_same_shape(src0, src1));
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    GGML_V3_ASSERT(src0->nb[0] == sizeof(float));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    GGML_V3_TENSOR_BINARY_OP_LOCALS
+
+    float eps;
+    memcpy(&eps, dst->op_params, sizeof(float));
+
+    // TODO: optimize
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        for (int64_t i02 = 0; i02 < ne02; i02++) {
+            for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
+                // src1 is same shape as src0 => same indices
+                const int64_t i11 = i01;
+                const int64_t i12 = i02;
+                const int64_t i13 = i03;
+
+                const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
+                const float * dz = (float *) ((char *) src1->data + i11*nb11 + i12*nb12 + i13*nb13);
+
+                ggml_v3_float sum_xx  = 0.0;
+                ggml_v3_float sum_xdz = 0.0;
+
+                for (int64_t i00 = 0; i00 < ne00; i00++) {
+                    sum_xx  += (ggml_v3_float)(x[i00] * x[i00]);
+                    sum_xdz += (ggml_v3_float)(x[i00] * dz[i00]);
+                }
+
+                //const float mean     = (float)(sum_xx)/ne00;
+                const float mean_eps = (float)(sum_xx)/ne00 + eps;
+                const float sum_eps  = (float)(sum_xx) + eps*ne00;
+                //const float mean_xdz = (float)(sum_xdz)/ne00;
+                // we could cache rms from forward pass to improve performance.
+                // to do this implement ggml_v3_rms and compose ggml_v3_rms_norm using ggml_v3_rms.
+                //const float rms      = sqrtf(mean_eps);
+                const float rrms     = 1.0f / sqrtf(mean_eps);
+                //const float scale    = -rrms/(ne00 * mean_eps); // -1/(n*rms**3)
+
+                {
+                    // z = rms_norm(x)
+                    //
+                    // rms_norm(src0) =
+                    //     scale(
+                    //         src0,
+                    //         div(
+                    //             1,
+                    //             sqrt(
+                    //                 add(
+                    //                     scale(
+                    //                         sum(
+                    //                             sqr(
+                    //                                 src0)),
+                    //                         (1.0/N)),
+                    //                     eps))));
+
+                    // postorder:
+                    // ## op    args         grad
+                    // 00 param src0         grad[#00]
+                    // 01 const 1
+                    // 02 sqr   (#00)        grad[#02]
+                    // 03 sum   (#02)        grad[#03]
+                    // 04 const 1/N
+                    // 05 scale (#03, #04)   grad[#05]
+                    // 06 const eps
+                    // 07 add   (#05, #06)   grad[#07]
+                    // 08 sqrt  (#07)        grad[#08]
+                    // 09 div   (#01,#08)    grad[#09]
+                    // 10 scale (#00,#09)    grad[#10]
+                    //
+                    // backward pass, given grad[#10]
+                    // #10: scale
+                    // grad[#00] += scale(grad[#10],#09)
+                    // grad[#09] += sum(mul(grad[#10],#00))
+                    // #09: div
+                    // grad[#08] += neg(mul(grad[#09], div(#09,#08)))
+                    // #08: sqrt
+                    // grad[#07] += mul(grad[#08], div(0.5, #08))
+                    // #07: add
+                    // grad[#05] += grad[#07]
+                    // #05: scale
+                    // grad[#03] += scale(grad[#05],#04)
+                    // #03: sum
+                    // grad[#02] += repeat(grad[#03], #02)
+                    // #02:
+                    // grad[#00] += scale(mul(#00, grad[#02]), 2.0)
+                    //
+                    // substitute and simplify:
+                    // grad[#00] = scale(grad(#10), #09) + scale(mul(#00, grad[#02]), 2.0)
+                    // grad[#02] = repeat(grad[#03], #02)
+                    // grad[#02] = repeat(scale(grad[#05],#04), #02)
+                    // grad[#02] = repeat(scale(grad[#07],#04), #02)
+                    // grad[#02] = repeat(scale(mul(grad[#08], div(0.5, #08)),#04), #02)
+                    // grad[#02] = repeat(scale(mul(neg(mul(grad[#09], div(#09,#08))), div(0.5, #08)),#04), #02)
+                    // grad[#02] = repeat(scale(mul(neg(mul(sum(mul(grad[#10],#00)), div(#09,#08))), div(0.5, #08)),#04), #02)
+                    // grad[#02] = repeat(-(sum(mul(grad[#10],#00)) * div(#09,#08) * div(0.5, #08) * (1/N)), #02)
+                    // grad[#02] = repeat(-(sum(mul(grad[#10],#00)) * div(div(#01,#08),#08) * div(0.5, #08) * (1/N)), #02)
+                    // grad[#02] = repeat(-(sum(mul(grad[#10],#00)) * div(1,#08*#08) * div(0.5, #08) * (1/N)), #02)
+                    // grad[#02] = repeat(-(sum(mul(grad[#10],#00)) * div(1,#07) * div(0.5, #08) * (1/N)), #02)
+                    // grad[#00] = scale(grad(#10), #09) + scale(mul(#00, grad[#02]), 2.0)
+                    // grad[#00] = scale(grad(#10), #09) + scale(mul(#00, repeat(-(sum(mul(grad[#10],#00)) * div(1,#07) * div(0.5, #08) * (1/N)), #02)), 2.0)
+                    // grad[#00] = scale(grad(#10), #09) + scale(scale(#00, -(sum(mul(grad[#10],#00)) * div(1,#07) * div(0.5, #08) * (1/N))), 2.0)
+                    // grad[#00] = scale(grad(#10), #09) + scale(#00, -(sum(mul(grad[#10],#00)) * div(1,#07) * div(1,#08) * (1/N)))
+                    // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(1,#07*#08) * (-1/N))
+                    // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(1,#07*#08) * (-1/N))
+                    // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(1,mean_eps*rms) * (-1/N))
+                    // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(-1,rms*N*mean_eps))
+                    // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(-1,rms*N*(sum_xx/N+eps)))
+                    // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(-1,rms*N*sum_xx+rms*N*eps))
+                    // grad[#00] = scale(dz, rrms) + scale(x, sum(mul(dz,x)) * div(-1,rms*N*mean_eps))
+                    // grad[#00] = scale(dz, rrms) + scale(x, sum_xdz * div(-1,rms*N*mean_eps))
+                    // a = b*c + d*e
+                    // a = b*c*f/f + d*e*f/f
+                    // a = (b*c*f + d*e*f)*(1/f)
+                    // a = (b*c*(1/c) + d*e*(1/c))*(1/(1/c))
+                    // a = (b + d*e/c)*c
+                    // b = dz, c = rrms, d = x, e = sum_xdz * div(-1,rms*N*mean_eps)
+                    // a = (dz + x*sum_xdz * div(-1,rms*N*mean_eps)/rrms)*rrms
+                    // a = (dz + x*sum_xdz * div(-1,rms*N*mean_eps)*rms)*rrms
+                    // a = (dz + x*sum_xdz * div(-rms,rms*N*mean_eps))*rrms
+                    // a = (dz + x*sum_xdz * div(-1,N*mean_eps))*rrms
+                    // a = (dz + x*div(-sum_xdz,N*mean_eps))*rrms
+                    // a = (dz + x*div(-mean_xdz,mean_eps))*rrms
+                    // grad[#00] = scale(dz + scale(x, div(-mean_xdz,mean_eps)),rrms)
+                    // grad[#00] = scale(dz + scale(x, -mean_xdz/mean_eps),rrms)
+                    // dx = scale(dz + scale(x, -mean_xdz/mean_eps),rrms)
+                }
+                // dx = scale(dz + scale(x, -mean_xdz/mean_eps),rrms)
+                // post-order:
+                // dx := x
+                // dx := scale(dx,-mean_xdz/mean_eps)
+                // dx := add(dx, dz)
+                // dx := scale(dx, rrms)
+                float * dx = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
+
+                ggml_v3_vec_cpy_f32  (ne00, dx, x);
+                // ggml_v3_vec_scale_f32(ne00, dx, -mean_xdz/mean_eps);
+                ggml_v3_vec_scale_f32(ne00, dx, (float)(-sum_xdz)/sum_eps);
+                ggml_v3_vec_acc_f32  (ne00, dx, dz);
+                ggml_v3_vec_scale_f32(ne00, dx, rrms);
+            }
+        }
+    }
+}
+
+static void ggml_v3_compute_forward_rms_norm_back(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        const struct ggml_v3_tensor * src1,
+        struct ggml_v3_tensor * dst) {
+    switch (src0->type) {
+        case GGML_V3_TYPE_F32:
+            {
+                ggml_v3_compute_forward_rms_norm_back_f32(params, src0, src1, dst);
+            } break;
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_v3_compute_forward_group_norm
+
+static void ggml_v3_compute_forward_group_norm_f32(
+    const struct ggml_v3_compute_params * params,
+    const struct ggml_v3_tensor * src0,
+    struct ggml_v3_tensor * dst) {
+    GGML_V3_ASSERT(ggml_v3_are_same_shape(src0, dst));
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    GGML_V3_ASSERT(src0->nb[0] == sizeof(float));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    GGML_V3_TENSOR_UNARY_OP_LOCALS
+
+    const float eps = 1e-6f; // TODO: make this a parameter
+
+    // TODO: optimize
+
+    int n_channels = src0->ne[2];
+    int n_groups = dst->op_params[0];
+    int n_channels_per_group = (n_channels + n_groups - 1) / n_groups;
+    for (int i = ith; i < n_groups; i+=nth) {
+        int start = i * n_channels_per_group;
+        int end = start + n_channels_per_group;
+        if (end > n_channels) {
+            end = n_channels;
+        }
+        int step = end - start;
+
+        for (int64_t i03 = 0; i03 < ne03; i03++) {
+            ggml_v3_float sum = 0.0;
+            for (int64_t i02 = start; i02 < end; i02++) {
+                for (int64_t i01 = 0; i01 < ne01; i01++) {
+                    const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
+
+                    for (int64_t i00 = 0; i00 < ne00; i00++) {
+                        sum += (ggml_v3_float)x[i00];
+                    }
+                }
+            }
+            float mean = sum / (ne00 * ne01 * step);
+            ggml_v3_float sum2 = 0.0;
+
+            for (int64_t i02 = start; i02 < end; i02++) {
+                for (int64_t i01 = 0; i01 < ne01; i01++) {
+                    const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
+
+                    float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3);
+
+                    for (int64_t i00 = 0; i00 < ne00; i00++) {
+                        float v = x[i00] - mean;
+                        y[i00] = v;
+                        sum2 += (ggml_v3_float)(v * v);
+                    }
+                }
+            }
+            float variance = sum2 / (ne00 * ne01 * step);
+            const float scale = 1.0f / sqrtf(variance + eps);
+
+            for (int64_t i02 = start; i02 < end; i02++) {
+                for (int64_t i01 = 0; i01 < ne01; i01++) {
+                    float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3);
+                    ggml_v3_vec_scale_f32(ne00, y, scale);
+                }
+            }
+        }
+    }
+}
+
+static void ggml_v3_compute_forward_group_norm(
+    const struct ggml_v3_compute_params * params,
+    const struct ggml_v3_tensor * src0,
+    struct ggml_v3_tensor * dst) {
+    switch (src0->type) {
+        case GGML_V3_TYPE_F32:
+            {
+                ggml_v3_compute_forward_group_norm_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_v3_compute_forward_mul_mat
+
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
+// helper function to determine if it is better to use BLAS or not
+// for large matrices, BLAS is faster
+static bool ggml_v3_compute_forward_mul_mat_use_blas(struct ggml_v3_tensor * dst) {
+    const struct ggml_v3_tensor * src0 = dst->src[0];
+    const struct ggml_v3_tensor * src1 = dst->src[1];
+
+    //const int64_t ne00 = src0->ne[0];
+    //const int64_t ne01 = src0->ne[1];
+
+    const int64_t ne10 = src1->ne[0];
+
+    const int64_t ne0 = dst->ne[0];
+    const int64_t ne1 = dst->ne[1];
+
+    // NOTE: with GGML_V3_OP_MUL_MAT_ID we don't want to go through the BLAS branch because it will dequantize (to_float)
+    //       all the experts for each batch element and the processing would become incredibly slow
+    // TODO: find the optimal values for these
+    if (dst->op != GGML_V3_OP_MUL_MAT_ID &&
+        ggml_v3_is_contiguous(src0) &&
+        ggml_v3_is_contiguous(src1) &&
+      //src0->type == GGML_V3_TYPE_F32 &&
+        src1->type == GGML_V3_TYPE_F32 &&
+        (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
+
+        /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
+        return true;
+    }
+
+    return false;
+}
+#endif
+
+static void ggml_v3_compute_forward_mul_mat(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        const struct ggml_v3_tensor * src1,
+              struct ggml_v3_tensor * dst) {
+    int64_t t0 = ggml_v3_perf_time_us();
+    UNUSED(t0);
+
+    GGML_V3_TENSOR_BINARY_OP_LOCALS
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    if (ith == 1 && g_imatrix_collect_v3) {
+        g_imatrix_collect_v3(src0, src1);
+    }
+
+    const enum ggml_v3_type type = src0->type;
+
+    const bool src1_cont = ggml_v3_is_contiguous(src1);
+
+    ggml_v3_vec_dot_t    const vec_dot               = type_traits[type].vec_dot;
+    enum ggml_v3_type    const vec_dot_type          = type_traits[type].vec_dot_type;
+    ggml_v3_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
+
+    GGML_V3_ASSERT(ne0 == ne01);
+    GGML_V3_ASSERT(ne1 == ne11);
+    GGML_V3_ASSERT(ne2 == ne12);
+    GGML_V3_ASSERT(ne3 == ne13);
+
+    // we don't support permuted src0 or src1
+    GGML_V3_ASSERT(nb00 == ggml_v3_type_size(type));
+    GGML_V3_ASSERT(nb10 == ggml_v3_type_size(src1->type));
+
+    // dst cannot be transposed or permuted
+    GGML_V3_ASSERT(nb0 == sizeof(float));
+    GGML_V3_ASSERT(nb0 <= nb1);
+    GGML_V3_ASSERT(nb1 <= nb2);
+    GGML_V3_ASSERT(nb2 <= nb3);
+
+    // broadcast factors
+    const int64_t r2 = ne12/ne02;
+    const int64_t r3 = ne13/ne03;
+
+    // nb01 >= nb00 - src0 is not transposed
+    //   compute by src0 rows
+
+#if defined(GGML_USE_CLBLAST)
+    if (ggml_v3_cl_can_mul_mat(src0, src1, dst)) {
+        if (params->ith == 0 && params->type == GGML_V3_TASK_COMPUTE) {
+            ggml_v3_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
+        }
+        return;
+    }
+#endif
+
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
+    if (ggml_v3_compute_forward_mul_mat_use_blas(dst)) {
+        if (params->ith != 0) {
+            return;
+        }
+
+        if (params->type == GGML_V3_TASK_INIT) {
+            return;
+        }
+
+        if (params->type == GGML_V3_TASK_FINALIZE) {
+            return;
+        }
+
+        for (int64_t i13 = 0; i13 < ne13; i13++) {
+            for (int64_t i12 = 0; i12 < ne12; i12++) {
+                // broadcast src0 into src1 across 2nd,3rd dimension
+                const int64_t i03 = i13/r3;
+                const int64_t i02 = i12/r2;
+
+                const void  * x = (char *)            src0->data + i02*nb02 + i03*nb03;
+                const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
+                      float * d = (float *) ((char *)  dst->data + i12*nb2  + i13*nb3);
+
+                if (type != GGML_V3_TYPE_F32) {
+                            float * const wdata    = params->wdata;
+                    ggml_v3_to_float_t const to_float = type_traits[type].to_float;
+
+                    size_t id = 0;
+                    for (int64_t i01 = 0; i01 < ne01; ++i01) {
+                        to_float((const char *) x + i01*nb01, wdata + id, ne00);
+                        id += ne00;
+                    }
+
+                    assert(id*sizeof(float) <= params->wsize);
+                    x = wdata;
+                }
+
+                cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
+                          ne1, ne01, ne10,
+                         1.0f,    y, ne10,
+                                  x, ne00,
+                         0.0f,    d, ne01);
+            }
+        }
+
+        //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_v3_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
+
+        return;
+    }
+#endif
+
+    if (params->type == GGML_V3_TASK_INIT) {
+        if (src1->type != vec_dot_type) {
+            char * wdata = params->wdata;
+            const size_t row_size = ggml_v3_row_size(vec_dot_type, ne10);
+
+            assert(params->wsize >= ne11*ne12*ne13*row_size);
+            GGML_V3_ASSERT(src1->type == GGML_V3_TYPE_F32);
+
+            for (int64_t i13 = 0; i13 < ne13; ++i13) {
+                for (int64_t i12 = 0; i12 < ne12; ++i12) {
+                    for (int64_t i11 = 0; i11 < ne11; ++i11) {
+                        from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10);
+                        wdata += row_size;
+                    }
+                }
+            }
+        }
+
+        return;
+    }
+
+    if (params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    const void * wdata    = (src1->type == vec_dot_type) ? src1->data : params->wdata;
+    const size_t row_size = ggml_v3_row_size(vec_dot_type, ne10);
+
+    const int64_t nr0 = ne01;          // src0 rows
+    const int64_t nr1 = ne1*ne12*ne13; // src1 rows
+
+    //printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
+
+    // distribute the thread work across the inner or outer loop based on which one is larger
+
+    const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
+    const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
+
+    const int64_t ith0 = ith % nth0;
+    const int64_t ith1 = ith / nth0;
+
+    const int64_t dr0 = (nr0 + nth0 - 1)/nth0;
+    const int64_t dr1 = (nr1 + nth1 - 1)/nth1;
+
+    const int64_t ir010 = dr0*ith0;
+    const int64_t ir011 = MIN(ir010 + dr0, nr0);
+
+    const int64_t ir110 = dr1*ith1;
+    const int64_t ir111 = MIN(ir110 + dr1, nr1);
+
+    //printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);
+
+    // threads with no work simply yield (not sure if it helps)
+    if (ir010 >= ir011 || ir110 >= ir111) {
+        sched_yield();
+        return;
+    }
+
+    assert(ne12 % ne02 == 0);
+    assert(ne13 % ne03 == 0);
+
+    // block-tiling attempt
+    const int64_t blck_0 = 16;
+    const int64_t blck_1 = 16;
+
+    // attempt to reduce false-sharing (does not seem to make a difference)
+    float tmp[16];
+
+    for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
+        for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
+            for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
+                const int64_t i13 = (ir1/(ne12*ne1));
+                const int64_t i12 = (ir1 - i13*ne12*ne1)/ne1;
+                const int64_t i11 = (ir1 - i13*ne12*ne1 - i12*ne1);
+
+                // broadcast src0 into src1
+                const int64_t i03 = i13/r3;
+                const int64_t i02 = i12/r2;
+
+                const int64_t i1 = i11;
+                const int64_t i2 = i12;
+                const int64_t i3 = i13;
+
+                const char * src0_row = (const char *) src0->data + (0 + i02*nb02 + i03*nb03);
+
+                // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
+                //       if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
+                //       the original src1 data pointer, so we should index using the indices directly
+                // TODO: this is a bit of a hack, we should probably have a better way to handle this
+                const char * src1_col = (const char *) wdata +
+                    (src1_cont || src1->type != vec_dot_type
+                     ? (i11      + i12*ne11 + i13*ne12*ne11)*row_size
+                     : (i11*nb11 + i12*nb12 + i13*nb13));
+
+                float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
+
+                //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
+                //    vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
+                //}
+
+                for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
+                    vec_dot(ne00, &tmp[ir0 - iir0], src0_row + ir0*nb01, src1_col);
+                }
+                memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
+            }
+        }
+    }
+}
+
+// ggml_v3_compute_forward_mul_mat_id
+
+static void ggml_v3_compute_forward_mul_mat_id(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * ids,
+        const struct ggml_v3_tensor * src1,
+              struct ggml_v3_tensor * dst) {
+
+    const struct ggml_v3_tensor * src0 = dst->src[2]; // only for GGML_V3_TENSOR_BINARY_OP_LOCALS
+
+    GGML_V3_TENSOR_BINARY_OP_LOCALS
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const enum ggml_v3_type type = src0->type;
+
+    const bool src1_cont = ggml_v3_is_contiguous(src1);
+
+    ggml_v3_vec_dot_t    const vec_dot               = type_traits[type].vec_dot;
+    enum ggml_v3_type    const vec_dot_type          = type_traits[type].vec_dot_type;
+    ggml_v3_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
+
+    GGML_V3_ASSERT(ne0 == ne01);
+    GGML_V3_ASSERT(ne1 == ne11);
+    GGML_V3_ASSERT(ne2 == ne12);
+    GGML_V3_ASSERT(ne3 == ne13);
+
+    // we don't support permuted src0 or src1
+    GGML_V3_ASSERT(nb00 == ggml_v3_type_size(type));
+    GGML_V3_ASSERT(nb10 == ggml_v3_type_size(src1->type));
+
+    // dst cannot be transposed or permuted
+    GGML_V3_ASSERT(nb0 == sizeof(float));
+    GGML_V3_ASSERT(nb0 <= nb1);
+    GGML_V3_ASSERT(nb1 <= nb2);
+    GGML_V3_ASSERT(nb2 <= nb3);
+
+    // broadcast factors
+    const int64_t r2 = ne12/ne02;
+    const int64_t r3 = ne13/ne03;
+
+    // row groups
+    const int id   = ggml_v3_get_op_params_i32(dst, 0);
+    const int n_as = ggml_v3_get_op_params_i32(dst, 1);
+
+    char * wdata_src1_end = (src1->type == vec_dot_type) ?
+            (char *) params->wdata :
+            (char *) params->wdata + GGML_V3_PAD(ggml_v3_row_size(vec_dot_type, ggml_v3_nelements(src1)), sizeof(int64_t));
+
+    int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
+    int64_t * matrix_rows       = matrix_row_counts + n_as;     // [n_as][ne11]
+
+    #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]
+
+   if (params->type == GGML_V3_TASK_INIT) {
+        char * wdata = params->wdata;
+        if (src1->type != vec_dot_type) {
+            const size_t row_size = ggml_v3_row_size(vec_dot_type, ne10);
+
+            assert(params->wsize >= ne11*ne12*ne13*row_size);
+            assert(src1->type == GGML_V3_TYPE_F32);
+
+            for (int64_t i13 = 0; i13 < ne13; ++i13) {
+                for (int64_t i12 = 0; i12 < ne12; ++i12) {
+                    for (int64_t i11 = 0; i11 < ne11; ++i11) {
+                        from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10);
+                        wdata += row_size;
+                    }
+                }
+            }
+        }
+
+        // initialize matrix_row_counts
+        GGML_V3_ASSERT(wdata == wdata_src1_end);
+        memset(matrix_row_counts, 0, n_as*sizeof(int64_t));
+
+        // group rows by src0 matrix
+        for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
+            const int32_t row_id = *(const int32_t *) ((const char *) ids->data + i01*ids->nb[1] + id*ids->nb[0]);
+
+            GGML_V3_ASSERT(row_id >= 0 && row_id < n_as);
+            MMID_MATRIX_ROW(row_id, matrix_row_counts[row_id]) = i01;
+            matrix_row_counts[row_id] += 1;
+        }
+
+        return;
+    }
+
+    if (params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    // compute each matrix multiplication in sequence
+    for (int cur_a = 0; cur_a < n_as; ++cur_a) {
+        const int64_t cne1 = matrix_row_counts[cur_a];
+
+        if (cne1 == 0) {
+            continue;
+        }
+
+        const struct ggml_v3_tensor * src0_cur = dst->src[cur_a + 2];
+
+        if (ith == 1 && g_imatrix_collect_v3) {
+            g_imatrix_collect_v3(src0_cur, src1);
+        }
+
+        const void * wdata    = (src1->type == vec_dot_type) ? src1->data : params->wdata;
+        const size_t row_size = ggml_v3_row_size(vec_dot_type, ne10);
+
+        const int64_t nr0 = ne01;           // src0 rows
+        const int64_t nr1 = cne1*ne12*ne13; // src1 rows
+
+        //printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
+
+        // distribute the thread work across the inner or outer loop based on which one is larger
+
+        const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
+        const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
+
+        const int64_t ith0 = ith % nth0;
+        const int64_t ith1 = ith / nth0;
+
+        const int64_t dr0 = (nr0 + nth0 - 1)/nth0;
+        const int64_t dr1 = (nr1 + nth1 - 1)/nth1;
+
+        const int64_t ir010 = dr0*ith0;
+        const int64_t ir011 = MIN(ir010 + dr0, nr0);
+
+        const int64_t ir110 = dr1*ith1;
+        const int64_t ir111 = MIN(ir110 + dr1, nr1);
+
+        //printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);
+
+        // threads with no work simply yield (not sure if it helps)
+        if (ir010 >= ir011 || ir110 >= ir111) {
+            sched_yield();
+            continue;
+        }
+
+        assert(ne12 % ne02 == 0);
+        assert(ne13 % ne03 == 0);
+
+        // block-tiling attempt
+        const int64_t blck_0 = 16;
+        const int64_t blck_1 = 16;
+
+        // attempt to reduce false-sharing (does not seem to make a difference)
+        float tmp[16];
+
+        for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
+            for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
+                for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
+                    const int64_t  i13 = (ir1/(ne12*cne1)); // Note: currently, src1 is always a matrix
+                    const int64_t  i12 = (ir1 - i13*ne12*cne1)/cne1;
+                    const int64_t _i11 = (ir1 - i13*ne12*cne1 - i12*cne1);
+                    const int64_t  i11 = MMID_MATRIX_ROW(cur_a, _i11);
+
+                    // broadcast src0 into src1
+                    const int64_t i03 = i13/r3;
+                    const int64_t i02 = i12/r2;
+
+                    const int64_t i1 = i11;
+                    const int64_t i2 = i12;
+                    const int64_t i3 = i13;
+
+                    const char * src0_row = (const char *) src0_cur->data + (0 + i02*nb02 + i03*nb03);
+
+                    // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
+                    //       if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
+                    //       the original src1 data pointer, so we should index using the indices directly
+                    // TODO: this is a bit of a hack, we should probably have a better way to handle this
+                    const char * src1_col = (const char *) wdata +
+                        (src1_cont || src1->type != vec_dot_type
+                        ? (i11      + i12*ne11 + i13*ne12*ne11)*row_size
+                        : (i11*nb11 + i12*nb12 + i13*nb13));
+
+                    float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
+
+                    //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
+                    //    vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
+                    //}
+
+                    for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
+                        vec_dot(ne00, &tmp[ir0 - iir0], src0_row + ir0*nb01, src1_col);
+                    }
+                    memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
+                }
+            }
+        }
+    }
+
+    #undef MMID_MATRIX_ROW
+}
+
+// ggml_v3_compute_forward_out_prod
+
+static void ggml_v3_compute_forward_out_prod_f32(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        const struct ggml_v3_tensor * src1,
+              struct ggml_v3_tensor * dst) {
+    // int64_t t0 = ggml_v3_perf_time_us();
+    // UNUSED(t0);
+
+    GGML_V3_TENSOR_BINARY_OP_LOCALS
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    GGML_V3_ASSERT(ne0  == ne00);
+    GGML_V3_ASSERT(ne1  == ne10);
+    GGML_V3_ASSERT(ne2  == ne02);
+    GGML_V3_ASSERT(ne02 == ne12);
+    GGML_V3_ASSERT(ne3  == ne13);
+    GGML_V3_ASSERT(ne03 == ne13);
+
+    // we don't support permuted src0 or src1
+    GGML_V3_ASSERT(nb00 == sizeof(float));
+
+    // dst cannot be transposed or permuted
+    GGML_V3_ASSERT(nb0 == sizeof(float));
+    // GGML_V3_ASSERT(nb0 <= nb1);
+    // GGML_V3_ASSERT(nb1 <= nb2);
+    // GGML_V3_ASSERT(nb2 <= nb3);
+
+    // nb01 >= nb00 - src0 is not transposed
+    //   compute by src0 rows
+
+    // TODO: #if defined(GGML_USE_CUBLAS) ggml_v3_cuda_out_prod
+    // TODO: #if defined(GGML_USE_CLBLAST)
+
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
+    bool use_blas = ggml_v3_is_matrix(src0) &&
+        ggml_v3_is_matrix(src1) &&
+        ggml_v3_is_contiguous(src0) &&
+        (ggml_v3_is_contiguous(src1) || ggml_v3_is_transposed(src1));
+#endif
+
+    if (params->type == GGML_V3_TASK_INIT) {
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) // gemm beta will zero dst
+        if (use_blas) {
+            return;
+        }
+#endif
+        ggml_v3_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
+        return;
+    }
+
+    if (params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
+    if (use_blas) {
+        if (params->ith != 0) { // All threads other than the first do no work.
+            return;
+        }
+        // Arguments to ggml_v3_compute_forward_out_prod (expressed as major,minor)
+        // src0: (k,n)
+        // src1: (k,m)
+        // dst:  (m,n)
+        //
+        // Arguments to sgemm (see https://github.com/Reference-LAPACK/lapack/blob/master/BLAS/SRC/sgemm.f)
+        // Also expressed as (major,minor)
+        // a: (m,k): so src1 transposed
+        // b: (k,n): so src0
+        // c: (m,n)
+        //
+        // However, if ggml_v3_is_transposed(src1) is true, then
+        // src1->data already contains a transposed version, so sgemm mustn't
+        // transpose it further.
+
+        int n = src0->ne[0];
+        int k = src0->ne[1];
+        int m = src1->ne[0];
+
+        int transposeA, lda;
+
+        if (!ggml_v3_is_transposed(src1)) {
+            transposeA = CblasTrans;
+            lda = m;
+        } else {
+            transposeA = CblasNoTrans;
+            lda = k;
+        }
+
+        float * a = (float *) ((char *) src1->data);
+        float * b = (float *) ((char *) src0->data);
+        float * c = (float *) ((char *) dst->data);
+
+        cblas_sgemm(CblasRowMajor, transposeA, CblasNoTrans, m, n, k, 1.0, a, lda, b, n, 0.0, c, n);
+
+        return;
+    }
+#endif
+
+    // dst[:,:,:,:] = 0
+    // for i2,i3:
+    //   for i1:
+    //     for i01:
+    //       for i0:
+    //         dst[i0,i1,i2,i3] += src0[i0,i01,i2,i3] * src1[i1,i01,i2,i3]
+
+    // parallelize by last three dimensions
+
+    // total rows in dst
+    const int64_t nr = ne1*ne2*ne3;
+
+    // rows per thread
+    const int64_t dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int64_t ir0 = dr*ith;
+    const int64_t ir1 = MIN(ir0 + dr, nr);
+
+    // block-tiling attempt
+    const int64_t blck_0 = MAX(GGML_V3_VEC_MAD_UNROLL, 32);
+    const int64_t blck_1 = 16;
+
+    for (int64_t bir = ir0; bir < ir1; bir += blck_1) {
+        const int64_t bir1 = MIN(bir + blck_1, ir1);
+        for (int64_t bi01 = 0; bi01 < ne01; bi01 += blck_0) {
+            const int64_t bne01 = MIN(bi01 + blck_0, ne01);
+            for (int64_t ir = bir; ir < bir1; ++ir) {
+                // dst indices
+                const int64_t i3 = ir/(ne2*ne1);
+                const int64_t i2 = (ir - i3*ne2*ne1)/ne1;
+                const int64_t i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+                const int64_t i02 = i2;
+                const int64_t i03 = i3;
+
+                //const int64_t i10 = i1;
+                const int64_t i12 = i2;
+                const int64_t i13 = i3;
+
+#if GGML_V3_VEC_MAD_UNROLL > 2
+                const int64_t bne01_unroll = bne01 - (bne01 % GGML_V3_VEC_MAD_UNROLL);
+                for (int64_t i01 = bi01; i01 < bne01_unroll; i01 += GGML_V3_VEC_MAD_UNROLL) {
+                    const int64_t i11 = i01;
+
+                    float * s0 = (float *) ((char *) src0->data + (          i01*nb01 + i02*nb02 + i03*nb03));
+                    float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
+                    float * d  = (float *) ((char *)  dst->data + (          i1*nb1 + i2*nb2 + i3*nb3));
+
+                    ggml_v3_vec_mad_f32_unroll(ne0, nb01, nb11, d, s0, s1);
+                }
+                for (int64_t i01 = bne01_unroll; i01 < bne01; ++i01) {
+                    const int64_t i11 = i01;
+
+                    float * s0 = (float *) ((char *) src0->data + (          i01*nb01 + i02*nb02 + i03*nb03));
+                    float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
+                    float * d  = (float *) ((char *)  dst->data + (          i1*nb1 + i2*nb2 + i3*nb3));
+
+                    ggml_v3_vec_mad_f32(ne0, d, s0, *s1);
+                }
+#else
+                for (int64_t i01 = bi01; i01 < bne01; ++i01) {
+                    const int64_t i11 = i01;
+
+                    float * s0 = (float *) ((char *) src0->data + (          i01*nb01 + i02*nb02 + i03*nb03));
+                    float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
+                    float * d  = (float *) ((char *)  dst->data + (          i1*nb1 + i2*nb2 + i3*nb3));
+
+                    ggml_v3_vec_mad_f32(ne0, d, s0, *s1);
+                }
+#endif
+            }
+        }
+    }
+
+    //int64_t t1 = ggml_v3_perf_time_us();
+    //static int64_t acc = 0;
+    //acc += t1 - t0;
+    //if (t1 - t0 > 10) {
+    //    printf("\n");
+    //    printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03);
+    //    printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03);
+    //    printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13);
+    //    printf("nb10 = %5d, nb11 = %5d, nb12 = %5d, nb13 = %5d\n", nb10, nb11, nb12, nb13);
+
+    //    printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\n", ith, nth, (int) (t1 - t0), (int) acc);
+    //}
+}
+
+static void ggml_v3_compute_forward_out_prod_q_f32(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        const struct ggml_v3_tensor * src1,
+              struct ggml_v3_tensor * dst) {
+    // int64_t t0 = ggml_v3_perf_time_us();
+    // UNUSED(t0);
+
+    GGML_V3_TENSOR_BINARY_OP_LOCALS;
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const enum ggml_v3_type type = src0->type;
+    ggml_v3_to_float_t const dequantize_row_q = type_traits[type].to_float;
+
+    GGML_V3_ASSERT(ne02 == ne12);
+    GGML_V3_ASSERT(ne03 == ne13);
+    GGML_V3_ASSERT(ne2  == ne12);
+    GGML_V3_ASSERT(ne3  == ne13);
+
+    // we don't support permuted src0 dim0
+    GGML_V3_ASSERT(nb00 == ggml_v3_type_size(type));
+
+    // dst dim0 cannot be transposed or permuted
+    GGML_V3_ASSERT(nb0 == sizeof(float));
+    // GGML_V3_ASSERT(nb0 <= nb1);
+    // GGML_V3_ASSERT(nb1 <= nb2);
+    // GGML_V3_ASSERT(nb2 <= nb3);
+
+    GGML_V3_ASSERT(ne0 == ne00);
+    GGML_V3_ASSERT(ne1 == ne10);
+    GGML_V3_ASSERT(ne2 == ne02);
+    GGML_V3_ASSERT(ne3 == ne03);
+
+    // nb01 >= nb00 - src0 is not transposed
+    //   compute by src0 rows
+
+    // TODO: #if defined(GGML_USE_CUBLAS) ggml_v3_cuda_out_prod
+    // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
+
+    if (params->type == GGML_V3_TASK_INIT) {
+        ggml_v3_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
+        return;
+    }
+
+    if (params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    // parallelize by last three dimensions
+
+    // total rows in dst
+    const int64_t nr = ne1*ne2*ne3;
+
+    // rows per thread
+    const int64_t dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int64_t ir0 = dr*ith;
+    const int64_t ir1 = MIN(ir0 + dr, nr);
+
+    // dst[:,:,:,:] = 0
+    // for i2,i3:
+    //   for i1:
+    //     for i01:
+    //       for i0:
+    //         dst[i0,i1,i2,i3] += src0[i0,i01,i2,i3] * src1[i1,i01,i2,i3]
+
+    float * wdata = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32) * ith;
+
+    for (int64_t ir = ir0; ir < ir1; ++ir) {
+        // dst indices
+        const int64_t i3 = ir/(ne2*ne1);
+        const int64_t i2 = (ir - i3*ne2*ne1)/ne1;
+        const int64_t i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+        const int64_t i02 = i2;
+        const int64_t i03 = i3;
+
+        //const int64_t i10 = i1;
+        const int64_t i12 = i2;
+        const int64_t i13 = i3;
+
+        for (int64_t i01 = 0; i01 < ne01; ++i01) {
+            const int64_t i11 = i01;
+
+            float * s0 = (float *) ((char *) src0->data + (          i01*nb01 + i02*nb02 + i03*nb03));
+            float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
+            float * d  = (float *) ((char *)  dst->data + (          i1*nb1 + i2*nb2 + i3*nb3));
+
+            dequantize_row_q(s0, wdata, ne0);
+            ggml_v3_vec_mad_f32(ne0, d, wdata, *s1);
+        }
+    }
+
+    //int64_t t1 = ggml_v3_perf_time_us();
+    //static int64_t acc = 0;
+    //acc += t1 - t0;
+    //if (t1 - t0 > 10) {
+    //    printf("\n");
+    //    printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03);
+    //    printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03);
+    //    printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13);
+    //    printf("nb10 = %5d, nb11 = %5d, nb12 = %5d, nb13 = %5d\n", nb10, nb11, nb12, nb13);
+
+    //    printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\n", ith, nth, (int) (t1 - t0), (int) acc);
+    //}
+}
+
+static void ggml_v3_compute_forward_out_prod(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        const struct ggml_v3_tensor * src1,
+        struct ggml_v3_tensor * dst) {
+    switch (src0->type) {
+        case GGML_V3_TYPE_Q4_0:
+        case GGML_V3_TYPE_Q4_1:
+        case GGML_V3_TYPE_Q5_0:
+        case GGML_V3_TYPE_Q5_1:
+        case GGML_V3_TYPE_Q8_0:
+        case GGML_V3_TYPE_Q2_K:
+        case GGML_V3_TYPE_Q3_K:
+        case GGML_V3_TYPE_Q4_K:
+        case GGML_V3_TYPE_Q5_K:
+        case GGML_V3_TYPE_Q6_K:
+        case GGML_V3_TYPE_IQ2_XXS:
+        case GGML_V3_TYPE_IQ2_XS:
+            {
+                ggml_v3_compute_forward_out_prod_q_f32(params, src0, src1, dst);
+            } break;
+        case GGML_V3_TYPE_F16:
+            {
+                GGML_V3_ASSERT(false); // todo
+                // ggml_v3_compute_forward_out_prod_f16_f32(params, src0, src1, dst);
+            } break;
+        case GGML_V3_TYPE_F32:
+            {
+                ggml_v3_compute_forward_out_prod_f32(params, src0, src1, dst);
+            } break;
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_v3_compute_forward_scale
+
+static void ggml_v3_compute_forward_scale_f32(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    GGML_V3_ASSERT(ggml_v3_is_contiguous(src0));
+    GGML_V3_ASSERT(ggml_v3_is_contiguous(dst));
+    GGML_V3_ASSERT(ggml_v3_are_same_shape(src0, dst));
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    // scale factor
+    float v;
+    memcpy(&v, dst->op_params, sizeof(float));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nc = src0->ne[0];
+    const int nr = ggml_v3_nrows(src0);
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    const size_t nb01 = src0->nb[1];
+
+    const size_t nb1 = dst->nb[1];
+
+    for (int i1 = ir0; i1 < ir1; i1++) {
+        if (dst->data != src0->data) {
+            // src0 is same shape as dst => same indices
+            memcpy((char *)dst->data + i1*nb1, (char *)src0->data + i1*nb01, nc * sizeof(float));
+        }
+        ggml_v3_vec_scale_f32(nc, (float *) ((char *) dst->data + i1*nb1), v);
+    }
+}
+
+static void ggml_v3_compute_forward_scale(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    switch (src0->type) {
+        case GGML_V3_TYPE_F32:
+            {
+                ggml_v3_compute_forward_scale_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_v3_compute_forward_set
+
+static void ggml_v3_compute_forward_set_f32(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        const struct ggml_v3_tensor * src1,
+        struct ggml_v3_tensor * dst) {
+    GGML_V3_ASSERT(ggml_v3_are_same_shape(src0, dst));
+    GGML_V3_ASSERT(ggml_v3_is_contiguous(dst) && ggml_v3_is_contiguous(src0));
+
+    // view src0 and dst with these strides and data offset inbytes during set
+    // nb0 is implicitly element_size because src0 and dst are contiguous
+    size_t nb1     = ((int32_t *) dst->op_params)[0];
+    size_t nb2     = ((int32_t *) dst->op_params)[1];
+    size_t nb3     = ((int32_t *) dst->op_params)[2];
+    size_t offset  = ((int32_t *) dst->op_params)[3];
+    bool   inplace = (bool) ((int32_t *) dst->op_params)[4];
+
+    if (!inplace && (params->type == GGML_V3_TASK_INIT)) {
+        // memcpy needs to be synchronized across threads to avoid race conditions.
+        // => do it in INIT phase
+        memcpy(
+            ((char *)  dst->data),
+            ((char *) src0->data),
+            ggml_v3_nbytes(dst));
+    }
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr = ggml_v3_nrows(src1);
+    const int nc = src1->ne[0];
+
+    GGML_V3_TENSOR_LOCALS(int64_t, ne1, src1, ne)
+    GGML_V3_TENSOR_LOCALS(size_t,  nb1, src1, nb)
+
+    // src0 and dst as viewed during set
+    const size_t nb0 = ggml_v3_element_size(src0);
+
+    const int im0 = (ne10 == 0 ? 0 : ne10-1);
+    const int im1 = (ne11 == 0 ? 0 : ne11-1);
+    const int im2 = (ne12 == 0 ? 0 : ne12-1);
+    const int im3 = (ne13 == 0 ? 0 : ne13-1);
+
+    GGML_V3_ASSERT(offset + im0*nb0  + im1*nb1  + im2*nb2  + im3*nb3  <= ggml_v3_nbytes(dst));
+
+    GGML_V3_ASSERT(nb10 == sizeof(float));
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int ir = ir0; ir < ir1; ++ir) {
+        // src0 and dst are viewed with shape of src1 and offset
+        // => same indices
+        const int i3 = ir/(ne12*ne11);
+        const int i2 = (ir - i3*ne12*ne11)/ne11;
+        const int i1 = (ir - i3*ne12*ne11 - i2*ne11);
+
+        ggml_v3_vec_cpy_f32(nc,
+                (float *) ((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + offset),
+                (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
+    }
+}
+
+static void ggml_v3_compute_forward_set(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        const struct ggml_v3_tensor * src1,
+        struct ggml_v3_tensor * dst) {
+
+    switch (src0->type) {
+        case GGML_V3_TYPE_F32:
+            {
+                ggml_v3_compute_forward_set_f32(params, src0, src1, dst);
+            } break;
+        case GGML_V3_TYPE_F16:
+        case GGML_V3_TYPE_Q4_0:
+        case GGML_V3_TYPE_Q4_1:
+        case GGML_V3_TYPE_Q5_0:
+        case GGML_V3_TYPE_Q5_1:
+        case GGML_V3_TYPE_Q8_0:
+        case GGML_V3_TYPE_Q8_1:
+        case GGML_V3_TYPE_Q2_K:
+        case GGML_V3_TYPE_Q3_K:
+        case GGML_V3_TYPE_Q4_K:
+        case GGML_V3_TYPE_Q5_K:
+        case GGML_V3_TYPE_Q6_K:
+        case GGML_V3_TYPE_IQ2_XXS:
+        case GGML_V3_TYPE_IQ2_XS:
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_v3_compute_forward_cpy
+
+static void ggml_v3_compute_forward_cpy(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    ggml_v3_compute_forward_dup(params, src0, dst);
+}
+
+// ggml_v3_compute_forward_cont
+
+static void ggml_v3_compute_forward_cont(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    ggml_v3_compute_forward_dup(params, src0, dst);
+}
+
+// ggml_v3_compute_forward_reshape
+
+static void ggml_v3_compute_forward_reshape(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    // NOP
+    UNUSED(params);
+    UNUSED(src0);
+    UNUSED(dst);
+}
+
+// ggml_v3_compute_forward_view
+
+static void ggml_v3_compute_forward_view(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0) {
+    // NOP
+    UNUSED(params);
+    UNUSED(src0);
+}
+
+// ggml_v3_compute_forward_permute
+
+static void ggml_v3_compute_forward_permute(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0) {
+    // NOP
+    UNUSED(params);
+    UNUSED(src0);
+}
+
+// ggml_v3_compute_forward_transpose
+
+static void ggml_v3_compute_forward_transpose(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0) {
+    // NOP
+    UNUSED(params);
+    UNUSED(src0);
+}
+
+// ggml_v3_compute_forward_get_rows
+
+static void ggml_v3_compute_forward_get_rows_q(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        const struct ggml_v3_tensor * src1,
+              struct ggml_v3_tensor * dst) {
+    assert(params->ith == 0);
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    GGML_V3_TENSOR_BINARY_OP_LOCALS
+
+    const int64_t nc = ne00;
+    const int64_t nr = ggml_v3_nelements(src1); GGML_V3_UNUSED(nr);
+
+    const enum ggml_v3_type type = src0->type;
+    ggml_v3_to_float_t const dequantize_row_q = type_traits[type].to_float;
+
+    assert(ne0  == nc);
+    assert(ne02 == ne11);
+    assert(nb00 == ggml_v3_type_size(type));
+    assert(ggml_v3_nrows(dst) == nr);
+
+    // TODO: multi-thread
+    for (int64_t i12 = 0; i12 < ne12; ++i12) {
+        for (int64_t i11 = 0; i11 < ne11; ++i11) {
+            for (int64_t i10 = 0; i10 < ne10; ++i10) {
+                const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
+
+                dequantize_row_q(
+                        (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
+                             (float *) ((char *)  dst->data + i10*nb1  + i11*nb2  + i12*nb3), nc);
+            }
+        }
+    }
+}
+
+static void ggml_v3_compute_forward_get_rows_f16(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        const struct ggml_v3_tensor * src1,
+              struct ggml_v3_tensor * dst) {
+    assert(params->ith == 0);
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    GGML_V3_TENSOR_BINARY_OP_LOCALS
+
+    const int64_t nc = ne00;
+    const int64_t nr = ggml_v3_nelements(src1); GGML_V3_UNUSED(nr);
+
+    assert(ne0  == nc);
+    assert(ne02 == ne11);
+    assert(nb00 == sizeof(ggml_v3_fp16_t));
+    assert(ggml_v3_nrows(dst) == nr);
+
+    // TODO: multi-thread
+    for (int64_t i12 = 0; i12 < ne12; ++i12) {
+        for (int64_t i11 = 0; i11 < ne11; ++i11) {
+            for (int64_t i10 = 0; i10 < ne10; ++i10) {
+                const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
+
+                ggml_v3_fp16_to_fp32_row(
+                        (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
+                             (float *) ((char *)  dst->data + i10*nb1  + i11*nb2  + i12*nb3), nc);
+            }
+        }
+    }
+}
+
+static void ggml_v3_compute_forward_get_rows_f32(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        const struct ggml_v3_tensor * src1,
+              struct ggml_v3_tensor * dst) {
+    assert(params->ith == 0);
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    GGML_V3_TENSOR_BINARY_OP_LOCALS
+
+    const int64_t nc = ne00;
+    const int64_t nr = ggml_v3_nelements(src1); GGML_V3_UNUSED(nr);
+
+    assert(ne0  == nc);
+    assert(ne02 == ne11);
+    assert(nb00 == sizeof(float));
+    assert(ggml_v3_nrows(dst) == nr);
+
+    // TODO: multi-thread
+    for (int64_t i12 = 0; i12 < ne12; ++i12) {
+        for (int64_t i11 = 0; i11 < ne11; ++i11) {
+            for (int64_t i10 = 0; i10 < ne10; ++i10) {
+                const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
+
+                ggml_v3_vec_cpy_f32(nc,
+                        (float *) ((char *)  dst->data + i10*nb1  + i11*nb2  + i12*nb3),
+                        (float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03));
+            }
+        }
+    }
+}
+
+static void ggml_v3_compute_forward_get_rows(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        const struct ggml_v3_tensor * src1,
+        struct ggml_v3_tensor * dst) {
+    switch (src0->type) {
+        case GGML_V3_TYPE_Q4_0:
+        case GGML_V3_TYPE_Q4_1:
+        case GGML_V3_TYPE_Q5_0:
+        case GGML_V3_TYPE_Q5_1:
+        case GGML_V3_TYPE_Q8_0:
+        case GGML_V3_TYPE_Q8_1:
+        case GGML_V3_TYPE_Q2_K:
+        case GGML_V3_TYPE_Q3_K:
+        case GGML_V3_TYPE_Q4_K:
+        case GGML_V3_TYPE_Q5_K:
+        case GGML_V3_TYPE_Q6_K:
+        case GGML_V3_TYPE_IQ2_XXS:
+        case GGML_V3_TYPE_IQ2_XS:
+            {
+                ggml_v3_compute_forward_get_rows_q(params, src0, src1, dst);
+            } break;
+        case GGML_V3_TYPE_F16:
+            {
+                ggml_v3_compute_forward_get_rows_f16(params, src0, src1, dst);
+            } break;
+        case GGML_V3_TYPE_F32:
+        case GGML_V3_TYPE_I32:
+            {
+                ggml_v3_compute_forward_get_rows_f32(params, src0, src1, dst);
+            } break;
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+
+    //static bool first = true;
+    //printf("ne0 = %d, ne1 = %d, ne2 = %d\n", dst->ne[0], dst->ne[1], dst->ne[2]);
+    //if (first) {
+    //    first = false;
+    //} else {
+    //    for (int k = 0; k < dst->ne[1]; ++k) {
+    //        for (int j = 0; j < dst->ne[0]/16; ++j) {
+    //            for (int i = 0; i < 16; ++i) {
+    //                printf("%8.4f ", ((float *) dst->data)[k*dst->ne[0] + j*16 + i]);
+    //            }
+    //            printf("\n");
+    //        }
+    //        printf("\n");
+    //    }
+    //    printf("\n");
+    //    exit(0);
+    //}
+}
+
+// ggml_v3_compute_forward_get_rows_back
+
+static void ggml_v3_compute_forward_get_rows_back_f32_f16(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        const struct ggml_v3_tensor * src1,
+              struct ggml_v3_tensor * dst) {
+    GGML_V3_ASSERT(params->ith == 0);
+    GGML_V3_ASSERT(ggml_v3_is_contiguous(dst));
+
+    // ggml_v3_compute_forward_dup_same_cont(params, opt0, dst);
+
+    if (params->type == GGML_V3_TASK_INIT) {
+        memset(dst->data, 0, ggml_v3_nbytes(dst));
+    }
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    const int nc = src0->ne[0];
+    const int nr = ggml_v3_nelements(src1);
+
+    GGML_V3_ASSERT( dst->ne[0] == nc);
+    GGML_V3_ASSERT(src0->nb[0] == sizeof(ggml_v3_fp16_t));
+
+    for (int i = 0; i < nr; ++i) {
+        const int r = ((int32_t *) src1->data)[i];
+
+        for (int j = 0; j < nc; ++j) {
+            ggml_v3_fp16_t v = ((ggml_v3_fp16_t *) ((char *) src0->data + i*src0->nb[1]))[j];
+            ((float *) ((char *) dst->data + r*dst->nb[1]))[j] += GGML_V3_FP16_TO_FP32(v);
+        }
+    }
+}
+
+static void ggml_v3_compute_forward_get_rows_back_f32(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        const struct ggml_v3_tensor * src1,
+              struct ggml_v3_tensor * dst) {
+    GGML_V3_ASSERT(params->ith == 0);
+    GGML_V3_ASSERT(ggml_v3_is_contiguous(dst));
+
+    // ggml_v3_compute_forward_dup_same_cont(params, opt0, dst);
+
+    if (params->type == GGML_V3_TASK_INIT) {
+        memset(dst->data, 0, ggml_v3_nbytes(dst));
+    }
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    const int nc = src0->ne[0];
+    const int nr = ggml_v3_nelements(src1);
+
+    GGML_V3_ASSERT( dst->ne[0] == nc);
+    GGML_V3_ASSERT(src0->nb[0] == sizeof(float));
+
+    for (int i = 0; i < nr; ++i) {
+        const int r = ((int32_t *) src1->data)[i];
+
+        ggml_v3_vec_add_f32(nc,
+                (float *) ((char *)  dst->data + r*dst->nb[1]),
+                (float *) ((char *)  dst->data + r*dst->nb[1]),
+                (float *) ((char *) src0->data + i*src0->nb[1]));
+    }
+}
+
+static void ggml_v3_compute_forward_get_rows_back(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        const struct ggml_v3_tensor * src1,
+        struct ggml_v3_tensor * dst) {
+    switch (src0->type) {
+        case GGML_V3_TYPE_F16:
+            {
+                ggml_v3_compute_forward_get_rows_back_f32_f16(params, src0, src1, dst);
+            } break;
+        case GGML_V3_TYPE_F32:
+            {
+                ggml_v3_compute_forward_get_rows_back_f32(params, src0, src1, dst);
+            } break;
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+
+    //static bool first = true;
+    //printf("ne0 = %d, ne1 = %d, ne2 = %d\n", dst->ne[0], dst->ne[1], dst->ne[2]);
+    //if (first) {
+    //    first = false;
+    //} else {
+    //    for (int k = 0; k < dst->ne[1]; ++k) {
+    //        for (int j = 0; j < dst->ne[0]/16; ++j) {
+    //            for (int i = 0; i < 16; ++i) {
+    //                printf("%8.4f ", ((float *) dst->data)[k*dst->ne[0] + j*16 + i]);
+    //            }
+    //            printf("\n");
+    //        }
+    //        printf("\n");
+    //    }
+    //    printf("\n");
+    //    exit(0);
+    //}
+}
+
+// ggml_v3_compute_forward_diag
+
+static void ggml_v3_compute_forward_diag_f32(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    GGML_V3_ASSERT(params->ith == 0);
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    // TODO: handle transposed/permuted matrices
+
+    GGML_V3_TENSOR_UNARY_OP_LOCALS
+
+    GGML_V3_ASSERT(ne00 == ne0);
+    GGML_V3_ASSERT(ne00 == ne1);
+    GGML_V3_ASSERT(ne01 == 1);
+    GGML_V3_ASSERT(ne02 == ne2);
+    GGML_V3_ASSERT(ne03 == ne3);
+
+    GGML_V3_ASSERT(nb00 == sizeof(float));
+    GGML_V3_ASSERT(nb0  == sizeof(float));
+
+    for (int i3 = 0; i3 < ne3; i3++) {
+        for (int i2 = 0; i2 < ne2; i2++) {
+            for (int i1 = 0; i1 < ne1; i1++) {
+                float * d = (float *)((char *)  dst->data + i3*nb3  + i2*nb2 + i1*nb1);
+                float * s = (float *)((char *) src0->data + i3*nb03 + i2*nb02);
+                for (int i0 = 0; i0 < i1; i0++) {
+                    d[i0] = 0;
+                }
+                d[i1] = s[i1];
+                for (int i0 = i1+1; i0 < ne0; i0++) {
+                    d[i0] = 0;
+                }
+            }
+        }
+    }
+}
+
+static void ggml_v3_compute_forward_diag(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    switch (src0->type) {
+        case GGML_V3_TYPE_F32:
+            {
+                ggml_v3_compute_forward_diag_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_v3_compute_forward_diag_mask_inf
+
+static void ggml_v3_compute_forward_diag_mask_f32(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst,
+        const float value) {
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int  n_past  = ((int32_t *) dst->op_params)[0];
+    const bool inplace = src0->data == dst->data;
+
+    GGML_V3_ASSERT(n_past >= 0);
+
+    if (!inplace && (params->type == GGML_V3_TASK_INIT)) {
+        // memcpy needs to be synchronized across threads to avoid race conditions.
+        // => do it in INIT phase
+        GGML_V3_ASSERT(ggml_v3_nelements(dst) == ggml_v3_nelements(src0));
+        GGML_V3_ASSERT(ggml_v3_is_contiguous(dst) && ggml_v3_is_contiguous(src0));
+        memcpy(
+            ((char *)  dst->data),
+            ((char *) src0->data),
+            ggml_v3_nbytes(dst));
+    }
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    // TODO: handle transposed/permuted matrices
+
+    const int n  = ggml_v3_nrows(src0);
+    const int nc = src0->ne[0];
+    const int nr = src0->ne[1];
+    const int nz = n/nr;
+
+    GGML_V3_ASSERT( dst->nb[0] == sizeof(float));
+    GGML_V3_ASSERT(src0->nb[0] == sizeof(float));
+
+    for (int k = 0; k < nz; k++) {
+        for (int j = ith; j < nr; j += nth) {
+            for (int i = n_past; i < nc; i++) {
+                if (i > n_past + j) {
+                    *(float *)((char *) dst->data + k*dst->nb[2] + j*dst->nb[1] + i*dst->nb[0]) = value;
+                }
+            }
+        }
+    }
+}
+
+static void ggml_v3_compute_forward_diag_mask_inf(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    switch (src0->type) {
+        case GGML_V3_TYPE_F32:
+            {
+                ggml_v3_compute_forward_diag_mask_f32(params, src0, dst, -INFINITY);
+            } break;
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+}
+
+static void ggml_v3_compute_forward_diag_mask_zero(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    switch (src0->type) {
+        case GGML_V3_TYPE_F32:
+            {
+                ggml_v3_compute_forward_diag_mask_f32(params, src0, dst, 0);
+            } break;
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_v3_compute_forward_soft_max
+
+static void ggml_v3_compute_forward_soft_max_f32(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        const struct ggml_v3_tensor * src1,
+              struct ggml_v3_tensor * dst) {
+    assert(ggml_v3_is_contiguous(dst));
+    assert(ggml_v3_are_same_shape(src0, dst));
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    float scale = 1.0f;
+    memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
+
+    // TODO: handle transposed/permuted matrices
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int64_t ne11 = src1 ? src1->ne[1] : 1;
+
+    const int nc = src0->ne[0];
+    const int nr = ggml_v3_nrows(src0);
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith;
+
+    for (int i1 = ir0; i1 < ir1; i1++) {
+        float * sp = (float *)((char *) src0->data + i1*src0->nb[1]);
+        float * dp = (float *)((char *)  dst->data +  i1*dst->nb[1]);
+
+        // broadcast the mask across rows
+        float * mp = src1 ? (float *)((char *) src1->data + (i1%ne11)*src1->nb[1]) : NULL;
+
+        ggml_v3_vec_cpy_f32  (nc, wp, sp);
+        ggml_v3_vec_scale_f32(nc, wp, scale);
+        if (mp) {
+            ggml_v3_vec_acc_f32(nc, wp, mp);
+        }
+
+#ifndef NDEBUG
+        for (int i = 0; i < nc; ++i) {
+            //printf("p[%d] = %f\n", i, p[i]);
+            assert(!isnan(wp[i]));
+        }
+#endif
+
+        float max = -INFINITY;
+        ggml_v3_vec_max_f32(nc, &max, wp);
+
+        ggml_v3_float sum = 0.0;
+
+        uint16_t scvt;
+        for (int i = 0; i < nc; i++) {
+            if (wp[i] == -INFINITY) {
+                dp[i] = 0.0f;
+            } else {
+                // const float val = (wp[i] == -INFINITY) ? 0.0 : exp(wp[i] - max);
+                ggml_v3_fp16_t s = GGML_V3_FP32_TO_FP16(wp[i] - max);
+                memcpy(&scvt, &s, sizeof(scvt));
+                const float val = GGML_V3_FP16_TO_FP32(ggml_v3_table_exp_f16[scvt]);
+                sum += (ggml_v3_float)val;
+                dp[i] = val;
+            }
+        }
+
+        assert(sum > 0.0);
+
+        sum = 1.0/sum;
+        ggml_v3_vec_scale_f32(nc, dp, sum);
+
+#ifndef NDEBUG
+        for (int i = 0; i < nc; ++i) {
+            assert(!isnan(dp[i]));
+            assert(!isinf(dp[i]));
+        }
+#endif
+    }
+}
+
+static void ggml_v3_compute_forward_soft_max(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        const struct ggml_v3_tensor * src1,
+              struct ggml_v3_tensor * dst) {
+    switch (src0->type) {
+        case GGML_V3_TYPE_F32:
+            {
+                ggml_v3_compute_forward_soft_max_f32(params, src0, src1, dst);
+            } break;
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_v3_compute_forward_soft_max_back
+
+static void ggml_v3_compute_forward_soft_max_back_f32(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        const struct ggml_v3_tensor * src1,
+        struct ggml_v3_tensor * dst) {
+    GGML_V3_ASSERT(ggml_v3_is_contiguous(src0));
+    GGML_V3_ASSERT(ggml_v3_is_contiguous(src1));
+    GGML_V3_ASSERT(ggml_v3_is_contiguous(dst));
+    GGML_V3_ASSERT(ggml_v3_are_same_shape(src0, dst));
+    GGML_V3_ASSERT(ggml_v3_are_same_shape(src1, dst));
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    // TODO: handle transposed/permuted matrices
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nc = src0->ne[0];
+    const int nr = ggml_v3_nrows(src0);
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int i1 = ir0; i1 < ir1; i1++) {
+        float *dy = (float *)((char *) src0->data + i1*src0->nb[1]);
+        float *y  = (float *)((char *) src1->data + i1*src1->nb[1]);
+        float *dx = (float *)((char *) dst->data  + i1*dst->nb[1]);
+
+#ifndef NDEBUG
+        for (int i = 0; i < nc; ++i) {
+            //printf("p[%d] = %f\n", i, p[i]);
+            assert(!isnan(dy[i]));
+            assert(!isnan(y[i]));
+        }
+#endif
+        // Jii = yi - yi*yi
+        // Jij = -yi*yj
+        // J = diag(y)-y.T*y
+        // dx = J * dy
+        // dxk = sum_i(Jki * dyi)
+        // dxk = sum_i(-yk*yi * dyi) - (-yk*yk)*dyk + (yk - yk*yk)*dyk
+        // dxk = sum_i(-yk*yi * dyi) + yk*yk*dyk + yk*dyk - yk*yk*dyk
+        // dxk = sum_i(-yk*yi * dyi) + yk*dyk
+        // dxk = -yk * sum_i(yi * dyi) + yk*dyk
+        // dxk = -yk * dot(y, dy) + yk*dyk
+        // dxk = yk * (- dot(y, dy) + dyk)
+        // dxk = yk * (dyk - dot(y, dy))
+        //
+        // post-order:
+        // dot_y_dy := dot(y, dy)
+        // dx := dy
+        // dx := dx - dot_y_dy
+        // dx := dx * y
+
+        // linear runtime, no additional memory
+        float dot_y_dy = 0;
+        ggml_v3_vec_dot_f32 (nc, &dot_y_dy, y, dy);
+        ggml_v3_vec_cpy_f32 (nc, dx, dy);
+        ggml_v3_vec_acc1_f32(nc, dx, -dot_y_dy);
+        ggml_v3_vec_mul_f32 (nc, dx, dx, y);
+
+#ifndef NDEBUG
+        for (int i = 0; i < nc; ++i) {
+            assert(!isnan(dx[i]));
+            assert(!isinf(dx[i]));
+        }
+#endif
+    }
+}
+
+static void ggml_v3_compute_forward_soft_max_back(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        const struct ggml_v3_tensor * src1,
+        struct ggml_v3_tensor * dst) {
+    switch (src0->type) {
+        case GGML_V3_TYPE_F32:
+            {
+                ggml_v3_compute_forward_soft_max_back_f32(params, src0, src1, dst);
+            } break;
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_v3_compute_forward_alibi
+
+static void ggml_v3_compute_forward_alibi_f32(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    assert(params->ith == 0);
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    //const int n_past = ((int32_t *) dst->op_params)[0];
+    const int n_head = ((int32_t *) dst->op_params)[1];
+    float max_bias;
+    memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
+
+    const int64_t ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
+    const int64_t ne1 = src0->ne[1]; // seq_len_without_past
+    const int64_t ne2 = src0->ne[2]; // n_head -> this is k
+    //const int64_t ne3 = src0->ne[3]; // 1 -> bsz
+
+    const int64_t n  = ggml_v3_nrows(src0);
+    const int64_t ne2_ne3 = n/ne1; // ne2*ne3
+
+    const size_t nb0 = src0->nb[0];
+    const size_t nb1 = src0->nb[1];
+    const size_t nb2 = src0->nb[2];
+    //const int nb3 = src0->nb[3];
+
+    GGML_V3_ASSERT(nb0 == sizeof(float));
+    GGML_V3_ASSERT(n_head == ne2);
+
+    // add alibi to src0 (KQ_scaled)
+    const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
+
+    const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
+    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
+
+    for (int64_t i = 0; i < ne0; i++) {
+        for (int64_t j = 0; j < ne1; j++) {
+            for (int64_t k = 0; k < ne2_ne3; k++) {
+                float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
+                float *      pdst = (float *)((char *)  dst->data + i*nb0 + j*nb1 + k*nb2);
+
+                // TODO: k*nb2 or k*nb3
+
+                float m_k;
+
+                if (k < n_heads_log2_floor) {
+                    m_k = powf(m0, k + 1);
+                } else {
+                    m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
+                }
+
+                pdst[0] = i * m_k + src[0];
+            }
+        }
+    }
+}
+
+static void ggml_v3_compute_forward_alibi_f16(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    assert(params->ith == 0);
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    //const int n_past = ((int32_t *) dst->op_params)[0];
+    const int n_head = ((int32_t *) dst->op_params)[1];
+    float max_bias;
+    memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
+
+    const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
+    const int ne1 = src0->ne[1]; // seq_len_without_past
+    const int ne2 = src0->ne[2]; // n_head -> this is k
+    //const int ne3 = src0->ne[3]; // 1 -> bsz
+
+    const int n  = ggml_v3_nrows(src0);
+    const int ne2_ne3 = n/ne1; // ne2*ne3
+
+    const int nb0 = src0->nb[0];
+    const int nb1 = src0->nb[1];
+    const int nb2 = src0->nb[2];
+    //const int nb3 = src0->nb[3];
+
+    GGML_V3_ASSERT(nb0 == sizeof(ggml_v3_fp16_t));
+    //GGML_V3_ASSERT(ne1 + n_past == ne0); (void) n_past;
+    GGML_V3_ASSERT(n_head == ne2);
+
+    // add alibi to src0 (KQ_scaled)
+    const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
+
+    const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
+    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
+
+    for (int i = 0; i < ne0; i++) {
+        for (int j = 0; j < ne1; j++) {
+            for (int k = 0; k < ne2_ne3; k++) {
+                ggml_v3_fp16_t * const src  = (ggml_v3_fp16_t *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
+                      float *      pdst  =       (float *)((char *)  dst->data + i*nb0 + j*nb1 + k*nb2);
+
+                // TODO: k*nb2 or k*nb3
+
+                float m_k;
+
+                if (k < n_heads_log2_floor) {
+                    m_k = powf(m0, k + 1);
+                } else {
+                    m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
+                }
+
+                // we return F32
+                pdst[0] = i * m_k + GGML_V3_FP16_TO_FP32(src[0]);
+            }
+        }
+    }
+}
+
+static void ggml_v3_compute_forward_alibi(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    switch (src0->type) {
+        case GGML_V3_TYPE_F16:
+            {
+                ggml_v3_compute_forward_alibi_f16(params, src0, dst);
+            } break;
+        case GGML_V3_TYPE_F32:
+            {
+                ggml_v3_compute_forward_alibi_f32(params, src0, dst);
+            } break;
+        case GGML_V3_TYPE_Q4_0:
+        case GGML_V3_TYPE_Q4_1:
+        case GGML_V3_TYPE_Q5_0:
+        case GGML_V3_TYPE_Q5_1:
+        case GGML_V3_TYPE_Q8_0:
+        case GGML_V3_TYPE_Q8_1:
+        case GGML_V3_TYPE_Q2_K:
+        case GGML_V3_TYPE_Q3_K:
+        case GGML_V3_TYPE_Q4_K:
+        case GGML_V3_TYPE_Q5_K:
+        case GGML_V3_TYPE_Q6_K:
+        case GGML_V3_TYPE_IQ2_XXS:
+        case GGML_V3_TYPE_IQ2_XS:
+        case GGML_V3_TYPE_Q8_K:
+        case GGML_V3_TYPE_I8:
+        case GGML_V3_TYPE_I16:
+        case GGML_V3_TYPE_I32:
+        case GGML_V3_TYPE_COUNT:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_v3_compute_forward_clamp
+
+static void ggml_v3_compute_forward_clamp_f32(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    assert(params->ith == 0);
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    float min;
+    float max;
+    memcpy(&min, (float *) dst->op_params + 0, sizeof(float));
+    memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int n  = ggml_v3_nrows(src0);
+    const int nc = src0->ne[0];
+
+    const size_t nb00 = src0->nb[0];
+    const size_t nb01 = src0->nb[1];
+
+    const size_t nb0 = dst->nb[0];
+    const size_t nb1 = dst->nb[1];
+
+    GGML_V3_ASSERT( nb0 == sizeof(float));
+    GGML_V3_ASSERT(nb00 == sizeof(float));
+
+    for (int j = ith; j < n; j += nth) {
+        float * dst_ptr  = (float *) ((char *)  dst->data + j*nb1);
+        float * src0_ptr = (float *) ((char *) src0->data + j*nb01);
+
+        for (int i = 0; i < nc; i++) {
+            dst_ptr[i] = MAX(MIN(src0_ptr[i], max), min);
+        }
+    }
+}
+
+static void ggml_v3_compute_forward_clamp(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    switch (src0->type) {
+        case GGML_V3_TYPE_F32:
+            {
+                ggml_v3_compute_forward_clamp_f32(params, src0, dst);
+            } break;
+        case GGML_V3_TYPE_F16:
+        case GGML_V3_TYPE_Q4_0:
+        case GGML_V3_TYPE_Q4_1:
+        case GGML_V3_TYPE_Q5_0:
+        case GGML_V3_TYPE_Q5_1:
+        case GGML_V3_TYPE_Q8_0:
+        case GGML_V3_TYPE_Q8_1:
+        case GGML_V3_TYPE_Q2_K:
+        case GGML_V3_TYPE_Q3_K:
+        case GGML_V3_TYPE_Q4_K:
+        case GGML_V3_TYPE_Q5_K:
+        case GGML_V3_TYPE_Q6_K:
+        case GGML_V3_TYPE_IQ2_XXS:
+        case GGML_V3_TYPE_IQ2_XS:
+        case GGML_V3_TYPE_Q8_K:
+        case GGML_V3_TYPE_I8:
+        case GGML_V3_TYPE_I16:
+        case GGML_V3_TYPE_I32:
+        case GGML_V3_TYPE_COUNT:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_v3_compute_forward_rope
+
+static float rope_yarn_ramp(const float low, const float high, const int i0) {
+    const float y = (i0 / 2 - low) / MAX(0.001f, high - low);
+    return 1 - MIN(1, MAX(0, y));
+}
+
+// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
+// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
+static void rope_yarn(
+    float theta_extrap, float freq_scale, float corr_dims[2], int64_t i0, float ext_factor, float mscale,
+    float * cos_theta, float * sin_theta
+) {
+    // Get n-d rotational scaling corrected for extrapolation
+    float theta_interp = freq_scale * theta_extrap;
+    float theta = theta_interp;
+    if (ext_factor != 0.0f) {
+        float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
+        theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
+
+        // Get n-d magnitude scaling corrected for interpolation
+        mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale);
+    }
+    *cos_theta = cosf(theta) * mscale;
+    *sin_theta = sinf(theta) * mscale;
+}
+
+// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
+// `corr_dim(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
+static float ggml_v3_rope_yarn_corr_dim(int n_dims, int n_orig_ctx, float n_rot, float base) {
+    return n_dims * logf(n_orig_ctx / (n_rot * 2 * (float)M_PI)) / (2 * logf(base));
+}
+
+void ggml_v3_rope_yarn_corr_dims(
+    int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]
+) {
+    // start and end correction dims
+    dims[0] = MAX(0,         floorf(ggml_v3_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_fast, freq_base)));
+    dims[1] = MIN(n_dims - 1, ceilf(ggml_v3_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_slow, freq_base)));
+}
+
+static void ggml_v3_compute_forward_rope_f32(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        const struct ggml_v3_tensor * src1,
+        struct ggml_v3_tensor * dst,
+        const bool forward) {
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
+
+    // these two only relevant for xPos RoPE:
+    float xpos_base;
+    bool  xpos_down;
+
+    //const int n_past     = ((int32_t *) dst->op_params)[0];
+    const int n_dims     = ((int32_t *) dst->op_params)[1];
+    const int mode       = ((int32_t *) dst->op_params)[2];
+    const int n_ctx      = ((int32_t *) dst->op_params)[3];
+    const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
+
+    memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
+    memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
+    memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
+    memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
+    memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
+    memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
+    memcpy(&xpos_base,   (int32_t *) dst->op_params + 11, sizeof(float));
+    memcpy(&xpos_down,   (int32_t *) dst->op_params + 12, sizeof(bool));
+
+    GGML_V3_TENSOR_UNARY_OP_LOCALS
+
+    //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
+    //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
+
+    GGML_V3_ASSERT(nb00 == sizeof(float));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr = ggml_v3_nrows(dst);
+
+    GGML_V3_ASSERT(n_dims <= ne0);
+    GGML_V3_ASSERT(n_dims % 2 == 0);
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    // row index used to determine which thread to use
+    int ir = 0;
+
+    const float theta_scale = powf(freq_base, -2.0f/n_dims);
+    const float inv_ndims = -1.f/n_dims;
+    float corr_dims[2];
+    ggml_v3_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);
+
+    const bool is_neox = mode & 2;
+    const bool is_glm  = mode & 4;
+
+    // backward process uses inverse rotation by cos and sin.
+    // cos and sin build a rotation matrix, where the inverse is the transpose.
+    // this essentially just switches the sign of sin.
+    const float sin_sign = forward ? 1.0f : -1.0f;
+
+    const int32_t * pos = (const int32_t *) src1->data;
+
+    for (int64_t i3 = 0; i3 < ne3; i3++) {
+        for (int64_t i2 = 0; i2 < ne2; i2++) {
+            const int64_t p = pos[i2];
+            for (int64_t i1 = 0; i1 < ne1; i1++) {
+                if (ir++ < ir0) continue;
+                if (ir   > ir1) break;
+
+                float theta_base = (float)p;
+
+                if (is_glm) {
+                    theta_base = MIN(p, n_ctx - 2);
+                    float block_theta = MAX(p - (n_ctx - 2), 0);
+                    for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
+                        const float cos_theta = cosf(theta_base);
+                        const float sin_theta = sinf(theta_base) * sin_sign;
+                        const float cos_block_theta = cosf(block_theta);
+                        const float sin_block_theta = sinf(block_theta) * sin_sign;
+
+                        theta_base *= theta_scale;
+                        block_theta *= theta_scale;
+
+                        const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+                              float * dst_data  = (float *)((char *)  dst->data +  i3*nb3 + i2*nb2  + i1*nb1  + i0*nb0);
+
+                        const float x0 = src[0];
+                        const float x1 = src[n_dims/2];
+                        const float x2 = src[n_dims];
+                        const float x3 = src[n_dims/2*3];
+
+                        dst_data[0]          = x0*cos_theta - x1*sin_theta;
+                        dst_data[n_dims/2]   = x0*sin_theta + x1*cos_theta;
+                        dst_data[n_dims]     = x2*cos_block_theta - x3*sin_block_theta;
+                        dst_data[n_dims/2*3] = x2*sin_block_theta + x3*cos_block_theta;
+                    }
+                } else if (!is_neox) {
+                    for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
+                        float cos_theta, sin_theta;
+                        rope_yarn(
+                            theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
+                        );
+                        sin_theta *= sin_sign;
+
+                        // zeta scaling for xPos only:
+                        float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
+                        if (xpos_down) zeta = 1.0f / zeta;
+
+                        theta_base *= theta_scale;
+
+                        const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+                              float * dst_data  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
+
+                        const float x0 = src[0];
+                        const float x1 = src[1];
+
+                        dst_data[0] = x0*cos_theta*zeta - x1*sin_theta*zeta;
+                        dst_data[1] = x0*sin_theta*zeta + x1*cos_theta*zeta;
+                    }
+                } else {
+                    // TODO: this might be wrong for ne0 != n_dims - need double check
+                    //       it seems we have to rope just the first n_dims elements and do nothing with the rest
+                    // ref:  https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26
+                    theta_base *= freq_scale;
+                    for (int64_t ic = 0; ic < ne0; ic += 2) {
+                        if (ic < n_dims) {
+                            const int64_t ib = 0;
+
+                            // simplified from `(ib * n_dims + ic) * inv_ndims`
+                            float cur_rot = inv_ndims * ic - ib;
+
+                            float cos_theta, sin_theta;
+                            rope_yarn(
+                                theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
+                                &cos_theta, &sin_theta
+                            );
+                            sin_theta *= sin_sign;
+
+                            theta_base *= theta_scale;
+
+                            const int64_t i0 = ib*n_dims + ic/2;
+
+                            const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+                                  float * dst_data  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
+
+                            const float x0 = src[0];
+                            const float x1 = src[n_dims/2];
+
+                            dst_data[0]        = x0*cos_theta - x1*sin_theta;
+                            dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
+                        } else {
+                            const int64_t i0 = ic;
+
+                            const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+                                  float * dst_data  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
+
+                            dst_data[0] = src[0];
+                            dst_data[1] = src[1];
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+static void ggml_v3_compute_forward_rope_f16(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        const struct ggml_v3_tensor * src1,
+        struct ggml_v3_tensor * dst,
+        const bool forward) {
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
+
+    //const int n_past     = ((int32_t *) dst->op_params)[0];
+    const int n_dims     = ((int32_t *) dst->op_params)[1];
+    const int mode       = ((int32_t *) dst->op_params)[2];
+    const int n_ctx      = ((int32_t *) dst->op_params)[3];
+    const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
+    memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
+    memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
+    memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
+    memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
+    memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
+    memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
+
+    GGML_V3_TENSOR_UNARY_OP_LOCALS
+
+    //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
+    //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
+
+    GGML_V3_ASSERT(nb0 == sizeof(ggml_v3_fp16_t));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr = ggml_v3_nrows(dst);
+
+    GGML_V3_ASSERT(n_dims <= ne0);
+    GGML_V3_ASSERT(n_dims % 2 == 0);
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    // row index used to determine which thread to use
+    int ir = 0;
+
+    const float theta_scale = powf(freq_base, -2.0f/n_dims);
+    const float inv_ndims = -1.f/n_dims;
+    float corr_dims[2];
+    ggml_v3_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);
+
+    const bool is_neox = mode & 2;
+    const bool is_glm  = mode & 4;
+
+    // backward process uses inverse rotation by cos and sin.
+    // cos and sin build a rotation matrix, where the inverse is the transpose.
+    // this essentially just switches the sign of sin.
+    const float sin_sign = forward ? 1.0f : -1.0f;
+
+    const int32_t * pos = (const int32_t *) src1->data;
+
+    for (int64_t i3 = 0; i3 < ne3; i3++) {
+        for (int64_t i2 = 0; i2 < ne2; i2++) {
+            const int64_t p = pos[i2];
+            for (int64_t i1 = 0; i1 < ne1; i1++) {
+                if (ir++ < ir0) continue;
+                if (ir   > ir1) break;
+
+                float theta_base = (float)p;
+
+                if (is_glm) {
+                    theta_base = MIN(p, n_ctx - 2);
+                    float block_theta = MAX(p - (n_ctx - 2), 0);
+                    for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
+                        const float cos_theta = cosf(theta_base);
+                        const float sin_theta = sinf(theta_base) * sin_sign;
+                        const float cos_block_theta = cosf(block_theta);
+                        const float sin_block_theta = sinf(block_theta) * sin_sign;
+
+                        theta_base *= theta_scale;
+                        block_theta *= theta_scale;
+
+                        const ggml_v3_fp16_t * const src = (ggml_v3_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+                              ggml_v3_fp16_t * dst_data  = (ggml_v3_fp16_t *)((char *)  dst->data +  i3*nb3 + i2*nb2  + i1*nb1  + i0*nb0);
+
+                        const float x0 = GGML_V3_FP16_TO_FP32(src[0]);
+                        const float x1 = GGML_V3_FP16_TO_FP32(src[n_dims/2]);
+                        const float x2 = GGML_V3_FP16_TO_FP32(src[n_dims]);
+                        const float x3 = GGML_V3_FP16_TO_FP32(src[n_dims/2*3]);
+
+                        dst_data[0]          = GGML_V3_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
+                        dst_data[n_dims/2]   = GGML_V3_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
+                        dst_data[n_dims]     = GGML_V3_FP32_TO_FP16(x2*cos_block_theta - x3*sin_block_theta);
+                        dst_data[n_dims/2*3] = GGML_V3_FP32_TO_FP16(x2*sin_block_theta + x3*cos_block_theta);
+                    }
+                } else if (!is_neox) {
+                    for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
+                        float cos_theta, sin_theta;
+                        rope_yarn(
+                            theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
+                        );
+                        sin_theta *= sin_sign;
+
+                        theta_base *= theta_scale;
+
+                        const ggml_v3_fp16_t * const src = (ggml_v3_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+                              ggml_v3_fp16_t * dst_data  = (ggml_v3_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
+
+                        const float x0 = GGML_V3_FP16_TO_FP32(src[0]);
+                        const float x1 = GGML_V3_FP16_TO_FP32(src[1]);
+
+                        dst_data[0] = GGML_V3_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
+                        dst_data[1] = GGML_V3_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
+                    }
+                } else {
+                    // TODO: this might be wrong for ne0 != n_dims - need double check
+                    //       it seems we have to rope just the first n_dims elements and do nothing with the rest
+                    // ref:  https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26
+                    theta_base *= freq_scale;
+                    for (int64_t ic = 0; ic < ne0; ic += 2) {
+                        if (ic < n_dims) {
+                            const int64_t ib = 0;
+
+                            // simplified from `(ib * n_dims + ic) * inv_ndims`
+                            float cur_rot = inv_ndims * ic - ib;
+
+                            float cos_theta, sin_theta;
+                            rope_yarn(
+                                theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
+                                &cos_theta, &sin_theta
+                            );
+                            sin_theta *= sin_sign;
+
+                            theta_base *= theta_scale;
+
+                            const int64_t i0 = ib*n_dims + ic/2;
+
+                            const ggml_v3_fp16_t * const src = (ggml_v3_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+                                  ggml_v3_fp16_t * dst_data  = (ggml_v3_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
+
+                            const float x0 = GGML_V3_FP16_TO_FP32(src[0]);
+                            const float x1 = GGML_V3_FP16_TO_FP32(src[n_dims/2]);
+
+                            dst_data[0]        = GGML_V3_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
+                            dst_data[n_dims/2] = GGML_V3_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
+                        } else {
+                            const int64_t i0 = ic;
+
+                            const ggml_v3_fp16_t * const src = (ggml_v3_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+                                  ggml_v3_fp16_t * dst_data  = (ggml_v3_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
+
+                            dst_data[0] = src[0];
+                            dst_data[1] = src[1];
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+static void ggml_v3_compute_forward_rope(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        const struct ggml_v3_tensor * src1,
+        struct ggml_v3_tensor * dst) {
+    switch (src0->type) {
+        case GGML_V3_TYPE_F16:
+            {
+                ggml_v3_compute_forward_rope_f16(params, src0, src1, dst, true);
+            } break;
+        case GGML_V3_TYPE_F32:
+            {
+                ggml_v3_compute_forward_rope_f32(params, src0, src1, dst, true);
+            } break;
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_v3_compute_forward_rope_back
+
+static void ggml_v3_compute_forward_rope_back(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        const struct ggml_v3_tensor * src1,
+        struct ggml_v3_tensor * dst) {
+    switch (src0->type) {
+        case GGML_V3_TYPE_F16:
+            {
+                ggml_v3_compute_forward_rope_f16(params, src0, src1, dst, false);
+            } break;
+        case GGML_V3_TYPE_F32:
+            {
+                ggml_v3_compute_forward_rope_f32(params, src0, src1, dst, false);
+            } break;
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_v3_compute_forward_conv_transpose_1d
+
+static void ggml_v3_compute_forward_conv_transpose_1d_f16_f32(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        const struct ggml_v3_tensor * src1,
+              struct ggml_v3_tensor * dst) {
+    GGML_V3_ASSERT(src0->type == GGML_V3_TYPE_F16);
+    GGML_V3_ASSERT(src1->type == GGML_V3_TYPE_F32);
+    GGML_V3_ASSERT( dst->type == GGML_V3_TYPE_F32);
+
+    int64_t t0 = ggml_v3_perf_time_us();
+    UNUSED(t0);
+
+    GGML_V3_TENSOR_BINARY_OP_LOCALS
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nk = ne00*ne01*ne02;
+
+    GGML_V3_ASSERT(nb00 == sizeof(ggml_v3_fp16_t));
+    GGML_V3_ASSERT(nb10 == sizeof(float));
+
+    if (params->type == GGML_V3_TASK_INIT) {
+        memset(params->wdata, 0, params->wsize);
+
+        // permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
+        {
+            ggml_v3_fp16_t * const wdata = (ggml_v3_fp16_t *) params->wdata + 0;
+
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                for (int64_t i01 = 0; i01 < ne01; i01++) {
+                    const ggml_v3_fp16_t * const src = (ggml_v3_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01);
+                    ggml_v3_fp16_t * dst_data = wdata + i01*ne00*ne02;
+                    for (int64_t i00 = 0; i00 < ne00; i00++) {
+                        dst_data[i00*ne02 + i02] = src[i00];
+                    }
+                }
+            }
+        }
+
+        // permute source data (src1) from (L x Cin) to (Cin x L)
+        {
+            ggml_v3_fp16_t * const wdata = (ggml_v3_fp16_t *) params->wdata + nk;
+            ggml_v3_fp16_t * dst_data = wdata;
+
+            for (int64_t i11 = 0; i11 < ne11; i11++) {
+                const float * const src = (float *)((char *) src1->data + i11*nb11);
+                for (int64_t i10 = 0; i10 < ne10; i10++) {
+                    dst_data[i10*ne11 + i11] = GGML_V3_FP32_TO_FP16(src[i10]);
+                }
+            }
+        }
+
+        // need to zero dst since we are accumulating into it
+        memset(dst->data, 0, ggml_v3_nbytes(dst));
+
+        return;
+    }
+
+    if (params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
+
+    // total rows in dst
+    const int nr = ne1;
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    ggml_v3_fp16_t * const wdata     = (ggml_v3_fp16_t *) params->wdata + 0;
+    ggml_v3_fp16_t * const wdata_src = wdata + nk;
+
+    for (int i1 = ir0; i1 < ir1; i1++) {
+        float * dst_data = (float *)((char *) dst->data + i1*nb1);
+        ggml_v3_fp16_t * wdata_kernel = wdata + i1*ne02*ne00;
+        for (int i10 = 0; i10 < ne10; i10++) {
+            const int i1n = i10*ne11;
+            for (int i00 = 0; i00 < ne00; i00++) {
+                float v = 0;
+                ggml_v3_vec_dot_f16(ne02, &v,
+                        (ggml_v3_fp16_t *)    wdata_src + i1n,
+                        (ggml_v3_fp16_t *) wdata_kernel + i00*ne02);
+                dst_data[i10*s0 + i00] += v;
+            }
+        }
+    }
+}
+
+static void ggml_v3_compute_forward_conv_transpose_1d_f32(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        const struct ggml_v3_tensor * src1,
+              struct ggml_v3_tensor * dst) {
+    GGML_V3_ASSERT(src0->type == GGML_V3_TYPE_F32);
+    GGML_V3_ASSERT(src1->type == GGML_V3_TYPE_F32);
+    GGML_V3_ASSERT( dst->type == GGML_V3_TYPE_F32);
+
+    int64_t t0 = ggml_v3_perf_time_us();
+    UNUSED(t0);
+
+    GGML_V3_TENSOR_BINARY_OP_LOCALS
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nk = ne00*ne01*ne02;
+
+    GGML_V3_ASSERT(nb00 == sizeof(float));
+    GGML_V3_ASSERT(nb10 == sizeof(float));
+
+    if (params->type == GGML_V3_TASK_INIT) {
+        memset(params->wdata, 0, params->wsize);
+
+        // prepare kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
+        {
+            float * const wdata = (float *) params->wdata + 0;
+
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                for (int64_t i01 = 0; i01 < ne01; i01++) {
+                    const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01);
+                    float * dst_data = wdata + i01*ne00*ne02;
+                    for (int64_t i00 = 0; i00 < ne00; i00++) {
+                        dst_data[i00*ne02 + i02] = src[i00];
+                    }
+                }
+            }
+        }
+
+        // prepare source data (src1)
+        {
+            float * const wdata = (float *) params->wdata + nk;
+            float * dst_data = wdata;
+
+            for (int64_t i11 = 0; i11 < ne11; i11++) {
+                const float * const src = (float *)((char *) src1->data + i11*nb11);
+                for (int64_t i10 = 0; i10 < ne10; i10++) {
+                    dst_data[i10*ne11 + i11] = src[i10];
+                }
+            }
+        }
+
+        // need to zero dst since we are accumulating into it
+        memset(dst->data, 0, ggml_v3_nbytes(dst));
+
+        return;
+    }
+
+    if (params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
+
+    // total rows in dst
+    const int nr = ne1;
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    float * const wdata     = (float *) params->wdata + 0;
+    float * const wdata_src = wdata + nk;
+
+    for (int i1 = ir0; i1 < ir1; i1++) {
+        float * dst_data = (float *)((char *) dst->data + i1*nb1);
+        float * wdata_kernel = wdata + i1*ne02*ne00;
+        for (int i10 = 0; i10 < ne10; i10++) {
+            const int i1n = i10*ne11;
+            for (int i00 = 0; i00 < ne00; i00++) {
+                float v = 0;
+                ggml_v3_vec_dot_f32(ne02, &v,
+                        wdata_src + i1n,
+                        wdata_kernel + i00*ne02);
+                dst_data[i10*s0 + i00] += v;
+            }
+        }
+    }
+}
+
+static void ggml_v3_compute_forward_conv_transpose_1d(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        const struct ggml_v3_tensor * src1,
+              struct ggml_v3_tensor * dst) {
+    switch (src0->type) {
+        case GGML_V3_TYPE_F16:
+            {
+                ggml_v3_compute_forward_conv_transpose_1d_f16_f32(params, src0, src1, dst);
+            } break;
+        case GGML_V3_TYPE_F32:
+            {
+                ggml_v3_compute_forward_conv_transpose_1d_f32(params, src0, src1, dst);
+            } break;
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+}
+
+// src0: kernel [OC, IC, KH, KW]
+// src1: image [N, IC, IH, IW]
+// dst:  result [N, OH, OW, IC*KH*KW]
+static void ggml_v3_compute_forward_im2col_f16(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        const struct ggml_v3_tensor * src1,
+              struct ggml_v3_tensor * dst) {
+    GGML_V3_ASSERT(src0->type == GGML_V3_TYPE_F16);
+    GGML_V3_ASSERT(src1->type == GGML_V3_TYPE_F32);
+    GGML_V3_ASSERT( dst->type == GGML_V3_TYPE_F16);
+
+    int64_t t0 = ggml_v3_perf_time_us();
+    UNUSED(t0);
+
+    GGML_V3_TENSOR_BINARY_OP_LOCALS;
+
+    const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
+    const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
+    const int32_t p0 = ((const int32_t *)(dst->op_params))[2];
+    const int32_t p1 = ((const int32_t *)(dst->op_params))[3];
+    const int32_t d0 = ((const int32_t *)(dst->op_params))[4];
+    const int32_t d1 = ((const int32_t *)(dst->op_params))[5];
+    const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1;
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int64_t N  = is_2D ? ne13 : ne12;
+    const int64_t IC = is_2D ? ne12 : ne11;
+    const int64_t IH = is_2D ? ne11 : 1;
+    const int64_t IW = ne10;
+
+    const int64_t KH = is_2D ? ne01 : 1;
+    const int64_t KW = ne00;
+
+    const int64_t OH = is_2D ? ne2 : 1;
+    const int64_t OW = ne1;
+
+    int ofs0 = is_2D ? nb13 : nb12;
+    int ofs1 = is_2D ? nb12 : nb11;
+
+    GGML_V3_ASSERT(nb00 == sizeof(ggml_v3_fp16_t));
+    GGML_V3_ASSERT(nb10 == sizeof(float));
+
+    if (params->type == GGML_V3_TASK_INIT) {
+        return;
+    }
+
+    if (params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
+    {
+        ggml_v3_fp16_t * const wdata = (ggml_v3_fp16_t *) dst->data;
+
+        for (int64_t in = 0; in < N; in++) {
+            for (int64_t ioh = 0; ioh < OH; ioh++) { // 1
+                for (int64_t iow = 0; iow < OW; iow++) {
+                    for (int64_t iic = ith; iic < IC; iic += nth) {
+
+                        // micro kernel
+                        ggml_v3_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
+                        const float * const src_data = (float *)((char *) src1->data + in*ofs0 + iic*ofs1); // [IH, IW]
+
+                        for (int64_t ikh = 0; ikh < KH; ikh++) {  // 1
+                            for (int64_t ikw = 0; ikw < KW; ikw++) {
+                                const int64_t iiw = iow*s0 + ikw*d0 - p0;
+                                const int64_t iih = ioh*s1 + ikh*d1 - p1;
+
+                                if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
+                                    dst_data[iic*(KH*KW) + ikh*KW + ikw] = 0;
+                                } else {
+                                    dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_V3_FP32_TO_FP16(src_data[iih*IW + iiw]);
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+static void ggml_v3_compute_forward_im2col(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        const struct ggml_v3_tensor * src1,
+              struct ggml_v3_tensor * dst) {
+    switch (src0->type) {
+        case GGML_V3_TYPE_F16:
+            {
+                ggml_v3_compute_forward_im2col_f16(params, src0, src1, dst);
+            } break;
+        case GGML_V3_TYPE_F32:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_v3_compute_forward_conv_transpose_2d
+
+static void ggml_v3_compute_forward_conv_transpose_2d(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        const struct ggml_v3_tensor * src1,
+              struct ggml_v3_tensor * dst) {
+    GGML_V3_ASSERT(src0->type == GGML_V3_TYPE_F16);
+    GGML_V3_ASSERT(src1->type == GGML_V3_TYPE_F32);
+    GGML_V3_ASSERT( dst->type == GGML_V3_TYPE_F32);
+
+    int64_t t0 = ggml_v3_perf_time_us();
+    UNUSED(t0);
+
+    GGML_V3_TENSOR_BINARY_OP_LOCALS
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nk = ne00*ne01*ne02*ne03;
+
+    GGML_V3_ASSERT(nb00 == sizeof(ggml_v3_fp16_t));
+    GGML_V3_ASSERT(nb10 == sizeof(float));
+
+    if (params->type == GGML_V3_TASK_INIT) {
+        memset(params->wdata, 0, params->wsize);
+
+        // permute kernel data (src0) from (Kw x Kh x Cout x Cin) to (Cin x Kw x Kh x Cout)
+        {
+            ggml_v3_fp16_t * const wdata = (ggml_v3_fp16_t *) params->wdata + 0;
+
+            for (int64_t i03 = 0; i03 < ne03; i03++) {
+                for (int64_t i02 = 0; i02 < ne02; i02++) {
+                    const ggml_v3_fp16_t * const src = (ggml_v3_fp16_t *)((char *) src0->data + i03*nb03 + i02*nb02);
+                    ggml_v3_fp16_t * dst_data = wdata + i02*ne01*ne00*ne03;
+                    for (int64_t i01 = 0; i01 < ne01; i01++) {
+                        for (int64_t i00 = 0; i00 < ne00; i00++) {
+                            dst_data[i01*ne00*ne03 + i00*ne03 + i03] = src[i01 * ne00 + i00];
+                        }
+                    }
+                }
+            }
+        }
+
+        // permute source data (src1) from (Sw x Sh x Cin) to (Cin x Sw x Sh)
+        {
+            ggml_v3_fp16_t * const wdata = (ggml_v3_fp16_t *) params->wdata + nk;
+            for (int i12 = 0; i12 < ne12; i12++) {
+                for (int i11 = 0; i11 < ne11; i11++) {
+                    const float * const src = (float *)((char *) src1->data + i12*nb12 + i11*nb11);
+                    ggml_v3_fp16_t * dst_data = wdata + i11*ne10*ne12;
+                    for (int i10 = 0; i10 < ne10; i10++) {
+                        dst_data[i10*ne12 + i12] = GGML_V3_FP32_TO_FP16(src[i10]);
+                    }
+                }
+            }
+        }
+
+        memset(dst->data, 0, ggml_v3_nbytes(dst));
+
+        return;
+    }
+
+    if (params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    const int32_t stride = ggml_v3_get_op_params_i32(dst, 0);
+
+    // total patches in dst
+    const int np = ne2;
+
+    // patches per thread
+    const int dp = (np + nth - 1)/nth;
+
+    // patch range for this thread
+    const int ip0 = dp*ith;
+    const int ip1 = MIN(ip0 + dp, np);
+
+    ggml_v3_fp16_t * const wdata = (ggml_v3_fp16_t *) params->wdata + 0;
+    ggml_v3_fp16_t * const wdata_src = wdata + nk;
+
+    for (int i2 = ip0; i2 < ip1; i2++) { // Cout
+        float * dst_data = (float *)((char *) dst->data + i2*nb2);
+        ggml_v3_fp16_t * wdata_kernel = wdata + i2*ne01*ne00*ne03;
+        for (int i11 = 0; i11 < ne11; i11++) {
+            for (int i10 = 0; i10 < ne10; i10++) {
+                const int i1n = i11*ne10*ne12 + i10*ne12;
+                for (int i01 = 0; i01 < ne01; i01++) {
+                    for (int i00 = 0; i00 < ne00; i00++) {
+                        float v = 0;
+                        ggml_v3_vec_dot_f16(ne03, &v,
+                                wdata_src + i1n,
+                                wdata_kernel + i01*ne00*ne03 + i00*ne03);
+                        dst_data[(i11*stride + i01)*ne0 + i10*stride + i00] += v;
+                    }
+                }
+            }
+        }
+    }
+}
+
+// ggml_v3_compute_forward_pool_1d_sk_p0
+
+static void ggml_v3_compute_forward_pool_1d_sk_p0(
+        const struct ggml_v3_compute_params * params,
+        const enum ggml_v3_op_pool op,
+        const struct ggml_v3_tensor * src,
+        const int k,
+        struct ggml_v3_tensor * dst) {
+    assert(src->type == GGML_V3_TYPE_F32);
+    assert(params->ith == 0);
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    const char * cdata = (const char *)src->data;
+    const char * const data_end = cdata + ggml_v3_nbytes(src);
+    float * drow = (float *)dst->data;
+
+    const int64_t rs = dst->ne[0];
+
+    while (cdata < data_end) {
+        const float * const srow = (const float *)cdata;
+
+        int j = 0;
+
+        for (int64_t i = 0; i < rs; ++i) {
+            switch (op) {
+                case GGML_V3_OP_POOL_AVG:   drow[i] = 0;        break;
+                case GGML_V3_OP_POOL_MAX:   drow[i] = -FLT_MAX; break;
+                case GGML_V3_OP_POOL_COUNT: GGML_V3_ASSERT(false); break;
+            }
+            for (int ki = 0; ki < k; ++ki) {
+                switch (op) {
+                    case GGML_V3_OP_POOL_AVG:                          drow[i] += srow[j]; break;
+                    case GGML_V3_OP_POOL_MAX:   if (srow[j] > drow[i]) drow[i]  = srow[j]; break;
+                    case GGML_V3_OP_POOL_COUNT:                        GGML_V3_ASSERT(false); break;
+                }
+                ++j;
+            }
+            switch (op) {
+                case GGML_V3_OP_POOL_AVG:         drow[i] /= k; break;
+                case GGML_V3_OP_POOL_MAX:                       break;
+                case GGML_V3_OP_POOL_COUNT: GGML_V3_ASSERT(false); break;
+            }
+        }
+
+        cdata += src->nb[1];
+        drow  += rs;
+    }
+}
+
+// ggml_v3_compute_forward_pool_1d
+
+static void ggml_v3_compute_forward_pool_1d(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+              struct ggml_v3_tensor * dst) {
+
+    const int32_t * opts = (const int32_t *)dst->op_params;
+    enum ggml_v3_op_pool op = opts[0];
+    const int k0 = opts[1];
+    const int s0 = opts[2];
+    const int p0 = opts[3];
+    GGML_V3_ASSERT(p0 == 0); // padding not supported
+    GGML_V3_ASSERT(k0 == s0); // only s = k supported
+
+    ggml_v3_compute_forward_pool_1d_sk_p0(params, op, src0, k0, dst);
+}
+
+// ggml_v3_compute_forward_pool_2d
+
+static void ggml_v3_compute_forward_pool_2d(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src,
+        struct ggml_v3_tensor * dst) {
+    assert(src->type == GGML_V3_TYPE_F32);
+    assert(params->ith == 0);
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    const int32_t * opts = (const int32_t *)dst->op_params;
+    enum ggml_v3_op_pool op = opts[0];
+    const int k0 = opts[1];
+    const int k1 = opts[2];
+    const int s0 = opts[3];
+    const int s1 = opts[4];
+    const int p0 = opts[5];
+    const int p1 = opts[6];
+    const char * cdata = (const char*)src->data;
+    const char * const data_end = cdata + ggml_v3_nbytes(src);
+
+    const int64_t px = dst->ne[0];
+    const int64_t py = dst->ne[1];
+    const int64_t pa = px * py;
+
+    float * dplane = (float *)dst->data;
+
+    const int ka = k0 * k1;
+    const int offset0 = -p0;
+    const int offset1 = -p1;
+
+    while (cdata < data_end) {
+        for (int oy = 0; oy < py; ++oy) {
+            float * const drow = dplane + oy * px;
+            for (int ox = 0; ox < px; ++ox) {
+                float * const out =  drow + ox;
+                switch (op) {
+                    case GGML_V3_OP_POOL_AVG:     *out = 0;        break;
+                    case GGML_V3_OP_POOL_MAX:     *out = -FLT_MAX; break;
+                    case GGML_V3_OP_POOL_COUNT: GGML_V3_ASSERT(false); break;
+                }
+
+                const int ix = offset0 + ox * s0;
+                const int iy = offset1 + oy * s1;
+
+                for (int ky = 0; ky < k1; ++ky) {
+                    if (iy + ky < 0 || iy + ky >= src->ne[1]) continue;
+                    const float * const srow = (const float *)(cdata + src->nb[1] * (iy + ky));
+                    for (int kx = 0; kx < k0; ++kx) {
+                        int j = ix + kx;
+                        if (j < 0 || j >= src->ne[0]) continue;
+                        switch (op) {
+                            case GGML_V3_OP_POOL_AVG:                     *out += srow[j]; break;
+                            case GGML_V3_OP_POOL_MAX: if (srow[j] > *out) *out  = srow[j]; break;
+                            case GGML_V3_OP_POOL_COUNT:                GGML_V3_ASSERT(false); break;
+                        }
+                    }
+                }
+                switch (op) {
+                    case GGML_V3_OP_POOL_AVG:           *out /= ka; break;
+                    case GGML_V3_OP_POOL_MAX:                       break;
+                    case GGML_V3_OP_POOL_COUNT: GGML_V3_ASSERT(false); break;
+                }
+            }
+        }
+
+        cdata  += src->nb[2];
+        dplane += pa;
+    }
+}
+
+// ggml_v3_compute_forward_upscale
+
+static void ggml_v3_compute_forward_upscale_f32(
+    const struct ggml_v3_compute_params * params,
+    const struct ggml_v3_tensor * src0,
+    struct ggml_v3_tensor * dst) {
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    GGML_V3_ASSERT(src0->nb[0] == sizeof(float));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    GGML_V3_TENSOR_UNARY_OP_LOCALS
+
+    const int scale_factor = dst->op_params[0];
+
+    // TODO: optimize
+
+    for (int64_t i3 = 0; i3 < ne3; i3++) {
+        const int64_t i03 = i3;
+        for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
+            const int64_t i02 = i2;
+            for (int64_t i1 = 0; i1 < ne1; i1++) {
+                const int64_t i01 = i1 / scale_factor;
+                for (int64_t i0 = 0; i0 < ne0; i0++) {
+                    const int64_t i00 = i0 / scale_factor;
+
+                    const float * x = (float *)((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+                          float * y = (float *)((char *)  dst->data +  i0*nb0  +  i1*nb1  +  i2*nb2  +  i3*nb3);
+
+                    *y = *x;
+                }
+            }
+        }
+    }
+}
+
+static void ggml_v3_compute_forward_upscale(
+    const struct ggml_v3_compute_params * params,
+    const struct ggml_v3_tensor * src0,
+    struct ggml_v3_tensor * dst) {
+    switch (src0->type) {
+        case GGML_V3_TYPE_F32:
+            {
+                ggml_v3_compute_forward_upscale_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_v3_compute_forward_pad
+
+static void ggml_v3_compute_forward_pad_f32(
+    const struct ggml_v3_compute_params * params,
+    const struct ggml_v3_tensor * src0,
+          struct ggml_v3_tensor * dst) {
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    GGML_V3_ASSERT(src0->nb[0] == sizeof(float));
+    GGML_V3_ASSERT( dst->nb[0] == sizeof(float));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    GGML_V3_TENSOR_UNARY_OP_LOCALS
+
+    float * dst_ptr = (float *) dst->data;
+
+    // TODO: optimize
+
+    for (int64_t i2 = 0; i2 < ne2; ++i2) {
+        for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
+            for (int64_t i0 = 0; i0 < ne0; ++i0) {
+                for (int64_t i3 = 0; i3 < ne3; ++i3) {
+                    const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
+
+                    const float * src_ptr = (const float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+
+                    if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
+                        dst_ptr[dst_idx] = *src_ptr;
+                    } else {
+                        dst_ptr[dst_idx] = 0;
+                    }
+                }
+            }
+        }
+    }
+}
+
+static void ggml_v3_compute_forward_pad(
+    const struct ggml_v3_compute_params * params,
+    const struct ggml_v3_tensor * src0,
+    struct ggml_v3_tensor * dst) {
+    switch (src0->type) {
+        case GGML_V3_TYPE_F32:
+            {
+                ggml_v3_compute_forward_pad_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_v3_compute_forward_argsort
+
+static void ggml_v3_compute_forward_argsort_f32(
+    const struct ggml_v3_compute_params * params,
+    const struct ggml_v3_tensor * src0,
+    struct ggml_v3_tensor * dst) {
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    GGML_V3_TENSOR_UNARY_OP_LOCALS
+
+    GGML_V3_ASSERT(nb0 == sizeof(float));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int64_t nr = ggml_v3_nrows(src0);
+
+    enum ggml_v3_sort_order order = (enum ggml_v3_sort_order) ggml_v3_get_op_params_i32(dst, 0);
+
+    for (int64_t i = ith; i < nr; i += nth) {
+        int32_t * dst_data = (int32_t *)((char *) dst->data + i*nb1);
+        const float * src_data = (float *)((char *) src0->data + i*nb01);
+
+        for (int64_t j = 0; j < ne0; j++) {
+            dst_data[j] = j;
+        }
+
+        // C doesn't have a functional sort, so we do a bubble sort instead
+        for (int64_t j = 0; j < ne0; j++) {
+            for (int64_t k = j + 1; k < ne0; k++) {
+                if ((order == GGML_V3_SORT_ASC && src_data[dst_data[j]] > src_data[dst_data[k]]) ||
+                    (order == GGML_V3_SORT_DESC && src_data[dst_data[j]] < src_data[dst_data[k]])) {
+                    int32_t tmp = dst_data[j];
+                    dst_data[j] = dst_data[k];
+                    dst_data[k] = tmp;
+                }
+            }
+        }
+    }
+}
+
+static void ggml_v3_compute_forward_argsort(
+    const struct ggml_v3_compute_params * params,
+    const struct ggml_v3_tensor * src0,
+    struct ggml_v3_tensor * dst) {
+
+    switch (src0->type) {
+        case GGML_V3_TYPE_F32:
+            {
+                ggml_v3_compute_forward_argsort_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_v3_compute_forward_flash_attn
+
+static void ggml_v3_compute_forward_flash_attn_f32(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * q,
+        const struct ggml_v3_tensor * k,
+        const struct ggml_v3_tensor * v,
+        const bool masked,
+        struct ggml_v3_tensor * dst) {
+    int64_t t0 = ggml_v3_perf_time_us();
+    UNUSED(t0);
+
+    GGML_V3_TENSOR_LOCALS(int64_t, neq, q,   ne)
+    GGML_V3_TENSOR_LOCALS(size_t,  nbq, q,   nb)
+    GGML_V3_TENSOR_LOCALS(int64_t, nek, k,   ne)
+    GGML_V3_TENSOR_LOCALS(size_t,  nbk, k,   nb)
+    GGML_V3_TENSOR_LOCALS(int64_t, nev, v,   ne)
+    GGML_V3_TENSOR_LOCALS(size_t,  nbv, v,   nb)
+    GGML_V3_TENSOR_LOCALS(int64_t, ne,  dst, ne)
+    GGML_V3_TENSOR_LOCALS(size_t,  nb,  dst, nb)
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int64_t D = neq0;
+    const int64_t N = neq1;
+    const int64_t P = nek1 - N;
+    const int64_t M = P + N;
+
+    const int Mup = ggml_v3_up(M, GGML_V3_SOFT_MAX_UNROLL);
+
+    GGML_V3_ASSERT(ne0 == D);
+    GGML_V3_ASSERT(ne1 == N);
+    GGML_V3_ASSERT(P >= 0);
+
+    GGML_V3_ASSERT(nbq0 == sizeof(float));
+    GGML_V3_ASSERT(nbk0 == sizeof(float));
+    GGML_V3_ASSERT(nbv0 == sizeof(float));
+
+    GGML_V3_ASSERT(neq0 == D);
+    GGML_V3_ASSERT(nek0 == D);
+    GGML_V3_ASSERT(nev1 == D);
+
+    GGML_V3_ASSERT(neq1 == N);
+    GGML_V3_ASSERT(nek1 == N + P);
+    GGML_V3_ASSERT(nev1 == D);
+
+    // dst cannot be transposed or permuted
+    GGML_V3_ASSERT(nb0 == sizeof(float));
+    GGML_V3_ASSERT(nb0 <= nb1);
+    GGML_V3_ASSERT(nb1 <= nb2);
+    GGML_V3_ASSERT(nb2 <= nb3);
+
+    if (params->type == GGML_V3_TASK_INIT) {
+        return;
+    }
+
+    if (params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    // parallelize by q rows using ggml_v3_vec_dot_f32
+
+    // total rows in q
+    const int nr = neq1*neq2*neq3;
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    const float scale = 1.0f/sqrtf(D);
+
+    //printf("P=%d N=%d D=%d ir0=%d ir1=%d scale = %f\n", P, N, D, ir0, ir1, scale);
+
+    for (int ir = ir0; ir < ir1; ++ir) {
+        // q indices
+        const int iq3 = ir/(neq2*neq1);
+        const int iq2 = (ir - iq3*neq2*neq1)/neq1;
+        const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);
+
+        float * S = (float *) params->wdata + ith*(Mup + CACHE_LINE_SIZE_F32);
+
+        for (int i = M; i < Mup; ++i) {
+            S[i] = -INFINITY;
+        }
+
+        const int64_t masked_begin = masked ? (P + iq1 + 1) : M;
+        for (int64_t ic = 0; ic < masked_begin; ++ic) {
+            // k indices
+            const int ik3 = iq3;
+            const int ik2 = iq2 % nek2;
+            const int ik1 = ic;
+
+            // S indices
+            const int i1 = ik1;
+
+            ggml_v3_vec_dot_f32(neq0,
+                    S + i1,
+                    (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
+                    (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
+        }
+
+        // scale
+        ggml_v3_vec_scale_f32(masked_begin, S, scale);
+
+        for (int64_t i = masked_begin; i < M; i++) {
+            S[i] = -INFINITY;
+        }
+
+        // softmax
+        // exclude known -INF S[..] values from max and loop
+        // dont forget to set their SW values to zero
+        {
+            float max = -INFINITY;
+            ggml_v3_vec_max_f32(masked_begin, &max, S);
+
+            ggml_v3_float sum = 0.0;
+            {
+#ifdef GGML_V3_SOFT_MAX_ACCELERATE
+                max = -max;
+                vDSP_vsadd(S, 1, &max, S, 1, Mup);
+                vvexpf(S, S, &Mup);
+                ggml_v3_vec_sum_f32(Mup, &sum, S);
+#else
+                uint16_t   scvt[GGML_V3_SOFT_MAX_UNROLL]; UNUSED(scvt);
+                ggml_v3_float sump[GGML_V3_SOFT_MAX_UNROLL] = { 0.0 };
+
+                for (int i = 0; i < Mup; i += GGML_V3_SOFT_MAX_UNROLL) {
+                    if (i >= masked_begin) {
+                        break;
+                    }
+                    float * SS = S + i;
+
+                    for (int j = 0; j < GGML_V3_SOFT_MAX_UNROLL; ++j) {
+                        if (i + j >= masked_begin) {
+                            break;
+                        } else if (SS[j] == -INFINITY) {
+                            SS[j] = 0.0f;
+                        } else {
+#ifndef GGML_V3_FLASH_ATTN_EXP_FP16
+                            const float val = expf(SS[j] - max);
+#else
+                            ggml_v3_fp16_t s = GGML_V3_FP32_TO_FP16(SS[j] - max);
+                            memcpy(&scvt[j], &s, sizeof(uint16_t));
+                            const float val = GGML_V3_FP16_TO_FP32(ggml_v3_table_exp_f16[scvt[j]]);
+#endif
+                            sump[j] += (ggml_v3_float)val;
+                            SS[j] = val;
+                        }
+                    }
+                }
+
+                for (int i = 0; i < GGML_V3_SOFT_MAX_UNROLL; i++) {
+                    sum += sump[i];
+                }
+#endif
+            }
+
+            assert(sum > 0.0);
+
+            sum = 1.0/sum;
+            ggml_v3_vec_scale_f32(masked_begin, S, sum);
+
+#ifndef NDEBUG
+            for (int i = 0; i < masked_begin; ++i) {
+                assert(!isnan(S[i]));
+                assert(!isinf(S[i]));
+            }
+#endif
+        }
+
+        for (int64_t ic = 0; ic < nev1; ++ic) {
+            // dst indices
+            const int i1 = iq1;
+            const int i2 = iq2;
+            const int i3 = iq3;
+
+            // v indices
+            const int iv2 = iq2 % nev2;
+            const int iv3 = iq3;
+
+            ggml_v3_vec_dot_f32(masked_begin,
+                    (float *) ((char *) dst->data + (ic*nb0 + i1*nb1  + i2*nb2   + i3*nb3)),
+                    (float *) ((char *) v->data   + (         ic*nbv1 + iv2*nbv2 + iv3*nbv3)),
+                    S);
+        }
+    }
+}
+
+static void ggml_v3_compute_forward_flash_attn_f16(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * q,
+        const struct ggml_v3_tensor * k,
+        const struct ggml_v3_tensor * v,
+        const bool masked,
+        struct ggml_v3_tensor * dst) {
+    int64_t t0 = ggml_v3_perf_time_us();
+    UNUSED(t0);
+
+    GGML_V3_TENSOR_LOCALS(int64_t, neq, q,   ne)
+    GGML_V3_TENSOR_LOCALS(size_t,  nbq, q,   nb)
+    GGML_V3_TENSOR_LOCALS(int64_t, nek, k,   ne)
+    GGML_V3_TENSOR_LOCALS(size_t,  nbk, k,   nb)
+    GGML_V3_TENSOR_LOCALS(int64_t, nev, v,   ne)
+    GGML_V3_TENSOR_LOCALS(size_t,  nbv, v,   nb)
+    GGML_V3_TENSOR_LOCALS(int64_t, ne,  dst, ne)
+    GGML_V3_TENSOR_LOCALS(size_t,  nb,  dst, nb)
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int64_t D = neq0;
+    const int64_t N = neq1;
+    const int64_t P = nek1 - N;
+    const int64_t M = P + N;
+
+    const int Mup = ggml_v3_up(M, GGML_V3_SOFT_MAX_UNROLL);
+
+    GGML_V3_ASSERT(ne0 == D);
+    GGML_V3_ASSERT(ne1 == N);
+    GGML_V3_ASSERT(P >= 0);
+
+    GGML_V3_ASSERT(nbq0 == sizeof(ggml_v3_fp16_t));
+    GGML_V3_ASSERT(nbk0 == sizeof(ggml_v3_fp16_t));
+    GGML_V3_ASSERT(nbv0 == sizeof(ggml_v3_fp16_t));
+
+    GGML_V3_ASSERT(neq0 == D);
+    GGML_V3_ASSERT(nek0 == D);
+    GGML_V3_ASSERT(nev1 == D);
+
+    GGML_V3_ASSERT(neq1 == N);
+    GGML_V3_ASSERT(nek1 == N + P);
+    GGML_V3_ASSERT(nev1 == D);
+
+    // dst cannot be transposed or permuted
+    GGML_V3_ASSERT(nb0 == sizeof(float));
+    GGML_V3_ASSERT(nb0 <= nb1);
+    GGML_V3_ASSERT(nb1 <= nb2);
+    GGML_V3_ASSERT(nb2 <= nb3);
+
+    if (params->type == GGML_V3_TASK_INIT) {
+        return;
+    }
+
+    if (params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    // parallelize by q rows using ggml_v3_vec_dot_f32
+
+    // total rows in q
+    const int nr = neq1*neq2*neq3;
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    const float scale = 1.0f/sqrtf(D);
+
+    //printf("P=%d N=%d D=%d ir0=%d ir1=%d scale = %f\n", P, N, D, ir0, ir1, scale);
+
+    for (int ir = ir0; ir < ir1; ++ir) {
+        // q indices
+        const int iq3 = ir/(neq2*neq1);
+        const int iq2 = (ir - iq3*neq2*neq1)/neq1;
+        const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);
+
+        float * S = (float *) params->wdata + ith*(2*Mup + CACHE_LINE_SIZE_F32);
+
+        for (int i = M; i < Mup; ++i) {
+            S[i] = -INFINITY;
+        }
+
+        if (GGML_V3_VEC_DOT_UNROLL > 2 || nek1 % GGML_V3_VEC_DOT_UNROLL != 0) {
+            for (int64_t ic = 0; ic < nek1; ++ic) {
+                // k indices
+                const int ik3 = iq3;
+                const int ik2 = iq2 % nek2;
+                const int ik1 = ic;
+
+                // S indices
+                const int i1 = ik1;
+
+                ggml_v3_vec_dot_f16(neq0,
+                        S + i1,
+                        (ggml_v3_fp16_t *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
+                        (ggml_v3_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
+            }
+        } else {
+            for (int64_t ic = 0; ic < nek1; ic += GGML_V3_VEC_DOT_UNROLL) {
+                // k indices
+                const int ik3 = iq3;
+                const int ik2 = iq2 % nek2;
+                const int ik1 = ic;
+
+                // S indices
+                const int i1 = ik1;
+
+                ggml_v3_vec_dot_f16_unroll(neq0, nbk1,
+                        S + i1,
+                        ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
+                        (ggml_v3_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
+            }
+        }
+
+        // scale
+        ggml_v3_vec_scale_f32(nek1, S, scale);
+
+        if (masked) {
+            for (int64_t i = P; i < M; i++) {
+                if (i > P + iq1) {
+                    S[i] = -INFINITY;
+                }
+            }
+        }
+
+        // softmax
+        // todo: exclude known -INF S[..] values from max and loop, assuming their results to be zero.
+        // dont forget to set their S values to zero
+        {
+            float max = -INFINITY;
+            ggml_v3_vec_max_f32(M, &max, S);
+
+            ggml_v3_float sum = 0.0;
+            {
+#ifdef GGML_V3_SOFT_MAX_ACCELERATE
+                max = -max;
+                vDSP_vsadd(S, 1, &max, S, 1, Mup);
+                vvexpf(S, S, &Mup);
+                ggml_v3_vec_sum_f32(Mup, &sum, S);
+#else
+                uint16_t   scvt[GGML_V3_SOFT_MAX_UNROLL];
+                ggml_v3_float sump[GGML_V3_SOFT_MAX_UNROLL] = { 0.0 };
+
+                for (int i = 0; i < Mup; i += GGML_V3_SOFT_MAX_UNROLL) {
+                    float * SS = S + i;
+
+                    for (int j = 0; j < GGML_V3_SOFT_MAX_UNROLL; ++j) {
+                        if (SS[j] == -INFINITY) {
+                            SS[j] = 0.0f;
+                        } else {
+                            ggml_v3_fp16_t s = GGML_V3_FP32_TO_FP16(SS[j] - max);
+                            memcpy(&scvt[j], &s, sizeof(uint16_t));
+                            const float val = GGML_V3_FP16_TO_FP32(ggml_v3_table_exp_f16[scvt[j]]);
+                            sump[j] += (ggml_v3_float)val;
+                            SS[j] = val;
+                        }
+                    }
+                }
+
+                for (int i = 0; i < GGML_V3_SOFT_MAX_UNROLL; i++) {
+                    sum += sump[i];
+                }
+#endif
+            }
+
+            assert(sum > 0.0);
+
+            sum = 1.0/sum;
+            ggml_v3_vec_scale_f32(M, S, sum);
+
+#ifndef NDEBUG
+            for (int i = 0; i < M; ++i) {
+                assert(!isnan(S[i]));
+                assert(!isinf(S[i]));
+            }
+#endif
+        }
+
+        ggml_v3_fp16_t * S16 = (ggml_v3_fp16_t *) ((float *) params->wdata + ith*(2*Mup + CACHE_LINE_SIZE_F32) + Mup);
+
+        for (int64_t i = 0; i < M; i++) {
+            S16[i] = GGML_V3_FP32_TO_FP16(S[i]);
+        }
+
+        // todo: exclude known zero S[..] values from dot (reducing nev0 and increasing begin of v and S16).
+        if (GGML_V3_VEC_DOT_UNROLL == 1 || (nev1 % GGML_V3_VEC_DOT_UNROLL != 0)) {
+            for (int64_t ic = 0; ic < nev1; ++ic) {
+                // dst indices
+                const int i1 = iq1;
+                const int i2 = iq2;
+                const int i3 = iq3;
+
+                // v indices
+                const int iv2 = iq2 % nev2;
+                const int iv3 = iq3;
+
+                ggml_v3_vec_dot_f16(nev0,
+                        (float *)       ((char *) dst->data + (ic*nb0 + i1*nb1  + i2*nb2   + i3*nb3)),
+                        (ggml_v3_fp16_t *) ((char *) v->data   + (         ic*nbv1 + iv2*nbv2 + iv3*nbv3)),
+                        S16);
+            }
+        } else {
+            for (int64_t ic = 0; ic < nev1; ic += GGML_V3_VEC_DOT_UNROLL) {
+                // dst indices
+                const int i1 = iq1;
+                const int i2 = iq2;
+                const int i3 = iq3;
+
+                // v indices
+                const int iv2 = iq2 % nev2;
+                const int iv3 = iq3;
+
+                ggml_v3_vec_dot_f16_unroll(nev0, nbv1,
+                        (float *) ((char *) dst->data + (ic*nb0 + i1*nb1  + i2*nb2   + i3*nb3)),
+                        ((char *)             v->data + (         ic*nbv1 + iv2*nbv2 + iv3*nbv3)),
+                        S16);
+            }
+        }
+    }
+}
+
+static void ggml_v3_compute_forward_flash_attn(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * q,
+        const struct ggml_v3_tensor * k,
+        const struct ggml_v3_tensor * v,
+        const bool masked,
+        struct ggml_v3_tensor * dst) {
+    switch (q->type) {
+        case GGML_V3_TYPE_F16:
+            {
+                ggml_v3_compute_forward_flash_attn_f16(params, q, k, v, masked, dst);
+            } break;
+        case GGML_V3_TYPE_F32:
+            {
+                ggml_v3_compute_forward_flash_attn_f32(params, q, k, v, masked, dst);
+            } break;
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_v3_compute_forward_flash_ff
+
+static void ggml_v3_compute_forward_flash_ff_f16(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * a,  // F16
+        const struct ggml_v3_tensor * b0, // F16 fc_w
+        const struct ggml_v3_tensor * b1, // F32 fc_b
+        const struct ggml_v3_tensor * c0, // F16 proj_w
+        const struct ggml_v3_tensor * c1, // F32 proj_b
+        struct ggml_v3_tensor * dst) {
+    int64_t t0 = ggml_v3_perf_time_us();
+    UNUSED(t0);
+
+    GGML_V3_TENSOR_LOCALS(int64_t, nea,  a,   ne)
+    GGML_V3_TENSOR_LOCALS(size_t,  nba,  a,   nb)
+    GGML_V3_TENSOR_LOCALS(int64_t, neb0, b0,  ne)
+    GGML_V3_TENSOR_LOCALS(size_t,  nbb0, b0,  nb)
+    GGML_V3_TENSOR_LOCALS(int64_t, neb1, b1,  ne)
+    GGML_V3_TENSOR_LOCALS(size_t,  nbb1, b1,  nb)
+    GGML_V3_TENSOR_LOCALS(int64_t, nec0, c0,  ne)
+    GGML_V3_TENSOR_LOCALS(size_t,  nbc0, c0,  nb)
+    GGML_V3_TENSOR_LOCALS(int64_t, nec1, c1,  ne)
+    GGML_V3_TENSOR_LOCALS(size_t,  nbc1, c1,  nb)
+    GGML_V3_TENSOR_LOCALS(int64_t, ne,   dst, ne)
+    GGML_V3_TENSOR_LOCALS(size_t,  nb,   dst, nb)
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int64_t D = nea0;
+    //const int64_t N = nea1;
+    const int64_t M = neb01;
+
+    GGML_V3_ASSERT(ne0 == nea0);
+    GGML_V3_ASSERT(ne1 == nea1);
+    GGML_V3_ASSERT(ne2 == nea2);
+
+    GGML_V3_ASSERT(nba0  == sizeof(ggml_v3_fp16_t));
+    GGML_V3_ASSERT(nbb00 == sizeof(ggml_v3_fp16_t));
+    GGML_V3_ASSERT(nbb10 == sizeof(float));
+    GGML_V3_ASSERT(nbc00 == sizeof(ggml_v3_fp16_t));
+    GGML_V3_ASSERT(nbc10 == sizeof(float));
+
+    GGML_V3_ASSERT(neb00 == D);
+    GGML_V3_ASSERT(neb01 == M);
+    GGML_V3_ASSERT(neb10 == M);
+    GGML_V3_ASSERT(neb11 == 1);
+
+    GGML_V3_ASSERT(nec00 == M);
+    GGML_V3_ASSERT(nec01 == D);
+    GGML_V3_ASSERT(nec10 == D);
+    GGML_V3_ASSERT(nec11 == 1);
+
+    // dst cannot be transposed or permuted
+    GGML_V3_ASSERT(nb0 == sizeof(float));
+    GGML_V3_ASSERT(nb0 <= nb1);
+    GGML_V3_ASSERT(nb1 <= nb2);
+    GGML_V3_ASSERT(nb2 <= nb3);
+
+    if (params->type == GGML_V3_TASK_INIT) {
+        return;
+    }
+
+    if (params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    // parallelize by a rows using ggml_v3_vec_dot_f32
+
+    // total rows in a
+    const int nr = nea1*nea2*nea3;
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int ir = ir0; ir < ir1; ++ir) {
+        // a indices
+        const int ia3 = ir/(nea2*nea1);
+        const int ia2 = (ir - ia3*nea2*nea1)/nea1;
+        const int ia1 = (ir - ia3*nea2*nea1 - ia2*nea1);
+
+        float * S = (float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32);
+
+        for (int64_t ic = 0; ic < neb01; ++ic) {
+            // b0 indices
+            const int ib03 = ia3;
+            const int ib02 = ia2;
+            const int ib01 = ic;
+
+            // S indices
+            const int i1 = ib01;
+
+            ggml_v3_vec_dot_f16(nea0,
+                    S + i1,
+                    (ggml_v3_fp16_t *) ((char *) b0->data + (ib01*nbb01 + ib02*nbb02 + ib03*nbb03)),
+                    (ggml_v3_fp16_t *) ((char *)  a->data + ( ia1*nba1  +  ia2*nba2  +  ia3*nba3)));
+        }
+
+        ggml_v3_vec_add_f32(neb01, S, S, (float *) b1->data);
+        //ggml_v3_vec_gelu_f32(neb01, S, S);
+
+        ggml_v3_fp16_t * S16 = (ggml_v3_fp16_t *) ((float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32) + M);
+
+        for (int64_t i = 0; i < M; i++) {
+            S16[i] = GGML_V3_FP32_TO_FP16(S[i]);
+        }
+
+        ggml_v3_vec_gelu_f16(neb01, S16, S16);
+
+        {
+            // dst indices
+            const int i1 = ia1;
+            const int i2 = ia2;
+            const int i3 = ia3;
+
+            for (int64_t ic = 0; ic < nec01; ++ic) {
+
+                ggml_v3_vec_dot_f16(neb01,
+                        (float *)       ((char *) dst->data + (ic*nb0 + i1*nb1   + i2*nb2   + i3*nb3)),
+                        (ggml_v3_fp16_t *) ((char *) c0->data  + (         ic*nbc01 + i2*nbc02 + i3*nbc03)),
+                        S16);
+            }
+
+            ggml_v3_vec_add_f32(nec01,
+                    (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3)),
+                    (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3)),
+                    (float *) c1->data);
+        }
+    }
+}
+
+static void ggml_v3_compute_forward_flash_ff(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * a,
+        const struct ggml_v3_tensor * b0,
+        const struct ggml_v3_tensor * b1,
+        const struct ggml_v3_tensor * c0,
+        const struct ggml_v3_tensor * c1,
+        struct ggml_v3_tensor * dst) {
+    switch (b0->type) {
+        case GGML_V3_TYPE_F16:
+            {
+                ggml_v3_compute_forward_flash_ff_f16(params, a, b0, b1, c0, c1, dst);
+            } break;
+        case GGML_V3_TYPE_F32:
+            {
+                GGML_V3_ASSERT(false); // TODO
+            } break;
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_v3_compute_forward_flash_attn_back
+
+static void ggml_v3_compute_forward_flash_attn_back_f32(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * q,
+        const struct ggml_v3_tensor * k,
+        const struct ggml_v3_tensor * v,
+        const struct ggml_v3_tensor * d,
+        const bool masked,
+              struct ggml_v3_tensor * dst) {
+    int64_t t0 = ggml_v3_perf_time_us();
+    UNUSED(t0);
+
+    GGML_V3_TENSOR_LOCALS(int64_t, neq, q,   ne)
+    GGML_V3_TENSOR_LOCALS(size_t,  nbq, q,   nb)
+    GGML_V3_TENSOR_LOCALS(int64_t, nek, k,   ne)
+    GGML_V3_TENSOR_LOCALS(size_t,  nbk, k,   nb)
+    GGML_V3_TENSOR_LOCALS(int64_t, nev, v,   ne)
+    GGML_V3_TENSOR_LOCALS(size_t,  nbv, v,   nb)
+    GGML_V3_TENSOR_LOCALS(int64_t, ned, d,   ne)
+    GGML_V3_TENSOR_LOCALS(size_t,  nbd, d,   nb)
+    GGML_V3_TENSOR_LOCALS(int64_t, ne,  dst, ne)
+    GGML_V3_TENSOR_LOCALS(size_t,  nb,  dst, nb)
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int64_t D = neq0;
+    const int64_t N = neq1;
+    const int64_t P = nek1 - N;
+    const int64_t M = P + N;
+
+    const int Mup  = ggml_v3_up(M, GGML_V3_SOFT_MAX_UNROLL);
+    const int mxDM = MAX(D, Mup);
+
+    // GGML_V3_ASSERT(ne0 == D);
+    // GGML_V3_ASSERT(ne1 == N);
+    GGML_V3_ASSERT(P >= 0);
+
+    GGML_V3_ASSERT(nbq0 == sizeof(float));
+    GGML_V3_ASSERT(nbk0 == sizeof(float));
+    GGML_V3_ASSERT(nbv0 == sizeof(float));
+
+    GGML_V3_ASSERT(neq0 == D);
+    GGML_V3_ASSERT(nek0 == D);
+    GGML_V3_ASSERT(nev1 == D);
+    GGML_V3_ASSERT(ned0 == D);
+
+    GGML_V3_ASSERT(neq1 == N);
+    GGML_V3_ASSERT(nek1 == N + P);
+    GGML_V3_ASSERT(nev1 == D);
+    GGML_V3_ASSERT(ned1 == N);
+
+    // dst cannot be transposed or permuted
+    GGML_V3_ASSERT(nb0 == sizeof(float));
+    GGML_V3_ASSERT(nb0 <= nb1);
+    GGML_V3_ASSERT(nb1 <= nb2);
+    GGML_V3_ASSERT(nb2 <= nb3);
+
+    if (params->type == GGML_V3_TASK_INIT) {
+        if (ith == 0) {
+            memset(dst->data, 0, nb0*ne0*ne1*ne2*ne3);
+        }
+        return;
+    }
+
+    if (params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    const int64_t elem_q = ggml_v3_nelements(q);
+    const int64_t elem_k = ggml_v3_nelements(k);
+
+    enum ggml_v3_type result_type = dst->type;
+    GGML_V3_ASSERT(ggml_v3_blck_size(result_type) == 1);
+    const size_t tsize = ggml_v3_type_size(result_type);
+
+    const size_t offs_q = 0;
+    const size_t offs_k = offs_q + GGML_V3_PAD(elem_q * tsize, GGML_V3_MEM_ALIGN);
+    const size_t offs_v = offs_k + GGML_V3_PAD(elem_k * tsize, GGML_V3_MEM_ALIGN);
+
+    void * grad_q = (char *) dst->data;
+    void * grad_k = (char *) dst->data + offs_k;
+    void * grad_v = (char *) dst->data + offs_v;
+
+    const size_t nbgq1 = nb0*neq0;
+    const size_t nbgq2 = nb0*neq0*neq1;
+    const size_t nbgq3 = nb0*neq0*neq1*neq2;
+
+    const size_t nbgk1 = nb0*nek0;
+    const size_t nbgk2 = nb0*nek0*nek1;
+    const size_t nbgk3 = nb0*nek0*nek1*neq2;
+
+    const size_t nbgv1 = nb0*nev0;
+    const size_t nbgv2 = nb0*nev0*nev1;
+    const size_t nbgv3 = nb0*nev0*nev1*neq2;
+
+    // parallelize by k rows using ggml_v3_vec_dot_f32
+
+    // total rows in k
+    const int nr = nek2*nek3;
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    const float scale = 1.0f/sqrtf(D);
+
+    //printf("P=%d N=%d D=%d ir0=%d ir1=%d scale = %f\n", P, N, D, ir0, ir1, scale);
+
+    // how often k2 (and v2) is repeated in q2
+    int nrep = neq2/nek2;
+
+    for (int ir = ir0; ir < ir1; ++ir) {
+        // q indices
+        const int ik3 = ir/(nek2);
+        const int ik2 = ir - ik3*nek2;
+
+        const int iq3 = ik3;
+        const int id3 = ik3;
+        const int iv3 = ik3;
+        const int iv2 = ik2;
+
+        for (int irep = 0; irep < nrep; ++irep) {
+            const int iq2 = ik2 + irep*nek2;
+            const int id2 = iq2;
+
+            // (ik2 + irep*nek2) % nek2 == ik2
+            for (int iq1 = 0; iq1 < neq1; ++iq1) {
+                const int id1 = iq1;
+
+                // not sure about CACHE_LINE_SIZE_F32..
+                // - maybe it must not be multiplied by 2 and excluded from .. in SM 1*(..) offset?
+                float * S  = (float *) params->wdata + ith*2*(mxDM + CACHE_LINE_SIZE_F32) + 0*(mxDM+CACHE_LINE_SIZE_F32);
+                float * SM = (float *) params->wdata + ith*2*(mxDM + CACHE_LINE_SIZE_F32) + 1*(mxDM+CACHE_LINE_SIZE_F32);
+
+                for (int i = M; i < Mup; ++i) {
+                    S[i] = -INFINITY;
+                }
+
+                const int64_t masked_begin = masked ? (P + iq1 + 1) : M;
+                for (int64_t ic = 0; ic < masked_begin; ++ic) {
+                    // k indices
+                    const int ik1 = ic;
+
+                    // S indices
+                    const int i1 = ik1;
+
+                    ggml_v3_vec_dot_f32(neq0,
+                            S + i1,
+                            (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
+                            (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
+                }
+
+                // scale
+                ggml_v3_vec_scale_f32(masked_begin, S, scale);
+
+                for (int64_t i = masked_begin; i < M; i++) {
+                    S[i] = -INFINITY;
+                }
+
+                // softmax
+                // exclude known -INF S[..] values from max and loop
+                // dont forget to set their SM values to zero
+                {
+                    float max = -INFINITY;
+                    ggml_v3_vec_max_f32(masked_begin, &max, S);
+
+                    ggml_v3_float sum = 0.0;
+                    {
+#ifdef GGML_V3_SOFT_MAX_ACCELERATE
+                        max = -max;
+                        vDSP_vsadd(SM, 1, &max, SM, 1, Mup);
+                        vvexpf(SM, SM, &Mup);
+                        ggml_v3_vec_sum_f32(Mup, &sum, SM);
+#else
+                        uint16_t   scvt[GGML_V3_SOFT_MAX_UNROLL]; UNUSED(scvt);
+                        ggml_v3_float sump[GGML_V3_SOFT_MAX_UNROLL] = { 0.0 };
+
+                        for (int i = 0; i < Mup; i += GGML_V3_SOFT_MAX_UNROLL) {
+                            if (i >= masked_begin) {
+                                break;
+                            }
+                            float * SR =  S + i;
+                            float * SW = SM + i;
+
+                            for (int j = 0; j < GGML_V3_SOFT_MAX_UNROLL; ++j) {
+                                if (i + j >= masked_begin) {
+                                    break;
+                                } else if (SR[j] == -INFINITY) {
+                                    SW[j] = 0.0f;
+                                } else {
+#ifndef GGML_V3_FLASH_ATTN_EXP_FP16
+                                    const float val = expf(SR[j] - max);
+#else
+                                    ggml_v3_fp16_t s = GGML_V3_FP32_TO_FP16(SR[j] - max);
+                                    memcpy(&scvt[j], &s, sizeof(uint16_t));
+                                    const float val = GGML_V3_FP16_TO_FP32(ggml_v3_table_exp_f16[scvt[j]]);
+#endif
+                                    sump[j] += (ggml_v3_float)val;
+                                    SW[j] = val;
+                                }
+                            }
+                        }
+
+                        for (int i = 0; i < GGML_V3_SOFT_MAX_UNROLL; i++) {
+                            sum += sump[i];
+                        }
+#endif
+                    }
+
+                    assert(sum > 0.0);
+
+                    sum = 1.0/sum;
+                    ggml_v3_vec_scale_f32(masked_begin, SM, sum);
+
+                }
+
+                // step-by-step explanation
+                {
+                    // forward-process                    shape      grads from backward process
+                    // parallel_for ik2,ik3:
+                    //  for irep:
+                    //   iq2 = ik2 + irep*nek2
+                    //   k[:D,:M,:,:]                     [D,M,:,:]  grad[k][:D,:M,ik2,ik3]  += grad[kcur]
+                    //   q[:D,:N,:,:]                     [D,N,:,:]  grad[q][:D,iq1,iq2,iq3] += grad[qcur]
+                    //   v[:M,:D,:,:]                     [M,D,:,:]  grad[v][:M,:D,iv2,iv3]  += grad[vcur]
+                    //   for iq1:
+                    //    kcur   = k[:D,:M,ik2,ik3]       [D,M,1,1]  grad[kcur] = grad[S1].T @ qcur
+                    //    qcur   = q[:D,iq1,iq2,iq3]      [D,1,1,1]  grad[qcur] = grad[S1]   @ kcur
+                    //    vcur   = v[:M,:D,iv2,iv3]       [M,D,1,1]  grad[vcur] = grad[S5].T @ S4
+                    //    S0     = -Inf                   [D,1,1,1]
+                    //   ~S1[i]  = dot(kcur[:D,i], qcur)
+                    //    S1     = qcur @ kcur.T          [M,1,1,1]  grad[S1]   = grad[S2] * scale
+                    //    S2     = S1 * scale             [M,1,1,1]  grad[S2]   = diag_mask_zero(grad[S3], P)
+                    //    S3     = diag_mask_inf(S2, P)   [M,1,1,1]  grad[S3]   = S4 * (grad[S4] - dot(S4, grad[S4]))
+                    //    S4     = softmax(S3)            [M,1,1,1]  grad[S4]   = grad[S5] @ vcur
+                    //   ~S5[i]  = dot(vcur[:,i], S4)
+                    //    S5     = S4 @ vcur.T            [D,1,1,1]  grad[S5]   = d[:D,id1,id2,id3]
+                    //   ~dst[i,iq1,iq2,iq3]  = S5[i]              ^
+                    //    dst[:D,iq1,iq2,iq3] = S5                 | grad[dst[:D,iq1,iq2,iq3]] = d[:D,id1,id2,id3]
+                    // dst                               backward-/ grad[dst]                 = d
+                    //
+                    // output gradients with their dependencies:
+                    //
+                    // grad[kcur] = grad[S1].T @ qcur
+                    // grad[S1]   = diag_mask_zero(grad[S3], P) * scale
+                    // grad[S3]   = S4 * (grad[S4] - dot(S4, grad[S4]))
+                    // grad[S4]   = grad[S5] @ vcur
+                    // grad[S4]   = d[:D,id1,id2,id3] @ vcur
+                    // grad[qcur] = grad[S1]   @ kcur
+                    // grad[vcur] = grad[S5].T @ S4
+                    // grad[vcur] = d[:D,id1,id2,id3].T @ S4
+                    //
+                    // in post-order:
+                    //
+                    // S1         = qcur @ kcur.T
+                    // S2         = S1 * scale
+                    // S3         = diag_mask_inf(S2, P)
+                    // S4         = softmax(S3)
+                    // grad[S4]   = d[:D,id1,id2,id3] @ vcur
+                    // grad[S3]   = S4 * (grad[S4] - dot(S4, grad[S4]))
+                    // grad[S1]   = diag_mask_zero(grad[S3], P) * scale
+                    // grad[qcur] = grad[S1]   @ kcur
+                    // grad[kcur] = grad[S1].T @ qcur
+                    // grad[vcur] = d[:D,id1,id2,id3].T @ S4
+                    //
+                    // using less variables (SM=S4):
+                    //
+                    // S             = diag_mask_inf(qcur @ kcur.T * scale, P)
+                    // SM            = softmax(S)
+                    // S             = d[:D,iq1,iq2,iq3] @ vcur
+                    // dot_SM_gradSM = dot(SM, S)
+                    // S             = SM * (S - dot(SM, S))
+                    // S             = diag_mask_zero(S, P) * scale
+                    //
+                    // grad[q][:D,iq1,iq2,iq3] += S   @ kcur
+                    // grad[k][:D,:M,ik2,ik3]  += S.T @ qcur
+                    // grad[v][:M,:D,iv2,iv3]  += d[:D,id1,id2,id3].T @ SM
+                }
+
+                // S = gradSM = d[:D,id1,id2,id3] @ vcur[:,:,iv2,iv3]
+                // S = d[:D,id1,id2,id3] @ vcur[:,:,iv2,iv3]
+                // for ic:
+                //   S[:M] += vcur[:M,ic,iv2,iv3] * d[ic,id1,id2,id3]
+                // exclude known future zero S[..] values from operation
+                ggml_v3_vec_set_f32(masked_begin, S, 0);
+                for (int64_t ic = 0; ic < D; ++ic) {
+                    ggml_v3_vec_mad_f32(masked_begin,
+                            S,
+                             (float *) ((char *) v->data + (          ic*nbv1  + iv2*nbv2 + iv3*nbv3)),
+                            *(float *) ((char *) d->data + (ic*nbd0 + id1*nbd1 + id2*nbd2 + id3*nbd3)));
+                }
+
+                // S = SM * (S - dot(SM, S))
+                float dot_SM_gradSM = 0;
+                ggml_v3_vec_dot_f32 (masked_begin, &dot_SM_gradSM, SM, S);
+                ggml_v3_vec_acc1_f32(M, S, -dot_SM_gradSM);
+                ggml_v3_vec_mul_f32 (masked_begin, S, S, SM);
+
+                // S = diag_mask_zero(S, P) * scale
+                // already done by above ggml_v3_vec_set_f32
+
+                // exclude known zero S[..] values from operation
+                ggml_v3_vec_scale_f32(masked_begin, S, scale);
+
+                // S    shape [M,1]
+                // SM   shape [M,1]
+                // kcur shape [D,M]
+                // qcur shape [D,1]
+                // vcur shape [M,D]
+
+                // grad[q][:D,iq1,iq2,iq3] += S @ kcur
+                // grad[q][:D,iq1,iq2,iq3] += shape[M,1] @ shape[D,M]
+                // for ic:
+                //  grad[q][:D,iq1,iq2,iq3] += S[ic] * kcur[:D,ic,ik2,ik3]
+                // exclude known zero S[..] values from loop
+                for (int64_t ic = 0; ic < masked_begin; ++ic) {
+                    ggml_v3_vec_mad_f32(D,
+                            (float *) ((char *) grad_q  + (iq1*nbgq1 + iq2*nbgq2  + iq3*nbgq3)),
+                            (float *) ((char *) k->data + (ic*nbk1   + ik2*nbk2   + ik3*nbk3)),
+                            S[ic]);
+                }
+
+                // grad[k][:D,:M,iq2,iq3] += S.T @ qcur
+                // for ic:
+                //  grad[k][:D,ic,iq2,iq3] += S.T[0,ic] * qcur[:D,0]
+                //  grad[k][:D,ic,iq2,iq3] += S[ic]     * qcur[:D,0]
+                // exclude known zero S[..] values from loop
+                for (int64_t ic = 0; ic < masked_begin; ++ic) {
+                    ggml_v3_vec_mad_f32(D,
+                            (float *) ((char *) grad_k  + (ic*nbgk1  + ik2*nbgk2  + ik3*nbgk3)),
+                            (float *) ((char *) q->data + (iq1*nbq1  + iq2*nbq2   + iq3*nbq3)),
+                            S[ic]);
+                }
+
+                // grad[v][:M,:D,iv2,iv3] += d[:D,id1,id2,id3].T       @ SM
+                // for ic:
+                //  grad[v][:M,ic,iv2,iv3] += d[:D,id1,id2,id3].T[0,ic] * SM[:M]
+                //  grad[v][:M,ic,iv2,iv3] += d[ic,id1,id2,id3]         * SM[:M]
+                // exclude known zero SM[..] values from mad
+                for (int64_t ic = 0; ic < D; ++ic) {
+                    ggml_v3_vec_mad_f32(masked_begin,
+                            (float *) ((char *) grad_v   + (          ic*nbgv1 + iv2*nbgv2 + iv3*nbgv3)),
+                            SM,
+                            *(float *) ((char *) d->data + (ic*nbd0 + id1*nbd1 + id2*nbd2  + id3*nbd3)));
+                }
+            }
+        }
+    }
+}
+
+static void ggml_v3_compute_forward_flash_attn_back(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * q,
+        const struct ggml_v3_tensor * k,
+        const struct ggml_v3_tensor * v,
+        const struct ggml_v3_tensor * d,
+        const bool masked,
+        struct ggml_v3_tensor * dst) {
+    switch (q->type) {
+        case GGML_V3_TYPE_F32:
+            {
+                ggml_v3_compute_forward_flash_attn_back_f32(params, q, k, v, d, masked, dst);
+            } break;
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_v3_compute_forward_win_part
+
+static void ggml_v3_compute_forward_win_part_f32(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    GGML_V3_TENSOR_LOCALS(int64_t, ne0, src0, ne)
+    GGML_V3_TENSOR_LOCALS(int64_t, ne,  dst,  ne)
+
+    const int32_t nep0 = ((const int32_t *)(dst->op_params))[0];
+    const int32_t nep1 = ((const int32_t *)(dst->op_params))[1];
+    const int32_t w    = ((const int32_t *)(dst->op_params))[2];
+
+    assert(ne00 == ne0);
+    assert(ne3  == nep0*nep1);
+
+    // TODO: optimize / multi-thread
+    for (int py = 0; py < nep1; ++py) {
+        for (int px = 0; px < nep0; ++px) {
+            const int64_t i3 = py*nep0 + px;
+            for (int64_t i2 = 0; i2 < ne2; ++i2) {
+                for (int64_t i1 = 0; i1 < ne1; ++i1) {
+                    for (int64_t i0 = 0; i0 < ne0; ++i0) {
+                        const int64_t i02 = py*w + i2;
+                        const int64_t i01 = px*w + i1;
+                        const int64_t i00 = i0;
+
+                        const int64_t i = i3*ne2*ne1*ne0 + i2*ne1*ne0    + i1*ne0   + i0;
+                        const int64_t j =                  i02*ne01*ne00 + i01*ne00 + i00;
+
+                        if (py*w + i2 >= ne02 || px*w + i1 >= ne01) {
+                            ((float *) dst->data)[i] = 0.0f;
+                        } else {
+                            ((float *) dst->data)[i] = ((float *) src0->data)[j];
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+static void ggml_v3_compute_forward_win_part(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    switch (src0->type) {
+        case GGML_V3_TYPE_F32:
+            {
+                ggml_v3_compute_forward_win_part_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_v3_compute_forward_win_unpart
+
+static void ggml_v3_compute_forward_win_unpart_f32(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    GGML_V3_TENSOR_LOCALS(int64_t, ne0, src0, ne)
+    GGML_V3_TENSOR_LOCALS(int64_t, ne,  dst,  ne)
+
+    const int32_t w = ((const int32_t *)(dst->op_params))[0];
+
+    // padding
+    const int px = (w - ne1%w)%w;
+    //const int py = (w - ne2%w)%w;
+
+    const int npx = (px + ne1)/w;
+    //const int npy = (py + ne2)/w;
+
+    assert(ne0 == ne00);
+
+    // TODO: optimize / multi-thread
+    for (int64_t i2 = 0; i2 < ne2; ++i2) {
+        for (int64_t i1 = 0; i1 < ne1; ++i1) {
+            for (int64_t i0 = 0; i0 < ne0; ++i0) {
+                const int ip2 = i2/w;
+                const int ip1 = i1/w;
+
+                const int64_t i02 = i2%w;
+                const int64_t i01 = i1%w;
+                const int64_t i00 = i0;
+
+                const int64_t i = (ip2*npx + ip1)*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00 + i00;
+                const int64_t j =                                  i2*ne1*ne0    + i1*ne0   + i0;
+
+                ((float *) dst->data)[j] = ((float *) src0->data)[i];
+            }
+        }
+    }
+}
+
+static void ggml_v3_compute_forward_win_unpart(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    switch (src0->type) {
+        case GGML_V3_TYPE_F32:
+            {
+                ggml_v3_compute_forward_win_unpart_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+}
+
+//gmml_compute_forward_unary
+
+static void ggml_v3_compute_forward_unary(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    const enum ggml_v3_unary_op op = ggml_v3_get_unary_op(dst);
+
+    switch (op) {
+        case GGML_V3_UNARY_OP_ABS:
+            {
+                ggml_v3_compute_forward_abs(params, src0, dst);
+            } break;
+        case GGML_V3_UNARY_OP_SGN:
+            {
+                ggml_v3_compute_forward_sgn(params, src0, dst);
+            } break;
+        case GGML_V3_UNARY_OP_NEG:
+            {
+                ggml_v3_compute_forward_neg(params, src0, dst);
+            } break;
+        case GGML_V3_UNARY_OP_STEP:
+            {
+                ggml_v3_compute_forward_step(params, src0, dst);
+            } break;
+        case GGML_V3_UNARY_OP_TANH:
+            {
+                ggml_v3_compute_forward_tanh(params, src0, dst);
+            } break;
+        case GGML_V3_UNARY_OP_ELU:
+            {
+                ggml_v3_compute_forward_elu(params, src0, dst);
+            } break;
+        case GGML_V3_UNARY_OP_RELU:
+            {
+                ggml_v3_compute_forward_relu(params, src0, dst);
+            } break;
+        case GGML_V3_UNARY_OP_GELU:
+            {
+                ggml_v3_compute_forward_gelu(params, src0, dst);
+            } break;
+        case GGML_V3_UNARY_OP_GELU_QUICK:
+            {
+                ggml_v3_compute_forward_gelu_quick(params, src0, dst);
+            } break;
+        case GGML_V3_UNARY_OP_SILU:
+            {
+                ggml_v3_compute_forward_silu(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_v3_compute_forward_get_rel_pos
+
+static void ggml_v3_compute_forward_get_rel_pos_f16(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L292-L322
+
+    GGML_V3_TENSOR_UNARY_OP_LOCALS
+
+    const int64_t w = ne1;
+
+    ggml_v3_fp16_t * src0_data = (ggml_v3_fp16_t *) src0->data;
+    ggml_v3_fp16_t * dst_data  = (ggml_v3_fp16_t *) dst->data;
+
+    for (int64_t i2 = 0; i2 < ne2; ++i2) {
+        for (int64_t i1 = 0; i1 < ne1; ++i1) {
+            const int64_t pos = (w - i1 - 1) + i2;
+            for (int64_t i0 = 0; i0 < ne0; ++i0) {
+                dst_data[i2*ne1*ne0 + i1*ne0 + i0] = src0_data[pos*ne00 + i0];
+            }
+        }
+    }
+}
+
+static void ggml_v3_compute_forward_get_rel_pos(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst) {
+    switch (src0->type) {
+        case GGML_V3_TYPE_F16:
+            {
+                ggml_v3_compute_forward_get_rel_pos_f16(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_v3_compute_forward_add_rel_pos
+
+static void ggml_v3_compute_forward_add_rel_pos_f32(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        const struct ggml_v3_tensor * src1,
+        const struct ggml_v3_tensor * src2,
+        struct ggml_v3_tensor * dst) {
+
+    const bool inplace = (bool) ((int32_t *) dst->op_params)[0];
+    if (!inplace && params->type == GGML_V3_TASK_INIT) {
+        memcpy((char *) dst->data, (char *) src0->data, ggml_v3_nbytes(dst));
+        return;
+    }
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    int64_t t0 = ggml_v3_perf_time_us();
+    UNUSED(t0);
+
+    // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L357-L359
+
+    float * src1_data = (float *) src1->data;
+    float * src2_data = (float *) src2->data;
+    float * dst_data  = (float *) dst->data;
+
+    const int64_t ne10 = src1->ne[0];
+    const int64_t ne11 = src1->ne[1];
+    const int64_t ne12 = src1->ne[2];
+    const int64_t ne13 = src1->ne[3];
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    // total patches in dst
+    const int np = ne13;
+
+    // patches per thread
+    const int dp = (np + nth - 1)/nth;
+
+    // patch range for this thread
+    const int ip0 = dp*ith;
+    const int ip1 = MIN(ip0 + dp, np);
+
+    for (int64_t i13 = ip0; i13 < ip1; ++i13) {
+        for (int64_t i12 = 0; i12 < ne12; ++i12) {
+            for (int64_t i11 = 0; i11 < ne11; ++i11) {
+                const int64_t jp1 = i13*ne12*ne11*ne10 + i12*ne11*ne10 + i11*ne10;
+                for (int64_t i10 = 0; i10 < ne10; ++i10) {
+                    const int64_t jp0  = jp1 + i10;
+                    const float src1_e = src1_data[jp0];
+                    const float src2_e = src2_data[jp0];
+
+                    const int64_t jdh = jp0 * ne10;
+                    const int64_t jdw = jdh - (ne10 - 1) * i10;
+
+                    for (int64_t j = 0; j < ne10; ++j) {
+                        dst_data[jdh + j     ] += src2_e;
+                        dst_data[jdw + j*ne10] += src1_e;
+                    }
+                }
+            }
+        }
+    }
+}
+
+static void ggml_v3_compute_forward_add_rel_pos(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        const struct ggml_v3_tensor * src1,
+        const struct ggml_v3_tensor * src2,
+        struct ggml_v3_tensor * dst) {
+    switch (src0->type) {
+        case GGML_V3_TYPE_F32:
+            {
+                ggml_v3_compute_forward_add_rel_pos_f32(params, src0, src1, src2, dst);
+            } break;
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_v3_compute_forward_map_unary
+
+static void ggml_v3_compute_forward_map_unary_f32(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst,
+        const ggml_v3_unary_op_f32_t fun) {
+    GGML_V3_ASSERT(ggml_v3_are_same_shape(src0, dst));
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    const int n  = ggml_v3_nrows(src0);
+    const int nc = src0->ne[0];
+
+    assert( dst->nb[0] == sizeof(float));
+    assert(src0->nb[0] == sizeof(float));
+
+    for (int i = 0; i < n; i++) {
+        fun(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+
+static void ggml_v3_compute_forward_map_unary(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        struct ggml_v3_tensor * dst,
+        const ggml_v3_unary_op_f32_t fun) {
+    switch (src0->type) {
+        case GGML_V3_TYPE_F32:
+            {
+                ggml_v3_compute_forward_map_unary_f32(params, src0, dst, fun);
+            } break;
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_v3_compute_forward_map_binary
+
+static void ggml_v3_compute_forward_map_binary_f32(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        const struct ggml_v3_tensor * src1,
+        struct ggml_v3_tensor * dst,
+        const ggml_v3_binary_op_f32_t fun) {
+    assert(params->ith == 0);
+    assert(ggml_v3_are_same_shape(src0, src1) && ggml_v3_are_same_shape(src0, dst));
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    const int n  = ggml_v3_nrows(src0);
+    const int nc = src0->ne[0];
+
+    assert( dst->nb[0] == sizeof(float));
+    assert(src0->nb[0] == sizeof(float));
+    assert(src1->nb[0] == sizeof(float));
+
+    for (int i = 0; i < n; i++) {
+        fun(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])),
+                (float *) ((char *) src1->data + i*(src1->nb[1])));
+    }
+}
+
+static void ggml_v3_compute_forward_map_binary(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        const struct ggml_v3_tensor * src1,
+        struct ggml_v3_tensor * dst,
+        const ggml_v3_binary_op_f32_t fun) {
+    switch (src0->type) {
+        case GGML_V3_TYPE_F32:
+            {
+                ggml_v3_compute_forward_map_binary_f32(params, src0, src1, dst, fun);
+            } break;
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_v3_compute_forward_map_custom1
+
+static void ggml_v3_compute_forward_map_custom1_f32(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * a,
+        struct ggml_v3_tensor * dst,
+        const ggml_v3_custom1_op_f32_t fun) {
+    assert(params->ith == 0);
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    fun(dst, a);
+}
+
+// ggml_v3_compute_forward_map_custom2
+
+static void ggml_v3_compute_forward_map_custom2_f32(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * a,
+        const struct ggml_v3_tensor * b,
+        struct ggml_v3_tensor * dst,
+        const ggml_v3_custom2_op_f32_t fun) {
+    assert(params->ith == 0);
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    fun(dst, a, b);
+}
+
+// ggml_v3_compute_forward_map_custom3
+
+static void ggml_v3_compute_forward_map_custom3_f32(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * a,
+        const struct ggml_v3_tensor * b,
+        const struct ggml_v3_tensor * c,
+        struct ggml_v3_tensor * dst,
+        const ggml_v3_custom3_op_f32_t fun) {
+    assert(params->ith == 0);
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    fun(dst, a, b, c);
+}
+
+// ggml_v3_compute_forward_map_custom1
+
+static void ggml_v3_compute_forward_map_custom1(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * a,
+              struct ggml_v3_tensor * dst) {
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    struct ggml_v3_map_custom1_op_params * p = (struct ggml_v3_map_custom1_op_params *) dst->op_params;
+
+    p->fun(dst, a, params->ith, params->nth, p->userdata);
+}
+
+// ggml_v3_compute_forward_map_custom2
+
+static void ggml_v3_compute_forward_map_custom2(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * a,
+        const struct ggml_v3_tensor * b,
+              struct ggml_v3_tensor * dst) {
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    struct ggml_v3_map_custom2_op_params * p = (struct ggml_v3_map_custom2_op_params *) dst->op_params;
+
+    p->fun(dst, a, b, params->ith, params->nth, p->userdata);
+}
+
+// ggml_v3_compute_forward_map_custom3
+
+static void ggml_v3_compute_forward_map_custom3(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * a,
+        const struct ggml_v3_tensor * b,
+        const struct ggml_v3_tensor * c,
+              struct ggml_v3_tensor * dst) {
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    struct ggml_v3_map_custom3_op_params * p = (struct ggml_v3_map_custom3_op_params *) dst->op_params;
+
+    p->fun(dst, a, b, c, params->ith, params->nth, p->userdata);
+}
+
+// ggml_v3_compute_forward_cross_entropy_loss
+
+static void ggml_v3_compute_forward_cross_entropy_loss_f32(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        const struct ggml_v3_tensor * src1,
+        struct ggml_v3_tensor * dst) {
+    GGML_V3_ASSERT(ggml_v3_is_contiguous(src0));
+    GGML_V3_ASSERT(ggml_v3_is_contiguous(src1));
+    GGML_V3_ASSERT(ggml_v3_is_scalar(dst));
+    GGML_V3_ASSERT(ggml_v3_are_same_shape(src0, src1));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    float * sums = (float *) params->wdata;
+
+    // TODO: handle transposed/permuted matrices
+    const int nc = src0->ne[0];
+    const int nr = ggml_v3_nrows(src0);
+
+    GGML_V3_ASSERT(params->wsize >= sizeof(float) * (nth + nth * nc));
+
+    if (params->type == GGML_V3_TASK_INIT) {
+        if (ith == 0) {
+            memset(sums, 0, sizeof(float) * (nth + nth * nc));
+        }
+        return;
+    }
+
+    if (params->type == GGML_V3_TASK_FINALIZE) {
+        if (ith == 0) {
+            float * dp = (float *) dst->data;
+            ggml_v3_vec_sum_f32(nth, dp, sums);
+            dp[0] *= -1.0f / (float) nr;
+        }
+        return;
+    }
+
+    const double eps = 1e-9;
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int i1 = ir0; i1 < ir1; i1++) {
+        float * s0 = (float *)((char *) src0->data + i1*src0->nb[1]);
+        float * s1 = (float *)((char *) src1->data + i1*src1->nb[1]);
+        float * st = ((float *) params->wdata) + nth + ith*nc;
+
+#ifndef NDEBUG
+        for (int i = 0; i < nc; ++i) {
+            //printf("p[%d] = %f\n", i, p[i]);
+            assert(!isnan(s0[i]));
+            assert(!isnan(s1[i]));
+        }
+#endif
+        // soft_max
+        ggml_v3_float sum = 0.0;
+        {
+            float max = -INFINITY;
+            ggml_v3_vec_max_f32(nc, &max, s0);
+
+            uint16_t scvt; UNUSED(scvt);
+            for (int i = 0; i < nc; i++) {
+                if (s0[i] == -INFINITY) {
+                    st[i] = 0.0f;
+                } else {
+#ifndef GGML_V3_CROSS_ENTROPY_EXP_FP16
+                    const float s = s0[i] - max;
+                    const float val = expf(s);
+#else
+                    ggml_v3_fp16_t s = GGML_V3_FP32_TO_FP16(s0[i] - max);
+                    memcpy(&scvt, &s, sizeof(scvt));
+                    const float val = GGML_V3_FP16_TO_FP32(ggml_v3_table_exp_f16[scvt]);
+#endif
+                    sum += (ggml_v3_float)val;
+                    st[i] = val;
+                }
+            }
+
+            assert(sum > 0.0);
+            // sum = 1.0/sum;
+        }
+        // avoid log(0) by rescaling from [0..1] to [eps..1]
+        sum = (1.0 - eps) / sum;
+        ggml_v3_vec_scale_f32(nc, st, sum);
+        ggml_v3_vec_add1_f32(nc, st, st, eps);
+        ggml_v3_vec_log_f32(nc, st, st);
+        ggml_v3_vec_mul_f32(nc, st, st, s1);
+
+        float st_sum = 0;
+        ggml_v3_vec_sum_f32(nc, &st_sum, st);
+        sums[ith] += st_sum;
+
+#ifndef NDEBUG
+        for (int i = 0; i < nc; ++i) {
+            assert(!isnan(st[i]));
+            assert(!isinf(st[i]));
+        }
+#endif
+    }
+
+}
+
+static void ggml_v3_compute_forward_cross_entropy_loss(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        const struct ggml_v3_tensor * src1,
+        struct ggml_v3_tensor * dst) {
+    switch (src0->type) {
+        case GGML_V3_TYPE_F32:
+            {
+                ggml_v3_compute_forward_cross_entropy_loss_f32(params, src0, src1, dst);
+            } break;
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_v3_compute_forward_cross_entropy_loss_back
+
+static void ggml_v3_compute_forward_cross_entropy_loss_back_f32(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        const struct ggml_v3_tensor * src1,
+        const struct ggml_v3_tensor * opt0,
+        struct ggml_v3_tensor * dst) {
+    GGML_V3_ASSERT(ggml_v3_is_contiguous(dst));
+    GGML_V3_ASSERT(ggml_v3_is_contiguous(src0));
+    GGML_V3_ASSERT(ggml_v3_is_contiguous(src1));
+    GGML_V3_ASSERT(ggml_v3_is_contiguous(opt0));
+    GGML_V3_ASSERT(ggml_v3_are_same_shape(src0, src1) && ggml_v3_are_same_shape(src0, dst));
+
+    const int64_t ith = params->ith;
+    const int64_t nth = params->nth;
+
+    if (params->type == GGML_V3_TASK_INIT || params->type == GGML_V3_TASK_FINALIZE) {
+        return;
+    }
+
+    const double eps = 1e-9;
+
+    // TODO: handle transposed/permuted matrices
+    const int64_t nc = src0->ne[0];
+    const int64_t nr = ggml_v3_nrows(src0);
+
+    // rows per thread
+    const int64_t dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int64_t ir0 = dr*ith;
+    const int64_t ir1 = MIN(ir0 + dr, nr);
+
+    float * d   = (float *) opt0->data;
+
+    for (int64_t i1 = ir0; i1 < ir1; i1++) {
+        float * ds0 = (float *)((char *) dst->data  + i1*dst->nb[1]);
+        float * s0  = (float *)((char *) src0->data + i1*src0->nb[1]);
+        float * s1  = (float *)((char *) src1->data + i1*src1->nb[1]);
+
+#ifndef NDEBUG
+        for (int i = 0; i < nc; ++i) {
+            //printf("p[%d] = %f\n", i, p[i]);
+            assert(!isnan(s0[i]));
+            assert(!isnan(s1[i]));
+        }
+#endif
+
+        // soft_max
+        ggml_v3_float sum = 0.0;
+        {
+            float max = -INFINITY;
+            ggml_v3_vec_max_f32(nc, &max, s0);
+
+            uint16_t scvt; UNUSED(scvt);
+            for (int i = 0; i < nc; i++) {
+                if (s0[i] == -INFINITY) {
+                    ds0[i] = 0.0f;
+                } else {
+#ifndef GGML_V3_CROSS_ENTROPY_EXP_FP16
+                    const float s = s0[i] - max;
+                    const float val = expf(s);
+#else
+                    ggml_v3_fp16_t s = GGML_V3_FP32_TO_FP16(s0[i] - max);
+                    memcpy(&scvt, &s, sizeof(scvt));
+                    const float val = GGML_V3_FP16_TO_FP32(ggml_v3_table_exp_f16[scvt]);
+#endif
+                    sum += (ggml_v3_float)val;
+                    ds0[i] = val;
+                }
+            }
+
+            assert(sum > 0.0);
+            sum = (1.0 - eps)/sum;
+        }
+
+        // grad(src0) = (softmax(src0) - src1) * grad(cross_entropy_loss(src0, src1)) / nr
+        ggml_v3_vec_scale_f32(nc, ds0, sum);
+        ggml_v3_vec_add1_f32(nc, ds0, ds0, eps);
+        ggml_v3_vec_sub_f32(nc, ds0, ds0, s1);
+        ggml_v3_vec_scale_f32(nc, ds0, d[0] / (float) nr);
+
+#ifndef NDEBUG
+        for (int i = 0; i < nc; ++i) {
+            assert(!isnan(ds0[i]));
+            assert(!isinf(ds0[i]));
+        }
+#endif
+    }
+}
+
+static void ggml_v3_compute_forward_cross_entropy_loss_back(
+        const struct ggml_v3_compute_params * params,
+        const struct ggml_v3_tensor * src0,
+        const struct ggml_v3_tensor * src1,
+        const struct ggml_v3_tensor * opt0,
+        struct ggml_v3_tensor * dst) {
+    switch (src0->type) {
+        case GGML_V3_TYPE_F32:
+            {
+                ggml_v3_compute_forward_cross_entropy_loss_back_f32(params, src0, src1, opt0, dst);
+            } break;
+        default:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+}
+
+/////////////////////////////////
+
+static void ggml_v3_compute_forward(struct ggml_v3_compute_params * params, struct ggml_v3_tensor * tensor) {
+    GGML_V3_ASSERT(params);
+
+    if (tensor->op == GGML_V3_OP_NONE) {
+        return;
+    }
+
+#ifdef GGML_USE_CUBLAS
+    bool skip_cpu = ggml_v3_cuda_compute_forward(params, tensor);
+    if (skip_cpu) {
+        return;
+    }
+    GGML_V3_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_V3_BACKEND_CPU);
+    GGML_V3_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_V3_BACKEND_CPU);
+#endif // GGML_USE_CUBLAS
+
+    switch (tensor->op) {
+        case GGML_V3_OP_DUP:
+            {
+                ggml_v3_compute_forward_dup(params, tensor->src[0], tensor);
+            } break;
+        case GGML_V3_OP_ADD:
+            {
+                ggml_v3_compute_forward_add(params, tensor->src[0], tensor->src[1], tensor);
+            } break;
+        case GGML_V3_OP_ADD1:
+            {
+                ggml_v3_compute_forward_add1(params, tensor->src[0], tensor->src[1], tensor);
+            } break;
+        case GGML_V3_OP_ACC:
+            {
+                ggml_v3_compute_forward_acc(params, tensor->src[0], tensor->src[1], tensor);
+            } break;
+        case GGML_V3_OP_SUB:
+            {
+                ggml_v3_compute_forward_sub(params, tensor->src[0], tensor->src[1], tensor);
+            } break;
+        case GGML_V3_OP_MUL:
+            {
+                ggml_v3_compute_forward_mul(params, tensor->src[0], tensor->src[1], tensor);
+            } break;
+        case GGML_V3_OP_DIV:
+            {
+                ggml_v3_compute_forward_div(params, tensor->src[0], tensor->src[1], tensor);
+            } break;
+        case GGML_V3_OP_SQR:
+            {
+                ggml_v3_compute_forward_sqr(params, tensor->src[0], tensor);
+            } break;
+        case GGML_V3_OP_SQRT:
+            {
+                ggml_v3_compute_forward_sqrt(params, tensor->src[0], tensor);
+            } break;
+        case GGML_V3_OP_LOG:
+            {
+                ggml_v3_compute_forward_log(params, tensor->src[0], tensor);
+            } break;
+        case GGML_V3_OP_SUM:
+            {
+                ggml_v3_compute_forward_sum(params, tensor->src[0], tensor);
+            } break;
+        case GGML_V3_OP_SUM_ROWS:
+            {
+                ggml_v3_compute_forward_sum_rows(params, tensor->src[0], tensor);
+            } break;
+        case GGML_V3_OP_MEAN:
+            {
+                ggml_v3_compute_forward_mean(params, tensor->src[0], tensor);
+            } break;
+        case GGML_V3_OP_ARGMAX:
+            {
+                ggml_v3_compute_forward_argmax(params, tensor->src[0], tensor);
+            } break;
+        case GGML_V3_OP_REPEAT:
+            {
+                ggml_v3_compute_forward_repeat(params, tensor->src[0], tensor);
+            } break;
+        case GGML_V3_OP_REPEAT_BACK:
+            {
+                ggml_v3_compute_forward_repeat_back(params, tensor->src[0], tensor);
+            } break;
+        case GGML_V3_OP_CONCAT:
+            {
+                ggml_v3_compute_forward_concat(params, tensor->src[0], tensor->src[1], tensor);
+            } break;
+        case GGML_V3_OP_SILU_BACK:
+            {
+                ggml_v3_compute_forward_silu_back(params, tensor->src[0], tensor->src[1], tensor);
+            } break;
+        case GGML_V3_OP_NORM:
+            {
+                ggml_v3_compute_forward_norm(params, tensor->src[0], tensor);
+            } break;
+        case GGML_V3_OP_RMS_NORM:
+            {
+                ggml_v3_compute_forward_rms_norm(params, tensor->src[0], tensor);
+            } break;
+        case GGML_V3_OP_RMS_NORM_BACK:
+            {
+                ggml_v3_compute_forward_rms_norm_back(params, tensor->src[0], tensor->src[1], tensor);
+            } break;
+        case GGML_V3_OP_GROUP_NORM:
+            {
+                ggml_v3_compute_forward_group_norm(params, tensor->src[0], tensor);
+            } break;
+        case GGML_V3_OP_MUL_MAT:
+            {
+                ggml_v3_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor);
+            } break;
+        case GGML_V3_OP_MUL_MAT_ID:
+            {
+                ggml_v3_compute_forward_mul_mat_id(params, tensor->src[0], tensor->src[1], tensor);
+            } break;
+        case GGML_V3_OP_OUT_PROD:
+            {
+                ggml_v3_compute_forward_out_prod(params, tensor->src[0], tensor->src[1], tensor);
+            } break;
+        case GGML_V3_OP_SCALE:
+            {
+                ggml_v3_compute_forward_scale(params, tensor->src[0], tensor);
+            } break;
+        case GGML_V3_OP_SET:
+            {
+                ggml_v3_compute_forward_set(params, tensor->src[0], tensor->src[1], tensor);
+            } break;
+        case GGML_V3_OP_CPY:
+            {
+                ggml_v3_compute_forward_cpy(params, tensor->src[0], tensor);
+            } break;
+        case GGML_V3_OP_CONT:
+            {
+                ggml_v3_compute_forward_cont(params, tensor->src[0], tensor);
+            } break;
+        case GGML_V3_OP_RESHAPE:
+            {
+                ggml_v3_compute_forward_reshape(params, tensor->src[0], tensor);
+            } break;
+        case GGML_V3_OP_VIEW:
+            {
+                ggml_v3_compute_forward_view(params, tensor->src[0]);
+            } break;
+        case GGML_V3_OP_PERMUTE:
+            {
+                ggml_v3_compute_forward_permute(params, tensor->src[0]);
+            } break;
+        case GGML_V3_OP_TRANSPOSE:
+            {
+                ggml_v3_compute_forward_transpose(params, tensor->src[0]);
+            } break;
+        case GGML_V3_OP_GET_ROWS:
+            {
+                ggml_v3_compute_forward_get_rows(params, tensor->src[0], tensor->src[1], tensor);
+            } break;
+        case GGML_V3_OP_GET_ROWS_BACK:
+            {
+                ggml_v3_compute_forward_get_rows_back(params, tensor->src[0], tensor->src[1], tensor);
+            } break;
+        case GGML_V3_OP_DIAG:
+            {
+                ggml_v3_compute_forward_diag(params, tensor->src[0], tensor);
+            } break;
+        case GGML_V3_OP_DIAG_MASK_INF:
+            {
+                ggml_v3_compute_forward_diag_mask_inf(params, tensor->src[0], tensor);
+            } break;
+        case GGML_V3_OP_DIAG_MASK_ZERO:
+            {
+                ggml_v3_compute_forward_diag_mask_zero(params, tensor->src[0], tensor);
+            } break;
+        case GGML_V3_OP_SOFT_MAX:
+            {
+                ggml_v3_compute_forward_soft_max(params, tensor->src[0], tensor->src[1], tensor);
+            } break;
+        case GGML_V3_OP_SOFT_MAX_BACK:
+            {
+                ggml_v3_compute_forward_soft_max_back(params, tensor->src[0], tensor->src[1], tensor);
+            } break;
+        case GGML_V3_OP_ROPE:
+            {
+                ggml_v3_compute_forward_rope(params, tensor->src[0], tensor->src[1], tensor);
+            } break;
+        case GGML_V3_OP_ROPE_BACK:
+            {
+                ggml_v3_compute_forward_rope_back(params, tensor->src[0], tensor->src[1], tensor);
+            } break;
+        case GGML_V3_OP_ALIBI:
+            {
+                ggml_v3_compute_forward_alibi(params, tensor->src[0], tensor);
+            } break;
+        case GGML_V3_OP_CLAMP:
+            {
+                ggml_v3_compute_forward_clamp(params, tensor->src[0], tensor);
+            } break;
+        case GGML_V3_OP_CONV_TRANSPOSE_1D:
+            {
+                ggml_v3_compute_forward_conv_transpose_1d(params, tensor->src[0], tensor->src[1], tensor);
+            } break;
+        case GGML_V3_OP_IM2COL:
+            {
+                ggml_v3_compute_forward_im2col(params, tensor->src[0], tensor->src[1], tensor);
+            } break;
+        case GGML_V3_OP_CONV_TRANSPOSE_2D:
+            {
+                ggml_v3_compute_forward_conv_transpose_2d(params, tensor->src[0], tensor->src[1], tensor);
+            } break;
+        case GGML_V3_OP_POOL_1D:
+            {
+                ggml_v3_compute_forward_pool_1d(params, tensor->src[0], tensor);
+            } break;
+        case GGML_V3_OP_POOL_2D:
+            {
+                ggml_v3_compute_forward_pool_2d(params, tensor->src[0], tensor);
+            } break;
+        case GGML_V3_OP_UPSCALE:
+            {
+                ggml_v3_compute_forward_upscale(params, tensor->src[0], tensor);
+            } break;
+        case GGML_V3_OP_PAD:
+            {
+                ggml_v3_compute_forward_pad(params, tensor->src[0], tensor);
+            } break;
+        case GGML_V3_OP_ARGSORT:
+            {
+                ggml_v3_compute_forward_argsort(params, tensor->src[0], tensor);
+            } break;
+        case GGML_V3_OP_LEAKY_RELU:
+            {
+                ggml_v3_compute_forward_leaky_relu(params, tensor->src[0], tensor);
+            } break;
+        case GGML_V3_OP_FLASH_ATTN:
+            {
+                const int32_t t = ggml_v3_get_op_params_i32(tensor, 0);
+                GGML_V3_ASSERT(t == 0 || t == 1);
+                const bool masked = t != 0;
+                ggml_v3_compute_forward_flash_attn(params, tensor->src[0], tensor->src[1], tensor->src[2], masked, tensor);
+            } break;
+        case GGML_V3_OP_FLASH_FF:
+            {
+                ggml_v3_compute_forward_flash_ff(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], tensor->src[4], tensor);
+            } break;
+        case GGML_V3_OP_FLASH_ATTN_BACK:
+            {
+                int32_t t = ggml_v3_get_op_params_i32(tensor, 0);
+                GGML_V3_ASSERT(t == 0 || t == 1);
+                bool masked = t != 0;
+                ggml_v3_compute_forward_flash_attn_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], masked, tensor);
+            } break;
+        case GGML_V3_OP_WIN_PART:
+            {
+                ggml_v3_compute_forward_win_part(params, tensor->src[0], tensor);
+            } break;
+        case GGML_V3_OP_WIN_UNPART:
+            {
+                ggml_v3_compute_forward_win_unpart(params, tensor->src[0], tensor);
+            } break;
+        case GGML_V3_OP_UNARY:
+            {
+                ggml_v3_compute_forward_unary(params, tensor->src[0], tensor);
+            } break;
+        case GGML_V3_OP_GET_REL_POS:
+            {
+                ggml_v3_compute_forward_get_rel_pos(params, tensor->src[0], tensor);
+            } break;
+        case GGML_V3_OP_ADD_REL_POS:
+            {
+                ggml_v3_compute_forward_add_rel_pos(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
+            } break;
+        case GGML_V3_OP_MAP_UNARY:
+            {
+                ggml_v3_unary_op_f32_t fun;
+                memcpy(&fun, tensor->op_params, sizeof(fun));
+                ggml_v3_compute_forward_map_unary(params, tensor->src[0], tensor, fun);
+            }
+            break;
+        case GGML_V3_OP_MAP_BINARY:
+            {
+                ggml_v3_binary_op_f32_t fun;
+                memcpy(&fun, tensor->op_params, sizeof(fun));
+                ggml_v3_compute_forward_map_binary(params, tensor->src[0], tensor->src[1], tensor, fun);
+            }
+            break;
+        case GGML_V3_OP_MAP_CUSTOM1_F32:
+            {
+                ggml_v3_custom1_op_f32_t fun;
+                memcpy(&fun, tensor->op_params, sizeof(fun));
+                ggml_v3_compute_forward_map_custom1_f32(params, tensor->src[0], tensor, fun);
+            }
+            break;
+        case GGML_V3_OP_MAP_CUSTOM2_F32:
+            {
+                ggml_v3_custom2_op_f32_t fun;
+                memcpy(&fun, tensor->op_params, sizeof(fun));
+                ggml_v3_compute_forward_map_custom2_f32(params, tensor->src[0], tensor->src[1], tensor, fun);
+            }
+            break;
+        case GGML_V3_OP_MAP_CUSTOM3_F32:
+            {
+                ggml_v3_custom3_op_f32_t fun;
+                memcpy(&fun, tensor->op_params, sizeof(fun));
+                ggml_v3_compute_forward_map_custom3_f32(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor, fun);
+            }
+            break;
+        case GGML_V3_OP_MAP_CUSTOM1:
+            {
+                ggml_v3_compute_forward_map_custom1(params, tensor->src[0], tensor);
+            }
+            break;
+        case GGML_V3_OP_MAP_CUSTOM2:
+            {
+                ggml_v3_compute_forward_map_custom2(params, tensor->src[0], tensor->src[1], tensor);
+            }
+            break;
+        case GGML_V3_OP_MAP_CUSTOM3:
+            {
+                ggml_v3_compute_forward_map_custom3(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
+            }
+            break;
+        case GGML_V3_OP_CROSS_ENTROPY_LOSS:
+            {
+                ggml_v3_compute_forward_cross_entropy_loss(params, tensor->src[0], tensor->src[1], tensor);
+            }
+            break;
+        case GGML_V3_OP_CROSS_ENTROPY_LOSS_BACK:
+            {
+                ggml_v3_compute_forward_cross_entropy_loss_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
+            }
+            break;
+        case GGML_V3_OP_NONE:
+            {
+                // nop
+            } break;
+        case GGML_V3_OP_COUNT:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+static size_t ggml_v3_hash_size(size_t min_sz) {
+    // next primes after powers of two
+    static const size_t primes[] = {
+        2, 3, 5, 11, 17, 37, 67, 131, 257, 521, 1031,
+        2053, 4099, 8209, 16411, 32771, 65537, 131101,
+        262147, 524309, 1048583, 2097169, 4194319, 8388617,
+        16777259, 33554467, 67108879, 134217757, 268435459,
+        536870923, 1073741827, 2147483659
+    };
+    static const size_t n_primes = sizeof(primes)/sizeof(primes[0]);
+
+    // find the smallest prime that is larger or equal to min_sz
+    size_t l = 0;
+    size_t r = n_primes;
+    while (l < r) {
+        size_t m = (l + r)/2;
+        if (primes[m] < min_sz) {
+            l = m + 1;
+        } else {
+            r = m;
+        }
+    }
+    size_t sz = l < n_primes ? primes[l] : min_sz | 1;
+    return sz;
+}
+
+static size_t ggml_v3_hash(const void * p) {
+    return (size_t)p;
+}
+
+size_t ggml_v3_hash_find(const struct ggml_v3_hash_set hash_set, struct ggml_v3_tensor * key) {
+    size_t h = ggml_v3_hash(key) % hash_set.size;
+
+    // linear probing
+    size_t i = h;
+    while (hash_set.keys[i] != NULL && hash_set.keys[i] != key) {
+        i = (i + 1) % hash_set.size;
+        if (i == h) {
+            // visited all hash table entries -> not found
+            return GGML_V3_HASHTABLE_FULL;
+        }
+    }
+    return i;
+}
+
+bool ggml_v3_hash_contains(struct ggml_v3_hash_set hash_set, struct ggml_v3_tensor * key) {
+    size_t i = ggml_v3_hash_find(hash_set, key);
+    return i != GGML_V3_HASHTABLE_FULL && hash_set.keys[i] == key;
+}
+
+size_t ggml_v3_hash_insert(struct ggml_v3_hash_set hash_set, struct ggml_v3_tensor * key) {
+    size_t i = ggml_v3_hash_find(hash_set, key);
+
+    GGML_V3_ASSERT(i != GGML_V3_HASHTABLE_FULL);
+
+    if (hash_set.keys[i] == key) {
+        return GGML_V3_HASHTABLE_ALREADY_EXISTS;
+    }
+
+    // insert
+    GGML_V3_ASSERT(hash_set.keys[i] == NULL);
+    hash_set.keys[i] = key;
+    return i;
+}
+
+size_t ggml_v3_hash_find_or_insert(struct ggml_v3_hash_set hash_set, struct ggml_v3_tensor * key) {
+    size_t i = ggml_v3_hash_find(hash_set, key);
+
+    GGML_V3_ASSERT(i != GGML_V3_HASHTABLE_FULL);
+
+    hash_set.keys[i] = key;
+    return i;
+}
+
+static struct ggml_v3_hash_set ggml_v3_hash_set_new(size_t size) {
+    size = ggml_v3_hash_size(size);
+    struct ggml_v3_hash_set result;
+    result.size = size;
+    result.keys = malloc(sizeof(struct ggml_v3_tensor *) * size);
+    memset(result.keys, 0, sizeof(struct ggml_v3_tensor *) * size);
+    return result;
+}
+
+static void ggml_v3_hash_set_free(struct ggml_v3_hash_set hash_set) {
+    free(hash_set.keys);
+}
+
+struct hash_map {
+    struct ggml_v3_hash_set set;
+    struct ggml_v3_tensor ** vals;
+};
+
+static struct hash_map * ggml_v3_new_hash_map(size_t size) {
+    struct hash_map * result = malloc(sizeof(struct hash_map));
+    result->set = ggml_v3_hash_set_new(size);
+    result->vals = malloc(sizeof(struct ggml_v3_tensor *) * result->set.size);
+    memset(result->vals, 0, sizeof(struct ggml_v3_tensor *) * result->set.size);
+    return result;
+}
+
+static void ggml_v3_hash_map_free(struct hash_map * map) {
+    ggml_v3_hash_set_free(map->set);
+    free(map->vals);
+    free(map);
+}
+
+// gradient checkpointing
+
+static struct ggml_v3_tensor * ggml_v3_recompute_graph_node(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_cgraph  * graph,
+        struct hash_map     * replacements,
+        struct ggml_v3_tensor  * node) {
+
+    if (node == NULL) {
+        return NULL;
+    }
+
+    if (node->is_param) {
+        return node;
+    }
+
+    if (!ggml_v3_hash_contains(graph->visited_hash_table, node)) {
+        return node;
+    }
+
+    int count_children = 0;
+    for (int k = 0; k < GGML_V3_MAX_SRC; ++k) {
+        if (node->src[k]) {
+            ++count_children;
+        }
+    }
+
+    if (count_children == 0) {
+        return node;
+    }
+
+    size_t i = ggml_v3_hash_find(replacements->set, node);
+    GGML_V3_ASSERT(i != GGML_V3_HASHTABLE_FULL); // assert that not full
+    if (replacements->set.keys[i] == node) {
+        return replacements->vals[i];
+    }
+
+    struct ggml_v3_tensor * clone = ggml_v3_new_tensor(ctx, node->type, GGML_V3_MAX_DIMS, node->ne);
+
+    // insert clone into replacements
+    GGML_V3_ASSERT(replacements->set.keys[i] == NULL); // assert that we don't overwrite
+    replacements->set.keys[i] = node;
+    replacements->vals[i] = clone;
+
+    clone->op       = node->op;
+    clone->grad     = node->grad;
+    clone->is_param = node->is_param;
+    clone->extra    = node->extra;
+    for (int k = 0; k < GGML_V3_MAX_DIMS; ++k) {
+        clone->nb[k] = node->nb[k];
+    }
+    for (int k = 0; k < GGML_V3_MAX_SRC; ++k) {
+        clone->src[k] = ggml_v3_recompute_graph_node(ctx, graph, replacements, node->src[k]);
+    }
+    if (node->view_src != NULL) {
+        clone->data = (node->view_src->data == NULL)
+                        ? NULL // view_src not yet allocated
+                        : (char *) node->view_src->data // view_src already allocated
+                                 + node->view_offs;
+        clone->view_src  = node->view_src;
+        clone->view_offs = node->view_offs;
+    }
+
+    GGML_V3_ASSERT(sizeof(node->op_params) == sizeof(int32_t) * (GGML_V3_MAX_OP_PARAMS / sizeof(int32_t)));
+    GGML_V3_ASSERT(sizeof(node->name)      == GGML_V3_MAX_NAME);
+    memcpy(clone->op_params, node->op_params, sizeof(node->op_params));
+    ggml_v3_format_name(clone, "%s (clone)", ggml_v3_get_name(node));
+
+    return clone;
+}
+
+void ggml_v3_build_backward_gradient_checkpointing(
+        struct ggml_v3_context   * ctx,
+        struct ggml_v3_cgraph    * gf,
+        struct ggml_v3_cgraph    * gb,
+        struct ggml_v3_cgraph    * gb_tmp,
+        struct ggml_v3_tensor  * * checkpoints,
+        int                     n_checkpoints) {
+    ggml_v3_graph_cpy(gf, gb_tmp);
+    ggml_v3_build_backward_expand(ctx, gf, gb_tmp, true);
+
+    if (n_checkpoints <= 0) {
+        ggml_v3_graph_cpy(gb_tmp, gb);
+        return;
+    }
+
+    struct hash_map * replacements = ggml_v3_new_hash_map(gf->n_nodes + gf->n_leafs + n_checkpoints);
+
+    // insert checkpoints in replacements
+    for (int i = 0; i < n_checkpoints; ++i) {
+        size_t k = ggml_v3_hash_find(replacements->set, checkpoints[i]);
+        GGML_V3_ASSERT(k != GGML_V3_HASHTABLE_FULL); // assert that not full
+        GGML_V3_ASSERT(replacements->set.keys[k] == NULL); // assert that we don't overwrite
+        replacements->set.keys[k] = checkpoints[i];
+        replacements->vals[k]     = checkpoints[i];
+    }
+
+    ggml_v3_graph_cpy(gf, gb);
+    // rewrite gb_tmp->nodes[gf->n_nodes:gb_tmp->n_nodes],
+    // replacing references to gb_tmp->nodes[0:gf->n_nodes] ( == gf->nodes[0:gf->n_nodes]),
+    // by recomputing them from checkpoints
+    for (int i = gf->n_nodes; i<gb_tmp->n_nodes; ++i) {
+        struct ggml_v3_tensor * node = gb_tmp->nodes[i];
+        for (int k = 0; k < GGML_V3_MAX_SRC; ++k) {
+            // insert new tensors recomputing src, reusing already made replacements,
+            // remember replacements: remember new tensors with mapping from corresponding gf nodes
+            // recurse for input tensors,
+            // unless (i.e. terminating when) input tensors are replacements (like checkpoints)
+            node->src[k] = ggml_v3_recompute_graph_node(ctx, gf, replacements, node->src[k]);
+        }
+        // insert rewritten backward node with replacements made into resulting backward graph gb
+        ggml_v3_build_forward_expand(gb, node);
+    }
+
+    ggml_v3_hash_map_free(replacements);
+}
+
+// functions to change gradients considering the case that input a might be initial gradient with zero value
+
+static struct ggml_v3_tensor * ggml_v3_add_or_set(struct ggml_v3_context * ctx, struct ggml_v3_tensor * a, struct ggml_v3_tensor * b, struct ggml_v3_hash_set zero_table) {
+    if (ggml_v3_hash_contains(zero_table, a)) {
+        return b;
+    } else {
+        return ggml_v3_add_impl(ctx, a, b, false);
+    }
+}
+
+static struct ggml_v3_tensor * ggml_v3_acc_or_set(struct ggml_v3_context * ctx, struct ggml_v3_tensor * a, struct ggml_v3_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset, struct ggml_v3_hash_set zero_table) {
+    if (ggml_v3_hash_contains(zero_table, a)) {
+        struct ggml_v3_tensor * a_zero = ggml_v3_scale(ctx, a, 0.0f);
+        return ggml_v3_acc_impl(ctx, a_zero, b, nb1, nb2, nb3, offset, false);
+    } else {
+        return ggml_v3_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, false);
+    }
+}
+
+static struct ggml_v3_tensor * ggml_v3_add1_or_set(struct ggml_v3_context * ctx, struct ggml_v3_tensor * a, struct ggml_v3_tensor * b, struct ggml_v3_hash_set zero_table) {
+    if (ggml_v3_hash_contains(zero_table, a)) {
+        return ggml_v3_repeat(ctx, b, a);
+    } else {
+        return ggml_v3_add1_impl(ctx, a, b, false);
+    }
+}
+
+static struct ggml_v3_tensor * ggml_v3_sub_or_set(struct ggml_v3_context * ctx, struct ggml_v3_tensor * a, struct ggml_v3_tensor * b, struct ggml_v3_hash_set zero_table) {
+    if (ggml_v3_hash_contains(zero_table, a)) {
+        return ggml_v3_neg(ctx, b);
+    } else {
+        return ggml_v3_sub_impl(ctx, a, b, false);
+    }
+}
+
+static void ggml_v3_compute_backward(struct ggml_v3_context * ctx, struct ggml_v3_tensor * tensor, struct ggml_v3_hash_set zero_table) {
+    struct ggml_v3_tensor * src0 = tensor->src[0];
+    struct ggml_v3_tensor * src1 = tensor->src[1];
+
+    switch (tensor->op) {
+        case GGML_V3_OP_DUP:
+            {
+                if (src0->grad) {
+                    src0->grad = ggml_v3_add_or_set(ctx, src0->grad, tensor->grad, zero_table);
+                }
+            } break;
+        case GGML_V3_OP_ADD:
+            {
+                if (src0->grad) {
+                    src0->grad = ggml_v3_add_or_set(ctx, src0->grad, tensor->grad, zero_table);
+                }
+                if (src1->grad) {
+                    src1->grad = ggml_v3_add_or_set(ctx, src1->grad, tensor->grad, zero_table);
+                }
+            } break;
+        case GGML_V3_OP_ADD1:
+            {
+                if (src0->grad) {
+                    src0->grad = ggml_v3_add_or_set(ctx, src0->grad, tensor->grad, zero_table);
+                }
+                if (src1->grad) {
+                    src1->grad = ggml_v3_add_or_set(ctx,
+                        src1->grad,
+                        ggml_v3_mean(ctx, tensor->grad), // TODO: should probably be sum instead of mean
+                        zero_table);
+                }
+            } break;
+        case GGML_V3_OP_ACC:
+            {
+                if (src0->grad) {
+                    src0->grad = ggml_v3_add_or_set(ctx, src0->grad, tensor->grad, zero_table);
+                }
+                if (src1->grad) {
+                    const size_t nb1     = ((int32_t *) tensor->op_params)[0];
+                    const size_t nb2     = ((int32_t *) tensor->op_params)[1];
+                    const size_t nb3     = ((int32_t *) tensor->op_params)[2];
+                    const size_t offset  = ((int32_t *) tensor->op_params)[3];
+
+                    struct ggml_v3_tensor * tensor_grad_view = ggml_v3_view_4d(ctx,
+                        tensor->grad,
+                        src1->grad->ne[0],
+                        src1->grad->ne[1],
+                        src1->grad->ne[2],
+                        src1->grad->ne[3],
+                        nb1, nb2, nb3, offset);
+
+                    src1->grad =
+                        ggml_v3_add_or_set(ctx,
+                            src1->grad,
+                            ggml_v3_reshape(ctx,
+                                ggml_v3_cont(ctx, tensor_grad_view),
+                                src1->grad),
+                            zero_table);
+                }
+            } break;
+        case GGML_V3_OP_SUB:
+            {
+                if (src0->grad) {
+                    src0->grad = ggml_v3_add_or_set(ctx, src0->grad, tensor->grad, zero_table);
+                }
+                if (src1->grad) {
+                    src1->grad = ggml_v3_sub_or_set(ctx, src1->grad, tensor->grad, zero_table);
+                }
+            } break;
+        case GGML_V3_OP_MUL:
+            {
+                if (src0->grad) {
+                    src0->grad =
+                        ggml_v3_add_or_set(ctx,
+                                src0->grad,
+                                ggml_v3_mul(ctx, src1, tensor->grad),
+                                zero_table);
+                }
+                if (src1->grad) {
+                    src1->grad =
+                        ggml_v3_add_or_set(ctx,
+                                src1->grad,
+                                ggml_v3_mul(ctx, src0, tensor->grad),
+                                zero_table);
+                }
+            } break;
+        case GGML_V3_OP_DIV:
+            {
+                if (src0->grad) {
+                    src0->grad =
+                        ggml_v3_add_or_set(ctx,
+                                src0->grad,
+                                ggml_v3_div(ctx, tensor->grad, src1),
+                                zero_table);
+                }
+                if (src1->grad) {
+                    src1->grad =
+                        ggml_v3_sub_or_set(ctx,
+                                src1->grad,
+                                ggml_v3_mul(ctx,
+                                    tensor->grad,
+                                    ggml_v3_div(ctx, tensor, src1)),
+                                zero_table);
+                }
+            } break;
+        case GGML_V3_OP_SQR:
+            {
+                if (src0->grad) {
+                    src0->grad =
+                        ggml_v3_add_or_set(ctx,
+                                src0->grad,
+                                ggml_v3_scale(ctx,
+                                    ggml_v3_mul(ctx, src0, tensor->grad),
+                                    2.0f),
+                                zero_table);
+                }
+            } break;
+        case GGML_V3_OP_SQRT:
+            {
+                if (src0->grad) {
+                    src0->grad =
+                        ggml_v3_add_or_set(ctx,
+                                src0->grad,
+                                ggml_v3_scale(ctx,
+                                    ggml_v3_div(ctx,
+                                        tensor->grad,
+                                        tensor),
+                                    0.5f),
+                                zero_table);
+                }
+            } break;
+        case GGML_V3_OP_LOG:
+            {
+                if (src0->grad) {
+                    src0->grad =
+                        ggml_v3_add_or_set(ctx,
+                                src0->grad,
+                                ggml_v3_div(ctx,
+                                    tensor->grad,
+                                    src0),
+                                zero_table);
+                }
+            } break;
+        case GGML_V3_OP_SUM:
+            {
+                if (src0->grad) {
+                    src0->grad =
+                        ggml_v3_add1_or_set(ctx,
+                                src0->grad,
+                                tensor->grad,
+                                zero_table);
+                }
+            } break;
+        case GGML_V3_OP_SUM_ROWS:
+            {
+                if (src0->grad) {
+                    src0->grad =
+                        ggml_v3_add_or_set(ctx,
+                                src0->grad,
+                                ggml_v3_repeat(ctx,
+                                    tensor->grad,
+                                    src0->grad),
+                                zero_table);
+                }
+            } break;
+        case GGML_V3_OP_MEAN:
+        case GGML_V3_OP_ARGMAX:
+            {
+                GGML_V3_ASSERT(false); // TODO: implement
+            } break;
+        case GGML_V3_OP_REPEAT:
+            {
+                // necessary for llama
+                if (src0->grad) {
+                    src0->grad = ggml_v3_add_or_set(ctx,
+                            src0->grad,
+                            ggml_v3_repeat_back(ctx, tensor->grad, src0->grad),
+                            zero_table);
+                }
+            } break;
+        case GGML_V3_OP_REPEAT_BACK:
+            {
+                if (src0->grad) {
+                    // TODO: test this
+                    src0->grad = ggml_v3_add_or_set(ctx,
+                            src0->grad,
+                            ggml_v3_repeat(ctx, tensor->grad, src0->grad),
+                            zero_table);
+                }
+            } break;
+        case GGML_V3_OP_CONCAT:
+            {
+                GGML_V3_ASSERT(false); // TODO: implement
+            } break;
+        case GGML_V3_OP_SILU_BACK:
+            {
+                GGML_V3_ASSERT(false); // TODO: not implemented
+            } break;
+        case GGML_V3_OP_NORM:
+            {
+                GGML_V3_ASSERT(false); // TODO: not implemented
+            } break;
+        case GGML_V3_OP_RMS_NORM:
+            {
+                // necessary for llama
+                if (src0->grad) {
+                    float eps;
+                    memcpy(&eps, tensor->op_params, sizeof(float));
+
+                    src0->grad = ggml_v3_add_or_set(ctx,
+                            src0->grad,
+                            ggml_v3_rms_norm_back(ctx, src0, tensor->grad, eps),
+                            zero_table);
+                }
+            } break;
+        case GGML_V3_OP_RMS_NORM_BACK:
+            {
+                GGML_V3_ASSERT(false); // TODO: not implemented
+            } break;
+        case GGML_V3_OP_GROUP_NORM:
+            {
+                GGML_V3_ASSERT(false); // TODO: not implemented
+            } break;
+        case GGML_V3_OP_MUL_MAT:
+            {
+                // https://cs231n.github.io/optimization-2/#staged
+                // # forward pass
+                // s0 = np.random.randn(5, 10)
+                // s1 = np.random.randn(10, 3)
+                // t = s0.dot(s1)
+
+                // # now suppose we had the gradient on t from above in the circuit
+                // dt = np.random.randn(*t.shape) # same shape as t
+                // ds0 = dt.dot(s1.T) #.T gives the transpose of the matrix
+                // ds1 = t.T.dot(dt)
+
+                // tensor.shape [m,p,qq,rr]
+                // src0.shape   [n,m,q1,r1]
+                // src1.shape   [n,p,qq,rr]
+
+                // necessary for llama
+                if (src0->grad) {
+                    struct ggml_v3_tensor * s1_tg =
+                        ggml_v3_out_prod(ctx, // [n,m,qq,rr]
+                            src1,          // [n,p,qq,rr]
+                            tensor->grad); // [m,p,qq,rr]
+                    const int64_t qq = s1_tg->ne[2];
+                    const int64_t rr = s1_tg->ne[3];
+                    const int64_t q1 = src0->ne[2];
+                    const int64_t r1 = src0->ne[3];
+                    const bool ne2_broadcasted = qq > q1;
+                    const bool ne3_broadcasted = rr > r1;
+                    if (ne2_broadcasted || ne3_broadcasted) {
+                        // sum broadcast repetitions of s1_tg into shape of src0
+                        s1_tg = ggml_v3_repeat_back(ctx, s1_tg, src0);
+                    }
+                    src0->grad =
+                        ggml_v3_add_or_set(ctx,
+                                src0->grad, // [n,m,q1,r1]
+                                s1_tg,      // [n,m,q1,r1]
+                                zero_table);
+                }
+                if (src1->grad) {
+                    src1->grad =
+                        ggml_v3_add_or_set(ctx,
+                                src1->grad,                            // [n,p,qq,rr]
+                                // ggml_v3_mul_mat(ctx,                   // [n,p,qq,rr]
+                                //     ggml_v3_cont(ctx,                  // [m,n,q1,r1]
+                                //         ggml_v3_transpose(ctx, src0)), // [m,n,q1,r1]
+                                //     tensor->grad),                  // [m,p,qq,rr]
+
+                                // // when src0 is bigger than tensor->grad (this is mostly the case in llama),
+                                // // avoid transpose of src0, rather transpose smaller tensor->grad
+                                // // and then use ggml_v3_out_prod
+                                ggml_v3_out_prod(ctx,                  // [n,p,qq,rr]
+                                    src0,                           // [n,m,q1,r1]
+                                    ggml_v3_transpose(ctx,             // [p,m,qq,rr]
+                                        tensor->grad)),             // [m,p,qq,rr]
+                                zero_table);
+                }
+            } break;
+        case GGML_V3_OP_MUL_MAT_ID:
+            {
+                GGML_V3_ASSERT(false); // TODO: not implemented
+            } break;
+        case GGML_V3_OP_OUT_PROD:
+            {
+                GGML_V3_ASSERT(false); // TODO: not implemented
+            } break;
+        case GGML_V3_OP_SCALE:
+            {
+                // necessary for llama
+                if (src0->grad) {
+                    float s;
+                    memcpy(&s, tensor->op_params, sizeof(float));
+
+                    src0->grad =
+                        ggml_v3_add_or_set(ctx,
+                            src0->grad,
+                            ggml_v3_scale_impl(ctx, tensor->grad, s, false),
+                            zero_table);
+                }
+            } break;
+        case GGML_V3_OP_SET:
+            {
+                const size_t nb1     = ((int32_t *) tensor->op_params)[0];
+                const size_t nb2     = ((int32_t *) tensor->op_params)[1];
+                const size_t nb3     = ((int32_t *) tensor->op_params)[2];
+                const size_t offset  = ((int32_t *) tensor->op_params)[3];
+
+                struct ggml_v3_tensor * tensor_grad_view = NULL;
+
+                if (src0->grad || src1->grad) {
+                    GGML_V3_ASSERT(src0->type == tensor->type);
+                    GGML_V3_ASSERT(tensor->grad->type == tensor->type);
+                    GGML_V3_ASSERT(tensor->grad->type == src1->grad->type);
+
+                    tensor_grad_view = ggml_v3_view_4d(ctx,
+                        tensor->grad,
+                        src1->grad->ne[0],
+                        src1->grad->ne[1],
+                        src1->grad->ne[2],
+                        src1->grad->ne[3],
+                        nb1, nb2, nb3, offset);
+                }
+
+                if (src0->grad) {
+                    src0->grad = ggml_v3_add_or_set(ctx,
+                        src0->grad,
+                        ggml_v3_acc_impl(ctx,
+                            tensor->grad,
+                            ggml_v3_neg(ctx, tensor_grad_view),
+                            nb1, nb2, nb3, offset, false),
+                        zero_table);
+                }
+
+                if (src1->grad) {
+                    src1->grad =
+                        ggml_v3_add_or_set(ctx,
+                            src1->grad,
+                            ggml_v3_reshape(ctx,
+                                ggml_v3_cont(ctx, tensor_grad_view),
+                                src1->grad),
+                            zero_table);
+                }
+            } break;
+        case GGML_V3_OP_CPY:
+            {
+                // necessary for llama
+                // cpy overwrites value of src1 by src0 and returns view(src1)
+                // the overwriting is mathematically equivalent to:
+                // tensor = src0 * 1 + src1 * 0
+                if (src0->grad) {
+                    // dsrc0 = dtensor * 1
+                    src0->grad = ggml_v3_add_or_set(ctx, src0->grad, tensor->grad, zero_table);
+                }
+                if (src1->grad) {
+                    // dsrc1 = dtensor * 0 -> noop
+                }
+            } break;
+        case GGML_V3_OP_CONT:
+            {
+                // same as cpy
+                if (src0->grad) {
+                    GGML_V3_ASSERT(ggml_v3_is_contiguous(src0->grad));
+                    GGML_V3_ASSERT(ggml_v3_is_contiguous(tensor->grad));
+                    src0->grad = ggml_v3_add_or_set(ctx, src0->grad, tensor->grad, zero_table);
+                }
+            } break;
+        case GGML_V3_OP_RESHAPE:
+            {
+                // necessary for llama
+                if (src0->grad) {
+                    src0->grad =
+                        ggml_v3_add_or_set(ctx, src0->grad,
+                            ggml_v3_reshape(ctx,
+                                ggml_v3_is_contiguous(tensor->grad)
+                                    ? tensor->grad
+                                    : ggml_v3_cont(ctx, tensor->grad),
+                                src0->grad),
+                        zero_table);
+                }
+            } break;
+        case GGML_V3_OP_VIEW:
+            {
+                // necessary for llama
+                if (src0->grad) {
+                    size_t offset;
+
+                    memcpy(&offset, tensor->op_params, sizeof(offset));
+
+                    size_t nb1     = tensor->nb[1];
+                    size_t nb2     = tensor->nb[2];
+                    size_t nb3     = tensor->nb[3];
+
+                    if (src0->type != src0->grad->type) {
+                        // gradient is typically F32, but src0 could be other type
+                        size_t ng = ggml_v3_element_size(src0->grad);
+                        size_t n0 = ggml_v3_element_size(src0);
+                        GGML_V3_ASSERT(offset % n0 == 0);
+                        GGML_V3_ASSERT(nb1 % n0 == 0);
+                        GGML_V3_ASSERT(nb2 % n0 == 0);
+                        GGML_V3_ASSERT(nb3 % n0 == 0);
+                        offset = (offset / n0) * ng;
+                        nb1 = (nb1 / n0) * ng;
+                        nb2 = (nb2 / n0) * ng;
+                        nb3 = (nb3 / n0) * ng;
+                    }
+
+                    src0->grad = ggml_v3_acc_or_set(ctx, src0->grad, tensor->grad, nb1, nb2, nb3, offset, zero_table);
+                }
+            } break;
+        case GGML_V3_OP_PERMUTE:
+            {
+                // necessary for llama
+                if (src0->grad) {
+                    int32_t * axes = (int32_t *) tensor->op_params;
+                    int axis0 = axes[0] & 0x3;
+                    int axis1 = axes[1] & 0x3;
+                    int axis2 = axes[2] & 0x3;
+                    int axis3 = axes[3] & 0x3;
+                    int axes_backward[4] = {0,0,0,0};
+                    axes_backward[axis0] = 0;
+                    axes_backward[axis1] = 1;
+                    axes_backward[axis2] = 2;
+                    axes_backward[axis3] = 3;
+                    src0->grad =
+                        ggml_v3_add_or_set(ctx, src0->grad,
+                            ggml_v3_permute(ctx,
+                                tensor->grad,
+                                axes_backward[0],
+                                axes_backward[1],
+                                axes_backward[2],
+                                axes_backward[3]),
+                            zero_table);
+                }
+            } break;
+        case GGML_V3_OP_TRANSPOSE:
+            {
+                // necessary for llama
+                if (src0->grad) {
+                    src0->grad =
+                        ggml_v3_add_or_set(ctx, src0->grad,
+                            ggml_v3_transpose(ctx, tensor->grad),
+                        zero_table);
+                }
+            } break;
+        case GGML_V3_OP_GET_ROWS:
+            {
+                // necessary for llama (only for tokenizer)
+                if (src0->grad) {
+                    src0->grad =
+                        ggml_v3_add_or_set(ctx, src0->grad,
+                            // last ggml_v3_get_rows_back argument src0->grad is only
+                            // necessary to setup correct output shape
+                            ggml_v3_get_rows_back(ctx, tensor->grad, src1, src0->grad),
+                        zero_table);
+                }
+                if (src1->grad) {
+                    // noop
+                }
+            } break;
+        case GGML_V3_OP_GET_ROWS_BACK:
+            {
+                GGML_V3_ASSERT(false); // TODO: not implemented
+            } break;
+        case GGML_V3_OP_DIAG:
+            {
+                GGML_V3_ASSERT(false); // TODO: not implemented
+            } break;
+        case GGML_V3_OP_DIAG_MASK_INF:
+            {
+                // necessary for llama
+                if (src0->grad) {
+                    const int n_past = ((int32_t *) tensor->op_params)[0];
+                    src0->grad =
+                        ggml_v3_add_or_set(ctx, src0->grad,
+                            /* ggml_v3_diag_mask_inf_impl() shouldn't be here */
+                            /* ref:  https://github.com/ggerganov/llama.cpp/pull/4203#discussion_r1412377992 */
+                            ggml_v3_diag_mask_zero_impl(ctx, tensor->grad, n_past, false),
+                        zero_table);
+                }
+            } break;
+        case GGML_V3_OP_DIAG_MASK_ZERO:
+            {
+                // necessary for llama
+                if (src0->grad) {
+                    const int n_past = ((int32_t *) tensor->op_params)[0];
+                    src0->grad =
+                        ggml_v3_add_or_set(ctx, src0->grad,
+                            ggml_v3_diag_mask_zero_impl(ctx, tensor->grad, n_past, false),
+                        zero_table);
+                }
+            } break;
+        case GGML_V3_OP_SOFT_MAX:
+            {
+                // necessary for llama
+                if (src0->grad) {
+                    src0->grad =
+                        ggml_v3_add_or_set(ctx, src0->grad,
+                            ggml_v3_soft_max_back(ctx, tensor->grad, tensor),
+                        zero_table);
+                }
+
+            } break;
+        case GGML_V3_OP_SOFT_MAX_BACK:
+            {
+                GGML_V3_ASSERT(false); // TODO: not implemented
+            } break;
+        case GGML_V3_OP_ROPE:
+            {
+                // necessary for llama
+                if (src0->grad) {
+                    //const int n_past = ((int32_t *) tensor->op_params)[0];
+                    const int n_dims     = ((int32_t *) tensor->op_params)[1];
+                    const int mode       = ((int32_t *) tensor->op_params)[2];
+                    const int n_ctx      = ((int32_t *) tensor->op_params)[3];
+                    const int n_orig_ctx = ((int32_t *) tensor->op_params)[4];
+                    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, xpos_base, xpos_down;
+
+                    memcpy(&freq_base,   (int32_t *) tensor->op_params +  5, sizeof(float));
+                    memcpy(&freq_scale,  (int32_t *) tensor->op_params +  6, sizeof(float));
+                    memcpy(&ext_factor,  (int32_t *) tensor->op_params +  7, sizeof(float));
+                    memcpy(&attn_factor, (int32_t *) tensor->op_params +  8, sizeof(float));
+                    memcpy(&beta_fast,   (int32_t *) tensor->op_params +  9, sizeof(float));
+                    memcpy(&beta_slow,   (int32_t *) tensor->op_params + 10, sizeof(float));
+                    memcpy(&xpos_base,   (int32_t *) tensor->op_params + 11, sizeof(float));
+                    memcpy(&xpos_down,   (int32_t *) tensor->op_params + 12, sizeof(bool));
+
+                    src0->grad = ggml_v3_add_or_set(ctx,
+                            src0->grad,
+                            ggml_v3_rope_back(ctx,
+                                tensor->grad,
+                                src1,
+                                n_dims,
+                                mode,
+                                n_ctx,
+                                n_orig_ctx,
+                                freq_base,
+                                freq_scale,
+                                ext_factor,
+                                attn_factor,
+                                beta_fast,
+                                beta_slow,
+                                xpos_base,
+                                xpos_down),
+                            zero_table);
+                }
+            } break;
+        case GGML_V3_OP_ROPE_BACK:
+            {
+                if (src0->grad) {
+                    //const int n_past = ((int32_t *) tensor->op_params)[0];
+                    const int n_dims     = ((int32_t *) tensor->op_params)[1];
+                    const int mode       = ((int32_t *) tensor->op_params)[2];
+                    const int n_ctx      = ((int32_t *) tensor->op_params)[3];
+                    const int n_orig_ctx = ((int32_t *) tensor->op_params)[4];
+                    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, xpos_base, xpos_down;
+
+                    memcpy(&freq_base,   (int32_t *) tensor->op_params +  5, sizeof(float));
+                    memcpy(&freq_scale,  (int32_t *) tensor->op_params +  6, sizeof(float));
+                    memcpy(&ext_factor,  (int32_t *) tensor->op_params +  7, sizeof(float));
+                    memcpy(&attn_factor, (int32_t *) tensor->op_params +  8, sizeof(float));
+                    memcpy(&beta_fast,   (int32_t *) tensor->op_params +  9, sizeof(float));
+                    memcpy(&beta_slow,   (int32_t *) tensor->op_params + 10, sizeof(float));
+                    memcpy(&xpos_base,   (int32_t *) tensor->op_params + 11, sizeof(float));
+                    memcpy(&xpos_down,   (int32_t *) tensor->op_params + 12, sizeof(bool));
+
+                    src0->grad = ggml_v3_add_or_set(ctx,
+                            src0->grad,
+                            ggml_v3_rope_impl(ctx,
+                                tensor->grad,
+                                src1,
+                                n_dims,
+                                mode,
+                                n_ctx,
+                                n_orig_ctx,
+                                freq_base,
+                                freq_scale,
+                                ext_factor,
+                                attn_factor,
+                                beta_fast,
+                                beta_slow,
+                                xpos_base,
+                                xpos_down,
+                                false),
+                            zero_table);
+                }
+            } break;
+        case GGML_V3_OP_ALIBI:
+            {
+                GGML_V3_ASSERT(false); // TODO: not implemented
+            } break;
+        case GGML_V3_OP_CLAMP:
+            {
+                GGML_V3_ASSERT(false); // TODO: not implemented
+            } break;
+        case GGML_V3_OP_CONV_TRANSPOSE_1D:
+            {
+                GGML_V3_ASSERT(false); // TODO: not implemented
+            } break;
+        case GGML_V3_OP_IM2COL:
+            {
+                GGML_V3_ASSERT(false); // TODO: not implemented
+            } break;
+        case GGML_V3_OP_CONV_TRANSPOSE_2D:
+            {
+                GGML_V3_ASSERT(false); // TODO: not implemented
+            } break;
+        case GGML_V3_OP_POOL_1D:
+            {
+                GGML_V3_ASSERT(false); // TODO: not implemented
+            } break;
+        case GGML_V3_OP_POOL_2D:
+            {
+                GGML_V3_ASSERT(false); // TODO: not implemented
+            } break;
+        case GGML_V3_OP_UPSCALE:
+            {
+                GGML_V3_ASSERT(false); // TODO: not implemented
+            } break;
+        case GGML_V3_OP_PAD:
+            {
+                GGML_V3_ASSERT(false); // TODO: not implemented
+            } break;
+        case GGML_V3_OP_ARGSORT:
+            {
+                GGML_V3_ASSERT(false); // TODO: not implemented
+            } break;
+        case GGML_V3_OP_LEAKY_RELU:
+            {
+                GGML_V3_ASSERT(false); // TODO: not implemented
+            } break;
+        case GGML_V3_OP_FLASH_ATTN:
+            {
+                struct ggml_v3_tensor * flash_grad = NULL;
+                if (src0->grad || src1->grad || tensor->src[2]->grad) {
+                    int32_t t = ggml_v3_get_op_params_i32(tensor, 0);
+                    GGML_V3_ASSERT(t == 0 || t == 1);
+                    bool masked = t != 0;
+                    flash_grad =
+                        ggml_v3_flash_attn_back(ctx,
+                            src0,
+                            src1,
+                            tensor->src[2],
+                            tensor->grad,
+                            masked);
+                }
+
+                struct ggml_v3_tensor * src2 = tensor->src[2];
+                const int64_t elem_q = ggml_v3_nelements(src0);
+                const int64_t elem_k = ggml_v3_nelements(src1);
+                const int64_t elem_v = ggml_v3_nelements(src2);
+
+                enum ggml_v3_type result_type = flash_grad->type;
+                GGML_V3_ASSERT(ggml_v3_blck_size(result_type) == 1);
+                const size_t tsize = ggml_v3_type_size(result_type);
+
+                const size_t offs_q = 0;
+                const size_t offs_k = offs_q + GGML_V3_PAD(elem_q * tsize, GGML_V3_MEM_ALIGN);
+                const size_t offs_v = offs_k + GGML_V3_PAD(elem_k * tsize, GGML_V3_MEM_ALIGN);
+
+                if (src0->grad) {
+                    struct ggml_v3_tensor * view_q = ggml_v3_view_1d(ctx, flash_grad, elem_q, offs_q);
+                    struct ggml_v3_tensor * grad_q = ggml_v3_reshape(ctx, view_q, src0);
+                    src0->grad = ggml_v3_add_or_set(ctx,
+                            src0->grad,
+                            grad_q,
+                            zero_table);
+                }
+                if (src1->grad) {
+                    struct ggml_v3_tensor * view_k = ggml_v3_view_1d(ctx, flash_grad, elem_k, offs_k);
+                    struct ggml_v3_tensor * grad_k = ggml_v3_reshape(ctx, view_k, src1);
+                    src1->grad = ggml_v3_add_or_set(ctx,
+                            src1->grad,
+                            grad_k,
+                            zero_table);
+                }
+                if (src2->grad) {
+                    struct ggml_v3_tensor * view_v = ggml_v3_view_1d(ctx, flash_grad, elem_v, offs_v);
+                    struct ggml_v3_tensor * grad_v = ggml_v3_reshape(ctx, view_v, src2);
+                    src2->grad = ggml_v3_add_or_set(ctx,
+                            src2->grad,
+                            grad_v,
+                            zero_table);
+                }
+            } break;
+        case GGML_V3_OP_FLASH_FF:
+            {
+                GGML_V3_ASSERT(false); // not supported
+            } break;
+        case GGML_V3_OP_FLASH_ATTN_BACK:
+            {
+                GGML_V3_ASSERT(false); // not supported
+            } break;
+        case GGML_V3_OP_WIN_PART:
+        case GGML_V3_OP_WIN_UNPART:
+        case GGML_V3_OP_UNARY:
+            {
+                switch (ggml_v3_get_unary_op(tensor)) {
+                    case GGML_V3_UNARY_OP_ABS:
+                        {
+                            if (src0->grad) {
+                                src0->grad =
+                                    ggml_v3_add_or_set(ctx,
+                                            src0->grad,
+                                            ggml_v3_mul(ctx,
+                                                ggml_v3_sgn(ctx, src0),
+                                                tensor->grad),
+                                            zero_table);
+                            }
+                        } break;
+                    case GGML_V3_UNARY_OP_SGN:
+                        {
+                            if (src0->grad) {
+                                // noop
+                            }
+                        } break;
+                    case GGML_V3_UNARY_OP_NEG:
+                        {
+                            if (src0->grad) {
+                                src0->grad = ggml_v3_sub_or_set(ctx, src0->grad, tensor->grad, zero_table);
+                            }
+                        } break;
+                    case GGML_V3_UNARY_OP_STEP:
+                        {
+                            if (src0->grad) {
+                                // noop
+                            }
+                        } break;
+                    case GGML_V3_UNARY_OP_TANH:
+                        {
+                            GGML_V3_ASSERT(false); // TODO: not implemented
+                        } break;
+                    case GGML_V3_UNARY_OP_ELU:
+                        {
+                            GGML_V3_ASSERT(false); // TODO: not implemented
+                        } break;
+                    case GGML_V3_UNARY_OP_RELU:
+                        {
+                            if (src0->grad) {
+                                src0->grad = ggml_v3_add_or_set(ctx,
+                                        src0->grad,
+                                        ggml_v3_mul(ctx,
+                                            ggml_v3_step(ctx, src0),
+                                            tensor->grad),
+                                        zero_table);
+                            }
+                        } break;
+                    case GGML_V3_UNARY_OP_GELU:
+                        {
+                            GGML_V3_ASSERT(false); // TODO: not implemented
+                        } break;
+                    case GGML_V3_UNARY_OP_GELU_QUICK:
+                        {
+                            GGML_V3_ASSERT(false); // TODO: not implemented
+                        } break;
+                    case GGML_V3_UNARY_OP_SILU:
+                        {
+                            // necessary for llama
+                            if (src0->grad) {
+                                src0->grad = ggml_v3_add_or_set(ctx,
+                                        src0->grad,
+                                        ggml_v3_silu_back(ctx, src0, tensor->grad),
+                                        zero_table);
+                            }
+                        } break;
+                    default:
+                        GGML_V3_ASSERT(false);
+                }
+            } break;
+        case GGML_V3_OP_GET_REL_POS:
+        case GGML_V3_OP_ADD_REL_POS:
+        case GGML_V3_OP_MAP_UNARY:
+        case GGML_V3_OP_MAP_BINARY:
+        case GGML_V3_OP_MAP_CUSTOM1_F32:
+        case GGML_V3_OP_MAP_CUSTOM2_F32:
+        case GGML_V3_OP_MAP_CUSTOM3_F32:
+        case GGML_V3_OP_MAP_CUSTOM1:
+        case GGML_V3_OP_MAP_CUSTOM2:
+        case GGML_V3_OP_MAP_CUSTOM3:
+            {
+                GGML_V3_ASSERT(false); // not supported
+            } break;
+        case GGML_V3_OP_CROSS_ENTROPY_LOSS:
+            {
+                if (src0->grad) {
+                    src0->grad = ggml_v3_add_or_set(ctx,
+                                src0->grad,
+                                ggml_v3_cross_entropy_loss_back(ctx,
+                                    src0,
+                                    src1,
+                                    tensor->grad),
+                                zero_table);
+                }
+            } break;
+        case GGML_V3_OP_CROSS_ENTROPY_LOSS_BACK:
+            {
+                GGML_V3_ASSERT(false); // not supported
+            } break;
+        case GGML_V3_OP_NONE:
+            {
+                // nop
+            } break;
+        case GGML_V3_OP_COUNT:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+
+    for (int i = 0; i < GGML_V3_MAX_SRC; ++i) {
+        if (tensor->src[i] && tensor->src[i]->grad) {
+            GGML_V3_ASSERT(ggml_v3_are_same_shape(tensor->src[i], tensor->src[i]->grad));
+        }
+    }
+}
+
+static void ggml_v3_visit_parents(struct ggml_v3_cgraph * cgraph, struct ggml_v3_tensor * node) {
+    if (node->grad == NULL) {
+        // this usually happens when we generate intermediate nodes from constants in the backward pass
+        // it can also happen during forward pass, if the user performs computations with constants
+        if (node->op != GGML_V3_OP_NONE) {
+            //GGML_V3_PRINT_DEBUG("%s: warning: node %p has no grad, but op %d\n", __func__, (void *) node, node->op);
+        }
+    }
+
+    // check if already visited
+    if (ggml_v3_hash_insert(cgraph->visited_hash_table, node) == GGML_V3_HASHTABLE_ALREADY_EXISTS) {
+        return;
+    }
+
+    for (int i = 0; i < GGML_V3_MAX_SRC; ++i) {
+        const int k =
+            (cgraph->order == GGML_V3_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT) ? i :
+            (cgraph->order == GGML_V3_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT) ? (GGML_V3_MAX_SRC-1-i) :
+            /* unknown order, just fall back to using i*/ i;
+        if (node->src[k]) {
+            ggml_v3_visit_parents(cgraph, node->src[k]);
+        }
+    }
+
+    if (node->op == GGML_V3_OP_NONE && node->grad == NULL) {
+        // reached a leaf node, not part of the gradient graph (e.g. a constant)
+        GGML_V3_ASSERT(cgraph->n_leafs < cgraph->size);
+
+        if (strlen(node->name) == 0) {
+            ggml_v3_format_name(node, "leaf_%d", cgraph->n_leafs);
+        }
+
+        cgraph->leafs[cgraph->n_leafs] = node;
+        cgraph->n_leafs++;
+    } else {
+        GGML_V3_ASSERT(cgraph->n_nodes < cgraph->size);
+
+        if (strlen(node->name) == 0) {
+            ggml_v3_format_name(node, "node_%d", cgraph->n_nodes);
+        }
+
+        cgraph->nodes[cgraph->n_nodes] = node;
+        if (cgraph->grads) {
+            cgraph->grads[cgraph->n_nodes] = node->grad;
+        }
+        cgraph->n_nodes++;
+    }
+}
+
+static void ggml_v3_build_forward_impl(struct ggml_v3_cgraph * cgraph, struct ggml_v3_tensor * tensor, bool expand) {
+    if (!expand) {
+        // TODO: this branch isn't accessible anymore, maybe move this to ggml_v3_build_forward_expand
+        ggml_v3_graph_clear(cgraph);
+    }
+
+    const int n0 = cgraph->n_nodes;
+    UNUSED(n0);
+
+    ggml_v3_visit_parents(cgraph, tensor);
+
+    const int n_new = cgraph->n_nodes - n0;
+    GGML_V3_PRINT_DEBUG("%s: visited %d new nodes\n", __func__, n_new);
+
+    if (n_new > 0) {
+        // the last added node should always be starting point
+        GGML_V3_ASSERT(cgraph->nodes[cgraph->n_nodes - 1] == tensor);
+    }
+}
+
+void ggml_v3_build_forward_expand(struct ggml_v3_cgraph * cgraph, struct ggml_v3_tensor * tensor) {
+    ggml_v3_build_forward_impl(cgraph, tensor, true);
+}
+
+void ggml_v3_build_backward_expand(struct ggml_v3_context * ctx, struct ggml_v3_cgraph * gf, struct ggml_v3_cgraph * gb, bool keep) {
+    GGML_V3_ASSERT(gf->n_nodes > 0);
+
+    // if we are keeping the gradient graph, we have to detach the gradient nodes from the original graph
+    if (keep) {
+        for (int i = 0; i < gf->n_nodes; i++) {
+            struct ggml_v3_tensor * node = gf->nodes[i];
+
+            if (node->grad) {
+                node->grad = ggml_v3_dup_tensor(ctx, node);
+                gf->grads[i] = node->grad;
+            }
+        }
+    }
+
+    // remember original gradients which start with zero values
+    struct ggml_v3_hash_set zero_table = ggml_v3_hash_set_new(gf->size);
+    for (int i = 0; i < gf->n_nodes; i++) {
+        if (gf->grads[i]) {
+            ggml_v3_hash_insert(zero_table, gf->grads[i]);
+        }
+    }
+
+    for (int i = gf->n_nodes - 1; i >= 0; i--) {
+        struct ggml_v3_tensor * node = gf->nodes[i];
+
+        // inplace operations to add gradients are not created by ggml_v3_compute_backward
+        // use allocator to automatically make inplace operations
+        if (node->grad) {
+            ggml_v3_compute_backward(ctx, node, zero_table);
+        }
+    }
+
+    for (int i = 0; i < gf->n_nodes; i++) {
+        struct ggml_v3_tensor * node = gf->nodes[i];
+
+        if (node->is_param) {
+            GGML_V3_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node);
+            ggml_v3_build_forward_expand(gb, node->grad);
+        }
+    }
+
+    ggml_v3_hash_set_free(zero_table);
+}
+
+static size_t ggml_v3_graph_nbytes(size_t size, bool grads) {
+    size_t nbytes = sizeof(struct ggml_v3_cgraph);
+    nbytes += size * sizeof(struct ggml_v3_tensor *) * 2; // leafs + nodes
+    if (grads) {
+        nbytes += size * sizeof(struct ggml_v3_tensor *); // grads
+    }
+    nbytes += ggml_v3_hash_size(size * 2) * sizeof(struct ggml_v3_tensor *); // hash set
+    return nbytes;
+}
+
+size_t ggml_v3_graph_overhead_custom(size_t size, bool grads) {
+    return GGML_V3_OBJECT_SIZE + GGML_V3_PAD(ggml_v3_graph_nbytes(size, grads), GGML_V3_MEM_ALIGN);
+}
+
+size_t ggml_v3_graph_overhead(void) {
+    return ggml_v3_graph_overhead_custom(GGML_V3_DEFAULT_GRAPH_SIZE, false);
+}
+
+struct ggml_v3_cgraph * ggml_v3_new_graph_custom(struct ggml_v3_context * ctx, size_t size, bool grads) {
+    const size_t obj_size = ggml_v3_graph_nbytes(size, grads);
+    struct ggml_v3_object * obj = ggml_v3_new_object(ctx, GGML_V3_OBJECT_GRAPH, obj_size);
+    struct ggml_v3_cgraph * cgraph = (struct ggml_v3_cgraph *) ((char *) ctx->mem_buffer + obj->offs);
+
+    struct ggml_v3_tensor ** data_start = (struct ggml_v3_tensor **) (cgraph + 1);
+
+    size_t hash_size = ggml_v3_hash_size(size * 2);
+    struct ggml_v3_tensor ** nodes_ptr = data_start;
+    struct ggml_v3_tensor ** leafs_ptr = nodes_ptr + size;
+    struct ggml_v3_tensor ** hash_keys_ptr = leafs_ptr + size;
+    struct ggml_v3_tensor ** grads_ptr = grads ? hash_keys_ptr + hash_size : NULL;
+
+    // check that we allocated the correct amount of memory
+    assert(obj_size == (size_t) (
+        (grads ? (char *)(grads_ptr + size) : (char *)(hash_keys_ptr + hash_size)) - (char *)cgraph));
+
+    memset(hash_keys_ptr, 0, hash_size * sizeof(struct ggml_v3_tensor *));
+
+    *cgraph = (struct ggml_v3_cgraph) {
+        /*.size         =*/ size,
+        /*.n_nodes      =*/ 0,
+        /*.n_leafs      =*/ 0,
+        /*.nodes        =*/ nodes_ptr,
+        /*.grads        =*/ grads_ptr,
+        /*.leafs        =*/ leafs_ptr,
+        /*.hash_table   =*/ { hash_size, hash_keys_ptr },
+        /*.order        =*/ GGML_V3_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
+        /*.perf_runs    =*/ 0,
+        /*.perf_cycles  =*/ 0,
+        /*.perf_time_us =*/ 0,
+    };
+
+    return cgraph;
+}
+
+struct ggml_v3_cgraph * ggml_v3_new_graph(struct ggml_v3_context * ctx) {
+    return ggml_v3_new_graph_custom(ctx, GGML_V3_DEFAULT_GRAPH_SIZE, false);
+}
+
+struct ggml_v3_cgraph ggml_v3_graph_view(struct ggml_v3_cgraph * cgraph0, int i0, int i1) {
+    struct ggml_v3_cgraph cgraph = {
+        /*.size         =*/ 0,
+        /*.n_nodes      =*/ i1 - i0,
+        /*.n_leafs      =*/ 0,
+        /*.nodes        =*/ cgraph0->nodes + i0,
+        /*.grads        =*/ cgraph0->grads ? cgraph0->grads + i0 : NULL,
+        /*.leafs        =*/ NULL,
+        /*.hash_table   =*/ { 0, NULL },
+        /*.order        =*/ cgraph0->order,
+        /*.perf_runs    =*/ 0,
+        /*.perf_cycles  =*/ 0,
+        /*.perf_time_us =*/ 0,
+    };
+
+    return cgraph;
+}
+
+void ggml_v3_graph_cpy(struct ggml_v3_cgraph * src, struct ggml_v3_cgraph * dst) {
+    GGML_V3_ASSERT(dst->size >= src->n_leafs);
+    GGML_V3_ASSERT(dst->size >= src->n_nodes);
+    GGML_V3_ASSERT(dst->visited_hash_table.size >= src->visited_hash_table.size);
+
+    dst->n_leafs = src->n_leafs;
+    dst->n_nodes = src->n_nodes;
+    dst->order   = src->order;
+
+    for (int i = 0; i < src->n_leafs; ++i) {
+        dst->leafs[i] = src->leafs[i];
+    }
+
+    for (int i = 0; i < src->n_nodes; ++i) {
+        dst->nodes[i] = src->nodes[i];
+    }
+
+    if (src->grads) {
+        GGML_V3_ASSERT(dst->grads != NULL);
+        for (int i = 0; i < src->n_nodes; ++i) {
+            dst->grads[i] = src->grads[i];
+        }
+    }
+
+    for (size_t i = 0; i < src->visited_hash_table.size; ++i) {
+        if (src->visited_hash_table.keys[i]) {
+            ggml_v3_hash_insert(dst->visited_hash_table, src->visited_hash_table.keys[i]);
+        }
+    }
+}
+
+struct ggml_v3_cgraph * ggml_v3_graph_dup(struct ggml_v3_context * ctx, struct ggml_v3_cgraph * cgraph) {
+    struct ggml_v3_cgraph * result = ggml_v3_new_graph_custom(ctx, cgraph->size, cgraph->grads != NULL);
+    ggml_v3_graph_cpy(cgraph, result);
+    return result;
+}
+
+void ggml_v3_graph_reset(struct ggml_v3_cgraph * cgraph) {
+    GGML_V3_ASSERT(cgraph->grads != NULL);
+
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        struct ggml_v3_tensor * grad = cgraph->grads[i];
+
+        if (grad) {
+            ggml_v3_set_zero(grad);
+        }
+    }
+}
+
+void ggml_v3_graph_clear(struct ggml_v3_cgraph * cgraph) {
+    cgraph->n_leafs = 0;
+    cgraph->n_nodes = 0;
+    memset(cgraph->visited_hash_table.keys, 0, cgraph->visited_hash_table.size * sizeof(struct ggml_v3_tensor *));
+}
+
+//
+// thread data
+//
+// synchronization is done via busy loops
+// I tried using spin locks, but not sure how to use them correctly - the things I tried were slower than busy loops
+//
+
+#ifdef __APPLE__
+
+//#include <os/lock.h>
+//
+//typedef os_unfair_lock ggml_v3_lock_t;
+//
+//#define ggml_v3_lock_init(x)    UNUSED(x)
+//#define ggml_v3_lock_destroy(x) UNUSED(x)
+//#define ggml_v3_lock_lock       os_unfair_lock_lock
+//#define ggml_v3_lock_unlock     os_unfair_lock_unlock
+//
+//#define GGML_V3_LOCK_INITIALIZER OS_UNFAIR_LOCK_INIT
+
+typedef int ggml_v3_lock_t;
+
+#define ggml_v3_lock_init(x)    UNUSED(x)
+#define ggml_v3_lock_destroy(x) UNUSED(x)
+#define ggml_v3_lock_lock(x)    UNUSED(x)
+#define ggml_v3_lock_unlock(x)  UNUSED(x)
+
+#define GGML_V3_LOCK_INITIALIZER 0
+
+typedef pthread_t ggml_v3_thread_t;
+
+#define ggml_v3_thread_create pthread_create
+#define ggml_v3_thread_join   pthread_join
+
+#else
+
+//typedef pthread_spinlock_t ggml_v3_lock_t;
+
+//#define ggml_v3_lock_init(x) pthread_spin_init(x, PTHREAD_PROCESS_PRIVATE)
+//#define ggml_v3_lock_destroy pthread_spin_destroy
+//#define ggml_v3_lock_lock    pthread_spin_lock
+//#define ggml_v3_lock_unlock  pthread_spin_unlock
+
+typedef int ggml_v3_lock_t;
+
+#define ggml_v3_lock_init(x)    UNUSED(x)
+#define ggml_v3_lock_destroy(x) UNUSED(x)
+#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
+#define ggml_v3_lock_lock(x)    _mm_pause()
+#else
+#define ggml_v3_lock_lock(x)    UNUSED(x)
+#endif
+#define ggml_v3_lock_unlock(x)  UNUSED(x)
+
+#define GGML_V3_LOCK_INITIALIZER 0
+
+typedef pthread_t ggml_v3_thread_t;
+
+#define ggml_v3_thread_create pthread_create
+#define ggml_v3_thread_join   pthread_join
+
+#endif
+
+// Android's libc implementation "bionic" does not support setting affinity
+#if defined(__linux__) && !defined(__BIONIC__)
+static void set_numa_thread_affinity(int thread_n, int n_threads) {
+    if (!ggml_v3_is_numa()) {
+        return;
+    }
+
+    // run thread on node_num thread_n / (threads per node)
+    const int node_num = thread_n / ((n_threads + g_state.numa.n_nodes - 1) / g_state.numa.n_nodes);
+    struct ggml_v3_numa_node * node = &g_state.numa.nodes[node_num];
+    size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
+
+    cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
+    CPU_ZERO_S(setsize, cpus);
+    for (size_t i = 0; i < node->n_cpus; ++i) {
+        CPU_SET_S(node->cpus[i], setsize, cpus);
+    }
+
+    int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
+    if (rv) {
+            fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",
+                    strerror(rv));
+    }
+
+    CPU_FREE(cpus);
+}
+
+static void clear_numa_thread_affinity(void) {
+    if (!ggml_v3_is_numa()) {
+        return;
+    }
+
+    size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
+
+    cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
+    CPU_ZERO_S(setsize, cpus);
+    for (unsigned i = 0; i < g_state.numa.total_cpus; ++i) {
+        CPU_SET_S(i, setsize, cpus);
+    }
+
+    int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
+    if (rv) {
+        fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",
+            strerror(rv));
+    }
+
+    CPU_FREE(cpus);
+}
+#else
+// TODO: Windows etc.
+// (the linux implementation may also work on BSD, someone should test)
+static void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads);  }
+static void clear_numa_thread_affinity(void) {}
+#endif
+
+struct ggml_v3_compute_state_shared {
+    const struct ggml_v3_cgraph * cgraph;
+    const struct ggml_v3_cplan  * cplan;
+
+    int64_t perf_node_start_cycles;
+    int64_t perf_node_start_time_us;
+
+    const int n_threads;
+
+    // synchronization primitives
+    atomic_int n_active; // num active threads
+    atomic_int node_n;   // active graph node
+
+    bool (*abort_callback)(void * data); // abort ggml_v3_graph_compute when true
+    void * abort_callback_data;
+};
+
+struct ggml_v3_compute_state {
+    ggml_v3_thread_t thrd;
+    int ith;
+    struct ggml_v3_compute_state_shared * shared;
+};
+
+static void ggml_v3_graph_compute_perf_stats_node(struct ggml_v3_tensor * node, const struct ggml_v3_compute_state_shared * st) {
+    int64_t cycles_cur  = ggml_v3_perf_cycles()  - st->perf_node_start_cycles;
+    int64_t time_us_cur = ggml_v3_perf_time_us() - st->perf_node_start_time_us;
+
+    node->perf_runs++;
+    node->perf_cycles  += cycles_cur;
+    node->perf_time_us += time_us_cur;
+}
+
+static int ggml_v3_get_n_tasks(struct ggml_v3_tensor * node, int n_threads) {
+    int n_tasks = 0;
+
+    switch (node->op) {
+        case GGML_V3_OP_CPY:
+        case GGML_V3_OP_DUP:
+        case GGML_V3_OP_ADD:
+        case GGML_V3_OP_ADD1:
+        case GGML_V3_OP_ACC:
+            {
+                n_tasks = n_threads;
+            } break;
+        case GGML_V3_OP_SUB:
+        case GGML_V3_OP_SQR:
+        case GGML_V3_OP_SQRT:
+        case GGML_V3_OP_LOG:
+        case GGML_V3_OP_SUM:
+        case GGML_V3_OP_SUM_ROWS:
+        case GGML_V3_OP_MEAN:
+        case GGML_V3_OP_ARGMAX:
+        case GGML_V3_OP_REPEAT:
+        case GGML_V3_OP_REPEAT_BACK:
+        case GGML_V3_OP_LEAKY_RELU:
+            {
+                n_tasks = 1;
+            } break;
+        case GGML_V3_OP_UNARY:
+            switch (ggml_v3_get_unary_op(node)) {
+                case GGML_V3_UNARY_OP_ABS:
+                case GGML_V3_UNARY_OP_SGN:
+                case GGML_V3_UNARY_OP_NEG:
+                case GGML_V3_UNARY_OP_STEP:
+                case GGML_V3_UNARY_OP_TANH:
+                case GGML_V3_UNARY_OP_ELU:
+                case GGML_V3_UNARY_OP_RELU:
+                    {
+                        n_tasks = 1;
+                    } break;
+
+                case GGML_V3_UNARY_OP_GELU:
+                case GGML_V3_UNARY_OP_GELU_QUICK:
+                case GGML_V3_UNARY_OP_SILU:
+                    {
+                        n_tasks = n_threads;
+                    } break;
+                default:
+                    GGML_V3_ASSERT(false);
+            }
+            break;
+        case GGML_V3_OP_SILU_BACK:
+        case GGML_V3_OP_MUL:
+        case GGML_V3_OP_DIV:
+        case GGML_V3_OP_NORM:
+        case GGML_V3_OP_RMS_NORM:
+        case GGML_V3_OP_RMS_NORM_BACK:
+        case GGML_V3_OP_GROUP_NORM:
+        case GGML_V3_OP_CONCAT:
+            {
+                n_tasks = n_threads;
+            } break;
+        case GGML_V3_OP_MUL_MAT:
+            {
+                n_tasks = n_threads;
+
+                // TODO: use different scheduling for different matrix sizes
+                //const int nr0 = ggml_v3_nrows(node->src[0]);
+                //const int nr1 = ggml_v3_nrows(node->src[1]);
+
+                //n_tasks = MIN(n_threads, MAX(1, nr0/128));
+                //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks);
+            } break;
+        case GGML_V3_OP_MUL_MAT_ID:
+            {
+                n_tasks = n_threads;
+            } break;
+        case GGML_V3_OP_OUT_PROD:
+            {
+                n_tasks = n_threads;
+            } break;
+        case GGML_V3_OP_SCALE:
+        case GGML_V3_OP_SET:
+        case GGML_V3_OP_CONT:
+        case GGML_V3_OP_RESHAPE:
+        case GGML_V3_OP_VIEW:
+        case GGML_V3_OP_PERMUTE:
+        case GGML_V3_OP_TRANSPOSE:
+        case GGML_V3_OP_GET_ROWS:
+        case GGML_V3_OP_GET_ROWS_BACK:
+        case GGML_V3_OP_DIAG:
+            {
+                n_tasks = 1;
+            } break;
+        case GGML_V3_OP_DIAG_MASK_ZERO:
+        case GGML_V3_OP_DIAG_MASK_INF:
+        case GGML_V3_OP_SOFT_MAX_BACK:
+        case GGML_V3_OP_ROPE:
+        case GGML_V3_OP_ROPE_BACK:
+        case GGML_V3_OP_ADD_REL_POS:
+            {
+                n_tasks = n_threads;
+            } break;
+        case GGML_V3_OP_ALIBI:
+            {
+                n_tasks = 1; //TODO
+            } break;
+        case GGML_V3_OP_CLAMP:
+            {
+                n_tasks = 1; //TODO
+            } break;
+        case GGML_V3_OP_SOFT_MAX:
+            {
+                n_tasks = MIN(MIN(4, n_threads), ggml_v3_nrows(node->src[0]));
+            } break;
+        case GGML_V3_OP_CONV_TRANSPOSE_1D:
+            {
+                n_tasks = n_threads;
+            } break;
+        case GGML_V3_OP_IM2COL:
+            {
+                n_tasks = n_threads;
+            } break;
+        case GGML_V3_OP_CONV_TRANSPOSE_2D:
+            {
+                n_tasks = n_threads;
+            } break;
+        case GGML_V3_OP_POOL_1D:
+        case GGML_V3_OP_POOL_2D:
+            {
+                n_tasks = 1;
+            } break;
+        case GGML_V3_OP_UPSCALE:
+            {
+                n_tasks = n_threads;
+            } break;
+        case GGML_V3_OP_PAD:
+            {
+                n_tasks = n_threads;
+            } break;
+        case GGML_V3_OP_ARGSORT:
+            {
+                n_tasks = n_threads;
+            } break;
+        case GGML_V3_OP_FLASH_ATTN:
+            {
+                n_tasks = n_threads;
+            } break;
+        case GGML_V3_OP_FLASH_FF:
+            {
+                n_tasks = n_threads;
+            } break;
+        case GGML_V3_OP_FLASH_ATTN_BACK:
+            {
+                n_tasks = n_threads;
+            } break;
+        case GGML_V3_OP_WIN_PART:
+        case GGML_V3_OP_WIN_UNPART:
+        case GGML_V3_OP_GET_REL_POS:
+        case GGML_V3_OP_MAP_UNARY:
+        case GGML_V3_OP_MAP_BINARY:
+        case GGML_V3_OP_MAP_CUSTOM1_F32:
+        case GGML_V3_OP_MAP_CUSTOM2_F32:
+        case GGML_V3_OP_MAP_CUSTOM3_F32:
+            {
+                n_tasks = 1;
+            } break;
+        case GGML_V3_OP_MAP_CUSTOM1:
+            {
+                struct ggml_v3_map_custom1_op_params * p = (struct ggml_v3_map_custom1_op_params *) node->op_params;
+                if (p->n_tasks == GGML_V3_N_TASKS_MAX) {
+                    n_tasks = n_threads;
+                } else {
+                    n_tasks = MIN(p->n_tasks, n_threads);
+                }
+            } break;
+        case GGML_V3_OP_MAP_CUSTOM2:
+            {
+                struct ggml_v3_map_custom2_op_params * p = (struct ggml_v3_map_custom2_op_params *) node->op_params;
+                if (p->n_tasks == GGML_V3_N_TASKS_MAX) {
+                    n_tasks = n_threads;
+                } else {
+                    n_tasks = MIN(p->n_tasks, n_threads);
+                }
+            } break;
+        case GGML_V3_OP_MAP_CUSTOM3:
+            {
+                struct ggml_v3_map_custom3_op_params * p = (struct ggml_v3_map_custom3_op_params *) node->op_params;
+                if (p->n_tasks == GGML_V3_N_TASKS_MAX) {
+                    n_tasks = n_threads;
+                } else {
+                    n_tasks = MIN(p->n_tasks, n_threads);
+                }
+            } break;
+        case GGML_V3_OP_CROSS_ENTROPY_LOSS:
+            {
+                n_tasks = n_threads;
+            } break;
+        case GGML_V3_OP_CROSS_ENTROPY_LOSS_BACK:
+            {
+                n_tasks = n_threads;
+            } break;
+        case GGML_V3_OP_NONE:
+            {
+                n_tasks = 1;
+            } break;
+        case GGML_V3_OP_COUNT:
+            {
+                GGML_V3_ASSERT(false);
+            } break;
+        default:
+            {
+                fprintf(stderr, "%s: op not implemented: ", __func__);
+                if (node->op < GGML_V3_OP_COUNT) {
+                    fprintf(stderr, "%s\n", ggml_v3_op_name(node->op));
+                } else {
+                    fprintf(stderr, "%d\n", node->op);
+                }
+                GGML_V3_ASSERT(false);
+            } break;
+    }
+
+    assert(n_tasks > 0);
+
+    return n_tasks;
+}
+
+static thread_ret_t ggml_v3_graph_compute_thread(void * data) {
+    struct ggml_v3_compute_state * state = (struct ggml_v3_compute_state *) data;
+
+    const struct ggml_v3_cgraph * cgraph = state->shared->cgraph;
+    const struct ggml_v3_cplan  * cplan  = state->shared->cplan;
+
+    const int   n_threads   = state->shared->n_threads;
+
+    set_numa_thread_affinity(state->ith, n_threads);
+
+    int node_n = -1;
+
+    while (true) {
+        if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
+            state->shared->node_n += 1;
+            return (thread_ret_t) GGML_V3_EXIT_ABORTED;
+        }
+
+        if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
+            // all other threads are finished and spinning
+            // do finalize and init here so we don't have synchronize again
+            struct ggml_v3_compute_params params = {
+                /*.type  =*/ GGML_V3_TASK_FINALIZE,
+                /*.ith   =*/ 0,
+                /*.nth   =*/ 0,
+                /*.wsize =*/ cplan->work_size,
+                /*.wdata =*/ cplan->work_data,
+            };
+
+            if (node_n != -1) {
+                /* FINALIZE */
+                struct ggml_v3_tensor * node = cgraph->nodes[node_n];
+                if (GGML_V3_OP_HAS_FINALIZE[node->op]) {
+                    params.nth = ggml_v3_get_n_tasks(node, n_threads);
+                    ggml_v3_compute_forward(&params, node);
+                }
+                ggml_v3_graph_compute_perf_stats_node(node, state->shared);
+            }
+
+            // distribute new work or execute it direct if 1T
+            while (++node_n < cgraph->n_nodes) {
+                GGML_V3_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
+
+                struct ggml_v3_tensor * node = cgraph->nodes[node_n];
+                const int n_tasks = ggml_v3_get_n_tasks(node, n_threads);
+
+                state->shared->perf_node_start_cycles  = ggml_v3_perf_cycles();
+                state->shared->perf_node_start_time_us = ggml_v3_perf_time_us();
+
+                params.nth = n_tasks;
+
+                /* INIT */
+                if (GGML_V3_OP_HAS_INIT[node->op]) {
+                    params.type = GGML_V3_TASK_INIT;
+                    ggml_v3_compute_forward(&params, node);
+                }
+
+                if (n_tasks == 1) {
+                    // TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
+                    // they do something more efficient than spinning (?)
+                    params.type = GGML_V3_TASK_COMPUTE;
+                    ggml_v3_compute_forward(&params, node);
+
+                    if (GGML_V3_OP_HAS_FINALIZE[node->op]) {
+                        params.type = GGML_V3_TASK_FINALIZE;
+                        ggml_v3_compute_forward(&params, node);
+                    }
+
+                    ggml_v3_graph_compute_perf_stats_node(node, state->shared);
+                } else {
+                    break;
+                }
+
+                if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
+                    break;
+                }
+            }
+
+            atomic_store(&state->shared->n_active, n_threads);
+            atomic_store(&state->shared->node_n,   node_n);
+        } else {
+           // wait for other threads to finish
+            const int last = node_n;
+
+            const bool do_yield = last < 0 || cgraph->nodes[last]->op == GGML_V3_OP_MUL_MAT;
+
+            while (true) {
+                // TODO: this sched_yield can have significant impact on the performance - either positive or negative
+                //       depending on the workload and the operating system.
+                //       since it is not clear what is the best approach, it should potentially become user-configurable
+                //       ref: https://github.com/ggerganov/ggml/issues/291
+                // UPD:  adding the do_yield flag seems to resolve the issue universally
+                if (do_yield) {
+                    sched_yield();
+                }
+
+                node_n = atomic_load(&state->shared->node_n);
+                if (node_n != last) break;
+            };
+        }
+
+        // check if we should stop
+        if (node_n >= cgraph->n_nodes) break;
+
+        /* COMPUTE */
+        struct ggml_v3_tensor * node = cgraph->nodes[node_n];
+        const int n_tasks = ggml_v3_get_n_tasks(node, n_threads);
+
+        struct ggml_v3_compute_params params = {
+            /*.type  =*/ GGML_V3_TASK_COMPUTE,
+            /*.ith   =*/ state->ith,
+            /*.nth   =*/ n_tasks,
+            /*.wsize =*/ cplan->work_size,
+            /*.wdata =*/ cplan->work_data,
+        };
+
+        if (state->ith < n_tasks) {
+            ggml_v3_compute_forward(&params, node);
+        }
+    }
+
+    return GGML_V3_EXIT_SUCCESS;
+}
+
+struct ggml_v3_cplan ggml_v3_graph_plan(struct ggml_v3_cgraph * cgraph, int n_threads) {
+    if (n_threads <= 0) {
+        n_threads = GGML_V3_DEFAULT_N_THREADS;
+    }
+
+    size_t work_size = 0;
+
+    struct ggml_v3_cplan cplan;
+    memset(&cplan, 0, sizeof(struct ggml_v3_cplan));
+
+    // thread scheduling for the different operations + work buffer size estimation
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        struct ggml_v3_tensor * node = cgraph->nodes[i];
+
+        const int n_tasks = ggml_v3_get_n_tasks(node, n_threads);
+
+        size_t cur = 0;
+
+        switch (node->op) {
+            case GGML_V3_OP_CPY:
+            case GGML_V3_OP_DUP:
+                {
+                    if (ggml_v3_is_quantized(node->type)) {
+                        cur = ggml_v3_type_size(GGML_V3_TYPE_F32) * node->ne[0] * n_tasks;
+                    }
+                } break;
+            case GGML_V3_OP_ADD:
+            case GGML_V3_OP_ADD1:
+                {
+                    if (ggml_v3_is_quantized(node->src[0]->type)) {
+                        cur = ggml_v3_type_size(GGML_V3_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
+                    }
+                } break;
+            case GGML_V3_OP_ACC:
+                {
+                    if (ggml_v3_is_quantized(node->src[0]->type)) {
+                        cur = ggml_v3_type_size(GGML_V3_TYPE_F32) * node->src[1]->ne[0] * n_tasks;
+                    }
+                } break;
+            case GGML_V3_OP_MUL_MAT:
+                {
+                    const enum ggml_v3_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type;
+
+#if defined(GGML_USE_CLBLAST)
+                    if (ggml_v3_cl_can_mul_mat(node->src[0], node->src[1], node)) {
+                        cur = ggml_v3_cl_mul_mat_get_wsize(node->src[0], node->src[1], node);
+                    } else
+#endif
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
+                    if (ggml_v3_compute_forward_mul_mat_use_blas(node)) {
+                        if (node->src[0]->type != GGML_V3_TYPE_F32) {
+                            // here we need memory just for single 2D matrix from src0
+                            cur = ggml_v3_type_size(GGML_V3_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]);
+                        }
+                    } else
+#endif
+                    if (node->src[1]->type != vec_dot_type) {
+                        cur = ggml_v3_row_size(vec_dot_type, ggml_v3_nelements(node->src[1]));
+                    }
+                } break;
+            case GGML_V3_OP_MUL_MAT_ID:
+                {
+                    const struct ggml_v3_tensor * src0 = node->src[2];
+                    const struct ggml_v3_tensor * src1 = node->src[1];
+                    const enum ggml_v3_type vec_dot_type = type_traits[src0->type].vec_dot_type;
+                    if (src1->type != vec_dot_type) {
+                        cur = ggml_v3_row_size(vec_dot_type, ggml_v3_nelements(src1));
+                    }
+                    const int n_as = ggml_v3_get_op_params_i32(node, 1);
+                    cur = GGML_V3_PAD(cur, sizeof(int64_t));        // align
+                    cur += n_as * sizeof(int64_t);               // matrix_row_counts
+                    cur += n_as * src1->ne[1] * sizeof(int64_t); // matrix_rows
+                } break;
+            case GGML_V3_OP_OUT_PROD:
+                {
+                    if (ggml_v3_is_quantized(node->src[0]->type)) {
+                        cur = ggml_v3_type_size(GGML_V3_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
+                    }
+                } break;
+            case GGML_V3_OP_SOFT_MAX:
+                {
+                    cur = ggml_v3_type_size(GGML_V3_TYPE_F32) * node->ne[0] * n_tasks;
+                } break;
+            case GGML_V3_OP_CONV_TRANSPOSE_1D:
+                {
+                    GGML_V3_ASSERT(node->src[0]->ne[3] == 1);
+                    GGML_V3_ASSERT(node->src[1]->ne[2] == 1);
+                    GGML_V3_ASSERT(node->src[1]->ne[3] == 1);
+
+                    const int64_t ne00 = node->src[0]->ne[0];  // K
+                    const int64_t ne01 = node->src[0]->ne[1];  // Cout
+                    const int64_t ne02 = node->src[0]->ne[2];  // Cin
+
+                    const int64_t ne10 = node->src[1]->ne[0];  // L
+                    const int64_t ne11 = node->src[1]->ne[1];  // Cin
+
+                    if (node->src[0]->type == GGML_V3_TYPE_F16 &&
+                        node->src[1]->type == GGML_V3_TYPE_F32) {
+                        cur += sizeof(ggml_v3_fp16_t)*ne00*ne01*ne02;
+                        cur += sizeof(ggml_v3_fp16_t)*ne10*ne11;
+                    } else if (node->src[0]->type == GGML_V3_TYPE_F32 &&
+                               node->src[1]->type == GGML_V3_TYPE_F32) {
+                        cur += sizeof(float)*ne00*ne01*ne02;
+                        cur += sizeof(float)*ne10*ne11;
+                    } else {
+                        GGML_V3_ASSERT(false);
+                    }
+                } break;
+            case GGML_V3_OP_CONV_TRANSPOSE_2D:
+                {
+                    const int64_t ne00 = node->src[0]->ne[0]; // W
+                    const int64_t ne01 = node->src[0]->ne[1]; // H
+                    const int64_t ne02 = node->src[0]->ne[2]; // Channels Out
+                    const int64_t ne03 = node->src[0]->ne[3]; // Channels In
+
+                    const int64_t ne10 = node->src[1]->ne[0]; // W
+                    const int64_t ne11 = node->src[1]->ne[1]; // H
+                    const int64_t ne12 = node->src[1]->ne[2]; // Channels In
+
+                    cur += sizeof(ggml_v3_fp16_t)*ne00*ne01*ne02*ne03;
+                    cur += sizeof(ggml_v3_fp16_t)*ne10*ne11*ne12;
+                } break;
+            case GGML_V3_OP_FLASH_ATTN:
+                {
+                    const int64_t ne11 = ggml_v3_up(node->src[1]->ne[1], GGML_V3_SOFT_MAX_UNROLL);
+
+                    if (node->src[1]->type == GGML_V3_TYPE_F32) {
+                        cur  = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
+                        cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
+                    } else if (node->src[1]->type == GGML_V3_TYPE_F16) {
+                        cur  = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
+                        cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
+                    }
+                } break;
+            case GGML_V3_OP_FLASH_FF:
+                {
+                    if (node->src[1]->type == GGML_V3_TYPE_F32) {
+                        cur  = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
+                        cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
+                    } else if (node->src[1]->type == GGML_V3_TYPE_F16) {
+                        cur  = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
+                        cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
+                    }
+                } break;
+            case GGML_V3_OP_FLASH_ATTN_BACK:
+                {
+                    const int64_t    D = node->src[0]->ne[0];
+                    const int64_t ne11 = ggml_v3_up(node->src[1]->ne[1], GGML_V3_SOFT_MAX_UNROLL);
+                    const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_v3_compute_forward_flash_attn_back
+                    if (node->src[1]->type == GGML_V3_TYPE_F32) {
+                        cur  = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
+                        cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
+                    } else if (node->src[1]->type == GGML_V3_TYPE_F16) {
+                        cur  = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
+                        cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
+                    }
+                } break;
+
+            case GGML_V3_OP_CROSS_ENTROPY_LOSS:
+                {
+                    cur = ggml_v3_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks);
+                } break;
+            case GGML_V3_OP_COUNT:
+                {
+                    GGML_V3_ASSERT(false);
+                } break;
+            default:
+                break;
+        }
+
+        work_size = MAX(work_size, cur);
+    }
+
+    if (work_size > 0) {
+        work_size += CACHE_LINE_SIZE*(n_threads - 1);
+    }
+
+    cplan.n_threads = n_threads;
+    cplan.work_size = work_size;
+    cplan.work_data = NULL;
+
+    return cplan;
+}
+
+int ggml_v3_graph_compute(struct ggml_v3_cgraph * cgraph, struct ggml_v3_cplan * cplan) {
+    {
+        GGML_V3_ASSERT(cplan);
+        GGML_V3_ASSERT(cplan->n_threads > 0);
+
+        if (cplan->work_size > 0) {
+            GGML_V3_ASSERT(cplan->work_data);
+        }
+    }
+
+    const int n_threads = cplan->n_threads;
+
+    struct ggml_v3_compute_state_shared state_shared = {
+        /*.cgraph                  =*/ cgraph,
+        /*.cgraph_plan             =*/ cplan,
+        /*.perf_node_start_cycles  =*/ 0,
+        /*.perf_node_start_time_us =*/ 0,
+        /*.n_threads               =*/ n_threads,
+        /*.n_active                =*/ n_threads,
+        /*.node_n                  =*/ -1,
+        /*.abort_callback          =*/ NULL,
+        /*.abort_callback_data     =*/ NULL,
+    };
+    struct ggml_v3_compute_state * workers = alloca(sizeof(struct ggml_v3_compute_state)*n_threads);
+
+    // create thread pool
+    if (n_threads > 1) {
+        for (int j = 1; j < n_threads; ++j) {
+            workers[j] = (struct ggml_v3_compute_state) {
+                .thrd   = 0,
+                .ith = j,
+                .shared = &state_shared,
+            };
+
+            const int rc = ggml_v3_thread_create(&workers[j].thrd, NULL, ggml_v3_graph_compute_thread, &workers[j]);
+            GGML_V3_ASSERT(rc == 0);
+            UNUSED(rc);
+        }
+    }
+
+    workers[0].ith = 0;
+    workers[0].shared = &state_shared;
+
+    const int64_t perf_start_cycles  = ggml_v3_perf_cycles();
+    const int64_t perf_start_time_us = ggml_v3_perf_time_us();
+
+    // this is a work thread too
+    int compute_status = (size_t) ggml_v3_graph_compute_thread(&workers[0]);
+
+    // don't leave affinity set on the main thread
+    clear_numa_thread_affinity();
+
+    // join or kill thread pool
+    if (n_threads > 1) {
+        for (int j = 1; j < n_threads; j++) {
+            const int rc = ggml_v3_thread_join(workers[j].thrd, NULL);
+            GGML_V3_ASSERT(rc == 0);
+        }
+    }
+
+    // performance stats (graph)
+    {
+        int64_t perf_cycles_cur  = ggml_v3_perf_cycles()  - perf_start_cycles;
+        int64_t perf_time_us_cur = ggml_v3_perf_time_us() - perf_start_time_us;
+
+        cgraph->perf_runs++;
+        cgraph->perf_cycles  += perf_cycles_cur;
+        cgraph->perf_time_us += perf_time_us_cur;
+
+        GGML_V3_PRINT_DEBUG("%s: perf (%d) - cpu = %.3f / %.3f ms, wall = %.3f / %.3f ms\n",
+                __func__, cgraph->perf_runs,
+                (double) perf_cycles_cur      / (double) ggml_v3_cycles_per_ms(),
+                (double) cgraph->perf_cycles  / (double) ggml_v3_cycles_per_ms() / (double) cgraph->perf_runs,
+                (double) perf_time_us_cur     / 1000.0,
+                (double) cgraph->perf_time_us / 1000.0 / cgraph->perf_runs);
+    }
+
+    return compute_status;
+}
+
+void ggml_v3_graph_compute_with_ctx(struct ggml_v3_context * ctx, struct ggml_v3_cgraph * cgraph, int n_threads) {
+    struct ggml_v3_cplan cplan = ggml_v3_graph_plan(cgraph, n_threads);
+
+    struct ggml_v3_object * obj = ggml_v3_new_object(ctx, GGML_V3_OBJECT_WORK_BUFFER, cplan.work_size);
+
+    cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
+
+    ggml_v3_graph_compute(cgraph, &cplan);
+}
+
+struct ggml_v3_tensor * ggml_v3_graph_get_tensor(struct ggml_v3_cgraph * cgraph, const char * name) {
+    for (int i = 0; i < cgraph->n_leafs; i++) {
+        struct ggml_v3_tensor * leaf = cgraph->leafs[i];
+
+        if (strcmp(leaf->name, name) == 0) {
+            return leaf;
+        }
+    }
+
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        struct ggml_v3_tensor * node = cgraph->nodes[i];
+
+        if (strcmp(node->name, name) == 0) {
+            return node;
+        }
+    }
+
+    return NULL;
+}
+
+static void ggml_v3_graph_export_leaf(const struct ggml_v3_tensor * tensor, FILE * fout) {
+    const int64_t * ne = tensor->ne;
+    const size_t  * nb = tensor->nb;
+
+    fprintf(fout, "%-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %16p %32s\n",
+            ggml_v3_type_name(tensor->type),
+            ggml_v3_op_name  (tensor->op),
+            ggml_v3_n_dims(tensor),
+            ne[0], ne[1], ne[2], ne[3],
+            nb[0], nb[1], nb[2], nb[3],
+            tensor->data,
+            tensor->name);
+}
+
+static void ggml_v3_graph_export_node(const struct ggml_v3_tensor * tensor, const char * arg, FILE * fout) {
+    const int64_t * ne = tensor->ne;
+    const size_t  * nb = tensor->nb;
+
+    fprintf(fout, "%-6s %-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %16p %32s\n",
+            arg,
+            ggml_v3_type_name(tensor->type),
+            ggml_v3_op_name  (tensor->op),
+            ggml_v3_n_dims(tensor),
+            ne[0], ne[1], ne[2], ne[3],
+            nb[0], nb[1], nb[2], nb[3],
+            tensor->data,
+            tensor->name);
+}
+
+void ggml_v3_graph_export(const struct ggml_v3_cgraph * cgraph, const char * fname) {
+    uint64_t size_eval = 0;
+
+    // compute size of intermediate results
+    // TODO: does not take into account scratch buffers !!!!
+    for (int i = 0; i < cgraph->n_nodes; ++i) {
+        size_eval += ggml_v3_nbytes_pad(cgraph->nodes[i]);
+    }
+
+    // print
+    {
+        FILE * fout = stdout;
+
+        fprintf(fout, "\n");
+        fprintf(fout, "%-16s %8x\n", "magic",        GGML_V3_FILE_MAGIC);
+        fprintf(fout, "%-16s %8d\n", "version",      GGML_V3_FILE_VERSION);
+        fprintf(fout, "%-16s %8d\n", "leafs",        cgraph->n_leafs);
+        fprintf(fout, "%-16s %8d\n", "nodes",        cgraph->n_nodes);
+        fprintf(fout, "%-16s %" PRIu64 "\n", "eval", size_eval);
+
+        // header
+        fprintf(fout, "\n");
+        fprintf(fout, "%-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %16s %16s\n",
+                "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "DATA", "NAME");
+
+        for (int i = 0; i < cgraph->n_leafs; ++i) {
+            ggml_v3_graph_export_leaf(cgraph->leafs[i], fout);
+
+            GGML_V3_ASSERT(cgraph->leafs[i]->op   == GGML_V3_OP_NONE);
+            GGML_V3_ASSERT(cgraph->leafs[i]->src[0] == NULL);
+            GGML_V3_ASSERT(cgraph->leafs[i]->src[1] == NULL);
+        }
+
+        // header
+        fprintf(fout, "\n");
+        fprintf(fout, "%-6s %-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %8s %16s %16s\n",
+                "ARG", "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "NTASKS", "DATA", "NAME");
+
+        for (int i = 0; i < cgraph->n_nodes; ++i) {
+            ggml_v3_graph_export_node(cgraph->nodes[i], "DST", fout);
+
+            for (int j = 0; j < GGML_V3_MAX_SRC; ++j) {
+                if (cgraph->nodes[i]->src[j]) {
+                    ggml_v3_graph_export_node(cgraph->nodes[i]->src[j], "SRC", fout);
+                }
+            }
+
+            fprintf(fout, "\n");
+        }
+
+        fprintf(fout, "\n");
+    }
+
+    // write binary data
+    {
+        FILE * fout = fopen(fname, "wb");
+
+        if (!fout) {
+            fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
+            return;
+        }
+
+        // header
+        {
+            const uint32_t magic   = GGML_V3_FILE_MAGIC;
+            const uint32_t version = GGML_V3_FILE_VERSION;
+            const uint32_t n_leafs = cgraph->n_leafs;
+            const uint32_t n_nodes = cgraph->n_nodes;
+
+            fwrite(&magic,     sizeof(uint32_t), 1, fout);
+            fwrite(&version,   sizeof(uint32_t), 1, fout);
+            fwrite(&n_leafs,   sizeof(uint32_t), 1, fout);
+            fwrite(&n_nodes,   sizeof(uint32_t), 1, fout);
+            fwrite(&size_eval, sizeof(uint64_t), 1, fout);
+        }
+
+        // leafs
+        {
+            for (int i = 0; i < cgraph->n_leafs; ++i) {
+                const struct ggml_v3_tensor * tensor = cgraph->leafs[i];
+
+                const uint32_t type   = tensor->type;
+                const uint32_t op     = tensor->op;
+
+                fwrite(&type,   sizeof(uint32_t), 1, fout);
+                fwrite(&op,     sizeof(uint32_t), 1, fout);
+
+                for (int j = 0; j < GGML_V3_MAX_DIMS; ++j) {
+                    const uint64_t ne = tensor->ne[j];
+                    const uint64_t nb = tensor->nb[j];
+
+                    fwrite(&ne, sizeof(uint64_t), 1, fout);
+                    fwrite(&nb, sizeof(uint64_t), 1, fout);
+                }
+
+                fwrite(tensor->name,      sizeof(char), GGML_V3_MAX_NAME,      fout);
+                fwrite(tensor->op_params, sizeof(char), GGML_V3_MAX_OP_PARAMS, fout);
+
+                // dump the data
+                // TODO: pad this to 32 byte boundary
+                {
+                    const size_t size = ggml_v3_nbytes(tensor);
+
+                    fwrite(tensor->data, sizeof(char), size, fout);
+                }
+            }
+        }
+
+        // nodes
+        {
+            for (int i = 0; i < cgraph->n_nodes; ++i) {
+                const struct ggml_v3_tensor * tensor = cgraph->nodes[i];
+
+                const uint32_t type   = tensor->type;
+                const uint32_t op     = tensor->op;
+
+                fwrite(&type,   sizeof(uint32_t), 1, fout);
+                fwrite(&op,     sizeof(uint32_t), 1, fout);
+
+                for (int j = 0; j < GGML_V3_MAX_DIMS; ++j) {
+                    const uint64_t ne = tensor->ne[j];
+                    const uint64_t nb = tensor->nb[j];
+
+                    fwrite(&ne, sizeof(uint64_t), 1, fout);
+                    fwrite(&nb, sizeof(uint64_t), 1, fout);
+                }
+
+                fwrite(tensor->name,      sizeof(char), GGML_V3_MAX_NAME,      fout);
+                fwrite(tensor->op_params, sizeof(char), GGML_V3_MAX_OP_PARAMS, fout);
+
+                // output the op arguments
+                {
+                    struct ggml_v3_tensor * args[GGML_V3_MAX_SRC] = { NULL };
+
+                    for (int j = 0; j < GGML_V3_MAX_SRC; ++j) {
+                        args[j] = tensor->src[j];
+                    }
+
+                    for (int j = 0; j < GGML_V3_MAX_SRC; ++j) {
+                        if (args[j]) {
+                            int32_t idx = -1;
+
+                            // check if leaf
+                            {
+                                for (int k = 0; k < cgraph->n_leafs; ++k) {
+                                    if (args[j] == cgraph->leafs[k]) {
+                                        idx = k;
+                                        break;
+                                    }
+                                }
+                            }
+
+                            // check if node
+                            if (idx == -1) {
+                                for (int k = 0; k < cgraph->n_nodes; ++k) {
+                                    if (args[j] == cgraph->nodes[k]) {
+                                        idx = cgraph->n_leafs + k;
+                                        break;
+                                    }
+                                }
+                            }
+
+                            if (idx == -1) {
+                                fprintf(stderr, "%s: failed to find tensor, arg = %d, node = %d\n", __func__, j, i);
+                                fclose(fout);
+                                return;
+                            }
+
+                            fwrite(&idx, sizeof(int32_t), 1, fout);
+                        } else {
+                            const int32_t nul = -1;
+
+                            fwrite(&nul, sizeof(int32_t), 1, fout);
+                        }
+                    }
+                }
+            }
+        }
+
+        fclose(fout);
+    }
+}
+
+struct ggml_v3_cgraph * ggml_v3_graph_import(const char * fname, struct ggml_v3_context ** ctx_data, struct ggml_v3_context ** ctx_eval) {
+    assert(*ctx_data == NULL);
+    assert(*ctx_eval == NULL);
+
+    struct ggml_v3_cgraph * result = NULL;
+
+    struct ggml_v3_tensor * data = NULL;
+
+    // read file into data
+    {
+        FILE * fin = fopen(fname, "rb");
+        if (!fin) {
+            fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
+            return result;
+        }
+
+        size_t fsize = 0;
+
+        fseek(fin, 0, SEEK_END);
+        fsize = ftell(fin);
+        fseek(fin, 0, SEEK_SET);
+
+        // create the data context
+        {
+            const size_t overhead = 1*ggml_v3_tensor_overhead();
+
+            struct ggml_v3_init_params params = {
+                .mem_size   = fsize + overhead,
+                .mem_buffer = NULL,
+                .no_alloc   = false,
+            };
+
+            *ctx_data = ggml_v3_init(params);
+
+            if (!*ctx_data) {
+                fprintf(stderr, "%s: failed to create ggml context\n", __func__);
+                fclose(fin);
+                return result;
+            }
+        }
+
+        data = ggml_v3_new_tensor_1d(*ctx_data, GGML_V3_TYPE_I8, fsize);
+
+        {
+            const size_t ret = fread(data->data, sizeof(char), fsize, fin);
+            if (ret != fsize) {
+                fprintf(stderr, "%s: failed to read %s\n", __func__, fname);
+                fclose(fin);
+                return result;
+            }
+        }
+
+        fclose(fin);
+    }
+
+    // populate result
+    {
+        char * ptr = (char *) data->data;
+
+        const uint32_t magic = *(const uint32_t *) ptr; ptr += sizeof(magic);
+
+        if (magic != GGML_V3_FILE_MAGIC) {
+            fprintf(stderr, "%s: invalid magic number, got %08x\n", __func__, magic);
+            return result;
+        }
+
+        const uint32_t version = *(const uint32_t *) ptr; ptr += sizeof(version);
+
+        if (version != GGML_V3_FILE_VERSION) {
+            fprintf(stderr, "%s: invalid version number\n", __func__);
+            return result;
+        }
+
+        const uint32_t n_leafs   = *(const uint32_t *) ptr; ptr += sizeof(n_leafs);
+        const uint32_t n_nodes   = *(const uint32_t *) ptr; ptr += sizeof(n_nodes);
+        const uint64_t size_eval = *(const uint64_t *) ptr; ptr += sizeof(size_eval);
+        const int     graph_size = MAX(n_leafs, n_nodes);
+
+        // create the data context
+        {
+            const size_t overhead = (n_leafs + n_nodes)*ggml_v3_tensor_overhead() + ggml_v3_graph_overhead_custom(graph_size, false);
+
+            struct ggml_v3_init_params params = {
+                .mem_size   = size_eval + overhead,
+                .mem_buffer = NULL,
+                .no_alloc   = true,
+            };
+
+            *ctx_eval = ggml_v3_init(params);
+
+            if (!*ctx_eval) {
+                fprintf(stderr, "%s: failed to create ggml context\n", __func__);
+                return result;
+            }
+        }
+
+        result = ggml_v3_new_graph_custom(*ctx_eval, graph_size, false);
+
+        result->n_leafs = n_leafs;
+        result->n_nodes = n_nodes;
+
+
+        // leafs
+        {
+            uint32_t type;
+            uint32_t op;
+
+            for (uint32_t i = 0; i < n_leafs; ++i) {
+                type   = *(const uint32_t *) ptr; ptr += sizeof(type);
+                op     = *(const uint32_t *) ptr; ptr += sizeof(op);
+
+                int64_t ne[GGML_V3_MAX_DIMS];
+                size_t  nb[GGML_V3_MAX_DIMS];
+
+                for (int j = 0; j < GGML_V3_MAX_DIMS; ++j) {
+                    uint64_t ne_cur;
+                    uint64_t nb_cur;
+
+                    ne_cur = *(const uint64_t *) ptr; ptr += sizeof(ne_cur);
+                    nb_cur = *(const uint64_t *) ptr; ptr += sizeof(nb_cur);
+
+                    ne[j] = ne_cur;
+                    nb[j] = nb_cur;
+                }
+
+                struct ggml_v3_tensor * tensor = ggml_v3_new_tensor(*ctx_eval, (enum ggml_v3_type) type, GGML_V3_MAX_DIMS, ne);
+
+                tensor->op = (enum ggml_v3_op) op;
+
+                memcpy(tensor->name,      ptr, GGML_V3_MAX_NAME);      ptr += GGML_V3_MAX_NAME;
+                memcpy(tensor->op_params, ptr, GGML_V3_MAX_OP_PARAMS); ptr += GGML_V3_MAX_OP_PARAMS;
+
+                tensor->data = (void *) ptr;
+
+                for (int j = 0; j < GGML_V3_MAX_DIMS; ++j) {
+                    tensor->nb[j] = nb[j];
+                }
+
+                result->leafs[i] = tensor;
+
+                ptr += ggml_v3_nbytes(tensor);
+
+                fprintf(stderr, "%s: loaded leaf %d: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_v3_nbytes(tensor));
+            }
+        }
+
+        ggml_v3_set_no_alloc(*ctx_eval, false);
+
+        // nodes
+        {
+            uint32_t type;
+            uint32_t op;
+
+            for (uint32_t i = 0; i < n_nodes; ++i) {
+                type   = *(const uint32_t *) ptr; ptr += sizeof(type);
+                op     = *(const uint32_t *) ptr; ptr += sizeof(op);
+
+                enum ggml_v3_op eop = (enum ggml_v3_op) op;
+
+                int64_t ne[GGML_V3_MAX_DIMS];
+                size_t  nb[GGML_V3_MAX_DIMS];
+
+                for (int j = 0; j < GGML_V3_MAX_DIMS; ++j) {
+                    uint64_t ne_cur;
+                    uint64_t nb_cur;
+
+                    ne_cur = *(const uint64_t *) ptr; ptr += sizeof(ne_cur);
+                    nb_cur = *(const uint64_t *) ptr; ptr += sizeof(nb_cur);
+
+                    ne[j] = ne_cur;
+                    nb[j] = nb_cur;
+                }
+
+                const char * ptr_name      = ptr; ptr += GGML_V3_MAX_NAME;
+                const char * ptr_op_params = ptr; ptr += GGML_V3_MAX_OP_PARAMS;
+
+                const int32_t * ptr_arg_idx = (const int32_t *) ptr; ptr += GGML_V3_MAX_SRC*sizeof(int32_t);
+
+                struct ggml_v3_tensor * args[GGML_V3_MAX_SRC] = { NULL };
+
+                // parse args
+                for (int j = 0; j < GGML_V3_MAX_SRC; ++j) {
+                    const int32_t arg_idx = ptr_arg_idx[j];
+
+                    if (arg_idx == -1) {
+                        continue;
+                    }
+
+                    if (arg_idx < result->n_leafs) {
+                        args[j] = result->leafs[arg_idx];
+                    } else {
+                        args[j] = result->nodes[arg_idx - result->n_leafs];
+                    }
+                }
+
+                // create the tensor
+                // "view" operations are handled differently
+                // TODO: handle inplace ops - currently a copy is always made
+
+                struct ggml_v3_tensor * tensor = NULL;
+
+                switch (eop) {
+                    // TODO: implement other view ops
+                    case GGML_V3_OP_RESHAPE:
+                        {
+                            tensor = ggml_v3_reshape_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3]);
+                        } break;
+                    case GGML_V3_OP_VIEW:
+                        {
+                            tensor = ggml_v3_view_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3], 0, 0, 0, 0);
+
+                            size_t offs;
+                            memcpy(&offs, ptr_op_params, sizeof(offs));
+
+                            tensor->data = ((char *) tensor->data) + offs;
+                        } break;
+                    case GGML_V3_OP_TRANSPOSE:
+                        {
+                            tensor = ggml_v3_transpose(*ctx_eval, args[0]);
+                        } break;
+                    case GGML_V3_OP_PERMUTE:
+                        {
+                            tensor = ggml_v3_view_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3], 0, 0, 0, 0);
+                        } break;
+                    default:
+                        {
+                            tensor = ggml_v3_new_tensor(*ctx_eval, (enum ggml_v3_type) type, GGML_V3_MAX_DIMS, ne);
+
+                            tensor->op = eop;
+                        } break;
+                }
+
+                memcpy(tensor->name,      ptr_name,      GGML_V3_MAX_NAME);
+                memcpy(tensor->op_params, ptr_op_params, GGML_V3_MAX_OP_PARAMS);
+
+                for (int j = 0; j < GGML_V3_MAX_DIMS; ++j) {
+                    tensor->nb[j] = nb[j];
+                }
+
+                for (int j = 0; j < GGML_V3_MAX_SRC; ++j) {
+                    tensor->src[j] = args[j];
+                }
+
+                result->nodes[i] = tensor;
+
+                fprintf(stderr, "%s: loaded node %d: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_v3_nbytes(tensor));
+            }
+        }
+    }
+
+    return result;
+}
+
+void ggml_v3_graph_print(const struct ggml_v3_cgraph * cgraph) {
+    int64_t perf_total_per_op_us[GGML_V3_OP_COUNT] = {0};
+
+    GGML_V3_PRINT("=== GRAPH ===\n");
+
+    GGML_V3_PRINT("n_nodes = %d\n", cgraph->n_nodes);
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        struct ggml_v3_tensor * node = cgraph->nodes[i];
+
+        perf_total_per_op_us[node->op] += MAX(1, node->perf_time_us);
+
+        GGML_V3_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
+                i,
+                node->ne[0], node->ne[1], node->ne[2],
+                ggml_v3_op_name(node->op), node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
+                (double) node->perf_cycles  / (double) ggml_v3_cycles_per_ms(),
+                (double) node->perf_cycles  / (double) ggml_v3_cycles_per_ms() / (double) node->perf_runs,
+                (double) node->perf_time_us / 1000.0,
+                (double) node->perf_time_us / 1000.0 / node->perf_runs);
+    }
+
+    GGML_V3_PRINT("n_leafs = %d\n", cgraph->n_leafs);
+    for (int i = 0; i < cgraph->n_leafs; i++) {
+        struct ggml_v3_tensor * node = cgraph->leafs[i];
+
+        GGML_V3_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s %16s\n",
+                i,
+                node->ne[0], node->ne[1],
+                ggml_v3_op_name(node->op),
+                ggml_v3_get_name(node));
+    }
+
+    for (int i = 0; i < GGML_V3_OP_COUNT; i++) {
+        if (perf_total_per_op_us[i] == 0) {
+            continue;
+        }
+
+        GGML_V3_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", ggml_v3_op_name(i), (double) perf_total_per_op_us[i] / 1000.0);
+    }
+
+    GGML_V3_PRINT("========================================\n");
+}
+
+// check if node is part of the graph
+static bool ggml_v3_graph_find(const struct ggml_v3_cgraph * cgraph, const struct ggml_v3_tensor * node) {
+    if (cgraph == NULL) {
+        return true;
+    }
+
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        if (cgraph->nodes[i] == node) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+static struct ggml_v3_tensor * ggml_v3_graph_get_parent(const struct ggml_v3_cgraph * cgraph, const struct ggml_v3_tensor * node) {
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        struct ggml_v3_tensor * parent = cgraph->nodes[i];
+
+        if (parent->grad == node) {
+            return parent;
+        }
+    }
+
+    return NULL;
+}
+
+static void ggml_v3_graph_dump_dot_node_edge(FILE * fp, const struct ggml_v3_cgraph * gb, struct ggml_v3_tensor * node, struct ggml_v3_tensor * parent, const char * label)  {
+    struct ggml_v3_tensor * gparent = ggml_v3_graph_get_parent(gb, node);
+    struct ggml_v3_tensor * gparent0 = ggml_v3_graph_get_parent(gb, parent);
+    fprintf(fp, "  \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"%s\"; ]\n",
+            gparent0 ? (void *) gparent0 : (void *) parent,
+            gparent0 ? "g" : "x",
+            gparent ? (void *) gparent : (void *) node,
+            gparent ? "g" : "x",
+            gparent ? "empty" : "vee",
+            gparent ? "dashed" : "solid",
+            label);
+}
+
+static void ggml_v3_graph_dump_dot_leaf_edge(FILE * fp, struct ggml_v3_tensor * node, struct ggml_v3_tensor * parent, const char * label)  {
+    fprintf(fp, "  \"%p\":%s -> \"%p\":%s [ label = \"%s\"; ]\n",
+            (void *) parent, "x",
+            (void *) node, "x",
+            label);
+}
+
+void ggml_v3_graph_dump_dot(const struct ggml_v3_cgraph * gb, const struct ggml_v3_cgraph * gf, const char * filename) {
+    char color[16];
+
+    FILE * fp = fopen(filename, "w");
+    GGML_V3_ASSERT(fp);
+
+    fprintf(fp, "digraph G {\n");
+    fprintf(fp, "  newrank = true;\n");
+    fprintf(fp, "  rankdir = LR;\n");
+
+    for (int i = 0; i < gb->n_nodes; i++) {
+        struct ggml_v3_tensor * node = gb->nodes[i];
+
+        if (ggml_v3_graph_get_parent(gb, node) != NULL) {
+            continue;
+        }
+
+        if (node->is_param) {
+            snprintf(color, sizeof(color), "yellow");
+        } else if (node->grad) {
+            if (ggml_v3_graph_find(gf, node)) {
+                snprintf(color, sizeof(color), "green");
+            } else {
+                snprintf(color, sizeof(color), "lightblue");
+            }
+        } else {
+            snprintf(color, sizeof(color), "white");
+        }
+
+        fprintf(fp, "  \"%p\" [ "
+                    "style = filled; fillcolor = %s; shape = record; "
+                    "label=\"",
+                (void *) node, color);
+
+        if (strlen(node->name) > 0) {
+            fprintf(fp, "%s (%s)|", node->name, ggml_v3_type_name(node->type));
+        } else {
+            fprintf(fp, "(%s)|", ggml_v3_type_name(node->type));
+        }
+
+        if (ggml_v3_is_matrix(node)) {
+            fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], ggml_v3_op_symbol(node->op));
+        } else {
+            fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], node->ne[2], ggml_v3_op_symbol(node->op));
+        }
+
+        if (node->grad) {
+            fprintf(fp, " | <g>%s\"; ]\n", ggml_v3_op_symbol(node->grad->op));
+        } else {
+            fprintf(fp, "\"; ]\n");
+        }
+    }
+
+    for (int i = 0; i < gb->n_leafs; i++) {
+        struct ggml_v3_tensor * node = gb->leafs[i];
+
+        snprintf(color, sizeof(color), "pink");
+
+        fprintf(fp, "  \"%p\" [ "
+                    "style = filled; fillcolor = %s; shape = record; "
+                    "label=\"<x>",
+                (void *) node, color);
+
+        if (strlen(node->name) > 0) {
+            fprintf(fp, "%s (%s)|", node->name, ggml_v3_type_name(node->type));
+        } else {
+            fprintf(fp, "(%s)|", ggml_v3_type_name(node->type));
+        }
+
+        fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]);
+        if (ggml_v3_nelements(node) < 5) {
+            fprintf(fp, " | (");
+            for (int j = 0; j < ggml_v3_nelements(node); j++) {
+                if (node->type == GGML_V3_TYPE_I8 || node->type == GGML_V3_TYPE_I16 || node->type == GGML_V3_TYPE_I32) {
+                    fprintf(fp, "%d", ggml_v3_get_i32_1d(node, j));
+                }
+                else if (node->type == GGML_V3_TYPE_F32 || node->type == GGML_V3_TYPE_F16) {
+                    fprintf(fp, "%.1e", (double)ggml_v3_get_f32_1d(node, j));
+                }
+                else {
+                    fprintf(fp, "#");
+                }
+                if (j < ggml_v3_nelements(node) - 1) {
+                    fprintf(fp, ", ");
+                }
+            }
+            fprintf(fp, ")");
+        }
+        fprintf(fp, "\"; ]\n");
+    }
+
+    for (int i = 0; i < gb->n_nodes; i++) {
+        struct ggml_v3_tensor * node = gb->nodes[i];
+
+        for (int j = 0; j < GGML_V3_MAX_SRC; j++) {
+            if (node->src[j]) {
+                char label[16];
+                snprintf(label, sizeof(label), "src %d", j);
+                ggml_v3_graph_dump_dot_node_edge(fp, gb, node, node->src[j], label);
+            }
+        }
+    }
+
+    for (int i = 0; i < gb->n_leafs; i++) {
+        struct ggml_v3_tensor * node = gb->leafs[i];
+
+        for (int j = 0; j < GGML_V3_MAX_SRC; j++) {
+            if (node->src[j]) {
+                char label[16];
+                snprintf(label, sizeof(label), "src %d", j);
+                ggml_v3_graph_dump_dot_leaf_edge(fp, node, node->src[j], label);
+            }
+        }
+    }
+
+    fprintf(fp, "}\n");
+
+    fclose(fp);
+
+    GGML_V3_PRINT("%s: dot -Tpng %s -o %s.png && open %s.png\n", __func__, filename, filename, filename);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+static void ggml_v3_opt_set_params(int np, struct ggml_v3_tensor * const ps[], const float * x) {
+    int i = 0;
+    for (int p = 0; p < np; ++p) {
+        const int64_t ne = ggml_v3_nelements(ps[p]) ;
+        // TODO: add function to set tensor from array
+        for (int64_t j = 0; j < ne; ++j) {
+            ggml_v3_set_f32_1d(ps[p], j, x[i++]);
+        }
+    }
+}
+
+static void ggml_v3_opt_get_params(int np, struct ggml_v3_tensor * const ps[], float * x) {
+    int i = 0;
+    for (int p = 0; p < np; ++p) {
+        const int64_t ne = ggml_v3_nelements(ps[p]) ;
+        // TODO: add function to get all elements at once
+        for (int64_t j = 0; j < ne; ++j) {
+            x[i++] = ggml_v3_get_f32_1d(ps[p], j);
+        }
+    }
+}
+
+static void ggml_v3_opt_get_grad(int np, struct ggml_v3_tensor * const ps[], float * g) {
+    int64_t i = 0;
+    for (int p = 0; p < np; ++p) {
+        const int64_t ne = ggml_v3_nelements(ps[p]) ;
+        // TODO: add function to get all elements at once
+        for (int64_t j = 0; j < ne; ++j) {
+            g[i++] = ggml_v3_get_f32_1d(ps[p]->grad, j);
+        }
+    }
+}
+
+static void ggml_v3_opt_acc_grad(int np, struct ggml_v3_tensor * const ps[], float * g, float scale) {
+    int64_t i = 0;
+    for (int p = 0; p < np; ++p) {
+        const int64_t ne = ggml_v3_nelements(ps[p]) ;
+        // TODO: add function to get all elements at once
+        for (int64_t j = 0; j < ne; ++j) {
+            g[i++] += ggml_v3_get_f32_1d(ps[p]->grad, j) * scale;
+        }
+    }
+}
+
+//
+// Using AdamW - ref: https://arxiv.org/pdf/1711.05101v3.pdf
+//
+// (Original Adam - ref: https://arxiv.org/pdf/1412.6980.pdf)
+//
+
+static enum ggml_v3_opt_result ggml_v3_opt_adam(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_opt_context * opt,
+        struct ggml_v3_opt_params params,
+        struct ggml_v3_tensor * f,
+        struct ggml_v3_cgraph * gf,
+        struct ggml_v3_cgraph * gb,
+        ggml_v3_opt_callback callback,
+        void * callback_data) {
+    GGML_V3_ASSERT(ggml_v3_is_scalar(f));
+
+    // these will store the parameters we want to optimize
+    struct ggml_v3_tensor * ps[GGML_V3_MAX_PARAMS];
+
+    int np = 0;
+    int64_t nx = 0;
+    for (int i = 0; i < gf->n_nodes; ++i) {
+        if (gf->nodes[i]->is_param) {
+            GGML_V3_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);
+
+            GGML_V3_ASSERT(np < GGML_V3_MAX_PARAMS);
+
+            ps[np++] = gf->nodes[i];
+            nx += ggml_v3_nelements(gf->nodes[i]);
+        }
+    }
+
+    if ((opt->params.type != params.type) || (opt->nx != nx) || (opt->params.past != params.past)) {
+        int iter = opt->iter;
+        ggml_v3_opt_init(opt->ctx, opt, params, nx);
+        opt->iter = iter;
+    }
+
+    // constants
+    float sched = params.adam.sched;
+    const float alpha = params.adam.alpha;
+    const float decay = params.adam.decay * alpha;
+    const float beta1 = params.adam.beta1;
+    const float beta2 = params.adam.beta2;
+    const float eps   = params.adam.eps;
+    const float gclip = params.adam.gclip;
+    const int decay_min_ndim = params.adam.decay_min_ndim;
+    const int n_accum = MAX(1, params.n_gradient_accumulation);
+    const float accum_norm = 1.0f / (float) n_accum;
+
+    float * g  = opt->adam.g->data;  // gradients
+    float * m  = opt->adam.m->data;  // first moment
+    float * v  = opt->adam.v->data;  // second moment
+
+    float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values
+
+    struct ggml_v3_cplan cplan = ggml_v3_graph_plan(gb, params.n_threads);
+    struct ggml_v3_object * obj = ggml_v3_new_object(ctx, GGML_V3_OBJECT_WORK_BUFFER, cplan.work_size);
+    cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
+
+    bool cancel = false;
+
+    // compute the function value
+    float fx = 0;
+    ggml_v3_set_zero(opt->adam.g);
+    for (int accum_step = 0; accum_step < n_accum; ++accum_step) {
+        if (callback) {
+            callback(callback_data, accum_step, &sched, &cancel);
+            if (cancel) {
+                return GGML_V3_OPT_CANCEL;
+            }
+        }
+        // ggml_v3_graph_reset  (gf);
+        ggml_v3_set_f32      (f->grad, 1.0f);
+        ggml_v3_graph_compute(gb, &cplan);
+        ggml_v3_opt_acc_grad(np, ps, g, accum_norm);
+        fx += ggml_v3_get_f32_1d(f, 0);
+    }
+    fx *= accum_norm;
+
+    opt->adam.fx_prev = fx;
+    opt->adam.fx_best = opt->adam.fx_prev;
+    if (pf) {
+        pf[opt->iter % params.past] = opt->adam.fx_prev;
+    }
+
+    opt->loss_before = opt->adam.fx_prev;
+    opt->loss_after  = opt->adam.fx_prev;
+
+    // initialize
+    if (opt->just_initialized) {
+        opt->adam.n_no_improvement = 0;
+        opt->just_initialized = false;
+    }
+
+    float * fx_best = &opt->adam.fx_best;
+    float * fx_prev = &opt->adam.fx_prev;
+    int * n_no_improvement = &opt->adam.n_no_improvement;
+
+    int iter0 = opt->iter;
+
+    // run the optimizer
+    for (int t = 0; t < params.adam.n_iter; ++t) {
+        opt->iter = iter0 + t + 1;
+        GGML_V3_PRINT_DEBUG  ("=== iter %d ===\n", t);
+
+        GGML_V3_PRINT_DEBUG  ("f      = %10.6f\n", ggml_v3_get_f32_1d(f, 0));
+        GGML_V3_PRINT_DEBUG_5("df/dx0 = %10.6f\n", ggml_v3_get_f32_1d(ps[0]->grad, 0));
+        GGML_V3_PRINT_DEBUG_5("df/dx1 = %10.6f\n", ggml_v3_get_f32_1d(ps[1]->grad, 0));
+
+        for (int i = 0; i < np; ++i) {
+            GGML_V3_PRINT_DEBUG("param %d: %10.6f, g = %10.6f\n", i,
+                    ggml_v3_get_f32_1d(ps[i], 0), ggml_v3_get_f32_1d(ps[i]->grad, 0));
+        }
+
+        const int64_t t_start_wall = ggml_v3_time_us();
+        const int64_t t_start_cpu = ggml_v3_cycles();
+        UNUSED(t_start_wall);
+        UNUSED(t_start_cpu);
+
+        {
+            float gnorm = 1.0f;
+            if (gclip > 0.0f) {
+                // gradient clipping
+                ggml_v3_float sum = 0.0;
+                for (int64_t i = 0; i < nx; ++i) {
+                    sum += (ggml_v3_float)(g[i]*g[i]);
+                }
+                ggml_v3_float norm = sqrt(sum);
+                if (norm > (ggml_v3_float) gclip) {
+                    gnorm = (float) ((ggml_v3_float) gclip / norm);
+                }
+            }
+            const float beta1h = alpha*sched/(1.0f - powf(beta1, opt->iter));
+            const float beta2h =        1.0f/(1.0f - powf(beta2, opt->iter));
+            int64_t i = 0;
+            for (int p = 0; p < np; ++p) {
+                const int64_t ne = ggml_v3_nelements(ps[p]);
+                const float p_decay = ((ggml_v3_n_dims(ps[p]) >= decay_min_ndim) ? decay : 0.0f) * sched;
+                for (int64_t j = 0; j < ne; ++j) {
+                    float x  = ggml_v3_get_f32_1d(ps[p], j);
+                    float g_ = g[i]*gnorm;
+                    m[i] = m[i]*beta1 +    g_*(1.0f - beta1);
+                    v[i] = v[i]*beta2 + g_*g_*(1.0f - beta2);
+                    float mh = m[i]*beta1h;
+                    float vh = v[i]*beta2h;
+                    vh = sqrtf(vh) + eps;
+                    x  = x*(1.0f - p_decay) - mh/vh;
+                    ggml_v3_set_f32_1d(ps[p], j, x);
+                    ++i;
+                }
+            }
+        }
+
+        fx = 0;
+        ggml_v3_set_zero(opt->adam.g);
+        for (int accum_step = 0; accum_step < n_accum; ++accum_step) {
+            if (callback) {
+                callback(callback_data, accum_step, &sched, &cancel);
+                if (cancel) {
+                    return GGML_V3_OPT_CANCEL;;
+                }
+            }
+            // ggml_v3_graph_reset  (gf);
+            ggml_v3_set_f32      (f->grad, 1.0f);
+            ggml_v3_graph_compute(gb, &cplan);
+            ggml_v3_opt_acc_grad(np, ps, g, accum_norm);
+            fx += ggml_v3_get_f32_1d(f, 0);
+        }
+        fx *= accum_norm;
+
+        opt->loss_after = fx;
+
+        // check convergence
+        if (fabsf(fx - fx_prev[0])/fx < params.adam.eps_f) {
+            GGML_V3_PRINT_DEBUG("converged\n");
+
+            return GGML_V3_OPT_OK;
+        }
+
+        // delta-based convergence test
+        if (pf != NULL) {
+            // need at least params.past iterations to start checking for convergence
+            if (params.past <= iter0 + t) {
+                const float rate = (pf[(iter0 + t)%params.past] - fx)/fx;
+
+                if (fabsf(rate) < params.delta) {
+                    return GGML_V3_OPT_OK;
+                }
+            }
+
+            pf[(iter0 + t)%params.past] = fx;
+        }
+
+        // check for improvement
+        if (params.max_no_improvement > 0) {
+            if (fx_best[0] > fx) {
+                fx_best[0] = fx;
+                n_no_improvement[0] = 0;
+            } else {
+                ++n_no_improvement[0];
+
+                if (n_no_improvement[0] >= params.max_no_improvement) {
+                    return GGML_V3_OPT_OK;
+                }
+            }
+        }
+
+        fx_prev[0] = fx;
+
+        {
+            const int64_t t_end_cpu = ggml_v3_cycles();
+            GGML_V3_PRINT_DEBUG("time iter:      %5.3f s\n", ((float)(t_end_cpu - t_start_cpu))/CLOCKS_PER_SEC);
+            UNUSED(t_end_cpu);
+
+            const int64_t t_end_wall = ggml_v3_time_us();
+            GGML_V3_PRINT_DEBUG("wall time iter: %5.3f s\n", (t_end_wall - t_start_wall)/1e6);
+            UNUSED(t_end_wall);
+        }
+    }
+
+    return GGML_V3_OPT_DID_NOT_CONVERGE;
+}
+
+//
+// L-BFGS
+//
+// the L-BFGS implementation below is based on the following implementation:
+//
+//   https://github.com/chokkan/liblbfgs
+//
+
+struct ggml_v3_lbfgs_iteration_data {
+    float alpha;
+    float ys;
+    float * s;
+    float * y;
+};
+
+static enum ggml_v3_opt_result linesearch_backtracking(
+        const struct ggml_v3_opt_params * params,
+        int nx,
+        float * x,
+        float * fx,
+        float * g,
+        float * d,
+        float * step,
+        const float * xp,
+        struct ggml_v3_tensor * f,
+        struct ggml_v3_cgraph * gb,
+        struct ggml_v3_cplan  * cplan,
+        const int np,
+        struct ggml_v3_tensor * ps[],
+        bool * cancel,
+        ggml_v3_opt_callback callback,
+        void * callback_data) {
+    int count = 0;
+
+    float width  = 0.0f;
+    float dg     = 0.0f;
+    float finit  = 0.0f;
+    float dginit = 0.0f;
+    float dgtest = 0.0f;
+
+    const float dec = 0.5f;
+    const float inc = 2.1f;
+
+    const int n_accum = MAX(1, params->n_gradient_accumulation);
+    const float accum_norm = 1.0f / (float) n_accum;
+
+    if (*step <= 0.f) {
+        return GGML_V3_LINESEARCH_INVALID_PARAMETERS;
+    }
+
+    // compute the initial gradient in the search direction
+    ggml_v3_vec_dot_f32(nx, &dginit, g, d);
+
+    // make sure that d points to a descent direction
+    if (0 < dginit) {
+        return GGML_V3_LINESEARCH_FAIL;
+    }
+
+    // initialize local variables
+    finit = *fx;
+    dgtest = params->lbfgs.ftol*dginit;
+
+    while (true) {
+        ggml_v3_vec_cpy_f32(nx, x, xp);
+        ggml_v3_vec_mad_f32(nx, x, d, *step);
+
+        // evaluate the function and gradient values
+        {
+            ggml_v3_opt_set_params(np, ps, x);
+
+            *fx = 0;
+            memset(g, 0, sizeof(float)*nx);
+            for (int accum_step = 0; accum_step < n_accum; ++accum_step) {
+                if (callback) {
+                    // LBFG-S does not support learning rate -> ignore learning schedule
+                    float sched = 0;
+                    callback(callback_data, accum_step, &sched, cancel);
+                    if (*cancel) {
+                        return GGML_V3_OPT_CANCEL;
+                    }
+                }
+                // ggml_v3_graph_reset  (gf);
+                ggml_v3_set_f32      (f->grad, 1.0f);
+                ggml_v3_graph_compute(gb, cplan);
+                ggml_v3_opt_acc_grad(np, ps, g, accum_norm);
+                *fx += ggml_v3_get_f32_1d(f, 0);
+            }
+            *fx *= accum_norm;
+
+        }
+
+        ++count;
+
+        if (*fx > finit + (*step)*dgtest) {
+            width = dec;
+        } else {
+            // Armijo condition is satisfied
+            if (params->lbfgs.linesearch == GGML_V3_LINESEARCH_BACKTRACKING_ARMIJO) {
+                return count;
+            }
+
+            ggml_v3_vec_dot_f32(nx, &dg, g, d);
+
+            // check the Wolfe condition
+            if (dg < params->lbfgs.wolfe * dginit) {
+                width = inc;
+            } else {
+                if(params->lbfgs.linesearch == GGML_V3_LINESEARCH_BACKTRACKING_WOLFE) {
+                    // regular Wolfe conditions
+                    return count;
+                }
+
+                if(dg > -params->lbfgs.wolfe*dginit) {
+                    width = dec;
+                } else {
+                    // strong Wolfe condition (GGML_V3_LINESEARCH_BACKTRACKING_STRONG_WOLFE)
+                    return count;
+                }
+            }
+        }
+
+        if (*step < params->lbfgs.min_step) {
+            return GGML_V3_LINESEARCH_MINIMUM_STEP;
+        }
+        if (*step > params->lbfgs.max_step) {
+            return GGML_V3_LINESEARCH_MAXIMUM_STEP;
+        }
+        if (params->lbfgs.max_linesearch <= count) {
+            return GGML_V3_LINESEARCH_MAXIMUM_ITERATIONS;
+        }
+
+        (*step) *= width;
+    }
+
+    GGML_V3_UNREACHABLE();
+}
+
+static enum ggml_v3_opt_result ggml_v3_opt_lbfgs(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_opt_context * opt,
+        struct ggml_v3_opt_params params,
+        struct ggml_v3_tensor * f,
+        struct ggml_v3_cgraph * gf,
+        struct ggml_v3_cgraph * gb,
+        ggml_v3_opt_callback callback,
+        void * callback_data) {
+    if (params.lbfgs.linesearch == GGML_V3_LINESEARCH_BACKTRACKING_WOLFE ||
+        params.lbfgs.linesearch == GGML_V3_LINESEARCH_BACKTRACKING_STRONG_WOLFE) {
+        if (params.lbfgs.wolfe <= params.lbfgs.ftol || 1.f <= params.lbfgs.wolfe) {
+            return GGML_V3_OPT_INVALID_WOLFE;
+        }
+    }
+
+    const int m = params.lbfgs.m;
+
+    // these will store the parameters we want to optimize
+    struct ggml_v3_tensor * ps[GGML_V3_MAX_PARAMS];
+
+    int np = 0;
+    int nx = 0;
+    for (int i = 0; i < gf->n_nodes; ++i) {
+        if (gf->nodes[i]->is_param) {
+            GGML_V3_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);
+
+            GGML_V3_ASSERT(np < GGML_V3_MAX_PARAMS);
+
+            ps[np++] = gf->nodes[i];
+            nx += ggml_v3_nelements(gf->nodes[i]);
+        }
+    }
+
+    if ((opt->params.type != params.type) || (opt->nx != nx) || (opt->params.past != params.past) || (opt->params.lbfgs.m != params.lbfgs.m)) {
+        int iter = opt->iter;
+        ggml_v3_opt_init(ctx, opt, params, nx);
+        opt->iter = iter;
+    }
+
+    struct ggml_v3_cplan cplan = ggml_v3_graph_plan(gb, params.n_threads);
+    struct ggml_v3_object * obj = ggml_v3_new_object(ctx, GGML_V3_OBJECT_WORK_BUFFER, cplan.work_size);
+    cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
+
+    float * x  = opt->lbfgs.x->data;  // current parameters
+    float * xp = opt->lbfgs.xp->data; // previous parameters
+    float * g  = opt->lbfgs.g->data;  // current gradient
+    float * gp = opt->lbfgs.gp->data; // previous gradient
+    float * d  = opt->lbfgs.d->data;  // search direction
+
+    float * pf = params.past > 0 ? opt->lbfgs.pf->data : NULL; // past function values
+
+    const int n_accum = MAX(1, params.n_gradient_accumulation);
+    const float accum_norm = 1.0f / (float) n_accum;
+
+    float fx    = 0.0f; // cost function value
+    float xnorm = 0.0f; // ||x||
+    float gnorm = 0.0f; // ||g||
+
+    // initialize x from the graph nodes
+    ggml_v3_opt_get_params(np, ps, x);
+
+    // the L-BFGS memory
+    float * lm_alpha = opt->lbfgs.lmal->data;
+    float * lm_ys    = opt->lbfgs.lmys->data;
+    float * lm_s     = opt->lbfgs.lms->data;
+    float * lm_y     = opt->lbfgs.lmy->data;
+
+    bool cancel = false;
+
+    // evaluate the function value and its gradient
+    {
+        ggml_v3_opt_set_params(np, ps, x);
+
+        fx = 0;
+        memset(g, 0, sizeof(float)*nx);
+        for (int accum_step = 0; accum_step < n_accum; ++accum_step) {
+            if (callback) {
+                // LBFG-S does not support learning rate -> ignore learning schedule
+                float sched = 0;
+                callback(callback_data, accum_step, &sched, &cancel);
+                if (cancel) {
+                    return GGML_V3_OPT_CANCEL;
+                }
+            }
+            // ggml_v3_graph_reset  (gf);
+            ggml_v3_set_f32      (f->grad, 1.0f);
+            ggml_v3_graph_compute(gb, &cplan);
+            ggml_v3_opt_acc_grad(np, ps, g, accum_norm);
+            fx += ggml_v3_get_f32_1d(f, 0);
+        }
+        fx *= accum_norm;
+
+        opt->loss_before = fx;
+        opt->loss_after  = fx;
+    }
+
+    // search direction = -gradient
+    ggml_v3_vec_neg_f32(nx, d, g);
+
+    // ||x||, ||g||
+    ggml_v3_vec_norm_f32(nx, &xnorm, x);
+    ggml_v3_vec_norm_f32(nx, &gnorm, g);
+
+    if (xnorm < 1.0f) {
+        xnorm = 1.0f;
+    }
+
+    // already optimized
+    if (gnorm/xnorm <= params.lbfgs.eps) {
+        return GGML_V3_OPT_OK;
+    }
+
+    if (opt->just_initialized) {
+        if (pf) {
+            pf[0] = fx;
+        }
+        opt->lbfgs.fx_best = fx;
+
+        // initial step
+        ggml_v3_vec_norm_inv_f32(nx, &opt->lbfgs.step, d);
+        opt->lbfgs.j                = 0;
+        opt->lbfgs.k                = 1;
+        opt->lbfgs.end              = 0;
+        opt->lbfgs.n_no_improvement = 0;
+        opt->just_initialized       = false;
+    }
+
+    float * fx_best        = &opt->lbfgs.fx_best;
+    float * step           = &opt->lbfgs.step;
+    int * j                = &opt->lbfgs.j;
+    int * k                = &opt->lbfgs.k;
+    int * end              = &opt->lbfgs.end;
+    int * n_no_improvement = &opt->lbfgs.n_no_improvement;
+
+    int ls     = 0;
+    int bound  = 0;
+
+    float ys   = 0.0f;
+    float yy   = 0.0f;
+    float beta = 0.0f;
+
+    int it = 0;
+
+    while (true) {
+        // store the current position and gradient vectors
+        ggml_v3_vec_cpy_f32(nx, xp, x);
+        ggml_v3_vec_cpy_f32(nx, gp, g);
+
+        // TODO: instead of passing &cancel here, use the return code of the linesearch
+        //       to determine if the optimization should be cancelled
+        //       this is a simple change, but not doing this atm, since I don't have a nice
+        //       way to test and don't want to break something with so many changes lined up
+        ls = linesearch_backtracking(&params, nx, x, &fx, g, d, step, xp, f, gb, &cplan, np, ps, &cancel, callback, callback_data);
+        if (cancel) {
+            return GGML_V3_OPT_CANCEL;
+        }
+
+        if (ls < 0) {
+            // linesearch failed - go back to the previous point and return
+            ggml_v3_vec_cpy_f32(nx, x, xp);
+            ggml_v3_vec_cpy_f32(nx, g, gp);
+
+            return ls;
+        }
+
+        opt->loss_after = fx;
+
+        ggml_v3_vec_norm_f32(nx, &xnorm, x);
+        ggml_v3_vec_norm_f32(nx, &gnorm, g);
+
+        GGML_V3_PRINT_DEBUG("f = %10.6f\n", ggml_v3_get_f32_1d(f, 0));
+
+        if (xnorm < 1.0f) {
+            xnorm = 1.0f;
+        }
+        if (gnorm/xnorm <= params.lbfgs.eps) {
+            // converged
+            return GGML_V3_OPT_OK;
+        }
+
+        // delta-based convergence test
+        if (pf != NULL) {
+            // need at least params.past iterations to start checking for convergence
+            if (params.past <= k[0]) {
+                const float rate = (pf[k[0]%params.past] - fx)/fx;
+
+                if (fabsf(rate) < params.delta) {
+                    return GGML_V3_OPT_OK;
+                }
+            }
+
+            pf[k[0]%params.past] = fx;
+        }
+
+        // check for improvement
+        if (params.max_no_improvement > 0) {
+            if (fx < fx_best[0]) {
+                fx_best[0] = fx;
+                n_no_improvement[0] = 0;
+            } else {
+                n_no_improvement[0]++;
+
+                if (n_no_improvement[0] >= params.max_no_improvement) {
+                    return GGML_V3_OPT_OK;
+                }
+            }
+        }
+
+        if (params.lbfgs.n_iter != 0 && params.lbfgs.n_iter < it + 1) {
+            // reached the maximum number of iterations
+            return GGML_V3_OPT_DID_NOT_CONVERGE;
+        }
+
+        // update vectors s and y:
+        //   s_{k+1} = x_{k+1} - x_{k} = \step * d_{k}.
+        //   y_{k+1} = g_{k+1} - g_{k}.
+        //
+        ggml_v3_vec_sub_f32(nx, &lm_s[end[0]*nx], x, xp);
+        ggml_v3_vec_sub_f32(nx, &lm_y[end[0]*nx], g, gp);
+
+        // compute scalars ys and yy:
+        //     ys = y^t \cdot s    -> 1 / \rho.
+        //     yy = y^t \cdot y.
+        //
+        ggml_v3_vec_dot_f32(nx, &ys, &lm_y[end[0]*nx], &lm_s[end[0]*nx]);
+        ggml_v3_vec_dot_f32(nx, &yy, &lm_y[end[0]*nx], &lm_y[end[0]*nx]);
+
+        lm_ys[end[0]] = ys;
+
+        // find new search direction
+        //   ref: https://en.wikipedia.org/wiki/Limited-memory_BFGS
+
+        bound = (m <= k[0]) ? m : k[0];
+        k[0]++;
+        it++;
+        end[0] = (end[0] + 1)%m;
+
+        // initialize search direction with -g
+        ggml_v3_vec_neg_f32(nx, d, g);
+
+        j[0] = end[0];
+        for (int i = 0; i < bound; ++i) {
+            j[0] = (j[0] + m - 1) % m;
+            // \alpha_{j} = \rho_{j} s^{t}_{j} \cdot q_{k+1}
+            ggml_v3_vec_dot_f32(nx, &lm_alpha[j[0]], &lm_s[j[0]*nx], d);
+            lm_alpha[j[0]] /= lm_ys[j[0]];
+            // q_{i} = q_{i+1} - \alpha_{i} y_{i}
+            ggml_v3_vec_mad_f32(nx, d, &lm_y[j[0]*nx], -lm_alpha[j[0]]);
+        }
+
+        ggml_v3_vec_scale_f32(nx, d, ys/yy);
+
+        for (int i = 0; i < bound; ++i) {
+            // \beta_{j} = \rho_{j} y^t_{j} \cdot \gamma_{i}
+            ggml_v3_vec_dot_f32(nx, &beta, &lm_y[j[0]*nx], d);
+            beta /= lm_ys[j[0]];
+            // \gamma_{i+1} = \gamma_{i} + (\alpha_{j} - \beta_{j}) s_{j}
+            ggml_v3_vec_mad_f32(nx, d, &lm_s[j[0]*nx], lm_alpha[j[0]] - beta);
+            j[0] = (j[0] + 1)%m;
+        }
+
+        step[0] = 1.0;
+    }
+
+    GGML_V3_UNREACHABLE();
+}
+
+struct ggml_v3_opt_params ggml_v3_opt_default_params(enum ggml_v3_opt_type type) {
+    struct ggml_v3_opt_params result;
+
+    switch (type) {
+        case GGML_V3_OPT_ADAM:
+            {
+                result = (struct ggml_v3_opt_params) {
+                    .type       = GGML_V3_OPT_ADAM,
+                    .graph_size = GGML_V3_DEFAULT_GRAPH_SIZE,
+                    .n_threads  = 1, // FIXME: GGML_V3_DEFAULT_N_THREADS ?
+                    .past       = 0,
+                    .delta      = 1e-5f,
+
+                    .max_no_improvement = 100,
+
+                    .print_forward_graph  = true,
+                    .print_backward_graph = true,
+
+                    .n_gradient_accumulation = 1,
+
+                    .adam = {
+                        .n_iter = 10000,
+                        .sched  = 1.000f,
+                        .decay  = 0.0f,
+                        .decay_min_ndim = 2,
+                        .alpha  = 0.001f,
+                        .beta1  = 0.9f,
+                        .beta2  = 0.999f,
+                        .eps    = 1e-8f,
+                        .eps_f  = 1e-5f,
+                        .eps_g  = 1e-3f,
+                        .gclip  = 0.0f,
+                    },
+                };
+            } break;
+        case GGML_V3_OPT_LBFGS:
+            {
+                result = (struct ggml_v3_opt_params) {
+                    .type       = GGML_V3_OPT_LBFGS,
+                    .graph_size = GGML_V3_DEFAULT_GRAPH_SIZE,
+                    .n_threads  = 1,
+                    .past       = 0,
+                    .delta      = 1e-5f,
+
+                    .max_no_improvement = 0,
+
+                    .print_forward_graph  = true,
+                    .print_backward_graph = true,
+
+                    .n_gradient_accumulation = 1,
+
+                    .lbfgs = {
+                        .m              = 6,
+                        .n_iter         = 100,
+                        .max_linesearch = 20,
+
+                        .eps      = 1e-5f,
+                        .ftol     = 1e-4f,
+                        .wolfe    = 0.9f,
+                        .min_step = 1e-20f,
+                        .max_step = 1e+20f,
+
+                        .linesearch = GGML_V3_LINESEARCH_DEFAULT,
+                    },
+                };
+            } break;
+    }
+
+    return result;
+}
+
+GGML_V3_API void ggml_v3_opt_init(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_opt_context * opt,
+        struct ggml_v3_opt_params params,
+        int64_t nx) {
+    opt->ctx = ctx;
+    opt->params = params;
+    opt->iter = 0;
+    opt->nx = nx;
+    opt->just_initialized = true;
+    if (opt->ctx == NULL) {
+        struct ggml_v3_init_params ctx_opt_params;
+        if (opt->params.type == GGML_V3_OPT_ADAM) {
+            ctx_opt_params.mem_size = GGML_V3_MEM_ALIGN*3 + ggml_v3_tensor_overhead()*3 + ggml_v3_type_size(GGML_V3_TYPE_F32)*nx*3;
+            if (opt->params.past > 0) {
+                ctx_opt_params.mem_size += GGML_V3_MEM_ALIGN + ggml_v3_tensor_overhead() + ggml_v3_type_size(GGML_V3_TYPE_F32)*opt->params.past;
+            }
+        } else if (opt->params.type == GGML_V3_OPT_LBFGS) {
+            ctx_opt_params.mem_size = GGML_V3_MEM_ALIGN*9 + ggml_v3_tensor_overhead()*9 + ggml_v3_type_size(GGML_V3_TYPE_F32)*(nx*5 + opt->params.lbfgs.m*2 + nx*opt->params.lbfgs.m*2);
+            if (opt->params.past > 0) {
+                ctx_opt_params.mem_size += GGML_V3_MEM_ALIGN + ggml_v3_tensor_overhead() + ggml_v3_type_size(GGML_V3_TYPE_F32)*opt->params.past;
+            }
+        }
+        ctx_opt_params.mem_buffer = NULL;
+        ctx_opt_params.no_alloc   = false;
+
+        opt->ctx = ggml_v3_init(ctx_opt_params);
+    }
+    switch (opt->params.type) {
+        case GGML_V3_OPT_ADAM:
+            {
+                opt->adam.g  = ggml_v3_new_tensor_1d(opt->ctx, GGML_V3_TYPE_F32, nx);
+                opt->adam.m  = ggml_v3_new_tensor_1d(opt->ctx, GGML_V3_TYPE_F32, nx);
+                opt->adam.v  = ggml_v3_new_tensor_1d(opt->ctx, GGML_V3_TYPE_F32, nx);
+                opt->adam.pf = params.past > 0
+                    ? ggml_v3_new_tensor_1d(opt->ctx, GGML_V3_TYPE_F32, params.past)
+                    : NULL;
+                ggml_v3_set_zero(opt->adam.m);
+                ggml_v3_set_zero(opt->adam.v);
+                if (opt->adam.pf) {
+                    ggml_v3_set_zero(opt->adam.pf);
+                }
+            } break;
+        case GGML_V3_OPT_LBFGS:
+            {
+                opt->lbfgs.x  = ggml_v3_new_tensor_1d(opt->ctx, GGML_V3_TYPE_F32, nx);
+                opt->lbfgs.xp = ggml_v3_new_tensor_1d(opt->ctx, GGML_V3_TYPE_F32, nx);
+                opt->lbfgs.g  = ggml_v3_new_tensor_1d(opt->ctx, GGML_V3_TYPE_F32, nx);
+                opt->lbfgs.gp = ggml_v3_new_tensor_1d(opt->ctx, GGML_V3_TYPE_F32, nx);
+                opt->lbfgs.d  = ggml_v3_new_tensor_1d(opt->ctx, GGML_V3_TYPE_F32, nx);
+                opt->lbfgs.pf = params.past > 0
+                    ? ggml_v3_new_tensor_1d(opt->ctx, GGML_V3_TYPE_F32, params.past)
+                    : NULL;
+                opt->lbfgs.lmal = ggml_v3_new_tensor_1d(opt->ctx, GGML_V3_TYPE_F32, params.lbfgs.m);
+                opt->lbfgs.lmys = ggml_v3_new_tensor_1d(opt->ctx, GGML_V3_TYPE_F32, params.lbfgs.m);
+                opt->lbfgs.lms  = ggml_v3_new_tensor_2d(opt->ctx, GGML_V3_TYPE_F32, nx, params.lbfgs.m);
+                opt->lbfgs.lmy  = ggml_v3_new_tensor_2d(opt->ctx, GGML_V3_TYPE_F32, nx, params.lbfgs.m);
+                ggml_v3_set_zero(opt->lbfgs.x);
+                ggml_v3_set_zero(opt->lbfgs.xp);
+                ggml_v3_set_zero(opt->lbfgs.g);
+                ggml_v3_set_zero(opt->lbfgs.gp);
+                ggml_v3_set_zero(opt->lbfgs.d);
+                if (opt->lbfgs.pf) {
+                    ggml_v3_set_zero(opt->lbfgs.pf);
+                }
+                ggml_v3_set_zero(opt->lbfgs.lmal);
+                ggml_v3_set_zero(opt->lbfgs.lmys);
+                ggml_v3_set_zero(opt->lbfgs.lms);
+                ggml_v3_set_zero(opt->lbfgs.lmy);
+            } break;
+    }
+}
+
+enum ggml_v3_opt_result ggml_v3_opt(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_opt_params params,
+        struct ggml_v3_tensor * f) {
+    bool free_ctx = false;
+    if (ctx == NULL) {
+        struct ggml_v3_init_params params_ctx = {
+            .mem_size   = 16*1024*1024,
+            .mem_buffer = NULL,
+            .no_alloc   = false,
+        };
+
+        ctx = ggml_v3_init(params_ctx);
+        if (ctx == NULL) {
+            return GGML_V3_OPT_NO_CONTEXT;
+        }
+
+        free_ctx = true;
+    }
+
+    enum ggml_v3_opt_result result = GGML_V3_OPT_OK;
+
+    struct ggml_v3_opt_context * opt = (struct ggml_v3_opt_context *) alloca(sizeof(struct ggml_v3_opt_context));
+
+    ggml_v3_opt_init(ctx, opt, params, 0);
+    result = ggml_v3_opt_resume(ctx, opt, f);
+
+    if (free_ctx) {
+        ggml_v3_free(ctx);
+    }
+
+    return result;
+}
+
+enum ggml_v3_opt_result ggml_v3_opt_resume(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_opt_context * opt,
+        struct ggml_v3_tensor * f) {
+
+    // build forward + backward compute graphs
+    struct ggml_v3_cgraph * gf = ggml_v3_new_graph_custom(ctx, opt->params.graph_size, true);
+    ggml_v3_build_forward_expand(gf, f);
+
+    struct ggml_v3_cgraph * gb = ggml_v3_graph_dup(ctx, gf);
+    ggml_v3_build_backward_expand(ctx, gf, gb, true);
+
+    return ggml_v3_opt_resume_g(ctx, opt, f, gf, gb, NULL, NULL);
+}
+
+enum ggml_v3_opt_result ggml_v3_opt_resume_g(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_opt_context * opt,
+        struct ggml_v3_tensor * f,
+        struct ggml_v3_cgraph * gf,
+        struct ggml_v3_cgraph * gb,
+        ggml_v3_opt_callback callback,
+        void * callback_data) {
+
+    // build forward + backward compute graphs
+    enum ggml_v3_opt_result result = GGML_V3_OPT_OK;
+
+    switch (opt->params.type) {
+        case GGML_V3_OPT_ADAM:
+            {
+                result = ggml_v3_opt_adam(ctx, opt, opt->params, f, gf, gb, callback, callback_data);
+            } break;
+        case GGML_V3_OPT_LBFGS:
+            {
+                result = ggml_v3_opt_lbfgs(ctx, opt, opt->params, f, gf, gb, callback, callback_data);
+            } break;
+    }
+
+    if (opt->params.print_forward_graph) {
+        ggml_v3_graph_print   (gf);
+        ggml_v3_graph_dump_dot(gf, NULL, "opt-forward.dot");
+    }
+
+    if (opt->params.print_backward_graph) {
+        ggml_v3_graph_print   (gb);
+        ggml_v3_graph_dump_dot(gb, gf, "opt-backward.dot");
+    }
+
+    return result;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+size_t ggml_v3_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist) {
+    assert(k % QK4_0 == 0);
+    const int nb = k / QK4_0;
+
+    for (int b = 0; b < n; b += k) {
+        block_q4_0 * restrict y = (block_q4_0 *) dst + b/QK4_0;
+
+        quantize_row_q4_0_reference(src + b, y, k);
+
+        for (int i = 0; i < nb; i++) {
+            for (int j = 0; j < QK4_0; j += 2) {
+                const uint8_t vi0 = y[i].qs[j/2] & 0x0F;
+                const uint8_t vi1 = y[i].qs[j/2] >> 4;
+
+                hist[vi0]++;
+                hist[vi1]++;
+            }
+        }
+    }
+
+    return (n/QK4_0*sizeof(block_q4_0));
+}
+
+size_t ggml_v3_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist) {
+    assert(k % QK4_1 == 0);
+    const int nb = k / QK4_1;
+
+    for (int b = 0; b < n; b += k) {
+        block_q4_1 * restrict y = (block_q4_1 *) dst + b/QK4_1;
+
+        quantize_row_q4_1_reference(src + b, y, k);
+
+        for (int i = 0; i < nb; i++) {
+            for (int j = 0; j < QK4_1; j += 2) {
+                const uint8_t vi0 = y[i].qs[j/2] & 0x0F;
+                const uint8_t vi1 = y[i].qs[j/2] >> 4;
+
+                hist[vi0]++;
+                hist[vi1]++;
+            }
+        }
+    }
+
+    return (n/QK4_1*sizeof(block_q4_1));
+}
+
+size_t ggml_v3_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist) {
+    assert(k % QK5_0 == 0);
+    const int nb = k / QK5_0;
+
+    for (int b = 0; b < n; b += k) {
+        block_q5_0 * restrict y = (block_q5_0 *)dst + b/QK5_0;
+
+        quantize_row_q5_0_reference(src + b, y, k);
+
+        for (int i = 0; i < nb; i++) {
+            uint32_t qh;
+            memcpy(&qh, &y[i].qh, sizeof(qh));
+
+            for (int j = 0; j < QK5_0; j += 2) {
+                const uint8_t vh0 = ((qh & (1u << (j/2 + 0 ))) >> (j/2 + 0 )) << 4;
+                const uint8_t vh1 = ((qh & (1u << (j/2 + 16))) >> (j/2 + 12));
+
+                // cast to 16 bins
+                const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
+                const uint8_t vi1 = ((y[i].qs[j/2] >>   4) | vh1) / 2;
+
+                hist[vi0]++;
+                hist[vi1]++;
+            }
+        }
+    }
+
+    return (n/QK5_0*sizeof(block_q5_0));
+}
+
+size_t ggml_v3_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist) {
+    assert(k % QK5_1 == 0);
+    const int nb = k / QK5_1;
+
+    for (int b = 0; b < n; b += k) {
+        block_q5_1 * restrict y = (block_q5_1 *)dst + b/QK5_1;
+
+        quantize_row_q5_1_reference(src + b, y, k);
+
+        for (int i = 0; i < nb; i++) {
+            uint32_t qh;
+            memcpy(&qh, &y[i].qh, sizeof(qh));
+
+            for (int j = 0; j < QK5_1; j += 2) {
+                const uint8_t vh0 = ((qh & (1u << (j/2 + 0 ))) >> (j/2 + 0 )) << 4;
+                const uint8_t vh1 = ((qh & (1u << (j/2 + 16))) >> (j/2 + 12));
+
+                // cast to 16 bins
+                const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
+                const uint8_t vi1 = ((y[i].qs[j/2] >>   4) | vh1) / 2;
+
+                hist[vi0]++;
+                hist[vi1]++;
+            }
+        }
+    }
+
+    return (n/QK5_1*sizeof(block_q5_1));
+}
+
+size_t ggml_v3_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist) {
+    assert(k % QK8_0 == 0);
+    const int nb = k / QK8_0;
+
+    for (int b = 0; b < n; b += k) {
+        block_q8_0 * restrict y = (block_q8_0 *)dst + b/QK8_0;
+
+        quantize_row_q8_0_reference(src + b, y, k);
+
+        for (int i = 0; i < nb; i++) {
+            for (int j = 0; j < QK8_0; ++j) {
+                const int8_t vi = y[i].qs[j];
+
+                hist[vi/16 + 8]++;
+            }
+        }
+    }
+
+    return (n/QK8_0*sizeof(block_q8_0));
+}
+
+size_t ggml_v3_quantize_chunk(enum ggml_v3_type type, const float * src, void * dst, int start, int n, int64_t * hist) {
+    size_t result = 0;
+    switch (type) {
+        case GGML_V3_TYPE_Q4_0:
+            {
+                GGML_V3_ASSERT(start % QK4_0 == 0);
+                block_q4_0 * block = (block_q4_0*)dst + start / QK4_0;
+                result = ggml_v3_quantize_q4_0(src + start, block, n, n, hist);
+            } break;
+        case GGML_V3_TYPE_Q4_1:
+            {
+                GGML_V3_ASSERT(start % QK4_1 == 0);
+                block_q4_1 * block = (block_q4_1*)dst + start / QK4_1;
+                result = ggml_v3_quantize_q4_1(src + start, block, n, n, hist);
+            } break;
+        case GGML_V3_TYPE_Q5_0:
+            {
+                GGML_V3_ASSERT(start % QK5_0 == 0);
+                block_q5_0 * block = (block_q5_0*)dst + start / QK5_0;
+                result = ggml_v3_quantize_q5_0(src + start, block, n, n, hist);
+            } break;
+        case GGML_V3_TYPE_Q5_1:
+            {
+                GGML_V3_ASSERT(start % QK5_1 == 0);
+                block_q5_1 * block = (block_q5_1*)dst + start / QK5_1;
+                result = ggml_v3_quantize_q5_1(src + start, block, n, n, hist);
+            } break;
+        case GGML_V3_TYPE_Q8_0:
+            {
+                GGML_V3_ASSERT(start % QK8_0 == 0);
+                block_q8_0 * block = (block_q8_0*)dst + start / QK8_0;
+                result = ggml_v3_quantize_q8_0(src + start, block, n, n, hist);
+            } break;
+        case GGML_V3_TYPE_Q2_K:
+            {
+                GGML_V3_ASSERT(start % QK_K == 0);
+                block_q2_K * block = (block_q2_K*)dst + start / QK_K;
+                result = ggml_v3_quantize_q2_K(src + start, block, n, n, hist);
+            } break;
+        case GGML_V3_TYPE_Q3_K:
+            {
+                GGML_V3_ASSERT(start % QK_K == 0);
+                block_q3_K * block = (block_q3_K*)dst + start / QK_K;
+                result = ggml_v3_quantize_q3_K(src + start, block, n, n, hist);
+            } break;
+        case GGML_V3_TYPE_Q4_K:
+            {
+                GGML_V3_ASSERT(start % QK_K == 0);
+                block_q4_K * block = (block_q4_K*)dst + start / QK_K;
+                result = ggml_v3_quantize_q4_K(src + start, block, n, n, hist);
+            } break;
+        case GGML_V3_TYPE_Q5_K:
+            {
+                GGML_V3_ASSERT(start % QK_K == 0);
+                block_q5_K * block = (block_q5_K*)dst + start / QK_K;
+                result = ggml_v3_quantize_q5_K(src + start, block, n, n, hist);
+            } break;
+        case GGML_V3_TYPE_Q6_K:
+            {
+                GGML_V3_ASSERT(start % QK_K == 0);
+                block_q6_K * block = (block_q6_K*)dst + start / QK_K;
+                result = ggml_v3_quantize_q6_K(src + start, block, n, n, hist);
+            } break;
+        case GGML_V3_TYPE_IQ2_XXS:
+            {
+                GGML_V3_ASSERT(start % QK_K == 0);
+                block_iq2_xxs * block = (block_iq2_xxs*)dst + start / QK_K;
+                result = ggml_v3_quantize_iq2_xxs(src + start, block, n, n, hist);
+            } break;
+        case GGML_V3_TYPE_IQ2_XS:
+            {
+                GGML_V3_ASSERT(start % QK_K == 0);
+                block_iq2_xs * block = (block_iq2_xs*)dst + start / QK_K;
+                result = ggml_v3_quantize_iq2_xs(src + start, block, n, n, hist);
+            } break;
+        case GGML_V3_TYPE_F16:
+            {
+                int elemsize = sizeof(ggml_v3_fp16_t);
+                ggml_v3_fp32_to_fp16_row(src + start, (ggml_v3_fp16_t *)dst + start, n);
+                result = n * elemsize;
+            } break;
+        case GGML_V3_TYPE_F32:
+            {
+                int elemsize = sizeof(float);
+                result = n * elemsize;
+                memcpy((uint8_t *)dst + start * elemsize, src + start, result);
+            } break;
+        default:
+            assert(false);
+    }
+    return result;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+struct gguf_v3_str {
+    uint64_t n;  // GGUFv2
+    char * data;
+};
+
+static const size_t GGUF_V3_TYPE_SIZE[GGUF_V3_TYPE_COUNT] = {
+    [GGUF_V3_TYPE_UINT8]   = sizeof(uint8_t),
+    [GGUF_V3_TYPE_INT8]    = sizeof(int8_t),
+    [GGUF_V3_TYPE_UINT16]  = sizeof(uint16_t),
+    [GGUF_V3_TYPE_INT16]   = sizeof(int16_t),
+    [GGUF_V3_TYPE_UINT32]  = sizeof(uint32_t),
+    [GGUF_V3_TYPE_INT32]   = sizeof(int32_t),
+    [GGUF_V3_TYPE_FLOAT32] = sizeof(float),
+    [GGUF_V3_TYPE_BOOL]    = sizeof(bool),
+    [GGUF_V3_TYPE_STRING]  = sizeof(struct gguf_v3_str),
+    [GGUF_V3_TYPE_UINT64]  = sizeof(uint64_t),
+    [GGUF_V3_TYPE_INT64]   = sizeof(int64_t),
+    [GGUF_V3_TYPE_FLOAT64] = sizeof(double),
+    [GGUF_V3_TYPE_ARRAY]   = 0, // undefined
+};
+static_assert(GGUF_V3_TYPE_COUNT == 13, "GGUF_V3_TYPE_COUNT != 13");
+
+static const char * GGUF_V3_TYPE_NAME[GGUF_V3_TYPE_COUNT] = {
+    [GGUF_V3_TYPE_UINT8]   = "u8",
+    [GGUF_V3_TYPE_INT8]    = "i8",
+    [GGUF_V3_TYPE_UINT16]  = "u16",
+    [GGUF_V3_TYPE_INT16]   = "i16",
+    [GGUF_V3_TYPE_UINT32]  = "u32",
+    [GGUF_V3_TYPE_INT32]   = "i32",
+    [GGUF_V3_TYPE_FLOAT32] = "f32",
+    [GGUF_V3_TYPE_BOOL]    = "bool",
+    [GGUF_V3_TYPE_STRING]  = "str",
+    [GGUF_V3_TYPE_ARRAY]   = "arr",
+    [GGUF_V3_TYPE_UINT64]  = "u64",
+    [GGUF_V3_TYPE_INT64]   = "i64",
+    [GGUF_V3_TYPE_FLOAT64] = "f64",
+};
+static_assert(GGUF_V3_TYPE_COUNT == 13, "GGUF_V3_TYPE_COUNT != 13");
+
+union gguf_v3_value {
+    uint8_t  uint8;
+    int8_t   int8;
+    uint16_t uint16;
+    int16_t  int16;
+    uint32_t uint32;
+    int32_t  int32;
+    float    float32;
+    uint64_t uint64;
+    int64_t  int64;
+    double   float64;
+    bool     bool_;
+
+    struct gguf_v3_str str;
+
+    struct {
+        enum gguf_v3_type type;
+
+        uint64_t n;  // GGUFv2
+        void * data;
+    } arr;
+};
+
+struct gguf_v3_kv {
+    struct gguf_v3_str key;
+
+    enum  gguf_v3_type  type;
+    union gguf_v3_value value;
+};
+
+struct gguf_v3_header {
+    char magic[4];
+
+    uint32_t version;
+    uint64_t n_tensors; // GGUFv2
+    uint64_t n_kv;      // GGUFv2
+};
+
+struct gguf_v3_tensor_info {
+    struct gguf_v3_str name;
+
+    uint32_t n_dims;
+    uint64_t ne[GGML_V3_MAX_DIMS];
+
+    enum ggml_v3_type type;
+
+    uint64_t offset; // offset from start of `data`, must be a multiple of `ALIGNMENT`
+
+    // for writing API
+    const void * data;
+    size_t size;
+};
+
+struct gguf_v3_context {
+    struct gguf_v3_header header;
+
+    struct gguf_v3_kv          * kv;
+    struct gguf_v3_tensor_info * infos;
+
+    size_t alignment;
+    size_t offset;    // offset of `data` from beginning of file
+    size_t size;      // size of `data` in bytes
+
+    //uint8_t * padding;
+    void * data;
+};
+
+static bool gguf_v3_fread_el(FILE * file, void * dst, size_t size, size_t * offset) {
+    const size_t n = fread(dst, 1, size, file);
+    *offset += n;
+    return n == size;
+}
+
+// NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
+static bool gguf_v3_fread_str_cur(FILE * file, struct gguf_v3_str * p, size_t * offset) {
+    p->n    = 0;
+    p->data = NULL;
+
+    bool ok = true;
+
+    ok = ok && gguf_v3_fread_el(file, &p->n,    sizeof(p->n), offset); p->data = calloc(p->n + 1, 1);
+    ok = ok && gguf_v3_fread_el(file,  p->data, p->n,         offset);
+
+    return ok;
+}
+
+static bool gguf_v3_fread_str_v1(FILE * file, struct gguf_v3_str * p, size_t * offset) {
+    p->n    = 0;
+    p->data = NULL;
+
+    bool ok = true;
+
+    uint32_t n = 0;
+    ok = ok && gguf_v3_fread_el(file, &n,       sizeof(n), offset); p->data = calloc(n + 1, 1); p->n = n;
+    ok = ok && gguf_v3_fread_el(file,  p->data, p->n,      offset);
+
+    return ok;
+}
+
+struct gguf_v3_context * gguf_v3_init_empty(void) {
+    struct gguf_v3_context * ctx = GGML_V3_ALIGNED_MALLOC(sizeof(struct gguf_v3_context));
+
+    memcpy(ctx->header.magic, GGUF_V3_MAGIC, sizeof(ctx->header.magic));
+    ctx->header.version   = GGUF_V3_VERSION;
+    ctx->header.n_tensors = 0;
+    ctx->header.n_kv      = 0;
+
+    ctx->kv    = NULL;
+    ctx->infos = NULL;
+
+    ctx->alignment = GGUF_V3_DEFAULT_ALIGNMENT;
+    ctx->offset    = 0;
+    ctx->size      = 0;
+
+    ctx->data = NULL;
+
+    return ctx;
+}
+
+struct gguf_v3_context * gguf_v3_init_from_file(const char * fname, struct gguf_v3_init_params params) {
+    FILE * file = fopen(fname, "rb");
+    if (!file) {
+        return NULL;
+    }
+
+    // offset from start of file
+    size_t offset = 0;
+
+    char magic[4];
+
+    // check the magic before making allocations
+    {
+        gguf_v3_fread_el(file, &magic, sizeof(magic), &offset);
+
+        for (uint32_t i = 0; i < sizeof(magic); i++) {
+            if (magic[i] != GGUF_V3_MAGIC[i]) {
+                fprintf(stderr, "%s: invalid magic characters '%c%c%c%c'\n", __func__, magic[0], magic[1], magic[2], magic[3]);
+                fclose(file);
+                return NULL;
+            }
+        }
+    }
+
+    bool ok = true;
+
+    struct gguf_v3_context * ctx = GGML_V3_ALIGNED_MALLOC(sizeof(struct gguf_v3_context));
+
+    // read the header
+    {
+        strncpy(ctx->header.magic, magic, 4);
+
+        ctx->kv    = NULL;
+        ctx->infos = NULL;
+        ctx->data  = NULL;
+
+        ok = ok && gguf_v3_fread_el(file, &ctx->header.version,   sizeof(ctx->header.version),   &offset);
+
+        if (ctx->header.version == 1) {
+            // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
+            uint32_t n_tensors = 0;
+            uint32_t n_kv      = 0;
+
+            ok = ok && gguf_v3_fread_el(file, &n_tensors, sizeof(n_tensors), &offset);
+            ok = ok && gguf_v3_fread_el(file, &n_kv,      sizeof(n_kv),      &offset);
+
+            ctx->header.n_tensors = n_tensors;
+            ctx->header.n_kv      = n_kv;
+        } else {
+            ok = ok && gguf_v3_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset);
+            ok = ok && gguf_v3_fread_el(file, &ctx->header.n_kv,      sizeof(ctx->header.n_kv),      &offset);
+        }
+
+        if (ctx->header.version == 1) {
+            fprintf(stderr, "%s: GGUFv1 is deprecated. please update if possible.\n", __func__);
+        }
+
+        if (!ok) {
+            fprintf(stderr, "%s: failed to read header\n", __func__);
+            fclose(file);
+            gguf_v3_free(ctx);
+            return NULL;
+        }
+    }
+
+    // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
+    bool (* gguf_v3_fread_str)(FILE *, struct gguf_v3_str *, size_t *) = gguf_v3_fread_str_cur;
+    if (ctx->header.version == 1) {
+        gguf_v3_fread_str = gguf_v3_fread_str_v1;
+    }
+
+    // read the kv pairs
+    {
+        ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_v3_kv));
+
+        for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
+            struct gguf_v3_kv * kv = &ctx->kv[i];
+
+            //fprintf(stderr, "%s: reading kv %d\n", __func__, i);
+
+            ok = ok && gguf_v3_fread_str(file, &kv->key,                    &offset);
+            ok = ok && gguf_v3_fread_el (file, &kv->type, sizeof(kv->type), &offset);
+
+            //fprintf(stderr, "%s: reading kv with key %s\n", __func__, kv->key.data);
+
+            switch (kv->type) {
+                case GGUF_V3_TYPE_UINT8:   ok = ok && gguf_v3_fread_el (file, &kv->value.uint8,   sizeof(kv->value.uint8),   &offset); break;
+                case GGUF_V3_TYPE_INT8:    ok = ok && gguf_v3_fread_el (file, &kv->value.int8,    sizeof(kv->value.int8),    &offset); break;
+                case GGUF_V3_TYPE_UINT16:  ok = ok && gguf_v3_fread_el (file, &kv->value.uint16,  sizeof(kv->value.uint16),  &offset); break;
+                case GGUF_V3_TYPE_INT16:   ok = ok && gguf_v3_fread_el (file, &kv->value.int16,   sizeof(kv->value.int16),   &offset); break;
+                case GGUF_V3_TYPE_UINT32:  ok = ok && gguf_v3_fread_el (file, &kv->value.uint32,  sizeof(kv->value.uint32),  &offset); break;
+                case GGUF_V3_TYPE_INT32:   ok = ok && gguf_v3_fread_el (file, &kv->value.int32,   sizeof(kv->value.int32),   &offset); break;
+                case GGUF_V3_TYPE_FLOAT32: ok = ok && gguf_v3_fread_el (file, &kv->value.float32, sizeof(kv->value.float32), &offset); break;
+                case GGUF_V3_TYPE_UINT64:  ok = ok && gguf_v3_fread_el (file, &kv->value.uint64,  sizeof(kv->value.uint64),  &offset); break;
+                case GGUF_V3_TYPE_INT64:   ok = ok && gguf_v3_fread_el (file, &kv->value.int64,   sizeof(kv->value.int64),   &offset); break;
+                case GGUF_V3_TYPE_FLOAT64: ok = ok && gguf_v3_fread_el (file, &kv->value.float64, sizeof(kv->value.float64), &offset); break;
+                case GGUF_V3_TYPE_BOOL:    ok = ok && gguf_v3_fread_el (file, &kv->value.bool_,   sizeof(kv->value.bool_),   &offset); break;
+                case GGUF_V3_TYPE_STRING:  ok = ok && gguf_v3_fread_str(file, &kv->value.str,                                &offset); break;
+                case GGUF_V3_TYPE_ARRAY:
+                    {
+                        ok = ok && gguf_v3_fread_el(file, &kv->value.arr.type, sizeof(kv->value.arr.type), &offset);
+
+                        if (ctx->header.version == 1) {
+                            // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
+                            uint32_t n = 0;
+                            ok = ok && gguf_v3_fread_el(file, &n, sizeof(n), &offset);
+                            kv->value.arr.n = n;
+                        } else {
+                            ok = ok && gguf_v3_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset);
+                        }
+
+                        switch (kv->value.arr.type) {
+                            case GGUF_V3_TYPE_UINT8:
+                            case GGUF_V3_TYPE_INT8:
+                            case GGUF_V3_TYPE_UINT16:
+                            case GGUF_V3_TYPE_INT16:
+                            case GGUF_V3_TYPE_UINT32:
+                            case GGUF_V3_TYPE_INT32:
+                            case GGUF_V3_TYPE_FLOAT32:
+                            case GGUF_V3_TYPE_UINT64:
+                            case GGUF_V3_TYPE_INT64:
+                            case GGUF_V3_TYPE_FLOAT64:
+                            case GGUF_V3_TYPE_BOOL:
+                                {
+                                    kv->value.arr.data = malloc(kv->value.arr.n * GGUF_V3_TYPE_SIZE[kv->value.arr.type]);
+                                    ok = ok && gguf_v3_fread_el(file, kv->value.arr.data, kv->value.arr.n * GGUF_V3_TYPE_SIZE[kv->value.arr.type], &offset);
+                                } break;
+                            case GGUF_V3_TYPE_STRING:
+                                {
+                                    kv->value.arr.data = malloc(kv->value.arr.n * sizeof(struct gguf_v3_str));
+                                    for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
+                                        ok = ok && gguf_v3_fread_str(file, &((struct gguf_v3_str *) kv->value.arr.data)[j], &offset);
+                                    }
+                                } break;
+                            case GGUF_V3_TYPE_ARRAY:
+                            case GGUF_V3_TYPE_COUNT: GGML_V3_ASSERT(false && "invalid type"); break;
+                        }
+                    } break;
+                case GGUF_V3_TYPE_COUNT: GGML_V3_ASSERT(false && "invalid type");
+            }
+
+            if (!ok) {
+                break;
+            }
+        }
+
+        if (!ok) {
+            fprintf(stderr, "%s: failed to read key-value pairs\n", __func__);
+            fclose(file);
+            gguf_v3_free(ctx);
+            return NULL;
+        }
+    }
+
+    // read the tensor infos
+    {
+        ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct gguf_v3_tensor_info));
+
+        for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
+            struct gguf_v3_tensor_info * info = &ctx->infos[i];
+
+            for (int j = 0; j < GGML_V3_MAX_DIMS; ++j) {
+                info->ne[j] = 1;
+            }
+
+            ok = ok && gguf_v3_fread_str(file, &info->name,                          &offset);
+            ok = ok && gguf_v3_fread_el (file, &info->n_dims, sizeof(info->n_dims),  &offset);
+            for (uint32_t j = 0; j < info->n_dims; ++j) {
+                if (ctx->header.version == 1) {
+                    // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
+                    uint32_t t = 0;
+                    ok = ok && gguf_v3_fread_el(file, &t, sizeof(t), &offset);
+                    info->ne[j] = t;
+                } else {
+                    ok = ok && gguf_v3_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset);
+                }
+            }
+            ok = ok && gguf_v3_fread_el (file, &info->type,   sizeof(info->type),    &offset);
+            ok = ok && gguf_v3_fread_el (file, &info->offset, sizeof(info->offset),  &offset);
+
+            if (!ok) {
+                fprintf(stderr, "%s: failed to read tensor info\n", __func__);
+                fclose(file);
+                gguf_v3_free(ctx);
+                return NULL;
+            }
+        }
+    }
+
+    ctx->alignment = GGUF_V3_DEFAULT_ALIGNMENT;
+
+    int alignment_idx = gguf_v3_find_key(ctx, "general.alignment");
+    if (alignment_idx != -1) {
+        ctx->alignment = gguf_v3_get_val_u32(ctx, alignment_idx);
+    }
+
+    // we require the data section to be aligned, so take into account any padding
+    {
+        const size_t offset_pad = offset % ctx->alignment;
+
+        if (offset_pad != 0) {
+            offset += ctx->alignment - offset_pad;
+            fseek(file, offset, SEEK_SET);
+        }
+    }
+
+    // store the current file offset - this is where the data section starts
+    ctx->offset = offset;
+
+    // compute the total size of the data section, taking into account the alignment
+    {
+        ctx->size = 0;
+        for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
+            struct gguf_v3_tensor_info * info = &ctx->infos[i];
+
+            const int64_t ne =
+                (int64_t) info->ne[0] *
+                (int64_t) info->ne[1] *
+                (int64_t) info->ne[2] *
+                (int64_t) info->ne[3];
+
+            if (ne % ggml_v3_blck_size(info->type) != 0) {
+                fprintf(stderr, "%s: tensor '%s' of type %d (%s) number of elements (%" PRId64 ") is not a multiple of block size (%d)\n",
+                        __func__, info->name.data, (int)info->type, ggml_v3_type_name(info->type), ne, ggml_v3_blck_size(info->type));
+                fclose(file);
+                gguf_v3_free(ctx);
+                return NULL;
+            }
+
+            const size_t size_cur = ggml_v3_row_size(info->type, ne);
+
+            ctx->size += GGML_V3_PAD(size_cur, ctx->alignment);
+        }
+    }
+
+    // load the tensor data only if requested
+    if (params.ctx != NULL) {
+        // if the provided gguf_v3_context is no_alloc, then we create "empty" tensors and do not read the binary blob
+        // otherwise, we load the binary blob into the created ggml_v3_context as well, and point the "data" members of
+        // the ggml_v3_tensor structs to the appropriate locations in the binary blob
+
+        // compute the exact size needed for the new ggml_v3_context
+        const size_t mem_size =
+            params.no_alloc ?
+            (ctx->header.n_tensors    )*ggml_v3_tensor_overhead() :
+            (ctx->header.n_tensors + 1)*ggml_v3_tensor_overhead() + ctx->size;
+
+        struct ggml_v3_init_params pdata = {
+            .mem_size   = mem_size,
+            .mem_buffer = NULL,
+            .no_alloc   = params.no_alloc,
+        };
+
+        *params.ctx = ggml_v3_init(pdata);
+
+        struct ggml_v3_context * ctx_data = *params.ctx;
+
+        struct ggml_v3_tensor * data = NULL;
+
+        if (!params.no_alloc) {
+            data = ggml_v3_new_tensor_1d(ctx_data, GGML_V3_TYPE_I8, ctx->size);
+
+            ok = ok && data != NULL;
+
+            // read the binary blob with the tensor data
+            ok = ok && gguf_v3_fread_el(file, data->data, ctx->size, &offset);
+
+            if (!ok) {
+                fprintf(stderr, "%s: failed to read tensor data\n", __func__);
+                fclose(file);
+                ggml_v3_free(ctx_data);
+                gguf_v3_free(ctx);
+                return NULL;
+            }
+
+            ctx->data = data->data;
+        }
+
+        ggml_v3_set_no_alloc(ctx_data, true);
+
+        // create the tensors
+        for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
+            const int64_t ne[GGML_V3_MAX_DIMS] = {
+                ctx->infos[i].ne[0],
+                ctx->infos[i].ne[1],
+                ctx->infos[i].ne[2],
+                ctx->infos[i].ne[3],
+            };
+
+            struct ggml_v3_tensor * cur = ggml_v3_new_tensor(ctx_data, ctx->infos[i].type, ctx->infos[i].n_dims, ne);
+
+            ok = ok && cur != NULL;
+
+            ggml_v3_set_name(cur, ctx->infos[i].name.data);
+
+            if (!ok) {
+                break;
+            }
+
+            // point the data member to the appropriate location in the binary blob using the tensor infos
+            if (!params.no_alloc) {
+              //cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file
+                cur->data = (char *) data->data + ctx->infos[i].offset;               // offset from data
+            }
+        }
+
+        if (!ok) {
+            fprintf(stderr, "%s: failed to read the tensor data\n", __func__);
+            fclose(file);
+            ggml_v3_free(ctx_data);
+            gguf_v3_free(ctx);
+            return NULL;
+        }
+
+        ggml_v3_set_no_alloc(ctx_data, params.no_alloc);
+    }
+
+    fclose(file);
+
+    return ctx;
+}
+
+void gguf_v3_free(struct gguf_v3_context * ctx) {
+    if (ctx == NULL) {
+        return;
+    }
+
+    if (ctx->kv) {
+        // free string memory - not great..
+        for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
+            struct gguf_v3_kv * kv = &ctx->kv[i];
+
+            if (kv->key.data) {
+                free(kv->key.data);
+            }
+
+            if (kv->type == GGUF_V3_TYPE_STRING) {
+                if (kv->value.str.data) {
+                    free(kv->value.str.data);
+                }
+            }
+
+            if (kv->type == GGUF_V3_TYPE_ARRAY) {
+                if (kv->value.arr.data) {
+                    if (kv->value.arr.type == GGUF_V3_TYPE_STRING) {
+                        for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
+                            struct gguf_v3_str * str = &((struct gguf_v3_str *) kv->value.arr.data)[j];
+                            if (str->data) {
+                                free(str->data);
+                            }
+                        }
+                    }
+                    free(kv->value.arr.data);
+                }
+            }
+        }
+
+        free(ctx->kv);
+    }
+
+    if (ctx->infos) {
+        for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
+            struct gguf_v3_tensor_info * info = &ctx->infos[i];
+
+            if (info->name.data) {
+                free(info->name.data);
+            }
+        }
+
+        free(ctx->infos);
+    }
+
+    GGML_V3_ALIGNED_FREE(ctx);
+}
+
+const char * gguf_v3_type_name(enum gguf_v3_type type) {
+    return GGUF_V3_TYPE_NAME[type];
+}
+
+int gguf_v3_get_version(const struct gguf_v3_context * ctx) {
+    return ctx->header.version;
+}
+
+size_t gguf_v3_get_alignment(const struct gguf_v3_context * ctx) {
+    return ctx->alignment;
+}
+
+size_t gguf_v3_get_data_offset(const struct gguf_v3_context * ctx) {
+    return ctx->offset;
+}
+
+void * gguf_v3_get_data(const struct gguf_v3_context * ctx) {
+    return ctx->data;
+}
+
+int gguf_v3_get_n_kv(const struct gguf_v3_context * ctx) {
+    return ctx->header.n_kv;
+}
+
+int gguf_v3_find_key(const struct gguf_v3_context * ctx, const char * key) {
+    // return -1 if key not found
+    int keyfound = -1;
+
+    const int n_kv = gguf_v3_get_n_kv(ctx);
+
+    for (int i = 0; i < n_kv; ++i) {
+        if (strcmp(key, gguf_v3_get_key(ctx, i)) == 0) {
+            keyfound = i;
+            break;
+        }
+    }
+
+    return keyfound;
+}
+
+const char * gguf_v3_get_key(const struct gguf_v3_context * ctx, int key_id) {
+    GGML_V3_ASSERT(key_id >= 0 && key_id < gguf_v3_get_n_kv(ctx));
+    return ctx->kv[key_id].key.data;
+}
+
+enum gguf_v3_type gguf_v3_get_kv_type(const struct gguf_v3_context * ctx, int key_id) {
+    GGML_V3_ASSERT(key_id >= 0 && key_id < gguf_v3_get_n_kv(ctx));
+    return ctx->kv[key_id].type;
+}
+
+enum gguf_v3_type gguf_v3_get_arr_type(const struct gguf_v3_context * ctx, int key_id) {
+    GGML_V3_ASSERT(key_id >= 0 && key_id < gguf_v3_get_n_kv(ctx));
+    GGML_V3_ASSERT(ctx->kv[key_id].type == GGUF_V3_TYPE_ARRAY);
+    return ctx->kv[key_id].value.arr.type;
+}
+
+const void * gguf_v3_get_arr_data(const struct gguf_v3_context * ctx, int key_id) {
+    GGML_V3_ASSERT(key_id >= 0 && key_id < gguf_v3_get_n_kv(ctx));
+    GGML_V3_ASSERT(ctx->kv[key_id].type == GGUF_V3_TYPE_ARRAY);
+    return ctx->kv[key_id].value.arr.data;
+}
+
+const char * gguf_v3_get_arr_str(const struct gguf_v3_context * ctx, int key_id, int i) {
+    GGML_V3_ASSERT(key_id >= 0 && key_id < gguf_v3_get_n_kv(ctx));
+    GGML_V3_ASSERT(ctx->kv[key_id].type == GGUF_V3_TYPE_ARRAY);
+    struct gguf_v3_kv * kv = &ctx->kv[key_id];
+    struct gguf_v3_str * str = &((struct gguf_v3_str *) kv->value.arr.data)[i];
+    return str->data;
+}
+
+int gguf_v3_get_arr_n(const struct gguf_v3_context * ctx, int key_id) {
+    GGML_V3_ASSERT(key_id >= 0 && key_id < gguf_v3_get_n_kv(ctx));
+    GGML_V3_ASSERT(ctx->kv[key_id].type == GGUF_V3_TYPE_ARRAY);
+    return ctx->kv[key_id].value.arr.n;
+}
+
+uint8_t gguf_v3_get_val_u8(const struct gguf_v3_context * ctx, int key_id) {
+    GGML_V3_ASSERT(key_id >= 0 && key_id < gguf_v3_get_n_kv(ctx));
+    GGML_V3_ASSERT(ctx->kv[key_id].type == GGUF_V3_TYPE_UINT8);
+    return ctx->kv[key_id].value.uint8;
+}
+
+int8_t gguf_v3_get_val_i8(const struct gguf_v3_context * ctx, int key_id) {
+    GGML_V3_ASSERT(key_id >= 0 && key_id < gguf_v3_get_n_kv(ctx));
+    GGML_V3_ASSERT(ctx->kv[key_id].type == GGUF_V3_TYPE_INT8);
+    return ctx->kv[key_id].value.int8;
+}
+
+uint16_t gguf_v3_get_val_u16(const struct gguf_v3_context * ctx, int key_id) {
+    GGML_V3_ASSERT(key_id >= 0 && key_id < gguf_v3_get_n_kv(ctx));
+    GGML_V3_ASSERT(ctx->kv[key_id].type == GGUF_V3_TYPE_UINT16);
+    return ctx->kv[key_id].value.uint16;
+}
+
+int16_t gguf_v3_get_val_i16(const struct gguf_v3_context * ctx, int key_id) {
+    GGML_V3_ASSERT(key_id >= 0 && key_id < gguf_v3_get_n_kv(ctx));
+    GGML_V3_ASSERT(ctx->kv[key_id].type == GGUF_V3_TYPE_INT16);
+    return ctx->kv[key_id].value.int16;
+}
+
+uint32_t gguf_v3_get_val_u32(const struct gguf_v3_context * ctx, int key_id) {
+    GGML_V3_ASSERT(key_id >= 0 && key_id < gguf_v3_get_n_kv(ctx));
+    GGML_V3_ASSERT(ctx->kv[key_id].type == GGUF_V3_TYPE_UINT32);
+    return ctx->kv[key_id].value.uint32;
+}
+
+int32_t gguf_v3_get_val_i32(const struct gguf_v3_context * ctx, int key_id) {
+    GGML_V3_ASSERT(key_id >= 0 && key_id < gguf_v3_get_n_kv(ctx));
+    GGML_V3_ASSERT(ctx->kv[key_id].type == GGUF_V3_TYPE_INT32);
+    return ctx->kv[key_id].value.int32;
+}
+
+float gguf_v3_get_val_f32(const struct gguf_v3_context * ctx, int key_id) {
+    GGML_V3_ASSERT(key_id >= 0 && key_id < gguf_v3_get_n_kv(ctx));
+    GGML_V3_ASSERT(ctx->kv[key_id].type == GGUF_V3_TYPE_FLOAT32);
+    return ctx->kv[key_id].value.float32;
+}
+
+uint64_t gguf_v3_get_val_u64(const struct gguf_v3_context * ctx, int key_id) {
+    GGML_V3_ASSERT(key_id >= 0 && key_id < gguf_v3_get_n_kv(ctx));
+    GGML_V3_ASSERT(ctx->kv[key_id].type == GGUF_V3_TYPE_UINT64);
+    return ctx->kv[key_id].value.uint64;
+}
+
+int64_t gguf_v3_get_val_i64(const struct gguf_v3_context * ctx, int key_id) {
+    GGML_V3_ASSERT(key_id >= 0 && key_id < gguf_v3_get_n_kv(ctx));
+    GGML_V3_ASSERT(ctx->kv[key_id].type == GGUF_V3_TYPE_INT64);
+    return ctx->kv[key_id].value.int64;
+}
+
+double gguf_v3_get_val_f64(const struct gguf_v3_context * ctx, int key_id) {
+    GGML_V3_ASSERT(key_id >= 0 && key_id < gguf_v3_get_n_kv(ctx));
+    GGML_V3_ASSERT(ctx->kv[key_id].type == GGUF_V3_TYPE_FLOAT64);
+    return ctx->kv[key_id].value.float64;
+}
+
+bool gguf_v3_get_val_bool(const struct gguf_v3_context * ctx, int key_id) {
+    GGML_V3_ASSERT(key_id >= 0 && key_id < gguf_v3_get_n_kv(ctx));
+    GGML_V3_ASSERT(ctx->kv[key_id].type == GGUF_V3_TYPE_BOOL);
+    return ctx->kv[key_id].value.bool_;
+}
+
+const char * gguf_v3_get_val_str(const struct gguf_v3_context * ctx, int key_id) {
+    GGML_V3_ASSERT(key_id >= 0 && key_id < gguf_v3_get_n_kv(ctx));
+    GGML_V3_ASSERT(ctx->kv[key_id].type == GGUF_V3_TYPE_STRING);
+    return ctx->kv[key_id].value.str.data;
+}
+
+const void * gguf_v3_get_val_data(const struct gguf_v3_context * ctx, int key_id) {
+    GGML_V3_ASSERT(key_id >= 0 && key_id < gguf_v3_get_n_kv(ctx));
+    GGML_V3_ASSERT(ctx->kv[key_id].type != GGUF_V3_TYPE_ARRAY);
+    GGML_V3_ASSERT(ctx->kv[key_id].type != GGUF_V3_TYPE_STRING);
+    return &ctx->kv[key_id].value;
+}
+
+int gguf_v3_get_n_tensors(const struct gguf_v3_context * ctx) {
+    return ctx->header.n_tensors;
+}
+
+int gguf_v3_find_tensor(const struct gguf_v3_context * ctx, const char * name) {
+    // return -1 if tensor not found
+    int tensorfound = -1;
+
+    const int n_tensors = gguf_v3_get_n_tensors(ctx);
+
+    for (int i = 0; i < n_tensors; ++i) {
+        if (strcmp(name, gguf_v3_get_tensor_name(ctx, i)) == 0) {
+            tensorfound = i;
+            break;
+        }
+    }
+
+    return tensorfound;
+}
+
+size_t gguf_v3_get_tensor_offset(const struct gguf_v3_context * ctx, int i) {
+    return ctx->infos[i].offset;
+}
+
+char * gguf_v3_get_tensor_name(const struct gguf_v3_context * ctx, int i) {
+    return ctx->infos[i].name.data;
+}
+
+enum ggml_v3_type gguf_v3_get_tensor_type(const struct gguf_v3_context * ctx, int i) {
+    return ctx->infos[i].type;
+}
+
+// returns the index
+static int gguf_v3_get_or_add_key(struct gguf_v3_context * ctx, const char * key) {
+    const int idx = gguf_v3_find_key(ctx, key);
+    if (idx >= 0) {
+        return idx;
+    }
+
+    const int n_kv = gguf_v3_get_n_kv(ctx);
+
+    ctx->kv = realloc(ctx->kv, (n_kv + 1) * sizeof(struct gguf_v3_kv));
+    ctx->kv[n_kv].key.n    = strlen(key);
+    ctx->kv[n_kv].key.data = strdup(key);
+    ctx->header.n_kv++;
+
+    return n_kv;
+}
+
+void gguf_v3_set_val_u8(struct gguf_v3_context * ctx, const char * key, uint8_t val) {
+    const int idx = gguf_v3_get_or_add_key(ctx, key);
+
+    ctx->kv[idx].type        = GGUF_V3_TYPE_UINT8;
+    ctx->kv[idx].value.uint8 = val;
+}
+
+void gguf_v3_set_val_i8(struct gguf_v3_context * ctx, const char * key, int8_t val) {
+    const int idx = gguf_v3_get_or_add_key(ctx, key);
+
+    ctx->kv[idx].type       = GGUF_V3_TYPE_INT8;
+    ctx->kv[idx].value.int8 = val;
+}
+
+void gguf_v3_set_val_u16(struct gguf_v3_context * ctx, const char * key, uint16_t val) {
+    const int idx = gguf_v3_get_or_add_key(ctx, key);
+
+    ctx->kv[idx].type         = GGUF_V3_TYPE_UINT16;
+    ctx->kv[idx].value.uint16 = val;
+}
+
+void gguf_v3_set_val_i16(struct gguf_v3_context * ctx, const char * key, int16_t val) {
+    const int idx = gguf_v3_get_or_add_key(ctx, key);
+
+    ctx->kv[idx].type        = GGUF_V3_TYPE_INT16;
+    ctx->kv[idx].value.int16 = val;
+}
+
+void gguf_v3_set_val_u32(struct gguf_v3_context * ctx, const char * key, uint32_t val) {
+    const int idx = gguf_v3_get_or_add_key(ctx, key);
+
+    ctx->kv[idx].type         = GGUF_V3_TYPE_UINT32;
+    ctx->kv[idx].value.uint32 = val;
+}
+
+void gguf_v3_set_val_i32(struct gguf_v3_context * ctx, const char * key, int32_t val) {
+    const int idx = gguf_v3_get_or_add_key(ctx, key);
+
+    ctx->kv[idx].type        = GGUF_V3_TYPE_INT32;
+    ctx->kv[idx].value.int32 = val;
+}
+
+void gguf_v3_set_val_f32(struct gguf_v3_context * ctx, const char * key, float val) {
+    const int idx = gguf_v3_get_or_add_key(ctx, key);
+
+    ctx->kv[idx].type          = GGUF_V3_TYPE_FLOAT32;
+    ctx->kv[idx].value.float32 = val;
+}
+
+void gguf_v3_set_val_u64(struct gguf_v3_context * ctx, const char * key, uint64_t val) {
+    const int idx = gguf_v3_get_or_add_key(ctx, key);
+
+    ctx->kv[idx].type         = GGUF_V3_TYPE_UINT64;
+    ctx->kv[idx].value.uint64 = val;
+}
+
+void gguf_v3_set_val_i64(struct gguf_v3_context * ctx, const char * key, int64_t val) {
+    const int idx = gguf_v3_get_or_add_key(ctx, key);
+
+    ctx->kv[idx].type        = GGUF_V3_TYPE_INT64;
+    ctx->kv[idx].value.int64 = val;
+}
+
+void gguf_v3_set_val_f64(struct gguf_v3_context * ctx, const char * key, double val) {
+    const int idx = gguf_v3_get_or_add_key(ctx, key);
+
+    ctx->kv[idx].type          = GGUF_V3_TYPE_FLOAT64;
+    ctx->kv[idx].value.float64 = val;
+}
+
+void gguf_v3_set_val_bool(struct gguf_v3_context * ctx, const char * key, bool val) {
+    const int idx = gguf_v3_get_or_add_key(ctx, key);
+
+    ctx->kv[idx].type        = GGUF_V3_TYPE_BOOL;
+    ctx->kv[idx].value.bool_ = val;
+}
+
+void gguf_v3_set_val_str(struct gguf_v3_context * ctx, const char * key, const char * val) {
+    const int idx = gguf_v3_get_or_add_key(ctx, key);
+
+    ctx->kv[idx].type           = GGUF_V3_TYPE_STRING;
+    ctx->kv[idx].value.str.n    = strlen(val);
+    ctx->kv[idx].value.str.data = strdup(val);
+}
+
+void gguf_v3_set_arr_data(struct gguf_v3_context * ctx, const char * key, enum gguf_v3_type type, const void * data, int n) {
+    const int idx = gguf_v3_get_or_add_key(ctx, key);
+
+    ctx->kv[idx].type           = GGUF_V3_TYPE_ARRAY;
+    ctx->kv[idx].value.arr.type = type;
+    ctx->kv[idx].value.arr.n    = n;
+    ctx->kv[idx].value.arr.data = malloc(n*GGUF_V3_TYPE_SIZE[type]);
+    memcpy(ctx->kv[idx].value.arr.data, data, n*GGUF_V3_TYPE_SIZE[type]);
+}
+
+void gguf_v3_set_arr_str(struct gguf_v3_context * ctx, const char * key, const char ** data, int n) {
+    const int idx = gguf_v3_get_or_add_key(ctx, key);
+
+    ctx->kv[idx].type           = GGUF_V3_TYPE_ARRAY;
+    ctx->kv[idx].value.arr.type = GGUF_V3_TYPE_STRING;
+    ctx->kv[idx].value.arr.n    = n;
+    ctx->kv[idx].value.arr.data = malloc(n*sizeof(struct gguf_v3_str));
+    for (int i = 0; i < n; i++) {
+        struct gguf_v3_str * str = &((struct gguf_v3_str *)ctx->kv[idx].value.arr.data)[i];
+        str->n    = strlen(data[i]);
+        str->data = strdup(data[i]);
+    }
+}
+
+// set or add KV pairs from another context
+void gguf_v3_set_kv(struct gguf_v3_context * ctx, struct gguf_v3_context * src) {
+    for (uint32_t i = 0; i < src->header.n_kv; i++) {
+        switch (src->kv[i].type) {
+            case GGUF_V3_TYPE_UINT8:   gguf_v3_set_val_u8  (ctx, src->kv[i].key.data, src->kv[i].value.uint8);    break;
+            case GGUF_V3_TYPE_INT8:    gguf_v3_set_val_i8  (ctx, src->kv[i].key.data, src->kv[i].value.int8);     break;
+            case GGUF_V3_TYPE_UINT16:  gguf_v3_set_val_u16 (ctx, src->kv[i].key.data, src->kv[i].value.uint16);   break;
+            case GGUF_V3_TYPE_INT16:   gguf_v3_set_val_i16 (ctx, src->kv[i].key.data, src->kv[i].value.int16);    break;
+            case GGUF_V3_TYPE_UINT32:  gguf_v3_set_val_u32 (ctx, src->kv[i].key.data, src->kv[i].value.uint32);   break;
+            case GGUF_V3_TYPE_INT32:   gguf_v3_set_val_i32 (ctx, src->kv[i].key.data, src->kv[i].value.int32);    break;
+            case GGUF_V3_TYPE_FLOAT32: gguf_v3_set_val_f32 (ctx, src->kv[i].key.data, src->kv[i].value.float32);  break;
+            case GGUF_V3_TYPE_UINT64:  gguf_v3_set_val_u64 (ctx, src->kv[i].key.data, src->kv[i].value.uint64);   break;
+            case GGUF_V3_TYPE_INT64:   gguf_v3_set_val_i64 (ctx, src->kv[i].key.data, src->kv[i].value.int64);    break;
+            case GGUF_V3_TYPE_FLOAT64: gguf_v3_set_val_f64 (ctx, src->kv[i].key.data, src->kv[i].value.float64);  break;
+            case GGUF_V3_TYPE_BOOL:    gguf_v3_set_val_bool(ctx, src->kv[i].key.data, src->kv[i].value.bool_);    break;
+            case GGUF_V3_TYPE_STRING:  gguf_v3_set_val_str (ctx, src->kv[i].key.data, src->kv[i].value.str.data); break;
+            case GGUF_V3_TYPE_ARRAY:
+                {
+                    if (src->kv[i].value.arr.type == GGUF_V3_TYPE_STRING) {
+                        const char ** data = malloc(src->kv[i].value.arr.n*sizeof(char *));
+                        for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) {
+                            data[j] = ((struct gguf_v3_str *)src->kv[i].value.arr.data)[j].data;
+                        }
+                        gguf_v3_set_arr_str(ctx, src->kv[i].key.data, data, src->kv[i].value.arr.n);
+                        free((void *)data);
+                    } else if (src->kv[i].value.arr.type == GGUF_V3_TYPE_ARRAY) {
+                        GGML_V3_ASSERT(false && "nested arrays not supported");
+                    } else {
+                        gguf_v3_set_arr_data(ctx, src->kv[i].key.data, src->kv[i].value.arr.type, src->kv[i].value.arr.data, src->kv[i].value.arr.n);
+                    }
+                } break;
+            case GGUF_V3_TYPE_COUNT:  GGML_V3_ASSERT(false && "invalid type"); break;
+        }
+    }
+}
+
+void gguf_v3_add_tensor(
+             struct gguf_v3_context * ctx,
+        const struct ggml_v3_tensor * tensor) {
+    const int idx = ctx->header.n_tensors;
+    ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_v3_tensor_info));
+
+    ctx->infos[idx].name.n    = strlen(tensor->name);
+    ctx->infos[idx].name.data = strdup(tensor->name);
+
+    for (int i = 0; i < GGML_V3_MAX_DIMS; ++i) {
+        ctx->infos[idx].ne[i] = 1;
+    }
+
+    ctx->infos[idx].n_dims = ggml_v3_n_dims(tensor);
+    for (uint32_t i = 0; i < ctx->infos[idx].n_dims; i++) {
+        ctx->infos[idx].ne[i] = tensor->ne[i];
+    }
+
+    ctx->infos[idx].type   = tensor->type;
+    ctx->infos[idx].offset = 0;
+    ctx->infos[idx].data   = tensor->data;
+    ctx->infos[idx].size   = ggml_v3_nbytes(tensor);
+
+    if (ctx->header.n_tensors > 0) {
+        ctx->infos[idx].offset = ctx->infos[idx - 1].offset + GGML_V3_PAD(ctx->infos[idx - 1].size, ctx->alignment);
+    }
+
+    ctx->header.n_tensors++;
+}
+
+void gguf_v3_set_tensor_type(struct gguf_v3_context * ctx, const char * name, enum ggml_v3_type type) {
+    const int idx = gguf_v3_find_tensor(ctx, name);
+    if (idx < 0) {
+        GGML_V3_ASSERT(false && "tensor not found");
+    }
+
+    ctx->infos[idx].type = type;
+}
+
+void gguf_v3_set_tensor_data(struct gguf_v3_context * ctx, const char * name, const void * data, size_t size) {
+    const int idx = gguf_v3_find_tensor(ctx, name);
+    if (idx < 0) {
+        GGML_V3_ASSERT(false && "tensor not found");
+    }
+
+    ctx->infos[idx].data = data;
+    ctx->infos[idx].size = size;
+
+    // update offsets
+    for (uint32_t i = idx + 1; i < ctx->header.n_tensors; ++i) {
+        ctx->infos[i].offset = ctx->infos[i - 1].offset + GGML_V3_PAD(ctx->infos[i - 1].size, ctx->alignment);
+    }
+}
+
+//static void gguf_v3_fwrite_str(FILE * file, const struct gguf_v3_str * val) {
+//    fwrite(&val->n,   sizeof(val->n),    1, file);
+//    fwrite(val->data, sizeof(char), val->n, file);
+//}
+//
+//static void gguf_v3_fwrite_el(FILE * file, const void * val, size_t size) {
+//    fwrite(val, sizeof(char), size, file);
+//}
+
+struct gguf_v3_buf {
+    void * data;
+    size_t size;
+    size_t offset;
+};
+
+static struct gguf_v3_buf gguf_v3_buf_init(size_t size) {
+    struct gguf_v3_buf buf = {
+        /*buf.data   =*/ size == 0 ? NULL : malloc(size),
+        /*buf.size   =*/ size,
+        /*buf.offset =*/ 0,
+    };
+
+    return buf;
+}
+
+static void gguf_v3_buf_free(struct gguf_v3_buf buf) {
+    if (buf.data) {
+        free(buf.data);
+    }
+}
+
+static void gguf_v3_buf_grow(struct gguf_v3_buf * buf, size_t size) {
+    if (buf->offset + size > buf->size) {
+        buf->size = 1.5*(buf->offset + size);
+        if (buf->data) {
+            buf->data = realloc(buf->data, buf->size);
+        }
+    }
+}
+
+static void gguf_v3_bwrite_str(struct gguf_v3_buf * buf, const struct gguf_v3_str * val) {
+    gguf_v3_buf_grow(buf, sizeof(val->n) + val->n);
+
+    if (buf->data) {
+        memcpy((char *) buf->data + buf->offset, &val->n, sizeof(val->n));
+    }
+    buf->offset += sizeof(val->n);
+
+    if (buf->data) {
+        memcpy((char *) buf->data + buf->offset, val->data, val->n);
+    }
+    buf->offset += val->n;
+}
+
+static void gguf_v3_bwrite_el(struct gguf_v3_buf * buf, const void * val, size_t el_size) {
+    gguf_v3_buf_grow(buf, el_size);
+
+    if (buf->data) {
+        memcpy((char *) buf->data + buf->offset, val, el_size);
+    }
+    buf->offset += el_size;
+}
+
+static void gguf_v3_write_to_buf(const struct gguf_v3_context * ctx, struct gguf_v3_buf * buf, bool only_meta) {
+    // write header
+    gguf_v3_bwrite_el(buf, &ctx->header.magic,     sizeof(ctx->header.magic));
+    gguf_v3_bwrite_el(buf, &ctx->header.version,   sizeof(ctx->header.version));
+    gguf_v3_bwrite_el(buf, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors));
+    gguf_v3_bwrite_el(buf, &ctx->header.n_kv,      sizeof(ctx->header.n_kv));
+
+    // write key-value pairs
+    for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
+        struct gguf_v3_kv * kv = &ctx->kv[i];
+
+        gguf_v3_bwrite_str(buf, &kv->key);
+        gguf_v3_bwrite_el (buf, &kv->type, sizeof(kv->type));
+
+        switch (kv->type) {
+            case GGUF_V3_TYPE_UINT8:   gguf_v3_bwrite_el( buf, &kv->value.uint8,   sizeof(kv->value.uint8)  ); break;
+            case GGUF_V3_TYPE_INT8:    gguf_v3_bwrite_el (buf, &kv->value.int8,    sizeof(kv->value.int8)   ); break;
+            case GGUF_V3_TYPE_UINT16:  gguf_v3_bwrite_el (buf, &kv->value.uint16,  sizeof(kv->value.uint16) ); break;
+            case GGUF_V3_TYPE_INT16:   gguf_v3_bwrite_el (buf, &kv->value.int16,   sizeof(kv->value.int16)  ); break;
+            case GGUF_V3_TYPE_UINT32:  gguf_v3_bwrite_el (buf, &kv->value.uint32,  sizeof(kv->value.uint32) ); break;
+            case GGUF_V3_TYPE_INT32:   gguf_v3_bwrite_el (buf, &kv->value.int32,   sizeof(kv->value.int32)  ); break;
+            case GGUF_V3_TYPE_FLOAT32: gguf_v3_bwrite_el (buf, &kv->value.float32, sizeof(kv->value.float32)); break;
+            case GGUF_V3_TYPE_UINT64:  gguf_v3_bwrite_el (buf, &kv->value.uint64,  sizeof(kv->value.uint64) ); break;
+            case GGUF_V3_TYPE_INT64:   gguf_v3_bwrite_el (buf, &kv->value.int64,   sizeof(kv->value.int64)  ); break;
+            case GGUF_V3_TYPE_FLOAT64: gguf_v3_bwrite_el (buf, &kv->value.float64, sizeof(kv->value.float64)); break;
+            case GGUF_V3_TYPE_BOOL:    gguf_v3_bwrite_el (buf, &kv->value.bool_,   sizeof(kv->value.bool_)  ); break;
+            case GGUF_V3_TYPE_STRING:  gguf_v3_bwrite_str(buf, &kv->value.str                               ); break;
+            case GGUF_V3_TYPE_ARRAY:
+                {
+                    gguf_v3_bwrite_el(buf, &kv->value.arr.type, sizeof(kv->value.arr.type));
+                    gguf_v3_bwrite_el(buf, &kv->value.arr.n,    sizeof(kv->value.arr.n)   );
+
+                    switch (kv->value.arr.type) {
+                        case GGUF_V3_TYPE_UINT8:
+                        case GGUF_V3_TYPE_INT8:
+                        case GGUF_V3_TYPE_UINT16:
+                        case GGUF_V3_TYPE_INT16:
+                        case GGUF_V3_TYPE_UINT32:
+                        case GGUF_V3_TYPE_INT32:
+                        case GGUF_V3_TYPE_FLOAT32:
+                        case GGUF_V3_TYPE_UINT64:
+                        case GGUF_V3_TYPE_INT64:
+                        case GGUF_V3_TYPE_FLOAT64:
+                        case GGUF_V3_TYPE_BOOL:
+                            {
+                                gguf_v3_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n * GGUF_V3_TYPE_SIZE[kv->value.arr.type]);
+                            } break;
+                        case GGUF_V3_TYPE_STRING:
+                            {
+                                for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
+                                    gguf_v3_bwrite_str(buf, &((struct gguf_v3_str *) kv->value.arr.data)[j]);
+                                }
+                            } break;
+                        case GGUF_V3_TYPE_ARRAY:
+                        case GGUF_V3_TYPE_COUNT: GGML_V3_ASSERT(false && "invalid type"); break;
+                    }
+                } break;
+            case GGUF_V3_TYPE_COUNT: GGML_V3_ASSERT(false && "invalid type");
+        }
+    }
+
+    // write tensor infos
+    for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
+        struct gguf_v3_tensor_info * info = &ctx->infos[i];
+
+        gguf_v3_bwrite_str(buf, &info->name);
+        gguf_v3_bwrite_el (buf, &info->n_dims, sizeof(info->n_dims));
+        for (uint32_t j = 0; j < info->n_dims; ++j) {
+            gguf_v3_bwrite_el(buf, &info->ne[j], sizeof(info->ne[j]));
+        }
+        gguf_v3_bwrite_el(buf, &info->type,   sizeof(info->type));
+        gguf_v3_bwrite_el(buf, &info->offset, sizeof(info->offset));
+    }
+
+    // we require the data section to be aligned, so take into account any padding
+    {
+        const size_t offset     = buf->offset;
+        const size_t offset_pad = GGML_V3_PAD(offset, ctx->alignment);
+
+        if (offset_pad != offset) {
+            uint8_t pad = 0;
+            for (size_t i = 0; i < offset_pad - offset; ++i) {
+                gguf_v3_bwrite_el(buf, &pad, sizeof(pad));
+            }
+        }
+    }
+
+    if (only_meta) {
+        return;
+    }
+
+    size_t offset = 0;
+
+    // write tensor data
+    for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
+        struct gguf_v3_tensor_info * info = &ctx->infos[i];
+
+        const size_t size     = info->size;
+        const size_t size_pad = GGML_V3_PAD(size, ctx->alignment);
+
+        gguf_v3_bwrite_el(buf, info->data, size);
+
+        if (size_pad != size) {
+            uint8_t pad = 0;
+            for (size_t j = 0; j < size_pad - size; ++j) {
+                gguf_v3_bwrite_el(buf, &pad, sizeof(pad));
+            }
+        }
+
+        GGML_V3_ASSERT(offset == info->offset);
+
+        offset += size_pad;
+    }
+}
+
+void gguf_v3_write_to_file(const struct gguf_v3_context * ctx, const char * fname, bool only_meta) {
+    FILE * file = fopen(fname, "wb");
+    if (!file) {
+        GGML_V3_ASSERT(false && "failed to open file for writing");
+    }
+
+    struct gguf_v3_buf buf = gguf_v3_buf_init(16*1024);
+
+    gguf_v3_write_to_buf(ctx, &buf, only_meta);
+
+    fwrite(buf.data, 1, buf.offset, file);
+
+    gguf_v3_buf_free(buf);
+
+    fclose(file);
+}
+
+size_t gguf_v3_get_meta_size(const struct gguf_v3_context * ctx) {
+    // no allocs - only compute size
+    struct gguf_v3_buf buf = gguf_v3_buf_init(0);
+
+    gguf_v3_write_to_buf(ctx, &buf, true);
+
+    return buf.offset;
+}
+
+void gguf_v3_get_meta_data(const struct gguf_v3_context * ctx, void * data) {
+    struct gguf_v3_buf buf = gguf_v3_buf_init(16*1024);
+
+    gguf_v3_write_to_buf(ctx, &buf, true);
+
+    memcpy(data, buf.data, buf.offset);
+
+    gguf_v3_buf_free(buf);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+int ggml_v3_cpu_has_avx(void) {
+#if defined(__AVX__)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_v3_cpu_has_avx_vnni(void) {
+#if defined(__AVXVNNI__)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_v3_cpu_has_avx2(void) {
+#if defined(__AVX2__)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_v3_cpu_has_avx512(void) {
+#if defined(__AVX512F__)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_v3_cpu_has_avx512_vbmi(void) {
+#if defined(__AVX512VBMI__)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_v3_cpu_has_avx512_vnni(void) {
+#if defined(__AVX512VNNI__)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_v3_cpu_has_fma(void) {
+#if defined(__FMA__)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_v3_cpu_has_neon(void) {
+#if defined(__ARM_NEON)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_v3_cpu_has_arm_fma(void) {
+#if defined(__ARM_FEATURE_FMA)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_v3_cpu_has_metal(void) {
+#if defined(GGML_USE_METAL)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_v3_cpu_has_f16c(void) {
+#if defined(__F16C__)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_v3_cpu_has_fp16_va(void) {
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_v3_cpu_has_wasm_simd(void) {
+#if defined(__wasm_simd128__)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_v3_cpu_has_blas(void) {
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_v3_cpu_has_cublas(void) {
+#if defined(GGML_USE_CUBLAS)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_v3_cpu_has_clblast(void) {
+#if defined(GGML_USE_CLBLAST)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_v3_cpu_has_gpublas(void) {
+    return ggml_v3_cpu_has_cublas() || ggml_v3_cpu_has_clblast();
+}
+
+int ggml_v3_cpu_has_sse3(void) {
+#if defined(__SSE3__)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_v3_cpu_has_ssse3(void) {
+#if defined(__SSSE3__)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_v3_cpu_has_vsx(void) {
+#if defined(__POWER9_VECTOR__)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+//formerly ggml-quants.c
+
+
+#include <math.h>
+#include <string.h>
+#include <assert.h>
+#include <float.h>
+
+#ifdef __ARM_NEON
+
+// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
+//
+//   $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
+//
+#include <arm_neon.h>
+
+#else
+
+#ifdef __wasm_simd128__
+#include <wasm_simd128.h>
+#else
+#if defined(__POWER9_VECTOR__) || defined(__powerpc64__)
+#include <altivec.h>
+#undef bool
+#define bool _Bool
+#else
+#if defined(_MSC_VER) || defined(__MINGW32__)
+#include <intrin.h>
+#else
+#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__)
+#if !defined(__riscv)
+#include <immintrin.h>
+#endif
+#endif
+#endif
+#endif
+#endif
+#endif
+
+#ifdef __riscv_v_intrinsic
+#include <riscv_vector.h>
+#endif
+
+#undef MIN
+#undef MAX
+
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+
+#ifndef MM256_SET_M128I
+#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
+#endif
+
+#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
+// multiply int8_t, add results pairwise twice
+static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
+    // Get absolute values of x vectors
+    const __m128i ax = _mm_sign_epi8(x, x);
+    // Sign the values of the y vectors
+    const __m128i sy = _mm_sign_epi8(y, x);
+    // Perform multiplication and create 16-bit values
+    const __m128i dot = _mm_maddubs_epi16(ax, sy);
+    const __m128i ones = _mm_set1_epi16(1);
+    return _mm_madd_epi16(ones, dot);
+}
+
+#if __AVX__ || __AVX2__ || __AVX512F__
+// horizontally add 8 floats
+static inline float hsum_float_8(const __m256 x) {
+    __m128 res = _mm256_extractf128_ps(x, 1);
+    res = _mm_add_ps(res, _mm256_castps256_ps128(x));
+    res = _mm_add_ps(res, _mm_movehl_ps(res, res));
+    res = _mm_add_ss(res, _mm_movehdup_ps(res));
+    return _mm_cvtss_f32(res);
+}
+
+// horizontally add 8 int32_t
+static inline int hsum_i32_8(const __m256i a) {
+    const __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1));
+    const __m128i hi64 = _mm_unpackhi_epi64(sum128, sum128);
+    const __m128i sum64 = _mm_add_epi32(hi64, sum128);
+    const __m128i hi32  = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
+    return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
+}
+
+// horizontally add 4 int32_t
+static inline int hsum_i32_4(const __m128i a) {
+    const __m128i hi64 = _mm_unpackhi_epi64(a, a);
+    const __m128i sum64 = _mm_add_epi32(hi64, a);
+    const __m128i hi32  = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
+    return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
+}
+
+#if defined(__AVX2__) || defined(__AVX512F__)
+// spread 32 bits to 32 bytes { 0x00, 0xFF }
+static inline __m256i bytes_from_bits_32(const uint8_t * x) {
+    uint32_t x32;
+    memcpy(&x32, x, sizeof(uint32_t));
+    const __m256i shuf_mask = _mm256_set_epi64x(
+            0x0303030303030303, 0x0202020202020202,
+            0x0101010101010101, 0x0000000000000000);
+    __m256i bytes = _mm256_shuffle_epi8(_mm256_set1_epi32(x32), shuf_mask);
+    const __m256i bit_mask = _mm256_set1_epi64x(0x7fbfdfeff7fbfdfe);
+    bytes = _mm256_or_si256(bytes, bit_mask);
+    return _mm256_cmpeq_epi8(bytes, _mm256_set1_epi64x(-1));
+}
+
+// Unpack 32 4-bit fields into 32 bytes
+// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
+static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
+{
+    const __m128i tmp = _mm_loadu_si128((const __m128i *)rsi);
+    const __m256i bytes = MM256_SET_M128I(_mm_srli_epi16(tmp, 4), tmp);
+    const __m256i lowMask = _mm256_set1_epi8( 0xF );
+    return _mm256_and_si256(lowMask, bytes);
+}
+
+// add int16_t pairwise and return as float vector
+static inline __m256 sum_i16_pairs_float(const __m256i x) {
+    const __m256i ones = _mm256_set1_epi16(1);
+    const __m256i summed_pairs = _mm256_madd_epi16(ones, x);
+    return _mm256_cvtepi32_ps(summed_pairs);
+}
+
+static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
+#if __AVXVNNI__
+    const __m256i zero = _mm256_setzero_si256();
+    const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
+    return _mm256_cvtepi32_ps(summed_pairs);
+#else
+    // Perform multiplication and create 16-bit values
+    const __m256i dot = _mm256_maddubs_epi16(ax, sy);
+    return sum_i16_pairs_float(dot);
+#endif
+}
+
+// multiply int8_t, add results pairwise twice and return as float vector
+static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
+#if __AVXVNNIINT8__
+    const __m256i zero = _mm256_setzero_si256();
+    const __m256i summed_pairs = _mm256_dpbssd_epi32(zero, x, y);
+    return _mm256_cvtepi32_ps(summed_pairs);
+#else
+    // Get absolute values of x vectors
+    const __m256i ax = _mm256_sign_epi8(x, x);
+    // Sign the values of the y vectors
+    const __m256i sy = _mm256_sign_epi8(y, x);
+    return mul_sum_us8_pairs_float(ax, sy);
+#endif
+}
+
+static inline __m128i packNibbles( __m256i bytes )
+{
+    // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
+#if __AVX512F__
+    const __m256i bytes_srli_4 = _mm256_srli_epi16(bytes, 4);   // 0000_0000_abcd_0000
+    bytes = _mm256_or_si256(bytes, bytes_srli_4);               // 0000_abcd_abcd_efgh
+    return _mm256_cvtepi16_epi8(bytes);                         // abcd_efgh
+#else
+    const __m256i lowByte = _mm256_set1_epi16( 0xFF );
+    __m256i high = _mm256_andnot_si256( lowByte, bytes );
+    __m256i low = _mm256_and_si256( lowByte, bytes );
+    high = _mm256_srli_epi16( high, 4 );
+    bytes = _mm256_or_si256( low, high );
+
+    // Compress uint16_t lanes into bytes
+    __m128i r0 = _mm256_castsi256_si128( bytes );
+    __m128i r1 = _mm256_extracti128_si256( bytes, 1 );
+    return _mm_packus_epi16( r0, r1 );
+#endif
+}
+#elif defined(__AVX__)
+// spread 32 bits to 32 bytes { 0x00, 0xFF }
+static inline __m256i bytes_from_bits_32(const uint8_t * x) {
+    uint32_t x32;
+    memcpy(&x32, x, sizeof(uint32_t));
+    const __m128i shuf_maskl = _mm_set_epi64x(0x0101010101010101, 0x0000000000000000);
+    const __m128i shuf_maskh = _mm_set_epi64x(0x0303030303030303, 0x0202020202020202);
+    __m128i bytesl = _mm_shuffle_epi8(_mm_set1_epi32(x32), shuf_maskl);
+    __m128i bytesh = _mm_shuffle_epi8(_mm_set1_epi32(x32), shuf_maskh);
+    const __m128i bit_mask = _mm_set1_epi64x(0x7fbfdfeff7fbfdfe);
+    bytesl = _mm_or_si128(bytesl, bit_mask);
+    bytesh = _mm_or_si128(bytesh, bit_mask);
+    bytesl = _mm_cmpeq_epi8(bytesl, _mm_set1_epi64x(-1));
+    bytesh = _mm_cmpeq_epi8(bytesh, _mm_set1_epi64x(-1));
+    return MM256_SET_M128I(bytesh, bytesl);
+}
+
+// Unpack 32 4-bit fields into 32 bytes
+// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
+static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
+{
+    // Load 16 bytes from memory
+    __m128i tmpl = _mm_loadu_si128((const __m128i *)rsi);
+    __m128i tmph = _mm_srli_epi16(tmpl, 4);
+    const __m128i lowMask = _mm_set1_epi8(0xF);
+    tmpl = _mm_and_si128(lowMask, tmpl);
+    tmph = _mm_and_si128(lowMask, tmph);
+    return MM256_SET_M128I(tmph, tmpl);
+}
+
+// add int16_t pairwise and return as float vector
+static inline __m256 sum_i16_pairs_float(const __m128i xh, const __m128i xl) {
+    const __m128i ones = _mm_set1_epi16(1);
+    const __m128i summed_pairsl = _mm_madd_epi16(ones, xl);
+    const __m128i summed_pairsh = _mm_madd_epi16(ones, xh);
+    const __m256i summed_pairs = MM256_SET_M128I(summed_pairsh, summed_pairsl);
+    return _mm256_cvtepi32_ps(summed_pairs);
+}
+
+static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
+    const __m128i axl = _mm256_castsi256_si128(ax);
+    const __m128i axh = _mm256_extractf128_si256(ax, 1);
+    const __m128i syl = _mm256_castsi256_si128(sy);
+    const __m128i syh = _mm256_extractf128_si256(sy, 1);
+    // Perform multiplication and create 16-bit values
+    const __m128i dotl = _mm_maddubs_epi16(axl, syl);
+    const __m128i doth = _mm_maddubs_epi16(axh, syh);
+    return sum_i16_pairs_float(doth, dotl);
+}
+
+// multiply int8_t, add results pairwise twice and return as float vector
+static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
+    const __m128i xl = _mm256_castsi256_si128(x);
+    const __m128i xh = _mm256_extractf128_si256(x, 1);
+    const __m128i yl = _mm256_castsi256_si128(y);
+    const __m128i yh = _mm256_extractf128_si256(y, 1);
+    // Get absolute values of x vectors
+    const __m128i axl = _mm_sign_epi8(xl, xl);
+    const __m128i axh = _mm_sign_epi8(xh, xh);
+    // Sign the values of the y vectors
+    const __m128i syl = _mm_sign_epi8(yl, xl);
+    const __m128i syh = _mm_sign_epi8(yh, xh);
+    // Perform multiplication and create 16-bit values
+    const __m128i dotl = _mm_maddubs_epi16(axl, syl);
+    const __m128i doth = _mm_maddubs_epi16(axh, syh);
+    return sum_i16_pairs_float(doth, dotl);
+}
+
+static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 )
+{
+    // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
+    const __m128i lowByte = _mm_set1_epi16( 0xFF );
+    __m128i high = _mm_andnot_si128( lowByte, bytes1 );
+    __m128i low = _mm_and_si128( lowByte, bytes1 );
+    high = _mm_srli_epi16( high, 4 );
+    bytes1 = _mm_or_si128( low, high );
+    high = _mm_andnot_si128( lowByte, bytes2 );
+    low = _mm_and_si128( lowByte, bytes2 );
+    high = _mm_srli_epi16( high, 4 );
+    bytes2 = _mm_or_si128( low, high );
+
+    return _mm_packus_epi16( bytes1, bytes2);
+}
+#endif
+#elif defined(__SSSE3__)
+// horizontally add 4x4 floats
+static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128 c, const __m128 d) {
+    __m128 res_0 =_mm_hadd_ps(a, b);
+    __m128 res_1 =_mm_hadd_ps(c, d);
+    __m128 res =_mm_hadd_ps(res_0, res_1);
+    res =_mm_hadd_ps(res, res);
+    res =_mm_hadd_ps(res, res);
+
+    return _mm_cvtss_f32(res);
+}
+#endif // __AVX__ || __AVX2__ || __AVX512F__
+#endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
+
+#if defined(__ARM_NEON)
+#if !defined(__aarch64__)
+
+// 64-bit compatibility
+
+// vaddvq_s16
+// vpaddq_s16
+// vaddvq_s32
+// vaddvq_f32
+// vmaxvq_f32
+// vcvtnq_s32_f32
+
+inline static int32_t vaddvq_s16(int16x8_t v) {
+    return
+        (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
+        (int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
+        (int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
+        (int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
+}
+
+inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
+    int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
+    int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
+    return vcombine_s16(a0, b0);
+}
+
+inline static int32_t vaddvq_s32(int32x4_t v) {
+    return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
+}
+
+inline static float vaddvq_f32(float32x4_t v) {
+    return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
+}
+
+inline static float vmaxvq_f32(float32x4_t v) {
+    return
+        MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
+            MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
+}
+
+inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
+    int32x4_t res;
+
+    res[0] = roundf(vgetq_lane_f32(v, 0));
+    res[1] = roundf(vgetq_lane_f32(v, 1));
+    res[2] = roundf(vgetq_lane_f32(v, 2));
+    res[3] = roundf(vgetq_lane_f32(v, 3));
+
+    return res;
+}
+
+// vld1q_s16_x2
+// vld1q_u8_x2
+// vld1q_u8_x4
+// vld1q_s8_x2
+// vld1q_s8_x4
+// TODO: double-check these work correctly
+
+typedef struct ggml_v3_int16x8x2_t {
+    int16x8_t val[2];
+} ggml_v3_int16x8x2_t;
+
+inline static ggml_v3_int16x8x2_t ggml_v3_vld1q_s16_x2(const int16_t * ptr) {
+    ggml_v3_int16x8x2_t res;
+
+    res.val[0] = vld1q_s16(ptr + 0);
+    res.val[1] = vld1q_s16(ptr + 8);
+
+    return res;
+}
+
+typedef struct ggml_v3_uint8x16x2_t {
+    uint8x16_t val[2];
+} ggml_v3_uint8x16x2_t;
+
+inline static ggml_v3_uint8x16x2_t ggml_v3_vld1q_u8_x2(const uint8_t * ptr) {
+    ggml_v3_uint8x16x2_t res;
+
+    res.val[0] = vld1q_u8(ptr + 0);
+    res.val[1] = vld1q_u8(ptr + 16);
+
+    return res;
+}
+
+typedef struct ggml_v3_uint8x16x4_t {
+    uint8x16_t val[4];
+} ggml_v3_uint8x16x4_t;
+
+inline static ggml_v3_uint8x16x4_t ggml_v3_vld1q_u8_x4(const uint8_t * ptr) {
+    ggml_v3_uint8x16x4_t res;
+
+    res.val[0] = vld1q_u8(ptr + 0);
+    res.val[1] = vld1q_u8(ptr + 16);
+    res.val[2] = vld1q_u8(ptr + 32);
+    res.val[3] = vld1q_u8(ptr + 48);
+
+    return res;
+}
+
+typedef struct ggml_v3_int8x16x2_t {
+    int8x16_t val[2];
+} ggml_v3_int8x16x2_t;
+
+inline static ggml_v3_int8x16x2_t ggml_v3_vld1q_s8_x2(const int8_t * ptr) {
+    ggml_v3_int8x16x2_t res;
+
+    res.val[0] = vld1q_s8(ptr + 0);
+    res.val[1] = vld1q_s8(ptr + 16);
+
+    return res;
+}
+
+typedef struct ggml_v3_int8x16x4_t {
+    int8x16_t val[4];
+} ggml_v3_int8x16x4_t;
+
+inline static ggml_v3_int8x16x4_t ggml_v3_vld1q_s8_x4(const int8_t * ptr) {
+    ggml_v3_int8x16x4_t res;
+
+    res.val[0] = vld1q_s8(ptr + 0);
+    res.val[1] = vld1q_s8(ptr + 16);
+    res.val[2] = vld1q_s8(ptr + 32);
+    res.val[3] = vld1q_s8(ptr + 48);
+
+    return res;
+}
+
+#else
+
+#define ggml_v3_int16x8x2_t  int16x8x2_t
+#define ggml_v3_uint8x16x2_t uint8x16x2_t
+#define ggml_v3_uint8x16x4_t uint8x16x4_t
+#define ggml_v3_int8x16x2_t  int8x16x2_t
+#define ggml_v3_int8x16x4_t  int8x16x4_t
+
+#define ggml_v3_vld1q_s16_x2 vld1q_s16_x2
+#define ggml_v3_vld1q_u8_x2  vld1q_u8_x2
+#define ggml_v3_vld1q_u8_x4  vld1q_u8_x4
+#define ggml_v3_vld1q_s8_x2  vld1q_s8_x2
+#define ggml_v3_vld1q_s8_x4  vld1q_s8_x4
+
+#endif
+
+#if !defined(__ARM_FEATURE_DOTPROD)
+
+inline static int32x4_t ggml_v3_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
+    const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b));
+    const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
+
+    return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1)));
+}
+
+#else
+
+#define ggml_v3_vdotq_s32(a, b, c) vdotq_s32(a, b, c)
+
+#endif
+
+#endif
+
+#if defined(__ARM_NEON) || defined(__wasm_simd128__)
+#define B1(c,s,n)  0x ## n ## c ,  0x ## n ## s
+#define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s)
+#define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s)
+#define B4(c,s,n) B3(c,s,n ## c), B3(c,s,n ## s)
+#define B5(c,s,n) B4(c,s,n ## c), B4(c,s,n ## s)
+#define B6(c,s,n) B5(c,s,n ## c), B5(c,s,n ## s)
+#define B7(c,s,n) B6(c,s,n ## c), B6(c,s,n ## s)
+#define B8(c,s  ) B7(c,s,     c), B7(c,s,     s)
+
+// precomputed tables for expanding 8bits to 8 bytes:
+static const uint64_t table_b2b_0[1 << 8] = { B8(00, 10) }; // ( b) << 4
+static const uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4
+#endif
+
+// reference implementation for deterministic creation of model files
+static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) {
+    static const int qk = QK4_0;
+
+    assert(k % qk == 0);
+
+    const int nb = k / qk;
+
+    for (int i = 0; i < nb; i++) {
+        float amax = 0.0f; // absolute max
+        float max  = 0.0f;
+
+        for (int j = 0; j < qk; j++) {
+            const float v = x[i*qk + j];
+            if (amax < fabsf(v)) {
+                amax = fabsf(v);
+                max  = v;
+            }
+        }
+
+        const float d  = max / -8;
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = GGML_V3_FP32_TO_FP16(d);
+
+        for (int j = 0; j < qk/2; ++j) {
+            const float x0 = x[i*qk + 0    + j]*id;
+            const float x1 = x[i*qk + qk/2 + j]*id;
+
+            const uint8_t xi0 = MIN(15, (int8_t)(x0 + 8.5f));
+            const uint8_t xi1 = MIN(15, (int8_t)(x1 + 8.5f));
+
+            y[i].qs[j]  = xi0;
+            y[i].qs[j] |= xi1 << 4;
+        }
+    }
+}
+
+static void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
+    quantize_row_q4_0_reference(x, y, k);
+}
+
+static void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k) {
+    const int qk = QK4_1;
+
+    assert(k % qk == 0);
+
+    const int nb = k / qk;
+
+    for (int i = 0; i < nb; i++) {
+        float min = FLT_MAX;
+        float max = -FLT_MAX;
+
+        for (int j = 0; j < qk; j++) {
+            const float v = x[i*qk + j];
+
+            if (v < min) min = v;
+            if (v > max) max = v;
+        }
+
+        const float d  = (max - min) / ((1 << 4) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = GGML_V3_FP32_TO_FP16(d);
+        y[i].m = GGML_V3_FP32_TO_FP16(min);
+
+        for (int j = 0; j < qk/2; ++j) {
+            const float x0 = (x[i*qk + 0    + j] - min)*id;
+            const float x1 = (x[i*qk + qk/2 + j] - min)*id;
+
+            const uint8_t xi0 = MIN(15, (int8_t)(x0 + 0.5f));
+            const uint8_t xi1 = MIN(15, (int8_t)(x1 + 0.5f));
+
+            y[i].qs[j]  = xi0;
+            y[i].qs[j] |= xi1 << 4;
+        }
+    }
+}
+
+static void quantize_row_q4_1(const float * restrict x, void * restrict y, int k) {
+    quantize_row_q4_1_reference(x, y, k);
+}
+
+static void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k) {
+    static const int qk = QK5_0;
+
+    assert(k % qk == 0);
+
+    const int nb = k / qk;
+
+    for (int i = 0; i < nb; i++) {
+        float amax = 0.0f; // absolute max
+        float max  = 0.0f;
+
+        for (int j = 0; j < qk; j++) {
+            const float v = x[i*qk + j];
+            if (amax < fabsf(v)) {
+                amax = fabsf(v);
+                max  = v;
+            }
+        }
+
+        const float d  = max / -16;
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = GGML_V3_FP32_TO_FP16(d);
+
+        uint32_t qh = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const float x0 = x[i*qk + 0    + j]*id;
+            const float x1 = x[i*qk + qk/2 + j]*id;
+
+            const uint8_t xi0 = MIN(31, (int8_t)(x0 + 16.5f));
+            const uint8_t xi1 = MIN(31, (int8_t)(x1 + 16.5f));
+
+            y[i].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
+
+            // get the 5-th bit and store it in qh at the right position
+            qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
+            qh |= ((xi1 & 0x10u) >> 4) << (j + qk/2);
+        }
+
+        memcpy(&y[i].qh, &qh, sizeof(qh));
+    }
+}
+
+static void quantize_row_q5_0(const float * restrict x, void * restrict y, int k) {
+    quantize_row_q5_0_reference(x, y, k);
+}
+
+static void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k) {
+    const int qk = QK5_1;
+
+    assert(k % qk == 0);
+
+    const int nb = k / qk;
+
+    for (int i = 0; i < nb; i++) {
+        float min = FLT_MAX;
+        float max = -FLT_MAX;
+
+        for (int j = 0; j < qk; j++) {
+            const float v = x[i*qk + j];
+
+            if (v < min) min = v;
+            if (v > max) max = v;
+        }
+
+        const float d  = (max - min) / ((1 << 5) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = GGML_V3_FP32_TO_FP16(d);
+        y[i].m = GGML_V3_FP32_TO_FP16(min);
+
+        uint32_t qh = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const float x0 = (x[i*qk + 0    + j] - min)*id;
+            const float x1 = (x[i*qk + qk/2 + j] - min)*id;
+
+            const uint8_t xi0 = (uint8_t)(x0 + 0.5f);
+            const uint8_t xi1 = (uint8_t)(x1 + 0.5f);
+
+            y[i].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
+
+            // get the 5-th bit and store it in qh at the right position
+            qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
+            qh |= ((xi1 & 0x10u) >> 4) << (j + qk/2);
+        }
+
+        memcpy(&y[i].qh, &qh, sizeof(y[i].qh));
+    }
+}
+
+static void quantize_row_q5_1(const float * restrict x, void * restrict y, int k) {
+    quantize_row_q5_1_reference(x, y, k);
+}
+
+// reference implementation for deterministic creation of model files
+static void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k) {
+    assert(k % QK8_0 == 0);
+    const int nb = k / QK8_0;
+
+    for (int i = 0; i < nb; i++) {
+        float amax = 0.0f; // absolute max
+
+        for (int j = 0; j < QK8_0; j++) {
+            const float v = x[i*QK8_0 + j];
+            amax = MAX(amax, fabsf(v));
+        }
+
+        const float d = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = GGML_V3_FP32_TO_FP16(d);
+
+        for (int j = 0; j < QK8_0; ++j) {
+            const float x0 = x[i*QK8_0 + j]*id;
+
+            y[i].qs[j] = roundf(x0);
+        }
+    }
+}
+
+static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) {
+    assert(QK8_0 == 32);
+    assert(k % QK8_0 == 0);
+    const int nb = k / QK8_0;
+
+    block_q8_0 * restrict y = vy;
+
+#if defined(__ARM_NEON)
+    for (int i = 0; i < nb; i++) {
+        float32x4_t srcv [8];
+        float32x4_t asrcv[8];
+        float32x4_t amaxv[8];
+
+        for (int j = 0; j < 8; j++) srcv[j]  = vld1q_f32(x + i*32 + 4*j);
+        for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[j]);
+
+        for (int j = 0; j < 4; j++) amaxv[2*j] = vmaxq_f32(asrcv[2*j], asrcv[2*j+1]);
+        for (int j = 0; j < 2; j++) amaxv[4*j] = vmaxq_f32(amaxv[4*j], amaxv[4*j+2]);
+        for (int j = 0; j < 1; j++) amaxv[8*j] = vmaxq_f32(amaxv[8*j], amaxv[8*j+4]);
+
+        const float amax = vmaxvq_f32(amaxv[0]);
+
+        const float d = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = GGML_V3_FP32_TO_FP16(d);
+
+        for (int j = 0; j < 8; j++) {
+            const float32x4_t v  = vmulq_n_f32(srcv[j], id);
+            const int32x4_t   vi = vcvtnq_s32_f32(v);
+
+            y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
+            y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
+            y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
+            y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
+        }
+    }
+#elif defined(__wasm_simd128__)
+    for (int i = 0; i < nb; i++) {
+        v128_t srcv [8];
+        v128_t asrcv[8];
+        v128_t amaxv[8];
+
+        for (int j = 0; j < 8; j++) srcv[j]  = wasm_v128_load(x + i*32 + 4*j);
+        for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]);
+
+        for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]);
+        for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]);
+        for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]);
+
+        const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0),
+                                   wasm_f32x4_extract_lane(amaxv[0], 1)),
+                               MAX(wasm_f32x4_extract_lane(amaxv[0], 2),
+                                   wasm_f32x4_extract_lane(amaxv[0], 3)));
+
+        const float d = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = GGML_V3_FP32_TO_FP16(d);
+
+        for (int j = 0; j < 8; j++) {
+            const v128_t v  = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id));
+            const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v);
+
+            y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0);
+            y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1);
+            y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2);
+            y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3);
+        }
+    }
+#elif defined(__AVX2__) || defined(__AVX__)
+    for (int i = 0; i < nb; i++) {
+        // Load elements into 4 AVX vectors
+        __m256 v0 = _mm256_loadu_ps( x );
+        __m256 v1 = _mm256_loadu_ps( x + 8 );
+        __m256 v2 = _mm256_loadu_ps( x + 16 );
+        __m256 v3 = _mm256_loadu_ps( x + 24 );
+        x += 32;
+
+        // Compute max(abs(e)) for the block
+        const __m256 signBit = _mm256_set1_ps( -0.0f );
+        __m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
+        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
+        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
+        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
+
+        __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
+        max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
+        max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
+        const float maxScalar = _mm_cvtss_f32( max4 );
+
+        // Quantize these floats
+        const float d = maxScalar / 127.f;
+        y[i].d = GGML_V3_FP32_TO_FP16(d);
+        const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f;
+        const __m256 mul = _mm256_set1_ps( id );
+
+        // Apply the multiplier
+        v0 = _mm256_mul_ps( v0, mul );
+        v1 = _mm256_mul_ps( v1, mul );
+        v2 = _mm256_mul_ps( v2, mul );
+        v3 = _mm256_mul_ps( v3, mul );
+
+        // Round to nearest integer
+        v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
+        v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
+        v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
+        v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
+
+        // Convert floats to integers
+        __m256i i0 = _mm256_cvtps_epi32( v0 );
+        __m256i i1 = _mm256_cvtps_epi32( v1 );
+        __m256i i2 = _mm256_cvtps_epi32( v2 );
+        __m256i i3 = _mm256_cvtps_epi32( v3 );
+
+#if defined(__AVX2__)
+        // Convert int32 to int16
+        i0 = _mm256_packs_epi32( i0, i1 );	// 0, 1, 2, 3,  8, 9, 10, 11,  4, 5, 6, 7, 12, 13, 14, 15
+        i2 = _mm256_packs_epi32( i2, i3 );	// 16, 17, 18, 19,  24, 25, 26, 27,  20, 21, 22, 23, 28, 29, 30, 31
+                                            // Convert int16 to int8
+        i0 = _mm256_packs_epi16( i0, i2 );	// 0, 1, 2, 3,  8, 9, 10, 11,  16, 17, 18, 19,  24, 25, 26, 27,  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+
+        // We got our precious signed bytes, but the order is now wrong
+        // These AVX2 pack instructions process 16-byte pieces independently
+        // The following instruction is fixing the order
+        const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
+        i0 = _mm256_permutevar8x32_epi32( i0, perm );
+
+        _mm256_storeu_si256((__m256i *)y[i].qs, i0);
+#else
+        // Since we don't have in AVX some necessary functions,
+        // we split the registers in half and call AVX2 analogs from SSE
+        __m128i ni0 = _mm256_castsi256_si128( i0 );
+        __m128i ni1 = _mm256_extractf128_si256( i0, 1);
+        __m128i ni2 = _mm256_castsi256_si128( i1 );
+        __m128i ni3 = _mm256_extractf128_si256( i1, 1);
+        __m128i ni4 = _mm256_castsi256_si128( i2 );
+        __m128i ni5 = _mm256_extractf128_si256( i2, 1);
+        __m128i ni6 = _mm256_castsi256_si128( i3 );
+        __m128i ni7 = _mm256_extractf128_si256( i3, 1);
+
+        // Convert int32 to int16
+        ni0 = _mm_packs_epi32( ni0, ni1 );
+        ni2 = _mm_packs_epi32( ni2, ni3 );
+        ni4 = _mm_packs_epi32( ni4, ni5 );
+        ni6 = _mm_packs_epi32( ni6, ni7 );
+        // Convert int16 to int8
+        ni0 = _mm_packs_epi16( ni0, ni2 );
+        ni4 = _mm_packs_epi16( ni4, ni6 );
+
+        _mm_storeu_si128((__m128i *)(y[i].qs +  0), ni0);
+        _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
+#endif
+    }
+#elif defined(__riscv_v_intrinsic)
+
+    size_t vl = __riscv_vsetvl_e32m4(QK8_0);
+
+    for (int i = 0; i < nb; i++) {
+        // load elements
+        vfloat32m4_t v_x   = __riscv_vle32_v_f32m4(x+i*QK8_0, vl);
+
+        vfloat32m4_t vfabs = __riscv_vfabs_v_f32m4(v_x, vl);
+        vfloat32m1_t tmp   = __riscv_vfmv_v_f_f32m1(0.0f, vl);
+        vfloat32m1_t vmax  = __riscv_vfredmax_vs_f32m4_f32m1(vfabs, tmp, vl);
+        float amax = __riscv_vfmv_f_s_f32m1_f32(vmax);
+
+        const float d = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = GGML_V3_FP32_TO_FP16(d);
+
+        vfloat32m4_t x0 = __riscv_vfmul_vf_f32m4(v_x, id, vl);
+
+        // convert to integer
+        vint16m2_t   vi = __riscv_vfncvt_x_f_w_i16m2(x0, vl);
+        vint8m1_t    vs = __riscv_vncvt_x_x_w_i8m1(vi, vl);
+
+        // store result
+        __riscv_vse8_v_i8m1(y[i].qs , vs, vl);
+    }
+#else
+    GGML_V3_UNUSED(nb);
+    // scalar
+    quantize_row_q8_0_reference(x, y, k);
+#endif
+}
+
+// reference implementation for deterministic creation of model files
+static void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k) {
+    assert(QK8_1 == 32);
+    assert(k % QK8_1 == 0);
+    const int nb = k / QK8_1;
+
+    for (int i = 0; i < nb; i++) {
+        float amax = 0.0f; // absolute max
+
+        for (int j = 0; j < QK8_1; j++) {
+            const float v = x[i*QK8_1 + j];
+            amax = MAX(amax, fabsf(v));
+        }
+
+        const float d = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = d;
+
+        int sum = 0;
+
+        for (int j = 0; j < QK8_1/2; ++j) {
+            const float v0 = x[i*QK8_1           + j]*id;
+            const float v1 = x[i*QK8_1 + QK8_1/2 + j]*id;
+
+            y[i].qs[          j] = roundf(v0);
+            y[i].qs[QK8_1/2 + j] = roundf(v1);
+
+            sum += y[i].qs[          j];
+            sum += y[i].qs[QK8_1/2 + j];
+        }
+
+        y[i].s = sum*d;
+    }
+}
+
+static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
+    assert(k % QK8_1 == 0);
+    const int nb = k / QK8_1;
+
+    block_q8_1 * restrict y = vy;
+
+#if defined(__ARM_NEON)
+    for (int i = 0; i < nb; i++) {
+        float32x4_t srcv [8];
+        float32x4_t asrcv[8];
+        float32x4_t amaxv[8];
+
+        for (int j = 0; j < 8; j++) srcv[j]  = vld1q_f32(x + i*32 + 4*j);
+        for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[j]);
+
+        for (int j = 0; j < 4; j++) amaxv[2*j] = vmaxq_f32(asrcv[2*j], asrcv[2*j+1]);
+        for (int j = 0; j < 2; j++) amaxv[4*j] = vmaxq_f32(amaxv[4*j], amaxv[4*j+2]);
+        for (int j = 0; j < 1; j++) amaxv[8*j] = vmaxq_f32(amaxv[8*j], amaxv[8*j+4]);
+
+        const float amax = vmaxvq_f32(amaxv[0]);
+
+        const float d = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = d;
+
+        int32x4_t accv = vdupq_n_s32(0);
+
+        for (int j = 0; j < 8; j++) {
+            const float32x4_t v  = vmulq_n_f32(srcv[j], id);
+            const int32x4_t   vi = vcvtnq_s32_f32(v);
+
+            y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
+            y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
+            y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
+            y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
+
+            accv = vaddq_s32(accv, vi);
+        }
+
+        y[i].s = d * vaddvq_s32(accv);
+    }
+#elif defined(__wasm_simd128__)
+    for (int i = 0; i < nb; i++) {
+        v128_t srcv [8];
+        v128_t asrcv[8];
+        v128_t amaxv[8];
+
+        for (int j = 0; j < 8; j++) srcv[j]  = wasm_v128_load(x + i*32 + 4*j);
+        for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]);
+
+        for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]);
+        for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]);
+        for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]);
+
+        const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0),
+                                   wasm_f32x4_extract_lane(amaxv[0], 1)),
+                               MAX(wasm_f32x4_extract_lane(amaxv[0], 2),
+                                   wasm_f32x4_extract_lane(amaxv[0], 3)));
+
+        const float d = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = d;
+
+        v128_t accv = wasm_i32x4_splat(0);
+
+        for (int j = 0; j < 8; j++) {
+            const v128_t v  = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id));
+            const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v);
+
+            y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0);
+            y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1);
+            y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2);
+            y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3);
+
+            accv = wasm_i32x4_add(accv, vi);
+        }
+
+        y[i].s = d * (wasm_i32x4_extract_lane(accv, 0) +
+                      wasm_i32x4_extract_lane(accv, 1) +
+                      wasm_i32x4_extract_lane(accv, 2) +
+                      wasm_i32x4_extract_lane(accv, 3));
+    }
+#elif defined(__AVX2__) || defined(__AVX__)
+    for (int i = 0; i < nb; i++) {
+        // Load elements into 4 AVX vectors
+        __m256 v0 = _mm256_loadu_ps( x );
+        __m256 v1 = _mm256_loadu_ps( x + 8 );
+        __m256 v2 = _mm256_loadu_ps( x + 16 );
+        __m256 v3 = _mm256_loadu_ps( x + 24 );
+        x += 32;
+
+        // Compute max(abs(e)) for the block
+        const __m256 signBit = _mm256_set1_ps( -0.0f );
+        __m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
+        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
+        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
+        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
+
+        __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
+        max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
+        max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
+        const float maxScalar = _mm_cvtss_f32( max4 );
+
+        // Quantize these floats
+        const float d = maxScalar / 127.f;
+        y[i].d = d;
+        const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f;
+        const __m256 mul = _mm256_set1_ps( id );
+
+        // Apply the multiplier
+        v0 = _mm256_mul_ps( v0, mul );
+        v1 = _mm256_mul_ps( v1, mul );
+        v2 = _mm256_mul_ps( v2, mul );
+        v3 = _mm256_mul_ps( v3, mul );
+
+        // Round to nearest integer
+        v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
+        v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
+        v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
+        v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
+
+        // Convert floats to integers
+        __m256i i0 = _mm256_cvtps_epi32( v0 );
+        __m256i i1 = _mm256_cvtps_epi32( v1 );
+        __m256i i2 = _mm256_cvtps_epi32( v2 );
+        __m256i i3 = _mm256_cvtps_epi32( v3 );
+
+#if defined(__AVX2__)
+        // Compute the sum of the quants and set y[i].s
+        y[i].s = d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3)));
+
+        // Convert int32 to int16
+        i0 = _mm256_packs_epi32( i0, i1 );	// 0, 1, 2, 3,  8, 9, 10, 11,  4, 5, 6, 7, 12, 13, 14, 15
+        i2 = _mm256_packs_epi32( i2, i3 );	// 16, 17, 18, 19,  24, 25, 26, 27,  20, 21, 22, 23, 28, 29, 30, 31
+                                            // Convert int16 to int8
+        i0 = _mm256_packs_epi16( i0, i2 );	// 0, 1, 2, 3,  8, 9, 10, 11,  16, 17, 18, 19,  24, 25, 26, 27,  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+
+        // We got our precious signed bytes, but the order is now wrong
+        // These AVX2 pack instructions process 16-byte pieces independently
+        // The following instruction is fixing the order
+        const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
+        i0 = _mm256_permutevar8x32_epi32( i0, perm );
+
+        _mm256_storeu_si256((__m256i *)y[i].qs, i0);
+#else
+        // Since we don't have in AVX some necessary functions,
+        // we split the registers in half and call AVX2 analogs from SSE
+        __m128i ni0 = _mm256_castsi256_si128( i0 );
+        __m128i ni1 = _mm256_extractf128_si256( i0, 1);
+        __m128i ni2 = _mm256_castsi256_si128( i1 );
+        __m128i ni3 = _mm256_extractf128_si256( i1, 1);
+        __m128i ni4 = _mm256_castsi256_si128( i2 );
+        __m128i ni5 = _mm256_extractf128_si256( i2, 1);
+        __m128i ni6 = _mm256_castsi256_si128( i3 );
+        __m128i ni7 = _mm256_extractf128_si256( i3, 1);
+
+        // Compute the sum of the quants and set y[i].s
+        const __m128i s0 = _mm_add_epi32(_mm_add_epi32(ni0, ni1), _mm_add_epi32(ni2, ni3));
+        const __m128i s1 = _mm_add_epi32(_mm_add_epi32(ni4, ni5), _mm_add_epi32(ni6, ni7));
+        y[i].s = d * hsum_i32_4(_mm_add_epi32(s0, s1));
+
+        // Convert int32 to int16
+        ni0 = _mm_packs_epi32( ni0, ni1 );
+        ni2 = _mm_packs_epi32( ni2, ni3 );
+        ni4 = _mm_packs_epi32( ni4, ni5 );
+        ni6 = _mm_packs_epi32( ni6, ni7 );
+        // Convert int16 to int8
+        ni0 = _mm_packs_epi16( ni0, ni2 );
+        ni4 = _mm_packs_epi16( ni4, ni6 );
+
+        _mm_storeu_si128((__m128i *)(y[i].qs +  0), ni0);
+        _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
+#endif
+    }
+#elif defined(__riscv_v_intrinsic)
+
+    size_t vl = __riscv_vsetvl_e32m4(QK8_1);
+
+    for (int i = 0; i < nb; i++) {
+        // load elements
+        vfloat32m4_t v_x   = __riscv_vle32_v_f32m4(x+i*QK8_1, vl);
+
+        vfloat32m4_t vfabs = __riscv_vfabs_v_f32m4(v_x, vl);
+        vfloat32m1_t tmp   = __riscv_vfmv_v_f_f32m1(0.0, vl);
+        vfloat32m1_t vmax  = __riscv_vfredmax_vs_f32m4_f32m1(vfabs, tmp, vl);
+        float amax = __riscv_vfmv_f_s_f32m1_f32(vmax);
+
+        const float d  = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = d;
+
+        vfloat32m4_t x0 = __riscv_vfmul_vf_f32m4(v_x, id, vl);
+
+        // convert to integer
+        vint16m2_t   vi = __riscv_vfncvt_x_f_w_i16m2(x0, vl);
+        vint8m1_t    vs = __riscv_vncvt_x_x_w_i8m1(vi, vl);
+
+        // store result
+        __riscv_vse8_v_i8m1(y[i].qs , vs, vl);
+
+        // compute sum for y[i].s
+        vint16m1_t tmp2 = __riscv_vmv_v_x_i16m1(0, vl);
+        vint16m1_t vwrs = __riscv_vwredsum_vs_i8m1_i16m1(vs, tmp2, vl);
+
+        // set y[i].s
+        int sum = __riscv_vmv_x_s_i16m1_i16(vwrs);
+        y[i].s = sum*d;
+    }
+#else
+    GGML_V3_UNUSED(nb);
+    // scalar
+    quantize_row_q8_1_reference(x, y, k);
+#endif
+}
+
+static void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k) {
+    static const int qk = QK4_0;
+
+    assert(k % qk == 0);
+
+    const int nb = k / qk;
+
+    for (int i = 0; i < nb; i++) {
+        const float d = GGML_V3_FP16_TO_FP32(x[i].d);
+
+        for (int j = 0; j < qk/2; ++j) {
+            const int x0 = (x[i].qs[j] & 0x0F) - 8;
+            const int x1 = (x[i].qs[j] >>   4) - 8;
+
+            y[i*qk + j + 0   ] = x0*d;
+            y[i*qk + j + qk/2] = x1*d;
+        }
+    }
+}
+
+static void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k) {
+    static const int qk = QK4_1;
+
+    assert(k % qk == 0);
+
+    const int nb = k / qk;
+
+    for (int i = 0; i < nb; i++) {
+        const float d = GGML_V3_FP16_TO_FP32(x[i].d);
+        const float m = GGML_V3_FP16_TO_FP32(x[i].m);
+
+        for (int j = 0; j < qk/2; ++j) {
+            const int x0 = (x[i].qs[j] & 0x0F);
+            const int x1 = (x[i].qs[j] >>   4);
+
+            y[i*qk + j + 0   ] = x0*d + m;
+            y[i*qk + j + qk/2] = x1*d + m;
+        }
+    }
+}
+
+static void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k) {
+    static const int qk = QK5_0;
+
+    assert(k % qk == 0);
+
+    const int nb = k / qk;
+
+    for (int i = 0; i < nb; i++) {
+        const float d = GGML_V3_FP16_TO_FP32(x[i].d);
+
+        uint32_t qh;
+        memcpy(&qh, x[i].qh, sizeof(qh));
+
+        for (int j = 0; j < qk/2; ++j) {
+            const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
+            const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
+
+            const int32_t x0 = ((x[i].qs[j] & 0x0F) | xh_0) - 16;
+            const int32_t x1 = ((x[i].qs[j] >>   4) | xh_1) - 16;
+
+            y[i*qk + j + 0   ] = x0*d;
+            y[i*qk + j + qk/2] = x1*d;
+        }
+    }
+}
+
+static void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k) {
+    static const int qk = QK5_1;
+
+    assert(k % qk == 0);
+
+    const int nb = k / qk;
+
+    for (int i = 0; i < nb; i++) {
+        const float d = GGML_V3_FP16_TO_FP32(x[i].d);
+        const float m = GGML_V3_FP16_TO_FP32(x[i].m);
+
+        uint32_t qh;
+        memcpy(&qh, x[i].qh, sizeof(qh));
+
+        for (int j = 0; j < qk/2; ++j) {
+            const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
+            const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
+
+            const int x0 = (x[i].qs[j] & 0x0F) | xh_0;
+            const int x1 = (x[i].qs[j] >>   4) | xh_1;
+
+            y[i*qk + j + 0   ] = x0*d + m;
+            y[i*qk + j + qk/2] = x1*d + m;
+        }
+    }
+}
+
+static void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int k) {
+    static const int qk = QK8_0;
+
+    assert(k % qk == 0);
+
+    const int nb = k / qk;
+
+    for (int i = 0; i < nb; i++) {
+        const float d = GGML_V3_FP16_TO_FP32(x[i].d);
+
+        for (int j = 0; j < qk; ++j) {
+            y[i*qk + j] = x[i].qs[j]*d;
+        }
+    }
+}
+
+//
+// 2-6 bit quantization in super-blocks
+//
+
+//
+// ===================== Helper functions
+//
+static inline int nearest_int(float fval) {
+    assert(fval <= 4194303.f);
+    float val = fval + 12582912.f;
+    int i; memcpy(&i, &val, sizeof(int));
+    return (i & 0x007fffff) - 0x00400000;
+}
+
+static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, int rmse_type) {
+    float max = 0;
+    float amax = 0;
+    for (int i = 0; i < n; ++i) {
+        float ax = fabsf(x[i]);
+        if (ax > amax) { amax = ax; max = x[i]; }
+    }
+    if (amax < 1e-30f) { // all zero
+        for (int i = 0; i < n; ++i) {
+            L[i] = 0;
+        }
+        return 0.f;
+    }
+    float iscale = -nmax / max;
+    if (rmse_type == 0) {
+        for (int i = 0; i < n; ++i) {
+            int l = nearest_int(iscale * x[i]);
+            L[i] = nmax + MAX(-nmax, MIN(nmax-1, l));
+        }
+        return 1/iscale;
+    }
+    bool return_early = false;
+    if (rmse_type < 0) {
+        rmse_type = -rmse_type;
+        return_early = true;
+    }
+    int weight_type = rmse_type%2;
+    float sumlx = 0;
+    float suml2 = 0;
+    for (int i = 0; i < n; ++i) {
+        int l = nearest_int(iscale * x[i]);
+        l = MAX(-nmax, MIN(nmax-1, l));
+        L[i] = l + nmax;
+        float w = weight_type == 1 ? x[i] * x[i] : 1;
+        sumlx += w*x[i]*l;
+        suml2 += w*l*l;
+    }
+    float scale = sumlx/suml2;
+    if (return_early) return suml2 > 0 ? 0.5f*(scale + 1/iscale) : 1/iscale;
+    float best = scale * sumlx;
+    for (int is = -9; is <= 9; ++is) {
+        if (is == 0) {
+            continue;
+        }
+        iscale = -(nmax + 0.1f*is) / max;
+        sumlx = suml2 = 0;
+        for (int i = 0; i < n; ++i) {
+            int l = nearest_int(iscale * x[i]);
+            l = MAX(-nmax, MIN(nmax-1, l));
+            float w = weight_type == 1 ? x[i] * x[i] : 1;
+            sumlx += w*x[i]*l;
+            suml2 += w*l*l;
+        }
+        if (suml2 > 0 && sumlx*sumlx > best*suml2) {
+            for (int i = 0; i < n; ++i) {
+                int l = nearest_int(iscale * x[i]);
+                L[i] = nmax + MAX(-nmax, MIN(nmax-1, l));
+            }
+            scale = sumlx/suml2; best = scale*sumlx;
+        }
+    }
+    return scale;
+}
+
+static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, bool do_rmse) {
+    float max = 0;
+    float amax = 0;
+    for (int i = 0; i < n; ++i) {
+        float ax = fabsf(x[i]);
+        if (ax > amax) { amax = ax; max = x[i]; }
+    }
+    if (!amax) { // all zero
+        for (int i = 0; i < n; ++i) { L[i] = 0; }
+        return 0.f;
+    }
+    float iscale = -nmax / max;
+    if (do_rmse) {
+        float sumlx = 0;
+        float suml2 = 0;
+        for (int i = 0; i < n; ++i) {
+            int l = nearest_int(iscale * x[i]);
+            l = MAX(-nmax, MIN(nmax-1, l));
+            L[i] = l;
+            float w = x[i]*x[i];
+            sumlx += w*x[i]*l;
+            suml2 += w*l*l;
+        }
+        for (int itry = 0; itry < 5; ++itry) {
+            int n_changed = 0;
+            for (int i = 0; i < n; ++i) {
+                float w = x[i]*x[i];
+                float slx = sumlx - w*x[i]*L[i];
+                if (slx > 0) {
+                    float sl2 = suml2 - w*L[i]*L[i];
+                    int new_l = nearest_int(x[i] * sl2 / slx);
+                    new_l = MAX(-nmax, MIN(nmax-1, new_l));
+                    if (new_l != L[i]) {
+                        slx += w*x[i]*new_l;
+                        sl2 += w*new_l*new_l;
+                        if (sl2 > 0 && slx*slx*suml2 > sumlx*sumlx*sl2) {
+                            L[i] = new_l; sumlx = slx; suml2 = sl2;
+                            ++n_changed;
+                        }
+                    }
+                }
+            }
+            if (!n_changed) {
+                break;
+            }
+        }
+        for (int i = 0; i < n; ++i) {
+            L[i] += nmax;
+        }
+        return sumlx / suml2;
+    }
+    for (int i = 0; i < n; ++i) {
+        int l = nearest_int(iscale * x[i]);
+        l = MAX(-nmax, MIN(nmax-1, l));
+        L[i] = l + nmax;
+    }
+    return 1/iscale;
+}
+
+static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, float * restrict the_min,
+        int ntry, float alpha) {
+    float min = x[0];
+    float max = x[0];
+    for (int i = 1; i < n; ++i) {
+        if (x[i] < min) min = x[i];
+        if (x[i] > max) max = x[i];
+    }
+    if (max == min) {
+        for (int i = 0; i < n; ++i) L[i] = 0;
+        *the_min = 0;
+        return 0.f;
+    }
+    if (min > 0) min = 0;
+    float iscale = nmax/(max - min);
+    float scale = 1/iscale;
+    for (int itry = 0; itry < ntry; ++itry) {
+        float sumlx = 0; int suml2 = 0;
+        bool did_change = false;
+        for (int i = 0; i < n; ++i) {
+            int l = nearest_int(iscale*(x[i] - min));
+            l = MAX(0, MIN(nmax, l));
+            if (l != L[i]) {
+                L[i] = l;
+                did_change = true;
+            }
+            sumlx += (x[i] - min)*l;
+            suml2 += l*l;
+        }
+        scale = sumlx/suml2;
+        float sum = 0;
+        for (int i = 0; i < n; ++i) {
+            sum += x[i] - scale*L[i];
+        }
+        min = alpha*min + (1 - alpha)*sum/n;
+        if (min > 0) min = 0;
+        iscale = 1/scale;
+        if (!did_change) break;
+    }
+    *the_min = -min;
+    return scale;
+}
+
+static float make_qkx2_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
+        uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux,
+        float rmin, float rdelta, int nstep, bool use_mad) {
+    float min = x[0];
+    float max = x[0];
+    float sum_w = weights[0];
+    float sum_x = sum_w * x[0];
+#ifdef HAVE_BUGGY_APPLE_LINKER
+    // use 'volatile' to prevent unroll and work around a bug in Apple ld64 1015.7
+    for (volatile int i = 1; i < n; ++i) {
+#else
+    for (int i = 1; i < n; ++i) {
+#endif
+        if (x[i] < min) min = x[i];
+        if (x[i] > max) max = x[i];
+        float w = weights[i];
+        sum_w += w;
+        sum_x += w * x[i];
+    }
+    if (min > 0) min = 0;
+    if (max == min) {
+        for (int i = 0; i < n; ++i) L[i] = 0;
+        *the_min = -min;
+        return 0.f;
+    }
+    float iscale = nmax/(max - min);
+    float scale = 1/iscale;
+    float best_mad = 0;
+    for (int i = 0; i < n; ++i) {
+        int l = nearest_int(iscale*(x[i] - min));
+        L[i] = MAX(0, MIN(nmax, l));
+        float diff = scale * L[i] + min - x[i];
+        diff = use_mad ? fabsf(diff) : diff * diff;
+        float w = weights[i];
+        best_mad += w * diff;
+    }
+    if (nstep < 1) {
+        *the_min = -min;
+        return scale;
+    }
+    for (int is = 0; is <= nstep; ++is) {
+        iscale = (rmin + rdelta*is + nmax)/(max - min);
+        float sum_l = 0, sum_l2 = 0, sum_xl = 0;
+        for (int i = 0; i < n; ++i) {
+            int l = nearest_int(iscale*(x[i] - min));
+            l = MAX(0, MIN(nmax, l));
+            Laux[i] = l;
+            float w = weights[i];
+            sum_l += w*l;
+            sum_l2 += w*l*l;
+            sum_xl += w*l*x[i];
+        }
+        float D = sum_w * sum_l2 - sum_l * sum_l;
+        if (D > 0) {
+            float this_scale = (sum_w * sum_xl - sum_x * sum_l)/D;
+            float this_min   = (sum_l2 * sum_x - sum_l * sum_xl)/D;
+            if (this_min > 0) {
+                this_min = 0;
+                this_scale = sum_xl / sum_l2;
+            }
+            float mad = 0;
+            for (int i = 0; i < n; ++i) {
+                float diff = this_scale * Laux[i] + this_min - x[i];
+                diff = use_mad ? fabsf(diff) : diff * diff;
+                float w = weights[i];
+                mad += w * diff;
+            }
+            if (mad < best_mad) {
+                for (int i = 0; i < n; ++i) {
+                    L[i] = Laux[i];
+                }
+                best_mad = mad;
+                scale = this_scale;
+                min = this_min;
+            }
+        }
+    }
+    *the_min = -min;
+    return scale;
+}
+
+#if QK_K == 256
+static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * restrict d, uint8_t * restrict m) {
+    if (j < 4) {
+        *d = q[j] & 63; *m = q[j + 4] & 63;
+    } else {
+        *d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4);
+        *m = (q[j+4] >>  4) | ((q[j-0] >> 6) << 4);
+    }
+}
+#endif
+
+//========================- 2-bit (de)-quantization
+
+static void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k) {
+    assert(k % QK_K == 0);
+    const int nb = k / QK_K;
+
+    uint8_t L[QK_K];
+    uint8_t Laux[16];
+    float   weights[16];
+    float mins[QK_K/16];
+    float scales[QK_K/16];
+
+    const float q4scale = 15.f;
+
+    for (int i = 0; i < nb; i++) {
+        float max_scale = 0; // as we are deducting the min, scales are always positive
+        float max_min = 0;
+        for (int j = 0; j < QK_K/16; ++j) {
+            for (int l = 0; l < 16; ++l) weights[l] = fabsf(x[16*j + l]);
+            scales[j] = make_qkx2_quants(16, 3, x + 16*j, weights, L + 16*j, &mins[j], Laux, -0.5f, 0.1f, 15, true);
+            float scale = scales[j];
+            if (scale > max_scale) {
+                max_scale = scale;
+            }
+            float min = mins[j];
+            if (min > max_min) {
+                max_min = min;
+            }
+        }
+
+        if (max_scale > 0) {
+            float iscale = q4scale/max_scale;
+            for (int j = 0; j < QK_K/16; ++j) {
+                int l = nearest_int(iscale*scales[j]);
+                y[i].scales[j] = l;
+            }
+            y[i].d = GGML_V3_FP32_TO_FP16(max_scale/q4scale);
+        } else {
+            for (int j = 0; j < QK_K/16; ++j) y[i].scales[j] = 0;
+            y[i].d = GGML_V3_FP32_TO_FP16(0.f);
+        }
+        if (max_min > 0) {
+            float iscale = q4scale/max_min;
+            for (int j = 0; j < QK_K/16; ++j) {
+                int l = nearest_int(iscale*mins[j]);
+                y[i].scales[j] |= (l << 4);
+            }
+            y[i].dmin = GGML_V3_FP32_TO_FP16(max_min/q4scale);
+        } else {
+            y[i].dmin = GGML_V3_FP32_TO_FP16(0.f);
+        }
+        for (int j = 0; j < QK_K/16; ++j) {
+            const float d = GGML_V3_FP16_TO_FP32(y[i].d) * (y[i].scales[j] & 0xF);
+            if (!d) continue;
+            const float dm = GGML_V3_FP16_TO_FP32(y[i].dmin) * (y[i].scales[j] >> 4);
+            for (int ii = 0; ii < 16; ++ii) {
+                int l = nearest_int((x[16*j + ii] + dm)/d);
+                l = MAX(0, MIN(3, l));
+                L[16*j + ii] = l;
+            }
+        }
+
+#if QK_K == 256
+        for (int j = 0; j < QK_K; j += 128) {
+            for (int l = 0; l < 32; ++l) {
+                y[i].qs[j/4 + l] = L[j + l] | (L[j + l + 32] << 2) | (L[j + l + 64] << 4) | (L[j + l + 96] << 6);
+            }
+        }
+#else
+        for (int l = 0; l < 16; ++l) {
+            y[i].qs[l] = L[l] | (L[l + 16] << 2) | (L[l + 32] << 4) | (L[l + 48] << 6);
+        }
+#endif
+
+        x += QK_K;
+
+    }
+}
+
+static void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k) {
+    assert(k % QK_K == 0);
+    const int nb = k / QK_K;
+
+    for (int i = 0; i < nb; i++) {
+
+        const float d = GGML_V3_FP16_TO_FP32(x[i].d);
+        const float min = GGML_V3_FP16_TO_FP32(x[i].dmin);
+
+        const uint8_t * q = x[i].qs;
+
+#if QK_K == 256
+        int is = 0;
+        float dl, ml;
+        for (int n = 0; n < QK_K; n += 128) {
+            int shift = 0;
+            for (int j = 0; j < 4; ++j) {
+
+                uint8_t sc = x[i].scales[is++];
+                dl = d * (sc & 0xF); ml = min * (sc >> 4);
+                for (int l = 0; l < 16; ++l) *y++ = dl * ((int8_t)((q[l] >> shift) & 3)) - ml;
+
+                sc = x[i].scales[is++];
+                dl = d * (sc & 0xF); ml = min * (sc >> 4);
+                for (int l = 0; l < 16; ++l) *y++ = dl * ((int8_t)((q[l+16] >> shift) & 3)) - ml;
+
+                shift += 2;
+            }
+            q += 32;
+        }
+#else
+        float dl1 = d * (x[i].scales[0] & 0xF), ml1 = min * (x[i].scales[0] >> 4);
+        float dl2 = d * (x[i].scales[1] & 0xF), ml2 = min * (x[i].scales[1] >> 4);
+        float dl3 = d * (x[i].scales[2] & 0xF), ml3 = min * (x[i].scales[2] >> 4);
+        float dl4 = d * (x[i].scales[3] & 0xF), ml4 = min * (x[i].scales[3] >> 4);
+        for (int l = 0; l < 16; ++l) {
+            y[l+ 0] = dl1 * ((int8_t)((q[l] >> 0) & 3)) - ml1;
+            y[l+16] = dl2 * ((int8_t)((q[l] >> 2) & 3)) - ml2;
+            y[l+32] = dl3 * ((int8_t)((q[l] >> 4) & 3)) - ml3;
+            y[l+48] = dl4 * ((int8_t)((q[l] >> 6) & 3)) - ml4;
+        }
+        y += QK_K;
+#endif
+    }
+}
+
+static void quantize_row_q2_K(const float * restrict x, void * restrict vy, int k) {
+    quantize_row_q2_K_reference(x, vy, k);
+}
+
+size_t ggml_v3_quantize_q2_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
+    (void)hist; // TODO: collect histograms
+
+    for (int j = 0; j < n; j += k) {
+        block_q2_K * restrict y = (block_q2_K *)dst + j/QK_K;
+        quantize_row_q2_K_reference(src + j, y, k);
+    }
+    return (n/QK_K*sizeof(block_q2_K));
+}
+
+//========================= 3-bit (de)-quantization
+
+static void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k) {
+    assert(k % QK_K == 0);
+    const int nb = k / QK_K;
+
+    int8_t L[QK_K];
+    float scales[QK_K / 16];
+
+    for (int i = 0; i < nb; i++) {
+
+        float max_scale = 0;
+        float amax = 0;
+        for (int j = 0; j < QK_K/16; ++j) {
+            scales[j] = make_q3_quants(16, 4, x + 16*j, L + 16*j, true);
+            float scale = fabsf(scales[j]);
+            if (scale > amax) {
+                amax = scale; max_scale = scales[j];
+            }
+        }
+
+#if QK_K == 256
+        memset(y[i].scales, 0, 12);
+        if (max_scale) {
+            float iscale = -32.f/max_scale;
+            for (int j = 0; j < QK_K/16; ++j) {
+                int8_t l = nearest_int(iscale*scales[j]);
+                l = MAX(-32, MIN(31, l)) + 32;
+                if (j < 8) {
+                    y[i].scales[j] = l & 0xF;
+                } else {
+                    y[i].scales[j-8] |= ((l & 0xF) << 4);
+                }
+                l >>= 4;
+                y[i].scales[j%4 + 8] |= (l << (2*(j/4)));
+            }
+            y[i].d = GGML_V3_FP32_TO_FP16(1/iscale);
+        } else {
+            y[i].d = GGML_V3_FP32_TO_FP16(0.f);
+        }
+
+        int8_t sc;
+        for (int j = 0; j < QK_K/16; ++j) {
+            sc = j < 8 ? y[i].scales[j] & 0xF : y[i].scales[j-8] >> 4;
+            sc = (sc | (((y[i].scales[8 + j%4] >> (2*(j/4))) & 3) << 4)) - 32;
+            float d = GGML_V3_FP16_TO_FP32(y[i].d) * sc;
+            if (!d) {
+                continue;
+            }
+            for (int ii = 0; ii < 16; ++ii) {
+                int l = nearest_int(x[16*j + ii]/d);
+                l = MAX(-4, MIN(3, l));
+                L[16*j + ii] = l + 4;
+            }
+        }
+#else
+        if (max_scale) {
+            float iscale = -8.f/max_scale;
+            for (int j = 0; j < QK_K/16; j+=2) {
+                int l1 = nearest_int(iscale*scales[j]);
+                l1 = 8 + MAX(-8, MIN(7, l1));
+                int l2 = nearest_int(iscale*scales[j+1]);
+                l2 = 8 + MAX(-8, MIN(7, l2));
+                y[i].scales[j/2] = l1 | (l2 << 4);
+            }
+            y[i].d = GGML_V3_FP32_TO_FP16(1/iscale);
+        } else {
+            for (int j = 0; j < QK_K/16; j+=2) {
+                y[i].scales[j/2] = 0;
+            }
+            y[i].d = GGML_V3_FP32_TO_FP16(0.f);
+        }
+        for (int j = 0; j < QK_K/16; ++j) {
+            int s = j%2 == 0 ? y[i].scales[j/2] & 0xF : y[i].scales[j/2] >> 4;
+            float d = GGML_V3_FP16_TO_FP32(y[i].d) * (s - 8);
+            if (!d) {
+                continue;
+            }
+            for (int ii = 0; ii < 16; ++ii) {
+                int l = nearest_int(x[16*j + ii]/d);
+                l = MAX(-4, MIN(3, l));
+                L[16*j + ii] = l + 4;
+            }
+        }
+#endif
+
+        memset(y[i].hmask, 0, QK_K/8);
+        // We put the high-bit for the 1st 8 quants into bit 0, the next 8 into bit 1, etc.
+        int m = 0;
+        uint8_t hm = 1;
+        for (int j = 0; j < QK_K; ++j) {
+            if (L[j] > 3) {
+                y[i].hmask[m] |= hm;
+                L[j] -= 4;
+            }
+            if (++m == QK_K/8) {
+                m = 0; hm <<= 1;
+            }
+        }
+#if QK_K == 256
+        for (int j = 0; j < QK_K; j += 128) {
+            for (int l = 0; l < 32; ++l) {
+                y[i].qs[j/4 + l] = L[j + l] | (L[j + l + 32] << 2) | (L[j + l + 64] << 4) | (L[j + l + 96] << 6);
+            }
+        }
+#else
+        for (int l = 0; l < 16; ++l) {
+            y[i].qs[l] = L[l] | (L[l + 16] << 2) | (L[l + 32] << 4) | (L[l + 48] << 6);
+        }
+#endif
+
+        x += QK_K;
+    }
+}
+
+#if QK_K == 256
+static void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k) {
+    assert(k % QK_K == 0);
+    const int nb = k / QK_K;
+
+    const uint32_t kmask1 = 0x03030303;
+    const uint32_t kmask2 = 0x0f0f0f0f;
+
+    uint32_t aux[4];
+    const int8_t * scales = (const int8_t*)aux;
+
+    for (int i = 0; i < nb; i++) {
+
+        const float d_all = GGML_V3_FP16_TO_FP32(x[i].d);
+
+        const uint8_t * restrict q = x[i].qs;
+        const uint8_t * restrict hm = x[i].hmask;
+        uint8_t m = 1;
+
+        memcpy(aux, x[i].scales, 12);
+        uint32_t tmp = aux[2];
+        aux[2] = ((aux[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
+        aux[3] = ((aux[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
+        aux[0] = (aux[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
+        aux[1] = (aux[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
+
+        int is = 0;
+        float dl;
+        for (int n = 0; n < QK_K; n += 128) {
+            int shift = 0;
+            for (int j = 0; j < 4; ++j) {
+
+                dl = d_all * (scales[is++] - 32);
+                for (int l = 0; l < 16; ++l) {
+                    *y++ = dl * ((int8_t)((q[l+ 0] >> shift) & 3) - ((hm[l+ 0] & m) ? 0 : 4));
+                }
+
+                dl = d_all * (scales[is++] - 32);
+                for (int l = 0; l < 16; ++l) {
+                    *y++ = dl * ((int8_t)((q[l+16] >> shift) & 3) - ((hm[l+16] & m) ? 0 : 4));
+                }
+
+                shift += 2;
+                m <<= 1;
+            }
+            q += 32;
+        }
+
+    }
+}
+#else
+static void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k) {
+    assert(k % QK_K == 0);
+    assert(QK_K == 64);
+    const int nb = k / QK_K;
+
+    for (int i = 0; i < nb; i++) {
+
+        const float d_all = GGML_V3_FP16_TO_FP32(x[i].d);
+
+        const uint8_t * restrict q = x[i].qs;
+        const uint8_t * restrict hm = x[i].hmask;
+
+        const float d1 = d_all * ((x[i].scales[0] & 0xF) - 8);
+        const float d2 = d_all * ((x[i].scales[0] >>  4) - 8);
+        const float d3 = d_all * ((x[i].scales[1] & 0xF) - 8);
+        const float d4 = d_all * ((x[i].scales[1] >>  4) - 8);
+
+        for (int l=0; l<8; ++l) {
+            uint8_t h = hm[l];
+            y[l+ 0] = d1 * ((int8_t)((q[l+0] >> 0) & 3) - ((h & 0x01) ? 0 : 4));
+            y[l+ 8] = d1 * ((int8_t)((q[l+8] >> 0) & 3) - ((h & 0x02) ? 0 : 4));
+            y[l+16] = d2 * ((int8_t)((q[l+0] >> 2) & 3) - ((h & 0x04) ? 0 : 4));
+            y[l+24] = d2 * ((int8_t)((q[l+8] >> 2) & 3) - ((h & 0x08) ? 0 : 4));
+            y[l+32] = d3 * ((int8_t)((q[l+0] >> 4) & 3) - ((h & 0x10) ? 0 : 4));
+            y[l+40] = d3 * ((int8_t)((q[l+8] >> 4) & 3) - ((h & 0x20) ? 0 : 4));
+            y[l+48] = d4 * ((int8_t)((q[l+0] >> 6) & 3) - ((h & 0x40) ? 0 : 4));
+            y[l+56] = d4 * ((int8_t)((q[l+8] >> 6) & 3) - ((h & 0x80) ? 0 : 4));
+        }
+        y += QK_K;
+    }
+}
+#endif
+
+static void quantize_row_q3_K(const float * restrict x, void * restrict vy, int k) {
+    quantize_row_q3_K_reference(x, vy, k);
+}
+
+size_t ggml_v3_quantize_q3_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
+    (void)hist; // TODO: collect histograms
+
+    for (int j = 0; j < n; j += k) {
+        block_q3_K * restrict y = (block_q3_K *)dst + j/QK_K;
+        quantize_row_q3_K_reference(src + j, y, k);
+    }
+    return (n/QK_K*sizeof(block_q3_K));
+}
+
+// ====================== 4-bit (de)-quantization
+
+static void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k) {
+    assert(k % QK_K == 0);
+    const int nb = k / QK_K;
+
+    uint8_t L[QK_K];
+    uint8_t Laux[32];
+    float   weights[32];
+    float mins[QK_K/32];
+    float scales[QK_K/32];
+
+    for (int i = 0; i < nb; i++) {
+
+        float max_scale = 0; // as we are deducting the min, scales are always positive
+        float max_min = 0;
+        for (int j = 0; j < QK_K/32; ++j) {
+            //scales[j] = make_qkx1_quants(32, 15, x + 32*j, L + 32*j, &mins[j], 9, 0.5f);
+            float sum_x2 = 0;
+            for (int l = 0; l < 32; ++l) sum_x2 += x[32*j + l] * x[32*j + l];
+            float av_x = sqrtf(sum_x2/32);
+            for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
+            scales[j] = make_qkx2_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -1.f, 0.1f, 20, false);
+            float scale = scales[j];
+            if (scale > max_scale) {
+                max_scale = scale;
+            }
+            float min = mins[j];
+            if (min > max_min) {
+                max_min = min;
+            }
+        }
+
+#if QK_K == 256
+        float inv_scale = max_scale > 0 ? 63.f/max_scale : 0.f;
+        float inv_min   = max_min   > 0 ? 63.f/max_min   : 0.f;
+        for (int j = 0; j < QK_K/32; ++j) {
+            uint8_t ls = nearest_int(inv_scale*scales[j]);
+            uint8_t lm = nearest_int(inv_min*mins[j]);
+            ls = MIN(63, ls);
+            lm = MIN(63, lm);
+            if (j < 4) {
+                y[i].scales[j] = ls;
+                y[i].scales[j+4] = lm;
+            } else {
+                y[i].scales[j+4] = (ls & 0xF) | ((lm & 0xF) << 4);
+                y[i].scales[j-4] |= ((ls >> 4) << 6);
+                y[i].scales[j-0] |= ((lm >> 4) << 6);
+            }
+        }
+        y[i].d = GGML_V3_FP32_TO_FP16(max_scale/63.f);
+        y[i].dmin = GGML_V3_FP32_TO_FP16(max_min/63.f);
+
+        uint8_t sc, m;
+        for (int j = 0; j < QK_K/32; ++j) {
+            get_scale_min_k4(j, y[i].scales, &sc, &m);
+            const float d = GGML_V3_FP16_TO_FP32(y[i].d) * sc;
+            if (!d) continue;
+            const float dm = GGML_V3_FP16_TO_FP32(y[i].dmin) * m;
+            for (int ii = 0; ii < 32; ++ii) {
+                int l = nearest_int((x[32*j + ii] + dm)/d);
+                l = MAX(0, MIN(15, l));
+                L[32*j + ii] = l;
+            }
+        }
+#else
+        const float s_factor = 15.f;
+        float inv_scale = max_scale > 0 ? s_factor/max_scale : 0.f;
+        float inv_min   = max_min   > 0 ? s_factor/max_min   : 0.f;
+        int d1 = nearest_int(inv_scale*scales[0]);
+        int m1 = nearest_int(inv_min*mins[0]);
+        int d2 = nearest_int(inv_scale*scales[1]);
+        int m2 = nearest_int(inv_min*mins[1]);
+        y[i].scales[0] = d1 | (m1 << 4);
+        y[i].scales[1] = d2 | (m2 << 4);
+        y[i].d[0] = GGML_V3_FP32_TO_FP16(max_scale/s_factor);
+        y[i].d[1] = GGML_V3_FP32_TO_FP16(max_min/s_factor);
+
+        float sumlx = 0;
+        int   suml2 = 0;
+        for (int j = 0; j < QK_K/32; ++j) {
+            const uint8_t sd = y[i].scales[j] & 0xF;
+            const uint8_t sm = y[i].scales[j] >>  4;
+            const float d = GGML_V3_FP16_TO_FP32(y[i].d[0]) * sd;
+            if (!d) continue;
+            const float m = GGML_V3_FP16_TO_FP32(y[i].d[1]) * sm;
+            for (int ii = 0; ii < 32; ++ii) {
+                int l = nearest_int((x[32*j + ii] + m)/d);
+                l = MAX(0, MIN(15, l));
+                L[32*j + ii] = l;
+                sumlx += (x[32*j + ii] + m)*l*sd;
+                suml2 += l*l*sd*sd;
+            }
+        }
+        if (suml2) {
+            y[i].d[0] = GGML_V3_FP32_TO_FP16(sumlx/suml2);
+        }
+#endif
+        uint8_t * q = y[i].qs;
+        for (int j = 0; j < QK_K; j += 64) {
+            for (int l = 0; l < 32; ++l) q[l] = L[j + l] | (L[j + l + 32] << 4);
+            q += 32;
+        }
+
+        x += QK_K;
+
+    }
+}
+
+static void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k) {
+    assert(k % QK_K == 0);
+    const int nb = k / QK_K;
+
+    for (int i = 0; i < nb; i++) {
+
+        const uint8_t * q = x[i].qs;
+
+#if QK_K == 256
+
+        const float d   = GGML_V3_FP16_TO_FP32(x[i].d);
+        const float min = GGML_V3_FP16_TO_FP32(x[i].dmin);
+
+        int is = 0;
+        uint8_t sc, m;
+        for (int j = 0; j < QK_K; j += 64) {
+            get_scale_min_k4(is + 0, x[i].scales, &sc, &m);
+            const float d1 = d * sc; const float m1 = min * m;
+            get_scale_min_k4(is + 1, x[i].scales, &sc, &m);
+            const float d2 = d * sc; const float m2 = min * m;
+            for (int l = 0; l < 32; ++l) *y++ = d1 * (q[l] & 0xF) - m1;
+            for (int l = 0; l < 32; ++l) *y++ = d2 * (q[l]  >> 4) - m2;
+            q += 32; is += 2;
+        }
+#else
+        const float dall = GGML_V3_FP16_TO_FP32(x[i].d[0]);
+        const float mall = GGML_V3_FP16_TO_FP32(x[i].d[1]);
+        const float d1 = dall * (x[i].scales[0] & 0xF), m1 = mall * (x[i].scales[0] >> 4);
+        const float d2 = dall * (x[i].scales[1] & 0xF), m2 = mall * (x[i].scales[1] >> 4);
+        for (int l = 0; l < 32; ++l) {
+            y[l+ 0] = d1 * (q[l] & 0xF) - m1;
+            y[l+32] = d2 * (q[l] >>  4) - m2;
+        }
+        y += QK_K;
+#endif
+
+    }
+}
+
+static void quantize_row_q4_K(const float * restrict x, void * restrict vy, int k) {
+    assert(k % QK_K == 0);
+    block_q4_K * restrict y = vy;
+    quantize_row_q4_K_reference(x, y, k);
+}
+
+size_t ggml_v3_quantize_q4_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
+    assert(k % QK_K == 0);
+    (void)hist; // TODO: collect histograms
+
+    for (int j = 0; j < n; j += k) {
+        block_q4_K * restrict y = (block_q4_K *)dst + j/QK_K;
+        quantize_row_q4_K_reference(src + j, y, k);
+    }
+    return (n/QK_K*sizeof(block_q4_K));
+}
+
+// ====================== 5-bit (de)-quantization
+
+static void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k) {
+    assert(k % QK_K == 0);
+    const int nb = k / QK_K;
+
+#if QK_K == 256
+    uint8_t L[QK_K];
+    float mins[QK_K/32];
+    float scales[QK_K/32];
+    float weights[32];
+    uint8_t Laux[32];
+#else
+    int8_t L[QK_K];
+    float scales[QK_K/16];
+#endif
+
+    for (int i = 0; i < nb; i++) {
+
+#if QK_K == 256
+
+        float max_scale = 0; // as we are deducting the min, scales are always positive
+        float max_min = 0;
+        for (int j = 0; j < QK_K/32; ++j) {
+            //scales[j] = make_qkx1_quants(32, 31, x + 32*j, L + 32*j, &mins[j], 9, 0.5f);
+            float sum_x2 = 0;
+            for (int l = 0; l < 32; ++l) sum_x2 += x[32*j + l] * x[32*j + l];
+            float av_x = sqrtf(sum_x2/32);
+            for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
+            scales[j] = make_qkx2_quants(32, 31, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.5f, 0.1f, 15, false);
+            float scale = scales[j];
+            if (scale > max_scale) {
+                max_scale = scale;
+            }
+            float min = mins[j];
+            if (min > max_min) {
+                max_min = min;
+            }
+        }
+
+        float inv_scale = max_scale > 0 ? 63.f/max_scale : 0.f;
+        float inv_min   = max_min   > 0 ? 63.f/max_min   : 0.f;
+        for (int j = 0; j < QK_K/32; ++j) {
+            uint8_t ls = nearest_int(inv_scale*scales[j]);
+            uint8_t lm = nearest_int(inv_min*mins[j]);
+            ls = MIN(63, ls);
+            lm = MIN(63, lm);
+            if (j < 4) {
+                y[i].scales[j] = ls;
+                y[i].scales[j+4] = lm;
+            } else {
+                y[i].scales[j+4] = (ls & 0xF) | ((lm & 0xF) << 4);
+                y[i].scales[j-4] |= ((ls >> 4) << 6);
+                y[i].scales[j-0] |= ((lm >> 4) << 6);
+            }
+        }
+        y[i].d = GGML_V3_FP32_TO_FP16(max_scale/63.f);
+        y[i].dmin = GGML_V3_FP32_TO_FP16(max_min/63.f);
+
+        uint8_t sc, m;
+        for (int j = 0; j < QK_K/32; ++j) {
+            get_scale_min_k4(j, y[i].scales, &sc, &m);
+            const float d = GGML_V3_FP16_TO_FP32(y[i].d) * sc;
+            if (!d) continue;
+            const float dm = GGML_V3_FP16_TO_FP32(y[i].dmin) * m;
+            for (int ii = 0; ii < 32; ++ii) {
+                int l = nearest_int((x[32*j + ii] + dm)/d);
+                l = MAX(0, MIN(31, l));
+                L[32*j + ii] = l;
+            }
+        }
+
+        uint8_t * restrict qh = y[i].qh;
+        uint8_t * restrict ql = y[i].qs;
+        memset(qh, 0, QK_K/8);
+
+        uint8_t m1 = 1, m2 = 2;
+        for (int n = 0; n < QK_K; n += 64) {
+            for (int j = 0; j < 32; ++j) {
+                int l1 = L[n + j];
+                if (l1 > 15) {
+                    l1 -= 16; qh[j] |= m1;
+                }
+                int l2 = L[n + j + 32];
+                if (l2 > 15) {
+                    l2 -= 16; qh[j] |= m2;
+                }
+                ql[j] = l1 | (l2 << 4);
+            }
+            m1 <<= 2; m2 <<= 2;
+            ql += 32;
+        }
+#else
+        float max_scale = 0, amax = 0;
+        for (int j = 0; j < QK_K/16; ++j) {
+            scales[j] = make_qx_quants(16, 16, x + 16*j, L + 16*j, 1);
+            float abs_scale = fabsf(scales[j]);
+            if (abs_scale > amax) {
+                amax = abs_scale;
+                max_scale = scales[j];
+            }
+        }
+
+        float iscale = -128.f/max_scale;
+        for (int j = 0; j < QK_K/16; ++j) {
+            int l = nearest_int(iscale*scales[j]);
+            y[i].scales[j] = MAX(-128, MIN(127, l));
+        }
+        y[i].d = GGML_V3_FP32_TO_FP16(1/iscale);
+
+        for (int j = 0; j < QK_K/16; ++j) {
+            const float d = GGML_V3_FP16_TO_FP32(y[i].d) * y[i].scales[j];
+            if (!d) continue;
+            for (int ii = 0; ii < 16; ++ii) {
+                int l = nearest_int(x[16*j + ii]/d);
+                l = MAX(-16, MIN(15, l));
+                L[16*j + ii] = l + 16;
+            }
+        }
+
+        uint8_t * restrict qh = y[i].qh;
+        uint8_t * restrict ql = y[i].qs;
+        memset(qh, 0, QK_K/8);
+
+        for (int j = 0; j < 32; ++j) {
+            int jm = j%8;
+            int is = j/8;
+            int l1 = L[j];
+            if (l1 > 15) {
+                l1 -= 16; qh[jm] |= (1 << is);
+            }
+            int l2 = L[j + 32];
+            if (l2 > 15) {
+                l2 -= 16; qh[jm] |= (1 << (4 + is));
+            }
+            ql[j] = l1 | (l2 << 4);
+        }
+#endif
+
+        x += QK_K;
+
+    }
+}
+
+static void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k) {
+    assert(k % QK_K == 0);
+    const int nb = k / QK_K;
+
+    for (int i = 0; i < nb; i++) {
+
+        const uint8_t * ql = x[i].qs;
+        const uint8_t * qh = x[i].qh;
+
+#if QK_K == 256
+
+        const float d = GGML_V3_FP16_TO_FP32(x[i].d);
+        const float min = GGML_V3_FP16_TO_FP32(x[i].dmin);
+
+        int is = 0;
+        uint8_t sc, m;
+        uint8_t u1 = 1, u2 = 2;
+        for (int j = 0; j < QK_K; j += 64) {
+            get_scale_min_k4(is + 0, x[i].scales, &sc, &m);
+            const float d1 = d * sc; const float m1 = min * m;
+            get_scale_min_k4(is + 1, x[i].scales, &sc, &m);
+            const float d2 = d * sc; const float m2 = min * m;
+            for (int l = 0; l < 32; ++l) *y++ = d1 * ((ql[l] & 0xF) + (qh[l] & u1 ? 16 : 0)) - m1;
+            for (int l = 0; l < 32; ++l) *y++ = d2 * ((ql[l]  >> 4) + (qh[l] & u2 ? 16 : 0)) - m2;
+            ql += 32; is += 2;
+            u1 <<= 2; u2 <<= 2;
+        }
+#else
+        float d = GGML_V3_FP16_TO_FP32(x[i].d);
+        const int8_t * restrict s = x[i].scales;
+        for (int l = 0; l < 8; ++l) {
+            y[l+ 0] = d * s[0] * ((ql[l+ 0] & 0xF) - (qh[l] & 0x01 ? 0 : 16));
+            y[l+ 8] = d * s[0] * ((ql[l+ 8] & 0xF) - (qh[l] & 0x02 ? 0 : 16));
+            y[l+16] = d * s[1] * ((ql[l+16] & 0xF) - (qh[l] & 0x04 ? 0 : 16));
+            y[l+24] = d * s[1] * ((ql[l+24] & 0xF) - (qh[l] & 0x08 ? 0 : 16));
+            y[l+32] = d * s[2] * ((ql[l+ 0] >>  4) - (qh[l] & 0x10 ? 0 : 16));
+            y[l+40] = d * s[2] * ((ql[l+ 8] >>  4) - (qh[l] & 0x20 ? 0 : 16));
+            y[l+48] = d * s[3] * ((ql[l+16] >>  4) - (qh[l] & 0x40 ? 0 : 16));
+            y[l+56] = d * s[3] * ((ql[l+24] >>  4) - (qh[l] & 0x80 ? 0 : 16));
+        }
+        y += QK_K;
+#endif
+    }
+}
+
+static void quantize_row_q5_K(const float * restrict x, void * restrict vy, int k) {
+    assert(k % QK_K == 0);
+    block_q5_K * restrict y = vy;
+    quantize_row_q5_K_reference(x, y, k);
+}
+
+size_t ggml_v3_quantize_q5_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
+    assert(k % QK_K == 0);
+    (void)hist; // TODO: collect histograms
+
+    for (int j = 0; j < n; j += k) {
+        block_q5_K * restrict y = (block_q5_K *)dst + j/QK_K;
+        quantize_row_q5_K_reference(src + j, y, k);
+    }
+    return (n/QK_K*sizeof(block_q5_K));
+}
+
+// ====================== 6-bit (de)-quantization
+
+static void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k) {
+    assert(k % QK_K == 0);
+    const int nb = k / QK_K;
+
+    int8_t L[QK_K];
+    float   scales[QK_K/16];
+
+    for (int i = 0; i < nb; i++) {
+
+        float max_scale = 0;
+        float max_abs_scale = 0;
+
+        for (int ib = 0; ib < QK_K/16; ++ib) {
+
+            const float scale = make_qx_quants(16, 32, x + 16*ib, L + 16*ib, 1);
+            scales[ib] = scale;
+
+            const float abs_scale = fabsf(scale);
+            if (abs_scale > max_abs_scale) {
+                max_abs_scale = abs_scale;
+                max_scale = scale;
+            }
+
+        }
+
+        if (!max_abs_scale) {
+            memset(&y[i], 0, sizeof(block_q6_K));
+            y[i].d = GGML_V3_FP32_TO_FP16(0.f);
+            x += QK_K;
+            continue;
+        }
+
+        float iscale = -128.f/max_scale;
+        y[i].d = GGML_V3_FP32_TO_FP16(1/iscale);
+        for (int ib = 0; ib < QK_K/16; ++ib) {
+            y[i].scales[ib] = MIN(127, nearest_int(iscale*scales[ib]));
+        }
+
+        for (int j = 0; j < QK_K/16; ++j) {
+            float d = GGML_V3_FP16_TO_FP32(y[i].d) * y[i].scales[j];
+            if (!d) {
+                continue;
+            }
+            for (int ii = 0; ii < 16; ++ii) {
+                int l = nearest_int(x[16*j + ii]/d);
+                l = MAX(-32, MIN(31, l));
+                L[16*j + ii] = l + 32;
+            }
+        }
+
+        uint8_t * restrict ql = y[i].ql;
+        uint8_t * restrict qh = y[i].qh;
+#if QK_K == 256
+        for (int j = 0; j < QK_K; j += 128) {
+            for (int l = 0; l < 32; ++l) {
+                const uint8_t q1 = L[j + l +  0] & 0xF;
+                const uint8_t q2 = L[j + l + 32] & 0xF;
+                const uint8_t q3 = L[j + l + 64] & 0xF;
+                const uint8_t q4 = L[j + l + 96] & 0xF;
+                ql[l+ 0] = q1 | (q3 << 4);
+                ql[l+32] = q2 | (q4 << 4);
+                qh[l] = (L[j + l] >> 4) | ((L[j + l + 32] >> 4) << 2) | ((L[j + l + 64] >> 4) << 4) | ((L[j + l + 96] >> 4) << 6);
+            }
+            ql += 64;
+            qh += 32;
+        }
+#else
+        for (int l = 0; l < 32; ++l) {
+            const uint8_t q1 = L[l +  0] & 0xF;
+            const uint8_t q2 = L[l + 32] & 0xF;
+            ql[l] = q1 | (q2 << 4);
+        }
+        for (int l = 0; l < 16; ++l) {
+            qh[l] = (L[l] >> 4) | ((L[l + 16] >> 4) << 2) | ((L[l + 32] >> 4) << 4) | ((L[l + 48] >> 4) << 6);
+        }
+#endif
+
+        x += QK_K;
+
+    }
+}
+
+static void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k) {
+    assert(k % QK_K == 0);
+    const int nb = k / QK_K;
+
+    for (int i = 0; i < nb; i++) {
+
+        const float d = GGML_V3_FP16_TO_FP32(x[i].d);
+
+        const uint8_t * restrict ql = x[i].ql;
+        const uint8_t * restrict qh = x[i].qh;
+        const int8_t  * restrict sc = x[i].scales;
+
+#if QK_K == 256
+        for (int n = 0; n < QK_K; n += 128) {
+            for (int l = 0; l < 32; ++l) {
+                int is = l/16;
+                const int8_t q1 = (int8_t)((ql[l +  0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
+                const int8_t q2 = (int8_t)((ql[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
+                const int8_t q3 = (int8_t)((ql[l +  0]  >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
+                const int8_t q4 = (int8_t)((ql[l + 32]  >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
+                y[l +  0] = d * sc[is + 0] * q1;
+                y[l + 32] = d * sc[is + 2] * q2;
+                y[l + 64] = d * sc[is + 4] * q3;
+                y[l + 96] = d * sc[is + 6] * q4;
+            }
+            y  += 128;
+            ql += 64;
+            qh += 32;
+            sc += 8;
+        }
+#else
+        for (int l = 0; l < 16; ++l) {
+            const int8_t q1 = (int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
+            const int8_t q2 = (int8_t)((ql[l+16] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
+            const int8_t q3 = (int8_t)((ql[l+ 0]  >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
+            const int8_t q4 = (int8_t)((ql[l+16]  >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
+            y[l+ 0] = d * sc[0] * q1;
+            y[l+16] = d * sc[1] * q2;
+            y[l+32] = d * sc[2] * q3;
+            y[l+48] = d * sc[3] * q4;
+        }
+        y  += 64;
+#endif
+
+    }
+}
+
+static void quantize_row_q6_K(const float * restrict x, void * restrict vy, int k) {
+    assert(k % QK_K == 0);
+    block_q6_K * restrict y = vy;
+    quantize_row_q6_K_reference(x, y, k);
+}
+
+size_t ggml_v3_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist) {
+    assert(k % QK_K == 0);
+    (void)hist; // TODO: collect histograms
+
+    for (int j = 0; j < n; j += k) {
+        block_q6_K * restrict y = (block_q6_K *)dst + j/QK_K;
+        quantize_row_q6_K_reference(src + j, y, k);
+    }
+    return (n/QK_K*sizeof(block_q6_K));
+}
+
+// ====================== "True" 2-bit (de)-quantization
+
+static const  uint64_t iq2xxs_grid[256] = {
+    0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
+    0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x08080808082b0808,
+    0x08080808082b082b, 0x08080808082b2b08, 0x08080808082b2b2b, 0x0808080819080819,
+    0x0808080819081908, 0x0808080819190808, 0x0808080819192b08, 0x08080808192b0819,
+    0x08080808192b1908, 0x080808082b080808, 0x080808082b08082b, 0x080808082b082b2b,
+    0x080808082b2b082b, 0x0808081908080819, 0x0808081908081908, 0x0808081908190808,
+    0x0808081908191919, 0x0808081919080808, 0x080808192b081908, 0x080808192b192b08,
+    0x0808082b08080808, 0x0808082b0808082b, 0x0808082b082b082b, 0x0808082b2b08082b,
+    0x0808190808080819, 0x0808190808081908, 0x0808190808190808, 0x08081908082b0819,
+    0x08081908082b1908, 0x0808190819080808, 0x080819081908082b, 0x0808190819082b08,
+    0x08081908192b0808, 0x080819082b080819, 0x080819082b081908, 0x080819082b190808,
+    0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b, 0x0808191908082b08,
+    0x08081919082b0808, 0x080819191908192b, 0x08081919192b2b19, 0x080819192b080808,
+    0x080819192b190819, 0x0808192b08082b19, 0x0808192b08190808, 0x0808192b19080808,
+    0x0808192b2b081908, 0x0808192b2b2b1908, 0x08082b0808080808, 0x08082b0808081919,
+    0x08082b0808082b08, 0x08082b0808191908, 0x08082b08082b2b08, 0x08082b0819080819,
+    0x08082b0819081908, 0x08082b0819190808, 0x08082b081919082b, 0x08082b082b082b08,
+    0x08082b1908081908, 0x08082b1919080808, 0x08082b2b0808082b, 0x08082b2b08191908,
+    0x0819080808080819, 0x0819080808081908, 0x0819080808190808, 0x08190808082b0819,
+    0x0819080819080808, 0x08190808192b0808, 0x081908082b081908, 0x081908082b190808,
+    0x081908082b191919, 0x0819081908080808, 0x0819081908082b08, 0x08190819082b0808,
+    0x0819081919190808, 0x0819081919192b2b, 0x081908192b080808, 0x0819082b082b1908,
+    0x0819082b19081919, 0x0819190808080808, 0x0819190808082b08, 0x08191908082b0808,
+    0x08191908082b1919, 0x0819190819082b19, 0x081919082b080808, 0x0819191908192b08,
+    0x08191919192b082b, 0x0819192b08080808, 0x0819192b0819192b, 0x08192b0808080819,
+    0x08192b0808081908, 0x08192b0808190808, 0x08192b0819080808, 0x08192b082b080819,
+    0x08192b1908080808, 0x08192b1908081919, 0x08192b192b2b0808, 0x08192b2b19190819,
+    0x082b080808080808, 0x082b08080808082b, 0x082b080808082b2b, 0x082b080819081908,
+    0x082b0808192b0819, 0x082b08082b080808, 0x082b08082b08082b, 0x082b0819082b2b19,
+    0x082b081919082b08, 0x082b082b08080808, 0x082b082b0808082b, 0x082b190808080819,
+    0x082b190808081908, 0x082b190808190808, 0x082b190819080808, 0x082b19081919192b,
+    0x082b191908080808, 0x082b191919080819, 0x082b1919192b1908, 0x082b192b2b190808,
+    0x082b2b0808082b08, 0x082b2b08082b0808, 0x082b2b082b191908, 0x082b2b2b19081908,
+    0x1908080808080819, 0x1908080808081908, 0x1908080808190808, 0x1908080808192b08,
+    0x19080808082b0819, 0x19080808082b1908, 0x1908080819080808, 0x1908080819082b08,
+    0x190808081919192b, 0x19080808192b0808, 0x190808082b080819, 0x190808082b081908,
+    0x190808082b190808, 0x1908081908080808, 0x19080819082b0808, 0x19080819192b0819,
+    0x190808192b080808, 0x190808192b081919, 0x1908082b08080819, 0x1908082b08190808,
+    0x1908082b19082b08, 0x1908082b1919192b, 0x1908082b192b2b08, 0x1908190808080808,
+    0x1908190808082b08, 0x19081908082b0808, 0x190819082b080808, 0x190819082b192b19,
+    0x190819190819082b, 0x19081919082b1908, 0x1908192b08080808, 0x19082b0808080819,
+    0x19082b0808081908, 0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919,
+    0x19082b1908080808, 0x19082b1919192b08, 0x19082b19192b0819, 0x19082b192b08082b,
+    0x19082b2b19081919, 0x19082b2b2b190808, 0x1919080808080808, 0x1919080808082b08,
+    0x1919080808190819, 0x1919080808192b19, 0x19190808082b0808, 0x191908082b080808,
+    0x191908082b082b08, 0x1919081908081908, 0x191908191908082b, 0x191908192b2b1908,
+    0x1919082b2b190819, 0x191919082b190808, 0x191919082b19082b, 0x1919191908082b2b,
+    0x1919192b08080819, 0x1919192b19191908, 0x19192b0808080808, 0x19192b0808190819,
+    0x19192b0808192b19, 0x19192b08192b1908, 0x19192b1919080808, 0x19192b2b08082b08,
+    0x192b080808081908, 0x192b080808190808, 0x192b080819080808, 0x192b0808192b2b08,
+    0x192b081908080808, 0x192b081919191919, 0x192b082b08192b08, 0x192b082b192b0808,
+    0x192b190808080808, 0x192b190808081919, 0x192b191908190808, 0x192b19190819082b,
+    0x192b19192b081908, 0x192b2b081908082b, 0x2b08080808080808, 0x2b0808080808082b,
+    0x2b08080808082b2b, 0x2b08080819080819, 0x2b0808082b08082b, 0x2b08081908081908,
+    0x2b08081908192b08, 0x2b08081919080808, 0x2b08082b08190819, 0x2b08190808080819,
+    0x2b08190808081908, 0x2b08190808190808, 0x2b08190808191919, 0x2b08190819080808,
+    0x2b081908192b0808, 0x2b08191908080808, 0x2b0819191908192b, 0x2b0819192b191908,
+    0x2b08192b08082b19, 0x2b08192b19080808, 0x2b08192b192b0808, 0x2b082b080808082b,
+    0x2b082b1908081908, 0x2b082b2b08190819, 0x2b19080808081908, 0x2b19080808190808,
+    0x2b190808082b1908, 0x2b19080819080808, 0x2b1908082b2b0819, 0x2b1908190819192b,
+    0x2b1908192b080808, 0x2b19082b19081919, 0x2b19190808080808, 0x2b191908082b082b,
+    0x2b19190819081908, 0x2b19191919190819, 0x2b192b082b080819, 0x2b192b19082b0808,
+    0x2b2b08080808082b, 0x2b2b080819190808, 0x2b2b08082b081919, 0x2b2b081908082b19,
+    0x2b2b082b08080808, 0x2b2b190808192b08, 0x2b2b2b0819190808, 0x2b2b2b1908081908,
+};
+
+static const uint64_t iq2xs_grid[512] = {
+    0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
+    0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
+    0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
+    0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
+    0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
+    0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x080808082b080808,
+    0x080808082b08082b, 0x080808082b081919, 0x080808082b082b08, 0x080808082b190819,
+    0x080808082b191908, 0x080808082b192b19, 0x080808082b2b0808, 0x0808081908080819,
+    0x0808081908081908, 0x080808190808192b, 0x0808081908082b19, 0x0808081908190808,
+    0x080808190819082b, 0x0808081908191919, 0x0808081908192b08, 0x0808081908192b2b,
+    0x08080819082b0819, 0x08080819082b1908, 0x0808081919080808, 0x080808191908082b,
+    0x0808081919081919, 0x0808081919082b08, 0x0808081919190819, 0x0808081919191908,
+    0x08080819192b0808, 0x08080819192b2b08, 0x080808192b080819, 0x080808192b081908,
+    0x080808192b190808, 0x0808082b08080808, 0x0808082b0808082b, 0x0808082b08081919,
+    0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908, 0x0808082b082b0808,
+    0x0808082b19080819, 0x0808082b19081908, 0x0808082b19190808, 0x0808082b19191919,
+    0x0808082b2b080808, 0x0808082b2b082b2b, 0x0808190808080819, 0x0808190808081908,
+    0x080819080808192b, 0x0808190808082b19, 0x0808190808190808, 0x080819080819082b,
+    0x0808190808191919, 0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908,
+    0x0808190819080808, 0x080819081908082b, 0x0808190819081919, 0x0808190819082b08,
+    0x0808190819190819, 0x0808190819191908, 0x080819081919192b, 0x08081908192b0808,
+    0x080819082b080819, 0x080819082b081908, 0x080819082b190808, 0x0808191908080808,
+    0x080819190808082b, 0x0808191908081919, 0x0808191908082b08, 0x0808191908190819,
+    0x0808191908191908, 0x08081919082b0808, 0x0808191919080819, 0x0808191919081908,
+    0x0808191919190808, 0x08081919192b0819, 0x080819192b080808, 0x0808192b08080819,
+    0x0808192b08081908, 0x0808192b08190808, 0x0808192b082b192b, 0x0808192b19080808,
+    0x0808192b1908082b, 0x0808192b2b081908, 0x08082b0808080808, 0x08082b080808082b,
+    0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808082b2b, 0x08082b0808190819,
+    0x08082b0808191908, 0x08082b08082b0808, 0x08082b08082b1919, 0x08082b0819080819,
+    0x08082b0819081908, 0x08082b0819190808, 0x08082b0819192b08, 0x08082b082b080808,
+    0x08082b082b2b0808, 0x08082b082b2b2b2b, 0x08082b1908080819, 0x08082b1908081908,
+    0x08082b1908190808, 0x08082b1919080808, 0x08082b192b080819, 0x08082b192b082b19,
+    0x08082b2b08080808, 0x08082b2b082b0808, 0x08082b2b082b2b08, 0x08082b2b2b19192b,
+    0x08082b2b2b2b0808, 0x0819080808080819, 0x0819080808081908, 0x081908080808192b,
+    0x0819080808082b19, 0x0819080808190808, 0x081908080819082b, 0x0819080808191919,
+    0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908, 0x0819080819080808,
+    0x081908081908082b, 0x0819080819081919, 0x0819080819082b08, 0x0819080819190819,
+    0x0819080819191908, 0x08190808192b0808, 0x08190808192b2b2b, 0x081908082b080819,
+    0x081908082b081908, 0x081908082b190808, 0x0819081908080808, 0x081908190808082b,
+    0x0819081908081919, 0x0819081908082b08, 0x0819081908190819, 0x0819081908191908,
+    0x08190819082b0808, 0x0819081919080819, 0x0819081919081908, 0x0819081919190808,
+    0x081908192b080808, 0x081908192b191908, 0x081908192b19192b, 0x0819082b08080819,
+    0x0819082b08081908, 0x0819082b0808192b, 0x0819082b08190808, 0x0819082b19080808,
+    0x0819082b192b0808, 0x0819190808080808, 0x081919080808082b, 0x0819190808081919,
+    0x0819190808082b08, 0x0819190808190819, 0x0819190808191908, 0x08191908082b0808,
+    0x0819190819080819, 0x0819190819081908, 0x0819190819082b19, 0x0819190819190808,
+    0x08191908192b1908, 0x081919082b080808, 0x0819191908080819, 0x0819191908081908,
+    0x0819191908190808, 0x0819191919080808, 0x0819192b08080808, 0x0819192b08191908,
+    0x0819192b19082b19, 0x08192b0808080819, 0x08192b0808081908, 0x08192b0808190808,
+    0x08192b080819082b, 0x08192b0819080808, 0x08192b0819191908, 0x08192b082b08192b,
+    0x08192b1908080808, 0x08192b1908081919, 0x08192b19192b192b, 0x08192b2b19190819,
+    0x08192b2b2b2b2b19, 0x082b080808080808, 0x082b08080808082b, 0x082b080808081919,
+    0x082b080808082b08, 0x082b080808082b2b, 0x082b080808190819, 0x082b080808191908,
+    0x082b0808082b0808, 0x082b080819080819, 0x082b080819081908, 0x082b080819190808,
+    0x082b08082b080808, 0x082b08082b2b0808, 0x082b081908080819, 0x082b081908081908,
+    0x082b081908190808, 0x082b081919080808, 0x082b081919082b08, 0x082b0819192b1919,
+    0x082b082b08080808, 0x082b082b082b082b, 0x082b082b2b080808, 0x082b082b2b2b2b08,
+    0x082b190808080819, 0x082b190808081908, 0x082b190808190808, 0x082b1908082b2b19,
+    0x082b190819080808, 0x082b191908080808, 0x082b191919080819, 0x082b19191919082b,
+    0x082b19192b192b19, 0x082b192b08080819, 0x082b192b08192b2b, 0x082b192b2b2b192b,
+    0x082b2b0808080808, 0x082b2b0808082b08, 0x082b2b0808082b2b, 0x082b2b08082b0808,
+    0x082b2b0819191919, 0x082b2b082b082b08, 0x082b2b082b2b082b, 0x082b2b19192b2b08,
+    0x082b2b192b190808, 0x082b2b2b08082b08, 0x082b2b2b082b0808, 0x082b2b2b2b08082b,
+    0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819, 0x1908080808081908,
+    0x190808080808192b, 0x1908080808082b19, 0x1908080808190808, 0x190808080819082b,
+    0x1908080808191919, 0x1908080808192b08, 0x19080808082b0819, 0x19080808082b1908,
+    0x1908080819080808, 0x190808081908082b, 0x1908080819081919, 0x1908080819082b08,
+    0x1908080819082b2b, 0x1908080819190819, 0x1908080819191908, 0x19080808192b0808,
+    0x19080808192b1919, 0x190808082b080819, 0x190808082b081908, 0x190808082b190808,
+    0x1908081908080808, 0x190808190808082b, 0x1908081908081919, 0x1908081908082b08,
+    0x1908081908190819, 0x1908081908191908, 0x19080819082b0808, 0x1908081919080819,
+    0x1908081919081908, 0x1908081919190808, 0x190808192b080808, 0x190808192b081919,
+    0x190808192b2b082b, 0x1908082b08080819, 0x1908082b08081908, 0x1908082b08190808,
+    0x1908082b0819082b, 0x1908082b082b2b19, 0x1908082b19080808, 0x1908190808080808,
+    0x190819080808082b, 0x1908190808081919, 0x1908190808082b08, 0x1908190808190819,
+    0x1908190808191908, 0x1908190808192b19, 0x19081908082b0808, 0x1908190819080819,
+    0x1908190819081908, 0x1908190819190808, 0x190819082b080808, 0x190819082b191908,
+    0x1908191908080819, 0x1908191908081908, 0x1908191908190808, 0x19081919082b1908,
+    0x1908191919080808, 0x190819192b192b2b, 0x1908192b08080808, 0x1908192b08082b2b,
+    0x1908192b19081908, 0x1908192b19190808, 0x19082b0808080819, 0x19082b0808081908,
+    0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919, 0x19082b0819191908,
+    0x19082b08192b082b, 0x19082b1908080808, 0x19082b1908190819, 0x19082b1919081908,
+    0x19082b1919190808, 0x19082b19192b2b19, 0x19082b2b08081908, 0x1919080808080808,
+    0x191908080808082b, 0x1919080808081919, 0x1919080808082b08, 0x1919080808190819,
+    0x1919080808191908, 0x19190808082b0808, 0x19190808082b2b08, 0x1919080819080819,
+    0x1919080819081908, 0x1919080819190808, 0x191908082b080808, 0x1919081908080819,
+    0x1919081908081908, 0x1919081908190808, 0x1919081908191919, 0x1919081919080808,
+    0x191908191908082b, 0x1919082b08080808, 0x1919082b19081908, 0x1919082b2b2b2b2b,
+    0x1919190808080819, 0x1919190808081908, 0x1919190808190808, 0x19191908082b0819,
+    0x1919190819080808, 0x19191908192b0808, 0x191919082b080819, 0x191919082b2b0819,
+    0x1919191908080808, 0x1919191908082b08, 0x191919192b080808, 0x191919192b082b08,
+    0x1919192b082b0819, 0x1919192b192b2b08, 0x1919192b2b2b0819, 0x19192b0808080808,
+    0x19192b0808191908, 0x19192b0819080819, 0x19192b0819190808, 0x19192b082b192b19,
+    0x19192b1908192b2b, 0x19192b1919080808, 0x19192b191908082b, 0x19192b2b2b081919,
+    0x192b080808080819, 0x192b080808081908, 0x192b080808190808, 0x192b080819080808,
+    0x192b080819191908, 0x192b0808192b082b, 0x192b08082b08192b, 0x192b08082b2b2b19,
+    0x192b081908080808, 0x192b082b082b1908, 0x192b082b19082b2b, 0x192b082b2b19082b,
+    0x192b190808080808, 0x192b19080819192b, 0x192b191908190808, 0x192b191919080808,
+    0x192b191919081919, 0x192b19192b2b1908, 0x192b2b0808080819, 0x192b2b08192b2b2b,
+    0x192b2b19082b1919, 0x192b2b2b0808192b, 0x192b2b2b19191908, 0x192b2b2b192b082b,
+    0x2b08080808080808, 0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08,
+    0x2b08080808190819, 0x2b08080808191908, 0x2b080808082b0808, 0x2b080808082b2b2b,
+    0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808082b080808,
+    0x2b0808082b08082b, 0x2b0808082b2b2b08, 0x2b0808082b2b2b2b, 0x2b08081908080819,
+    0x2b08081908081908, 0x2b0808190808192b, 0x2b08081908190808, 0x2b08081919080808,
+    0x2b08081919190819, 0x2b08081919192b19, 0x2b08082b08080808, 0x2b08082b082b0808,
+    0x2b08082b2b080808, 0x2b08082b2b08082b, 0x2b08082b2b2b0808, 0x2b08082b2b2b2b08,
+    0x2b08190808080819, 0x2b08190808081908, 0x2b08190808190808, 0x2b0819080819082b,
+    0x2b08190808191919, 0x2b08190819080808, 0x2b081908192b0808, 0x2b0819082b082b19,
+    0x2b08191908080808, 0x2b08191919081908, 0x2b0819192b2b1919, 0x2b08192b08192b08,
+    0x2b08192b192b2b2b, 0x2b082b0808080808, 0x2b082b0808082b08, 0x2b082b08082b1919,
+    0x2b082b0819192b2b, 0x2b082b082b080808, 0x2b082b082b08082b, 0x2b082b082b2b2b08,
+    0x2b082b190808192b, 0x2b082b2b082b082b, 0x2b082b2b2b080808, 0x2b082b2b2b082b08,
+    0x2b082b2b2b19192b, 0x2b082b2b2b2b2b08, 0x2b19080808080819, 0x2b19080808081908,
+    0x2b19080808190808, 0x2b19080819080808, 0x2b1908081919192b, 0x2b1908082b081908,
+    0x2b19081908080808, 0x2b190819082b082b, 0x2b190819192b1908, 0x2b19082b1919192b,
+    0x2b19082b2b082b19, 0x2b19190808080808, 0x2b19190808081919, 0x2b19190819081908,
+    0x2b19190819190808, 0x2b19190819192b08, 0x2b191919082b2b19, 0x2b1919192b190808,
+    0x2b1919192b19082b, 0x2b19192b19080819, 0x2b192b0819190819, 0x2b192b082b2b192b,
+    0x2b192b1919082b19, 0x2b192b2b08191919, 0x2b192b2b192b0808, 0x2b2b080808080808,
+    0x2b2b08080808082b, 0x2b2b080808082b08, 0x2b2b080808082b2b, 0x2b2b0808082b0808,
+    0x2b2b0808082b2b2b, 0x2b2b08082b2b0808, 0x2b2b081919190819, 0x2b2b081919192b19,
+    0x2b2b08192b2b192b, 0x2b2b082b08080808, 0x2b2b082b0808082b, 0x2b2b082b08082b08,
+    0x2b2b082b082b2b2b, 0x2b2b082b2b080808, 0x2b2b082b2b2b0808, 0x2b2b190819080808,
+    0x2b2b19082b191919, 0x2b2b192b192b1919, 0x2b2b192b2b192b08, 0x2b2b2b0808082b2b,
+    0x2b2b2b08082b0808, 0x2b2b2b08082b082b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b0808,
+    0x2b2b2b082b2b2b08, 0x2b2b2b1908081908, 0x2b2b2b192b081908, 0x2b2b2b192b08192b,
+    0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
+};
+
+static const uint8_t ksigns_iq2xs[128] = {
+      0, 129, 130,   3, 132,   5,   6, 135, 136,   9,  10, 139,  12, 141, 142,  15,
+    144,  17,  18, 147,  20, 149, 150,  23,  24, 153, 154,  27, 156,  29,  30, 159,
+    160,  33,  34, 163,  36, 165, 166,  39,  40, 169, 170,  43, 172,  45,  46, 175,
+     48, 177, 178,  51, 180,  53,  54, 183, 184,  57,  58, 187,  60, 189, 190,  63,
+    192,  65,  66, 195,  68, 197, 198,  71,  72, 201, 202,  75, 204,  77,  78, 207,
+     80, 209, 210,  83, 212,  85,  86, 215, 216,  89,  90, 219,  92, 221, 222,  95,
+     96, 225, 226,  99, 228, 101, 102, 231, 232, 105, 106, 235, 108, 237, 238, 111,
+    240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255,
+};
+
+static const uint8_t kmask_iq2xs[8] = {1, 2, 4, 8, 16, 32, 64, 128};
+
+static void quantize_row_iq2_xxs_reference(const float * restrict x, block_iq2_xxs * restrict y, int k) {
+    (void)x;
+    (void)y;
+    (void)k;
+    assert(k % QK_K == 0);
+    //fprintf(stderr, "=========================== %s: not implemented\n", __func__);
+}
+
+static void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k) {
+    assert(k % QK_K == 0);
+    const int nb = k / QK_K;
+
+    uint32_t aux32[2];
+    const uint8_t * aux8 = (const uint8_t *)aux32;
+
+    for (int i = 0; i < nb; i++) {
+
+        const float d = GGML_V3_FP16_TO_FP32(x[i].d);
+
+        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
+            memcpy(aux32, x[i].qs + 4*ib32, 2*sizeof(uint32_t));
+            const float db = d * (0.5f + (aux32[1] >> 28)) * 0.25f;
+            for (int l = 0; l < 4; ++l) {
+                const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
+                const uint8_t  signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127];
+                for (int j = 0; j < 8; ++j) {
+                    y[j] = db * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
+                }
+                y += 8;
+            }
+        }
+    }
+}
+
+static void quantize_row_iq2_xxs(const float * restrict x, void * restrict vy, int k) {
+    assert(k % QK_K == 0);
+    block_iq2_xxs * restrict y = vy;
+    quantize_row_iq2_xxs_reference(x, y, k);
+}
+
+size_t ggml_v3_quantize_iq2_xxs(const float * src, void * dst, int n, int k, int64_t * hist) {
+    assert(k % QK_K == 0);
+    (void)hist; // TODO: collect histograms
+
+    for (int j = 0; j < n; j += k) {
+        block_iq2_xxs * restrict y = (block_iq2_xxs *)dst + j/QK_K;
+        quantize_row_iq2_xxs_reference(src + j, y, k);
+    }
+    return (n/QK_K*sizeof(block_iq2_xxs));
+}
+
+// ====================== 2.3125 bpw (de)-quantization
+
+static void quantize_row_iq2_xs_reference(const float * restrict x, block_iq2_xs * restrict y, int k) {
+    (void)x;
+    (void)y;
+    (void)k;
+    assert(k % QK_K == 0);
+    //fprintf(stderr, "=========================== %s: not implemented\n", __func__);
+}
+
+static void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y, int k) {
+    assert(k % QK_K == 0);
+    const int nb = k / QK_K;
+
+    float db[2];
+
+    for (int i = 0; i < nb; i++) {
+
+        const float d = GGML_V3_FP16_TO_FP32(x[i].d);
+
+        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
+            db[0] = d * (0.5f + (x[i].scales[ib32] & 0xf)) * 0.25f;
+            db[1] = d * (0.5f + (x[i].scales[ib32] >>  4)) * 0.25f;
+            for (int l = 0; l < 4; ++l) {
+                const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (x[i].qs[4*ib32 + l] & 511));
+                const uint8_t  signs = ksigns_iq2xs[x[i].qs[4*ib32 + l] >> 9];
+                for (int j = 0; j < 8; ++j) {
+                    y[j] = db[l/2] * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
+                }
+                y += 8;
+            }
+        }
+    }
+}
+
+static void quantize_row_iq2_xs(const float * restrict x, void * restrict vy, int k) {
+    assert(k % QK_K == 0);
+    block_iq2_xs * restrict y = vy;
+    quantize_row_iq2_xs_reference(x, y, k);
+}
+
+size_t ggml_v3_quantize_iq2_xs(const float * src, void * dst, int n, int k, int64_t * hist) {
+    assert(k % QK_K == 0);
+    (void)hist; // TODO: collect histograms
+
+    for (int j = 0; j < n; j += k) {
+        block_iq2_xs * restrict y = (block_iq2_xs *)dst + j/QK_K;
+        quantize_row_iq2_xs_reference(src + j, y, k);
+    }
+    return (n/QK_K*sizeof(block_iq2_xs));
+}
+
+//===================================== Q8_K ==============================================
+
+static void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k) {
+    assert(k % QK_K == 0);
+    const int nb = k / QK_K;
+
+    for (int i = 0; i < nb; i++) {
+
+        float max = 0;
+        float amax = 0;
+        for (int j = 0; j < QK_K; ++j) {
+            float ax = fabsf(x[j]);
+            if (ax > amax) {
+                amax = ax; max = x[j];
+            }
+        }
+        if (!amax) {
+            y[i].d = 0;
+            memset(y[i].qs, 0, QK_K);
+            x += QK_K;
+            continue;
+        }
+        //const float iscale = -128.f/max;
+        // We need this change for IQ2_XXS, else the AVX implementation becomes very awkward
+        const float iscale = -127.f/max;
+        for (int j = 0; j < QK_K; ++j) {
+            int v = nearest_int(iscale*x[j]);
+            y[i].qs[j] = MIN(127, v);
+        }
+        for (int j = 0; j < QK_K/16; ++j) {
+            int sum = 0;
+            for (int ii = 0; ii < 16; ++ii) {
+                sum += y[i].qs[j*16 + ii];
+            }
+            y[i].bsums[j] = sum;
+        }
+        y[i].d = 1/iscale;
+        x += QK_K;
+    }
+}
+
+static void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k) {
+    assert(k % QK_K == 0);
+    const int nb = k / QK_K;
+
+    for (int i = 0; i < nb; i++) {
+        for (int j = 0; j < QK_K; ++j) {
+            *y++ = x[i].d * x[i].qs[j];
+        }
+    }
+}
+
+static void quantize_row_q8_K(const float * restrict x, void * restrict y, int k) {
+    quantize_row_q8_K_reference(x, y, k);
+}
+
+//===================================== Dot ptoducts =================================
+
+//
+// Helper functions
+//
+#if __AVX__ || __AVX2__ || __AVX512F__
+
+// shuffles to pick the required scales in dot products
+static inline __m256i get_scale_shuffle_q3k(int i) {
+    static const uint8_t k_shuffle[128] = {
+         0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,     2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
+         4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5,     6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7,
+         8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9,    10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,
+        12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,    14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,
+    };
+    return _mm256_loadu_si256((const __m256i*)k_shuffle + i);
+}
+static inline __m256i get_scale_shuffle_k4(int i) {
+    static const uint8_t k_shuffle[256] = {
+         0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
+         2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
+         4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5,
+         6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7,
+         8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9,
+        10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,
+        12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,
+        14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15
+    };
+    return _mm256_loadu_si256((const __m256i*)k_shuffle + i);
+}
+static inline __m128i get_scale_shuffle(int i) {
+    static const uint8_t k_shuffle[128] = {
+         0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
+         2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
+         4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5,
+         6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7,
+         8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9,
+        10,10,10,10,10,10,10,10, 11,11,11,11,11,11,11,11,
+        12,12,12,12,12,12,12,12, 13,13,13,13,13,13,13,13,
+        14,14,14,14,14,14,14,14, 15,15,15,15,15,15,15,15
+    };
+    return _mm_loadu_si128((const __m128i*)k_shuffle + i);
+}
+#endif
+
+static void ggml_v3_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+
+    assert(n % qk == 0);
+
+    const block_q4_0 * restrict x = vx;
+    const block_q8_0 * restrict y = vy;
+
+#if defined(__ARM_NEON)
+    float32x4_t sumv0 = vdupq_n_f32(0.0f);
+    float32x4_t sumv1 = vdupq_n_f32(0.0f);
+
+    assert(nb % 2 == 0); // TODO: handle odd nb
+
+    for (int i = 0; i < nb; i += 2) {
+        const block_q4_0 * restrict x0 = &x[i + 0];
+        const block_q4_0 * restrict x1 = &x[i + 1];
+        const block_q8_0 * restrict y0 = &y[i + 0];
+        const block_q8_0 * restrict y1 = &y[i + 1];
+
+        const uint8x16_t m4b = vdupq_n_u8(0x0F);
+        const int8x16_t  s8b = vdupq_n_s8(0x8);
+
+        const uint8x16_t v0_0 = vld1q_u8(x0->qs);
+        const uint8x16_t v0_1 = vld1q_u8(x1->qs);
+
+        // 4-bit -> 8-bit
+        const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
+        const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
+        const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
+        const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
+
+        // sub 8
+        const int8x16_t v0_0ls = vsubq_s8(v0_0l, s8b);
+        const int8x16_t v0_0hs = vsubq_s8(v0_0h, s8b);
+        const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b);
+        const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b);
+
+        // load y
+        const int8x16_t v1_0l = vld1q_s8(y0->qs);
+        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
+        const int8x16_t v1_1l = vld1q_s8(y1->qs);
+        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
+
+        // dot product into int32x4_t
+        const int32x4_t p_0 = ggml_v3_vdotq_s32(ggml_v3_vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h);
+        const int32x4_t p_1 = ggml_v3_vdotq_s32(ggml_v3_vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h);
+
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_V3_FP16_TO_FP32(x0->d)*GGML_V3_FP16_TO_FP32(y0->d));
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_V3_FP16_TO_FP32(x1->d)*GGML_V3_FP16_TO_FP32(y1->d));
+    }
+
+    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
+#elif defined(__AVX2__)
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+
+    // Main loop
+    for (int i = 0; i < nb; ++i) {
+        /* Compute combined scale for the block */
+        const __m256 d = _mm256_set1_ps( GGML_V3_FP16_TO_FP32(x[i].d) * GGML_V3_FP16_TO_FP32(y[i].d) );
+
+        __m256i bx = bytes_from_nibbles_32(x[i].qs);
+
+        // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
+        const __m256i off = _mm256_set1_epi8( 8 );
+        bx = _mm256_sub_epi8( bx, off );
+
+        __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
+
+        const __m256 q = mul_sum_i8_pairs_float(bx, by);
+
+        /* Multiply q with scale and accumulate */
+        acc = _mm256_fmadd_ps( d, q, acc );
+    }
+
+    *s = hsum_float_8(acc);
+#elif defined(__AVX__)
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+
+    // Main loop
+    for (int i = 0; i < nb; ++i) {
+        // Compute combined scale for the block
+        const __m256 d = _mm256_set1_ps( GGML_V3_FP16_TO_FP32(x[i].d) * GGML_V3_FP16_TO_FP32(y[i].d) );
+
+        const __m128i lowMask = _mm_set1_epi8(0xF);
+        const __m128i off = _mm_set1_epi8(8);
+
+        const __m128i tmp = _mm_loadu_si128((const __m128i *)x[i].qs);
+
+        __m128i bx = _mm_and_si128(lowMask, tmp);
+        __m128i by = _mm_loadu_si128((const __m128i *)y[i].qs);
+        bx = _mm_sub_epi8(bx, off);
+        const __m128i i32_0 = mul_sum_i8_pairs(bx, by);
+
+        bx = _mm_and_si128(lowMask, _mm_srli_epi64(tmp, 4));
+        by = _mm_loadu_si128((const __m128i *)(y[i].qs + 16));
+        bx = _mm_sub_epi8(bx, off);
+        const __m128i i32_1 = mul_sum_i8_pairs(bx, by);
+
+        // Convert int32_t to float
+        __m256 p = _mm256_cvtepi32_ps(MM256_SET_M128I(i32_0, i32_1));
+
+        // Apply the scale, and accumulate
+        acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc);
+    }
+
+    *s = hsum_float_8(acc);
+#elif defined(__SSSE3__)
+    // set constants
+    const __m128i lowMask = _mm_set1_epi8(0xF);
+    const __m128i off = _mm_set1_epi8(8);
+
+    // Initialize accumulator with zeros
+    __m128 acc_0 = _mm_setzero_ps();
+    __m128 acc_1 = _mm_setzero_ps();
+    __m128 acc_2 = _mm_setzero_ps();
+    __m128 acc_3 = _mm_setzero_ps();
+
+    // First round without accumulation
+    {
+        _mm_prefetch(&x[0] + sizeof(block_q4_0), _MM_HINT_T0);
+        _mm_prefetch(&y[0] + sizeof(block_q8_0), _MM_HINT_T0);
+
+        // Compute combined scale for the block 0 and 1
+        const __m128 d_0_1 = _mm_set1_ps( GGML_V3_FP16_TO_FP32(x[0].d) * GGML_V3_FP16_TO_FP32(y[0].d) );
+
+        const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[0].qs);
+
+        __m128i bx_0 = _mm_and_si128(lowMask, tmp_0_1);
+        __m128i by_0 = _mm_loadu_si128((const __m128i *)y[0].qs);
+        bx_0 = _mm_sub_epi8(bx_0, off);
+        const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0);
+
+        __m128i bx_1 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_0_1, 4));
+        __m128i by_1 = _mm_loadu_si128((const __m128i *)(y[0].qs + 16));
+        bx_1 = _mm_sub_epi8(bx_1, off);
+        const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1);
+
+        _mm_prefetch(&x[1] + sizeof(block_q4_0), _MM_HINT_T0);
+        _mm_prefetch(&y[1] + sizeof(block_q8_0), _MM_HINT_T0);
+
+        // Compute combined scale for the block 2 and 3
+        const __m128 d_2_3 = _mm_set1_ps( GGML_V3_FP16_TO_FP32(x[1].d) * GGML_V3_FP16_TO_FP32(y[1].d) );
+
+        const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[1].qs);
+
+        __m128i bx_2 = _mm_and_si128(lowMask, tmp_2_3);
+        __m128i by_2 = _mm_loadu_si128((const __m128i *)y[1].qs);
+        bx_2 = _mm_sub_epi8(bx_2, off);
+        const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2);
+
+        __m128i bx_3 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_2_3, 4));
+        __m128i by_3 = _mm_loadu_si128((const __m128i *)(y[1].qs + 16));
+        bx_3 = _mm_sub_epi8(bx_3, off);
+        const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3);
+
+        // Convert int32_t to float
+        __m128 p0 = _mm_cvtepi32_ps(i32_0);
+        __m128 p1 = _mm_cvtepi32_ps(i32_1);
+        __m128 p2 = _mm_cvtepi32_ps(i32_2);
+        __m128 p3 = _mm_cvtepi32_ps(i32_3);
+
+        // Apply the scale
+        acc_0 = _mm_mul_ps( d_0_1, p0 );
+        acc_1 = _mm_mul_ps( d_0_1, p1 );
+        acc_2 = _mm_mul_ps( d_2_3, p2 );
+        acc_3 = _mm_mul_ps( d_2_3, p3 );
+    }
+
+    assert(nb % 2 == 0); // TODO: handle odd nb
+
+    // Main loop
+    for (int i = 2; i < nb; i+=2) {
+        _mm_prefetch(&x[i] + sizeof(block_q4_0), _MM_HINT_T0);
+        _mm_prefetch(&y[i] + sizeof(block_q8_0), _MM_HINT_T0);
+
+        // Compute combined scale for the block 0 and 1
+        const __m128 d_0_1 = _mm_set1_ps( GGML_V3_FP16_TO_FP32(x[i].d) * GGML_V3_FP16_TO_FP32(y[i].d) );
+
+        const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[i].qs);
+
+        __m128i bx_0 = _mm_and_si128(lowMask, tmp_0_1);
+        __m128i by_0 = _mm_loadu_si128((const __m128i *)y[i].qs);
+        bx_0 = _mm_sub_epi8(bx_0, off);
+        const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0);
+
+        __m128i bx_1 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_0_1, 4));
+        __m128i by_1 = _mm_loadu_si128((const __m128i *)(y[i].qs + 16));
+        bx_1 = _mm_sub_epi8(bx_1, off);
+        const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1);
+
+        _mm_prefetch(&x[i] + 2 * sizeof(block_q4_0), _MM_HINT_T0);
+        _mm_prefetch(&y[i] + 2 * sizeof(block_q8_0), _MM_HINT_T0);
+
+        // Compute combined scale for the block 2 and 3
+        const __m128 d_2_3 = _mm_set1_ps( GGML_V3_FP16_TO_FP32(x[i + 1].d) * GGML_V3_FP16_TO_FP32(y[i + 1].d) );
+
+        const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[i + 1].qs);
+
+        __m128i bx_2 = _mm_and_si128(lowMask, tmp_2_3);
+        __m128i by_2 = _mm_loadu_si128((const __m128i *)y[i + 1].qs);
+        bx_2 = _mm_sub_epi8(bx_2, off);
+        const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2);
+
+        __m128i bx_3 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_2_3, 4));
+        __m128i by_3 = _mm_loadu_si128((const __m128i *)(y[i + 1].qs + 16));
+        bx_3 = _mm_sub_epi8(bx_3, off);
+        const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3);
+
+        // Convert int32_t to float
+        __m128 p0 = _mm_cvtepi32_ps(i32_0);
+        __m128 p1 = _mm_cvtepi32_ps(i32_1);
+        __m128 p2 = _mm_cvtepi32_ps(i32_2);
+        __m128 p3 = _mm_cvtepi32_ps(i32_3);
+
+        // Apply the scale
+        __m128 p0_d = _mm_mul_ps( d_0_1, p0 );
+        __m128 p1_d = _mm_mul_ps( d_0_1, p1 );
+        __m128 p2_d = _mm_mul_ps( d_2_3, p2 );
+        __m128 p3_d = _mm_mul_ps( d_2_3, p3 );
+
+        // Acummulate
+        acc_0 = _mm_add_ps(p0_d, acc_0);
+        acc_1 = _mm_add_ps(p1_d, acc_1);
+        acc_2 = _mm_add_ps(p2_d, acc_2);
+        acc_3 = _mm_add_ps(p3_d, acc_3);
+    }
+
+    *s = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
+#elif defined(__riscv_v_intrinsic)
+    float sumf = 0.0;
+
+    size_t vl = __riscv_vsetvl_e8m1(qk/2);
+
+    for (int i = 0; i < nb; i++) {
+        // load elements
+        vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[i].qs, vl);
+
+        vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[i].qs, vl);
+        vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[i].qs+16, vl);
+
+        // mask and store lower part of x, and then upper part
+        vuint8mf2_t x_a = __riscv_vand_vx_u8mf2(tx, 0x0F, vl);
+        vuint8mf2_t x_l = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl);
+
+        vint8mf2_t x_ai = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a);
+        vint8mf2_t x_li = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l);
+
+        // subtract offset
+        vint8mf2_t v0 = __riscv_vsub_vx_i8mf2(x_ai, 8, vl);
+        vint8mf2_t v1 = __riscv_vsub_vx_i8mf2(x_li, 8, vl);
+
+        vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl);
+        vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl);
+
+        vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
+
+        vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl);
+        vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl);
+
+        int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
+
+        sumf += sumi*GGML_V3_FP16_TO_FP32(x[i].d)*GGML_V3_FP16_TO_FP32(y[i].d);
+    }
+
+    *s = sumf;
+#else
+    // scalar
+    float sumf = 0.0;
+
+    for (int i = 0; i < nb; i++) {
+        int sumi = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const int v0 = (x[i].qs[j] & 0x0F) - 8;
+            const int v1 = (x[i].qs[j] >>   4) - 8;
+
+            sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]);
+        }
+
+        sumf += sumi*GGML_V3_FP16_TO_FP32(x[i].d)*GGML_V3_FP16_TO_FP32(y[i].d);
+    }
+
+    *s = sumf;
+#endif
+}
+
+static void ggml_v3_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+    const int qk = QK8_1;
+    const int nb = n / qk;
+
+    assert(n % qk == 0);
+
+    const block_q4_1 * restrict x = vx;
+    const block_q8_1 * restrict y = vy;
+
+    // TODO: add WASM SIMD
+#if defined(__ARM_NEON)
+    float32x4_t sumv0 = vdupq_n_f32(0.0f);
+    float32x4_t sumv1 = vdupq_n_f32(0.0f);
+
+    float summs = 0;
+
+    assert(nb % 2 == 0); // TODO: handle odd nb
+
+    for (int i = 0; i < nb; i += 2) {
+        const block_q4_1 * restrict x0 = &x[i + 0];
+        const block_q4_1 * restrict x1 = &x[i + 1];
+        const block_q8_1 * restrict y0 = &y[i + 0];
+        const block_q8_1 * restrict y1 = &y[i + 1];
+
+        summs += GGML_V3_FP16_TO_FP32(x0->m) * y0->s + GGML_V3_FP16_TO_FP32(x1->m) * y1->s;
+
+        const uint8x16_t m4b = vdupq_n_u8(0x0F);
+
+        const uint8x16_t v0_0 = vld1q_u8(x0->qs);
+        const uint8x16_t v0_1 = vld1q_u8(x1->qs);
+
+        // 4-bit -> 8-bit
+        const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
+        const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
+        const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
+        const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
+
+        // load y
+        const int8x16_t v1_0l = vld1q_s8(y0->qs);
+        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
+        const int8x16_t v1_1l = vld1q_s8(y1->qs);
+        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
+
+        // dot product into int32x4_t
+        const int32x4_t p_0 = ggml_v3_vdotq_s32(ggml_v3_vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h);
+        const int32x4_t p_1 = ggml_v3_vdotq_s32(ggml_v3_vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h);
+
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_V3_FP16_TO_FP32(x0->d)*y0->d);
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_V3_FP16_TO_FP32(x1->d)*y1->d);
+    }
+
+    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs;
+#elif defined(__AVX2__) || defined(__AVX__)
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+
+    float summs = 0;
+
+    // Main loop
+    for (int i = 0; i < nb; ++i) {
+        const float d0 = GGML_V3_FP16_TO_FP32(x[i].d);
+        const float d1 = y[i].d;
+
+        summs += GGML_V3_FP16_TO_FP32(x[i].m) * y[i].s;
+
+        const __m256 d0v = _mm256_set1_ps( d0 );
+        const __m256 d1v = _mm256_set1_ps( d1 );
+
+        // Compute combined scales
+        const __m256 d0d1 = _mm256_mul_ps( d0v, d1v );
+
+        // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
+        const __m256i bx = bytes_from_nibbles_32(x[i].qs);
+        const __m256i by = _mm256_loadu_si256( (const __m256i *)y[i].qs );
+
+        const __m256 xy = mul_sum_us8_pairs_float(bx, by);
+
+        // Accumulate d0*d1*x*y
+#if defined(__AVX2__)
+        acc = _mm256_fmadd_ps( d0d1, xy, acc );
+#else
+        acc = _mm256_add_ps( _mm256_mul_ps( d0d1, xy ), acc );
+#endif
+    }
+
+    *s = hsum_float_8(acc) + summs;
+#elif defined(__riscv_v_intrinsic)
+    float sumf = 0.0;
+
+    size_t vl = __riscv_vsetvl_e8m1(qk/2);
+
+    for (int i = 0; i < nb; i++) {
+        // load elements
+        vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[i].qs, vl);
+
+        vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[i].qs, vl);
+        vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[i].qs+16, vl);
+
+        // mask and store lower part of x, and then upper part
+        vuint8mf2_t x_a = __riscv_vand_vx_u8mf2(tx, 0x0F, vl);
+        vuint8mf2_t x_l = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl);
+
+        vint8mf2_t v0 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a);
+        vint8mf2_t v1 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l);
+
+        vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl);
+        vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl);
+
+        vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
+
+        vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl);
+        vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl);
+
+        int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
+
+        sumf += (GGML_V3_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_V3_FP16_TO_FP32(x[i].m)*y[i].s;
+    }
+
+    *s = sumf;
+#else
+    // scalar
+    float sumf = 0.0;
+
+    for (int i = 0; i < nb; i++) {
+        int sumi = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const int v0 = (x[i].qs[j] & 0x0F);
+            const int v1 = (x[i].qs[j] >>   4);
+
+            sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]);
+        }
+
+        sumf += (GGML_V3_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_V3_FP16_TO_FP32(x[i].m)*y[i].s;
+    }
+
+    *s = sumf;
+#endif
+}
+
+static void ggml_v3_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+
+    assert(n % qk == 0);
+    assert(qk == QK5_0);
+
+    const block_q5_0 * restrict x = vx;
+    const block_q8_0 * restrict y = vy;
+
+#if defined(__ARM_NEON)
+    float32x4_t sumv0 = vdupq_n_f32(0.0f);
+    float32x4_t sumv1 = vdupq_n_f32(0.0f);
+
+    uint32_t qh0;
+    uint32_t qh1;
+
+    uint64_t tmp0[4];
+    uint64_t tmp1[4];
+
+    assert(nb % 2 == 0); // TODO: handle odd nb
+
+    for (int i = 0; i < nb; i += 2) {
+        const block_q5_0 * restrict x0 = &x[i];
+        const block_q5_0 * restrict x1 = &x[i + 1];
+        const block_q8_0 * restrict y0 = &y[i];
+        const block_q8_0 * restrict y1 = &y[i + 1];
+
+        const uint8x16_t m4b = vdupq_n_u8(0x0F);
+
+        // extract the 5th bit via lookup table ((!b) << 4)
+        memcpy(&qh0, x0->qh, sizeof(qh0));
+        memcpy(&qh1, x1->qh, sizeof(qh1));
+
+        tmp0[0] = table_b2b_1[(qh0 >>  0) & 0xFF];
+        tmp0[1] = table_b2b_1[(qh0 >>  8) & 0xFF];
+        tmp0[2] = table_b2b_1[(qh0 >> 16) & 0xFF];
+        tmp0[3] = table_b2b_1[(qh0 >> 24)       ];
+
+        tmp1[0] = table_b2b_1[(qh1 >>  0) & 0xFF];
+        tmp1[1] = table_b2b_1[(qh1 >>  8) & 0xFF];
+        tmp1[2] = table_b2b_1[(qh1 >> 16) & 0xFF];
+        tmp1[3] = table_b2b_1[(qh1 >> 24)       ];
+
+        const int8x16_t qhl0 = vld1q_s8((const int8_t *)(tmp0 + 0));
+        const int8x16_t qhh0 = vld1q_s8((const int8_t *)(tmp0 + 2));
+        const int8x16_t qhl1 = vld1q_s8((const int8_t *)(tmp1 + 0));
+        const int8x16_t qhh1 = vld1q_s8((const int8_t *)(tmp1 + 2));
+
+        const uint8x16_t v0_0 = vld1q_u8(x0->qs);
+        const uint8x16_t v0_1 = vld1q_u8(x1->qs);
+
+        // 4-bit -> 8-bit
+        int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
+        int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
+        int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
+        int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
+
+        // add high bit and sub 16 (equivalent to sub 0x10 when bit is zero)
+        const int8x16_t v0_0lf = vsubq_s8(v0_0l, qhl0);
+        const int8x16_t v0_0hf = vsubq_s8(v0_0h, qhh0);
+        const int8x16_t v0_1lf = vsubq_s8(v0_1l, qhl1);
+        const int8x16_t v0_1hf = vsubq_s8(v0_1h, qhh1);
+
+        // load y
+        const int8x16_t v1_0l = vld1q_s8(y0->qs);
+        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
+        const int8x16_t v1_1l = vld1q_s8(y1->qs);
+        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
+
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
+                        ggml_v3_vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
+                        ggml_v3_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_V3_FP16_TO_FP32(x0->d)*GGML_V3_FP16_TO_FP32(y0->d));
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
+                        ggml_v3_vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
+                        ggml_v3_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_V3_FP16_TO_FP32(x1->d)*GGML_V3_FP16_TO_FP32(y1->d));
+    }
+
+    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
+#elif defined(__wasm_simd128__)
+    v128_t sumv = wasm_f32x4_splat(0.0f);
+
+    uint32_t qh;
+    uint64_t tmp[4];
+
+    // TODO: check if unrolling this is better
+    for (int i = 0; i < nb; ++i) {
+        const block_q5_0 * restrict x0 = &x[i];
+        const block_q8_0 * restrict y0 = &y[i];
+
+        const v128_t m4b  = wasm_i8x16_splat(0x0F);
+
+        // extract the 5th bit
+        memcpy(&qh, x0->qh, sizeof(qh));
+
+        tmp[0] = table_b2b_1[(qh >>  0) & 0xFF];
+        tmp[1] = table_b2b_1[(qh >>  8) & 0xFF];
+        tmp[2] = table_b2b_1[(qh >> 16) & 0xFF];
+        tmp[3] = table_b2b_1[(qh >> 24)       ];
+
+        const v128_t qhl = wasm_v128_load(tmp + 0);
+        const v128_t qhh = wasm_v128_load(tmp + 2);
+
+        const v128_t v0 = wasm_v128_load(x0->qs);
+
+        // 4-bit -> 8-bit
+        const v128_t v0l = wasm_v128_and (v0, m4b);
+        const v128_t v0h = wasm_u8x16_shr(v0, 4);
+
+        // add high bit and sub 16 (equivalent to sub 0x10 when bit is zero)
+        const v128_t v0lf = wasm_i8x16_sub(v0l, qhl);
+        const v128_t v0hf = wasm_i8x16_sub(v0h, qhh);
+
+        // load y
+        const v128_t v1l = wasm_v128_load(y0->qs);
+        const v128_t v1h = wasm_v128_load(y0->qs + 16);
+
+        // int8x16 -> int16x8
+        const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf);
+        const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf);
+        const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf);
+        const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf);
+
+        const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l);
+        const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l);
+        const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
+        const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
+
+        // dot product
+        sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(
+                        wasm_i32x4_add(
+                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
+                                           wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
+                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
+                                           wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
+                    wasm_f32x4_splat(GGML_V3_FP16_TO_FP32(x0->d) * GGML_V3_FP16_TO_FP32(y0->d))));
+    }
+
+    *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
+         wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3);
+#elif defined(__AVX2__)
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+
+    // Main loop
+    for (int i = 0; i < nb; i++) {
+        /* Compute combined scale for the block */
+        const __m256 d = _mm256_set1_ps(GGML_V3_FP16_TO_FP32(x[i].d) * GGML_V3_FP16_TO_FP32(y[i].d));
+
+        __m256i bx = bytes_from_nibbles_32(x[i].qs);
+        __m256i bxhi = bytes_from_bits_32(x[i].qh);
+        bxhi = _mm256_andnot_si256(bxhi, _mm256_set1_epi8((char)0xF0));
+        bx = _mm256_or_si256(bx, bxhi);
+
+        __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
+
+        const __m256 q = mul_sum_i8_pairs_float(bx, by);
+
+        /* Multiply q with scale and accumulate */
+        acc = _mm256_fmadd_ps(d, q, acc);
+    }
+
+    *s = hsum_float_8(acc);
+#elif defined(__AVX__)
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+    __m128i mask = _mm_set1_epi8((char)0xF0);
+
+    // Main loop
+    for (int i = 0; i < nb; i++) {
+        /* Compute combined scale for the block */
+        const __m256 d = _mm256_set1_ps(GGML_V3_FP16_TO_FP32(x[i].d) * GGML_V3_FP16_TO_FP32(y[i].d));
+
+        __m256i bx = bytes_from_nibbles_32(x[i].qs);
+        const __m256i bxhi = bytes_from_bits_32(x[i].qh);
+        __m128i bxhil = _mm256_castsi256_si128(bxhi);
+        __m128i bxhih = _mm256_extractf128_si256(bxhi, 1);
+        bxhil = _mm_andnot_si128(bxhil, mask);
+        bxhih = _mm_andnot_si128(bxhih, mask);
+        __m128i bxl = _mm256_castsi256_si128(bx);
+        __m128i bxh = _mm256_extractf128_si256(bx, 1);
+        bxl = _mm_or_si128(bxl, bxhil);
+        bxh = _mm_or_si128(bxh, bxhih);
+        bx = MM256_SET_M128I(bxh, bxl);
+
+        const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
+
+        const __m256 q = mul_sum_i8_pairs_float(bx, by);
+
+        /* Multiply q with scale and accumulate */
+        acc = _mm256_add_ps(_mm256_mul_ps(d, q), acc);
+    }
+
+    *s = hsum_float_8(acc);
+#elif defined(__riscv_v_intrinsic)
+    float sumf = 0.0;
+
+    uint32_t qh;
+
+    size_t vl = __riscv_vsetvl_e8m1(qk/2);
+
+    // These temporary registers are for masking and shift operations
+    vuint32m2_t vt_1 = __riscv_vid_v_u32m2(vl);
+    vuint32m2_t vt_2 = __riscv_vsll_vv_u32m2(__riscv_vmv_v_x_u32m2(1, vl), vt_1, vl);
+
+    vuint32m2_t vt_3 = __riscv_vsll_vx_u32m2(vt_2, 16, vl);
+    vuint32m2_t vt_4 = __riscv_vadd_vx_u32m2(vt_1, 12, vl);
+
+    for (int i = 0; i < nb; i++) {
+        memcpy(&qh, x[i].qh, sizeof(uint32_t));
+
+        // ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
+        vuint32m2_t xha_0 = __riscv_vand_vx_u32m2(vt_2, qh, vl);
+        vuint32m2_t xhr_0 = __riscv_vsrl_vv_u32m2(xha_0, vt_1, vl);
+        vuint32m2_t xhl_0 = __riscv_vsll_vx_u32m2(xhr_0, 4, vl);
+
+        // ((qh & (1u << (j + 16))) >> (j + 12));
+        vuint32m2_t xha_1 = __riscv_vand_vx_u32m2(vt_3, qh, vl);
+        vuint32m2_t xhl_1 = __riscv_vsrl_vv_u32m2(xha_1, vt_4, vl);
+
+        // narrowing
+        vuint16m1_t xhc_0 = __riscv_vncvt_x_x_w_u16m1(xhl_0, vl);
+        vuint8mf2_t xh_0 = __riscv_vncvt_x_x_w_u8mf2(xhc_0, vl);
+
+        vuint16m1_t xhc_1 = __riscv_vncvt_x_x_w_u16m1(xhl_1, vl);
+        vuint8mf2_t xh_1 = __riscv_vncvt_x_x_w_u8mf2(xhc_1, vl);
+
+        // load
+        vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[i].qs, vl);
+
+        vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[i].qs, vl);
+        vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[i].qs+16, vl);
+
+        vuint8mf2_t x_at = __riscv_vand_vx_u8mf2(tx, 0x0F, vl);
+        vuint8mf2_t x_lt = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl);
+
+        vuint8mf2_t x_a = __riscv_vor_vv_u8mf2(x_at, xh_0, vl);
+        vuint8mf2_t x_l = __riscv_vor_vv_u8mf2(x_lt, xh_1, vl);
+
+        vint8mf2_t x_ai = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a);
+        vint8mf2_t x_li = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l);
+
+        vint8mf2_t v0 = __riscv_vsub_vx_i8mf2(x_ai, 16, vl);
+        vint8mf2_t v1 = __riscv_vsub_vx_i8mf2(x_li, 16, vl);
+
+        vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl);
+        vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl);
+
+        vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
+
+        vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl);
+        vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl);
+
+        int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
+
+        sumf += (GGML_V3_FP16_TO_FP32(x[i].d)*GGML_V3_FP16_TO_FP32(y[i].d)) * sumi;
+    }
+
+    *s = sumf;
+#else
+    // scalar
+    float sumf = 0.0;
+
+    for (int i = 0; i < nb; i++) {
+        uint32_t qh;
+        memcpy(&qh, x[i].qh, sizeof(qh));
+
+        int sumi = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
+            const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
+
+            const int32_t x0 = ((x[i].qs[j] & 0x0F) | xh_0) - 16;
+            const int32_t x1 = ((x[i].qs[j] >>   4) | xh_1) - 16;
+
+            sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
+        }
+
+        sumf += (GGML_V3_FP16_TO_FP32(x[i].d)*GGML_V3_FP16_TO_FP32(y[i].d)) * sumi;
+    }
+
+    *s = sumf;
+#endif
+}
+
+static void ggml_v3_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+    const int qk = QK8_1;
+    const int nb = n / qk;
+
+    assert(n % qk == 0);
+    assert(qk == QK5_1);
+
+    const block_q5_1 * restrict x = vx;
+    const block_q8_1 * restrict y = vy;
+
+#if defined(__ARM_NEON)
+    float32x4_t sumv0 = vdupq_n_f32(0.0f);
+    float32x4_t sumv1 = vdupq_n_f32(0.0f);
+
+    float summs0 = 0.0f;
+    float summs1 = 0.0f;
+
+    uint32_t qh0;
+    uint32_t qh1;
+
+    uint64_t tmp0[4];
+    uint64_t tmp1[4];
+
+    assert(nb % 2 == 0); // TODO: handle odd nb
+
+    for (int i = 0; i < nb; i += 2) {
+        const block_q5_1 * restrict x0 = &x[i];
+        const block_q5_1 * restrict x1 = &x[i + 1];
+        const block_q8_1 * restrict y0 = &y[i];
+        const block_q8_1 * restrict y1 = &y[i + 1];
+
+        const uint8x16_t m4b = vdupq_n_u8(0x0F);
+
+        summs0 += GGML_V3_FP16_TO_FP32(x0->m) * y0->s;
+        summs1 += GGML_V3_FP16_TO_FP32(x1->m) * y1->s;
+
+        // extract the 5th bit via lookup table ((b) << 4)
+        memcpy(&qh0, x0->qh, sizeof(qh0));
+        memcpy(&qh1, x1->qh, sizeof(qh1));
+
+        tmp0[0] = table_b2b_0[(qh0 >>  0) & 0xFF];
+        tmp0[1] = table_b2b_0[(qh0 >>  8) & 0xFF];
+        tmp0[2] = table_b2b_0[(qh0 >> 16) & 0xFF];
+        tmp0[3] = table_b2b_0[(qh0 >> 24)       ];
+
+        tmp1[0] = table_b2b_0[(qh1 >>  0) & 0xFF];
+        tmp1[1] = table_b2b_0[(qh1 >>  8) & 0xFF];
+        tmp1[2] = table_b2b_0[(qh1 >> 16) & 0xFF];
+        tmp1[3] = table_b2b_0[(qh1 >> 24)       ];
+
+        const int8x16_t qhl0 = vld1q_s8((const int8_t *)(tmp0 + 0));
+        const int8x16_t qhh0 = vld1q_s8((const int8_t *)(tmp0 + 2));
+        const int8x16_t qhl1 = vld1q_s8((const int8_t *)(tmp1 + 0));
+        const int8x16_t qhh1 = vld1q_s8((const int8_t *)(tmp1 + 2));
+
+        const uint8x16_t v0_0 = vld1q_u8(x0->qs);
+        const uint8x16_t v0_1 = vld1q_u8(x1->qs);
+
+        // 4-bit -> 8-bit
+        const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
+        const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
+        const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
+        const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
+
+        // add high bit
+        const int8x16_t v0_0lf = vorrq_s8(v0_0l, qhl0);
+        const int8x16_t v0_0hf = vorrq_s8(v0_0h, qhh0);
+        const int8x16_t v0_1lf = vorrq_s8(v0_1l, qhl1);
+        const int8x16_t v0_1hf = vorrq_s8(v0_1h, qhh1);
+
+        // load y
+        const int8x16_t v1_0l = vld1q_s8(y0->qs);
+        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
+        const int8x16_t v1_1l = vld1q_s8(y1->qs);
+        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
+
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
+                        ggml_v3_vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
+                        ggml_v3_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_V3_FP16_TO_FP32(x0->d)*y0->d);
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
+                        ggml_v3_vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
+                        ggml_v3_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_V3_FP16_TO_FP32(x1->d)*y1->d);
+    }
+
+    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs0 + summs1;
+#elif defined(__wasm_simd128__)
+    v128_t sumv = wasm_f32x4_splat(0.0f);
+
+    float summs = 0.0f;
+
+    uint32_t qh;
+    uint64_t tmp[4];
+
+    // TODO: check if unrolling this is better
+    for (int i = 0; i < nb; ++i) {
+        const block_q5_1 * restrict x0 = &x[i];
+        const block_q8_1 * restrict y0 = &y[i];
+
+        summs += GGML_V3_FP16_TO_FP32(x0->m) * y0->s;
+
+        const v128_t m4b = wasm_i8x16_splat(0x0F);
+
+        // extract the 5th bit
+        memcpy(&qh, x0->qh, sizeof(qh));
+
+        tmp[0] = table_b2b_0[(qh >>  0) & 0xFF];
+        tmp[1] = table_b2b_0[(qh >>  8) & 0xFF];
+        tmp[2] = table_b2b_0[(qh >> 16) & 0xFF];
+        tmp[3] = table_b2b_0[(qh >> 24)       ];
+
+        const v128_t qhl = wasm_v128_load(tmp + 0);
+        const v128_t qhh = wasm_v128_load(tmp + 2);
+
+        const v128_t v0 = wasm_v128_load(x0->qs);
+
+        // 4-bit -> 8-bit
+        const v128_t v0l = wasm_v128_and (v0, m4b);
+        const v128_t v0h = wasm_u8x16_shr(v0, 4);
+
+        // add high bit
+        const v128_t v0lf = wasm_v128_or(v0l, qhl);
+        const v128_t v0hf = wasm_v128_or(v0h, qhh);
+
+        // load y
+        const v128_t v1l = wasm_v128_load(y0->qs);
+        const v128_t v1h = wasm_v128_load(y0->qs + 16);
+
+        // int8x16 -> int16x8
+        const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf);
+        const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf);
+        const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf);
+        const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf);
+
+        const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l);
+        const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l);
+        const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
+        const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
+
+        // dot product
+        sumv = wasm_f32x4_add(sumv,
+                wasm_f32x4_mul(wasm_f32x4_convert_i32x4(wasm_i32x4_add(
+                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
+                                           wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
+                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
+                                           wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
+                    wasm_f32x4_splat(GGML_V3_FP16_TO_FP32(x0->d) * y0->d)));
+    }
+
+    *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
+         wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3) + summs;
+#elif defined(__AVX2__)
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+
+    float summs = 0.0f;
+
+    // Main loop
+    for (int i = 0; i < nb; i++) {
+        const __m256 dx = _mm256_set1_ps(GGML_V3_FP16_TO_FP32(x[i].d));
+
+        summs += GGML_V3_FP16_TO_FP32(x[i].m) * y[i].s;
+
+        __m256i bx = bytes_from_nibbles_32(x[i].qs);
+        __m256i bxhi = bytes_from_bits_32(x[i].qh);
+        bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10));
+        bx = _mm256_or_si256(bx, bxhi);
+
+        const __m256 dy = _mm256_set1_ps(y[i].d);
+        const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
+
+        const __m256 q = mul_sum_us8_pairs_float(bx, by);
+
+        acc = _mm256_fmadd_ps(q, _mm256_mul_ps(dx, dy), acc);
+    }
+
+    *s = hsum_float_8(acc) + summs;
+#elif defined(__AVX__)
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+    __m128i mask = _mm_set1_epi8(0x10);
+
+    float summs = 0.0f;
+
+    // Main loop
+    for (int i = 0; i < nb; i++) {
+        const __m256 dx = _mm256_set1_ps(GGML_V3_FP16_TO_FP32(x[i].d));
+
+        summs += GGML_V3_FP16_TO_FP32(x[i].m) * y[i].s;
+
+        __m256i bx = bytes_from_nibbles_32(x[i].qs);
+        const __m256i bxhi = bytes_from_bits_32(x[i].qh);
+        __m128i bxhil = _mm256_castsi256_si128(bxhi);
+        __m128i bxhih = _mm256_extractf128_si256(bxhi, 1);
+        bxhil = _mm_and_si128(bxhil, mask);
+        bxhih = _mm_and_si128(bxhih, mask);
+        __m128i bxl = _mm256_castsi256_si128(bx);
+        __m128i bxh = _mm256_extractf128_si256(bx, 1);
+        bxl = _mm_or_si128(bxl, bxhil);
+        bxh = _mm_or_si128(bxh, bxhih);
+        bx = MM256_SET_M128I(bxh, bxl);
+
+        const __m256 dy = _mm256_set1_ps(y[i].d);
+        const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
+
+        const __m256 q = mul_sum_us8_pairs_float(bx, by);
+
+        acc = _mm256_add_ps(_mm256_mul_ps(q, _mm256_mul_ps(dx, dy)), acc);
+    }
+
+    *s = hsum_float_8(acc) + summs;
+#elif defined(__riscv_v_intrinsic)
+    float sumf = 0.0;
+
+    uint32_t qh;
+
+    size_t vl = __riscv_vsetvl_e8m1(qk/2);
+
+    // temporary registers for shift operations
+    vuint32m2_t vt_1 = __riscv_vid_v_u32m2(vl);
+    vuint32m2_t vt_2 = __riscv_vadd_vx_u32m2(vt_1, 12, vl);
+
+    for (int i = 0; i < nb; i++) {
+        memcpy(&qh, x[i].qh, sizeof(uint32_t));
+
+        // load qh
+        vuint32m2_t vqh = __riscv_vmv_v_x_u32m2(qh, vl);
+
+        // ((qh >> (j +  0)) << 4) & 0x10;
+        vuint32m2_t xhr_0 = __riscv_vsrl_vv_u32m2(vqh, vt_1, vl);
+        vuint32m2_t xhl_0 = __riscv_vsll_vx_u32m2(xhr_0, 4, vl);
+        vuint32m2_t xha_0 = __riscv_vand_vx_u32m2(xhl_0, 0x10, vl);
+
+        // ((qh >> (j + 12))     ) & 0x10;
+        vuint32m2_t xhr_1 = __riscv_vsrl_vv_u32m2(vqh, vt_2, vl);
+        vuint32m2_t xha_1 = __riscv_vand_vx_u32m2(xhr_1, 0x10, vl);
+
+        // narrowing
+        vuint16m1_t xhc_0 = __riscv_vncvt_x_x_w_u16m1(xha_0, vl);
+        vuint8mf2_t xh_0 = __riscv_vncvt_x_x_w_u8mf2(xhc_0, vl);
+
+        vuint16m1_t xhc_1 = __riscv_vncvt_x_x_w_u16m1(xha_1, vl);
+        vuint8mf2_t xh_1 = __riscv_vncvt_x_x_w_u8mf2(xhc_1, vl);
+
+        // load
+        vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[i].qs, vl);
+
+        vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[i].qs, vl);
+        vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[i].qs+16, vl);
+
+        vuint8mf2_t x_at = __riscv_vand_vx_u8mf2(tx, 0x0F, vl);
+        vuint8mf2_t x_lt = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl);
+
+        vuint8mf2_t x_a = __riscv_vor_vv_u8mf2(x_at, xh_0, vl);
+        vuint8mf2_t x_l = __riscv_vor_vv_u8mf2(x_lt, xh_1, vl);
+
+        vint8mf2_t v0 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a);
+        vint8mf2_t v1 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l);
+
+        vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl);
+        vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl);
+
+        vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
+
+        vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl);
+        vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl);
+
+        int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
+
+        sumf += (GGML_V3_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_V3_FP16_TO_FP32(x[i].m)*y[i].s;
+    }
+
+    *s = sumf;
+#else
+    // scalar
+    float sumf = 0.0;
+
+    for (int i = 0; i < nb; i++) {
+        uint32_t qh;
+        memcpy(&qh, x[i].qh, sizeof(qh));
+
+        int sumi = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
+            const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
+
+            const int32_t x0 = (x[i].qs[j] & 0xF) | xh_0;
+            const int32_t x1 = (x[i].qs[j] >>  4) | xh_1;
+
+            sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
+        }
+
+        sumf += (GGML_V3_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_V3_FP16_TO_FP32(x[i].m)*y[i].s;
+    }
+
+    *s = sumf;
+#endif
+}
+
+static void ggml_v3_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+
+    assert(n % qk == 0);
+
+    const block_q8_0 * restrict x = vx;
+    const block_q8_0 * restrict y = vy;
+
+#if defined(__ARM_NEON)
+    float32x4_t sumv0 = vdupq_n_f32(0.0f);
+    float32x4_t sumv1 = vdupq_n_f32(0.0f);
+
+    assert(nb % 2 == 0); // TODO: handle odd nb
+
+    for (int i = 0; i < nb; i += 2) {
+        const block_q8_0 * restrict x0 = &x[i + 0];
+        const block_q8_0 * restrict x1 = &x[i + 1];
+        const block_q8_0 * restrict y0 = &y[i + 0];
+        const block_q8_0 * restrict y1 = &y[i + 1];
+
+        const int8x16_t x0_0 = vld1q_s8(x0->qs);
+        const int8x16_t x0_1 = vld1q_s8(x0->qs + 16);
+        const int8x16_t x1_0 = vld1q_s8(x1->qs);
+        const int8x16_t x1_1 = vld1q_s8(x1->qs + 16);
+
+        // load y
+        const int8x16_t y0_0 = vld1q_s8(y0->qs);
+        const int8x16_t y0_1 = vld1q_s8(y0->qs + 16);
+        const int8x16_t y1_0 = vld1q_s8(y1->qs);
+        const int8x16_t y1_1 = vld1q_s8(y1->qs + 16);
+
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
+                        ggml_v3_vdotq_s32(vdupq_n_s32(0), x0_0, y0_0),
+                        ggml_v3_vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_V3_FP16_TO_FP32(x0->d)*GGML_V3_FP16_TO_FP32(y0->d));
+
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
+                        ggml_v3_vdotq_s32(vdupq_n_s32(0), x1_0, y1_0),
+                        ggml_v3_vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_V3_FP16_TO_FP32(x1->d)*GGML_V3_FP16_TO_FP32(y1->d));
+    }
+
+    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
+#elif defined(__AVX2__) || defined(__AVX__)
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+
+    // Main loop
+    for (int i = 0; i < nb; ++i) {
+        // Compute combined scale for the block
+        const __m256 d = _mm256_set1_ps(GGML_V3_FP16_TO_FP32(x[i].d) * GGML_V3_FP16_TO_FP32(y[i].d));
+        __m256i bx = _mm256_loadu_si256((const __m256i *)x[i].qs);
+        __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
+
+        const __m256 q = mul_sum_i8_pairs_float(bx, by);
+
+        // Multiply q with scale and accumulate
+#if defined(__AVX2__)
+        acc = _mm256_fmadd_ps( d, q, acc );
+#else
+        acc = _mm256_add_ps( _mm256_mul_ps( d, q ), acc );
+#endif
+    }
+
+    *s = hsum_float_8(acc);
+#elif defined(__riscv_v_intrinsic)
+    float sumf = 0.0;
+    size_t vl = __riscv_vsetvl_e8m1(qk);
+
+    for (int i = 0; i < nb; i++) {
+        // load elements
+        vint8m1_t bx = __riscv_vle8_v_i8m1(x[i].qs, vl);
+        vint8m1_t by = __riscv_vle8_v_i8m1(y[i].qs, vl);
+
+        vint16m2_t vw_mul = __riscv_vwmul_vv_i16m2(bx, by, vl);
+
+        vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, vl);
+        vint32m1_t v_sum = __riscv_vwredsum_vs_i16m2_i32m1(vw_mul, v_zero, vl);
+
+        int sumi = __riscv_vmv_x_s_i32m1_i32(v_sum);
+
+        sumf += sumi*(GGML_V3_FP16_TO_FP32(x[i].d)*GGML_V3_FP16_TO_FP32(y[i].d));
+    }
+
+    *s = sumf;
+#else
+    // scalar
+    float sumf = 0.0;
+
+    for (int i = 0; i < nb; i++) {
+        int sumi = 0;
+
+        for (int j = 0; j < qk; j++) {
+            sumi += x[i].qs[j]*y[i].qs[j];
+        }
+
+        sumf += sumi*(GGML_V3_FP16_TO_FP32(x[i].d)*GGML_V3_FP16_TO_FP32(y[i].d));
+    }
+
+    *s = sumf;
+#endif
+}
+
+#if QK_K == 256
+static void ggml_v3_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+
+    const block_q2_K * restrict x = vx;
+    const block_q8_K * restrict y = vy;
+
+    const int nb = n / QK_K;
+
+#ifdef __ARM_NEON
+    const uint8x16_t m3 = vdupq_n_u8(0x3);
+    const uint8x16_t m4 = vdupq_n_u8(0xF);
+
+    const int32x4_t vzero = vdupq_n_s32(0);
+
+    ggml_v3_int8x16x2_t q2bytes;
+    uint8_t aux[16];
+
+    float sum = 0;
+
+    for (int i = 0; i < nb; ++i) {
+        const float d = y[i].d * GGML_V3_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_V3_FP16_TO_FP32(x[i].dmin);
+
+        const uint8_t * restrict q2 = x[i].qs;
+        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * restrict sc = x[i].scales;
+
+        const uint8x16_t mins_and_scales = vld1q_u8(sc);
+        const uint8x16_t scales = vandq_u8(mins_and_scales, m4);
+        vst1q_u8(aux, scales);
+
+        const uint8x16_t mins = vshrq_n_u8(mins_and_scales, 4);
+        const ggml_v3_int16x8x2_t q8sums = ggml_v3_vld1q_s16_x2(y[i].bsums);
+        const ggml_v3_int16x8x2_t mins16 = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mins))), vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mins)))}};
+        const int32x4_t s0 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[0]), vget_low_s16 (q8sums.val[0])),
+                                       vmull_s16(vget_high_s16(mins16.val[0]), vget_high_s16(q8sums.val[0])));
+        const int32x4_t s1 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[1]), vget_low_s16 (q8sums.val[1])),
+                                       vmull_s16(vget_high_s16(mins16.val[1]), vget_high_s16(q8sums.val[1])));
+        sum += dmin * vaddvq_s32(vaddq_s32(s0, s1));
+
+        int isum = 0;
+        int is = 0;
+
+// We use this macro instead of a function call because for some reason
+// the code runs 2-3% slower, even if the function is declared inline
+#define MULTIPLY_ACCUM_WITH_SCALE(index)\
+        isum += vaddvq_s32(ggml_v3_vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * aux[is+(index)];\
+        isum += vaddvq_s32(ggml_v3_vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * aux[is+1+(index)];
+
+#define SHIFT_MULTIPLY_ACCUM_WITH_SCALE(shift, index)\
+        q8bytes = ggml_v3_vld1q_s8_x2(q8); q8 += 32;\
+        q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits.val[0], (shift)), m3));\
+        q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits.val[1], (shift)), m3));\
+        MULTIPLY_ACCUM_WITH_SCALE((index));
+
+        for (int j = 0; j < QK_K/128; ++j) {
+            const ggml_v3_uint8x16x2_t q2bits = ggml_v3_vld1q_u8_x2(q2); q2 += 32;
+
+            ggml_v3_int8x16x2_t q8bytes = ggml_v3_vld1q_s8_x2(q8); q8 += 32;
+            q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(q2bits.val[0], m3));
+            q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(q2bits.val[1], m3));
+
+            MULTIPLY_ACCUM_WITH_SCALE(0);
+
+            SHIFT_MULTIPLY_ACCUM_WITH_SCALE(2, 2);
+            SHIFT_MULTIPLY_ACCUM_WITH_SCALE(4, 4);
+            SHIFT_MULTIPLY_ACCUM_WITH_SCALE(6, 6);
+
+            is += 8;
+        }
+
+        sum += d * isum;
+    }
+
+    *s = sum;
+
+#elif defined __AVX2__
+
+    const __m256i m3 = _mm256_set1_epi8(3);
+    const __m128i m4 = _mm_set1_epi8(0xF);
+
+    __m256 acc = _mm256_setzero_ps();
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * GGML_V3_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_V3_FP16_TO_FP32(x[i].dmin);
+
+        const uint8_t * restrict q2 = x[i].qs;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales);
+        const __m128i scales8 = _mm_and_si128(mins_and_scales, m4);
+        const __m128i mins8 = _mm_and_si128(_mm_srli_epi16(mins_and_scales, 4), m4);
+        const __m256i mins = _mm256_cvtepi8_epi16(mins8);
+        const __m256i prod = _mm256_madd_epi16(mins, _mm256_loadu_si256((const __m256i*)y[i].bsums));
+
+        acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(prod), acc);
+
+        const __m256i all_scales = _mm256_cvtepi8_epi16(scales8);
+        const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0);
+        const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1);
+        const __m256i scales[2] = {MM256_SET_M128I(l_scales, l_scales), MM256_SET_M128I(h_scales, h_scales)};
+
+        __m256i sumi = _mm256_setzero_si256();
+
+        for (int j = 0; j < QK_K/128; ++j) {
+
+            const __m256i q2bits = _mm256_loadu_si256((const __m256i*)q2); q2 += 32;
+
+            const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+            const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+            const __m256i q8_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+            const __m256i q8_3 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+
+            const __m256i q2_0 = _mm256_and_si256(q2bits, m3);
+            const __m256i q2_1 = _mm256_and_si256(_mm256_srli_epi16(q2bits, 2), m3);
+            const __m256i q2_2 = _mm256_and_si256(_mm256_srli_epi16(q2bits, 4), m3);
+            const __m256i q2_3 = _mm256_and_si256(_mm256_srli_epi16(q2bits, 6), m3);
+
+            __m256i p0 = _mm256_maddubs_epi16(q2_0, q8_0);
+            __m256i p1 = _mm256_maddubs_epi16(q2_1, q8_1);
+            __m256i p2 = _mm256_maddubs_epi16(q2_2, q8_2);
+            __m256i p3 = _mm256_maddubs_epi16(q2_3, q8_3);
+
+            p0 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(0)), p0);
+            p1 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(1)), p1);
+            p2 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(2)), p2);
+            p3 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(3)), p3);
+
+            p0 = _mm256_add_epi32(p0, p1);
+            p2 = _mm256_add_epi32(p2, p3);
+
+            sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p0, p2));
+        }
+
+        acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi), acc);
+
+    }
+
+    *s = hsum_float_8(acc);
+
+#elif defined __AVX__
+
+    const __m128i m3 = _mm_set1_epi8(0x3);
+    const __m128i m4 = _mm_set1_epi8(0xF);
+    const __m128i m2 = _mm_set1_epi8(0x2);
+
+    __m256 acc = _mm256_setzero_ps();
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float dall = y[i].d * GGML_V3_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_V3_FP16_TO_FP32(x[i].dmin);
+
+        const uint8_t * restrict q2 = x[i].qs;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        // load mins and scales from block_q2_K.scales[QK_K/16]
+        const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales);
+        const __m128i scales16 = _mm_and_si128(mins_and_scales, m4);
+        const __m128i mins16 = _mm_and_si128(_mm_srli_epi16(mins_and_scales, 4), m4);
+        const __m128i mins_0 = _mm_cvtepi8_epi16(mins16);
+        const __m128i mins_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(mins16, mins16));
+
+        // summs = y[i].bsums * (x[i].scales >> 4) in 16bits*8*2 to 32bits*4*2
+        const __m128i summs_0 = _mm_madd_epi16(mins_0, _mm_loadu_si128((const __m128i*)&y[i].bsums[0]));
+        const __m128i summs_1 = _mm_madd_epi16(mins_1, _mm_loadu_si128((const __m128i*)&y[i].bsums[8]));
+
+        // sumf += -dmin * summs in 32bits*8
+        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(MM256_SET_M128I(summs_1, summs_0))), acc);
+
+        const __m128i scales_0 = _mm_cvtepi8_epi16(scales16);
+        const __m128i scales_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(scales16, scales16));
+        const __m128i scales[2] = { scales_0, scales_1 };
+
+        __m128i sumi_0 = _mm_setzero_si128();
+        __m128i sumi_1 = _mm_setzero_si128();
+
+        for (int j = 0; j < QK_K/128; ++j) {
+
+            // load Q8 quants int8*16*8 from block_q8_K.qs[QK_K]
+            const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_2 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_3 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_4 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_5 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+
+            // load 2bits*16*8 from block_q2_K.qs[QK_K/4]
+            __m128i q2bits = _mm_loadu_si128((const __m128i*)q2); q2 += 16;
+            const __m128i q2_0 = _mm_and_si128(q2bits, m3);
+            const __m128i q2_2 = _mm_and_si128(_mm_srli_epi16(q2bits, 2), m3);
+            const __m128i q2_4 = _mm_and_si128(_mm_srli_epi16(q2bits, 4), m3);
+            const __m128i q2_6 = _mm_and_si128(_mm_srli_epi16(q2bits, 6), m3);
+            q2bits = _mm_loadu_si128((const __m128i*)q2); q2 += 16;
+            const __m128i q2_1 = _mm_and_si128(q2bits, m3);
+            const __m128i q2_3 = _mm_and_si128(_mm_srli_epi16(q2bits, 2), m3);
+            const __m128i q2_5 = _mm_and_si128(_mm_srli_epi16(q2bits, 4), m3);
+            const __m128i q2_7 = _mm_and_si128(_mm_srli_epi16(q2bits, 6), m3);
+
+            // isuml = q8[l] * ((q2[l] >> shift) & 3) in 8bits*16*8 to 16bits*8*8
+            __m128i p0 = _mm_maddubs_epi16(q2_0, q8_0);
+            __m128i p1 = _mm_maddubs_epi16(q2_1, q8_1);
+            __m128i p2 = _mm_maddubs_epi16(q2_2, q8_2);
+            __m128i p3 = _mm_maddubs_epi16(q2_3, q8_3);
+            __m128i p4 = _mm_maddubs_epi16(q2_4, q8_4);
+            __m128i p5 = _mm_maddubs_epi16(q2_5, q8_5);
+            __m128i p6 = _mm_maddubs_epi16(q2_6, q8_6);
+            __m128i p7 = _mm_maddubs_epi16(q2_7, q8_7);
+
+            // isum += (x[i].scales[is++] & 0xF) * isuml in 16bits*8*8 to 32bits*4*8
+            __m128i shuffle = _mm_set1_epi16(0x0100);
+            p0 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p0);
+            shuffle = _mm_add_epi16(shuffle, m2);
+            p1 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p1);
+            shuffle = _mm_add_epi16(shuffle, m2);
+            p2 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p2);
+            shuffle = _mm_add_epi16(shuffle, m2);
+            p3 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p3);
+            shuffle = _mm_add_epi16(shuffle, m2);
+            p4 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p4);
+            shuffle = _mm_add_epi16(shuffle, m2);
+            p5 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p5);
+            shuffle = _mm_add_epi16(shuffle, m2);
+            p6 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p6);
+            shuffle = _mm_add_epi16(shuffle, m2);
+            p7 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p7);
+
+            p0 = _mm_add_epi32(p0, p1);
+            p2 = _mm_add_epi32(p2, p3);
+            p4 = _mm_add_epi32(p4, p5);
+            p6 = _mm_add_epi32(p6, p7);
+
+            // isum in 32bits*4*2
+            sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p0, p2));
+            sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p4, p6));
+        }
+
+        // sumf += dall * isum - dmin * summs in 32bits
+        __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
+        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dall), _mm256_cvtepi32_ps(sumi)), acc);
+    }
+
+    *s = hsum_float_8(acc);
+
+#elif defined __riscv_v_intrinsic
+
+    float sumf = 0;
+    uint8_t temp_01[32] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+
+    for (int i = 0; i < nb; ++i) {
+
+        const uint8_t * q2 = x[i].qs;
+        const  int8_t * q8 = y[i].qs;
+        const uint8_t * sc = x[i].scales;
+
+        const float dall = y[i].d * GGML_V3_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_V3_FP16_TO_FP32(x[i].dmin);
+
+        size_t vl = 16;
+
+        vuint8m1_t scales = __riscv_vle8_v_u8m1(sc, vl);
+        vuint8m1_t aux = __riscv_vand_vx_u8m1(scales, 0x0F, vl);
+
+        vint16m1_t q8sums = __riscv_vle16_v_i16m1(y[i].bsums, vl);
+
+        vuint8mf2_t scales_2 = __riscv_vle8_v_u8mf2(sc, vl);
+        vuint8mf2_t mins8 = __riscv_vsrl_vx_u8mf2(scales_2, 0x4, vl);
+        vint16m1_t mins = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(mins8, vl));
+        vint32m2_t prod = __riscv_vwmul_vv_i32m2(q8sums, mins, vl);
+        vint32m1_t vsums = __riscv_vredsum_vs_i32m2_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl);
+
+        sumf  += dmin * __riscv_vmv_x_s_i32m1_i32(vsums);
+
+        vl = 32;
+
+        vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
+        vuint8m1_t v_b = __riscv_vle8_v_u8m1(temp_01, vl);
+
+        uint8_t is=0;
+        int isum=0;
+
+        for (int j = 0; j < QK_K/128; ++j) {
+            // load Q2
+            vuint8m1_t q2_x = __riscv_vle8_v_u8m1(q2, vl);
+
+            vuint8m1_t q2_0 = __riscv_vand_vx_u8m1(q2_x, 0x03, vl);
+            vuint8m1_t q2_1 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q2_x, 0x2, vl), 0x03 , vl);
+            vuint8m1_t q2_2 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q2_x, 0x4, vl), 0x03 , vl);
+            vuint8m1_t q2_3 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q2_x, 0x6, vl), 0x03 , vl);
+
+            // duplicate scale elements for product
+            vuint8m1_t sc0 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 0+is, vl), vl);
+            vuint8m1_t sc1 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 2+is, vl), vl);
+            vuint8m1_t sc2 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 4+is, vl), vl);
+            vuint8m1_t sc3 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 6+is, vl), vl);
+
+            vint16m2_t p0 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_0, sc0, vl));
+            vint16m2_t p1 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_1, sc1, vl));
+            vint16m2_t p2 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_2, sc2, vl));
+            vint16m2_t p3 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_3, sc3, vl));
+
+            // load Q8
+            vint8m1_t q8_0 = __riscv_vle8_v_i8m1(q8, vl);
+            vint8m1_t q8_1 = __riscv_vle8_v_i8m1(q8+32, vl);
+            vint8m1_t q8_2 = __riscv_vle8_v_i8m1(q8+64, vl);
+            vint8m1_t q8_3 = __riscv_vle8_v_i8m1(q8+96, vl);
+
+            vint32m4_t s0 = __riscv_vwmul_vv_i32m4(p0, __riscv_vwcvt_x_x_v_i16m2(q8_0, vl), vl);
+            vint32m4_t s1 = __riscv_vwmul_vv_i32m4(p1, __riscv_vwcvt_x_x_v_i16m2(q8_1, vl), vl);
+            vint32m4_t s2 = __riscv_vwmul_vv_i32m4(p2, __riscv_vwcvt_x_x_v_i16m2(q8_2, vl), vl);
+            vint32m4_t s3 = __riscv_vwmul_vv_i32m4(p3, __riscv_vwcvt_x_x_v_i16m2(q8_3, vl), vl);
+
+            vint32m1_t isum0 = __riscv_vredsum_vs_i32m4_i32m1(__riscv_vadd_vv_i32m4(s0, s1, vl), vzero, vl);
+            vint32m1_t isum1 = __riscv_vredsum_vs_i32m4_i32m1(__riscv_vadd_vv_i32m4(s2, s3, vl), isum0, vl);
+
+            isum += __riscv_vmv_x_s_i32m1_i32(isum1);
+
+            q2+=32;  q8+=128;  is=8;
+
+        }
+
+        sumf += dall * isum;
+
+    }
+
+    *s = sumf;
+
+#else
+
+    float sumf = 0;
+
+    for (int i = 0; i < nb; ++i) {
+
+        const uint8_t * q2 = x[i].qs;
+        const  int8_t * q8 = y[i].qs;
+        const uint8_t * sc = x[i].scales;
+
+        int summs = 0;
+        for (int j = 0; j < 16; ++j) {
+            summs += y[i].bsums[j] * (sc[j] >> 4);
+        }
+
+        const float dall = y[i].d * GGML_V3_FP16_TO_FP32(x[i].d);
+        const float dmin = y[i].d * GGML_V3_FP16_TO_FP32(x[i].dmin);
+
+        int isum = 0;
+        int is = 0;
+        int d;
+        for (int k = 0; k < QK_K/128; ++k) {
+            int shift = 0;
+            for (int j = 0; j < 4; ++j) {
+                d = sc[is++] & 0xF;
+                int isuml = 0;
+                for (int l =  0; l < 16; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
+                isum += d * isuml;
+                d = sc[is++] & 0xF;
+                isuml = 0;
+                for (int l = 16; l < 32; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
+                isum += d * isuml;
+                shift += 2;
+                q8 += 32;
+            }
+            q2 += 32;
+        }
+        sumf += dall * isum - dmin * summs;
+    }
+    *s = sumf;
+#endif
+}
+
+#else
+
+static void ggml_v3_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+
+    const block_q2_K * restrict x = vx;
+    const block_q8_K * restrict y = vy;
+
+    const int nb = n / QK_K;
+
+#ifdef __ARM_NEON
+    const uint8x16_t m3 = vdupq_n_u8(0x3);
+
+    const int32x4_t vzero = vdupq_n_s32(0);
+
+    ggml_v3_int8x16x4_t q2bytes;
+
+    uint32_t aux32[2];
+    const uint8_t * scales = (const uint8_t *)aux32;
+
+    float sum = 0;
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * (float)x[i].d;
+        const float dmin = -y[i].d * (float)x[i].dmin;
+
+        const uint8_t * restrict q2 = x[i].qs;
+        const int8_t  * restrict q8 = y[i].qs;
+        const uint32_t * restrict sc = (const uint32_t *)x[i].scales;
+
+        aux32[0] = sc[0] & 0x0f0f0f0f;
+        aux32[1] = (sc[0] >> 4) & 0x0f0f0f0f;
+
+        sum += dmin * (scales[4] * y[i].bsums[0] + scales[5] * y[i].bsums[1] + scales[6] * y[i].bsums[2] + scales[7] * y[i].bsums[3]);
+
+        int isum1 = 0, isum2 = 0;
+
+        const uint8x16_t q2bits = vld1q_u8(q2);
+
+        const ggml_v3_int8x16x4_t q8bytes = ggml_v3_vld1q_s8_x4(q8);
+
+        q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(q2bits, m3));
+        q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 2), m3));
+        q2bytes.val[2] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 4), m3));
+        q2bytes.val[3] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 6), m3));
+
+        isum1 += vaddvq_s32(ggml_v3_vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * scales[0];
+        isum2 += vaddvq_s32(ggml_v3_vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * scales[1];
+        isum1 += vaddvq_s32(ggml_v3_vdotq_s32(vzero, q2bytes.val[2], q8bytes.val[2])) * scales[2];
+        isum2 += vaddvq_s32(ggml_v3_vdotq_s32(vzero, q2bytes.val[3], q8bytes.val[3])) * scales[3];
+
+        sum += d * (isum1 + isum2);
+    }
+
+    *s = sum;
+
+#elif defined __AVX2__
+
+    const __m256i m3 = _mm256_set1_epi8(3);
+
+    __m256 acc = _mm256_setzero_ps();
+
+    uint32_t ud, um;
+    const uint8_t * restrict db = (const uint8_t *)&ud;
+    const uint8_t * restrict mb = (const uint8_t *)&um;
+
+    float summs = 0;
+
+    // TODO: optimize this
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * GGML_V3_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_V3_FP16_TO_FP32(x[i].dmin);
+
+        const uint8_t * restrict q2 = x[i].qs;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        const uint32_t * restrict sc = (const uint32_t *)x[i].scales;
+        ud = (sc[0] >> 0) & 0x0f0f0f0f;
+        um = (sc[0] >> 4) & 0x0f0f0f0f;
+
+        int32_t smin = mb[0] * y[i].bsums[0] + mb[1] * y[i].bsums[1] + mb[2] * y[i].bsums[2] + mb[3] * y[i].bsums[3];
+        summs += dmin * smin;
+
+        const __m128i q2bits = _mm_loadu_si128((const __m128i*)q2);
+        const __m256i q2_0 = _mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q2bits, 2), q2bits), m3);
+        const __m256i q2_1 = _mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q2bits, 6), _mm_srli_epi16(q2bits, 4)), m3);
+
+        const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
+        const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
+
+        const __m256i p0 = _mm256_maddubs_epi16(q2_0, q8_0);
+        const __m256i p1 = _mm256_maddubs_epi16(q2_1, q8_1);
+
+        const __m256i p_0 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(p0, 0));
+        const __m256i p_1 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(p0, 1));
+        const __m256i p_2 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(p1, 0));
+        const __m256i p_3 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(p1, 1));
+
+        acc = _mm256_fmadd_ps(_mm256_set1_ps(d * db[0]), _mm256_cvtepi32_ps(p_0), acc);
+        acc = _mm256_fmadd_ps(_mm256_set1_ps(d * db[1]), _mm256_cvtepi32_ps(p_1), acc);
+        acc = _mm256_fmadd_ps(_mm256_set1_ps(d * db[2]), _mm256_cvtepi32_ps(p_2), acc);
+        acc = _mm256_fmadd_ps(_mm256_set1_ps(d * db[3]), _mm256_cvtepi32_ps(p_3), acc);
+    }
+
+    *s = hsum_float_8(acc) + summs;
+
+#elif defined __AVX__
+
+    const __m128i m3 = _mm_set1_epi8(3);
+
+    __m256 acc = _mm256_setzero_ps();
+
+    uint32_t ud, um;
+    const uint8_t * restrict db = (const uint8_t *)&ud;
+    const uint8_t * restrict mb = (const uint8_t *)&um;
+
+    float summs = 0;
+
+    // TODO: optimize this
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * GGML_V3_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_V3_FP16_TO_FP32(x[i].dmin);
+
+        const uint8_t * restrict q2 = x[i].qs;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        const uint32_t * restrict sc = (const uint32_t *)x[i].scales;
+        ud = (sc[0] >> 0) & 0x0f0f0f0f;
+        um = (sc[0] >> 4) & 0x0f0f0f0f;
+
+        int32_t smin = mb[0] * y[i].bsums[0] + mb[1] * y[i].bsums[1] + mb[2] * y[i].bsums[2] + mb[3] * y[i].bsums[3];
+        summs += dmin * smin;
+
+        const __m128i q2bits = _mm_loadu_si128((const __m128i*)q2);
+        const __m128i q2_0 = _mm_and_si128(q2bits, m3);
+        const __m128i q2_1 = _mm_and_si128(_mm_srli_epi16(q2bits, 2), m3);
+        const __m128i q2_2 = _mm_and_si128(_mm_srli_epi16(q2bits, 4), m3);
+        const __m128i q2_3 = _mm_and_si128(_mm_srli_epi16(q2bits, 6), m3);
+
+        const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
+        const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
+
+        const __m128i p0 = _mm_maddubs_epi16(q2_0, _mm256_extractf128_si256(q8_0, 0));
+        const __m128i p1 = _mm_maddubs_epi16(q2_1, _mm256_extractf128_si256(q8_0, 1));
+        const __m128i p2 = _mm_maddubs_epi16(q2_2, _mm256_extractf128_si256(q8_1, 0));
+        const __m128i p3 = _mm_maddubs_epi16(q2_3, _mm256_extractf128_si256(q8_1, 1));
+
+        const __m256i p_0 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p0, p0)), _mm_cvtepi16_epi32(p0));
+        const __m256i p_1 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p1, p1)), _mm_cvtepi16_epi32(p1));
+        const __m256i p_2 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p2, p2)), _mm_cvtepi16_epi32(p2));
+        const __m256i p_3 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p3, p3)), _mm_cvtepi16_epi32(p3));
+
+        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[0]), _mm256_cvtepi32_ps(p_0)), acc);
+        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[1]), _mm256_cvtepi32_ps(p_1)), acc);
+        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[2]), _mm256_cvtepi32_ps(p_2)), acc);
+        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[3]), _mm256_cvtepi32_ps(p_3)), acc);
+    }
+
+    *s = hsum_float_8(acc) + summs;
+
+#elif defined __riscv_v_intrinsic
+
+    uint32_t aux32[2];
+    const uint8_t * scales = (const uint8_t *)aux32;
+
+    float sumf = 0;
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * (float)x[i].d;
+        const float dmin = -y[i].d * (float)x[i].dmin;
+
+        const uint8_t * restrict q2 = x[i].qs;
+        const int8_t  * restrict q8 = y[i].qs;
+        const uint32_t * restrict sc = (const uint32_t *)x[i].scales;
+
+        aux32[0] = sc[0] & 0x0f0f0f0f;
+        aux32[1] = (sc[0] >> 4) & 0x0f0f0f0f;
+
+        sumf += dmin * (scales[4] * y[i].bsums[0] + scales[5] * y[i].bsums[1] + scales[6] * y[i].bsums[2] + scales[7] * y[i].bsums[3]);
+
+        int isum1 = 0;
+        int isum2 = 0;
+
+        size_t vl = 16;
+
+        vint16m1_t vzero = __riscv_vmv_v_x_i16m1(0, 1);
+
+        // load Q2
+        vuint8mf2_t q2_x = __riscv_vle8_v_u8mf2(q2, vl);
+
+        vint8mf2_t q2_0 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(q2_x, 0x03, vl));
+        vint8mf2_t q2_1 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(q2_x, 0x2, vl), 0x03 , vl));
+        vint8mf2_t q2_2 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(q2_x, 0x4, vl), 0x03 , vl));
+        vint8mf2_t q2_3 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(q2_x, 0x6, vl), 0x03 , vl));
+
+        // load Q8, and take product with Q2
+        vint16m1_t p0 = __riscv_vwmul_vv_i16m1(q2_0, __riscv_vle8_v_i8mf2(q8, vl), vl);
+        vint16m1_t p1 = __riscv_vwmul_vv_i16m1(q2_1, __riscv_vle8_v_i8mf2(q8+16, vl), vl);
+        vint16m1_t p2 = __riscv_vwmul_vv_i16m1(q2_2, __riscv_vle8_v_i8mf2(q8+32, vl), vl);
+        vint16m1_t p3 = __riscv_vwmul_vv_i16m1(q2_3, __riscv_vle8_v_i8mf2(q8+48, vl), vl);
+
+        vint16m1_t vs_0 = __riscv_vredsum_vs_i16m1_i16m1(p0, vzero, vl);
+        vint16m1_t vs_1 = __riscv_vredsum_vs_i16m1_i16m1(p1, vzero, vl);
+        vint16m1_t vs_2 = __riscv_vredsum_vs_i16m1_i16m1(p2, vzero, vl);
+        vint16m1_t vs_3 = __riscv_vredsum_vs_i16m1_i16m1(p3, vzero, vl);
+
+        isum1 += __riscv_vmv_x_s_i16m1_i16(vs_0) * scales[0];
+        isum2 += __riscv_vmv_x_s_i16m1_i16(vs_1) * scales[1];
+        isum1 += __riscv_vmv_x_s_i16m1_i16(vs_2) * scales[2];
+        isum2 += __riscv_vmv_x_s_i16m1_i16(vs_3) * scales[3];
+
+        sumf += d * (isum1 + isum2);
+
+    }
+
+    *s = sumf;
+
+#else
+
+    float sumf = 0;
+
+    int isum[4];
+
+    for (int i = 0; i < nb; ++i) {
+
+        const uint8_t * q2 = x[i].qs;
+        const  int8_t * q8 = y[i].qs;
+        const uint8_t * sc = x[i].scales;
+
+        int summs = 0;
+        for (int j = 0; j < QK_K/16; ++j) {
+            summs += y[i].bsums[j] * (sc[j] >> 4);
+        }
+
+        const float dall = y[i].d * GGML_V3_FP16_TO_FP32(x[i].d);
+        const float dmin = y[i].d * GGML_V3_FP16_TO_FP32(x[i].dmin);
+
+        isum[0] = isum[1] = isum[2] = isum[3] = 0;
+        for (int l =  0; l < 16; ++l) {
+            isum[0] += q8[l+ 0] * ((q2[l] >> 0) & 3);
+            isum[1] += q8[l+16] * ((q2[l] >> 2) & 3);
+            isum[2] += q8[l+32] * ((q2[l] >> 4) & 3);
+            isum[3] += q8[l+48] * ((q2[l] >> 6) & 3);
+        }
+        for (int l = 0; l < 4; ++l) {
+            isum[l] *= (sc[l] & 0xF);
+        }
+        sumf += dall * (isum[0] + isum[1] + isum[2] + isum[3]) - dmin * summs;
+    }
+    *s = sumf;
+#endif
+}
+#endif
+
+#if QK_K == 256
+static void ggml_v3_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+    assert(n % QK_K == 0);
+
+    const uint32_t kmask1 = 0x03030303;
+    const uint32_t kmask2 = 0x0f0f0f0f;
+
+    const block_q3_K * restrict x = vx;
+    const block_q8_K * restrict y = vy;
+
+    const int nb = n / QK_K;
+
+#ifdef __ARM_NEON
+
+    uint32_t aux[3];
+    uint32_t utmp[4];
+
+    const uint8x16_t m3b = vdupq_n_u8(0x3);
+    const int32x4_t  vzero = vdupq_n_s32(0);
+
+    const uint8x16_t m0 = vdupq_n_u8(1);
+    const uint8x16_t m1 = vshlq_n_u8(m0, 1);
+    const uint8x16_t m2 = vshlq_n_u8(m0, 2);
+    const uint8x16_t m3 = vshlq_n_u8(m0, 3);
+    const int8_t m32 = 32;
+
+    ggml_v3_int8x16x4_t q3bytes;
+
+    float sum = 0;
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * GGML_V3_FP16_TO_FP32(x[i].d);
+
+        const uint8_t * restrict q3 = x[i].qs;
+        const uint8_t * restrict qh = x[i].hmask;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        ggml_v3_uint8x16x2_t qhbits = ggml_v3_vld1q_u8_x2(qh);
+
+        ggml_v3_uint8x16x4_t q3h;
+
+        int32_t isum = 0;
+
+        // Set up scales
+        memcpy(aux, x[i].scales, 12);
+        utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
+        utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
+        utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
+        utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
+
+        int8_t * scale = (int8_t *)utmp;
+        for (int j = 0; j < 16; ++j) scale[j] -= m32;
+
+        for (int j = 0; j < QK_K/128; ++j) {
+
+            const ggml_v3_uint8x16x2_t q3bits = ggml_v3_vld1q_u8_x2(q3); q3 += 32;
+            const ggml_v3_int8x16x4_t q8bytes_1 = ggml_v3_vld1q_s8_x4(q8); q8 += 64;
+            const ggml_v3_int8x16x4_t q8bytes_2 = ggml_v3_vld1q_s8_x4(q8); q8 += 64;
+
+            q3h.val[0] = vshlq_n_u8(vbicq_u8(m0, qhbits.val[0]), 2);
+            q3h.val[1] = vshlq_n_u8(vbicq_u8(m0, qhbits.val[1]), 2);
+            q3h.val[2] = vshlq_n_u8(vbicq_u8(m1, qhbits.val[0]), 1);
+            q3h.val[3] = vshlq_n_u8(vbicq_u8(m1, qhbits.val[1]), 1);
+
+            q3bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(q3bits.val[0], m3b)), vreinterpretq_s8_u8(q3h.val[0]));
+            q3bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(q3bits.val[1], m3b)), vreinterpretq_s8_u8(q3h.val[1]));
+            q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 2), m3b)), vreinterpretq_s8_u8(q3h.val[2]));
+            q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 2), m3b)), vreinterpretq_s8_u8(q3h.val[3]));
+
+            isum += vaddvq_s32(ggml_v3_vdotq_s32(vzero, q3bytes.val[0], q8bytes_1.val[0])) * scale[0];
+            isum += vaddvq_s32(ggml_v3_vdotq_s32(vzero, q3bytes.val[1], q8bytes_1.val[1])) * scale[1];
+            isum += vaddvq_s32(ggml_v3_vdotq_s32(vzero, q3bytes.val[2], q8bytes_1.val[2])) * scale[2];
+            isum += vaddvq_s32(ggml_v3_vdotq_s32(vzero, q3bytes.val[3], q8bytes_1.val[3])) * scale[3];
+
+            scale += 4;
+
+            q3h.val[0] = vbicq_u8(m2, qhbits.val[0]);
+            q3h.val[1] = vbicq_u8(m2, qhbits.val[1]);
+            q3h.val[2] = vshrq_n_u8(vbicq_u8(m3, qhbits.val[0]), 1);
+            q3h.val[3] = vshrq_n_u8(vbicq_u8(m3, qhbits.val[1]), 1);
+
+            q3bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 4), m3b)), vreinterpretq_s8_u8(q3h.val[0]));
+            q3bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 4), m3b)), vreinterpretq_s8_u8(q3h.val[1]));
+            q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 6), m3b)), vreinterpretq_s8_u8(q3h.val[2]));
+            q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 6), m3b)), vreinterpretq_s8_u8(q3h.val[3]));
+
+            isum += vaddvq_s32(ggml_v3_vdotq_s32(vzero, q3bytes.val[0], q8bytes_2.val[0])) * scale[0];
+            isum += vaddvq_s32(ggml_v3_vdotq_s32(vzero, q3bytes.val[1], q8bytes_2.val[1])) * scale[1];
+            isum += vaddvq_s32(ggml_v3_vdotq_s32(vzero, q3bytes.val[2], q8bytes_2.val[2])) * scale[2];
+            isum += vaddvq_s32(ggml_v3_vdotq_s32(vzero, q3bytes.val[3], q8bytes_2.val[3])) * scale[3];
+
+            scale += 4;
+
+            if (j == 0) {
+                qhbits.val[0] = vshrq_n_u8(qhbits.val[0], 4);
+                qhbits.val[1] = vshrq_n_u8(qhbits.val[1], 4);
+            }
+
+        }
+        sum += d * isum;
+
+    }
+
+    *s = sum;
+
+#elif defined __AVX2__
+
+    const __m256i m3 = _mm256_set1_epi8(3);
+    const __m256i mone = _mm256_set1_epi8(1);
+    const __m128i m32 = _mm_set1_epi8(32);
+
+    __m256 acc = _mm256_setzero_ps();
+
+    uint32_t aux[3];
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * GGML_V3_FP16_TO_FP32(x[i].d);
+
+        const uint8_t * restrict q3 = x[i].qs;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        // Set up scales
+        memcpy(aux, x[i].scales, 12);
+        __m128i scales128 = _mm_set_epi32(
+                ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4),
+                ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4),
+                (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4),
+                (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4));
+        scales128 = _mm_sub_epi8(scales128, m32);
+        const __m256i all_scales = _mm256_cvtepi8_epi16(scales128);
+        const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0);
+        const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1);
+        const __m256i scales[2] = {MM256_SET_M128I(l_scales, l_scales), MM256_SET_M128I(h_scales, h_scales)};
+
+        // high bit
+        const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].hmask);
+
+        // integer accumulator
+        __m256i sumi = _mm256_setzero_si256();
+
+        int bit = 0;
+        int is  = 0;
+
+        for (int j = 0; j < QK_K/128; ++j) {
+            // load low 2 bits
+            const __m256i q3bits = _mm256_loadu_si256((const __m256i*)q3); q3 += 32;
+
+            // prepare low and high bits
+            const __m256i q3l_0 = _mm256_and_si256(q3bits, m3);
+            const __m256i q3h_0 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, bit)), bit), 2);
+            ++bit;
+
+            const __m256i q3l_1 = _mm256_and_si256(_mm256_srli_epi16(q3bits, 2), m3);
+            const __m256i q3h_1 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, bit)), bit), 2);
+            ++bit;
+
+            const __m256i q3l_2 = _mm256_and_si256(_mm256_srli_epi16(q3bits, 4), m3);
+            const __m256i q3h_2 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, bit)), bit), 2);
+            ++bit;
+
+            const __m256i q3l_3 = _mm256_and_si256(_mm256_srli_epi16(q3bits, 6), m3);
+            const __m256i q3h_3 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, bit)), bit), 2);
+            ++bit;
+
+            // load Q8 quants
+            const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+            const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+            const __m256i q8_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+            const __m256i q8_3 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+
+            // Dot product: we multiply the 2 low bits and 1 high bit part separately, so we can use _mm256_maddubs_epi16,
+            // and then subtract. The high bit part has the 2 already subtracted (and so, it is zero if the high bit was not set,
+            // and 2 if the high bit was set)
+            __m256i q8s_0 = _mm256_maddubs_epi16(q3h_0, q8_0);
+            __m256i q8s_1 = _mm256_maddubs_epi16(q3h_1, q8_1);
+            __m256i q8s_2 = _mm256_maddubs_epi16(q3h_2, q8_2);
+            __m256i q8s_3 = _mm256_maddubs_epi16(q3h_3, q8_3);
+
+            __m256i p16_0 = _mm256_maddubs_epi16(q3l_0, q8_0);
+            __m256i p16_1 = _mm256_maddubs_epi16(q3l_1, q8_1);
+            __m256i p16_2 = _mm256_maddubs_epi16(q3l_2, q8_2);
+            __m256i p16_3 = _mm256_maddubs_epi16(q3l_3, q8_3);
+
+            p16_0 = _mm256_sub_epi16(p16_0, q8s_0);
+            p16_1 = _mm256_sub_epi16(p16_1, q8s_1);
+            p16_2 = _mm256_sub_epi16(p16_2, q8s_2);
+            p16_3 = _mm256_sub_epi16(p16_3, q8s_3);
+
+            // multiply with scales
+            p16_0 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(is + 0)), p16_0);
+            p16_1 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(is + 1)), p16_1);
+            p16_2 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(is + 2)), p16_2);
+            p16_3 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(is + 3)), p16_3);
+
+            // accumulate
+            p16_0 = _mm256_add_epi32(p16_0, p16_1);
+            p16_2 = _mm256_add_epi32(p16_2, p16_3);
+            sumi  = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_0, p16_2));
+
+        }
+
+        // multiply with block scale and accumulate
+        acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi), acc);
+
+    }
+
+    *s = hsum_float_8(acc);
+
+#elif defined __AVX__
+
+    const __m128i m3 = _mm_set1_epi8(3);
+    const __m128i mone = _mm_set1_epi8(1);
+    const __m128i m32 = _mm_set1_epi8(32);
+    const __m128i m2 = _mm_set1_epi8(2);
+
+    __m256 acc = _mm256_setzero_ps();
+
+    const uint32_t *aux;
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * GGML_V3_FP16_TO_FP32(x[i].d);
+
+        const uint8_t * restrict q3 = x[i].qs;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        // Set up scales
+        aux = (const uint32_t *)x[i].scales;
+        __m128i scales128 = _mm_set_epi32(
+                ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4),
+                ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4),
+                (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4),
+                (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4));
+        scales128 = _mm_sub_epi8(scales128, m32);
+        const __m128i scales_0 = _mm_cvtepi8_epi16(scales128);
+        const __m128i scales_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(scales128, scales128));
+        const __m128i scales[2] = { scales_0, scales_1 };
+
+        // high bit *128*2 from block_q3_K.hmask[QK_K/8]
+        const __m128i hbits_0 = _mm_loadu_si128((const __m128i*)&x[i].hmask[0]);
+        const __m128i hbits_1 = _mm_loadu_si128((const __m128i*)&x[i].hmask[16]);
+
+        // integer accumulator
+        __m128i sumi_0 = _mm_setzero_si128();
+        __m128i sumi_1 = _mm_setzero_si128();
+
+        for (int j = 0; j < QK_K/128; ++j) {
+            // load low 2 bits *64*2 from block_q3_K.qs[QK_K/4]
+            const __m128i q3bits_0 = _mm_loadu_si128((const __m128i*)q3); q3 += 16;
+            const __m128i q3bits_1 = _mm_loadu_si128((const __m128i*)q3); q3 += 16;
+
+            // prepare low and high bits
+            const int bit = j << 2;
+
+            const __m128i q3l_0 = _mm_and_si128(q3bits_0, m3);
+            const __m128i q3l_1 = _mm_and_si128(q3bits_1, m3);
+            const __m128i q3h_0 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit)), bit), 2);
+            const __m128i q3h_1 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit)), bit), 2);
+
+            const __m128i q3l_2 = _mm_and_si128(_mm_srli_epi16(q3bits_0, 2), m3);
+            const __m128i q3l_3 = _mm_and_si128(_mm_srli_epi16(q3bits_1, 2), m3);
+            const __m128i q3h_2 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit+1)), bit+1), 2);
+            const __m128i q3h_3 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit+1)), bit+1), 2);
+
+            const __m128i q3l_4 = _mm_and_si128(_mm_srli_epi16(q3bits_0, 4), m3);
+            const __m128i q3l_5 = _mm_and_si128(_mm_srli_epi16(q3bits_1, 4), m3);
+            const __m128i q3h_4 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit+2)), bit+2), 2);
+            const __m128i q3h_5 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit+2)), bit+2), 2);
+
+            const __m128i q3l_6 = _mm_and_si128(_mm_srli_epi16(q3bits_0, 6), m3);
+            const __m128i q3l_7 = _mm_and_si128(_mm_srli_epi16(q3bits_1, 6), m3);
+            const __m128i q3h_6 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit+3)), bit+3), 2);
+            const __m128i q3h_7 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit+3)), bit+3), 2);
+
+            // load Q8 quants from block_q8_K.qs[QK_K]
+            const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_2 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_3 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_4 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_5 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+
+            // Dot product: we multiply the 2 low bits and 1 high bit part separately, so we can use _mm256_maddubs_epi16,
+            // and then subtract. The high bit part has the 2 already subtracted (and so, it is zero if the high bit was not set,
+            // and 2 if the high bit was set)
+            __m128i q8s_0 = _mm_maddubs_epi16(q3h_0, q8_0);
+            __m128i q8s_1 = _mm_maddubs_epi16(q3h_1, q8_1);
+            __m128i q8s_2 = _mm_maddubs_epi16(q3h_2, q8_2);
+            __m128i q8s_3 = _mm_maddubs_epi16(q3h_3, q8_3);
+            __m128i q8s_4 = _mm_maddubs_epi16(q3h_4, q8_4);
+            __m128i q8s_5 = _mm_maddubs_epi16(q3h_5, q8_5);
+            __m128i q8s_6 = _mm_maddubs_epi16(q3h_6, q8_6);
+            __m128i q8s_7 = _mm_maddubs_epi16(q3h_7, q8_7);
+
+            __m128i p16_0 = _mm_maddubs_epi16(q3l_0, q8_0);
+            __m128i p16_1 = _mm_maddubs_epi16(q3l_1, q8_1);
+            __m128i p16_2 = _mm_maddubs_epi16(q3l_2, q8_2);
+            __m128i p16_3 = _mm_maddubs_epi16(q3l_3, q8_3);
+            __m128i p16_4 = _mm_maddubs_epi16(q3l_4, q8_4);
+            __m128i p16_5 = _mm_maddubs_epi16(q3l_5, q8_5);
+            __m128i p16_6 = _mm_maddubs_epi16(q3l_6, q8_6);
+            __m128i p16_7 = _mm_maddubs_epi16(q3l_7, q8_7);
+
+            p16_0 = _mm_sub_epi16(p16_0, q8s_0);
+            p16_1 = _mm_sub_epi16(p16_1, q8s_1);
+            p16_2 = _mm_sub_epi16(p16_2, q8s_2);
+            p16_3 = _mm_sub_epi16(p16_3, q8s_3);
+            p16_4 = _mm_sub_epi16(p16_4, q8s_4);
+            p16_5 = _mm_sub_epi16(p16_5, q8s_5);
+            p16_6 = _mm_sub_epi16(p16_6, q8s_6);
+            p16_7 = _mm_sub_epi16(p16_7, q8s_7);
+
+            // multiply with scales
+            __m128i shuffle = _mm_set1_epi16(0x0100);
+            p16_0 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_0);
+            shuffle = _mm_add_epi16(shuffle, m2);
+            p16_1 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_1);
+            shuffle = _mm_add_epi16(shuffle, m2);
+            p16_2 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_2);
+            shuffle = _mm_add_epi16(shuffle, m2);
+            p16_3 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_3);
+            shuffle = _mm_add_epi16(shuffle, m2);
+            p16_4 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_4);
+            shuffle = _mm_add_epi16(shuffle, m2);
+            p16_5 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_5);
+            shuffle = _mm_add_epi16(shuffle, m2);
+            p16_6 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_6);
+            shuffle = _mm_add_epi16(shuffle, m2);
+            p16_7 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_7);
+
+            // accumulate
+            p16_0 = _mm_add_epi32(p16_0, p16_1);
+            p16_2 = _mm_add_epi32(p16_2, p16_3);
+            p16_4 = _mm_add_epi32(p16_4, p16_5);
+            p16_6 = _mm_add_epi32(p16_6, p16_7);
+            sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
+            sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_4, p16_6));
+
+        }
+
+        // multiply with block scale and accumulate
+        __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
+        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc);
+
+    }
+
+    *s = hsum_float_8(acc);
+
+#elif defined __riscv_v_intrinsic
+
+    uint32_t aux[3];
+    uint32_t utmp[4];
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+
+        const uint8_t * restrict q3 = x[i].qs;
+        const uint8_t * restrict qh = x[i].hmask;
+        const  int8_t * restrict q8 = y[i].qs;
+
+        memcpy(aux, x[i].scales, 12);
+        utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
+        utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
+        utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
+        utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
+
+        int8_t * scale = (int8_t *)utmp;
+        for (int j = 0; j < 16; ++j) scale[j] -= 32;
+
+
+        size_t vl = 32;
+        uint8_t m =  1;
+
+        vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
+        vuint8m1_t vqh = __riscv_vle8_v_u8m1(qh, vl);
+
+        int sum_t = 0;
+
+        for (int j = 0; j < QK_K; j += 128) {
+
+            vl = 32;
+
+            // load Q3
+            vuint8m1_t q3_x = __riscv_vle8_v_u8m1(q3, vl);
+
+            vint8m1_t q3_0 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q3_x, 0x03, vl));
+            vint8m1_t q3_1 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q3_x, 0x2, vl), 0x03 , vl));
+            vint8m1_t q3_2 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q3_x, 0x4, vl), 0x03 , vl));
+            vint8m1_t q3_3 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q3_x, 0x6, vl), 0x03 , vl));
+
+            // compute mask for subtraction
+            vuint8m1_t qh_m0 = __riscv_vand_vx_u8m1(vqh, m, vl);
+            vbool8_t vmask_0 = __riscv_vmseq_vx_u8m1_b8(qh_m0, 0, vl);
+            vint8m1_t q3_m0 = __riscv_vsub_vx_i8m1_m(vmask_0, q3_0, 0x4, vl);
+            m <<= 1;
+
+            vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
+            vbool8_t vmask_1 = __riscv_vmseq_vx_u8m1_b8(qh_m1, 0, vl);
+            vint8m1_t q3_m1 = __riscv_vsub_vx_i8m1_m(vmask_1, q3_1, 0x4, vl);
+            m <<= 1;
+
+            vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
+            vbool8_t vmask_2 = __riscv_vmseq_vx_u8m1_b8(qh_m2, 0, vl);
+            vint8m1_t q3_m2 = __riscv_vsub_vx_i8m1_m(vmask_2, q3_2, 0x4, vl);
+            m <<= 1;
+
+            vuint8m1_t qh_m3 = __riscv_vand_vx_u8m1(vqh, m, vl);
+            vbool8_t vmask_3 = __riscv_vmseq_vx_u8m1_b8(qh_m3, 0, vl);
+            vint8m1_t q3_m3 = __riscv_vsub_vx_i8m1_m(vmask_3, q3_3, 0x4, vl);
+            m <<= 1;
+
+            // load Q8 and take product with Q3
+            vint16m2_t a0 = __riscv_vwmul_vv_i16m2(q3_m0, __riscv_vle8_v_i8m1(q8, vl), vl);
+            vint16m2_t a1 = __riscv_vwmul_vv_i16m2(q3_m1, __riscv_vle8_v_i8m1(q8+32, vl), vl);
+            vint16m2_t a2 = __riscv_vwmul_vv_i16m2(q3_m2, __riscv_vle8_v_i8m1(q8+64, vl), vl);
+            vint16m2_t a3 = __riscv_vwmul_vv_i16m2(q3_m3, __riscv_vle8_v_i8m1(q8+96, vl), vl);
+
+            vl = 16;
+
+            // retrieve lane to multiply with scale
+            vint32m2_t aux0_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a0, 0), (scale[0]), vl);
+            vint32m2_t aux0_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a0, 1), (scale[1]), vl);
+            vint32m2_t aux1_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a1, 0), (scale[2]), vl);
+            vint32m2_t aux1_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a1, 1), (scale[3]), vl);
+            vint32m2_t aux2_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a2, 0), (scale[4]), vl);
+            vint32m2_t aux2_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a2, 1), (scale[5]), vl);
+            vint32m2_t aux3_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a3, 0), (scale[6]), vl);
+            vint32m2_t aux3_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a3, 1), (scale[7]), vl);
+
+            vint32m1_t isum0 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux0_0, aux0_1, vl), vzero, vl);
+            vint32m1_t isum1 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux1_0, aux1_1, vl), isum0, vl);
+            vint32m1_t isum2 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux2_0, aux2_1, vl), isum1, vl);
+            vint32m1_t isum3 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux3_0, aux3_1, vl), isum2, vl);
+
+            sum_t +=  __riscv_vmv_x_s_i32m1_i32(isum3);
+
+            q3 += 32;    q8 += 128;   scale += 8;
+
+        }
+
+        const float d = GGML_V3_FP16_TO_FP32(x[i].d) * y[i].d;
+
+        sumf += d*sum_t;
+
+    }
+
+    *s = sumf;
+
+#else
+    // scalar version
+    // This function is written like this so the compiler can manage to vectorize most of it
+    // Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the
+    // manually vectorized version above. Every other version I tried would run at least 4 times slower.
+    // The ideal situation would be if we could just write the code once, and the compiler would
+    // automatically produce the best possible set of machine instructions, instead of us having to manually
+    // write vectorized versions for AVX, ARM_NEON, etc.
+
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    uint32_t auxs[4];
+    const int8_t * scales = (const int8_t*)auxs;
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * restrict q3 = x[i].qs;
+        const uint8_t * restrict hm = x[i].hmask;
+        const  int8_t * restrict q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * restrict a = aux8;
+        uint8_t m = 1;
+        for (int j = 0; j < QK_K; j += 128) {
+            for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
+            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
+            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3;
+            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3;
+            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
+            a += 32; m <<= 1;
+            q3 += 32;
+        }
+        a = aux8;
+
+        memcpy(auxs, x[i].scales, 12);
+        uint32_t tmp = auxs[2];
+        auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
+        auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
+        auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
+        auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
+        for (int j = 0; j < QK_K/16; ++j) {
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_V3_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
+
+#endif
+
+}
+
+#else
+
+static void ggml_v3_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+    assert(n % QK_K == 0);
+
+    const block_q3_K * restrict x = vx;
+    const block_q8_K * restrict y = vy;
+
+    const int nb = n / QK_K;
+
+#ifdef __ARM_NEON
+    const int32x4_t vzero = vdupq_n_s32(0);
+
+    const uint8x16_t m3b = vdupq_n_u8(0x3);
+    const uint8x16_t mh  = vdupq_n_u8(4);
+
+    ggml_v3_int8x16x4_t q3bytes;
+
+    uint16_t aux16[2];
+    int8_t * scales = (int8_t *)aux16;
+
+    float sum = 0;
+
+    for (int i = 0; i < nb; ++i) {
+
+        ggml_v3_uint8x16x4_t q3h;
+
+        const uint8x8_t  hbits    = vld1_u8(x[i].hmask);
+        const uint8x16_t q3bits   = vld1q_u8(x[i].qs);
+        const ggml_v3_int8x16x4_t q8bytes = ggml_v3_vld1q_s8_x4(y[i].qs);
+
+        const uint16_t a = *(const uint16_t *)x[i].scales;
+        aux16[0] = a & 0x0f0f;
+        aux16[1] = (a >> 4) & 0x0f0f;
+
+        for (int j = 0; j < 4; ++j) scales[j] -= 8;
+
+        int32_t isum = -4*(scales[0] * y[i].bsums[0] + scales[2] * y[i].bsums[1] + scales[1] * y[i].bsums[2] + scales[3] * y[i].bsums[3]);
+
+        const float d = y[i].d * (float)x[i].d;
+
+        const uint8x16_t htmp = vcombine_u8(hbits, vshr_n_u8(hbits, 1));
+        q3h.val[0] = vandq_u8(mh, vshlq_n_u8(htmp, 2));
+        q3h.val[1] = vandq_u8(mh, htmp);
+        q3h.val[2] = vandq_u8(mh, vshrq_n_u8(htmp, 2));
+        q3h.val[3] = vandq_u8(mh, vshrq_n_u8(htmp, 4));
+
+        q3bytes.val[0] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q3bits, m3b),                q3h.val[0]));
+        q3bytes.val[1] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(vshrq_n_u8(q3bits, 2), m3b), q3h.val[1]));
+        q3bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(vshrq_n_u8(q3bits, 4), m3b), q3h.val[2]));
+        q3bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q3bits, 6),                q3h.val[3]));
+
+        isum += vaddvq_s32(ggml_v3_vdotq_s32(vzero, q3bytes.val[0], q8bytes.val[0])) * scales[0];
+        isum += vaddvq_s32(ggml_v3_vdotq_s32(vzero, q3bytes.val[1], q8bytes.val[1])) * scales[2];
+        isum += vaddvq_s32(ggml_v3_vdotq_s32(vzero, q3bytes.val[2], q8bytes.val[2])) * scales[1];
+        isum += vaddvq_s32(ggml_v3_vdotq_s32(vzero, q3bytes.val[3], q8bytes.val[3])) * scales[3];
+
+        sum += d * isum;
+
+    }
+
+    *s = sum;
+
+#elif defined __AVX2__
+
+    const __m256i m3 = _mm256_set1_epi8(3);
+    const __m256i m1 = _mm256_set1_epi8(1);
+
+    __m256 acc = _mm256_setzero_ps();
+
+    uint64_t aux64;
+
+    uint16_t aux16[2];
+    const int8_t * aux8 = (const int8_t *)aux16;
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * GGML_V3_FP16_TO_FP32(x[i].d);
+
+        const uint8_t * restrict q3 = x[i].qs;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        const uint16_t a = *(const uint16_t *)x[i].scales;
+        aux16[0] = a & 0x0f0f;
+        aux16[1] = (a >> 4) & 0x0f0f;
+
+        const __m256i scale_0 = MM256_SET_M128I(_mm_set1_epi16(aux8[2] - 8), _mm_set1_epi16(aux8[0] - 8));
+        const __m256i scale_1 = MM256_SET_M128I(_mm_set1_epi16(aux8[3] - 8), _mm_set1_epi16(aux8[1] - 8));
+
+        memcpy(&aux64, x[i].hmask, 8);
+
+        const __m128i haux = _mm_set_epi64x(aux64 >> 1, aux64 >> 0);
+        __m256i q3h_0 = MM256_SET_M128I(_mm_srli_epi16(haux, 2), haux);
+        __m256i q3h_1 = _mm256_srli_epi16(q3h_0, 4);
+        q3h_0 = _mm256_slli_epi16(_mm256_andnot_si256(q3h_0, m1), 2);
+        q3h_1 = _mm256_slli_epi16(_mm256_andnot_si256(q3h_1, m1), 2);
+
+        // load low 2 bits
+        const __m128i q3bits = _mm_loadu_si128((const __m128i*)q3);
+
+        // prepare low and high bits
+        const __m256i q3aux  = MM256_SET_M128I(_mm_srli_epi16(q3bits, 2), q3bits);
+        const __m256i q3l_0 = _mm256_and_si256(q3aux, m3);
+        const __m256i q3l_1 = _mm256_and_si256(_mm256_srli_epi16(q3aux, 4), m3);
+
+        // load Q8 quants
+        const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
+        const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
+
+        // Dot product: we multiply the 2 low bits and 1 high bit part separately, so we can use _mm256_maddubs_epi16,
+        // and then subtract. The high bit part has the 2 already subtracted (and so, it is zero if the high bit was not set,
+        // and 2 if the high bit was set)
+        const __m256i q8s_0 = _mm256_maddubs_epi16(q3h_0, q8_0);
+        const __m256i q8s_1 = _mm256_maddubs_epi16(q3h_1, q8_1);
+
+        __m256i p16_0 = _mm256_maddubs_epi16(q3l_0, q8_0);
+        __m256i p16_1 = _mm256_maddubs_epi16(q3l_1, q8_1);
+
+        p16_0 = _mm256_sub_epi16(p16_0, q8s_0);
+        p16_1 = _mm256_sub_epi16(p16_1, q8s_1);
+
+        // multiply with scales
+        p16_0 = _mm256_madd_epi16(scale_0, p16_0);
+        p16_1 = _mm256_madd_epi16(scale_1, p16_1);
+
+        p16_0 = _mm256_add_epi32(p16_0, p16_1);
+
+        // multiply with block scale and accumulate
+        acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(p16_0), acc);
+
+    }
+
+    *s = hsum_float_8(acc);
+
+#elif defined __AVX__
+
+    const __m128i m3 = _mm_set1_epi8(3);
+    const __m128i m1 = _mm_set1_epi8(1);
+
+    __m256 acc = _mm256_setzero_ps();
+
+    uint64_t aux64;
+
+    uint16_t aux16[2];
+    const int8_t * aux8 = (const int8_t *)aux16;
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * GGML_V3_FP16_TO_FP32(x[i].d);
+
+        const uint8_t * restrict q3 = x[i].qs;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        const uint16_t a = *(const uint16_t *)x[i].scales;
+        aux16[0] = a & 0x0f0f;
+        aux16[1] = (a >> 4) & 0x0f0f;
+
+        const __m128i scale_0 = _mm_set1_epi16(aux8[0] - 8);
+        const __m128i scale_1 = _mm_set1_epi16(aux8[2] - 8);
+        const __m128i scale_2 = _mm_set1_epi16(aux8[1] - 8);
+        const __m128i scale_3 = _mm_set1_epi16(aux8[3] - 8);
+
+        memcpy(&aux64, x[i].hmask, 8);
+
+        __m128i q3h_0 = _mm_set_epi64x(aux64 >> 1, aux64 >> 0);
+        __m128i q3h_1 = _mm_srli_epi16(q3h_0, 2);
+        __m128i q3h_2 = _mm_srli_epi16(q3h_0, 4);
+        __m128i q3h_3 = _mm_srli_epi16(q3h_0, 6);
+        q3h_0 = _mm_slli_epi16(_mm_andnot_si128(q3h_0, m1), 2);
+        q3h_1 = _mm_slli_epi16(_mm_andnot_si128(q3h_1, m1), 2);
+        q3h_2 = _mm_slli_epi16(_mm_andnot_si128(q3h_2, m1), 2);
+        q3h_3 = _mm_slli_epi16(_mm_andnot_si128(q3h_3, m1), 2);
+
+        // load low 2 bits
+        const __m128i q3bits = _mm_loadu_si128((const __m128i*)q3);
+
+        // prepare low and high bits
+        const __m128i q3l_0 = _mm_and_si128(q3bits, m3);
+        const __m128i q3l_1 = _mm_and_si128(_mm_srli_epi16(q3bits, 2), m3);
+        const __m128i q3l_2 = _mm_and_si128(_mm_srli_epi16(q3bits, 4), m3);
+        const __m128i q3l_3 = _mm_and_si128(_mm_srli_epi16(q3bits, 6), m3);
+
+        // load Q8 quants
+        const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
+        const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
+
+        // Dot product: we multiply the 2 low bits and 1 high bit part separately, so we can use _mm_maddubs_epi16,
+        // and then subtract. The high bit part has the 2 already subtracted (and so, it is zero if the high bit was not set,
+        // and 2 if the high bit was set)
+        const __m128i q8s_0 = _mm_maddubs_epi16(q3h_0, _mm256_extractf128_si256(q8_0, 0));
+        const __m128i q8s_1 = _mm_maddubs_epi16(q3h_1, _mm256_extractf128_si256(q8_0, 1));
+        const __m128i q8s_2 = _mm_maddubs_epi16(q3h_2, _mm256_extractf128_si256(q8_1, 0));
+        const __m128i q8s_3 = _mm_maddubs_epi16(q3h_3, _mm256_extractf128_si256(q8_1, 1));
+
+        __m128i p16_0 = _mm_maddubs_epi16(q3l_0, _mm256_extractf128_si256(q8_0, 0));
+        __m128i p16_1 = _mm_maddubs_epi16(q3l_1, _mm256_extractf128_si256(q8_0, 1));
+        __m128i p16_2 = _mm_maddubs_epi16(q3l_2, _mm256_extractf128_si256(q8_1, 0));
+        __m128i p16_3 = _mm_maddubs_epi16(q3l_3, _mm256_extractf128_si256(q8_1, 1));
+
+        p16_0 = _mm_sub_epi16(p16_0, q8s_0);
+        p16_1 = _mm_sub_epi16(p16_1, q8s_1);
+        p16_2 = _mm_sub_epi16(p16_2, q8s_2);
+        p16_3 = _mm_sub_epi16(p16_3, q8s_3);
+
+        // multiply with scales
+        p16_0 = _mm_madd_epi16(scale_0, p16_0);
+        p16_1 = _mm_madd_epi16(scale_1, p16_1);
+        p16_2 = _mm_madd_epi16(scale_2, p16_2);
+        p16_3 = _mm_madd_epi16(scale_3, p16_3);
+
+        p16_0 = _mm_add_epi32(p16_0, p16_2);
+        p16_1 = _mm_add_epi32(p16_1, p16_3);
+        __m256i p16 = MM256_SET_M128I(p16_1, p16_0);
+
+        // multiply with block scale and accumulate
+        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(p16)), acc);
+
+    }
+
+    *s = hsum_float_8(acc);
+
+#elif defined __riscv_v_intrinsic
+
+    uint16_t aux16[2];
+    int8_t * scales = (int8_t *)aux16;
+
+    float sumf = 0;
+
+    for (int i = 0; i < nb; ++i) {
+
+        const uint8_t * restrict q3 = x[i].qs;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        const uint16_t a = *(const uint16_t *)x[i].scales;
+        aux16[0] = a & 0x0f0f;
+        aux16[1] = (a >> 4) & 0x0f0f;
+
+        for (int j = 0; j < 4; ++j) scales[j] -= 8;
+
+        int32_t isum = -4*(scales[0] * y[i].bsums[0] + scales[2] * y[i].bsums[1] + scales[1] * y[i].bsums[2] + scales[3] * y[i].bsums[3]);
+
+        const float d = y[i].d * (float)x[i].d;
+
+        vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
+
+        // load qh
+        vuint8mf4_t qh_x1   = __riscv_vle8_v_u8mf4(x[i].hmask, 8);
+        vuint8mf2_t qh_x2   = __riscv_vlmul_ext_v_u8mf4_u8mf2(__riscv_vsrl_vx_u8mf4(qh_x1, 1, 8));
+
+        size_t vl = 16;
+
+        // extend and combine both qh_x1 and qh_x2
+        vuint8mf2_t qh_x = __riscv_vslideup_vx_u8mf2(__riscv_vlmul_ext_v_u8mf4_u8mf2(qh_x1), qh_x2, vl/2, vl);
+
+        vuint8mf2_t qh_0 = __riscv_vand_vx_u8mf2(__riscv_vsll_vx_u8mf2(qh_x, 0x2, vl), 0x4, vl);
+        vuint8mf2_t qh_1 = __riscv_vand_vx_u8mf2(qh_x, 0x4, vl);
+        vuint8mf2_t qh_2 = __riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(qh_x, 0x2, vl), 0x4, vl);
+        vuint8mf2_t qh_3 = __riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(qh_x, 0x4, vl), 0x4, vl);
+
+        // load Q3
+        vuint8mf2_t q3_x  = __riscv_vle8_v_u8mf2(q3, vl);
+
+        vuint8mf2_t q3h_0 = __riscv_vor_vv_u8mf2(__riscv_vand_vx_u8mf2(q3_x, 0x3, vl), qh_0, vl);
+        vuint8mf2_t q3h_1 = __riscv_vor_vv_u8mf2(__riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(q3_x, 2, vl), 0x3, vl), qh_1, vl);
+        vuint8mf2_t q3h_2 = __riscv_vor_vv_u8mf2(__riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(q3_x, 4, vl), 0x3, vl), qh_2, vl);
+        vuint8mf2_t q3h_3 = __riscv_vor_vv_u8mf2(__riscv_vsrl_vx_u8mf2(q3_x, 0x6, vl), qh_3, vl);
+
+        vint8mf2_t q3_0 = __riscv_vreinterpret_v_u8mf2_i8mf2(q3h_0);
+        vint8mf2_t q3_1 = __riscv_vreinterpret_v_u8mf2_i8mf2(q3h_1);
+        vint8mf2_t q3_2 = __riscv_vreinterpret_v_u8mf2_i8mf2(q3h_2);
+        vint8mf2_t q3_3 = __riscv_vreinterpret_v_u8mf2_i8mf2(q3h_3);
+
+        // load Q8 and take product with Q3
+        vint16m1_t p0 = __riscv_vwmul_vv_i16m1(q3_0, __riscv_vle8_v_i8mf2(q8, vl), vl);
+        vint16m1_t p1 = __riscv_vwmul_vv_i16m1(q3_1, __riscv_vle8_v_i8mf2(q8+16, vl), vl);
+        vint16m1_t p2 = __riscv_vwmul_vv_i16m1(q3_2, __riscv_vle8_v_i8mf2(q8+32, vl), vl);
+        vint16m1_t p3 = __riscv_vwmul_vv_i16m1(q3_3, __riscv_vle8_v_i8mf2(q8+48, vl), vl);
+
+        vint32m1_t vs_0 = __riscv_vwredsum_vs_i16m1_i32m1(p0, vzero, vl);
+        vint32m1_t vs_1 = __riscv_vwredsum_vs_i16m1_i32m1(p1, vzero, vl);
+        vint32m1_t vs_2 = __riscv_vwredsum_vs_i16m1_i32m1(p2, vzero, vl);
+        vint32m1_t vs_3 = __riscv_vwredsum_vs_i16m1_i32m1(p3, vzero, vl);
+
+        isum += __riscv_vmv_x_s_i32m1_i32(vs_0) * scales[0];
+        isum += __riscv_vmv_x_s_i32m1_i32(vs_1) * scales[2];
+        isum += __riscv_vmv_x_s_i32m1_i32(vs_2) * scales[1];
+        isum += __riscv_vmv_x_s_i32m1_i32(vs_3) * scales[3];
+
+        sumf += d * isum;
+
+    }
+
+    *s = sumf;
+
+#else
+
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    int32_t scales[4];
+    memset(sums, 0, 8*sizeof(float));
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * restrict q3 = x[i].qs;
+        const uint8_t * restrict hm = x[i].hmask;
+        const  int8_t * restrict q8 = y[i].qs;
+        int8_t * restrict a = aux8;
+        for (int l = 0; l < 8; ++l) {
+            a[l+ 0] = (int8_t)((q3[l+0] >> 0) & 3) - (hm[l] & 0x01 ? 0 : 4);
+            a[l+ 8] = (int8_t)((q3[l+8] >> 0) & 3) - (hm[l] & 0x02 ? 0 : 4);
+            a[l+16] = (int8_t)((q3[l+0] >> 2) & 3) - (hm[l] & 0x04 ? 0 : 4);
+            a[l+24] = (int8_t)((q3[l+8] >> 2) & 3) - (hm[l] & 0x08 ? 0 : 4);
+            a[l+32] = (int8_t)((q3[l+0] >> 4) & 3) - (hm[l] & 0x10 ? 0 : 4);
+            a[l+40] = (int8_t)((q3[l+8] >> 4) & 3) - (hm[l] & 0x20 ? 0 : 4);
+            a[l+48] = (int8_t)((q3[l+0] >> 6) & 3) - (hm[l] & 0x40 ? 0 : 4);
+            a[l+56] = (int8_t)((q3[l+8] >> 6) & 3) - (hm[l] & 0x80 ? 0 : 4);
+        }
+
+        scales[0] = (x[i].scales[0] & 0xF) - 8;
+        scales[1] = (x[i].scales[0] >>  4) - 8;
+        scales[2] = (x[i].scales[1] & 0xF) - 8;
+        scales[3] = (x[i].scales[1] >>  4) - 8;
+
+        memset(aux32, 0, 8*sizeof(int32_t));
+        for (int j = 0; j < QK_K/16; ++j) {
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] += q8[l] * a[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux32[l] += scales[j] * aux16[l];
+        }
+        const float d = GGML_V3_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
+
+#endif
+
+}
+#endif
+
+#if QK_K == 256
+static void ggml_v3_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+    assert(n % QK_K == 0);
+
+    const block_q4_K * restrict x = vx;
+    const block_q8_K * restrict y = vy;
+
+    const int nb = n / QK_K;
+
+    static const uint32_t kmask1 = 0x3f3f3f3f;
+    static const uint32_t kmask2 = 0x0f0f0f0f;
+    static const uint32_t kmask3 = 0x03030303;
+
+    uint32_t utmp[4];
+
+#ifdef __ARM_NEON
+    const uint8x16_t m4b = vdupq_n_u8(0xf);
+    const int32x4_t mzero = vdupq_n_s32(0);
+
+    ggml_v3_int8x16x2_t q4bytes;
+    ggml_v3_int8x16x2_t q8bytes;
+
+    float sumf = 0;
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * GGML_V3_FP16_TO_FP32(x[i].d);
+        const float dmin = y[i].d * GGML_V3_FP16_TO_FP32(x[i].dmin);
+
+        const int16x8_t q8sums = vpaddq_s16(vld1q_s16(y[i].bsums), vld1q_s16(y[i].bsums + 8));
+
+        memcpy(utmp, x[i].scales, 12);
+
+        uint32x2_t mins8 = { 0 };
+        mins8 = vset_lane_u32(utmp[1] & kmask1, mins8, 0);
+        mins8 = vset_lane_u32(((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4), mins8, 1);
+
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[0] &= kmask1;
+
+        const int16x8_t mins = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(mins8)));
+        const int32x4_t prod = vaddq_s32(vmull_s16(vget_low_s16 (q8sums), vget_low_s16 (mins)),
+                                         vmull_s16(vget_high_s16(q8sums), vget_high_s16(mins)));
+        sumf -= dmin * vaddvq_s32(prod);
+
+        const uint8_t * scales = (const uint8_t *)utmp;
+
+        const uint8_t * restrict q4 = x[i].qs;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        int32_t sumi1 = 0;
+        int32_t sumi2 = 0;
+
+        for (int j = 0; j < QK_K/64; ++j) {
+            const ggml_v3_uint8x16x2_t q4bits = ggml_v3_vld1q_u8_x2(q4); q4 += 32;
+
+            q8bytes = ggml_v3_vld1q_s8_x2(q8); q8 += 32;
+            q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8  (q4bits.val[0], m4b));
+            q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8  (q4bits.val[1], m4b));
+
+            const int32x4_t p1 = ggml_v3_vdotq_s32(ggml_v3_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
+            sumi1 += vaddvq_s32(p1) * scales[2*j+0];
+
+            q8bytes = ggml_v3_vld1q_s8_x2(q8); q8 += 32;
+            q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4));
+            q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4));
+
+            const int32x4_t p2 = ggml_v3_vdotq_s32(ggml_v3_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
+
+            sumi2 += vaddvq_s32(p2) * scales[2*j+1];
+        }
+
+        sumf += d * (sumi1 + sumi2);
+
+    }
+
+    *s = sumf;
+
+#elif defined __AVX2__
+
+    const __m256i m4 = _mm256_set1_epi8(0xF);
+
+    __m256 acc = _mm256_setzero_ps();
+    __m128 acc_m = _mm_setzero_ps();
+
+   for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * GGML_V3_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_V3_FP16_TO_FP32(x[i].dmin);
+
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        const uint8_t * restrict q4 = x[i].qs;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        const __m256i mins_and_scales = _mm256_cvtepu8_epi16(_mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]));
+
+        const __m256i q8sums = _mm256_loadu_si256((const __m256i*)y[i].bsums);
+        const __m128i q8s = _mm_hadd_epi16(_mm256_extracti128_si256(q8sums, 0), _mm256_extracti128_si256(q8sums, 1));
+        const __m128i prod = _mm_madd_epi16(_mm256_extracti128_si256(mins_and_scales, 1), q8s);
+        acc_m = _mm_fmadd_ps(_mm_set1_ps(dmin), _mm_cvtepi32_ps(prod), acc_m);
+
+        const __m128i sc128  = _mm256_extracti128_si256(mins_and_scales, 0);
+        const __m256i scales = MM256_SET_M128I(sc128, sc128);
+
+        __m256i sumi = _mm256_setzero_si256();
+
+        for (int j = 0; j < QK_K/64; ++j) {
+
+            const __m256i scale_l = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2*j+0));
+            const __m256i scale_h = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2*j+1));
+
+            const __m256i q4bits = _mm256_loadu_si256((const __m256i*)q4); q4 += 32;
+            const __m256i q4l = _mm256_and_si256(q4bits, m4);
+            const __m256i q4h = _mm256_and_si256(_mm256_srli_epi16(q4bits, 4), m4);
+
+            const __m256i q8l = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+            __m256i p16l = _mm256_maddubs_epi16(q4l, q8l);
+            p16l = _mm256_madd_epi16(scale_l, p16l);
+
+            const __m256i q8h = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+            __m256i p16h = _mm256_maddubs_epi16(q4h, q8h);
+            p16h = _mm256_madd_epi16(scale_h, p16h);
+            const __m256i sumj = _mm256_add_epi32(p16l, p16h);
+
+            sumi = _mm256_add_epi32(sumi, sumj);
+        }
+
+        __m256 vd = _mm256_set1_ps(d);
+        acc = _mm256_fmadd_ps(vd, _mm256_cvtepi32_ps(sumi), acc);
+
+    }
+
+    acc_m = _mm_add_ps(acc_m, _mm_movehl_ps(acc_m, acc_m));
+    acc_m = _mm_add_ss(acc_m, _mm_movehdup_ps(acc_m));
+
+    *s = hsum_float_8(acc) + _mm_cvtss_f32(acc_m);
+
+#elif defined __AVX__
+
+    const __m128i m4 = _mm_set1_epi8(0xF);
+    const __m128i m2 = _mm_set1_epi8(0x2);
+
+    __m256 acc = _mm256_setzero_ps();
+    __m128 acc_m = _mm_setzero_ps();
+
+   for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * GGML_V3_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_V3_FP16_TO_FP32(x[i].dmin);
+
+        const uint8_t * restrict q4 = x[i].qs;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        const __m128i utmps = _mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]);
+        const __m128i scales = _mm_cvtepu8_epi16(utmps);
+        const __m128i mins = _mm_cvtepu8_epi16(_mm_unpackhi_epi64(utmps, utmps));
+
+        const __m128i q8sums_0 = _mm_loadu_si128((const __m128i*)&y[i].bsums[0]);
+        const __m128i q8sums_1 = _mm_loadu_si128((const __m128i*)&y[i].bsums[8]);
+        const __m128i q8s = _mm_hadd_epi16(q8sums_0, q8sums_1);
+        const __m128i prod = _mm_madd_epi16(mins, q8s);
+        acc_m = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(dmin), _mm_cvtepi32_ps(prod)), acc_m);
+
+        __m128i sumi_0 = _mm_setzero_si128();
+        __m128i sumi_1 = _mm_setzero_si128();
+
+        __m128i shuffle = _mm_set1_epi16(0x0100);
+        for (int j = 0; j < QK_K/64; ++j) {
+
+            const __m128i scale_l = _mm_shuffle_epi8(scales, shuffle);
+            shuffle = _mm_add_epi16(shuffle, m2);
+            const __m128i scale_h = _mm_shuffle_epi8(scales, shuffle);
+            shuffle = _mm_add_epi16(shuffle, m2);
+
+            __m128i q4bits = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
+            const __m128i q4l_0 = _mm_and_si128(q4bits, m4);
+            const __m128i q4h_0 = _mm_and_si128(_mm_srli_epi16(q4bits, 4), m4);
+            q4bits = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
+            const __m128i q4l_1 = _mm_and_si128(q4bits, m4);
+            const __m128i q4h_1 = _mm_and_si128(_mm_srli_epi16(q4bits, 4), m4);
+
+            const __m128i q8l_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            __m128i p16l = _mm_maddubs_epi16(q4l_0, q8l_0);
+            p16l = _mm_madd_epi16(scale_l, p16l);
+            sumi_0 = _mm_add_epi32(sumi_0, p16l);
+            const __m128i q8l_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            p16l = _mm_maddubs_epi16(q4l_1, q8l_1);
+            p16l = _mm_madd_epi16(scale_l, p16l);
+            sumi_1 = _mm_add_epi32(sumi_1, p16l);
+
+            const __m128i q8h_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            __m128i p16h = _mm_maddubs_epi16(q4h_0, q8h_0);
+            p16h = _mm_madd_epi16(scale_h, p16h);
+            sumi_0 = _mm_add_epi32(sumi_0, p16h);
+            const __m128i q8h_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            p16h = _mm_maddubs_epi16(q4h_1, q8h_1);
+            p16h = _mm_madd_epi16(scale_h, p16h);
+            sumi_1 = _mm_add_epi32(sumi_1, p16h);
+
+        }
+
+        __m256 vd = _mm256_set1_ps(d);
+        __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
+        acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc);
+
+    }
+
+    acc_m = _mm_add_ps(acc_m, _mm_movehl_ps(acc_m, acc_m));
+    acc_m = _mm_add_ss(acc_m, _mm_movehdup_ps(acc_m));
+
+    *s = hsum_float_8(acc) + _mm_cvtss_f32(acc_m);
+
+#elif defined __riscv_v_intrinsic
+
+    const uint8_t * scales = (const uint8_t*)&utmp[0];
+    const uint8_t * mins   = (const uint8_t*)&utmp[2];
+
+    float sumf = 0;
+
+    for (int i = 0; i < nb; ++i) {
+
+        size_t vl = 8;
+
+        const float d = y[i].d * GGML_V3_FP16_TO_FP32(x[i].d);
+        const float dmin = y[i].d * GGML_V3_FP16_TO_FP32(x[i].dmin);
+
+        vint16mf2_t q8sums_0 = __riscv_vlse16_v_i16mf2(y[i].bsums, 4, vl);
+        vint16mf2_t q8sums_1 = __riscv_vlse16_v_i16mf2(y[i].bsums+1, 4, vl);
+        vint16mf2_t q8sums   = __riscv_vadd_vv_i16mf2(q8sums_0, q8sums_1, vl);
+
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        vuint8mf4_t mins8  = __riscv_vle8_v_u8mf4(mins, vl);
+        vint16mf2_t v_mins = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vzext_vf2_u16mf2(mins8, vl));
+        vint32m1_t  prod   = __riscv_vwmul_vv_i32m1(q8sums, v_mins, vl);
+
+        vint32m1_t sumi = __riscv_vredsum_vs_i32m1_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl);
+        sumf -= dmin * __riscv_vmv_x_s_i32m1_i32(sumi);
+
+        const uint8_t * restrict q4 = x[i].qs;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        vl = 32;
+
+        int32_t sum_1 = 0;
+        int32_t sum_2 = 0;
+
+        vint16m1_t vzero = __riscv_vmv_v_x_i16m1(0, 1);
+
+        for (int j = 0; j < QK_K/64; ++j) {
+            // load Q4
+            vuint8m1_t q4_x = __riscv_vle8_v_u8m1(q4, vl);
+
+            // load Q8 and multiply it with lower Q4 nibble
+            vint8m1_t  q8_0 = __riscv_vle8_v_i8m1(q8, vl);
+            vint8m1_t  q4_0 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q4_x, 0x0F, vl));
+            vint16m2_t qv_0 = __riscv_vwmul_vv_i16m2(q4_0, q8_0, vl);
+            vint16m1_t vs_0 = __riscv_vredsum_vs_i16m2_i16m1(qv_0, vzero, vl);
+
+            sum_1 += __riscv_vmv_x_s_i16m1_i16(vs_0) * scales[2*j+0];
+
+            // load Q8 and multiply it with upper Q4 nibble
+            vint8m1_t  q8_1 = __riscv_vle8_v_i8m1(q8+32, vl);
+            vint8m1_t  q4_1 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q4_x, 0x04, vl));
+            vint16m2_t qv_1 = __riscv_vwmul_vv_i16m2(q4_1, q8_1, vl);
+            vint16m1_t vs_1 = __riscv_vredsum_vs_i16m2_i16m1(qv_1, vzero, vl);
+
+            sum_2 += __riscv_vmv_x_s_i16m1_i16(vs_1) * scales[2*j+1];
+
+            q4 += 32;    q8 += 64;
+
+        }
+
+        sumf += d*(sum_1 + sum_2);
+
+    }
+
+    *s = sumf;
+
+#else
+
+
+    const uint8_t * scales = (const uint8_t*)&utmp[0];
+    const uint8_t * mins   = (const uint8_t*)&utmp[2];
+
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * restrict q4 = x[i].qs;
+        const  int8_t * restrict q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * restrict a = aux8;
+        for (int j = 0; j < QK_K/64; ++j) {
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
+            a += 32;
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l]  >> 4);
+            a += 32; q4 += 32;
+        }
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        int sumi = 0;
+        for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
+        a = aux8;
+        int is = 0;
+        for (int j = 0; j < QK_K/32; ++j) {
+            int32_t scale = scales[is++];
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_V3_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+        const float dmin = GGML_V3_FP16_TO_FP32(x[i].dmin) * y[i].d;
+        sumf -= dmin * sumi;
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
+#endif
+}
+#else
+static void ggml_v3_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+    assert(n % QK_K == 0);
+
+    const block_q4_K * restrict x = vx;
+    const block_q8_K * restrict y = vy;
+
+    const int nb = n / QK_K;
+
+#ifdef __ARM_NEON
+    const uint8x16_t m4b = vdupq_n_u8(0xf);
+
+    const int32x4_t mzero = vdupq_n_s32(0);
+
+    float sumf = 0;
+
+    ggml_v3_int8x16x2_t q4bytes;
+    ggml_v3_int8x16x4_t q8bytes;
+
+    float sum_mins = 0.f;
+
+    uint16_t aux16[2];
+    const uint8_t * restrict scales = (const uint8_t *)aux16;
+
+    for (int i = 0; i < nb; ++i) {
+
+        const uint8_t * restrict q4 = x[i].qs;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        const uint16_t * restrict a = (const uint16_t *)x[i].scales;
+        aux16[0] = a[0] & 0x0f0f;
+        aux16[1] = (a[0] >> 4) & 0x0f0f;
+
+        const int32_t summi = scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]);
+        sum_mins += y[i].d * (float)x[i].d[1] * summi;
+
+        const float d = y[i].d * (float)x[i].d[0];
+
+        const ggml_v3_uint8x16x2_t q4bits = ggml_v3_vld1q_u8_x2(q4);
+
+        q8bytes = ggml_v3_vld1q_s8_x4(q8);
+        q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8  (q4bits.val[0], m4b));
+        q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8  (q4bits.val[1], m4b));
+
+        const int32x4_t p1 = ggml_v3_vdotq_s32(ggml_v3_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
+        const int32_t sumi1 = vaddvq_s32(p1) * scales[0];
+
+        q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4));
+        q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4));
+
+        const int32x4_t p2 = ggml_v3_vdotq_s32(ggml_v3_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[2]), q4bytes.val[1], q8bytes.val[3]);
+        const int32_t sumi2 = vaddvq_s32(p2) * scales[1];
+
+        sumf += d * (sumi1 + sumi2);
+    }
+
+    *s = sumf - sum_mins;
+
+#elif defined __AVX2__
+
+    const __m256i m4 = _mm256_set1_epi8(0xF);
+
+    __m256 acc = _mm256_setzero_ps();
+
+    float summs = 0;
+
+    uint16_t aux16[2];
+    const uint8_t * scales = (const uint8_t *)aux16;
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = GGML_V3_FP16_TO_FP32(x[i].d[0]) * y[i].d;
+        const float m = GGML_V3_FP16_TO_FP32(x[i].d[1]) * y[i].d;
+        const __m256 vd = _mm256_set1_ps(d);
+
+        const uint16_t * a = (const uint16_t *)x[i].scales;
+        aux16[0] = a[0] & 0x0f0f;
+        aux16[1] = (a[0] >> 4) & 0x0f0f;
+
+        summs += m * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]));
+
+        const uint8_t * restrict q4 = x[i].qs;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        const __m256i q4bits = _mm256_loadu_si256((const __m256i*)q4);
+        const __m256i q4l = _mm256_and_si256(q4bits, m4);
+        const __m256i q4h = _mm256_and_si256(_mm256_srli_epi16(q4bits, 4), m4);
+
+        const __m256i q8l = _mm256_loadu_si256((const __m256i*)(q8+ 0));
+        const __m256i q8h = _mm256_loadu_si256((const __m256i*)(q8+32));
+
+        const __m256i p16l = _mm256_maddubs_epi16(q4l, q8l);
+        const __m256i p16h = _mm256_maddubs_epi16(q4h, q8h);
+
+        const __m256i p32l = _mm256_madd_epi16(_mm256_set1_epi16(scales[0]), p16l);
+        acc = _mm256_fmadd_ps(vd, _mm256_cvtepi32_ps(p32l), acc);
+
+        const __m256i p32h = _mm256_madd_epi16(_mm256_set1_epi16(scales[1]), p16h);
+        acc = _mm256_fmadd_ps(vd, _mm256_cvtepi32_ps(p32h), acc);
+
+    }
+
+    *s = hsum_float_8(acc) - summs;
+
+#elif defined __AVX__
+
+    const __m128i m4 = _mm_set1_epi8(0xF);
+
+    __m256 acc = _mm256_setzero_ps();
+
+    float summs = 0;
+
+    uint16_t aux16[2];
+    const uint8_t * scales = (const uint8_t *)aux16;
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = GGML_V3_FP16_TO_FP32(x[i].d[0]) * y[i].d;
+        const float m = GGML_V3_FP16_TO_FP32(x[i].d[1]) * y[i].d;
+        const __m256 vd = _mm256_set1_ps(d);
+
+        const uint16_t * a = (const uint16_t *)x[i].scales;
+        aux16[0] = a[0] & 0x0f0f;
+        aux16[1] = (a[0] >> 4) & 0x0f0f;
+
+        summs += m * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]));
+
+        const uint8_t * restrict q4 = x[i].qs;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        const __m256i q4bits = _mm256_loadu_si256((const __m256i*)q4);
+        const __m128i q4bits_0 = _mm256_extractf128_si256(q4bits, 0);
+        const __m128i q4bits_1 = _mm256_extractf128_si256(q4bits, 1);
+        const __m128i q4_0 = _mm_and_si128(q4bits_0, m4);
+        const __m128i q4_1 = _mm_and_si128(q4bits_1, m4);
+        const __m128i q4_2 = _mm_and_si128(_mm_srli_epi16(q4bits_0, 4), m4);
+        const __m128i q4_3 = _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4);
+
+        const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
+        const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
+
+        const __m128i p16_0 = _mm_maddubs_epi16(q4_0, _mm256_extractf128_si256(q8_0, 0));
+        const __m128i p16_1 = _mm_maddubs_epi16(q4_1, _mm256_extractf128_si256(q8_0, 1));
+        const __m128i p16_2 = _mm_maddubs_epi16(q4_2, _mm256_extractf128_si256(q8_1, 0));
+        const __m128i p16_3 = _mm_maddubs_epi16(q4_3, _mm256_extractf128_si256(q8_1, 1));
+
+        const __m128i p32_0 = _mm_madd_epi16(_mm_set1_epi16(scales[0]), p16_0);
+        const __m128i p32_1 = _mm_madd_epi16(_mm_set1_epi16(scales[0]), p16_1);
+        acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(MM256_SET_M128I(p32_1, p32_0))), acc);
+
+        const __m128i p32_2 = _mm_madd_epi16(_mm_set1_epi16(scales[1]), p16_2);
+        const __m128i p32_3 = _mm_madd_epi16(_mm_set1_epi16(scales[1]), p16_3);
+        acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(MM256_SET_M128I(p32_3, p32_2))), acc);
+
+    }
+
+    *s = hsum_float_8(acc) - summs;
+
+#elif defined __riscv_v_intrinsic
+
+    uint16_t s16[2];
+    const uint8_t * restrict scales = (const uint8_t *)s16;
+
+    float sumf = 0;
+
+    for (int i = 0; i < nb; ++i) {
+
+        const uint8_t * restrict q4 = x[i].qs;
+        const  int8_t * restrict q8 = y[i].qs;
+
+        const uint16_t * restrict b = (const uint16_t *)x[i].scales;
+        s16[0] = b[0] & 0x0f0f;
+        s16[1] = (b[0] >> 4) & 0x0f0f;
+
+        sumf -= y[i].d * GGML_V3_FP16_TO_FP32(x[i].d[1]) * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]));
+        const float d = y[i].d * GGML_V3_FP16_TO_FP32(x[i].d[0]);
+
+        size_t vl = 32;
+
+        vint16m1_t vzero = __riscv_vmv_v_x_i16m1(0, 1);
+
+        // load Q4
+        vuint8m1_t q4_x = __riscv_vle8_v_u8m1(q4, vl);
+
+        // load Q8 and multiply it with lower Q4 nibble
+        vint8m1_t  q4_a = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q4_x, 0x0F, vl));
+        vint16m2_t va_0 = __riscv_vwmul_vv_i16m2(q4_a, __riscv_vle8_v_i8m1(q8, vl), vl);
+        vint16m1_t aux1 = __riscv_vredsum_vs_i16m2_i16m1(va_0, vzero, vl);
+
+        sumf += d*scales[0]*__riscv_vmv_x_s_i16m1_i16(aux1);
+
+        // load Q8 and multiply it with upper Q4 nibble
+        vint8m1_t  q4_s = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q4_x, 0x04, vl));
+        vint16m2_t va_1 = __riscv_vwmul_vv_i16m2(q4_s, __riscv_vle8_v_i8m1(q8+32, vl), vl);
+        vint16m1_t aux2 = __riscv_vredsum_vs_i16m2_i16m1(va_1, vzero, vl);
+
+        sumf += d*scales[1]*__riscv_vmv_x_s_i16m1_i16(aux2);
+
+    }
+
+    *s = sumf;
+
+#else
+
+    uint8_t aux8[QK_K];
+    int16_t aux16[16];
+    float   sums [8];
+    memset(sums, 0, 8*sizeof(float));
+
+    uint16_t s16[2];
+    const uint8_t * restrict scales = (const uint8_t *)s16;
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * restrict q4 = x[i].qs;
+        const  int8_t * restrict q8 = y[i].qs;
+        uint8_t * restrict a = aux8;
+        for (int l = 0; l < 32; ++l) a[l+ 0] = q4[l] & 0xF;
+        for (int l = 0; l < 32; ++l) a[l+32] = q4[l]  >> 4;
+
+        const uint16_t * restrict b = (const uint16_t *)x[i].scales;
+        s16[0] = b[0] & 0x0f0f;
+        s16[1] = (b[0] >> 4) & 0x0f0f;
+
+        sumf -= y[i].d * GGML_V3_FP16_TO_FP32(x[i].d[1]) * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]));
+
+        const float d = y[i].d * GGML_V3_FP16_TO_FP32(x[i].d[0]);
+
+        for (int j = 0; j < QK_K/32; ++j) {
+            for (int l = 0; l < 16; ++l) aux16[l] = q8[l] * a[l];
+            q8 += 16; a += 16;
+            for (int l = 0; l < 16; ++l) aux16[l] += q8[l] * a[l];
+            q8 += 16; a += 16;
+            const float dl = d * scales[j];
+            for (int l = 0; l < 8; ++l) sums[l] += dl * (aux16[l] + aux16[l+8]);
+        }
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
+#endif
+}
+#endif
+
+#if QK_K == 256
+static void ggml_v3_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+    assert(n % QK_K == 0);
+
+    const block_q5_K * restrict x = vx;
+    const block_q8_K * restrict y = vy;
+
+    const int nb = n / QK_K;
+
+    static const uint32_t kmask1 = 0x3f3f3f3f;
+    static const uint32_t kmask2 = 0x0f0f0f0f;
+    static const uint32_t kmask3 = 0x03030303;
+
+    uint32_t utmp[4];
+
+#ifdef __ARM_NEON
+    const uint8x16_t m4b = vdupq_n_u8(0xf);
+    const uint8x16_t mone = vdupq_n_u8(1);
+    const uint8x16_t mtwo = vdupq_n_u8(2);
+    const int32x4_t mzero = vdupq_n_s32(0);
+
+    ggml_v3_int8x16x4_t q5bytes;
+
+    float sumf = 0;
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * GGML_V3_FP16_TO_FP32(x[i].d);
+        const float dmin = y[i].d * GGML_V3_FP16_TO_FP32(x[i].dmin);
+
+        const int16x8_t q8sums = vpaddq_s16(vld1q_s16(y[i].bsums), vld1q_s16(y[i].bsums + 8));
+
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        const uint8x8_t mins8 = vld1_u8((const uint8_t*)utmp + 8);
+        const int16x8_t mins = vreinterpretq_s16_u16(vmovl_u8(mins8));
+        const int32x4_t prod = vaddq_s32(vmull_s16(vget_low_s16 (q8sums), vget_low_s16 (mins)),
+                                         vmull_s16(vget_high_s16(q8sums), vget_high_s16(mins)));
+        int32_t sumi_mins = vaddvq_s32(prod);
+
+        const uint8_t * scales = (const uint8_t *)utmp;
+
+        const uint8_t * restrict q5 = x[i].qs;
+        const uint8_t * restrict qh = x[i].qh;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        ggml_v3_uint8x16x2_t qhbits = ggml_v3_vld1q_u8_x2(qh);
+
+        ggml_v3_uint8x16x4_t q5h;
+
+        int32_t sumi = 0;
+
+        for (int j = 0; j < QK_K/64; ++j) {
+
+            const ggml_v3_uint8x16x2_t q5bits = ggml_v3_vld1q_u8_x2(q5); q5 += 32;
+            const ggml_v3_int8x16x4_t q8bytes = ggml_v3_vld1q_s8_x4(q8); q8 += 64;
+
+            q5h.val[0] = vshlq_n_u8(vandq_u8(mone, qhbits.val[0]), 4);
+            q5h.val[1] = vshlq_n_u8(vandq_u8(mone, qhbits.val[1]), 4);
+            q5h.val[2] = vshlq_n_u8(vandq_u8(mtwo, qhbits.val[0]), 3);
+            q5h.val[3] = vshlq_n_u8(vandq_u8(mtwo, qhbits.val[1]), 3);
+            qhbits.val[0] = vshrq_n_u8(qhbits.val[0], 2);
+            qhbits.val[1] = vshrq_n_u8(qhbits.val[1], 2);
+
+            q5bytes.val[0] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q5bits.val[0], m4b), q5h.val[0]));
+            q5bytes.val[1] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q5bits.val[1], m4b), q5h.val[1]));
+            q5bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.val[0], 4), q5h.val[2]));
+            q5bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.val[1], 4), q5h.val[3]));
+
+            sumi += vaddvq_s32(ggml_v3_vdotq_s32(ggml_v3_vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0]), q5bytes.val[1], q8bytes.val[1])) * *scales++;
+            sumi += vaddvq_s32(ggml_v3_vdotq_s32(ggml_v3_vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2]), q5bytes.val[3], q8bytes.val[3])) * *scales++;
+        }
+
+        sumf += d * sumi - dmin * sumi_mins;
+    }
+
+    *s = sumf;
+
+#elif defined __AVX2__
+
+    const __m256i m4 = _mm256_set1_epi8(0xF);
+    const __m128i mzero = _mm_setzero_si128();
+    const __m256i mone  = _mm256_set1_epi8(1);
+
+    __m256 acc = _mm256_setzero_ps();
+
+    float summs = 0.f;
+
+   for (int i = 0; i < nb; ++i) {
+
+        const uint8_t * restrict q5 = x[i].qs;
+        const int8_t  * restrict q8 = y[i].qs;
+
+#if QK_K == 256
+        const float d = y[i].d * GGML_V3_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_V3_FP16_TO_FP32(x[i].dmin);
+
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+#else
+        // TODO
+        const float d = 0, dmin = 0;
+#endif
+
+        const __m256i mins_and_scales = _mm256_cvtepu8_epi16(_mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]));
+
+        const __m256i q8sums = _mm256_loadu_si256((const __m256i*)y[i].bsums);
+        const __m128i q8s = _mm_hadd_epi16(_mm256_extracti128_si256(q8sums, 0), _mm256_extracti128_si256(q8sums, 1));
+        const __m128i prod = _mm_madd_epi16(_mm256_extracti128_si256(mins_and_scales, 1), q8s);
+        const __m128i hsum = _mm_hadd_epi32(_mm_hadd_epi32(prod, mzero), mzero);
+        summs += dmin * _mm_extract_epi32(hsum, 0);
+
+        const __m128i sc128  = _mm256_extracti128_si256(mins_and_scales, 0);
+        const __m256i scales = MM256_SET_M128I(sc128, sc128);
+
+        const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].qh);
+        __m256i hmask = mone;
+
+        __m256i sumi = _mm256_setzero_si256();
+
+        int bit = 0;
+
+        for (int j = 0; j < QK_K/64; ++j) {
+
+            const __m256i scale_0 = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2*j+0));
+            const __m256i scale_1 = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2*j+1));
+
+            const __m256i q5bits = _mm256_loadu_si256((const __m256i*)q5); q5 += 32;
+
+            const __m256i q5l_0 = _mm256_and_si256(q5bits, m4);
+            const __m256i q5h_0 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_and_si256(hbits, hmask), bit++), 4);
+            const __m256i q5_0  = _mm256_add_epi8(q5l_0, q5h_0);
+            hmask = _mm256_slli_epi16(hmask, 1);
+
+            const __m256i q5l_1 = _mm256_and_si256(_mm256_srli_epi16(q5bits, 4), m4);
+            const __m256i q5h_1 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_and_si256(hbits, hmask), bit++), 4);
+            const __m256i q5_1  = _mm256_add_epi8(q5l_1, q5h_1);
+            hmask = _mm256_slli_epi16(hmask, 1);
+
+            const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+            const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+
+            __m256i p16_0 = _mm256_maddubs_epi16(q5_0, q8_0);
+            __m256i p16_1 = _mm256_maddubs_epi16(q5_1, q8_1);
+
+            p16_0 = _mm256_madd_epi16(scale_0, p16_0);
+            p16_1 = _mm256_madd_epi16(scale_1, p16_1);
+
+            sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_0, p16_1));
+
+        }
+
+        __m256 vd = _mm256_set1_ps(d);
+        acc = _mm256_fmadd_ps(vd, _mm256_cvtepi32_ps(sumi), acc);
+
+    }
+
+    *s = hsum_float_8(acc) + summs;
+
+#elif defined __AVX__
+
+    const __m128i m4 = _mm_set1_epi8(0xF);
+    const __m128i mzero = _mm_setzero_si128();
+    const __m128i mone  = _mm_set1_epi8(1);
+    const __m128i m2 = _mm_set1_epi8(2);
+
+    __m256 acc = _mm256_setzero_ps();
+
+    float summs = 0.f;
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * GGML_V3_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_V3_FP16_TO_FP32(x[i].dmin);
+
+        const uint8_t * restrict q5 = x[i].qs;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        const __m128i utmps = _mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]);
+        const __m128i scales = _mm_cvtepu8_epi16(utmps);
+        const __m128i mins = _mm_cvtepu8_epi16(_mm_unpackhi_epi64(utmps, utmps));
+
+        const __m128i q8sums_0 = _mm_loadu_si128((const __m128i*)&y[i].bsums[0]);
+        const __m128i q8sums_1 = _mm_loadu_si128((const __m128i*)&y[i].bsums[8]);
+        const __m128i q8s = _mm_hadd_epi16(q8sums_0, q8sums_1);
+        const __m128i prod = _mm_madd_epi16(mins, q8s);
+        const __m128i hsum = _mm_hadd_epi32(_mm_hadd_epi32(prod, mzero), mzero);
+        summs += dmin * _mm_extract_epi32(hsum, 0);
+
+        const __m128i hbits_0 = _mm_loadu_si128((const __m128i*)&x[i].qh[0]);
+        const __m128i hbits_1 = _mm_loadu_si128((const __m128i*)&x[i].qh[16]);
+        __m128i hmask = mone;
+
+        __m128i sumi_0 = _mm_setzero_si128();
+        __m128i sumi_1 = _mm_setzero_si128();
+
+        int bit = 0;
+
+        __m128i shuffle = _mm_set1_epi16(0x0100);
+        for (int j = 0; j < QK_K/64; ++j) {
+
+            const __m128i scale_0 = _mm_shuffle_epi8(scales, shuffle);
+            shuffle = _mm_add_epi16(shuffle, m2);
+            const __m128i scale_1 = _mm_shuffle_epi8(scales, shuffle);
+            shuffle = _mm_add_epi16(shuffle, m2);
+
+            const __m128i q5bits_0 = _mm_loadu_si128((const __m128i*)q5); q5 += 16;
+            const __m128i q5bits_1 = _mm_loadu_si128((const __m128i*)q5); q5 += 16;
+
+            __m128i q5l_0 = _mm_and_si128(q5bits_0, m4);
+            __m128i q5l_1 = _mm_and_si128(q5bits_1, m4);
+            __m128i q5h_0 = _mm_slli_epi16(_mm_srli_epi16(_mm_and_si128(hbits_0, hmask), bit), 4);
+            __m128i q5h_1 = _mm_slli_epi16(_mm_srli_epi16(_mm_and_si128(hbits_1, hmask), bit++), 4);
+            __m128i q5_0  = _mm_add_epi8(q5l_0, q5h_0);
+            __m128i q5_1  = _mm_add_epi8(q5l_1, q5h_1);
+            hmask = _mm_slli_epi16(hmask, 1);
+
+            __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            __m128i p16_0 = _mm_maddubs_epi16(q5_0, q8_0);
+            __m128i p16_1 = _mm_maddubs_epi16(q5_1, q8_1);
+            p16_0 = _mm_madd_epi16(scale_0, p16_0);
+            p16_1 = _mm_madd_epi16(scale_0, p16_1);
+
+            q5l_0 = _mm_and_si128(_mm_srli_epi16(q5bits_0, 4), m4);
+            q5l_1 = _mm_and_si128(_mm_srli_epi16(q5bits_1, 4), m4);
+            q5h_0 = _mm_slli_epi16(_mm_srli_epi16(_mm_and_si128(hbits_0, hmask), bit), 4);
+            q5h_1 = _mm_slli_epi16(_mm_srli_epi16(_mm_and_si128(hbits_1, hmask), bit++), 4);
+            q5_0  = _mm_add_epi8(q5l_0, q5h_0);
+            q5_1  = _mm_add_epi8(q5l_1, q5h_1);
+            hmask = _mm_slli_epi16(hmask, 1);
+
+            q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            __m128i p16_2 = _mm_maddubs_epi16(q5_0, q8_0);
+            __m128i p16_3 = _mm_maddubs_epi16(q5_1, q8_1);
+            p16_2 = _mm_madd_epi16(scale_1, p16_2);
+            p16_3 = _mm_madd_epi16(scale_1, p16_3);
+
+            sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
+            sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3));
+
+        }
+
+        __m256 vd = _mm256_set1_ps(d);
+        __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
+        acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc);
+
+    }
+
+    *s = hsum_float_8(acc) + summs;
+
+#elif defined __riscv_v_intrinsic
+
+    const uint8_t * scales = (const uint8_t*)&utmp[0];
+    const uint8_t * mins   = (const uint8_t*)&utmp[2];
+
+    float sumf = 0;
+    float sums = 0.0;
+
+    size_t vl;
+
+    for (int i = 0; i < nb; ++i) {
+
+        vl = 8;
+
+        const uint8_t * restrict q5 = x[i].qs;
+        const uint8_t * restrict hm = x[i].qh;
+        const  int8_t * restrict q8 = y[i].qs;
+
+        const float d = GGML_V3_FP16_TO_FP32(x[i].d) * y[i].d;
+        const float dmin = GGML_V3_FP16_TO_FP32(x[i].dmin) * y[i].d;
+
+        vint16mf2_t q8sums_0 = __riscv_vlse16_v_i16mf2(y[i].bsums, 4, vl);
+        vint16mf2_t q8sums_1 = __riscv_vlse16_v_i16mf2(y[i].bsums+1, 4, vl);
+        vint16mf2_t q8sums = __riscv_vadd_vv_i16mf2(q8sums_0, q8sums_1, vl);
+
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        vuint8mf4_t mins8 = __riscv_vle8_v_u8mf4(mins, vl);
+        vint16mf2_t v_mins = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vzext_vf2_u16mf2(mins8, vl));
+        vint32m1_t prod = __riscv_vwmul_vv_i32m1(q8sums, v_mins, vl);
+
+        vint32m1_t sumi = __riscv_vredsum_vs_i32m1_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl);
+        sumf -= dmin * __riscv_vmv_x_s_i32m1_i32(sumi);
+
+        vl = 32;
+        int32_t aux32 = 0;
+        int is = 0;
+
+        uint8_t m = 1;
+        vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
+        vuint8m1_t vqh = __riscv_vle8_v_u8m1(hm, vl);
+
+        for (int j = 0; j < QK_K/64; ++j) {
+            // load Q5 and Q8
+            vuint8m1_t q5_x = __riscv_vle8_v_u8m1(q5, vl);
+            vint8m1_t  q8_y1 = __riscv_vle8_v_i8m1(q8, vl);
+            vint8m1_t  q8_y2 = __riscv_vle8_v_i8m1(q8+32, vl);
+
+            // compute mask for addition
+            vint8m1_t q5_a = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q5_x, 0x0F, vl));
+            vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
+            vbool8_t vmask_1 = __riscv_vmsne_vx_u8m1_b8(qh_m1, 0, vl);
+            vint8m1_t q5_m1 = __riscv_vadd_vx_i8m1_m(vmask_1, q5_a, 16, vl);
+            m <<= 1;
+
+            vint8m1_t q5_l = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q5_x, 0x04, vl));
+            vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
+            vbool8_t vmask_2 = __riscv_vmsne_vx_u8m1_b8(qh_m2, 0, vl);
+            vint8m1_t q5_m2 = __riscv_vadd_vx_i8m1_m(vmask_2, q5_l, 16, vl);
+            m <<= 1;
+
+            vint16m2_t v0 = __riscv_vwmul_vv_i16m2(q5_m1, q8_y1, vl);
+            vint16m2_t v1 = __riscv_vwmul_vv_i16m2(q5_m2, q8_y2, vl);
+
+            vint32m4_t vs1 = __riscv_vwmul_vx_i32m4(v0, scales[is++], vl);
+            vint32m4_t vs2 = __riscv_vwmul_vx_i32m4(v1, scales[is++], vl);
+
+            vint32m1_t vacc1 = __riscv_vredsum_vs_i32m4_i32m1(vs1, vzero, vl);
+            vint32m1_t vacc2 = __riscv_vredsum_vs_i32m4_i32m1(vs2, vzero, vl);
+
+            aux32 += __riscv_vmv_x_s_i32m1_i32(vacc1) + __riscv_vmv_x_s_i32m1_i32(vacc2);
+            q5 += 32;    q8 += 64;
+
+        }
+
+        vfloat32m1_t vaux = __riscv_vfmul_vf_f32m1(__riscv_vfmv_v_f_f32m1(aux32, 1), d, 1);
+        sums += __riscv_vfmv_f_s_f32m1_f32(vaux);
+
+    }
+
+    *s = sumf+sums;
+
+#else
+
+    const uint8_t * scales = (const uint8_t*)&utmp[0];
+    const uint8_t * mins   = (const uint8_t*)&utmp[2];
+
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * restrict q4 = x[i].qs;
+        const uint8_t * restrict hm = x[i].qh;
+        const  int8_t * restrict q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * restrict a = aux8;
+        uint8_t m = 1;
+        for (int j = 0; j < QK_K/64; ++j) {
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
+            for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l]  >> 4);
+            for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
+            a += 32; m <<= 1;
+            q4 += 32;
+        }
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        int sumi = 0;
+        for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
+        a = aux8;
+        int is = 0;
+        for (int j = 0; j < QK_K/32; ++j) {
+            int32_t scale = scales[is++];
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_V3_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+        const float dmin = GGML_V3_FP16_TO_FP32(x[i].dmin) * y[i].d;
+        sumf -= dmin * sumi;
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
+#endif
+}
+
+#else
+
+static void ggml_v3_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+    assert(n % QK_K == 0);
+
+    const block_q5_K * restrict x = vx;
+    const block_q8_K * restrict y = vy;
+
+    const int nb = n / QK_K;
+
+#ifdef __ARM_NEON
+    const uint8x16_t m4b = vdupq_n_u8(0xf);
+    const uint8x16_t mh = vdupq_n_u8(16);
+    const int32x4_t mzero = vdupq_n_s32(0);
+
+    ggml_v3_int8x16x4_t q5bytes;
+    ggml_v3_uint8x16x4_t q5h;
+
+    float sumf = 0;
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * (float)x[i].d;
+        const int8_t * sc = x[i].scales;
+
+        const uint8_t * restrict q5 = x[i].qs;
+        const uint8_t * restrict qh = x[i].qh;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        const uint8x8_t qhbits = vld1_u8(qh);
+
+        const ggml_v3_uint8x16x2_t q5bits = ggml_v3_vld1q_u8_x2(q5);
+        const ggml_v3_int8x16x4_t q8bytes = ggml_v3_vld1q_s8_x4(q8);
+
+        const uint8x16_t htmp = vcombine_u8(qhbits, vshr_n_u8(qhbits, 1));
+        q5h.val[0] = vbicq_u8(mh, vshlq_n_u8(htmp, 4));
+        q5h.val[1] = vbicq_u8(mh, vshlq_n_u8(htmp, 2));
+        q5h.val[2] = vbicq_u8(mh, htmp);
+        q5h.val[3] = vbicq_u8(mh, vshrq_n_u8(htmp, 2));
+
+        q5bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(q5bits.val[0], m4b)), vreinterpretq_s8_u8(q5h.val[0]));
+        q5bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(q5bits.val[1], m4b)), vreinterpretq_s8_u8(q5h.val[1]));
+        q5bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vshrq_n_u8(q5bits.val[0], 4)), vreinterpretq_s8_u8(q5h.val[2]));
+        q5bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vshrq_n_u8(q5bits.val[1], 4)), vreinterpretq_s8_u8(q5h.val[3]));
+
+        int32_t sumi1 = sc[0] * vaddvq_s32(ggml_v3_vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0]));
+        int32_t sumi2 = sc[1] * vaddvq_s32(ggml_v3_vdotq_s32(mzero, q5bytes.val[1], q8bytes.val[1]));
+        int32_t sumi3 = sc[2] * vaddvq_s32(ggml_v3_vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2]));
+        int32_t sumi4 = sc[3] * vaddvq_s32(ggml_v3_vdotq_s32(mzero, q5bytes.val[3], q8bytes.val[3]));
+
+        sumf += d * (sumi1 + sumi2 + sumi3 + sumi4);
+    }
+
+    *s = sumf;
+
+#elif defined __AVX2__
+
+    const __m256i m4 = _mm256_set1_epi8(0xF);
+    const __m256i mone  = _mm256_set1_epi8(1);
+
+    __m256 acc = _mm256_setzero_ps();
+
+    for (int i = 0; i < nb; ++i) {
+
+        const uint8_t * restrict q5 = x[i].qs;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        const float d = y[i].d * GGML_V3_FP16_TO_FP32(x[i].d);
+
+        const __m256i q5bits = _mm256_loadu_si256((const __m256i*)q5);
+
+        const __m256i scale_l = MM256_SET_M128I(_mm_set1_epi16(x[i].scales[1]), _mm_set1_epi16(x[i].scales[0]));
+        const __m256i scale_h = MM256_SET_M128I(_mm_set1_epi16(x[i].scales[3]), _mm_set1_epi16(x[i].scales[2]));
+
+        int64_t aux64;
+        memcpy(&aux64, x[i].qh, 8);
+        const __m128i haux128 = _mm_set_epi64x(aux64 >> 1, aux64);
+        const __m256i haux256 = MM256_SET_M128I(_mm_srli_epi16(haux128, 2), haux128);
+
+        const __m256i q5h_0 = _mm256_slli_epi16(_mm256_andnot_si256(haux256, mone), 4);
+        const __m256i q5h_1 = _mm256_slli_epi16(_mm256_andnot_si256(_mm256_srli_epi16(haux256, 4), mone), 4);
+
+        const __m256i q5l_0 = _mm256_and_si256(q5bits, m4);
+        const __m256i q5l_1 = _mm256_and_si256(_mm256_srli_epi16(q5bits, 4), m4);
+
+        const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
+        const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
+
+        const __m256i p16_0 = _mm256_madd_epi16(scale_l, _mm256_maddubs_epi16(q5l_0, q8_0));
+        const __m256i p16_1 = _mm256_madd_epi16(scale_h, _mm256_maddubs_epi16(q5l_1, q8_1));
+        const __m256i s16_0 = _mm256_madd_epi16(scale_l, _mm256_maddubs_epi16(q5h_0, q8_0));
+        const __m256i s16_1 = _mm256_madd_epi16(scale_h, _mm256_maddubs_epi16(q5h_1, q8_1));
+
+        const __m256i dot = _mm256_sub_epi32(_mm256_add_epi32(p16_0, p16_1), _mm256_add_epi32(s16_0, s16_1));
+
+        acc = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(dot), acc);
+
+    }
+
+    *s = hsum_float_8(acc);
+
+#elif defined __AVX__
+
+    const __m128i m4 = _mm_set1_epi8(0xF);
+    const __m128i mone  = _mm_set1_epi8(1);
+
+    __m256 acc = _mm256_setzero_ps();
+
+    for (int i = 0; i < nb; ++i) {
+
+        const uint8_t * restrict q5 = x[i].qs;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        const float d = y[i].d * GGML_V3_FP16_TO_FP32(x[i].d);
+
+        const __m256i q5bits = _mm256_loadu_si256((const __m256i*)q5);
+
+        const __m128i scale_0 = _mm_set1_epi16(x[i].scales[0]);
+        const __m128i scale_1 = _mm_set1_epi16(x[i].scales[1]);
+        const __m128i scale_2 = _mm_set1_epi16(x[i].scales[2]);
+        const __m128i scale_3 = _mm_set1_epi16(x[i].scales[3]);
+
+        int64_t aux64;
+        memcpy(&aux64, x[i].qh, 8);
+        const __m128i haux128_0 = _mm_set_epi64x(aux64 >> 1, aux64);
+        const __m128i haux128_1 = _mm_srli_epi16(haux128_0, 2);
+
+        const __m128i q5h_0 = _mm_slli_epi16(_mm_andnot_si128(haux128_0, mone), 4);
+        const __m128i q5h_1 = _mm_slli_epi16(_mm_andnot_si128(haux128_1, mone), 4);
+        const __m128i q5h_2 = _mm_slli_epi16(_mm_andnot_si128(_mm_srli_epi16(haux128_0, 4), mone), 4);
+        const __m128i q5h_3 = _mm_slli_epi16(_mm_andnot_si128(_mm_srli_epi16(haux128_1, 4), mone), 4);
+
+        const __m128i q5l_0 = _mm_and_si128(_mm256_extractf128_si256(q5bits, 0), m4);
+        const __m128i q5l_1 = _mm_and_si128(_mm256_extractf128_si256(q5bits, 1), m4);
+        const __m128i q5l_2 = _mm_and_si128(_mm_srli_epi16(_mm256_extractf128_si256(q5bits, 0), 4), m4);
+        const __m128i q5l_3 = _mm_and_si128(_mm_srli_epi16(_mm256_extractf128_si256(q5bits, 1), 4), m4);
+
+        const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
+        const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
+
+        const __m128i p16_0 = _mm_madd_epi16(scale_0, _mm_maddubs_epi16(q5l_0, _mm256_extractf128_si256(q8_0, 0)));
+        const __m128i p16_1 = _mm_madd_epi16(scale_1, _mm_maddubs_epi16(q5l_1, _mm256_extractf128_si256(q8_0, 1)));
+        const __m128i p16_2 = _mm_madd_epi16(scale_2, _mm_maddubs_epi16(q5l_2, _mm256_extractf128_si256(q8_1, 0)));
+        const __m128i p16_3 = _mm_madd_epi16(scale_3, _mm_maddubs_epi16(q5l_3, _mm256_extractf128_si256(q8_1, 1)));
+        const __m128i s16_0 = _mm_madd_epi16(scale_0, _mm_maddubs_epi16(q5h_0, _mm256_extractf128_si256(q8_0, 0)));
+        const __m128i s16_1 = _mm_madd_epi16(scale_1, _mm_maddubs_epi16(q5h_1, _mm256_extractf128_si256(q8_0, 1)));
+        const __m128i s16_2 = _mm_madd_epi16(scale_2, _mm_maddubs_epi16(q5h_2, _mm256_extractf128_si256(q8_1, 0)));
+        const __m128i s16_3 = _mm_madd_epi16(scale_3, _mm_maddubs_epi16(q5h_3, _mm256_extractf128_si256(q8_1, 1)));
+
+        const __m128i dot_0 = _mm_sub_epi32(_mm_add_epi32(p16_0, p16_2), _mm_add_epi32(s16_0, s16_2));
+        const __m128i dot_1 = _mm_sub_epi32(_mm_add_epi32(p16_1, p16_3), _mm_add_epi32(s16_1, s16_3));
+
+        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(dot_1, dot_0))), acc);
+
+    }
+
+    *s = hsum_float_8(acc);
+
+#elif defined __riscv_v_intrinsic
+
+    float sumf = 0;
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * (float)x[i].d;
+        const int8_t * sc = x[i].scales;
+
+        const uint8_t * restrict q5 = x[i].qs;
+        const uint8_t * restrict qh = x[i].qh;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
+
+        // load qh
+        vuint8mf4_t qh_x1   = __riscv_vle8_v_u8mf4(qh, 8);
+        vuint8mf2_t qh_x2   = __riscv_vlmul_ext_v_u8mf4_u8mf2(__riscv_vsrl_vx_u8mf4(qh_x1, 1, 8));
+
+        size_t vl = 16;
+
+        // combine both qh_1 and qh_2
+        vuint8mf2_t qh_x = __riscv_vslideup_vx_u8mf2(__riscv_vlmul_ext_v_u8mf4_u8mf2(qh_x1), qh_x2, vl/2, vl);
+
+        vuint8mf2_t qh_h0 = __riscv_vand_vx_u8mf2(__riscv_vnot_v_u8mf2(__riscv_vsll_vx_u8mf2(qh_x, 0x4, vl), vl), 16, vl);
+        vuint8mf2_t qh_h1 = __riscv_vand_vx_u8mf2(__riscv_vnot_v_u8mf2(__riscv_vsll_vx_u8mf2(qh_x, 0x2, vl), vl), 16, vl);
+        vuint8mf2_t qh_h2 = __riscv_vand_vx_u8mf2(__riscv_vnot_v_u8mf2(qh_x, vl), 16, vl);
+        vuint8mf2_t qh_h3 = __riscv_vand_vx_u8mf2(__riscv_vnot_v_u8mf2(__riscv_vsrl_vx_u8mf2(qh_x, 0x4, vl), vl), 16, vl);
+
+        vint8mf2_t qh_0 = __riscv_vreinterpret_v_u8mf2_i8mf2(qh_h0);
+        vint8mf2_t qh_1 = __riscv_vreinterpret_v_u8mf2_i8mf2(qh_h1);
+        vint8mf2_t qh_2 = __riscv_vreinterpret_v_u8mf2_i8mf2(qh_h2);
+        vint8mf2_t qh_3 = __riscv_vreinterpret_v_u8mf2_i8mf2(qh_h3);
+
+        // load q5
+        vuint8mf2_t q5_x1  = __riscv_vle8_v_u8mf2(q5, vl);
+        vuint8mf2_t q5_x2  = __riscv_vle8_v_u8mf2(q5+16, vl);
+
+        vint8mf2_t q5s_0 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(q5_x1, 0xF, vl));
+        vint8mf2_t q5s_1 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(q5_x2, 0xF, vl));
+        vint8mf2_t q5s_2 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vsrl_vx_u8mf2(q5_x1, 0x4, vl));
+        vint8mf2_t q5s_3 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vsrl_vx_u8mf2(q5_x2, 0x4, vl));
+
+        vint8mf2_t q5_0 = __riscv_vsub_vv_i8mf2(q5s_0, qh_0, vl);
+        vint8mf2_t q5_1 = __riscv_vsub_vv_i8mf2(q5s_1, qh_1, vl);
+        vint8mf2_t q5_2 = __riscv_vsub_vv_i8mf2(q5s_2, qh_2, vl);
+        vint8mf2_t q5_3 = __riscv_vsub_vv_i8mf2(q5s_3, qh_3, vl);
+
+        // load Q8 and multiply it with Q5
+        vint16m1_t p0 = __riscv_vwmul_vv_i16m1(q5_0, __riscv_vle8_v_i8mf2(q8, vl), vl);
+        vint16m1_t p1 = __riscv_vwmul_vv_i16m1(q5_1, __riscv_vle8_v_i8mf2(q8+16, vl), vl);
+        vint16m1_t p2 = __riscv_vwmul_vv_i16m1(q5_2, __riscv_vle8_v_i8mf2(q8+32, vl), vl);
+        vint16m1_t p3 = __riscv_vwmul_vv_i16m1(q5_3, __riscv_vle8_v_i8mf2(q8+48, vl), vl);
+
+        vint32m1_t vs_0 = __riscv_vwredsum_vs_i16m1_i32m1(p0, vzero, vl);
+        vint32m1_t vs_1 = __riscv_vwredsum_vs_i16m1_i32m1(p1, vzero, vl);
+        vint32m1_t vs_2 = __riscv_vwredsum_vs_i16m1_i32m1(p2, vzero, vl);
+        vint32m1_t vs_3 = __riscv_vwredsum_vs_i16m1_i32m1(p3, vzero, vl);
+
+        int32_t sumi1 = sc[0] * __riscv_vmv_x_s_i32m1_i32(vs_0);
+        int32_t sumi2 = sc[1] * __riscv_vmv_x_s_i32m1_i32(vs_1);
+        int32_t sumi3 = sc[2] * __riscv_vmv_x_s_i32m1_i32(vs_2);
+        int32_t sumi4 = sc[3] * __riscv_vmv_x_s_i32m1_i32(vs_3);
+
+        sumf += d * (sumi1 + sumi2 + sumi3 + sumi4);
+
+    }
+
+    *s = sumf;
+
+#else
+
+    int8_t aux8[QK_K];
+    int16_t aux16[16];
+    float   sums [8];
+    memset(sums, 0, 8*sizeof(float));
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * restrict q4 = x[i].qs;
+        const uint8_t * restrict hm = x[i].qh;
+        const  int8_t * restrict q8 = y[i].qs;
+        int8_t * restrict a = aux8;
+        for (int l = 0; l < 32; ++l) {
+            a[l+ 0] = q4[l] & 0xF;
+            a[l+32] = q4[l]  >> 4;
+        }
+        for (int is = 0; is < 8; ++is) {
+            uint8_t m = 1 << is;
+            for (int l = 0; l < 8; ++l) a[8*is + l] -= (hm[l] & m ? 0 : 16);
+        }
+
+        const float d = y[i].d * GGML_V3_FP16_TO_FP32(x[i].d);
+        const int8_t * restrict sc = x[i].scales;
+
+        for (int j = 0; j < QK_K/16; ++j) {
+            const float dl = d * sc[j];
+            for (int l = 0; l < 16; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l <  8; ++l) sums[l] += dl * (aux16[l] + aux16[8+l]);
+            q8 += 16; a += 16;
+        }
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
+#endif
+}
+#endif
+
+
+#if QK_K == 256
+static void ggml_v3_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+    assert(n % QK_K == 0);
+
+    const block_q6_K * restrict x = vx;
+    const block_q8_K * restrict y = vy;
+
+    const int nb = n / QK_K;
+
+#ifdef __ARM_NEON
+    float sum = 0;
+
+    const uint8x16_t m4b = vdupq_n_u8(0xF);
+    const int32x4_t  vzero = vdupq_n_s32(0);
+    //const int8x16_t  m32s = vdupq_n_s8(32);
+
+    const uint8x16_t mone = vdupq_n_u8(3);
+
+    ggml_v3_int8x16x4_t q6bytes;
+    ggml_v3_uint8x16x4_t q6h;
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d_all = GGML_V3_FP16_TO_FP32(x[i].d);
+
+        const uint8_t * restrict q6 = x[i].ql;
+        const uint8_t * restrict qh = x[i].qh;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        const int8_t * restrict scale = x[i].scales;
+
+        const ggml_v3_int16x8x2_t q8sums = ggml_v3_vld1q_s16_x2(y[i].bsums);
+        const int8x16_t scales = vld1q_s8(scale);
+        const ggml_v3_int16x8x2_t q6scales = {{vmovl_s8(vget_low_s8(scales)), vmovl_s8(vget_high_s8(scales))}};
+
+        const int32x4_t prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums.val[0]), vget_low_s16 (q6scales.val[0])),
+                                                   vmull_s16(vget_high_s16(q8sums.val[0]), vget_high_s16(q6scales.val[0]))),
+                                         vaddq_s32(vmull_s16(vget_low_s16 (q8sums.val[1]), vget_low_s16 (q6scales.val[1])),
+                                                   vmull_s16(vget_high_s16(q8sums.val[1]), vget_high_s16(q6scales.val[1]))));
+        int32_t isum_mins = vaddvq_s32(prod);
+
+        int32_t isum = 0;
+
+        for (int j = 0; j < QK_K/128; ++j) {
+
+            ggml_v3_uint8x16x2_t qhbits = ggml_v3_vld1q_u8_x2(qh); qh += 32;
+            ggml_v3_uint8x16x4_t q6bits = ggml_v3_vld1q_u8_x4(q6); q6 += 64;
+            ggml_v3_int8x16x4_t q8bytes = ggml_v3_vld1q_s8_x4(q8); q8 += 64;
+
+            q6h.val[0] = vshlq_n_u8(vandq_u8(mone, qhbits.val[0]), 4);
+            q6h.val[1] = vshlq_n_u8(vandq_u8(mone, qhbits.val[1]), 4);
+            uint8x16_t shifted = vshrq_n_u8(qhbits.val[0], 2);
+            q6h.val[2] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
+            shifted = vshrq_n_u8(qhbits.val[1], 2);
+            q6h.val[3] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
+
+            //q6bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[0], m4b), q6h.val[0])), m32s);
+            //q6bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[1], m4b), q6h.val[1])), m32s);
+            //q6bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[2], m4b), q6h.val[2])), m32s);
+            //q6bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[3], m4b), q6h.val[3])), m32s);
+            q6bytes.val[0] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[0], m4b), q6h.val[0]));
+            q6bytes.val[1] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[1], m4b), q6h.val[1]));
+            q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[2], m4b), q6h.val[2]));
+            q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[3], m4b), q6h.val[3]));
+
+            isum += vaddvq_s32(ggml_v3_vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
+                    vaddvq_s32(ggml_v3_vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
+                    vaddvq_s32(ggml_v3_vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
+                    vaddvq_s32(ggml_v3_vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
+
+            scale += 4;
+
+            q8bytes = ggml_v3_vld1q_s8_x4(q8); q8 += 64;
+
+            shifted = vshrq_n_u8(qhbits.val[0], 4);
+            q6h.val[0] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
+            shifted = vshrq_n_u8(qhbits.val[1], 4);
+            q6h.val[1] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
+            shifted = vshrq_n_u8(qhbits.val[0], 6);
+            q6h.val[2] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
+            shifted = vshrq_n_u8(qhbits.val[1], 6);
+            q6h.val[3] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
+
+            //q6bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[0], 4), q6h.val[0])), m32s);
+            //q6bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[1], 4), q6h.val[1])), m32s);
+            //q6bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[2], 4), q6h.val[2])), m32s);
+            //q6bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[3], 4), q6h.val[3])), m32s);
+            q6bytes.val[0] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[0], 4), q6h.val[0]));
+            q6bytes.val[1] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[1], 4), q6h.val[1]));
+            q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[2], 4), q6h.val[2]));
+            q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[3], 4), q6h.val[3]));
+
+            isum += vaddvq_s32(ggml_v3_vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
+                    vaddvq_s32(ggml_v3_vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
+                    vaddvq_s32(ggml_v3_vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
+                    vaddvq_s32(ggml_v3_vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
+            scale += 4;
+        }
+        //sum += isum * d_all * y[i].d;
+        sum += d_all * y[i].d * (isum - 32 * isum_mins);
+
+    }
+    *s = sum;
+
+#elif defined __AVX2__
+
+    const __m256i m4 = _mm256_set1_epi8(0xF);
+    const __m256i m2 = _mm256_set1_epi8(3);
+    const __m256i m32s = _mm256_set1_epi8(32);
+
+    __m256 acc = _mm256_setzero_ps();
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * GGML_V3_FP16_TO_FP32(x[i].d);
+
+        const uint8_t * restrict q4 = x[i].ql;
+        const uint8_t * restrict qh = x[i].qh;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales);
+
+        __m256i sumi = _mm256_setzero_si256();
+
+        int is = 0;
+
+        for (int j = 0; j < QK_K/128; ++j) {
+
+            const __m128i scale_0 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 0));
+            const __m128i scale_1 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 1));
+            const __m128i scale_2 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 2));
+            const __m128i scale_3 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 3));
+            is += 4;
+
+            const __m256i q4bits1 = _mm256_loadu_si256((const __m256i*)q4); q4 += 32;
+            const __m256i q4bits2 = _mm256_loadu_si256((const __m256i*)q4); q4 += 32;
+            const __m256i q4bitsH = _mm256_loadu_si256((const __m256i*)qh); qh += 32;
+
+            const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(q4bitsH, m2), 4);
+            const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 2), m2), 4);
+            const __m256i q4h_2 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 4), m2), 4);
+            const __m256i q4h_3 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 6), m2), 4);
+
+            const __m256i q4_0 = _mm256_or_si256(_mm256_and_si256(q4bits1, m4), q4h_0);
+            const __m256i q4_1 = _mm256_or_si256(_mm256_and_si256(q4bits2, m4), q4h_1);
+            const __m256i q4_2 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits1, 4), m4), q4h_2);
+            const __m256i q4_3 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits2, 4), m4), q4h_3);
+
+            const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+            const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+            const __m256i q8_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+            const __m256i q8_3 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+
+            __m256i q8s_0 = _mm256_maddubs_epi16(m32s, q8_0);
+            __m256i q8s_1 = _mm256_maddubs_epi16(m32s, q8_1);
+            __m256i q8s_2 = _mm256_maddubs_epi16(m32s, q8_2);
+            __m256i q8s_3 = _mm256_maddubs_epi16(m32s, q8_3);
+
+            __m256i p16_0 = _mm256_maddubs_epi16(q4_0, q8_0);
+            __m256i p16_1 = _mm256_maddubs_epi16(q4_1, q8_1);
+            __m256i p16_2 = _mm256_maddubs_epi16(q4_2, q8_2);
+            __m256i p16_3 = _mm256_maddubs_epi16(q4_3, q8_3);
+
+            p16_0 = _mm256_sub_epi16(p16_0, q8s_0);
+            p16_1 = _mm256_sub_epi16(p16_1, q8s_1);
+            p16_2 = _mm256_sub_epi16(p16_2, q8s_2);
+            p16_3 = _mm256_sub_epi16(p16_3, q8s_3);
+
+            p16_0 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_0), p16_0);
+            p16_1 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_1), p16_1);
+            p16_2 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_2), p16_2);
+            p16_3 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_3), p16_3);
+
+            sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_0, p16_1));
+            sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_2, p16_3));
+
+        }
+
+        acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi), acc);
+    }
+
+    *s = hsum_float_8(acc);
+
+#elif defined __AVX__
+
+    const __m128i m4 = _mm_set1_epi8(0xF);
+    const __m128i m3 = _mm_set1_epi8(3);
+    const __m128i m32s = _mm_set1_epi8(32);
+    const __m128i m2 = _mm_set1_epi8(2);
+
+    __m256 acc = _mm256_setzero_ps();
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * GGML_V3_FP16_TO_FP32(x[i].d);
+
+        const uint8_t * restrict q4 = x[i].ql;
+        const uint8_t * restrict qh = x[i].qh;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales);
+
+        __m128i sumi_0 = _mm_setzero_si128();
+        __m128i sumi_1 = _mm_setzero_si128();
+
+        __m128i shuffle = _mm_set_epi64x(0x0101010101010101, 0x0000000000000000);
+        for (int j = 0; j < QK_K/128; ++j) {
+
+            const __m128i q4bitsH_0 = _mm_loadu_si128((const __m128i*)qh); qh += 16;
+            const __m128i q4bitsH_1 = _mm_loadu_si128((const __m128i*)qh); qh += 16;
+
+            const __m128i q4h_0 = _mm_slli_epi16(_mm_and_si128(q4bitsH_0, m3), 4);
+            const __m128i q4h_1 = _mm_slli_epi16(_mm_and_si128(q4bitsH_1, m3), 4);
+            const __m128i q4h_2 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_0, 2), m3), 4);
+            const __m128i q4h_3 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_1, 2), m3), 4);
+            const __m128i q4h_4 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_0, 4), m3), 4);
+            const __m128i q4h_5 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_1, 4), m3), 4);
+            const __m128i q4h_6 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_0, 6), m3), 4);
+            const __m128i q4h_7 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_1, 6), m3), 4);
+
+            const __m128i q4bits1_0 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
+            const __m128i q4bits1_1 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
+            const __m128i q4bits2_0 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
+            const __m128i q4bits2_1 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
+
+            const __m128i q4_0 = _mm_or_si128(_mm_and_si128(q4bits1_0, m4), q4h_0);
+            const __m128i q4_1 = _mm_or_si128(_mm_and_si128(q4bits1_1, m4), q4h_1);
+            const __m128i q4_2 = _mm_or_si128(_mm_and_si128(q4bits2_0, m4), q4h_2);
+            const __m128i q4_3 = _mm_or_si128(_mm_and_si128(q4bits2_1, m4), q4h_3);
+            const __m128i q4_4 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_0, 4), m4), q4h_4);
+            const __m128i q4_5 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_1, 4), m4), q4h_5);
+            const __m128i q4_6 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_0, 4), m4), q4h_6);
+            const __m128i q4_7 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_1, 4), m4), q4h_7);
+
+            const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_2 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_3 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_4 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_5 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+
+            __m128i q8s_0 = _mm_maddubs_epi16(m32s, q8_0);
+            __m128i q8s_1 = _mm_maddubs_epi16(m32s, q8_1);
+            __m128i q8s_2 = _mm_maddubs_epi16(m32s, q8_2);
+            __m128i q8s_3 = _mm_maddubs_epi16(m32s, q8_3);
+            __m128i q8s_4 = _mm_maddubs_epi16(m32s, q8_4);
+            __m128i q8s_5 = _mm_maddubs_epi16(m32s, q8_5);
+            __m128i q8s_6 = _mm_maddubs_epi16(m32s, q8_6);
+            __m128i q8s_7 = _mm_maddubs_epi16(m32s, q8_7);
+
+            __m128i p16_0 = _mm_maddubs_epi16(q4_0, q8_0);
+            __m128i p16_1 = _mm_maddubs_epi16(q4_1, q8_1);
+            __m128i p16_2 = _mm_maddubs_epi16(q4_2, q8_2);
+            __m128i p16_3 = _mm_maddubs_epi16(q4_3, q8_3);
+            __m128i p16_4 = _mm_maddubs_epi16(q4_4, q8_4);
+            __m128i p16_5 = _mm_maddubs_epi16(q4_5, q8_5);
+            __m128i p16_6 = _mm_maddubs_epi16(q4_6, q8_6);
+            __m128i p16_7 = _mm_maddubs_epi16(q4_7, q8_7);
+
+            p16_0 = _mm_sub_epi16(p16_0, q8s_0);
+            p16_1 = _mm_sub_epi16(p16_1, q8s_1);
+            p16_2 = _mm_sub_epi16(p16_2, q8s_2);
+            p16_3 = _mm_sub_epi16(p16_3, q8s_3);
+            p16_4 = _mm_sub_epi16(p16_4, q8s_4);
+            p16_5 = _mm_sub_epi16(p16_5, q8s_5);
+            p16_6 = _mm_sub_epi16(p16_6, q8s_6);
+            p16_7 = _mm_sub_epi16(p16_7, q8s_7);
+
+            const __m128i scale_0 = _mm_shuffle_epi8(scales, shuffle);
+            shuffle = _mm_add_epi8(shuffle, m2);
+            const __m128i scale_1 = _mm_shuffle_epi8(scales, shuffle);
+            shuffle = _mm_add_epi8(shuffle, m2);
+            const __m128i scale_2 = _mm_shuffle_epi8(scales, shuffle);
+            shuffle = _mm_add_epi8(shuffle, m2);
+            const __m128i scale_3 = _mm_shuffle_epi8(scales, shuffle);
+            shuffle = _mm_add_epi8(shuffle, m2);
+
+            p16_0 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_0), p16_0);
+            p16_1 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_0, scale_0)), p16_1);
+            p16_2 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_1), p16_2);
+            p16_3 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_1, scale_1)), p16_3);
+            p16_4 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_2), p16_4);
+            p16_5 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_2, scale_2)), p16_5);
+            p16_6 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_3), p16_6);
+            p16_7 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_3, scale_3)), p16_7);
+
+            sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
+            sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3));
+            sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_4, p16_6));
+            sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_5, p16_7));
+
+        }
+
+        __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
+        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc);
+    }
+
+    *s = hsum_float_8(acc);
+
+#elif defined __riscv_v_intrinsic
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = GGML_V3_FP16_TO_FP32(x[i].d) * y[i].d;
+
+        const uint8_t * restrict q6 = x[i].ql;
+        const uint8_t * restrict qh = x[i].qh;
+        const  int8_t * restrict q8 = y[i].qs;
+
+        const int8_t * restrict scale = x[i].scales;
+
+        size_t vl;
+
+        vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
+
+        int sum_t = 0;
+        int is = 0;
+
+        for (int j = 0; j < QK_K/128; ++j) {
+
+            vl = 32;
+
+            // load qh
+            vuint8m1_t qh_x = __riscv_vle8_v_u8m1(qh, vl);
+
+            // load Q6
+            vuint8m1_t q6_0 = __riscv_vle8_v_u8m1(q6, vl);
+            vuint8m1_t q6_1 = __riscv_vle8_v_u8m1(q6+32, vl);
+
+            vuint8m1_t q6a_0 = __riscv_vand_vx_u8m1(q6_0, 0x0F, vl);
+            vuint8m1_t q6a_1 = __riscv_vand_vx_u8m1(q6_1, 0x0F, vl);
+            vuint8m1_t q6s_0 = __riscv_vsrl_vx_u8m1(q6_0, 0x04, vl);
+            vuint8m1_t q6s_1 = __riscv_vsrl_vx_u8m1(q6_1, 0x04, vl);
+
+            vuint8m1_t qh_0 = __riscv_vand_vx_u8m1(qh_x, 0x03, vl);
+            vuint8m1_t qh_1 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x2, vl), 0x03 , vl);
+            vuint8m1_t qh_2 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x4, vl), 0x03 , vl);
+            vuint8m1_t qh_3 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x6, vl), 0x03 , vl);
+
+            vuint8m1_t qhi_0 = __riscv_vor_vv_u8m1(q6a_0, __riscv_vsll_vx_u8m1(qh_0, 0x04, vl), vl);
+            vuint8m1_t qhi_1 = __riscv_vor_vv_u8m1(q6a_1, __riscv_vsll_vx_u8m1(qh_1, 0x04, vl), vl);
+            vuint8m1_t qhi_2 = __riscv_vor_vv_u8m1(q6s_0, __riscv_vsll_vx_u8m1(qh_2, 0x04, vl), vl);
+            vuint8m1_t qhi_3 = __riscv_vor_vv_u8m1(q6s_1, __riscv_vsll_vx_u8m1(qh_3, 0x04, vl), vl);
+
+            vint8m1_t a_0 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_0), 32, vl);
+            vint8m1_t a_1 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_1), 32, vl);
+            vint8m1_t a_2 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_2), 32, vl);
+            vint8m1_t a_3 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_3), 32, vl);
+
+            // load Q8 and take product
+            vint16m2_t va_q_0 = __riscv_vwmul_vv_i16m2(a_0, __riscv_vle8_v_i8m1(q8, vl), vl);
+            vint16m2_t va_q_1 = __riscv_vwmul_vv_i16m2(a_1, __riscv_vle8_v_i8m1(q8+32, vl), vl);
+            vint16m2_t va_q_2 = __riscv_vwmul_vv_i16m2(a_2, __riscv_vle8_v_i8m1(q8+64, vl), vl);
+            vint16m2_t va_q_3 = __riscv_vwmul_vv_i16m2(a_3, __riscv_vle8_v_i8m1(q8+96, vl), vl);
+
+            vl = 16;
+
+            vint32m2_t vaux_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_0, 0), scale[is+0], vl);
+            vint32m2_t vaux_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_0, 1), scale[is+1], vl);
+            vint32m2_t vaux_2 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_1, 0), scale[is+2], vl);
+            vint32m2_t vaux_3 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_1, 1), scale[is+3], vl);
+            vint32m2_t vaux_4 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_2, 0), scale[is+4], vl);
+            vint32m2_t vaux_5 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_2, 1), scale[is+5], vl);
+            vint32m2_t vaux_6 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_3, 0), scale[is+6], vl);
+            vint32m2_t vaux_7 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_3, 1), scale[is+7], vl);
+
+            vint32m1_t isum0 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_0, vaux_1, vl), vzero, vl);
+            vint32m1_t isum1 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_2, vaux_3, vl), isum0, vl);
+            vint32m1_t isum2 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_4, vaux_5, vl), isum1, vl);
+            vint32m1_t isum3 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_6, vaux_7, vl), isum2, vl);
+
+            sum_t += __riscv_vmv_x_s_i32m1_i32(isum3);
+
+            q6 += 64;   qh += 32;   q8 += 128;   is=8;
+
+        }
+
+        sumf += d * sum_t;
+
+    }
+
+    *s = sumf;
+
+#else
+
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * restrict q4 = x[i].ql;
+        const uint8_t * restrict qh = x[i].qh;
+        const  int8_t * restrict q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * restrict a = aux8;
+        for (int j = 0; j < QK_K; j += 128) {
+            for (int l = 0; l < 32; ++l) {
+                a[l +  0] = (int8_t)((q4[l +  0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
+                a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
+                a[l + 64] = (int8_t)((q4[l +  0] >>  4) | (((qh[l] >> 4) & 3) << 4)) - 32;
+                a[l + 96] = (int8_t)((q4[l + 32] >>  4) | (((qh[l] >> 6) & 3) << 4)) - 32;
+            }
+            a  += 128;
+            q4 += 64;
+            qh += 32;
+        }
+        a = aux8;
+        int is = 0;
+        for (int j = 0; j < QK_K/16; ++j) {
+            int scale = x[i].scales[is++];
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_V3_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
+#endif
+}
+
+#else
+
+static void ggml_v3_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+    assert(n % QK_K == 0);
+
+    const block_q6_K * restrict x = vx;
+    const block_q8_K * restrict y = vy;
+
+    const int nb = n / QK_K;
+
+#ifdef __ARM_NEON
+    float sum = 0;
+
+    const uint8x16_t m4b = vdupq_n_u8(0xF);
+    const int8x16_t  m32s = vdupq_n_s8(32);
+    const int32x4_t  vzero = vdupq_n_s32(0);
+
+    const uint8x16_t mone = vdupq_n_u8(3);
+
+    ggml_v3_int8x16x4_t q6bytes;
+    ggml_v3_uint8x16x4_t q6h;
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d_all = (float)x[i].d;
+
+        const uint8_t * restrict q6 = x[i].ql;
+        const uint8_t * restrict qh = x[i].qh;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        const int8_t * restrict scale = x[i].scales;
+
+        int32_t isum = 0;
+
+        uint8x16_t qhbits = vld1q_u8(qh);
+        ggml_v3_uint8x16x2_t q6bits = ggml_v3_vld1q_u8_x2(q6);
+        ggml_v3_int8x16x4_t q8bytes = ggml_v3_vld1q_s8_x4(q8);
+
+        q6h.val[0] = vshlq_n_u8(vandq_u8(mone, qhbits), 4);
+        uint8x16_t shifted = vshrq_n_u8(qhbits, 2);
+        q6h.val[1] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
+        shifted = vshrq_n_u8(qhbits, 4);
+        q6h.val[2] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
+        shifted = vshrq_n_u8(qhbits, 6);
+        q6h.val[3] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
+
+        q6bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[0], m4b), q6h.val[0])), m32s);
+        q6bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[1], m4b), q6h.val[1])), m32s);
+        q6bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[0], 4), q6h.val[2])), m32s);
+        q6bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[1], 4), q6h.val[3])), m32s);
+
+        isum += vaddvq_s32(ggml_v3_vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
+                vaddvq_s32(ggml_v3_vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
+                vaddvq_s32(ggml_v3_vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
+                vaddvq_s32(ggml_v3_vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
+
+        sum += isum * d_all * y[i].d;
+
+    }
+    *s = sum;
+
+#elif defined __AVX2__
+
+    const __m256i m4 = _mm256_set1_epi8(0xF);
+    const __m256i m2 = _mm256_set1_epi8(3);
+    const __m256i m32s = _mm256_set1_epi8(32);
+
+    __m256 acc = _mm256_setzero_ps();
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * GGML_V3_FP16_TO_FP32(x[i].d);
+
+        const uint8_t * restrict q4 = x[i].ql;
+        const uint8_t * restrict qh = x[i].qh;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        const __m64 scales_1 = _mm_set1_pi8(x[i].scales[0]);
+        const __m64 scales_2 = _mm_set1_pi8(x[i].scales[1]);
+        const __m64 scales_3 = _mm_set1_pi8(x[i].scales[2]);
+        const __m64 scales_4 = _mm_set1_pi8(x[i].scales[3]);
+
+        __m256i sumi = _mm256_setzero_si256();
+
+        const __m128i scale_0 = _mm_set_epi64(scales_2, scales_1);
+        const __m128i scale_1 = _mm_set_epi64(scales_4, scales_3);
+
+        const __m256i q4bits1 = _mm256_loadu_si256((const __m256i*)q4);
+        const __m128i q4bitsH = _mm_loadu_si128((const __m128i*)qh);
+
+        const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q4bitsH, 2), q4bitsH), m2), 4);
+        const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q4bitsH, 6), _mm_srli_epi16(q4bitsH, 4)), m2), 4);
+
+        const __m256i q4_0 = _mm256_or_si256(_mm256_and_si256(q4bits1, m4), q4h_0);
+        const __m256i q4_1 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits1, 4), m4), q4h_1);
+
+        const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
+        const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
+
+        __m256i q8s_0 = _mm256_maddubs_epi16(m32s, q8_0);
+        __m256i q8s_1 = _mm256_maddubs_epi16(m32s, q8_1);
+
+        __m256i p16_0 = _mm256_maddubs_epi16(q4_0, q8_0);
+        __m256i p16_1 = _mm256_maddubs_epi16(q4_1, q8_1);
+
+        p16_0 = _mm256_sub_epi16(p16_0, q8s_0);
+        p16_1 = _mm256_sub_epi16(p16_1, q8s_1);
+
+        p16_0 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_0), p16_0);
+        p16_1 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_1), p16_1);
+
+        sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_0, p16_1));
+
+        acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi), acc);
+    }
+
+    *s = hsum_float_8(acc);
+
+#elif defined __AVX__
+
+    const __m128i m4 = _mm_set1_epi8(0xF);
+    const __m128i m2 = _mm_set1_epi8(3);
+    const __m128i m32s = _mm_set1_epi8(32);
+
+    __m256 acc = _mm256_setzero_ps();
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * GGML_V3_FP16_TO_FP32(x[i].d);
+
+        const uint8_t * restrict q4 = x[i].ql;
+        const uint8_t * restrict qh = x[i].qh;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        const __m64 scales_1 = _mm_set1_pi8(x[i].scales[0]);
+        const __m64 scales_2 = _mm_set1_pi8(x[i].scales[1]);
+        const __m64 scales_3 = _mm_set1_pi8(x[i].scales[2]);
+        const __m64 scales_4 = _mm_set1_pi8(x[i].scales[3]);
+
+        __m128i sumi_0 = _mm_setzero_si128();
+        __m128i sumi_1 = _mm_setzero_si128();
+
+        const __m128i scale_0 = _mm_set_epi64(scales_2, scales_1);
+        const __m128i scale_1 = _mm_set_epi64(scales_4, scales_3);
+
+        const __m256i q4bits1 = _mm256_loadu_si256((const __m256i*)q4);
+        const __m128i q4bitsH = _mm_loadu_si128((const __m128i*)qh);
+
+        const __m128i q4h_0 = _mm_slli_epi16(_mm_and_si128(q4bitsH, m2), 4);
+        const __m128i q4h_1 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH, 2), m2), 4);
+        const __m128i q4h_2 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH, 4), m2), 4);
+        const __m128i q4h_3 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH, 6), m2), 4);
+
+        const __m128i q4_0 = _mm_or_si128(_mm_and_si128(_mm256_extractf128_si256(q4bits1, 0), m4), q4h_0);
+        const __m128i q4_1 = _mm_or_si128(_mm_and_si128(_mm256_extractf128_si256(q4bits1, 1), m4), q4h_1);
+        const __m128i q4_2 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(_mm256_extractf128_si256(q4bits1, 0), 4), m4), q4h_2);
+        const __m128i q4_3 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(_mm256_extractf128_si256(q4bits1, 1), 4), m4), q4h_3);
+
+        const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
+        const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
+
+        __m128i q8s_0 = _mm_maddubs_epi16(m32s, _mm256_extractf128_si256(q8_0, 0));
+        __m128i q8s_1 = _mm_maddubs_epi16(m32s, _mm256_extractf128_si256(q8_0, 1));
+        __m128i q8s_2 = _mm_maddubs_epi16(m32s, _mm256_extractf128_si256(q8_1, 0));
+        __m128i q8s_3 = _mm_maddubs_epi16(m32s, _mm256_extractf128_si256(q8_1, 1));
+
+        __m128i p16_0 = _mm_maddubs_epi16(q4_0, _mm256_extractf128_si256(q8_0, 0));
+        __m128i p16_1 = _mm_maddubs_epi16(q4_1, _mm256_extractf128_si256(q8_0, 1));
+        __m128i p16_2 = _mm_maddubs_epi16(q4_2, _mm256_extractf128_si256(q8_1, 0));
+        __m128i p16_3 = _mm_maddubs_epi16(q4_3, _mm256_extractf128_si256(q8_1, 1));
+
+        p16_0 = _mm_sub_epi16(p16_0, q8s_0);
+        p16_1 = _mm_sub_epi16(p16_1, q8s_1);
+        p16_2 = _mm_sub_epi16(p16_2, q8s_2);
+        p16_3 = _mm_sub_epi16(p16_3, q8s_3);
+
+        p16_0 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_0), p16_0);
+        p16_1 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_0, scale_0)), p16_1);
+        p16_2 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_1), p16_2);
+        p16_3 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_1, scale_1)), p16_3);
+
+        sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
+        sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3));
+
+        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(MM256_SET_M128I(sumi_1, sumi_0))), acc);
+    }
+
+    *s = hsum_float_8(acc);
+
+#elif defined __riscv_v_intrinsic
+
+    float sumf = 0;
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d_all = (float)x[i].d;
+
+        const uint8_t * restrict q6 = x[i].ql;
+        const uint8_t * restrict qh = x[i].qh;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        const int8_t * restrict scale = x[i].scales;
+
+        int32_t isum = 0;
+
+        size_t vl = 16;
+
+        vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
+
+        // load Q6
+        vuint8mf2_t q6_0 = __riscv_vle8_v_u8mf2(q6, vl);
+        vuint8mf2_t q6_1 = __riscv_vle8_v_u8mf2(q6+16, vl);
+
+        // load qh
+        vuint8mf2_t qh_x = __riscv_vle8_v_u8mf2(qh, vl);
+
+        vuint8mf2_t qh0 = __riscv_vsll_vx_u8mf2(__riscv_vand_vx_u8mf2(qh_x, 0x3, vl), 0x4, vl);
+        qh_x = __riscv_vsrl_vx_u8mf2(qh_x, 0x2, vl);
+        vuint8mf2_t qh1 = __riscv_vsll_vx_u8mf2(__riscv_vand_vx_u8mf2(qh_x, 0x3, vl), 0x4, vl);
+        qh_x = __riscv_vsrl_vx_u8mf2(qh_x, 0x2, vl);
+        vuint8mf2_t qh2 = __riscv_vsll_vx_u8mf2(__riscv_vand_vx_u8mf2(qh_x, 0x3, vl), 0x4, vl);
+        qh_x = __riscv_vsrl_vx_u8mf2(qh_x, 0x2, vl);
+        vuint8mf2_t qh3 = __riscv_vsll_vx_u8mf2(__riscv_vand_vx_u8mf2(qh_x, 0x3, vl), 0x4, vl);
+
+        vuint8mf2_t q6h_0 = __riscv_vor_vv_u8mf2(__riscv_vand_vx_u8mf2(q6_0, 0xF, vl), qh0, vl);
+        vuint8mf2_t q6h_1 = __riscv_vor_vv_u8mf2(__riscv_vand_vx_u8mf2(q6_1, 0xF, vl), qh1, vl);
+        vuint8mf2_t q6h_2 = __riscv_vor_vv_u8mf2(__riscv_vsrl_vx_u8mf2(q6_0, 0x4, vl), qh2, vl);
+        vuint8mf2_t q6h_3 = __riscv_vor_vv_u8mf2(__riscv_vsrl_vx_u8mf2(q6_1, 0x4, vl), qh3, vl);
+
+        vint8mf2_t q6v_0 = __riscv_vsub_vx_i8mf2(__riscv_vreinterpret_v_u8mf2_i8mf2(q6h_0), 32, vl);
+        vint8mf2_t q6v_1 = __riscv_vsub_vx_i8mf2(__riscv_vreinterpret_v_u8mf2_i8mf2(q6h_1), 32, vl);
+        vint8mf2_t q6v_2 = __riscv_vsub_vx_i8mf2(__riscv_vreinterpret_v_u8mf2_i8mf2(q6h_2), 32, vl);
+        vint8mf2_t q6v_3 = __riscv_vsub_vx_i8mf2(__riscv_vreinterpret_v_u8mf2_i8mf2(q6h_3), 32, vl);
+
+        // load Q8 and take product
+        vint16m1_t p0 = __riscv_vwmul_vv_i16m1(q6v_0, __riscv_vle8_v_i8mf2(q8, vl), vl);
+        vint16m1_t p1 = __riscv_vwmul_vv_i16m1(q6v_1, __riscv_vle8_v_i8mf2(q8+16, vl), vl);
+        vint16m1_t p2 = __riscv_vwmul_vv_i16m1(q6v_2, __riscv_vle8_v_i8mf2(q8+32, vl), vl);
+        vint16m1_t p3 = __riscv_vwmul_vv_i16m1(q6v_3, __riscv_vle8_v_i8mf2(q8+48, vl), vl);
+
+        vint32m1_t vs_0 = __riscv_vwredsum_vs_i16m1_i32m1(p0, vzero, vl);
+        vint32m1_t vs_1 = __riscv_vwredsum_vs_i16m1_i32m1(p1, vzero, vl);
+        vint32m1_t vs_2 = __riscv_vwredsum_vs_i16m1_i32m1(p2, vzero, vl);
+        vint32m1_t vs_3 = __riscv_vwredsum_vs_i16m1_i32m1(p3, vzero, vl);
+
+        isum += __riscv_vmv_x_s_i32m1_i32(vs_0) * scale[0];
+        isum += __riscv_vmv_x_s_i32m1_i32(vs_1) * scale[1];
+        isum += __riscv_vmv_x_s_i32m1_i32(vs_2) * scale[2];
+        isum += __riscv_vmv_x_s_i32m1_i32(vs_3) * scale[3];
+
+        sumf += isum * d_all * y[i].d;
+
+    }
+
+    *s = sumf;
+
+#else
+
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * restrict q4 = x[i].ql;
+        const uint8_t * restrict qh = x[i].qh;
+        const  int8_t * restrict q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * restrict a = aux8;
+        for (int l = 0; l < 16; ++l) {
+            a[l+ 0] = (int8_t)((q4[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
+            a[l+16] = (int8_t)((q4[l+16] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
+            a[l+32] = (int8_t)((q4[l+ 0] >>  4) | (((qh[l] >> 4) & 3) << 4)) - 32;
+            a[l+48] = (int8_t)((q4[l+16] >>  4) | (((qh[l] >> 6) & 3) << 4)) - 32;
+        }
+        int is = 0;
+        for (int j = 0; j < QK_K/16; ++j) {
+            int scale = x[i].scales[is++];
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_V3_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
+#endif
+}
+
+#endif
+
+static const int8_t keven_signs_q2xs[1024] = {
+     1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1, -1,  1, -1,  1,  1,  1,  1,  1, -1, -1, -1,  1,  1,  1,  1,  1,  1,
+     1,  1, -1,  1,  1,  1,  1, -1, -1,  1, -1,  1,  1,  1,  1,  1,  1, -1, -1,  1,  1,  1,  1,  1, -1, -1, -1,  1,  1,  1,  1, -1,
+     1,  1,  1, -1,  1,  1,  1, -1, -1,  1,  1, -1,  1,  1,  1,  1,  1, -1,  1, -1,  1,  1,  1,  1, -1, -1,  1, -1,  1,  1,  1, -1,
+     1,  1, -1, -1,  1,  1,  1,  1, -1,  1, -1, -1,  1,  1,  1, -1,  1, -1, -1, -1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1,  1,
+     1,  1,  1,  1, -1,  1,  1, -1, -1,  1,  1,  1, -1,  1,  1,  1,  1, -1,  1,  1, -1,  1,  1,  1, -1, -1,  1,  1, -1,  1,  1, -1,
+     1,  1, -1,  1, -1,  1,  1,  1, -1,  1, -1,  1, -1,  1,  1, -1,  1, -1, -1,  1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1,  1,  1,
+     1,  1,  1, -1, -1,  1,  1,  1, -1,  1,  1, -1, -1,  1,  1, -1,  1, -1,  1, -1, -1,  1,  1, -1, -1, -1,  1, -1, -1,  1,  1,  1,
+     1,  1, -1, -1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1,  1,  1, -1, -1, -1, -1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1, -1,
+     1,  1,  1,  1,  1, -1,  1, -1, -1,  1,  1,  1,  1, -1,  1,  1,  1, -1,  1,  1,  1, -1,  1,  1, -1, -1,  1,  1,  1, -1,  1, -1,
+     1,  1, -1,  1,  1, -1,  1,  1, -1,  1, -1,  1,  1, -1,  1, -1,  1, -1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1,  1, -1,  1,  1,
+     1,  1,  1, -1,  1, -1,  1,  1, -1,  1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1, -1, -1,  1, -1,  1, -1,  1,  1,
+     1,  1, -1, -1,  1, -1,  1, -1, -1,  1, -1, -1,  1, -1,  1,  1,  1, -1, -1, -1,  1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1, -1,
+     1,  1,  1,  1, -1, -1,  1,  1, -1,  1,  1,  1, -1, -1,  1, -1,  1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1, -1, -1,  1,  1,
+     1,  1, -1,  1, -1, -1,  1, -1, -1,  1, -1,  1, -1, -1,  1,  1,  1, -1, -1,  1, -1, -1,  1,  1, -1, -1, -1,  1, -1, -1,  1, -1,
+     1,  1,  1, -1, -1, -1,  1, -1, -1,  1,  1, -1, -1, -1,  1,  1,  1, -1,  1, -1, -1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1, -1,
+     1,  1, -1, -1, -1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1, -1,  1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1,  1,
+     1,  1,  1,  1,  1,  1, -1, -1, -1,  1,  1,  1,  1,  1, -1,  1,  1, -1,  1,  1,  1,  1, -1,  1, -1, -1,  1,  1,  1,  1, -1, -1,
+     1,  1, -1,  1,  1,  1, -1,  1, -1,  1, -1,  1,  1,  1, -1, -1,  1, -1, -1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1, -1,  1,
+     1,  1,  1, -1,  1,  1, -1,  1, -1,  1,  1, -1,  1,  1, -1, -1,  1, -1,  1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1,  1, -1,  1,
+     1,  1, -1, -1,  1,  1, -1, -1, -1,  1, -1, -1,  1,  1, -1,  1,  1, -1, -1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1,  1, -1, -1,
+     1,  1,  1,  1, -1,  1, -1,  1, -1,  1,  1,  1, -1,  1, -1, -1,  1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1,  1, -1,  1, -1,  1,
+     1,  1, -1,  1, -1,  1, -1, -1, -1,  1, -1,  1, -1,  1, -1,  1,  1, -1, -1,  1, -1,  1, -1,  1, -1, -1, -1,  1, -1,  1, -1, -1,
+     1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1, -1, -1,  1, -1,  1,  1, -1,  1, -1, -1,  1, -1,  1, -1, -1,  1, -1, -1,  1, -1, -1,
+     1,  1, -1, -1, -1,  1, -1,  1, -1,  1, -1, -1, -1,  1, -1, -1,  1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1,  1,
+     1,  1,  1,  1,  1, -1, -1,  1, -1,  1,  1,  1,  1, -1, -1, -1,  1, -1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1, -1, -1,  1,
+     1,  1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1,  1, -1, -1,  1,  1, -1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1, -1, -1, -1,
+     1,  1,  1, -1,  1, -1, -1, -1, -1,  1,  1, -1,  1, -1, -1,  1,  1, -1,  1, -1,  1, -1, -1,  1, -1, -1,  1, -1,  1, -1, -1, -1,
+     1,  1, -1, -1,  1, -1, -1,  1, -1,  1, -1, -1,  1, -1, -1, -1,  1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1,  1,
+     1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1, -1, -1, -1,  1,  1, -1,  1,  1, -1, -1, -1,  1, -1, -1,  1,  1, -1, -1, -1, -1,
+     1,  1, -1,  1, -1, -1, -1,  1, -1,  1, -1,  1, -1, -1, -1, -1,  1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1,  1,
+     1,  1,  1, -1, -1, -1, -1,  1, -1,  1,  1, -1, -1, -1, -1, -1,  1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1,  1,
+     1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1,  1,  1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1, -1,
+};
+
+static void ggml_v3_vec_dot_iq2_xxs_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+    assert(n % QK_K == 0);
+
+    const block_iq2_xxs * restrict x = vx;
+    const block_q8_K    * restrict y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined(__ARM_NEON)
+
+    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
+
+    uint32_t aux32[4];
+    const uint8_t * aux8 = (const uint8_t *)aux32;
+
+    ggml_v3_int8x16x4_t q2u;
+    ggml_v3_int8x16x4_t q2s;
+    ggml_v3_int8x16x4_t q8b;
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_V3_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint16_t * restrict q2 = x[i].qs;
+        const int8_t   * restrict q8 = y[i].qs;
+        float sumf1 = 0, sumf2 = 0;
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+            q8b = ggml_v3_vld1q_s8_x4(q8); q8 += 64;
+            memcpy(aux32, q2, 4*sizeof(uint32_t)); q2 += 8;
+            q2u.val[0] = vcombine_s8(vld1_s8((const void *)(iq2xxs_grid + aux8[ 0])), vld1_s8((const void *)(iq2xxs_grid + aux8[ 1])));
+            q2u.val[1] = vcombine_s8(vld1_s8((const void *)(iq2xxs_grid + aux8[ 2])), vld1_s8((const void *)(iq2xxs_grid + aux8[ 3])));
+            q2u.val[2] = vcombine_s8(vld1_s8((const void *)(iq2xxs_grid + aux8[ 8])), vld1_s8((const void *)(iq2xxs_grid + aux8[ 9])));
+            q2u.val[3] = vcombine_s8(vld1_s8((const void *)(iq2xxs_grid + aux8[10])), vld1_s8((const void *)(iq2xxs_grid + aux8[11])));
+            q2s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >>  0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >>  7) & 127))));
+            q2s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >> 21) & 127))));
+            q2s.val[2] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[3] >>  0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[3] >>  7) & 127))));
+            q2s.val[3] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[3] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[3] >> 21) & 127))));
+            q2u.val[0] = vmulq_s8(q2u.val[0], q2s.val[0]);
+            q2u.val[1] = vmulq_s8(q2u.val[1], q2s.val[1]);
+            q2u.val[2] = vmulq_s8(q2u.val[2], q2s.val[2]);
+            q2u.val[3] = vmulq_s8(q2u.val[3], q2s.val[3]);
+            const int32x4_t p1 = ggml_v3_vdotq_s32(ggml_v3_vdotq_s32(vdupq_n_s32(0), q2u.val[0], q8b.val[0]), q2u.val[1], q8b.val[1]);
+            const int32x4_t p2 = ggml_v3_vdotq_s32(ggml_v3_vdotq_s32(vdupq_n_s32(0), q2u.val[2], q8b.val[2]), q2u.val[3], q8b.val[3]);
+            sumf1 += vaddvq_s32(p1) * (0.5f + (aux32[1] >> 28));
+            sumf2 += vaddvq_s32(p2) * (0.5f + (aux32[3] >> 28));
+        }
+        sumf += d*(sumf1 + sumf2);
+    }
+    *s = 0.25f * sumf;
+
+#elif defined(__AVX2__)
+
+    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
+
+    uint32_t aux32[4];
+    const uint8_t * aux8 = (const uint8_t *)aux32;
+
+    __m256 accumf = _mm256_setzero_ps();
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_V3_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint16_t * restrict q2 = x[i].qs;
+        const int8_t   * restrict q8 = y[i].qs;
+        __m256i sumi1 = _mm256_setzero_si256();
+        __m256i sumi2 = _mm256_setzero_si256();
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+            const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
+            const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
+            memcpy(aux32, q2, 4*sizeof(uint32_t)); q2 += 8;
+            const __m256i q2_1 = _mm256_set_epi64x(iq2xxs_grid[aux8[ 3]], iq2xxs_grid[aux8[ 2]], iq2xxs_grid[aux8[1]], iq2xxs_grid[aux8[0]]);
+            const __m256i q2_2 = _mm256_set_epi64x(iq2xxs_grid[aux8[11]], iq2xxs_grid[aux8[10]], iq2xxs_grid[aux8[9]], iq2xxs_grid[aux8[8]]);
+            const __m256i s2_1 = _mm256_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127],
+                                                   signs64[(aux32[1] >>  7) & 127], signs64[(aux32[1] >>  0) & 127]);
+            const __m256i s2_2 = _mm256_set_epi64x(signs64[(aux32[3] >> 21) & 127], signs64[(aux32[3] >> 14) & 127],
+                                                   signs64[(aux32[3] >>  7) & 127], signs64[(aux32[3] >>  0) & 127]);
+            const __m256i q8s_1 = _mm256_sign_epi8(q8_1, s2_1);
+            const __m256i q8s_2 = _mm256_sign_epi8(q8_2, s2_2);
+            const __m256i dot1  = _mm256_maddubs_epi16(q2_1, q8s_1);
+            const __m256i dot2  = _mm256_maddubs_epi16(q2_2, q8s_2);
+            const uint16_t ls1 = aux32[1] >> 28;
+            const uint16_t ls2 = aux32[3] >> 28;
+            const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1));
+            const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1));
+            sumi1 = _mm256_add_epi32(sumi1, p1);
+            sumi2 = _mm256_add_epi32(sumi2, p2);
+        }
+
+        accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
+
+    }
+
+    *s = 0.125f * hsum_float_8(accumf);
+
+#else
+
+    uint32_t aux32[2];
+    const uint8_t * aux8 = (const uint8_t *)aux32;
+
+    float sumf = 0.f;
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_V3_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint16_t * restrict q2 = x[i].qs;
+        const int8_t   * restrict q8 = y[i].qs;
+        int32_t bsum = 0;
+        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
+            memcpy(aux32, q2, 2*sizeof(uint32_t));
+            q2 += 4;
+            const uint32_t ls = 2*(aux32[1] >> 28) + 1;
+            int32_t sumi = 0;
+            for (int l = 0; l < 4; ++l) {
+                const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
+                const uint8_t  signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127];
+                for (int j = 0; j < 8; ++j) {
+                    sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            bsum += sumi * ls;
+        }
+        sumf += d * bsum;
+    }
+    *s = 0.125f * sumf;
+#endif
+}
+
+static void ggml_v3_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+    assert(n % QK_K == 0);
+
+    const block_iq2_xs * restrict x = vx;
+    const block_q8_K   * restrict y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined(__ARM_NEON)
+
+    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
+
+    int8x16x4_t q2u;
+    int8x16x4_t q2s;
+    int8x16x4_t q8b;
+
+    int32x4x4_t scales32;
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_V3_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint16_t * restrict q2 = x[i].qs;
+        const int8_t   * restrict q8 = y[i].qs;
+        const uint8x8_t scales8 = vld1_u8(x[i].scales);
+        const uint8x8_t scales_l = vand_u8(scales8, vdup_n_u8(0xf));
+        const uint8x8_t scales_h = vshr_n_u8(scales8, 4);
+        uint8x16_t scales = vcombine_u8(vzip1_u8(scales_l, scales_h), vzip2_u8(scales_l, scales_h));
+        scales = vaddq_u8(vshlq_n_u8(scales, 1), vdupq_n_u8(1));
+        const uint16x8_t scales1 = vmovl_u8(vget_low_u8(scales));
+        const uint16x8_t scales2 = vmovl_u8(vget_high_u8(scales));
+        scales32.val[0] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(scales1)));
+        scales32.val[1] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(scales1)));
+        scales32.val[2] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(scales2)));
+        scales32.val[3] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(scales2)));
+        int32x4_t sumi = vdupq_n_s32(0);
+        for (int ib64 = 0; ib64 < QK_K/64; ++ib64) {
+            q8b = vld1q_s8_x4(q8); q8 += 64;
+            q2u.val[0] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[0] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[1] & 511))));
+            q2u.val[1] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[2] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[3] & 511))));
+            q2u.val[2] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[4] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[5] & 511))));
+            q2u.val[3] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[6] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[7] & 511))));
+            q2s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + (q2[0] >> 9))), vld1_s8((const void *)(signs64 + (q2[1] >> 9))));
+            q2s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + (q2[2] >> 9))), vld1_s8((const void *)(signs64 + (q2[3] >> 9))));
+            q2s.val[2] = vcombine_s8(vld1_s8((const void *)(signs64 + (q2[4] >> 9))), vld1_s8((const void *)(signs64 + (q2[5] >> 9))));
+            q2s.val[3] = vcombine_s8(vld1_s8((const void *)(signs64 + (q2[6] >> 9))), vld1_s8((const void *)(signs64 + (q2[7] >> 9))));
+            q2u.val[0] = vmulq_s8(q2u.val[0], q2s.val[0]);
+            q2u.val[1] = vmulq_s8(q2u.val[1], q2s.val[1]);
+            q2u.val[2] = vmulq_s8(q2u.val[2], q2s.val[2]);
+            q2u.val[3] = vmulq_s8(q2u.val[3], q2s.val[3]);
+            const int32x4_t p1 = ggml_v3_vdotq_s32(vdupq_n_s32(0), q2u.val[0], q8b.val[0]);
+            const int32x4_t p2 = ggml_v3_vdotq_s32(vdupq_n_s32(0), q2u.val[1], q8b.val[1]);
+            const int32x4_t p3 = ggml_v3_vdotq_s32(vdupq_n_s32(0), q2u.val[2], q8b.val[2]);
+            const int32x4_t p4 = ggml_v3_vdotq_s32(vdupq_n_s32(0), q2u.val[3], q8b.val[3]);
+            const int32x4_t p = vpaddq_s32(vpaddq_s32(p1, p2), vpaddq_s32(p3, p4));
+            sumi = vmlaq_s32(sumi, p, scales32.val[ib64]);
+            q2 += 8;
+        }
+        sumf += d*vaddvq_s32(sumi);
+    }
+    *s = 0.125f * sumf;
+
+#elif defined(__AVX2__)
+
+    const __m128i m4 = _mm_set1_epi8(0xf);
+    const __m128i m1 = _mm_set1_epi8(1);
+    const __m128i m511 = _mm_set1_epi16(511);
+    const __m128i m127 = _mm_set1_epi16(127);
+
+    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
+
+    uint64_t aux64;
+
+    // somewhat hacky, but gives a significant boost in performance
+    __m128i aux_gindex, aux_sindex;
+    const uint16_t * gindex = (const uint16_t *)&aux_gindex;
+    const uint16_t * sindex = (const uint16_t *)&aux_sindex;
+
+    __m256 accumf = _mm256_setzero_ps();
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_V3_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint16_t * restrict q2 = x[i].qs;
+        const int8_t   * restrict q8 = y[i].qs;
+
+        memcpy(&aux64, x[i].scales, 8);
+        __m128i stmp = _mm_set1_epi64x(aux64);
+        stmp = _mm_unpacklo_epi8(_mm_and_si128(stmp, m4), _mm_and_si128(_mm_srli_epi16(stmp, 4), m4));
+        const __m128i scales = _mm_add_epi8(_mm_slli_epi16(stmp, 1), m1);
+
+        __m256i sumi1 = _mm256_setzero_si256();
+        __m256i sumi2 = _mm256_setzero_si256();
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+            const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
+            const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
+            const __m128i q2_data = _mm_loadu_si128((const __m128i*)q2);  q2 +=  8;
+            aux_gindex = _mm_and_si128(q2_data, m511);
+            aux_sindex = _mm_and_si128(_mm_srli_epi16(q2_data, 9), m127);
+            const __m256i q2_1 = _mm256_set_epi64x(iq2xs_grid[gindex[3]], iq2xs_grid[gindex[2]], iq2xs_grid[gindex[1]], iq2xs_grid[gindex[0]]);
+            const __m256i q2_2 = _mm256_set_epi64x(iq2xs_grid[gindex[7]], iq2xs_grid[gindex[6]], iq2xs_grid[gindex[5]], iq2xs_grid[gindex[4]]);
+            const __m256i s2_1 = _mm256_set_epi64x(signs64[sindex[3]], signs64[sindex[2]], signs64[sindex[1]], signs64[sindex[0]]);
+            const __m256i s2_2 = _mm256_set_epi64x(signs64[sindex[7]], signs64[sindex[6]], signs64[sindex[5]], signs64[sindex[4]]);
+            const __m256i q8s_1 = _mm256_sign_epi8(q8_1, s2_1);
+            const __m256i q8s_2 = _mm256_sign_epi8(q8_2, s2_2);
+            const __m256i dot1  = _mm256_maddubs_epi16(q2_1, q8s_1);
+            const __m256i dot2  = _mm256_maddubs_epi16(q2_2, q8s_2);
+
+            const __m256i sc1 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+0)));
+            const __m256i sc2 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+1)));
+
+            sumi1 = _mm256_add_epi32(sumi1, _mm256_madd_epi16(dot1, sc1));
+            sumi2 = _mm256_add_epi32(sumi2, _mm256_madd_epi16(dot2, sc2));
+        }
+
+        accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
+
+    }
+
+    *s = 0.125f * hsum_float_8(accumf);
+
+#else
+
+    float sumf = 0.f;
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_V3_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint16_t * restrict q2 = x[i].qs;
+        const uint8_t  * restrict sc = x[i].scales;
+        const int8_t   * restrict q8 = y[i].qs;
+        int32_t bsum = 0;
+        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
+            const uint16_t ls1 = 2*(sc[ib32] & 0xf) + 1;
+            const uint16_t ls2 = 2*(sc[ib32] >>  4) + 1;
+            int32_t sumi = 0;
+            for (int l = 0; l < 2; ++l) {
+                const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
+                const uint8_t  signs = ksigns_iq2xs[q2[l] >> 9];
+                for (int j = 0; j < 8; ++j) {
+                    sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            bsum += sumi * ls1;
+            sumi = 0;
+            for (int l = 2; l < 4; ++l) {
+                const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
+                const uint8_t  signs = ksigns_iq2xs[q2[l] >> 9];
+                for (int j = 0; j < 8; ++j) {
+                    sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            bsum += sumi * ls2;
+            q2 += 4;
+        }
+        sumf += d * bsum;
+    }
+    *s = 0.125f * sumf;
+#endif
+}
diff --git a/otherarch/ggml_v3.h b/otherarch/ggml_v3.h
new file mode 100644
index 000000000..e8748e029
--- /dev/null
+++ b/otherarch/ggml_v3.h
@@ -0,0 +1,2261 @@
+#pragma once
+
+//
+// GGML Tensor Library
+//
+// This documentation is still a work in progress.
+// If you wish some specific topics to be covered, feel free to drop a comment:
+//
+//   https://github.com/ggerganov/whisper.cpp/issues/40
+//
+// ## Overview
+//
+// This library implements:
+//
+//  - a set of tensor operations
+//  - automatic differentiation
+//  - basic optimization algorithms
+//
+// The aim of this library is to provide a minimalistic approach for various machine learning tasks. This includes,
+// but is not limited to, the following:
+//
+//  - linear regression
+//  - support vector machines
+//  - neural networks
+//
+// The library allows the user to define a certain function using the available tensor operations. This function
+// definition is represented internally via a computation graph. Each tensor operation in the function definition
+// corresponds to a node in the graph. Having the computation graph defined, the user can choose to compute the
+// function's value and/or its gradient with respect to the input variables. Optionally, the function can be optimized
+// using one of the available optimization algorithms.
+//
+// For example, here we define the function: f(x) = a*x^2 + b
+//
+//   {
+//       struct ggml_v3_init_params params = {
+//           .mem_size   = 16*1024*1024,
+//           .mem_buffer = NULL,
+//       };
+//
+//       // memory allocation happens here
+//       struct ggml_v3_context * ctx = ggml_v3_init(params);
+//
+//       struct ggml_v3_tensor * x = ggml_v3_new_tensor_1d(ctx, GGML_V3_TYPE_F32, 1);
+//
+//       ggml_v3_set_param(ctx, x); // x is an input variable
+//
+//       struct ggml_v3_tensor * a  = ggml_v3_new_tensor_1d(ctx, GGML_V3_TYPE_F32, 1);
+//       struct ggml_v3_tensor * b  = ggml_v3_new_tensor_1d(ctx, GGML_V3_TYPE_F32, 1);
+//       struct ggml_v3_tensor * x2 = ggml_v3_mul(ctx, x, x);
+//       struct ggml_v3_tensor * f  = ggml_v3_add(ctx, ggml_v3_mul(ctx, a, x2), b);
+//
+//       ...
+//   }
+//
+// Notice that the function definition above does not involve any actual computation. The computation is performed only
+// when the user explicitly requests it. For example, to compute the function's value at x = 2.0:
+//
+//   {
+//       ...
+//
+//       struct ggml_v3_cgraph * gf = ggml_v3_new_graph(ctx);
+//       ggml_v3_build_forward_expand(gf, f);
+//
+//       // set the input variable and parameter values
+//       ggml_v3_set_f32(x, 2.0f);
+//       ggml_v3_set_f32(a, 3.0f);
+//       ggml_v3_set_f32(b, 4.0f);
+//
+//       ggml_v3_graph_compute_with_ctx(ctx, &gf, n_threads);
+//
+//       printf("f = %f\n", ggml_v3_get_f32_1d(f, 0));
+//
+//       ...
+//   }
+//
+// The actual computation is performed in the ggml_v3_graph_compute() function.
+//
+// The ggml_v3_new_tensor_...() functions create new tensors. They are allocated in the memory buffer provided to the
+// ggml_v3_init() function. You have to be careful not to exceed the memory buffer size. Therefore, you have to know
+// in advance how much memory you need for your computation. Alternatively, you can allocate a large enough memory
+// and after defining the computation graph, call the ggml_v3_used_mem() function to find out how much memory was
+// actually needed.
+//
+// The ggml_v3_set_param() function marks a tensor as an input variable. This is used by the automatic
+// differentiation and optimization algorithms.
+//
+// The described approach allows to define the function graph once and then compute its forward or backward graphs
+// multiple times. All computations will use the same memory buffer allocated in the ggml_v3_init() function. This way
+// the user can avoid the memory allocation overhead at runtime.
+//
+// The library supports multi-dimensional tensors - up to 4 dimensions. The FP16 and FP32 data types are first class
+// citizens, but in theory the library can be extended to support FP8 and integer data types.
+//
+// Each tensor operation produces a new tensor. Initially the library was envisioned to support only the use of unary
+// and binary operations. Most of the available operations fall into one of these two categories. With time, it became
+// clear that the library needs to support more complex operations. The way to support these operations is not clear
+// yet, but a few examples are demonstrated in the following operations:
+//
+//   - ggml_v3_permute()
+//   - ggml_v3_conv_1d_1s()
+//   - ggml_v3_conv_1d_2s()
+//
+// For each tensor operator, the library implements a forward and backward computation function. The forward function
+// computes the output tensor value given the input tensor values. The backward function computes the adjoint of the
+// input tensors given the adjoint of the output tensor. For a detailed explanation of what this means, take a
+// calculus class, or watch the following video:
+//
+//   What is Automatic Differentiation?
+//   https://www.youtube.com/watch?v=wG_nF1awSSY
+//
+//
+// ## Tensor data (struct ggml_v3_tensor)
+//
+// The tensors are stored in memory via the ggml_v3_tensor struct. The structure provides information about the size of
+// the tensor, the data type, and the memory buffer where the tensor data is stored. Additionally, it contains
+// pointers to the "source" tensors - i.e. the tensors that were used to compute the current tensor. For example:
+//
+//   {
+//       struct ggml_v3_tensor * c = ggml_v3_add(ctx, a, b);
+//
+//       assert(c->src[0] == a);
+//       assert(c->src[1] == b);
+//   }
+//
+// The multi-dimensional tensors are stored in row-major order. The ggml_v3_tensor struct contains fields for the
+// number of elements in each dimension ("ne") as well as the number of bytes ("nb", a.k.a. stride). This allows
+// to store tensors that are not contiguous in memory, which is useful for operations such as transposition and
+// permutation. All tensor operations have to take the stride into account and not assume that the tensor is
+// contiguous in memory.
+//
+// The data of the tensor is accessed via the "data" pointer. For example:
+//
+//   {
+//       const int nx = 2;
+//       const int ny = 3;
+//
+//       struct ggml_v3_tensor * a = ggml_v3_new_tensor_2d(ctx, GGML_V3_TYPE_F32, nx, ny);
+//
+//       for (int y = 0; y < ny; y++) {
+//           for (int x = 0; x < nx; x++) {
+//               *(float *) ((char *) a->data + y*a->nb[1] + x*a->nb[0]) = x + y;
+//           }
+//       }
+//
+//       ...
+//   }
+//
+// Alternatively, there are helper functions, such as ggml_v3_get_f32_1d() and ggml_v3_set_f32_1d() that can be used.
+//
+// ## The matrix multiplication operator (ggml_v3_mul_mat)
+//
+// TODO
+//
+//
+// ## Multi-threading
+//
+// TODO
+//
+//
+// ## Overview of ggml.c
+//
+// TODO
+//
+//
+// ## SIMD optimizations
+//
+// TODO
+//
+//
+// ## Debugging ggml
+//
+// TODO
+//
+//
+
+#ifdef GGML_V3_SHARED
+#    if defined(_WIN32) && !defined(__MINGW32__)
+#        ifdef GGML_V3_BUILD
+#            define GGML_V3_API __declspec(dllexport)
+#        else
+#            define GGML_V3_API __declspec(dllimport)
+#        endif
+#    else
+#        define GGML_V3_API __attribute__ ((visibility ("default")))
+#    endif
+#else
+#    define GGML_V3_API
+#endif
+
+// TODO: support for clang
+#ifdef __GNUC__
+#    define GGML_V3_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
+#elif defined(_MSC_VER)
+#    define GGML_V3_DEPRECATED(func, hint) __declspec(deprecated(hint)) func
+#else
+#    define GGML_V3_DEPRECATED(func, hint) func
+#endif
+
+#ifndef __GNUC__
+#    define GGML_V3_ATTRIBUTE_FORMAT(...)
+#elif defined(__MINGW32__)
+#    define GGML_V3_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
+#else
+#    define GGML_V3_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
+#endif
+
+#include <stdint.h>
+#include <stddef.h>
+#include <stdbool.h>
+
+#define GGML_V3_FILE_MAGIC   0x67676d6c // "ggml"
+#define GGML_V3_FILE_VERSION 1
+
+#define GGML_V3_QNT_VERSION        2    // bump this on quantization format changes
+#define GGML_V3_QNT_VERSION_FACTOR 1000 // do not change this
+
+#define GGML_V3_MAX_DIMS           4
+#define GGML_V3_MAX_PARAMS         2048
+#define GGML_V3_MAX_CONTEXTS       64
+#define GGML_V3_MAX_SRC            10
+#ifndef GGML_V3_MAX_NAME
+#define GGML_V3_MAX_NAME           64
+#endif
+#define GGML_V3_MAX_OP_PARAMS      64
+#define GGML_V3_DEFAULT_N_THREADS  4
+#define GGML_V3_DEFAULT_GRAPH_SIZE 2048
+#if UINTPTR_MAX == 0xFFFFFFFF
+    #define GGML_V3_MEM_ALIGN 4
+#else
+    #define GGML_V3_MEM_ALIGN 16
+#endif
+
+#define GGML_V3_EXIT_SUCCESS 0
+#define GGML_V3_EXIT_ABORTED 1
+
+#define GGUF_V3_MAGIC "GGUF"
+
+#define GGUF_V3_VERSION 3
+
+#define GGUF_V3_DEFAULT_ALIGNMENT 32
+
+#define GGML_V3_UNUSED(x) (void)(x)
+
+#define GGML_V3_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
+
+#define GGML_V3_ASSERT_CONTINUE(x) \
+    do { \
+        if (!(x)) { \
+            fprintf(stderr, "GGML_V3_ASSERT_CONTINUE: %s:%d: %s\n", __FILE__, __LINE__, #x); \
+        } \
+    } while (0)
+
+#define GGML_V3_ASSERT(x) \
+    do { \
+        if (!(x)) { \
+            fflush(stdout); \
+            fprintf(stderr, "GGML_V3_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
+            ggml_v3_print_backtrace(); \
+            abort(); \
+        } \
+    } while (0)
+
+#ifndef NDEBUG
+#define GGML_V3_UNREACHABLE() GGML_V3_ASSERT(!"statement should not be reached")
+#elif defined(__GNUC__)
+#define GGML_V3_UNREACHABLE() __builtin_unreachable()
+#elif defined(_MSC_VER)
+#define GGML_V3_UNREACHABLE() __assume(0)
+#else
+#define GGML_V3_UNREACHABLE() ((void) 0)
+#endif
+
+// used to copy the number of elements and stride in bytes of tensors into local variables.
+// main purpose is to reduce code duplication and improve readability.
+//
+// example:
+//
+//    GGML_V3_TENSOR_LOCALS(int64_t, ne1, src1, ne);
+//    GGML_V3_TENSOR_LOCALS(size_t,  nb1, src1, nb);
+//
+#define GGML_V3_TENSOR_LOCALS_1(type, prefix, pointer, array) \
+    const type prefix##0 = (pointer)->array[0]; \
+    GGML_V3_UNUSED(prefix##0);
+#define GGML_V3_TENSOR_LOCALS_2(type, prefix, pointer, array) \
+    GGML_V3_TENSOR_LOCALS_1    (type, prefix, pointer, array) \
+    const type prefix##1 = (pointer)->array[1]; \
+    GGML_V3_UNUSED(prefix##1);
+#define GGML_V3_TENSOR_LOCALS_3(type, prefix, pointer, array) \
+    GGML_V3_TENSOR_LOCALS_2    (type, prefix, pointer, array) \
+    const type prefix##2 = (pointer)->array[2]; \
+    GGML_V3_UNUSED(prefix##2);
+#define GGML_V3_TENSOR_LOCALS(type, prefix, pointer, array) \
+    GGML_V3_TENSOR_LOCALS_3  (type, prefix, pointer, array) \
+    const type prefix##3 = (pointer)->array[3]; \
+    GGML_V3_UNUSED(prefix##3);
+
+#define GGML_V3_TENSOR_UNARY_OP_LOCALS \
+    GGML_V3_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
+    GGML_V3_TENSOR_LOCALS(size_t,  nb0, src0, nb) \
+    GGML_V3_TENSOR_LOCALS(int64_t, ne,  dst,  ne) \
+    GGML_V3_TENSOR_LOCALS(size_t,  nb,  dst,  nb)
+
+#define GGML_V3_TENSOR_BINARY_OP_LOCALS \
+    GGML_V3_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
+    GGML_V3_TENSOR_LOCALS(size_t,  nb0, src0, nb) \
+    GGML_V3_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
+    GGML_V3_TENSOR_LOCALS(size_t,  nb1, src1, nb) \
+    GGML_V3_TENSOR_LOCALS(int64_t, ne,  dst,  ne) \
+    GGML_V3_TENSOR_LOCALS(size_t,  nb,  dst,  nb)
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+#if defined(__ARM_NEON) && defined(__CUDACC__)
+    typedef half ggml_v3_fp16_t;
+#elif defined(__ARM_NEON) && !defined(_MSC_VER)
+    typedef __fp16 ggml_v3_fp16_t;
+#else
+    typedef uint16_t ggml_v3_fp16_t;
+#endif
+
+    // convert FP16 <-> FP32
+    GGML_V3_API float       ggml_v3_fp16_to_fp32(ggml_v3_fp16_t x);
+    GGML_V3_API ggml_v3_fp16_t ggml_v3_fp32_to_fp16(float x);
+
+    GGML_V3_API void ggml_v3_fp16_to_fp32_row(const ggml_v3_fp16_t * x, float * y, int n);
+    GGML_V3_API void ggml_v3_fp32_to_fp16_row(const float * x, ggml_v3_fp16_t * y, int n);
+
+    struct ggml_v3_object;
+    struct ggml_v3_context;
+
+    enum ggml_v3_type {
+        GGML_V3_TYPE_F32  = 0,
+        GGML_V3_TYPE_F16  = 1,
+        GGML_V3_TYPE_Q4_0 = 2,
+        GGML_V3_TYPE_Q4_1 = 3,
+        // GGML_V3_TYPE_Q4_2 = 4, support has been removed
+        // GGML_V3_TYPE_Q4_3 (5) support has been removed
+        GGML_V3_TYPE_Q5_0 = 6,
+        GGML_V3_TYPE_Q5_1 = 7,
+        GGML_V3_TYPE_Q8_0 = 8,
+        GGML_V3_TYPE_Q8_1 = 9,
+        // k-quantizations
+        GGML_V3_TYPE_Q2_K = 10,
+        GGML_V3_TYPE_Q3_K = 11,
+        GGML_V3_TYPE_Q4_K = 12,
+        GGML_V3_TYPE_Q5_K = 13,
+        GGML_V3_TYPE_Q6_K = 14,
+        GGML_V3_TYPE_Q8_K = 15,
+        GGML_V3_TYPE_IQ2_XXS = 16,
+        GGML_V3_TYPE_IQ2_XS  = 17,
+        GGML_V3_TYPE_I8,
+        GGML_V3_TYPE_I16,
+        GGML_V3_TYPE_I32,
+        GGML_V3_TYPE_COUNT,
+    };
+
+    // precision
+    enum ggml_v3_prec {
+        GGML_V3_PREC_DEFAULT,
+        GGML_V3_PREC_F32,
+    };
+
+    enum ggml_v3_backend_type {
+        GGML_V3_BACKEND_CPU = 0,
+        GGML_V3_BACKEND_GPU = 10,
+        GGML_V3_BACKEND_GPU_SPLIT = 20,
+    };
+
+    // model file types
+    enum ggml_v3_ftype {
+        GGML_V3_FTYPE_UNKNOWN     = -1,
+        GGML_V3_FTYPE_ALL_F32     = 0,
+        GGML_V3_FTYPE_MOSTLY_F16  = 1,  // except 1d tensors
+        GGML_V3_FTYPE_MOSTLY_Q4_0 = 2,  // except 1d tensors
+        GGML_V3_FTYPE_MOSTLY_Q4_1 = 3,  // except 1d tensors
+        GGML_V3_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
+        GGML_V3_FTYPE_MOSTLY_Q8_0 = 7,  // except 1d tensors
+        GGML_V3_FTYPE_MOSTLY_Q5_0 = 8,  // except 1d tensors
+        GGML_V3_FTYPE_MOSTLY_Q5_1 = 9,  // except 1d tensors
+        GGML_V3_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors
+        GGML_V3_FTYPE_MOSTLY_Q3_K = 11, // except 1d tensors
+        GGML_V3_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors
+        GGML_V3_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors
+        GGML_V3_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors
+        GGML_V3_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors
+        GGML_V3_FTYPE_MOSTLY_IQ2_XS  = 16, // except 1d tensors
+    };
+
+    // available tensor operations:
+    enum ggml_v3_op {
+        GGML_V3_OP_NONE = 0,
+
+        GGML_V3_OP_DUP,
+        GGML_V3_OP_ADD,
+        GGML_V3_OP_ADD1,
+        GGML_V3_OP_ACC,
+        GGML_V3_OP_SUB,
+        GGML_V3_OP_MUL,
+        GGML_V3_OP_DIV,
+        GGML_V3_OP_SQR,
+        GGML_V3_OP_SQRT,
+        GGML_V3_OP_LOG,
+        GGML_V3_OP_SUM,
+        GGML_V3_OP_SUM_ROWS,
+        GGML_V3_OP_MEAN,
+        GGML_V3_OP_ARGMAX,
+        GGML_V3_OP_REPEAT,
+        GGML_V3_OP_REPEAT_BACK,
+        GGML_V3_OP_CONCAT,
+        GGML_V3_OP_SILU_BACK,
+        GGML_V3_OP_NORM, // normalize
+        GGML_V3_OP_RMS_NORM,
+        GGML_V3_OP_RMS_NORM_BACK,
+        GGML_V3_OP_GROUP_NORM,
+
+        GGML_V3_OP_MUL_MAT,
+        GGML_V3_OP_MUL_MAT_ID,
+        GGML_V3_OP_OUT_PROD,
+
+        GGML_V3_OP_SCALE,
+        GGML_V3_OP_SET,
+        GGML_V3_OP_CPY,
+        GGML_V3_OP_CONT,
+        GGML_V3_OP_RESHAPE,
+        GGML_V3_OP_VIEW,
+        GGML_V3_OP_PERMUTE,
+        GGML_V3_OP_TRANSPOSE,
+        GGML_V3_OP_GET_ROWS,
+        GGML_V3_OP_GET_ROWS_BACK,
+        GGML_V3_OP_DIAG,
+        GGML_V3_OP_DIAG_MASK_INF,
+        GGML_V3_OP_DIAG_MASK_ZERO,
+        GGML_V3_OP_SOFT_MAX,
+        GGML_V3_OP_SOFT_MAX_BACK,
+        GGML_V3_OP_ROPE,
+        GGML_V3_OP_ROPE_BACK,
+        GGML_V3_OP_ALIBI,
+        GGML_V3_OP_CLAMP,
+        GGML_V3_OP_CONV_TRANSPOSE_1D,
+        GGML_V3_OP_IM2COL,
+        GGML_V3_OP_CONV_TRANSPOSE_2D,
+        GGML_V3_OP_POOL_1D,
+        GGML_V3_OP_POOL_2D,
+        GGML_V3_OP_UPSCALE, // nearest interpolate
+        GGML_V3_OP_PAD,
+        GGML_V3_OP_ARGSORT,
+        GGML_V3_OP_LEAKY_RELU,
+
+        GGML_V3_OP_FLASH_ATTN,
+        GGML_V3_OP_FLASH_FF,
+        GGML_V3_OP_FLASH_ATTN_BACK,
+        GGML_V3_OP_WIN_PART,
+        GGML_V3_OP_WIN_UNPART,
+        GGML_V3_OP_GET_REL_POS,
+        GGML_V3_OP_ADD_REL_POS,
+
+        GGML_V3_OP_UNARY,
+
+        GGML_V3_OP_MAP_UNARY,
+        GGML_V3_OP_MAP_BINARY,
+
+        GGML_V3_OP_MAP_CUSTOM1_F32,
+        GGML_V3_OP_MAP_CUSTOM2_F32,
+        GGML_V3_OP_MAP_CUSTOM3_F32,
+
+        GGML_V3_OP_MAP_CUSTOM1,
+        GGML_V3_OP_MAP_CUSTOM2,
+        GGML_V3_OP_MAP_CUSTOM3,
+
+        GGML_V3_OP_CROSS_ENTROPY_LOSS,
+        GGML_V3_OP_CROSS_ENTROPY_LOSS_BACK,
+
+        GGML_V3_OP_COUNT,
+    };
+
+    enum ggml_v3_unary_op {
+        GGML_V3_UNARY_OP_ABS,
+        GGML_V3_UNARY_OP_SGN,
+        GGML_V3_UNARY_OP_NEG,
+        GGML_V3_UNARY_OP_STEP,
+        GGML_V3_UNARY_OP_TANH,
+        GGML_V3_UNARY_OP_ELU,
+        GGML_V3_UNARY_OP_RELU,
+        GGML_V3_UNARY_OP_GELU,
+        GGML_V3_UNARY_OP_GELU_QUICK,
+        GGML_V3_UNARY_OP_SILU,
+
+        GGML_V3_UNARY_OP_COUNT,
+    };
+
+    enum ggml_v3_object_type {
+        GGML_V3_OBJECT_TENSOR,
+        GGML_V3_OBJECT_GRAPH,
+        GGML_V3_OBJECT_WORK_BUFFER
+    };
+
+    enum ggml_v3_log_level {
+        GGML_V3_LOG_LEVEL_ERROR = 2,
+        GGML_V3_LOG_LEVEL_WARN = 3,
+        GGML_V3_LOG_LEVEL_INFO = 4,
+        GGML_V3_LOG_LEVEL_DEBUG = 5
+    };
+
+    // ggml object
+    struct ggml_v3_object {
+        size_t offs;
+        size_t size;
+
+        struct ggml_v3_object * next;
+
+        enum ggml_v3_object_type type;
+
+        char padding[4];
+    };
+
+    static const size_t GGML_V3_OBJECT_SIZE = sizeof(struct ggml_v3_object);
+
+    // n-dimensional tensor
+    struct ggml_v3_tensor {
+        enum ggml_v3_type         type;
+        enum ggml_v3_backend_type backend;
+
+        struct ggml_v3_backend_buffer * buffer;
+
+        int64_t ne[GGML_V3_MAX_DIMS]; // number of elements
+        size_t  nb[GGML_V3_MAX_DIMS]; // stride in bytes:
+                                   // nb[0] = ggml_v3_type_size(type)
+                                   // nb[1] = nb[0]   * (ne[0] / ggml_v3_blck_size(type)) + padding
+                                   // nb[i] = nb[i-1] * ne[i-1]
+
+        // compute data
+        enum ggml_v3_op op;
+
+        // op params - allocated as int32_t for alignment
+        int32_t op_params[GGML_V3_MAX_OP_PARAMS / sizeof(int32_t)];
+
+        bool is_param;
+
+        struct ggml_v3_tensor * grad;
+        struct ggml_v3_tensor * src[GGML_V3_MAX_SRC];
+
+        // performance
+        int     perf_runs;
+        int64_t perf_cycles;
+        int64_t perf_time_us;
+
+        struct ggml_v3_tensor * view_src;
+        size_t               view_offs;
+
+        void * data;
+
+        char name[GGML_V3_MAX_NAME];
+
+        void * extra; // extra things e.g. for ggml-cuda.cu
+
+        char padding[8];
+    };
+
+    static const size_t GGML_V3_TENSOR_SIZE = sizeof(struct ggml_v3_tensor);
+
+    // the compute plan that needs to be prepared for ggml_v3_graph_compute()
+    // since https://github.com/ggerganov/ggml/issues/287
+    struct ggml_v3_cplan {
+        size_t    work_size; // size of work buffer, calculated by `ggml_v3_graph_plan()`
+        uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_v3_graph_compute()`
+
+        int n_threads;
+
+        // abort ggml_v3_graph_compute when true
+        bool (*abort_callback)(void * data);
+        void * abort_callback_data;
+    };
+
+    enum ggml_v3_cgraph_eval_order {
+        GGML_V3_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
+        GGML_V3_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
+        GGML_V3_CGRAPH_EVAL_ORDER_COUNT
+    };
+
+    struct ggml_v3_hash_set {
+        size_t size;
+        struct ggml_v3_tensor ** keys;
+    };
+
+    // computation graph
+    struct ggml_v3_cgraph {
+        int size;
+        int n_nodes;
+        int n_leafs;
+
+        struct ggml_v3_tensor ** nodes;
+        struct ggml_v3_tensor ** grads;
+        struct ggml_v3_tensor ** leafs;
+
+        struct ggml_v3_hash_set visited_hash_table;
+
+        enum ggml_v3_cgraph_eval_order order;
+
+        // performance
+        int     perf_runs;
+        int64_t perf_cycles;
+        int64_t perf_time_us;
+    };
+
+    // scratch buffer
+    struct ggml_v3_scratch {
+        size_t offs;
+        size_t size;
+        void * data;
+    };
+
+    struct ggml_v3_init_params {
+        // memory pool
+        size_t mem_size;   // bytes
+        void * mem_buffer; // if NULL, memory will be allocated internally
+        bool   no_alloc;   // don't allocate memory for the tensor data
+    };
+
+
+    // compute types
+
+    // NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled.
+    // This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995.
+    enum ggml_v3_task_type {
+        GGML_V3_TASK_INIT = 0,
+        GGML_V3_TASK_COMPUTE,
+        GGML_V3_TASK_FINALIZE,
+    };
+
+    struct ggml_v3_compute_params {
+        enum ggml_v3_task_type type;
+
+        // ith = thread index, nth = number of threads
+        int ith, nth;
+
+        // work buffer for all threads
+        size_t wsize;
+        void * wdata;
+    };
+
+    // misc
+
+    GGML_V3_API void    ggml_v3_time_init(void); // call this once at the beginning of the program
+    GGML_V3_API int64_t ggml_v3_time_ms(void);
+    GGML_V3_API int64_t ggml_v3_time_us(void);
+    GGML_V3_API int64_t ggml_v3_cycles(void);
+    GGML_V3_API int64_t ggml_v3_cycles_per_ms(void);
+
+    GGML_V3_API void    ggml_v3_print_backtrace(void);
+
+    GGML_V3_API void    ggml_v3_numa_init(void); // call once for better performance on NUMA systems
+    GGML_V3_API bool    ggml_v3_is_numa(void); // true if init detected that system has >1 NUMA node
+
+    GGML_V3_API void    ggml_v3_print_object (const struct ggml_v3_object * obj);
+    GGML_V3_API void    ggml_v3_print_objects(const struct ggml_v3_context * ctx);
+
+    GGML_V3_API int64_t ggml_v3_nelements   (const struct ggml_v3_tensor * tensor);
+    GGML_V3_API int64_t ggml_v3_nrows       (const struct ggml_v3_tensor * tensor);
+    GGML_V3_API size_t  ggml_v3_nbytes      (const struct ggml_v3_tensor * tensor);
+    GGML_V3_API size_t  ggml_v3_nbytes_pad  (const struct ggml_v3_tensor * tensor); // same as ggml_v3_nbytes() but padded to GGML_V3_MEM_ALIGN
+
+    GGML_V3_API int    ggml_v3_blck_size(enum ggml_v3_type type);
+    GGML_V3_API size_t ggml_v3_type_size(enum ggml_v3_type type);             // size in bytes for all elements in a block
+    GGML_V3_API size_t ggml_v3_row_size (enum ggml_v3_type type, int64_t ne); // size in bytes for all elements in a row
+
+    GGML_V3_DEPRECATED(
+    GGML_V3_API double ggml_v3_type_sizef(enum ggml_v3_type type), // ggml_v3_type_size()/ggml_v3_blck_size() as float
+    "use ggml_v3_row_size() instead");
+
+    GGML_V3_API const char * ggml_v3_type_name(enum ggml_v3_type type);
+    GGML_V3_API const char * ggml_v3_op_name  (enum ggml_v3_op   op);
+    GGML_V3_API const char * ggml_v3_op_symbol(enum ggml_v3_op   op);
+
+    GGML_V3_API const char * ggml_v3_unary_op_name(enum ggml_v3_unary_op op);
+    GGML_V3_API const char * ggml_v3_op_desc(const struct ggml_v3_tensor * t); // unary or op name
+
+    GGML_V3_API size_t  ggml_v3_element_size(const struct ggml_v3_tensor * tensor);
+
+    GGML_V3_API bool    ggml_v3_is_quantized(enum ggml_v3_type type);
+
+    // TODO: temporary until model loading of ggml examples is refactored
+    GGML_V3_API enum ggml_v3_type ggml_v3_ftype_to_ggml_v3_type(enum ggml_v3_ftype ftype);
+
+    GGML_V3_API bool ggml_v3_is_transposed(const struct ggml_v3_tensor * tensor);
+    GGML_V3_API bool ggml_v3_is_contiguous(const struct ggml_v3_tensor * tensor);
+    GGML_V3_API bool ggml_v3_is_permuted  (const struct ggml_v3_tensor * tensor);
+    GGML_V3_API bool ggml_v3_is_scalar    (const struct ggml_v3_tensor * tensor);
+    GGML_V3_API bool ggml_v3_is_vector    (const struct ggml_v3_tensor * tensor);
+    GGML_V3_API bool ggml_v3_is_matrix    (const struct ggml_v3_tensor * tensor);
+    GGML_V3_API bool ggml_v3_is_3d        (const struct ggml_v3_tensor * tensor);
+    GGML_V3_API int  ggml_v3_n_dims       (const struct ggml_v3_tensor * tensor); // returns 1 for scalars
+
+    GGML_V3_API bool ggml_v3_are_same_shape(const struct ggml_v3_tensor * t0, const struct ggml_v3_tensor * t1);
+
+    // use this to compute the memory overhead of a tensor
+    GGML_V3_API size_t ggml_v3_tensor_overhead(void);
+
+    // main
+
+    GGML_V3_API struct ggml_v3_context * ggml_v3_init(struct ggml_v3_init_params params);
+    GGML_V3_API void                  ggml_v3_free(struct ggml_v3_context * ctx);
+
+    GGML_V3_API size_t  ggml_v3_used_mem(const struct ggml_v3_context * ctx);
+
+    GGML_V3_API size_t  ggml_v3_set_scratch (struct ggml_v3_context * ctx, struct ggml_v3_scratch scratch);
+    GGML_V3_API bool    ggml_v3_get_no_alloc(struct ggml_v3_context * ctx);
+    GGML_V3_API void    ggml_v3_set_no_alloc(struct ggml_v3_context * ctx, bool no_alloc);
+
+    GGML_V3_API void *  ggml_v3_get_mem_buffer     (const struct ggml_v3_context * ctx);
+    GGML_V3_API size_t  ggml_v3_get_mem_size       (const struct ggml_v3_context * ctx);
+    GGML_V3_API size_t  ggml_v3_get_max_tensor_size(const struct ggml_v3_context * ctx);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_new_tensor(
+            struct ggml_v3_context * ctx,
+            enum   ggml_v3_type type,
+            int    n_dims,
+            const int64_t *ne);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_new_tensor_1d(
+            struct ggml_v3_context * ctx,
+            enum   ggml_v3_type type,
+            int64_t ne0);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_new_tensor_2d(
+            struct ggml_v3_context * ctx,
+            enum   ggml_v3_type type,
+            int64_t ne0,
+            int64_t ne1);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_new_tensor_3d(
+            struct ggml_v3_context * ctx,
+            enum   ggml_v3_type type,
+            int64_t ne0,
+            int64_t ne1,
+            int64_t ne2);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_new_tensor_4d(
+            struct ggml_v3_context * ctx,
+            enum   ggml_v3_type type,
+            int64_t ne0,
+            int64_t ne1,
+            int64_t ne2,
+            int64_t ne3);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_new_i32(struct ggml_v3_context * ctx, int32_t value);
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_new_f32(struct ggml_v3_context * ctx, float value);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_dup_tensor (struct ggml_v3_context * ctx, const struct ggml_v3_tensor * src);
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_view_tensor(struct ggml_v3_context * ctx, struct ggml_v3_tensor * src);
+
+    // Context tensor enumeration and lookup
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_get_first_tensor(const struct ggml_v3_context * ctx);
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_get_next_tensor (const struct ggml_v3_context * ctx, struct ggml_v3_tensor * tensor);
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_get_tensor(struct ggml_v3_context * ctx, const char * name);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_set_zero(struct ggml_v3_tensor * tensor);
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_set_i32 (struct ggml_v3_tensor * tensor, int32_t value);
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_set_f32 (struct ggml_v3_tensor * tensor, float value);
+
+    // Converts a flat index into coordinates
+    GGML_V3_API void    ggml_v3_unravel_index(const struct ggml_v3_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3);
+
+    GGML_V3_API int32_t ggml_v3_get_i32_1d(const struct ggml_v3_tensor * tensor, int i);
+    GGML_V3_API void    ggml_v3_set_i32_1d(const struct ggml_v3_tensor * tensor, int i, int32_t value);
+
+    GGML_V3_API int32_t ggml_v3_get_i32_nd(const struct ggml_v3_tensor * tensor, int i0, int i1, int i2, int i3);
+    GGML_V3_API void    ggml_v3_set_i32_nd(const struct ggml_v3_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
+
+    GGML_V3_API float   ggml_v3_get_f32_1d(const struct ggml_v3_tensor * tensor, int i);
+    GGML_V3_API void    ggml_v3_set_f32_1d(const struct ggml_v3_tensor * tensor, int i, float value);
+
+    GGML_V3_API float   ggml_v3_get_f32_nd(const struct ggml_v3_tensor * tensor, int i0, int i1, int i2, int i3);
+    GGML_V3_API void    ggml_v3_set_f32_nd(const struct ggml_v3_tensor * tensor, int i0, int i1, int i2, int i3, float value);
+
+    GGML_V3_API void *  ggml_v3_get_data    (const struct ggml_v3_tensor * tensor);
+    GGML_V3_API float * ggml_v3_get_data_f32(const struct ggml_v3_tensor * tensor);
+
+    GGML_V3_API enum ggml_v3_unary_op ggml_v3_get_unary_op(const struct ggml_v3_tensor * tensor);
+
+    GGML_V3_API const char *         ggml_v3_get_name   (const struct ggml_v3_tensor * tensor);
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_set_name   (      struct ggml_v3_tensor * tensor, const char * name);
+    GGML_V3_ATTRIBUTE_FORMAT(2, 3)
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_format_name(      struct ggml_v3_tensor * tensor, const char * fmt, ...);
+
+    //
+    // operations on tensors with backpropagation
+    //
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_dup(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a);
+
+    // in-place, returns view(a)
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_dup_inplace(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_add(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            struct ggml_v3_tensor  * b);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_add_inplace(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            struct ggml_v3_tensor  * b);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_add_cast(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            struct ggml_v3_tensor  * b,
+            enum   ggml_v3_type      type);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_add1(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            struct ggml_v3_tensor  * b);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_add1_inplace(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            struct ggml_v3_tensor  * b);
+
+    // dst = a
+    // view(dst, nb1, nb2, nb3, offset) += b
+    // return dst
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_acc(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            struct ggml_v3_tensor  * b,
+            size_t                nb1,
+            size_t                nb2,
+            size_t                nb3,
+            size_t                offset);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_acc_inplace(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            struct ggml_v3_tensor  * b,
+            size_t                nb1,
+            size_t                nb2,
+            size_t                nb3,
+            size_t                offset);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_sub(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            struct ggml_v3_tensor  * b);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_sub_inplace(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            struct ggml_v3_tensor  * b);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_mul(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            struct ggml_v3_tensor  * b);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_mul_inplace(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            struct ggml_v3_tensor  * b);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_div(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            struct ggml_v3_tensor  * b);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_div_inplace(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            struct ggml_v3_tensor  * b);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_sqr(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_sqr_inplace(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_sqrt(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_sqrt_inplace(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_log(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_log_inplace(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a);
+
+    // return scalar
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_sum(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a);
+
+    // sums along rows, with input shape [a,b,c,d] return shape [1,b,c,d]
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_sum_rows(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a);
+
+    // mean along rows
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_mean(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a);
+
+    // argmax along rows
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_argmax(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a);
+
+    // if a is the same shape as b, and a is not parameter, return a
+    // otherwise, return a new tensor: repeat(a) to fit in b
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_repeat(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            struct ggml_v3_tensor  * b);
+
+    // sums repetitions in a into shape of b
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_repeat_back(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            struct ggml_v3_tensor  * b);
+
+    // concat a and b on dim 2
+    // used in stable-diffusion
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_concat(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            struct ggml_v3_tensor  * b);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_abs(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_abs_inplace(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_sgn(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_sgn_inplace(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_neg(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_neg_inplace(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_step(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_step_inplace(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_tanh(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_tanh_inplace(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_elu(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_elu_inplace(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_relu(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_leaky_relu(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a, float negative_slope, bool inplace);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_relu_inplace(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_gelu(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_gelu_inplace(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_gelu_quick(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_gelu_quick_inplace(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_silu(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_silu_inplace(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a);
+
+    // a - x
+    // b - dy
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_silu_back(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            struct ggml_v3_tensor  * b);
+
+    // normalize along rows
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_norm(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            float                 eps);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_norm_inplace(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            float                 eps);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_rms_norm(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            float                 eps);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_rms_norm_inplace(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            float                 eps);
+
+    // group normalize along ne0*ne1*n_groups
+    // used in stable-diffusion
+    // TODO: eps is hardcoded to 1e-6 for now
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_group_norm(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            int                   n_groups);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_group_norm_inplace(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            int                   n_groups);
+
+    // a - x
+    // b - dy
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_rms_norm_back(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            struct ggml_v3_tensor  * b,
+            float                 eps);
+
+    // A: k columns, n rows => [ne03, ne02, n, k]
+    // B: k columns, m rows  (i.e. we transpose it internally) => [ne03 * x, ne02 * y, m, k]
+    // result is n columns, m rows => [ne03 * x, ne02 * y, m, n]
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_mul_mat(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            struct ggml_v3_tensor  * b);
+
+    // change the precision of a matrix multiplication
+    // set to GGML_V3_PREC_F32 for higher precision (useful for phi-2)
+    GGML_V3_API void ggml_v3_mul_mat_set_prec(
+            struct ggml_v3_tensor * a,
+            enum ggml_v3_prec       prec);
+
+    // indirect matrix multiplication
+    //  ggml_v3_mul_mat_id(ctx, as, ids, id, b) ~= ggml_v3_mul_mat(as[ids[id]], b)
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_mul_mat_id(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * const as[],
+            int                   n_as,
+            struct ggml_v3_tensor  * ids,
+            int                   id,
+            struct ggml_v3_tensor  * b);
+
+    // A: m columns, n rows,
+    // B: p columns, n rows,
+    // result is m columns, p rows
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_out_prod(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            struct ggml_v3_tensor  * b);
+
+    //
+    // operations on tensors without backpropagation
+    //
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_scale(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            float                 s);
+
+    // in-place, returns view(a)
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_scale_inplace(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            float                 s);
+
+    // b -> view(a,offset,nb1,nb2,3), return modified a
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_set(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            struct ggml_v3_tensor  * b,
+            size_t                nb1,
+            size_t                nb2,
+            size_t                nb3,
+            size_t                offset);
+
+    // b -> view(a,offset,nb1,nb2,3), return view(a)
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_set_inplace(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            struct ggml_v3_tensor  * b,
+            size_t                nb1,
+            size_t                nb2,
+            size_t                nb3,
+            size_t                offset);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_set_1d(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            struct ggml_v3_tensor  * b,
+            size_t                offset);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_set_1d_inplace(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            struct ggml_v3_tensor  * b,
+            size_t                offset);
+
+    // b -> view(a,offset,nb1,nb2,3), return modified a
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_set_2d(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            struct ggml_v3_tensor  * b,
+            size_t                nb1,
+            size_t                offset);
+
+    // b -> view(a,offset,nb1,nb2,3), return view(a)
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_set_2d_inplace(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            struct ggml_v3_tensor  * b,
+            size_t                nb1,
+            size_t                offset);
+
+    // a -> b, return view(b)
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_cpy(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            struct ggml_v3_tensor  * b);
+
+    // make contiguous
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_cont(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a);
+
+    // make contiguous, with new shape
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_cont_1d(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            int64_t               ne0);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_cont_2d(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            int64_t               ne0,
+            int64_t               ne1);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_cont_3d(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            int64_t               ne0,
+            int64_t               ne1,
+            int64_t               ne2);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_cont_4d(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            int64_t               ne0,
+            int64_t               ne1,
+            int64_t               ne2,
+            int64_t               ne3);
+
+    // return view(a), b specifies the new shape
+    // TODO: when we start computing gradient, make a copy instead of view
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_reshape(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            struct ggml_v3_tensor  * b);
+
+    // return view(a)
+    // TODO: when we start computing gradient, make a copy instead of view
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_reshape_1d(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            int64_t               ne0);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_reshape_2d(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            int64_t               ne0,
+            int64_t               ne1);
+
+    // return view(a)
+    // TODO: when we start computing gradient, make a copy instead of view
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_reshape_3d(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            int64_t               ne0,
+            int64_t               ne1,
+            int64_t               ne2);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_reshape_4d(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            int64_t               ne0,
+            int64_t               ne1,
+            int64_t               ne2,
+            int64_t               ne3);
+
+    // offset in bytes
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_view_1d(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            int64_t               ne0,
+            size_t                offset);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_view_2d(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            int64_t               ne0,
+            int64_t               ne1,
+            size_t                nb1, // row stride in bytes
+            size_t                offset);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_view_3d(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            int64_t               ne0,
+            int64_t               ne1,
+            int64_t               ne2,
+            size_t                nb1, // row   stride in bytes
+            size_t                nb2, // slice stride in bytes
+            size_t                offset);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_view_4d(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            int64_t               ne0,
+            int64_t               ne1,
+            int64_t               ne2,
+            int64_t               ne3,
+            size_t                nb1, // row   stride in bytes
+            size_t                nb2, // slice stride in bytes
+            size_t                nb3,
+            size_t                offset);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_permute(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            int                   axis0,
+            int                   axis1,
+            int                   axis2,
+            int                   axis3);
+
+    // alias for ggml_v3_permute(ctx, a, 1, 0, 2, 3)
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_transpose(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a);
+
+    // supports 3D: a->ne[2] == b->ne[1]
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_get_rows(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            struct ggml_v3_tensor  * b);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_get_rows_back(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            struct ggml_v3_tensor  * b,
+            struct ggml_v3_tensor  * c);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_diag(
+        struct ggml_v3_context     * ctx,
+        struct ggml_v3_tensor      * a);
+
+    // set elements above the diagonal to -INF
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_diag_mask_inf(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            int                   n_past);
+
+    // in-place, returns view(a)
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_diag_mask_inf_inplace(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            int                   n_past);
+
+    // set elements above the diagonal to 0
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_diag_mask_zero(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            int                   n_past);
+
+    // in-place, returns view(a)
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_diag_mask_zero_inplace(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            int                   n_past);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_soft_max(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a);
+
+    // in-place, returns view(a)
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_soft_max_inplace(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a);
+
+    // fused soft_max(a*scale + mask)
+    // mask is optional
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_soft_max_ext(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            struct ggml_v3_tensor  * mask,
+            float                 scale);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_soft_max_back(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            struct ggml_v3_tensor  * b);
+
+    // in-place, returns view(a)
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_soft_max_back_inplace(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            struct ggml_v3_tensor  * b);
+
+    // rotary position embedding
+    // if mode & 1 == 1, skip n_past elements (DEPRECATED)
+    // if mode & 2 == 1, GPT-NeoX style
+    // if mode & 4 == 1, ChatGLM style
+    //
+    // b is an int32 vector with size a->ne[2], it contains the positions
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_rope(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            struct ggml_v3_tensor  * b,
+            int                   n_dims,
+            int                   mode,
+            int                   n_ctx);
+
+    // in-place, returns view(a)
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_rope_inplace(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            struct ggml_v3_tensor  * b,
+            int                   n_dims,
+            int                   mode,
+            int                   n_ctx);
+
+    // custom RoPE
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_rope_custom(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            struct ggml_v3_tensor  * b,
+            int                   n_dims,
+            int                   mode,
+            int                   n_ctx,
+            int                   n_orig_ctx,
+            float                 freq_base,
+            float                 freq_scale,
+            float                 ext_factor,
+            float                 attn_factor,
+            float                 beta_fast,
+            float                 beta_slow);
+
+    // in-place, returns view(a)
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_rope_custom_inplace(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            struct ggml_v3_tensor  * b,
+            int                   n_dims,
+            int                   mode,
+            int                   n_ctx,
+            int                   n_orig_ctx,
+            float                 freq_base,
+            float                 freq_scale,
+            float                 ext_factor,
+            float                 attn_factor,
+            float                 beta_fast,
+            float                 beta_slow);
+
+    // compute correction dims for YaRN RoPE scaling
+    void ggml_v3_rope_yarn_corr_dims(
+        int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
+
+    // xPos RoPE, in-place, returns view(a)
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_rope_xpos_inplace(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            struct ggml_v3_tensor  * b,
+            int                   n_dims,
+            float                 base,
+            bool                  down);
+
+    // rotary position embedding backward, i.e compute dx from dy
+    // a - dy
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_rope_back(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            struct ggml_v3_tensor  * b,
+            int                   n_dims,
+            int                   mode,
+            int                   n_ctx,
+            int                   n_orig_ctx,
+            float                 freq_base,
+            float                 freq_scale,
+            float                 ext_factor,
+            float                 attn_factor,
+            float                 beta_fast,
+            float                 beta_slow,
+            float                 xpos_base,
+            bool                  xpos_down);
+
+    // alibi position embedding
+    // in-place, returns view(a)
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_alibi(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            int                   n_past,
+            int                   n_head,
+            float                 bias_max);
+
+    // clamp
+    // in-place, returns view(a)
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_clamp(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            float                 min,
+            float                 max);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_im2col(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            struct ggml_v3_tensor  * b,
+            int                  s0,
+            int                  s1,
+            int                  p0,
+            int                  p1,
+            int                  d0,
+            int                  d1,
+            bool                 is_2D);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_conv_1d(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            struct ggml_v3_tensor  * b,
+            int                   s0,  // stride
+            int                   p0,  // padding
+            int                   d0); // dilation
+
+    // conv_1d with padding = half
+    // alias for ggml_v3_conv_1d(a, b, s, a->ne[0]/2, d)
+    GGML_V3_API struct ggml_v3_tensor* ggml_v3_conv_1d_ph(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            struct ggml_v3_tensor  * b,
+            int                   s,
+            int                   d);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_conv_transpose_1d(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            struct ggml_v3_tensor  * b,
+            int                   s0,
+            int                   p0,
+            int                   d0);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_conv_2d(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            struct ggml_v3_tensor  * b,
+            int                   s0,
+            int                   s1,
+            int                   p0,
+            int                   p1,
+            int                   d0,
+            int                   d1);
+
+
+    // kernel size is a->ne[0] x a->ne[1]
+    // stride is equal to kernel size
+    // padding is zero
+    // example:
+    // a:     16   16    3  768
+    // b:   1024 1024    3    1
+    // res:   64   64  768    1
+    // used in sam
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_conv_2d_sk_p0(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            struct ggml_v3_tensor  * b);
+
+    // kernel size is a->ne[0] x a->ne[1]
+    // stride is 1
+    // padding is half
+    // example:
+    // a:      3    3    256  256
+    // b:     64   64    256    1
+    // res:   64   64    256    1
+    // used in sam
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_conv_2d_s1_ph(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            struct ggml_v3_tensor  * b);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_conv_transpose_2d_p0(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            struct ggml_v3_tensor  * b,
+            int                   stride);
+
+    enum ggml_v3_op_pool {
+        GGML_V3_OP_POOL_MAX,
+        GGML_V3_OP_POOL_AVG,
+        GGML_V3_OP_POOL_COUNT,
+    };
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_pool_1d(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            enum ggml_v3_op_pool     op,
+            int                   k0, // kernel size
+            int                   s0, // stride
+            int                   p0); // padding
+
+    // the result will have 2*p0 padding for the first dimension
+    // and 2*p1 padding for the second dimension
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_pool_2d(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            enum ggml_v3_op_pool     op,
+            int                   k0,
+            int                   k1,
+            int                   s0,
+            int                   s1,
+            float                 p0,
+            float                 p1);
+
+    // nearest interpolate
+    // used in stable-diffusion
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_upscale(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            int                   scale_factor);
+
+    // pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_pad(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            int                  p0,
+            int                  p1,
+            int                  p2,
+            int                  p3);
+
+    // sort rows
+    enum ggml_v3_sort_order {
+        GGML_V3_SORT_ASC,
+        GGML_V3_SORT_DESC,
+    };
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_argsort(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            enum ggml_v3_sort_order  order);
+
+    // top k elements per row
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_top_k(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            int                   k);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_flash_attn(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * q,
+            struct ggml_v3_tensor  * k,
+            struct ggml_v3_tensor  * v,
+            bool                  masked);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_flash_attn_back(
+           struct ggml_v3_context * ctx,
+           struct ggml_v3_tensor  * q,
+           struct ggml_v3_tensor  * k,
+           struct ggml_v3_tensor  * v,
+           struct ggml_v3_tensor  * d,
+           bool                  masked);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_flash_ff(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            struct ggml_v3_tensor  * b0,
+            struct ggml_v3_tensor  * b1,
+            struct ggml_v3_tensor  * c0,
+            struct ggml_v3_tensor  * c1);
+
+    // partition into non-overlapping windows with padding if needed
+    // example:
+    // a:   768   64   64    1
+    // w:    14
+    // res: 768   14   14    25
+    // used in sam
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_win_part(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            int                   w);
+
+    // reverse of ggml_v3_win_part
+    // used in sam
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_win_unpart(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            int                   w0,
+            int                   h0,
+            int                   w);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_unary(
+            struct ggml_v3_context * ctx,
+             struct ggml_v3_tensor * a,
+             enum ggml_v3_unary_op op);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_unary_inplace(
+        struct ggml_v3_context * ctx,
+        struct ggml_v3_tensor  * a,
+        enum ggml_v3_unary_op op);
+
+    // used in sam
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_get_rel_pos(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            int                   qh,
+            int                   kh);
+
+    // used in sam
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_add_rel_pos(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            struct ggml_v3_tensor  * pw,
+            struct ggml_v3_tensor  * ph);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_add_rel_pos_inplace(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * a,
+            struct ggml_v3_tensor  * pw,
+            struct ggml_v3_tensor  * ph);
+
+    // custom operators
+
+    typedef void (*ggml_v3_unary_op_f32_t) (const int, float *, const float *);
+    typedef void (*ggml_v3_binary_op_f32_t)(const int, float *, const float *, const float *);
+
+    typedef void (*ggml_v3_custom1_op_f32_t)(struct ggml_v3_tensor *, const struct ggml_v3_tensor *);
+    typedef void (*ggml_v3_custom2_op_f32_t)(struct ggml_v3_tensor *, const struct ggml_v3_tensor *, const struct ggml_v3_tensor *);
+    typedef void (*ggml_v3_custom3_op_f32_t)(struct ggml_v3_tensor *, const struct ggml_v3_tensor *, const struct ggml_v3_tensor *, const struct ggml_v3_tensor *);
+
+    GGML_V3_DEPRECATED(GGML_V3_API struct ggml_v3_tensor * ggml_v3_map_unary_f32(
+            struct ggml_v3_context        * ctx,
+            struct ggml_v3_tensor         * a,
+                   ggml_v3_unary_op_f32_t   fun),
+        "use ggml_v3_map_custom1 instead");
+
+    GGML_V3_DEPRECATED(GGML_V3_API struct ggml_v3_tensor * ggml_v3_map_unary_inplace_f32(
+            struct ggml_v3_context        * ctx,
+            struct ggml_v3_tensor         * a,
+                   ggml_v3_unary_op_f32_t   fun),
+        "use ggml_v3_map_custom1_inplace instead");
+
+    GGML_V3_DEPRECATED(GGML_V3_API struct ggml_v3_tensor * ggml_v3_map_binary_f32(
+            struct ggml_v3_context         * ctx,
+            struct ggml_v3_tensor          * a,
+            struct ggml_v3_tensor          * b,
+                   ggml_v3_binary_op_f32_t   fun),
+        "use ggml_v3_map_custom2 instead");
+
+    GGML_V3_DEPRECATED(GGML_V3_API struct ggml_v3_tensor * ggml_v3_map_binary_inplace_f32(
+            struct ggml_v3_context         * ctx,
+            struct ggml_v3_tensor          * a,
+            struct ggml_v3_tensor          * b,
+                   ggml_v3_binary_op_f32_t   fun),
+        "use ggml_v3_map_custom2_inplace instead");
+
+    GGML_V3_DEPRECATED(GGML_V3_API struct ggml_v3_tensor * ggml_v3_map_custom1_f32(
+            struct ggml_v3_context          * ctx,
+            struct ggml_v3_tensor           * a,
+                   ggml_v3_custom1_op_f32_t   fun),
+        "use ggml_v3_map_custom1 instead");
+
+    GGML_V3_DEPRECATED(GGML_V3_API struct ggml_v3_tensor * ggml_v3_map_custom1_inplace_f32(
+            struct ggml_v3_context          * ctx,
+            struct ggml_v3_tensor           * a,
+                   ggml_v3_custom1_op_f32_t   fun),
+        "use ggml_v3_map_custom1_inplace instead");
+
+    GGML_V3_DEPRECATED(GGML_V3_API struct ggml_v3_tensor * ggml_v3_map_custom2_f32(
+            struct ggml_v3_context          * ctx,
+            struct ggml_v3_tensor           * a,
+            struct ggml_v3_tensor           * b,
+                   ggml_v3_custom2_op_f32_t   fun),
+        "use ggml_v3_map_custom2 instead");
+
+    GGML_V3_DEPRECATED(GGML_V3_API struct ggml_v3_tensor * ggml_v3_map_custom2_inplace_f32(
+            struct ggml_v3_context          * ctx,
+            struct ggml_v3_tensor           * a,
+            struct ggml_v3_tensor           * b,
+                   ggml_v3_custom2_op_f32_t   fun),
+        "use ggml_v3_map_custom2_inplace instead");
+
+    GGML_V3_DEPRECATED(GGML_V3_API struct ggml_v3_tensor * ggml_v3_map_custom3_f32(
+            struct ggml_v3_context          * ctx,
+            struct ggml_v3_tensor           * a,
+            struct ggml_v3_tensor           * b,
+            struct ggml_v3_tensor           * c,
+                   ggml_v3_custom3_op_f32_t   fun),
+        "use ggml_v3_map_custom3 instead");
+
+    GGML_V3_DEPRECATED(GGML_V3_API struct ggml_v3_tensor * ggml_v3_map_custom3_inplace_f32(
+            struct ggml_v3_context          * ctx,
+            struct ggml_v3_tensor           * a,
+            struct ggml_v3_tensor           * b,
+            struct ggml_v3_tensor           * c,
+                   ggml_v3_custom3_op_f32_t   fun),
+        "use ggml_v3_map_custom3_inplace instead");
+
+    // custom operators v2
+
+    typedef void (*ggml_v3_custom1_op_t)(struct ggml_v3_tensor * dst , const struct ggml_v3_tensor * a, int ith, int nth, void * userdata);
+    typedef void (*ggml_v3_custom2_op_t)(struct ggml_v3_tensor * dst , const struct ggml_v3_tensor * a, const struct ggml_v3_tensor * b, int ith, int nth, void * userdata);
+    typedef void (*ggml_v3_custom3_op_t)(struct ggml_v3_tensor * dst , const struct ggml_v3_tensor * a, const struct ggml_v3_tensor * b, const struct ggml_v3_tensor * c, int ith, int nth, void * userdata);
+
+    #define GGML_V3_N_TASKS_MAX -1
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_map_custom1(
+            struct ggml_v3_context   * ctx,
+            struct ggml_v3_tensor    * a,
+            ggml_v3_custom1_op_t       fun,
+            int                     n_tasks,
+            void                  * userdata);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_map_custom1_inplace(
+            struct ggml_v3_context   * ctx,
+            struct ggml_v3_tensor    * a,
+            ggml_v3_custom1_op_t       fun,
+            int                     n_tasks,
+            void                  * userdata);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_map_custom2(
+            struct ggml_v3_context   * ctx,
+            struct ggml_v3_tensor    * a,
+            struct ggml_v3_tensor    * b,
+            ggml_v3_custom2_op_t       fun,
+            int                     n_tasks,
+            void                  * userdata);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_map_custom2_inplace(
+            struct ggml_v3_context   * ctx,
+            struct ggml_v3_tensor    * a,
+            struct ggml_v3_tensor    * b,
+            ggml_v3_custom2_op_t       fun,
+            int                     n_tasks,
+            void                  * userdata);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_map_custom3(
+            struct ggml_v3_context   * ctx,
+            struct ggml_v3_tensor    * a,
+            struct ggml_v3_tensor    * b,
+            struct ggml_v3_tensor    * c,
+            ggml_v3_custom3_op_t       fun,
+            int                     n_tasks,
+            void                  * userdata);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_map_custom3_inplace(
+            struct ggml_v3_context   * ctx,
+            struct ggml_v3_tensor    * a,
+            struct ggml_v3_tensor    * b,
+            struct ggml_v3_tensor    * c,
+            ggml_v3_custom3_op_t       fun,
+            int                     n_tasks,
+            void                  * userdata);
+
+    // loss function
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_cross_entropy_loss(
+            struct ggml_v3_context         * ctx,
+            struct ggml_v3_tensor          * a,
+            struct ggml_v3_tensor          * b);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_cross_entropy_loss_back(
+            struct ggml_v3_context         * ctx,
+            struct ggml_v3_tensor          * a,
+            struct ggml_v3_tensor          * b,
+            struct ggml_v3_tensor          * c);
+
+    //
+    // automatic differentiation
+    //
+
+    GGML_V3_API void ggml_v3_set_param(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_tensor  * tensor);
+
+
+    GGML_V3_API void ggml_v3_build_forward_expand (struct ggml_v3_cgraph * cgraph, struct ggml_v3_tensor * tensor);
+    GGML_V3_API void ggml_v3_build_backward_expand(struct ggml_v3_context * ctx, struct ggml_v3_cgraph * gf, struct ggml_v3_cgraph * gb, bool keep);
+
+    // graph allocation in a context
+    GGML_V3_API struct ggml_v3_cgraph * ggml_v3_new_graph         (struct ggml_v3_context * ctx); // size = GGML_V3_DEFAULT_GRAPH_SIZE, grads = false
+    GGML_V3_API struct ggml_v3_cgraph * ggml_v3_new_graph_custom  (struct ggml_v3_context * ctx, size_t size, bool grads);
+    GGML_V3_API struct ggml_v3_cgraph * ggml_v3_graph_dup         (struct ggml_v3_context * ctx, struct ggml_v3_cgraph * cgraph);
+    GGML_V3_API struct ggml_v3_cgraph   ggml_v3_graph_view        (struct ggml_v3_cgraph * cgraph, int i0, int i1);
+    GGML_V3_API void                 ggml_v3_graph_cpy         (struct ggml_v3_cgraph * src, struct ggml_v3_cgraph * dst);
+    GGML_V3_API void                 ggml_v3_graph_reset       (struct ggml_v3_cgraph * cgraph);  // zero grads
+    GGML_V3_API void                 ggml_v3_graph_clear       (struct ggml_v3_cgraph * cgraph);
+
+    GGML_V3_API size_t ggml_v3_graph_overhead(void);
+    GGML_V3_API size_t ggml_v3_graph_overhead_custom(size_t size, bool grads);
+
+    // ggml_v3_graph_plan() has to be called before ggml_v3_graph_compute()
+    // when plan.work_size > 0, caller must allocate memory for plan.work_data
+    GGML_V3_API struct ggml_v3_cplan ggml_v3_graph_plan   (struct ggml_v3_cgraph * cgraph, int n_threads /*= GGML_V3_DEFAULT_N_THREADS*/);
+    GGML_V3_API int               ggml_v3_graph_compute(struct ggml_v3_cgraph * cgraph, struct ggml_v3_cplan * cplan);
+
+    // same as ggml_v3_graph_compute() but the work data is allocated as a part of the context
+    // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
+    GGML_V3_API void ggml_v3_graph_compute_with_ctx(struct ggml_v3_context * ctx, struct ggml_v3_cgraph * cgraph, int n_threads);
+
+    GGML_V3_API struct ggml_v3_tensor * ggml_v3_graph_get_tensor(struct ggml_v3_cgraph * cgraph, const char * name);
+
+    GGML_V3_API void                 ggml_v3_graph_export(const struct ggml_v3_cgraph * cgraph, const char * fname);
+    GGML_V3_API struct ggml_v3_cgraph * ggml_v3_graph_import(const char * fname, struct ggml_v3_context ** ctx_data, struct ggml_v3_context ** ctx_eval);
+
+    // print info and performance information for the graph
+    GGML_V3_API void ggml_v3_graph_print(const struct ggml_v3_cgraph * cgraph);
+
+    // dump the graph into a file using the dot format
+    GGML_V3_API void ggml_v3_graph_dump_dot(const struct ggml_v3_cgraph * gb, const struct ggml_v3_cgraph * gf, const char * filename);
+
+    // build gradient checkpointing backward graph gb for gf using provided checkpoints
+    // gb_tmp will contain original backward graph with rewritten backward process nodes,
+    // but without the second forward pass nodes.
+    GGML_V3_API void ggml_v3_build_backward_gradient_checkpointing(
+            struct ggml_v3_context   * ctx,
+            struct ggml_v3_cgraph    * gf,
+            struct ggml_v3_cgraph    * gb,
+            struct ggml_v3_cgraph    * gb_tmp,
+            struct ggml_v3_tensor  * * checkpoints,
+            int                     n_checkpoints);
+    //
+    // optimization
+    //
+
+    // optimization methods
+    enum ggml_v3_opt_type {
+        GGML_V3_OPT_ADAM,
+        GGML_V3_OPT_LBFGS,
+    };
+
+    // linesearch methods
+    enum ggml_v3_linesearch {
+        GGML_V3_LINESEARCH_DEFAULT = 1,
+
+        GGML_V3_LINESEARCH_BACKTRACKING_ARMIJO       = 0,
+        GGML_V3_LINESEARCH_BACKTRACKING_WOLFE        = 1,
+        GGML_V3_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 2,
+    };
+
+    // optimization return values
+    enum ggml_v3_opt_result {
+        GGML_V3_OPT_OK = 0,
+        GGML_V3_OPT_DID_NOT_CONVERGE,
+        GGML_V3_OPT_NO_CONTEXT,
+        GGML_V3_OPT_INVALID_WOLFE,
+        GGML_V3_OPT_FAIL,
+        GGML_V3_OPT_CANCEL,
+
+        GGML_V3_LINESEARCH_FAIL = -128,
+        GGML_V3_LINESEARCH_MINIMUM_STEP,
+        GGML_V3_LINESEARCH_MAXIMUM_STEP,
+        GGML_V3_LINESEARCH_MAXIMUM_ITERATIONS,
+        GGML_V3_LINESEARCH_INVALID_PARAMETERS,
+    };
+
+    typedef void (*ggml_v3_opt_callback)(void * data, int accum_step, float * sched, bool * cancel);
+    typedef void (*ggml_v3_log_callback)(enum ggml_v3_log_level level, const char * text, void * user_data);
+
+    // optimization parameters
+    //
+    //   see ggml.c (ggml_v3_opt_default_params) for default values
+    //
+    struct ggml_v3_opt_params {
+        enum ggml_v3_opt_type type;
+
+        size_t graph_size;
+
+        int n_threads;
+
+        // delta-based convergence test
+        //
+        //   if past == 0 - disabled
+        //   if past > 0:
+        //     stop if |f(x) - f(x_past)| < delta * max(1, |f(x)|)
+        //
+        int past;
+        float delta;
+
+        // maximum number of iterations without improvement
+        //
+        //   if 0 - disabled
+        //   if > 0:
+        //     assume convergence if no cost improvement in this number of iterations
+        //
+        int max_no_improvement;
+
+        bool print_forward_graph;
+        bool print_backward_graph;
+
+        int n_gradient_accumulation;
+
+        // ADAM parameters
+        struct {
+            int n_iter;
+
+            float sched; // schedule multiplier (fixed, decay or warmup)
+            float decay; // weight decay for AdamW, use 0.0f to disable
+            int   decay_min_ndim; // minimum number of tensor dimension to apply weight decay
+            float alpha; // learning rate
+            float beta1;
+            float beta2;
+            float eps;   // epsilon for numerical stability
+            float eps_f; // epsilon for convergence test
+            float eps_g; // epsilon for convergence test
+            float gclip; // gradient clipping
+        } adam;
+
+        // LBFGS parameters
+        struct {
+            int m; // number of corrections to approximate the inv. Hessian
+            int n_iter;
+            int max_linesearch;
+
+            float eps;      // convergence tolerance
+            float ftol;     // line search tolerance
+            float wolfe;
+            float min_step;
+            float max_step;
+
+            enum ggml_v3_linesearch linesearch;
+        } lbfgs;
+    };
+
+    struct ggml_v3_opt_context {
+        struct ggml_v3_context * ctx;
+        struct ggml_v3_opt_params params;
+
+        int iter;
+        int64_t nx; // number of parameter elements
+
+        bool just_initialized;
+
+        float loss_before;
+        float loss_after;
+
+        struct {
+            struct ggml_v3_tensor * g;  // current gradient
+            struct ggml_v3_tensor * m;  // first moment
+            struct ggml_v3_tensor * v;  // second moment
+            struct ggml_v3_tensor * pf; // past function values
+            float fx_best;
+            float fx_prev;
+            int n_no_improvement;
+        } adam;
+
+        struct {
+            struct ggml_v3_tensor * x;    // current parameters
+            struct ggml_v3_tensor * xp;   // previous parameters
+            struct ggml_v3_tensor * g;    // current gradient
+            struct ggml_v3_tensor * gp;   // previous gradient
+            struct ggml_v3_tensor * d;    // search direction
+            struct ggml_v3_tensor * pf;   // past function values
+            struct ggml_v3_tensor * lmal; // the L-BFGS memory alpha
+            struct ggml_v3_tensor * lmys; // the L-BFGS memory ys
+            struct ggml_v3_tensor * lms;  // the L-BFGS memory s
+            struct ggml_v3_tensor * lmy;  // the L-BFGS memory y
+            float fx_best;
+            float step;
+            int j;
+            int k;
+            int end;
+            int n_no_improvement;
+        } lbfgs;
+    };
+
+    GGML_V3_API struct ggml_v3_opt_params ggml_v3_opt_default_params(enum ggml_v3_opt_type type);
+
+    // optimize the function defined by the tensor f
+    GGML_V3_API enum ggml_v3_opt_result ggml_v3_opt(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_opt_params params,
+            struct ggml_v3_tensor * f);
+
+    // initialize optimizer context
+    GGML_V3_API void ggml_v3_opt_init(
+            struct ggml_v3_context     * ctx,
+            struct ggml_v3_opt_context * opt,
+            struct ggml_v3_opt_params    params,
+            int64_t                   nx);
+
+    // continue optimizing the function defined by the tensor f
+    GGML_V3_API enum ggml_v3_opt_result ggml_v3_opt_resume(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_opt_context * opt,
+            struct ggml_v3_tensor * f);
+
+    // continue optimizing the function defined by the tensor f
+    GGML_V3_API enum ggml_v3_opt_result ggml_v3_opt_resume_g(
+            struct ggml_v3_context * ctx,
+            struct ggml_v3_opt_context * opt,
+            struct ggml_v3_tensor * f,
+            struct ggml_v3_cgraph * gf,
+            struct ggml_v3_cgraph * gb,
+            ggml_v3_opt_callback callback,
+            void * callback_data);
+
+    //
+    // quantization
+    //
+
+    // TODO: these would probably get removed in favor of the more general ggml_v3_quantize_chunk
+    GGML_V3_API size_t ggml_v3_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
+    GGML_V3_API size_t ggml_v3_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
+    GGML_V3_API size_t ggml_v3_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
+    GGML_V3_API size_t ggml_v3_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
+    GGML_V3_API size_t ggml_v3_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
+
+    GGML_V3_API size_t ggml_v3_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist);
+    GGML_V3_API size_t ggml_v3_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist);
+    GGML_V3_API size_t ggml_v3_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
+    GGML_V3_API size_t ggml_v3_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
+    GGML_V3_API size_t ggml_v3_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
+    GGML_V3_API size_t ggml_v3_quantize_iq2_xxs(const float * src, void * dst, int n, int k, int64_t * hist);
+    GGML_V3_API size_t ggml_v3_quantize_iq2_xs (const float * src, void * dst, int n, int k, int64_t * hist);
+
+    GGML_V3_API size_t ggml_v3_quantize_chunk(enum ggml_v3_type type, const float * src, void * dst, int start, int n, int64_t * hist);
+
+    //
+    // Importance matrix
+    //
+    typedef void(*ggml_v3_collect_imatrix_t)(const struct ggml_v3_tensor * src0, const struct ggml_v3_tensor * src1);
+    GGML_V3_API void ggml_v3_set_imatrix_collection(ggml_v3_collect_imatrix_t imatrix_collect);
+
+    //
+    // gguf
+    //
+
+    enum gguf_v3_type {
+        GGUF_V3_TYPE_UINT8   = 0,
+        GGUF_V3_TYPE_INT8    = 1,
+        GGUF_V3_TYPE_UINT16  = 2,
+        GGUF_V3_TYPE_INT16   = 3,
+        GGUF_V3_TYPE_UINT32  = 4,
+        GGUF_V3_TYPE_INT32   = 5,
+        GGUF_V3_TYPE_FLOAT32 = 6,
+        GGUF_V3_TYPE_BOOL    = 7,
+        GGUF_V3_TYPE_STRING  = 8,
+        GGUF_V3_TYPE_ARRAY   = 9,
+        GGUF_V3_TYPE_UINT64  = 10,
+        GGUF_V3_TYPE_INT64   = 11,
+        GGUF_V3_TYPE_FLOAT64 = 12,
+        GGUF_V3_TYPE_COUNT,       // marks the end of the enum
+    };
+
+    struct gguf_v3_context;
+
+    struct gguf_v3_init_params {
+        bool no_alloc;
+
+        // if not NULL, create a ggml_v3_context and allocate the tensor data in it
+        struct ggml_v3_context ** ctx;
+    };
+
+    GGML_V3_API struct gguf_v3_context * gguf_v3_init_empty(void);
+    GGML_V3_API struct gguf_v3_context * gguf_v3_init_from_file(const char * fname, struct gguf_v3_init_params params);
+    //GGML_V3_API struct gguf_v3_context * gguf_v3_init_from_buffer(..);
+
+    GGML_V3_API void gguf_v3_free(struct gguf_v3_context * ctx);
+
+    GGML_V3_API const char * gguf_v3_type_name(enum gguf_v3_type type);
+
+    GGML_V3_API int    gguf_v3_get_version    (const struct gguf_v3_context * ctx);
+    GGML_V3_API size_t gguf_v3_get_alignment  (const struct gguf_v3_context * ctx);
+    GGML_V3_API size_t gguf_v3_get_data_offset(const struct gguf_v3_context * ctx);
+    GGML_V3_API void * gguf_v3_get_data       (const struct gguf_v3_context * ctx);
+
+    GGML_V3_API int          gguf_v3_get_n_kv(const struct gguf_v3_context * ctx);
+    GGML_V3_API int          gguf_v3_find_key(const struct gguf_v3_context * ctx, const char * key);
+    GGML_V3_API const char * gguf_v3_get_key (const struct gguf_v3_context * ctx, int key_id);
+
+    GGML_V3_API enum gguf_v3_type gguf_v3_get_kv_type (const struct gguf_v3_context * ctx, int key_id);
+    GGML_V3_API enum gguf_v3_type gguf_v3_get_arr_type(const struct gguf_v3_context * ctx, int key_id);
+
+    // will abort if the wrong type is used for the key
+    GGML_V3_API uint8_t      gguf_v3_get_val_u8  (const struct gguf_v3_context * ctx, int key_id);
+    GGML_V3_API int8_t       gguf_v3_get_val_i8  (const struct gguf_v3_context * ctx, int key_id);
+    GGML_V3_API uint16_t     gguf_v3_get_val_u16 (const struct gguf_v3_context * ctx, int key_id);
+    GGML_V3_API int16_t      gguf_v3_get_val_i16 (const struct gguf_v3_context * ctx, int key_id);
+    GGML_V3_API uint32_t     gguf_v3_get_val_u32 (const struct gguf_v3_context * ctx, int key_id);
+    GGML_V3_API int32_t      gguf_v3_get_val_i32 (const struct gguf_v3_context * ctx, int key_id);
+    GGML_V3_API float        gguf_v3_get_val_f32 (const struct gguf_v3_context * ctx, int key_id);
+    GGML_V3_API uint64_t     gguf_v3_get_val_u64 (const struct gguf_v3_context * ctx, int key_id);
+    GGML_V3_API int64_t      gguf_v3_get_val_i64 (const struct gguf_v3_context * ctx, int key_id);
+    GGML_V3_API double       gguf_v3_get_val_f64 (const struct gguf_v3_context * ctx, int key_id);
+    GGML_V3_API bool         gguf_v3_get_val_bool(const struct gguf_v3_context * ctx, int key_id);
+    GGML_V3_API const char * gguf_v3_get_val_str (const struct gguf_v3_context * ctx, int key_id);
+    GGML_V3_API const void * gguf_v3_get_val_data(const struct gguf_v3_context * ctx, int key_id);
+    GGML_V3_API int          gguf_v3_get_arr_n   (const struct gguf_v3_context * ctx, int key_id);
+    GGML_V3_API const void * gguf_v3_get_arr_data(const struct gguf_v3_context * ctx, int key_id);
+    GGML_V3_API const char * gguf_v3_get_arr_str (const struct gguf_v3_context * ctx, int key_id, int i);
+
+    GGML_V3_API int            gguf_v3_get_n_tensors    (const struct gguf_v3_context * ctx);
+    GGML_V3_API int            gguf_v3_find_tensor      (const struct gguf_v3_context * ctx, const char * name);
+    GGML_V3_API size_t         gguf_v3_get_tensor_offset(const struct gguf_v3_context * ctx, int i);
+    GGML_V3_API char *         gguf_v3_get_tensor_name  (const struct gguf_v3_context * ctx, int i);
+    GGML_V3_API enum ggml_v3_type gguf_v3_get_tensor_type  (const struct gguf_v3_context * ctx, int i);
+
+    // overrides existing values or adds a new one
+    GGML_V3_API void gguf_v3_set_val_u8  (struct gguf_v3_context * ctx, const char * key, uint8_t  val);
+    GGML_V3_API void gguf_v3_set_val_i8  (struct gguf_v3_context * ctx, const char * key, int8_t   val);
+    GGML_V3_API void gguf_v3_set_val_u16 (struct gguf_v3_context * ctx, const char * key, uint16_t val);
+    GGML_V3_API void gguf_v3_set_val_i16 (struct gguf_v3_context * ctx, const char * key, int16_t  val);
+    GGML_V3_API void gguf_v3_set_val_u32 (struct gguf_v3_context * ctx, const char * key, uint32_t val);
+    GGML_V3_API void gguf_v3_set_val_i32 (struct gguf_v3_context * ctx, const char * key, int32_t  val);
+    GGML_V3_API void gguf_v3_set_val_f32 (struct gguf_v3_context * ctx, const char * key, float    val);
+    GGML_V3_API void gguf_v3_set_val_u64 (struct gguf_v3_context * ctx, const char * key, uint64_t val);
+    GGML_V3_API void gguf_v3_set_val_i64 (struct gguf_v3_context * ctx, const char * key, int64_t  val);
+    GGML_V3_API void gguf_v3_set_val_f64 (struct gguf_v3_context * ctx, const char * key, double   val);
+    GGML_V3_API void gguf_v3_set_val_bool(struct gguf_v3_context * ctx, const char * key, bool     val);
+    GGML_V3_API void gguf_v3_set_val_str (struct gguf_v3_context * ctx, const char * key, const char * val);
+    GGML_V3_API void gguf_v3_set_arr_data(struct gguf_v3_context * ctx, const char * key, enum gguf_v3_type type, const void * data, int n);
+    GGML_V3_API void gguf_v3_set_arr_str (struct gguf_v3_context * ctx, const char * key, const char ** data, int n);
+
+    // set or add KV pairs from another context
+    GGML_V3_API void gguf_v3_set_kv(struct gguf_v3_context * ctx, struct gguf_v3_context * src);
+
+    // manage tensor info
+    GGML_V3_API void gguf_v3_add_tensor(struct gguf_v3_context * ctx, const struct ggml_v3_tensor * tensor);
+    GGML_V3_API void gguf_v3_set_tensor_type(struct gguf_v3_context * ctx, const char * name, enum ggml_v3_type type);
+    GGML_V3_API void gguf_v3_set_tensor_data(struct gguf_v3_context * ctx, const char * name, const void * data, size_t size);
+
+    // writing gguf files can be done in 2 ways:
+    //
+    // - write the entire gguf_v3_context to a binary file in a single pass:
+    //
+    //   gguf_v3_write_to_file(ctx, fname);
+    //
+    // - first prepare a file with a placeholder for the meta data, write the tensor data, then write the meta data:
+    //
+    //   FILE * f = fopen(fname, "wb");
+    //   fseek(f, gguf_v3_get_meta_size(ctx), SEEK_SET);
+    //   fwrite(f, ...);
+    //   void * data = gguf_v3_meta_get_meta_data(ctx);
+    //   fseek(f, 0, SEEK_SET);
+    //   fwrite(f, data, gguf_v3_get_meta_size(ctx));
+    //   free(data);
+    //   fclose(f);
+    //
+
+    // write the entire context to a binary file
+    GGML_V3_API void gguf_v3_write_to_file(const struct gguf_v3_context * ctx, const char * fname, bool only_meta);
+
+    // get the size in bytes of the meta data (header, kv pairs, tensor info) including padding
+    GGML_V3_API size_t gguf_v3_get_meta_size(const struct gguf_v3_context * ctx);
+    GGML_V3_API void   gguf_v3_get_meta_data(const struct gguf_v3_context * ctx, void * data);
+
+    //
+    // system info
+    //
+
+    GGML_V3_API int ggml_v3_cpu_has_avx        (void);
+    GGML_V3_API int ggml_v3_cpu_has_avx_vnni   (void);
+    GGML_V3_API int ggml_v3_cpu_has_avx2       (void);
+    GGML_V3_API int ggml_v3_cpu_has_avx512     (void);
+    GGML_V3_API int ggml_v3_cpu_has_avx512_vbmi(void);
+    GGML_V3_API int ggml_v3_cpu_has_avx512_vnni(void);
+    GGML_V3_API int ggml_v3_cpu_has_fma        (void);
+    GGML_V3_API int ggml_v3_cpu_has_neon       (void);
+    GGML_V3_API int ggml_v3_cpu_has_arm_fma    (void);
+    GGML_V3_API int ggml_v3_cpu_has_metal      (void);
+    GGML_V3_API int ggml_v3_cpu_has_f16c       (void);
+    GGML_V3_API int ggml_v3_cpu_has_fp16_va    (void);
+    GGML_V3_API int ggml_v3_cpu_has_wasm_simd  (void);
+    GGML_V3_API int ggml_v3_cpu_has_blas       (void);
+    GGML_V3_API int ggml_v3_cpu_has_cublas     (void);
+    GGML_V3_API int ggml_v3_cpu_has_clblast    (void);
+    GGML_V3_API int ggml_v3_cpu_has_gpublas    (void);
+    GGML_V3_API int ggml_v3_cpu_has_sse3       (void);
+    GGML_V3_API int ggml_v3_cpu_has_ssse3      (void);
+    GGML_V3_API int ggml_v3_cpu_has_vsx        (void);
+
+    //
+    // Internal types and functions exposed for tests and benchmarks
+    //
+
+#ifdef  __cplusplus
+// restrict not standard in C++
+#define GGML_V3_RESTRICT
+#else
+#define GGML_V3_RESTRICT restrict
+#endif
+    typedef void (*ggml_v3_to_float_t)  (const void  * GGML_V3_RESTRICT x, float * GGML_V3_RESTRICT y, int k);
+    typedef void (*ggml_v3_from_float_t)(const float * GGML_V3_RESTRICT x, void  * GGML_V3_RESTRICT y, int k);
+    typedef void (*ggml_v3_vec_dot_t)   (const int n, float * GGML_V3_RESTRICT s, const void * GGML_V3_RESTRICT x, const void * GGML_V3_RESTRICT y);
+
+    typedef struct {
+        const char      * type_name;
+        int               blck_size;
+        size_t            type_size;
+        bool              is_quantized;
+        ggml_v3_to_float_t   to_float;
+        ggml_v3_from_float_t from_float;
+        ggml_v3_from_float_t from_float_reference;
+        ggml_v3_vec_dot_t    vec_dot;
+        enum ggml_v3_type    vec_dot_type;
+    } ggml_v3_type_traits_t;
+
+    GGML_V3_API ggml_v3_type_traits_t ggml_v3_internal_get_type_traits(enum ggml_v3_type type);
+
+#ifdef  __cplusplus
+}
+#endif
diff --git a/otherarch/gpt2_v3.cpp b/otherarch/gpt2_v3.cpp
index 8914df2ee..e766c963e 100644
--- a/otherarch/gpt2_v3.cpp
+++ b/otherarch/gpt2_v3.cpp
@@ -1,4 +1,4 @@
-#include "ggml.h"
+#include "ggml_v3.h"
 #include "otherarch.h"
 
 #include "utils.h"
@@ -17,10 +17,10 @@
 #include "model_adapter.h"
 
 #ifdef GGML_USE_CUBLAS
-#include "ggml-cuda.h"
+#include "ggml_v3-cuda.h"
 #endif
 #if defined(GGML_USE_CLBLAST)
-#include "ggml-opencl.h"
+#include "ggml_v3-opencl.h"
 #endif
 
 
@@ -57,7 +57,7 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
         fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
         fin.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
 
-        const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
+        const int32_t qntvr = hparams.ftype / GGML_V3_QNT_VERSION_FACTOR;
 
         printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
         printf("%s: n_ctx   = %d (%d)\n", __func__, hparams.n_ctx,origmaxctx);
@@ -67,7 +67,7 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
         printf("%s: ftype   = %d\n", __func__, hparams.ftype);
         printf("%s: qntvr   = %d\n", __func__, qntvr);
 
-        hparams.ftype %= GGML_QNT_VERSION_FACTOR;
+        hparams.ftype %= GGML_V3_QNT_VERSION_FACTOR;
     }
 
     // load vocab
@@ -113,8 +113,8 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
 
     // for the big tensors, we have the option to store the data in 16-bit floats or quantized
     // in order to save memory and also to speed up the computation
-    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
-    if (wtype == GGML_TYPE_COUNT) {
+    ggml_v3_type wtype = ggml_v3_ftype_to_ggml_v3_type((ggml_v3_ftype) (model.hparams.ftype));
+    if (wtype == GGML_V3_TYPE_COUNT) {
         fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
                 __func__, fname.c_str(), model.hparams.ftype);
         return ModelLoadResult::FAIL;
@@ -136,33 +136,33 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
         const int kv_heads = hparams.n_head; // 1 if MQA else hparams.n_head
         const int kv_dim   = kv_heads * head_dim;
 
-        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
-        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
+        ctx_size += n_embd*ggml_v3_type_sizef(GGML_V3_TYPE_F32); // ln_f_g
+        ctx_size += n_embd*ggml_v3_type_sizef(GGML_V3_TYPE_F32); // ln_f_b
 
-        ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // wte
-        ctx_size +=   n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe
-        ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // lm_head
+        ctx_size += n_vocab*n_embd*ggml_v3_type_sizef(wtype);         // wte
+        ctx_size +=   n_ctx*n_embd*ggml_v3_type_sizef(GGML_V3_TYPE_F32); // wpe
+        ctx_size += n_vocab*n_embd*ggml_v3_type_sizef(wtype);         // lm_head
 
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
+        ctx_size += n_layer*(n_embd*ggml_v3_type_sizef(GGML_V3_TYPE_F32)); // ln_1_g
+        ctx_size += n_layer*(n_embd*ggml_v3_type_sizef(GGML_V3_TYPE_F32)); // ln_1_b
 
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b
+        ctx_size += n_layer*(n_embd*ggml_v3_type_sizef(GGML_V3_TYPE_F32)); // ln_2_g
+        ctx_size += n_layer*(n_embd*ggml_v3_type_sizef(GGML_V3_TYPE_F32)); // ln_2_b
 
-        ctx_size += n_layer*((n_embd + 2*kv_dim)*n_embd*ggml_type_sizef(wtype));         // c_attn_attn_w // TODO:
-        ctx_size += n_layer*(       (n_embd + 2*kv_dim)*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b
+        ctx_size += n_layer*((n_embd + 2*kv_dim)*n_embd*ggml_v3_type_sizef(wtype));         // c_attn_attn_w // TODO:
+        ctx_size += n_layer*(       (n_embd + 2*kv_dim)*ggml_v3_type_sizef(GGML_V3_TYPE_F32)); // c_attn_attn_b
 
-        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype));           // c_attn_proj_w
-        ctx_size += n_layer*(       n_embd*ggml_type_sizef(GGML_TYPE_F32));   // c_attn_proj_b
+        ctx_size += n_layer*(n_embd*n_embd*ggml_v3_type_sizef(wtype));           // c_attn_proj_w
+        ctx_size += n_layer*(       n_embd*ggml_v3_type_sizef(GGML_V3_TYPE_F32));   // c_attn_proj_b
 
-        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_fc_w
-        ctx_size += n_layer*(       4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
+        ctx_size += n_layer*(4*n_embd*n_embd*ggml_v3_type_sizef(wtype));         // c_mlp_fc_w
+        ctx_size += n_layer*(       4*n_embd*ggml_v3_type_sizef(GGML_V3_TYPE_F32)); // c_mlp_fc_b
 
-        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_proj_w
-        ctx_size += n_layer*(         n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
+        ctx_size += n_layer*(4*n_embd*n_embd*ggml_v3_type_sizef(wtype));         // c_mlp_proj_w
+        ctx_size += n_layer*(         n_embd*ggml_v3_type_sizef(GGML_V3_TYPE_F32)); // c_mlp_proj_b
 
-        ctx_size += std::max(origmaxctx,n_ctx)*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_k
-        ctx_size += std::max(origmaxctx,n_ctx)*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_v
+        ctx_size += std::max(origmaxctx,n_ctx)*n_layer*n_embd*ggml_v3_type_sizef(GGML_V3_TYPE_F16); // memory_k
+        ctx_size += std::max(origmaxctx,n_ctx)*n_layer*n_embd*ggml_v3_type_sizef(GGML_V3_TYPE_F16); // memory_v
 
         ctx_size += (6 + 12*n_layer)*1024; // object overhead
 
@@ -171,14 +171,14 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
 
     // create the ggml context
     {
-        struct ggml_init_params params;
+        struct ggml_v3_init_params params;
         params.mem_size   = ctx_size;
         params.mem_buffer = NULL;
         params.no_alloc   = false;
 
-        model.ctx = ggml_init(params);
+        model.ctx = ggml_v3_init(params);
         if (!model.ctx) {
-            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
+            fprintf(stderr, "%s: ggml_v3_init() failed\n", __func__);
             return ModelLoadResult::FAIL;
         }
     }
@@ -198,12 +198,12 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
 
         model.layers.resize(n_layer);
 
-        model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-        model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+        model.ln_f_g = ggml_v3_new_tensor_1d(ctx, GGML_V3_TYPE_F32, n_embd);
+        model.ln_f_b = ggml_v3_new_tensor_1d(ctx, GGML_V3_TYPE_F32, n_embd);
 
-        model.wte     = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
-        model.wpe     = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ctx);
-        model.lm_head = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
+        model.wte     = ggml_v3_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
+        model.wpe     = ggml_v3_new_tensor_2d(ctx, GGML_V3_TYPE_F32, n_embd, n_ctx);
+        model.lm_head = ggml_v3_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
 
         // map by name
         model.tensors["model/ln_f/g"] = model.ln_f_g;
@@ -216,23 +216,23 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
         for (int i = 0; i < n_layer; ++i) {
             auto & layer = model.layers[i];
 
-            layer.ln_1_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-            layer.ln_1_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+            layer.ln_1_g        = ggml_v3_new_tensor_1d(ctx, GGML_V3_TYPE_F32,   n_embd);
+            layer.ln_1_b        = ggml_v3_new_tensor_1d(ctx, GGML_V3_TYPE_F32,   n_embd);
 
-            layer.ln_2_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-            layer.ln_2_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+            layer.ln_2_g        = ggml_v3_new_tensor_1d(ctx, GGML_V3_TYPE_F32,   n_embd);
+            layer.ln_2_b        = ggml_v3_new_tensor_1d(ctx, GGML_V3_TYPE_F32,   n_embd);
 
-            layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, n_embd + 2*kv_dim);
-            layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd + 2*kv_dim);
+            layer.c_attn_attn_w = ggml_v3_new_tensor_2d(ctx, wtype,           n_embd, n_embd + 2*kv_dim);
+            layer.c_attn_attn_b = ggml_v3_new_tensor_1d(ctx, GGML_V3_TYPE_F32, n_embd + 2*kv_dim);
 
-            layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, n_embd);
-            layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+            layer.c_attn_proj_w = ggml_v3_new_tensor_2d(ctx, wtype,           n_embd, n_embd);
+            layer.c_attn_proj_b = ggml_v3_new_tensor_1d(ctx, GGML_V3_TYPE_F32,   n_embd);
 
-            layer.c_mlp_fc_w    = ggml_new_tensor_2d(ctx, wtype,           n_embd, 4*n_embd); //TODO: 4*n_embd = config.n_inner
-            layer.c_mlp_fc_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
+            layer.c_mlp_fc_w    = ggml_v3_new_tensor_2d(ctx, wtype,           n_embd, 4*n_embd); //TODO: 4*n_embd = config.n_inner
+            layer.c_mlp_fc_b    = ggml_v3_new_tensor_1d(ctx, GGML_V3_TYPE_F32, 4*n_embd);
 
-            layer.c_mlp_proj_w  = ggml_new_tensor_2d(ctx, wtype,         4*n_embd, n_embd);
-            layer.c_mlp_proj_b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+            layer.c_mlp_proj_w  = ggml_v3_new_tensor_2d(ctx, wtype,         4*n_embd, n_embd);
+            layer.c_mlp_proj_b  = ggml_v3_new_tensor_1d(ctx, GGML_V3_TYPE_F32,   n_embd);
 
             // map by name
             model.tensors["model/h" + std::to_string(i) + "/ln_1/g"]        = layer.ln_1_g;
@@ -266,10 +266,10 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
         const int n_mem      = n_layer*std::max(origmaxctx,n_ctx);
         const int n_elements = n_embd*n_mem;
 
-        model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
-        model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
+        model.memory_k = ggml_v3_new_tensor_1d(ctx, GGML_V3_TYPE_F16, n_elements);
+        model.memory_v = ggml_v3_new_tensor_1d(ctx, GGML_V3_TYPE_F16, n_elements);
 
-        const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
+        const size_t memory_size = ggml_v3_nbytes(model.memory_k) + ggml_v3_nbytes(model.memory_v);
 
         printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
     }
@@ -314,37 +314,37 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
                         __func__, name.data(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
                 return ModelLoadResult::FAIL;
             }
-            if (ggml_nelements(tensor) != nelements) {
+            if (ggml_v3_nelements(tensor) != nelements) {
                 fprintf(stderr, "%s: tensor '%s' has wrong size in model file. got %d, expected %d\n",
-                        __func__, name.data(), (int) ggml_nelements(tensor), nelements);
+                        __func__, name.data(), (int) ggml_v3_nelements(tensor), nelements);
                 return ModelLoadResult::FAIL;
             }
 
             // for debugging
             if (0) {
-                printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
+                printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_v3_type_name(ggml_v3_type(ttype)), ggml_v3_nbytes(tensor)/1024.0/1024.0, ggml_v3_nbytes(tensor));
             }
 
-            const size_t bpe = ggml_type_size(ggml_type(ttype));
+            const size_t bpe = ggml_v3_type_size(ggml_v3_type(ttype));
 
-            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
+            if ((nelements*bpe)/ggml_v3_blck_size(tensor->type) != ggml_v3_nbytes(tensor)) {
                 fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
-                        __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
+                        __func__, name.data(), ggml_v3_nbytes(tensor), nelements*bpe);
                 return ModelLoadResult::FAIL;
             }
 
-            fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
+            fin.read(reinterpret_cast<char *>(tensor->data), ggml_v3_nbytes(tensor));
 
             // GPT-2 models share the WTE tensor as the LM head
             if (name == "model/wte" && has_lm_head == false) {
-                memcpy(model.lm_head->data, tensor->data, ggml_nbytes(tensor));
+                memcpy(model.lm_head->data, tensor->data, ggml_v3_nbytes(tensor));
             }
 
             if (name == "model/lm_head") {
                 has_lm_head = true;
             }
 
-            total_size += ggml_nbytes(tensor);
+            total_size += ggml_v3_nbytes(tensor);
         }
 
         printf("%s: model size  = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
@@ -366,20 +366,20 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
         #endif
         for (int i = 0; i < n_gpu; ++i) {
             const auto & layer = model.layers[i];
-            layer.c_attn_attn_w->backend = GGML_BACKEND_GPU;
-            layer.c_attn_proj_w->backend = GGML_BACKEND_GPU;
-            layer.c_mlp_fc_w->backend = GGML_BACKEND_GPU;
-            layer.c_mlp_proj_w->backend = GGML_BACKEND_GPU;
+            layer.c_attn_attn_w->backend = GGML_V3_BACKEND_GPU;
+            layer.c_attn_proj_w->backend = GGML_V3_BACKEND_GPU;
+            layer.c_mlp_fc_w->backend = GGML_V3_BACKEND_GPU;
+            layer.c_mlp_proj_w->backend = GGML_V3_BACKEND_GPU;
             #if defined(GGML_USE_CLBLAST)
-            ggml_cl_transform_tensor(layer.c_attn_attn_w->data,layer.c_attn_attn_w); vram_total += ggml_nbytes(layer.c_attn_attn_w);
-            ggml_cl_transform_tensor(layer.c_attn_proj_w->data,layer.c_attn_proj_w); vram_total += ggml_nbytes(layer.c_attn_proj_w);
-            ggml_cl_transform_tensor(layer.c_mlp_fc_w->data,layer.c_mlp_fc_w); vram_total += ggml_nbytes(layer.c_mlp_fc_w);
-            ggml_cl_transform_tensor(layer.c_mlp_proj_w->data,layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w);
+            ggml_v3_cl_transform_tensor(layer.c_attn_attn_w->data,layer.c_attn_attn_w); vram_total += ggml_v3_nbytes(layer.c_attn_attn_w);
+            ggml_v3_cl_transform_tensor(layer.c_attn_proj_w->data,layer.c_attn_proj_w); vram_total += ggml_v3_nbytes(layer.c_attn_proj_w);
+            ggml_v3_cl_transform_tensor(layer.c_mlp_fc_w->data,layer.c_mlp_fc_w); vram_total += ggml_v3_nbytes(layer.c_mlp_fc_w);
+            ggml_v3_cl_transform_tensor(layer.c_mlp_proj_w->data,layer.c_mlp_proj_w); vram_total += ggml_v3_nbytes(layer.c_mlp_proj_w);
             #else
-            ggml_cuda_transform_tensor(layer.c_attn_attn_w->data,layer.c_attn_attn_w); vram_total += ggml_nbytes(layer.c_attn_attn_w);
-            ggml_cuda_transform_tensor(layer.c_attn_proj_w->data,layer.c_attn_proj_w); vram_total += ggml_nbytes(layer.c_attn_proj_w);
-            ggml_cuda_transform_tensor(layer.c_mlp_fc_w->data,layer.c_mlp_fc_w); vram_total += ggml_nbytes(layer.c_mlp_fc_w);
-            ggml_cuda_transform_tensor(layer.c_mlp_proj_w->data,layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w);
+            ggml_v3_cuda_transform_tensor(layer.c_attn_attn_w->data,layer.c_attn_attn_w); vram_total += ggml_v3_nbytes(layer.c_attn_attn_w);
+            ggml_v3_cuda_transform_tensor(layer.c_attn_proj_w->data,layer.c_attn_proj_w); vram_total += ggml_v3_nbytes(layer.c_attn_proj_w);
+            ggml_v3_cuda_transform_tensor(layer.c_mlp_fc_w->data,layer.c_mlp_fc_w); vram_total += ggml_v3_nbytes(layer.c_mlp_fc_w);
+            ggml_v3_cuda_transform_tensor(layer.c_mlp_proj_w->data,layer.c_mlp_proj_w); vram_total += ggml_v3_nbytes(layer.c_mlp_proj_w);
             #endif
         }
         #if defined(GGML_USE_CLBLAST)
@@ -448,48 +448,48 @@ bool gpt2_eval(
         }
     }
 
-    struct ggml_init_params params;
+    struct ggml_v3_init_params params;
     params.mem_size   = buf_size;
     params.mem_buffer = buf;
     params.no_alloc   = false;
 
 
-    struct ggml_context * ctx0 = ggml_init(params);
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, 8192, false);
+    struct ggml_v3_context * ctx0 = ggml_v3_init(params);
+    struct ggml_v3_cgraph * gf = ggml_v3_new_graph_custom(ctx0, 8192, false);
 
-    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
+    struct ggml_v3_tensor * embd = ggml_v3_new_tensor_1d(ctx0, GGML_V3_TYPE_I32, N);
+    memcpy(embd->data, embd_inp.data(), N*ggml_v3_element_size(embd));
 
-    struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+    struct ggml_v3_tensor * position = ggml_v3_new_tensor_1d(ctx0, GGML_V3_TYPE_I32, N);
     for (int i = 0; i < N; ++i) {
         ((int32_t *) position->data)[i] = n_past + i;
     }
 
     // wte + wpe
-    struct ggml_tensor * inpL =
-        ggml_add(ctx0,
-                ggml_get_rows(ctx0, model.wte, embd),
-                ggml_get_rows(ctx0, model.wpe, position));
+    struct ggml_v3_tensor * inpL =
+        ggml_v3_add(ctx0,
+                ggml_v3_get_rows(ctx0, model.wte, embd),
+                ggml_v3_get_rows(ctx0, model.wpe, position));
 
     for (int il = 0; il < n_layer; ++il) {
-        struct ggml_tensor * cur;
+        struct ggml_v3_tensor * cur;
 
         if(use_scratch){
-        ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
+        ggml_v3_set_scratch(ctx0, { 0, scr0_size, scr0, });
         }
 
         // norm
         {
             // [ 768, N]
-            cur = ggml_norm(ctx0, inpL, default_norm_eps);
+            cur = ggml_v3_norm(ctx0, inpL, default_norm_eps);
 
             // cur = ln_1_g*cur + ln_1_b
             // [ 768, N]
-            cur = ggml_add(ctx0,
-                    ggml_mul(ctx0,
-                        ggml_repeat(ctx0, model.layers[il].ln_1_g, cur),
+            cur = ggml_v3_add(ctx0,
+                    ggml_v3_mul(ctx0,
+                        ggml_v3_repeat(ctx0, model.layers[il].ln_1_g, cur),
                         cur),
-                    ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
+                    ggml_v3_repeat(ctx0, model.layers[il].ln_1_b, cur));
         }
 
         // attn
@@ -501,104 +501,104 @@ bool gpt2_eval(
         // cur = attn_w*cur + attn_b
         // [2304, N]
         {
-            cur = ggml_mul_mat(ctx0,
+            cur = ggml_v3_mul_mat(ctx0,
                     model.layers[il].c_attn_attn_w,
                     cur);
 
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, model.layers[il].c_attn_attn_b, cur),
+            cur = ggml_v3_add(ctx0,
+                    ggml_v3_repeat(ctx0, model.layers[il].c_attn_attn_b, cur),
                     cur);
         }
 
         // self-attention
         {
-            struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
-            struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd);
-            struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd);
+            struct ggml_v3_tensor * Qcur = ggml_v3_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
+            struct ggml_v3_tensor * Kcur = ggml_v3_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd);
+            struct ggml_v3_tensor * Vcur = ggml_v3_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd);
 
             // store key and value to memory
             if (N >= 1) {
-                struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
-                struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past));
+                struct ggml_v3_tensor * k = ggml_v3_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_v3_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
+                struct ggml_v3_tensor * v = ggml_v3_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_v3_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past));
 
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
+                ggml_v3_build_forward_expand(gf, ggml_v3_cpy(ctx0, Kcur, k));
+                ggml_v3_build_forward_expand(gf, ggml_v3_cpy(ctx0, Vcur, v));
             }
 
             // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
             // [64, N, 12]
-            struct ggml_tensor * Q =
-                ggml_permute(ctx0,
-                        ggml_cpy(ctx0,
+            struct ggml_v3_tensor * Q =
+                ggml_v3_permute(ctx0,
+                        ggml_v3_cpy(ctx0,
                             Qcur,
-                            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
+                            ggml_v3_new_tensor_3d(ctx0, GGML_V3_TYPE_F32, n_embd/n_head, n_head, N)),
                         0, 2, 1, 3);
 
             // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
             // [64, n_past + N, 12]
-            struct ggml_tensor * K =
-                ggml_permute(ctx0,
-                        ggml_reshape_3d(ctx0,
-                            ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
+            struct ggml_v3_tensor * K =
+                ggml_v3_permute(ctx0,
+                        ggml_v3_reshape_3d(ctx0,
+                            ggml_v3_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_v3_element_size(model.memory_k)*n_embd),
                             n_embd/n_head, n_head, n_past + N),
                         0, 2, 1, 3); //TODO: need to be tiled
 
             // GG: flash attention
-            //struct ggml_tensor * V =
-            //    ggml_cpy(ctx0,
-            //            ggml_permute(ctx0,
-            //                ggml_reshape_3d(ctx0,
-            //                    ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
+            //struct ggml_v3_tensor * V =
+            //    ggml_v3_cpy(ctx0,
+            //            ggml_v3_permute(ctx0,
+            //                ggml_v3_reshape_3d(ctx0,
+            //                    ggml_v3_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_v3_element_size(model.memory_v)*n_embd),
             //                    n_embd/n_head, n_head, n_past + N),
             //                1, 2, 0, 3),
-            //            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head));
+            //            ggml_v3_new_tensor_3d(ctx0, GGML_V3_TYPE_F32, n_past + N, n_embd/n_head, n_head));
 
-            //struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true);
+            //struct ggml_v3_tensor * KQV = ggml_v3_flash_attn(ctx0, Q, K, V, true);
 
             // K * Q
             // [n_past + N, N, 12]
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); //TODO: check if it broadcasts
+            struct ggml_v3_tensor * KQ = ggml_v3_mul_mat(ctx0, K, Q); //TODO: check if it broadcasts
 
             // KQ_scaled = KQ / sqrt(n_embd/n_head)
             // [n_past + N, N, 12]
-            struct ggml_tensor * KQ_scaled =
-                ggml_scale_inplace(ctx0,
+            struct ggml_v3_tensor * KQ_scaled =
+                ggml_v3_scale_inplace(ctx0,
                         KQ,
                         1.0f/sqrt(float(n_embd)/n_head)
                         );
 
             // KQ_masked = mask_past(KQ_scaled)
             // [n_past + N, N, 12]
-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
+            struct ggml_v3_tensor * KQ_masked = ggml_v3_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
 
             // KQ = soft_max(KQ_masked)
             // [n_past + N, N, 12]
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
+            struct ggml_v3_tensor * KQ_soft_max = ggml_v3_soft_max_inplace(ctx0, KQ_masked);
 
             // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
             // [n_past + N, 64, 12]
-            struct ggml_tensor * V_trans =
-                ggml_cpy(ctx0,
-                        ggml_permute(ctx0,
-                            ggml_reshape_3d(ctx0,
-                                ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
+            struct ggml_v3_tensor * V_trans =
+                ggml_v3_cpy(ctx0,
+                        ggml_v3_permute(ctx0,
+                            ggml_v3_reshape_3d(ctx0,
+                                ggml_v3_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_v3_element_size(model.memory_v)*n_embd),
                                 n_embd/n_head, n_head, n_past + N),
                             1, 2, 0, 3),
-                        ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd/n_head, n_head));
+                        ggml_v3_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd/n_head, n_head));
 
             // KQV = transpose(V) * KQ_soft_max
             // [64, N, 12]
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
+            struct ggml_v3_tensor * KQV = ggml_v3_mul_mat(ctx0, V_trans, KQ_soft_max);
 
             // KQV_merged = KQV.permute(0, 2, 1, 3)
             // [64, 12, N]
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+            struct ggml_v3_tensor * KQV_merged = ggml_v3_permute(ctx0, KQV, 0, 2, 1, 3);
 
             // cur = KQV_merged.contiguous().view(n_embd, N)
             // [768, N]
-            cur = ggml_cpy(ctx0,
+            cur = ggml_v3_cpy(ctx0,
                     KQV_merged,
-                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
+                    ggml_v3_new_tensor_2d(ctx0, GGML_V3_TYPE_F32, n_embd, N));
         }
 
         // projection
@@ -610,37 +610,37 @@ bool gpt2_eval(
         // cur = proj_w*cur + proj_b
         // [768, N]
         {
-            cur = ggml_mul_mat(ctx0,
+            cur = ggml_v3_mul_mat(ctx0,
                     model.layers[il].c_attn_proj_w,
                     cur);
 
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, model.layers[il].c_attn_proj_b, cur),
+            cur = ggml_v3_add(ctx0,
+                    ggml_v3_repeat(ctx0, model.layers[il].c_attn_proj_b, cur),
                     cur);
         }
 
         // add the input
-        cur = ggml_add(ctx0, cur, inpL);
+        cur = ggml_v3_add(ctx0, cur, inpL);
 
-        struct ggml_tensor * inpFF = cur;
+        struct ggml_v3_tensor * inpFF = cur;
 
         if(use_scratch){
-        ggml_set_scratch(ctx0, { 0, scr1_size, scr1, });
+        ggml_v3_set_scratch(ctx0, { 0, scr1_size, scr1, });
         }
 
         // feed-forward network
         {
             // norm
             {
-                cur = ggml_norm(ctx0, inpFF, default_norm_eps);
+                cur = ggml_v3_norm(ctx0, inpFF, default_norm_eps);
 
                 // cur = ln_2_g*cur + ln_2_b
                 // [ 768, N]
-                cur = ggml_add(ctx0,
-                        ggml_mul(ctx0,
-                            ggml_repeat(ctx0, model.layers[il].ln_2_g, cur),
+                cur = ggml_v3_add(ctx0,
+                        ggml_v3_mul(ctx0,
+                            ggml_v3_repeat(ctx0, model.layers[il].ln_2_g, cur),
                             cur),
-                        ggml_repeat(ctx0, model.layers[il].ln_2_b, cur));
+                        ggml_v3_repeat(ctx0, model.layers[il].ln_2_b, cur));
             }
 
             // fully connected
@@ -651,17 +651,17 @@ bool gpt2_eval(
             //
             // cur = fc_w*cur + fc_b
             // [3072, N]
-            cur = ggml_mul_mat(ctx0,
+            cur = ggml_v3_mul_mat(ctx0,
                     model.layers[il].c_mlp_fc_w,
                     cur);
 
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur),
+            cur = ggml_v3_add(ctx0,
+                    ggml_v3_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur),
                     cur);
 
             // GELU activation
             // [3072, N]
-            cur = ggml_gelu(ctx0, cur);
+            cur = ggml_v3_gelu(ctx0, cur);
 
             // projection
             // [ 768, 3072] - model.layers[il].c_mlp_proj_w
@@ -671,71 +671,71 @@ bool gpt2_eval(
             //
             // cur = proj_w*cur + proj_b
             // [768, N]
-            cur = ggml_mul_mat(ctx0,
+            cur = ggml_v3_mul_mat(ctx0,
                     model.layers[il].c_mlp_proj_w,
                     cur);
 
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur),
+            cur = ggml_v3_add(ctx0,
+                    ggml_v3_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur),
                     cur);
         }
 
         // input for next layer
-        inpL = ggml_add(ctx0, cur, inpFF);
+        inpL = ggml_v3_add(ctx0, cur, inpFF);
     }
 
     if(use_scratch){
-    ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
+    ggml_v3_set_scratch(ctx0, { 0, scr0_size, scr0, });
     }
 
     // norm
     {
         // [ 768, N]
-        inpL = ggml_norm(ctx0, inpL, default_norm_eps);
+        inpL = ggml_v3_norm(ctx0, inpL, default_norm_eps);
 
         // inpL = ln_f_g*inpL + ln_f_b
         // [ 768, N]
-        inpL = ggml_add(ctx0,
-                ggml_mul(ctx0,
-                    ggml_repeat(ctx0, model.ln_f_g, inpL),
+        inpL = ggml_v3_add(ctx0,
+                ggml_v3_mul(ctx0,
+                    ggml_v3_repeat(ctx0, model.ln_f_g, inpL),
                     inpL),
-                ggml_repeat(ctx0, model.ln_f_b, inpL));
+                ggml_v3_repeat(ctx0, model.ln_f_b, inpL));
     }
 
     if(use_scratch){
-    ggml_set_scratch(ctx0, { 0, 0, nullptr, });
+    ggml_v3_set_scratch(ctx0, { 0, 0, nullptr, });
     }
 
     // inpL = WTE * inpL
     // [ 768, 50257] - model.lm_head
     // [ 768, N]     - inpL
-    inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);
+    inpL = ggml_v3_mul_mat(ctx0, model.lm_head, inpL);
 
     // logits -> probs
-    //inpL = ggml_soft_max_inplace(ctx0, inpL);
+    //inpL = ggml_v3_soft_max_inplace(ctx0, inpL);
 
     // run the computation
-    ggml_build_forward_expand(gf, inpL);
+    ggml_v3_build_forward_expand(gf, inpL);
     kcpp_graph_compute_helper(gf, n_threads);
 
     //if (n_past%100 == 0) {
-    //    ggml_graph_print   (&gf);
-    //    ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
+    //    ggml_v3_graph_print   (&gf);
+    //    ggml_v3_graph_dump_dot(&gf, NULL, "gpt-2.dot");
     //}
 
     //embd_w.resize(n_vocab*N);
-    //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
+    //memcpy(embd_w.data(), ggml_v3_get_data(inpL), sizeof(float)*n_vocab*N);
 
     // return result just for the last token
     embd_w.resize(n_vocab);
-    memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
+    memcpy(embd_w.data(), (float *) ggml_v3_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
 
     if (mem_per_token == 0) {
-        mem_per_token = ggml_used_mem(ctx0)/N;
+        mem_per_token = ggml_v3_used_mem(ctx0)/N;
     }
-    //printf("used_mem = %zu MB\n", ggml_used_mem(ctx0)/(1024*1024));
+    //printf("used_mem = %zu MB\n", ggml_v3_used_mem(ctx0)/(1024*1024));
 
-    ggml_free(ctx0);
+    ggml_v3_free(ctx0);
 
     return true;
 }
\ No newline at end of file
diff --git a/otherarch/gptj_v3.cpp b/otherarch/gptj_v3.cpp
index ed7a44a5a..3a06b1c7d 100644
--- a/otherarch/gptj_v3.cpp
+++ b/otherarch/gptj_v3.cpp
@@ -1,4 +1,4 @@
-#include "ggml.h"
+#include "ggml_v3.h"
 #include "otherarch.h"
 
 #include "utils.h"
@@ -17,10 +17,10 @@
 #include "model_adapter.h"
 
 #ifdef GGML_USE_CUBLAS
-#include "ggml-cuda.h"
+#include "ggml_v3-cuda.h"
 #endif
 #if defined(GGML_USE_CLBLAST)
-#include "ggml-opencl.h"
+#include "ggml_v3-opencl.h"
 #endif
 
 // load the model's weights from a file
@@ -57,7 +57,7 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g
         fin.read((char *) &hparams.n_rot,   sizeof(hparams.n_rot));
         fin.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
 
-        const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
+        const int32_t qntvr = hparams.ftype / GGML_V3_QNT_VERSION_FACTOR;
 
         printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
         printf("%s: n_ctx   = %d (%d)\n", __func__, hparams.n_ctx,origmaxctx);
@@ -70,7 +70,7 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g
 
         hparams.n_ctx = std::max(origmaxctx,hparams.n_ctx);
 
-        hparams.ftype %= GGML_QNT_VERSION_FACTOR;
+        hparams.ftype %= GGML_V3_QNT_VERSION_FACTOR;
     }
 
     // load vocab
@@ -102,8 +102,8 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g
 
     // for the big tensors, we have the option to store the data in 16-bit floats or quantized
     // in order to save memory and also to speed up the computation
-    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
-    if (wtype == GGML_TYPE_COUNT) {
+    ggml_v3_type wtype = ggml_v3_ftype_to_ggml_v3_type((ggml_v3_ftype) (model.hparams.ftype));
+    if (wtype == GGML_V3_TYPE_COUNT) {
         fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
                 __func__, fname.c_str(), model.hparams.ftype);
         return ModelLoadResult::FAIL;
@@ -111,7 +111,7 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g
 
     auto & ctx = model.ctx;
 
-    auto memory_type = GGML_TYPE_F16;
+    auto memory_type = GGML_V3_TYPE_F16;
 
     size_t ctx_size = 0;
 
@@ -123,31 +123,31 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g
         const int n_ctx   = hparams.n_ctx;
         const int n_vocab = hparams.n_vocab;
 
-        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
-        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
+        ctx_size += n_embd*ggml_v3_type_sizef(GGML_V3_TYPE_F32); // ln_f_g
+        ctx_size += n_embd*ggml_v3_type_sizef(GGML_V3_TYPE_F32); // ln_f_b
 
-        ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // wte
+        ctx_size += n_embd*n_vocab*ggml_v3_type_sizef(wtype); // wte
 
-        ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype);         // lmh_g
-        ctx_size +=        n_vocab*ggml_type_sizef(GGML_TYPE_F32); // lmh_b
+        ctx_size += n_embd*n_vocab*ggml_v3_type_sizef(wtype);         // lmh_g
+        ctx_size +=        n_vocab*ggml_v3_type_sizef(GGML_V3_TYPE_F32); // lmh_b
 
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
+        ctx_size += n_layer*(n_embd*ggml_v3_type_sizef(GGML_V3_TYPE_F32)); // ln_1_g
+        ctx_size += n_layer*(n_embd*ggml_v3_type_sizef(GGML_V3_TYPE_F32)); // ln_1_b
 
-        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_q_proj_w
-        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_k_proj_w
-        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_v_proj_w
+        ctx_size += n_layer*(n_embd*n_embd*ggml_v3_type_sizef(wtype)); // c_attn_q_proj_w
+        ctx_size += n_layer*(n_embd*n_embd*ggml_v3_type_sizef(wtype)); // c_attn_k_proj_w
+        ctx_size += n_layer*(n_embd*n_embd*ggml_v3_type_sizef(wtype)); // c_attn_v_proj_w
 
-        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w
+        ctx_size += n_layer*(n_embd*n_embd*ggml_v3_type_sizef(wtype)); // c_attn_proj_w
 
-        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_fc_w
-        ctx_size += n_layer*(       4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
+        ctx_size += n_layer*(4*n_embd*n_embd*ggml_v3_type_sizef(wtype));         // c_mlp_fc_w
+        ctx_size += n_layer*(       4*n_embd*ggml_v3_type_sizef(GGML_V3_TYPE_F32)); // c_mlp_fc_b
 
-        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_proj_w
-        ctx_size += n_layer*(         n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
+        ctx_size += n_layer*(4*n_embd*n_embd*ggml_v3_type_sizef(wtype));         // c_mlp_proj_w
+        ctx_size += n_layer*(         n_embd*ggml_v3_type_sizef(GGML_V3_TYPE_F32)); // c_mlp_proj_b
 
-        ctx_size += std::max(origmaxctx,n_ctx)*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_k
-        ctx_size += std::max(origmaxctx,n_ctx)*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_v
+        ctx_size += std::max(origmaxctx,n_ctx)*n_layer*n_embd*ggml_v3_type_sizef(memory_type); // memory_k
+        ctx_size += std::max(origmaxctx,n_ctx)*n_layer*n_embd*ggml_v3_type_sizef(memory_type); // memory_v
 
         ctx_size += (5 + 10*n_layer)*512; // object overhead
 
@@ -156,15 +156,15 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g
 
     // create the ggml context
     {
-        struct ggml_init_params params;
+        struct ggml_v3_init_params params;
         params.mem_size   = ctx_size;
         params.mem_buffer = NULL;
         params.no_alloc   = false;
 
 
-        model.ctx = ggml_init(params);
+        model.ctx = ggml_v3_init(params);
         if (!model.ctx) {
-            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
+            fprintf(stderr, "%s: ggml_v3_init() failed\n", __func__);
             return ModelLoadResult::FAIL;
         }
     }
@@ -179,13 +179,13 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g
 
         model.layers.resize(n_layer);
 
-        model.wte    = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
+        model.wte    = ggml_v3_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
 
-        model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-        model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+        model.ln_f_g = ggml_v3_new_tensor_1d(ctx, GGML_V3_TYPE_F32, n_embd);
+        model.ln_f_b = ggml_v3_new_tensor_1d(ctx, GGML_V3_TYPE_F32, n_embd);
 
-        model.lmh_g  = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
-        model.lmh_b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_vocab);
+        model.lmh_g  = ggml_v3_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
+        model.lmh_b  = ggml_v3_new_tensor_1d(ctx, GGML_V3_TYPE_F32, n_vocab);
 
         // map by name
         model.tensors["transformer.wte.weight"] = model.wte;
@@ -199,20 +199,20 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g
         for (int i = 0; i < n_layer; ++i) {
             auto & layer = model.layers[i];
 
-            layer.ln_1_g          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-            layer.ln_1_b          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+            layer.ln_1_g          = ggml_v3_new_tensor_1d(ctx, GGML_V3_TYPE_F32,   n_embd);
+            layer.ln_1_b          = ggml_v3_new_tensor_1d(ctx, GGML_V3_TYPE_F32,   n_embd);
 
-            layer.c_attn_q_proj_w = ggml_new_tensor_2d(ctx, wtype,           n_embd,   n_embd);
-            layer.c_attn_k_proj_w = ggml_new_tensor_2d(ctx, wtype,           n_embd,   n_embd);
-            layer.c_attn_v_proj_w = ggml_new_tensor_2d(ctx, wtype,           n_embd,   n_embd);
+            layer.c_attn_q_proj_w = ggml_v3_new_tensor_2d(ctx, wtype,           n_embd,   n_embd);
+            layer.c_attn_k_proj_w = ggml_v3_new_tensor_2d(ctx, wtype,           n_embd,   n_embd);
+            layer.c_attn_v_proj_w = ggml_v3_new_tensor_2d(ctx, wtype,           n_embd,   n_embd);
 
-            layer.c_attn_proj_w   = ggml_new_tensor_2d(ctx, wtype,           n_embd,   n_embd);
+            layer.c_attn_proj_w   = ggml_v3_new_tensor_2d(ctx, wtype,           n_embd,   n_embd);
 
-            layer.c_mlp_fc_w      = ggml_new_tensor_2d(ctx, wtype,           n_embd, 4*n_embd);
-            layer.c_mlp_fc_b      = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
+            layer.c_mlp_fc_w      = ggml_v3_new_tensor_2d(ctx, wtype,           n_embd, 4*n_embd);
+            layer.c_mlp_fc_b      = ggml_v3_new_tensor_1d(ctx, GGML_V3_TYPE_F32, 4*n_embd);
 
-            layer.c_mlp_proj_w    = ggml_new_tensor_2d(ctx, wtype,         4*n_embd,   n_embd);
-            layer.c_mlp_proj_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+            layer.c_mlp_proj_w    = ggml_v3_new_tensor_2d(ctx, wtype,         4*n_embd,   n_embd);
+            layer.c_mlp_proj_b    = ggml_v3_new_tensor_1d(ctx, GGML_V3_TYPE_F32,   n_embd);
 
             // map by name
             model.tensors["transformer.h." + std::to_string(i) + ".ln_1.weight"]          = layer.ln_1_g;
@@ -243,10 +243,10 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g
         const int n_mem      = n_layer*std::max(origmaxctx,n_ctx);
         const int n_elements = n_embd*n_mem;
 
-        model.memory_k = ggml_new_tensor_1d(ctx, memory_type, n_elements);
-        model.memory_v = ggml_new_tensor_1d(ctx, memory_type, n_elements);
+        model.memory_k = ggml_v3_new_tensor_1d(ctx, memory_type, n_elements);
+        model.memory_v = ggml_v3_new_tensor_1d(ctx, memory_type, n_elements);
 
-        const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
+        const size_t memory_size = ggml_v3_nbytes(model.memory_k) + ggml_v3_nbytes(model.memory_v);
 
         printf("%s: memory_size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
     }
@@ -287,7 +287,7 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g
             }
 
             auto tensor = model.tensors[name.data()];
-            if (ggml_nelements(tensor) != nelements) {
+            if (ggml_v3_nelements(tensor) != nelements) {
                 fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
                 return ModelLoadResult::FAIL;
             }
@@ -299,7 +299,7 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g
                 if(tensor->ne[0]==ne[1] && tensor->ne[1]==ne[0] && should_transpose_layer(name))
                 {
                     printf("\nFound a transposed tensor. This could be an older or newer model. Retrying load...");
-                    ggml_free(ctx);
+                    ggml_v3_free(ctx);
                     return ModelLoadResult::RETRY_LOAD;
                 }
                 else
@@ -313,21 +313,21 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g
 
             // for debugging
             if (0) {
-                printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
+                printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_v3_type_name(ggml_v3_type(ttype)), ggml_v3_nbytes(tensor)/1024.0/1024.0, ggml_v3_nbytes(tensor));
             }
 
-            const size_t bpe = ggml_type_size(ggml_type(ttype));
+            const size_t bpe = ggml_v3_type_size(ggml_v3_type(ttype));
 
-            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
+            if ((nelements*bpe)/ggml_v3_blck_size(tensor->type) != ggml_v3_nbytes(tensor)) {
                 fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
-                        __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
+                        __func__, name.data(), ggml_v3_nbytes(tensor), nelements*bpe);
                 return ModelLoadResult::FAIL;
             }
 
-            fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
+            fin.read(reinterpret_cast<char *>(tensor->data), ggml_v3_nbytes(tensor));
 
-            //printf("%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ttype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
-            total_size += ggml_nbytes(tensor);
+            //printf("%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ttype == 0 ? "float" : "f16", ggml_v3_nbytes(tensor)/1024.0/1024.0);
+            total_size += ggml_v3_nbytes(tensor);
             if (++n_tensors % 8 == 0) {
                 printf(".");
                 fflush(stdout);
@@ -355,26 +355,26 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g
         #endif
         for (int i = 0; i < n_gpu; ++i) {
             const auto & layer = model.layers[i];
-            layer.c_attn_q_proj_w->backend = GGML_BACKEND_GPU;
-            layer.c_attn_k_proj_w->backend = GGML_BACKEND_GPU;
-            layer.c_attn_v_proj_w->backend = GGML_BACKEND_GPU;
-            layer.c_attn_proj_w->backend = GGML_BACKEND_GPU;
-            layer.c_mlp_fc_w->backend = GGML_BACKEND_GPU;
-            layer.c_mlp_proj_w->backend = GGML_BACKEND_GPU;
+            layer.c_attn_q_proj_w->backend = GGML_V3_BACKEND_GPU;
+            layer.c_attn_k_proj_w->backend = GGML_V3_BACKEND_GPU;
+            layer.c_attn_v_proj_w->backend = GGML_V3_BACKEND_GPU;
+            layer.c_attn_proj_w->backend = GGML_V3_BACKEND_GPU;
+            layer.c_mlp_fc_w->backend = GGML_V3_BACKEND_GPU;
+            layer.c_mlp_proj_w->backend = GGML_V3_BACKEND_GPU;
             #if defined(GGML_USE_CLBLAST)
-            ggml_cl_transform_tensor(layer.c_attn_q_proj_w->data,layer.c_attn_q_proj_w); vram_total += ggml_nbytes(layer.c_attn_q_proj_w);
-            ggml_cl_transform_tensor(layer.c_attn_k_proj_w->data,layer.c_attn_k_proj_w); vram_total += ggml_nbytes(layer.c_attn_k_proj_w);
-            ggml_cl_transform_tensor(layer.c_attn_v_proj_w->data,layer.c_attn_v_proj_w); vram_total += ggml_nbytes(layer.c_attn_v_proj_w);
-            ggml_cl_transform_tensor(layer.c_attn_proj_w->data,layer.c_attn_proj_w); vram_total += ggml_nbytes(layer.c_attn_proj_w);
-            ggml_cl_transform_tensor(layer.c_mlp_fc_w->data,layer.c_mlp_fc_w); vram_total += ggml_nbytes(layer.c_mlp_fc_w);
-            ggml_cl_transform_tensor(layer.c_mlp_proj_w->data,layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w);
+            ggml_v3_cl_transform_tensor(layer.c_attn_q_proj_w->data,layer.c_attn_q_proj_w); vram_total += ggml_v3_nbytes(layer.c_attn_q_proj_w);
+            ggml_v3_cl_transform_tensor(layer.c_attn_k_proj_w->data,layer.c_attn_k_proj_w); vram_total += ggml_v3_nbytes(layer.c_attn_k_proj_w);
+            ggml_v3_cl_transform_tensor(layer.c_attn_v_proj_w->data,layer.c_attn_v_proj_w); vram_total += ggml_v3_nbytes(layer.c_attn_v_proj_w);
+            ggml_v3_cl_transform_tensor(layer.c_attn_proj_w->data,layer.c_attn_proj_w); vram_total += ggml_v3_nbytes(layer.c_attn_proj_w);
+            ggml_v3_cl_transform_tensor(layer.c_mlp_fc_w->data,layer.c_mlp_fc_w); vram_total += ggml_v3_nbytes(layer.c_mlp_fc_w);
+            ggml_v3_cl_transform_tensor(layer.c_mlp_proj_w->data,layer.c_mlp_proj_w); vram_total += ggml_v3_nbytes(layer.c_mlp_proj_w);
             #else
-            ggml_cuda_transform_tensor(layer.c_attn_q_proj_w->data,layer.c_attn_q_proj_w); vram_total += ggml_nbytes(layer.c_attn_q_proj_w);
-            ggml_cuda_transform_tensor(layer.c_attn_k_proj_w->data,layer.c_attn_k_proj_w); vram_total += ggml_nbytes(layer.c_attn_k_proj_w);
-            ggml_cuda_transform_tensor(layer.c_attn_v_proj_w->data,layer.c_attn_v_proj_w); vram_total += ggml_nbytes(layer.c_attn_v_proj_w);
-            ggml_cuda_transform_tensor(layer.c_attn_proj_w->data,layer.c_attn_proj_w); vram_total += ggml_nbytes(layer.c_attn_proj_w);
-            ggml_cuda_transform_tensor(layer.c_mlp_fc_w->data,layer.c_mlp_fc_w); vram_total += ggml_nbytes(layer.c_mlp_fc_w);
-            ggml_cuda_transform_tensor(layer.c_mlp_proj_w->data,layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w);
+            ggml_v3_cuda_transform_tensor(layer.c_attn_q_proj_w->data,layer.c_attn_q_proj_w); vram_total += ggml_v3_nbytes(layer.c_attn_q_proj_w);
+            ggml_v3_cuda_transform_tensor(layer.c_attn_k_proj_w->data,layer.c_attn_k_proj_w); vram_total += ggml_v3_nbytes(layer.c_attn_k_proj_w);
+            ggml_v3_cuda_transform_tensor(layer.c_attn_v_proj_w->data,layer.c_attn_v_proj_w); vram_total += ggml_v3_nbytes(layer.c_attn_v_proj_w);
+            ggml_v3_cuda_transform_tensor(layer.c_attn_proj_w->data,layer.c_attn_proj_w); vram_total += ggml_v3_nbytes(layer.c_attn_proj_w);
+            ggml_v3_cuda_transform_tensor(layer.c_mlp_fc_w->data,layer.c_mlp_fc_w); vram_total += ggml_v3_nbytes(layer.c_mlp_fc_w);
+            ggml_v3_cuda_transform_tensor(layer.c_mlp_proj_w->data,layer.c_mlp_proj_w); vram_total += ggml_v3_nbytes(layer.c_mlp_proj_w);
             #endif
         }
         #if defined(GGML_USE_CLBLAST)
@@ -448,45 +448,45 @@ bool gptj_eval(
         }
     }
 
-    struct ggml_init_params params;
+    struct ggml_v3_init_params params;
     params.mem_size   = buf_size;
     params.mem_buffer = buf;
     params.no_alloc   = false;
 
 
-    struct ggml_context * ctx0 = ggml_init(params);
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, GGML_MAX_NODES, false);
+    struct ggml_v3_context * ctx0 = ggml_v3_init(params);
+    struct ggml_v3_cgraph * gf = ggml_v3_new_graph_custom(ctx0, GGML_V3_MAX_NODES, false);
 
-    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
+    struct ggml_v3_tensor * embd = ggml_v3_new_tensor_1d(ctx0, GGML_V3_TYPE_I32, N);
+    memcpy(embd->data, embd_inp.data(), N*ggml_v3_element_size(embd));
 
     // wte
-    struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.wte, embd);
+    struct ggml_v3_tensor * inpL = ggml_v3_get_rows(ctx0, model.wte, embd);
 
     for (int il = 0; il < n_layer; ++il) {
-        struct ggml_tensor * cur;
+        struct ggml_v3_tensor * cur;
 
         if(use_scratch){
-        ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
+        ggml_v3_set_scratch(ctx0, { 0, scr0_size, scr0, });
         }
 
         // norm
         {
-            cur = ggml_norm(ctx0, inpL, default_norm_eps);
+            cur = ggml_v3_norm(ctx0, inpL, default_norm_eps);
 
             // cur = ln_1_g*cur + ln_1_b
-            cur = ggml_add(ctx0,
-                    ggml_mul(ctx0,
-                        ggml_repeat(ctx0, model.layers[il].ln_1_g, cur),
+            cur = ggml_v3_add(ctx0,
+                    ggml_v3_mul(ctx0,
+                        ggml_v3_repeat(ctx0, model.layers[il].ln_1_g, cur),
                         cur),
-                    ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
+                    ggml_v3_repeat(ctx0, model.layers[il].ln_1_b, cur));
         }
 
-        struct ggml_tensor * inpSA = cur;
+        struct ggml_v3_tensor * inpSA = cur;
 
         // self-attention
         {
-            struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+            struct ggml_v3_tensor * KQ_pos = ggml_v3_new_tensor_1d(ctx0, GGML_V3_TYPE_I32, N);
             {
                 int * data = (int *) KQ_pos->data;
                 for (int i = 0; i < N; ++i) {
@@ -494,170 +494,170 @@ bool gptj_eval(
                 }
             }
 
-            struct ggml_tensor *Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].c_attn_q_proj_w, cur), n_embd / n_head, n_head, N), KQ_pos, n_rot, 0, n_ctx, 0, freq_base, freq_scale, 0, 1, 32, 1);
-            struct ggml_tensor *Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].c_attn_k_proj_w, cur), n_embd / n_head, n_head, N), KQ_pos, n_rot, 0, n_ctx, 0, freq_base, freq_scale, 0, 1, 32, 1);
+            struct ggml_v3_tensor *Qcur = ggml_v3_rope_custom_inplace(ctx0, ggml_v3_reshape_3d(ctx0, ggml_v3_mul_mat(ctx0, model.layers[il].c_attn_q_proj_w, cur), n_embd / n_head, n_head, N), KQ_pos, n_rot, 0, n_ctx, 0, freq_base, freq_scale, 0, 1, 32, 1);
+            struct ggml_v3_tensor *Kcur = ggml_v3_rope_custom_inplace(ctx0, ggml_v3_reshape_3d(ctx0, ggml_v3_mul_mat(ctx0, model.layers[il].c_attn_k_proj_w, cur), n_embd / n_head, n_head, N), KQ_pos, n_rot, 0, n_ctx, 0, freq_base, freq_scale, 0, 1, 32, 1);
 
             // store key and value to memory
             {
-                struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_mul_mat(ctx0, model.layers[il].c_attn_v_proj_w, cur));
+                struct ggml_v3_tensor * Vcur = ggml_v3_transpose(ctx0, ggml_v3_mul_mat(ctx0, model.layers[il].c_attn_v_proj_w, cur));
 
-                struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
-                struct ggml_tensor * v = ggml_view_2d(ctx0, model.memory_v, N, n_embd,
-                        (   n_ctx)*ggml_element_size(model.memory_v),
-                        (il*n_ctx)*ggml_element_size(model.memory_v)*n_embd + n_past*ggml_element_size(model.memory_v));
+                struct ggml_v3_tensor * k = ggml_v3_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_v3_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
+                struct ggml_v3_tensor * v = ggml_v3_view_2d(ctx0, model.memory_v, N, n_embd,
+                        (   n_ctx)*ggml_v3_element_size(model.memory_v),
+                        (il*n_ctx)*ggml_v3_element_size(model.memory_v)*n_embd + n_past*ggml_v3_element_size(model.memory_v));
 
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
+                ggml_v3_build_forward_expand(gf, ggml_v3_cpy(ctx0, Kcur, k));
+                ggml_v3_build_forward_expand(gf, ggml_v3_cpy(ctx0, Vcur, v));
             }
 
             // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
-            struct ggml_tensor * Q =
-                ggml_permute(ctx0,
+            struct ggml_v3_tensor * Q =
+                ggml_v3_permute(ctx0,
                         Qcur,
                         0, 2, 1, 3);
 
             // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
-            struct ggml_tensor * K =
-                ggml_permute(ctx0,
-                        ggml_reshape_3d(ctx0,
-                            ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
+            struct ggml_v3_tensor * K =
+                ggml_v3_permute(ctx0,
+                        ggml_v3_reshape_3d(ctx0,
+                            ggml_v3_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_v3_element_size(model.memory_k)*n_embd),
                             n_embd/n_head, n_head, n_past + N),
                         0, 2, 1, 3);
 
             // K * Q
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+            struct ggml_v3_tensor * KQ = ggml_v3_mul_mat(ctx0, K, Q);
 
             // KQ_scaled = KQ / sqrt(n_embd/n_head)
-            struct ggml_tensor * KQ_scaled =
-                ggml_scale_inplace(ctx0,
+            struct ggml_v3_tensor * KQ_scaled =
+                ggml_v3_scale_inplace(ctx0,
                         KQ,
                         1.0f/sqrt(float(n_embd)/n_head)
                         );
 
             // KQ_masked = mask_past(KQ_scaled)
-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
+            struct ggml_v3_tensor * KQ_masked = ggml_v3_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
 
             // KQ = soft_max(KQ_masked)
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
+            struct ggml_v3_tensor * KQ_soft_max = ggml_v3_soft_max_inplace(ctx0, KQ_masked);
 
             // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
-            struct ggml_tensor * V =
-                ggml_view_3d(ctx0, model.memory_v,
+            struct ggml_v3_tensor * V =
+                ggml_v3_view_3d(ctx0, model.memory_v,
                         n_past + N, n_embd/n_head, n_head,
-                        n_ctx*ggml_element_size(model.memory_v),
-                        n_ctx*ggml_element_size(model.memory_v)*n_embd/n_head,
-                        il*n_ctx*ggml_element_size(model.memory_v)*n_embd);
+                        n_ctx*ggml_v3_element_size(model.memory_v),
+                        n_ctx*ggml_v3_element_size(model.memory_v)*n_embd/n_head,
+                        il*n_ctx*ggml_v3_element_size(model.memory_v)*n_embd);
 
             // KQV = transpose(V) * KQ_soft_max
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
+            struct ggml_v3_tensor * KQV = ggml_v3_mul_mat(ctx0, V, KQ_soft_max);
 
             // KQV_merged = KQV.permute(0, 2, 1, 3)
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+            struct ggml_v3_tensor * KQV_merged = ggml_v3_permute(ctx0, KQV, 0, 2, 1, 3);
 
             // cur = KQV_merged.contiguous().view(n_embd, N)
-            cur = ggml_cpy(ctx0,
+            cur = ggml_v3_cpy(ctx0,
                     KQV_merged,
-                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
+                    ggml_v3_new_tensor_2d(ctx0, GGML_V3_TYPE_F32, n_embd, N));
 
             // projection (no bias)
-            cur = ggml_mul_mat(ctx0,
+            cur = ggml_v3_mul_mat(ctx0,
                     model.layers[il].c_attn_proj_w,
                     cur);
         }
 
         if(use_scratch){
-        ggml_set_scratch(ctx0, { 0, scr1_size, scr1, });
+        ggml_v3_set_scratch(ctx0, { 0, scr1_size, scr1, });
         }
 
-        struct ggml_tensor * inpFF = cur;
+        struct ggml_v3_tensor * inpFF = cur;
 
         // feed-forward network
         // this is independent of the self-attention result, so it could be done in parallel to the self-attention
         {
             // note here we pass inpSA instead of cur
-            cur = ggml_mul_mat(ctx0,
+            cur = ggml_v3_mul_mat(ctx0,
                     model.layers[il].c_mlp_fc_w,
                     inpSA);
 
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur),
+            cur = ggml_v3_add(ctx0,
+                    ggml_v3_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur),
                     cur);
 
             // GELU activation
-            cur = ggml_gelu(ctx0, cur);
+            cur = ggml_v3_gelu(ctx0, cur);
 
             // projection
             // cur = proj_w*cur + proj_b
-            cur = ggml_mul_mat(ctx0,
+            cur = ggml_v3_mul_mat(ctx0,
                     model.layers[il].c_mlp_proj_w,
                     cur);
 
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur),
+            cur = ggml_v3_add(ctx0,
+                    ggml_v3_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur),
                     cur);
         }
 
         // self-attention + FF
-        cur  = ggml_add(ctx0, cur, inpFF);
+        cur  = ggml_v3_add(ctx0, cur, inpFF);
 
         // input for next layer
-        inpL = ggml_add(ctx0, cur, inpL);
+        inpL = ggml_v3_add(ctx0, cur, inpL);
     }
 
     if(use_scratch){
-    ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
+    ggml_v3_set_scratch(ctx0, { 0, scr0_size, scr0, });
     }
 
     // norm
     {
-        inpL = ggml_norm(ctx0, inpL, default_norm_eps);
+        inpL = ggml_v3_norm(ctx0, inpL, default_norm_eps);
 
         // inpL = ln_f_g*inpL + ln_f_b
-        inpL = ggml_add(ctx0,
-                ggml_mul(ctx0,
-                    ggml_repeat(ctx0, model.ln_f_g, inpL),
+        inpL = ggml_v3_add(ctx0,
+                ggml_v3_mul(ctx0,
+                    ggml_v3_repeat(ctx0, model.ln_f_g, inpL),
                     inpL),
-                ggml_repeat(ctx0, model.ln_f_b, inpL));
+                ggml_v3_repeat(ctx0, model.ln_f_b, inpL));
     }
 
     if(use_scratch){
-    ggml_set_scratch(ctx0, { 0, 0, nullptr, });
+    ggml_v3_set_scratch(ctx0, { 0, 0, nullptr, });
     }
 
     // lm_head
     {
-        inpL = ggml_mul_mat(ctx0, model.lmh_g, inpL);
+        inpL = ggml_v3_mul_mat(ctx0, model.lmh_g, inpL);
 
-        inpL = ggml_add(ctx0,
-                ggml_repeat(ctx0, model.lmh_b, inpL),
+        inpL = ggml_v3_add(ctx0,
+                ggml_v3_repeat(ctx0, model.lmh_b, inpL),
                 inpL);
     }
 
     // logits -> probs
-    //inpL = ggml_soft_max_inplace(ctx0, inpL);
+    //inpL = ggml_v3_soft_max_inplace(ctx0, inpL);
 
     // run the computation
-    ggml_build_forward_expand(gf, inpL);
+    ggml_v3_build_forward_expand(gf, inpL);
     kcpp_graph_compute_helper(gf, n_threads);
 
     //if (n_past%100 == 0) {
-    //    ggml_graph_print   (&gf);
-    //    ggml_graph_dump_dot(&gf, NULL, "gpt-j.dot");
+    //    ggml_v3_graph_print   (&gf);
+    //    ggml_v3_graph_dump_dot(&gf, NULL, "gpt-j.dot");
     //}
 
     //embd_w.resize(n_vocab*N);
-    //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
+    //memcpy(embd_w.data(), ggml_v3_get_data(inpL), sizeof(float)*n_vocab*N);
 
     // return result for just the last token
     embd_w.resize(n_vocab);
-    memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
+    memcpy(embd_w.data(), (float *) ggml_v3_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
 
     if (mem_per_token == 0) {
-        mem_per_token = ggml_used_mem(ctx0)/N;
+        mem_per_token = ggml_v3_used_mem(ctx0)/N;
     }
-    //printf("used_mem = %zu\n", ggml_used_mem(ctx0));
+    //printf("used_mem = %zu\n", ggml_v3_used_mem(ctx0));
 
-    ggml_free(ctx0);
+    ggml_v3_free(ctx0);
 
     return true;
 }
diff --git a/otherarch/llama_v3.cpp b/otherarch/llama_v3.cpp
index dde7a1574..6652e67fb 100644
--- a/otherarch/llama_v3.cpp
+++ b/otherarch/llama_v3.cpp
@@ -11,24 +11,19 @@
 #include "llama-util.h"
 #include "llama_v3.h"
 
-#include "ggml.h"
+#include "ggml_v3.h"
 #include "otherarch.h"
 #ifdef GGML_USE_CUBLAS
-#include "ggml-cuda.h"
+#include "ggml_v3-cuda.h"
 #endif
 #if defined(GGML_USE_CLBLAST)
-#include "ggml-opencl.h"
+#include "ggml_v3-opencl.h"
 #endif
 
-#ifdef GGML_USE_METAL
-#include "ggml-metal.h"
-#endif
-#ifdef GGML_USE_MPI
-#include "ggml-mpi.h"
-#endif
+
 #ifdef GGML_USE_K_QUANTS
 #ifndef QK_K
-#ifdef GGML_QKK_64
+#ifdef GGML_V3_QKK_64
 #define QK_K 64
 #else
 #define QK_K 256
@@ -66,13 +61,14 @@ static void llama_v3_log_callback_default(llama_v3_log_level level, const char *
 #define LLAMA_V3_LOG_WARN(...)  llama_v3_log_internal(LLAMA_V3_LOG_LEVEL_WARN , __VA_ARGS__)
 #define LLAMA_V3_LOG_ERROR(...) llama_v3_log_internal(LLAMA_V3_LOG_LEVEL_ERROR, __VA_ARGS__)
 
-#include "ggml-alloc.h"
-#if !defined(GGML_USE_CUBLAS)
-#define LLAMA_V3_USE_ALLOCATOR
-#else
+// disable allocator for backwards compatibility - to avoid gguf changes messing it up
+// #include "ggml-alloc.h"
+// #if !defined(GGML_USE_CUBLAS)
+// #define LLAMA_V3_USE_ALLOCATOR
+// #else
 #define LLAMA_V3_USE_SCRATCH
 #define LLAMA_V3_MAX_SCRATCH_BUFFERS 16
-#endif
+// #endif
 
 
 // available llama models
@@ -94,9 +90,9 @@ static const size_t MB3 = 1024*1024;
 // TODO: dynamically determine these sizes
 //       needs modifications in ggml
 
-typedef void (*offload_func_t)(struct ggml_tensor * tensor);
+typedef void (*offload_func_v3_t)(struct ggml_v3_tensor * tensor);
 
-void llama_v3_nop(struct ggml_tensor * tensor) { // don't offload by default
+void llama_v3_nop(struct ggml_v3_tensor * tensor) { // don't offload by default
     (void) tensor;
 }
 
@@ -104,15 +100,15 @@ void llama_v3_nop(struct ggml_tensor * tensor) { // don't offload by default
 // ggml helpers
 //
 
-static void llv3_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
-    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
+static void llv3_graph_compute_helper(std::vector<uint8_t> & buf, ggml_v3_cgraph * graph, int n_threads) {
+    struct ggml_v3_cplan plan = ggml_v3_graph_plan(graph, n_threads);
 
     if (plan.work_size > 0) {
         buf.resize(plan.work_size);
         plan.work_data = buf.data();
     }
 
-    ggml_graph_compute(graph, &plan);
+    ggml_v3_graph_compute(graph, &plan);
 }
 
 
@@ -237,35 +233,35 @@ struct llama_v3_hparams {
         result *= (size_t) n_embd_gqa();
         result *= (size_t) n_ctx;
         result *= (size_t) n_layer;
-        result *= sizeof(ggml_fp16_t);
+        result *= sizeof(ggml_v3_fp16_t);
         return result;
     }
 };
 
 struct llama_v3_layer {
     // normalization
-    struct ggml_tensor * attention_norm;
+    struct ggml_v3_tensor * attention_norm;
 
     // attention
-    struct ggml_tensor * wq;
-    struct ggml_tensor * wk;
-    struct ggml_tensor * wv;
-    struct ggml_tensor * wo;
+    struct ggml_v3_tensor * wq;
+    struct ggml_v3_tensor * wk;
+    struct ggml_v3_tensor * wv;
+    struct ggml_v3_tensor * wo;
 
     // normalization
-    struct ggml_tensor * ffn_norm;
+    struct ggml_v3_tensor * ffn_norm;
 
     // ff
-    struct ggml_tensor * w1;
-    struct ggml_tensor * w2;
-    struct ggml_tensor * w3;
+    struct ggml_v3_tensor * w1;
+    struct ggml_v3_tensor * w2;
+    struct ggml_v3_tensor * w3;
 };
 
 struct llama_v3_kv_cache {
-    struct ggml_tensor * k = NULL;
-    struct ggml_tensor * v = NULL;
+    struct ggml_v3_tensor * k = NULL;
+    struct ggml_v3_tensor * v = NULL;
 
-    struct ggml_context * ctx = NULL;
+    struct ggml_v3_context * ctx = NULL;
 
     llama_v3_ctx_buffer buf;
 
@@ -273,12 +269,12 @@ struct llama_v3_kv_cache {
 
     ~llama_v3_kv_cache() {
         if (ctx) {
-            ggml_free(ctx);
+            ggml_v3_free(ctx);
         }
 
 #ifdef GGML_USE_CUBLAS
-        ggml_cuda_free_data(k);
-        ggml_cuda_free_data(v);
+        ggml_v3_cuda_free_data(k);
+        ggml_v3_cuda_free_data(v);
 #endif // GGML_USE_CUBLAS
     }
 };
@@ -301,16 +297,16 @@ struct llama_v3_model {
 
     llama_v3_hparams hparams;
 
-    struct ggml_tensor * tok_embeddings;
+    struct ggml_v3_tensor * tok_embeddings;
 
-    struct ggml_tensor * norm;
-    struct ggml_tensor * output;
+    struct ggml_v3_tensor * norm;
+    struct ggml_v3_tensor * output;
 
     std::vector<llama_v3_layer> layers;
     int n_gpu_layers;
 
     // context
-    struct ggml_context * ctx = NULL;
+    struct ggml_v3_context * ctx = NULL;
 
     // the model memory buffer
     llama_v3_ctx_buffer buf;
@@ -323,7 +319,7 @@ struct llama_v3_model {
     llama_v3_mlock mlock_mmap;
 
     // for quantize-stats only
-    std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
+    std::vector<std::pair<std::string, struct ggml_v3_tensor *>> tensors_by_name;
 
     int64_t t_load_us = 0;
     int64_t t_start_us = 0;
@@ -332,17 +328,17 @@ struct llama_v3_model {
 
     ~llama_v3_model() {
         if (ctx) {
-            ggml_free(ctx);
+            ggml_v3_free(ctx);
         }
 
 #ifdef GGML_USE_CUBLAS
         for (size_t i = 0; i < tensors_by_name.size(); ++i) {
-            ggml_cuda_free_data(tensors_by_name[i].second);
+            ggml_v3_cuda_free_data(tensors_by_name[i].second);
         }
-        ggml_cuda_free_scratch();
+        ggml_v3_cuda_free_scratch();
 #elif defined(GGML_USE_CLBLAST)
         for (size_t i = 0; i < tensors_by_name.size(); ++i) {
-            ggml_cl_free_data(tensors_by_name[i].second);
+            ggml_v3_cl_free_data(tensors_by_name[i].second);
         }
 #endif
     }
@@ -354,14 +350,10 @@ struct llama_v3_context {
         if (model_owner) {
             delete &model;
         }
-#ifdef GGML_USE_METAL
-        if (ctx_metal) {
-            ggml_metal_free(ctx_metal);
-        }
-#endif
+
 #ifdef LLAMA_V3_USE_ALLOCATOR
         if (alloc) {
-            ggml_allocr_free(alloc);
+            ggml_v3_allocr_free(alloc);
         }
 #endif
     }
@@ -397,7 +389,7 @@ struct llama_v3_context {
     // input embedding (1-dimensional array: [n_embd])
     std::vector<float> embedding;
 
-    // reusable buffer for `struct ggml_graph_plan.work_data`
+    // reusable buffer for `struct ggml_v3_graph_plan.work_data`
     std::vector<uint8_t> work_buffer;
 
     // memory buffers used to evaluate the model
@@ -406,7 +398,7 @@ struct llama_v3_context {
 
 #ifdef LLAMA_V3_USE_ALLOCATOR
     llama_v3_ctx_buffer buf_alloc;
-    ggml_allocr * alloc = NULL;
+    ggml_v3_allocr * alloc = NULL;
 #endif
 
 #ifdef LLAMA_V3_USE_SCRATCH
@@ -415,23 +407,16 @@ struct llama_v3_context {
     size_t buf_max_size[LLAMA_V3_MAX_SCRATCH_BUFFERS] = { 0 };
 #endif
 
-#ifdef GGML_USE_METAL
-    ggml_metal_context * ctx_metal = NULL;
-#endif
 
-#ifdef GGML_USE_MPI
-    ggml_mpi_context * ctx_mpi = NULL;
-#endif
-
-    void use_buf(struct ggml_context * ctx, int i) {
+    void use_buf(struct ggml_v3_context * ctx, int i) {
 #if defined(LLAMA_V3_USE_SCRATCH)
         size_t last_size = 0;
 
         if (i == -1) {
-            last_size = ggml_set_scratch(ctx, { 0, 0, nullptr, });
+            last_size = ggml_v3_set_scratch(ctx, { 0, 0, nullptr, });
         } else {
             auto & buf = buf_scratch[i];
-            last_size = ggml_set_scratch(ctx, { 0, buf.size, buf.addr, });
+            last_size = ggml_v3_set_scratch(ctx, { 0, buf.size, buf.addr, });
         }
 
         if (buf_last >= 0) {
@@ -489,21 +474,21 @@ static std::string llama_v3_format_tensor_shape(const std::vector<uint32_t> & ne
     return buf;
 }
 
-static size_t llama_v3_calc_tensor_size(const std::vector<uint32_t> & ne, enum ggml_type type) {
-    size_t size = ggml_type_size(type);
+static size_t llama_v3_calc_tensor_size(const std::vector<uint32_t> & ne, enum ggml_v3_type type) {
+    size_t size = ggml_v3_type_size(type);
     for (uint32_t dim : ne) {
         size = checked_mul<size_t>(size, dim);
     }
-    return size / ggml_blck_size(type);
+    return size / ggml_v3_blck_size(type);
 }
 
 struct llama_v3_load_tensor {
     std::string name;
-    enum ggml_type type = GGML_TYPE_F32;
+    enum ggml_v3_type type = GGML_V3_TYPE_F32;
     std::vector<uint32_t> ne;
     size_t file_off;
     size_t size;
-    struct ggml_tensor * ggml_tensor = NULL;
+    struct ggml_v3_tensor * ggml_v3_tensor = NULL;
     uint8_t * data;
 };
 
@@ -597,7 +582,7 @@ struct llama_v3_file_loader {
             llama_v3_load_tensor tensor;
             uint32_t n_dims = file.read_u32();
             uint32_t name_len = file.read_u32();
-            tensor.type = (enum ggml_type) file.read_u32();
+            tensor.type = (enum ggml_v3_type) file.read_u32();
             tensor.ne.resize(n_dims);
             file.read_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * n_dims);
             std::string name = file.read_string(name_len);
@@ -605,18 +590,18 @@ struct llama_v3_file_loader {
                 throw std::runtime_error(format_old("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims));
             }
             switch (tensor.type) {
-                case GGML_TYPE_F32:
-                case GGML_TYPE_F16:
-                case GGML_TYPE_Q4_0:
-                case GGML_TYPE_Q4_1:
-                case GGML_TYPE_Q5_0:
-                case GGML_TYPE_Q5_1:
-                case GGML_TYPE_Q8_0:
-                case GGML_TYPE_Q2_K:
-                case GGML_TYPE_Q3_K:
-                case GGML_TYPE_Q4_K:
-                case GGML_TYPE_Q5_K:
-                case GGML_TYPE_Q6_K:
+                case GGML_V3_TYPE_F32:
+                case GGML_V3_TYPE_F16:
+                case GGML_V3_TYPE_Q4_0:
+                case GGML_V3_TYPE_Q4_1:
+                case GGML_V3_TYPE_Q5_0:
+                case GGML_V3_TYPE_Q5_1:
+                case GGML_V3_TYPE_Q8_0:
+                case GGML_V3_TYPE_Q2_K:
+                case GGML_V3_TYPE_Q3_K:
+                case GGML_V3_TYPE_Q4_K:
+                case GGML_V3_TYPE_Q5_K:
+                case GGML_V3_TYPE_Q6_K:
                     break;
                 default: {
                     throw std::runtime_error(format_old("unrecognized tensor type %u\n", tensor.type));
@@ -675,20 +660,20 @@ struct llama_v3_file_saver {
             file.write_raw(&token_score.score, sizeof(token_score.score));
         }
     }
-    void write_tensor(llama_v3_load_tensor & tensor, enum ggml_type new_type, const void * new_data, size_t new_size) {
+    void write_tensor(llama_v3_load_tensor & tensor, enum ggml_v3_type new_type, const void * new_data, size_t new_size) {
         switch (new_type) {
-            case GGML_TYPE_F32:
-            case GGML_TYPE_F16:
-            case GGML_TYPE_Q4_0:
-            case GGML_TYPE_Q4_1:
-            case GGML_TYPE_Q5_0:
-            case GGML_TYPE_Q5_1:
-            case GGML_TYPE_Q8_0:
-            case GGML_TYPE_Q2_K:
-            case GGML_TYPE_Q3_K:
-            case GGML_TYPE_Q4_K:
-            case GGML_TYPE_Q5_K:
-            case GGML_TYPE_Q6_K:
+            case GGML_V3_TYPE_F32:
+            case GGML_V3_TYPE_F16:
+            case GGML_V3_TYPE_Q4_0:
+            case GGML_V3_TYPE_Q4_1:
+            case GGML_V3_TYPE_Q5_0:
+            case GGML_V3_TYPE_Q5_1:
+            case GGML_V3_TYPE_Q8_0:
+            case GGML_V3_TYPE_Q2_K:
+            case GGML_V3_TYPE_Q3_K:
+            case GGML_V3_TYPE_Q4_K:
+            case GGML_V3_TYPE_Q5_K:
+            case GGML_V3_TYPE_Q6_K:
                 break;
             default: LLAMA_V3_ASSERT(false);
         }
@@ -707,8 +692,8 @@ struct llama_v3_model_loader {
     std::unique_ptr<llama_v3_file_loader> file_loader;
     llama_v3_load_tensors_map tensors_map;
     bool use_mmap;
-    size_t num_ggml_tensors_created = 0;
-    struct ggml_context * ggml_ctx = NULL;
+    size_t num_ggml_v3_tensors_created = 0;
+    struct ggml_v3_context * ggml_v3_ctx = NULL;
     std::unique_ptr<llama_v3_mmap> mapping;
 
     llama_v3_model_loader(const std::string & fname_base, bool use_mmap) {
@@ -722,12 +707,12 @@ struct llama_v3_model_loader {
     void calc_sizes(size_t * ctx_size_p, size_t * mmapped_size_p) const {
         *ctx_size_p = *mmapped_size_p = 0;
         for (const llama_v3_load_tensor & lt : tensors_map.tensors) {
-            *ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE;
+            *ctx_size_p += sizeof(struct ggml_v3_tensor) + GGML_V3_OBJECT_SIZE;
             *(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size + 16;
         }
     }
 
-    struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend_type backend) {
+    struct ggml_v3_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_v3_backend_type backend) {
         auto it = tensors_map.name_to_idx.find(name);
         if (it == tensors_map.name_to_idx.end()) {
             throw std::runtime_error(std::runtime_error(format_old("llama.cpp: tensor '%s' is missing from model", name.c_str())));
@@ -741,31 +726,31 @@ struct llama_v3_model_loader {
         return get_tensor_for(lt, backend);
     }
 
-    struct ggml_tensor * get_tensor_for(llama_v3_load_tensor & lt, ggml_backend_type backend) {
-        struct ggml_tensor * tensor;
-        if (backend != GGML_BACKEND_CPU) {
-            ggml_set_no_alloc(ggml_ctx, true);
+    struct ggml_v3_tensor * get_tensor_for(llama_v3_load_tensor & lt, ggml_v3_backend_type backend) {
+        struct ggml_v3_tensor * tensor;
+        if (backend != GGML_V3_BACKEND_CPU) {
+            ggml_v3_set_no_alloc(ggml_v3_ctx, true);
         }
         if (lt.ne.size() == 2) {
-            tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1));
+            tensor = ggml_v3_new_tensor_2d(ggml_v3_ctx, lt.type, lt.ne.at(0), lt.ne.at(1));
         } else {
             LLAMA_V3_ASSERT(lt.ne.size() == 1);
-            tensor = ggml_new_tensor_1d(ggml_ctx, lt.type, lt.ne.at(0));
+            tensor = ggml_v3_new_tensor_1d(ggml_v3_ctx, lt.type, lt.ne.at(0));
         }
-        ggml_set_name(tensor, lt.name.c_str());
-        LLAMA_V3_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
+        ggml_v3_set_name(tensor, lt.name.c_str());
+        LLAMA_V3_ASSERT(lt.ggml_v3_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
 
-        if (backend != GGML_BACKEND_CPU) {
-            ggml_set_no_alloc(ggml_ctx, use_mmap);
+        if (backend != GGML_V3_BACKEND_CPU) {
+            ggml_v3_set_no_alloc(ggml_v3_ctx, use_mmap);
         }
         tensor->backend = backend;
-        lt.ggml_tensor = tensor;
-        num_ggml_tensors_created++;
+        lt.ggml_v3_tensor = tensor;
+        num_ggml_v3_tensors_created++;
         return tensor;
     }
 
     void done_getting_tensors() const {
-        if (num_ggml_tensors_created != tensors_map.tensors.size()) {
+        if (num_ggml_v3_tensors_created != tensors_map.tensors.size()) {
             throw std::runtime_error(std::string("llama.cpp: file contained more tensors than expected"));
         }
     }
@@ -776,13 +761,13 @@ struct llama_v3_model_loader {
         size_t lock_size = 0;
         for (const llama_v3_load_tensor & lt : tensors_map.tensors) {
             data_size += lt.size;
-            if (lt.ggml_tensor->backend != GGML_BACKEND_CPU) {
+            if (lt.ggml_v3_tensor->backend != GGML_V3_BACKEND_CPU) {
                 prefetch_size -= lt.size;
             }
         }
 
         if (use_mmap) {
-            mapping.reset(new llama_v3_mmap(&file_loader->file, prefetch_size, ggml_is_numa()));
+            mapping.reset(new llama_v3_mmap(&file_loader->file, prefetch_size, ggml_v3_is_numa()));
             if (lmlock) {
                 lmlock->init(mapping->addr);
             }
@@ -793,36 +778,36 @@ struct llama_v3_model_loader {
             if (progress_callback) {
                 progress_callback((float) done_size / data_size, progress_callback_user_data);
             }
-            LLAMA_V3_ASSERT(lt.ggml_tensor); // unused tensors should have been caught by load_data already
-            lt.data = (uint8_t *) lt.ggml_tensor->data;
+            LLAMA_V3_ASSERT(lt.ggml_v3_tensor); // unused tensors should have been caught by load_data already
+            lt.data = (uint8_t *) lt.ggml_v3_tensor->data;
 
             // allocate temp buffer if not using mmap
             if (!use_mmap && lt.data == NULL) {
-                GGML_ASSERT(lt.ggml_tensor->backend != GGML_BACKEND_CPU);
-                lt.data = (uint8_t*)malloc(ggml_nbytes(lt.ggml_tensor));
+                GGML_V3_ASSERT(lt.ggml_v3_tensor->backend != GGML_V3_BACKEND_CPU);
+                lt.data = (uint8_t*)malloc(ggml_v3_nbytes(lt.ggml_v3_tensor));
             }
 
             load_data_for(lt);
 
-            switch(lt.ggml_tensor->backend) {
-                case GGML_BACKEND_CPU:
-                    lt.ggml_tensor->data = lt.data;
+            switch(lt.ggml_v3_tensor->backend) {
+                case GGML_V3_BACKEND_CPU:
+                    lt.ggml_v3_tensor->data = lt.data;
                     if (use_mmap && lmlock) {
                         lock_size += lt.size;
                         lmlock->grow_to(lock_size);
                     }
                     break;
 #if defined(GGML_USE_CUBLAS)
-                case GGML_BACKEND_GPU:
-                case GGML_BACKEND_GPU_SPLIT:
-                    ggml_cuda_transform_tensor(lt.data, lt.ggml_tensor);
+                case GGML_V3_BACKEND_GPU:
+                case GGML_V3_BACKEND_GPU_SPLIT:
+                    ggml_v3_cuda_transform_tensor(lt.data, lt.ggml_v3_tensor);
                     if (!use_mmap) {
                         free(lt.data);
                     }
                     break;
 #elif defined(GGML_USE_CLBLAST)
-                case GGML_BACKEND_GPU:
-                    ggml_cl_transform_tensor(lt.data, lt.ggml_tensor);
+                case GGML_V3_BACKEND_GPU:
+                    ggml_v3_cl_transform_tensor(lt.data, lt.ggml_v3_tensor);
                     if (!use_mmap) {
                         free(lt.data);
                     }
@@ -869,7 +854,7 @@ struct llama_v3_model_loader {
 static bool kv_cache_init(
         const struct llama_v3_hparams & hparams,
              struct llama_v3_kv_cache & cache,
-                         ggml_type   wtype,
+                         ggml_v3_type   wtype,
                                int   n_ctx,
                                int   n_gpu_layers) {
     const int n_embd  = hparams.n_embd_gqa();
@@ -878,33 +863,33 @@ static bool kv_cache_init(
     const int64_t n_mem      = n_layer*n_ctx;
     const int64_t n_elements = n_embd*n_mem;
 
-    cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB3);
+    cache.buf.resize(2u*n_elements*ggml_v3_type_size(wtype) + 2u*MB3);
     cache.n = 0;
 
-    struct ggml_init_params params;
+    struct ggml_v3_init_params params;
     params.mem_size   = cache.buf.size;
     params.mem_buffer = cache.buf.addr;
     params.no_alloc   = false;
 
-    cache.ctx = ggml_init(params);
+    cache.ctx = ggml_v3_init(params);
 
     if (!cache.ctx) {
         LLAMA_V3_LOG_ERROR("%s: failed to allocate memory for kv cache\n", __func__);
         return false;
     }
 
-    cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
-    cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
-    ggml_set_name(cache.k, "cache_k");
-    ggml_set_name(cache.v, "cache_v");
+    cache.k = ggml_v3_new_tensor_1d(cache.ctx, wtype, n_elements);
+    cache.v = ggml_v3_new_tensor_1d(cache.ctx, wtype, n_elements);
+    ggml_v3_set_name(cache.k, "cache_k");
+    ggml_v3_set_name(cache.v, "cache_v");
 
     (void) n_gpu_layers;
 #ifdef GGML_USE_CUBLAS
     if (n_gpu_layers > n_layer + 1) {
-        ggml_cuda_assign_buffers_no_scratch(cache.v);
+        ggml_v3_cuda_assign_buffers_no_scratch(cache.v);
     }
     if (n_gpu_layers > n_layer + 2) {
-        ggml_cuda_assign_buffers_no_scratch(cache.k);
+        ggml_v3_cuda_assign_buffers_no_scratch(cache.k);
     }
 #endif // GGML_USE_CUBLAS
 
@@ -967,32 +952,28 @@ int get_blas_batch_mul3(int batch)
 }
 
 void llama_v3_backend_init(bool numa) {
-    ggml_time_init();
+    ggml_v3_time_init();
 
     // needed to initialize f16 tables
     {
-        struct ggml_init_params params = { 0, NULL, false };
-        struct ggml_context * ctx = ggml_init(params);
-        ggml_free(ctx);
+        struct ggml_v3_init_params params = { 0, NULL, false };
+        struct ggml_v3_context * ctx = ggml_v3_init(params);
+        ggml_v3_free(ctx);
     }
 
     if (numa) {
-        ggml_numa_init();
+        ggml_v3_numa_init();
     }
 
-#ifdef GGML_USE_MPI
-    ggml_mpi_backend_init();
-#endif
+
 }
 
 void llama_v3_backend_free() {
-#ifdef GGML_USE_MPI
-    ggml_mpi_backend_free();
-#endif
+
 }
 
 int64_t llama_v3_time_us() {
-    return ggml_time_us();
+    return ggml_v3_time_us();
 }
 
 //
@@ -1064,14 +1045,14 @@ static void llama_v3_model_load_internal(
         float rope_freq_base,
         float rope_freq_scale,
         bool low_vram,
-        ggml_type memory_type,
+        ggml_v3_type memory_type,
         bool use_mmap,
         bool use_mlock,
         bool vocab_only,
         llama_v3_progress_callback progress_callback,
         void * progress_callback_user_data) {
 
-    model.t_start_us = ggml_time_us();
+    model.t_start_us = ggml_v3_time_us();
     size_t blasbatchmul = get_blas_batch_mul3(n_batch);
 
     std::unique_ptr<llama_v3_model_loader> ml(new llama_v3_model_loader(fname, use_mmap));
@@ -1188,15 +1169,15 @@ static void llama_v3_model_load_internal(
             model.mlock_buf.grow_to(model.buf.size);
         }
 
-        struct ggml_init_params params = {
+        struct ggml_v3_init_params params = {
             /*.mem_size   =*/ model.buf.size,
             /*.mem_buffer =*/ model.buf.addr,
             /*.no_alloc   =*/ ml->use_mmap,
         };
 
-        model.ctx = ggml_init(params);
+        model.ctx = ggml_v3_init(params);
         if (!model.ctx) {
-            throw std::runtime_error(format_old("ggml_init() failed"));
+            throw std::runtime_error(format_old("ggml_v3_init() failed"));
         }
     }
 
@@ -1204,17 +1185,17 @@ static void llama_v3_model_load_internal(
     (void) mul_mat_q;
 #if defined(GGML_USE_CUBLAS)
     LLAMA_V3_LOG_INFO("%s: using CUDA for GPU acceleration\n", __func__);
-    ggml_cuda_set_main_device(main_gpu);
-    ggml_cuda_set_mul_mat_q(mul_mat_q);
-#define LLAMA_V3_BACKEND_OFFLOAD       GGML_BACKEND_GPU
-#define LLAMA_V3_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
+    ggml_v3_cuda_set_main_device(main_gpu);
+    ggml_v3_cuda_set_mul_mat_q(mul_mat_q);
+#define LLAMA_V3_BACKEND_OFFLOAD       GGML_V3_BACKEND_GPU
+#define LLAMA_V3_BACKEND_OFFLOAD_SPLIT GGML_V3_BACKEND_GPU_SPLIT
 #elif defined(GGML_USE_CLBLAST)
     LLAMA_V3_LOG_INFO("%s: using OpenCL for GPU acceleration\n", __func__);
-#define LLAMA_V3_BACKEND_OFFLOAD       GGML_BACKEND_GPU
-#define LLAMA_V3_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU
+#define LLAMA_V3_BACKEND_OFFLOAD       GGML_V3_BACKEND_GPU
+#define LLAMA_V3_BACKEND_OFFLOAD_SPLIT GGML_V3_BACKEND_GPU
 #else
-#define LLAMA_V3_BACKEND_OFFLOAD       GGML_BACKEND_CPU
-#define LLAMA_V3_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_CPU
+#define LLAMA_V3_BACKEND_OFFLOAD       GGML_V3_BACKEND_CPU
+#define LLAMA_V3_BACKEND_OFFLOAD_SPLIT GGML_V3_BACKEND_CPU
 #endif
 
     // prepare memory for the weights
@@ -1226,36 +1207,36 @@ static void llama_v3_model_load_internal(
         const uint32_t n_layer    = hparams.n_layer;
         const uint32_t n_vocab    = hparams.n_vocab;
 
-        ml->ggml_ctx = ctx;
+        ml->ggml_v3_ctx = ctx;
 
-        model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU);
+        model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, GGML_V3_BACKEND_CPU);
 
         // "output" tensor
         {
-            ggml_backend_type backend_norm;
-            ggml_backend_type backend_output;
+            ggml_v3_backend_type backend_norm;
+            ggml_v3_backend_type backend_output;
             if (n_gpu_layers > int(n_layer)) { // NOLINT
                 // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
                 // on Windows however this is detrimental unless everything is on the GPU
 #ifndef _WIN32
-                backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_V3_BACKEND_OFFLOAD;
+                backend_norm = low_vram ? GGML_V3_BACKEND_CPU : LLAMA_V3_BACKEND_OFFLOAD;
 #else
-                backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_V3_BACKEND_OFFLOAD;
+                backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_V3_BACKEND_CPU : LLAMA_V3_BACKEND_OFFLOAD;
 #endif // _WIN32
 
                 backend_output = LLAMA_V3_BACKEND_OFFLOAD_SPLIT;
             } else {
-                backend_norm = GGML_BACKEND_CPU;
-                backend_output = GGML_BACKEND_CPU;
+                backend_norm = GGML_V3_BACKEND_CPU;
+                backend_output = GGML_V3_BACKEND_CPU;
             }
 
             model.norm   = ml->get_tensor("norm.weight",   {n_embd},          backend_norm);
             model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, backend_output);
-            if (backend_norm == GGML_BACKEND_GPU) {
-                vram_weights += ggml_nbytes(model.norm);
+            if (backend_norm == GGML_V3_BACKEND_GPU) {
+                vram_weights += ggml_v3_nbytes(model.norm);
             }
-            if (backend_output == GGML_BACKEND_GPU_SPLIT) {
-                vram_weights += ggml_nbytes(model.output);
+            if (backend_output == GGML_V3_BACKEND_GPU_SPLIT) {
+                vram_weights += ggml_v3_nbytes(model.output);
             }
         }
 
@@ -1263,8 +1244,8 @@ static void llama_v3_model_load_internal(
 
         model.layers.resize(n_layer);
         for (uint32_t i = 0; i < n_layer; ++i) {
-            const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_V3_BACKEND_OFFLOAD; // NOLINT
-            const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_V3_BACKEND_OFFLOAD_SPLIT; // NOLINT
+            const ggml_v3_backend_type backend = int(i) < i_gpu_start ? GGML_V3_BACKEND_CPU : LLAMA_V3_BACKEND_OFFLOAD; // NOLINT
+            const ggml_v3_backend_type backend_split = int(i) < i_gpu_start ? GGML_V3_BACKEND_CPU : LLAMA_V3_BACKEND_OFFLOAD_SPLIT; // NOLINT
 
             auto & layer = model.layers[i];
 
@@ -1283,11 +1264,11 @@ static void llama_v3_model_load_internal(
             layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", {  n_ff, n_embd}, backend_split);
             layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd,   n_ff}, backend_split);
 
-            if (backend == GGML_BACKEND_GPU) {
+            if (backend == GGML_V3_BACKEND_GPU) {
                 vram_weights +=
-                    ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk)             +
-                    ggml_nbytes(layer.wv)             + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
-                    ggml_nbytes(layer.w1)             + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
+                    ggml_v3_nbytes(layer.attention_norm) + ggml_v3_nbytes(layer.wq) + ggml_v3_nbytes(layer.wk)             +
+                    ggml_v3_nbytes(layer.wv)             + ggml_v3_nbytes(layer.wo) + ggml_v3_nbytes(layer.ffn_norm) +
+                    ggml_v3_nbytes(layer.w1)             + ggml_v3_nbytes(layer.w2) + ggml_v3_nbytes(layer.w3);
             }
         }
     }
@@ -1296,7 +1277,7 @@ static void llama_v3_model_load_internal(
 
     // print memory requirements
     {
-        const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
+        const size_t scale = memory_type == GGML_V3_TYPE_F32 ? 2 : 1;
 
         // this is the total memory required to run the inference
         size_t mem_required =
@@ -1322,12 +1303,12 @@ static void llama_v3_model_load_internal(
 #ifdef GGML_USE_CUBLAS
         if (low_vram) {
             LLAMA_V3_LOG_INFO("%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
-            ggml_cuda_set_scratch_size(0); // disable scratch
+            ggml_v3_cuda_set_scratch_size(0); // disable scratch
         } else {
             const size_t vram_scratch_base = VRAM_REQ_SCRATCH_BASE_3().at(model.type);
             const size_t vram_scratch_per_context = VRAM_REQ_SCRATCH_PER_CONTEXT_3().at(model.type);
             vram_scratch = n_batch * (vram_scratch_base + n_ctx * vram_scratch_per_context);
-            ggml_cuda_set_scratch_size(vram_scratch);
+            ggml_v3_cuda_set_scratch_size(vram_scratch);
             if (n_gpu_layers > 0) {
                 LLAMA_V3_LOG_INFO("%s: allocating batch_size x (%zd kB + n_ctx x %zd B) = %zd MB VRAM for the scratch buffer\n",
                         __func__, vram_scratch_base / kB3, vram_scratch_per_context,
@@ -1380,13 +1361,13 @@ static void llama_v3_model_load_internal(
 
     // populate `tensors_by_name`
     for (llama_v3_load_tensor & lt : ml->tensors_map.tensors) {
-        model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
+        model.tensors_by_name.emplace_back(lt.name, lt.ggml_v3_tensor);
     }
 
     (void) tensor_split;
 #if defined(GGML_USE_CUBLAS)
     {
-        ggml_cuda_set_tensor_split(tensor_split);
+        ggml_v3_cuda_set_tensor_split(tensor_split);
     }
 #endif
 
@@ -1400,7 +1381,7 @@ static void llama_v3_model_load_internal(
 
     // loading time will be recalculate after the first eval, so
     // we take page faults deferred by mmap() into consideration
-    model.t_load_us = ggml_time_us() - model.t_start_us;
+    model.t_load_us = ggml_v3_time_us() - model.t_start_us;
 }
 
 static bool llama_v3_model_load(
@@ -1418,7 +1399,7 @@ static bool llama_v3_model_load(
         float rope_freq_base,
         float rope_freq_scale,
         bool low_vram,
-        ggml_type memory_type,
+        ggml_v3_type memory_type,
         bool use_mmap,
         bool use_mlock,
         bool vocab_only,
@@ -1435,7 +1416,7 @@ static bool llama_v3_model_load(
     }
 }
 
-static struct ggml_cgraph * llama_v3_build_graph(
+static struct ggml_v3_cgraph * llama_v3_build_graph(
          llama_v3_context & lctx,
      const llama_v3_token * tokens,
            const float * embd,
@@ -1473,7 +1454,7 @@ static struct ggml_cgraph * llama_v3_build_graph(
     auto & buf_compute   = lctx.buf_compute;
 
 
-    struct ggml_init_params params = {
+    struct ggml_v3_init_params params = {
         /*.mem_size   =*/ buf_compute.size,
         /*.mem_buffer =*/ buf_compute.addr,
         /*.no_alloc   =*/ false,
@@ -1483,41 +1464,39 @@ static struct ggml_cgraph * llama_v3_build_graph(
     params.no_alloc = true;
 #endif
 
-    struct ggml_context * ctx0 = ggml_init(params);
+    struct ggml_v3_context * ctx0 = ggml_v3_init(params);
 
-    ggml_cgraph * gf = ggml_new_graph_custom(ctx0, GGML_MAX_NODES, false);
+    ggml_v3_cgraph * gf = ggml_v3_new_graph_custom(ctx0, GGML_V3_MAX_NODES, false);
 
-    struct ggml_tensor * cur;
-    struct ggml_tensor * inpL;
+    struct ggml_v3_tensor * cur;
+    struct ggml_v3_tensor * inpL;
 
     if (tokens) {
-        struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+        struct ggml_v3_tensor * inp_tokens = ggml_v3_new_tensor_1d(ctx0, GGML_V3_TYPE_I32, N);
 
 #ifdef LLAMA_V3_USE_ALLOCATOR
-        ggml_allocr_alloc(lctx.alloc, inp_tokens);
-        if (!ggml_allocr_is_measure(lctx.alloc)) {
-            memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
+        ggml_v3_allocr_alloc(lctx.alloc, inp_tokens);
+        if (!ggml_v3_allocr_is_measure(lctx.alloc)) {
+            memcpy(inp_tokens->data, tokens, N*ggml_v3_element_size(inp_tokens));
         }
 #else
-        memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
+        memcpy(inp_tokens->data, tokens, N*ggml_v3_element_size(inp_tokens));
 #endif
-        ggml_set_name(inp_tokens, "inp_tokens");
+        ggml_v3_set_name(inp_tokens, "inp_tokens");
 
-        inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
+        inpL = ggml_v3_get_rows(ctx0, model.tok_embeddings, inp_tokens);
     } else {
-#ifdef GGML_USE_MPI
-        GGML_ASSERT(false && "not implemented");
-#endif
 
-        inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
+
+        inpL = ggml_v3_new_tensor_2d(ctx0, GGML_V3_TYPE_F32, n_embd, N);
 
 #ifdef LLAMA_V3_USE_ALLOCATOR
-        ggml_allocr_alloc(lctx.alloc, inpL);
-        if (!ggml_allocr_is_measure(lctx.alloc)) {
-            memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
+        ggml_v3_allocr_alloc(lctx.alloc, inpL);
+        if (!ggml_v3_allocr_is_measure(lctx.alloc)) {
+            memcpy(inpL->data, embd, N * n_embd * ggml_v3_element_size(inpL));
         }
 #else
-        memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
+        memcpy(inpL->data, embd, N * n_embd * ggml_v3_element_size(inpL));
 #endif
     }
 
@@ -1528,82 +1507,82 @@ static struct ggml_cgraph * llama_v3_build_graph(
     // tensors are GPU-accelerated if any input or the output has been offloaded
     //
     // with the low VRAM option VRAM scratch is disabled in llama_v3_load_model_internal
-    // in that case ggml_cuda_assign_buffers has no effect
-    offload_func_t offload_func_nr = llama_v3_nop; // nr = non-repeating
-    offload_func_t offload_func_kq = llama_v3_nop;
-    offload_func_t offload_func_v  = llama_v3_nop;
+    // in that case ggml_v3_cuda_assign_buffers has no effect
+    offload_func_v3_t offload_func_nr = llama_v3_nop; // nr = non-repeating
+    offload_func_v3_t offload_func_kq = llama_v3_nop;
+    offload_func_v3_t offload_func_v  = llama_v3_nop;
 
 #ifdef GGML_USE_CUBLAS
     if (n_gpu_layers > n_layer) {
-        offload_func_nr = ggml_cuda_assign_buffers;
+        offload_func_nr = ggml_v3_cuda_assign_buffers;
     }
     if (n_gpu_layers > n_layer + 1) {
-        offload_func_v  = ggml_cuda_assign_buffers;
+        offload_func_v  = ggml_v3_cuda_assign_buffers;
     }
     if (n_gpu_layers > n_layer + 2) {
-        offload_func_kq = ggml_cuda_assign_buffers;
+        offload_func_kq = ggml_v3_cuda_assign_buffers;
     }
 #endif // GGML_USE_CUBLAS
 
-    struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+    struct ggml_v3_tensor * KQ_scale = ggml_v3_new_tensor_1d(ctx0, GGML_V3_TYPE_F32, 1);
 #ifdef LLAMA_V3_USE_ALLOCATOR
-    ggml_allocr_alloc(lctx.alloc, KQ_scale);
-    if (!ggml_allocr_is_measure(lctx.alloc)) {
-        ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
+    ggml_v3_allocr_alloc(lctx.alloc, KQ_scale);
+    if (!ggml_v3_allocr_is_measure(lctx.alloc)) {
+        ggml_v3_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
     }
 #else
-    ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
+    ggml_v3_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
 #endif
 
     float KQ_scale_float = 1.0f/sqrtf(float(n_embd)/n_head);
 
-    ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
+    ggml_v3_set_name(KQ_scale, "1/sqrt(n_embd_head)");
 
     for (int il = 0; il < n_layer; ++il) {
-        ggml_format_name(inpL, "layer_inp_%d", il);
+        ggml_v3_format_name(inpL, "layer_inp_%d", il);
 
-        offload_func_t offload_func = llama_v3_nop;
+        offload_func_v3_t offload_func = llama_v3_nop;
 
 #ifdef GGML_USE_CUBLAS
         if (il >= i_gpu_start) {
-            offload_func = ggml_cuda_assign_buffers;
+            offload_func = ggml_v3_cuda_assign_buffers;
         }
 #endif // GGML_USE_CUBLAS
 
-        struct ggml_tensor * inpSA = inpL;
+        struct ggml_v3_tensor * inpSA = inpL;
 
         lctx.use_buf(ctx0, 0);
 
         // norm
         {
-            cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
+            cur = ggml_v3_rms_norm(ctx0, inpL, rms_norm_eps);
             offload_func(cur);
-            ggml_set_name(cur, "rms_norm_0");
+            ggml_v3_set_name(cur, "rms_norm_0");
 
             // cur = cur*attention_norm(broadcasted)
-            cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm);
+            cur = ggml_v3_mul(ctx0, cur, model.layers[il].attention_norm);
             offload_func(cur);
-            ggml_set_name(cur, "attention_norm_0");
+            ggml_v3_set_name(cur, "attention_norm_0");
         }
 
         // self-attention
         {
             // compute Q and K and RoPE them
-            struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
+            struct ggml_v3_tensor * tmpk = ggml_v3_mul_mat(ctx0, model.layers[il].wk, cur);
             offload_func_kq(tmpk);
-            ggml_set_name(tmpk, "tmpk");
+            ggml_v3_set_name(tmpk, "tmpk");
 
-            struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
+            struct ggml_v3_tensor * tmpq = ggml_v3_mul_mat(ctx0, model.layers[il].wq, cur);
             offload_func_kq(tmpq);
-            ggml_set_name(tmpq, "tmpq");
+            ggml_v3_set_name(tmpq, "tmpq");
 
-            struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-            ggml_set_name(KQ_pos, "KQ_pos");
+            struct ggml_v3_tensor * KQ_pos = ggml_v3_new_tensor_1d(ctx0, GGML_V3_TYPE_I32, n_tokens);
+            ggml_v3_set_name(KQ_pos, "KQ_pos");
 
 #ifdef LLAMA_V3_USE_ALLOCATOR
             offload_func_kq(KQ_pos); //don't offload rope for cublas, its broken now since ring buffer was added
-            ggml_allocr_alloc(lctx.alloc, KQ_pos);
-            if (!ggml_allocr_is_measure(lctx.alloc)) {
+            ggml_v3_allocr_alloc(lctx.alloc, KQ_pos);
+            if (!ggml_v3_allocr_is_measure(lctx.alloc)) {
                int * data = (int *) KQ_pos->data;
                 for (int i = 0; i < N; ++i) {
                     data[i] = n_past + i;
@@ -1618,171 +1597,171 @@ static struct ggml_cgraph * llama_v3_build_graph(
             }
 #endif
 
-            struct ggml_tensor *Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), KQ_pos, n_embd_head, 0, 0, 0, freq_base, freq_scale, 0, 1, 32, 1);
+            struct ggml_v3_tensor *Kcur = ggml_v3_rope_custom_inplace(ctx0, ggml_v3_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), KQ_pos, n_embd_head, 0, 0, 0, freq_base, freq_scale, 0, 1, 32, 1);
             offload_func_kq(Kcur);
-            ggml_set_name(Kcur, "Kcur");
+            ggml_v3_set_name(Kcur, "Kcur");
 
-            struct ggml_tensor *Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), KQ_pos, n_embd_head, 0, 0, 0, freq_base, freq_scale, 0, 1, 32, 1);
+            struct ggml_v3_tensor *Qcur = ggml_v3_rope_custom_inplace(ctx0, ggml_v3_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), KQ_pos, n_embd_head, 0, 0, 0, freq_base, freq_scale, 0, 1, 32, 1);
             offload_func_kq(Qcur);
-            ggml_set_name(Qcur, "Qcur");
+            ggml_v3_set_name(Qcur, "Qcur");
 
             // store key and value to memory
             {
                 // compute the transposed [N, n_embd] V matrix
 
-                struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
+                struct ggml_v3_tensor * tmpv = ggml_v3_mul_mat(ctx0, model.layers[il].wv, cur);
                 offload_func_v(tmpv);
-                ggml_set_name(tmpv, "tmpv");
+                ggml_v3_set_name(tmpv, "tmpv");
 
-                struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, N));
+                struct ggml_v3_tensor * Vcur = ggml_v3_transpose(ctx0, ggml_v3_reshape_2d(ctx0, tmpv, n_embd_gqa, N));
                 offload_func_v(Vcur);
-                ggml_set_name(Vcur, "Vcur");
+                ggml_v3_set_name(Vcur, "Vcur");
 
-                struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
+                struct ggml_v3_tensor * k = ggml_v3_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_v3_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
                 offload_func_kq(k);
-                ggml_set_name(k, "k");
+                ggml_v3_set_name(k, "k");
 
-                struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
-                        (   n_ctx)*ggml_element_size(kv_self.v),
-                        (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
+                struct ggml_v3_tensor * v = ggml_v3_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
+                        (   n_ctx)*ggml_v3_element_size(kv_self.v),
+                        (il*n_ctx)*ggml_v3_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_v3_element_size(kv_self.v));
                 offload_func_v(v);
-                ggml_set_name(v, "v");
+                ggml_v3_set_name(v, "v");
 
                 // important: storing RoPE-ed version of K in the KV cache!
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
+                ggml_v3_build_forward_expand(gf, ggml_v3_cpy(ctx0, Kcur, k));
+                ggml_v3_build_forward_expand(gf, ggml_v3_cpy(ctx0, Vcur, v));
             }
 
-            struct ggml_tensor * Q =
-                ggml_permute(ctx0,
+            struct ggml_v3_tensor * Q =
+                ggml_v3_permute(ctx0,
                         Qcur,
                         0, 2, 1, 3);
             offload_func_kq(Q);
-            ggml_set_name(Q, "Q");
+            ggml_v3_set_name(Q, "Q");
 
-            struct ggml_tensor * K =
-                ggml_view_3d(ctx0, kv_self.k,
+            struct ggml_v3_tensor * K =
+                ggml_v3_view_3d(ctx0, kv_self.k,
                         n_embd_head, n_past + N, n_head_kv,
-                        ggml_element_size(kv_self.k)*n_embd_gqa,
-                        ggml_element_size(kv_self.k)*n_embd_head,
-                        ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
+                        ggml_v3_element_size(kv_self.k)*n_embd_gqa,
+                        ggml_v3_element_size(kv_self.k)*n_embd_head,
+                        ggml_v3_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
             offload_func_kq(K);
-            ggml_set_name(K, "K");
+            ggml_v3_set_name(K, "K");
 
             // K * Q
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+            struct ggml_v3_tensor * KQ = ggml_v3_mul_mat(ctx0, K, Q);
             offload_func_kq(KQ);
-            ggml_set_name(KQ, "KQ");
+            ggml_v3_set_name(KQ, "KQ");
 
             // KQ_scaled = KQ / sqrt(n_embd_head)
             // KQ_scaled shape [n_past + N, N, n_head, 1]
-            struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale_float);
+            struct ggml_v3_tensor * KQ_scaled = ggml_v3_scale_inplace(ctx0, KQ, KQ_scale_float);
             offload_func_kq(KQ_scaled);
-            ggml_set_name(KQ_scaled, "KQ_scaled");
+            ggml_v3_set_name(KQ_scaled, "KQ_scaled");
 
             // KQ_masked = mask_past(KQ_scaled)
-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
+            struct ggml_v3_tensor * KQ_masked = ggml_v3_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
             offload_func_kq(KQ_masked);
-            ggml_set_name(KQ_masked, "KQ_masked");
+            ggml_v3_set_name(KQ_masked, "KQ_masked");
 
             // KQ = soft_max(KQ_masked)
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
+            struct ggml_v3_tensor * KQ_soft_max = ggml_v3_soft_max_inplace(ctx0, KQ_masked);
             offload_func_v(KQ_soft_max);
-            ggml_set_name(KQ_soft_max, "KQ_soft_max");
+            ggml_v3_set_name(KQ_soft_max, "KQ_soft_max");
 
             // split cached V into n_head heads
-            struct ggml_tensor * V =
-                ggml_view_3d(ctx0, kv_self.v,
+            struct ggml_v3_tensor * V =
+                ggml_v3_view_3d(ctx0, kv_self.v,
                         n_past + N, n_embd_head, n_head_kv,
-                        ggml_element_size(kv_self.v)*n_ctx,
-                        ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
-                        ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
+                        ggml_v3_element_size(kv_self.v)*n_ctx,
+                        ggml_v3_element_size(kv_self.v)*n_ctx*n_embd_head,
+                        ggml_v3_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
             offload_func_v(V);
-            ggml_set_name(V, "V");
+            ggml_v3_set_name(V, "V");
 
 #if 1
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
+            struct ggml_v3_tensor * KQV = ggml_v3_mul_mat(ctx0, V, KQ_soft_max);
             offload_func_v(KQV);
-            ggml_set_name(KQV, "KQV");
+            ggml_v3_set_name(KQV, "KQV");
 #else
             // make V contiguous in memory to speed up the matmul, however we waste time on the copy
             // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
             // is there a better way?
-            struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head));
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
+            struct ggml_v3_tensor * V_cont = ggml_v3_cpy(ctx0, V, ggml_v3_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head));
+            struct ggml_v3_tensor * KQV = ggml_v3_mul_mat(ctx0, V_cont, KQ_soft_max);
 #endif
 
             // KQV_merged = KQV.permute(0, 2, 1, 3)
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+            struct ggml_v3_tensor * KQV_merged = ggml_v3_permute(ctx0, KQV, 0, 2, 1, 3);
             offload_func_v(KQV_merged);
-            ggml_set_name(KQV_merged, "KQV_merged");
+            ggml_v3_set_name(KQV_merged, "KQV_merged");
 
             // cur = KQV_merged.contiguous().view(n_embd, N)
-            cur = ggml_cpy(ctx0,
+            cur = ggml_v3_cpy(ctx0,
                     KQV_merged,
-                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
+                    ggml_v3_new_tensor_2d(ctx0, GGML_V3_TYPE_F32, n_embd, N));
             offload_func_v(cur);
-            ggml_set_name(cur, "KQV_merged_contiguous");
+            ggml_v3_set_name(cur, "KQV_merged_contiguous");
 
             // projection (no bias)
-            cur = ggml_mul_mat(ctx0,
+            cur = ggml_v3_mul_mat(ctx0,
                     model.layers[il].wo,
                     cur);
             offload_func(cur);
-            ggml_set_name(cur, "result_wo");
+            ggml_v3_set_name(cur, "result_wo");
         }
 
         lctx.use_buf(ctx0, 1);
 
-        struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
+        struct ggml_v3_tensor * inpFF = ggml_v3_add(ctx0, cur, inpSA);
         offload_func(inpFF);
-        ggml_set_name(inpFF, "inpFF");
+        ggml_v3_set_name(inpFF, "inpFF");
 
         // feed-forward network
         {
             // norm
             {
-                cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
+                cur = ggml_v3_rms_norm(ctx0, inpFF, rms_norm_eps);
                 offload_func(cur);
-                ggml_set_name(cur, "rms_norm_1");
+                ggml_v3_set_name(cur, "rms_norm_1");
 
                 // cur = cur*ffn_norm(broadcasted)
-                cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
+                cur = ggml_v3_mul(ctx0, cur, model.layers[il].ffn_norm);
                 offload_func(cur);
-                ggml_set_name(cur, "ffn_norm");
+                ggml_v3_set_name(cur, "ffn_norm");
             }
 
-            struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
+            struct ggml_v3_tensor * tmp = ggml_v3_mul_mat(ctx0,
                     model.layers[il].w3,
                     cur);
             offload_func(tmp);
-            ggml_set_name(tmp, "result_w3");
+            ggml_v3_set_name(tmp, "result_w3");
 
-            cur = ggml_mul_mat(ctx0,
+            cur = ggml_v3_mul_mat(ctx0,
                     model.layers[il].w1,
                     cur);
             offload_func(cur);
-            ggml_set_name(cur, "result_w1");
+            ggml_v3_set_name(cur, "result_w1");
 
             // SILU activation
-            cur = ggml_silu(ctx0, cur);
+            cur = ggml_v3_silu(ctx0, cur);
             offload_func(cur);
-            ggml_set_name(cur, "silu");
+            ggml_v3_set_name(cur, "silu");
 
-            cur = ggml_mul(ctx0, cur, tmp);
+            cur = ggml_v3_mul(ctx0, cur, tmp);
             offload_func(cur);
-            ggml_set_name(cur, "silu_x_result_w3");
+            ggml_v3_set_name(cur, "silu_x_result_w3");
 
-            cur = ggml_mul_mat(ctx0,
+            cur = ggml_v3_mul_mat(ctx0,
                     model.layers[il].w2,
                     cur);
             offload_func(cur);
-            ggml_set_name(cur, "result_w2");
+            ggml_v3_set_name(cur, "result_w2");
         }
 
-        cur = ggml_add(ctx0, cur, inpFF);
+        cur = ggml_v3_add(ctx0, cur, inpFF);
         offload_func(cur);
-        ggml_set_name(cur, "inpFF_+_result_w2");
+        ggml_v3_set_name(cur, "inpFF_+_result_w2");
 
         // input for next layer
         inpL = cur;
@@ -1792,41 +1771,41 @@ static struct ggml_cgraph * llama_v3_build_graph(
 
     // norm
     {
-        cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
+        cur = ggml_v3_rms_norm(ctx0, inpL, rms_norm_eps);
         offload_func_nr(cur);
-        ggml_set_name(cur, "rms_norm_2");
+        ggml_v3_set_name(cur, "rms_norm_2");
 
         // cur = cur*norm(broadcasted)
-        cur = ggml_mul(ctx0, cur, model.norm);
+        cur = ggml_v3_mul(ctx0, cur, model.norm);
         // offload_func_nr(cur); // TODO CPU + GPU mirrored backend
-        ggml_set_name(cur, "result_norm");
+        ggml_v3_set_name(cur, "result_norm");
     }
 
     // lm_head
-    cur = ggml_mul_mat(ctx0, model.output, cur);
-    ggml_set_name(cur, "result_output");
+    cur = ggml_v3_mul_mat(ctx0, model.output, cur);
+    ggml_v3_set_name(cur, "result_output");
 
     lctx.use_buf(ctx0, -1);
 
     // logits -> probs
-    //cur = ggml_soft_max_inplace(ctx0, cur);
+    //cur = ggml_v3_soft_max_inplace(ctx0, cur);
 
-    ggml_build_forward_expand(gf, cur);
+    ggml_v3_build_forward_expand(gf, cur);
 
     if (mem_per_token == 0) {
-        mem_per_token = ggml_used_mem(ctx0)/N;
+        mem_per_token = ggml_v3_used_mem(ctx0)/N;
     }
 
 #if 0
     LLAMA_V3_LOG_INFO("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__,
-            ggml_used_mem(ctx0)/1024.0/1024.0,
+            ggml_v3_used_mem(ctx0)/1024.0/1024.0,
             lctx.get_buf_max_mem(0)/1024.0/1024.0,
             lctx.get_buf_max_mem(1)/1024.0/1024.0,
             lctx.work_buffer.size()/1024.0/1024.0,
             n_past, N);
 #endif
 
-    ggml_free(ctx0);
+    ggml_v3_free(ctx0);
 
     return gf;
 }
@@ -1858,11 +1837,9 @@ static bool llama_v3_eval_internal(
     // LLAMA_V3_ASSERT(n_tokens <= n_batch);
     // LLAMA_V3_ASSERT(n_past + n_tokens <= n_ctx);
 
-    const int64_t t_start_us = ggml_time_us();
+    const int64_t t_start_us = ggml_v3_time_us();
+
 
-#ifdef GGML_USE_MPI
-    ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
-#endif
 
     const int N = n_tokens;
 
@@ -1877,67 +1854,47 @@ static bool llama_v3_eval_internal(
     const int64_t n_vocab     = hparams.n_vocab;
 
 #ifdef LLAMA_V3_USE_ALLOCATOR
-    ggml_allocr_reset(lctx.alloc);
+    ggml_v3_allocr_reset(lctx.alloc);
 #endif
 
-    ggml_cgraph * gf = llama_v3_build_graph(lctx, tokens, embd, n_tokens, n_past);
+    ggml_v3_cgraph * gf = llama_v3_build_graph(lctx, tokens, embd, n_tokens, n_past);
 
 #ifdef LLAMA_V3_USE_ALLOCATOR
-    ggml_allocr_alloc_graph(lctx.alloc, gf);
+    ggml_v3_allocr_alloc_graph(lctx.alloc, gf);
 #endif
 
-    // LLAMA_V3_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
+    // LLAMA_V3_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_v3_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
 
     // for big prompts, if BLAS is enabled, it is better to use only one thread
     // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
-    n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
+    n_threads = N >= 32 && ggml_v3_cpu_has_blas() && !ggml_v3_cpu_has_gpublas() ? 1 : n_threads;
 
-    struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
-    struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
+    struct ggml_v3_tensor * res = gf->nodes[gf->n_nodes - 1];
+    struct ggml_v3_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
 
     LLAMA_V3_ASSERT(strcmp(res->name, "result_output") == 0);
     LLAMA_V3_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
 
-#if GGML_USE_MPI
-    const int64_t n_layer = hparams.n_layer;
-    ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
-#endif
 
-#ifdef GGML_USE_METAL
-    if (lctx.ctx_metal) {
-        ggml_metal_set_n_cb     (lctx.ctx_metal, n_threads);
-        ggml_metal_graph_compute(lctx.ctx_metal, gf);
-        ggml_metal_get_tensor   (lctx.ctx_metal, res);
-        if (!lctx.embedding.empty()) {
-            ggml_metal_get_tensor(lctx.ctx_metal, embeddings);
-        }
-    } else {
-        llv3_graph_compute_helper(lctx.work_buffer, gf, n_threads);
-    }
-#else
     llv3_graph_compute_helper(lctx.work_buffer, gf, n_threads);
-#endif
 
-#if GGML_USE_MPI
-    ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
-#endif
 
     // update kv token count
     lctx.kv_self.n = n_past + N;
 
     if (cgraph_fname) {
-        ggml_graph_export(gf, cgraph_fname);
+        ggml_v3_graph_export(gf, cgraph_fname);
     }
 
-#ifdef GGML_PERF
+#ifdef GGML_V3_PERF
     // print timing information per ggml operation (for debugging purposes)
-    // requires GGML_PERF to be defined
-    ggml_graph_print(gf);
+    // requires GGML_V3_PERF to be defined
+    ggml_v3_graph_print(gf);
 #endif
 
     // plot the computation graph in dot format (for debugging purposes)
     //if (n_past%100 == 0) {
-    //    ggml_graph_dump_dot(gf, NULL, "llama.dot");
+    //    ggml_v3_graph_dump_dot(gf, NULL, "llama.dot");
     //}
 
     // extract logits
@@ -1946,11 +1903,11 @@ static bool llama_v3_eval_internal(
 
         if (lctx.logits_all) {
             logits_out.resize(n_vocab * N);
-            memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*N);
+            memcpy(logits_out.data(), (float *) ggml_v3_get_data(res), sizeof(float)*n_vocab*N);
         } else {
             // return result for just the last token
             logits_out.resize(n_vocab);
-            memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
+            memcpy(logits_out.data(), (float *) ggml_v3_get_data(res) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
         }
     }
 
@@ -1959,16 +1916,16 @@ static bool llama_v3_eval_internal(
         auto & embedding_out = lctx.embedding;
 
         embedding_out.resize(n_embd);
-        memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd);
+        memcpy(embedding_out.data(), (float *) ggml_v3_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd);
     }
 
     // measure the performance only for the single-token evals
     if (N == 1) {
-        lctx.t_eval_us += ggml_time_us() - t_start_us;
+        lctx.t_eval_us += ggml_v3_time_us() - t_start_us;
         lctx.n_eval++;
     }
     else if (N > 1) {
-        lctx.t_p_eval_us += ggml_time_us() - t_start_us;
+        lctx.t_p_eval_us += ggml_v3_time_us() - t_start_us;
         lctx.n_p_eval += N;
     }
 
@@ -2127,7 +2084,7 @@ std::vector<llama_token> llama_v3_tokenize(
     if (n_tokens < 0) {
         result.resize(-n_tokens);
         int check = llama_v3_tokenize(ctx, text.c_str(), result.data(), result.size(), add_bos);
-        GGML_ASSERT(check == -n_tokens);
+        GGML_V3_ASSERT(check == -n_tokens);
     } else {
         result.resize(n_tokens);
     }
@@ -2532,7 +2489,7 @@ void llama_v3_grammar_free(struct llama_v3_grammar * grammar) {
 void llama_v3_sample_softmax(struct llama_v3_context * ctx, llama_v3_token_data_array * candidates) {
     assert(candidates->size > 0);
 
-    const int64_t t_start_sample_us = ggml_time_us();
+    const int64_t t_start_sample_us = ggml_v3_time_us();
 
     // Sort the logits in descending order
     if (!candidates->sorted) {
@@ -2554,12 +2511,12 @@ void llama_v3_sample_softmax(struct llama_v3_context * ctx, llama_v3_token_data_
     }
 
     if (ctx) {
-        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
+        ctx->t_sample_us += ggml_v3_time_us() - t_start_sample_us;
     }
 }
 
 void llama_v3_sample_top_k(struct llama_v3_context * ctx, llama_v3_token_data_array * candidates, int k, size_t min_keep) {
-    const int64_t t_start_sample_us = ggml_time_us();
+    const int64_t t_start_sample_us = ggml_v3_time_us();
 
     k = std::max(k, (int) min_keep);
     k = std::min(k, (int) candidates->size);
@@ -2579,7 +2536,7 @@ void llama_v3_sample_top_k(struct llama_v3_context * ctx, llama_v3_token_data_ar
     candidates->size = k;
 
     if (ctx) {
-        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
+        ctx->t_sample_us += ggml_v3_time_us() - t_start_sample_us;
     }
 }
 
@@ -2590,7 +2547,7 @@ void llama_v3_sample_top_p(struct llama_v3_context * ctx, llama_v3_token_data_ar
 
     llama_v3_sample_softmax(ctx, candidates);
 
-    const int64_t t_start_sample_us = ggml_time_us();
+    const int64_t t_start_sample_us = ggml_v3_time_us();
 
     // Compute the cumulative probabilities
     float cum_sum = 0.0f;
@@ -2611,7 +2568,7 @@ void llama_v3_sample_top_p(struct llama_v3_context * ctx, llama_v3_token_data_ar
     candidates->size = last_idx;
 
     if (ctx) {
-        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
+        ctx->t_sample_us += ggml_v3_time_us() - t_start_sample_us;
     }
 }
 
@@ -2621,7 +2578,7 @@ void llama_v3_sample_tail_free(struct llama_v3_context * ctx, llama_v3_token_dat
     }
 
     llama_v3_sample_softmax(nullptr, candidates);
-    const int64_t t_start_sample_us = ggml_time_us();
+    const int64_t t_start_sample_us = ggml_v3_time_us();
 
     // Compute the first and second derivatives
     std::vector<float> first_derivatives(candidates->size - 1);
@@ -2670,7 +2627,7 @@ void llama_v3_sample_tail_free(struct llama_v3_context * ctx, llama_v3_token_dat
     candidates->size = last_idx;
 
     if (ctx) {
-        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
+        ctx->t_sample_us += ggml_v3_time_us() - t_start_sample_us;
     }
 }
 
@@ -2685,7 +2642,7 @@ void llama_v3_sample_typical(struct llama_v3_context * ctx, llama_v3_token_data_
     // Compute the softmax of logits and calculate entropy
     llama_v3_sample_softmax(nullptr, candidates);
 
-    const int64_t t_start_sample_us = ggml_time_us();
+    const int64_t t_start_sample_us = ggml_v3_time_us();
 
     float entropy = 0.0f;
     for (size_t i = 0; i < candidates->size; ++i) {
@@ -2737,19 +2694,19 @@ void llama_v3_sample_typical(struct llama_v3_context * ctx, llama_v3_token_data_
     candidates->size = new_candidates.size();
 
     if (ctx) {
-        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
+        ctx->t_sample_us += ggml_v3_time_us() - t_start_sample_us;
     }
 }
 
 void llama_v3_sample_temperature(struct llama_v3_context * ctx, llama_v3_token_data_array * candidates_p, float temp) {
-    const int64_t t_start_sample_us = ggml_time_us();
+    const int64_t t_start_sample_us = ggml_v3_time_us();
 
     for (size_t i = 0; i < candidates_p->size; ++i) {
         candidates_p->data[i].logit /= temp;
     }
 
     if (ctx) {
-        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
+        ctx->t_sample_us += ggml_v3_time_us() - t_start_sample_us;
     }
 }
 
@@ -2758,7 +2715,7 @@ void llama_v3_sample_repetition_penalty(struct llama_v3_context * ctx, llama_v3_
         return;
     }
 
-    const int64_t t_start_sample_us = ggml_time_us();
+    const int64_t t_start_sample_us = ggml_v3_time_us();
 
     for (size_t i = 0; i < candidates->size; ++i) {
         const auto * token_iter = std::find(last_tokens, last_tokens + last_tokens_size, candidates->data[i].id);
@@ -2778,7 +2735,7 @@ void llama_v3_sample_repetition_penalty(struct llama_v3_context * ctx, llama_v3_
     candidates->sorted = false;
 
     if (ctx) {
-        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
+        ctx->t_sample_us += ggml_v3_time_us() - t_start_sample_us;
     }
 }
 
@@ -2787,7 +2744,7 @@ void llama_v3_sample_frequency_and_presence_penalties(struct llama_v3_context *
         return;
     }
 
-    const int64_t t_start_sample_us = ggml_time_us();
+    const int64_t t_start_sample_us = ggml_v3_time_us();
 
     // Create a frequency map to count occurrences of each token in last_tokens
     std::unordered_map<llama_v3_token, int> token_count;
@@ -2809,13 +2766,13 @@ void llama_v3_sample_frequency_and_presence_penalties(struct llama_v3_context *
     candidates->sorted = false;
 
     if (ctx) {
-        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
+        ctx->t_sample_us += ggml_v3_time_us() - t_start_sample_us;
     }
 }
 
 void llama_v3_sample_grammar(struct llama_v3_context * ctx, llama_v3_token_data_array * candidates, const struct llama_v3_grammar * grammar) {
     assert(ctx);
-    const int64_t t_start_sample_us = ggml_time_us();
+    const int64_t t_start_sample_us = ggml_v3_time_us();
 
     bool allow_eos = false;
     for (const auto & stack : grammar->stacks) {
@@ -2853,7 +2810,7 @@ void llama_v3_sample_grammar(struct llama_v3_context * ctx, llama_v3_token_data_
         candidates->data[reject.index].logit = -INFINITY;
     }
 
-    ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
+    ctx->t_sample_us += ggml_v3_time_us() - t_start_sample_us;
 }
 
 static void llama_v3_log_softmax(float * array, size_t size) {
@@ -2875,7 +2832,7 @@ void llama_v3_sample_classifier_free_guidance(
         llama_v3_token_data_array * candidates,
           struct llama_v3_context * guidance_ctx,
                          float   scale) {
-    int64_t t_start_sample_us = ggml_time_us();
+    int64_t t_start_sample_us = ggml_v3_time_us();
 
     assert(ctx);
     auto n_vocab = llama_v3_n_vocab(ctx);
@@ -2899,7 +2856,7 @@ void llama_v3_sample_classifier_free_guidance(
     }
 
     if (ctx) {
-        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
+        ctx->t_sample_us += ggml_v3_time_us() - t_start_sample_us;
     }
 }
 
@@ -2907,7 +2864,7 @@ llama_v3_token llama_v3_sample_token_mirostat(struct llama_v3_context * ctx, lla
     assert(ctx);
     auto N = float(llama_v3_n_vocab(ctx));
     int64_t t_start_sample_us;
-    t_start_sample_us = ggml_time_us();
+    t_start_sample_us = ggml_v3_time_us();
 
     llama_v3_sample_softmax(nullptr, candidates);
 
@@ -2930,10 +2887,10 @@ llama_v3_token llama_v3_sample_token_mirostat(struct llama_v3_context * ctx, lla
     // Sample the next word X using top-k sampling
     llama_v3_sample_top_k(nullptr, candidates, int(k), 1);
     if (ctx) {
-        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
+        ctx->t_sample_us += ggml_v3_time_us() - t_start_sample_us;
     }
     llama_v3_token X = llama_v3_sample_token(ctx, candidates);
-    t_start_sample_us = ggml_time_us();
+    t_start_sample_us = ggml_v3_time_us();
 
     // Compute error as the difference between observed surprise and target surprise value
     size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_v3_token_data & candidate) {
@@ -2946,14 +2903,14 @@ llama_v3_token llama_v3_sample_token_mirostat(struct llama_v3_context * ctx, lla
     *mu = *mu - eta * e;
 
     if (ctx) {
-        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
+        ctx->t_sample_us += ggml_v3_time_us() - t_start_sample_us;
     }
     return X;
 }
 
 llama_v3_token llama_v3_sample_token_mirostat_v2(struct llama_v3_context * ctx, llama_v3_token_data_array * candidates, float tau, float eta, float * mu) {
     int64_t t_start_sample_us;
-    t_start_sample_us = ggml_time_us();
+    t_start_sample_us = ggml_v3_time_us();
 
     llama_v3_sample_softmax(ctx, candidates);
 
@@ -2967,7 +2924,7 @@ llama_v3_token llama_v3_sample_token_mirostat_v2(struct llama_v3_context * ctx,
     }
 
     if (ctx) {
-        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
+        ctx->t_sample_us += ggml_v3_time_us() - t_start_sample_us;
     }
 
     // Normalize the probabilities of the remaining words
@@ -2975,7 +2932,7 @@ llama_v3_token llama_v3_sample_token_mirostat_v2(struct llama_v3_context * ctx,
 
     // Sample the next word X from the remaining words
     llama_v3_token X = llama_v3_sample_token(ctx, candidates);
-    t_start_sample_us = ggml_time_us();
+    t_start_sample_us = ggml_v3_time_us();
 
     // Compute error as the difference between observed surprise and target surprise value
     size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_v3_token_data & candidate) {
@@ -2988,13 +2945,13 @@ llama_v3_token llama_v3_sample_token_mirostat_v2(struct llama_v3_context * ctx,
     *mu = *mu - eta * e;
 
     if (ctx) {
-        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
+        ctx->t_sample_us += ggml_v3_time_us() - t_start_sample_us;
     }
     return X;
 }
 
 llama_v3_token llama_v3_sample_token_greedy(struct llama_v3_context * ctx, llama_v3_token_data_array * candidates) {
-    const int64_t t_start_sample_us = ggml_time_us();
+    const int64_t t_start_sample_us = ggml_v3_time_us();
 
     // Find max element
     auto * max_iter = std::max_element(candidates->data, candidates->data + candidates->size, [](const llama_v3_token_data & a, const llama_v3_token_data & b) {
@@ -3003,7 +2960,7 @@ llama_v3_token llama_v3_sample_token_greedy(struct llama_v3_context * ctx, llama
 
     llama_v3_token result = max_iter->id;
     if (ctx) {
-        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
+        ctx->t_sample_us += ggml_v3_time_us() - t_start_sample_us;
         ctx->n_sample++;
     }
     return result;
@@ -3011,7 +2968,7 @@ llama_v3_token llama_v3_sample_token_greedy(struct llama_v3_context * ctx, llama
 
 llama_v3_token llama_v3_sample_token(struct llama_v3_context * ctx, llama_v3_token_data_array * candidates) {
     assert(ctx);
-    const int64_t t_start_sample_us = ggml_time_us();
+    const int64_t t_start_sample_us = ggml_v3_time_us();
     llama_v3_sample_softmax(nullptr, candidates);
 
     std::vector<float> probs;
@@ -3026,13 +2983,13 @@ llama_v3_token llama_v3_sample_token(struct llama_v3_context * ctx, llama_v3_tok
 
     llama_v3_token result = candidates->data[idx].id;
 
-    ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
+    ctx->t_sample_us += ggml_v3_time_us() - t_start_sample_us;
     ctx->n_sample++;
     return result;
 }
 
 void llama_v3_grammar_accept_token(struct llama_v3_context * ctx, struct llama_v3_grammar * grammar, llama_v3_token token) {
-    const int64_t t_start_sample_us = ggml_time_us();
+    const int64_t t_start_sample_us = ggml_v3_time_us();
 
     if (token == llama_v3_token_eos()) {
         for (const auto & stack : grammar->stacks) {
@@ -3054,7 +3011,7 @@ void llama_v3_grammar_accept_token(struct llama_v3_context * ctx, struct llama_v
     grammar->partial_utf8 = decoded.second;
     LLAMA_V3_ASSERT(!grammar->stacks.empty());
 
-    ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
+    ctx->t_sample_us += ggml_v3_time_us() - t_start_sample_us;
 }
 
 //
@@ -3067,20 +3024,20 @@ static void llama_v3_convert_tensor_internal(const llama_v3_load_tensor & tensor
     }
     float * f32_output = (float *) output.addr;
 
-    ggml_type_traits_t qtype;
-    if (ggml_is_quantized(tensor.type)) {
-        qtype = ggml_internal_get_type_traits(tensor.type);
+    ggml_v3_type_traits_t qtype;
+    if (ggml_v3_is_quantized(tensor.type)) {
+        qtype = ggml_v3_internal_get_type_traits(tensor.type);
         if (qtype.to_float == NULL) {
-            throw std::runtime_error(format_old("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor.type)));
+            throw std::runtime_error(format_old("type %s unsupported for integer quantization: no dequantization available", ggml_v3_type_name(tensor.type)));
         }
-    } else if (tensor.type != GGML_TYPE_F16) {
-        throw std::runtime_error(format_old("cannot dequantize/convert tensor type %s", ggml_type_name(tensor.type)));
+    } else if (tensor.type != GGML_V3_TYPE_F16) {
+        throw std::runtime_error(format_old("cannot dequantize/convert tensor type %s", ggml_v3_type_name(tensor.type)));
     }
 
     if (nthread < 2) {
-        if (tensor.type == GGML_TYPE_F16) {
-            ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor.data, f32_output, nelements);
-        } else if (ggml_is_quantized(tensor.type)) {
+        if (tensor.type == GGML_V3_TYPE_F16) {
+            ggml_v3_fp16_to_fp32_row((ggml_v3_fp16_t *)tensor.data, f32_output, nelements);
+        } else if (ggml_v3_is_quantized(tensor.type)) {
             qtype.to_float(tensor.data, f32_output, nelements);
         } else {
             LLAMA_V3_ASSERT(false); // unreachable
@@ -3088,8 +3045,8 @@ static void llama_v3_convert_tensor_internal(const llama_v3_load_tensor & tensor
         return;
     }
 
-    auto block_size = tensor.type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor.type);
-    auto block_size_bytes = ggml_type_size(tensor.type);
+    auto block_size = tensor.type == GGML_V3_TYPE_F16 ? 1 : (size_t)ggml_v3_blck_size(tensor.type);
+    auto block_size_bytes = ggml_v3_type_size(tensor.type);
 
     LLAMA_V3_ASSERT(nelements % block_size == 0);
     auto nblocks = nelements / block_size;
@@ -3102,9 +3059,9 @@ static void llama_v3_convert_tensor_internal(const llama_v3_load_tensor & tensor
         auto thr_elems = thr_blocks * block_size; // number of elements for this thread
         auto thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread
 
-        auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
-            if (typ == GGML_TYPE_F16) {
-                ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
+        auto compute = [qtype] (ggml_v3_type typ, uint8_t * inbuf, float * outbuf, int nels) {
+            if (typ == GGML_V3_TYPE_F16) {
+                ggml_v3_fp16_to_fp32_row((ggml_v3_fp16_t *)inbuf, outbuf, nels);
             } else {
                 qtype.to_float(inbuf, outbuf, nels);
             }
@@ -3120,30 +3077,30 @@ static void llama_v3_convert_tensor_internal(const llama_v3_load_tensor & tensor
 }
 
 static void llama_v3_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_v3_model_quantize_params * params) {
-    ggml_type quantized_type;
+    ggml_v3_type quantized_type;
     llama_v3_ftype ftype = params->ftype;
     int nthread = params->nthread;
 
     switch (params->ftype) {
-        case LLAMA_V3_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
-        case LLAMA_V3_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
-        case LLAMA_V3_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
-        case LLAMA_V3_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
-        case LLAMA_V3_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
-        case LLAMA_V3_FTYPE_MOSTLY_F16:  quantized_type = GGML_TYPE_F16;  break;
-        case LLAMA_V3_FTYPE_ALL_F32:     quantized_type = GGML_TYPE_F32;  break;
+        case LLAMA_V3_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_V3_TYPE_Q4_0; break;
+        case LLAMA_V3_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_V3_TYPE_Q4_1; break;
+        case LLAMA_V3_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_V3_TYPE_Q5_0; break;
+        case LLAMA_V3_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_V3_TYPE_Q5_1; break;
+        case LLAMA_V3_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_V3_TYPE_Q8_0; break;
+        case LLAMA_V3_FTYPE_MOSTLY_F16:  quantized_type = GGML_V3_TYPE_F16;  break;
+        case LLAMA_V3_FTYPE_ALL_F32:     quantized_type = GGML_V3_TYPE_F32;  break;
 
 #ifdef GGML_USE_K_QUANTS
         // K-quants
-        case LLAMA_V3_FTYPE_MOSTLY_Q2_K:   quantized_type = GGML_TYPE_Q2_K; break;
+        case LLAMA_V3_FTYPE_MOSTLY_Q2_K:   quantized_type = GGML_V3_TYPE_Q2_K; break;
         case LLAMA_V3_FTYPE_MOSTLY_Q3_K_S:
         case LLAMA_V3_FTYPE_MOSTLY_Q3_K_M:
-        case LLAMA_V3_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
+        case LLAMA_V3_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_V3_TYPE_Q3_K; break;
         case LLAMA_V3_FTYPE_MOSTLY_Q4_K_S:
-        case LLAMA_V3_FTYPE_MOSTLY_Q4_K_M: quantized_type = GGML_TYPE_Q4_K; break;
+        case LLAMA_V3_FTYPE_MOSTLY_Q4_K_M: quantized_type = GGML_V3_TYPE_Q4_K; break;
         case LLAMA_V3_FTYPE_MOSTLY_Q5_K_S:
-        case LLAMA_V3_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
-        case LLAMA_V3_FTYPE_MOSTLY_Q6_K:   quantized_type = GGML_TYPE_Q6_K; break;
+        case LLAMA_V3_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_V3_TYPE_Q5_K; break;
+        case LLAMA_V3_FTYPE_MOSTLY_Q6_K:   quantized_type = GGML_V3_TYPE_Q6_K; break;
 #endif
         default: throw std::runtime_error(format_old("invalid output file type %d\n", ftype));
     }
@@ -3192,7 +3149,7 @@ static void llama_v3_model_quantize_internal(const std::string & fname_inp, cons
         LLAMA_V3_LOG_INFO("[%4zu/%4zu] %36s - %16s, type = %6s, ",
                ++idx, model_loader->tensors_map.tensors.size(),
                tensor.name.c_str(), llama_v3_format_tensor_shape(tensor.ne).c_str(),
-               ggml_type_name(tensor.type));
+               ggml_v3_type_name(tensor.type));
 
         // This used to be a regex, but <regex> has an extreme cost to compile times.
         bool quantize = tensor.name.rfind("weight") == tensor.name.size() - 6; // ends with 'weight'?
@@ -3202,7 +3159,7 @@ static void llama_v3_model_quantize_internal(const std::string & fname_inp, cons
         quantize &= params->quantize_output_tensor || tensor.name != "output.weight";
         quantize &= quantized_type != tensor.type;
 
-        enum ggml_type new_type;
+        enum ggml_v3_type new_type;
         void * new_data;
         size_t new_size;
         llama_v3_buffer work;
@@ -3219,30 +3176,30 @@ static void llama_v3_model_quantize_internal(const std::string & fname_inp, cons
                 int nx = tensor.ne.at(0);
                 int ny = tensor.ne.at(1);
                 if (nx % QK_K == 0 && ny % QK_K == 0) {
-                    new_type = GGML_TYPE_Q6_K;
+                    new_type = GGML_V3_TYPE_Q6_K;
                 }
             } else if (tensor.name.find("attention.wv.weight") != std::string::npos) {
-                if      (ftype == LLAMA_V3_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_V3_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
-                else if (ftype == LLAMA_V3_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
+                if      (ftype == LLAMA_V3_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_V3_FTYPE_MOSTLY_Q2_K) new_type = GGML_V3_TYPE_Q4_K;
+                else if (ftype == LLAMA_V3_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_V3_TYPE_Q5_K;
                 else if ((ftype == LLAMA_V3_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_V3_FTYPE_MOSTLY_Q5_K_M) &&
-                        use_more_bits(i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
+                        use_more_bits(i_attention_wv, n_attention_wv)) new_type = GGML_V3_TYPE_Q6_K;
                 else if (QK_K == 64 && (ftype == LLAMA_V3_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_V3_FTYPE_MOSTLY_Q3_K_S) &&
-                        (i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
+                        (i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_V3_TYPE_Q6_K;
                 ++i_attention_wv;
             } else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
-                if      (ftype == LLAMA_V3_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_V3_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
-                else if (ftype == LLAMA_V3_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
+                if      (ftype == LLAMA_V3_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_V3_FTYPE_MOSTLY_Q2_K) new_type = GGML_V3_TYPE_Q4_K;
+                else if (ftype == LLAMA_V3_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_V3_TYPE_Q5_K;
                 else if ((ftype == LLAMA_V3_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_V3_FTYPE_MOSTLY_Q5_K_M) &&
-                         use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
-                //else if (ftype == LLAMA_V3_FTYPE_MOSTLY_Q4_K_S && i_feed_forward_w2 < n_feed_forward_w2/8) new_type = GGML_TYPE_Q6_K;
+                         use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_V3_TYPE_Q6_K;
+                //else if (ftype == LLAMA_V3_FTYPE_MOSTLY_Q4_K_S && i_feed_forward_w2 < n_feed_forward_w2/8) new_type = GGML_V3_TYPE_Q6_K;
                 ++i_feed_forward_w2;
             } else if (tensor.name.find("attention.wo.weight") != std::string::npos) {
-                if      (ftype == LLAMA_V3_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_V3_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
-                else if (ftype == LLAMA_V3_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
+                if      (ftype == LLAMA_V3_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_V3_FTYPE_MOSTLY_Q2_K) new_type = GGML_V3_TYPE_Q4_K;
+                else if (ftype == LLAMA_V3_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_V3_TYPE_Q5_K;
             }
             bool convert_incompatible_tensor = false;
-            if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
-                new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
+            if (new_type == GGML_V3_TYPE_Q2_K || new_type == GGML_V3_TYPE_Q3_K || new_type == GGML_V3_TYPE_Q4_K ||
+                new_type == GGML_V3_TYPE_Q5_K || new_type == GGML_V3_TYPE_Q6_K) {
                 int nx = tensor.ne.at(0);
                 int ny = tensor.ne.at(1);
                 if (nx % QK_K != 0 || ny % QK_K != 0) {
@@ -3252,10 +3209,10 @@ static void llama_v3_model_quantize_internal(const std::string & fname_inp, cons
             }
             if (convert_incompatible_tensor) {
                 if (tensor.name == "output.weight") {
-                    new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
+                    new_type = GGML_V3_TYPE_F16; //fall back to F16 instead of just failing.
                     LLAMA_V3_LOG_WARN("F16 will be used for this tensor instead.\n");
                 } else if (tensor.name == "tok_embeddings.weight") {
-                    new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
+                    new_type = GGML_V3_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
                     LLAMA_V3_LOG_WARN("Q4_0 will be used for this tensor instead.\n");
                 } else {
                     throw std::runtime_error("Unsupported tensor size encountered\n");
@@ -3267,16 +3224,16 @@ static void llama_v3_model_quantize_internal(const std::string & fname_inp, cons
             size_t nelements = tensor.ne.at(0) * tensor.ne.at(1);
             llama_v3_buffer f32_conv_buf;
 
-            if (tensor.type == GGML_TYPE_F32) {
+            if (tensor.type == GGML_V3_TYPE_F32) {
                 f32_data = (float *) tensor.data;
-            } else if (ggml_is_quantized(tensor.type) && !params->allow_requantize) {
-                throw std::runtime_error(format_old("requantizing from type %s is disabled", ggml_type_name(tensor.type)));
+            } else if (ggml_v3_is_quantized(tensor.type) && !params->allow_requantize) {
+                throw std::runtime_error(format_old("requantizing from type %s is disabled", ggml_v3_type_name(tensor.type)));
             } else {
                 llama_v3_convert_tensor_internal(tensor, f32_conv_buf, nelements, nthread);
                 f32_data = (float *) f32_conv_buf.addr;
             }
 
-            LLAMA_V3_LOG_INFO("quantizing to %s .. ", ggml_type_name(new_type));
+            LLAMA_V3_LOG_INFO("quantizing to %s .. ", ggml_v3_type_name(new_type));
             fflush(stdout);
 
             work.resize(nelements * 4); // upper bound on size
@@ -3287,7 +3244,7 @@ static void llama_v3_model_quantize_internal(const std::string & fname_inp, cons
             const int nchunk = (nelements + chunk_size - 1)/chunk_size;
             const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
             if (nthread_use < 2) {
-                new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, hist_cur.data());
+                new_size = ggml_v3_quantize_chunk(new_type, f32_data, new_data, 0, nelements, hist_cur.data());
             } else {
                 size_t counter = 0;
                 new_size = 0;
@@ -3311,7 +3268,7 @@ static void llama_v3_model_quantize_internal(const std::string & fname_inp, cons
                         if (local_hist.empty()) {
                             local_hist.resize(hist_cur.size(), 0);
                         }
-                        local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
+                        local_size += ggml_v3_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
                     }
                 };
                 if ((int) workers.size() < nthread_use - 1) {
@@ -3373,11 +3330,11 @@ static void llama_v3_model_quantize_internal(const std::string & fname_inp, cons
 struct llama_v3_model * llama_v3_load_model_from_file(
                              const char * path_model,
             struct llama_v3_context_params   params) {
-    ggml_time_init();
+    ggml_v3_time_init();
 
     llama_v3_model * model = new llama_v3_model;
 
-    ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
+    ggml_v3_type memory_type = params.f16_kv ? GGML_V3_TYPE_F16 : GGML_V3_TYPE_F32;
 
     if (!llama_v3_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gqa, params.rms_norm_eps, params.n_gpu_layers,
                 params.main_gpu, params.tensor_split, params.mul_mat_q, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
@@ -3430,7 +3387,7 @@ struct llama_v3_context * llama_v3_new_context_with_model(
     ctx->rng = std::mt19937(params.seed);
     ctx->logits_all = params.logits_all;
 
-    ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
+    ggml_v3_type memory_type = params.f16_kv ? GGML_V3_TYPE_F16 : GGML_V3_TYPE_F32;
 
     // reserve memory for context buffers
     if (!params.vocab_only) {
@@ -3441,7 +3398,7 @@ struct llama_v3_context * llama_v3_new_context_with_model(
         }
 
         {
-            const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
+            const size_t memory_size = ggml_v3_nbytes(ctx->kv_self.k) + ggml_v3_nbytes(ctx->kv_self.v);
             LLAMA_V3_LOG_INFO("%s: kv self size  = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
         }
 
@@ -3462,30 +3419,19 @@ struct llama_v3_context * llama_v3_new_context_with_model(
         {
             static const size_t tensor_alignment = 32;
             // the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data
-            ctx->buf_compute.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead());
+            ctx->buf_compute.resize(ggml_v3_tensor_overhead()*GGML_V3_MAX_NODES + ggml_v3_graph_overhead());
 
             // create measure allocator
-            ctx->alloc = ggml_allocr_new_measure(tensor_alignment);
+            ctx->alloc = ggml_v3_allocr_new_measure(tensor_alignment);
 
             // build worst-case graph
             int n_tokens = std::min((int)hparams.n_ctx, params.n_batch);
             int n_past = hparams.n_ctx - n_tokens;
             llama_v3_token token = llama_v3_token_bos(); // not actually used by llama_v3_build_graph, but required to choose between token and embedding inputs graph
-            ggml_cgraph * gf = llama_v3_build_graph(*ctx, &token, NULL, n_tokens, n_past);
-#ifdef GGML_USE_METAL
-            if (params.n_gpu_layers > 0) {
-                ctx->ctx_metal = ggml_metal_init(1);
-                if (!ctx->ctx_metal) {
-                    LLAMA_V3_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__);
-                    llama_v3_free(ctx);
-                    return NULL;
-                }
-                ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false);
-                ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
-            }
-#endif
+            ggml_v3_cgraph * gf = llama_v3_build_graph(*ctx, &token, NULL, n_tokens, n_past);
+
             // measure memory requirements for the graph
-            size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
+            size_t alloc_size = ggml_v3_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
 
             LLAMA_V3_LOG_INFO("%s: compute buffer total size = %7.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
 
@@ -3497,18 +3443,14 @@ struct llama_v3_context * llama_v3_new_context_with_model(
             //LLAMA_V3_LOG_INFO("%s: (debug) equivalent with scratch buffer = %7.2f MB\n", __func__, prev_req / 1024.0 / 1024.0);
 
             // recreate allocator with exact memory requirements
-            ggml_allocr_free(ctx->alloc);
+            ggml_v3_allocr_free(ctx->alloc);
 
             ctx->buf_alloc.resize(alloc_size);
-            ctx->alloc = ggml_allocr_new(ctx->buf_alloc.addr, ctx->buf_alloc.size, tensor_alignment);
-#ifdef GGML_USE_METAL
-            if (ctx->ctx_metal) {
-                ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
-            }
-#endif
+            ctx->alloc = ggml_v3_allocr_new(ctx->buf_alloc.addr, ctx->buf_alloc.size, tensor_alignment);
+
         }
 #else
-        ctx->buf_compute.resize(blasbatchmul*MEM_REQ_EVAL_3().at(ctx->model.type) + ggml_graph_overhead());
+        ctx->buf_compute.resize(blasbatchmul*MEM_REQ_EVAL_3().at(ctx->model.type) + ggml_v3_graph_overhead());
 #endif
 
 #ifdef LLAMA_V3_USE_SCRATCH
@@ -3517,54 +3459,6 @@ struct llama_v3_context * llama_v3_new_context_with_model(
 #endif
     }
 
-#ifdef GGML_USE_METAL
-    if (params.n_gpu_layers > 0) {
-        // this allocates all Metal resources and memory buffers
-
-        void * data_ptr  = NULL;
-        size_t data_size = 0;
-
-        if (params.use_mmap) {
-            data_ptr  = ctx->model.mapping->addr;
-            data_size = ctx->model.mapping->size;
-        } else {
-            data_ptr  = ggml_get_mem_buffer(ctx->model.ctx);
-            data_size = ggml_get_mem_size  (ctx->model.ctx);
-        }
-
-        const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
-
-        LLAMA_V3_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
-
-#define LLAMA_V3_METAL_CHECK_BUF(result)                            \
-    if (!(result)) {                                             \
-        LLAMA_V3_LOG_ERROR("%s: failed to add buffer\n", __func__); \
-        llama_v3_free(ctx);                                         \
-        return NULL;                                             \
-    }
-
-        LLAMA_V3_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
-
-        LLAMA_V3_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0));
-        LLAMA_V3_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv",   ctx->kv_self.buf.addr, ctx->kv_self.buf.size, 0));
-
-        LLAMA_V3_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.addr, ctx->buf_alloc.size, 0));
-#undef LLAMA_V3_METAL_CHECK_BUF
-    }
-#endif
-
-#ifdef GGML_USE_MPI
-    ctx->ctx_mpi = ggml_mpi_init();
-
-    if (ggml_mpi_rank(ctx->ctx_mpi) > 0) {
-        // Enter a blocking eval loop with dummy input, letting rank=0 drive the process
-        const std::vector<llama_v3_token> tmp(ctx->model.hparams.n_ctx, llama_v3_token_bos());
-        while (!llama_v3_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
-        llama_v3_backend_free();
-        exit(1);
-    }
-#endif
-
     return ctx;
 }
 
@@ -3601,7 +3495,7 @@ int llama_v3_model_quantize(
 int llama_v3_apply_lora_from_file_internal(const struct llama_v3_model & model, const char * path_lora, const char * path_base_model, int n_threads) {
     LLAMA_V3_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
 
-    const int64_t t_start_lora_us = ggml_time_us();
+    const int64_t t_start_lora_us = ggml_v3_time_us();
 
     auto fin = std::ifstream(path_lora, std::ios::binary);
     if (!fin) {
@@ -3638,16 +3532,16 @@ int llama_v3_apply_lora_from_file_internal(const struct llama_v3_model & model,
     // create a temporary ggml context to store the lora tensors
     // todo: calculate size from biggest possible tensor
     std::vector<uint8_t> lora_buf(1024ull * 1024ull * 1024ull);
-    struct ggml_init_params params;
+    struct ggml_v3_init_params params;
     params.mem_size   = lora_buf.size();
     params.mem_buffer = lora_buf.data();
     params.no_alloc   = false;
 
-    ggml_context * lora_ctx = ggml_init(params);
-    std::unordered_map<std::string, struct ggml_tensor *> lora_tensors;
+    ggml_v3_context * lora_ctx = ggml_v3_init(params);
+    std::unordered_map<std::string, struct ggml_v3_tensor *> lora_tensors;
 
     // create a name -> tensor map of the model to accelerate lookups
-    std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
+    std::unordered_map<std::string, struct ggml_v3_tensor*> model_tensors;
     for (const auto & kv: model.tensors_by_name) {
         model_tensors.insert(kv);
     }
@@ -3655,7 +3549,7 @@ int llama_v3_apply_lora_from_file_internal(const struct llama_v3_model & model,
 
     // load base model
     std::unique_ptr<llama_v3_model_loader> model_loader;
-    ggml_context * base_ctx = NULL;
+    ggml_v3_context * base_ctx = NULL;
     llama_v3_buffer base_buf;
     if (path_base_model) {
         LLAMA_V3_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
@@ -3666,18 +3560,18 @@ int llama_v3_apply_lora_from_file_internal(const struct llama_v3_model & model,
         model_loader->calc_sizes(&ctx_size, &mmapped_size);
         base_buf.resize(ctx_size);
 
-        ggml_init_params base_params;
+        ggml_v3_init_params base_params;
         base_params.mem_size   = base_buf.size;
         base_params.mem_buffer = base_buf.addr;
         base_params.no_alloc   = model_loader->use_mmap;
 
-        base_ctx = ggml_init(base_params);
+        base_ctx = ggml_v3_init(base_params);
 
-        model_loader->ggml_ctx = base_ctx;
+        model_loader->ggml_v3_ctx = base_ctx;
 
         // maybe this should in llama_v3_model_loader
         if (model_loader->use_mmap) {
-            model_loader->mapping.reset(new llama_v3_mmap(&model_loader->file_loader->file, /* prefetch */ 0, ggml_is_numa()));
+            model_loader->mapping.reset(new llama_v3_mmap(&model_loader->file_loader->file, /* prefetch */ 0, ggml_v3_is_numa()));
         }
     }
 
@@ -3730,10 +3624,10 @@ int llama_v3_apply_lora_from_file_internal(const struct llama_v3_model & model,
         }
 
         // create ggml tensor
-        ggml_type wtype;
+        ggml_v3_type wtype;
         switch (ftype) {
-            case 0: wtype = GGML_TYPE_F32;  break;
-            case 1: wtype = GGML_TYPE_F16;  break;
+            case 0: wtype = GGML_V3_TYPE_F32;  break;
+            case 1: wtype = GGML_V3_TYPE_F16;  break;
             default:
                     {
                         LLAMA_V3_LOG_ERROR("%s: invalid tensor data type '%d'\n",
@@ -3741,19 +3635,19 @@ int llama_v3_apply_lora_from_file_internal(const struct llama_v3_model & model,
                         return false;
                     }
         }
-        ggml_tensor * lora_tensor;
+        ggml_v3_tensor * lora_tensor;
         if (n_dims == 2) {
-            lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]);
+            lora_tensor = ggml_v3_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]);
         }
         else {
             LLAMA_V3_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims);
             return 1;
         }
-        ggml_set_name(lora_tensor, "lora_tensor");
+        ggml_v3_set_name(lora_tensor, "lora_tensor");
 
         // load tensor data
         size_t offset = fin.tellg();
-        size_t tensor_data_size = ggml_nbytes(lora_tensor);
+        size_t tensor_data_size = ggml_v3_nbytes(lora_tensor);
         offset = (offset + 31) & -32;
         fin.seekg(offset);
         fin.read((char*)lora_tensor->data, tensor_data_size);
@@ -3764,26 +3658,26 @@ int llama_v3_apply_lora_from_file_internal(const struct llama_v3_model & model,
         if (lora_tensors.find(base_name + ".loraA") != lora_tensors.end() &&
             lora_tensors.find(base_name + ".loraB") != lora_tensors.end()) {
 
-            ggml_tensor * dest_t = model_tensors[base_name];
+            ggml_v3_tensor * dest_t = model_tensors[base_name];
 
-            offload_func_t offload_func = llama_v3_nop;
-            offload_func_t offload_func_force_inplace = llama_v3_nop;
+            offload_func_v3_t offload_func = llama_v3_nop;
+            offload_func_v3_t offload_func_force_inplace = llama_v3_nop;
 
 #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
-            if (dest_t->backend == GGML_BACKEND_GPU || dest_t->backend == GGML_BACKEND_GPU_SPLIT) {
-                if (dest_t->type != GGML_TYPE_F16) {
+            if (dest_t->backend == GGML_V3_BACKEND_GPU || dest_t->backend == GGML_V3_BACKEND_GPU_SPLIT) {
+                if (dest_t->type != GGML_V3_TYPE_F16) {
                     printf("\nError: the simultaneous use of LoRAs and GPU acceleration is only supported for f16 models\n");
                     throw std::runtime_error(format_old(
                         "%s: error: lora failed", __func__));
                 }
 #if defined(GGML_USE_CUBLAS)
-                offload_func = ggml_cuda_assign_buffers;
-                offload_func_force_inplace = ggml_cuda_assign_buffers_force_inplace;
+                offload_func = ggml_v3_cuda_assign_buffers;
+                offload_func_force_inplace = ggml_v3_cuda_assign_buffers_force_inplace;
 #endif
             }
 #endif // GGML_USE_CUBLAS
 
-            ggml_tensor * base_t;
+            ggml_v3_tensor * base_t;
             if (model_loader) {
                 // load from base model
                 if (model_loader->tensors_map.name_to_idx.find(base_name) == model_loader->tensors_map.name_to_idx.end()) {
@@ -3792,16 +3686,16 @@ int llama_v3_apply_lora_from_file_internal(const struct llama_v3_model & model,
                 }
                 size_t idx = model_loader->tensors_map.name_to_idx[base_name];
                 llama_v3_load_tensor & lt = model_loader->tensors_map.tensors[idx];
-                base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU);
-                lt.data = (uint8_t *) lt.ggml_tensor->data;
+                base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_V3_BACKEND_CPU);
+                lt.data = (uint8_t *) lt.ggml_v3_tensor->data;
                 model_loader->load_data_for(lt);
-                lt.ggml_tensor->data = lt.data;
+                lt.ggml_v3_tensor->data = lt.data;
             }
             else {
                 base_t = dest_t;
             }
 
-            if (ggml_is_quantized(base_t->type)) {
+            if (ggml_v3_is_quantized(base_t->type)) {
                 if (!warned) {
                     LLAMA_V3_LOG_WARN("%s: warning: using a lora adapter with a quantized model may result in poor quality, "
                                    "use a f16 or f32 base model with --lora-base\n", __func__);
@@ -3809,13 +3703,13 @@ int llama_v3_apply_lora_from_file_internal(const struct llama_v3_model & model,
                 }
             }
 
-            ggml_tensor * loraA = lora_tensors[base_name + ".loraA"];
-            GGML_ASSERT(loraA->type == GGML_TYPE_F32);
-            ggml_set_name(loraA, "loraA");
+            ggml_v3_tensor * loraA = lora_tensors[base_name + ".loraA"];
+            GGML_V3_ASSERT(loraA->type == GGML_V3_TYPE_F32);
+            ggml_v3_set_name(loraA, "loraA");
 
-            ggml_tensor * loraB = lora_tensors[base_name + ".loraB"];
-            GGML_ASSERT(loraB->type == GGML_TYPE_F32);
-            ggml_set_name(loraB, "loraB");
+            ggml_v3_tensor * loraB = lora_tensors[base_name + ".loraB"];
+            GGML_V3_ASSERT(loraB->type == GGML_V3_TYPE_F32);
+            ggml_v3_set_name(loraB, "loraB");
 
             if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
                 LLAMA_V3_LOG_ERROR("%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
@@ -3824,43 +3718,43 @@ int llama_v3_apply_lora_from_file_internal(const struct llama_v3_model & model,
             }
 
             // w = w + BA*s
-            ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
+            ggml_v3_tensor * BA = ggml_v3_mul_mat(lora_ctx, loraA, loraB);
             offload_func(BA);
-            ggml_set_name(BA, "BA");
+            ggml_v3_set_name(BA, "BA");
 
             if (scaling != 1.0f) {
-                ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
-                ggml_set_name(scale_tensor, "scale_tensor");
+                ggml_v3_tensor * scale_tensor = ggml_v3_new_f32(lora_ctx, scaling);
+                ggml_v3_set_name(scale_tensor, "scale_tensor");
 
-                BA = ggml_scale_inplace(lora_ctx, BA, scaling);
+                BA = ggml_v3_scale_inplace(lora_ctx, BA, scaling);
                 offload_func(BA);
-                ggml_set_name(BA, "BA_scaled");
+                ggml_v3_set_name(BA, "BA_scaled");
             }
 
-            ggml_tensor * r;
+            ggml_v3_tensor * r;
             if (base_t == dest_t) {
-                r = ggml_add_inplace(lora_ctx, dest_t, BA);
+                r = ggml_v3_add_inplace(lora_ctx, dest_t, BA);
                 offload_func_force_inplace(r);
-                ggml_set_name(r, "r_add_inplace");
+                ggml_v3_set_name(r, "r_add_inplace");
             }
             else {
-                r = ggml_add(lora_ctx, base_t, BA);
+                r = ggml_v3_add(lora_ctx, base_t, BA);
                 offload_func(r);
-                ggml_set_name(r, "r_add");
+                ggml_v3_set_name(r, "r_add");
 
-                r = ggml_cpy(lora_ctx, r, dest_t);
+                r = ggml_v3_cpy(lora_ctx, r, dest_t);
                 offload_func(r);
-                ggml_set_name(r, "r_cpy");
+                ggml_v3_set_name(r, "r_cpy");
             }
 
-            struct ggml_cgraph * gf = ggml_new_graph(lora_ctx);
-            ggml_build_forward_expand(gf, r);
+            struct ggml_v3_cgraph * gf = ggml_v3_new_graph(lora_ctx);
+            ggml_v3_build_forward_expand(gf, r);
 
             llv3_graph_compute_helper(work_buffer, gf, n_threads);
 
             // we won't need these tensors again, reset the context to save memory
-            ggml_free(lora_ctx);
-            lora_ctx = ggml_init(params);
+            ggml_v3_free(lora_ctx);
+            lora_ctx = ggml_v3_init(params);
             lora_tensors.clear();
 
             n_tensors++;
@@ -3871,12 +3765,12 @@ int llama_v3_apply_lora_from_file_internal(const struct llama_v3_model & model,
     }
 
     // TODO: this should be in a destructor, it will leak on failure
-    ggml_free(lora_ctx);
+    ggml_v3_free(lora_ctx);
     if (base_ctx) {
-        ggml_free(base_ctx);
+        ggml_v3_free(base_ctx);
     }
 
-    const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
+    const int64_t t_lora_us = ggml_v3_time_us() - t_start_lora_us;
     LLAMA_V3_LOG_INFO(" done (%.2f ms)\n", t_lora_us / 1000.0);
 
     return 0;
@@ -4019,32 +3913,32 @@ void llama_v3_copy_state_data_internal(struct llama_v3_context * ctx, llama_v3_d
         data_ctx->write(&kv_ntok, sizeof(kv_ntok));
 
         if (kv_size) {
-            const size_t elt_size = ggml_element_size(kv_self.k);
+            const size_t elt_size = ggml_v3_element_size(kv_self.k);
 
-            ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
-            ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
+            ggml_v3_context * cpy_ctx = ggml_v3_init({ 4096, NULL, /* no_alloc */ true });
+            ggml_v3_cgraph * gf = ggml_v3_new_graph(cpy_ctx);
 
-            ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
-            std::vector<uint8_t> kout3d_data(ggml_nbytes(kout3d), 0);
+            ggml_v3_tensor * kout3d = ggml_v3_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
+            std::vector<uint8_t> kout3d_data(ggml_v3_nbytes(kout3d), 0);
             kout3d->data = kout3d_data.data();
 
-            ggml_tensor * vout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer);
-            std::vector<uint8_t> vout3d_data(ggml_nbytes(vout3d), 0);
+            ggml_v3_tensor * vout3d = ggml_v3_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer);
+            std::vector<uint8_t> vout3d_data(ggml_v3_nbytes(vout3d), 0);
             vout3d->data = vout3d_data.data();
 
-            ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
+            ggml_v3_tensor * k3d = ggml_v3_view_3d(cpy_ctx, kv_self.k,
                 n_embd, kv_ntok, n_layer,
                 elt_size*n_embd, elt_size*n_embd*n_ctx, 0);
 
-            ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v,
+            ggml_v3_tensor * v3d = ggml_v3_view_3d(cpy_ctx, kv_self.v,
                 kv_ntok, n_embd, n_layer,
                 elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
 
-            ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k3d, kout3d));
-            ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v3d, vout3d));
+            ggml_v3_build_forward_expand(gf, ggml_v3_cpy(cpy_ctx, k3d, kout3d));
+            ggml_v3_build_forward_expand(gf, ggml_v3_cpy(cpy_ctx, v3d, vout3d));
             llv3_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
 
-            ggml_free(cpy_ctx);
+            ggml_v3_free(cpy_ctx);
 
             // our data is now in the kout3d_data and vout3d_data buffers
             // write them to file
@@ -4129,32 +4023,32 @@ size_t llama_v3_set_state_data(struct llama_v3_context * ctx, uint8_t * src) {
         if (kv_size) {
             LLAMA_V3_ASSERT(kv_self.buf.size == kv_size);
 
-            const size_t elt_size = ggml_element_size(kv_self.k);
+            const size_t elt_size = ggml_v3_element_size(kv_self.k);
 
-            ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
-            ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
+            ggml_v3_context * cpy_ctx = ggml_v3_init({ 4096, NULL, /* no_alloc */ true });
+            ggml_v3_cgraph * gf = ggml_v3_new_graph(cpy_ctx);
 
-            ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
+            ggml_v3_tensor * kin3d = ggml_v3_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
             kin3d->data = (void *) inp;
-            inp += ggml_nbytes(kin3d);
+            inp += ggml_v3_nbytes(kin3d);
 
-            ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer);
+            ggml_v3_tensor * vin3d = ggml_v3_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer);
             vin3d->data = (void *) inp;
-            inp += ggml_nbytes(vin3d);
+            inp += ggml_v3_nbytes(vin3d);
 
-            ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
+            ggml_v3_tensor * k3d = ggml_v3_view_3d(cpy_ctx, kv_self.k,
                 n_embd, kv_ntok, n_layer,
                 elt_size*n_embd, elt_size*n_embd*n_ctx, 0);
 
-            ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v,
+            ggml_v3_tensor * v3d = ggml_v3_view_3d(cpy_ctx, kv_self.v,
                 kv_ntok, n_embd, n_layer,
                 elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
 
-            ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin3d, k3d));
-            ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin3d, v3d));
+            ggml_v3_build_forward_expand(gf, ggml_v3_cpy(cpy_ctx, kin3d, k3d));
+            ggml_v3_build_forward_expand(gf, ggml_v3_cpy(cpy_ctx, vin3d, v3d));
             llv3_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
 
-            ggml_free(cpy_ctx);
+            ggml_v3_free(cpy_ctx);
         }
 
         ctx->kv_self.n = kv_ntok;
@@ -4264,7 +4158,7 @@ int llama_v3_eval(
     // get a more accurate load time, upon first eval
     // TODO: fix this
     if (!ctx->has_evaluated_once) {
-        ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
+        ctx->t_load_us = ggml_v3_time_us() - ctx->t_start_us;
         ctx->has_evaluated_once = true;
     }
 
@@ -4286,7 +4180,7 @@ int llama_v3_eval_embd(
     // get a more accurate load time, upon first eval
     // TODO: fix this
     if (!ctx->has_evaluated_once) {
-        ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
+        ctx->t_load_us = ggml_v3_time_us() - ctx->t_start_us;
         ctx->has_evaluated_once = true;
     }
 
@@ -4420,7 +4314,7 @@ llama_v3_token llama_v3_token_nl() {
 struct llama_v3_timings llama_v3_get_timings(struct llama_v3_context * ctx) {
     struct llama_v3_timings result = {
         /*.t_start_ms  =*/ 1e-3 * ctx->t_start_us,
-        /*.t_end_ms    =*/ 1.00 * ggml_time_ms(),
+        /*.t_end_ms    =*/ 1.00 * ggml_v3_time_ms(),
         /*.t_load_ms   =*/ 1e-3 * ctx->t_load_us,
         /*.t_sample_ms =*/ 1e-3 * ctx->t_sample_us,
         /*.t_p_eval_ms =*/ 1e-3 * ctx->t_p_eval_us,
@@ -4449,7 +4343,7 @@ void llama_v3_print_timings(struct llama_v3_context * ctx) {
 }
 
 void llama_v3_reset_timings(struct llama_v3_context * ctx) {
-    ctx->t_start_us = ggml_time_us();
+    ctx->t_start_us = ggml_v3_time_us();
     ctx->t_sample_us = ctx->n_sample = 0;
     ctx->t_eval_us   = ctx->n_eval   = 0;
     ctx->t_p_eval_us = ctx->n_p_eval = 0;
@@ -4459,26 +4353,26 @@ const char * llama_v3_print_system_info(void) {
     static std::string s;
 
     s  = "";
-    s += "AVX = "         + std::to_string(ggml_cpu_has_avx())         + " | ";
-    s += "AVX2 = "        + std::to_string(ggml_cpu_has_avx2())        + " | ";
-    s += "AVX512 = "      + std::to_string(ggml_cpu_has_avx512())      + " | ";
-    s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
-    s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | ";
-    s += "FMA = "         + std::to_string(ggml_cpu_has_fma())         + " | ";
-    s += "NEON = "        + std::to_string(ggml_cpu_has_neon())        + " | ";
-    s += "ARM_FMA = "     + std::to_string(ggml_cpu_has_arm_fma())     + " | ";
-    s += "F16C = "        + std::to_string(ggml_cpu_has_f16c())        + " | ";
-    s += "FP16_VA = "     + std::to_string(ggml_cpu_has_fp16_va())     + " | ";
-    s += "WASM_SIMD = "   + std::to_string(ggml_cpu_has_wasm_simd())   + " | ";
-    s += "BLAS = "        + std::to_string(ggml_cpu_has_blas())        + " | ";
-    s += "SSE3 = "        + std::to_string(ggml_cpu_has_sse3())        + " | ";
-    s += "VSX = "         + std::to_string(ggml_cpu_has_vsx())         + " | ";
+    s += "AVX = "         + std::to_string(ggml_v3_cpu_has_avx())         + " | ";
+    s += "AVX2 = "        + std::to_string(ggml_v3_cpu_has_avx2())        + " | ";
+    s += "AVX512 = "      + std::to_string(ggml_v3_cpu_has_avx512())      + " | ";
+    s += "AVX512_VBMI = " + std::to_string(ggml_v3_cpu_has_avx512_vbmi()) + " | ";
+    s += "AVX512_VNNI = " + std::to_string(ggml_v3_cpu_has_avx512_vnni()) + " | ";
+    s += "FMA = "         + std::to_string(ggml_v3_cpu_has_fma())         + " | ";
+    s += "NEON = "        + std::to_string(ggml_v3_cpu_has_neon())        + " | ";
+    s += "ARM_FMA = "     + std::to_string(ggml_v3_cpu_has_arm_fma())     + " | ";
+    s += "F16C = "        + std::to_string(ggml_v3_cpu_has_f16c())        + " | ";
+    s += "FP16_VA = "     + std::to_string(ggml_v3_cpu_has_fp16_va())     + " | ";
+    s += "WASM_SIMD = "   + std::to_string(ggml_v3_cpu_has_wasm_simd())   + " | ";
+    s += "BLAS = "        + std::to_string(ggml_v3_cpu_has_blas())        + " | ";
+    s += "SSE3 = "        + std::to_string(ggml_v3_cpu_has_sse3())        + " | ";
+    s += "VSX = "         + std::to_string(ggml_v3_cpu_has_vsx())         + " | ";
 
     return s.c_str();
 }
 
 // For internal test use
-const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_v3_internal_get_tensor_map(struct llama_v3_context * ctx) {
+const std::vector<std::pair<std::string, struct ggml_v3_tensor *>>& llama_v3_internal_get_tensor_map(struct llama_v3_context * ctx) {
     return ctx->model.tensors_by_name;
 }
 
diff --git a/otherarch/llama_v3.h b/otherarch/llama_v3.h
index 2cc4b4707..2fe17d86f 100644
--- a/otherarch/llama_v3.h
+++ b/otherarch/llama_v3.h
@@ -1,10 +1,10 @@
 #ifndef LLAMA_V3_H
 #define LLAMA_V3_H
 
-#include "ggml.h"
+#include "ggml_v3.h"
 #ifdef GGML_USE_CUBLAS
-#include "ggml-cuda.h"
-#define LLAMA_V3_MAX_DEVICES GGML_CUDA_MAX_DEVICES
+#include "ggml_v3-cuda.h"
+#define LLAMA_V3_MAX_DEVICES GGML_V3_CUDA_MAX_DEVICES
 #else
 #define LLAMA_V3_MAX_DEVICES 1
 #endif // GGML_USE_CUBLAS
@@ -477,9 +477,9 @@ extern "C" {
 
 #include <vector>
 #include <string>
-struct ggml_tensor;
+struct ggml_v3_tensor;
 
-const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_v3_internal_get_tensor_map(struct llama_v3_context * ctx);
+const std::vector<std::pair<std::string, struct ggml_v3_tensor *>>& llama_v3_internal_get_tensor_map(struct llama_v3_context * ctx);
 
 #endif
 
diff --git a/otherarch/mpt_v3.cpp b/otherarch/mpt_v3.cpp
index cf910ac4c..3372b06ce 100644
--- a/otherarch/mpt_v3.cpp
+++ b/otherarch/mpt_v3.cpp
@@ -1,4 +1,4 @@
-#include "ggml.h"
+#include "ggml_v3.h"
 #include "otherarch.h"
 
 #include "utils.h"
@@ -17,10 +17,10 @@
 #include "model_adapter.h"
 
 #ifdef GGML_USE_CUBLAS
-#include "ggml-cuda.h"
+#include "ggml_v3-cuda.h"
 #endif
 #if defined(GGML_USE_CLBLAST)
-#include "ggml-opencl.h"
+#include "ggml_v3-opencl.h"
 #endif
 
 // load the model's weights from a file
@@ -58,7 +58,7 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo
 
         hparams.n_ctx = std::min(hparams.max_seq_len, hparams.n_ctx);
 
-        const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
+        const int32_t qntvr = hparams.ftype / GGML_V3_QNT_VERSION_FACTOR;
 
         printf("%s: d_model        = %d\n", __func__, hparams.d_model);
         printf("%s: max_seq_len    = %d\n", __func__, hparams.max_seq_len);
@@ -71,7 +71,7 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo
         printf("%s: ftype          = %d\n", __func__, hparams.ftype);
         printf("%s: qntvr          = %d\n", __func__, qntvr);
 
-        hparams.ftype %= GGML_QNT_VERSION_FACTOR;
+        hparams.ftype %= GGML_V3_QNT_VERSION_FACTOR;
     }
 
     // load vocab
@@ -107,8 +107,8 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo
     // for the big tensors, we have the option to store the data in 16-bit
     // floats or quantized in order to save memory and also to speed up the
     // computation
-    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype)(model.hparams.ftype));
-    if (wtype == GGML_TYPE_COUNT) {
+    ggml_v3_type wtype = ggml_v3_ftype_to_ggml_v3_type((ggml_v3_ftype)(model.hparams.ftype));
+    if (wtype == GGML_V3_TYPE_COUNT) {
         fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n", __func__, fname.c_str(),
                 model.hparams.ftype);
         return false;
@@ -126,18 +126,18 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo
         const size_t n_layer = hparams.n_layers;
         const size_t n_vocab = hparams.n_vocab;
 
-        ctx_size += n_embd * n_vocab * ggml_type_sizef(wtype); // wte_weight
-        ctx_size += n_embd * ggml_type_sizef(GGML_TYPE_F32);   // norm_f_weight
+        ctx_size += n_embd * n_vocab * ggml_v3_type_sizef(wtype); // wte_weight
+        ctx_size += n_embd * ggml_v3_type_sizef(GGML_V3_TYPE_F32);   // norm_f_weight
 
-        ctx_size += n_layer * (n_embd * ggml_type_sizef(GGML_TYPE_F32));      // ln_1_weight
-        ctx_size += n_layer * (3 * n_embd * n_embd * ggml_type_sizef(wtype)); // attn_Wqkv_weight
-        ctx_size += n_layer * (n_embd * n_embd * ggml_type_sizef(wtype));     // attn_out_proj_weight
-        ctx_size += n_layer * (n_embd * ggml_type_sizef(GGML_TYPE_F32));      // ln_2_weight
-        ctx_size += n_layer * (4 * n_embd * n_embd * ggml_type_sizef(wtype)); // mlp_mlp_up_weight
-        ctx_size += n_layer * (n_embd * n_embd * 4 * ggml_type_sizef(wtype)); // mlp_mlp_down_weight
+        ctx_size += n_layer * (n_embd * ggml_v3_type_sizef(GGML_V3_TYPE_F32));      // ln_1_weight
+        ctx_size += n_layer * (3 * n_embd * n_embd * ggml_v3_type_sizef(wtype)); // attn_Wqkv_weight
+        ctx_size += n_layer * (n_embd * n_embd * ggml_v3_type_sizef(wtype));     // attn_out_proj_weight
+        ctx_size += n_layer * (n_embd * ggml_v3_type_sizef(GGML_V3_TYPE_F32));      // ln_2_weight
+        ctx_size += n_layer * (4 * n_embd * n_embd * ggml_v3_type_sizef(wtype)); // mlp_mlp_up_weight
+        ctx_size += n_layer * (n_embd * n_embd * 4 * ggml_v3_type_sizef(wtype)); // mlp_mlp_down_weight
 
-        ctx_size += n_ctx * n_layer * n_embd * ggml_type_sizef(GGML_TYPE_F16); // memory_k
-        ctx_size += n_ctx * n_layer * n_embd * ggml_type_sizef(GGML_TYPE_F16); // memory_v
+        ctx_size += n_ctx * n_layer * n_embd * ggml_v3_type_sizef(GGML_V3_TYPE_F16); // memory_k
+        ctx_size += n_ctx * n_layer * n_embd * ggml_v3_type_sizef(GGML_V3_TYPE_F16); // memory_v
 
         ctx_size += (6 + 6 * n_layer) * 512; // object overhead
 
@@ -146,14 +146,14 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo
 
     // create the ggml context
     {
-        struct ggml_init_params params;
+        struct ggml_v3_init_params params;
         params.mem_size = ctx_size;
         params.mem_buffer = NULL;
         params.no_alloc = false;
 
-        model.ctx = ggml_init(params);
+        model.ctx = ggml_v3_init(params);
         if (!model.ctx) {
-            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
+            fprintf(stderr, "%s: ggml_v3_init() failed\n", __func__);
             return false;
         }
     }
@@ -168,8 +168,8 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo
 
         model.layers.resize(n_layer);
 
-        model.wte_weight    = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
-        model.norm_f_weight = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+        model.wte_weight    = ggml_v3_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
+        model.norm_f_weight = ggml_v3_new_tensor_1d(ctx, GGML_V3_TYPE_F32, n_embd);
 
         // map by name
         model.tensors["transformer.wte.weight"]    = model.wte_weight;
@@ -178,12 +178,12 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo
         for (int i = 0; i < (int) n_layer; ++i) {
             auto & layer = model.layers[i];
 
-            layer.norm_1_weight          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,     n_embd);
-            layer.c_attn_wqkv_weight     = ggml_new_tensor_2d(ctx, wtype,             n_embd, 3 * n_embd);
-            layer.c_attn_out_proj_weight = ggml_new_tensor_2d(ctx, wtype,             n_embd,     n_embd);
-            layer.norm_2_weight          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,     n_embd);
-            layer.ffn_up_proj            = ggml_new_tensor_2d(ctx, wtype,             n_embd, 4 * n_embd);
-            layer.ffn_down_proj          = ggml_new_tensor_2d(ctx, wtype,         4 * n_embd,     n_embd);
+            layer.norm_1_weight          = ggml_v3_new_tensor_1d(ctx, GGML_V3_TYPE_F32,     n_embd);
+            layer.c_attn_wqkv_weight     = ggml_v3_new_tensor_2d(ctx, wtype,             n_embd, 3 * n_embd);
+            layer.c_attn_out_proj_weight = ggml_v3_new_tensor_2d(ctx, wtype,             n_embd,     n_embd);
+            layer.norm_2_weight          = ggml_v3_new_tensor_1d(ctx, GGML_V3_TYPE_F32,     n_embd);
+            layer.ffn_up_proj            = ggml_v3_new_tensor_2d(ctx, wtype,             n_embd, 4 * n_embd);
+            layer.ffn_down_proj          = ggml_v3_new_tensor_2d(ctx, wtype,         4 * n_embd,     n_embd);
 
             // map by name
             model.tensors["transformer.blocks." + std::to_string(i) + ".norm_1.weight"]        = layer.norm_1_weight;
@@ -205,10 +205,10 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo
         const int64_t n_mem      = n_layer * n_ctx;
         const int64_t n_elements = n_embd  * n_mem;
 
-        model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
-        model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
+        model.memory_k = ggml_v3_new_tensor_1d(ctx, GGML_V3_TYPE_F16, n_elements);
+        model.memory_v = ggml_v3_new_tensor_1d(ctx, GGML_V3_TYPE_F16, n_elements);
 
-        const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
+        const size_t memory_size = ggml_v3_nbytes(model.memory_k) + ggml_v3_nbytes(model.memory_v);
 
         printf("%s: memory_size = %8.2f MB, n_mem = %" PRId64 "\n", __func__, memory_size / 1024.0 / 1024.0, n_mem);
     }
@@ -249,7 +249,7 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo
             }
 
             auto tensor = model.tensors[name.data()];
-            if (ggml_nelements(tensor) != nelements) {
+            if (ggml_v3_nelements(tensor) != nelements) {
                 fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
                 return false;
             }
@@ -265,22 +265,22 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo
             // for debugging
             if (0) {
                 printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1],
-                       ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor) / 1024.0 / 1024.0, ggml_nbytes(tensor));
+                       ggml_v3_type_name(ggml_v3_type(ttype)), ggml_v3_nbytes(tensor) / 1024.0 / 1024.0, ggml_v3_nbytes(tensor));
             }
 
-            const size_t bpe = ggml_type_size(ggml_type(ttype));
+            const size_t bpe = ggml_v3_type_size(ggml_v3_type(ttype));
 
-            if ((nelements * bpe) / ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
+            if ((nelements * bpe) / ggml_v3_blck_size(tensor->type) != ggml_v3_nbytes(tensor)) {
                 fprintf(stderr,
                         "%s: tensor '%s' has wrong size in model file: got %zu, "
                         "expected %zu\n",
-                        __func__, name.data(), ggml_nbytes(tensor), nelements * bpe);
+                        __func__, name.data(), ggml_v3_nbytes(tensor), nelements * bpe);
                 return false;
             }
 
-            fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
+            fin.read(reinterpret_cast<char *>(tensor->data), ggml_v3_nbytes(tensor));
 
-            total_size += ggml_nbytes(tensor);
+            total_size += ggml_v3_nbytes(tensor);
             if (++n_tensors % 8 == 0) {
                 printf(".");
                 fflush(stdout);
@@ -308,20 +308,20 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo
         #endif
         for (int i = 0; i < n_gpu; ++i) {
             const auto & layer = model.layers[i];
-            layer.ffn_up_proj->backend = GGML_BACKEND_GPU;
-            layer.ffn_down_proj->backend = GGML_BACKEND_GPU;
-            layer.c_attn_wqkv_weight->backend = GGML_BACKEND_GPU;
-            layer.c_attn_out_proj_weight->backend = GGML_BACKEND_GPU;
+            layer.ffn_up_proj->backend = GGML_V3_BACKEND_GPU;
+            layer.ffn_down_proj->backend = GGML_V3_BACKEND_GPU;
+            layer.c_attn_wqkv_weight->backend = GGML_V3_BACKEND_GPU;
+            layer.c_attn_out_proj_weight->backend = GGML_V3_BACKEND_GPU;
             #if defined(GGML_USE_CLBLAST)
-            ggml_cl_transform_tensor(layer.ffn_up_proj->data,layer.ffn_up_proj); vram_total += ggml_nbytes(layer.ffn_up_proj);
-            ggml_cl_transform_tensor(layer.ffn_down_proj->data,layer.ffn_down_proj); vram_total += ggml_nbytes(layer.ffn_down_proj);
-            ggml_cl_transform_tensor(layer.c_attn_wqkv_weight->data,layer.c_attn_wqkv_weight); vram_total += ggml_nbytes(layer.c_attn_wqkv_weight);
-            ggml_cl_transform_tensor(layer.c_attn_out_proj_weight->data,layer.c_attn_out_proj_weight); vram_total += ggml_nbytes(layer.c_attn_out_proj_weight);
+            ggml_v3_cl_transform_tensor(layer.ffn_up_proj->data,layer.ffn_up_proj); vram_total += ggml_v3_nbytes(layer.ffn_up_proj);
+            ggml_v3_cl_transform_tensor(layer.ffn_down_proj->data,layer.ffn_down_proj); vram_total += ggml_v3_nbytes(layer.ffn_down_proj);
+            ggml_v3_cl_transform_tensor(layer.c_attn_wqkv_weight->data,layer.c_attn_wqkv_weight); vram_total += ggml_v3_nbytes(layer.c_attn_wqkv_weight);
+            ggml_v3_cl_transform_tensor(layer.c_attn_out_proj_weight->data,layer.c_attn_out_proj_weight); vram_total += ggml_v3_nbytes(layer.c_attn_out_proj_weight);
             #else
-            ggml_cuda_transform_tensor(layer.ffn_up_proj->data,layer.ffn_up_proj); vram_total += ggml_nbytes(layer.ffn_up_proj);
-            ggml_cuda_transform_tensor(layer.ffn_down_proj->data,layer.ffn_down_proj); vram_total += ggml_nbytes(layer.ffn_down_proj);
-            ggml_cuda_transform_tensor(layer.c_attn_wqkv_weight->data,layer.c_attn_wqkv_weight); vram_total += ggml_nbytes(layer.c_attn_wqkv_weight);
-            ggml_cuda_transform_tensor(layer.c_attn_out_proj_weight->data,layer.c_attn_out_proj_weight); vram_total += ggml_nbytes(layer.c_attn_out_proj_weight);
+            ggml_v3_cuda_transform_tensor(layer.ffn_up_proj->data,layer.ffn_up_proj); vram_total += ggml_v3_nbytes(layer.ffn_up_proj);
+            ggml_v3_cuda_transform_tensor(layer.ffn_down_proj->data,layer.ffn_down_proj); vram_total += ggml_v3_nbytes(layer.ffn_down_proj);
+            ggml_v3_cuda_transform_tensor(layer.c_attn_wqkv_weight->data,layer.c_attn_wqkv_weight); vram_total += ggml_v3_nbytes(layer.c_attn_wqkv_weight);
+            ggml_v3_cuda_transform_tensor(layer.c_attn_out_proj_weight->data,layer.c_attn_out_proj_weight); vram_total += ggml_v3_nbytes(layer.c_attn_out_proj_weight);
             #endif
         }
         #if defined(GGML_USE_CLBLAST)
@@ -384,32 +384,32 @@ bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past,
         }
     }
 
-    struct ggml_init_params params;
+    struct ggml_v3_init_params params;
     params.mem_size   = buf_size;
     params.mem_buffer = buf;
     params.no_alloc   = false;
 
-    struct ggml_context * ctx0 = ggml_init(params);
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, GGML_MAX_NODES, false);
+    struct ggml_v3_context * ctx0 = ggml_v3_init(params);
+    struct ggml_v3_cgraph * gf = ggml_v3_new_graph_custom(ctx0, GGML_V3_MAX_NODES, false);
 
-    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    memcpy(embd->data, embd_inp.data(), N * ggml_element_size(embd));
+    struct ggml_v3_tensor * embd = ggml_v3_new_tensor_1d(ctx0, GGML_V3_TYPE_I32, N);
+    memcpy(embd->data, embd_inp.data(), N * ggml_v3_element_size(embd));
 
-    struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.wte_weight, embd);
+    struct ggml_v3_tensor * inpL = ggml_v3_get_rows(ctx0, model.wte_weight, embd);
 
     for (int il = 0; il < n_layer; ++il) {
 
-        struct ggml_tensor * cur;
+        struct ggml_v3_tensor * cur;
 
         if(use_scratch){
-        ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
+        ggml_v3_set_scratch(ctx0, { 0, scr0_size, scr0, });
         }
 
         // a = self.ln_1(x)
         {
-            cur = ggml_norm(ctx0, inpL, default_norm_eps);
+            cur = ggml_v3_norm(ctx0, inpL, default_norm_eps);
 
-            cur = ggml_mul(ctx0, ggml_repeat(ctx0, model.layers[il].norm_1_weight, cur), cur);
+            cur = ggml_v3_mul(ctx0, ggml_v3_repeat(ctx0, model.layers[il].norm_1_weight, cur), cur);
         }
 
         // self-attention
@@ -418,164 +418,164 @@ bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past,
         //  is_causal=is_causal)
         {
             // compute QKV
-            cur = ggml_mul_mat(ctx0, model.layers[il].c_attn_wqkv_weight, cur);
+            cur = ggml_v3_mul_mat(ctx0, model.layers[il].c_attn_wqkv_weight, cur);
 
             if (model.hparams.clip_qkv > 0.0f) {
-                cur = ggml_clamp(ctx0, cur, -model.hparams.clip_qkv, model.hparams.clip_qkv);
+                cur = ggml_v3_clamp(ctx0, cur, -model.hparams.clip_qkv, model.hparams.clip_qkv);
             }
 
-            struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0 * sizeof(float) * n_embd);
-            struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1 * sizeof(float) * n_embd);
-            struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2 * sizeof(float) * n_embd);
+            struct ggml_v3_tensor * Qcur = ggml_v3_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0 * sizeof(float) * n_embd);
+            struct ggml_v3_tensor * Kcur = ggml_v3_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1 * sizeof(float) * n_embd);
+            struct ggml_v3_tensor * Vcur = ggml_v3_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2 * sizeof(float) * n_embd);
 
             // store key and value to memory
             {
-                struct ggml_tensor * k =
-                    ggml_view_1d(ctx0, model.memory_k, N * n_embd,
-                                 (ggml_element_size(model.memory_k) * n_embd) * (il * n_ctx + n_past));
-                struct ggml_tensor * v =
-                    ggml_view_1d(ctx0, model.memory_v, N * n_embd,
-                                 (ggml_element_size(model.memory_v) * n_embd) * (il * n_ctx + n_past));
+                struct ggml_v3_tensor * k =
+                    ggml_v3_view_1d(ctx0, model.memory_k, N * n_embd,
+                                 (ggml_v3_element_size(model.memory_k) * n_embd) * (il * n_ctx + n_past));
+                struct ggml_v3_tensor * v =
+                    ggml_v3_view_1d(ctx0, model.memory_v, N * n_embd,
+                                 (ggml_v3_element_size(model.memory_v) * n_embd) * (il * n_ctx + n_past));
 
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
+                ggml_v3_build_forward_expand(gf, ggml_v3_cpy(ctx0, Kcur, k));
+                ggml_v3_build_forward_expand(gf, ggml_v3_cpy(ctx0, Vcur, v));
             }
 
             // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0,
             // 2, 1, 3) [64, N, 12]
-            struct ggml_tensor * Q = ggml_permute(
-                ctx0, ggml_cpy(ctx0, Qcur, ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd / n_head, n_head, N)), 0, 2,
+            struct ggml_v3_tensor * Q = ggml_v3_permute(
+                ctx0, ggml_v3_cpy(ctx0, Qcur, ggml_v3_new_tensor_3d(ctx0, GGML_V3_TYPE_F32, n_embd / n_head, n_head, N)), 0, 2,
                 1, 3);
 
             // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1,
             // 3) [64, n_past + N, 12]
-            struct ggml_tensor * K =
-                ggml_permute(ctx0,
-                             ggml_reshape_3d(ctx0,
-                                             ggml_view_1d(ctx0, model.memory_k, (n_past + N) * n_embd,
-                                                          il * n_ctx * ggml_element_size(model.memory_k) * n_embd),
+            struct ggml_v3_tensor * K =
+                ggml_v3_permute(ctx0,
+                             ggml_v3_reshape_3d(ctx0,
+                                             ggml_v3_view_1d(ctx0, model.memory_k, (n_past + N) * n_embd,
+                                                          il * n_ctx * ggml_v3_element_size(model.memory_k) * n_embd),
                                              n_embd / n_head, n_head, n_past + N),
                              0, 2, 1, 3);
             // K * Q
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+            struct ggml_v3_tensor * KQ = ggml_v3_mul_mat(ctx0, K, Q);
 
             // KQ_scaled = KQ / sqrt(n_embd/n_head)
-            struct ggml_tensor * KQ_scaled =
-                ggml_scale(ctx0, KQ, 1.0f / sqrt(float(n_embd) / n_head));
+            struct ggml_v3_tensor * KQ_scaled =
+                ggml_v3_scale(ctx0, KQ, 1.0f / sqrt(float(n_embd) / n_head));
 
-            struct ggml_tensor * KQ_scaled_alibi =
-                ggml_alibi(ctx0, KQ_scaled, n_past, n_head, model.hparams.alibi_bias_max);
+            struct ggml_v3_tensor * KQ_scaled_alibi =
+                ggml_v3_alibi(ctx0, KQ_scaled, n_past, n_head, model.hparams.alibi_bias_max);
 
             // KQ_masked = mask_past(KQ_scaled)
-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past);
+            struct ggml_v3_tensor * KQ_masked = ggml_v3_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past);
 
             // KQ = soft_max(KQ_masked)
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
+            struct ggml_v3_tensor * KQ_soft_max = ggml_v3_soft_max(ctx0, KQ_masked);
 
             // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1,
             // 2, 0, 3).contiguous() [n_past + N, 64, 12]
-            struct ggml_tensor * V_trans = ggml_cpy(
+            struct ggml_v3_tensor * V_trans = ggml_v3_cpy(
                 ctx0,
-                ggml_permute(ctx0,
-                             ggml_reshape_3d(ctx0,
-                                             ggml_view_1d(ctx0, model.memory_v, (n_past + N) * n_embd,
-                                                          il * n_ctx * ggml_element_size(model.memory_v) * n_embd),
+                ggml_v3_permute(ctx0,
+                             ggml_v3_reshape_3d(ctx0,
+                                             ggml_v3_view_1d(ctx0, model.memory_v, (n_past + N) * n_embd,
+                                                          il * n_ctx * ggml_v3_element_size(model.memory_v) * n_embd),
                                              n_embd / n_head, n_head, n_past + N),
                              1, 2, 0, 3),
-                ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd / n_head, n_head));
+                ggml_v3_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd / n_head, n_head));
 
             // KQV = transpose(V) * KQ_soft_max
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
+            struct ggml_v3_tensor * KQV = ggml_v3_mul_mat(ctx0, V_trans, KQ_soft_max);
 
             // KQV_merged = KQV.permute(0, 2, 1, 3)
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+            struct ggml_v3_tensor * KQV_merged = ggml_v3_permute(ctx0, KQV, 0, 2, 1, 3);
 
             // cur = KQV_merged.contiguous().view(n_embd, N)
-            cur = ggml_cpy(ctx0, KQV_merged, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
+            cur = ggml_v3_cpy(ctx0, KQV_merged, ggml_v3_new_tensor_2d(ctx0, GGML_V3_TYPE_F32, n_embd, N));
 
             // projection
-            { cur = ggml_mul_mat(ctx0, model.layers[il].c_attn_out_proj_weight, cur); }
+            { cur = ggml_v3_mul_mat(ctx0, model.layers[il].c_attn_out_proj_weight, cur); }
         }
 
-        inpL = ggml_add(ctx0, inpL, cur);
+        inpL = ggml_v3_add(ctx0, inpL, cur);
 
         if(use_scratch){
-        ggml_set_scratch(ctx0, { 0, scr1_size, scr1, });
+        ggml_v3_set_scratch(ctx0, { 0, scr1_size, scr1, });
         }
 
         // m = self.ln_2(x)
         {
-            cur = ggml_norm(ctx0, inpL, default_norm_eps);
+            cur = ggml_v3_norm(ctx0, inpL, default_norm_eps);
 
-            cur = ggml_mul(ctx0, ggml_repeat(ctx0, model.layers[il].norm_2_weight, cur), cur);
+            cur = ggml_v3_mul(ctx0, ggml_v3_repeat(ctx0, model.layers[il].norm_2_weight, cur), cur);
         }
 
         // n = self.mlp(m)
         {
 
-            cur = ggml_mul_mat(ctx0, model.layers[il].ffn_up_proj, cur);
+            cur = ggml_v3_mul_mat(ctx0, model.layers[il].ffn_up_proj, cur);
 
             // GELU activation
-            cur = ggml_gelu(ctx0, cur);
+            cur = ggml_v3_gelu(ctx0, cur);
 
             // projection
             // cur = proj_w*cur + proj_b
-            cur = ggml_mul_mat(ctx0, model.layers[il].ffn_down_proj, cur);
+            cur = ggml_v3_mul_mat(ctx0, model.layers[il].ffn_down_proj, cur);
         }
 
         // x = x + n
-        inpL = ggml_add(ctx0, inpL, cur);
+        inpL = ggml_v3_add(ctx0, inpL, cur);
     }
 
     if(use_scratch){
-    ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
+    ggml_v3_set_scratch(ctx0, { 0, scr0_size, scr0, });
     }
 
     // norm
     {
-        inpL = ggml_norm(ctx0, inpL, default_norm_eps);
+        inpL = ggml_v3_norm(ctx0, inpL, default_norm_eps);
         // inpL = ln_f_g*inpL
-        inpL = ggml_mul(ctx0, ggml_repeat(ctx0, model.norm_f_weight, inpL), inpL);
+        inpL = ggml_v3_mul(ctx0, ggml_v3_repeat(ctx0, model.norm_f_weight, inpL), inpL);
     }
 
     if(use_scratch){
-    ggml_set_scratch(ctx0, { 0, 0, nullptr, });
+    ggml_v3_set_scratch(ctx0, { 0, 0, nullptr, });
     }
 
     // output embedding weight tied to input embedding
-    inpL = ggml_mul_mat(ctx0, model.wte_weight, inpL);
+    inpL = ggml_v3_mul_mat(ctx0, model.wte_weight, inpL);
 
     // logits -> probs
-    // inpL = ggml_soft_max(ctx0, inpL);
+    // inpL = ggml_v3_soft_max(ctx0, inpL);
 
     // run the computation
-    ggml_build_forward_expand(gf, inpL);
+    ggml_v3_build_forward_expand(gf, inpL);
     kcpp_graph_compute_helper(gf, n_threads);
 
     // std::cout << "Qcur" << std::endl;
     // print_tensor(Qcur);
 
     // if (n_past%100 == 0) {
-    // ggml_graph_print(&gf);
-    // ggml_graph_dump_dot(&gf, NULL, "mpt-model.dot");
+    // ggml_v3_graph_print(&gf);
+    // ggml_v3_graph_dump_dot(&gf, NULL, "mpt-model.dot");
     // }
 
     if (logits_all) {
         // return result for all tokens
         embd_w.resize(n_vocab *N);
-        memcpy(embd_w.data(), (float *)ggml_get_data(inpL) , sizeof(float) * n_vocab * N);
+        memcpy(embd_w.data(), (float *)ggml_v3_get_data(inpL) , sizeof(float) * n_vocab * N);
     } else {
         // return result for just the last token
         embd_w.resize(n_vocab);
-        memcpy(embd_w.data(), (float *)ggml_get_data(inpL) + (n_vocab * (N - 1)), sizeof(float) * n_vocab);
+        memcpy(embd_w.data(), (float *)ggml_v3_get_data(inpL) + (n_vocab * (N - 1)), sizeof(float) * n_vocab);
     }
 
     if (mem_per_token == 0) {
-        mem_per_token = ggml_used_mem(ctx0) / N;
+        mem_per_token = ggml_v3_used_mem(ctx0) / N;
     }
-    // printf("used_mem = %zu\n", ggml_used_mem(ctx0));
+    // printf("used_mem = %zu\n", ggml_v3_used_mem(ctx0));
 
-    ggml_free(ctx0);
+    ggml_v3_free(ctx0);
 
     return true;
 }
diff --git a/otherarch/neox_v3.cpp b/otherarch/neox_v3.cpp
index ba77e0ef6..e15c0c930 100644
--- a/otherarch/neox_v3.cpp
+++ b/otherarch/neox_v3.cpp
@@ -1,4 +1,4 @@
-#include "ggml.h"
+#include "ggml_v3.h"
 #include "otherarch.h"
 
 #include "utils.h"
@@ -15,10 +15,10 @@
 #include <algorithm>
 
 #ifdef GGML_USE_CUBLAS
-#include "ggml-cuda.h"
+#include "ggml_v3-cuda.h"
 #endif
 #if defined(GGML_USE_CLBLAST)
-#include "ggml-opencl.h"
+#include "ggml_v3-opencl.h"
 #endif
 
 // load the model's weights from a file
@@ -56,7 +56,7 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model &
         fin.read((char *) &hparams.par_res, sizeof(hparams.par_res));
         fin.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
 
-        const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
+        const int32_t qntvr = hparams.ftype / GGML_V3_QNT_VERSION_FACTOR;
 
         printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
         printf("%s: n_ctx   = %d (%d)\n", __func__, hparams.n_ctx,origmaxctx);
@@ -70,7 +70,7 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model &
 
         hparams.n_ctx = std::max(origmaxctx,hparams.n_ctx);
 
-        hparams.ftype %= GGML_QNT_VERSION_FACTOR;
+        hparams.ftype %= GGML_V3_QNT_VERSION_FACTOR;
     }
 
     // load vocab
@@ -96,8 +96,8 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model &
 
     // for the big tensors, we have the option to store the data in 16-bit floats or quantized
     // in order to save memory and also to speed up the computation
-    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
-    if (wtype == GGML_TYPE_COUNT) {
+    ggml_v3_type wtype = ggml_v3_ftype_to_ggml_v3_type((ggml_v3_ftype) (model.hparams.ftype));
+    if (wtype == GGML_V3_TYPE_COUNT) {
         fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
                 __func__, fname.c_str(), model.hparams.ftype);
         return ModelLoadResult::FAIL;
@@ -115,34 +115,34 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model &
         const size_t n_ctx   = hparams.n_ctx;
         const size_t n_vocab = hparams.n_vocab;
 
-        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
-        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
+        ctx_size += n_embd*ggml_v3_type_sizef(GGML_V3_TYPE_F32); // ln_f_g
+        ctx_size += n_embd*ggml_v3_type_sizef(GGML_V3_TYPE_F32); // ln_f_b
 
-        ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // wte
+        ctx_size += n_embd*n_vocab*ggml_v3_type_sizef(wtype); // wte
 
-        ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype);           // lmh_g
-        //ctx_size +=        n_vocab*ggml_type_sizef(GGML_TYPE_F32); // lmh_b
+        ctx_size += n_embd*n_vocab*ggml_v3_type_sizef(wtype);           // lmh_g
+        //ctx_size +=        n_vocab*ggml_v3_type_sizef(GGML_V3_TYPE_F32); // lmh_b
 
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
+        ctx_size += n_layer*(n_embd*ggml_v3_type_sizef(GGML_V3_TYPE_F32)); // ln_1_g
+        ctx_size += n_layer*(n_embd*ggml_v3_type_sizef(GGML_V3_TYPE_F32)); // ln_1_b
 
-        ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype));         // c_attn_attn_w
-        ctx_size += n_layer*(       3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b
+        ctx_size += n_layer*(3*n_embd*n_embd*ggml_v3_type_sizef(wtype));         // c_attn_attn_w
+        ctx_size += n_layer*(       3*n_embd*ggml_v3_type_sizef(GGML_V3_TYPE_F32)); // c_attn_attn_b
 
-        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype));         // c_attn_proj_w
-        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_proj_b
+        ctx_size += n_layer*(n_embd*n_embd*ggml_v3_type_sizef(wtype));         // c_attn_proj_w
+        ctx_size += n_layer*(n_embd*n_embd*ggml_v3_type_sizef(GGML_V3_TYPE_F32)); // c_attn_proj_b
 
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b
+        ctx_size += n_layer*(n_embd*ggml_v3_type_sizef(GGML_V3_TYPE_F32)); // ln_2_g
+        ctx_size += n_layer*(n_embd*ggml_v3_type_sizef(GGML_V3_TYPE_F32)); // ln_2_b
 
-        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_fc_w
-        ctx_size += n_layer*(       4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
+        ctx_size += n_layer*(4*n_embd*n_embd*ggml_v3_type_sizef(wtype));         // c_mlp_fc_w
+        ctx_size += n_layer*(       4*n_embd*ggml_v3_type_sizef(GGML_V3_TYPE_F32)); // c_mlp_fc_b
 
-        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_proj_w
-        ctx_size += n_layer*(         n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
+        ctx_size += n_layer*(4*n_embd*n_embd*ggml_v3_type_sizef(wtype));         // c_mlp_proj_w
+        ctx_size += n_layer*(         n_embd*ggml_v3_type_sizef(GGML_V3_TYPE_F32)); // c_mlp_proj_b
 
-        ctx_size += std::max((size_t)origmaxctx,n_ctx)*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_k
-        ctx_size += std::max((size_t)origmaxctx,n_ctx)*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_v
+        ctx_size += std::max((size_t)origmaxctx,n_ctx)*n_layer*n_embd*ggml_v3_type_sizef(GGML_V3_TYPE_F16); // memory_k
+        ctx_size += std::max((size_t)origmaxctx,n_ctx)*n_layer*n_embd*ggml_v3_type_sizef(GGML_V3_TYPE_F16); // memory_v
 
         ctx_size += (6 + 16*n_layer)*1024; // object overhead
 
@@ -151,14 +151,14 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model &
 
     // create the ggml context
     {
-        struct ggml_init_params params;
+        struct ggml_v3_init_params params;
         params.mem_size   = ctx_size;
         params.mem_buffer = NULL;
         params.no_alloc   = false;
 
-        model.ctx = ggml_init(params);
+        model.ctx = ggml_v3_init(params);
         if (!model.ctx) {
-            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
+            fprintf(stderr, "%s: ggml_v3_init() failed\n", __func__);
             return ModelLoadResult::FAIL;
         }
     }
@@ -173,13 +173,13 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model &
 
         model.layers.resize(n_layer);
 
-        model.wte    = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
+        model.wte    = ggml_v3_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
 
-        model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-        model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+        model.ln_f_g = ggml_v3_new_tensor_1d(ctx, GGML_V3_TYPE_F32, n_embd);
+        model.ln_f_b = ggml_v3_new_tensor_1d(ctx, GGML_V3_TYPE_F32, n_embd);
 
-        model.lmh_g  = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
-        //model.lmh_b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_vocab);
+        model.lmh_g  = ggml_v3_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
+        //model.lmh_b  = ggml_v3_new_tensor_1d(ctx, GGML_V3_TYPE_F32, n_vocab);
 
         // map by name
         model.tensors["gpt_neox.embed_in.weight"] = model.wte;
@@ -193,23 +193,23 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model &
         for (int i = 0; i < n_layer; ++i) {
             auto & layer = model.layers[i];
 
-            layer.ln_1_g          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-            layer.ln_1_b          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+            layer.ln_1_g          = ggml_v3_new_tensor_1d(ctx, GGML_V3_TYPE_F32,   n_embd);
+            layer.ln_1_b          = ggml_v3_new_tensor_1d(ctx, GGML_V3_TYPE_F32,   n_embd);
 
-            layer.c_attn_attn_w   = ggml_new_tensor_2d(ctx, wtype,           n_embd, 3*n_embd);
-            layer.c_attn_attn_b   = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
+            layer.c_attn_attn_w   = ggml_v3_new_tensor_2d(ctx, wtype,           n_embd, 3*n_embd);
+            layer.c_attn_attn_b   = ggml_v3_new_tensor_1d(ctx, GGML_V3_TYPE_F32, 3*n_embd);
 
-            layer.c_attn_proj_w   = ggml_new_tensor_2d(ctx, wtype,           n_embd,   n_embd);
-            layer.c_attn_proj_b   = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+            layer.c_attn_proj_w   = ggml_v3_new_tensor_2d(ctx, wtype,           n_embd,   n_embd);
+            layer.c_attn_proj_b   = ggml_v3_new_tensor_1d(ctx, GGML_V3_TYPE_F32,   n_embd);
 
-            layer.ln_2_g          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-            layer.ln_2_b          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+            layer.ln_2_g          = ggml_v3_new_tensor_1d(ctx, GGML_V3_TYPE_F32,   n_embd);
+            layer.ln_2_b          = ggml_v3_new_tensor_1d(ctx, GGML_V3_TYPE_F32,   n_embd);
 
-            layer.c_mlp_fc_w      = ggml_new_tensor_2d(ctx, wtype,           n_embd, 4*n_embd);
-            layer.c_mlp_fc_b      = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
+            layer.c_mlp_fc_w      = ggml_v3_new_tensor_2d(ctx, wtype,           n_embd, 4*n_embd);
+            layer.c_mlp_fc_b      = ggml_v3_new_tensor_1d(ctx, GGML_V3_TYPE_F32, 4*n_embd);
 
-            layer.c_mlp_proj_w    = ggml_new_tensor_2d(ctx, wtype,         4*n_embd,   n_embd);
-            layer.c_mlp_proj_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+            layer.c_mlp_proj_w    = ggml_v3_new_tensor_2d(ctx, wtype,         4*n_embd,   n_embd);
+            layer.c_mlp_proj_b    = ggml_v3_new_tensor_1d(ctx, GGML_V3_TYPE_F32,   n_embd);
 
             // map by name
             model.tensors["gpt_neox.layers." + std::to_string(i) + ".input_layernorm.weight"] = layer.ln_1_g;
@@ -243,10 +243,10 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model &
         const int64_t n_mem      = n_layer*std::max(origmaxctx,n_ctx);
         const int64_t n_elements = n_embd*n_mem;
 
-        model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
-        model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
+        model.memory_k = ggml_v3_new_tensor_1d(ctx, GGML_V3_TYPE_F16, n_elements);
+        model.memory_v = ggml_v3_new_tensor_1d(ctx, GGML_V3_TYPE_F16, n_elements);
 
-        const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
+        const size_t memory_size = ggml_v3_nbytes(model.memory_k) + ggml_v3_nbytes(model.memory_v);
 
         printf("%s: memory_size = %8.2f MB, n_mem = %" PRId64 "\n", __func__, memory_size/1024.0/1024.0, n_mem);
     }
@@ -287,7 +287,7 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model &
             }
 
             auto tensor = model.tensors[name.data()];
-            if (ggml_nelements(tensor) != nelements) {
+            if (ggml_v3_nelements(tensor) != nelements) {
                 fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
                 return ModelLoadResult::FAIL;
             }
@@ -300,21 +300,21 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model &
 
             // for debugging
             if (0) {
-                printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
+                printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_v3_type_name(ggml_v3_type(ttype)), ggml_v3_nbytes(tensor)/1024.0/1024.0, ggml_v3_nbytes(tensor));
             }
 
-            const size_t bpe = ggml_type_size(ggml_type(ttype));
+            const size_t bpe = ggml_v3_type_size(ggml_v3_type(ttype));
 
-            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
+            if ((nelements*bpe)/ggml_v3_blck_size(tensor->type) != ggml_v3_nbytes(tensor)) {
                 fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
-                        __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
-                 ggml_free(ctx);
+                        __func__, name.data(), ggml_v3_nbytes(tensor), nelements*bpe);
+                 ggml_v3_free(ctx);
                  return ModelLoadResult::RETRY_LOAD;
             }
 
-            fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
+            fin.read(reinterpret_cast<char *>(tensor->data), ggml_v3_nbytes(tensor));
 
-            total_size += ggml_nbytes(tensor);
+            total_size += ggml_v3_nbytes(tensor);
             if (++n_tensors % 8 == 0) {
                 printf(".");
                 fflush(stdout);
@@ -342,20 +342,20 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model &
         #endif
         for (int i = 0; i < n_gpu; ++i) {
             const auto & layer = model.layers[i];
-            layer.c_attn_attn_w->backend = GGML_BACKEND_GPU;
-            layer.c_attn_proj_w->backend = GGML_BACKEND_GPU;
-            layer.c_mlp_fc_w->backend = GGML_BACKEND_GPU;
-            layer.c_mlp_proj_w->backend = GGML_BACKEND_GPU;
+            layer.c_attn_attn_w->backend = GGML_V3_BACKEND_GPU;
+            layer.c_attn_proj_w->backend = GGML_V3_BACKEND_GPU;
+            layer.c_mlp_fc_w->backend = GGML_V3_BACKEND_GPU;
+            layer.c_mlp_proj_w->backend = GGML_V3_BACKEND_GPU;
             #if defined(GGML_USE_CLBLAST)
-            ggml_cl_transform_tensor(layer.c_attn_attn_w->data,layer.c_attn_attn_w); vram_total += ggml_nbytes(layer.c_attn_attn_w);
-            ggml_cl_transform_tensor(layer.c_attn_proj_w->data,layer.c_attn_proj_w); vram_total += ggml_nbytes(layer.c_attn_proj_w);
-            ggml_cl_transform_tensor(layer.c_mlp_fc_w->data,layer.c_mlp_fc_w); vram_total += ggml_nbytes(layer.c_mlp_fc_w);
-            ggml_cl_transform_tensor(layer.c_mlp_proj_w->data,layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w);
+            ggml_v3_cl_transform_tensor(layer.c_attn_attn_w->data,layer.c_attn_attn_w); vram_total += ggml_v3_nbytes(layer.c_attn_attn_w);
+            ggml_v3_cl_transform_tensor(layer.c_attn_proj_w->data,layer.c_attn_proj_w); vram_total += ggml_v3_nbytes(layer.c_attn_proj_w);
+            ggml_v3_cl_transform_tensor(layer.c_mlp_fc_w->data,layer.c_mlp_fc_w); vram_total += ggml_v3_nbytes(layer.c_mlp_fc_w);
+            ggml_v3_cl_transform_tensor(layer.c_mlp_proj_w->data,layer.c_mlp_proj_w); vram_total += ggml_v3_nbytes(layer.c_mlp_proj_w);
             #else
-            ggml_cuda_transform_tensor(layer.c_attn_attn_w->data,layer.c_attn_attn_w); vram_total += ggml_nbytes(layer.c_attn_attn_w);
-            ggml_cuda_transform_tensor(layer.c_attn_proj_w->data,layer.c_attn_proj_w); vram_total += ggml_nbytes(layer.c_attn_proj_w);
-            ggml_cuda_transform_tensor(layer.c_mlp_fc_w->data,layer.c_mlp_fc_w); vram_total += ggml_nbytes(layer.c_mlp_fc_w);
-            ggml_cuda_transform_tensor(layer.c_mlp_proj_w->data,layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w);
+            ggml_v3_cuda_transform_tensor(layer.c_attn_attn_w->data,layer.c_attn_attn_w); vram_total += ggml_v3_nbytes(layer.c_attn_attn_w);
+            ggml_v3_cuda_transform_tensor(layer.c_attn_proj_w->data,layer.c_attn_proj_w); vram_total += ggml_v3_nbytes(layer.c_attn_proj_w);
+            ggml_v3_cuda_transform_tensor(layer.c_mlp_fc_w->data,layer.c_mlp_fc_w); vram_total += ggml_v3_nbytes(layer.c_mlp_fc_w);
+            ggml_v3_cuda_transform_tensor(layer.c_mlp_proj_w->data,layer.c_mlp_proj_w); vram_total += ggml_v3_nbytes(layer.c_mlp_proj_w);
             #endif
         }
         #if defined(GGML_USE_CLBLAST)
@@ -371,37 +371,37 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model &
 
 
 // feed-forward network
-ggml_tensor * gpt_neox_ff(
+ggml_v3_tensor * gpt_neox_ff(
         const gpt_neox_layer &layer,
-        ggml_context * ctx0,
-        ggml_tensor * inp) {
-    ggml_tensor * cur = ggml_norm(ctx0, inp, default_norm_eps);
+        ggml_v3_context * ctx0,
+        ggml_v3_tensor * inp) {
+    ggml_v3_tensor * cur = ggml_v3_norm(ctx0, inp, default_norm_eps);
 
-    cur = ggml_add(ctx0,
-        ggml_mul(ctx0,
-            ggml_repeat(ctx0, layer.ln_2_g, cur),
+    cur = ggml_v3_add(ctx0,
+        ggml_v3_mul(ctx0,
+            ggml_v3_repeat(ctx0, layer.ln_2_g, cur),
             cur),
-        ggml_repeat(ctx0, layer.ln_2_b, cur));
+        ggml_v3_repeat(ctx0, layer.ln_2_b, cur));
 
-    cur = ggml_mul_mat(ctx0,
+    cur = ggml_v3_mul_mat(ctx0,
             layer.c_mlp_fc_w,
             cur);
 
-    cur = ggml_add(ctx0,
-            ggml_repeat(ctx0, layer.c_mlp_fc_b, cur),
+    cur = ggml_v3_add(ctx0,
+            ggml_v3_repeat(ctx0, layer.c_mlp_fc_b, cur),
             cur);
 
     // GELU activation
-    cur = ggml_gelu(ctx0, cur);
+    cur = ggml_v3_gelu(ctx0, cur);
 
     // projection
     // cur = proj_w*cur + proj_b
-    cur = ggml_mul_mat(ctx0,
+    cur = ggml_v3_mul_mat(ctx0,
             layer.c_mlp_proj_w,
             cur);
 
-    cur = ggml_add(ctx0,
-            ggml_repeat(ctx0, layer.c_mlp_proj_b, cur),
+    cur = ggml_v3_add(ctx0,
+            ggml_v3_repeat(ctx0, layer.c_mlp_proj_b, cur),
             cur);
     return cur;
 }
@@ -464,56 +464,56 @@ bool gpt_neox_eval(
         }
     }
 
-    struct ggml_init_params params;
+    struct ggml_v3_init_params params;
     params.mem_size   = buf_size;
     params.mem_buffer = buf;
     params.no_alloc   = false;
 
 
-    struct ggml_context * ctx0 = ggml_init(params);
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, GGML_MAX_NODES, false);
+    struct ggml_v3_context * ctx0 = ggml_v3_init(params);
+    struct ggml_v3_cgraph * gf = ggml_v3_new_graph_custom(ctx0, GGML_V3_MAX_NODES, false);
 
-    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
+    struct ggml_v3_tensor * embd = ggml_v3_new_tensor_1d(ctx0, GGML_V3_TYPE_I32, N);
+    memcpy(embd->data, embd_inp.data(), N*ggml_v3_element_size(embd));
 
     // wte
-    struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.wte, embd);
+    struct ggml_v3_tensor * inpL = ggml_v3_get_rows(ctx0, model.wte, embd);
 
     for (int il = 0; il < n_layer; ++il) {
-        struct ggml_tensor * cur;
+        struct ggml_v3_tensor * cur;
 
         if(use_scratch){
-        ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
+        ggml_v3_set_scratch(ctx0, { 0, scr0_size, scr0, });
         }
 
         // self-attention
         {
             {
-                cur = ggml_norm(ctx0, inpL, default_norm_eps);
+                cur = ggml_v3_norm(ctx0, inpL, default_norm_eps);
 
-                cur = ggml_add(ctx0,
-                        ggml_mul(ctx0,
-                            ggml_repeat(ctx0, model.layers[il].ln_1_g, cur),
+                cur = ggml_v3_add(ctx0,
+                        ggml_v3_mul(ctx0,
+                            ggml_v3_repeat(ctx0, model.layers[il].ln_1_g, cur),
                             cur),
-                        ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
+                        ggml_v3_repeat(ctx0, model.layers[il].ln_1_b, cur));
             }
 
             // compute QKV
             {
-                cur = ggml_mul_mat(ctx0,
+                cur = ggml_v3_mul_mat(ctx0,
                         model.layers[il].c_attn_attn_w,
                         cur);
 
-                cur = ggml_add(ctx0,
-                        ggml_repeat(ctx0, model.layers[il].c_attn_attn_b, cur),
+                cur = ggml_v3_add(ctx0,
+                        ggml_v3_repeat(ctx0, model.layers[il].c_attn_attn_b, cur),
                         cur);
             }
 
-            struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 0*sizeof(float)*n_embd/n_head));
-            struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 1*sizeof(float)*n_embd/n_head));
-            struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 2*sizeof(float)*n_embd/n_head));
+            struct ggml_v3_tensor * Qcur = ggml_v3_cont(ctx0, ggml_v3_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 0*sizeof(float)*n_embd/n_head));
+            struct ggml_v3_tensor * Kcur = ggml_v3_cont(ctx0, ggml_v3_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 1*sizeof(float)*n_embd/n_head));
+            struct ggml_v3_tensor * Vcur = ggml_v3_cont(ctx0, ggml_v3_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 2*sizeof(float)*n_embd/n_head));
 
-            struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+            struct ggml_v3_tensor * KQ_pos = ggml_v3_new_tensor_1d(ctx0, GGML_V3_TYPE_I32, N);
             {
                 int * data = (int *) KQ_pos->data;
                 for (int i = 0; i < N; ++i) {
@@ -522,161 +522,161 @@ bool gpt_neox_eval(
             }
 
             // using mode = 2 for GPT-NeoX mode
-            Qcur = ggml_rope_custom_inplace(ctx0, Qcur, KQ_pos, n_rot, 2, n_ctx, 0, freq_base, freq_scale, 0, 1, 32, 1);
-            Kcur = ggml_rope_custom_inplace(ctx0, Kcur, KQ_pos, n_rot, 2, n_ctx, 0, freq_base, freq_scale, 0, 1, 32, 1);
+            Qcur = ggml_v3_rope_custom_inplace(ctx0, Qcur, KQ_pos, n_rot, 2, n_ctx, 0, freq_base, freq_scale, 0, 1, 32, 1);
+            Kcur = ggml_v3_rope_custom_inplace(ctx0, Kcur, KQ_pos, n_rot, 2, n_ctx, 0, freq_base, freq_scale, 0, 1, 32, 1);
 
             // store key and value to memory
             {
-                Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd, N));
+                Vcur = ggml_v3_transpose(ctx0, ggml_v3_reshape_2d(ctx0, Vcur, n_embd, N));
 
-                struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
-                struct ggml_tensor * v = ggml_view_2d(ctx0, model.memory_v, N, n_embd,
-                        (   n_ctx)*ggml_element_size(model.memory_v),
-                        (il*n_ctx)*ggml_element_size(model.memory_v)*n_embd + n_past*ggml_element_size(model.memory_v));
+                struct ggml_v3_tensor * k = ggml_v3_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_v3_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
+                struct ggml_v3_tensor * v = ggml_v3_view_2d(ctx0, model.memory_v, N, n_embd,
+                        (   n_ctx)*ggml_v3_element_size(model.memory_v),
+                        (il*n_ctx)*ggml_v3_element_size(model.memory_v)*n_embd + n_past*ggml_v3_element_size(model.memory_v));
 
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
+                ggml_v3_build_forward_expand(gf, ggml_v3_cpy(ctx0, Kcur, k));
+                ggml_v3_build_forward_expand(gf, ggml_v3_cpy(ctx0, Vcur, v));
             }
 
             // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
-            struct ggml_tensor * Q =
-                ggml_permute(ctx0,
+            struct ggml_v3_tensor * Q =
+                ggml_v3_permute(ctx0,
                         Qcur,
                         0, 2, 1, 3);
 
             // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
-            struct ggml_tensor * K =
-                ggml_permute(ctx0,
-                        ggml_reshape_3d(ctx0,
-                            ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
+            struct ggml_v3_tensor * K =
+                ggml_v3_permute(ctx0,
+                        ggml_v3_reshape_3d(ctx0,
+                            ggml_v3_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_v3_element_size(model.memory_k)*n_embd),
                             n_embd/n_head, n_head, n_past + N),
                         0, 2, 1, 3);
 
             // K * Q
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+            struct ggml_v3_tensor * KQ = ggml_v3_mul_mat(ctx0, K, Q);
 
             // KQ_scaled = KQ / sqrt(n_embd/n_head)
-            struct ggml_tensor * KQ_scaled =
-                ggml_scale_inplace(ctx0,
+            struct ggml_v3_tensor * KQ_scaled =
+                ggml_v3_scale_inplace(ctx0,
                         KQ,
                         1.0f/sqrt(float(n_embd)/n_head)
                         );
 
             // KQ_masked = mask_past(KQ_scaled)
-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
+            struct ggml_v3_tensor * KQ_masked = ggml_v3_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
 
             // KQ = soft_max(KQ_masked)
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
+            struct ggml_v3_tensor * KQ_soft_max = ggml_v3_soft_max_inplace(ctx0, KQ_masked);
 
             // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
-            struct ggml_tensor * V =
-                ggml_view_3d(ctx0, model.memory_v,
+            struct ggml_v3_tensor * V =
+                ggml_v3_view_3d(ctx0, model.memory_v,
                         n_past + N, n_embd/n_head, n_head,
-                        n_ctx*ggml_element_size(model.memory_v),
-                        n_ctx*ggml_element_size(model.memory_v)*n_embd/n_head,
-                        il*n_ctx*ggml_element_size(model.memory_v)*n_embd);
+                        n_ctx*ggml_v3_element_size(model.memory_v),
+                        n_ctx*ggml_v3_element_size(model.memory_v)*n_embd/n_head,
+                        il*n_ctx*ggml_v3_element_size(model.memory_v)*n_embd);
 
             // KQV = transpose(V) * KQ_soft_max
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
+            struct ggml_v3_tensor * KQV = ggml_v3_mul_mat(ctx0, V, KQ_soft_max);
 
             // KQV_merged = KQV.permute(0, 2, 1, 3)
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+            struct ggml_v3_tensor * KQV_merged = ggml_v3_permute(ctx0, KQV, 0, 2, 1, 3);
 
             // cur = KQV_merged.contiguous().view(n_embd, N)
-            cur = ggml_cpy(ctx0,
+            cur = ggml_v3_cpy(ctx0,
                     KQV_merged,
-                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
+                    ggml_v3_new_tensor_2d(ctx0, GGML_V3_TYPE_F32, n_embd, N));
 
             // projection
             {
-                cur = ggml_mul_mat(ctx0,
+                cur = ggml_v3_mul_mat(ctx0,
                         model.layers[il].c_attn_proj_w,
                         cur);
 
-                cur = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].c_attn_proj_b, cur), cur);
+                cur = ggml_v3_add(ctx0, ggml_v3_repeat(ctx0, model.layers[il].c_attn_proj_b, cur), cur);
             }
         }
 
         if(use_scratch){
-        ggml_set_scratch(ctx0, { 0, scr1_size, scr1, });
+        ggml_v3_set_scratch(ctx0, { 0, scr1_size, scr1, });
         }
 
         if (hparams.par_res == 0) {
-            struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpL);
+            struct ggml_v3_tensor * inpFF = ggml_v3_add(ctx0, cur, inpL);
 
             cur = gpt_neox_ff(model.layers[il], ctx0, inpFF);
 
             // input for next layer
-            inpL = ggml_add(ctx0, cur, inpFF);
+            inpL = ggml_v3_add(ctx0, cur, inpFF);
         } else {
-            struct ggml_tensor * inpFF = cur;
+            struct ggml_v3_tensor * inpFF = cur;
 
             // this is independent of the self-attention result, so it could be done in parallel to the self-attention
             // note here we pass inpL instead of cur
             cur = gpt_neox_ff(model.layers[il], ctx0, inpL);
 
             // layer input + FF
-            cur  = ggml_add(ctx0, cur, inpFF);
+            cur  = ggml_v3_add(ctx0, cur, inpFF);
 
             // input for next layer
-            inpL = ggml_add(ctx0, cur, inpL);
+            inpL = ggml_v3_add(ctx0, cur, inpL);
         }
     }
 
     if(use_scratch){
-    ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
+    ggml_v3_set_scratch(ctx0, { 0, scr0_size, scr0, });
     }
 
     // norm
     {
-        inpL = ggml_norm(ctx0, inpL, default_norm_eps);
+        inpL = ggml_v3_norm(ctx0, inpL, default_norm_eps);
 
         // inpL = ln_f_g*inpL + ln_f_b
-        inpL = ggml_add(ctx0,
-                ggml_mul(ctx0,
-                    ggml_repeat(ctx0, model.ln_f_g, inpL),
+        inpL = ggml_v3_add(ctx0,
+                ggml_v3_mul(ctx0,
+                    ggml_v3_repeat(ctx0, model.ln_f_g, inpL),
                     inpL),
-                ggml_repeat(ctx0, model.ln_f_b, inpL));
+                ggml_v3_repeat(ctx0, model.ln_f_b, inpL));
     }
 
     if(use_scratch){
-    ggml_set_scratch(ctx0, { 0, 0, nullptr, });
+    ggml_v3_set_scratch(ctx0, { 0, 0, nullptr, });
     }
 
     // lm_head
     {
-        inpL = ggml_mul_mat(ctx0, model.lmh_g, inpL);
+        inpL = ggml_v3_mul_mat(ctx0, model.lmh_g, inpL);
 
-        //inpL = ggml_add(ctx0,
-        //        ggml_repeat(ctx0, model.lmh_b, inpL),
+        //inpL = ggml_v3_add(ctx0,
+        //        ggml_v3_repeat(ctx0, model.lmh_b, inpL),
         //        inpL);
     }
 
     // logits -> probs
-    //inpL = ggml_soft_max_inplace(ctx0, inpL);
+    //inpL = ggml_v3_soft_max_inplace(ctx0, inpL);
 
     // run the computation
-    ggml_build_forward_expand(gf, inpL);
+    ggml_v3_build_forward_expand(gf, inpL);
     kcpp_graph_compute_helper(gf, n_threads);
 
     //if (n_past%100 == 0) {
-    //    ggml_graph_print   (&gf);
-    //    ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
+    //    ggml_v3_graph_print   (&gf);
+    //    ggml_v3_graph_dump_dot(&gf, NULL, "gpt-2.dot");
     //}
 
     //embd_w.resize(n_vocab*N);
-    //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
+    //memcpy(embd_w.data(), ggml_v3_get_data(inpL), sizeof(float)*n_vocab*N);
 
     // return result for just the last token
     embd_w.resize(n_vocab);
-    memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
+    memcpy(embd_w.data(), (float *) ggml_v3_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
 
     if (mem_per_token == 0) {
-        mem_per_token = ggml_used_mem(ctx0)/N;
+        mem_per_token = ggml_v3_used_mem(ctx0)/N;
     }
-    //printf("used_mem = %zu\n", ggml_used_mem(ctx0));
+    //printf("used_mem = %zu\n", ggml_v3_used_mem(ctx0));
 
-    ggml_free(ctx0);
+    ggml_v3_free(ctx0);
 
     return true;
 }
diff --git a/otherarch/otherarch.h b/otherarch/otherarch.h
index 47ea0d7b3..5c7deb86d 100644
--- a/otherarch/otherarch.h
+++ b/otherarch/otherarch.h
@@ -31,22 +31,22 @@ struct gptj_hparams {
 
 struct gptj_layer {
     // normalization
-    struct ggml_tensor * ln_1_g;
-    struct ggml_tensor * ln_1_b;
+    struct ggml_v3_tensor * ln_1_g;
+    struct ggml_v3_tensor * ln_1_b;
 
     // attention
-    struct ggml_tensor * c_attn_q_proj_w;
-    struct ggml_tensor * c_attn_k_proj_w;
-    struct ggml_tensor * c_attn_v_proj_w;
+    struct ggml_v3_tensor * c_attn_q_proj_w;
+    struct ggml_v3_tensor * c_attn_k_proj_w;
+    struct ggml_v3_tensor * c_attn_v_proj_w;
 
-    struct ggml_tensor * c_attn_proj_w;
+    struct ggml_v3_tensor * c_attn_proj_w;
 
     // ff
-    struct ggml_tensor * c_mlp_fc_w;
-    struct ggml_tensor * c_mlp_fc_b;
+    struct ggml_v3_tensor * c_mlp_fc_w;
+    struct ggml_v3_tensor * c_mlp_fc_b;
 
-    struct ggml_tensor * c_mlp_proj_w;
-    struct ggml_tensor * c_mlp_proj_b;
+    struct ggml_v3_tensor * c_mlp_proj_w;
+    struct ggml_v3_tensor * c_mlp_proj_b;
 };
 struct gptj_layer_v2 {
     // normalization
@@ -139,23 +139,23 @@ struct gptj_model {
     gptj_hparams hparams;
 
     // normalization
-    struct ggml_tensor * ln_f_g;
-    struct ggml_tensor * ln_f_b;
+    struct ggml_v3_tensor * ln_f_g;
+    struct ggml_v3_tensor * ln_f_b;
 
-    struct ggml_tensor * wte; // position embedding
+    struct ggml_v3_tensor * wte; // position embedding
 
-    struct ggml_tensor * lmh_g; // language model head
-    struct ggml_tensor * lmh_b; // language model bias
+    struct ggml_v3_tensor * lmh_g; // language model head
+    struct ggml_v3_tensor * lmh_b; // language model bias
 
     std::vector<gptj_layer> layers;
 
     // key + value memory
-    struct ggml_tensor * memory_k;
-    struct ggml_tensor * memory_v;
+    struct ggml_v3_tensor * memory_k;
+    struct ggml_v3_tensor * memory_v;
 
     //
-    struct ggml_context * ctx;
-    std::map<std::string, struct ggml_tensor *> tensors;
+    struct ggml_v3_context * ctx;
+    std::map<std::string, struct ggml_v3_tensor *> tensors;
 };
 
 // default hparams (GPT-2 117M)
@@ -259,47 +259,47 @@ struct gpt2_v2_model {
 
 struct gpt2_layer {
     // normalization
-    struct ggml_tensor * ln_1_g;
-    struct ggml_tensor * ln_1_b;
+    struct ggml_v3_tensor * ln_1_g;
+    struct ggml_v3_tensor * ln_1_b;
 
-    struct ggml_tensor * ln_2_g;
-    struct ggml_tensor * ln_2_b;
+    struct ggml_v3_tensor * ln_2_g;
+    struct ggml_v3_tensor * ln_2_b;
 
     // attention
-    struct ggml_tensor * c_attn_attn_w;
-    struct ggml_tensor * c_attn_attn_b;
+    struct ggml_v3_tensor * c_attn_attn_w;
+    struct ggml_v3_tensor * c_attn_attn_b;
 
-    struct ggml_tensor * c_attn_proj_w;
-    struct ggml_tensor * c_attn_proj_b;
+    struct ggml_v3_tensor * c_attn_proj_w;
+    struct ggml_v3_tensor * c_attn_proj_b;
 
     // mlp
-    struct ggml_tensor * c_mlp_fc_w;
-    struct ggml_tensor * c_mlp_fc_b;
+    struct ggml_v3_tensor * c_mlp_fc_w;
+    struct ggml_v3_tensor * c_mlp_fc_b;
 
-    struct ggml_tensor * c_mlp_proj_w;
-    struct ggml_tensor * c_mlp_proj_b;
+    struct ggml_v3_tensor * c_mlp_proj_w;
+    struct ggml_v3_tensor * c_mlp_proj_b;
 };
 
 struct gpt2_model {
     gpt2_hparams hparams;
 
     // normalization
-    struct ggml_tensor * ln_f_g;
-    struct ggml_tensor * ln_f_b;
+    struct ggml_v3_tensor * ln_f_g;
+    struct ggml_v3_tensor * ln_f_b;
 
-    struct ggml_tensor * wte;     // position embedding
-    struct ggml_tensor * wpe;     //    token embedding
-    struct ggml_tensor * lm_head; // language model head
+    struct ggml_v3_tensor * wte;     // position embedding
+    struct ggml_v3_tensor * wpe;     //    token embedding
+    struct ggml_v3_tensor * lm_head; // language model head
 
     std::vector<gpt2_layer> layers;
 
     // key + value memory
-    struct ggml_tensor * memory_k;
-    struct ggml_tensor * memory_v;
+    struct ggml_v3_tensor * memory_k;
+    struct ggml_v3_tensor * memory_v;
 
     //
-    struct ggml_context * ctx;
-    std::map<std::string, struct ggml_tensor *> tensors;
+    struct ggml_v3_context * ctx;
+    std::map<std::string, struct ggml_v3_tensor *> tensors;
 };
 
 // default hparams (StableLM 3B)
@@ -351,7 +351,7 @@ struct gpt_neox_v2_model {
     struct ggml_v2_tensor * wte; // position embedding
 
     struct ggml_v2_tensor * lmh_g; // language model head
-    //struct ggml_tensor * lmh_b; // language model bias
+    //struct ggml_v3_tensor * lmh_b; // language model bias
 
     std::vector<gpt_neox_layer_v2> layers;
 
@@ -366,49 +366,49 @@ struct gpt_neox_v2_model {
 
 struct gpt_neox_layer {
     // pre normalization
-    struct ggml_tensor * ln_1_g;
-    struct ggml_tensor * ln_1_b;
+    struct ggml_v3_tensor * ln_1_g;
+    struct ggml_v3_tensor * ln_1_b;
 
     // attention
-    struct ggml_tensor * c_attn_attn_w;
-    struct ggml_tensor * c_attn_attn_b;
+    struct ggml_v3_tensor * c_attn_attn_w;
+    struct ggml_v3_tensor * c_attn_attn_b;
 
-    struct ggml_tensor * c_attn_proj_w;
-    struct ggml_tensor * c_attn_proj_b;
+    struct ggml_v3_tensor * c_attn_proj_w;
+    struct ggml_v3_tensor * c_attn_proj_b;
 
     // post normalization
-    struct ggml_tensor * ln_2_g;
-    struct ggml_tensor * ln_2_b;
+    struct ggml_v3_tensor * ln_2_g;
+    struct ggml_v3_tensor * ln_2_b;
 
     // ff
-    struct ggml_tensor * c_mlp_fc_w;
-    struct ggml_tensor * c_mlp_fc_b;
+    struct ggml_v3_tensor * c_mlp_fc_w;
+    struct ggml_v3_tensor * c_mlp_fc_b;
 
-    struct ggml_tensor * c_mlp_proj_w;
-    struct ggml_tensor * c_mlp_proj_b;
+    struct ggml_v3_tensor * c_mlp_proj_w;
+    struct ggml_v3_tensor * c_mlp_proj_b;
 };
 
 struct gpt_neox_model {
     gpt_neox_hparams hparams;
 
     // normalization
-    struct ggml_tensor * ln_f_g;
-    struct ggml_tensor * ln_f_b;
+    struct ggml_v3_tensor * ln_f_g;
+    struct ggml_v3_tensor * ln_f_b;
 
-    struct ggml_tensor * wte; // position embedding
+    struct ggml_v3_tensor * wte; // position embedding
 
-    struct ggml_tensor * lmh_g; // language model head
-    //struct ggml_tensor * lmh_b; // language model bias
+    struct ggml_v3_tensor * lmh_g; // language model head
+    //struct ggml_v3_tensor * lmh_b; // language model bias
 
     std::vector<gpt_neox_layer> layers;
 
     // key + value memory
-    struct ggml_tensor * memory_k;
-    struct ggml_tensor * memory_v;
+    struct ggml_v3_tensor * memory_k;
+    struct ggml_v3_tensor * memory_v;
 
     //
-    struct ggml_context * ctx;
-    std::map<std::string, struct ggml_tensor *> tensors;
+    struct ggml_v3_context * ctx;
+    std::map<std::string, struct ggml_v3_tensor *> tensors;
 };
 
 
@@ -428,35 +428,35 @@ struct mpt_hparams {
 
 struct mpt_layer {
     // pre normalization
-    struct ggml_tensor * norm_1_weight;
+    struct ggml_v3_tensor * norm_1_weight;
 
     // attention
-    struct ggml_tensor * c_attn_wqkv_weight;
-    struct ggml_tensor * c_attn_out_proj_weight;
+    struct ggml_v3_tensor * c_attn_wqkv_weight;
+    struct ggml_v3_tensor * c_attn_out_proj_weight;
 
     // post normalization
-    struct ggml_tensor * norm_2_weight;
+    struct ggml_v3_tensor * norm_2_weight;
 
     // ff
-    struct ggml_tensor * ffn_up_proj;
-    struct ggml_tensor * ffn_down_proj;
+    struct ggml_v3_tensor * ffn_up_proj;
+    struct ggml_v3_tensor * ffn_down_proj;
 };
 
 struct mpt_model {
     mpt_hparams hparams;
 
-    struct ggml_tensor * wte_weight;    // position embedding
-    struct ggml_tensor * norm_f_weight; // language model head
+    struct ggml_v3_tensor * wte_weight;    // position embedding
+    struct ggml_v3_tensor * norm_f_weight; // language model head
 
     std::vector<mpt_layer> layers;
 
     // key + value memory
-    struct ggml_tensor * memory_k;
-    struct ggml_tensor * memory_v;
+    struct ggml_v3_tensor * memory_k;
+    struct ggml_v3_tensor * memory_v;
 
-    struct ggml_context * ctx;
-    std::map<std::string, struct ggml_tensor *> tensors;
+    struct ggml_v3_context * ctx;
+    std::map<std::string, struct ggml_v3_tensor *> tensors;
 };
 
 const float default_norm_eps = 1e-5f;
-const size_t GGML_MAX_NODES = 8192;
\ No newline at end of file
+const size_t GGML_V3_MAX_NODES = 8192;
\ No newline at end of file
diff --git a/otherarch/rwkv_v3.cpp b/otherarch/rwkv_v3.cpp
index a398b3f88..5692f743c 100644
--- a/otherarch/rwkv_v3.cpp
+++ b/otherarch/rwkv_v3.cpp
@@ -4,13 +4,13 @@
 #include "otherarch.h"
 
 #include "rwkv_v3.h"
-#include "ggml.h"
+#include "ggml_v3.h"
 
 #ifdef GGML_USE_CUBLAS
-#include "ggml-cuda.h"
+#include "ggml_v3-cuda.h"
 #endif
 #if defined(GGML_USE_CLBLAST)
-#include "ggml-opencl.h"
+#include "ggml_v3-opencl.h"
 #endif
 
 #include "utils.h"
@@ -194,23 +194,23 @@ enum rwkv_type {
     TYPE_COUNT
 };
 
-#define GGML_TYPE_UNKNOWN GGML_TYPE_COUNT
+#define GGML_V3_TYPE_UNKNOWN GGML_V3_TYPE_COUNT
 
-extern const enum ggml_type rwkv_type_to_ggml[TYPE_COUNT + 1] = {
-    GGML_TYPE_F32,     /* FP32   */
-    GGML_TYPE_F16,     /* FP16   */
-    GGML_TYPE_Q4_0,    /* Q4_0   */
-    GGML_TYPE_Q4_1,    /* Q4_1   */
-    GGML_TYPE_UNKNOWN, /* Q4_1_O */
-    GGML_TYPE_UNKNOWN, /* Q4_2   */
-    GGML_TYPE_UNKNOWN, /* Q4_3   */
-    GGML_TYPE_Q5_0,    /* Q5_0   */
-    GGML_TYPE_Q5_1,    /* Q5_1   */
-    GGML_TYPE_Q8_0,    /* Q8_0   */
-    GGML_TYPE_COUNT    /* COUNT  */
+extern const enum ggml_v3_type rwkv_type_to_ggml[TYPE_COUNT + 1] = {
+    GGML_V3_TYPE_F32,     /* FP32   */
+    GGML_V3_TYPE_F16,     /* FP16   */
+    GGML_V3_TYPE_Q4_0,    /* Q4_0   */
+    GGML_V3_TYPE_Q4_1,    /* Q4_1   */
+    GGML_V3_TYPE_UNKNOWN, /* Q4_1_O */
+    GGML_V3_TYPE_UNKNOWN, /* Q4_2   */
+    GGML_V3_TYPE_UNKNOWN, /* Q4_3   */
+    GGML_V3_TYPE_Q5_0,    /* Q5_0   */
+    GGML_V3_TYPE_Q5_1,    /* Q5_1   */
+    GGML_V3_TYPE_Q8_0,    /* Q8_0   */
+    GGML_V3_TYPE_COUNT    /* COUNT  */
 };
 
-extern const enum rwkv_type rwkv_type_from_ggml[GGML_TYPE_COUNT + 1] = {
+extern const enum rwkv_type rwkv_type_from_ggml[GGML_V3_TYPE_COUNT + 1] = {
     TYPE_FP32,   /* FP32  */
     TYPE_FP16,   /* FP16  */
     TYPE_Q4_0,   /* Q4_0  */
@@ -259,11 +259,11 @@ bool rwkv_fread_file_header(FILE * file, struct rwkv_file_header & header, bool
     RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_DATA_TYPE, header.data_type < TYPE_COUNT, "Model data type out of range (%" PRId32 " > %" PRId32 ")", header.data_type, TYPE_COUNT - 1);
 
     if (verify_data_type) {
-        enum ggml_type ggml_type = rwkv_type_to_ggml[header.data_type];
+        enum ggml_v3_type ggml_v3_type = rwkv_type_to_ggml[header.data_type];
 
         RWKV_ASSERT_FALSE_MSG(
             RWKV_ERROR_DATA_TYPE,
-            ggml_type != GGML_TYPE_UNKNOWN,
+            ggml_v3_type != GGML_V3_TYPE_UNKNOWN,
             "Models in %s format cannot be loaded anymore because the format was removed.\n"
             "You need to quantize the model into another format or use an older version of rwkv.cpp.\n"
             "See https://github.com/saharNooby/rwkv.cpp#compatibility for more info",
@@ -272,7 +272,7 @@ bool rwkv_fread_file_header(FILE * file, struct rwkv_file_header & header, bool
 
         RWKV_ASSERT_FALSE_MSG(
             RWKV_ERROR_DATA_TYPE,
-            (!ggml_is_quantized(ggml_type) || header.version == RWKV_FILE_VERSION_1),
+            (!ggml_v3_is_quantized(ggml_v3_type) || header.version == RWKV_FILE_VERSION_1),
             "The quantized model file in %s format was created with an old version of rwkv.cpp and can not be loaded anymore.\n"
             "You need to requantize the model or use an older version of rwkv.cpp.\n"
             "See https://github.com/saharNooby/rwkv.cpp#compatibility for more info",
@@ -304,11 +304,11 @@ struct rwkv_tensor {
     uint8_t * data;
 };
 
-//rwkv relied on the old ggml_nbytes implementation, so backport it here. Fixes breaking change in PR 2874
-size_t rwkv_nbytes_old(const struct ggml_tensor * tensor) {
-    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
+//rwkv relied on the old ggml_v3_nbytes implementation, so backport it here. Fixes breaking change in PR 2874
+size_t rwkv_nbytes_old(const struct ggml_v3_tensor * tensor) {
+    static_assert(GGML_V3_MAX_DIMS == 4, "GGML_V3_MAX_DIMS is not 4 - update this function");
     auto a = tensor->ne[3]*tensor->nb[3];
-    auto b = (ggml_nelements(tensor)*ggml_type_size(tensor->type))/ggml_blck_size(tensor->type);
+    auto b = (ggml_v3_nelements(tensor)*ggml_v3_type_size(tensor->type))/ggml_v3_blck_size(tensor->type);
     return ((a) > (b) ? (a) : (b));
 }
 
@@ -319,7 +319,7 @@ bool rwkv_fread_tensor_header(FILE * file, struct rwkv_tensor_header & header) {
     RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_DATA_TYPE, header.data_type < TYPE_COUNT, "Tensor data type out of range (%" PRId32 " > %" PRId32 ")", header.data_type, TYPE_COUNT - 1);
     RWKV_ASSERT_FALSE_MSG(
         RWKV_ERROR_DATA_TYPE,
-        rwkv_type_to_ggml[header.data_type] != GGML_TYPE_UNKNOWN,
+        rwkv_type_to_ggml[header.data_type] != GGML_V3_TYPE_UNKNOWN,
         "Tensor data type (%s) is no longer supported",
         rwkv_type_to_string[header.data_type]
     );
@@ -366,27 +366,27 @@ bool rwkv_fread_tensor(FILE * file, struct rwkv_tensor & output, void * buffer =
     return true;
 }
 
-bool rwkv_fread_ggml_tensor_data(FILE * file, const struct rwkv_tensor_header & header, struct ggml_context * ctx, std::string & name, struct ggml_tensor *& tensor) {
+bool rwkv_fread_ggml_v3_tensor_data(FILE * file, const struct rwkv_tensor_header & header, struct ggml_v3_context * ctx, std::string & name, struct ggml_v3_tensor *& tensor) {
     RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_FILE_READ, rwkv_fread_string(file, header.key_length, name), "Failed to read tensor name");
 
-    enum ggml_type ggml_type = rwkv_type_to_ggml[header.data_type];
-    RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_UNSUPPORTED, ggml_type != GGML_TYPE_UNKNOWN, "Unsupported tensor data type %s from %s", rwkv_type_to_string[header.data_type], name.c_str());
+    enum ggml_v3_type ggml_v3_type = rwkv_type_to_ggml[header.data_type];
+    RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_UNSUPPORTED, ggml_v3_type != GGML_V3_TYPE_UNKNOWN, "Unsupported tensor data type %s from %s", rwkv_type_to_string[header.data_type], name.c_str());
 
     tensor = header.dim_count == 1
-        ? ggml_new_tensor_1d(ctx, ggml_type, header.width)
-        : ggml_new_tensor_2d(ctx, ggml_type, header.width, header.height);
+        ? ggml_v3_new_tensor_1d(ctx, ggml_v3_type, header.width)
+        : ggml_v3_new_tensor_2d(ctx, ggml_v3_type, header.width, header.height);
 
     RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_ALLOC, tensor, "Failed to allocate tensor");
-    ggml_set_name(tensor, name.c_str());
+    ggml_v3_set_name(tensor, name.c_str());
 
     RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_FILE_READ, rwkv_fread_data(file, rwkv_nbytes_old(tensor), tensor->data), "Failed to read tensor data from %s", name.c_str());
     return true;
 }
 
-bool rwkv_fread_ggml_tensor(FILE * file, struct ggml_context * ctx, std::string & name, struct ggml_tensor *& tensor) {
+bool rwkv_fread_ggml_v3_tensor(FILE * file, struct ggml_v3_context * ctx, std::string & name, struct ggml_v3_tensor *& tensor) {
     struct rwkv_tensor_header header;
     RWKV_ENSURE_OR_FALSE_MSG(rwkv_fread_tensor_header(file, header), "Invalid tensor header");
-    return rwkv_fread_ggml_tensor_data(file, header, ctx, name, tensor);
+    return rwkv_fread_ggml_v3_tensor_data(file, header, ctx, name, tensor);
 }
 
 bool rwkv_fwrite_tensor(FILE * file, const struct rwkv_tensor & tensor) {
@@ -399,45 +399,45 @@ bool rwkv_fwrite_tensor(FILE * file, const struct rwkv_tensor & tensor) {
 // --- Model definition ---
 
 struct rwkv_layer {
-    struct ggml_tensor * ln1_weight;
-    struct ggml_tensor * ln1_bias;
+    struct ggml_v3_tensor * ln1_weight;
+    struct ggml_v3_tensor * ln1_bias;
 
     // RWKV, also called "attention" by the author.
-    struct ggml_tensor * att_time_mix_k;
-    struct ggml_tensor * att_time_mix_v;
-    struct ggml_tensor * att_time_mix_r;
-    struct ggml_tensor * att_time_first;
-    struct ggml_tensor * att_time_decay;
-    struct ggml_tensor * att_key;
-    struct ggml_tensor * att_value;
-    struct ggml_tensor * att_receptance;
-    struct ggml_tensor * att_output;
+    struct ggml_v3_tensor * att_time_mix_k;
+    struct ggml_v3_tensor * att_time_mix_v;
+    struct ggml_v3_tensor * att_time_mix_r;
+    struct ggml_v3_tensor * att_time_first;
+    struct ggml_v3_tensor * att_time_decay;
+    struct ggml_v3_tensor * att_key;
+    struct ggml_v3_tensor * att_value;
+    struct ggml_v3_tensor * att_receptance;
+    struct ggml_v3_tensor * att_output;
 
-    struct ggml_tensor * ln2_weight;
-    struct ggml_tensor * ln2_bias;
+    struct ggml_v3_tensor * ln2_weight;
+    struct ggml_v3_tensor * ln2_bias;
 
     // FFN.
-    struct ggml_tensor * ffn_time_mix_k;
-    struct ggml_tensor * ffn_time_mix_r;
-    struct ggml_tensor * ffn_key;
-    struct ggml_tensor * ffn_value;
-    struct ggml_tensor * ffn_receptance;
+    struct ggml_v3_tensor * ffn_time_mix_k;
+    struct ggml_v3_tensor * ffn_time_mix_r;
+    struct ggml_v3_tensor * ffn_key;
+    struct ggml_v3_tensor * ffn_value;
+    struct ggml_v3_tensor * ffn_receptance;
 };
 
 struct rwkv_model {
     struct rwkv_file_header header;
 
-    struct ggml_tensor * emb;
+    struct ggml_v3_tensor * emb;
 
-    struct ggml_tensor * ln0_weight;
-    struct ggml_tensor * ln0_bias;
+    struct ggml_v3_tensor * ln0_weight;
+    struct ggml_v3_tensor * ln0_bias;
 
     std::unique_ptr<struct rwkv_layer[]> layers;
 
-    struct ggml_tensor * ln_out_weight;
-    struct ggml_tensor * ln_out_bias;
+    struct ggml_v3_tensor * ln_out_weight;
+    struct ggml_v3_tensor * ln_out_bias;
 
-    struct ggml_tensor * head;
+    struct ggml_v3_tensor * head;
 };
 
 // --- Operators ---
@@ -466,26 +466,26 @@ void rwkv_max_impl(const int n_cols, float * dest, const float * src0, const flo
     }
 }
 
-struct ggml_tensor * rwkv_exp(ggml_context * ctx, struct ggml_tensor * x) {
-    return ggml_map_unary_f32(ctx, x, rwkv_exp_impl);
+struct ggml_v3_tensor * rwkv_exp(ggml_v3_context * ctx, struct ggml_v3_tensor * x) {
+    return ggml_v3_map_unary_f32(ctx, x, rwkv_exp_impl);
 }
 
-struct ggml_tensor * rwkv_1_minus_x(ggml_context * ctx, struct ggml_tensor * x) {
-    return ggml_map_unary_f32(ctx, x, rwkv_1_minus_x_impl);
+struct ggml_v3_tensor * rwkv_1_minus_x(ggml_v3_context * ctx, struct ggml_v3_tensor * x) {
+    return ggml_v3_map_unary_f32(ctx, x, rwkv_1_minus_x_impl);
 }
 
-struct ggml_tensor * rwkv_sigmoid(ggml_context * ctx, struct ggml_tensor * x) {
-    return ggml_map_unary_f32(ctx, x, rwkv_sigmoid_impl);
+struct ggml_v3_tensor * rwkv_sigmoid(ggml_v3_context * ctx, struct ggml_v3_tensor * x) {
+    return ggml_v3_map_unary_f32(ctx, x, rwkv_sigmoid_impl);
 }
 
-struct ggml_tensor * rwkv_max(ggml_context * ctx, struct ggml_tensor * x, struct ggml_tensor * y) {
-    return ggml_map_binary_f32(ctx, x, y, rwkv_max_impl);
+struct ggml_v3_tensor * rwkv_max(ggml_v3_context * ctx, struct ggml_v3_tensor * x, struct ggml_v3_tensor * y) {
+    return ggml_v3_map_binary_f32(ctx, x, y, rwkv_max_impl);
 }
 
-struct ggml_tensor * rwkv_layer_norm(ggml_context * ctx, struct ggml_tensor * x, struct ggml_tensor * weight, struct ggml_tensor * bias) {
+struct ggml_v3_tensor * rwkv_layer_norm(ggml_v3_context * ctx, struct ggml_v3_tensor * x, struct ggml_v3_tensor * weight, struct ggml_v3_tensor * bias) {
     // LayerNorm in RWKV is `x = (x - mean(x)) / sqrt(variance(x) + 1e-5) * weight + bias`
-    // Looks like ggml_norm does the first part, we only need to apply weight & bias.
-    return ggml_add_inplace(ctx, ggml_mul_inplace(ctx, ggml_norm(ctx, x, default_norm_eps), weight), bias);
+    // Looks like ggml_v3_norm does the first part, we only need to apply weight & bias.
+    return ggml_v3_add_inplace(ctx, ggml_v3_mul_inplace(ctx, ggml_v3_norm(ctx, x, default_norm_eps), weight), bias);
 }
 
 // --- Implementation ---
@@ -501,7 +501,7 @@ struct rwkv_future_ctx {
     size_t memory_size = 0;
     size_t scratch_size = 0;
 
-    // Align to GGML_MEM_ALIGN, which can currently be up to 16
+    // Align to GGML_V3_MEM_ALIGN, which can currently be up to 16
     static const size_t align(const size_t size) {
         return ((size + 15) & ~15);
     }
@@ -530,18 +530,18 @@ struct rwkv_future_ctx {
         }
     }
 
-    struct rwkv_future_tensor declare(const enum ggml_type type, const uint64_t width, const uint64_t height = 1);
+    struct rwkv_future_tensor declare(const enum ggml_v3_type type, const uint64_t width, const uint64_t height = 1);
 
-    struct rwkv_future_tensor alloc(const enum ggml_type type, const uint64_t width, const uint64_t height = 1, const bool use_scratch = true);
+    struct rwkv_future_tensor alloc(const enum ggml_v3_type type, const uint64_t width, const uint64_t height = 1, const bool use_scratch = true);
 };
 
 struct rwkv_future_tensor {
-    enum ggml_type type = GGML_TYPE_COUNT;
+    enum ggml_v3_type type = GGML_V3_TYPE_COUNT;
     uint64_t width = 0;
     uint64_t height = 0;
 
-    static const size_t size(const enum ggml_type type, const uint64_t width, const uint64_t height) {
-        struct ggml_tensor decoy {};
+    static const size_t size(const enum ggml_v3_type type, const uint64_t width, const uint64_t height) {
+        struct ggml_v3_tensor decoy {};
         decoy.type = type;
         decoy.ne[0] = width;
         decoy.ne[1] = height;
@@ -551,22 +551,22 @@ struct rwkv_future_tensor {
     }
 
     rwkv_future_tensor() {}
-    rwkv_future_tensor(const enum ggml_type type, const uint64_t width, const uint64_t height = 1): type(type), width(width), height(height) {}
-    rwkv_future_tensor(const struct ggml_tensor * ref): type(ref->type), width(ref->ne[0]), height(ref->ne[1]) {}
+    rwkv_future_tensor(const enum ggml_v3_type type, const uint64_t width, const uint64_t height = 1): type(type), width(width), height(height) {}
+    rwkv_future_tensor(const struct ggml_v3_tensor * ref): type(ref->type), width(ref->ne[0]), height(ref->ne[1]) {}
 
     struct rwkv_future_tensor alloc(struct rwkv_future_ctx & ctx, const bool use_scratch = true) const {
-        ctx.add_objects(sizeof(struct ggml_tensor));
+        ctx.add_objects(sizeof(struct ggml_v3_tensor));
         ctx.add_data(use_scratch, rwkv_future_tensor::size(type, width, height));
         return *this;
     }
 
     struct rwkv_future_tensor view(struct rwkv_future_ctx & ctx) const {
-        ctx.add_objects(sizeof(struct ggml_tensor));
+        ctx.add_objects(sizeof(struct ggml_v3_tensor));
         return *this;
     }
 
     struct rwkv_future_tensor subview(struct rwkv_future_ctx & ctx, const uint32_t width, const uint32_t height = 1) const {
-        ctx.add_objects(sizeof(struct ggml_tensor), 2);
+        ctx.add_objects(sizeof(struct ggml_v3_tensor), 2);
         ctx.add_memory(sizeof(uint32_t) * 2);
         return rwkv_future_tensor(type, width, height);
     }
@@ -584,7 +584,7 @@ struct rwkv_future_tensor {
     }
 
     struct rwkv_future_tensor set_inplace(struct rwkv_future_ctx & ctx, const struct rwkv_future_tensor src) {
-        ctx.add_objects(sizeof(struct ggml_tensor));
+        ctx.add_objects(sizeof(struct ggml_v3_tensor));
         ctx.add_memory(sizeof(uint32_t) * 5);
         return this->view(ctx);
     }
@@ -598,17 +598,17 @@ struct rwkv_future_tensor {
     }
 
     struct rwkv_future_tensor fn(struct rwkv_future_ctx & ctx) const {
-        ctx.add_objects(sizeof(struct ggml_tensor));
+        ctx.add_objects(sizeof(struct ggml_v3_tensor));
         ctx.add_memory(sizeof(void *) / sizeof(uint32_t));
         return this->dup(ctx);
     }
 
     struct rwkv_future_tensor mul_mat(struct rwkv_future_ctx & ctx, const struct rwkv_future_tensor & other) const {
-        return ctx.alloc(GGML_TYPE_F32, this->height, other.height);
+        return ctx.alloc(GGML_V3_TYPE_F32, this->height, other.height);
     }
 
     struct rwkv_future_tensor get_rows(struct rwkv_future_ctx & ctx, const struct rwkv_future_tensor & other) const {
-        return ctx.alloc(GGML_TYPE_F32, this->width, other.width);
+        return ctx.alloc(GGML_V3_TYPE_F32, this->width, other.width);
     }
 };
 
@@ -616,21 +616,21 @@ const size_t rwkv_tensor_header::size() const {
     return rwkv_future_tensor::size(rwkv_type_to_ggml[this->data_type], this->width, this->height);
 }
 
-struct rwkv_future_tensor rwkv_future_ctx::declare(const enum ggml_type type, const uint64_t width, const uint64_t height) {
+struct rwkv_future_tensor rwkv_future_ctx::declare(const enum ggml_v3_type type, const uint64_t width, const uint64_t height) {
     return rwkv_future_tensor(type, width, height);
 }
 
-struct rwkv_future_tensor rwkv_future_ctx::alloc(const enum ggml_type type, const uint64_t width, const uint64_t height, const bool use_scratch) {
+struct rwkv_future_tensor rwkv_future_ctx::alloc(const enum ggml_v3_type type, const uint64_t width, const uint64_t height, const bool use_scratch) {
     return this->declare(type, width, height).alloc(*this, use_scratch);
 }
 
-struct rwkv_ggml_context {
+struct rwkv_ggml_v3_context {
     std::unique_ptr<uint8_t[]> scratch;
-    struct ggml_context * ctx;
+    struct ggml_v3_context * ctx;
 
-    rwkv_ggml_context(): ctx(NULL) {}
+    rwkv_ggml_v3_context(): ctx(NULL) {}
 
-    rwkv_ggml_context(const struct rwkv_future_ctx future_ctx): ctx(NULL) {
+    rwkv_ggml_v3_context(const struct rwkv_future_ctx future_ctx): ctx(NULL) {
         scratch.reset(new(std::nothrow) uint8_t[future_ctx.scratch_size]);
 
         if (!scratch) {
@@ -640,24 +640,24 @@ struct rwkv_ggml_context {
         const size_t memory_required_overhead = size_t(128) * 1024 * 1024;
         const size_t memory_required_overhead_sc = size_t(64) * 1024 * 1024;
 
-        ctx = ggml_init({ future_ctx.objects_count * GGML_OBJECT_SIZE + future_ctx.memory_size  + memory_required_overhead, NULL, false});
+        ctx = ggml_v3_init({ future_ctx.objects_count * GGML_V3_OBJECT_SIZE + future_ctx.memory_size  + memory_required_overhead, NULL, false});
 
         if (!ctx) {
             return;
         }
 
-        ggml_set_scratch(ctx, { 0, memory_required_overhead_sc + future_ctx.scratch_size, scratch.get() });
+        ggml_v3_set_scratch(ctx, { 0, memory_required_overhead_sc + future_ctx.scratch_size, scratch.get() });
     }
 
-    struct rwkv_ggml_context & operator=(struct rwkv_ggml_context && source) {
+    struct rwkv_ggml_v3_context & operator=(struct rwkv_ggml_v3_context && source) {
         scratch.reset(source.scratch.release());
         std::swap(ctx, source.ctx);
         return *this;
     }
 
-    ~rwkv_ggml_context() {
+    ~rwkv_ggml_v3_context() {
         if (ctx) {
-            ggml_free(ctx);
+            ggml_v3_free(ctx);
         }
     }
 };
@@ -666,11 +666,11 @@ struct rwkv_ggml_context {
 // Contains all the model weights.
 // Shared by one or more contexts.
 struct rwkv_instance {
-    struct rwkv_ggml_context ctx;
+    struct rwkv_ggml_v3_context ctx;
     struct rwkv_model model;
 
     // TODO Come up with a better solution to estimate "work tensor" size
-    // The ggml_cgraph allocates a "work tensor" the first time it is used.
+    // The ggml_v3_cgraph allocates a "work tensor" the first time it is used.
     // Currently, the height of blocks.0.ffn.key.weight is the bottleneck in our implementation of RWKV.
     // Since it is the largest dimension used in any matrix multiply, it is the size used for the "work tensor".
     // However, if ggml changes its implementation, or rwkv.cpp changes its own implementation, at any point,
@@ -684,11 +684,11 @@ struct rwkv_instance {
 // But they're also used in building the computation graphs to represent the operations
 // used from input->output (operating "in place" on a rwkv_layer_state).
 struct rwkv_layer_state {
-    struct ggml_tensor * ffn_xx;
-    struct ggml_tensor * att_xx;
-    struct ggml_tensor * att_aa;
-    struct ggml_tensor * att_bb;
-    struct ggml_tensor * att_pp;
+    struct ggml_v3_tensor * ffn_xx;
+    struct ggml_v3_tensor * att_xx;
+    struct ggml_v3_tensor * att_aa;
+    struct ggml_v3_tensor * att_bb;
+    struct ggml_v3_tensor * att_pp;
 };
 
 // Holds a single computation graph and its ggml context.
@@ -696,11 +696,11 @@ struct rwkv_layer_state {
 // Graphs read hidden state from the rwkv_context and then write it back to the rwkv_context.
 // (see rwkv_context.input_layers and rwkv_context.output_layers)
 struct rwkv_graph {
-    struct rwkv_ggml_context ctx;
-    struct ggml_tensor * tokens;
+    struct rwkv_ggml_v3_context ctx;
+    struct ggml_v3_tensor * tokens;
 
-    // ggml_cgraph is so large that it can cause stack overflows if not stored on the heap
-    ggml_cgraph * cgraph;
+    // ggml_v3_cgraph is so large that it can cause stack overflows if not stored on the heap
+    ggml_v3_cgraph * cgraph;
 
     size_t pre_logits_nodes;
     size_t pre_logits_leafs;
@@ -714,12 +714,12 @@ struct rwkv_context {
     std::shared_ptr<struct rwkv_instance> instance;
 
     // Reused by all graphs.
-    struct rwkv_ggml_context ctx;
-    struct ggml_tensor * input_state;
+    struct rwkv_ggml_v3_context ctx;
+    struct ggml_v3_tensor * input_state;
     std::unique_ptr<struct rwkv_layer_state[]> input_layers;
-    struct ggml_tensor * output_state;
+    struct ggml_v3_tensor * output_state;
     std::unique_ptr<struct rwkv_layer_state[]> output_layers;
-    struct ggml_tensor * logits;
+    struct ggml_v3_tensor * logits;
 
     uint32_t n_threads;
 
@@ -810,12 +810,12 @@ void rwkv_future_carry_x(struct rwkv_future_ctx & ctx,
     }
 }
 
-void rwkv_carry_x(struct ggml_context * ctx,
-    struct ggml_tensor * weight,
-    struct ggml_tensor * bias,
-    struct ggml_tensor *& x,
-    struct ggml_tensor *& x_prev,
-    struct ggml_tensor *& carry
+void rwkv_carry_x(struct ggml_v3_context * ctx,
+    struct ggml_v3_tensor * weight,
+    struct ggml_v3_tensor * bias,
+    struct ggml_v3_tensor *& x,
+    struct ggml_v3_tensor *& x_prev,
+    struct ggml_v3_tensor *& carry
 ) {
     const size_t n_embed = x->ne[0];
     const size_t sequence_len = x->ne[1];
@@ -831,15 +831,15 @@ void rwkv_carry_x(struct ggml_context * ctx,
         carry = x;
     } else {
         // self.layer_norm(x, self.w.blocks[i].ln2)
-        x = rwkv_layer_norm(ctx, x, ggml_repeat(ctx, weight, x), ggml_repeat(ctx, bias, x));
+        x = rwkv_layer_norm(ctx, x, ggml_v3_repeat(ctx, weight, x), ggml_v3_repeat(ctx, bias, x));
 
         // xx = torch.cat((state[5*i+0].to(dtype=self.FLOAT_MODE).unsqueeze(0), x[:-1,:]))
-        x_prev = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embed, sequence_len);
-        x_prev = ggml_set_1d_inplace(ctx, x_prev, carry, 0);
-        x_prev = ggml_set_1d_inplace(ctx, x_prev, ggml_view_1d(ctx, x, n_embed * (sequence_len - 1), 0), n_embed * sizeof(float));
+        x_prev = ggml_v3_new_tensor_2d(ctx, GGML_V3_TYPE_F32, n_embed, sequence_len);
+        x_prev = ggml_v3_set_1d_inplace(ctx, x_prev, carry, 0);
+        x_prev = ggml_v3_set_1d_inplace(ctx, x_prev, ggml_v3_view_1d(ctx, x, n_embed * (sequence_len - 1), 0), n_embed * sizeof(float));
 
         // state[5*i+0] = x[-1,:]
-        carry = ggml_view_1d(ctx, x, n_embed, n_embed * (sequence_len - 1) * sizeof(float));
+        carry = ggml_v3_view_1d(ctx, x, n_embed, n_embed * (sequence_len - 1) * sizeof(float));
     }
 }
 
@@ -866,38 +866,38 @@ void rwkv_future_att_rkv(struct rwkv_future_ctx & ctx,
 }
 
 void rwkv_att_rkv(
-    struct ggml_context * ctx,
+    struct ggml_v3_context * ctx,
     struct rwkv_layer layer,
-    struct ggml_tensor * x,
-    struct ggml_tensor * x_prev,
-    struct ggml_tensor *& r,
-    struct ggml_tensor *& k,
-    struct ggml_tensor *& v
+    struct ggml_v3_tensor * x,
+    struct ggml_v3_tensor * x_prev,
+    struct ggml_v3_tensor *& r,
+    struct ggml_v3_tensor *& k,
+    struct ggml_v3_tensor *& v
 ) {
     // xk = x * time_mix_k + state[5 * i + 1] * (1 - time_mix_k)
-    struct ggml_tensor * xk = ggml_add_inplace(ctx,
-        ggml_mul(ctx, x, layer.att_time_mix_k),
-        ggml_mul(ctx, x_prev, rwkv_1_minus_x(ctx, layer.att_time_mix_k))
+    struct ggml_v3_tensor * xk = ggml_v3_add_inplace(ctx,
+        ggml_v3_mul(ctx, x, layer.att_time_mix_k),
+        ggml_v3_mul(ctx, x_prev, rwkv_1_minus_x(ctx, layer.att_time_mix_k))
     );
 
     // xv = x * time_mix_v + state[5 * i + 1] * (1 - time_mix_v)
-    struct ggml_tensor * xv = ggml_add_inplace(ctx,
-        ggml_mul(ctx, x, layer.att_time_mix_v),
-        ggml_mul(ctx, x_prev, rwkv_1_minus_x(ctx, layer.att_time_mix_v))
+    struct ggml_v3_tensor * xv = ggml_v3_add_inplace(ctx,
+        ggml_v3_mul(ctx, x, layer.att_time_mix_v),
+        ggml_v3_mul(ctx, x_prev, rwkv_1_minus_x(ctx, layer.att_time_mix_v))
     );
 
     // xr = x * time_mix_r + state[5 * i + 1] * (1 - time_mix_r)
-    struct ggml_tensor * xr = ggml_add_inplace(ctx,
-        ggml_mul(ctx, x, layer.att_time_mix_r),
-        ggml_mul(ctx, x_prev, rwkv_1_minus_x(ctx, layer.att_time_mix_r))
+    struct ggml_v3_tensor * xr = ggml_v3_add_inplace(ctx,
+        ggml_v3_mul(ctx, x, layer.att_time_mix_r),
+        ggml_v3_mul(ctx, x_prev, rwkv_1_minus_x(ctx, layer.att_time_mix_r))
     );
 
     // r = torch.sigmoid(rw @ xr)
-    r = rwkv_sigmoid(ctx, ggml_mul_mat(ctx, layer.att_receptance, xr));
+    r = rwkv_sigmoid(ctx, ggml_v3_mul_mat(ctx, layer.att_receptance, xr));
     // k = kw @ xk
-    k = ggml_mul_mat(ctx, layer.att_key, xk);
+    k = ggml_v3_mul_mat(ctx, layer.att_key, xk);
     // v = vw @ xv
-    v = ggml_mul_mat(ctx, layer.att_value, xv);
+    v = ggml_v3_mul_mat(ctx, layer.att_value, xv);
 }
 
 struct rwkv_future_tensor rwkv_future_att_wkv(struct rwkv_future_ctx & ctx,
@@ -931,48 +931,48 @@ struct rwkv_future_tensor rwkv_future_att_wkv(struct rwkv_future_ctx & ctx,
     return a.combine(ctx, b);
 }
 
-struct ggml_tensor * rwkv_att_wkv(
-    struct ggml_context * ctx,
-    struct ggml_tensor * att_time_first,
-    struct ggml_tensor * att_time_decay,
-    struct ggml_tensor * k,
-    struct ggml_tensor * v,
-    struct ggml_tensor *& aa,
-    struct ggml_tensor *& bb,
-    struct ggml_tensor *& pp
+struct ggml_v3_tensor * rwkv_att_wkv(
+    struct ggml_v3_context * ctx,
+    struct ggml_v3_tensor * att_time_first,
+    struct ggml_v3_tensor * att_time_decay,
+    struct ggml_v3_tensor * k,
+    struct ggml_v3_tensor * v,
+    struct ggml_v3_tensor *& aa,
+    struct ggml_v3_tensor *& bb,
+    struct ggml_v3_tensor *& pp
 ) {
     // ww = time_first + k
-    struct ggml_tensor * ww = ggml_add(ctx, att_time_first, k);
+    struct ggml_v3_tensor * ww = ggml_v3_add(ctx, att_time_first, k);
     // qq = torch.maximum(pp, ww)
-    struct ggml_tensor * qq = rwkv_max(ctx, pp, ww);
+    struct ggml_v3_tensor * qq = rwkv_max(ctx, pp, ww);
     // e1 = torch.exp(pp - qq)
-    struct ggml_tensor * e1 = rwkv_exp(ctx, ggml_sub(ctx, pp, qq));
+    struct ggml_v3_tensor * e1 = rwkv_exp(ctx, ggml_v3_sub(ctx, pp, qq));
     // e2 = torch.exp(ww - qq)
-    struct ggml_tensor * e2 = rwkv_exp(ctx, ggml_sub(ctx, ww, qq));
+    struct ggml_v3_tensor * e2 = rwkv_exp(ctx, ggml_v3_sub(ctx, ww, qq));
 
     // a = e1 * aa + e2 * v
-    struct ggml_tensor * a = ggml_add_inplace(ctx, ggml_mul(ctx, e1, aa), ggml_mul(ctx, e2, v));
+    struct ggml_v3_tensor * a = ggml_v3_add_inplace(ctx, ggml_v3_mul(ctx, e1, aa), ggml_v3_mul(ctx, e2, v));
     // b = e1 * bb + e2
-    struct ggml_tensor * b = ggml_add_inplace(ctx, ggml_mul(ctx, e1, bb), e2);
+    struct ggml_v3_tensor * b = ggml_v3_add_inplace(ctx, ggml_v3_mul(ctx, e1, bb), e2);
 
     // ww = pp + time_decay
-    ww = ggml_add(ctx, pp, att_time_decay);
+    ww = ggml_v3_add(ctx, pp, att_time_decay);
     // qq = torch.maximum(ww, k)
     qq = rwkv_max(ctx, ww, k);
     // e1 = torch.exp(ww - qq)
-    e1 = rwkv_exp(ctx, ggml_sub(ctx, ww, qq));
+    e1 = rwkv_exp(ctx, ggml_v3_sub(ctx, ww, qq));
     // e2 = torch.exp(k[t] - qq)
-    e2 = rwkv_exp(ctx, ggml_sub(ctx, k, qq));
+    e2 = rwkv_exp(ctx, ggml_v3_sub(ctx, k, qq));
 
     // state[5 * i + 2] = e1 * aa + e2 * v
     // state[5 * i + 3] = e1 * bb + e2
     // state[5 * i + 4] = qq
-    aa = ggml_add_inplace(ctx, ggml_mul(ctx, e1, aa), ggml_mul(ctx, e2, v));
-    bb = ggml_add_inplace(ctx, ggml_mul(ctx, e1, bb), e2);
+    aa = ggml_v3_add_inplace(ctx, ggml_v3_mul(ctx, e1, aa), ggml_v3_mul(ctx, e2, v));
+    bb = ggml_v3_add_inplace(ctx, ggml_v3_mul(ctx, e1, bb), e2);
     pp = qq;
 
     // wkv = a / b
-    return ggml_div(ctx, a, b);
+    return ggml_v3_div(ctx, a, b);
 }
 
 
@@ -1005,17 +1005,17 @@ struct rwkv_future_tensor rwkv_future_att(struct rwkv_future_ctx & ctx,
     return att_output.mul_mat(ctx, r.combine(ctx, wkv));
 }
 
-struct ggml_tensor * rwkv_att(struct ggml_context * ctx, struct ggml_tensor * x, struct rwkv_layer layer, struct rwkv_layer_state & state) {
-    struct ggml_tensor * x_prev;
+struct ggml_v3_tensor * rwkv_att(struct ggml_v3_context * ctx, struct ggml_v3_tensor * x, struct rwkv_layer layer, struct rwkv_layer_state & state) {
+    struct ggml_v3_tensor * x_prev;
     rwkv_carry_x(ctx, layer.ln1_weight, layer.ln1_bias, x, x_prev, state.att_xx);
 
-    struct ggml_tensor * r, * k, * v;
+    struct ggml_v3_tensor * r, * k, * v;
     rwkv_att_rkv(ctx, layer, x, x_prev, r, k, v);
 
-    struct ggml_tensor * wkv = rwkv_att_wkv(ctx, layer.att_time_first, layer.att_time_decay, k, v, state.att_aa, state.att_bb, state.att_pp);
+    struct ggml_v3_tensor * wkv = rwkv_att_wkv(ctx, layer.att_time_first, layer.att_time_decay, k, v, state.att_aa, state.att_bb, state.att_pp);
 
     // ow @ (r * xx)
-    return ggml_mul_mat(ctx, layer.att_output, ggml_mul(ctx, r, wkv));
+    return ggml_v3_mul_mat(ctx, layer.att_output, ggml_v3_mul(ctx, r, wkv));
 }
 
 struct rwkv_future_tensor rwkv_future_ffn(struct rwkv_future_ctx & ctx,
@@ -1041,47 +1041,47 @@ struct rwkv_future_tensor rwkv_future_ffn(struct rwkv_future_ctx & ctx,
     return r.consume(ctx, ffn_v.mul_mat(ctx, k));
 }
 
-struct ggml_tensor * rwkv_ffn(struct ggml_context * ctx, struct ggml_tensor * x, struct rwkv_layer layer, struct rwkv_layer_state & state) {
-    struct ggml_tensor * x_prev;
+struct ggml_v3_tensor * rwkv_ffn(struct ggml_v3_context * ctx, struct ggml_v3_tensor * x, struct rwkv_layer layer, struct rwkv_layer_state & state) {
+    struct ggml_v3_tensor * x_prev;
     rwkv_carry_x(ctx, layer.ln2_weight, layer.ln2_bias, x, x_prev, state.ffn_xx);
 
     // xk = x * time_mix_k + state[5 * i + 1] * (1 - time_mix_k)
     // xk = x * time_mix_k + state[5 * i + 0] * (1 - time_mix_k)
-    struct ggml_tensor * xk = ggml_add_inplace(
+    struct ggml_v3_tensor * xk = ggml_v3_add_inplace(
         ctx,
-        ggml_mul(ctx, x, layer.ffn_time_mix_k),
-        ggml_mul(ctx, x_prev, rwkv_1_minus_x(ctx, layer.ffn_time_mix_k))
+        ggml_v3_mul(ctx, x, layer.ffn_time_mix_k),
+        ggml_v3_mul(ctx, x_prev, rwkv_1_minus_x(ctx, layer.ffn_time_mix_k))
     );
 
     // xr = x * time_mix_r + state[5 * i + 0] * (1 - time_mix_r)
-    struct ggml_tensor * xr = ggml_add_inplace(
+    struct ggml_v3_tensor * xr = ggml_v3_add_inplace(
         ctx,
-        ggml_mul(ctx, x, layer.ffn_time_mix_r),
-        ggml_mul(ctx, x_prev, rwkv_1_minus_x(ctx, layer.ffn_time_mix_r))
+        ggml_v3_mul(ctx, x, layer.ffn_time_mix_r),
+        ggml_v3_mul(ctx, x_prev, rwkv_1_minus_x(ctx, layer.ffn_time_mix_r))
     );
 
     // r = torch.sigmoid(rw @ xr)
-    struct ggml_tensor * r = rwkv_sigmoid(ctx, ggml_mul_mat(ctx, layer.ffn_receptance, xr));
+    struct ggml_v3_tensor * r = rwkv_sigmoid(ctx, ggml_v3_mul_mat(ctx, layer.ffn_receptance, xr));
 
     // k = torch.square(torch.relu(kw @ xk))
-    struct ggml_tensor * k = ggml_sqr_inplace(ctx, ggml_relu_inplace(ctx, ggml_mul_mat(ctx, layer.ffn_key, xk)));
+    struct ggml_v3_tensor * k = ggml_v3_sqr_inplace(ctx, ggml_v3_relu_inplace(ctx, ggml_v3_mul_mat(ctx, layer.ffn_key, xk)));
 
     // r * (vw @ k)
-    return ggml_mul_inplace(ctx, r, ggml_mul_mat(ctx, layer.ffn_value, k));
+    return ggml_v3_mul_inplace(ctx, r, ggml_v3_mul_mat(ctx, layer.ffn_value, k));
 }
 
 struct rwkv_future_tensor rwkv_future_graph_work(struct rwkv_future_ctx & ctx,
-    const enum ggml_type type,
+    const enum ggml_v3_type type,
     const size_t ffn_key_height,
     const size_t n_threads,
     const size_t sequence_len = 1
 ) {
 #if defined(GGML_USE_CLBLAST) || defined(GGML_USE_CUBLAS)
-    enum ggml_type mul_mat_type = type == GGML_TYPE_F32 ? GGML_TYPE_F32 : GGML_TYPE_F16;
+    enum ggml_v3_type mul_mat_type = type == GGML_V3_TYPE_F32 ? GGML_V3_TYPE_F32 : GGML_V3_TYPE_F16;
 #else
-    enum ggml_type mul_mat_type = ggml_is_quantized(type) ? GGML_TYPE_Q8_1 : type;
+    enum ggml_v3_type mul_mat_type = ggml_v3_is_quantized(type) ? GGML_V3_TYPE_Q8_1 : type;
 #endif
-    return ctx.alloc(GGML_TYPE_I8, rwkv_future_tensor::size(mul_mat_type, ffn_key_height, sequence_len) * n_threads + 64 * (n_threads - 1));
+    return ctx.alloc(GGML_V3_TYPE_I8, rwkv_future_tensor::size(mul_mat_type, ffn_key_height, sequence_len) * n_threads + 64 * (n_threads - 1));
 }
 
 struct rwkv_future_tensor rwkv_future_serial_graph(struct rwkv_future_ctx & ctx,
@@ -1148,13 +1148,13 @@ struct rwkv_future_tensor rwkv_future_serial_graph(struct rwkv_future_ctx & ctx,
 }
 
 bool rwkv_build_serial_graph(
-    struct ggml_context * ctx,
+    struct ggml_v3_context * ctx,
     struct rwkv_model & model,
-    struct ggml_tensor * tokens,
+    struct ggml_v3_tensor * tokens,
     struct rwkv_layer_state * inputs,
     struct rwkv_layer_state * outputs,
-    struct ggml_tensor * logits,
-    struct ggml_cgraph * cgraph,
+    struct ggml_v3_tensor * logits,
+    struct ggml_v3_cgraph * cgraph,
 
     size_t * const pre_logits_nodes,
     size_t * const pre_logits_leafs,
@@ -1162,7 +1162,7 @@ bool rwkv_build_serial_graph(
     size_t * const post_logits_leafs
 ) {
     // x = self.w.emb.weight[token]
-    struct ggml_tensor * x = ggml_get_rows(ctx, model.emb, tokens);
+    struct ggml_v3_tensor * x = ggml_v3_get_rows(ctx, model.emb, tokens);
 
     // x = self.layer_norm(x, self.w.blocks[0].ln0)
     x = rwkv_layer_norm(ctx, x, model.ln0_weight, model.ln0_bias);
@@ -1171,15 +1171,15 @@ bool rwkv_build_serial_graph(
         struct rwkv_layer & layer = model.layers[i];
 
         struct rwkv_layer_state state = inputs[i];
-        x = ggml_add_inplace(ctx, x, rwkv_att(ctx, x, layer, state));
-        x = ggml_add_inplace(ctx, x, rwkv_ffn(ctx, x, layer, state));
+        x = ggml_v3_add_inplace(ctx, x, rwkv_att(ctx, x, layer, state));
+        x = ggml_v3_add_inplace(ctx, x, rwkv_ffn(ctx, x, layer, state));
 
         struct rwkv_layer_state & output = outputs[i];
-        ggml_build_forward_expand(cgraph, ggml_cpy(ctx, state.ffn_xx, output.ffn_xx));
-        ggml_build_forward_expand(cgraph, ggml_cpy(ctx, state.att_xx, output.att_xx));
-        ggml_build_forward_expand(cgraph, ggml_cpy(ctx, state.att_aa, output.att_aa));
-        ggml_build_forward_expand(cgraph, ggml_cpy(ctx, state.att_bb, output.att_bb));
-        ggml_build_forward_expand(cgraph, ggml_cpy(ctx, state.att_pp, output.att_pp));
+        ggml_v3_build_forward_expand(cgraph, ggml_v3_cpy(ctx, state.ffn_xx, output.ffn_xx));
+        ggml_v3_build_forward_expand(cgraph, ggml_v3_cpy(ctx, state.att_xx, output.att_xx));
+        ggml_v3_build_forward_expand(cgraph, ggml_v3_cpy(ctx, state.att_aa, output.att_aa));
+        ggml_v3_build_forward_expand(cgraph, ggml_v3_cpy(ctx, state.att_bb, output.att_bb));
+        ggml_v3_build_forward_expand(cgraph, ggml_v3_cpy(ctx, state.att_pp, output.att_pp));
     }
 
     *pre_logits_nodes = cgraph->n_nodes;
@@ -1189,7 +1189,7 @@ bool rwkv_build_serial_graph(
     x = rwkv_layer_norm(ctx, x, model.ln_out_weight, model.ln_out_bias);
 
     // x = (self.w.head.weight @ x).float()
-    ggml_build_forward_expand(cgraph, ggml_cpy(ctx, ggml_mul_mat(ctx, model.head, x), logits));
+    ggml_v3_build_forward_expand(cgraph, ggml_v3_cpy(ctx, ggml_v3_mul_mat(ctx, model.head, x), logits));
 
     *post_logits_nodes = cgraph->n_nodes;
     *post_logits_leafs = cgraph->n_leafs;
@@ -1272,13 +1272,13 @@ struct rwkv_future_tensor rwkv_future_sequence_graph(struct rwkv_future_ctx & ct
 }
 
 bool rwkv_build_sequence_graph(
-    struct ggml_context * ctx,
+    struct ggml_v3_context * ctx,
     struct rwkv_model & model,
-    struct ggml_tensor * tokens,
+    struct ggml_v3_tensor * tokens,
     struct rwkv_layer_state * inputs,
     struct rwkv_layer_state * outputs,
-    struct ggml_tensor * logits,
-    struct ggml_cgraph * cgraph,
+    struct ggml_v3_tensor * logits,
+    struct ggml_v3_cgraph * cgraph,
 
     size_t * const pre_logits_nodes,
     size_t * const pre_logits_leafs,
@@ -1288,48 +1288,48 @@ bool rwkv_build_sequence_graph(
     const uint32_t n_embed = model.header.n_embed;
     const size_t sequence_len = tokens->ne[0];
 
-    struct ggml_tensor * x = ggml_get_rows(ctx, model.emb, tokens);
-    x = rwkv_layer_norm(ctx, x, ggml_repeat(ctx, model.ln0_weight, x), ggml_repeat(ctx, model.ln0_bias, x));
+    struct ggml_v3_tensor * x = ggml_v3_get_rows(ctx, model.emb, tokens);
+    x = rwkv_layer_norm(ctx, x, ggml_v3_repeat(ctx, model.ln0_weight, x), ggml_v3_repeat(ctx, model.ln0_bias, x));
 
     for (size_t i = 0; i < model.header.n_layer; i++) {
         struct rwkv_layer & layer = model.layers[i];
         struct rwkv_layer_state state = inputs[i];
 
-        struct ggml_tensor * x0 = x, * x_prev;
+        struct ggml_v3_tensor * x0 = x, * x_prev;
         rwkv_carry_x(ctx, layer.ln1_weight, layer.ln1_bias, x0, x_prev, state.att_xx);
 
-        struct ggml_tensor * r, * k, * v;
+        struct ggml_v3_tensor * r, * k, * v;
         rwkv_att_rkv(ctx, layer, x0, x_prev, r, k, v);
 
-        ggml_build_forward_expand(cgraph, r);
+        ggml_v3_build_forward_expand(cgraph, r);
 
         for (uint32_t t = 0; t < sequence_len; t++) {
-            struct ggml_tensor * kt = ggml_view_1d(ctx, k, n_embed, n_embed * sizeof(float) * t);
-            struct ggml_tensor * vt = ggml_view_1d(ctx, v, n_embed, n_embed * sizeof(float) * t);
-            struct ggml_tensor * xt = ggml_view_1d(ctx, x_prev, n_embed, n_embed * sizeof(float) * t);
-            struct ggml_tensor * wkv = rwkv_att_wkv(ctx, layer.att_time_first, layer.att_time_decay, kt, vt, state.att_aa, state.att_bb, state.att_pp);
-            ggml_build_forward_expand(cgraph, ggml_cpy(ctx, wkv, xt));
+            struct ggml_v3_tensor * kt = ggml_v3_view_1d(ctx, k, n_embed, n_embed * sizeof(float) * t);
+            struct ggml_v3_tensor * vt = ggml_v3_view_1d(ctx, v, n_embed, n_embed * sizeof(float) * t);
+            struct ggml_v3_tensor * xt = ggml_v3_view_1d(ctx, x_prev, n_embed, n_embed * sizeof(float) * t);
+            struct ggml_v3_tensor * wkv = rwkv_att_wkv(ctx, layer.att_time_first, layer.att_time_decay, kt, vt, state.att_aa, state.att_bb, state.att_pp);
+            ggml_v3_build_forward_expand(cgraph, ggml_v3_cpy(ctx, wkv, xt));
         }
 
-        x = ggml_add_inplace(ctx, x, ggml_mul_mat(ctx, layer.att_output, ggml_mul(ctx, r, x_prev)));
-        x = ggml_add_inplace(ctx, x, rwkv_ffn(ctx, x, layer, state));
+        x = ggml_v3_add_inplace(ctx, x, ggml_v3_mul_mat(ctx, layer.att_output, ggml_v3_mul(ctx, r, x_prev)));
+        x = ggml_v3_add_inplace(ctx, x, rwkv_ffn(ctx, x, layer, state));
 
         struct rwkv_layer_state & output = outputs[i];
-        ggml_build_forward_expand(cgraph, ggml_cpy(ctx, state.ffn_xx, output.ffn_xx));
-        ggml_build_forward_expand(cgraph, ggml_cpy(ctx, state.att_xx, output.att_xx));
-        ggml_build_forward_expand(cgraph, ggml_cpy(ctx, state.att_aa, output.att_aa));
-        ggml_build_forward_expand(cgraph, ggml_cpy(ctx, state.att_bb, output.att_bb));
-        ggml_build_forward_expand(cgraph, ggml_cpy(ctx, state.att_pp, output.att_pp));
+        ggml_v3_build_forward_expand(cgraph, ggml_v3_cpy(ctx, state.ffn_xx, output.ffn_xx));
+        ggml_v3_build_forward_expand(cgraph, ggml_v3_cpy(ctx, state.att_xx, output.att_xx));
+        ggml_v3_build_forward_expand(cgraph, ggml_v3_cpy(ctx, state.att_aa, output.att_aa));
+        ggml_v3_build_forward_expand(cgraph, ggml_v3_cpy(ctx, state.att_bb, output.att_bb));
+        ggml_v3_build_forward_expand(cgraph, ggml_v3_cpy(ctx, state.att_pp, output.att_pp));
     }
 
     *pre_logits_nodes = cgraph->n_nodes;
     *pre_logits_leafs = cgraph->n_leafs;
 
     // x = self.layer_norm(x[-1,:], self.w.ln_out)
-    x = rwkv_layer_norm(ctx, ggml_view_1d(ctx, x, n_embed, n_embed * sizeof(float) * (sequence_len - 1)), model.ln_out_weight, model.ln_out_bias);
+    x = rwkv_layer_norm(ctx, ggml_v3_view_1d(ctx, x, n_embed, n_embed * sizeof(float) * (sequence_len - 1)), model.ln_out_weight, model.ln_out_bias);
 
     // x = (self.w.head.weight @ x).float()
-    ggml_build_forward_expand(cgraph, ggml_cpy(ctx, ggml_mul_mat(ctx, model.head, x), logits));
+    ggml_v3_build_forward_expand(cgraph, ggml_v3_cpy(ctx, ggml_v3_mul_mat(ctx, model.head, x), logits));
 
     *post_logits_nodes = cgraph->n_nodes;
     *post_logits_leafs = cgraph->n_leafs;
@@ -1368,10 +1368,10 @@ struct rwkv_file {
 bool rwkv_instance_from_file(const char * file_path, struct rwkv_instance & instance) {
     struct stat file_stat;
     struct rwkv_model model;
-    struct rwkv_ggml_context ctx;
+    struct rwkv_ggml_v3_context ctx;
     size_t ffn_key_size = 0;
 
-    std::unordered_map<std::string, struct ggml_tensor *> parameters;
+    std::unordered_map<std::string, struct ggml_v3_tensor *> parameters;
 
     {
         rwkv_file file(fopen(file_path, "rb"));
@@ -1403,25 +1403,25 @@ bool rwkv_instance_from_file(const char * file_path, struct rwkv_instance & inst
         ctx = future_ctx;
         RWKV_ASSERT_NULL_MSG(RWKV_ERROR_CTX | RWKV_ERROR_ALLOC, ctx.ctx, "Failed to allocate model context");
 
-        struct ggml_tensor * tensor;
+        struct ggml_v3_tensor * tensor;
 
         while ((size_t) ftell(file.file) < (size_t) file_stat.st_size) {
-            RWKV_ASSERT_NULL_MSG(RWKV_ERROR_MODEL_PARAMS, rwkv_fread_ggml_tensor(file.file, ctx.ctx, name, tensor), "Failed to read model params");
+            RWKV_ASSERT_NULL_MSG(RWKV_ERROR_MODEL_PARAMS, rwkv_fread_ggml_v3_tensor(file.file, ctx.ctx, name, tensor), "Failed to read model params");
             parameters[std::move(name)] = tensor;
         }
     }
 
-    std::unordered_map<std::string, struct ggml_tensor *> & parameters_ref = parameters;
-    RWKV_ASSERT_NULL(RWKV_ERROR_MODEL_PARAMS | RWKV_ERROR_PARAM_MISSING, rwkv_set_params(model, [&](const char * key, struct ggml_tensor *& dest) {
-        struct ggml_tensor * tensor = parameters_ref[key];
+    std::unordered_map<std::string, struct ggml_v3_tensor *> & parameters_ref = parameters;
+    RWKV_ASSERT_NULL(RWKV_ERROR_MODEL_PARAMS | RWKV_ERROR_PARAM_MISSING, rwkv_set_params(model, [&](const char * key, struct ggml_v3_tensor *& dest) {
+        struct ggml_v3_tensor * tensor = parameters_ref[key];
         RWKV_ENSURE_OR_FALSE_MSG(tensor, "Model parameter %s not found", key);
         dest = tensor;
         return true;
     }));
 
     // Verify order of dimensions
-    struct ggml_tensor * emb = model.emb;
-    RWKV_ASSERT_NULL_MSG(RWKV_ERROR_MODEL_PARAMS | RWKV_ERROR_SHAPE, ggml_n_dims(emb) == 2, "Unexpected dimension count of embedding matrix %d", ggml_n_dims(emb));
+    struct ggml_v3_tensor * emb = model.emb;
+    RWKV_ASSERT_NULL_MSG(RWKV_ERROR_MODEL_PARAMS | RWKV_ERROR_SHAPE, ggml_v3_n_dims(emb) == 2, "Unexpected dimension count of embedding matrix %d", ggml_v3_n_dims(emb));
     RWKV_ASSERT_NULL_MSG(RWKV_ERROR_MODEL_PARAMS | RWKV_ERROR_DIMENSION, emb->ne[0] == model.header.n_embed, "Unexpected dimension of embedding matrix %" PRId64, emb->ne[0]);
     RWKV_ASSERT_NULL_MSG(RWKV_ERROR_MODEL_PARAMS | RWKV_ERROR_DIMENSION, emb->ne[1] == model.header.n_vocab, "Unexpected dimension of embedding matrix %" PRId64, emb->ne[1]);
 
@@ -1440,9 +1440,9 @@ struct rwkv_context * rwkv_new_context_impl(std::shared_ptr<struct rwkv_instance
     const size_t n_layer = header.n_layer;
 
     struct rwkv_future_ctx future_ctx;
-    const struct rwkv_future_tensor future_input = future_ctx.alloc(GGML_TYPE_F32, n_embed * 5 * n_layer);
-    const struct rwkv_future_tensor future_output = future_ctx.alloc(GGML_TYPE_F32, n_embed * 5 * n_layer);
-    const struct rwkv_future_tensor future_logits = future_ctx.alloc(GGML_TYPE_F32, n_vocab);
+    const struct rwkv_future_tensor future_input = future_ctx.alloc(GGML_V3_TYPE_F32, n_embed * 5 * n_layer);
+    const struct rwkv_future_tensor future_output = future_ctx.alloc(GGML_V3_TYPE_F32, n_embed * 5 * n_layer);
+    const struct rwkv_future_tensor future_logits = future_ctx.alloc(GGML_V3_TYPE_F32, n_vocab);
 
     for (size_t i = 0; i < n_layer; i++) {
         /* ffn_xx */ future_input.subview(future_ctx, n_embed); future_output.subview(future_ctx, n_embed);
@@ -1452,11 +1452,11 @@ struct rwkv_context * rwkv_new_context_impl(std::shared_ptr<struct rwkv_instance
         /* att_pp */ future_input.subview(future_ctx, n_embed); future_output.subview(future_ctx, n_embed);
     }
 
-    struct rwkv_ggml_context ctx(future_ctx);
+    struct rwkv_ggml_v3_context ctx(future_ctx);
     RWKV_ASSERT_NULL_MSG(RWKV_ERROR_CTX | RWKV_ERROR_ALLOC, ctx.ctx, "Failed to allocate model context");
 
-    struct ggml_tensor * input = ggml_new_tensor_1d(ctx.ctx, GGML_TYPE_F32, n_embed * 5 * n_layer);
-    struct ggml_tensor * output = ggml_new_tensor_1d(ctx.ctx, GGML_TYPE_F32, n_embed * 5 * n_layer);
+    struct ggml_v3_tensor * input = ggml_v3_new_tensor_1d(ctx.ctx, GGML_V3_TYPE_F32, n_embed * 5 * n_layer);
+    struct ggml_v3_tensor * output = ggml_v3_new_tensor_1d(ctx.ctx, GGML_V3_TYPE_F32, n_embed * 5 * n_layer);
 
     // We collect parts of input state here. Each part is (n_embed) vector.
     std::unique_ptr<struct rwkv_layer_state[]> inputs(new(std::nothrow) struct rwkv_layer_state[n_layer]);
@@ -1468,24 +1468,24 @@ struct rwkv_context * rwkv_new_context_impl(std::shared_ptr<struct rwkv_instance
 
     for (size_t i = 0; i < n_layer; i++) {
         struct rwkv_layer_state & input_state = inputs[i];
-        input_state.ffn_xx = ggml_view_1d(ctx.ctx, input, n_embed, n_embed * (i * 5 + 0) * sizeof(float));
-        input_state.att_xx = ggml_view_1d(ctx.ctx, input, n_embed, n_embed * (i * 5 + 1) * sizeof(float));
-        input_state.att_aa = ggml_view_1d(ctx.ctx, input, n_embed, n_embed * (i * 5 + 2) * sizeof(float));
-        input_state.att_bb = ggml_view_1d(ctx.ctx, input, n_embed, n_embed * (i * 5 + 3) * sizeof(float));
-        input_state.att_pp = ggml_view_1d(ctx.ctx, input, n_embed, n_embed * (i * 5 + 4) * sizeof(float));
+        input_state.ffn_xx = ggml_v3_view_1d(ctx.ctx, input, n_embed, n_embed * (i * 5 + 0) * sizeof(float));
+        input_state.att_xx = ggml_v3_view_1d(ctx.ctx, input, n_embed, n_embed * (i * 5 + 1) * sizeof(float));
+        input_state.att_aa = ggml_v3_view_1d(ctx.ctx, input, n_embed, n_embed * (i * 5 + 2) * sizeof(float));
+        input_state.att_bb = ggml_v3_view_1d(ctx.ctx, input, n_embed, n_embed * (i * 5 + 3) * sizeof(float));
+        input_state.att_pp = ggml_v3_view_1d(ctx.ctx, input, n_embed, n_embed * (i * 5 + 4) * sizeof(float));
 
         struct rwkv_layer_state & output_state = outputs[i];
-        output_state.ffn_xx = ggml_view_1d(ctx.ctx, output, n_embed, n_embed * (i * 5 + 0) * sizeof(float));
-        output_state.att_xx = ggml_view_1d(ctx.ctx, output, n_embed, n_embed * (i * 5 + 1) * sizeof(float));
-        output_state.att_aa = ggml_view_1d(ctx.ctx, output, n_embed, n_embed * (i * 5 + 2) * sizeof(float));
-        output_state.att_bb = ggml_view_1d(ctx.ctx, output, n_embed, n_embed * (i * 5 + 3) * sizeof(float));
-        output_state.att_pp = ggml_view_1d(ctx.ctx, output, n_embed, n_embed * (i * 5 + 4) * sizeof(float));
+        output_state.ffn_xx = ggml_v3_view_1d(ctx.ctx, output, n_embed, n_embed * (i * 5 + 0) * sizeof(float));
+        output_state.att_xx = ggml_v3_view_1d(ctx.ctx, output, n_embed, n_embed * (i * 5 + 1) * sizeof(float));
+        output_state.att_aa = ggml_v3_view_1d(ctx.ctx, output, n_embed, n_embed * (i * 5 + 2) * sizeof(float));
+        output_state.att_bb = ggml_v3_view_1d(ctx.ctx, output, n_embed, n_embed * (i * 5 + 3) * sizeof(float));
+        output_state.att_pp = ggml_v3_view_1d(ctx.ctx, output, n_embed, n_embed * (i * 5 + 4) * sizeof(float));
     }
 
-    struct ggml_tensor * logits = ggml_new_tensor_1d(ctx.ctx, GGML_TYPE_F32, n_vocab);
+    struct ggml_v3_tensor * logits = ggml_v3_new_tensor_1d(ctx.ctx, GGML_V3_TYPE_F32, n_vocab);
 
     struct rwkv_future_ctx graph_future_ctx;
-    const struct rwkv_future_tensor future_token = graph_future_ctx.alloc(GGML_TYPE_I32, 1, 1, false);
+    const struct rwkv_future_tensor future_token = graph_future_ctx.alloc(GGML_V3_TYPE_I32, 1, 1, false);
 
     const struct rwkv_model & model = instance->model;
     const struct rwkv_layer & layer = model.layers[0];
@@ -1519,8 +1519,8 @@ struct rwkv_context * rwkv_new_context_impl(std::shared_ptr<struct rwkv_instance
     struct rwkv_graph serial_graph;
     serial_graph.ctx = graph_future_ctx;
     RWKV_ASSERT_NULL_MSG(RWKV_ERROR_CTX | RWKV_ERROR_ALLOC, serial_graph.ctx.ctx, "Failed to allocate serial graph context");
-    serial_graph.tokens = ggml_new_i32(serial_graph.ctx.ctx, 0);
-    serial_graph.cgraph = ggml_new_graph_custom(serial_graph.ctx.ctx, GGML_MAX_NODES, false);
+    serial_graph.tokens = ggml_v3_new_i32(serial_graph.ctx.ctx, 0);
+    serial_graph.cgraph = ggml_v3_new_graph_custom(serial_graph.ctx.ctx, GGML_V3_MAX_NODES, false);
     RWKV_ASSERT_NULL_MSG(RWKV_ERROR_ALLOC, serial_graph.cgraph, "Failed to allocate serial graph");
 
     RWKV_ASSERT_NULL(RWKV_ERROR_GRAPH, rwkv_build_serial_graph(
@@ -1568,13 +1568,13 @@ struct rwkv_context * rwkv_clone_context(struct rwkv_context * ctx, const uint32
 bool rwkv_gpu_offload_layers(struct rwkv_context * ctx, const uint32_t n_layers) {
 #if defined(GGML_USE_CLBLAST) || defined(GGML_USE_CUBLAS)
     printf("\nOffloading %u (or fewer) layers...",n_layers);
-    const auto offload = [&](struct ggml_tensor * tensor) {
+    const auto offload = [&](struct ggml_v3_tensor * tensor) {
         // TODO support multi-GPU
-        tensor->backend = GGML_BACKEND_GPU;
+        tensor->backend = GGML_V3_BACKEND_GPU;
         #if defined(GGML_USE_CLBLAST)
-        ggml_cl_transform_tensor(tensor->data, tensor);
+        ggml_v3_cl_transform_tensor(tensor->data, tensor);
         #else
-        ggml_cuda_transform_tensor(tensor->data, tensor);
+        ggml_v3_cuda_transform_tensor(tensor->data, tensor);
         #endif
     };
 
@@ -1584,7 +1584,7 @@ bool rwkv_gpu_offload_layers(struct rwkv_context * ctx, const uint32_t n_layers)
         for (size_t & i = ctx->gpu_layers; i < n_gpu; i++) {
             const struct rwkv_layer & layer = ctx->instance->model.layers[i];
 
-            // TODO also offload other operations to GPU with ggml_cuda_assign_buffers
+            // TODO also offload other operations to GPU with ggml_v3_cuda_assign_buffers
             offload(layer.att_key);
             offload(layer.att_value);
             offload(layer.att_receptance);
@@ -1627,7 +1627,7 @@ bool rwkv_eval(struct rwkv_context * ctx, const int n_threads, const uint32_t to
     RWKV_CTX_ASSERT_FALSE_MSG(ctx, RWKV_ERROR_ARGS, token < n_vocab, "Token (%" PRId32 ") is out of range (0 .. %zu)", token, n_vocab - 1);
 
     rwkv_set_inputs(ctx, state_in);
-    ggml_set_i32(ctx->serial_graph.tokens, token);
+    ggml_v3_set_i32(ctx->serial_graph.tokens, token);
 
     // Short circuit computation of logits if nobody actually cares
     if (!logits_out) {
@@ -1663,7 +1663,7 @@ bool rwkv_eval_sequence(struct rwkv_context * ctx, const int n_threads, const ui
         // Build new sequence graph
 
         struct rwkv_future_ctx graph_future_ctx;
-        const struct rwkv_future_tensor future_tokens = graph_future_ctx.alloc(GGML_TYPE_I32, sequence_len);
+        const struct rwkv_future_tensor future_tokens = graph_future_ctx.alloc(GGML_V3_TYPE_I32, sequence_len);
 
         const struct rwkv_model & model = ctx->instance->model;
         const struct rwkv_layer & layer = model.layers[0];
@@ -1697,8 +1697,8 @@ bool rwkv_eval_sequence(struct rwkv_context * ctx, const int n_threads, const ui
         struct rwkv_graph sequence_graph;
         sequence_graph.ctx = graph_future_ctx;
         RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_CTX | RWKV_ERROR_ALLOC, sequence_graph.ctx.ctx, "Failed to allocate sequence graph context");
-        sequence_graph.tokens = ggml_new_tensor_1d(sequence_graph.ctx.ctx, GGML_TYPE_I32, sequence_len);
-        sequence_graph.cgraph = ggml_new_graph_custom(sequence_graph.ctx.ctx, GGML_MAX_NODES, false);
+        sequence_graph.tokens = ggml_v3_new_tensor_1d(sequence_graph.ctx.ctx, GGML_V3_TYPE_I32, sequence_len);
+        sequence_graph.cgraph = ggml_v3_new_graph_custom(sequence_graph.ctx.ctx, GGML_V3_MAX_NODES, false);
         RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_ALLOC, sequence_graph.cgraph, "Failed to allocate sequence graph");
 
         RWKV_ASSERT_FALSE(RWKV_ERROR_GRAPH, rwkv_build_sequence_graph(
@@ -1788,8 +1788,8 @@ void rwkv_free(struct rwkv_context * ctx) {
 bool rwkv_quantize_model_file(const char * in_path, const char * out_path, const char * type_name) {
     global_last_error = RWKV_ERROR_NONE;
 
-    enum ggml_type out_type = rwkv_type_to_ggml[rwkv_type_from_string(type_name)];
-    RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_ARGS | RWKV_ERROR_DATA_TYPE, ggml_is_quantized(out_type), "Unsupported output data type (%s)", rwkv_type_to_string[rwkv_type_from_ggml[out_type]]);
+    enum ggml_v3_type out_type = rwkv_type_to_ggml[rwkv_type_from_string(type_name)];
+    RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_ARGS | RWKV_ERROR_DATA_TYPE, ggml_v3_is_quantized(out_type), "Unsupported output data type (%s)", rwkv_type_to_string[rwkv_type_from_ggml[out_type]]);
 
     RWKV_MSG("Loading model from '%s'\n", in_path);
 
@@ -1807,10 +1807,10 @@ bool rwkv_quantize_model_file(const char * in_path, const char * out_path, const
     struct rwkv_file_header in_header;
     RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_FILE, rwkv_fread_file_header(in_file.file, in_header), "Invalid file header");
 
-    enum ggml_type in_type = rwkv_type_to_ggml[in_header.data_type];
+    enum ggml_v3_type in_type = rwkv_type_to_ggml[in_header.data_type];
     RWKV_ASSERT_FALSE_MSG(
         RWKV_ERROR_FILE,
-        in_type == GGML_TYPE_F32 || in_type == GGML_TYPE_F16,
+        in_type == GGML_V3_TYPE_F32 || in_type == GGML_V3_TYPE_F16,
         "Unsupported input data type (%s); needs to be FP32 or FP16",
         rwkv_type_to_string[rwkv_type_from_ggml[in_type]]
     );
@@ -1825,8 +1825,8 @@ bool rwkv_quantize_model_file(const char * in_path, const char * out_path, const
     size_t new_total_size = 0;
 
     // Required to init the F16 tables
-    // Doesn't crash if ggml_init fails
-    ggml_free(ggml_init({ 0, NULL, true }));
+    // Doesn't crash if ggml_v3_init fails
+    ggml_v3_free(ggml_v3_init({ 0, NULL, true }));
 
     size_t max_in_size = 0;
     size_t max_out_size = 0;
@@ -1848,7 +1848,7 @@ bool rwkv_quantize_model_file(const char * in_path, const char * out_path, const
                 max_out_size = in_size;
             }
 
-            size_t f32_size = rwkv_future_tensor::size(GGML_TYPE_F32, header.width, header.height);
+            size_t f32_size = rwkv_future_tensor::size(GGML_V3_TYPE_F32, header.width, header.height);
 
             if (f32_size > max_in_size) {
                 max_in_size = f32_size;
@@ -1902,11 +1902,11 @@ bool rwkv_quantize_model_file(const char * in_path, const char * out_path, const
             size_t nelements = (size_t) header.width * (size_t) header.height;
 
             if (header.data_type == TYPE_FP16) {
-                ggml_fp16_to_fp32_row((const ggml_fp16_t *) out_buf, (float *) in_buf, nelements);
+                ggml_v3_fp16_to_fp32_row((const ggml_v3_fp16_t *) out_buf, (float *) in_buf, nelements);
             }
 
             int64_t hist_cur[16] {};
-            new_size = ggml_quantize_chunk(out_type, (const float *) in_buf, out_buf, 0, nelements, hist_cur);
+            new_size = ggml_v3_quantize_chunk(out_type, (const float *) in_buf, out_buf, 0, nelements, hist_cur);
             header.data_type = rwkv_type_from_ggml[out_type];
             data = out_buf;
 
@@ -1952,18 +1952,18 @@ const char * rwkv_get_system_info_string(void) {
     static std::string s;
 
     s  = "";
-    s += "AVX="       + std::to_string(ggml_cpu_has_avx())       + " ";
-    s += "AVX2="      + std::to_string(ggml_cpu_has_avx2())      + " ";
-    s += "AVX512="    + std::to_string(ggml_cpu_has_avx512())    + " ";
-    s += "FMA="       + std::to_string(ggml_cpu_has_fma())       + " ";
-    s += "NEON="      + std::to_string(ggml_cpu_has_neon())      + " ";
-    s += "ARM_FMA="   + std::to_string(ggml_cpu_has_arm_fma())   + " ";
-    s += "F16C="      + std::to_string(ggml_cpu_has_f16c())      + " ";
-    s += "FP16_VA="   + std::to_string(ggml_cpu_has_fp16_va())   + " ";
-    s += "WASM_SIMD=" + std::to_string(ggml_cpu_has_wasm_simd()) + " ";
-    s += "BLAS="      + std::to_string(ggml_cpu_has_blas())      + " ";
-    s += "SSE3="      + std::to_string(ggml_cpu_has_sse3())      + " ";
-    s += "VSX="       + std::to_string(ggml_cpu_has_vsx());
+    s += "AVX="       + std::to_string(ggml_v3_cpu_has_avx())       + " ";
+    s += "AVX2="      + std::to_string(ggml_v3_cpu_has_avx2())      + " ";
+    s += "AVX512="    + std::to_string(ggml_v3_cpu_has_avx512())    + " ";
+    s += "FMA="       + std::to_string(ggml_v3_cpu_has_fma())       + " ";
+    s += "NEON="      + std::to_string(ggml_v3_cpu_has_neon())      + " ";
+    s += "ARM_FMA="   + std::to_string(ggml_v3_cpu_has_arm_fma())   + " ";
+    s += "F16C="      + std::to_string(ggml_v3_cpu_has_f16c())      + " ";
+    s += "FP16_VA="   + std::to_string(ggml_v3_cpu_has_fp16_va())   + " ";
+    s += "WASM_SIMD=" + std::to_string(ggml_v3_cpu_has_wasm_simd()) + " ";
+    s += "BLAS="      + std::to_string(ggml_v3_cpu_has_blas())      + " ";
+    s += "SSE3="      + std::to_string(ggml_v3_cpu_has_sse3())      + " ";
+    s += "VSX="       + std::to_string(ggml_v3_cpu_has_vsx());
 
     return s.c_str();
 }
\ No newline at end of file
diff --git a/otherarch/utils.cpp b/otherarch/utils.cpp
index 16e015c84..62df47a8b 100644
--- a/otherarch/utils.cpp
+++ b/otherarch/utils.cpp
@@ -9,7 +9,6 @@
 #include <sstream>
 
 
-
 void utreplace(std::string & str, const std::string & needle, const std::string & replacement) {
     size_t pos = 0;
     while ((pos = str.find(needle, pos)) != std::string::npos) {
@@ -224,13 +223,13 @@ bool should_transpose_layer(std::string name)
 }
 
 static std::vector<uint8_t> kcpp_compute_buf;
-void kcpp_graph_compute_helper(ggml_cgraph *graph, int n_threads)
+void kcpp_graph_compute_helper(struct ggml_v3_cgraph *graph, int n_threads)
 {
-    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
+    struct ggml_v3_cplan plan = ggml_v3_graph_plan(graph, n_threads);
     if (plan.work_size > 0)
     {
         kcpp_compute_buf.resize(plan.work_size);
         plan.work_data = kcpp_compute_buf.data();
     }
-    ggml_graph_compute(graph, &plan);
-}
\ No newline at end of file
+    ggml_v3_graph_compute(graph, &plan);
+}
diff --git a/otherarch/utils.h b/otherarch/utils.h
index cbd7bfb51..efe0dc108 100644
--- a/otherarch/utils.h
+++ b/otherarch/utils.h
@@ -8,6 +8,7 @@
 #include <random>
 #include <thread>
 #include "common.h"
+#include "ggml_v3.h"
 
 //
 // CLI argument parsing
@@ -53,7 +54,5 @@ void gpt_split_words(std::string str, std::vector<std::string>& words);
 std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text);
 
 
-
 bool should_transpose_layer(std::string name);
-
-void kcpp_graph_compute_helper(ggml_cgraph * graph, int n_threads);
\ No newline at end of file
+void kcpp_graph_compute_helper(ggml_v3_cgraph * graph, int n_threads);