From 3fd072a54001a908c54e81fd2e82b682ecfdd475 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 1 Apr 2025 14:57:19 +0300
Subject: [PATCH 01/20] metal : use F32 prec in FA kernels (#12688)

* metal : use F32 prec in FA kernels

ggml-ci

* cont : fix FA vec kernel

ggml-ci
---
 ggml/src/ggml-metal/ggml-metal.m     |  2 +-
 ggml/src/ggml-metal/ggml-metal.metal | 94 ++++++++++++++--------------
 2 files changed, 48 insertions(+), 48 deletions(-)

diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index 3942013f4..456e1fd99 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -4179,7 +4179,7 @@ static void ggml_metal_encode_node(
                     // ne00*(nsg)
                     // each simdgroup has a full f16 head vector in shared mem to accumulate results
                     //
-#define FATTN_SMEM(nsg) (GGML_PAD((nqptg*(GGML_PAD(ne00, 128) + 2*ncpsg*(nsg)) + ne20*(nsg))*(sizeof(float)/2), 16))
+#define FATTN_SMEM(nsg) (GGML_PAD((nqptg*(GGML_PAD(ne00, 128) + 4*ncpsg*(nsg)) + ne20*(nsg))*(sizeof(float)/2), 16))
 
                     int64_t nsgmax = 2;
                     while (true) {
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index 80d0765b4..b08666e27 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -3184,8 +3184,8 @@ kernel void kernel_flash_attn_ext(
     threadgroup_barrier(mem_flags::mem_threadgroup);
 
     {
-        half S[Q] = { [0 ... Q-1] = 0.0f };
-        half M[Q] = { [0 ... Q-1] = -__FLT16_MAX__/2 };
+        float S[Q] = { [0 ... Q-1] = 0.0f };
+        float M[Q] = { [0 ... Q-1] = -__FLT16_MAX__/2 };
 
         // thread indices inside the simdgroup
         // TODO: see if we can utilize quad-group functions for better performance
@@ -3202,13 +3202,13 @@ kernel void kernel_flash_attn_ext(
 
         const bool has_mask = mask != q;
 
-        half slope = 1.0f;
+        float slope = 1.0f;
 
         // ALiBi
         if (args.max_bias > 0.0f) {
             const short h = iq2;
 
-            const half  base = h < args.n_head_log2 ? args.m0 : args.m1;
+            const float base = h < args.n_head_log2 ? args.m0 : args.m1;
             const short exph = h < args.n_head_log2 ? h + 1 : 2*(h - args.n_head_log2) + 1;
 
             slope = pow(base, exph);
@@ -3224,14 +3224,14 @@ kernel void kernel_flash_attn_ext(
 
             if (has_mask) {
                 // used to detect blocks full of -INF
-                half smax = -INFINITY;
+                float smax = -INFINITY;
 
                 // load the mask in shared memory
                 #pragma unroll(Q)
                 for (short j = 0; j < Q; ++j) {
                     device const half * pm = (device const half *) ((device const char *) mask + (iq1 + j)*args.nb31);
 
-                    const half m = pm[ic + tiisg];
+                    const float m = pm[ic + tiisg];
 
                     ss[j*TS + C + tiisg] = m;
                     smax = max(smax, m);
@@ -3327,10 +3327,10 @@ kernel void kernel_flash_attn_ext(
             // online softmax
             {
                 for (ushort j = 0; j < Q; ++j) {
-                    const half m = M[j];
+                    const float m = M[j];
 
                     // scale and apply the logitcap / mask
-                    half s = ss[j*TS + tiisg]*args.scale;
+                    float s = ss[j*TS + tiisg]*args.scale;
 
                     if (args.logit_softcap != 0.0f) {
                         s = args.logit_softcap*precise::tanh(s);
@@ -3341,8 +3341,8 @@ kernel void kernel_flash_attn_ext(
 
                     M[j] = simd_max(max(M[j], s));
 
-                    const half ms = exp(m - M[j]);
-                    const half vs = exp(s - M[j]);
+                    const float ms = exp(m - M[j]);
+                    const float vs = exp(s - M[j]);
 
                     S[j] = S[j]*ms + simd_sum(vs);
 
@@ -3444,8 +3444,8 @@ kernel void kernel_flash_attn_ext(
 
     // reduce the warps sequentially
     for (ushort sg = 1; sg < nsg; ++sg) {
-        half S = { 0.0f };
-        half M = { -__FLT16_MAX__/2 };
+        float S = { 0.0f };
+        float M = { -__FLT16_MAX__/2 };
 
         threadgroup_barrier(mem_flags::mem_threadgroup);
 
@@ -3461,16 +3461,16 @@ kernel void kernel_flash_attn_ext(
         // the first simdgroup accumulates the results from the other simdgroups
         if (sgitg == 0) {
             for (short j = 0; j < Q; ++j) {
-                const half S0 = ss[j*TS +         0];
-                const half S1 = ss[j*TS + sg*SH + 0];
+                const float S0 = ss[j*TS +         0];
+                const float S1 = ss[j*TS + sg*SH + 0];
 
-                const half M0 = ss[j*TS +         1];
-                const half M1 = ss[j*TS + sg*SH + 1];
+                const float M0 = ss[j*TS +         1];
+                const float M1 = ss[j*TS + sg*SH + 1];
 
                 M = max(M0, M1);
 
-                const half ms0 = exp(M0 - M);
-                const half ms1 = exp(M1 - M);
+                const float ms0 = exp(M0 - M);
+                const float ms1 = exp(M1 - M);
 
                 S = S0*ms0 + S1*ms1;
 
@@ -3646,16 +3646,16 @@ kernel void kernel_flash_attn_ext_vec(
     constexpr short DV4 = DV/4;
     constexpr short NW  = N_SIMDWIDTH;
     constexpr short NL  = NW/NE; // note: this can be adjusted to support different head sizes and simdgroup work loads
-    constexpr short SH  = 2*C;   // shared memory per simdgroup
+    constexpr short SH  = 4*C;   // shared memory per simdgroup
 
     const short T = DK + nsg*SH; // shared memory size per query in (half)
 
-  //threadgroup q_t  * sq  = (threadgroup q_t  *) (shmem_f16 +                0*DK); // holds the query data
-    threadgroup q4_t * sq4 = (threadgroup q4_t *) (shmem_f16 +                0*DK); // same as above but in q4_t
-    threadgroup s_t  * ss  = (threadgroup s_t  *) (shmem_f16 + sgitg*SH     + Q*DK); // scratch buffer for attention
-    threadgroup s4_t * ss4 = (threadgroup s4_t *) (shmem_f16 + sgitg*SH     + Q*DK); // same as above but in s4_t
-    threadgroup half * sm  = (threadgroup half *) (shmem_f16 + sgitg*SH + C + Q*DK); // scratch buffer for mask
-    threadgroup o4_t * sr4 = (threadgroup o4_t *) (shmem_f16 + sgitg*DV     + Q*T);  // scratch buffer for the results
+  //threadgroup q_t   * sq  = (threadgroup q_t   *) (shmem_f16 +                  0*DK); // holds the query data
+    threadgroup q4_t  * sq4 = (threadgroup q4_t  *) (shmem_f16 +                  0*DK); // same as above but in q4_t
+    threadgroup s_t   * ss  = (threadgroup s_t   *) (shmem_f16 + sgitg*SH       + Q*DK); // scratch buffer for attention
+    threadgroup s4_t  * ss4 = (threadgroup s4_t  *) (shmem_f16 + sgitg*SH       + Q*DK); // same as above but in s4_t
+    threadgroup float * sm  = (threadgroup float *) (shmem_f16 + sgitg*SH + 2*C + Q*DK); // scratch buffer for mask
+    threadgroup o4_t  * sr4 = (threadgroup o4_t  *) (shmem_f16 + sgitg*DV       + Q*T);  // scratch buffer for the results
 
     // store the result for all queries in local memory (the O matrix from the paper)
     o4_t lo[DV4/NL];
@@ -3684,8 +3684,8 @@ kernel void kernel_flash_attn_ext_vec(
     threadgroup_barrier(mem_flags::mem_threadgroup);
 
     {
-        half S = 0.0f;
-        half M = -__FLT16_MAX__/2;
+        float S = 0.0f;
+        float M = -__FLT16_MAX__/2;
 
         // thread indices inside the simdgroup
         const short tx = tiisg%NL;
@@ -3703,13 +3703,13 @@ kernel void kernel_flash_attn_ext_vec(
         // pointer to the mask
         device const half * pm = (device const half *) (mask + iq1*args.nb31);
 
-        half slope = 1.0f;
+        float slope = 1.0f;
 
         // ALiBi
         if (args.max_bias > 0.0f) {
             const short h = iq2;
 
-            const half  base = h < args.n_head_log2 ? args.m0 : args.m1;
+            const float base = h < args.n_head_log2 ? args.m0 : args.m1;
             const short exph = h < args.n_head_log2 ? h + 1 : 2*(h - args.n_head_log2) + 1;
 
             slope = pow(base, exph);
@@ -3799,13 +3799,13 @@ kernel void kernel_flash_attn_ext_vec(
 
             // online softmax
             {
-                const half m = M;
-                const half s = ss[tiisg];
+                const float m = M;
+                const float s = ss[tiisg];
 
                 M = simd_max(max(M, s));
 
-                const half ms = exp(m - M);
-                const half vs = exp(s - M);
+                const float ms = exp(m - M);
+                const float vs = exp(s - M);
 
                 S = S*ms + simd_sum(vs);
 
@@ -3836,7 +3836,7 @@ kernel void kernel_flash_attn_ext_vec(
                         v4_t mv;
                         deq_v_t4(pv4 + i/nl_v, i%nl_v, mv);
 
-                        lo[ii/NL] += mv*ms;
+                        lo[ii/NL] += o4_t(float4(mv)*float4(ms));
                     }
                 }
             }
@@ -3907,18 +3907,18 @@ kernel void kernel_flash_attn_ext_vec(
     // parallel reduce
     for (short r = nsg/2; r > 0; r >>= 1) {
         if (sgitg < r) {
-            const half S0 = ss[       0];
-            const half S1 = ss[r*SH + 0];
+            const float S0 = ss[           0];
+            const float S1 = ss[r*(SH/2) + 0];
 
-            const half M0 = ss[       1];
-            const half M1 = ss[r*SH + 1];
+            const float M0 = ss[           1];
+            const float M1 = ss[r*(SH/2) + 1];
 
-            const half M = max(M0, M1);
+            const float M = max(M0, M1);
 
-            const half ms0 = exp(M0 - M);
-            const half ms1 = exp(M1 - M);
+            const float ms0 = exp(M0 - M);
+            const float ms1 = exp(M1 - M);
 
-            const half S = S0*ms0 + S1*ms1;
+            const float S = S0*ms0 + S1*ms1;
 
             if (tiisg == 0) {
                 ss[0] = S;
@@ -3950,11 +3950,11 @@ kernel void kernel_flash_attn_ext_vec(
 //       in the other (non-vec) kernel, we need s_t to also be float because we scale during the soft_max
 //
 #define FA_TYPES \
-           half4, \
-           half4, \
-           half4, \
-    float,        \
-    half,  half4, \
+           half4,  \
+           half4,  \
+           half4,  \
+    float,         \
+    float, float4, \
            half4
 
 typedef decltype(kernel_flash_attn_ext_vec<FA_TYPES, half4, 1, dequantize_f16_t4, half4, 1, dequantize_f16_t4, 128, 128, 4>) flash_attn_ext_vec_t;

From 5936a616e46cffad4579ccaff8f8fa315809da4c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= <sigbjorn.skjaeret@scala.com>
Date: Tue, 1 Apr 2025 14:37:13 +0200
Subject: [PATCH 02/20] convert : BailingMoE : fix qkv split when head_dim is 0
 (#12687)

NOTE: Ling-lite-base is broken, see https://huggingface.co/inclusionAI/Ling-lite-base/discussions/2
---
 convert_hf_to_gguf.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 3cf8b5165..cfe94deaf 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -5146,10 +5146,7 @@ class BailingMoeModel(Model):
     def set_gguf_parameters(self):
         super().set_gguf_parameters()
         hparams = self.hparams
-        if hparams.get("head_dim"):
-            rope_dim = hparams["head_dim"]
-        else:
-            rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
+        rope_dim = hparams.get("head_dim") or hparams["hidden_size"] // hparams["num_attention_heads"]
 
         self.gguf_writer.add_rope_dimension_count(rope_dim)
         self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
@@ -5175,7 +5172,7 @@ class BailingMoeModel(Model):
         n_head = self.hparams["num_attention_heads"]
         n_kv_head = self.hparams.get("num_key_value_heads")
         n_embd = self.hparams["hidden_size"]
-        head_dim = self.hparams.get("head_dim", n_embd // n_head)
+        head_dim = self.hparams.get("head_dim") or n_embd // n_head
 
         output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT)
 

From e39e727e9a3daec7082b9c59540a429ad85914af Mon Sep 17 00:00:00 2001
From: jklincn <jklincn@outlook.com>
Date: Tue, 1 Apr 2025 20:54:28 +0800
Subject: [PATCH 03/20] llama : use LLM_KV_GENERAL_FILE_TYPE instead of
 gguf_find_key (#12672)

---
 src/llama-arch.cpp         | 1 +
 src/llama-arch.h           | 1 +
 src/llama-model-loader.cpp | 6 +++---
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index 954ae65a3..047782e7d 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -75,6 +75,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_GENERAL_ARCHITECTURE,         "general.architecture"                  },
     { LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version"          },
     { LLM_KV_GENERAL_ALIGNMENT,            "general.alignment"                     },
+    { LLM_KV_GENERAL_FILE_TYPE,            "general.file_type"                     },
     { LLM_KV_GENERAL_NAME,                 "general.name"                          },
     { LLM_KV_GENERAL_AUTHOR,               "general.author"                        },
     { LLM_KV_GENERAL_VERSION,              "general.version"                       },
diff --git a/src/llama-arch.h b/src/llama-arch.h
index 7e2a3dd29..297cfa4da 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -79,6 +79,7 @@ enum llm_kv {
     LLM_KV_GENERAL_ARCHITECTURE,
     LLM_KV_GENERAL_QUANTIZATION_VERSION,
     LLM_KV_GENERAL_ALIGNMENT,
+    LLM_KV_GENERAL_FILE_TYPE,
     LLM_KV_GENERAL_NAME,
     LLM_KV_GENERAL_AUTHOR,
     LLM_KV_GENERAL_VERSION,
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index 05d58ad90..1be0f2d6d 100644
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -640,9 +640,9 @@ llama_model_loader::llama_model_loader(
         ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED);
 
         {
-            const int kid = gguf_find_key(meta.get(), "general.file_type"); // TODO: use LLM_KV
-            if (kid >= 0) {
-                ftype = (llama_ftype) gguf_get_val_u32(meta.get(), kid);
+            uint32_t ftype_val = 0;
+            if (get_key(LLM_KV_GENERAL_FILE_TYPE, ftype_val, false)) {
+                ftype = (llama_ftype) ftype_val;
             }
         }
 

From f423981ac806bf031d83784bcb47d2721bc70f97 Mon Sep 17 00:00:00 2001
From: Junil Kim <logyourself@gmail.com>
Date: Wed, 2 Apr 2025 01:54:34 +0900
Subject: [PATCH 04/20] opencl : fix memory allocation size (#12649)

issue:
https://github.com/CodeLinaro/llama.cpp/pull/17#issuecomment-2760611283

This patch fixes the memory allocation size
not exceeding the maximum size of the OpenCL device.
---
 ggml/src/ggml-opencl/ggml-opencl.cpp | 29 +++++++++++++++++++++++++---
 1 file changed, 26 insertions(+), 3 deletions(-)

diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
index 6c123ddef..723cab8b1 100644
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
@@ -921,10 +921,33 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
     backend_ctx->program_CL_gemm = build_program_from_source(context, device, kernel_src_CL_gemm.c_str(), compile_opts);
     CL_CHECK((backend_ctx->CL_mul_mat_Ab_Bi_8x4 = clCreateKernel(backend_ctx->program_CL_gemm, "kernel_mul_mat_Ab_Bi_8x4", &err), err));
 
+    // TODO: fixme: these sizes are hardcoded for now.
+    //  they should be allocated based on the model's size
+    //  and the device's max alloc size
+    size_t max_alloc_size;
+    CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &max_alloc_size, NULL));
+
     // Allocate intermediate buffers and images
-    size_t max_A_q_d_bytes = 311164928;
-    size_t max_A_s_d_bytes = 38895616;
-    size_t max_B_d_bytes = 45088768;
+    size_t required_A_q_d_bytes = 311164928;
+    size_t required_A_s_d_bytes = 38895616;
+    size_t required_B_d_bytes = 45088768;
+
+    // Ensure buffer sizes do not exceed the maximum allocation size
+    size_t max_A_q_d_bytes = MIN(required_A_q_d_bytes, max_alloc_size);
+    size_t max_A_s_d_bytes = MIN(required_A_s_d_bytes, max_alloc_size);
+    size_t max_B_d_bytes   = MIN(required_B_d_bytes, max_alloc_size);
+    if (required_A_q_d_bytes > max_alloc_size) {
+        GGML_LOG_WARN("ggml_opencl: A_q_d buffer size reduced from %zu to %zu due to device limitations.\n",
+                      required_A_q_d_bytes, max_A_q_d_bytes);
+    }
+    if (required_A_s_d_bytes > max_alloc_size) {
+        GGML_LOG_WARN("ggml_opencl: A_s_d buffer size reduced from %zu to %zu due to device limitations.\n",
+                      required_A_s_d_bytes, max_A_s_d_bytes);
+    }
+    if (required_B_d_bytes > max_alloc_size) {
+        GGML_LOG_WARN("ggml_opencl: B_d buffer size reduced from %zu to %zu due to device limitations.\n",
+                      required_B_d_bytes, max_B_d_bytes);
+    }
 
     CL_CHECK((backend_ctx->A_q_d_max = clCreateBuffer(context, 0, max_A_q_d_bytes, NULL, &err), err));
     CL_CHECK((backend_ctx->A_s_d_max = clCreateBuffer(context, 0, max_A_s_d_bytes, NULL, &err), err));

From 267c1399f15a278ec8c3cdcf9c90dc94151fbc38 Mon Sep 17 00:00:00 2001
From: Xuan-Son Nguyen <son@huggingface.co>
Date: Tue, 1 Apr 2025 23:44:05 +0200
Subject: [PATCH 05/20] common : refactor downloading system, handle mmproj
 with -hf option (#12694)

* (wip) refactor downloading system [no ci]

* fix all examples

* fix mmproj with -hf

* gemma3: update readme

* only handle mmproj in llava example

* fix multi-shard download

* windows: fix problem with std::min and std::max

* fix 2
---
 common/arg.cpp                                | 675 ++++++++++++++++--
 common/common.cpp                             | 495 +------------
 common/common.h                               |  44 +-
 examples/batched-bench/batched-bench.cpp      |   2 +-
 examples/batched/batched.cpp                  |   2 +-
 examples/export-lora/export-lora.cpp          |   2 +-
 examples/gritlm/gritlm.cpp                    |   2 +-
 examples/llava/README-gemma3.md               |  20 +
 examples/llava/gemma3-cli.cpp                 |   6 +-
 examples/llava/llava-cli.cpp                  |   6 +-
 examples/llava/minicpmv-cli.cpp               |   6 +-
 examples/llava/qwen2vl-cli.cpp                |   6 +-
 examples/parallel/parallel.cpp                |   2 +-
 examples/passkey/passkey.cpp                  |   2 +-
 examples/server/server.cpp                    |  19 +-
 .../speculative-simple/speculative-simple.cpp |   2 +-
 examples/speculative/speculative.cpp          |   2 +-
 examples/tts/tts.cpp                          |   7 +-
 tests/test-arg-parser.cpp                     |   8 +-
 19 files changed, 673 insertions(+), 635 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 8292adaac..47c26955e 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -1,9 +1,19 @@
+#include "gguf.h" // for reading GGUF splits
 #include "arg.h"
 
 #include "log.h"
 #include "sampling.h"
 #include "chat.h"
 
+// fix problem with std::min and std::max
+#if defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+#   define NOMINMAX
+#endif
+#include <windows.h>
+#endif
+
 #include <algorithm>
 #include <climits>
 #include <cstdarg>
@@ -14,6 +24,14 @@
 #include <thread>
 #include <vector>
 
+//#define LLAMA_USE_CURL
+
+#if defined(LLAMA_USE_CURL)
+#include <curl/curl.h>
+#include <curl/easy.h>
+#include <future>
+#endif
+
 #include "json-schema-to-grammar.h"
 
 using json = nlohmann::ordered_json;
@@ -125,47 +143,549 @@ std::string common_arg::to_string() {
     return ss.str();
 }
 
+//
+// downloader
+//
+
+struct common_hf_file_res {
+    std::string repo; // repo name with ":tag" removed
+    std::string ggufFile;
+    std::string mmprojFile;
+};
+
+#ifdef LLAMA_USE_CURL
+
+#ifdef __linux__
+#include <linux/limits.h>
+#elif defined(_WIN32)
+#   if !defined(PATH_MAX)
+#   define PATH_MAX MAX_PATH
+#   endif
+#else
+#include <sys/syslimits.h>
+#endif
+#define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
+
+//
+// CURL utils
+//
+
+using curl_ptr = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>;
+
+// cannot use unique_ptr for curl_slist, because we cannot update without destroying the old one
+struct curl_slist_ptr {
+    struct curl_slist * ptr = nullptr;
+    ~curl_slist_ptr() {
+        if (ptr) {
+            curl_slist_free_all(ptr);
+        }
+    }
+};
+
+#define CURL_MAX_RETRY 3
+#define CURL_RETRY_DELAY_SECONDS 2
+
+static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds) {
+    int remaining_attempts = max_attempts;
+
+    while (remaining_attempts > 0) {
+        LOG_INF("%s: Trying to download from %s (attempt %d of %d)...\n", __func__ , url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
+
+        CURLcode res = curl_easy_perform(curl);
+        if (res == CURLE_OK) {
+            return true;
+        }
+
+        int exponential_backoff_delay = std::pow(retry_delay_seconds, max_attempts - remaining_attempts) * 1000;
+        LOG_WRN("%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay);
+
+        remaining_attempts--;
+        std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
+    }
+
+    LOG_ERR("%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts);
+
+    return false;
+}
+
+// download one single file from remote URL to local path
+static bool common_download_file_single(const std::string & url, const std::string & path, const std::string & bearer_token) {
+    // Initialize libcurl
+    curl_ptr       curl(curl_easy_init(), &curl_easy_cleanup);
+    curl_slist_ptr http_headers;
+    if (!curl) {
+        LOG_ERR("%s: error initializing libcurl\n", __func__);
+        return false;
+    }
+
+    bool force_download = false;
+
+    // Set the URL, allow to follow http redirection
+    curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
+    curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
+
+    // Check if hf-token or bearer-token was specified
+    if (!bearer_token.empty()) {
+        std::string auth_header = "Authorization: Bearer " + bearer_token;
+        http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
+        curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
+    }
+
+#if defined(_WIN32)
+    // CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
+    //   operating system. Currently implemented under MS-Windows.
+    curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
+#endif
+
+    // Check if the file already exists locally
+    auto file_exists = std::filesystem::exists(path);
+
+    // If the file exists, check its JSON metadata companion file.
+    std::string metadata_path = path + ".json";
+    nlohmann::json metadata;
+    std::string etag;
+    std::string last_modified;
+
+    if (file_exists) {
+        // Try and read the JSON metadata file (note: stream autoclosed upon exiting this block).
+        std::ifstream metadata_in(metadata_path);
+        if (metadata_in.good()) {
+            try {
+                metadata_in >> metadata;
+                LOG_INF("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
+                if (metadata.contains("url") && metadata.at("url").is_string()) {
+                    auto previous_url = metadata.at("url").get<std::string>();
+                    if (previous_url != url) {
+                        LOG_ERR("%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str());
+                        return false;
+                    }
+                }
+                if (metadata.contains("etag") && metadata.at("etag").is_string()) {
+                    etag = metadata.at("etag");
+                }
+                if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) {
+                    last_modified = metadata.at("lastModified");
+                }
+            } catch (const nlohmann::json::exception & e) {
+            LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
+                return false;
+            }
+        }
+    } else {
+        LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
+    }
+
+    // Send a HEAD request to retrieve the etag and last-modified headers
+    struct common_load_model_from_url_headers {
+        std::string etag;
+        std::string last_modified;
+    };
+
+    common_load_model_from_url_headers headers;
+
+    {
+        typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
+        auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
+            common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
+
+            static std::regex header_regex("([^:]+): (.*)\r\n");
+            static std::regex etag_regex("ETag", std::regex_constants::icase);
+            static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase);
+
+            std::string header(buffer, n_items);
+            std::smatch match;
+            if (std::regex_match(header, match, header_regex)) {
+                const std::string & key = match[1];
+                const std::string & value = match[2];
+                if (std::regex_match(key, match, etag_regex)) {
+                    headers->etag = value;
+                } else if (std::regex_match(key, match, last_modified_regex)) {
+                    headers->last_modified = value;
+                }
+            }
+            return n_items;
+        };
+
+        curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
+        curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); // hide head request progress
+        curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
+        curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
+
+        bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
+        if (!was_perform_successful) {
+            return false;
+        }
+
+        long http_code = 0;
+        curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
+        if (http_code != 200) {
+            // HEAD not supported, we don't know if the file has changed
+            // force trigger downloading
+            force_download = true;
+            LOG_ERR("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
+        }
+    }
+
+    bool should_download = !file_exists || force_download;
+    if (!should_download) {
+        if (!etag.empty() && etag != headers.etag) {
+            LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str());
+            should_download = true;
+        } else if (!last_modified.empty() && last_modified != headers.last_modified) {
+            LOG_WRN("%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__, last_modified.c_str(), headers.last_modified.c_str());
+            should_download = true;
+        }
+    }
+    if (should_download) {
+        std::string path_temporary = path + ".downloadInProgress";
+        if (file_exists) {
+            LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
+            if (remove(path.c_str()) != 0) {
+                LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
+                return false;
+            }
+        }
+
+        // Set the output file
+
+        struct FILE_deleter {
+            void operator()(FILE * f) const {
+                fclose(f);
+            }
+        };
+
+        std::unique_ptr<FILE, FILE_deleter> outfile(fopen(path_temporary.c_str(), "wb"));
+        if (!outfile) {
+            LOG_ERR("%s: error opening local file for writing: %s\n", __func__, path.c_str());
+            return false;
+        }
+
+        typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * data, size_t size, size_t nmemb, void * fd);
+        auto write_callback = [](void * data, size_t size, size_t nmemb, void * fd) -> size_t {
+            return fwrite(data, size, nmemb, (FILE *)fd);
+        };
+        curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 0L);
+        curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
+        curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, outfile.get());
+
+        //  display download progress
+        curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 0L);
+
+        // helper function to hide password in URL
+        auto llama_download_hide_password_in_url = [](const std::string & url) -> std::string {
+            std::size_t protocol_pos = url.find("://");
+            if (protocol_pos == std::string::npos) {
+                return url;  // Malformed URL
+            }
+
+            std::size_t at_pos = url.find('@', protocol_pos + 3);
+            if (at_pos == std::string::npos) {
+                return url;  // No password in URL
+            }
+
+            return url.substr(0, protocol_pos + 3) + "********" + url.substr(at_pos);
+        };
+
+        // start the download
+        LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
+            llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
+        bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
+        if (!was_perform_successful) {
+            return false;
+        }
+
+        long http_code = 0;
+        curl_easy_getinfo (curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
+        if (http_code < 200 || http_code >= 400) {
+            LOG_ERR("%s: invalid http status code received: %ld\n", __func__, http_code);
+            return false;
+        }
+
+        // Causes file to be closed explicitly here before we rename it.
+        outfile.reset();
+
+        // Write the updated JSON metadata file.
+        metadata.update({
+            {"url", url},
+            {"etag", headers.etag},
+            {"lastModified", headers.last_modified}
+        });
+        std::ofstream(metadata_path) << metadata.dump(4);
+        LOG_INF("%s: file metadata saved: %s\n", __func__, metadata_path.c_str());
+
+        if (rename(path_temporary.c_str(), path.c_str()) != 0) {
+            LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
+            return false;
+        }
+    }
+
+    return true;
+}
+
+// download multiple files from remote URLs to local paths
+// the input is a vector of pairs <url, path>
+static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls, const std::string & bearer_token) {
+    // Prepare download in parallel
+    std::vector<std::future<bool>> futures_download;
+    for (auto const & item : urls) {
+        futures_download.push_back(std::async(std::launch::async, [bearer_token](const std::pair<std::string, std::string> & it) -> bool {
+            return common_download_file_single(it.first, it.second, bearer_token);
+        }, item));
+    }
+
+    // Wait for all downloads to complete
+    for (auto & f : futures_download) {
+        if (!f.get()) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+static bool common_download_model(
+        const common_params_model & model,
+        const std::string & bearer_token) {
+    // Basic validation of the model.url
+    if (model.url.empty()) {
+        LOG_ERR("%s: invalid model url\n", __func__);
+        return false;
+    }
+
+    if (!common_download_file_single(model.url, model.path, bearer_token)) {
+        return false;
+    }
+
+    // check for additional GGUFs split to download
+    int n_split = 0;
+    {
+        struct gguf_init_params gguf_params = {
+            /*.no_alloc = */ true,
+            /*.ctx      = */ NULL,
+        };
+        auto * ctx_gguf = gguf_init_from_file(model.path.c_str(), gguf_params);
+        if (!ctx_gguf) {
+            LOG_ERR("\n%s:  failed to load input GGUF from %s\n", __func__, model.path.c_str());
+            return false;
+        }
+
+        auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT);
+        if (key_n_split >= 0) {
+            n_split = gguf_get_val_u16(ctx_gguf, key_n_split);
+        }
+
+        gguf_free(ctx_gguf);
+    }
+
+    if (n_split > 1) {
+        char split_prefix[PATH_MAX] = {0};
+        char split_url_prefix[LLAMA_CURL_MAX_URL_LENGTH] = {0};
+
+        // Verify the first split file format
+        // and extract split URL and PATH prefixes
+        {
+            if (!llama_split_prefix(split_prefix, sizeof(split_prefix), model.path.c_str(), 0, n_split)) {
+                LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, model.path.c_str(), n_split);
+                return false;
+            }
+
+            if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model.url.c_str(), 0, n_split)) {
+                LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model.url.c_str(), n_split);
+                return false;
+            }
+        }
+
+        std::vector<std::pair<std::string, std::string>> urls;
+        for (int idx = 1; idx < n_split; idx++) {
+            char split_path[PATH_MAX] = {0};
+            llama_split_path(split_path, sizeof(split_path), split_prefix, idx, n_split);
+
+            char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
+            llama_split_path(split_url, sizeof(split_url), split_url_prefix, idx, n_split);
+
+            if (std::string(split_path) == model.path) {
+                continue; // skip the already downloaded file
+            }
+
+            urls.push_back({split_url, split_path});
+        }
+
+        // Download in parallel
+        common_download_file_multiple(urls, bearer_token);
+    }
+
+    return true;
+}
+
+/**
+ * Allow getting the HF file from the HF repo with tag (like ollama), for example:
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s
+ * Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo)
+ *
+ * Return pair of <repo, file> (with "repo" already having tag removed)
+ *
+ * Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
+ */
+static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & bearer_token) {
+    auto parts = string_split<std::string>(hf_repo_with_tag, ':');
+    std::string tag = parts.size() > 1 ? parts.back() : "latest";
+    std::string hf_repo = parts[0];
+    if (string_split<std::string>(hf_repo, '/').size() != 2) {
+        throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n");
+    }
+
+    // fetch model info from Hugging Face Hub API
+    curl_ptr       curl(curl_easy_init(), &curl_easy_cleanup);
+    curl_slist_ptr http_headers;
+    std::string res_str;
+    std::string url = "https://huggingface.co/v2/" + hf_repo + "/manifests/" + tag;
+    curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
+    curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
+    typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
+    auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t {
+        static_cast<std::string *>(data)->append((char * ) ptr, size * nmemb);
+        return size * nmemb;
+    };
+    curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
+    curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_str);
+#if defined(_WIN32)
+    curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
+#endif
+    if (!bearer_token.empty()) {
+        std::string auth_header = "Authorization: Bearer " + bearer_token;
+        http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
+    }
+    // Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
+    http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
+    http_headers.ptr = curl_slist_append(http_headers.ptr, "Accept: application/json");
+    curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
+
+    CURLcode res = curl_easy_perform(curl.get());
+
+    if (res != CURLE_OK) {
+        throw std::runtime_error("error: cannot make GET request to HF API");
+    }
+
+    long res_code;
+    std::string ggufFile   = "";
+    std::string mmprojFile = "";
+    curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code);
+    if (res_code == 200) {
+        // extract ggufFile.rfilename in json, using regex
+        {
+            std::regex pattern("\"ggufFile\"[\\s\\S]*?\"rfilename\"\\s*:\\s*\"([^\"]+)\"");
+            std::smatch match;
+            if (std::regex_search(res_str, match, pattern)) {
+                ggufFile = match[1].str();
+            }
+        }
+        // extract mmprojFile.rfilename in json, using regex
+        {
+            std::regex pattern("\"mmprojFile\"[\\s\\S]*?\"rfilename\"\\s*:\\s*\"([^\"]+)\"");
+            std::smatch match;
+            if (std::regex_search(res_str, match, pattern)) {
+                mmprojFile = match[1].str();
+            }
+        }
+    } else if (res_code == 401) {
+        throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token");
+    } else {
+        throw std::runtime_error(string_format("error from HF API, response code: %ld, data: %s", res_code, res_str.c_str()));
+    }
+
+    // check response
+    if (ggufFile.empty()) {
+        throw std::runtime_error("error: model does not have ggufFile");
+    }
+
+    return { hf_repo, ggufFile, mmprojFile };
+}
+
+#else
+
+static bool common_download_file_single(const std::string &, const std::string &, const std::string &) {
+    LOG_ERR("error: built without CURL, cannot download model from internet\n");
+    return false;
+}
+
+static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> &, const std::string &) {
+    LOG_ERR("error: built without CURL, cannot download model from the internet\n");
+    return false;
+}
+
+static bool common_download_model(
+        const common_params_model &,
+        const std::string &) {
+    LOG_ERR("error: built without CURL, cannot download model from the internet\n");
+    return false;
+}
+
+static struct common_hf_file_res common_get_hf_file(const std::string &, const std::string &) {
+    LOG_ERR("error: built without CURL, cannot download model from the internet\n");
+    return {};
+}
+
+#endif // LLAMA_USE_CURL
+
 //
 // utils
 //
 
-static void common_params_handle_model_default(
-        std::string & model,
-        const std::string & model_url,
-        std::string & hf_repo,
-        std::string & hf_file,
-        const std::string & hf_token,
-        const std::string & model_default) {
-    if (!hf_repo.empty()) {
-        // short-hand to avoid specifying --hf-file -> default it to --model
-        if (hf_file.empty()) {
-            if (model.empty()) {
-                auto auto_detected = common_get_hf_file(hf_repo, hf_token);
-                if (auto_detected.first.empty() || auto_detected.second.empty()) {
-                    exit(1); // built without CURL, error message already printed
+static void common_params_handle_model(
+        struct common_params_model & model,
+        const std::string & bearer_token,
+        const std::string & model_path_default,
+        bool is_mmproj = false) { // TODO: move is_mmproj to an enum when we have more files?
+    // handle pre-fill default model path and url based on hf_repo and hf_file
+    {
+        if (!model.hf_repo.empty()) {
+            // short-hand to avoid specifying --hf-file -> default it to --model
+            if (model.hf_file.empty()) {
+                if (model.path.empty()) {
+                    auto auto_detected = common_get_hf_file(model.hf_repo, bearer_token);
+                    if (auto_detected.repo.empty() || auto_detected.ggufFile.empty()) {
+                        exit(1); // built without CURL, error message already printed
+                    }
+                    model.hf_repo = auto_detected.repo;
+                    model.hf_file = is_mmproj ? auto_detected.mmprojFile : auto_detected.ggufFile;
+                } else {
+                    model.hf_file = model.path;
                 }
-                hf_repo = auto_detected.first;
-                hf_file = auto_detected.second;
-            } else {
-                hf_file = model;
             }
+
+            // TODO: allow custom host
+            model.url = "https://huggingface.co/" + model.hf_repo + "/resolve/main/" + model.hf_file;
+
+            // make sure model path is present (for caching purposes)
+            if (model.path.empty()) {
+                // this is to avoid different repo having same file name, or same file name in different subdirs
+                std::string filename = model.hf_repo + "_" + model.hf_file;
+                // to make sure we don't have any slashes in the filename
+                string_replace_all(filename, "/", "_");
+                model.path = fs_get_cache_file(filename);
+            }
+
+        } else if (!model.url.empty()) {
+            if (model.path.empty()) {
+                auto f = string_split<std::string>(model.url, '#').front();
+                f = string_split<std::string>(f, '?').front();
+                model.path = fs_get_cache_file(string_split<std::string>(f, '/').back());
+            }
+
+        } else if (model.path.empty()) {
+            model.path = model_path_default;
         }
-        // make sure model path is present (for caching purposes)
-        if (model.empty()) {
-            // this is to avoid different repo having same file name, or same file name in different subdirs
-            std::string filename = hf_repo + "_" + hf_file;
-            // to make sure we don't have any slashes in the filename
-            string_replace_all(filename, "/", "_");
-            model = fs_get_cache_file(filename);
+    }
+
+    // then, download it if needed
+    if (!model.url.empty()) {
+        bool ok = common_download_model(model, bearer_token);
+        if (!ok) {
+            LOG_ERR("error: failed to download model from %s\n", model.url.c_str());
+            exit(1);
         }
-    } else if (!model_url.empty()) {
-        if (model.empty()) {
-            auto f = string_split<std::string>(model_url, '#').front();
-            f = string_split<std::string>(f, '?').front();
-            model = fs_get_cache_file(string_split<std::string>(f, '/').back());
-        }
-    } else if (model.empty()) {
-        model = model_default;
     }
 }
 
@@ -300,10 +820,16 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
         throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
     }
 
-    // TODO: refactor model params in a common struct
-    common_params_handle_model_default(params.model,             params.model_url,             params.hf_repo,             params.hf_file,             params.hf_token, DEFAULT_MODEL_PATH);
-    common_params_handle_model_default(params.speculative.model, params.speculative.model_url, params.speculative.hf_repo, params.speculative.hf_file, params.hf_token, "");
-    common_params_handle_model_default(params.vocoder.model,     params.vocoder.model_url,     params.vocoder.hf_repo,     params.vocoder.hf_file,     params.hf_token, "");
+    common_params_handle_model(params.model,             params.hf_token, DEFAULT_MODEL_PATH);
+    common_params_handle_model(params.speculative.model, params.hf_token, "");
+    common_params_handle_model(params.vocoder.model,     params.hf_token, "");
+
+    // allow --mmproj to be set from -hf
+    // assuming that mmproj is always in the same repo as text model
+    if (!params.model.hf_repo.empty() && ctx_arg.ex == LLAMA_EXAMPLE_LLAVA) {
+        params.mmproj.hf_repo = params.model.hf_repo;
+    }
+    common_params_handle_model(params.mmproj,            params.hf_token, "", true);
 
     if (params.escape) {
         string_process_escapes(params.prompt);
@@ -1561,7 +2087,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         {"--mmproj"}, "FILE",
         "path to a multimodal projector file for LLaVA. see examples/llava/README.md",
         [](common_params & params, const std::string & value) {
-            params.mmproj = value;
+            params.mmproj.path = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_LLAVA}));
+    add_opt(common_arg(
+        {"--mmproj-url"}, "URL",
+        "URL to a multimodal projector file for LLaVA. see examples/llava/README.md",
+        [](common_params & params, const std::string & value) {
+            params.mmproj.url = value;
         }
     ).set_examples({LLAMA_EXAMPLE_LLAVA}));
     add_opt(common_arg(
@@ -1790,14 +2323,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                 "or `--model-url` if set, otherwise %s)", DEFAULT_MODEL_PATH
             ),
         [](common_params & params, const std::string & value) {
-            params.model = value;
+            params.model.path = value;
         }
     ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL"));
     add_opt(common_arg(
         {"-mu", "--model-url"}, "MODEL_URL",
         "model download url (default: unused)",
         [](common_params & params, const std::string & value) {
-            params.model_url = value;
+            params.model.url = value;
         }
     ).set_env("LLAMA_ARG_MODEL_URL"));
     add_opt(common_arg(
@@ -1806,35 +2339,35 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         "example: unsloth/phi-4-GGUF:q4_k_m\n"
         "(default: unused)",
         [](common_params & params, const std::string & value) {
-            params.hf_repo = value;
+            params.model.hf_repo = value;
         }
     ).set_env("LLAMA_ARG_HF_REPO"));
     add_opt(common_arg(
         {"-hfd", "-hfrd", "--hf-repo-draft"}, "<user>/<model>[:quant]",
         "Same as --hf-repo, but for the draft model (default: unused)",
         [](common_params & params, const std::string & value) {
-            params.speculative.hf_repo = value;
+            params.speculative.model.hf_repo = value;
         }
     ).set_env("LLAMA_ARG_HFD_REPO"));
     add_opt(common_arg(
         {"-hff", "--hf-file"}, "FILE",
         "Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)",
         [](common_params & params, const std::string & value) {
-            params.hf_file = value;
+            params.model.hf_file = value;
         }
     ).set_env("LLAMA_ARG_HF_FILE"));
     add_opt(common_arg(
         {"-hfv", "-hfrv", "--hf-repo-v"}, "<user>/<model>[:quant]",
         "Hugging Face model repository for the vocoder model (default: unused)",
         [](common_params & params, const std::string & value) {
-            params.vocoder.hf_repo = value;
+            params.vocoder.model.hf_repo = value;
         }
     ).set_env("LLAMA_ARG_HF_REPO_V"));
     add_opt(common_arg(
         {"-hffv", "--hf-file-v"}, "FILE",
         "Hugging Face model file for the vocoder model (default: unused)",
         [](common_params & params, const std::string & value) {
-            params.vocoder.hf_file = value;
+            params.vocoder.model.hf_file = value;
         }
     ).set_env("LLAMA_ARG_HF_FILE_V"));
     add_opt(common_arg(
@@ -2454,7 +2987,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         {"-md", "--model-draft"}, "FNAME",
         "draft model for speculative decoding (default: unused)",
         [](common_params & params, const std::string & value) {
-            params.speculative.model = value;
+            params.speculative.model.path = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
 
@@ -2462,7 +2995,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         {"-mv", "--model-vocoder"}, "FNAME",
         "vocoder model for audio generation (default: unused)",
         [](common_params & params, const std::string & value) {
-            params.vocoder.model = value;
+            params.vocoder.model.path = value;
         }
     ).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER}));
      add_opt(common_arg(
@@ -2485,10 +3018,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         {"--tts-oute-default"},
         string_format("use default OuteTTS models (note: can download weights from the internet)"),
         [](common_params & params) {
-            params.hf_repo = "OuteAI/OuteTTS-0.2-500M-GGUF";
-            params.hf_file = "OuteTTS-0.2-500M-Q8_0.gguf";
-            params.vocoder.hf_repo = "ggml-org/WavTokenizer";
-            params.vocoder.hf_file = "WavTokenizer-Large-75-F16.gguf";
+            params.model.hf_repo = "OuteAI/OuteTTS-0.2-500M-GGUF";
+            params.model.hf_file = "OuteTTS-0.2-500M-Q8_0.gguf";
+            params.vocoder.model.hf_repo = "ggml-org/WavTokenizer";
+            params.vocoder.model.hf_file = "WavTokenizer-Large-75-F16.gguf";
         }
     ).set_examples({LLAMA_EXAMPLE_TTS}));
 
@@ -2496,8 +3029,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         {"--embd-bge-small-en-default"},
         string_format("use default bge-small-en-v1.5 model (note: can download weights from the internet)"),
         [](common_params & params) {
-            params.hf_repo = "ggml-org/bge-small-en-v1.5-Q8_0-GGUF";
-            params.hf_file = "bge-small-en-v1.5-q8_0.gguf";
+            params.model.hf_repo = "ggml-org/bge-small-en-v1.5-Q8_0-GGUF";
+            params.model.hf_file = "bge-small-en-v1.5-q8_0.gguf";
             params.pooling_type = LLAMA_POOLING_TYPE_NONE;
             params.embd_normalize = 2;
             params.n_ctx = 512;
@@ -2510,8 +3043,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         {"--embd-e5-small-en-default"},
         string_format("use default e5-small-v2 model (note: can download weights from the internet)"),
         [](common_params & params) {
-            params.hf_repo = "ggml-org/e5-small-v2-Q8_0-GGUF";
-            params.hf_file = "e5-small-v2-q8_0.gguf";
+            params.model.hf_repo = "ggml-org/e5-small-v2-Q8_0-GGUF";
+            params.model.hf_file = "e5-small-v2-q8_0.gguf";
             params.pooling_type = LLAMA_POOLING_TYPE_NONE;
             params.embd_normalize = 2;
             params.n_ctx = 512;
@@ -2524,8 +3057,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         {"--embd-gte-small-default"},
         string_format("use default gte-small model (note: can download weights from the internet)"),
         [](common_params & params) {
-            params.hf_repo = "ggml-org/gte-small-Q8_0-GGUF";
-            params.hf_file = "gte-small-q8_0.gguf";
+            params.model.hf_repo = "ggml-org/gte-small-Q8_0-GGUF";
+            params.model.hf_file = "gte-small-q8_0.gguf";
             params.pooling_type = LLAMA_POOLING_TYPE_NONE;
             params.embd_normalize = 2;
             params.n_ctx = 512;
@@ -2538,8 +3071,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         {"--fim-qwen-1.5b-default"},
         string_format("use default Qwen 2.5 Coder 1.5B (note: can download weights from the internet)"),
         [](common_params & params) {
-            params.hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF";
-            params.hf_file = "qwen2.5-coder-1.5b-q8_0.gguf";
+            params.model.hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF";
+            params.model.hf_file = "qwen2.5-coder-1.5b-q8_0.gguf";
             params.port = 8012;
             params.n_gpu_layers = 99;
             params.flash_attn = true;
@@ -2554,8 +3087,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         {"--fim-qwen-3b-default"},
         string_format("use default Qwen 2.5 Coder 3B (note: can download weights from the internet)"),
         [](common_params & params) {
-            params.hf_repo = "ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF";
-            params.hf_file = "qwen2.5-coder-3b-q8_0.gguf";
+            params.model.hf_repo = "ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF";
+            params.model.hf_file = "qwen2.5-coder-3b-q8_0.gguf";
             params.port = 8012;
             params.n_gpu_layers = 99;
             params.flash_attn = true;
@@ -2570,8 +3103,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         {"--fim-qwen-7b-default"},
         string_format("use default Qwen 2.5 Coder 7B (note: can download weights from the internet)"),
         [](common_params & params) {
-            params.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
-            params.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
+            params.model.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
+            params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
             params.port = 8012;
             params.n_gpu_layers = 99;
             params.flash_attn = true;
@@ -2586,10 +3119,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         {"--fim-qwen-7b-spec"},
         string_format("use Qwen 2.5 Coder 7B + 0.5B draft for speculative decoding (note: can download weights from the internet)"),
         [](common_params & params) {
-            params.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
-            params.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
-            params.speculative.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
-            params.speculative.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
+            params.model.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
+            params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
+            params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
+            params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
             params.speculative.n_gpu_layers = 99;
             params.port = 8012;
             params.n_gpu_layers = 99;
@@ -2605,10 +3138,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         {"--fim-qwen-14b-spec"},
         string_format("use Qwen 2.5 Coder 14B + 0.5B draft for speculative decoding (note: can download weights from the internet)"),
         [](common_params & params) {
-            params.hf_repo = "ggml-org/Qwen2.5-Coder-14B-Q8_0-GGUF";
-            params.hf_file = "qwen2.5-coder-14b-q8_0.gguf";
-            params.speculative.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
-            params.speculative.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
+            params.model.hf_repo = "ggml-org/Qwen2.5-Coder-14B-Q8_0-GGUF";
+            params.model.hf_file = "qwen2.5-coder-14b-q8_0.gguf";
+            params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
+            params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
             params.speculative.n_gpu_layers = 99;
             params.port = 8012;
             params.n_gpu_layers = 99;
diff --git a/common/common.cpp b/common/common.cpp
index 18ffb4e73..22642c84a 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -51,45 +51,11 @@
 #include <sys/stat.h>
 #include <unistd.h>
 #endif
-#if defined(LLAMA_USE_CURL)
-#include <curl/curl.h>
-#include <curl/easy.h>
-#include <future>
-#endif
 
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 
-#if defined(LLAMA_USE_CURL)
-#ifdef __linux__
-#include <linux/limits.h>
-#elif defined(_WIN32)
-#   if !defined(PATH_MAX)
-#   define PATH_MAX MAX_PATH
-#   endif
-#else
-#include <sys/syslimits.h>
-#endif
-#define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
-
-//
-// CURL utils
-//
-
-using curl_ptr = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>;
-
-// cannot use unique_ptr for curl_slist, because we cannot update without destroying the old one
-struct curl_slist_ptr {
-    struct curl_slist * ptr = nullptr;
-    ~curl_slist_ptr() {
-        if (ptr) {
-            curl_slist_free_all(ptr);
-        }
-    }
-};
-#endif // LLAMA_USE_CURL
-
 using json = nlohmann::ordered_json;
 
 //
@@ -900,22 +866,14 @@ std::string fs_get_cache_file(const std::string & filename) {
 //
 // Model utils
 //
+
 struct common_init_result common_init_from_params(common_params & params) {
     common_init_result iparams;
     auto mparams = common_model_params_to_llama(params);
 
-    llama_model * model = nullptr;
-
-    if (!params.hf_repo.empty() && !params.hf_file.empty()) {
-        model = common_load_model_from_hf(params.hf_repo, params.hf_file, params.model, params.hf_token, mparams);
-    } else if (!params.model_url.empty()) {
-        model = common_load_model_from_url(params.model_url, params.model, params.hf_token, mparams);
-    } else {
-        model = llama_model_load_from_file(params.model.c_str(), mparams);
-    }
-
+    llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
     if (model == NULL) {
-        LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.c_str());
+        LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.path.c_str());
         return iparams;
     }
 
@@ -950,7 +908,7 @@ struct common_init_result common_init_from_params(common_params & params) {
 
     llama_context * lctx = llama_init_from_model(model, cparams);
     if (lctx == NULL) {
-        LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str());
+        LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
         llama_model_free(model);
         return iparams;
     }
@@ -1164,451 +1122,6 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
     return tpp;
 }
 
-#ifdef LLAMA_USE_CURL
-
-#define CURL_MAX_RETRY 3
-#define CURL_RETRY_DELAY_SECONDS 2
-
-static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds) {
-    int remaining_attempts = max_attempts;
-
-    while (remaining_attempts > 0) {
-        LOG_INF("%s: Trying to download from %s (attempt %d of %d)...\n", __func__ , url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
-
-        CURLcode res = curl_easy_perform(curl);
-        if (res == CURLE_OK) {
-            return true;
-        }
-
-        int exponential_backoff_delay = std::pow(retry_delay_seconds, max_attempts - remaining_attempts) * 1000;
-        LOG_WRN("%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay);
-
-        remaining_attempts--;
-        std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
-    }
-
-    LOG_ERR("%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts);
-
-    return false;
-}
-
-static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
-    // Initialize libcurl
-    curl_ptr       curl(curl_easy_init(), &curl_easy_cleanup);
-    curl_slist_ptr http_headers;
-    if (!curl) {
-        LOG_ERR("%s: error initializing libcurl\n", __func__);
-        return false;
-    }
-
-    bool force_download = false;
-
-    // Set the URL, allow to follow http redirection
-    curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
-    curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
-
-    // Check if hf-token or bearer-token was specified
-    if (!hf_token.empty()) {
-        std::string auth_header = "Authorization: Bearer " + hf_token;
-        http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
-        curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
-    }
-
-#if defined(_WIN32)
-    // CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
-    //   operating system. Currently implemented under MS-Windows.
-    curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
-#endif
-
-    // Check if the file already exists locally
-    auto file_exists = std::filesystem::exists(path);
-
-    // If the file exists, check its JSON metadata companion file.
-    std::string metadata_path = path + ".json";
-    nlohmann::json metadata;
-    std::string etag;
-    std::string last_modified;
-
-    if (file_exists) {
-        // Try and read the JSON metadata file (note: stream autoclosed upon exiting this block).
-        std::ifstream metadata_in(metadata_path);
-        if (metadata_in.good()) {
-            try {
-                metadata_in >> metadata;
-                LOG_INF("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
-                if (metadata.contains("url") && metadata.at("url").is_string()) {
-                    auto previous_url = metadata.at("url").get<std::string>();
-                    if (previous_url != url) {
-                        LOG_ERR("%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str());
-                        return false;
-                    }
-                }
-                if (metadata.contains("etag") && metadata.at("etag").is_string()) {
-                    etag = metadata.at("etag");
-                }
-                if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) {
-                    last_modified = metadata.at("lastModified");
-                }
-            } catch (const nlohmann::json::exception & e) {
-            LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
-                return false;
-            }
-        }
-    } else {
-        LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
-    }
-
-    // Send a HEAD request to retrieve the etag and last-modified headers
-    struct common_load_model_from_url_headers {
-        std::string etag;
-        std::string last_modified;
-    };
-
-    common_load_model_from_url_headers headers;
-
-    {
-        typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
-        auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
-            common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
-
-            static std::regex header_regex("([^:]+): (.*)\r\n");
-            static std::regex etag_regex("ETag", std::regex_constants::icase);
-            static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase);
-
-            std::string header(buffer, n_items);
-            std::smatch match;
-            if (std::regex_match(header, match, header_regex)) {
-                const std::string & key = match[1];
-                const std::string & value = match[2];
-                if (std::regex_match(key, match, etag_regex)) {
-                    headers->etag = value;
-                } else if (std::regex_match(key, match, last_modified_regex)) {
-                    headers->last_modified = value;
-                }
-            }
-            return n_items;
-        };
-
-        curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
-        curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); // hide head request progress
-        curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
-        curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
-
-        bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
-        if (!was_perform_successful) {
-            return false;
-        }
-
-        long http_code = 0;
-        curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
-        if (http_code != 200) {
-            // HEAD not supported, we don't know if the file has changed
-            // force trigger downloading
-            force_download = true;
-            LOG_ERR("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
-        }
-    }
-
-    bool should_download = !file_exists || force_download;
-    if (!should_download) {
-        if (!etag.empty() && etag != headers.etag) {
-            LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str());
-            should_download = true;
-        } else if (!last_modified.empty() && last_modified != headers.last_modified) {
-            LOG_WRN("%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__, last_modified.c_str(), headers.last_modified.c_str());
-            should_download = true;
-        }
-    }
-    if (should_download) {
-        std::string path_temporary = path + ".downloadInProgress";
-        if (file_exists) {
-            LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
-            if (remove(path.c_str()) != 0) {
-                LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
-                return false;
-            }
-        }
-
-        // Set the output file
-
-        struct FILE_deleter {
-            void operator()(FILE * f) const {
-                fclose(f);
-            }
-        };
-
-        std::unique_ptr<FILE, FILE_deleter> outfile(fopen(path_temporary.c_str(), "wb"));
-        if (!outfile) {
-            LOG_ERR("%s: error opening local file for writing: %s\n", __func__, path.c_str());
-            return false;
-        }
-
-        typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * data, size_t size, size_t nmemb, void * fd);
-        auto write_callback = [](void * data, size_t size, size_t nmemb, void * fd) -> size_t {
-            return fwrite(data, size, nmemb, (FILE *)fd);
-        };
-        curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 0L);
-        curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
-        curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, outfile.get());
-
-        //  display download progress
-        curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 0L);
-
-        // helper function to hide password in URL
-        auto llama_download_hide_password_in_url = [](const std::string & url) -> std::string {
-            std::size_t protocol_pos = url.find("://");
-            if (protocol_pos == std::string::npos) {
-                return url;  // Malformed URL
-            }
-
-            std::size_t at_pos = url.find('@', protocol_pos + 3);
-            if (at_pos == std::string::npos) {
-                return url;  // No password in URL
-            }
-
-            return url.substr(0, protocol_pos + 3) + "********" + url.substr(at_pos);
-        };
-
-        // start the download
-        LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
-            llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
-        bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
-        if (!was_perform_successful) {
-            return false;
-        }
-
-        long http_code = 0;
-        curl_easy_getinfo (curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
-        if (http_code < 200 || http_code >= 400) {
-            LOG_ERR("%s: invalid http status code received: %ld\n", __func__, http_code);
-            return false;
-        }
-
-        // Causes file to be closed explicitly here before we rename it.
-        outfile.reset();
-
-        // Write the updated JSON metadata file.
-        metadata.update({
-            {"url", url},
-            {"etag", headers.etag},
-            {"lastModified", headers.last_modified}
-        });
-        std::ofstream(metadata_path) << metadata.dump(4);
-        LOG_INF("%s: file metadata saved: %s\n", __func__, metadata_path.c_str());
-
-        if (rename(path_temporary.c_str(), path.c_str()) != 0) {
-            LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
-            return false;
-        }
-    }
-
-    return true;
-}
-
-struct llama_model * common_load_model_from_url(
-        const std::string & model_url,
-        const std::string & local_path,
-        const std::string & hf_token,
-        const struct llama_model_params & params) {
-    // Basic validation of the model_url
-    if (model_url.empty()) {
-        LOG_ERR("%s: invalid model_url\n", __func__);
-        return NULL;
-    }
-
-    if (!common_download_file(model_url, local_path, hf_token)) {
-        return NULL;
-    }
-
-    // check for additional GGUFs split to download
-    int n_split = 0;
-    {
-        struct gguf_init_params gguf_params = {
-            /*.no_alloc = */ true,
-            /*.ctx      = */ NULL,
-        };
-        auto * ctx_gguf = gguf_init_from_file(local_path.c_str(), gguf_params);
-        if (!ctx_gguf) {
-            LOG_ERR("\n%s:  failed to load input GGUF from %s\n", __func__, local_path.c_str());
-            return NULL;
-        }
-
-        auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT);
-        if (key_n_split >= 0) {
-            n_split = gguf_get_val_u16(ctx_gguf, key_n_split);
-        }
-
-        gguf_free(ctx_gguf);
-    }
-
-    if (n_split > 1) {
-        char split_prefix[PATH_MAX] = {0};
-        char split_url_prefix[LLAMA_CURL_MAX_URL_LENGTH] = {0};
-
-        // Verify the first split file format
-        // and extract split URL and PATH prefixes
-        {
-            if (!llama_split_prefix(split_prefix, sizeof(split_prefix), local_path.c_str(), 0, n_split)) {
-                LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, local_path.c_str(), n_split);
-                return NULL;
-            }
-
-            if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url.c_str(), 0, n_split)) {
-                LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url.c_str(), n_split);
-                return NULL;
-            }
-        }
-
-        // Prepare download in parallel
-        std::vector<std::future<bool>> futures_download;
-        for (int idx = 1; idx < n_split; idx++) {
-            futures_download.push_back(std::async(std::launch::async, [&split_prefix, &split_url_prefix, &n_split, hf_token](int download_idx) -> bool {
-                char split_path[PATH_MAX] = {0};
-                llama_split_path(split_path, sizeof(split_path), split_prefix, download_idx, n_split);
-
-                char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
-                llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split);
-
-                return common_download_file(split_url, split_path, hf_token);
-            }, idx));
-        }
-
-        // Wait for all downloads to complete
-        for (auto & f : futures_download) {
-            if (!f.get()) {
-                return NULL;
-            }
-        }
-    }
-
-    return llama_model_load_from_file(local_path.c_str(), params);
-}
-
-struct llama_model * common_load_model_from_hf(
-        const std::string & repo,
-        const std::string & remote_path,
-        const std::string & local_path,
-        const std::string & hf_token,
-        const struct llama_model_params & params) {
-    // construct hugging face model url:
-    //
-    //  --repo ggml-org/models --file tinyllama-1.1b/ggml-model-f16.gguf
-    //    https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf
-    //
-    //  --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf
-    //    https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf
-    //
-
-    std::string model_url = "https://huggingface.co/";
-    model_url += repo;
-    model_url += "/resolve/main/";
-    model_url += remote_path;
-
-    return common_load_model_from_url(model_url, local_path, hf_token, params);
-}
-
-/**
- * Allow getting the HF file from the HF repo with tag (like ollama), for example:
- * - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
- * - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M
- * - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s
- * Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo)
- *
- * Return pair of <repo, file> (with "repo" already having tag removed)
- *
- * Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
- */
-std::pair<std::string, std::string> common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & hf_token) {
-    auto parts = string_split<std::string>(hf_repo_with_tag, ':');
-    std::string tag = parts.size() > 1 ? parts.back() : "latest";
-    std::string hf_repo = parts[0];
-    if (string_split<std::string>(hf_repo, '/').size() != 2) {
-        throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n");
-    }
-
-    // fetch model info from Hugging Face Hub API
-    json model_info;
-    curl_ptr       curl(curl_easy_init(), &curl_easy_cleanup);
-    curl_slist_ptr http_headers;
-    std::string res_str;
-    std::string url = "https://huggingface.co/v2/" + hf_repo + "/manifests/" + tag;
-    curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
-    curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
-    typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
-    auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t {
-        static_cast<std::string *>(data)->append((char * ) ptr, size * nmemb);
-        return size * nmemb;
-    };
-    curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
-    curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_str);
-#if defined(_WIN32)
-    curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
-#endif
-    if (!hf_token.empty()) {
-        std::string auth_header = "Authorization: Bearer " + hf_token;
-        http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
-    }
-    // Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
-    http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
-    http_headers.ptr = curl_slist_append(http_headers.ptr, "Accept: application/json");
-    curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
-
-    CURLcode res = curl_easy_perform(curl.get());
-
-    if (res != CURLE_OK) {
-        throw std::runtime_error("error: cannot make GET request to HF API");
-    }
-
-    long res_code;
-    curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code);
-    if (res_code == 200) {
-        model_info = json::parse(res_str);
-    } else if (res_code == 401) {
-        throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token");
-    } else {
-        throw std::runtime_error(string_format("error from HF API, response code: %ld, data: %s", res_code, res_str.c_str()));
-    }
-
-    // check response
-    if (!model_info.contains("ggufFile")) {
-        throw std::runtime_error("error: model does not have ggufFile");
-    }
-    json & gguf_file = model_info.at("ggufFile");
-    if (!gguf_file.contains("rfilename")) {
-        throw std::runtime_error("error: ggufFile does not have rfilename");
-    }
-
-    return std::make_pair(hf_repo, gguf_file.at("rfilename"));
-}
-
-#else
-
-struct llama_model * common_load_model_from_url(
-        const std::string & /*model_url*/,
-        const std::string & /*local_path*/,
-        const std::string & /*hf_token*/,
-        const struct llama_model_params & /*params*/) {
-    LOG_WRN("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
-    return nullptr;
-}
-
-struct llama_model * common_load_model_from_hf(
-        const std::string & /*repo*/,
-        const std::string & /*remote_path*/,
-        const std::string & /*local_path*/,
-        const std::string & /*hf_token*/,
-        const struct llama_model_params & /*params*/) {
-    LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
-    return nullptr;
-}
-
-std::pair<std::string, std::string> common_get_hf_file(const std::string &, const std::string &) {
-    LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
-    return std::make_pair("", "");
-}
-
-#endif // LLAMA_USE_CURL
-
 //
 // Batch utils
 //
diff --git a/common/common.h b/common/common.h
index 1c0f19977..41ff9905e 100644
--- a/common/common.h
+++ b/common/common.h
@@ -184,6 +184,13 @@ struct common_params_sampling {
     std::string print() const;
 };
 
+struct common_params_model {
+    std::string path    = ""; // model local path                                           // NOLINT
+    std::string url     = ""; // model url to download                                      // NOLINT
+    std::string hf_repo = ""; // HF repo                                                    // NOLINT
+    std::string hf_file = ""; // HF file                                                    // NOLINT
+};
+
 struct common_params_speculative {
     std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
 
@@ -197,19 +204,11 @@ struct common_params_speculative {
     struct cpu_params cpuparams;
     struct cpu_params cpuparams_batch;
 
-    std::string hf_repo = ""; // HF repo                                                     // NOLINT
-    std::string hf_file = ""; // HF file                                                     // NOLINT
-
-    std::string model = "";     // draft model for speculative decoding                      // NOLINT
-    std::string model_url = ""; // model url to download                                     // NOLINT
+    struct common_params_model model;
 };
 
 struct common_params_vocoder {
-    std::string hf_repo = ""; // HF repo                                                     // NOLINT
-    std::string hf_file = ""; // HF file                                                     // NOLINT
-
-    std::string model     = ""; // model path                                                // NOLINT
-    std::string model_url = ""; // model url to download                                     // NOLINT
+    struct common_params_model model;
 
     std::string speaker_file = ""; // speaker file path                                      // NOLINT
 
@@ -267,12 +266,10 @@ struct common_params {
     struct common_params_speculative speculative;
     struct common_params_vocoder     vocoder;
 
-    std::string model                = ""; // model path                                                    // NOLINT
+    struct common_params_model model;
+
     std::string model_alias          = ""; // model alias                                                   // NOLINT
-    std::string model_url            = ""; // model url to download                                         // NOLINT
     std::string hf_token             = ""; // HF token                                                      // NOLINT
-    std::string hf_repo              = ""; // HF repo                                                       // NOLINT
-    std::string hf_file              = ""; // HF file                                                       // NOLINT
     std::string prompt               = "";                                                                  // NOLINT
     std::string system_prompt        = "";                                                                  // NOLINT
     std::string prompt_file          = ""; // store the external prompt file name                           // NOLINT
@@ -347,7 +344,7 @@ struct common_params {
     common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
 
     // multimodal models (see examples/llava)
-    std::string mmproj = "";        // path to multimodal projector                                         // NOLINT
+    struct common_params_model mmproj;
     std::vector<std::string> image; // path to image file(s)
 
     // embedding
@@ -546,23 +543,6 @@ struct llama_model_params     common_model_params_to_llama  (      common_params
 struct llama_context_params   common_context_params_to_llama(const common_params & params);
 struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
 
-struct llama_model * common_load_model_from_url(
-    const std::string & model_url,
-    const std::string & local_path,
-    const std::string & hf_token,
-    const struct llama_model_params & params);
-
-struct llama_model * common_load_model_from_hf(
-    const std::string & repo,
-    const std::string & remote_path,
-    const std::string & local_path,
-    const std::string & hf_token,
-    const struct llama_model_params & params);
-
-std::pair<std::string, std::string> common_get_hf_file(
-    const std::string & hf_repo_with_tag,
-    const std::string & hf_token);
-
 // clear LoRA adapters from context, then apply new list of adapters
 void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
 
diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp
index 430e8be51..0f4019293 100644
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@@ -38,7 +38,7 @@ int main(int argc, char ** argv) {
 
     llama_model_params model_params = common_model_params_to_llama(params);
 
-    llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);
+    llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params);
 
     if (model == NULL) {
         fprintf(stderr , "%s: error: unable to load model\n" , __func__);
diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp
index 21b95ef5e..1a5de5928 100644
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@@ -41,7 +41,7 @@ int main(int argc, char ** argv) {
 
     llama_model_params model_params = common_model_params_to_llama(params);
 
-    llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);
+    llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params);
 
     if (model == NULL) {
         LOG_ERR("%s: error: unable to load model\n" , __func__);
diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp
index e7d0fbfff..24dc85cf2 100644
--- a/examples/export-lora/export-lora.cpp
+++ b/examples/export-lora/export-lora.cpp
@@ -421,7 +421,7 @@ int main(int argc, char ** argv) {
 
     g_verbose = (params.verbosity > 1);
     try {
-        lora_merge_ctx ctx(params.model, params.lora_adapters, params.out_file, params.cpuparams.n_threads);
+        lora_merge_ctx ctx(params.model.path, params.lora_adapters, params.out_file, params.cpuparams.n_threads);
         ctx.run_merge();
     } catch (const std::exception & err) {
         fprintf(stderr, "%s\n", err.what());
diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp
index f7db7861c..539bc4d60 100644
--- a/examples/gritlm/gritlm.cpp
+++ b/examples/gritlm/gritlm.cpp
@@ -168,7 +168,7 @@ int main(int argc, char * argv[]) {
 
     llama_backend_init();
 
-    llama_model * model = llama_model_load_from_file(params.model.c_str(), mparams);
+    llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
 
     // create generation context
     llama_context * ctx = llama_init_from_model(model, cparams);
diff --git a/examples/llava/README-gemma3.md b/examples/llava/README-gemma3.md
index 20bf73fb5..3c25ee258 100644
--- a/examples/llava/README-gemma3.md
+++ b/examples/llava/README-gemma3.md
@@ -4,6 +4,26 @@
 >
 > This is very experimental, only used for demo purpose.
 
+## Quick started
+
+You can use pre-quantized model from [ggml-org](https://huggingface.co/ggml-org)'s Hugging Face account
+
+```bash
+# build
+cmake -B build
+cmake --build build --target llama-gemma3-cli
+
+# alternatively, install from brew (MacOS)
+brew install llama.cpp
+
+# run it
+llama-gemma3-cli -hf ggml-org/gemma-3-4b-it-GGUF
+llama-gemma3-cli -hf ggml-org/gemma-3-12b-it-GGUF
+llama-gemma3-cli -hf ggml-org/gemma-3-27b-it-GGUF
+
+# note: 1B model does not support vision
+```
+
 ## How to get mmproj.gguf?
 
 ```bash
diff --git a/examples/llava/gemma3-cli.cpp b/examples/llava/gemma3-cli.cpp
index c36bb2eda..7813ac19f 100644
--- a/examples/llava/gemma3-cli.cpp
+++ b/examples/llava/gemma3-cli.cpp
@@ -78,7 +78,7 @@ struct gemma3_context {
     }
 
     void init_clip_model(common_params & params) {
-        const char * clip_path = params.mmproj.c_str();
+        const char * clip_path = params.mmproj.path.c_str();
         ctx_clip = clip_model_load(clip_path, params.verbosity > 1);
     }
 
@@ -232,13 +232,13 @@ int main(int argc, char ** argv) {
 
     common_init();
 
-    if (params.mmproj.empty()) {
+    if (params.mmproj.path.empty()) {
         show_additional_info(argc, argv);
         return 1;
     }
 
     gemma3_context ctx(params);
-    printf("%s: %s\n", __func__, params.model.c_str());
+    printf("%s: %s\n", __func__, params.model.path.c_str());
 
     bool is_single_turn = !params.prompt.empty() && !params.image.empty();
 
diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp
index 40aa0876f..a15131343 100644
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@@ -225,7 +225,7 @@ static struct llama_model * llava_init(common_params * params) {
 
     llama_model_params model_params = common_model_params_to_llama(*params);
 
-    llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params);
+    llama_model * model = llama_model_load_from_file(params->model.path.c_str(), model_params);
     if (model == NULL) {
         LOG_ERR("%s: unable to load model\n" , __func__);
         return NULL;
@@ -234,7 +234,7 @@ static struct llama_model * llava_init(common_params * params) {
 }
 
 static struct llava_context * llava_init_context(common_params * params, llama_model * model) {
-    const char * clip_path = params->mmproj.c_str();
+    const char * clip_path = params->mmproj.path.c_str();
 
     auto prompt = params->prompt;
     if (prompt.empty()) {
@@ -283,7 +283,7 @@ int main(int argc, char ** argv) {
 
     common_init();
 
-    if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
+    if (params.mmproj.path.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
         print_usage(argc, argv);
         return 1;
     }
diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp
index 12f536cf5..48fddeaa4 100644
--- a/examples/llava/minicpmv-cli.cpp
+++ b/examples/llava/minicpmv-cli.cpp
@@ -31,7 +31,7 @@ static struct llama_model * llava_init(common_params * params) {
 
     llama_model_params model_params = common_model_params_to_llama(*params);
 
-    llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params);
+    llama_model * model = llama_model_load_from_file(params->model.path.c_str(), model_params);
     if (model == NULL) {
         LOG_ERR("%s: unable to load model\n" , __func__);
         return NULL;
@@ -80,7 +80,7 @@ static void llava_free(struct llava_context * ctx_llava) {
 }
 
 static struct clip_ctx * clip_init_context(common_params * params) {
-    const char * clip_path = params->mmproj.c_str();
+    const char * clip_path = params->mmproj.path.c_str();
 
     auto prompt = params->prompt;
     if (prompt.empty()) {
@@ -290,7 +290,7 @@ int main(int argc, char ** argv) {
 
     common_init();
 
-    if (params.mmproj.empty() || (params.image.empty())) {
+    if (params.mmproj.path.empty() || (params.image.empty())) {
         show_additional_info(argc, argv);
         return 1;
     }
diff --git a/examples/llava/qwen2vl-cli.cpp b/examples/llava/qwen2vl-cli.cpp
index 132a7da54..c6481e482 100644
--- a/examples/llava/qwen2vl-cli.cpp
+++ b/examples/llava/qwen2vl-cli.cpp
@@ -314,7 +314,7 @@ static struct llama_model * llava_init(common_params * params) {
 
     llama_model_params model_params = common_model_params_to_llama(*params);
 
-    llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params);
+    llama_model * model = llama_model_load_from_file(params->model.path.c_str(), model_params);
     if (model == NULL) {
         LOG_ERR("%s: unable to load model\n" , __func__);
         return NULL;
@@ -323,7 +323,7 @@ static struct llama_model * llava_init(common_params * params) {
 }
 
 static struct llava_context * llava_init_context(common_params * params, llama_model * model) {
-    const char * clip_path = params->mmproj.c_str();
+    const char * clip_path = params->mmproj.path.c_str();
 
     auto prompt = params->prompt;
     if (prompt.empty()) {
@@ -524,7 +524,7 @@ int main(int argc, char ** argv) {
 
     common_init();
 
-    if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
+    if (params.mmproj.path.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
         print_usage(argc, argv);
         return 1;
     }
diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
index 588632f04..e0e6da631 100644
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -405,7 +405,7 @@ int main(int argc, char ** argv) {
         params.prompt_file = "used built-in defaults";
     }
     LOG_INF("External prompt file: \033[32m%s\033[0m\n", params.prompt_file.c_str());
-    LOG_INF("Model and path used:  \033[32m%s\033[0m\n\n", params.model.c_str());
+    LOG_INF("Model and path used:  \033[32m%s\033[0m\n\n", params.model.path.c_str());
 
     LOG_INF("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt              ) / (t_main_end - t_main_start) * 1e6);
     LOG_INF("Total gen tokens:    %6d, speed: %5.2f t/s\n", n_total_gen,    (double) (n_total_gen                 ) / (t_main_end - t_main_start) * 1e6);
diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp
index ea3a6c1fc..347ea4a69 100644
--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@@ -64,7 +64,7 @@ int main(int argc, char ** argv) {
 
     llama_model_params model_params = common_model_params_to_llama(params);
 
-    llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);
+    llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params);
 
     if (model == NULL) {
         LOG_ERR("%s: unable to load model\n" , __func__);
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 17a292da1..d140f8c44 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1876,7 +1876,7 @@ struct server_context {
     }
 
     bool load_model(const common_params & params) {
-        SRV_INF("loading model '%s'\n", params.model.c_str());
+        SRV_INF("loading model '%s'\n", params.model.path.c_str());
 
         params_base = params;
 
@@ -1886,7 +1886,7 @@ struct server_context {
         ctx   = llama_init.context.get();
 
         if (model == nullptr) {
-            SRV_ERR("failed to load model, '%s'\n", params_base.model.c_str());
+            SRV_ERR("failed to load model, '%s'\n", params_base.model.path.c_str());
             return false;
         }
 
@@ -1897,16 +1897,13 @@ struct server_context {
         add_bos_token = llama_vocab_get_add_bos(vocab);
         has_eos_token = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
 
-        if (!params_base.speculative.model.empty() || !params_base.speculative.hf_repo.empty()) {
-            SRV_INF("loading draft model '%s'\n", params_base.speculative.model.c_str());
+        if (!params_base.speculative.model.path.empty() || !params_base.speculative.model.hf_repo.empty()) {
+            SRV_INF("loading draft model '%s'\n", params_base.speculative.model.path.c_str());
 
             auto params_dft = params_base;
 
             params_dft.devices      = params_base.speculative.devices;
-            params_dft.hf_file      = params_base.speculative.hf_file;
-            params_dft.hf_repo      = params_base.speculative.hf_repo;
             params_dft.model        = params_base.speculative.model;
-            params_dft.model_url    = params_base.speculative.model_url;
             params_dft.n_ctx        = params_base.speculative.n_ctx == 0 ? params_base.n_ctx / params_base.n_parallel : params_base.speculative.n_ctx;
             params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
             params_dft.n_parallel   = 1;
@@ -1920,12 +1917,12 @@ struct server_context {
             model_dft = llama_init_dft.model.get();
 
             if (model_dft == nullptr) {
-                SRV_ERR("failed to load draft model, '%s'\n", params_base.speculative.model.c_str());
+                SRV_ERR("failed to load draft model, '%s'\n", params_base.speculative.model.path.c_str());
                 return false;
             }
 
             if (!common_speculative_are_compatible(ctx, llama_init_dft.context.get())) {
-                SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params_base.speculative.model.c_str(), params_base.model.c_str());
+                SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params_base.speculative.model.path.c_str(), params_base.model.path.c_str());
 
                 return false;
             }
@@ -3865,7 +3862,7 @@ int main(int argc, char ** argv) {
         json data = {
             { "default_generation_settings", ctx_server.default_generation_settings_for_props },
             { "total_slots",                 ctx_server.params_base.n_parallel },
-            { "model_path",                  ctx_server.params_base.model },
+            { "model_path",                  ctx_server.params_base.model.path },
             { "chat_template",               common_chat_templates_source(ctx_server.chat_templates.get()) },
             { "bos_token",                   common_token_to_piece(ctx_server.ctx, llama_vocab_bos(ctx_server.vocab), /* special= */ true)},
             { "eos_token",                   common_token_to_piece(ctx_server.ctx, llama_vocab_eos(ctx_server.vocab), /* special= */ true)},
@@ -4131,7 +4128,7 @@ int main(int argc, char ** argv) {
             {"object", "list"},
             {"data", {
                 {
-                    {"id",       params.model_alias.empty() ? params.model : params.model_alias},
+                    {"id",       params.model_alias.empty() ? params.model.path : params.model_alias},
                     {"object",   "model"},
                     {"created",  std::time(0)},
                     {"owned_by", "llamacpp"},
diff --git a/examples/speculative-simple/speculative-simple.cpp b/examples/speculative-simple/speculative-simple.cpp
index a5d2bc9d0..0783ed4a4 100644
--- a/examples/speculative-simple/speculative-simple.cpp
+++ b/examples/speculative-simple/speculative-simple.cpp
@@ -24,7 +24,7 @@ int main(int argc, char ** argv) {
 
     common_init();
 
-    if (params.speculative.model.empty()) {
+    if (params.speculative.model.path.empty()) {
         LOG_ERR("%s: --model-draft is required\n", __func__);
         return 1;
     }
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index 627d01bbc..561c30883 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -46,7 +46,7 @@ int main(int argc, char ** argv) {
 
     common_init();
 
-    if (params.speculative.model.empty()) {
+    if (params.speculative.model.path.empty()) {
         LOG_ERR("%s: --model-draft is required\n", __func__);
         return 1;
     }
diff --git a/examples/tts/tts.cpp b/examples/tts/tts.cpp
index c7ac94cc5..0f0479869 100644
--- a/examples/tts/tts.cpp
+++ b/examples/tts/tts.cpp
@@ -577,12 +577,7 @@ int main(int argc, char ** argv) {
 
     const llama_vocab * vocab = llama_model_get_vocab(model_ttc);
 
-    // TODO: refactor in a common struct
-    params.model     = params.vocoder.model;
-    params.model_url = params.vocoder.model_url;
-    params.hf_repo   = params.vocoder.hf_repo;
-    params.hf_file   = params.vocoder.hf_file;
-
+    params.model = params.vocoder.model;
     params.embedding = true;
 
     common_init_result llama_init_cts = common_init_from_params(params);
diff --git a/tests/test-arg-parser.cpp b/tests/test-arg-parser.cpp
index 69604b87c..537fc63a4 100644
--- a/tests/test-arg-parser.cpp
+++ b/tests/test-arg-parser.cpp
@@ -77,7 +77,7 @@ int main(void) {
 
     argv = {"binary_name", "-m", "model_file.gguf"};
     assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
-    assert(params.model == "model_file.gguf");
+    assert(params.model.path == "model_file.gguf");
 
     argv = {"binary_name", "-t", "1234"};
     assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
@@ -89,7 +89,7 @@ int main(void) {
 
     argv = {"binary_name", "-m", "abc.gguf", "--predict", "6789", "--batch-size", "9090"};
     assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
-    assert(params.model == "abc.gguf");
+    assert(params.model.path == "abc.gguf");
     assert(params.n_predict == 6789);
     assert(params.n_batch == 9090);
 
@@ -112,7 +112,7 @@ int main(void) {
     setenv("LLAMA_ARG_THREADS", "1010", true);
     argv = {"binary_name"};
     assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
-    assert(params.model == "blah.gguf");
+    assert(params.model.path == "blah.gguf");
     assert(params.cpuparams.n_threads == 1010);
 
 
@@ -122,7 +122,7 @@ int main(void) {
     setenv("LLAMA_ARG_THREADS", "1010", true);
     argv = {"binary_name", "-m", "overwritten.gguf"};
     assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
-    assert(params.model == "overwritten.gguf");
+    assert(params.model.path == "overwritten.gguf");
     assert(params.cpuparams.n_threads == 1010);
 #endif // _WIN32
 

From 9bacd6b37461608385360fd64326c13247ccf18e Mon Sep 17 00:00:00 2001
From: Chenguang Li <757486878@qq.com>
Date: Wed, 2 Apr 2025 15:22:13 +0800
Subject: [PATCH 06/20] [CANN] get_rows and dup optimization (#12671)

* [CANN]get_rows and dup optimization.

Co-authored-by: hipudding <huafengchun@gmail.com>
Signed-off-by: noemotiovon <noemotiovon@gmail.com>

* [CANN]GET_ROWS and CPY/DUP optimization

Co-authored-by: hipudding <huafengchun@gmail.com>
Signed-off-by: noemotiovon <noemotiovon@gmail.com>

* [CANN]code style adjustment

Signed-off-by: noemotiovon <noemotiovon@gmail.com>

* [CANN]code style adjustment

Signed-off-by: noemotiovon <noemotiovon@gmail.com>

* [CANN]code style adjustment

Signed-off-by: noemotiovon <noemotiovon@gmail.com>

* [CANN]code style adjustment

Signed-off-by: noemotiovon <noemotiovon@gmail.com>

---------

Signed-off-by: noemotiovon <noemotiovon@gmail.com>
Co-authored-by: noemotiovon <noemotiovon@gmail.com>
Co-authored-by: hipudding <huafengchun@gmail.com>
---
 ggml/src/ggml-cann/CMakeLists.txt             |   2 -
 ggml/src/ggml-cann/aclnn_ops.cpp              | 479 +++++++++---------
 ggml/src/ggml-cann/ggml-cann.cpp              |  26 +-
 ggml/src/ggml-cann/kernels/CMakeLists.txt     |  30 --
 ggml/src/ggml-cann/kernels/ascendc_kernels.h  |  19 -
 ggml/src/ggml-cann/kernels/dup.cpp            | 234 ---------
 ggml/src/ggml-cann/kernels/get_row_f16.cpp    | 197 -------
 ggml/src/ggml-cann/kernels/get_row_f32.cpp    | 190 -------
 ggml/src/ggml-cann/kernels/get_row_q4_0.cpp   | 204 --------
 ggml/src/ggml-cann/kernels/get_row_q8_0.cpp   | 191 -------
 .../ggml-cann/kernels/quantize_f16_q8_0.cpp   | 218 --------
 .../ggml-cann/kernels/quantize_f32_q8_0.cpp   | 216 --------
 .../kernels/quantize_float_to_q4_0.cpp        | 295 -----------
 13 files changed, 262 insertions(+), 2039 deletions(-)
 delete mode 100644 ggml/src/ggml-cann/kernels/CMakeLists.txt
 delete mode 100644 ggml/src/ggml-cann/kernels/ascendc_kernels.h
 delete mode 100644 ggml/src/ggml-cann/kernels/dup.cpp
 delete mode 100644 ggml/src/ggml-cann/kernels/get_row_f16.cpp
 delete mode 100644 ggml/src/ggml-cann/kernels/get_row_f32.cpp
 delete mode 100644 ggml/src/ggml-cann/kernels/get_row_q4_0.cpp
 delete mode 100644 ggml/src/ggml-cann/kernels/get_row_q8_0.cpp
 delete mode 100644 ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp
 delete mode 100644 ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp
 delete mode 100644 ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp

diff --git a/ggml/src/ggml-cann/CMakeLists.txt b/ggml/src/ggml-cann/CMakeLists.txt
index 05cf06bfa..0d8e483b2 100644
--- a/ggml/src/ggml-cann/CMakeLists.txt
+++ b/ggml/src/ggml-cann/CMakeLists.txt
@@ -51,13 +51,11 @@ if (CANN_INSTALL_DIR)
         ${CANN_INSTALL_DIR}/acllib/include
     )
 
-    add_subdirectory(kernels)
     list(APPEND CANN_LIBRARIES
         ascendcl
         nnopbase
         opapi
         acl_op_compiler
-        ascendc_kernels
     )
 
     file(GLOB GGML_SOURCES_CANN "*.cpp")
diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp
index 6bb5d0834..8482bb537 100644
--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
@@ -30,6 +30,7 @@
 #include <aclnnop/aclnn_copy.h>
 #include <aclnnop/aclnn_cos.h>
 #include <aclnnop/aclnn_div.h>
+#include <aclnnop/aclnn_embedding.h>
 #include <aclnnop/aclnn_exp.h>
 #include <aclnnop/aclnn_fill_scalar.h>
 #include <aclnnop/aclnn_group_norm.h>
@@ -58,7 +59,6 @@
 #include <vector>
 
 #include "ggml-impl.h"
-#include "kernels/ascendc_kernels.h"
 
 #define GGML_COMMON_DECL_C
 
@@ -99,6 +99,35 @@ static void aclnn_repeat(ggml_backend_cann_context& ctx, aclTensor* acl_src,
     ACL_CHECK(aclDestroyIntArray(repeats));
 }
 
+/**
+ * @brief Casts the elements of a tensor to a specified data type using the CANN backend.
+ *
+ * @details This function performs a type conversion on the elements of the input tensor `acl_src`
+ *          and stores the results in the destination tensor `acl_dst`. The conversion type is
+ *          determined based on the `dst` tensor's data type.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_src The source tensor whose elements will be cast.
+ * @param acl_dst The destination tensor that will store the casted elements.
+ * @param dst The ggml tensor specifying the target data type.
+ */
+static void aclnn_cast(ggml_backend_cann_context& ctx, aclTensor* acl_src,
+                    aclTensor* acl_dst, ggml_tensor* dst) {
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+    ACL_CHECK(aclnnCastGetWorkspaceSize(acl_src,
+                                        ggml_cann_type_mapping(dst->type),
+                                        acl_dst, &workspaceSize, &executor));
+
+    if (workspaceSize > 0) {
+        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+        workspaceAddr = workspace_allocator.get();
+    }
+
+    ACL_CHECK(aclnnCast(workspaceAddr, workspaceSize, executor, ctx.stream()));
+}
+
 void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     ggml_tensor* src = dst->src[0];
     GGML_ASSERT(ggml_can_repeat(src, dst));
@@ -889,173 +918,76 @@ static void cann_copy(ggml_backend_cann_context& ctx, aclTensor* acl_src,
 }
 
 void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
-    ggml_tensor* src = dst->src[0];
+    ggml_tensor* src0 = dst->src[0];
 
-    aclTensor* acl_src = ggml_cann_create_tensor(src);
+    aclTensor* acl_src = ggml_cann_create_tensor(src0);
     aclTensor* acl_dst = ggml_cann_create_tensor(dst);
-
-    ggml_cann_pool_alloc src_extra_allocator(ctx.pool(), sizeof(ggml_tensor));
-    ggml_cann_pool_alloc dst_extra_allocator(ctx.pool(), sizeof(ggml_tensor));
-    src->extra = src_extra_allocator.get();
-    dst->extra = dst_extra_allocator.get();
-    ACL_CHECK(aclrtMemcpyAsync(src->extra, sizeof(ggml_tensor), src,
-                               sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
-                               ctx.stream()));
-    ACL_CHECK(aclrtMemcpyAsync(dst->extra, sizeof(ggml_tensor), dst,
-                               sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
-                               ctx.stream()));
-
-    if ((dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32) &&
-        ggml_are_same_shape(src, dst)) {
-        cann_copy(ctx, acl_src, acl_dst);
-        ACL_CHECK(aclDestroyTensor(acl_src));
-        ACL_CHECK(aclDestroyTensor(acl_dst));
-        return;
-    }
-    // TODO: simplify
-    if (src->type == GGML_TYPE_F16) {
-        if (dst->type == GGML_TYPE_Q8_0) {
-            aclrtlaunch_ascendc_quantize_f16_q8_0(
-                24, ctx.stream(), src->data, dst->data,
-                ((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
-                ((ggml_tensor*)dst->extra)->ne);
-            return;
-        }
-        if (dst->type == GGML_TYPE_Q4_0) {
-            aclrtlaunch_ascendc_quantize_f16_to_q4_0(
-                24, ctx.stream(), src->data, dst->data,
-                ((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
-                ((ggml_tensor*)dst->extra)->ne);
-            return;
-        }
-        if (dst->type == GGML_TYPE_F16) {
-            if (ggml_are_same_shape(src, dst)) {
-                cann_copy(ctx, acl_src, acl_dst);
-                ACL_CHECK(aclDestroyTensor(acl_src));
-                ACL_CHECK(aclDestroyTensor(acl_dst));
-                return;
-            }
-            if (ggml_is_contiguous(dst)) {
-                const size_t src_type_size = ggml_type_size(src->type);
-                if (src->nb[0] == src_type_size) {
-                    // src0 is contigous on first dimension, copy by rows
-                    int64_t rows_num = ggml_nrows(src);
-
-                    aclrtlaunch_ascendc_dup_by_rows_fp16(
-                        rows_num, ctx.stream(), src->data, dst->data,
-                        ((ggml_tensor*)src->extra)->ne,
-                        ((ggml_tensor*)src->extra)->nb,
-                        ((ggml_tensor*)dst->extra)->ne,
-                        ((ggml_tensor*)dst->extra)->nb);
-                    return;
-                }
-                GGML_ABORT("fatal error");
-            }
-            GGML_ABORT("fatal error");
-        }
-        if (dst->type == GGML_TYPE_F32) {
-            if (ggml_are_same_shape(src, dst)) {
-                cann_copy(ctx, acl_src, acl_dst);
-                ACL_CHECK(aclDestroyTensor(acl_src));
-                ACL_CHECK(aclDestroyTensor(acl_dst));
-                return;
-            }
-            if (ggml_is_contiguous(dst)) {
-                const size_t src_type_size = ggml_type_size(src->type);
-                if (src->nb[0] == src_type_size) {
-                    // src0 is contigous on first dimension, copy by rows
-                    int64_t rows_num = ggml_nrows(src);
-                    aclrtlaunch_ascendc_dup_by_rows_fp16_to_fp32(
-                        rows_num, ctx.stream(), src->data, dst->data,
-                        ((ggml_tensor*)src->extra)->ne,
-                        ((ggml_tensor*)src->extra)->nb,
-                        ((ggml_tensor*)dst->extra)->ne,
-                        ((ggml_tensor*)dst->extra)->nb);
-                    return;
-                }
-                GGML_ABORT("fatal error");
-            }
-            GGML_ABORT("fatal error");
-        }
-        // TODO
-        GGML_ABORT("fatal error");
-    } else if (src->type == GGML_TYPE_F32) {
-        // TODO: if (src0->type == dst->type && ne00 == ne0 && nb00 == type_size
-        //          && nb0 == type_size)
-        if (dst->type == GGML_TYPE_Q8_0) {
-            aclrtlaunch_ascendc_quantize_f32_q8_0(
-                24, ctx.stream(), src->data, dst->data,
-                ((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
-                ((ggml_tensor*)dst->extra)->ne);
-            return;
-        }
-        if (dst->type == GGML_TYPE_Q4_0) {
-            aclrtlaunch_ascendc_quantize_f32_to_q4_0(
-                24, ctx.stream(), src->data, dst->data,
-                ((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
-                ((ggml_tensor*)dst->extra)->ne);
-            return;
-        }
-        if (dst->type == GGML_TYPE_F32) {
-            if (ggml_are_same_shape(src, dst)) {
-                cann_copy(ctx, acl_src, acl_dst);
-                ACL_CHECK(aclDestroyTensor(acl_src));
-                ACL_CHECK(aclDestroyTensor(acl_dst));
-                return;
-            }
-            if (ggml_is_contiguous(dst)) {
-                const size_t src_type_size = ggml_type_size(src->type);
-                if (src->nb[0] == src_type_size) {
-                    // src0 is contigous on first dimension, copy by rows
-                    int64_t rows_num = ggml_nrows(src);
-                    aclrtlaunch_ascendc_dup_by_rows_fp32(
-                        rows_num, ctx.stream(), src->data, dst->data,
-                        ((ggml_tensor*)src->extra)->ne,
-                        ((ggml_tensor*)src->extra)->nb,
-                        ((ggml_tensor*)dst->extra)->ne,
-                        ((ggml_tensor*)dst->extra)->nb);
-                    return;
-                }
-                GGML_ABORT("fatal error");
-            } else {
-                // TODO: dst not contiguous
-                GGML_ABORT("fatal error");
-            }
-        }
-        if (dst->type == GGML_TYPE_F16) {
-            if (ggml_are_same_shape(src, dst)) {
-                cann_copy(ctx, acl_src, acl_dst);
-                ACL_CHECK(aclDestroyTensor(acl_src));
-                ACL_CHECK(aclDestroyTensor(acl_dst));
-                return;
-            }
-            if (ggml_is_contiguous(dst)) {
-                const size_t src_type_size = ggml_type_size(src->type);
-                if (src->nb[0] == src_type_size) {
-                    // src0 is contigous on first dimension, copy by rows
-                    int64_t rows_num = ggml_nrows(src);
-                    aclrtlaunch_ascendc_dup_by_rows_fp32_to_fp16(
-                        rows_num, ctx.stream(), src->data, dst->data,
-                        ((ggml_tensor*)src->extra)->ne,
-                        ((ggml_tensor*)src->extra)->nb,
-                        ((ggml_tensor*)dst->extra)->ne,
-                        ((ggml_tensor*)dst->extra)->nb);
-                    return;
-                }
-                GGML_ABORT("fatal error");
-            }
-        }
-        // TODO
-        GGML_ABORT("fatal error");
-    } else {
-        if (ggml_are_same_shape(src, dst)) {
+    if (ggml_are_same_shape(src0, dst)) {
+        if (dst->type == src0->type) {
             cann_copy(ctx, acl_src, acl_dst);
-            ACL_CHECK(aclDestroyTensor(acl_src));
-            ACL_CHECK(aclDestroyTensor(acl_dst));
-            return;
+        } else {
+            aclnn_cast(ctx, acl_src, acl_dst, dst);
+        }
+    } else {
+        if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) {
+            if (dst->type == src0->type) {
+                size_t cpy_size = ggml_nbytes(dst);
+                ACL_CHECK(aclrtMemcpyAsync(
+                    dst->data, cpy_size, src0->data, cpy_size,
+                    ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
+                return;
+            } else {
+                ggml_cann_pool_alloc src_buffer_allocator(
+                    ctx.pool(),
+                    ggml_nelements(dst) * ggml_type_size(dst->type));
+                void* src_trans_buffer = src_buffer_allocator.get();
+                size_t src_trans_nb[GGML_MAX_DIMS];
+                src_trans_nb[0] = ggml_type_size(dst->type);
+                for (int i = 1; i < GGML_MAX_DIMS; i++) {
+                    src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
+                }
+                aclTensor* src_trans_tensor = ggml_cann_create_tensor(
+                    src_trans_buffer, ggml_cann_type_mapping(dst->type),
+                    ggml_type_size(dst->type), src0->ne, src_trans_nb,
+                    GGML_MAX_DIMS);
+
+                aclnn_cast(ctx, acl_src, src_trans_tensor, dst);
+                size_t cpy_size = ggml_nbytes(dst);
+                ACL_CHECK(aclrtMemcpyAsync(
+                    dst->data, cpy_size, src_trans_buffer, cpy_size,
+                    ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
+                ACL_CHECK(aclDestroyTensor(src_trans_tensor));
+                return;
+            }
+        } else if (ggml_is_contiguous(dst)) {
+            ggml_cann_pool_alloc src_buffer_allocator(
+                ctx.pool(), ggml_nelements(dst) * ggml_type_size(dst->type));
+            void* src_trans_buffer = src_buffer_allocator.get();
+            size_t src_trans_nb[GGML_MAX_DIMS];
+            src_trans_nb[0] = ggml_type_size(dst->type);
+            for (int i = 1; i < GGML_MAX_DIMS; i++) {
+                src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
+            }
+            aclTensor* src_trans_tensor = ggml_cann_create_tensor(
+                src_trans_buffer, ggml_cann_type_mapping(dst->type),
+                ggml_type_size(dst->type), src0->ne, src_trans_nb,
+                GGML_MAX_DIMS);
+
+            aclnn_cast(ctx, acl_src, src_trans_tensor, dst);
+
+            size_t cpy_size = ggml_nbytes(dst);
+            ACL_CHECK(aclrtMemcpyAsync(dst->data, cpy_size, src_trans_buffer,
+                                       cpy_size, ACL_MEMCPY_DEVICE_TO_DEVICE,
+                                       ctx.stream()));
+            ACL_CHECK(aclDestroyTensor(src_trans_tensor));
+            return;
+        } else {
+            GGML_ABORT("Unsupport dst is not tontiguous.");
         }
-        GGML_ABORT("fatal error");
     }
+
+    ACL_CHECK(aclDestroyTensor(acl_src));
+    ACL_CHECK(aclDestroyTensor(acl_dst));
 }
 
 #ifdef __cplusplus
@@ -2378,85 +2310,168 @@ void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     ACL_CHECK(aclDestroyTensor(tmp_mask_tensor));
 }
 
-void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
-    ggml_tensor* src0 = dst->src[0];
-    ggml_tensor* src1 = dst->src[1];
+/**
+ * @brief Performs embedding operation on a 4D tensor using the CANN backend.
+ *
+ * This function extracts slices from the source tensor (`src_buffer`),
+ * index tensor (`index`), and destination tensor (`dst`), and performs an
+ * embedding operation on them. The embedding operation is applied by iterating
+ * over the last two dimensions of the source tensor, creating the necessary
+ * tensors for the source, index, and output, and executing the embedding operation.
+ *
+ * @param ctx The context for CANN backend operations.
+ * @param src_buffer The source buffer holding the data for the source tensor.
+ * @param src_ne The dimensions of the source tensor.
+ * @param src_nb The strides (byte offsets) of the source tensor.
+ * @param index The index tensor used in the embedding operation.
+ * @param dst The destination tensor where the result will be stored.
+ */
+static void aclnn_embedding_4d(ggml_backend_cann_context& ctx, void* src_buffer,
+                            int64_t* src_ne, size_t* src_nb, ggml_tensor* index,
+                            ggml_tensor* dst) {
+    for (int64_t i = 0; i < src_ne[3]; i++) {
+        for (int64_t j = 0; j < src_ne[2]; j++) {
+            // src
+            int64_t acl_src_ne[2] = {src_ne[0], src_ne[1]};
+            size_t acl_src_nb[2] = {src_nb[0], src_nb[1]};
+            aclTensor* acl_src_tensor = ggml_cann_create_tensor(
+                (char*)src_buffer + i * src_nb[3] + j * src_nb[2],
+                ggml_cann_type_mapping(dst->type), ggml_element_size(dst),
+                acl_src_ne, acl_src_nb, 2);
 
-    ggml_cann_pool_alloc src0_extra_allocator(ctx.pool(), sizeof(ggml_tensor));
-    ggml_cann_pool_alloc src1_extra_allocator(ctx.pool(), sizeof(ggml_tensor));
-    ggml_cann_pool_alloc dst_extra_allocator(ctx.pool(), sizeof(ggml_tensor));
-    src0->extra = src0_extra_allocator.get();
-    src1->extra = src1_extra_allocator.get();
-    dst->extra = dst_extra_allocator.get();
-    ACL_CHECK(aclrtMemcpyAsync(src0->extra, sizeof(ggml_tensor), src0,
-                               sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
-                               ctx.stream()));
-    ACL_CHECK(aclrtMemcpyAsync(src1->extra, sizeof(ggml_tensor), src1,
-                               sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
-                               ctx.stream()));
-    ACL_CHECK(aclrtMemcpyAsync(dst->extra, sizeof(ggml_tensor), dst,
-                               sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
-                               ctx.stream()));
+            // index
+            int64_t acl_index_ne[1] = {index->ne[0]};
+            size_t acl_index_nb[1] = {index->nb[0]};
+            aclTensor* acl_index = ggml_cann_create_tensor(
+                (char*)index->data + i * index->nb[2] + j * index->nb[1],
+                ggml_cann_type_mapping(index->type), ggml_element_size(index),
+                acl_index_ne, acl_index_nb, 1);
+
+            // out
+            int64_t acl_out_ne[2] = {dst->ne[0], dst->ne[1]};
+            size_t acl_out_nb[2] = {dst->nb[0], dst->nb[1]};
+            aclTensor* acl_out = ggml_cann_create_tensor(
+                (char*)dst->data + i * dst->nb[3] + j * dst->nb[2],
+                ggml_cann_type_mapping(dst->type), ggml_element_size(dst),
+                acl_out_ne, acl_out_nb, 2);
+
+            uint64_t workspaceSize = 0;
+            aclOpExecutor* executor;
+            void* workspaceAddr = nullptr;
+
+            ACL_CHECK(aclnnEmbeddingGetWorkspaceSize(
+                acl_src_tensor, acl_index, acl_out, &workspaceSize, &executor));
+
+            if (workspaceSize > 0) {
+                ggml_cann_pool_alloc workspace_allocator(ctx.pool(),
+                                                         workspaceSize);
+                workspaceAddr = workspace_allocator.get();
+            }
+
+            ACL_CHECK(aclnnEmbedding(workspaceAddr, workspaceSize, executor,
+                                     ctx.stream()));
+
+            ACL_CHECK(aclDestroyTensor(acl_src_tensor));
+            ACL_CHECK(aclDestroyTensor(acl_index));
+            ACL_CHECK(aclDestroyTensor(acl_out));
+        }
+    }
+}
+
+void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    ggml_tensor* src0 = dst->src[0];  // src
+    ggml_tensor* src1 = dst->src[1];  // index
 
     switch (src0->type) {
         case GGML_TYPE_F32: {
-#ifdef ASCEND_310P
-            // Special operation for get_row_f32 kernel of 310P: clear the
-            // content of dest data buffer when row is not aligned to 32 bytes
-            if ((src0->ne[0] % 8) != 0) {
-                size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] *
-                                 src0->ne[0] * ggml_type_size(GGML_TYPE_F32);
-                ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len));
-            }
-#endif
-            aclrtlaunch_ascendc_get_row_f32(
-                24, ctx.stream(), src0->data, src1->data, dst->data,
-                ((ggml_tensor*)src0->extra)->ne,
-                ((ggml_tensor*)src0->extra)->nb,
-                ((ggml_tensor*)src1->extra)->ne,
-                ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
-                ((ggml_tensor*)dst->extra)->nb);
+            aclnn_embedding_4d(ctx, src0->data, src0->ne, src0->nb, src1,
+                                   dst);
             break;
         }
         case GGML_TYPE_F16: {
-#ifdef ASCEND_310P
-            // Special operation for get_row_f16 kernel of 310P: clear the
-            // content of dest data buffer when row is not aligned to 32 bytes
-            if ((src0->ne[0] % 16) != 0) {
-                size_t dst_len =
-                    src1->ne[0] * src1->ne[1] * src1->ne[2] * src0->ne[0] *
-                    ggml_type_size(
-                        GGML_TYPE_F32);  // out is also f32, even input is f16
-                ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len));
+            aclTensor* acl_src0 = ggml_cann_create_tensor(src0);
+            ggml_cann_pool_alloc src_buffer_allocator(
+                ctx.pool(), ggml_nelements(src0) * sizeof(float_t));
+            void* src_trans_buffer = src_buffer_allocator.get();
+            size_t src_trans_nb[GGML_MAX_DIMS];
+            src_trans_nb[0] = sizeof(float_t);
+            for (int i = 1; i < GGML_MAX_DIMS; i++) {
+                src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
             }
-#endif
-            aclrtlaunch_ascendc_get_row_f16(
-                24, ctx.stream(), src0->data, src1->data, dst->data,
-                ((ggml_tensor*)src0->extra)->ne,
-                ((ggml_tensor*)src0->extra)->nb,
-                ((ggml_tensor*)src1->extra)->ne,
-                ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
-                ((ggml_tensor*)dst->extra)->nb);
+            aclTensor* src_trans_tensor = ggml_cann_create_tensor(
+                src_trans_buffer, ACL_FLOAT, ggml_type_size(dst->type),
+                src0->ne, src_trans_nb, GGML_MAX_DIMS);
+            aclnn_cast(ctx, acl_src0, src_trans_tensor, dst);
+            aclnn_embedding_4d(ctx, src_trans_buffer, src0->ne,
+                                   src_trans_nb, src1, dst);
+            ACL_CHECK(aclDestroyTensor(acl_src0));
+            ACL_CHECK(aclDestroyTensor(src_trans_tensor));
             break;
         }
-        case GGML_TYPE_Q4_0:
-            aclrtlaunch_ascendc_get_row_q4_0(
-                24, ctx.stream(), src0->data, src1->data, dst->data,
-                ((ggml_tensor*)src0->extra)->ne,
-                ((ggml_tensor*)src1->extra)->ne,
-                ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
-                ((ggml_tensor*)dst->extra)->nb);
-            break;
-        case GGML_TYPE_Q8_0:
-            aclrtlaunch_ascendc_get_row_q8_0(
-                24, ctx.stream(), src0->data, src1->data, dst->data,
-                ((ggml_tensor*)src0->extra)->ne,
-                ((ggml_tensor*)src1->extra)->ne,
-                ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
-                ((ggml_tensor*)dst->extra)->nb);
+        case GGML_TYPE_Q8_0: {
+            // add 1 dim for bcast mul.
+            size_t weight_nb[GGML_MAX_DIMS + 1], scale_nb[GGML_MAX_DIMS + 1],
+                dequant_nb[GGML_MAX_DIMS + 1];
+            int64_t weight_ne[GGML_MAX_DIMS + 1], scale_ne[GGML_MAX_DIMS + 1],
+                *dequant_ne;
+            int64_t scale_offset = 0;
+
+            // [3,4,5,64] -> [3,4,5,2,32]
+            weight_ne[0] = QK8_0;
+            weight_ne[1] = src0->ne[0] / QK8_0;
+            weight_nb[0] = sizeof(int8_t);
+            weight_nb[1] = weight_nb[0] * weight_ne[0];
+            for (int i = 2; i < GGML_MAX_DIMS + 1; i++) {
+                weight_ne[i] = src0->ne[i - 1];
+                weight_nb[i] = weight_nb[i - 1] * weight_ne[i - 1];
+            }
+
+            // [3,4,5,64] -> [3,4,5,2,1]
+            scale_ne[0] = 1;
+            scale_ne[1] = src0->ne[0] / QK8_0;
+            scale_nb[0] = sizeof(uint16_t);
+            scale_nb[1] = scale_nb[0] * scale_ne[0];
+            for (int i = 2; i < GGML_MAX_DIMS + 1; i++) {
+                scale_ne[i] = src0->ne[i - 1];
+                scale_nb[i] = scale_nb[i - 1] * scale_ne[i - 1];
+            }
+
+            // [3,4,5,64] -> [3,4,5,2,32]
+            dequant_ne = weight_ne;
+            dequant_nb[0] = sizeof(float_t);
+            for (int i = 1; i < GGML_MAX_DIMS + 1; i++) {
+                dequant_nb[i] = dequant_nb[i - 1] * dequant_ne[i - 1];
+            }
+
+            scale_offset = ggml_nelements(src0) * sizeof(int8_t);
+            ggml_cann_pool_alloc dequant_buffer_allocator(
+                ctx.pool(), ggml_nelements(src0) * sizeof(float_t));
+
+            aclTensor* acl_weight_tensor = ggml_cann_create_tensor(
+                src0->data, ACL_INT8, sizeof(int8_t), weight_ne, weight_nb,
+                GGML_MAX_DIMS + 1);
+            aclTensor* acl_scale_tensor = ggml_cann_create_tensor(
+                src0->data, ACL_FLOAT16, sizeof(float16_t), scale_ne, scale_nb,
+                GGML_MAX_DIMS + 1, ACL_FORMAT_ND, scale_offset);
+            aclTensor* dequant_tensor = ggml_cann_create_tensor(
+                dequant_buffer_allocator.get(), ACL_FLOAT, sizeof(float_t),
+                dequant_ne, dequant_nb, GGML_MAX_DIMS + 1);
+
+            aclnn_mul(ctx, acl_weight_tensor, acl_scale_tensor, dequant_tensor);
+            dequant_nb[0] = sizeof(float_t);
+            dequant_ne = src0->ne;
+            for (int i = 1; i < GGML_MAX_DIMS; i++) {
+                dequant_nb[i] = dequant_nb[i - 1] * src0->ne[i - 1];
+            }
+
+            aclnn_embedding_4d(ctx, dequant_buffer_allocator.get(),
+                                   dequant_ne, dequant_nb, src1, dst);
+
+            ACL_CHECK(aclDestroyTensor(dequant_tensor));
             break;
+        }
         default:
-            GGML_ABORT("fatal error");
+            GGML_ABORT("Unsupported tensor type for GGML_OP_GET_ROWS");
             break;
     }
 }
@@ -2797,8 +2812,8 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
 
             ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
                 acl_input_tensor, acl_weight_tensor, acl_scale_tensor, nullptr,
-                nullptr, nullptr, nullptr, antiquantGroupSize, acl_output_tensor,
-                &workspaceSize, &executor));
+                nullptr, nullptr, nullptr, antiquantGroupSize,
+                acl_output_tensor, &workspaceSize, &executor));
             if (workspaceAddr == nullptr) {
                 workspaceAddr = workspace_allocator.alloc(workspaceSize);
             }
diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
index 68cd9920d..da75f77f5 100644
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -1704,7 +1704,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
             switch (op->src[0]->type) {
                 case GGML_TYPE_F32:
                 case GGML_TYPE_F16:
-                case GGML_TYPE_Q4_0:
                 case GGML_TYPE_Q8_0:
                     return true;
                 default:
@@ -1712,16 +1711,21 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
             }
         } break;
         case GGML_OP_CPY: {
-            switch (op->type) {
-                case GGML_TYPE_F32:
-                case GGML_TYPE_F16:
-                case GGML_TYPE_Q8_0:
-                case GGML_TYPE_Q4_0:
-                    return true;
-                default:
-                    return false;
+            ggml_tensor *src = op->src[0];
+            if ((op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_F16) ||
+                  (src->type != GGML_TYPE_F32 &&
+                    src->type != GGML_TYPE_F16)) {
+                // only support F32 and F16.
+                return false;
             }
-        }
+
+            if (!ggml_are_same_shape(op, src) && !ggml_is_contiguous(op)) {
+                // unsupport dst is not contiguous.
+                return false;
+            }
+
+            return true;
+        } break;
         case GGML_OP_CONT: {
             // TODO: support GGML_TYPE_BF16
             switch (op->src[0]->type) {
@@ -1762,9 +1766,9 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
             }
             return true;
         }
+        case GGML_OP_DUP:
         case GGML_OP_IM2COL:
         case GGML_OP_CONCAT:
-        case GGML_OP_DUP:
         case GGML_OP_REPEAT:
         case GGML_OP_NONE:
         case GGML_OP_RESHAPE:
diff --git a/ggml/src/ggml-cann/kernels/CMakeLists.txt b/ggml/src/ggml-cann/kernels/CMakeLists.txt
deleted file mode 100644
index d687220c3..000000000
--- a/ggml/src/ggml-cann/kernels/CMakeLists.txt
+++ /dev/null
@@ -1,30 +0,0 @@
-file(GLOB SRC_FILES
-    get_row_f32.cpp
-    get_row_f16.cpp
-    get_row_q4_0.cpp
-    get_row_q8_0.cpp
-    quantize_f32_q8_0.cpp
-    quantize_f16_q8_0.cpp
-    quantize_float_to_q4_0.cpp
-    dup.cpp
-)
-
-set(ASCEND_CANN_PACKAGE_PATH ${CANN_INSTALL_DIR})
-set(RUN_MODE "npu" CACHE STRING "run mode: npu/sim")
-
-if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
-    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
-elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
-    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
-else()
-    message(FATAL_ERROR "ascendc_kernel_cmake does not exist, please check whether the compiler package is installed.")
-endif()
-include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
-
-ascendc_library(ascendc_kernels STATIC
-    ${SRC_FILES}
-)
-
-message(STATUS "CANN: compile ascend kernels witch SOC_TYPE:${SOC_TYPE}, SOC_VERSION:${SOC_VERSION}, compile macro:-D${SOC_TYPE_COMPILE_OPTION}.")
-ascendc_compile_definitions(ascendc_kernels PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}")
-# ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)
diff --git a/ggml/src/ggml-cann/kernels/ascendc_kernels.h b/ggml/src/ggml-cann/kernels/ascendc_kernels.h
deleted file mode 100644
index 7e153208c..000000000
--- a/ggml/src/ggml-cann/kernels/ascendc_kernels.h
+++ /dev/null
@@ -1,19 +0,0 @@
-#ifndef ASCENDC_KERNELS_H
-#define ASCENDC_KERNELS_H
-
-#include "aclrtlaunch_ascendc_get_row_f32.h"
-#include "aclrtlaunch_ascendc_get_row_f16.h"
-#include "aclrtlaunch_ascendc_get_row_q8_0.h"
-#include "aclrtlaunch_ascendc_get_row_q4_0.h"
-
-#include "aclrtlaunch_ascendc_quantize_f32_q8_0.h"
-#include "aclrtlaunch_ascendc_quantize_f16_q8_0.h"
-#include "aclrtlaunch_ascendc_quantize_f16_to_q4_0.h"
-#include "aclrtlaunch_ascendc_quantize_f32_to_q4_0.h"
-
-#include "aclrtlaunch_ascendc_dup_by_rows_fp16.h"
-#include "aclrtlaunch_ascendc_dup_by_rows_fp32.h"
-#include "aclrtlaunch_ascendc_dup_by_rows_fp32_to_fp16.h"
-#include "aclrtlaunch_ascendc_dup_by_rows_fp16_to_fp32.h"
-
-#endif  // ASCENDC_KERNELS_H
diff --git a/ggml/src/ggml-cann/kernels/dup.cpp b/ggml/src/ggml-cann/kernels/dup.cpp
deleted file mode 100644
index d9b957449..000000000
--- a/ggml/src/ggml-cann/kernels/dup.cpp
+++ /dev/null
@@ -1,234 +0,0 @@
-#include "kernel_operator.h"
-
-using namespace AscendC;
-
-#define BUFFER_NUM 2
-const int64_t SUPPORTED_MAX_DIM = 65535;  // currently the limit of max block dim supportted by dup kernel is 65535template <typename SRC_T, typename DST_T>
-
-template <typename SRC_T, typename DST_T>
-class DupByRows {
-   public:
-    __aicore__ inline DupByRows() {}
-    __aicore__ inline void init(GM_ADDR src, GM_ADDR dst, int64_t *input_ne_ub,
-                                size_t *input_nb_ub) {
-        /* Dup by rows when src is contigous on first dimension and dst is
-        contiguous, each kernel process one row.
-        */
-
-        // Input has four dims.
-        int64_t op_block_num = GetBlockNum();
-        int64_t op_block_idx = GetBlockIdx();
-
-        // param
-        num_rows = input_ne_ub[1] * input_ne_ub[2] * input_ne_ub[3];
-        num_elem = input_ne_ub[0];
-
-        // index for (ne[1], ne[2], ne[3]): (idx_ne1, idx_ne2, idx_ne3)
-        idx_ne3 = op_block_idx / (input_ne_ub[1] * input_ne_ub[2]);
-        idx_ne2 = (op_block_idx - idx_ne3 * (input_ne_ub[1] * input_ne_ub[2]))
-                  / (input_ne_ub[1]);
-        idx_ne1 = op_block_idx - idx_ne3 * (input_ne_ub[1] * input_ne_ub[2])
-                - idx_ne2 * input_ne_ub[1];
-
-        // src may not contiguous in dim [1,2,3], so stride decited by ne&nb
-        src_stride = input_nb_ub[3] * idx_ne3 + input_nb_ub[2] * idx_ne2
-                     + input_nb_ub[1] * idx_ne1;
-
-        // dst is contiguous
-        dst_stride = op_block_idx * (input_ne_ub[0] * sizeof(DST_T));
-
-        src_gm.SetGlobalBuffer(reinterpret_cast<__gm__ SRC_T *>(src +
-                                                                src_stride));
-        dst_gm.SetGlobalBuffer(reinterpret_cast<__gm__ DST_T *>(dst +
-                                                                dst_stride));
-
-        pipe.InitBuffer(src_queue, BUFFER_NUM, (sizeof(SRC_T) * num_elem +
-                                                32 - 1) / 32 * 32);
-        pipe.InitBuffer(dst_queue, BUFFER_NUM, (sizeof(DST_T) * num_elem +
-                                                32 - 1) / 32 * 32);
-    }
-
-    __aicore__ inline void copy_in() {
-        LocalTensor<SRC_T> src_local = src_queue.AllocTensor<SRC_T>();
-        const size_t elem_per_block = 32 / sizeof(SRC_T);
-        size_t tail = num_elem % elem_per_block;
-        size_t cpy_elements_len = tail > 0 ? num_elem + 1 : num_elem;
-        DataCopy(src_local, src_gm, cpy_elements_len);
-        src_queue.EnQue(src_local);
-    }
-
-    __aicore__ inline void copy_out() {
-        LocalTensor<DST_T> dst_local = dst_queue.DeQue<DST_T>();
-#ifdef ASCEND_310P
-        const size_t elem_per_block = 32 / sizeof(DST_T);
-        size_t tail = num_elem % elem_per_block;
-        size_t len = num_elem & ~(elem_per_block - 1);
-        if (len > 0) {
-            DataCopy(dst_gm, dst_local, len);
-        }
-        if(tail != 0) {
-            for (size_t i = tail; i < elem_per_block; i++) {
-                dst_local[len + i].SetValue(0, 0);
-            }
-            SetAtomicAdd<float>();
-            DataCopy(dst_gm[len], dst_local[len], elem_per_block);
-            SetAtomicNone();
-        }
-#else
-        DataCopyExtParams dataCopyParams;
-        dataCopyParams.blockCount = 1;
-        dataCopyParams.blockLen = num_elem * sizeof(DST_T);
-        DataCopyPad(dst_gm, dst_local, dataCopyParams);
-#endif
-        dst_queue.FreeTensor(dst_local);
-    }
-
-    __aicore__ inline void dup() {
-        // main process, copy one row data from src to dst.
-        copy_in();
-
-        LocalTensor<SRC_T> src_local = src_queue.DeQue<SRC_T>();
-        LocalTensor<DST_T> dst_local = dst_queue.AllocTensor<DST_T>();
-
-        int32_t BLOCK_NUM = 32 / sizeof(DST_T);
-        DataCopy(dst_local, src_local, (num_elem + BLOCK_NUM - 1)
-                                        / BLOCK_NUM * BLOCK_NUM);
-        dst_queue.EnQue<DST_T>(dst_local);
-
-        src_queue.FreeTensor(src_local);
-        copy_out();
-    }
-
-    __aicore__ inline void dup_with_cast() {
-        // main process, copy one row data from src to dst.
-        // cast dtype from src to dst.
-        copy_in();
-
-        LocalTensor<SRC_T> src_local = src_queue.DeQue<SRC_T>();
-        LocalTensor<DST_T> dst_local = dst_queue.AllocTensor<DST_T>();
-
-        Cast(dst_local, src_local, RoundMode::CAST_NONE, num_elem);
-        dst_queue.EnQue<DST_T>(dst_local);
-
-        src_queue.FreeTensor(src_local);
-        copy_out();
-    }
-
-   private:
-
-    TPipe pipe;
-    GlobalTensor<SRC_T> src_gm;
-    GlobalTensor<DST_T> dst_gm;
-
-    int64_t num_rows;
-    int64_t num_elem;
-    int64_t idx_ne3;
-    int64_t idx_ne2;
-    int64_t idx_ne1;
-    int64_t src_stride;
-    int64_t dst_stride;
-
-    TQue<QuePosition::VECIN, BUFFER_NUM> src_queue;
-    TQue<QuePosition::VECOUT, BUFFER_NUM> dst_queue;
-};
-
-template <typename T>
-__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
-    auto gm_ptr = (__gm__ uint8_t *)gm;
-    auto ub_ptr = (uint8_t *)(ub);
-    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
-        *ub_ptr = *gm_ptr;
-    }
-}
-
-extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp16(
-                                                        GM_ADDR src_gm,
-                                                        GM_ADDR dst_gm,
-                                                        GM_ADDR input_ne_gm,
-                                                        GM_ADDR input_nb_gm,
-                                                        GM_ADDR output_ne_gm,
-                                                        GM_ADDR output_nb_gm) {
-
-    int64_t input_ne_ub[4];
-    size_t input_nb_ub[4];
-    int64_t output_ne_ub[4];
-    size_t output_nb_ub[4];
-
-    copy_to_ub(input_ne_gm, input_ne_ub, 32);
-    copy_to_ub(input_nb_gm, input_nb_ub, 32);
-    copy_to_ub(output_ne_gm, output_ne_ub, 32);
-    copy_to_ub(output_nb_gm, output_nb_ub, 32);
-
-    DupByRows<half, half> op;
-    op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
-    op.dup();
-}
-
-extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp32(
-                                                        GM_ADDR src_gm,
-                                                        GM_ADDR dst_gm,
-                                                        GM_ADDR input_ne_gm,
-                                                        GM_ADDR input_nb_gm,
-                                                        GM_ADDR output_ne_gm,
-                                                        GM_ADDR output_nb_gm) {
-    int64_t input_ne_ub[4];
-    size_t input_nb_ub[4];
-    int64_t output_ne_ub[4];
-    size_t output_nb_ub[4];
-
-    copy_to_ub(input_ne_gm, input_ne_ub, 32);
-    copy_to_ub(input_nb_gm, input_nb_ub, 32);
-    copy_to_ub(output_ne_gm, output_ne_ub, 32);
-    copy_to_ub(output_nb_gm, output_nb_ub, 32);
-
-    DupByRows<float, float> op;
-    op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
-    op.dup();
-}
-
-extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp32_to_fp16(
-                                                        GM_ADDR src_gm,
-                                                        GM_ADDR dst_gm,
-                                                        GM_ADDR input_ne_gm,
-                                                        GM_ADDR input_nb_gm,
-                                                        GM_ADDR output_ne_gm,
-                                                        GM_ADDR output_nb_gm) {
-
-    int64_t input_ne_ub[4];
-    size_t input_nb_ub[4];
-    int64_t output_ne_ub[4];
-    size_t output_nb_ub[4];
-
-    copy_to_ub(input_ne_gm, input_ne_ub, 32);
-    copy_to_ub(input_nb_gm, input_nb_ub, 32);
-    copy_to_ub(output_ne_gm, output_ne_ub, 32);
-    copy_to_ub(output_nb_gm, output_nb_ub, 32);
-
-    DupByRows<float, half> op;
-    op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
-    op.dup_with_cast();
-}
-
-extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp16_to_fp32(
-                                                        GM_ADDR src_gm,
-                                                        GM_ADDR dst_gm,
-                                                        GM_ADDR input_ne_gm,
-                                                        GM_ADDR input_nb_gm,
-                                                        GM_ADDR output_ne_gm,
-                                                        GM_ADDR output_nb_gm) {
-
-    // copy params from gm to ub.
-    int64_t input_ne_ub[4];
-    size_t input_nb_ub[4];
-    int64_t output_ne_ub[4];
-    size_t output_nb_ub[4];
-
-    copy_to_ub(input_ne_gm, input_ne_ub, 32);
-    copy_to_ub(input_nb_gm, input_nb_ub, 32);
-    copy_to_ub(output_ne_gm, output_ne_ub, 32);
-    copy_to_ub(output_nb_gm, output_nb_ub, 32);
-
-    DupByRows<half, float> op;
-    op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
-    op.dup_with_cast();
-}
diff --git a/ggml/src/ggml-cann/kernels/get_row_f16.cpp b/ggml/src/ggml-cann/kernels/get_row_f16.cpp
deleted file mode 100644
index 416b45104..000000000
--- a/ggml/src/ggml-cann/kernels/get_row_f16.cpp
+++ /dev/null
@@ -1,197 +0,0 @@
-#include "kernel_operator.h"
-
-// optimize me. Use template to avoid copy code.
-using namespace AscendC;
-
-#define BUFFER_NUM 2
-
-class GET_ROW_F16 {
-   public:
-    __aicore__ inline GET_ROW_F16() {}
-    __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
-                                int64_t *input_ne_ub, size_t *input_nb_ub,
-                                int64_t *indices_ne_ub, size_t *indices_nb_ub,
-                                int64_t *output_ne_ub, size_t *output_nb_ub) {
-        // TODO, use template for F16/f32
-        int64_t op_block_num = GetBlockNum();
-        op_block_idx = GetBlockIdx();
-
-        for (int i = 0; i < 4; i++) {
-            input_ne[i] = input_ne_ub[i];
-            input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
-
-            indices_ne[i] = indices_ne_ub[i];
-            indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
-
-            output_ne[i] = output_ne_ub[i];
-            output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
-        }
-
-        // Indices has two dims. n_elements = all rows should get.
-        // dr, all rows should this thread get.
-        uint64_t n_elements =
-            indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
-        dr = n_elements / op_block_num;
-
-        uint64_t tails = n_elements % op_block_num;
-        if (op_block_idx < tails) {
-            dr += 1;
-            ir = dr * op_block_idx;
-        } else {
-            ir = dr * op_block_idx + tails;
-        }
-
-        input_gm.SetGlobalBuffer((__gm__ half *)input);
-        indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
-        output_gm.SetGlobalBuffer((__gm__ float *)output);
-
-        uint64_t input_local_buffer_size = ((input_ne[0] * sizeof(half) + 31)
-                                             & ~31);
-        uint64_t output_local_buffer_size = ((input_ne[0] * sizeof(float) + 31)
-                                              & ~31);
-
-        local_buffer_elems = input_local_buffer_size / sizeof(half);
-
-        // TODO, consider long row that can't put in UB.
-        // All data should asign to 32. It's ok because all data is align to 32.
-        pipe.InitBuffer(input_queue, BUFFER_NUM, input_local_buffer_size);
-        pipe.InitBuffer(output_queue, BUFFER_NUM, output_local_buffer_size);
-    }
-
-    __aicore__ inline void copy_in(uint32_t offset, size_t len) {
-        size_t origin_len = len;
-        LocalTensor<half> input_local = input_queue.AllocTensor<half>();
-        const size_t elem_per_block = 32 / sizeof(half);
-        size_t tail = len % elem_per_block;
-        len = len & ~(elem_per_block - 1);
-        if(tail != 0) {
-            len += elem_per_block;
-        }
-        DataCopy(input_local, input_gm[offset], len);
-        input_queue.EnQue(input_local);
-    }
-
-    __aicore__ inline void copy_out(uint32_t offset, size_t len) {
-        LocalTensor<float> output_local = output_queue.DeQue<float>();
-        const size_t elem_per_block = 32 / sizeof(float);
-        size_t tail = len % elem_per_block;
-        len = len & ~(elem_per_block - 1);
-        if (len > 0) {
-            DataCopy(output_gm[offset], output_local, len);
-        }
-
-        if(tail != 0) {
-#ifdef ASCEND_310P
-            for (size_t i = tail; i < elem_per_block; i++) {
-                output_local[len + i].SetValue(0, 0);
-            }
-            SetAtomicAdd<float>();
-            DataCopy(output_gm[offset + len], output_local[len], elem_per_block);
-            SetAtomicNone();
-#else
-            DataCopyExtParams dataCopyParams;
-            dataCopyParams.blockCount = 1;
-            dataCopyParams.blockLen = tail * sizeof(float);
-            DataCopyPad(output_gm[offset + len], output_local[len],
-                        dataCopyParams);
-#endif
-        }
-        output_queue.FreeTensor(output_local);
-    }
-
-    __aicore__ inline void calculate_row(int64_t idx) {
-        const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
-        const int64_t indices_ne1_idx =
-            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
-            indices_ne[0];
-        const int64_t indices_ne0_idx =
-            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
-             indices_ne1_idx * indices_ne[0]);
-
-        const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
-                                       indices_ne1_idx * indices_stride[1] +
-                                       indices_ne2_idx * indices_stride[2];
-        const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
-
-        const int64_t input_offset = selected_row_idx * input_stride[1] +
-                                     indices_ne1_idx * input_stride[2] +
-                                     indices_ne2_idx * input_stride[3];
-
-        const int64_t output_offset = indices_ne0_idx * output_stride[1] +
-                                      indices_ne1_idx * output_stride[2] +
-                                      indices_ne2_idx * output_stride[3];
-
-        copy_in(input_offset, input_ne[0]);
-        LocalTensor<half> input_local = input_queue.DeQue<half>();
-        LocalTensor<float> output_local = output_queue.AllocTensor<float>();
-
-        Cast(output_local, input_local, RoundMode::CAST_NONE,
-             local_buffer_elems);
-        output_queue.EnQue(output_local);
-        copy_out(output_offset, input_ne[0]);
-
-        input_queue.FreeTensor(input_local);
-    }
-
-    __aicore__ inline void calculate() {
-        for (int64_t i = ir; i < ir + dr; i++) {
-            calculate_row(i);
-        }
-    }
-
-   private:
-    int64_t input_ne[4];
-    size_t input_stride[4];
-
-    int64_t indices_ne[4];
-    size_t indices_stride[4];
-
-    int64_t output_ne[4];
-    size_t output_stride[4];
-
-    size_t local_buffer_elems;
-
-    int64_t ir;
-    int64_t dr;
-
-    TPipe pipe;
-    GlobalTensor<half> input_gm;
-    GlobalTensor<int32_t> indices_gm;
-    GlobalTensor<float> output_gm;
-    TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
-    TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
-    int64_t op_block_idx;
-};
-
-template <typename T>
-__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
-    auto gm_ptr = (__gm__ uint8_t *)gm;
-    auto ub_ptr = (uint8_t *)(ub);
-    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
-        *ub_ptr = *gm_ptr;
-    }
-}
-
-extern "C" __global__ __aicore__ void ascendc_get_row_f16(
-    GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
-    GM_ADDR input_ne_gm, GM_ADDR input_nb_gm, GM_ADDR indices_ne_gm,
-    GM_ADDR indices_nb_gm, GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
-    int64_t input_ne_ub[4];
-    size_t input_nb_ub[4];
-    int64_t indices_ne_ub[4];
-    size_t indices_nb_ub[4];
-    int64_t output_ne_ub[4];
-    size_t output_nb_ub[4];
-
-    copy_to_ub(input_ne_gm, input_ne_ub, 32);
-    copy_to_ub(input_nb_gm, input_nb_ub, 32);
-    copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
-    copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
-    copy_to_ub(output_ne_gm, output_ne_ub, 32);
-    copy_to_ub(output_nb_gm, output_nb_ub, 32);
-
-    GET_ROW_F16 op;
-    op.init(input_gm, indices_gm, output_gm, input_ne_ub, input_nb_ub,
-            indices_ne_ub, indices_nb_ub, output_ne_ub, output_nb_ub);
-    op.calculate();
-}
diff --git a/ggml/src/ggml-cann/kernels/get_row_f32.cpp b/ggml/src/ggml-cann/kernels/get_row_f32.cpp
deleted file mode 100644
index 02116905b..000000000
--- a/ggml/src/ggml-cann/kernels/get_row_f32.cpp
+++ /dev/null
@@ -1,190 +0,0 @@
-#include "kernel_operator.h"
-
-// optimize me. Use template to avoid copy code.
-using namespace AscendC;
-
-#define BUFFER_NUM 2
-
-class GET_ROW_F32 {
-   public:
-    __aicore__ inline GET_ROW_F32() {}
-    __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
-                                int64_t *input_ne_ub, size_t *input_nb_ub,
-                                int64_t *indices_ne_ub, size_t *indices_nb_ub,
-                                int64_t *output_ne_ub, size_t *output_nb_ub) {
-        int64_t op_block_num = GetBlockNum();
-        op_block_idx = GetBlockIdx();
-
-        for (int i = 0; i < 4; i++) {
-            input_ne[i] = input_ne_ub[i];
-            input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
-
-            indices_ne[i] = indices_ne_ub[i];
-            indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
-
-            output_ne[i] = output_ne_ub[i];
-            output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
-        }
-
-        // Indices has two dims. n_elements = all rows should get.
-        // dr, all rows should this thread get.
-        uint64_t n_elements =
-            indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
-        dr = n_elements / op_block_num;
-
-        uint64_t tails = n_elements % op_block_num;
-        if (op_block_idx < tails) {
-            dr += 1;
-            ir = dr * op_block_idx;
-        } else {
-            ir = dr * op_block_idx + tails;
-        }
-
-        input_gm.SetGlobalBuffer((__gm__ float *)input);
-        indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
-        output_gm.SetGlobalBuffer((__gm__ float *)output);
-
-        uint64_t local_buffer_size = ((input_ne[0] * sizeof(float) + 31) & ~31);
-        local_buffer_elems = local_buffer_size / sizeof(float);
-
-        // TODO, consider long row that can't put in UB.
-        // All data should asign to 32. It's ok because all data is align to 32.
-        pipe.InitBuffer(input_queue, BUFFER_NUM, local_buffer_size);
-        pipe.InitBuffer(output_queue, BUFFER_NUM, local_buffer_size);
-    }
-
-    __aicore__ inline void copy_in(uint32_t offset, size_t len) {
-        LocalTensor<float> input_local = input_queue.AllocTensor<float>();
-        const size_t elem_per_block = 32 / sizeof(float);
-        size_t tail = len % elem_per_block;
-        len = len & ~(elem_per_block - 1);
-        if(tail != 0) {
-            len += elem_per_block;
-        }
-        DataCopy(input_local, input_gm[offset], len);
-        input_queue.EnQue(input_local);
-    }
-
-    __aicore__ inline void copy_out(uint32_t offset, size_t len) {
-        LocalTensor<float> output_local = output_queue.DeQue<float>();
-        const size_t elem_per_block = 32 / sizeof(float);
-        size_t tail = len % elem_per_block;
-        len = len & ~(elem_per_block - 1);
-        if (len > 0) {
-            DataCopy(output_gm[offset], output_local, len);
-        }
-
-        if(tail != 0) {
-#ifdef ASCEND_310P
-            for (size_t i = tail; i < elem_per_block; i++) {
-                output_local[len + i].SetValue(0, 0);
-            }
-            SetAtomicAdd<float>();
-            DataCopy(output_gm[offset + len], output_local[len], elem_per_block);
-            SetAtomicNone();
-#else
-            DataCopyExtParams dataCopyParams;
-            dataCopyParams.blockCount = 1;
-            dataCopyParams.blockLen = tail * sizeof(float);
-            DataCopyPad(output_gm[offset + len], output_local[len],
-                        dataCopyParams);
-#endif
-        }
-        output_queue.FreeTensor(output_local);
-    }
-
-    __aicore__ inline void calculate_row(int64_t idx) {
-        const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
-        const int64_t indices_ne1_idx =
-            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
-            indices_ne[0];
-        const int64_t indices_ne0_idx =
-            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
-             indices_ne1_idx * indices_ne[0]);
-
-        const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
-                                       indices_ne1_idx * indices_stride[1] +
-                                       indices_ne2_idx * indices_stride[2];
-        const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
-
-        const int64_t input_offset = selected_row_idx * input_stride[1] +
-                                     indices_ne1_idx * input_stride[2] +
-                                     indices_ne2_idx * input_stride[3];
-
-        const int64_t output_offset = indices_ne0_idx * output_stride[1] +
-                                      indices_ne1_idx * output_stride[2] +
-                                      indices_ne2_idx * output_stride[3];
-
-        copy_in(input_offset, input_ne[0]);
-        LocalTensor<float> input_local = input_queue.DeQue<float>();
-        LocalTensor<float> output_local = output_queue.AllocTensor<float>();
-
-        DataCopy(output_local, input_local, local_buffer_elems);
-        output_queue.EnQue(output_local);
-        copy_out(output_offset, input_ne[0]);
-
-        input_queue.FreeTensor(input_local);
-    }
-
-    __aicore__ inline void calculate() {
-        for (int64_t i = ir; i < ir + dr; i++) {
-            calculate_row(i);
-        }
-    }
-
-   private:
-    int64_t input_ne[4];
-    size_t input_stride[4];
-
-    int64_t indices_ne[4];
-    size_t indices_stride[4];
-
-    int64_t output_ne[4];
-    size_t output_stride[4];
-
-    size_t local_buffer_elems;
-
-    int64_t ir;
-    int64_t dr;
-
-    TPipe pipe;
-    GlobalTensor<float> input_gm;
-    GlobalTensor<int32_t> indices_gm;
-    GlobalTensor<float> output_gm;
-    TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
-    TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
-    int64_t op_block_idx;
-};
-
-template <typename T>
-__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
-    auto gm_ptr = (__gm__ uint8_t *)gm;
-    auto ub_ptr = (uint8_t *)(ub);
-    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
-        *ub_ptr = *gm_ptr;
-    }
-}
-
-extern "C" __global__ __aicore__ void ascendc_get_row_f32(
-    GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
-    GM_ADDR input_ne_gm, GM_ADDR input_nb_gm, GM_ADDR indices_ne_gm,
-    GM_ADDR indices_nb_gm, GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
-    int64_t input_ne_ub[4];
-    size_t input_nb_ub[4];
-    int64_t indices_ne_ub[4];
-    size_t indices_nb_ub[4];
-    int64_t output_ne_ub[4];
-    size_t output_nb_ub[4];
-
-    copy_to_ub(input_ne_gm, input_ne_ub, 32);
-    copy_to_ub(input_nb_gm, input_nb_ub, 32);
-    copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
-    copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
-    copy_to_ub(output_ne_gm, output_ne_ub, 32);
-    copy_to_ub(output_nb_gm, output_nb_ub, 32);
-
-    GET_ROW_F32 op;
-    op.init(input_gm, indices_gm, output_gm, input_ne_ub, input_nb_ub,
-            indices_ne_ub, indices_nb_ub, output_ne_ub, output_nb_ub);
-    op.calculate();
-}
diff --git a/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp b/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp
deleted file mode 100644
index 4fbe72208..000000000
--- a/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp
+++ /dev/null
@@ -1,204 +0,0 @@
-#include "kernel_operator.h"
-
-// optimize me. Use template to avoid copy code.
-using namespace AscendC;
-#ifdef ASCEND_310P // 310P not support 4bit get row
-    extern "C" __global__ __aicore__ void ascendc_get_row_q4_0(
-        GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
-        GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm,
-        GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
-        // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
-        printf("Ascend310P not support 4bit get row.\n");
-    }
-#else
-
-#define BUFFER_NUM 2
-
-#define QK4_0 32
-
-class GET_ROW_Q4_0 {
-   public:
-    __aicore__ inline GET_ROW_Q4_0() {}
-    __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
-                                int64_t *input_ne_ub, int64_t *indices_ne_ub,
-                                size_t *indices_nb_ub, int64_t *output_ne_ub,
-                                size_t *output_nb_ub) {
-        int64_t op_block_num = GetBlockNum();
-        int64_t op_block_idx = GetBlockIdx();
-
-        for (int i = 0; i < 4; i++) {
-            input_ne[i] = input_ne_ub[i];
-            indices_ne[i] = indices_ne_ub[i];
-            indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
-            scale_ne[i] = input_ne_ub[i];
-            output_ne[i] = output_ne_ub[i];
-            output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
-        }
-
-        // one scale for a group.
-        scale_ne[0] /= QK4_0;
-
-        input_stride[0] = 1;
-        scale_stride[0] = 1;
-        output_stride[0] = 1;
-        for (int i = 1; i < 4; i++) {
-            input_stride[i] = input_stride[i - 1] * input_ne[i - 1];
-            scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
-        }
-
-        group_size_in_row = input_ne[0] / QK4_0;
-        int64_t scale_offset = input_ne[0] * input_ne[1] * input_ne[2] *
-                               input_ne[3] / 2;
-
-        // Indices has two dims. n_elements = all rows should get.
-        // dr, all rows should this thread get.
-        uint64_t n_elements =
-            indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
-        dr = n_elements / op_block_num;
-
-        uint64_t tails = n_elements % op_block_num;
-        if (op_block_idx < tails) {
-            dr += 1;
-            ir = dr * op_block_idx;
-        } else {
-            ir = dr * op_block_idx + tails;
-        }
-
-        input_gm.SetGlobalBuffer((__gm__ int4b_t *)input);
-        scale_gm.SetGlobalBuffer((__gm__ half *)(input + scale_offset));
-        indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
-        output_gm.SetGlobalBuffer((__gm__ float *)output);
-
-        pipe.InitBuffer(input_queue, BUFFER_NUM, QK4_0 * sizeof(int4b_t));
-        pipe.InitBuffer(cast_queue, BUFFER_NUM, QK4_0 * sizeof(half));
-        pipe.InitBuffer(output_queue, BUFFER_NUM, QK4_0 * sizeof(float));
-    }
-
-    __aicore__ inline void copy_in(uint32_t offset) {
-        LocalTensor<int4b_t> input_local = input_queue.AllocTensor<int4b_t>();
-        // 32 * sizeof(int4b_t) = 16, which is not aligned to 32, why no error?
-        DataCopy(input_local, input_gm[offset], QK4_0);
-        input_queue.EnQue(input_local);
-    }
-
-    __aicore__ inline void copy_out(uint32_t offset) {
-        LocalTensor<float> output_local = output_queue.DeQue<float>();
-        DataCopy(output_gm[offset], output_local, QK4_0);
-        output_queue.FreeTensor(output_local);
-    }
-
-    __aicore__ inline void calculate_group(int64_t idx, int64_t group) {
-        const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
-        const int64_t indices_ne1_idx =
-            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
-            indices_ne[0];
-        const int64_t indices_ne0_idx =
-            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
-             indices_ne1_idx * indices_ne[0]);
-
-        const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
-                                       indices_ne1_idx * indices_stride[1] +
-                                       indices_ne2_idx * indices_stride[2];
-        const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
-
-        const int64_t input_offset = selected_row_idx * input_stride[1] +
-                                     indices_ne1_idx * input_stride[2] +
-                                     indices_ne2_idx * input_stride[3] +
-                                     group * QK4_0;
-        const int64_t scale_offset = selected_row_idx * scale_stride[1] +
-                                     indices_ne1_idx * scale_stride[2] +
-                                     indices_ne2_idx * scale_stride[3] + group;
-        const int64_t output_offset = indices_ne0_idx * output_stride[1] +
-                                      indices_ne1_idx * output_stride[2] +
-                                      indices_ne2_idx * output_stride[3] +
-                                      group * QK4_0;
-
-        copy_in(input_offset);
-        LocalTensor<int4b_t> input_local = input_queue.DeQue<int4b_t>();
-        LocalTensor<half> cast_local = cast_queue.AllocTensor<half>();
-        LocalTensor<float> output_local = output_queue.AllocTensor<float>();
-
-        // TODO: cast more data to speed up.
-        Cast(cast_local, input_local, RoundMode::CAST_NONE, QK4_0);
-        Cast(output_local, cast_local, RoundMode::CAST_NONE, QK4_0);
-
-        // Only mul need compile by group.
-        half scale = scale_gm.GetValue(scale_offset);
-
-        Muls(output_local, output_local, (float)scale, QK4_0);
-
-        input_queue.FreeTensor(input_local);
-        cast_queue.FreeTensor(cast_local);
-        output_queue.EnQue(output_local);
-
-        copy_out(output_offset);
-    }
-
-    __aicore__ inline void calculate() {
-        for (int64_t i = ir; i < ir + dr; i++) {
-            for (int64_t j = 0; j < group_size_in_row; j++) {
-                calculate_group(i, j);
-            }
-        }
-    }
-
-   private:
-    int64_t input_ne[4];
-    size_t input_stride[4];
-
-    int64_t scale_ne[4];
-    size_t scale_stride[4];
-
-    int64_t indices_ne[4];
-    size_t indices_stride[4];
-
-    int64_t output_ne[4];
-    size_t output_stride[4];
-
-    int64_t ir;
-    int64_t dr;
-
-    int64_t group_size_in_row;
-
-    TPipe pipe;
-    GlobalTensor<int4b_t> input_gm;
-    GlobalTensor<half> scale_gm;
-    GlobalTensor<int32_t> indices_gm;
-    GlobalTensor<float> output_gm;
-    TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
-    TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
-    TQue<QuePosition::VECIN, BUFFER_NUM> cast_queue;
-};
-
-template <typename T>
-__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
-    auto gm_ptr = (__gm__ uint8_t *)gm;
-    auto ub_ptr = (uint8_t *)(ub);
-    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
-        *ub_ptr = *gm_ptr;
-    }
-}
-
-extern "C" __global__ __aicore__ void ascendc_get_row_q4_0(
-    GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
-    GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm,
-    GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
-    int64_t input_ne_ub[4];
-    int64_t indices_ne_ub[4];
-    size_t indices_nb_ub[4];
-    int64_t output_ne_ub[4];
-    size_t output_nb_ub[4];
-
-    copy_to_ub(input_ne_gm, input_ne_ub, 32);
-    copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
-    copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
-    copy_to_ub(output_ne_gm, output_ne_ub, 32);
-    copy_to_ub(output_nb_gm, output_nb_ub, 32);
-
-    GET_ROW_Q4_0 op;
-    op.init(input_gm, indices_gm, output_gm, input_ne_ub, indices_ne_ub,
-            indices_nb_ub, output_ne_ub, output_nb_ub);
-    op.calculate();
-}
-
-#endif // #ifdef ASCEND_310P
diff --git a/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp b/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp
deleted file mode 100644
index ba9ab3c04..000000000
--- a/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp
+++ /dev/null
@@ -1,191 +0,0 @@
-#include "kernel_operator.h"
-
-// optimize me. Use template to avoid copy code.
-using namespace AscendC;
-
-#define BUFFER_NUM 2
-
-#define QK8_0 32
-
-class GET_ROW_Q8_0 {
-   public:
-    __aicore__ inline GET_ROW_Q8_0() {}
-    __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
-                                int64_t *input_ne_ub, int64_t *indices_ne_ub,
-                                size_t *indices_nb_ub, int64_t *output_ne_ub,
-                                size_t *output_nb_ub) {
-        int64_t op_block_num = GetBlockNum();
-        int64_t op_block_idx = GetBlockIdx();
-
-        for (int i = 0; i < 4; i++) {
-            input_ne[i] = input_ne_ub[i];
-            indices_ne[i] = indices_ne_ub[i];
-            indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
-            scale_ne[i] = input_ne_ub[i];
-            output_ne[i] = output_ne_ub[i];
-            output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
-        }
-
-        // one scale for a group.
-        scale_ne[0] /= QK8_0;
-
-        input_stride[0] = 1;
-        scale_stride[0] = 1;
-        output_stride[0] = 1;
-        for (int i = 1; i < 4; i++) {
-            input_stride[i] = input_stride[i - 1] * input_ne[i - 1];
-            scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
-        }
-
-        group_size_in_row = input_ne[0] / QK8_0;
-        int64_t scale_offset = input_ne[0] * input_ne[1] * input_ne[2] *
-                               input_ne[3] * sizeof(int8_t);
-
-        // Indices has two dims. n_elements = all rows should get.
-        // dr, all rows should this thread get.
-        uint64_t n_elements =
-            indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
-        dr = n_elements / op_block_num;
-
-        uint64_t tails = n_elements % op_block_num;
-        if (op_block_idx < tails) {
-            dr += 1;
-            ir = dr * op_block_idx;
-        } else {
-            ir = dr * op_block_idx + tails;
-        }
-
-        input_gm.SetGlobalBuffer((__gm__ int8_t *)input);
-        scale_gm.SetGlobalBuffer((__gm__ half *)(input + scale_offset));
-        indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
-        output_gm.SetGlobalBuffer((__gm__ float *)output);
-
-        pipe.InitBuffer(input_queue, BUFFER_NUM, QK8_0 * sizeof(int8_t));
-        pipe.InitBuffer(cast_queue, BUFFER_NUM, QK8_0 * sizeof(half));
-        pipe.InitBuffer(output_queue, BUFFER_NUM, QK8_0 * sizeof(float));
-    }
-
-    __aicore__ inline void copy_in(uint32_t offset) {
-        LocalTensor<int8_t> input_local = input_queue.AllocTensor<int8_t>();
-        DataCopy(input_local, input_gm[offset], QK8_0);
-        input_queue.EnQue(input_local);
-    }
-
-    __aicore__ inline void copy_out(uint32_t offset) {
-        LocalTensor<float> output_local = output_queue.DeQue<float>();
-        DataCopy(output_gm[offset], output_local, QK8_0);
-        output_queue.FreeTensor(output_local);
-    }
-
-    __aicore__ inline void calculate_group(int64_t idx, int64_t group) {
-        const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
-        const int64_t indices_ne1_idx =
-            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
-            indices_ne[0];
-        const int64_t indices_ne0_idx =
-            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
-             indices_ne1_idx * indices_ne[0]);
-
-        const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
-                                       indices_ne1_idx * indices_stride[1] +
-                                       indices_ne2_idx * indices_stride[2];
-        const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
-
-        const int64_t input_offset = selected_row_idx * input_stride[1] +
-                                     indices_ne1_idx * input_stride[2] +
-                                     indices_ne2_idx * input_stride[3] +
-                                     group * QK8_0;
-        const int64_t scale_offset = selected_row_idx * scale_stride[1] +
-                                     indices_ne1_idx * scale_stride[2] +
-                                     indices_ne2_idx * scale_stride[3] + group;
-        const int64_t output_offset = indices_ne0_idx * output_stride[1] +
-                                      indices_ne1_idx * output_stride[2] +
-                                      indices_ne2_idx * output_stride[3] +
-                                      group * QK8_0;
-
-        copy_in(input_offset);
-        LocalTensor<int8_t> input_local = input_queue.DeQue<int8_t>();
-        LocalTensor<half> cast_local = cast_queue.AllocTensor<half>();
-        LocalTensor<float> output_local = output_queue.AllocTensor<float>();
-
-        // TODO: cast more data to speed up.
-        Cast(cast_local, input_local, RoundMode::CAST_NONE, QK8_0);
-        Cast(output_local, cast_local, RoundMode::CAST_NONE, QK8_0);
-
-        // Only mul need compile by group.
-        half scale = scale_gm.GetValue(scale_offset);
-        Muls(output_local, output_local, (float)scale, QK8_0);
-
-        input_queue.FreeTensor(input_local);
-        cast_queue.FreeTensor(cast_local);
-        output_queue.EnQue(output_local);
-
-        copy_out(output_offset);
-    }
-
-    __aicore__ inline void calculate() {
-        for (int64_t i = ir; i < ir + dr; i++) {
-            for (int64_t j = 0; j < group_size_in_row; j++) {
-                calculate_group(i, j);
-            }
-        }
-    }
-
-   private:
-    int64_t input_ne[4];
-    size_t input_stride[4];
-
-    int64_t scale_ne[4];
-    size_t scale_stride[4];
-
-    int64_t indices_ne[4];
-    size_t indices_stride[4];
-
-    int64_t output_ne[4];
-    size_t output_stride[4];
-
-    int64_t ir;
-    int64_t dr;
-
-    int64_t group_size_in_row;
-
-    TPipe pipe;
-    GlobalTensor<int8_t> input_gm;
-    GlobalTensor<half> scale_gm;
-    GlobalTensor<int32_t> indices_gm;
-    GlobalTensor<float> output_gm;
-    TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
-    TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
-    TQue<QuePosition::VECIN, BUFFER_NUM> cast_queue;
-};
-
-template <typename T>
-__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
-    auto gm_ptr = (__gm__ uint8_t *)gm;
-    auto ub_ptr = (uint8_t *)(ub);
-    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
-        *ub_ptr = *gm_ptr;
-    }
-}
-
-extern "C" __global__ __aicore__ void ascendc_get_row_q8_0(
-    GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
-    GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm,
-    GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
-    int64_t input_ne_ub[4];
-    int64_t indices_ne_ub[4];
-    size_t indices_nb_ub[4];
-    int64_t output_ne_ub[4];
-    size_t output_nb_ub[4];
-
-    copy_to_ub(input_ne_gm, input_ne_ub, 32);
-    copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
-    copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
-    copy_to_ub(output_ne_gm, output_ne_ub, 32);
-    copy_to_ub(output_nb_gm, output_nb_ub, 32);
-
-    GET_ROW_Q8_0 op;
-    op.init(input_gm, indices_gm, output_gm, input_ne_ub, indices_ne_ub,
-            indices_nb_ub, output_ne_ub, output_nb_ub);
-    op.calculate();
-}
diff --git a/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp b/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp
deleted file mode 100644
index 504b43afa..000000000
--- a/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp
+++ /dev/null
@@ -1,218 +0,0 @@
-#include "kernel_operator.h"
-
-using namespace AscendC;
-#ifdef ASCEND_310P
-    extern "C" __global__ __aicore__ void ascendc_quantize_f16_q8_0(
-        GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
-        GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
-        // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
-        printf("Ascend310P not support f16->8bit quantization.\n");
-    }
-#else
-
-#define BUFFER_NUM 2
-#define QK8_0 32
-
-class QUANTIZE_F16_Q8_0 {
-   public:
-    __aicore__ inline QUANTIZE_F16_Q8_0() {}
-    __aicore__ inline void init(GM_ADDR input, GM_ADDR output,
-                                int64_t *input_ne_ub, size_t *input_nb_ub,
-                                int64_t *output_ne_ub) {
-        int64_t op_block_num = GetBlockNum();
-        int64_t op_block_idx = GetBlockIdx();
-
-        for (int i = 0; i < 4; i++) {
-            input_ne[i] = input_ne_ub[i];
-            input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
-
-            output_ne[i] = output_ne_ub[i];
-        }
-
-        output_stride[0] = 1;
-        for (int i = 1; i < 4; i++) {
-            output_stride[i] = output_stride[i - 1] * output_ne[i - 1];
-        }
-
-        scale_ne = input_ne;
-        scale_stride[0] = 1;
-        scale_stride[1] = input_ne[0] / QK8_0;
-        for (int i = 2; i < 4; i++) {
-            scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
-        }
-
-        // split input tensor by rows.
-        uint64_t nr = input_ne[1] * input_ne[2] * input_ne[3];
-        dr = nr / op_block_num;
-
-        uint64_t tails = nr % op_block_num;
-        if (op_block_idx < tails) {
-            dr += 1;
-            ir = dr * op_block_idx;
-        } else {
-            ir = dr * op_block_idx + tails;
-        }
-
-        group_size_in_row = scale_stride[1];
-        int64_t output_size = output_ne[0] * output_ne[1] * output_ne[2] *
-                              output_ne[3] * sizeof(uint8_t);
-
-        input_gm.SetGlobalBuffer((__gm__ half *)input);
-        output_gm.SetGlobalBuffer((__gm__ int8_t *)output);
-        scale_gm.SetGlobalBuffer((__gm__ half *)(output + output_size + ir *
-                                                 group_size_in_row *
-                                                 sizeof(half)));
-
-        pipe.InitBuffer(input_queue, BUFFER_NUM, QK8_0 * sizeof(half));
-        pipe.InitBuffer(output_queue, BUFFER_NUM, QK8_0 * sizeof(int8_t));
-        pipe.InitBuffer(work_queue, 1, 32);
-        pipe.InitBuffer(max_queue, 1, 32);
-        pipe.InitBuffer(abs_queue, 1, QK8_0 * sizeof(float));
-        pipe.InitBuffer(scale_queue, 1, 32);
-        pipe.InitBuffer(cast_queue ,1 ,QK8_0 * sizeof(float));
-    }
-
-    __aicore__ inline void copy_in(uint32_t offset) {
-        LocalTensor<half> input_local = input_queue.AllocTensor<half>();
-        DataCopy(input_local, input_gm[offset], QK8_0);
-        input_queue.EnQue(input_local);
-    }
-
-    __aicore__ inline void copy_out(uint32_t offset) {
-        LocalTensor<int8_t> output_local = output_queue.DeQue<int8_t>();
-        DataCopy(output_gm[offset], output_local, QK8_0);
-        output_queue.FreeTensor(output_local);
-    }
-
-    __aicore__ inline half calculate_group(int64_t row, int64_t group) {
-        const int64_t i3 = row / (input_ne[1] * input_ne[2]);
-        const int64_t i2 = (row - i3 * input_ne[1] * input_ne[2]) / input_ne[1];
-        const int64_t i1 =
-            row - i3 * input_ne[1] * input_ne[2] - i2 * input_ne[1];
-
-        const int64_t input_offset = i1 * input_stride[1] +
-                                     i2 * input_stride[2] +
-                                     i3 * input_stride[3] + QK8_0 * group;
-
-        const int64_t output_offset = i1 * output_stride[1] +
-                                      i2 * output_stride[2] +
-                                      i3 * output_stride[3] + QK8_0 * group;
-
-        copy_in(input_offset);
-        LocalTensor<half> input_local = input_queue.DeQue<half>();
-        LocalTensor<int8_t> output_local = output_queue.AllocTensor<int8_t>();
-        LocalTensor<float> work_local = work_queue.AllocTensor<float>();
-        LocalTensor<float> abs_local = abs_queue.AllocTensor<float>();
-        LocalTensor<float> max_local = max_queue.AllocTensor<float>();
-        LocalTensor<float> cast_local = cast_queue.AllocTensor<float>();
-
-        Cast(cast_local, input_local, RoundMode::CAST_NONE, QK8_0);
-        Abs(abs_local, cast_local, QK8_0);
-        ReduceMax(max_local, abs_local, work_local, QK8_0);
-
-        pipe_barrier(PIPE_ALL);
-        float d = max_local.GetValue(0);
-        d = d / ((1 << 7) - 1);
-        if (d != 0) {
-            Muls(cast_local, cast_local, 1.0f / d, QK8_0);
-        }
-
-        Cast(cast_local, cast_local, RoundMode::CAST_ROUND, QK8_0);
-        Cast(input_local, cast_local, RoundMode::CAST_ROUND, QK8_0);
-        Cast(output_local, input_local, RoundMode::CAST_ROUND, QK8_0);
-        output_queue.EnQue(output_local);
-        copy_out(output_offset);
-
-        input_queue.FreeTensor(input_local);
-        work_queue.FreeTensor(work_local);
-        abs_queue.FreeTensor(abs_local);
-        max_queue.FreeTensor(max_local);
-        cast_queue.FreeTensor(cast_local);
-        return (half)d;
-    }
-
-    __aicore__ inline void calculate() {
-        LocalTensor<half> scale_local = scale_queue.AllocTensor<half>();
-        uint32_t scale_local_offset = 0;
-        uint32_t scale_global_offset = 0;
-        for (int64_t i = ir; i < ir + dr; i++) {
-            for (int64_t j = 0; j < group_size_in_row; j++) {
-                half scale = calculate_group(i, j);
-                scale_local.SetValue(scale_local_offset++, scale);
-                if (scale_local_offset == 16) {
-                    scale_local_offset = 0;
-                    // TODO: OPTIMIZE ME
-                    pipe_barrier(PIPE_ALL);
-                    DataCopy(scale_gm[scale_global_offset], scale_local, 16);
-                    pipe_barrier(PIPE_ALL);
-                    scale_global_offset += 16;
-                }
-            }
-        }
-
-        if (scale_local_offset != 0) {
-            pipe_barrier(PIPE_ALL);
-            DataCopyExtParams dataCopyParams;
-            dataCopyParams.blockCount = 1;
-            dataCopyParams.blockLen = scale_local_offset * sizeof(half);
-            DataCopyPad(scale_gm[scale_global_offset], scale_local,
-                        dataCopyParams);
-            pipe_barrier(PIPE_ALL);
-        }
-    }
-
-   private:
-    int64_t input_ne[4];
-    size_t input_stride[4];
-
-    int64_t *scale_ne;
-    size_t scale_stride[4];
-
-    int64_t output_ne[4];
-    size_t output_stride[4];
-
-    int64_t group_size_in_row;
-
-    int64_t ir;
-    int64_t dr;
-
-    TPipe pipe;
-    GlobalTensor<half> input_gm;
-    GlobalTensor<half> scale_gm;
-    GlobalTensor<int8_t> output_gm;
-    TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
-    TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
-    TQue<QuePosition::VECIN, 1> work_queue;
-    TQue<QuePosition::VECOUT, 1> max_queue;
-    TQue<QuePosition::VECIN, 1> abs_queue;
-    TQue<QuePosition::VECOUT, 1> scale_queue;
-    TQue<QuePosition::VECOUT, 1> cast_queue;
-
-};
-
-template <typename T>
-__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
-    auto gm_ptr = (__gm__ uint8_t *)gm;
-    auto ub_ptr = (uint8_t *)(ub);
-    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
-        *ub_ptr = *gm_ptr;
-    }
-}
-
-extern "C" __global__ __aicore__ void ascendc_quantize_f16_q8_0(
-    GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
-    GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
-    int64_t input_ne_ub[4];
-    size_t input_nb_ub[4];
-    int64_t output_ne_ub[4];
-
-    copy_to_ub(input_ne_gm, input_ne_ub, 32);
-    copy_to_ub(input_nb_gm, input_nb_ub, 32);
-    copy_to_ub(output_ne_gm, output_ne_ub, 32);
-
-    QUANTIZE_F16_Q8_0 op;
-    op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
-    op.calculate();
-}
-
-#endif // #ifdef ASCEND_310P
diff --git a/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp b/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp
deleted file mode 100644
index 05b0bc1df..000000000
--- a/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp
+++ /dev/null
@@ -1,216 +0,0 @@
-#include "kernel_operator.h"
-
-using namespace AscendC;
-#ifdef ASCEND_310P // 310P not support f32->8bit quantization
-    extern "C" __global__ __aicore__ void ascendc_quantize_f32_q8_0(
-        GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
-        GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
-        // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
-        printf("Ascend310P not support f32->8bit quantization.\n");
-    }
-#else
-
-#define BUFFER_NUM 2
-#define QK8_0 32
-
-class QUANTIZE_F32_Q8_0 {
-   public:
-    __aicore__ inline QUANTIZE_F32_Q8_0() {}
-    __aicore__ inline void init(GM_ADDR input, GM_ADDR output,
-                                int64_t *input_ne_ub, size_t *input_nb_ub,
-                                int64_t *output_ne_ub) {
-        int64_t op_block_num = GetBlockNum();
-        int64_t op_block_idx = GetBlockIdx();
-
-        for (int i = 0; i < 4; i++) {
-            input_ne[i] = input_ne_ub[i];
-            input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
-
-            output_ne[i] = output_ne_ub[i];
-        }
-
-        output_stride[0] = 1;
-        for (int i = 1; i < 4; i++) {
-            output_stride[i] = output_stride[i - 1] * output_ne[i - 1];
-        }
-
-        scale_ne = input_ne;
-        scale_stride[0] = 1;
-        scale_stride[1] = input_ne[0] / QK8_0;
-        for (int i = 2; i < 4; i++) {
-            scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
-        }
-
-        // split input tensor by rows.
-        uint64_t nr = input_ne[1] * input_ne[2] * input_ne[3];
-        dr = nr / op_block_num;
-
-        uint64_t tails = nr % op_block_num;
-        if (op_block_idx < tails) {
-            dr += 1;
-            ir = dr * op_block_idx;
-        } else {
-            ir = dr * op_block_idx + tails;
-        }
-
-        group_size_in_row = scale_stride[1];
-        int64_t output_size = output_ne[0] * output_ne[1] * output_ne[2] *
-                              output_ne[3] * sizeof(uint8_t);
-
-        input_gm.SetGlobalBuffer((__gm__ float *)input);
-        output_gm.SetGlobalBuffer((__gm__ int8_t *)output);
-        scale_gm.SetGlobalBuffer((__gm__ half *)(output + output_size +
-                                                 ir * group_size_in_row *
-                                                 sizeof(half)));
-
-        pipe.InitBuffer(input_queue, BUFFER_NUM, QK8_0 * sizeof(float));
-        pipe.InitBuffer(output_queue, BUFFER_NUM, QK8_0 * sizeof(int8_t));
-        pipe.InitBuffer(work_queue, 1, 32);
-        pipe.InitBuffer(max_queue, 1, 32);
-        pipe.InitBuffer(abs_queue, 1, QK8_0 * sizeof(float));
-        pipe.InitBuffer(cast_queue, 1, QK8_0 * sizeof(half));
-        pipe.InitBuffer(scale_queue, 1, 32);
-    }
-
-    __aicore__ inline void copy_in(uint32_t offset) {
-        LocalTensor<float> input_local = input_queue.AllocTensor<float>();
-        DataCopy(input_local, input_gm[offset], QK8_0);
-        input_queue.EnQue(input_local);
-    }
-
-    __aicore__ inline void copy_out(uint32_t offset) {
-        LocalTensor<int8_t> output_local = output_queue.DeQue<int8_t>();
-        DataCopy(output_gm[offset], output_local, QK8_0);
-        output_queue.FreeTensor(output_local);
-    }
-
-    __aicore__ inline half calculate_group(int64_t row, int64_t group) {
-        const int64_t i3 = row / (input_ne[1] * input_ne[2]);
-        const int64_t i2 = (row - i3 * input_ne[1] * input_ne[2]) / input_ne[1];
-        const int64_t i1 =
-            row - i3 * input_ne[1] * input_ne[2] - i2 * input_ne[1];
-
-        const int64_t input_offset = i1 * input_stride[1] +
-                                     i2 * input_stride[2] +
-                                     i3 * input_stride[3] + QK8_0 * group;
-
-        const int64_t output_offset = i1 * output_stride[1] +
-                                      i2 * output_stride[2] +
-                                      i3 * output_stride[3] + QK8_0 * group;
-
-        copy_in(input_offset);
-        LocalTensor<float> input_local = input_queue.DeQue<float>();
-        LocalTensor<int8_t> output_local = output_queue.AllocTensor<int8_t>();
-        LocalTensor<float> work_local = work_queue.AllocTensor<float>();
-        LocalTensor<float> abs_local = abs_queue.AllocTensor<float>();
-        LocalTensor<float> max_local = max_queue.AllocTensor<float>();
-        LocalTensor<half> cast_local = cast_queue.AllocTensor<half>();
-
-        Abs(abs_local, input_local, QK8_0);
-        ReduceMax(max_local, abs_local, work_local, QK8_0);
-        pipe_barrier(PIPE_ALL);
-        float d = max_local.GetValue(0);
-        d = d / ((1 << 7) - 1);
-        if (d != 0) {
-            Muls(input_local, input_local, 1.0f / d, QK8_0);
-        }
-
-        Cast(input_local, input_local, RoundMode::CAST_ROUND, QK8_0);
-        Cast(cast_local, input_local, RoundMode::CAST_ROUND, QK8_0);
-        Cast(output_local, cast_local, RoundMode::CAST_ROUND, QK8_0);
-        output_queue.EnQue(output_local);
-        copy_out(output_offset);
-
-        input_queue.FreeTensor(input_local);
-        work_queue.FreeTensor(work_local);
-        abs_queue.FreeTensor(abs_local);
-        max_queue.FreeTensor(max_local);
-        cast_queue.FreeTensor(cast_local);
-
-        return (half)d;
-    }
-
-    __aicore__ inline void calculate() {
-        LocalTensor<half> scale_local = scale_queue.AllocTensor<half>();
-        uint32_t scale_local_offset = 0;
-        uint32_t scale_global_offset = 0;
-        for (int64_t i = ir; i < ir + dr; i++) {
-            for (int64_t j = 0; j < group_size_in_row; j++) {
-                half scale = calculate_group(i, j);
-                scale_local.SetValue(scale_local_offset++, scale);
-                if (scale_local_offset == 16) {
-                    scale_local_offset = 0;
-                    // TODO: OPTIMIZE ME
-                    pipe_barrier(PIPE_ALL);
-                    DataCopy(scale_gm[scale_global_offset], scale_local, 16);
-                    pipe_barrier(PIPE_ALL);
-                    scale_global_offset += 16;
-                }
-            }
-        }
-
-        if (scale_local_offset != 0) {
-            pipe_barrier(PIPE_ALL);
-            DataCopyExtParams dataCopyParams;
-            dataCopyParams.blockCount = 1;
-            dataCopyParams.blockLen = scale_local_offset * sizeof(half);
-            DataCopyPad(scale_gm[scale_global_offset], scale_local,
-                        dataCopyParams);
-            pipe_barrier(PIPE_ALL);
-        }
-    }
-
-   private:
-    int64_t input_ne[4];
-    size_t input_stride[4];
-
-    int64_t *scale_ne;
-    size_t scale_stride[4];
-
-    int64_t output_ne[4];
-    size_t output_stride[4];
-
-    int64_t group_size_in_row;
-
-    int64_t ir;
-    int64_t dr;
-
-    TPipe pipe;
-    GlobalTensor<float> input_gm;
-    GlobalTensor<half> scale_gm;
-    GlobalTensor<int8_t> output_gm;
-    TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
-    TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
-    TQue<QuePosition::VECIN, 1> work_queue;
-    TQue<QuePosition::VECOUT, 1> max_queue;
-    TQue<QuePosition::VECIN, 1> abs_queue;
-    TQue<QuePosition::VECIN, 1> cast_queue;
-    TQue<QuePosition::VECOUT, 1> scale_queue;
-};
-
-template <typename T>
-__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
-    auto gm_ptr = (__gm__ uint8_t *)gm;
-    auto ub_ptr = (uint8_t *)(ub);
-    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
-        *ub_ptr = *gm_ptr;
-    }
-}
-
-extern "C" __global__ __aicore__ void ascendc_quantize_f32_q8_0(
-    GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
-    GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
-    int64_t input_ne_ub[4];
-    size_t input_nb_ub[4];
-    int64_t output_ne_ub[4];
-
-    copy_to_ub(input_ne_gm, input_ne_ub, 32);
-    copy_to_ub(input_nb_gm, input_nb_ub, 32);
-    copy_to_ub(output_ne_gm, output_ne_ub, 32);
-
-    QUANTIZE_F32_Q8_0 op;
-    op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
-    op.calculate();
-}
-
-#endif // #ifdef ASCEND_310P
diff --git a/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp b/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp
deleted file mode 100644
index 1188937b7..000000000
--- a/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp
+++ /dev/null
@@ -1,295 +0,0 @@
-#include "kernel_operator.h"
-
-using namespace AscendC;
-#ifdef ASCEND_310P // 310P not support float->4bit quantization
-    extern "C" __global__ __aicore__ void ascendc_quantize_f32_to_q4_0(
-        GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
-        GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
-        // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
-        printf("Ascend310P not support f32->4bit quantization.\n");
-    }
-
-    extern "C" __global__ __aicore__ void ascendc_quantize_f16_to_q4_0(
-        GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
-        GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
-        // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
-        printf("Ascend310P not support f16->4bit quantization.\n");
-    }
-#else
-
-#define BUFFER_NUM 2
-#define Group_Size 32
-
-template <typename SRC_T>
-class QUANTIZE_FLOAT_TO_Q4_0 {
-   public:
-    __aicore__ inline QUANTIZE_FLOAT_TO_Q4_0() {}
-    __aicore__ inline void init(GM_ADDR input, GM_ADDR output,
-                                int64_t *input_ne_ub, size_t *input_nb_ub,
-                                int64_t *output_ne_ub) {
-        // TODO: fix test_case CPY(type_src=f16,type_dst=q4_0,ne=[256,4,4,4],
-        //                         permute=[0,0,0,0]):
-        // [CPY] NMSE = 0.000008343 > 0.000001000 FAIL
-        int64_t op_block_num = GetBlockNum();
-        int64_t op_block_idx = GetBlockIdx();
-
-        // input stride of data elements
-        for (int i = 0; i < 4; i++) {
-            input_ne[i] = input_ne_ub[i];
-            input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
-            output_ne[i] = output_ne_ub[i];
-        }
-
-        // output stride of data elements
-        output_stride[0] = 1;
-        for (int i = 1; i < 4; i++) {
-            output_stride[i] = output_stride[i - 1] * output_ne[i - 1];
-        }
-
-        // scale saved one by one after data:. [group1_scale, group2_scale, ...]
-        scale_ne = input_ne;
-        scale_stride[0] = 1;
-        scale_stride[1] = input_ne[0] / Group_Size;
-        for (int i = 2; i < 4; i++) {
-            scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
-        }
-
-        // split input tensor by rows.
-        uint64_t nr = input_ne[1] * input_ne[2] * input_ne[3];
-        dr = nr / op_block_num;
-
-        uint64_t tails = nr % op_block_num;
-        if (op_block_idx < tails) {
-            dr += 1;
-            ir = dr * op_block_idx;
-        } else {
-            ir = dr * op_block_idx + tails;
-        }
-
-        group_size_in_row = scale_stride[1];
-        int64_t scale_offset = output_ne[0] * output_ne[1] * output_ne[2] *
-                              output_ne[3] * sizeof(uint8_t) / 2;
-
-        input_gm.SetGlobalBuffer((__gm__ SRC_T *)input);
-        output_gm.SetGlobalBuffer((__gm__ int8_t *)output);
-        scale_gm.SetGlobalBuffer((__gm__ half *)(output + scale_offset + ir *
-                                                 group_size_in_row *
-                                                 sizeof(half)));
-
-        pipe.InitBuffer(input_queue, BUFFER_NUM, Group_Size * sizeof(SRC_T));
-        pipe.InitBuffer(output_queue, BUFFER_NUM,
-                            Group_Size * sizeof(int8_t) / 2);
-        pipe.InitBuffer(cast_queue , 1, Group_Size * sizeof(float));
-        pipe.InitBuffer(work_queue, 1, Group_Size * sizeof(float));
-        pipe.InitBuffer(max_queue, 1, Group_Size * sizeof(float));
-        pipe.InitBuffer(min_queue, 1, Group_Size * sizeof(float));
-        pipe.InitBuffer(scale_queue, 1, Group_Size / 2 * sizeof(half));
-        pipe.InitBuffer(int8_queue, 1, Group_Size * sizeof(int8_t));
-        pipe.InitBuffer(half_queue, 1, Group_Size * sizeof(half));
-    }
-
-    __aicore__ inline void copy_in(uint32_t offset) {
-        LocalTensor<SRC_T> input_local = input_queue.AllocTensor<SRC_T>();
-        DataCopy(input_local, input_gm[offset], Group_Size);
-        input_queue.EnQue(input_local);
-    }
-
-    __aicore__ inline void copy_out(uint32_t offset) {
-        // reinterpretcast Group_Size(32) * int4b_t to Group_Size / 2 * int8_t,
-        // and using DataCopyPad to avoid 32 bits align.
-        LocalTensor<int4b_t> output_local = output_queue.DeQue<int4b_t>();
-        LocalTensor<int8_t> output_int8_local =
-                                    output_local.ReinterpretCast<int8_t>();
-
-        DataCopyExtParams dataCopyParams;
-        dataCopyParams.blockCount = 1;
-        dataCopyParams.blockLen = Group_Size / 2  * sizeof(int8_t);
-        DataCopyPad(output_gm[offset], output_int8_local, dataCopyParams);
-
-        output_queue.FreeTensor(output_local);
-    }
-
-    __aicore__ inline void input_to_cast(LocalTensor<float> cast_local,
-                                         LocalTensor<float> input_local) {
-        DataCopy(cast_local, input_local, Group_Size);
-    }
-
-    __aicore__ inline void input_to_cast(LocalTensor<float> cast_local,
-                                         LocalTensor<half> input_local) {
-        Cast(cast_local, input_local, RoundMode::CAST_NONE, Group_Size);
-    }
-
-    __aicore__ inline half calculate_group(int64_t row, int64_t group) {
-        const int64_t i3 = row / (input_ne[1] * input_ne[2]);
-        const int64_t i2 = (row - i3 * input_ne[1] * input_ne[2]) / input_ne[1];
-        const int64_t i1 =
-            row - i3 * input_ne[1] * input_ne[2] - i2 * input_ne[1];
-
-        const int64_t input_offset = i1 * input_stride[1] +
-                                     i2 * input_stride[2] +
-                                     i3 * input_stride[3] + Group_Size * group;
-
-        // output_offset is stride for output_gm which datatype is int8_t and
-        // divided by 2 is needed for int4b_t.
-        const int64_t output_offset = (i1 * output_stride[1] +
-                                       i2 * output_stride[2] +
-                                       i3 * output_stride[3] +
-                                       Group_Size * group) / 2;
-        copy_in(input_offset);
-
-        LocalTensor<SRC_T> input_local = input_queue.DeQue<SRC_T>();
-        LocalTensor<int4b_t> output_local = output_queue.AllocTensor<int4b_t>();
-        LocalTensor<float> cast_local = cast_queue.AllocTensor<float>();
-        LocalTensor<float> work_local = work_queue.AllocTensor<float>();
-        LocalTensor<float> max_local = max_queue.AllocTensor<float>();
-        LocalTensor<float> min_local = min_queue.AllocTensor<float>();
-        LocalTensor<int8_t> int8_local = int8_queue.AllocTensor<int8_t>();
-        LocalTensor<half> half_local = half_queue.AllocTensor<half>();
-
-        input_to_cast(cast_local, input_local);
-
-        ReduceMax(max_local, cast_local, work_local, Group_Size);
-        ReduceMin(min_local, cast_local, work_local, Group_Size);
-        const float max_value = max_local.GetValue(0);
-        const float min_value = min_local.GetValue(0);
-        float d = max_value;
-        if (min_value < 0 && (-1 * min_value) > max_value) {
-            d = min_value;
-        }
-
-        d = d / (-8);
-        if (d != 0) {
-            Muls(cast_local, cast_local, 1.0f / d, Group_Size);
-        }
-
-        // range: [-8,8] -> [0.5,16.5] -> [0,16] -> [0,15] -> [-8,7]
-        float scalar = 8.5f;
-        Adds(cast_local, cast_local, scalar, Group_Size);
-        Cast(cast_local, cast_local, RoundMode::CAST_FLOOR, Group_Size);
-        scalar = 15.0f;
-        Mins(cast_local, cast_local, scalar, Group_Size);
-        scalar = -8.0f;
-        Adds(cast_local, cast_local, scalar, Group_Size);
-
-        // float->half->int4b
-        Cast(half_local, cast_local, RoundMode::CAST_NONE, Group_Size);
-        Cast(output_local, half_local, RoundMode::CAST_NONE, Group_Size);
-
-        output_queue.EnQue(output_local);
-        copy_out(output_offset);
-
-        input_queue.FreeTensor(input_local);
-        work_queue.FreeTensor(work_local);
-        max_queue.FreeTensor(max_local);
-        min_queue.FreeTensor(min_local);
-        int8_queue.FreeTensor(int8_local);
-        half_queue.FreeTensor(half_local);
-        cast_queue.FreeTensor(cast_local);
-        return (half)d;
-    }
-
-    __aicore__ inline void calculate() {
-        LocalTensor<half> scale_local = scale_queue.AllocTensor<half>();
-        uint32_t scale_local_offset = 0;
-        uint32_t scale_global_offset = 0;
-        for (int64_t i = ir; i < ir + dr; i++) {
-            for (int64_t j = 0; j < group_size_in_row; j++) {
-                half scale = calculate_group(i, j);
-                scale_local.SetValue(scale_local_offset++, scale);
-                // Copy Group_Size/2 length data each time.
-                if (scale_local_offset == Group_Size / 2) {
-                    scale_local_offset = 0;
-                    // TODO: OPTIMIZE ME
-                    pipe_barrier(PIPE_ALL);
-                    DataCopy(scale_gm[scale_global_offset], scale_local,
-                                      Group_Size / 2);
-                    pipe_barrier(PIPE_ALL);
-                    scale_global_offset += Group_Size / 2;
-                }
-            }
-        }
-
-        if (scale_local_offset != 0) {
-            pipe_barrier(PIPE_ALL);
-            DataCopyExtParams dataCopyParams;
-            dataCopyParams.blockCount = 1;
-            dataCopyParams.blockLen = scale_local_offset * sizeof(half);
-            DataCopyPad(scale_gm[scale_global_offset], scale_local,
-                        dataCopyParams);
-            pipe_barrier(PIPE_ALL);
-        }
-        scale_queue.FreeTensor(scale_local);
-    }
-
-   private:
-    int64_t input_ne[4];
-    size_t input_stride[4];
-
-    int64_t *scale_ne;
-    size_t scale_stride[4];
-
-    int64_t output_ne[4];
-    size_t output_stride[4];
-
-    int64_t group_size_in_row;
-
-    int64_t ir;
-    int64_t dr;
-
-    TPipe pipe;
-    GlobalTensor<SRC_T> input_gm;
-    GlobalTensor<half> scale_gm;
-    GlobalTensor<int8_t> output_gm;
-    TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
-    TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
-    TQue<QuePosition::VECIN, BUFFER_NUM> work_queue;
-    TQue<QuePosition::VECOUT, BUFFER_NUM> max_queue;
-    TQue<QuePosition::VECOUT, BUFFER_NUM> min_queue;
-    TQue<QuePosition::VECOUT, BUFFER_NUM> scale_queue;
-    TQue<QuePosition::VECOUT, BUFFER_NUM> cast_queue;
-    TQue<QuePosition::VECOUT, BUFFER_NUM> int8_queue;
-    TQue<QuePosition::VECOUT, BUFFER_NUM> half_queue;
-};
-
-template <typename T>
-__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
-    auto gm_ptr = (__gm__ uint8_t *)gm;
-    auto ub_ptr = (uint8_t *)(ub);
-    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
-        *ub_ptr = *gm_ptr;
-    }
-}
-
-extern "C" __global__ __aicore__ void ascendc_quantize_f16_to_q4_0(
-    GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
-    GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
-    int64_t input_ne_ub[4];
-    size_t input_nb_ub[4];
-    int64_t output_ne_ub[4];
-
-    copy_to_ub(input_ne_gm, input_ne_ub, 32);
-    copy_to_ub(input_nb_gm, input_nb_ub, 32);
-    copy_to_ub(output_ne_gm, output_ne_ub, 32);
-
-    QUANTIZE_FLOAT_TO_Q4_0<half> op;
-    op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
-    op.calculate();
-}
-
-extern "C" __global__ __aicore__ void ascendc_quantize_f32_to_q4_0(
-    GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
-    GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
-    int64_t input_ne_ub[4];
-    size_t input_nb_ub[4];
-    int64_t output_ne_ub[4];
-
-    copy_to_ub(input_ne_gm, input_ne_ub, 32);
-    copy_to_ub(input_nb_gm, input_nb_ub, 32);
-    copy_to_ub(output_ne_gm, output_ne_ub, 32);
-
-    QUANTIZE_FLOAT_TO_Q4_0<float> op;
-    op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
-    op.calculate();
-}
-
-#endif // #ifdef ASCEND_310P

From 42eb248f46e1175349e553b6eda6cb63027d74d1 Mon Sep 17 00:00:00 2001
From: Xuan-Son Nguyen <son@huggingface.co>
Date: Wed, 2 Apr 2025 09:58:34 +0200
Subject: [PATCH 07/20] common : remove json.hpp from common.cpp (#12697)

* common : remove json.hpp from common.cpp

* fix comment
---
 common/common.cpp          | 28 ----------------------------
 common/common.h            |  4 ----
 examples/server/server.cpp | 11 ++++++-----
 examples/server/utils.hpp  | 29 ++++++++++++++++++++++++++++-
 4 files changed, 34 insertions(+), 38 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 22642c84a..e7269ead4 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -7,9 +7,6 @@
 
 #include "common.h"
 #include "log.h"
-// Change JSON_ASSERT from assert() to GGML_ASSERT:
-#define JSON_ASSERT GGML_ASSERT
-#include "json.hpp"
 #include "llama.h"
 
 #include <algorithm>
@@ -56,8 +53,6 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 
-using json = nlohmann::ordered_json;
-
 //
 // CPU utils
 //
@@ -1545,26 +1540,3 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
 
     return result;
 }
-
-template <>
-json common_grammar_trigger::to_json() const {
-    json out {
-        {"type", (int) type},
-        {"value", value},
-    };
-    if (type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
-        out["token"] = (int) token;
-    }
-    return out;
-}
-
-template <>
-common_grammar_trigger common_grammar_trigger::from_json(const json & in) {
-    common_grammar_trigger out;
-    out.type = (common_grammar_trigger_type) in.at("type").get<int>();
-    out.value = in.at("value").get<std::string>();
-    if (out.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
-        out.token = (llama_token) in.at("token").get<int>();
-    }
-    return out;
-}
diff --git a/common/common.h b/common/common.h
index 41ff9905e..ea7aef99d 100644
--- a/common/common.h
+++ b/common/common.h
@@ -121,10 +121,6 @@ struct common_grammar_trigger {
     common_grammar_trigger_type type;
     std::string value;
     llama_token token = LLAMA_TOKEN_NULL;
-
-    // T can only be nlohmann::ordered_json
-    template <class T> T to_json() const;
-    template <class T> static common_grammar_trigger from_json(const T & in);
 };
 
 // sampling parameters
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index d140f8c44..760c36464 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -133,7 +133,8 @@ struct slot_params {
 
         auto grammar_triggers = json::array();
         for (const auto & trigger : sampling.grammar_triggers) {
-            grammar_triggers.push_back(trigger.to_json<json>());
+            server_grammar_trigger ct(std::move(trigger));
+            grammar_triggers.push_back(ct.to_json());
         }
 
         return json {
@@ -372,9 +373,9 @@ struct server_task {
             const auto grammar_triggers = data.find("grammar_triggers");
             if (grammar_triggers != data.end()) {
                 for (const auto & t : *grammar_triggers) {
-                    auto ct = common_grammar_trigger::from_json(t);
-                    if (ct.type == COMMON_GRAMMAR_TRIGGER_TYPE_WORD) {
-                        const auto & word = ct.value;
+                    server_grammar_trigger ct(t);
+                    if (ct.value.type == COMMON_GRAMMAR_TRIGGER_TYPE_WORD) {
+                        const auto & word = ct.value.value;
                         auto ids = common_tokenize(vocab, word, /* add_special= */ false, /* parse_special= */ true);
                         if (ids.size() == 1) {
                             auto token = ids[0];
@@ -392,7 +393,7 @@ struct server_task {
                             params.sampling.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, word});
                         }
                     } else {
-                        params.sampling.grammar_triggers.push_back(ct);
+                        params.sampling.grammar_triggers.push_back(std::move(ct.value));
                     }
                 }
             }
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index 58cdd6af9..55cf3230d 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -58,6 +58,32 @@ static T json_value(const json & body, const std::string & key, const T & defaul
 
 const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT);
 
+// thin wrapper around common_grammar_trigger with (de)serialization functions
+struct server_grammar_trigger {
+    common_grammar_trigger value;
+
+    server_grammar_trigger() = default;
+    server_grammar_trigger(const common_grammar_trigger & value) : value(value) {}
+    server_grammar_trigger(const json & in) {
+        value.type = (common_grammar_trigger_type) in.at("type").get<int>();
+        value.value = in.at("value").get<std::string>();
+        if (value.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
+            value.token = (llama_token) in.at("token").get<int>();
+        }
+    }
+
+    json to_json() const {
+        json out {
+            {"type", (int) value.type},
+            {"value", value.value},
+        };
+        if (value.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
+            out["token"] = (int) value.token;
+        }
+        return out;
+    }
+};
+
 //
 // tokenizer and input processing utils
 //
@@ -627,7 +653,8 @@ static json oaicompat_completion_params_parse(
     llama_params["grammar_lazy"]     = chat_params.grammar_lazy;
     auto grammar_triggers = json::array();
     for (const auto & trigger : chat_params.grammar_triggers) {
-        grammar_triggers.push_back(trigger.to_json<json>());
+        server_grammar_trigger ct(trigger);
+        grammar_triggers.push_back(ct.to_json());
     }
     llama_params["grammar_triggers"] = grammar_triggers;
     llama_params["preserved_tokens"] = chat_params.preserved_tokens;

From 83a88bd6affbe148a622ac730952ac5b8b585979 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= <sigbjorn.skjaeret@scala.com>
Date: Wed, 2 Apr 2025 11:21:48 +0200
Subject: [PATCH 08/20] vocab : BailingMoE : change possessive quantifiers to
 greedy (#12677)

---
 src/llama-vocab.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index 31e2055f8..521a6ec5e 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -411,7 +411,8 @@ struct llm_tokenizer_bpe : llm_tokenizer {
                 regex_exprs = {
                     // original regex from tokenizer.json
                     // "'(?i:[sdmt]|ll|ve|re)|[^\\r\\n\\p{L}\\p{N}]?+\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]++[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+"
-                    "'(?:[sSdDmMtT]|[lL][lL]|[vV][eE]|[rR][eE])|[^\\r\\n\\p{L}\\p{N}]?+\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]++[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
+                    // FIXME? Changed possessive quantifiers (?+ and ++) to greedy to avoid errors and imatrix hanging (tried atomic grouping but it's not supported?)
+                    "'(?:[sSdDmMtT]|[lL][lL]|[vV][eE]|[rR][eE])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
                 };
                 break;
             default:

From a10b36c91a091f4606710fba4e9327fd71e0e738 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 2 Apr 2025 14:32:59 +0300
Subject: [PATCH 09/20] llama : refactor kv cache guard (#12695)

* llama : refactor kv cache guard

ggml-ci

* cont : fix comment [no ci]

* llama : fix kv_cache restore logic

ggml-ci

* context : simplify kv cache updates

ggml-ci

* cont : better name [no ci]

* llama : fix llama_decode return code when could not find KV slot

ggml-ci

* context : change log err -> warn [no ci]

* kv-cache : add comment + warning
---
 examples/parallel/parallel.cpp |   2 +
 src/llama-context.cpp          |  59 +++----------------
 src/llama-kv-cache.cpp         |  69 +++++++++++++++++++---
 src/llama-kv-cache.h           | 104 ++++++++++++---------------------
 4 files changed, 107 insertions(+), 127 deletions(-)

diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
index e0e6da631..80698518e 100644
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -106,6 +106,8 @@ int main(int argc, char ** argv) {
 
     common_params params;
 
+    params.n_predict = 128;
+
     if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) {
         return 1;
     }
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 3479a8cca..7d067afbe 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -1201,33 +1201,7 @@ int llama_context::decode(llama_batch & inp_batch) {
     const int64_t n_tokens_all = batch.n_tokens;
     const int64_t n_embd       = hparams.n_embd;
 
-    // TODO: remove this stuff
-    class batch_guard {
-    public:
-        batch_guard(llama_kv_cache_unified & kv_self) : kv_slot_restorer(kv_self) {
-        }
-
-        ~batch_guard() {
-            if (!is_done) {
-                kv_slot_restorer.restore();
-            }
-        }
-
-        void done() {
-            is_done = true;
-        }
-
-        void save(const llama_kv_cache_slot_info & slot_info) {
-            kv_slot_restorer.save(slot_info);
-        }
-
-    private:
-        bool is_done = false;
-
-        llama_kv_slot_restorer kv_slot_restorer;
-    };
-
-    batch_guard bg(*kv_self);
+    llama_kv_cache_guard kv_guard(kv_self.get());
 
     GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
 
@@ -1280,6 +1254,9 @@ int llama_context::decode(llama_batch & inp_batch) {
         return -2;
     };
 
+    // handle any pending defrags/shifts
+    kv_self_update();
+
     int64_t n_outputs_prev = 0;
 
     while (sbatch.n_tokens > 0) {
@@ -1319,22 +1296,12 @@ int llama_context::decode(llama_batch & inp_batch) {
 
         // find KV slot
         {
-            kv_self_update();
+            if (!kv_self->find_slot(ubatch)) {
+                LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
 
-            // if we have enough unused cells before the current head ->
-            //   better to start searching from the beginning of the cache, hoping to fill it
-            if (kv_self->head > kv_self->used + 2*ubatch.n_tokens) {
-                kv_self->head = 0;
+                return 1;
             }
 
-            const auto slot_info = kv_self->find_slot(ubatch);
-            if (!slot_info) {
-                LLAMA_LOG_ERROR("%s: failed to prepare ubatch\n", __func__);
-                return -3;
-            }
-
-            bg.save(slot_info);
-
             if (!kv_self->recurrent) {
                 // a heuristic, to avoid attending the full cache if it is not yet utilized
                 // after enough generations, the benefit from this heuristic disappears
@@ -1371,16 +1338,6 @@ int llama_context::decode(llama_batch & inp_batch) {
             }
         }
 
-        // update the kv ring buffer
-        {
-            kv_self->head += ubatch.n_tokens;
-
-            // Ensure kv cache head points to a valid index.
-            if (kv_self->head >= kv_self->size) {
-                kv_self->head = 0;
-            }
-        }
-
         // plot the computation graph in dot format (for debugging purposes)
         //if (n_past%100 == 0) {
         //    ggml_graph_dump_dot(gf, NULL, "llama.dot");
@@ -1467,7 +1424,7 @@ int llama_context::decode(llama_batch & inp_batch) {
     }
 
     // finalize the batch processing
-    bg.done();
+    kv_guard.commit();
 
     // set output mappings
     {
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index 14c8933b4..7ba546c10 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -11,8 +11,6 @@
 #include <map>
 #include <stdexcept>
 
-static const llama_kv_cache_slot_info llama_kv_cache_slot_info_failed{false};
-
 llama_kv_cache_unified::llama_kv_cache_unified(const llama_hparams & hparams, callbacks cbs) : hparams(hparams), cbs(std::move(cbs)) {
 }
 
@@ -206,6 +204,8 @@ bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
                 return false;
             }
         }
+
+        return true;
     }
 
     for (uint32_t i = 0; i < size; ++i) {
@@ -446,16 +446,66 @@ void llama_kv_cache_unified::defrag() {
     }
 }
 
+void llama_kv_cache_unified::restore() {
+    if (pending.ranges.empty()) {
+        return;
+    }
+
+    // TODO: tmp - move to llama_kv_cache_recurrent
+    if (recurrent) {
+        seq_rm(-1, -1, -1);
+        return;
+    }
+
+    uint32_t new_head = size;
+
+    for (auto & range : pending.ranges) {
+        for (uint32_t i = range.c0; i < range.c1; ++i) {
+            cells[i].seq_id.clear();
+
+            // keep count of the number of used cells
+            if (cells[i].pos >= 0) {
+                used--;
+            }
+
+            cells[i].pos = -1;
+            cells[i].src = -1;
+        }
+
+        new_head = std::min(new_head, range.c0);
+    }
+
+    if (new_head != size && new_head < head) {
+        head = new_head;
+    }
+}
+
+void llama_kv_cache_unified::commit() {
+    if (pending.ranges.empty()) {
+        LLAMA_LOG_WARN("%s: no pending KV cache updates to commit - might indicate a bug (ref: %s)\n",
+                __func__, "https://github.com/ggml-org/llama.cpp/pull/12695");
+        return;
+    }
+
+    pending.ranges.clear();
+}
+
 bool llama_kv_cache_unified::get_can_shift() const {
     return can_shift;
 }
 
-llama_kv_cache_slot_info llama_kv_cache_unified::find_slot(
+bool llama_kv_cache_unified::find_slot(
        const llama_ubatch & ubatch) {
     const uint32_t n_tokens = ubatch.n_tokens;
     const uint32_t n_seqs   = ubatch.n_seqs;
     const uint32_t n_seq_tokens = ubatch.n_seq_tokens;
 
+    // if we have enough unused cells before the current head ->
+    //   better to start searching from the beginning of the cache, hoping to fill it
+    if (head > used + 2*ubatch.n_tokens) {
+        head = 0;
+    }
+
     if (recurrent) {
         // For recurrent state architectures (like Mamba or RWKV),
         // each cache cell can store the state for a whole sequence.
@@ -477,7 +527,7 @@ llama_kv_cache_slot_info llama_kv_cache_unified::find_slot(
                     // too big seq_id
                     // TODO: would it be possible to resize the cache instead?
                     LLAMA_LOG_ERROR("%s: seq_id=%d >= n_seq_max=%d Try using a bigger --parallel value\n", __func__, seq_id, size);
-                    return llama_kv_cache_slot_info_failed;
+                    return false;
                 }
                 if (j > 0) {
                     llama_kv_cell & seq = cells[seq_id];
@@ -616,14 +666,14 @@ llama_kv_cache_slot_info llama_kv_cache_unified::find_slot(
             [](const llama_kv_cell& cell){ return !cell.is_empty(); });
 
         // sanity check
-        return llama_kv_cache_slot_info(n >= n_seqs);
+        return n >= n_seqs;
     }
 
     // otherwise, one cell per token.
 
     if (n_tokens > size) {
         LLAMA_LOG_ERROR("%s: n_tokens = %d > size = %d\n", __func__, n_tokens, size);
-        return llama_kv_cache_slot_info_failed;
+        return false;
     }
 
     uint32_t n_tested = 0;
@@ -651,7 +701,7 @@ llama_kv_cache_slot_info llama_kv_cache_unified::find_slot(
 
         if (n_tested >= size) {
             //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
-            return llama_kv_cache_slot_info_failed;
+            return false;
         }
     }
 
@@ -668,7 +718,9 @@ llama_kv_cache_slot_info llama_kv_cache_unified::find_slot(
 
     used += n_tokens;
 
-    return llama_kv_cache_slot_info(head, head + n_tokens);
+    pending.ranges.push_back({head, head + n_tokens});
+
+    return true;
 }
 
 uint32_t llama_kv_cache_unified::get_padding(const llama_cparams & cparams) const {
@@ -1033,6 +1085,7 @@ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell
             LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
             return false;
         }
+        commit();
 
         // DEBUG CHECK: kv.head should be our first cell, kv.head + cell_count - 1 should be our last cell (verify seq_id and pos values)
         // Assume that this is one contiguous block of cells
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
index 0a7ff8a4e..ff0ba3540 100644
--- a/src/llama-kv-cache.h
+++ b/src/llama-kv-cache.h
@@ -17,6 +17,9 @@ struct llama_ubatch;
 struct llama_kv_cache : public llama_memory_i {
     using llama_memory_i::llama_memory_i;
 
+    virtual void restore() = 0; // call if batch processing fails - restores the cache state
+    virtual void commit() = 0;  // call after successful batch processing - clears any pending state
+
     virtual int32_t  get_n_tokens()   const = 0;
     virtual uint32_t get_used_cells() const = 0; // TODO: remove, this is too-specific to the unified cache
 
@@ -25,9 +28,24 @@ struct llama_kv_cache : public llama_memory_i {
     bool get_can_edit() const override { return get_can_shift(); }
 };
 
+struct llama_kv_cache_guard {
+    llama_kv_cache_guard(llama_kv_cache * kv) : kv(kv) {}
+
+    ~llama_kv_cache_guard() {
+        kv->restore();
+    }
+
+    void commit() {
+        kv->commit();
+    }
+
+private:
+    llama_kv_cache * kv;
+};
+
 struct llama_kv_cell {
     llama_pos pos   = -1;
-    llama_pos delta = 0;
+    llama_pos delta =  0;
     int32_t   src   = -1; // used by recurrent state models to copy states
     int32_t   tail  = -1;
 
@@ -46,17 +64,6 @@ struct llama_kv_cell {
     }
 };
 
-// a structure holds information about the slot found in llama_kv_cache_find_slot
-struct llama_kv_cache_slot_info {
-    std::pair<uint32_t, uint32_t> boundaries; // slot boundaries [begin, end)
-    bool found = false;                       // the slot was found
-
-    explicit llama_kv_cache_slot_info(bool found_) : found{found_} {}
-    llama_kv_cache_slot_info(uint32_t begin, uint32_t end) : boundaries{begin, end}, found{true} {}
-
-    operator bool() const { return found; }
-};
-
 // ring-buffer of cached KV data
 // TODO: pimpl
 // TODO: add notion of max sequences
@@ -93,6 +100,9 @@ public:
     void clear() override;
     void defrag() override;
 
+    virtual void restore() override;
+    virtual void commit() override;
+
     bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1) override;
     void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
     void seq_keep(llama_seq_id seq_id) override;
@@ -105,10 +115,9 @@ public:
 
     // find an empty slot of size "n_tokens" in the cache
     // updates the cache head
-    // returns a structure holding information about the slot found
     // Note: On success, it's important that cache.head points
     // to the first cell of the slot.
-    llama_kv_cache_slot_info find_slot(const llama_ubatch & batch);
+    bool find_slot(const llama_ubatch & batch);
 
     // TODO: maybe not needed
     uint32_t get_padding(const llama_cparams & cparams) const;
@@ -128,7 +137,19 @@ public:
     // return true if cells have been moved
     bool defrag_prepare(int32_t n_max_nodes);
 
-    // state save/load
+    // commit/restore cache
+
+    struct slot_range {
+        uint32_t c0 = 0; // note: these are cell indices, not sequence positions
+        uint32_t c1 = 0;
+    };
+
+    // pending cell updates that are not yet committed
+    struct {
+        std::vector<slot_range> ranges;
+    } pending;
+
+    // state write/load
 
     void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const;
     void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1);
@@ -183,59 +204,6 @@ private:
 //    using llama_kv_cache_unified::llama_kv_cache_unified;
 //};
 
-//
-// kv cache restore
-//
-
-// saves the kv_cache state for future recovery.
-// used to rollback llama_kv_cache_find_slot changes.
-struct llama_kv_slot_restorer {
-    struct llama_kv_cache_state {
-        uint32_t head = 0;
-        uint32_t n    = 0;
-    } old_state;
-
-    // for non-recurrent models only
-    // list of slots to restore
-    std::vector<std::pair<uint32_t, uint32_t>> slot_boundaries;
-
-    bool do_restore = false;
-
-    llama_kv_cache_unified & cache;
-
-    explicit llama_kv_slot_restorer(llama_kv_cache_unified & cache) : cache(cache) {
-        old_state.head = cache.head;
-        old_state.n    = cache.n;
-    }
-
-    // saves a slot information for future restoration
-    void save(const llama_kv_cache_slot_info & slot) {
-        if (slot) {
-            do_restore = true;
-            if (slot.boundaries.first != slot.boundaries.second) {
-                slot_boundaries.push_back(slot.boundaries);
-            }
-        }
-    }
-
-    // must be explicitly called to restore the kv_cache state
-    // and rollback changes from all llama_kv_cache_find_slot calls
-    void restore() {
-        if (do_restore) {
-            cache.head = old_state.head;
-            cache.n    = old_state.n;
-
-            if (cache.recurrent) { // recurrent models like Mamba or RWKV can't have a state partially erased
-                cache.seq_rm(-1, -1, -1);
-            } else {
-                for (auto & slot : slot_boundaries) {
-                    cache.seq_rm(-1, slot.first, slot.second);
-                }
-            }
-        }
-    }
-};
-
 // TODO: maybe become part of the public llama_kv_cache in the future
 int32_t llama_kv_cache_n_tokens(const llama_kv_cache * kv);
 

From e0e912f49b3195ef9d0c51378629ba03c9b972da Mon Sep 17 00:00:00 2001
From: Diego Devesa <slarengh@gmail.com>
Date: Wed, 2 Apr 2025 14:52:01 +0200
Subject: [PATCH 10/20] llama : add option to override model tensor buffers
 (#11397)

* llama : add option to override tensor buffers

* ggml : fix possible underflow in ggml_nbytes
---
 common/arg.cpp             | 40 ++++++++++++++++++++++++++++++++++++++
 common/common.cpp          | 10 ++++++++++
 common/common.h            |  1 +
 ggml/src/ggml.c            |  6 ++++++
 include/llama.h            |  8 ++++++++
 src/llama-context.cpp      |  3 ++-
 src/llama-model-loader.cpp |  5 ++++-
 src/llama-model-loader.h   |  8 +++++---
 src/llama-model.cpp        | 30 ++++++++++++++++++++++++++--
 src/llama-model.h          |  2 ++
 src/llama-quant.cpp        |  2 +-
 src/llama.cpp              |  2 +-
 12 files changed, 108 insertions(+), 9 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 47c26955e..fa22e86cd 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -1,6 +1,7 @@
 #include "gguf.h" // for reading GGUF splits
 #include "arg.h"
 
+#include "common.h"
 #include "log.h"
 #include "sampling.h"
 #include "chat.h"
@@ -848,6 +849,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
         params.kv_overrides.back().key[0] = 0;
     }
 
+    if (!params.tensor_buft_overrides.empty()) {
+        params.tensor_buft_overrides.push_back({nullptr, nullptr});
+    }
+
     if (params.reranking && params.embedding) {
         throw std::invalid_argument("error: either --embedding or --reranking can be specified, but not both");
     }
@@ -2180,6 +2185,41 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             exit(0);
         }
     ));
+    add_opt(common_arg(
+        {"--override-tensor", "-ot"}, "<tensor name pattern>=<buffer type>,...",
+        "override tensor buffer type", [](common_params & params, const std::string & value) {
+            /* static */ std::map<std::string, ggml_backend_buffer_type_t> buft_list;
+            if (buft_list.empty()) {
+                // enumerate all the devices and add their buffer types to the list
+                for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
+                    auto * dev = ggml_backend_dev_get(i);
+                    auto * buft = ggml_backend_dev_buffer_type(dev);
+                    if (buft) {
+                        buft_list[ggml_backend_buft_name(buft)] = buft;
+                    }
+                }
+            }
+
+            for (const auto & override : string_split<std::string>(value, ',')) {
+                std::string::size_type pos = override.find('=');
+                if (pos == std::string::npos) {
+                    throw std::invalid_argument("invalid value");
+                }
+                std::string tensor_name = override.substr(0, pos);
+                std::string buffer_type = override.substr(pos + 1);
+
+                if (buft_list.find(buffer_type) == buft_list.end()) {
+                    printf("Available buffer types:\n");
+                    for (const auto & it : buft_list) {
+                        printf("  %s\n", ggml_backend_buft_name(it.second));
+                    }
+                    throw std::invalid_argument("unknown buffer type");
+                }
+                // FIXME: this leaks memory
+                params.tensor_buft_overrides.push_back({strdup(tensor_name.c_str()), buft_list.at(buffer_type)});
+            }
+        }
+    ));
     add_opt(common_arg(
         {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
         "number of layers to store in VRAM",
diff --git a/common/common.cpp b/common/common.cpp
index e7269ead4..d4882c512 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1042,15 +1042,18 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
     if (!params.devices.empty()) {
         mparams.devices = params.devices.data();
     }
+
     if (params.n_gpu_layers != -1) {
         mparams.n_gpu_layers = params.n_gpu_layers;
     }
+
     mparams.main_gpu        = params.main_gpu;
     mparams.split_mode      = params.split_mode;
     mparams.tensor_split    = params.tensor_split;
     mparams.use_mmap        = params.use_mmap;
     mparams.use_mlock       = params.use_mlock;
     mparams.check_tensors   = params.check_tensors;
+
     if (params.kv_overrides.empty()) {
         mparams.kv_overrides = NULL;
     } else {
@@ -1058,6 +1061,13 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
         mparams.kv_overrides = params.kv_overrides.data();
     }
 
+    if (params.tensor_buft_overrides.empty()) {
+        mparams.tensor_buft_overrides = NULL;
+    } else {
+        GGML_ASSERT(params.tensor_buft_overrides.back().pattern == nullptr && "Tensor buffer overrides not terminated with empty pattern");
+        mparams.tensor_buft_overrides = params.tensor_buft_overrides.data();
+    }
+
     return mparams;
 }
 
diff --git a/common/common.h b/common/common.h
index ea7aef99d..725b5123d 100644
--- a/common/common.h
+++ b/common/common.h
@@ -279,6 +279,7 @@ struct common_params {
     std::vector<std::string> in_files;   // all input files
     std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
     std::vector<llama_model_kv_override> kv_overrides;
+    std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
 
     bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply)
     std::vector<common_adapter_lora_info> lora_adapters; // lora adapter path with user defined scale
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 161dd3fa9..3e274d6ae 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -1159,6 +1159,12 @@ int64_t ggml_nrows(const struct ggml_tensor * tensor) {
 }
 
 size_t ggml_nbytes(const struct ggml_tensor * tensor) {
+    for (int i = 0; i < GGML_MAX_DIMS; ++i) {
+        if (tensor->ne[i] <= 0) {
+            return 0;
+        }
+    }
+
     size_t nbytes;
     const size_t blck_size = ggml_blck_size(tensor->type);
     if (blck_size == 1) {
diff --git a/include/llama.h b/include/llama.h
index 468ab1fa4..fca2b034b 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -280,10 +280,18 @@ extern "C" {
         };
     };
 
+    struct llama_model_tensor_buft_override {
+        const char * pattern;
+        ggml_backend_buffer_type_t buft;
+    };
+
     struct llama_model_params {
         // NULL-terminated list of devices to use for offloading (if NULL, all available devices are used)
         ggml_backend_dev_t * devices;
 
+        // NULL-terminated list of buffer types to use for tensors that match a pattern
+        const struct llama_model_tensor_buft_override * tensor_buft_overrides;
+
         int32_t n_gpu_layers; // number of layers to store in VRAM
         enum llama_split_mode split_mode; // how to split the model across multiple GPUs
 
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 7d067afbe..392707943 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -255,7 +255,8 @@ llama_context::llama_context(
             model.n_devices() > 1 &&
             model.params.n_gpu_layers > (int) model.hparams.n_layer &&
             model.params.split_mode == LLAMA_SPLIT_MODE_LAYER &&
-            cparams.offload_kqv;
+            cparams.offload_kqv &&
+            !model.has_tensor_overrides();
 
         // pipeline parallelism requires support for async compute and events in all devices
         if (pipeline_parallel) {
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index 1be0f2d6d..ec1d78e31 100644
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -445,7 +445,8 @@ llama_model_loader::llama_model_loader(
         std::vector<std::string> & splits,
         bool use_mmap,
         bool check_tensors,
-        const struct llama_model_kv_override * param_overrides_p) {
+        const llama_model_kv_override * param_overrides_p,
+        const llama_model_tensor_buft_override * param_tensor_buft_overrides_p) {
     int trace = 0;
     if (getenv("LLAMA_TRACE")) {
         trace = atoi(getenv("LLAMA_TRACE"));
@@ -457,6 +458,8 @@ llama_model_loader::llama_model_loader(
         }
     }
 
+    tensor_buft_overrides = param_tensor_buft_overrides_p;
+
     // Load the main GGUF
     struct ggml_context * ctx = NULL;
     struct gguf_init_params params = {
diff --git a/src/llama-model-loader.h b/src/llama-model-loader.h
index fe35404b2..0f52b011b 100644
--- a/src/llama-model-loader.h
+++ b/src/llama-model-loader.h
@@ -77,8 +77,9 @@ struct llama_model_loader {
 
     llama_mmaps mappings;
 
-    std::map<std::string, struct llama_tensor_weight, weight_name_comparer> weights_map;
-    std::unordered_map<std::string, struct llama_model_kv_override> kv_overrides;
+    std::map<std::string, llama_tensor_weight, weight_name_comparer> weights_map;
+    std::unordered_map<std::string, llama_model_kv_override> kv_overrides;
+    const llama_model_tensor_buft_override * tensor_buft_overrides;
 
     gguf_context_ptr meta;
     std::vector<ggml_context_ptr> contexts;
@@ -95,7 +96,8 @@ struct llama_model_loader {
         std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
         bool use_mmap,
         bool check_tensors,
-        const struct llama_model_kv_override * param_overrides_p);
+        const llama_model_kv_override * param_overrides_p,
+        const llama_model_tensor_buft_override * param_tensor_buft_overrides_p);
 
     template<typename T>
     typename std::enable_if<std::is_integral<T>::value, bool>::type
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 8d525e1be..ca6e3ab2c 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -17,6 +17,7 @@
 #include <cmath>
 #include <functional>
 #include <map>
+#include <regex>
 #include <sstream>
 #include <stdexcept>
 
@@ -378,9 +379,12 @@ struct llama_model::impl {
     layer_dev dev_input = {};
     layer_dev dev_output = {};
     std::vector<layer_dev> dev_layer;
+
+    bool has_tensor_overrides;
 };
 
 llama_model::llama_model(const llama_model_params & params) : params(params), pimpl(std::make_unique<impl>()) {
+    pimpl->has_tensor_overrides = params.tensor_buft_overrides && params.tensor_buft_overrides[0].pattern;
 }
 
 llama_model::~llama_model() {}
@@ -1571,9 +1575,26 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                     GGML_ABORT("invalid layer %d for tensor %s", info.layer, tn.str().c_str());
             }
 
-            ggml_backend_buffer_type_t buft = select_weight_buft(hparams, t_meta, op, *buft_list);
+            ggml_backend_buffer_type_t buft = nullptr;
+
+            // check overrides
+            if (ml.tensor_buft_overrides) {
+                std::string tensor_name = tn.str();
+                for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
+                    std::regex pattern(overrides->pattern);
+                    if (std::regex_search(tensor_name, pattern)) {
+                        LLAMA_LOG_DEBUG("tensor %s buffer type overriden to %s\n", tensor_name.c_str(), ggml_backend_buft_name(overrides->buft));
+                        buft = overrides->buft;
+                        break;
+                    }
+                }
+            }
+
             if (!buft) {
-                throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str()));
+                buft = select_weight_buft(hparams, t_meta, op, *buft_list);
+                if (!buft) {
+                    throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str()));
+                }
             }
 
             // avoid using a host buffer when using mmap
@@ -4151,6 +4172,10 @@ ggml_backend_buffer_type_t llama_model::select_buft(int il) const {
             });
 }
 
+bool llama_model::has_tensor_overrides() const {
+    return pimpl->has_tensor_overrides;
+}
+
 const ggml_tensor * llama_model::get_tensor(const char * name) const {
     auto it = std::find_if(tensors_by_name.begin(), tensors_by_name.end(),
             [name](const std::pair<std::string, ggml_tensor *> & it) {
@@ -12319,6 +12344,7 @@ llm_graph_result_ptr llama_model::build_graph(
 llama_model_params llama_model_default_params() {
     llama_model_params result = {
         /*.devices                     =*/ nullptr,
+        /*.tensor_buft_overrides       =*/ nullptr,
         /*.n_gpu_layers                =*/ 0,
         /*.split_mode                  =*/ LLAMA_SPLIT_MODE_LAYER,
         /*.main_gpu                    =*/ 0,
diff --git a/src/llama-model.h b/src/llama-model.h
index f1bf0df3a..91e6e8725 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -382,6 +382,8 @@ struct llama_model {
 
     ggml_backend_buffer_type_t select_buft(int il) const;
 
+    bool has_tensor_overrides() const;
+
     const struct ggml_tensor * get_tensor(const char * name) const;
 
     // TODO: move this to new llm_arch_model_i interface
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 09eb57077..e3e10fa6c 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -527,7 +527,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
     }
 
     std::vector<std::string> splits = {};
-    llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides);
+    llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides, nullptr);
     ml.init_mappings(false); // no prefetching
 
     llama_model model(llama_model_default_params());
diff --git a/src/llama.cpp b/src/llama.cpp
index 81e1dd1d0..d5164720b 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -92,7 +92,7 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
     model.t_start_us = tm.t_start_us;
 
     try {
-        llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides);
+        llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides, params.tensor_buft_overrides);
 
         ml.print_info();
 

From 833e2b7409211a07df97716998c5002526642652 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 2 Apr 2025 16:38:54 +0300
Subject: [PATCH 11/20] model : print tensor size during load (#12711)

* model : print tensor size during load

* cont : fix units MB -> MiB

Co-authored-by: Diego Devesa <slarengh@gmail.com>

---------

Co-authored-by: Diego Devesa <slarengh@gmail.com>
---
 src/llama-model-loader.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index ec1d78e31..ea73a8a7b 100644
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -603,7 +603,9 @@ llama_model_loader::llama_model_loader(
 
             if (trace > 0) {
                 const uint16_t sid = w.idx;
-                LLAMA_LOG_INFO("%s: - tensor split %2d: %32s %-8s [ %s ]\n", __func__, sid, ggml_get_name(tensor), ggml_type_name(type), llama_format_tensor_shape(tensor).c_str());
+                LLAMA_LOG_INFO("%s: - tensor split %2d: %32s %-8s [ %s ] %8.2f MiB\n", __func__,
+                        sid, ggml_get_name(tensor), ggml_type_name(type), llama_format_tensor_shape(tensor).c_str(),
+                        ggml_nbytes(tensor)/1024.0f/1024.0f);
             }
         }
 

From 92e3006bb69dfeb656ccf5c7c1c1efadb03c88c2 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Wed, 2 Apr 2025 19:12:30 +0200
Subject: [PATCH 12/20] Vulkan: Fix mmq int dot float cache size (#12722)

---
 ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp       | 6 ++----
 ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp | 4 ++--
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp
index 42f81356e..284a35caa 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp
@@ -234,9 +234,9 @@ void main() {
 #endif
 
 #if QUANT_AUXF == 1
-    FLOAT_TYPE cache_a_dm[TM];
+    FLOAT_TYPE cache_a_dm[WMITER * TM];
 #else
-    FLOAT_TYPE_VEC2 cache_a_dm[TM];
+    FLOAT_TYPE_VEC2 cache_a_dm[WMITER * TM];
 #endif
 
     FLOAT_TYPE_VEC2 cache_b_ds[TN];
@@ -247,7 +247,6 @@ void main() {
             const uint iqs = loadr_a;
             const uint buf_ib = loadc_a + l;
 
-            // Should ds be gated to a single thread?
             if (iqs == 0) {
 #if QUANT_AUXF == 1
                 buf_a_dm[buf_ib] = get_d(ib);
@@ -276,7 +275,6 @@ void main() {
 
             const uint buf_ib = loadc_b + l;
 
-            // Should ds be gated to a single thread?
             if (iqs == 0) {
                 buf_b_ds[buf_ib] = FLOAT_TYPE_VEC2(data_b[ib].ds);
             }
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp
index c4c35e105..63b15471b 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp
@@ -17,7 +17,7 @@ i32vec2 repack(uint ib, uint iqs) {
 }
 
 ACC_TYPE mul_q8_1(int32_t q_sum, float da, vec2 dsb) {
-    return ACC_TYPE(da * (float(q_sum) * dsb.x - 8.0 * dsb.y));
+    return ACC_TYPE(da * (float(q_sum) * dsb.x - 8.0f * dsb.y));
 }
 #endif
 
@@ -51,7 +51,7 @@ i32vec2 repack(uint ib, uint iqs) {
 }
 
 ACC_TYPE mul_q8_1(int32_t q_sum, float da, vec2 dsb) {
-    return ACC_TYPE(da * (float(q_sum) * dsb.x - 16.0 * dsb.y));
+    return ACC_TYPE(da * (float(q_sum) * dsb.x - 16.0f * dsb.y));
 }
 #endif
 

From be0a0f8cae039e2286f757612accebfb8f21b36e Mon Sep 17 00:00:00 2001
From: Jeff Bolz <jbolz@nvidia.com>
Date: Wed, 2 Apr 2025 12:40:32 -0500
Subject: [PATCH 13/20] vulkan: Implement grouped query attention in the
 coopmat2 FA shader (#12559)

When adjacent batches of Q share the same batches of K/V, batch them into
the same workgroup. For example, when:

dst(128,32,1,1) = FA(q(128,1,32,1), k(128,16640,8,1), v(128,16640,8,1))

previously we would run 32 workgroups computing 1 result each, now we will
run 8 workgroups computing 4 results each.

This doesn't directly translate to better performance (at least when you have
>=32 SMs), but in a subsequent change I'll enable split_k which will scale much
better with 4x fewer workgroups.
---
 ggml/src/ggml-vulkan/ggml-vulkan.cpp          | 25 ++++++-
 .../vulkan-shaders/flash_attn_cm2.comp        | 66 ++++++++++++++-----
 2 files changed, 71 insertions(+), 20 deletions(-)

diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index ee0969fe1..f60fe33aa 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -31,6 +31,7 @@
 
 #define ROUNDUP_POW2(M, N) (((M) + (N) - 1) & ~((N) - 1))
 #define CEIL_DIV(M, N) (((M) + (N)-1) / (N))
+static bool is_pow2(uint32_t x) { return x > 1 && (x & (x-1)) == 0; }
 
 #define VK_VENDOR_ID_AMD 0x1002
 #define VK_VENDOR_ID_APPLE 0x106b
@@ -501,6 +502,8 @@ struct vk_flash_attn_push_constants {
     uint32_t n_head_log2;
     float m0;
     float m1;
+
+    uint32_t gqa_ratio;
 };
 
 struct vk_op_push_constants {
@@ -5402,7 +5405,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
     const uint32_t nbm1 = mask ? mask->nb[1] : 0;
 
     const uint32_t D = neq0;
-    const uint32_t N = neq1;
+    uint32_t N = neq1;
     const uint32_t KV = nek1;
 
     GGML_ASSERT(ne0 == D);
@@ -5460,6 +5463,22 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
     vk_pipeline pipeline = pipelines[aligned];
     assert(pipeline);
 
+    uint32_t gqa_ratio = 1;
+    uint32_t qk_ratio = neq2 / nek2;
+    uint32_t workgroups_x = (uint32_t)neq1;
+    uint32_t workgroups_y = (uint32_t)neq2;
+    uint32_t workgroups_z = (uint32_t)neq3;
+
+    if (N == 1 && qk_ratio > 1 && is_pow2(qk_ratio) && gqa_ratio <= flash_attention_num_small_rows &&
+        qk_ratio * nek2 == neq2 && nek2 == nev2 && neq3 == 1 && nek3 == 1 && nev3 == 1) {
+        // grouped query attention - make the N dimension equal to gqa_ratio, reduce
+        // workgroups proportionally in y dimension. The shader will detect gqa_ratio > 1
+        // and change addressing calculations to index Q's dimension 2.
+        gqa_ratio = qk_ratio;
+        N = gqa_ratio;
+        workgroups_y /= N;
+    }
+
     if (dryrun) {
         // Request descriptor sets
         ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
@@ -5549,7 +5568,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
                                               v_stride, (uint32_t)nbv2, (uint32_t)nbv3,
                                               nbm1,
                                               scale, max_bias, logit_softcap,
-                                              mask != nullptr, n_head_log2, m0, m1 };
+                                              mask != nullptr, n_head_log2, m0, m1, gqa_ratio };
     ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
                                 {
                                     vk_subbuffer{d_Q, q_buf_offset, VK_WHOLE_SIZE},
@@ -5558,7 +5577,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
                                     vk_subbuffer{d_M, m_buf_offset, VK_WHOLE_SIZE},
                                     vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE},
                                 },
-                                sizeof(vk_flash_attn_push_constants), &pc, { (uint32_t)neq1, (uint32_t)neq2, (uint32_t)neq3 });
+                                sizeof(vk_flash_attn_push_constants), &pc, { workgroups_x, workgroups_y, workgroups_z });
 }
 
 static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op) {
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp
index df30355f6..cac8f107b 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp
@@ -61,6 +61,8 @@ layout (push_constant) uniform parameter {
     uint32_t n_head_log2;
     float m0;
     float m1;
+
+    uint32_t gqa_ratio;
 } p;
 
 layout (binding = 0) readonly buffer Q {uint8_t data_q[];};
@@ -103,6 +105,28 @@ ACC_TYPE Max(const in uint32_t row, const in uint32_t col, const in ACC_TYPE ele
 #define DECODEFUNC
 #endif
 
+// Store the output when doing grouped query attention.
+// Rows index by Q's dimension 2, and the first N rows are valid.
+D_TYPE perElemOpGqaStore(const in uint32_t r, const in uint32_t c, const in D_TYPE elem, const in uint32_t o_offset, const in uint32_t iq2, const in uint32_t N)
+{
+    if (r < N && c < D) {
+        uint32_t offset = (iq2 + r) * D + c;
+        data_o[o_offset + offset] = D_TYPE(elem);
+    }
+    return elem;
+}
+
+// Load the slope matrix, indexed by Q's dimension 2.
+ACC_TYPE perElemOpComputeSlope(const in uint32_t r, const in uint32_t c, const in ACC_TYPE elem, const in uint32_t iq2)
+{
+    const uint32_t h = iq2 + (r & (p.gqa_ratio - 1));
+
+    const ACC_TYPE base = ACC_TYPE(h < p.n_head_log2 ? p.m0 : p.m1);
+    const int      exph = int(h < p.n_head_log2 ? h + 1 : 2*(h - p.n_head_log2) + 1);
+
+    return ACC_TYPE(pow(base, ACC_TYPE(exph)));
+}
+
 void main() {
 #ifdef NEEDS_INIT_IQ_SHMEM
     init_iq_shmem(gl_WorkGroupSize);
@@ -116,7 +140,9 @@ void main() {
 
     const uint32_t i = gl_WorkGroupID.x;
 
-    const uint32_t iq2 = gl_WorkGroupID.y;
+    // When not using grouped query attention, all rows share the same iq2, equal to gl_WorkGroupID.y.
+    // When using grouped query attention, each workgroup does gqa_ratio consecutive values of iq2.
+    const uint32_t iq2 = gl_WorkGroupID.y * p.gqa_ratio;
     const uint32_t iq3 = gl_WorkGroupID.z;
 
     // broadcast factors
@@ -149,8 +175,10 @@ void main() {
     tensorLayoutK = setTensorLayoutDimensionNV(tensorLayoutK, KV, D);
     tensorLayoutV = setTensorLayoutDimensionNV(tensorLayoutV, KV, D);
 
-    // nb?1 are already divided by the type size and are in units of elements
-    uint32_t q_stride = p.nb01;
+    // nb?1 are already divided by the type size and are in units of elements.
+    // When using grouped query attention, Q is indexed by iq2, so the stride
+    // should be nb02 (which is in bytes).
+    uint32_t q_stride = p.gqa_ratio > 1 ? (p.nb02 / 4) : p.nb01;
     uint32_t k_stride = p.nb11;
     uint32_t v_stride = p.nb21;
     // hint to the compiler that strides are aligned for the aligned variant of the shader
@@ -182,16 +210,11 @@ void main() {
     L = coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(0);
     M = coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(-1.0/0.0);
 
-    ACC_TYPE slope = ACC_TYPE(1.0);
+    coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator> slopeMat = coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(1.0);
 
     // ALiBi
     if (p.max_bias > 0.0f) {
-        const uint32_t h = iq2;
-
-        const ACC_TYPE base = ACC_TYPE(h < p.n_head_log2 ? p.m0 : p.m1);
-        const int      exph = int(h < p.n_head_log2 ? h + 1 : 2*(h - p.n_head_log2) + 1);
-
-        slope = pow(base, ACC_TYPE(exph));
+        coopMatPerElementNV(slopeMat, slopeMat, perElemOpComputeSlope, iq2);
     }
 
     [[dont_unroll]]
@@ -215,12 +238,16 @@ void main() {
         if (p.mask != 0) {
             tensorLayoutNV<2, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutM = createTensorLayoutNV(2, gl_CooperativeMatrixClampModeConstantNV);
             tensorLayoutM = setTensorLayoutDimensionNV(tensorLayoutM, p.nem1, KV);
+            // When using grouped query attention, all rows use the same mask.
+            if (p.gqa_ratio > 1) {
+                tensorLayoutM = setTensorLayoutStrideNV(tensorLayoutM, 0, 1);
+            }
 
             coopmat<float16_t, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator> mv;
 
             coopMatLoadTensorNV(mv, data_m, 0, sliceTensorLayoutNV(tensorLayoutM, i * Br, Br, j * Bc, Bc));
 
-            S += slope*coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(mv);
+            S += slopeMat*coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(mv);
         }
 
         // Clear padding elements to -inf, so they don't contribute to rowmax
@@ -297,13 +324,18 @@ void main() {
 
     O = Ldiag*O;
 
-    tensorLayoutNV<3, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutD = createTensorLayoutNV(3, gl_CooperativeMatrixClampModeConstantNV);
-    tensorLayoutD = setTensorLayoutDimensionNV(tensorLayoutD, p.ne2, p.ne1, D);
-
-    // permute dimensions
-    tensorViewNV<3, false, 1, 0, 2> tensorViewPermute = createTensorViewNV(3, false, 1, 0, 2);
     uint32_t o_offset = iq3*p.ne2*p.ne1;
 
     coopmat<D_TYPE, gl_ScopeWorkgroup, Br, D, gl_MatrixUseAccumulator> O_D = coopmat<D_TYPE, gl_ScopeWorkgroup, Br, D, gl_MatrixUseAccumulator>(O);
-    coopMatStoreTensorNV(O_D, data_o, o_offset, sliceTensorLayoutNV(tensorLayoutD, i * Br, Br, iq2, 1, 0, D), tensorViewPermute);
+    if (p.gqa_ratio > 1) {
+        coopMatPerElementNV(O_D, O_D, perElemOpGqaStore, o_offset, iq2, N);
+    } else {
+        tensorLayoutNV<3, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutD = createTensorLayoutNV(3, gl_CooperativeMatrixClampModeConstantNV);
+        tensorLayoutD = setTensorLayoutDimensionNV(tensorLayoutD, p.ne2, p.ne1, D);
+
+        // permute dimensions
+        tensorViewNV<3, false, 1, 0, 2> tensorViewPermute = createTensorViewNV(3, false, 1, 0, 2);
+
+        coopMatStoreTensorNV(O_D, data_o, o_offset, sliceTensorLayoutNV(tensorLayoutD, i * Br, Br, iq2, N, 0, D), tensorViewPermute);
+    }
 }

From 6f3bd38640f07e4dec7f145d2fbf093ce48c9544 Mon Sep 17 00:00:00 2001
From: bandoti <141645996+bandoti@users.noreply.github.com>
Date: Wed, 2 Apr 2025 14:56:26 -0300
Subject: [PATCH 14/20] cmake: remove caching from vulkan coopmat checks
 (#12719)

---
 ggml/src/ggml-vulkan/CMakeLists.txt | 58 +++++++++++------------------
 1 file changed, 22 insertions(+), 36 deletions(-)

diff --git a/ggml/src/ggml-vulkan/CMakeLists.txt b/ggml/src/ggml-vulkan/CMakeLists.txt
index e3c59b75f..51e8301ce 100644
--- a/ggml/src/ggml-vulkan/CMakeLists.txt
+++ b/ggml/src/ggml-vulkan/CMakeLists.txt
@@ -23,49 +23,35 @@ if (Vulkan_FOUND)
                              ../../include/ggml-vulkan.h
                             )
 
-    if(NOT DEFINED GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
-        # Compile a test shader to determine whether GL_KHR_cooperative_matrix is supported.
-        # If it's not, there will be an error to stderr.
-        # If it's supported, set a define to indicate that we should compile those shaders
-        execute_process(COMMAND ${Vulkan_GLSLC_EXECUTABLE} -o - -fshader-stage=compute --target-env=vulkan1.3 "${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders/test_coopmat_support.comp"
-                        OUTPUT_VARIABLE glslc_output
-                        ERROR_VARIABLE glslc_error)
+    # Compile a test shader to determine whether GL_KHR_cooperative_matrix is supported.
+    # If it's not, there will be an error to stderr.
+    # If it's supported, set a define to indicate that we should compile those shaders
+    execute_process(COMMAND ${Vulkan_GLSLC_EXECUTABLE} -o - -fshader-stage=compute --target-env=vulkan1.3 "${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders/test_coopmat_support.comp"
+                    OUTPUT_VARIABLE glslc_output
+                    ERROR_VARIABLE glslc_error)
 
-        if (${glslc_error} MATCHES ".*extension not supported: GL_KHR_cooperative_matrix.*")
-            message(STATUS "GL_KHR_cooperative_matrix not supported by glslc")
-            set(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT OFF CACHE INTERNAL "Whether coopmat is supported by glslc")
-        else()
-            message(STATUS "GL_KHR_cooperative_matrix supported by glslc")
-            set(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT ON CACHE INTERNAL "Whether coopmat is supported by glslc")
-        endif()
+    if (${glslc_error} MATCHES ".*extension not supported: GL_KHR_cooperative_matrix.*")
+        message(STATUS "GL_KHR_cooperative_matrix not supported by glslc")
+        set(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT OFF)
     else()
-        message(STATUS "GL_KHR_cooperative_matrix support already defined: ${GGML_VULKAN_COOPMAT_GLSLC_SUPPORT}")
-    endif()
-
-    if(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
+        message(STATUS "GL_KHR_cooperative_matrix supported by glslc")
+        set(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT ON)
         add_compile_definitions(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
     endif()
 
-    if(NOT DEFINED GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
-        # Compile a test shader to determine whether GL_NV_cooperative_matrix2 is supported.
-        # If it's not, there will be an error to stderr.
-        # If it's supported, set a define to indicate that we should compile those shaders
-        execute_process(COMMAND ${Vulkan_GLSLC_EXECUTABLE} -o - -fshader-stage=compute --target-env=vulkan1.3 "${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders/test_coopmat2_support.comp"
-                        OUTPUT_VARIABLE glslc_output
-                        ERROR_VARIABLE glslc_error)
+    # Compile a test shader to determine whether GL_NV_cooperative_matrix2 is supported.
+    # If it's not, there will be an error to stderr.
+    # If it's supported, set a define to indicate that we should compile those shaders
+    execute_process(COMMAND ${Vulkan_GLSLC_EXECUTABLE} -o - -fshader-stage=compute --target-env=vulkan1.3 "${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders/test_coopmat2_support.comp"
+                    OUTPUT_VARIABLE glslc_output
+                    ERROR_VARIABLE glslc_error)
 
-        if (${glslc_error} MATCHES ".*extension not supported: GL_NV_cooperative_matrix2.*")
-            message(STATUS "GL_NV_cooperative_matrix2 not supported by glslc")
-            set(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT OFF CACHE INTERNAL "Whether coopmat2 is supported by glslc")
-        else()
-            message(STATUS "GL_NV_cooperative_matrix2 supported by glslc")
-            set(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT ON CACHE INTERNAL "Whether coopmat2 is supported by glslc")
-        endif()
+    if (${glslc_error} MATCHES ".*extension not supported: GL_NV_cooperative_matrix2.*")
+        message(STATUS "GL_NV_cooperative_matrix2 not supported by glslc")
+        set(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT OFF)
     else()
-        message(STATUS "GL_NV_cooperative_matrix2 support already defined: ${GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT}")
-    endif()
-
-    if(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
+        message(STATUS "GL_NV_cooperative_matrix2 supported by glslc")
+        set(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT ON)
         add_compile_definitions(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
     endif()
 

From f01bd02376f919b05ee635f438311be8dfc91d7c Mon Sep 17 00:00:00 2001
From: Jeff Bolz <jbolz@nvidia.com>
Date: Wed, 2 Apr 2025 14:25:08 -0500
Subject: [PATCH 15/20] vulkan: Implement split_k for coopmat2 flash attention.
 (#12627)

When using group query attention, we have one workgroup per KV batch and this
can be very few workgroups (e.g. just 8 in some models). Enable split_k to
spread the work across SMs. This helps a lot when the KV cache is large.
---
 ggml/src/ggml-vulkan/ggml-vulkan.cpp          | 86 ++++++++++++++++---
 .../vulkan-shaders/flash_attn_cm2.comp        | 42 ++++++++-
 .../flash_attn_split_k_reduce.comp            | 59 +++++++++++++
 .../vulkan-shaders/vulkan-shaders-gen.cpp     |  1 +
 tests/test-backend-ops.cpp                    |  6 ++
 5 files changed, 177 insertions(+), 17 deletions(-)
 create mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp

diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index f60fe33aa..f6cc28603 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -353,6 +353,7 @@ struct vk_device_struct {
     vk_pipeline pipeline_flash_attn_f32_f16_D112[GGML_TYPE_COUNT][2][2][2];
     vk_pipeline pipeline_flash_attn_f32_f16_D128[GGML_TYPE_COUNT][2][2][2];
     vk_pipeline pipeline_flash_attn_f32_f16_D256[GGML_TYPE_COUNT][2][2][2];
+    vk_pipeline pipeline_flash_attn_split_k_reduce;
 
     std::unordered_map<std::string, vk_pipeline_ref> pipelines;
     std::unordered_map<std::string, uint64_t> pipeline_descriptor_set_requirements;
@@ -504,6 +505,8 @@ struct vk_flash_attn_push_constants {
     float m1;
 
     uint32_t gqa_ratio;
+    uint32_t split_kv;
+    uint32_t k_num;
 };
 
 struct vk_op_push_constants {
@@ -1476,7 +1479,7 @@ static std::array<uint32_t, 2> fa_rows_cols(uint32_t D, uint32_t clamp, ggml_typ
 
     // small rows, large cols
     if (small_rows) {
-        return {flash_attention_num_small_rows, 128};
+        return {flash_attention_num_small_rows, 64};
     }
     // small cols to reduce register count
     if (ggml_is_quantized(type) || D == 256) {
@@ -2332,6 +2335,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
     ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ4_NL],  "get_rows_iq4_nl_f32",  get_rows_iq4_nl_f32_len,  get_rows_iq4_nl_f32_data,  "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
 
     ggml_vk_create_pipeline(device, device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256 * 4, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_flash_attn_split_k_reduce, "fa_split_k_reduce", fa_split_k_reduce_len, fa_split_k_reduce_data, "main", 2, 3 * sizeof(uint32_t), {1, 1, 1}, {}, 1, true);
     ggml_vk_create_pipeline(device, device->pipeline_quantize_q8_1, "quantize_q8_1", quantize_q8_1_len, quantize_q8_1_data, "main", 2, 1 * sizeof(uint32_t), {32 * device->subgroup_size / 8, 1, 1}, { device->subgroup_size }, 1);
 
     for (uint32_t i = 0; i < p021_max_gqa_ratio; ++i) {
@@ -5479,9 +5483,38 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
         workgroups_y /= N;
     }
 
+    uint32_t split_kv = KV;
+    uint32_t split_k = 1;
+
+    if (gqa_ratio > 1 && ctx->device->shader_core_count > 0) {
+        GGML_ASSERT(workgroups_x == 1);
+        // Try to run two workgroups per SM.
+        split_k = ctx->device->shader_core_count * 2 / workgroups_y;
+        if (split_k > 1) {
+            // Try to evenly split KV into split_k chunks, but it needs to be a multiple
+            // of "align", so recompute split_k based on that.
+            split_kv = ROUNDUP_POW2(KV / split_k, pipelines[1]->align);
+            split_k = CEIL_DIV(KV, split_kv);
+            workgroups_x = split_k;
+        }
+    }
+
+    // Reserve space for split_k temporaries. For each split, we need to store the O matrix (D x ne1)
+    // and the per-row m and L values (ne1 rows).
+    const uint64_t split_k_size = split_k > 1 ? (D * ne1 * sizeof(float) + ne1 * sizeof(float) * 2) * split_k : 0;
+    if (split_k_size > ctx->device->max_memory_allocation_size) {
+        GGML_ABORT("Requested preallocation size is too large");
+    }
+    if (ctx->prealloc_size_split_k < split_k_size) {
+        ctx->prealloc_size_split_k = split_k_size;
+    }
+
     if (dryrun) {
         // Request descriptor sets
         ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
+        if (split_k > 1) {
+            ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_flash_attn_split_k_reduce, 1);
+        }
         return;
     }
 
@@ -5502,8 +5535,6 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
     const float m0 = powf(2.0f, -(max_bias       ) / n_head_log2);
     const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
 
-    ggml_vk_sync_buffers(subctx);
-
     vk_buffer d_Q = nullptr, d_K = nullptr, d_V = nullptr, d_D = nullptr, d_M = nullptr;
     size_t q_buf_offset = 0, k_buf_offset = 0, v_buf_offset = 0, d_buf_offset = 0, m_buf_offset = 0;
 
@@ -5568,16 +5599,45 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
                                               v_stride, (uint32_t)nbv2, (uint32_t)nbv3,
                                               nbm1,
                                               scale, max_bias, logit_softcap,
-                                              mask != nullptr, n_head_log2, m0, m1, gqa_ratio };
-    ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
-                                {
-                                    vk_subbuffer{d_Q, q_buf_offset, VK_WHOLE_SIZE},
-                                    vk_subbuffer{d_K, k_buf_offset, VK_WHOLE_SIZE},
-                                    vk_subbuffer{d_V, v_buf_offset, VK_WHOLE_SIZE},
-                                    vk_subbuffer{d_M, m_buf_offset, VK_WHOLE_SIZE},
-                                    vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE},
-                                },
-                                sizeof(vk_flash_attn_push_constants), &pc, { workgroups_x, workgroups_y, workgroups_z });
+                                              mask != nullptr, n_head_log2, m0, m1,
+                                              gqa_ratio, split_kv, split_k };
+
+    ggml_vk_sync_buffers(subctx);
+
+    if (split_k > 1) {
+        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
+                                    {
+                                        vk_subbuffer{d_Q, q_buf_offset, VK_WHOLE_SIZE},
+                                        vk_subbuffer{d_K, k_buf_offset, VK_WHOLE_SIZE},
+                                        vk_subbuffer{d_V, v_buf_offset, VK_WHOLE_SIZE},
+                                        vk_subbuffer{d_M, m_buf_offset, VK_WHOLE_SIZE},
+                                        vk_subbuffer{ctx->prealloc_split_k, 0, VK_WHOLE_SIZE},
+                                    },
+                                    // We only use split_k when group query attention is enabled, which means
+                                    // there's no more than one tile of rows (i.e. workgroups_x would have been
+                                    // one). We reuse workgroups_x to mean the number of splits, so we need to
+                                    // cancel out the divide by wg_denoms[0].
+                                    sizeof(vk_flash_attn_push_constants), &pc, { workgroups_x * pipeline->wg_denoms[0], workgroups_y, workgroups_z });
+
+        ggml_vk_sync_buffers(subctx);
+        const std::array<uint32_t, 3> pc2 = { D, (uint32_t)ne1, split_k };
+        ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_flash_attn_split_k_reduce,
+                                    {
+                                        vk_subbuffer{ctx->prealloc_split_k, 0, VK_WHOLE_SIZE},
+                                        vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE},
+                                    },
+                                    pc2.size() * uint32_t{sizeof(uint32_t)}, pc2.data(), { (uint32_t)ne1, 1, 1 });
+    } else {
+        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
+                                    {
+                                        vk_subbuffer{d_Q, q_buf_offset, VK_WHOLE_SIZE},
+                                        vk_subbuffer{d_K, k_buf_offset, VK_WHOLE_SIZE},
+                                        vk_subbuffer{d_V, v_buf_offset, VK_WHOLE_SIZE},
+                                        vk_subbuffer{d_M, m_buf_offset, VK_WHOLE_SIZE},
+                                        vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE},
+                                    },
+                                    sizeof(vk_flash_attn_push_constants), &pc, { workgroups_x, workgroups_y, workgroups_z });
+    }
 }
 
 static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op) {
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp
index cac8f107b..d78092000 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp
@@ -63,6 +63,8 @@ layout (push_constant) uniform parameter {
     float m1;
 
     uint32_t gqa_ratio;
+    uint32_t split_kv;
+    uint32_t k_num;
 } p;
 
 layout (binding = 0) readonly buffer Q {uint8_t data_q[];};
@@ -116,6 +118,16 @@ D_TYPE perElemOpGqaStore(const in uint32_t r, const in uint32_t c, const in D_TY
     return elem;
 }
 
+// Store column zero. This is used to save per-row m and L values for split_k.
+ACC_TYPE perElemOpStoreCol0(const in uint32_t r, const in uint32_t c, const in ACC_TYPE elem, const in uint32_t o_offset, const in uint32_t iq2, const in uint32_t N)
+{
+    if (r < N && c == 0) {
+        uint32_t offset = iq2 + r;
+        data_o[o_offset + offset] = D_TYPE(elem);
+    }
+    return elem;
+}
+
 // Load the slope matrix, indexed by Q's dimension 2.
 ACC_TYPE perElemOpComputeSlope(const in uint32_t r, const in uint32_t c, const in ACC_TYPE elem, const in uint32_t iq2)
 {
@@ -135,10 +147,18 @@ void main() {
     const uint32_t N = p.N;
     const uint32_t KV = p.KV;
 
-    const uint32_t Tr = CEIL_DIV(N, Br);
-    const uint32_t Tc = CEIL_DIV(KV, Bc);
+    uint32_t i = gl_WorkGroupID.x;
+    uint32_t split_k_index = 0;
 
-    const uint32_t i = gl_WorkGroupID.x;
+    if (p.k_num > 1) {
+        i = 0;
+        split_k_index = gl_WorkGroupID.x;
+    }
+
+    const uint32_t Tr = CEIL_DIV(N, Br);
+
+    const uint32_t start_j = split_k_index * p.split_kv / Bc;
+    const uint32_t end_j = CEIL_DIV(min(KV, (split_k_index + 1) * p.split_kv), Bc);
 
     // When not using grouped query attention, all rows share the same iq2, equal to gl_WorkGroupID.y.
     // When using grouped query attention, each workgroup does gqa_ratio consecutive values of iq2.
@@ -218,7 +238,7 @@ void main() {
     }
 
     [[dont_unroll]]
-    for (uint32_t j = 0; j < Tc; ++j) {
+    for (uint32_t j = start_j; j < end_j; ++j) {
 
         coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator> S = coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(0);
 
@@ -312,6 +332,20 @@ void main() {
         O = coopMatMulAdd(P_A, V, O);
     }
 
+    // If there is split_k, then the split_k resolve shader does the final
+    // division by L. Store the intermediate O value and per-row m and L values.
+    if (p.k_num > 1) {
+        coopmat<D_TYPE, gl_ScopeWorkgroup, Br, D, gl_MatrixUseAccumulator> O_D = coopmat<D_TYPE, gl_ScopeWorkgroup, Br, D, gl_MatrixUseAccumulator>(O);
+
+        uint32_t o_offset = D * p.ne1 * split_k_index;
+        coopMatPerElementNV(O_D, O_D, perElemOpGqaStore, o_offset, iq2, N);
+
+        o_offset = D * p.ne1 * p.k_num + p.ne1 * split_k_index * 2;
+        coopMatPerElementNV(L, L, perElemOpStoreCol0, o_offset, iq2, N);
+        coopMatPerElementNV(M, M, perElemOpStoreCol0, o_offset + p.ne1, iq2, N);
+        return;
+    }
+
     coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, D, gl_MatrixUseAccumulator> Ldiag;
 
     // resize L by using smear/reduce
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp
new file mode 100644
index 000000000..a7e395685
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp
@@ -0,0 +1,59 @@
+#version 450
+
+#extension GL_EXT_control_flow_attributes : enable
+
+#define BLOCK_SIZE 32
+
+layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer A {float data_a[];};
+layout (binding = 1) writeonly buffer D {float data_d[];};
+
+layout (push_constant) uniform parameter {
+    uint D;
+    uint N;
+    uint k_num;
+} p;
+
+void main() {
+    // Each workgroup handles a row
+    const uint n = gl_WorkGroupID.x;
+    const uint tid = gl_LocalInvocationID.x;
+
+    uint D = p.D;
+    uint N = p.N;
+    uint k_num = p.k_num;
+
+    uint l_offset = D * N * k_num + n;
+    uint m_offset = D * N * k_num + N + n;
+    uint lm_stride = N * 2;
+
+    // Compute the max m value for the row
+    float m_max = -1.0/0.0;
+    [[unroll]] for (uint k = 0; k < k_num; ++k) {
+        float m = data_a[m_offset + k * lm_stride];
+        m_max = max(m_max, m);
+    }
+
+    // Compute L based on m_max
+    float L = 0;
+    [[unroll]] for (uint k = 0; k < k_num; ++k) {
+        float l = data_a[l_offset + k * lm_stride];
+        float m = data_a[m_offset + k * lm_stride];
+        L += exp(m - m_max) * l;
+    }
+
+    L = 1.0 / L;
+
+    // Scale and sum the O contributions based on m_max and store the result to memory
+    for (uint d = tid; d < D; d += BLOCK_SIZE) {
+        float O = 0.0;
+        [[unroll]] for (uint k = 0; k < k_num; ++k) {
+            uint o_offset = D * N * k + D * n + d;
+            float m = data_a[m_offset + k * lm_stride];
+            O += exp(m - m_max) * data_a[o_offset];
+        }
+        O *= L;
+        data_d[D * n + d] = O;
+    }
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
index 2ac4caee7..cf74625cc 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
@@ -465,6 +465,7 @@ void process_shaders() {
     string_to_spv("acc_f32", "acc.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
 
     string_to_spv("split_k_reduce", "mul_mat_split_k_reduce.comp", {});
+    string_to_spv("fa_split_k_reduce", "flash_attn_split_k_reduce.comp", {});
     string_to_spv("quantize_q8_1", "quantize_q8_1.comp", {});
 
     string_to_spv("mul_f32", "mul.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 426a9557c..e61a126cf 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -4516,6 +4516,12 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
         }
     }
 
+    for (int kv : { 4096, 8192, 16384, }) {
+        for (int hs : { 64, 128, }) {
+            test_cases.emplace_back(new test_flash_attn_ext(hs, hs, 8, 4, kv, 1, true, 0, 0, GGML_PREC_F32, GGML_TYPE_F16));
+        }
+    }
+
     return test_cases;
 }
 

From 97a20c012be2f9bfbeee0209405a31001f93ccf9 Mon Sep 17 00:00:00 2001
From: lhez <quic_lih@quicinc.com>
Date: Wed, 2 Apr 2025 17:01:42 -0700
Subject: [PATCH 16/20] opencl: use `max_alloc_size` in backend ctx instead of
 querying again (#12705)

---
 ggml/src/ggml-opencl/ggml-opencl.cpp | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
index 723cab8b1..6806c1397 100644
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
@@ -924,27 +924,24 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
     // TODO: fixme: these sizes are hardcoded for now.
     //  they should be allocated based on the model's size
     //  and the device's max alloc size
-    size_t max_alloc_size;
-    CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &max_alloc_size, NULL));
-
     // Allocate intermediate buffers and images
     size_t required_A_q_d_bytes = 311164928;
     size_t required_A_s_d_bytes = 38895616;
     size_t required_B_d_bytes = 45088768;
 
     // Ensure buffer sizes do not exceed the maximum allocation size
-    size_t max_A_q_d_bytes = MIN(required_A_q_d_bytes, max_alloc_size);
-    size_t max_A_s_d_bytes = MIN(required_A_s_d_bytes, max_alloc_size);
-    size_t max_B_d_bytes   = MIN(required_B_d_bytes, max_alloc_size);
-    if (required_A_q_d_bytes > max_alloc_size) {
+    size_t max_A_q_d_bytes = MIN(required_A_q_d_bytes, backend_ctx->max_alloc_size);
+    size_t max_A_s_d_bytes = MIN(required_A_s_d_bytes, backend_ctx->max_alloc_size);
+    size_t max_B_d_bytes   = MIN(required_B_d_bytes, backend_ctx->max_alloc_size);
+    if (required_A_q_d_bytes > backend_ctx->max_alloc_size) {
         GGML_LOG_WARN("ggml_opencl: A_q_d buffer size reduced from %zu to %zu due to device limitations.\n",
                       required_A_q_d_bytes, max_A_q_d_bytes);
     }
-    if (required_A_s_d_bytes > max_alloc_size) {
+    if (required_A_s_d_bytes > backend_ctx->max_alloc_size) {
         GGML_LOG_WARN("ggml_opencl: A_s_d buffer size reduced from %zu to %zu due to device limitations.\n",
                       required_A_s_d_bytes, max_A_s_d_bytes);
     }
-    if (required_B_d_bytes > max_alloc_size) {
+    if (required_B_d_bytes > backend_ctx->max_alloc_size) {
         GGML_LOG_WARN("ggml_opencl: B_d buffer size reduced from %zu to %zu due to device limitations.\n",
                       required_B_d_bytes, max_B_d_bytes);
     }

From 2a0dc97e56eac6db0a4016f0b45da6d0a0055ef2 Mon Sep 17 00:00:00 2001
From: hipudding <huafengchun@gmail.com>
Date: Thu, 3 Apr 2025 08:49:51 +0800
Subject: [PATCH 17/20] CANN: Fix failed test cases (#12708)

* CANN: Fix memory waste in aclnn_tensor

* CANN: fix backend ops fail

* CANN: fix acl_tensor memory alloc.

* CANN: format

* CANN: remove trailing whitespace
---
 ggml/src/ggml-cann/acl_tensor.cpp | 10 ++++++----
 ggml/src/ggml-cann/acl_tensor.h   | 10 +++++-----
 ggml/src/ggml-cann/aclnn_ops.cpp  |  6 +-----
 ggml/src/ggml-cann/aclnn_ops.h    |  6 ------
 ggml/src/ggml-cann/ggml-cann.cpp  | 28 ++++++++++++++++++----------
 5 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/ggml/src/ggml-cann/acl_tensor.cpp b/ggml/src/ggml-cann/acl_tensor.cpp
index d120ce6ac..9b6553c50 100644
--- a/ggml/src/ggml-cann/acl_tensor.cpp
+++ b/ggml/src/ggml-cann/acl_tensor.cpp
@@ -54,9 +54,7 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne,
     // added.
     int64_t acl_ne[GGML_MAX_DIMS * 2], acl_stride[GGML_MAX_DIMS * 2];
 
-    int64_t acl_storage_len = 0;
     if (ne == nullptr) {
-        acl_storage_len = ggml_nbytes(tensor);
         for (int i = 0; i < GGML_MAX_DIMS; i++) {
             acl_ne[i] = tensor->ne[i];
             // The step size of acl is in elements.
@@ -65,14 +63,18 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne,
     } else {
         // With bcast
         for (int i = 0; i < dims; i++) {
-            acl_storage_len += (ne[i] - 1) * nb[i];
             acl_ne[i] = ne[i];
             acl_stride[i] = nb[i] / ggml_element_size(tensor);
         }
     }
 
-    // Reverse ne and stride.
     int64_t final_dims = (dims == 0 ? GGML_MAX_DIMS : dims);
+    int64_t acl_storage_len = 1;
+    for (int i = 0; i < final_dims; i++) {
+        acl_storage_len += (acl_ne[i] - 1) * acl_stride[i];
+    }
+
+    // Reverse ne and stride.
     std::reverse(acl_ne, acl_ne + final_dims);
     std::reverse(acl_stride, acl_stride + final_dims);
 
diff --git a/ggml/src/ggml-cann/acl_tensor.h b/ggml/src/ggml-cann/acl_tensor.h
index 4734a9cb8..93f09937e 100644
--- a/ggml/src/ggml-cann/acl_tensor.h
+++ b/ggml/src/ggml-cann/acl_tensor.h
@@ -101,14 +101,14 @@ aclTensor* ggml_cann_create_tensor(void* data_ptr, aclDataType dtype,
         tmp_stride[i] = nb[i] / type_size;
     }
 
+    int64_t acl_storage_len = 1;
+    for (int i = 0; i < dims; i++) {
+        acl_storage_len += (tmp_ne[i] - 1) * tmp_stride[i];
+    }
+
     std::reverse(tmp_ne, tmp_ne + dims);
     std::reverse(tmp_stride, tmp_stride + dims);
 
-    int64_t acl_storage_len = 0;
-    for (int i = 0; i < dims; i++) {
-        acl_storage_len += (ne[i] - 1) * nb[i];
-    }
-
     aclTensor* acl_tensor =
         aclCreateTensor(tmp_ne, dims, dtype, tmp_stride, offset / type_size,
                         format, &acl_storage_len, 1, data_ptr);
diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp
index 8482bb537..ae13730c0 100644
--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
@@ -358,8 +358,6 @@ void ggml_cann_sqr(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
 
 void ggml_cann_clamp(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     ggml_tensor* src = dst->src[0];
-    GGML_ASSERT(src->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
 
     float min;
     float max;
@@ -1090,8 +1088,6 @@ void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     float eps;
     memcpy(&eps, dst->op_params, sizeof(float));
 
-    GGML_ASSERT(eps > 0.0f);
-
     uint64_t workspaceSize = 0;
     aclOpExecutor* executor;
     void* workspaceAddr = nullptr;
@@ -3152,7 +3148,7 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     // TODO: use ascendc
     // Only test with LLAMA model.
     ggml_tensor* src0 = dst->src[0];  // input
-    ggml_tensor* src2 = dst->src[2];  // freq_factors
+    // ggml_tensor* src2 = dst->src[2];  // freq_factors, not used now.
 
     // param
     float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
diff --git a/ggml/src/ggml-cann/aclnn_ops.h b/ggml/src/ggml-cann/aclnn_ops.h
index 680129c76..51a5cf92f 100644
--- a/ggml/src/ggml-cann/aclnn_ops.h
+++ b/ggml/src/ggml-cann/aclnn_ops.h
@@ -535,9 +535,6 @@ template <aclnnStatus getWorkspaceSize(const aclTensor*, aclTensor*, uint64_t*,
 void ggml_cann_activation(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     ggml_tensor* src = dst->src[0];
 
-    GGML_ASSERT(src->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
-
     aclTensor* acl_src = ggml_cann_create_tensor(src);
     aclTensor* acl_dst = ggml_cann_create_tensor(dst);
 
@@ -566,9 +563,6 @@ template <aclnnStatus getWorkspaceSize(const aclTensor*, const aclTensor*,
 void ggml_cann_activation(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     ggml_tensor* src = dst->src[0];
 
-    GGML_ASSERT(src->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
-
     aclTensor* acl_src = ggml_cann_create_tensor(src);
     aclTensor* acl_dst = ggml_cann_create_tensor(dst);
 
diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
index da75f77f5..3527bd298 100644
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -1458,11 +1458,6 @@ static void ggml_backend_cann_free(ggml_backend_t backend) {
     ACL_CHECK(aclrtSynchronizeDevice());
     ACL_CHECK(aclrtResetDevice(cann_ctx->device));
 
-    // finalize when last backend freed.
-    if (cann_ctx->device == ggml_backend_cann_get_device_count() - 1) {
-        ACL_CHECK(aclFinalize());
-    }
-
     delete cann_ctx;
     delete backend;
 }
@@ -1688,11 +1683,14 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
             }
         case GGML_OP_MUL_MAT: {
             switch (op->src[0]->type) {
-                case GGML_TYPE_Q8_0:
                 case GGML_TYPE_F16:
                 case GGML_TYPE_F32:
-                case GGML_TYPE_Q4_0:
                     return true;
+                case GGML_TYPE_Q8_0:
+                case GGML_TYPE_Q4_0:
+                    // only support contiguous for quantized types.
+                    return ggml_is_contiguous(op->src[0]) &&
+                            ggml_is_contiguous(op->src[1]);
                 default:
                     return false;
             }
@@ -1738,13 +1736,14 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
         }
         case GGML_OP_ROPE: {
             // TODO: with ops-test v == 1
-            float * ext_factor = (float*)((int32_t*)op->op_params + 7);
+            float ext_factor = 0.0f;
+            memcpy(&ext_factor, (const float *) op->op_params + 7, sizeof(float));
             // TODO: n_dims <= ne0
             if (op->src[0]->ne[0] != op->op_params[1]) {
                 return false;
             }
             // TODO: ext_factor != 0
-            if (*ext_factor != 0) {
+            if (ext_factor != 0) {
                 return false;
             }
 
@@ -1766,6 +1765,16 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
             }
             return true;
         }
+        case GGML_OP_POOL_2D: {
+            const int32_t * opts = (const int32_t *) op->op_params;
+            const int       k0   = opts[1];
+            const int       k1   = opts[2];
+            const int       p0   = opts[5];
+            const int       p1   = opts[6];
+            // value of paddingH should be at most half of kernelH
+            // value of paddingW should be at most half of kernelW
+            return (p0 <= (k0 / 2)) && (p1 <= (k1 / 2));
+        }
         case GGML_OP_DUP:
         case GGML_OP_IM2COL:
         case GGML_OP_CONCAT:
@@ -1785,7 +1794,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
         case GGML_OP_CLAMP:
         case GGML_OP_DIAG_MASK_INF:
         case GGML_OP_SOFT_MAX:
-        case GGML_OP_POOL_2D:
         case GGML_OP_SUM_ROWS:
         case GGML_OP_ARGSORT:
         case GGML_OP_ACC:

From 3f9da22c2b21a2cef216de50006436ef1cab8764 Mon Sep 17 00:00:00 2001
From: Alan Gray <agray3@users.noreply.github.com>
Date: Thu, 3 Apr 2025 02:31:15 +0100
Subject: [PATCH 18/20] Simplify and improve CUDA graphs through use of
 indirect copy pointers (#9017)

* CUDA: Simplify and improve CUDA graphs through use of indirect copy pointers

Previously there was complexity in the CUDA graphs implementation due
frequently changing parameters to copy kernels associated with K and V
cache pointers. This patch simplifies by using indirection to avoid
such parameters frequently changing, avoiding the need for frequent
graph updates.

Fixes #12152

* Addressed comments

* fix HIP builds

* properly sync to stream

* removed ggml_cuda_cpy_fn_ptrs

* move stream sync before free

* guard to only use indirection with graphs

* style fixes

* check for errors

---------

Co-authored-by: slaren <slarengh@gmail.com>
---
 ggml/src/ggml-cuda/common.cuh   |   8 +-
 ggml/src/ggml-cuda/cpy.cu       | 140 ++++++++++++++++++++------------
 ggml/src/ggml-cuda/cpy.cuh      |   2 +
 ggml/src/ggml-cuda/ggml-cuda.cu |  93 ++++++---------------
 4 files changed, 122 insertions(+), 121 deletions(-)

diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index a718b6a12..8284a0017 100644
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -729,7 +729,13 @@ struct ggml_cuda_graph {
     bool disable_due_to_failed_graph_capture = false;
     int number_consecutive_updates = 0;
     std::vector<ggml_graph_node_properties> ggml_graph_properties;
-    std::vector<char **> updated_kernel_arg;
+    bool use_cpy_indirection = false;
+    std::vector<char *> cpy_dest_ptrs;
+    char ** dest_ptrs_d;
+    int dest_ptrs_size = 0;
+    // Index to allow each cpy kernel to be aware of it's position within the graph
+    // relative to other cpy nodes.
+    int graph_cpynode_index = -1;
 #endif
 };
 
diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu
index cca2bee0b..1f3151e75 100644
--- a/ggml/src/ggml-cuda/cpy.cu
+++ b/ggml/src/ggml-cuda/cpy.cu
@@ -32,16 +32,18 @@ static __device__ void cpy_1_f16_f32(const char * cxi, char * cdsti) {
 }
 
 template <cpy_kernel_t cpy_1>
-static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
+static __global__ void cpy_f32_f16(const char * cx, char * cdst_direct, const int ne,
                                    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
                                    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
-                                   const int nb12, const int nb13) {
+                                   const int nb12, const int nb13, char ** cdst_indirect, int graph_cpynode_index) {
     const int64_t i = blockDim.x*blockIdx.x + threadIdx.x;
 
     if (i >= ne) {
         return;
     }
 
+    char * cdst = (cdst_indirect != nullptr) ? cdst_indirect[graph_cpynode_index]: cdst_direct;
+
     // determine indices i03/i13, i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor
     // then combine those indices with the corresponding byte offsets to get the total offsets
     const int64_t i03 = i/(ne00 * ne01 * ne02);
@@ -288,16 +290,18 @@ static __device__ void cpy_blck_f32_iq4_nl(const char * cxi, char * cdsti) {
 }
 
 template <cpy_kernel_t cpy_blck, int qk>
-static __global__ void cpy_f32_q(const char * cx, char * cdst, const int ne,
+static __global__ void cpy_f32_q(const char * cx, char * cdst_direct, const int ne,
                                  const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
                                  const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
-                                 const int nb12, const int nb13) {
+                                 const int nb12, const int nb13, char ** cdst_indirect, int graph_cpynode_index) {
     const int i = (blockDim.x*blockIdx.x + threadIdx.x)*qk;
 
     if (i >= ne) {
         return;
     }
 
+    char * cdst = (cdst_indirect != nullptr) ? cdst_indirect[graph_cpynode_index]: cdst_direct;
+
     const int i03 = i/(ne00 * ne01 * ne02);
     const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
     const int i01 = (i - i03*ne00*ne01*ne02  -  i02*ne01*ne00) / ne00;
@@ -314,16 +318,18 @@ static __global__ void cpy_f32_q(const char * cx, char * cdst, const int ne,
 }
 
 template <cpy_kernel_t cpy_blck, int qk>
-static __global__ void cpy_q_f32(const char * cx, char * cdst, const int ne,
+static __global__ void cpy_q_f32(const char * cx, char * cdst_direct, const int ne,
                                  const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
                                  const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
-                                 const int nb12, const int nb13) {
+                                 const int nb12, const int nb13, char ** cdst_indirect, int graph_cpynode_index) {
     const int i = (blockDim.x*blockIdx.x + threadIdx.x)*qk;
 
     if (i >= ne) {
         return;
     }
 
+    char * cdst = (cdst_indirect != nullptr) ? cdst_indirect[graph_cpynode_index]: cdst_direct;
+
     const int i03 = i/(ne00 * ne01 * ne02);
     const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
     const int i01 = (i - i03*ne00*ne01*ne02  -  i02*ne01*ne00) / ne00;
@@ -339,66 +345,84 @@ static __global__ void cpy_q_f32(const char * cx, char * cdst, const int ne,
     cpy_blck(cx + x_offset, cdst + dst_offset);
 }
 
+// Copy destination pointers to GPU to be available when pointer indirection is in use
+
+void ggml_cuda_cpy_dest_ptrs_copy(ggml_cuda_graph * cuda_graph, char ** host_dest_ptrs, const int host_dest_ptrs_size, cudaStream_t stream) {
+#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)
+    if (cuda_graph->dest_ptrs_size < host_dest_ptrs_size) { // (re-)allocate GPU memory for destination pointers
+        CUDA_CHECK(cudaStreamSynchronize(stream));
+        if (cuda_graph->dest_ptrs_d != nullptr) {
+            CUDA_CHECK(cudaFree(cuda_graph->dest_ptrs_d));
+        }
+        CUDA_CHECK(cudaMalloc(&cuda_graph->dest_ptrs_d, host_dest_ptrs_size*sizeof(char *)));
+        cuda_graph->dest_ptrs_size = host_dest_ptrs_size;
+    }
+    // copy destination pointers to GPU
+    CUDA_CHECK(cudaMemcpyAsync(cuda_graph->dest_ptrs_d, host_dest_ptrs, host_dest_ptrs_size*sizeof(char *), cudaMemcpyHostToDevice, stream));
+    cuda_graph->graph_cpynode_index = 0; // reset index
+#endif
+}
+
 static void ggml_cpy_f16_f32_cuda(
     const char * cx, char * cdst, const int ne,
     const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
 
     const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
     cpy_f32_f16<cpy_1_f16_f32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
 }
 
 static void ggml_cpy_f32_f32_cuda(
     const char * cx, char * cdst, const int ne,
     const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
 
     const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
     cpy_f32_f16<cpy_1_f32_f32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
 }
 
 static void ggml_cpy_f32_f16_cuda(
     const char * cx, char * cdst, const int ne,
     const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
 
     const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
     cpy_f32_f16<cpy_1_f32_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
 }
 
 static void ggml_cpy_f32_q8_0_cuda(
     const char * cx, char * cdst, const int ne,
     const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
 
     GGML_ASSERT(ne % QK8_0 == 0);
     const int num_blocks = ne / QK8_0;
     cpy_f32_q<cpy_blck_f32_q8_0, QK8_0><<<num_blocks, 1, 0, stream>>>
-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
 }
 
 static void ggml_cpy_q8_0_f32_cuda(
     const char * cx, char * cdst, const int ne,
     const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
 
     const int num_blocks = ne;
     cpy_q_f32<cpy_blck_q8_0_f32, QK8_0><<<num_blocks, 1, 0, stream>>>
-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
 }
 
 static void ggml_cpy_f32_q4_0_cuda(
     const char * cx, char * cdst, const int ne,
     const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
 
     GGML_ASSERT(ne % QK4_0 == 0);
     const int num_blocks = ne / QK4_0;
     cpy_f32_q<cpy_blck_f32_q4_0, QK4_0><<<num_blocks, 1, 0, stream>>>
-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
 }
 
 static void ggml_cpy_q4_0_f32_cuda(
@@ -407,22 +431,22 @@ static void ggml_cpy_q4_0_f32_cuda(
     const int nb00, const int nb01, const int nb02,
     const int nb03, const int ne10, const int ne11, const int ne12,
     const int nb10, const int nb11, const int nb12, const int nb13,
-    cudaStream_t stream) {
+    cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
     const int num_blocks = ne;
     cpy_q_f32<cpy_blck_q_f32<dequantize_q4_0, QK4_0>, QK4_0><<<num_blocks, 1, 0, stream>>>(
         cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
-         ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+         ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
 }
 
 static void ggml_cpy_f32_q4_1_cuda(
     const char * cx, char * cdst, const int ne,
     const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
 
     GGML_ASSERT(ne % QK4_1 == 0);
     const int num_blocks = ne / QK4_1;
     cpy_f32_q<cpy_blck_f32_q4_1, QK4_1><<<num_blocks, 1, 0, stream>>>
-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
 }
 
 static void ggml_cpy_q4_1_f32_cuda(
@@ -431,22 +455,22 @@ static void ggml_cpy_q4_1_f32_cuda(
     const int nb00, const int nb01, const int nb02,
     const int nb03, const int ne10, const int ne11, const int ne12,
     const int nb10, const int nb11, const int nb12, const int nb13,
-    cudaStream_t stream) {
+    cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
     const int num_blocks = ne;
     cpy_q_f32<cpy_blck_q_f32<dequantize_q4_1, QK4_1>, QK4_1><<<num_blocks, 1, 0, stream>>>(
         cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
-         ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+         ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
 }
 
 static void ggml_cpy_f32_q5_0_cuda(
     const char * cx, char * cdst, const int ne,
     const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
 
     GGML_ASSERT(ne % QK5_0 == 0);
     const int num_blocks = ne / QK5_0;
     cpy_f32_q<cpy_blck_f32_q5_0, QK5_0><<<num_blocks, 1, 0, stream>>>
-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
 }
 
 static void ggml_cpy_q5_0_f32_cuda(
@@ -455,22 +479,22 @@ static void ggml_cpy_q5_0_f32_cuda(
     const int nb00, const int nb01, const int nb02,
     const int nb03, const int ne10, const int ne11, const int ne12,
     const int nb10, const int nb11, const int nb12, const int nb13,
-    cudaStream_t stream) {
+    cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
     const int num_blocks = ne;
     cpy_q_f32<cpy_blck_q_f32<dequantize_q5_0, QK5_0>, QK5_0><<<num_blocks, 1, 0, stream>>>(
         cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
-        ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+        ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
 }
 
 static void ggml_cpy_f32_q5_1_cuda(
     const char * cx, char * cdst, const int ne,
     const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
 
     GGML_ASSERT(ne % QK5_1 == 0);
     const int num_blocks = ne / QK5_1;
     cpy_f32_q<cpy_blck_f32_q5_1, QK5_1><<<num_blocks, 1, 0, stream>>>
-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
 }
 
 static void ggml_cpy_q5_1_f32_cuda(
@@ -479,32 +503,32 @@ static void ggml_cpy_q5_1_f32_cuda(
     const int nb00, const int nb01, const int nb02,
     const int nb03, const int ne10, const int ne11, const int ne12,
     const int nb10, const int nb11, const int nb12, const int nb13,
-    cudaStream_t stream) {
+    cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
     const int num_blocks = ne;
     cpy_q_f32<cpy_blck_q_f32<dequantize_q5_1, QK5_1>, QK5_1><<<num_blocks, 1, 0, stream>>>(
         cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
-        ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+        ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
 }
 
 static void ggml_cpy_f32_iq4_nl_cuda(
     const char * cx, char * cdst, const int ne,
     const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
 
     GGML_ASSERT(ne % QK4_NL == 0);
     const int num_blocks = ne / QK4_NL;
     cpy_f32_q<cpy_blck_f32_iq4_nl, QK4_NL><<<num_blocks, 1, 0, stream>>>
-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
 }
 
 static void ggml_cpy_f16_f16_cuda(
     const char * cx, char * cdst, const int ne,
     const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
 
     const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
     cpy_f32_f16<cpy_1_f16_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
 }
 
 void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1) {
@@ -541,46 +565,60 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
     char * src0_ddc = (char *) src0->data;
     char * src1_ddc = (char *) src1->data;
 
+    char ** dest_ptrs_d = nullptr;
+    int graph_cpynode_index = -1;
+#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)
+    if(ctx.cuda_graph->use_cpy_indirection) {
+        dest_ptrs_d = ctx.cuda_graph->dest_ptrs_d;
+        graph_cpynode_index = ctx.cuda_graph->graph_cpynode_index;
+    }
+#endif
     if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
         GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1));
         CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
-        ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+        ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
-        ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+        ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
-        ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+        ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
     } else if (src0->type == GGML_TYPE_Q8_0 && src1->type == GGML_TYPE_F32) {
-        ggml_cpy_q8_0_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+        ggml_cpy_q8_0_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
-        ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+        ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
     } else if (src0->type == GGML_TYPE_Q4_0 && src1->type == GGML_TYPE_F32) {
         ggml_cpy_q4_0_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02,
-            nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+            nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
-        ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+        ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
     } else if (src0->type == GGML_TYPE_Q4_1 && src1->type == GGML_TYPE_F32) {
         ggml_cpy_q4_1_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02,
-            nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+            nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_0) {
-        ggml_cpy_f32_q5_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+        ggml_cpy_f32_q5_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
     } else if (src0->type == GGML_TYPE_Q5_0 && src1->type == GGML_TYPE_F32) {
         ggml_cpy_q5_0_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02,
-            nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+            nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_IQ4_NL) {
-        ggml_cpy_f32_iq4_nl_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+        ggml_cpy_f32_iq4_nl_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_1) {
-        ggml_cpy_f32_q5_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+        ggml_cpy_f32_q5_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
     } else if (src0->type == GGML_TYPE_Q5_1 && src1->type == GGML_TYPE_F32) {
-        ggml_cpy_q5_1_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+        ggml_cpy_q5_1_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
     } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
-        ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+        ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
     } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
-        ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+        ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
     } else {
         GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
                 ggml_type_name(src0->type), ggml_type_name(src1->type));
     }
+#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)
+    if(ctx.cuda_graph->use_cpy_indirection) {
+        ctx.cuda_graph->graph_cpynode_index = graph_cpynode_index;
+    }
+#endif
+
 }
 
 void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
diff --git a/ggml/src/ggml-cuda/cpy.cuh b/ggml/src/ggml-cuda/cpy.cuh
index 28b06cdda..6bed0564d 100644
--- a/ggml/src/ggml-cuda/cpy.cuh
+++ b/ggml/src/ggml-cuda/cpy.cuh
@@ -7,3 +7,5 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
 void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 
 void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1);
+
+void ggml_cuda_cpy_dest_ptrs_copy(ggml_cuda_graph * cuda_graph, char ** host_dest_ptrs, const int host_dest_ptrs_size, cudaStream_t stream);
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 861927654..57319bafd 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2441,10 +2441,11 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
 
 #ifdef USE_CUDA_GRAPH
 static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
-    std::vector<void *> & ggml_cuda_cpy_fn_ptrs, bool use_cuda_graph) {
+    bool use_cuda_graph) {
 
     // Loop over nodes in GGML graph to obtain info needed for CUDA graph
-    cuda_ctx->cuda_graph->updated_kernel_arg.clear();
+    cuda_ctx->cuda_graph->cpy_dest_ptrs.clear();
+
     for (int i = 0; i < cgraph->n_nodes; i++) {
         ggml_tensor * node = cgraph->nodes[i];
 
@@ -2476,8 +2477,11 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
         }
 
         if (node->op == GGML_OP_CPY) {
-            // store the copy op parameter which changes with each token.
-            cuda_ctx->cuda_graph->updated_kernel_arg.push_back((char **) &(node->src[1]->data));
+
+            // Store the pointers which are updated for each token, such that these can be sent
+            // to the device and accessed using indirection from CUDA graph
+            cuda_ctx->cuda_graph->cpy_dest_ptrs.push_back((char *) node->src[1]->data);
+
             // store a pointer to each copy op CUDA kernel to identify it later
             void * ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]);
             if (!ptr) {
@@ -2485,10 +2489,6 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
 #ifndef NDEBUG
                 GGML_LOG_DEBUG("%s: disabling CUDA graphs due to unsupported copy op\n", __func__);
 #endif
-            } else {
-                if (std::find(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), ptr) == ggml_cuda_cpy_fn_ptrs.end()) {
-                    ggml_cuda_cpy_fn_ptrs.push_back(ptr);
-                }
             }
         }
 
@@ -2497,6 +2497,12 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
         }
     }
 
+    if (use_cuda_graph) {
+        cuda_ctx->cuda_graph->use_cpy_indirection = true;
+        // copy pointers to GPU so they can be accessed via indirection within CUDA graph
+        ggml_cuda_cpy_dest_ptrs_copy(cuda_ctx->cuda_graph.get(), cuda_ctx->cuda_graph->cpy_dest_ptrs.data(), cuda_ctx->cuda_graph->cpy_dest_ptrs.size(), cuda_ctx->stream());
+    }
+
     return use_cuda_graph;
 }
 
@@ -2551,51 +2557,6 @@ static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_gra
     return true;
 }
 
-static void maintain_cuda_graph(ggml_backend_cuda_context * cuda_ctx, std::vector<void *> & ggml_cuda_cpy_fn_ptrs, bool cuda_graph_update_required) {
-
-    if (cuda_graph_update_required) {
-        // Extract nodes from graph
-        // First call with null argument gets number of nodes in graph
-        CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, nullptr, &cuda_ctx->cuda_graph->num_nodes));
-        // Subsequent call with non-null argument gets nodes
-        cuda_ctx->cuda_graph->nodes.clear();
-        cuda_ctx->cuda_graph->nodes.resize(cuda_ctx->cuda_graph->num_nodes);
-        cuda_ctx->cuda_graph->params.clear();
-        cuda_ctx->cuda_graph->params.resize(cuda_ctx->cuda_graph->num_nodes);
-        if (cuda_ctx->cuda_graph->num_nodes > 0) {
-            CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, cuda_ctx->cuda_graph->nodes.data(), &cuda_ctx->cuda_graph->num_nodes));
-
-            // Loop over nodes, and extract kernel parameters from each node
-            for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
-                cudaGraphNodeType node_type;
-                CUDA_CHECK(cudaGraphNodeGetType(cuda_ctx->cuda_graph->nodes[i], &node_type));
-                if (node_type == cudaGraphNodeTypeKernel) {
-                    cudaError_t stat = cudaGraphKernelNodeGetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]); // Get params using runtime
-                    if (stat == cudaErrorInvalidDeviceFunction) {
-                        // Fails due to incorrect handling by CUDA runtime of CUDA BLAS node.
-                        // We don't need to update blas nodes, so clear error and move on.
-                        (void)cudaGetLastError();
-                    } else {
-                        GGML_ASSERT(stat == cudaSuccess);
-                    }
-                }
-            }
-        }
-    } else {
-        // One of the arguments to the copy kernel is updated for each token, hence we need to
-        // replace that argument with the updated value in the CUDA graph
-        // on update steps, the live parameters will already be captured
-        int k = 0;
-        for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
-            if(count(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), cuda_ctx->cuda_graph->params[i].func) > 0) {
-                char ** updated_kernel_arg_ptr = cuda_ctx->cuda_graph->updated_kernel_arg.at(k++);
-                *(void**)cuda_ctx->cuda_graph->params[i].kernelParams[1] = *(void**)updated_kernel_arg_ptr;
-                CUDA_CHECK(cudaGraphKernelNodeSetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]));
-            }
-        }
-    }
-}
-
 static bool is_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph) {
 
     bool cuda_graph_update_required = false;
@@ -2655,8 +2616,7 @@ static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) {
 #endif
 
 static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
-   [[maybe_unused]] std::vector<void *> & ggml_cuda_cpy_fn_ptrs,  bool & graph_evaluated_or_captured, bool & use_cuda_graph,
-    bool & cuda_graph_update_required) {
+    bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) {
 
     while (!graph_evaluated_or_captured) {
         // Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph.
@@ -2706,13 +2666,9 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
         if (cuda_ctx->cuda_graph->instance == nullptr) { // Create executable graph from captured graph.
             CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
         }
-
-        // Perform update to graph (if required for this token), and change copy parameter (required for every token)
-        maintain_cuda_graph(cuda_ctx, ggml_cuda_cpy_fn_ptrs, cuda_graph_update_required);
-
-        // Update graph executable
-        update_cuda_graph_executable(cuda_ctx);
-
+        if (cuda_graph_update_required) { // Update graph executable
+            update_cuda_graph_executable(cuda_ctx);
+        }
         // Launch graph
         CUDA_CHECK(cudaGraphLaunch(cuda_ctx->cuda_graph->instance, cuda_ctx->stream()));
 #else
@@ -2726,10 +2682,6 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
 
     ggml_cuda_set_device(cuda_ctx->device);
 
-    // vector of pointers to CUDA cpy kernels, which are required to identify
-    // kernel parameters which need updated in the graph for each token
-    std::vector<void *> ggml_cuda_cpy_fn_ptrs;
-
 #ifdef USE_CUDA_GRAPH
     static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);
 
@@ -2763,8 +2715,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
     if (use_cuda_graph) {
         cuda_graph_update_required = is_cuda_graph_update_required(cuda_ctx, cgraph);
 
-        use_cuda_graph = check_node_graph_compatibility_and_refresh_copy_ops(cuda_ctx, cgraph,
-                             ggml_cuda_cpy_fn_ptrs, use_cuda_graph);
+        use_cuda_graph = check_node_graph_compatibility_and_refresh_copy_ops(cuda_ctx, cgraph, use_cuda_graph);
 
         // Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
         if (use_cuda_graph && cuda_graph_update_required) {
@@ -2785,6 +2736,10 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
         CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed));
     }
 
+    if (!use_cuda_graph) {
+        cuda_ctx->cuda_graph->use_cpy_indirection = false;
+    }
+
 #else
     bool use_cuda_graph = false;
     bool cuda_graph_update_required = false;
@@ -2792,7 +2747,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
 
     bool graph_evaluated_or_captured = false;
 
-    evaluate_and_capture_cuda_graph(cuda_ctx, cgraph, ggml_cuda_cpy_fn_ptrs, graph_evaluated_or_captured, use_cuda_graph, cuda_graph_update_required);
+    evaluate_and_capture_cuda_graph(cuda_ctx, cgraph, graph_evaluated_or_captured, use_cuda_graph, cuda_graph_update_required);
 
     return GGML_STATUS_SUCCESS;
 }

From 65cfe136a0793b2fdf5af8bdb1ab2cf10c3e1d2e Mon Sep 17 00:00:00 2001
From: Chenguang Li <757486878@qq.com>
Date: Thu, 3 Apr 2025 15:18:08 +0800
Subject: [PATCH 19/20] CANN: Support operator SIN COS ARGMAX (#12709)

* [CANN]support sin cos argmax

Signed-off-by: noemotiovon <noemotiovon@gmail.com>

* [CANN]codestyle adjustment

Signed-off-by: noemotiovon <noemotiovon@gmail.com>

* [CANN]Remove redundant code

Signed-off-by: noemotiovon <noemotiovon@gmail.com>

---------

Signed-off-by: noemotiovon <noemotiovon@gmail.com>
Co-authored-by: noemotiovon <noemotiovon@gmail.com>
---
 ggml/src/ggml-cann/aclnn_ops.cpp | 44 ++++++++++++++++++++++++++++++++
 ggml/src/ggml-cann/aclnn_ops.h   | 41 +++++++++++++++++++++++++++++
 ggml/src/ggml-cann/ggml-cann.cpp | 12 +++++++++
 3 files changed, 97 insertions(+)

diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp
index ae13730c0..f5734cbab 100644
--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
@@ -51,6 +51,7 @@
 #include <aclnnop/aclnn_triu.h>
 #include <aclnnop/aclnn_upsample_nearest_2d.h>
 #include <aclnnop/aclnn_weight_quant_batch_matmul_v2.h>
+#include <aclnnop/aclnn_argmax.h>
 #include <float.h>
 
 #include <cmath>
@@ -3440,3 +3441,46 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor));
     ACL_CHECK(aclDestroyTensor(acl_dst));
 }
+
+
+ void ggml_cann_argmax(ggml_backend_cann_context& ctx, ggml_tensor* dst){
+    ggml_tensor * src0 = dst->src[0];
+
+    aclTensor* acl_src = ggml_cann_create_tensor(src0);
+    aclTensor* acl_dst = ggml_cann_create_tensor(dst, dst->ne, dst->nb, 3);
+
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(aclnnArgMaxGetWorkspaceSize(acl_src, 3, false, acl_dst,
+                     &workspaceSize, &executor));
+    if (workspaceSize > 0) {
+        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+        workspaceAddr = workspace_allocator.get();
+    }
+    ACL_CHECK(aclnnArgMax(workspaceAddr, workspaceSize, executor, ctx.stream()));
+
+    ACL_CHECK(aclDestroyTensor(acl_src));
+    ACL_CHECK(aclDestroyTensor(acl_dst));
+}
+
+void ggml_cann_cos(ggml_backend_cann_context& ctx, ggml_tensor* dst){
+    ggml_tensor * src0 = dst->src[0];
+
+    aclTensor* acl_src = ggml_cann_create_tensor(src0);
+    aclTensor* acl_dst = ggml_cann_create_tensor(dst);
+    aclnn_cos(ctx, acl_src, acl_dst);
+    ACL_CHECK(aclDestroyTensor(acl_src));
+    ACL_CHECK(aclDestroyTensor(acl_dst));
+}
+
+void ggml_cann_sin(ggml_backend_cann_context& ctx, ggml_tensor* dst){
+    ggml_tensor * src0 = dst->src[0];
+
+    aclTensor* acl_src = ggml_cann_create_tensor(src0);
+    aclTensor* acl_dst = ggml_cann_create_tensor(dst);
+    aclnn_sin(ctx, acl_src, acl_dst);
+    ACL_CHECK(aclDestroyTensor(acl_src));
+    ACL_CHECK(aclDestroyTensor(acl_dst));
+}
diff --git a/ggml/src/ggml-cann/aclnn_ops.h b/ggml/src/ggml-cann/aclnn_ops.h
index 51a5cf92f..132790503 100644
--- a/ggml/src/ggml-cann/aclnn_ops.h
+++ b/ggml/src/ggml-cann/aclnn_ops.h
@@ -484,6 +484,47 @@ void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst);
  */
 void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 
+/**
+ * @brief   Computes the index of the maximum value along the specified dimension
+ *          of a ggml tensor using the CANN backend.
+ *
+ * @details This function performs an argmax operation on the input tensor.
+ *          It finds the index of the maximum value along the specified axis
+ *          and stores these indices in the destination tensor `dst`. The
+ *          operation is executed using the CANN backend for optimized performance.
+ *
+ * @param ctx The CANN context used for operations.
+ * @param dst The destination tensor where the indices of the maximum values will be stored.
+ *            dst->op is `GGML_OP_ARGMAX`.
+ */
+void ggml_cann_argmax(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
+/**
+ * @brief   Computes the cosine of each element in a ggml tensor using the CANN backend.
+ *
+ * @details This function applies the cosine function element-wise to the input tensor.
+ *          The computed cosine values are stored in the destination tensor `dst`.
+ *          The operation is optimized using the CANN backend for improved performance.
+ *
+ * @param ctx The CANN context used for operations.
+ * @param dst The destination tensor where the cosine values will be stored.
+ *            dst->op is `GGML_OP_COS`.
+ */
+void ggml_cann_cos(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
+/**
+ * @brief   Computes the sine of each element in a ggml tensor using the CANN backend.
+ *
+ * @details This function applies the sine function element-wise to the input tensor.
+ *          The computed sine values are stored in the destination tensor `dst`.
+ *          The operation is optimized using the CANN backend for improved performance.
+ *
+ * @param ctx The CANN context used for operations.
+ * @param dst The destination tensor where the sine values will be stored.
+ *            dst->op is `GGML_OP_SIN`.
+ */
+void ggml_cann_sin(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
 template <aclnnStatus getWorkspaceSize(const aclTensor*, const aclTensor*,
                                        aclTensor*, uint64_t*, aclOpExecutor**),
           aclnnStatus execute(void*, uint64_t, aclOpExecutor*, aclrtStream)>
diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
index 3527bd298..5e790f05f 100644
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -1420,6 +1420,15 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
         case GGML_OP_ARGSORT:
             ggml_cann_argsort(ctx, dst);
             break;
+        case GGML_OP_ARGMAX:
+            ggml_cann_argmax(ctx, dst);
+            break;
+        case GGML_OP_COS:
+            ggml_cann_cos(ctx, dst);
+            break;
+        case GGML_OP_SIN:
+            ggml_cann_sin(ctx, dst);
+            break;
         default:
             return false;
     }
@@ -1802,6 +1811,9 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
         case GGML_OP_ARANGE:
         case GGML_OP_TIMESTEP_EMBEDDING:
         case GGML_OP_LEAKY_RELU:
+        case GGML_OP_ARGMAX:
+        case GGML_OP_COS:
+        case GGML_OP_SIN:
             return true;
         default:
             return false;

From 193c3e03a63ccda3ac3d6a2999e41e6d1414fe23 Mon Sep 17 00:00:00 2001
From: a3sh <38979186+A3shTnT@users.noreply.github.com>
Date: Thu, 3 Apr 2025 15:32:55 +0800
Subject: [PATCH 20/20] fix MUSA compiler warning (#12704)

* fix MUSA compiler warning

* replace (void) with GGML_UNUSED
---
 ggml/src/ggml-cuda/ssm-conv.cu | 59 ++++++++++++++++------------------
 ggml/src/ggml-cuda/ssm-scan.cu | 34 +++++++++-----------
 2 files changed, 44 insertions(+), 49 deletions(-)

diff --git a/ggml/src/ggml-cuda/ssm-conv.cu b/ggml/src/ggml-cuda/ssm-conv.cu
index cfe03d68f..f63757196 100644
--- a/ggml/src/ggml-cuda/ssm-conv.cu
+++ b/ggml/src/ggml-cuda/ssm-conv.cu
@@ -4,13 +4,14 @@ template <size_t split_d_inner, size_t d_conv>
 static __global__ void ssm_conv_f32(const float * __restrict__ src0, const float * __restrict__ src1,
                                     const int src0_nb0, const int src0_nb1, const int src0_nb2, const int src1_nb1,
                                     float * __restrict__ dst, const int dst_nb0, const int dst_nb1, const int dst_nb2,
-                                    const int nc, const int ncs, const int nr, const int n_t, const int n_s) {
+                                    const int64_t n_t) {
+    GGML_UNUSED(src0_nb0);
     const int tid  = threadIdx.x;
     const int bidx = blockIdx.x;
     const int bidy = blockIdx.y;
 
-    const float * x_block = (const float *) ((char *) src0 + bidx * src0_nb2 + bidy * split_d_inner * src0_nb1);
-    const float * w_block = (const float *) ((char *) src1 + bidy * split_d_inner * src1_nb1);
+    const float * x_block = (const float *) ((const char *) src0 + bidx * src0_nb2 + bidy * split_d_inner * src0_nb1);
+    const float * w_block = (const float *) ((const char *) src1 + bidy * split_d_inner * src1_nb1);
     float *       y_block = (float *) ((char *) dst + bidx * dst_nb2 + bidy * split_d_inner * dst_nb0);
 
     const int stride_x = src0_nb1 / sizeof(float);
@@ -21,15 +22,15 @@ static __global__ void ssm_conv_f32(const float * __restrict__ src0, const float
     float w[d_conv] = { 0.0f };
 
 #pragma unroll
-    for (int j = 0; j < d_conv; j++) {
+    for (size_t j = 0; j < d_conv; j++) {
         w[j] = w_block[tid * stride_w + j];
     }
 
-    for (int i = 0; i < n_t; i++) {
+    for (int64_t i = 0; i < n_t; i++) {
         float sumf = 0.0f;
 
         if (i == 0) {
-            for (int j = 0; j < d_conv; j++) {
+            for (size_t j = 0; j < d_conv; j++) {
                 x[j] = x_block[tid * stride_x + j];
             }
         } else {
@@ -37,27 +38,26 @@ static __global__ void ssm_conv_f32(const float * __restrict__ src0, const float
         }
 
 #pragma unroll
-        for (int j = 0; j < d_conv; j++) {
+        for (size_t j = 0; j < d_conv; j++) {
             sumf += x[(i + j) % d_conv] * w[j];
         }
         y_block[i * stride_y + tid] = sumf;
     }
 }
 
-template <size_t split_d_inner, size_t d_conv, size_t split_n_t>
+template <size_t split_d_inner, size_t d_conv, int64_t split_n_t>
 static __global__ void ssm_conv_long_token_f32(const float * __restrict__ src0, const float * __restrict__ src1,
                                                const int src0_nb0, const int src0_nb1, const int src0_nb2,
                                                const int src1_nb1, float * __restrict__ dst, const int dst_nb0,
-                                               const int dst_nb1, const int dst_nb2, const int nc, const int ncs,
-                                               const int nr, const int n_t, const int n_s) {
+                                               const int dst_nb1, const int dst_nb2, const int64_t n_t) {
     const int tid  = threadIdx.x;
     const int bidx = blockIdx.x;
     const int bidy = blockIdx.y;
     const int bidz = blockIdx.z;
 
-    const float * x_block = (const float *) ((char *) src0 + bidx * src0_nb2 + bidy * split_d_inner * src0_nb1 +
+    const float * x_block = (const float *) ((const char *) src0 + bidx * src0_nb2 + bidy * split_d_inner * src0_nb1 +
                                              bidz * split_n_t * src0_nb0);
-    const float * w_block = (const float *) ((char *) src1 + bidy * split_d_inner * src1_nb1);
+    const float * w_block = (const float *) ((const char *) src1 + bidy * split_d_inner * src1_nb1);
     float *       y_block =
         (float *) ((char *) dst + bidx * dst_nb2 + bidz * split_n_t * dst_nb1 + bidy * split_d_inner * dst_nb0);
 
@@ -69,17 +69,17 @@ static __global__ void ssm_conv_long_token_f32(const float * __restrict__ src0,
     float w[d_conv] = { 0.0f };
 
 #pragma unroll
-    for (int j = 0; j < d_conv; j++) {
+    for (size_t j = 0; j < d_conv; j++) {
         w[j] = w_block[tid * stride_w + j];
     }
 
 #pragma unroll
-    for (int i = 0; i < split_n_t; i++) {
+    for (int64_t i = 0; i < split_n_t; i++) {
         if (bidz * split_n_t + i < n_t) {
             float sumf = 0.0f;
 
             if (i == 0) {
-                for (int j = 0; j < d_conv; j++) {
+                for (size_t j = 0; j < d_conv; j++) {
                     x[j] = x_block[tid * stride_x + j];
                 }
             } else {
@@ -87,7 +87,7 @@ static __global__ void ssm_conv_long_token_f32(const float * __restrict__ src0,
             }
 
 #pragma unroll
-            for (int j = 0; j < d_conv; j++) {
+            for (size_t j = 0; j < d_conv; j++) {
                 sumf += x[(i + j) % d_conv] * w[j];
             }
             y_block[i * stride_y + tid] = sumf;
@@ -97,8 +97,8 @@ static __global__ void ssm_conv_long_token_f32(const float * __restrict__ src0,
 
 static void ssm_conv_f32_cuda(const float * src0, const float * src1, const int src0_nb0, const int src0_nb1,
                               const int src0_nb2, const int src1_nb1, float * dst, const int dst_nb0, const int dst_nb1,
-                              const int dst_nb2, const int nc, const int ncs, const int nr, const int n_t,
-                              const int n_s, cudaStream_t stream) {
+                              const int dst_nb2, const int64_t nc, const int64_t nr, const int64_t n_t,
+                              const int64_t n_s, cudaStream_t stream) {
     const int threads = 128;
     GGML_ASSERT(nr % threads == 0);
 
@@ -106,18 +106,16 @@ static void ssm_conv_f32_cuda(const float * src0, const float * src1, const int
         const dim3 blocks(n_s, (nr + threads - 1) / threads, 1);
         if (nc == 4) {
             ssm_conv_f32<threads, 4><<<blocks, threads, 0, stream>>>(src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1,
-                                                                     dst, dst_nb0, dst_nb1, dst_nb2, nc, ncs, nr, n_t,
-                                                                     n_s);
+                                                                     dst, dst_nb0, dst_nb1, dst_nb2, n_t);
         } else {
             GGML_ABORT("Only support kernel size = 4  now.");
         }
     } else {
         if (nc == 4) {
-            const int split_n_t = 32;
-            dim3      blocks(n_s, (nr + threads - 1) / threads, (n_t + split_n_t - 1) / split_n_t);
-            ssm_conv_long_token_f32<threads, 4, split_n_t>
-                <<<blocks, threads, 0, stream>>>(src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1, dst, dst_nb0,
-                                                 dst_nb1, dst_nb2, nc, ncs, nr, n_t, n_s);
+            const int64_t split_n_t = 32;
+            dim3          blocks(n_s, (nr + threads - 1) / threads, (n_t + split_n_t - 1) / split_n_t);
+            ssm_conv_long_token_f32<threads, 4, split_n_t><<<blocks, threads, 0, stream>>>(
+                src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1, dst, dst_nb0, dst_nb1, dst_nb2, n_t);
         } else {
             GGML_ABORT("Only support kernel size = 4 right now.");
         }
@@ -128,11 +126,10 @@ void ggml_cuda_op_ssm_conv(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     const struct ggml_tensor * src0 = dst->src[0];  // conv_x
     const struct ggml_tensor * src1 = dst->src[1];  // conv1d.weight
 
-    const int nc  = src1->ne[0];                    // d_conv
-    const int ncs = src0->ne[0];                    // d_conv - 1 + n_t
-    const int nr  = src0->ne[1];                    // d_inner
-    const int n_t = dst->ne[1];                     // tokens per sequence
-    const int n_s = dst->ne[2];                     // number of sequences in the batch
+    const int64_t nc  = src1->ne[0];                // d_conv
+    const int64_t nr  = src0->ne[1];                // d_inner
+    const int64_t n_t = dst->ne[1];                 // tokens per sequence
+    const int64_t n_s = dst->ne[2];                 // number of sequences in the batch
 
     GGML_ASSERT(dst->ne[0] == nr);
     GGML_ASSERT(src0->nb[0] == sizeof(float));
@@ -147,5 +144,5 @@ void ggml_cuda_op_ssm_conv(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     GGML_ASSERT(src0->type == GGML_TYPE_F32);
     GGML_ASSERT(dst->type == GGML_TYPE_F32);
     ssm_conv_f32_cuda(src0_d, src1_d, src0->nb[0], src0->nb[1], src0->nb[2], src1->nb[1], dst_d, dst->nb[0], dst->nb[1],
-                      dst->nb[2], nc, ncs, nr, n_t, n_s, stream);
+                      dst->nb[2], nc, nr, n_t, n_s, stream);
 }
diff --git a/ggml/src/ggml-cuda/ssm-scan.cu b/ggml/src/ggml-cuda/ssm-scan.cu
index 52db17cd9..37ee208c0 100644
--- a/ggml/src/ggml-cuda/ssm-scan.cu
+++ b/ggml/src/ggml-cuda/ssm-scan.cu
@@ -1,10 +1,5 @@
 #include "ssm-scan.cuh"
 
-// #include <cuda_runtime.h>
-// static __device__ void global_to_shared(const float *src, float *dst) {
-//   asm volatile("cp.async.");
-// }
-
 template <size_t splitD, size_t N>
 __global__ void __launch_bounds__(splitD, 2)
     ssm_scan_f32(const float * __restrict__ src0, const float * __restrict__ src1, const float * __restrict__ src2,
@@ -12,7 +7,9 @@ __global__ void __launch_bounds__(splitD, 2)
                  const int src0_nb1, const int src0_nb2, const int src1_nb0, const int src1_nb1, const int src1_nb2,
                  const int src1_nb3, const int src2_nb0, const int src2_nb1, const int src2_nb2, const int src3_nb1,
                  const int src4_nb1, const int src4_nb2, const int src5_nb1, const int src5_nb2,
-                 float * __restrict__ dst, const int D, const int L, const int B) {
+                 float * __restrict__ dst, const int64_t L) {
+    GGML_UNUSED(src1_nb0);
+    GGML_UNUSED(src2_nb0);
     const int bidx = blockIdx.x;  // split along B
     const int bidy = blockIdx.y;  // split along D
     const int tid  = threadIdx.x;
@@ -25,12 +22,12 @@ __global__ void __launch_bounds__(splitD, 2)
     float *                 smem_A     = smem;
     float *                 smem_s0    = smem_A + splitD * stride_sA;
 
-    const float * s0_block = (const float *) ((char *) src0 + bidx * src0_nb2 + bidy * splitD * src0_nb1);
-    const float * x_block  = (const float *) ((char *) src1 + (bidx * src1_nb2) + bidy * splitD * sizeof(float));
-    const float * dt_block = (const float *) ((char *) src2 + (bidx * src2_nb2) + bidy * splitD * sizeof(float));
-    const float * A_block  = (const float *) ((char *) src3 + bidy * splitD * src3_nb1);
-    const float * B_block  = (const float *) ((char *) src4 + (bidx * src4_nb2));
-    const float * C_block  = (const float *) ((char *) src5 + (bidx * src5_nb2));
+    const float * s0_block = (const float *) ((const char *) src0 + bidx * src0_nb2 + bidy * splitD * src0_nb1);
+    const float * x_block  = (const float *) ((const char *) src1 + (bidx * src1_nb2) + bidy * splitD * sizeof(float));
+    const float * dt_block = (const float *) ((const char *) src2 + (bidx * src2_nb2) + bidy * splitD * sizeof(float));
+    const float * A_block  = (const float *) ((const char *) src3 + bidy * splitD * src3_nb1);
+    const float * B_block  = (const float *) ((const char *) src4 + (bidx * src4_nb2));
+    const float * C_block  = (const float *) ((const char *) src5 + (bidx * src5_nb2));
     float *       y_block  = (float *) ((char *) dst + (bidx * src1_nb2) + bidy * splitD * sizeof(float));
     float *       s_block  = (float *) ((char *) dst + src1_nb3 + bidx * src0_nb2 + bidy * splitD * src0_nb1);
 
@@ -46,7 +43,7 @@ __global__ void __launch_bounds__(splitD, 2)
     // can N not be 16? for example 32?
     if (N == 16) {
 #pragma unroll
-        for (int i = 0; i < splitD / 4; i += 2) {
+        for (size_t i = 0; i < splitD / 4; i += 2) {
             float value = A_block[(wid * warpSize + i) * stride_A + wtid];
             // todo: bank conflict
             // I am always confused with how to use the swizzling method to solve
@@ -54,7 +51,7 @@ __global__ void __launch_bounds__(splitD, 2)
             smem_A[(wid * warpSize + i) * stride_sA + wtid + ((wtid / 16) > 0 ? 1 : 0)] = value;
         }
 #pragma unroll
-        for (int i = 0; i < splitD / 4; i += 2) {
+        for (size_t i = 0; i < splitD / 4; i += 2) {
             float value = s0_block[(wid * warpSize + i) * stride_s0 + wtid];
             smem_s0[(wid * warpSize + i) * stride_ss0 + wtid + ((wtid / 16) > 0 ? 1 : 0)] = value;
         }
@@ -62,7 +59,7 @@ __global__ void __launch_bounds__(splitD, 2)
 
     __syncthreads();
 
-    for (int i = 0; i < L; i++) {
+    for (int64_t i = 0; i < L; i++) {
         float dt_soft_plus = dt_block[i * stride_dt + tid];
         if (dt_soft_plus <= 20.0f) {
             dt_soft_plus = log1pf(exp(dt_soft_plus));
@@ -70,7 +67,7 @@ __global__ void __launch_bounds__(splitD, 2)
         float x_dt = x_block[i * stride_x + tid] * dt_soft_plus;
         float sumf = 0.0f;
 #pragma unroll
-        for (int j = 0; j < N; j++) {
+        for (size_t j = 0; j < N; j++) {
             float state = (smem_s0[tid * stride_ss0 + j] * expf(dt_soft_plus * smem_A[tid * stride_sA + j])) +
                           (B_block[i * stride_B + j] * x_dt);
             sumf += state * C_block[i * stride_C + j];
@@ -90,7 +87,8 @@ static void ssm_scan_f32_cuda(const float * src0, const float * src1, const floa
                               const int src1_nb0, const int src1_nb1, const int src1_nb2, const int src1_nb3,
                               const int src2_nb0, const int src2_nb1, const int src2_nb2, const int src3_nb1,
                               const int src4_nb1, const int src4_nb2, const int src5_nb1, const int src5_nb2,
-                              float * dst, const int N, const int D, const int L, const int B, cudaStream_t stream) {
+                              float * dst, const int64_t N, const int64_t D, const int64_t L, const int64_t B,
+                              cudaStream_t stream) {
     const int threads = 128;
     // todo: consider D cannot be divided,does this situation exist?
     GGML_ASSERT(D % threads == 0);
@@ -99,7 +97,7 @@ static void ssm_scan_f32_cuda(const float * src0, const float * src1, const floa
     if (N == 16) {
         ssm_scan_f32<128, 16><<<blocks, threads, smem_size, stream>>>(
             src0, src1, src2, src3, src4, src5, src0_nb1, src0_nb2, src1_nb0, src1_nb1, src1_nb2, src1_nb3, src2_nb0,
-            src2_nb1, src2_nb2, src3_nb1, src4_nb1, src4_nb2, src5_nb1, src5_nb2, dst, D, L, B);
+            src2_nb1, src2_nb2, src3_nb1, src4_nb1, src4_nb2, src5_nb1, src5_nb2, dst, L);
     } else {
         GGML_ABORT("doesn't support N!=16.");
     }