Merge commit '8c570c9496' into concedo_experimental

# Conflicts: # README.md # tests/test-backend-ops.cpp
2025-09-11 01:24:36 +00:00 · 2024-05-10 16:55:26 +08:00 · 2024-05-10 16:55:26 +08:00 · db82bad6f2
commit db82bad6f2
parent dbe72b959e 8c570c9496
18 changed files with 45581 additions and 35766 deletions
--- a/convert-hf-to-gguf-update.py
+++ b/convert-hf-to-gguf-update.py
@ -49,6 +49,10 @@ chktxt = '\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n🚀 (normal) 😶‍

 if len(sys.argv) == 2:
    token = sys.argv[1]
+    if not token.startswith("hf_"):
+        logger.info("Huggingface token seems invalid")
+        logger.info("Usage: python convert-hf-to-gguf-update.py <huggingface_token>")
+        sys.exit(1)
 else:
    logger.info("Usage: python convert-hf-to-gguf-update.py <huggingface_token>")
    sys.exit(1)
@ -257,6 +261,7 @@ tests = [
    "3333333",
    "33333333",
    "333333333",
+    # "Cửa Việt", # llama-bpe fails on this
    chktxt,
 ]

--- a/examples/convert-llama2c-to-ggml/README.md
+++ b/examples/convert-llama2c-to-ggml/README.md
@ -2,7 +2,7 @@

 This example reads weights from project [llama2.c](https://github.com/karpathy/llama2.c) and saves them in ggml compatible format. The vocab that is available in `models/ggml-vocab.bin` is used by default.

-To convert the model first download the models from the [llma2.c](https://github.com/karpathy/llama2.c) repository:
+To convert the model first download the models from the [llama2.c](https://github.com/karpathy/llama2.c) repository:

 `$ make -j`

--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@ -52,15 +52,15 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
                    size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
                    float v;
                    if (type == GGML_TYPE_F16) {
-                        v = ggml_fp16_to_fp32(*(ggml_fp16_t *) data + i);
+                        v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]);
                    } else if (type == GGML_TYPE_F32) {
-                        v = *(float *) data + i;
+                        v = *(float *) &data[i];
                    } else if (type == GGML_TYPE_I32) {
-                        v = (float) *(int32_t *) data + i;
+                        v = (float) *(int32_t *) &data[i];
                    } else if (type == GGML_TYPE_I16) {
-                        v = (float) *(int16_t *) data + i;
+                        v = (float) *(int16_t *) &data[i];
                    } else if (type == GGML_TYPE_I8) {
-                        v = (float) *(int8_t *) data + i;
+                        v = (float) *(int8_t *) &data[i];
                    } else {
                        GGML_ASSERT(false);
                    }
--- a/ggml-cuda/common.cuh
+++ b/ggml-cuda/common.cuh
@ -234,122 +234,6 @@ typedef float dfloat; // dequantize float
 typedef float2 dfloat2;
 #endif //GGML_CUDA_F16

-[[noreturn]]
-static __device__ void no_device_code(
-    const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
-
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-    printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n",
-           file_name, line, function_name, arch);
-    GGML_UNUSED(arch_list);
-#else
-    printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",
-           file_name, line, function_name, arch, arch_list);
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-    __trap();
-
-    GGML_UNUSED(no_device_code); // suppress unused function warning
-}
-
-#ifdef __CUDA_ARCH__
-#define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__))
-#else
-#define NO_DEVICE_CODE //GGML_ASSERT(false && "NO_DEVICE_CODE not valid in host code.")
-#endif // __CUDA_ARCH__
-
-static __device__ __forceinline__ float warp_reduce_sum(float x) {
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        x += __shfl_xor_sync(0xffffffff, x, mask, 32);
-    }
-    return x;
-}
-
-static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
-        a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
-    }
-    return a;
-}
-
-static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
-#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
-#pragma unroll
-   for (int mask = 16; mask > 0; mask >>= 1) {
-       a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
-   }
-   return a;
-#else
-   GGML_UNUSED(a);
-   NO_DEVICE_CODE;
-#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
-}
-
-static __device__ __forceinline__ float warp_reduce_max(float x) {
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
-    }
-    return x;
-}
-
-static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b) {
-#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
-
-#if CUDART_VERSION >= CUDART_HMAX
-    return __hmax(a, b);
-#else
-    return __half2float(a) > __half2float(b) ? a : b;
-#endif // CUDART_VERSION >= CUDART_HMAX
-
-#else
-    GGML_UNUSED(a);
-    GGML_UNUSED(b);
-    NO_DEVICE_CODE;
-#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
-}
-static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const half2 b) {
-#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
-
-#if CUDART_VERSION >= CUDART_HMAX
-    return __hmax2(a, b);
-#else
-    half2 ret;
-    reinterpret_cast<half&>(ret.x) =  __low2float(a) >  __low2float(b) ?  __low2half(a) :  __low2half(b);
-    reinterpret_cast<half&>(ret.y) = __high2float(a) > __high2float(b) ? __high2half(a) : __high2half(b);
-    return ret;
-#endif // CUDART_VERSION >= CUDART_HMAX
-
-#else
-    GGML_UNUSED(a);
-    GGML_UNUSED(b);
-    NO_DEVICE_CODE;
-#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
-}
-
-static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
-#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
-#pragma unroll
-   for (int mask = 16; mask > 0; mask >>= 1) {
-       x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
-   }
-   return x;
-#else
-   GGML_UNUSED(x);
-   NO_DEVICE_CODE;
-#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
-}
-
-#if CUDART_VERSION < CUDART_HMASK
-static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half2 b) {
-    const uint32_t mask_low  = 0x0000FFFF * (float( __low2half(a)) > float( __low2half(b)));
-    const uint32_t mask_high = 0xFFFF0000 * (float(__high2half(a)) > float(__high2half(b)));
-    return mask_low | mask_high;
-}
-#endif // CUDART_VERSION < 12000
-
 #if defined(GGML_USE_HIPBLAS)
 #define __CUDA_ARCH__ 1300

@ -433,11 +317,143 @@ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
 }
 #endif // defined(GGML_USE_HIPBLAS)

-#define FP16_AVAILABLE     defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) ? \
-    defined(RDNA1) || defined(RDNA2) || defined(RDNA3) : __CUDA_ARCH__ >= CC_PASCAL
+#define FP16_AVAILABLE (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL

 #define FP16_MMA_AVAILABLE !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA

+static bool fp16_mma_available(const int cc) {
+    return cc < CC_OFFSET_AMD && cc >= CC_VOLTA;
+}
+
+[[noreturn]]
+static __device__ void no_device_code(
+    const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
+
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+    printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n",
+           file_name, line, function_name, arch);
+    GGML_UNUSED(arch_list);
+#else
+    printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",
+           file_name, line, function_name, arch, arch_list);
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+    __trap();
+
+    GGML_UNUSED(no_device_code); // suppress unused function warning
+}
+
+#ifdef __CUDA_ARCH__
+#define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__))
+#else
+#define NO_DEVICE_CODE //GGML_ASSERT(false && "NO_DEVICE_CODE not valid in host code.")
+#endif // __CUDA_ARCH__
+
+static __device__ __forceinline__ float warp_reduce_sum(float x) {
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        x += __shfl_xor_sync(0xffffffff, x, mask, 32);
+    }
+    return x;
+}
+
+static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
+        a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
+    }
+    return a;
+}
+
+static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
+#if FP16_AVAILABLE
+
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        const half2 a_other = __shfl_xor_sync(0xffffffff, a, mask, 32);
+        reinterpret_cast<half&>(a.x) +=  __low2half(a_other);
+        reinterpret_cast<half&>(a.y) += __high2half(a_other);
+    }
+    return a;
+#else
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
+    }
+    return a;
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+
+#else
+    NO_DEVICE_CODE;
+    return a;
+#endif // FP16_AVAILABLE
+}
+
+static __device__ __forceinline__ float warp_reduce_max(float x) {
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
+    }
+    return x;
+}
+
+static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b) {
+#if FP16_AVAILABLE
+
+#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
+    return __float2half(fmaxf(__half2float(a), __half2float(b)));
+#else
+    return __hmax(a, b);
+#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
+
+#else
+   NO_DEVICE_CODE;
+   GGML_UNUSED(b);
+   return a;
+#endif // FP16_AVAILABLE
+}
+
+static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const half2 b) {
+#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
+
+#if CUDART_VERSION >= CUDART_HMAX
+    return __hmax2(a, b);
+#else
+    half2 ret;
+    reinterpret_cast<half&>(ret.x) = __float2half(fmaxf( __low2float(a),  __low2float(b)));
+    reinterpret_cast<half&>(ret.y) = __float2half(fmaxf(__high2float(a), __high2float(b)));
+    return ret;
+#endif // CUDART_VERSION >= CUDART_HMAX
+
+#else
+    GGML_UNUSED(a);
+    GGML_UNUSED(b);
+    NO_DEVICE_CODE;
+#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
+}
+
+static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
+#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
+#pragma unroll
+   for (int mask = 16; mask > 0; mask >>= 1) {
+       x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
+   }
+   return x;
+#else
+   GGML_UNUSED(x);
+   NO_DEVICE_CODE;
+#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
+}
+
+#if CUDART_VERSION < CUDART_HMASK
+static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half2 b) {
+    const uint32_t mask_low  = 0x0000FFFF * (float( __low2half(a)) > float( __low2half(b)));
+    const uint32_t mask_high = 0xFFFF0000 * (float(__high2half(a)) > float(__high2half(b)));
+    return mask_low | mask_high;
+}
+#endif // CUDART_VERSION < 12000
+
 // TODO: move to ggml-common.h
 static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};

--- a/ggml-cuda/fattn.cu
+++ b/ggml-cuda/fattn.cu
@ -11,8 +11,10 @@
 #define HALF_MAX_HALF         __float2half(65504.0f/2) // Use neg. of this instead of -INFINITY to initialize KQ max vals to avoid NaN upon subtraction.
 #define SOFTMAX_FTZ_THRESHOLD -20.0f                   // Softmax exp. of values smaller than this are flushed to zero to avoid NaNs.

-template<int D, int parallel_blocks> // D == head size
-__launch_bounds__(((D + WARP_SIZE - 1) / WARP_SIZE)*WARP_SIZE, 1)
+template<int D, int ncols, int parallel_blocks> // D == head size
+#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
+__launch_bounds__(D, 1)
+#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 static __global__ void flash_attn_vec_ext_f16(
        const char * __restrict__ Q,
        const char * __restrict__ K,
@ -44,55 +46,77 @@ static __global__ void flash_attn_vec_ext_f16(
 #if FP16_AVAILABLE
    //In this kernel Q, K, V are matrices while i, j, k are matrix indices.

-    const int ic = blockIdx.x / parallel_blocks; // Index of the Q/QKV column to work on.
-    const int ip = blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel.
+    const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
+    const int ip  =  blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel.

    const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
-    const float2 * Q_f2  = (const float2 *) (Q    + nb02* blockIdx.y              + nb01*ic);
+    const float2 * Q_f2  = (const float2 *) (Q    + nb02* blockIdx.y              + nb01*ic0);
    const half2  * K_h2  = (const half2  *) (K    + nb12*(blockIdx.y / gqa_ratio));
    const half   * V_h   = (const half   *) (V    + nb12*(blockIdx.y / gqa_ratio)); // K and V have same shape
-    const half   * maskh = (const half   *)  mask + ne11*ic;
+    const half   * maskh = (const half   *)  mask + ne11*ic0;

    const int stride_KV  = nb11 / sizeof(half);
    const int stride_KV2 = nb11 / sizeof(half2);

-    constexpr int nwarps = (D + WARP_SIZE - 1) / WARP_SIZE;
+    static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
+    constexpr int nwarps = D / WARP_SIZE;
    const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
-    __builtin_assume(tid < nwarps*WARP_SIZE);
+    __builtin_assume(tid < D);

-    __shared__ half KQ[nwarps*WARP_SIZE];
-    KQ[tid] = -INFINITY;
+    __shared__ half KQ[ncols*D];
+#pragma unroll
+    for (int j = 0; j < ncols; ++j) {
+        KQ[j*D + tid] = -HALF_MAX_HALF;
+    }
    half2 * KQ2 = (half2 *) KQ;

-    half kqmax = -HALF_MAX_HALF;
-    half kqsum = 0.0f;
+    half kqmax[ncols];
+#pragma unroll
+    for (int j = 0; j < ncols; ++j) {
+        kqmax[j] = -HALF_MAX_HALF;
+    }
+    half kqsum[ncols] = {0.0f};

-    __shared__ half kqmax_shared[WARP_SIZE];
-    __shared__ half kqsum_shared[WARP_SIZE];
-    if (threadIdx.y == 0) {
-        kqmax_shared[threadIdx.x] = -HALF_MAX_HALF;
-        kqsum_shared[threadIdx.x] = 0.0f;
+    __shared__ half kqmax_shared[ncols][WARP_SIZE];
+    __shared__ half kqsum_shared[ncols][WARP_SIZE];
+#pragma unroll
+    for (int j = 0; j < ncols; ++j) {
+        if (threadIdx.y == 0) {
+            kqmax_shared[j][threadIdx.x] = -HALF_MAX_HALF;
+            kqsum_shared[j][threadIdx.x] = 0.0f;
+        }
    }
    __syncthreads();

    // Convert Q to half2 and store in registers:
-    half2 Q_h2[(D/2 + WARP_SIZE - 1) / WARP_SIZE];
+    half2 Q_h2[ncols][D/(2*WARP_SIZE)];
 #pragma unroll
-    for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
-        const int i = i0 + threadIdx.x;
-        if (i0 + WARP_SIZE > D/2 && i >= D/2) {
-            break;
-        }
+    for (int j = 0; j < ncols; ++j) {
+#pragma unroll
+        for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
+            const int i = i0 + threadIdx.x;

-        Q_h2[i0/WARP_SIZE] = make_half2(scale, scale) * make_half2(Q_f2[i].x, Q_f2[i].y);
+            const float2 tmp = Q_f2[j*(nb01/sizeof(float2)) + i];
+            Q_h2[j][i0/WARP_SIZE] = make_half2(scale, scale) * make_half2(tmp.x, tmp.y);
+        }
    }

-    half2 VKQ = make_half2(0.0f, 0.0f); // Each thread calculates a single VKQ value.
+    half2 VKQ[ncols] = {{0.0f, 0.0f}};

-    const int k_start  = parallel_blocks == 1 ? 0 : ip*D;
+    const int k_start = parallel_blocks == 1 ? 0 : ip*D;
    for (int k_VKQ_0 = k_start; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*D) {
        // Calculate KQ tile and keep track of new maximum KQ values:
-        half kqmax_new = kqmax;
+
+        // For unknown reasons using a half array of size 1 for kqmax_new causes a performance regression,
+        // see https://github.com/ggerganov/llama.cpp/pull/7061 .
+        // Therefore this variable is defined twice but only used once (so that the compiler can optimize out the unused variable).
+        half kqmax_new = kqmax[0];
+        half kqmax_new_arr[ncols];
+#pragma unroll
+        for (int j = 0; j < ncols; ++j) {
+            kqmax_new_arr[j] = kqmax[j];
+        }
+
 #pragma unroll
        for (int i_KQ_0 = 0; i_KQ_0 < D; i_KQ_0 += nwarps) {
            const int i_KQ = i_KQ_0 + threadIdx.y;
@ -101,89 +125,112 @@ static __global__ void flash_attn_vec_ext_f16(
                break;
            }

-            half2 sum2 = make_half2(0.0f, 0.0f);
+            half2 sum2[ncols] = {{0.0f, 0.0f}};
 #pragma unroll
            for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += WARP_SIZE) {
                const int k_KQ = k_KQ_0 + threadIdx.x;
-                if (k_KQ_0 + WARP_SIZE > D/2 && k_KQ >= D/2) {
-                    break;
-                }

                const half2 K_ik = K_h2[(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ];
-                sum2 += K_ik * Q_h2[k_KQ_0/WARP_SIZE];
-            }
-
-            sum2 = warp_reduce_sum(sum2);
-            half sum = __low2half(sum2) + __high2half(sum2);
-            sum += mask ? maskh[k_VKQ_0 + i_KQ] : __float2half(0.0f);
-            kqmax_new = ggml_cuda_hmax(kqmax_new, sum);
-            if (threadIdx.x == 0) {
-                KQ[i_KQ] = sum;
-            }
-        }
-
-        kqmax_new = warp_reduce_max(kqmax_new);
-        if (threadIdx.x == 0) {
-            kqmax_shared[threadIdx.y] = kqmax_new;
-        }
-        __syncthreads();
-        kqmax_new = kqmax_shared[threadIdx.x];
-        kqmax_new = warp_reduce_max(kqmax_new);
-
-        const half KQ_max_scale = hexp(kqmax - kqmax_new);
-        kqmax = kqmax_new;
-
-        const half val = hexp(KQ[tid] - kqmax);
-        kqsum = kqsum*KQ_max_scale + val;
-        KQ[tid] = val;
-
-        VKQ *= __half2half2(KQ_max_scale);
-
-        __syncthreads();
-
-        if (tid < D) {
 #pragma unroll
-            for (int k0 = 0; k0 < D; k0 += 2) {
-                if (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + k0 >= ne11) {
-                    break;
+                for (int j = 0; j < ncols; ++j) {
+                    sum2[j] += K_ik * Q_h2[j][k_KQ_0/WARP_SIZE];
+                }
+            }
+
+#pragma unroll
+            for (int j = 0; j < ncols; ++j) {
+                sum2[j] = warp_reduce_sum(sum2[j]);
+                half sum = __low2half(sum2[j]) + __high2half(sum2[j]);
+                sum += mask ? maskh[j*ne11 + k_VKQ_0 + i_KQ] : __float2half(0.0f);
+
+                if (ncols == 1) {
+                    kqmax_new        = ggml_cuda_hmax(kqmax_new,        sum);
+                } else {
+                    kqmax_new_arr[j] = ggml_cuda_hmax(kqmax_new_arr[j], sum);
                }

-                half2 V_k;
-                reinterpret_cast<half&>(V_k.x) = V_h[(k_VKQ_0 + k0 + 0)*stride_KV + tid];
-                reinterpret_cast<half&>(V_k.y) = V_h[(k_VKQ_0 + k0 + 1)*stride_KV + tid];
-                VKQ += V_k*KQ2[k0/2];
+                if (threadIdx.x == 0) {
+                    KQ[j*D + i_KQ] = sum;
+                }
+            }
+        }
+
+#pragma unroll
+        for (int j = 0; j < ncols; ++j) {
+            half kqmax_new_j = ncols == 1 ? kqmax_new : kqmax_new_arr[j];
+
+            kqmax_new_j = warp_reduce_max(kqmax_new_j);
+            if (threadIdx.x == 0) {
+                kqmax_shared[j][threadIdx.y] = kqmax_new_j;
+            }
+        }
+
+        __syncthreads();
+
+#pragma unroll
+        for (int j = 0; j < ncols; ++j) {
+            half kqmax_new_j = kqmax_shared[j][threadIdx.x];
+            kqmax_new_j = warp_reduce_max(kqmax_new_j);
+
+            const half KQ_max_scale = hexp(kqmax[j] - kqmax_new_j);
+            kqmax[j] = kqmax_new_j;
+
+            const half val = hexp(KQ[j*D + tid] - kqmax[j]);
+            kqsum[j] = kqsum[j]*KQ_max_scale + val;
+            KQ[j*D + tid] = val;
+
+            VKQ[j] *= __half2half2(KQ_max_scale);
+        }
+
+        __syncthreads();
+
+#pragma unroll
+        for (int k0 = 0; k0 < D; k0 += 2) {
+            if (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + k0 >= ne11) {
+                break;
+            }
+
+            half2 V_k;
+            reinterpret_cast<half&>(V_k.x) = V_h[(k_VKQ_0 + k0 + 0)*stride_KV + tid];
+            reinterpret_cast<half&>(V_k.y) = V_h[(k_VKQ_0 + k0 + 1)*stride_KV + tid];
+#pragma unroll
+            for (int j = 0; j < ncols; ++j) {
+                VKQ[j] += V_k*KQ2[j*(D/2) + k0/2];
            }
        }

        __syncthreads();
    }

-    if (tid >= D) {
-        kqsum = 0.0f;
+#pragma unroll
+    for (int j = 0; j < ncols; ++j) {
+        kqsum[j] = warp_reduce_sum(kqsum[j]);
+        if (threadIdx.x == 0) {
+            kqsum_shared[j][threadIdx.y] = kqsum[j];
+        }
    }

-    kqsum = warp_reduce_sum(kqsum);
-    if (threadIdx.x == 0) {
-        kqsum_shared[threadIdx.y] = kqsum;
-    }
    __syncthreads();
-    kqsum = kqsum_shared[threadIdx.x];
-    kqsum = warp_reduce_sum(kqsum);

-    if (tid >= D) {
-        return;
+#pragma unroll
+    for (int j_VKQ = 0; j_VKQ < ncols; ++j_VKQ) {
+        kqsum[j_VKQ] = kqsum_shared[j_VKQ][threadIdx.x];
+        kqsum[j_VKQ] = warp_reduce_sum(kqsum[j_VKQ]);
+
+        half dst_val = (__low2half(VKQ[j_VKQ]) + __high2half(VKQ[j_VKQ]));
+        if (parallel_blocks == 1) {
+            dst_val /= kqsum[j_VKQ];
+        }
+        const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip;
+        dst[j_dst*D*gridDim.y + D*blockIdx.y + tid] = dst_val;
    }

-    half dst_val = (__low2half(VKQ) + __high2half(VKQ));
-    if (parallel_blocks == 1) {
-        dst_val /= kqsum;
+    if (parallel_blocks != 1 && tid != 0) {
+#pragma unroll
+        for (int j = 0; j < ncols; ++j) {
+            dst_meta[(ic0 + j)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[j], kqsum[j]);
+        }
    }
-    dst[D*gridDim.y*blockIdx.x + D*blockIdx.y + tid] = dst_val;
-
-    if (parallel_blocks == 1 || tid != 0) {
-        return;
-    }
-    dst_meta[ic*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax, kqsum);
 #else
   NO_DEVICE_CODE;
 #endif // FP16_AVAILABLE
@ -191,7 +238,9 @@ static __global__ void flash_attn_vec_ext_f16(

 // D == head size, VKQ_stride == num VKQ rows calculated in parallel:
 template<int D, int ncols, int nwarps, int VKQ_stride, int parallel_blocks, typename KQ_acc_t>
+#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 __launch_bounds__(nwarps*WARP_SIZE, 1)
+#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 static __global__ void flash_attn_ext_f16(
        const char * __restrict__ Q,
        const char * __restrict__ K,
@ -573,7 +622,9 @@ static __global__ void flash_attn_ext_f16(
 }

 template<int D, int parallel_blocks> // D == head size
+#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 __launch_bounds__(D, 1)
+#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 static __global__ void flash_attn_combine_results(
        const float  * __restrict__ VKQ_parts,
        const float2 * __restrict__ VKQ_meta,
@ -642,7 +693,7 @@ static_assert(get_VKQ_stride( 80, 1, 16) ==  16, "Test failed.");
 static_assert(get_VKQ_stride( 80, 2, 16) ==  16, "Test failed.");
 static_assert(get_VKQ_stride( 80, 4, 16) ==  16, "Test failed.");

-template <int D, int parallel_blocks> void launch_fattn_vec_f16(
+template <int D, int cols_per_block, int parallel_blocks> void launch_fattn_vec_f16(
        const ggml_tensor * Q, const ggml_tensor * K, const ggml_tensor * V, ggml_tensor * KQV, const ggml_tensor * mask,
        ggml_cuda_pool & pool, cudaStream_t main_stream
 ) {
@ -656,13 +707,13 @@ template <int D, int parallel_blocks> void launch_fattn_vec_f16(

    constexpr int  nwarps = (D + WARP_SIZE - 1) / WARP_SIZE;
    const     dim3 block_dim(WARP_SIZE, nwarps, 1);
-    const     dim3 blocks_num(parallel_blocks*Q->ne[1], Q->ne[2], Q->ne[3]);
+    const     dim3 blocks_num(parallel_blocks*((Q->ne[1] + cols_per_block - 1) / cols_per_block), Q->ne[2], Q->ne[3]);
    const     int  shmem = 0;

    float scale;
    memcpy(&scale, KQV->op_params, sizeof(float));

-    flash_attn_vec_ext_f16<D, parallel_blocks>
+    flash_attn_vec_ext_f16<D, cols_per_block, parallel_blocks>
        <<<blocks_num, block_dim, shmem, main_stream>>> (
                (const char *) Q->data,
                (const char *) K->data,
@ -783,10 +834,99 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst

    ggml_cuda_set_device(ctx.device);

+    const int cc  = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
    const int nsm = ggml_cuda_info().devices[ggml_cuda_get_device()].nsm;

    const int32_t precision = KQV->op_params[1];

+    if (!fp16_mma_available(cc)) {
+        GGML_ASSERT(precision == GGML_PREC_DEFAULT);
+        GGML_ASSERT(Q->ne[0] == 64 || Q->ne[0] == 128 && "FlashAttention without tensor cores only supports head sizes 64 and 128.");
+
+        if (Q->ne[1] == 1) {
+            constexpr int cols_per_block = 1;
+            constexpr int parallel_blocks = 4;
+            switch (Q->ne[0]) {
+                case 64:
+                    launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
+                    break;
+                case 128:
+                    launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
+                    break;
+                default:
+                    GGML_ASSERT(false);
+                    break;
+            }
+            return;
+        }
+
+        if (Q->ne[1] == 2) {
+            constexpr int cols_per_block = 2;
+            constexpr int parallel_blocks = 4;
+            switch (Q->ne[0]) {
+                case 64:
+                    launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
+                    break;
+                case 128:
+                    launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
+                    break;
+                default:
+                    GGML_ASSERT(false);
+                    break;
+            }
+            return;
+        }
+
+        if (Q->ne[1] <= 4) {
+            constexpr int cols_per_block = 4;
+            constexpr int parallel_blocks = 4;
+            switch (Q->ne[0]) {
+                case 64:
+                    launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
+                    break;
+                case 128:
+                    launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
+                    break;
+                default:
+                    GGML_ASSERT(false);
+                    break;
+            }
+            return;
+        }
+
+        if (Q->ne[1] <= 8) {
+            constexpr int cols_per_block = 8;
+            constexpr int parallel_blocks = 4;
+            switch (Q->ne[0]) {
+                case 64:
+                    launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
+                    break;
+                case 128:
+                    launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
+                    break;
+                default:
+                    GGML_ASSERT(false);
+                    break;
+            }
+            return;
+        }
+
+        constexpr int cols_per_block = 8;
+        constexpr int parallel_blocks = 1;
+        switch (Q->ne[0]) {
+            case 64:
+                launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
+                break;
+            case 128:
+                launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
+                break;
+            default:
+                GGML_ASSERT(false);
+                break;
+        }
+        return;
+    }
+
    if (precision != GGML_PREC_DEFAULT) {
        if (Q->ne[1] <= 32 || Q->ne[0] > 128) {
            constexpr int cols_per_block = 16;
@ -845,16 +985,17 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
    }

    if (Q->ne[1] == 1 && Q->ne[0] % (2*WARP_SIZE) == 0) {
+        constexpr int cols_per_block = 1;
        constexpr int parallel_blocks = 4;
        switch (Q->ne[0]) {
            case 64:
-                launch_fattn_vec_f16< 64, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
+                launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
                break;
            case 128:
-                launch_fattn_vec_f16<128, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
+                launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
                break;
            case 256:
-                launch_fattn_vec_f16<256, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
+                launch_fattn_vec_f16<256, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
                break;
            default:
                GGML_ASSERT(false);
--- a/ggml-opencl.cpp
+++ b/ggml-opencl.cpp
@ -2128,6 +2128,7 @@ static size_t ggml_backend_opencl_buffer_type_get_alignment(ggml_backend_buffer_
    if (alignment == (cl_uint)-1) {
        ggml_cl_init();
        clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &alignment, NULL);
+        alignment /= 8; // bits to bytes
    }
    return alignment;

--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@ -8330,24 +8330,26 @@ static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict_
    const int blocks_per_row = ncols / qk;
    const int blocks_per_warp = vdr * WARP_SIZE / qi;

-// partial sum for each thread
+    const int qi_vdr = (qi / vdr); // N_threads processing 1 qk block
+
+    // partial sum for each thread
    float tmp = 0.0f;

    const block_q_t  * x = (const block_q_t  *) vx;
    const block_q8_1 * y = (const block_q8_1 *) vy;

-    for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
+    for (int i = item_ct1.get_local_id(2) / qi_vdr; i < blocks_per_row;
         i += blocks_per_warp) {
-        const int ibx = row*blocks_per_row + i; // x block index
+      const int ibx = row * blocks_per_row + i; // x block index

-        const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
+      const int iby = i * (qk / QK8_1); // y block index that aligns with ibx

-        const int iqs =
-            vdr *
-            (item_ct1.get_local_id(2) %
-             (qi / vdr)); // x block quant index when casting the quants to int
+      const int iqs =
+          vdr *
+          (item_ct1.get_local_id(2) -
+           i * qi_vdr); // x block quant index when casting the quants to int

-        tmp += vec_dot_q_sycl(&x[ibx], &y[iby], iqs);
+      tmp += vec_dot_q_sycl(&x[ibx], &y[iby], iqs);
    }

    // sum up partial sums and write back result
--- a/ggml-vulkan-shaders.hpp
+++ b/ggml-vulkan-shaders.hpp
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
--- a/ggml_vk_generate_shaders.py
+++ b/ggml_vk_generate_shaders.py
--- a/gguf-py/scripts/gguf-new-metadata.py
+++ b/gguf-py/scripts/gguf-new-metadata.py
@ -7,7 +7,8 @@ import json
 from pathlib import Path

 import numpy as np
-from typing import Any, Sequence
+from tqdm import tqdm
+from typing import Any, Sequence, NamedTuple

 # Necessary to load the local gguf package
 if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent / 'gguf-py').exists():
@ -18,6 +19,12 @@ import gguf
 logger = logging.getLogger("gguf-new-metadata")


+class MetadataDetails(NamedTuple):
+    type: gguf.GGUFValueType
+    value: Any
+    description: str = ''
+
+
 def get_byteorder(reader: gguf.GGUFReader) -> gguf.GGUFEndian:
    if np.uint32(1) == np.uint32(1).newbyteorder("<"):
        # Host is little endian
@ -59,7 +66,16 @@ def get_field_data(reader: gguf.GGUFReader, key: str) -> Any:
    return decode_field(field)


-def copy_with_new_metadata(reader: gguf.GGUFReader, writer: gguf.GGUFWriter, new_metadata: dict[str, str], remove_metadata: Sequence[str]) -> None:
+def find_token(token_list: Sequence[int], token: str) -> Sequence[int]:
+    token_ids = [index for index, value in enumerate(token_list) if value == token]
+
+    if len(token_ids) == 0:
+        raise LookupError(f'Unable to find "{token}" in token list!')
+
+    return token_ids
+
+
+def copy_with_new_metadata(reader: gguf.GGUFReader, writer: gguf.GGUFWriter, new_metadata: dict[str, MetadataDetails], remove_metadata: Sequence[str]) -> None:
    for field in reader.fields.values():
        # Suppress virtual fields and fields written by GGUFWriter
        if field.name == gguf.Keys.General.ARCHITECTURE or field.name.startswith('GGUF.'):
@ -75,54 +91,64 @@ def copy_with_new_metadata(reader: gguf.GGUFReader, writer: gguf.GGUFWriter, new
            logger.debug(f'Removing {field.name}')
            continue

-        old_val = decode_field(field)
+        old_val = MetadataDetails(field.types[0], decode_field(field))
        val = new_metadata.get(field.name, old_val)

        if field.name in new_metadata:
-            logger.debug(f'Modifying {field.name}: "{old_val}" -> "{val}"')
+            logger.debug(f'Modifying {field.name}: "{old_val.value}" -> "{val.value}" {val.description}')
            del new_metadata[field.name]
-        elif val is not None:
+        elif val.value is not None:
            logger.debug(f'Copying {field.name}')

-        if val is not None:
+        if val.value is not None:
            writer.add_key(field.name)
-            writer.add_val(val, field.types[0])
+            writer.add_val(val.value, val.type)

    if gguf.Keys.Tokenizer.CHAT_TEMPLATE in new_metadata:
        logger.debug('Adding chat template(s)')
-        writer.add_chat_template(new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE])
+        writer.add_chat_template(new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE].value)
        del new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE]

-    # TODO: Support other types than string?
    for key, val in new_metadata.items():
-        logger.debug(f'Adding {key}: {val}')
+        logger.debug(f'Adding {key}: "{val.value}" {val.description}')
        writer.add_key(key)
-        writer.add_val(val, gguf.GGUFValueType.STRING)
+        writer.add_val(val.value, val.type)
+
+    total_bytes = 0

    for tensor in reader.tensors:
+        total_bytes += tensor.n_bytes
        # Dimensions are written in reverse order, so flip them first
        shape = np.flipud(tensor.shape).tolist()
        writer.add_tensor_info(tensor.name, shape, tensor.data.dtype, tensor.data.nbytes, tensor.tensor_type)

+    bar = tqdm(desc="Writing", total=total_bytes, unit="byte", unit_scale=True)
+
    writer.write_header_to_file()
    writer.write_kv_data_to_file()
    writer.write_ti_data_to_file()

    for tensor in reader.tensors:
        writer.write_tensor_data(tensor.data)
+        bar.update(tensor.n_bytes)

    writer.close()


 def main() -> None:
+    tokenizer_metadata = (getattr(gguf.Keys.Tokenizer, n) for n in gguf.Keys.Tokenizer.__dict__.keys() if not n.startswith('_'))
+    token_names = dict((n.split('.')[-1][:-len('_token_id')], n) for n in tokenizer_metadata if n.endswith('_token_id'))
+
    parser = argparse.ArgumentParser(description="Make a copy of a GGUF file with new metadata")
    parser.add_argument("input",                                       type=Path, help="GGUF format model input filename")
    parser.add_argument("output",                                      type=Path, help="GGUF format model output filename")
-    parser.add_argument("--general-name",                              type=str,  help="The models general.name")
-    parser.add_argument("--general-description",                       type=str,  help="The models general.description")
-    parser.add_argument("--chat-template",                             type=str,  help="Chat template string (or JSON string containing templates)")
-    parser.add_argument("--chat-template-config",                      type=Path, help="Config file (tokenizer_config.json) containing chat template(s)")
-    parser.add_argument("--remove-metadata",      action="append",     type=str,  help="Remove metadata (by key name) from output model")
+    parser.add_argument("--general-name",                              type=str,  help="The models general.name", metavar='"name"')
+    parser.add_argument("--general-description",                       type=str,  help="The models general.description", metavar='"Description ..."')
+    parser.add_argument("--chat-template",                             type=str,  help="Chat template string (or JSON string containing templates)", metavar='"{% ... %} ..."')
+    parser.add_argument("--chat-template-config",                      type=Path, help="Config file containing chat template(s)", metavar='tokenizer_config.json')
+    parser.add_argument("--remove-metadata",      action="append",     type=str,  help="Remove metadata (by key name) from output model", metavar='general.url')
+    parser.add_argument("--special-token",        action="append",     type=str,  help="Special token by value", nargs=2, metavar=(' | '.join(token_names.keys()), '"<token>"'))
+    parser.add_argument("--special-token-by-id",  action="append",     type=str,  help="Special token by id", nargs=2, metavar=(' | '.join(token_names.keys()), '0'))
    parser.add_argument("--force",                action="store_true",            help="Bypass warnings without confirmation")
    parser.add_argument("--verbose",              action="store_true",            help="Increase output verbosity")
    args = parser.parse_args(None if len(sys.argv) > 2 else ["--help"])
@ -133,20 +159,20 @@ def main() -> None:
    remove_metadata = args.remove_metadata or []

    if args.general_name:
-        new_metadata[gguf.Keys.General.NAME] = args.general_name
+        new_metadata[gguf.Keys.General.NAME] = MetadataDetails(gguf.GGUFValueType.STRING, args.general_name)

    if args.general_description:
-        new_metadata[gguf.Keys.General.DESCRIPTION] = args.general_description
+        new_metadata[gguf.Keys.General.DESCRIPTION] = MetadataDetails(gguf.GGUFValueType.STRING, args.general_description)

    if args.chat_template:
-        new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE] = json.loads(args.chat_template) if args.chat_template.startswith('[') else args.chat_template
+        new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE] = MetadataDetails(gguf.GGUFValueType.STRING, json.loads(args.chat_template) if args.chat_template.startswith('[') else args.chat_template)

    if args.chat_template_config:
        with open(args.chat_template_config, 'r') as fp:
            config = json.load(fp)
            template = config.get('chat_template')
            if template:
-                new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE] = template
+                new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE] = MetadataDetails(gguf.GGUFValueType.STRING, template)

    if remove_metadata:
        logger.warning('*** Warning *** Warning *** Warning **')
@ -166,6 +192,32 @@ def main() -> None:
    arch = get_field_data(reader, gguf.Keys.General.ARCHITECTURE)
    endianess = get_byteorder(reader)

+    token_list = get_field_data(reader, gguf.Keys.Tokenizer.LIST) or []
+
+    for name, token in args.special_token or []:
+        if name not in token_names:
+            logger.warning(f'Unknown special token "{name}", ignoring...')
+        else:
+            ids = find_token(token_list, token)
+            new_metadata[token_names[name]] = MetadataDetails(gguf.GGUFValueType.UINT32, ids[0], f'= {token}')
+
+            if len(ids) > 1:
+                logger.warning(f'Multiple "{token}" tokens found, choosing ID {ids[0]}, use --special-token-by-id if you want another:')
+                logger.warning(', '.join(str(i) for i in ids))
+
+    for name, id_string in args.special_token_by_id or []:
+        if name not in token_names:
+            logger.warning(f'Unknown special token "{name}", ignoring...')
+        elif not id_string.isdecimal():
+            raise LookupError(f'Token ID "{id_string}" is not a valid ID!')
+        else:
+            id_int = int(id_string)
+
+            if id_int >= 0 and id_int < len(token_list):
+                new_metadata[token_names[name]] = MetadataDetails(gguf.GGUFValueType.UINT32, id_int, f'= {token_list[id_int]}')
+            else:
+                raise LookupError(f'Token ID {id_int} is not within token list!')
+
    if os.path.isfile(args.output) and not args.force:
        logger.warning('*** Warning *** Warning *** Warning **')
        logger.warning(f'* The "{args.output}" GGUF file already exists, it will be overwritten!')
--- a/llama.cpp
+++ b/llama.cpp
@ -12779,7 +12779,7 @@ struct llm_tokenizer_wpm {
                continue;
            }
            code = unicode_tolower(code);
-            if (type == CODEPOINT_TYPE_WHITESPACE) {
+            if (type == CODEPOINT_TYPE_SEPARATOR) {
                code = ' ';
            }
            std::string s = unicode_cpt_to_utf8(code);
@ -15824,13 +15824,6 @@ struct llama_context * llama_new_context_with_model(
        cparams.flash_attn = false;
    }

-#ifdef GGML_USE_HIPBLAS
-    if (cparams.flash_attn) {
-        LLAMA_LOG_WARN("%s: flash_attn is not yet compatible with HIPBLAS builds - forcing off\n", __func__);
-        cparams.flash_attn = false;
-    }
-#endif
-
    if (params.seed == LLAMA_DEFAULT_SEED) {
        params.seed = time(NULL);
    }
@ -18199,7 +18192,7 @@ struct llama_timings llama_get_timings(struct llama_context * ctx) {
        /*.t_eval_ms   =*/ 1e-3 * ctx->t_eval_us,

        /*.n_sample =*/ std::max(1, ctx->n_sample),
-        /*.n_p_eval =*/ std::max(1, ctx->n_p_eval),
+        /*.n_p_eval =*/ std::max(0, ctx->n_p_eval),
        /*.n_eval   =*/ std::max(1, ctx->n_eval),
    };

--- a/scripts/gen-unicode-data.py
+++ b/scripts/gen-unicode-data.py
@ -1,31 +1,14 @@
 import regex


-def cpt_to_utf8_str(cpt):
-    if cpt <= 0xFF:
-        return bytes([cpt, 0, 0, 0])
-    elif cpt <= 0xFFFF:
-        return bytes([cpt & 0xFF, cpt >> 8, 0, 0])
-    elif cpt <= 0xFFFFFF:
-        return bytes([cpt & 0xFF, (cpt >> 8) & 0xFF, (cpt >> 16) & 0xFF, 0])
-    else:
-        return bytes([cpt & 0xFF, (cpt >> 8) & 0xFF, (cpt >> 16) & 0xFF, cpt >> 24])
-
-
-def is_match(codepoint, regex_expr):
-    try:
-        res = regex.match(regex_expr, cpt_to_utf8_str(codepoint).decode('utf-32'))
-        return res is not None
-    except Exception:
-        return False
-
-
 def get_matches(regex_expr):
+    regex_expr_compiled = regex.compile(regex_expr)
    unicode_ranges = []
    current_range = None

    for codepoint in range(0x110000):
-        if is_match(codepoint, regex_expr):
+        char = chr(codepoint)
+        if regex_expr_compiled.match(char):
            if current_range is None:
                current_range = [codepoint, codepoint]
            else:
@ -40,27 +23,42 @@ def get_matches(regex_expr):
    return unicode_ranges


-def print_cat(cat, ranges):
-    print("const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_{} = {{".format(cat)) # noqa: NP100
-    cnt = 0
-    for start, end in ranges:
-        if cnt % 4 != 0:
-            print(" ", end="") # noqa: NP100
-        print("{{0x{:08X}, 0x{:08X}}},".format(start, end), end="") # noqa: NP100
-        if cnt % 4 == 3:
-            print("") # noqa: NP100
-        cnt += 1
-
-    if cnt % 4 != 0:
-        print("") # noqa: NP100
+def print_cat(mode, cat, ranges):
+    if mode == "range":
+        print("const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_{} = {{".format(cat)) # noqa: NP100
+    if mode == "map":
+        print("const std::map<uint32_t, uint32_t> unicode_map_{} = {{".format(cat)) # noqa: NP100
+    for i, values in enumerate(ranges):
+        end = ",\n" if (i % 4 == 3 or i + 1 == len(ranges)) else ", "
+        values = ["0x%08X" % value for value in values]
+        print("{" + ", ".join(values) + "}", end=end) # noqa: NP100
    print("};") # noqa: NP100
    print("") # noqa: NP100


-print_cat("number",      get_matches(r'\p{N}'))
-print_cat("letter",      get_matches(r'\p{L}'))
-print_cat("whitespace",  get_matches(r'\p{Z}'))
-print_cat("accent_mark", get_matches(r'\p{M}'))
-print_cat("punctuation", get_matches(r'\p{P}'))
-print_cat("symbol",      get_matches(r'\p{S}'))
-print_cat("control",     get_matches(r'\p{C}'))
+print_cat("range", "number",      get_matches(r'\p{N}'))
+print_cat("range", "letter",      get_matches(r'\p{L}'))
+print_cat("range", "separator",   get_matches(r'\p{Z}'))
+print_cat("range", "accent_mark", get_matches(r'\p{M}'))
+print_cat("range", "punctuation", get_matches(r'\p{P}'))
+print_cat("range", "symbol",      get_matches(r'\p{S}'))
+print_cat("range", "control",     get_matches(r'\p{C}'))
+
+print_cat("range", "whitespace",  get_matches(r'\s'))
+
+
+map_lowercase = []
+map_uppercase = []
+for codepoint in range(0x110000):
+    char = chr(codepoint)
+    lower = ord(char.lower()[0])
+    upper = ord(char.upper()[0])
+    if codepoint != lower:
+        map_lowercase.append((codepoint, lower))
+    if codepoint != upper:
+        map_uppercase.append((codepoint, upper))
+print_cat("map", "lowercase", map_lowercase)
+print_cat("map", "uppercase", map_uppercase)
+
+
+# TODO: generate unicode_map_nfd
--- a/tests/test-tokenizer-random.py
+++ b/tests/test-tokenizer-random.py
@ -0,0 +1,295 @@
+# Test libllama tokenizer == AutoTokenizer.
+# Brute force random tokens/text generation.
+#
+# Sample usage:
+#
+#   python3 tests/test-tokenizer-random.py ./models/ggml-vocab-llama-bpe.gguf ./models/tokenizers/llama-bpe
+#
+
+import time
+import logging
+import argparse
+import subprocess
+import random
+
+from typing import Iterator
+
+import cffi
+from transformers import AutoTokenizer, PreTrainedTokenizerBase
+
+logger = logging.getLogger("test-tokenizer-random-bpe")
+
+
+class LibLlama:
+
+    DEFAULT_PATH_LLAMA_H = "./llama.h"
+    DEFAULT_PATH_LIBLLAMA = "./build/libllama.so"  # CMakeLists.txt: BUILD_SHARED_LIBS ON
+
+    def __init__(self, path_llama_h: str = None, path_libllama: str = None):
+        path_llama_h = path_llama_h or self.DEFAULT_PATH_LLAMA_H
+        path_libllama = path_libllama or self.DEFAULT_PATH_LIBLLAMA
+        (self.ffi, self.lib) = self._load_libllama_cffi(path_llama_h, path_libllama)
+        self.lib.llama_backend_init()
+
+    def _load_libllama_cffi(self, path_llama_h: str, path_libllama: str):
+        cmd = ["gcc", "-E", "-P", "-D__restrict=", "-D__attribute__(x)=", "-D__asm__(x)=", path_llama_h]
+        res = subprocess.run(cmd, stdout=subprocess.PIPE)
+        assert (res.returncode == 0)
+        source = res.stdout.decode()
+        ffi = cffi.FFI()
+        if True:  # workarounds for pycparser
+            source = "typedef struct { } __builtin_va_list;" + "\n" + source
+            source = source.replace("sizeof (int)",    str(ffi.sizeof("int")))
+            source = source.replace("sizeof (void *)", str(ffi.sizeof("void*")))
+            source = source.replace("sizeof (size_t)", str(ffi.sizeof("size_t")))
+            source = source.replace("sizeof(int32_t)", str(ffi.sizeof("int32_t")))
+        ffi.cdef(source, override=True)
+        lib = ffi.dlopen(path_libllama)
+        return (ffi, lib)
+
+    def model_default_params(self, **kwargs):
+        mparams = self.lib.llama_model_default_params()
+        for k, v in kwargs.items():
+            setattr(mparams, k, v)
+        return mparams
+
+    def context_default_params(self, **kwargs):
+        cparams = self.lib.llama_context_default_params()
+        for k, v in kwargs.items():
+            setattr(cparams, k, v)
+        return cparams
+
+
+class LibLlamaModel:
+
+    def __init__(self, libllama: LibLlama, path_model: str, mparams={}, cparams={}):
+        self.lib = libllama.lib
+        self.ffi = libllama.ffi
+        if isinstance(mparams, dict):
+            mparams = libllama.model_default_params(**mparams)
+        self.model = self.lib.llama_load_model_from_file(path_model.encode(), mparams)
+        if not self.model:
+            raise RuntimeError("error: failed to load model '%s'" % path_model)
+        if isinstance(cparams, dict):
+            cparams = libllama.context_default_params(**cparams)
+        self.ctx = self.lib.llama_new_context_with_model(self.model, cparams)
+        if not self.ctx:
+            raise RuntimeError("error: failed to create context for model '%s'" % path_model)
+        n_tokens_max = self.lib.llama_n_ctx(self.ctx)
+        self.token_ids = self.ffi.new("llama_token[]", n_tokens_max)
+
+    def free(self):
+        if self.ctx:
+            self.lib.llama_free(self.ctx)
+        if self.model:
+            self.lib.llama_free_model(self.model)
+        self.ctx = None
+        self.model = None
+        self.lib = None
+
+    def tokenize(self, text: str, n_tokens_max: int = 0, add_special: bool = False, parse_special: bool = False) -> list[int]:
+        n_tokens_max = n_tokens_max if n_tokens_max > 0 else len(self.token_ids)
+        text = text.encode("utf-8")
+        num = self.lib.llama_tokenize(self.model, text, len(text), self.token_ids, n_tokens_max, add_special, parse_special)
+        if num < 0:
+            return []
+        return list(self.token_ids[0:num])
+
+
+def generator_custom_text() -> Iterator[str]:
+    """General tests"""
+    yield from [
+        "",
+        " ",
+        "  ",
+        "   ",
+        "\t",
+        "\n",
+        "\n\n",
+        "\n\n\n",
+        "\t\n",
+        "Hello world",
+        " Hello world",
+        "Hello World",
+        " Hello World",
+        " Hello World!",
+        "Hello, world!",
+        " Hello, world!",
+        " this is 🦙.cpp",
+        "w048 7tuijk dsdfhu",
+        "нещо на Български",
+        "កាន់តែពិសេសអាចខលចេញ",
+        "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
+        "Hello",
+        " Hello",
+        "  Hello",
+        "   Hello",
+        "    Hello",
+        "    Hello\n    Hello",
+        " (",
+        "\n =",
+        "' era",
+        "Hello, y'all! How are you 😁 ?我想在apple工作1314151天～",
+        "3",
+        "33",
+        "333",
+        "3333",
+        "33333",
+        "333333",
+        "3333333",
+        "33333333",
+        "333333333",
+    ]
+
+
+def generator_custom_text_edge_cases() -> Iterator[str]:
+    """Edge cases found while debugging"""
+    yield from [
+        '\x1f-a',   # unicode_ranges_control, {0x00001C, 0x00001F}
+        '¼-a',      # unicode_ranges_digit, 0x00BC
+        '½-a',      # unicode_ranges_digit, 0x00BD
+        '¾-a',      # unicode_ranges_digit, 0x00BE
+        'a 〇b',    # unicode_ranges_digit, 0x3007
+        'Ⅵ-a',     # unicode_ranges_digit, {0x00002150, 0x0000218F} // Number Forms
+        '\uFEFF//', # unicode_ranges_control, 0xFEFF (BOM)
+        '<s>a'      # TODO: Phi-3 fail
+    ]
+
+
+def generator_random_chars(iterations = 100) -> Iterator[str]:
+    """Brute force random text with simple characters"""
+
+    WHITESPACES = list(" " * 20 + "\n" * 5 + "\r\n" * 5 + "\t" * 5)
+    CHARS = list(set("""
+        ABCDEFGHIJKLMNOPQRSTUVWXYZ
+        abcdefghijklmnopqrstuvwxyz
+        ÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛÄËÏÖÜ
+        áéíóúàèìòùâêîôûäëïöü
+        .-,*/-+ª!"·$%&/()=?¿[]{}<>\\|@#~½¬~;:_
+    """))
+
+    rand = random.Random()
+    for m in range(iterations):
+        rand.seed(m)
+        text = []
+        num_words = rand.randint(300, 400)
+        for i in range(num_words):
+            k = rand.randint(1, 7)
+            word = rand.choices(CHARS, k=k)
+            space = rand.choice(WHITESPACES)
+            text.append("".join(word) + space)
+        yield "".join(text)
+
+
+def generator_random_vocab_chars(tokenizer: PreTrainedTokenizerBase, iterations = 100) -> Iterator[str]:
+    """Brute force random text with vocab characters"""
+
+    vocab_ids = list(tokenizer.vocab.values())
+    vocab_text = tokenizer.decode(vocab_ids, skip_special_tokens=True)
+    vocab_chars = list(set(vocab_text))
+    del vocab_ids, vocab_text
+
+    rand = random.Random()
+    for m in range(iterations):
+        rand.seed(m)
+        text = rand.choices(vocab_chars, k=1024)
+        yield "".join(text)
+
+
+def generator_random_vocab_tokens(tokenizer: PreTrainedTokenizerBase, iterations = 100) -> Iterator[str]:
+    """Brute force random text from vocab tokens"""
+
+    space_id = tokenizer.encode(" ", add_special_tokens=False)[0]
+    vocab_ids = list(tokenizer.vocab.values())
+    vocab_ids = list(sorted(vocab_ids + vocab_ids))
+    for i in range(1, len(vocab_ids), 2):
+        vocab_ids[i] = space_id
+    vocab_tokens = tokenizer.decode(vocab_ids, skip_special_tokens=True)
+    vocab_tokens = vocab_tokens.split(" ")
+    del vocab_ids
+
+    yield from vocab_tokens
+
+    rand = random.Random()
+    for m in range(iterations):
+        rand.seed(m)
+        text = []
+        num_words = rand.randint(300, 400)
+        for i in range(num_words):
+            k = rand.randint(1, 3)
+            tokens = rand.choices(vocab_tokens, k=k)
+            tokens = [t.strip(" \n\r\t") for t in tokens]
+            sep = rand.choice("     \n\r\t")
+            text.append("".join(tokens) + sep)
+        yield "".join(text)
+
+
+def generator_random_bytes(iterations = 100) -> Iterator[str]:
+    """Brute force random bytes"""
+
+    WHITESPACES = list(" " * 20 + "\n" * 5 + "\r\n" * 5 + "\t" * 5)
+
+    rand = random.Random()
+    for m in range(iterations):
+        rand.seed(m)
+        text = []
+        num_words = rand.randint(300, 400)
+        for i in range(num_words):
+            k = rand.randint(1, 8)
+            word = [chr(r) for r in rand.randbytes(k) if r]
+            word.append(rand.choice(WHITESPACES))
+            text.append("".join(word))
+        yield "".join(text)
+
+
+def test_compare_tokenizer(model: LibLlamaModel, tokenizer: PreTrainedTokenizerBase, generator: Iterator[str]):
+
+    def find_first_mismatch(ids1: list[int], ids2: list[int]):
+        for i, (a,b) in enumerate(zip(ids1, ids2)):
+            if a != b:
+                return i
+        if len(ids1) == len(ids2):
+            return -1
+        return min(len(ids1), len(ids2))
+
+    t0 = time.perf_counter()
+    logger.info("%s: %s" % (generator.__name__, "ini"))
+    for text in generator:
+        ids1 = model.tokenize(text, add_special=False, parse_special=False)
+        ids2 = tokenizer.encode(text, add_special_tokens=False)
+        if ids1 != ids2:
+            i = find_first_mismatch(ids1, ids2)
+            ids1 = list(ids1)[max(0, i - 2) : i + 2 + 1]
+            ids2 = list(ids2)[max(0, i - 2) : i + 2 + 1]
+            text2 = tokenizer.decode(ids2, skip_special_tokens=True)
+            assert (text2 in text)
+            logger.info(" Text:     " + repr(text2))
+            logger.info(" TokenIDs: " + str(ids1))
+            logger.info(" Expected: " + str(ids2))
+            raise Exception()
+    t1 = time.perf_counter()
+    logger.info("%s: end, time: %.3f secs" % (generator.__name__, t1 - t0))
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("vocab_file", help="path to vocab 'gguf' file")
+    parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file")
+    parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
+    args = parser.parse_args()
+
+    logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
+
+    model = LibLlamaModel(LibLlama(), args.vocab_file, mparams=dict(vocab_only=True), cparams=dict(n_ctx=2048))
+
+    tokenizer = AutoTokenizer.from_pretrained(args.dir_tokenizer)
+
+    test_compare_tokenizer(model, tokenizer, generator_custom_text())
+    test_compare_tokenizer(model, tokenizer, generator_custom_text_edge_cases())
+    test_compare_tokenizer(model, tokenizer, generator_random_chars(10_000))
+    test_compare_tokenizer(model, tokenizer, generator_random_vocab_chars(tokenizer, 10_000))
+    test_compare_tokenizer(model, tokenizer, generator_random_vocab_tokens(tokenizer, 10_000))
+    # test_compare_tokenizer(model, tokenizer, generator_random_bytes(10_000)) # FAIL
+
+    model.free()
--- a/unicode-data.cpp
+++ b/unicode-data.cpp
--- a/unicode-data.h
+++ b/unicode-data.h
@ -7,6 +7,7 @@

 extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_number;
 extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_letter;
+extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_separator;
 extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_whitespace;
 extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_accent_mark;
 extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_punctuation;
--- a/unicode.cpp
+++ b/unicode.cpp
@ -9,6 +9,7 @@
 #include <stdexcept>
 #include <string>
 #include <unordered_map>
+#include <unordered_set>
 #include <utility>
 #include <vector>
 #include <locale>
@ -111,27 +112,27 @@ static uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset)
 static std::unordered_map<uint32_t, int> unicode_cpt_type_map() {
    std::unordered_map<uint32_t, int> cpt_types;
    for (auto p : unicode_ranges_number) {
-        for (auto i = p.first; i <= p.second; ++ i) {
+        for (auto i = p.first; i <= p.second; ++i) {
            cpt_types[i] = CODEPOINT_TYPE_NUMBER;
        }
    }
    for (auto p : unicode_ranges_letter) {
-        for (auto i = p.first; i <= p.second; ++ i) {
+        for (auto i = p.first; i <= p.second; ++i) {
            cpt_types[i] = CODEPOINT_TYPE_LETTER;
        }
    }
-    for (auto p : unicode_ranges_whitespace) {
-        for (auto i = p.first; i <= p.second; ++ i) {
-            cpt_types[i] = CODEPOINT_TYPE_WHITESPACE;
+    for (auto p : unicode_ranges_separator) {
+        for (auto i = p.first; i <= p.second; ++i) {
+            cpt_types[i] = CODEPOINT_TYPE_SEPARATOR;
        }
    }
    for (auto p : unicode_ranges_accent_mark) {
-        for (auto i = p.first; i <= p.second; ++ i) {
+        for (auto i = p.first; i <= p.second; ++i) {
            cpt_types[i] = CODEPOINT_TYPE_ACCENT_MARK;
        }
    }
    for (auto p : unicode_ranges_punctuation) {
-        for (auto i = p.first; i <= p.second; ++ i) {
+        for (auto i = p.first; i <= p.second; ++i) {
            cpt_types[i] = CODEPOINT_TYPE_PUNCTUATION;
        }
    }
@ -141,7 +142,7 @@ static std::unordered_map<uint32_t, int> unicode_cpt_type_map() {
        }
    }
    for (auto p : unicode_ranges_control) {
-        for (auto i = p.first; i <= p.second; ++ i) {
+        for (auto i = p.first; i <= p.second; ++i) {
            cpt_types[i] = CODEPOINT_TYPE_CONTROL;
        }
    }
@ -224,138 +225,256 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
    std::vector<size_t> bpe_offsets; // store the offset of each word
    bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size

+    const auto cpts = unicode_cpts_from_utf8(text);
+
    size_t start = 0;
+    for (auto offset : offsets) {
+        const size_t offset_ini = start;
+        const size_t offset_end = start + offset;
+        assert(offset_end <= cpts.size());
+        start = offset_end;
+
+        auto _get_cpt = [&] (const size_t pos) -> char32_t {
+            return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0;
+        };
+
+        auto _get_cpt_type = [&] (const size_t pos) -> int {
+            return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_type(cpts[pos]) : CODEPOINT_TYPE_UNIDENTIFIED;
+        };
+
+        size_t _prev_end = offset_ini;
+        auto _add_token = [&] (const size_t end) -> size_t {
+            assert(_prev_end <= end && end <= offset_end);
+            size_t len = end - _prev_end;
+            if (len > 0) {
+                bpe_offsets.push_back(len);
+            }
+            _prev_end = end;
+            //if (len > 0) {
+            //    std::string s = "";
+            //    for(size_t p = end-len; p < end; p++)
+            //        s += unicode_cpt_to_utf8(cpts[p]);
+            //    printf(">>> '%s'\n", s.c_str());
+            //}
+            return len;
+        };
+
+        for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
+            const char32_t cpt = _get_cpt(pos);
+            const int cpt_type = _get_cpt_type(pos);
+
+            // regex: 's|'t|'re|'ve|'m|'ll|'d
+            if (cpt == '\'' && pos+1 < offset_end) {
+                char32_t cpt_next = _get_cpt(pos+1);
+                if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || cpt_next == 'd') {
+                    pos += _add_token(pos+2);
+                    continue;
+                }
+                if (pos+2 < offset_end) {
+                    char32_t cpt_next_next = _get_cpt(pos+2);
+                    if ((cpt_next == 'r' && cpt_next_next == 'e') ||
+                        (cpt_next == 'v' && cpt_next_next == 'e') ||
+                        (cpt_next == 'l' && cpt_next_next == 'l')) {
+                        pos += _add_token(pos+3);
+                        continue;
+                    }
+                }
+            }
+
+            char32_t cpt2 = (cpt == ' ' ? _get_cpt(pos+1) : cpt);
+            int cpt2_type = (cpt == ' ' ? _get_cpt_type(pos+1) : cpt_type);
+            // regex: <space>?\p{L}+
+            if (cpt2_type == CODEPOINT_TYPE_LETTER) {
+                pos += (cpt == ' ');
+                while (cpt2_type == CODEPOINT_TYPE_LETTER) {
+                    cpt2_type = _get_cpt_type(++pos);
+                }
+                _add_token(pos);
+                continue;
+            }
+            // regex: <space>?\p{N}+
+            if (cpt2_type == CODEPOINT_TYPE_NUMBER) {
+                pos += (cpt == ' ');
+                while (cpt2_type == CODEPOINT_TYPE_NUMBER) {
+                    cpt2_type = _get_cpt_type(++pos);
+                }
+                _add_token(pos);
+                continue;
+            }
+            // regex: <space>?[^\s\p{L}\p{N}]+
+            if (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
+                pos += (cpt == ' ');
+                while (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
+                    cpt2_type = _get_cpt_type(++pos);
+                    cpt2 = _get_cpt(pos);
+                }
+                _add_token(pos);
+                continue;
+            }
+
+            size_t num_whitespaces = 0;
+            while (unicode_cpt_is_whitespace(_get_cpt(pos+num_whitespaces))) {
+                num_whitespaces++;
+            }
+
+            // regex: \s+(?!\S)
+            if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != 0) {
+                pos += num_whitespaces - 1;
+                _add_token(pos);
+                continue;
+            }
+
+            // regex: \s+
+            if (num_whitespaces > 0) {
+                pos += num_whitespaces;
+                _add_token(pos);
+                continue;
+            }
+
+            // no matches
+            _add_token(++pos);
+        }
+    }
+
+    return bpe_offsets;
+}
+
+// LLAMA3 system regex: "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"
+static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string & text, const std::vector<size_t> & offsets) {
+    std::vector<size_t> bpe_offsets; // store the offset of each word
+    bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size

    const auto cpts = unicode_cpts_from_utf8(text);

+    size_t start = 0;
    for (auto offset : offsets) {
-        std::string token;
+        const size_t offset_ini = start;
+        const size_t offset_end = start + offset;
+        assert(offset_end <= cpts.size());
+        start = offset_end;

-        bool collecting_numeric = false;
-        bool collecting_letter = false;
-        bool collecting_special = false;
-        bool collecting_whitespace_lookahead = false;
-        bool collecting = false;
+        auto _get_cpt = [&] (const size_t pos) -> char32_t {
+            return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0;
+        };

-        std::vector<std::string> text_utf;
-        text_utf.reserve(offset);
+        auto _get_cpt_type = [&] (const size_t pos) -> int {
+            return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_type(cpts[pos]) : CODEPOINT_TYPE_UNIDENTIFIED;
+        };

-        for (size_t i = start; i < start + offset; ++i) {
-            text_utf.emplace_back(unicode_cpt_to_utf8(cpts[i]));
-        }
+        size_t _prev_end = offset_ini;
+        auto _add_token = [&] (const size_t end) -> size_t {
+            assert(_prev_end <= end && end <= offset_end);
+            size_t len = end - _prev_end;
+            if (len > 0) {
+                bpe_offsets.push_back(len);
+            }
+            _prev_end = end;
+            //if (len > 0) {
+            //    std::string s = "";
+            //    for(size_t p = end-len; p < end; p++)
+            //        s += unicode_cpt_to_utf8(cpts[p]);
+            //    printf(">>> '%s'\n", s.c_str());
+            //}
+            return len;
+        };

-        for (int i = 0; i < (int)text_utf.size(); i++) {
-            const std::string & utf_char = text_utf[i];
-            bool split_condition = false;
-            int bytes_remain = text_utf.size() - i;
+        for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
+            const char32_t cpt = _get_cpt(pos);
+            const int cpt_type = _get_cpt_type(pos);

-            // forward backward lookups
-            const std::string & utf_char_next      = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
-            const std::string & utf_char_next_next = (i + 2 < (int)text_utf.size()) ? text_utf[i + 2] : "";
-
-            // handling contractions
-            if (!split_condition && bytes_remain >= 2) {
-                // 's|'t|'m|'d
-                if (utf_char == "\'" && (utf_char_next == "s" || utf_char_next == "t" || utf_char_next == "m" || utf_char_next == "d")) {
-                    split_condition = true;
-                }
-                if (split_condition) {
-                    if (token.size()) {
-                        bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size());
-                    }
-                    token = utf_char + utf_char_next;
-                    bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size());
-                    token = "";
-                    i++;
+            // regex: (?i:'s|'t|'re|'ve|'m|'ll|'d) // case insensitive
+            if (cpt == '\'' && pos+1 < offset_end) {
+                char32_t cpt_next = unicode_tolower(_get_cpt(pos+1));
+                if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || cpt_next == 'd') {
+                    pos += _add_token(pos+2);
                    continue;
                }
-            }
-            if (!split_condition && bytes_remain >= 3) {
-                // 're|'ve|'ll
-                if (utf_char == "\'" && (
-                    (utf_char_next == "r" && utf_char_next_next == "e") ||
-                    (utf_char_next == "v" && utf_char_next_next == "e") ||
-                    (utf_char_next == "l" && utf_char_next_next == "l"))
-                    ) {
-                    split_condition = true;
-                }
-                if (split_condition) {
-                    // current token + next token can be defined
-                    if (token.size()) {
-                        bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size());
+                if (pos+2 < offset_end) {
+                    char32_t cpt_next_next = unicode_tolower(_get_cpt(pos+2));
+                    if ((cpt_next == 'r' && cpt_next_next == 'e') ||
+                        (cpt_next == 'v' && cpt_next_next == 'e') ||
+                        (cpt_next == 'l' && cpt_next_next == 'l')) {
+                        pos += _add_token(pos+3);
+                        continue;
                    }
-                    token =  utf_char;
-                    token += utf_char_next;
-                    token += utf_char_next_next;
+                }
+            }

-                    bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size());
-                    token = "";
-                    i += 2;
+            // regex: [^\r\n\p{L}\p{N}]?\p{L}+  //####FIXME: the first \p{L} is correct?
+            if (cpt != '\r' && cpt != '\n' && /*cpt_type != CODEPOINT_TYPE_LETTER &&*/ cpt_type != CODEPOINT_TYPE_NUMBER) {
+                if (cpt_type == CODEPOINT_TYPE_LETTER || _get_cpt_type(pos+1) == CODEPOINT_TYPE_LETTER) {  // one or more letters
+                    pos++;
+                    while (_get_cpt_type(pos) == CODEPOINT_TYPE_LETTER) {
+                        pos++;
+                    }
+                    _add_token(pos);
                    continue;
                }
            }

-            if (!split_condition && !collecting) {
-                if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || (token.empty() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER)) {
-                    collecting_letter = true;
-                    collecting = true;
-                }
-                else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_NUMBER || (token.empty() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_NUMBER)) {
-                    collecting_numeric = true;
-                    collecting = true;
-                }
-                else if (
-                    ((unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_NUMBER) && (unicode_cpt_type(utf_char) != CODEPOINT_TYPE_WHITESPACE)) ||
-                    (token.empty() && utf_char == " " && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_NUMBER && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE)
-                    ) {
-                    collecting_special = true;
-                    collecting = true;
-                }
-                else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_WHITESPACE) {
-                    collecting_whitespace_lookahead = true;
-                    collecting = true;
-                }
-                else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE) {
-                    split_condition = true;
-                }
-            }
-            else if (!split_condition && collecting) {
-                if (collecting_letter && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER) {
-                    split_condition = true;
-                }
-                else if (collecting_numeric && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_NUMBER) {
-                    split_condition = true;
-                }
-                else if (collecting_special && (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_NUMBER || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
-                    split_condition = true;
-                }
-                else if (collecting_whitespace_lookahead && (unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_NUMBER)) {
-                    split_condition = true;
+            // regex: \p{N}{1,3}
+            if (cpt_type == CODEPOINT_TYPE_NUMBER) {
+                size_t ini = pos;
+                while (_get_cpt_type(pos) == CODEPOINT_TYPE_NUMBER) {
+                    if (++pos - ini >= 3 ) {
+                        _add_token(pos);
+                        ini = pos;
+                    }
                }
+                _add_token(pos);
+                continue;
            }

-            if (utf_char_next == "") {
-                split_condition = true; // final
-                token += utf_char;
+            // regex: <space>?[^\s\p{L}\p{N}]+[\r\n]*
+            char32_t cpt2 = (cpt == ' ' ? _get_cpt(pos+1) : cpt);
+            int cpt2_type = (cpt == ' ' ? _get_cpt_type(pos+1) : cpt_type);
+            if (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
+                pos += (cpt == ' ');
+                while (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
+                    cpt2_type = _get_cpt_type(++pos);
+                    cpt2 = _get_cpt(pos);
+                }
+                while (cpt2 == '\r' || cpt2 == '\n') {
+                    cpt2 = _get_cpt(++pos);
+                }
+                _add_token(pos);
+                continue;
            }

-            if (split_condition) {
-                if (token.size()) {
-                    bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size());
+            size_t num_whitespaces = 0;
+            size_t last_end_r_or_n = 0;
+            while (unicode_cpt_is_whitespace(_get_cpt(pos+num_whitespaces))) {
+                char32_t cpt2 = _get_cpt(pos+num_whitespaces);
+                if (cpt2 == '\r' || cpt2 == '\n') {
+                    last_end_r_or_n = pos + num_whitespaces + 1;
                }
-                token = utf_char;
-                collecting = false;
-                collecting_letter = false;
-                collecting_numeric = false;
-                collecting_special = false;
-                collecting_whitespace_lookahead = false;
+                num_whitespaces++;
            }
-            else {
-                token += utf_char;
+
+            // regex: \s*[\r\n]+
+            if (last_end_r_or_n > 0) {
+                pos = last_end_r_or_n;
+                _add_token(pos);
+                continue;
            }
+
+            // regex: \s+(?!\S)
+            if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != 0) {
+                pos += num_whitespaces - 1;
+                _add_token(pos);
+                continue;
+            }
+
+            // regex: \s+
+            if (num_whitespaces > 0) {
+                pos += num_whitespaces;
+                _add_token(pos);
+                continue;
+            }
+
+            // no matches
+            _add_token(++pos);
        }
-
-        start += offset;
    }

    return bpe_offsets;
@ -424,14 +543,14 @@ static std::vector<size_t> unicode_regex_split_stl(const std::string & text, con
 static std::vector<size_t> unicode_regex_split_custom(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) {
    std::vector<size_t> bpe_offsets;

-    (void)(text);
-    (void)(regex_expr);
-    (void)(offsets);
-    // TODO: this implementation is actually wrong, uncomment and run:
-    //       make -j && ./bin/test-tokenizer-0 ../models/ggml-vocab-gpt-2.gguf
-    //if (regex_expr == "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)") {
-    //    bpe_offsets = unicode_regex_split_custom_gpt2(text, offsets);
-    //}
+    if (regex_expr == "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)") {
+        bpe_offsets = unicode_regex_split_custom_gpt2(text, offsets);
+    } else if (
+            regex_expr == "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" ||
+            regex_expr == "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+") {
+
+        bpe_offsets = unicode_regex_split_custom_llama3(text, offsets);
+    }

    return bpe_offsets;
 }
@ -506,6 +625,19 @@ int unicode_cpt_type(const std::string & utf8) {
    return unicode_cpt_type(unicode_cpt_from_utf8(utf8, offset));
 }

+bool unicode_cpt_is_whitespace(uint32_t cp) {
+    static const std::unordered_set<uint32_t> is_whitespace = [] {
+        std::unordered_set<uint32_t> is_whitespace;
+        for (auto p : unicode_ranges_whitespace) {
+            for (auto i = p.first; i <= p.second; ++i) {
+                is_whitespace.insert(i);
+            }
+        }
+        return is_whitespace;
+    }();
+    return (bool)is_whitespace.count(cp);
+}
+
 std::string unicode_byte_to_utf8(uint8_t byte) {
    static std::unordered_map<uint8_t, std::string> map = unicode_byte_to_utf8_map();
    return map.at(byte);
--- a/unicode.h
+++ b/unicode.h
@ -7,7 +7,7 @@
 #define CODEPOINT_TYPE_UNIDENTIFIED 0
 #define CODEPOINT_TYPE_NUMBER       1
 #define CODEPOINT_TYPE_LETTER       2
-#define CODEPOINT_TYPE_WHITESPACE   3
+#define CODEPOINT_TYPE_SEPARATOR    3
 #define CODEPOINT_TYPE_ACCENT_MARK  4
 #define CODEPOINT_TYPE_PUNCTUATION  5
 #define CODEPOINT_TYPE_SYMBOL       6
@ -21,6 +21,8 @@ std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & c
 int unicode_cpt_type(uint32_t cp);
 int unicode_cpt_type(const std::string & utf8);

+bool unicode_cpt_is_whitespace(uint32_t cp);
+
 std::string unicode_byte_to_utf8(uint8_t byte);
 uint8_t unicode_utf8_to_byte(const std::string & utf8);