Merge commit '8c570c9496' into concedo_experimental

# Conflicts: # README.md # tests/test-backend-ops.cpp
2025-09-11 09:34:37 +00:00 · 2024-05-10 16:55:26 +08:00 · 2024-05-10 16:55:26 +08:00 · db82bad6f2
commit db82bad6f2
parent dbe72b959e 8c570c9496
18 changed files with 45581 additions and 35766 deletions
--- a/convert-hf-to-gguf-update.py
+++ b/convert-hf-to-gguf-update.py
@ -49,6 +49,10 @@ chktxt = '\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n🚀 (normal) 😶‍
 if len(sys.argv) == 2:
    token = sys.argv[1]
    if not token.startswith("hf_"):
        logger.info("Huggingface token seems invalid")
        logger.info("Usage: python convert-hf-to-gguf-update.py <huggingface_token>")
        sys.exit(1)
 else:
    logger.info("Usage: python convert-hf-to-gguf-update.py <huggingface_token>")
    sys.exit(1)
@ -257,6 +261,7 @@ tests = [
    "3333333",
    "33333333",
    "333333333",
    # "Cửa Việt", # llama-bpe fails on this
    chktxt,
 ]
--- a/examples/convert-llama2c-to-ggml/README.md
+++ b/examples/convert-llama2c-to-ggml/README.md
@ -2,7 +2,7 @@
 This example reads weights from project [llama2.c](https://github.com/karpathy/llama2.c) and saves them in ggml compatible format. The vocab that is available in `models/ggml-vocab.bin` is used by default.
-To convert the model first download the models from the [llma2.c](https://github.com/karpathy/llama2.c) repository:
+To convert the model first download the models from the [llama2.c](https://github.com/karpathy/llama2.c) repository:
 `$ make -j`
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@ -52,15 +52,15 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
                    size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
                    float v;
                    if (type == GGML_TYPE_F16) {
-                        v = ggml_fp16_to_fp32(*(ggml_fp16_t *) data + i);
+                        v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]);
                    } else if (type == GGML_TYPE_F32) {
-                        v = *(float *) data + i;
+                        v = *(float *) &data[i];
                    } else if (type == GGML_TYPE_I32) {
-                        v = (float) *(int32_t *) data + i;
+                        v = (float) *(int32_t *) &data[i];
                    } else if (type == GGML_TYPE_I16) {
-                        v = (float) *(int16_t *) data + i;
+                        v = (float) *(int16_t *) &data[i];
                    } else if (type == GGML_TYPE_I8) {
-                        v = (float) *(int8_t *) data + i;
+                        v = (float) *(int8_t *) &data[i];
                    } else {
                        GGML_ASSERT(false);
                    }
--- a/ggml-cuda/common.cuh
+++ b/ggml-cuda/common.cuh
@ -234,122 +234,6 @@ typedef float dfloat; // dequantize float
 typedef float2 dfloat2;
 #endif //GGML_CUDA_F16
 [[noreturn]]
 static __device__ void no_device_code(
    const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
 #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
    printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n",
           file_name, line, function_name, arch);
    GGML_UNUSED(arch_list);
 #else
    printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",
           file_name, line, function_name, arch, arch_list);
 #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
    __trap();
    GGML_UNUSED(no_device_code); // suppress unused function warning
 }
 #ifdef __CUDA_ARCH__
 #define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__))
 #else
 #define NO_DEVICE_CODE //GGML_ASSERT(false && "NO_DEVICE_CODE not valid in host code.")
 #endif // __CUDA_ARCH__
 static __device__ __forceinline__ float warp_reduce_sum(float x) {
 #pragma unroll
    for (int mask = 16; mask > 0; mask >>= 1) {
        x += __shfl_xor_sync(0xffffffff, x, mask, 32);
    }
    return x;
 }
 static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
 #pragma unroll
    for (int mask = 16; mask > 0; mask >>= 1) {
        a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
        a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
    }
    return a;
 }
 static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
 #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
 #pragma unroll
   for (int mask = 16; mask > 0; mask >>= 1) {
       a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
   }
   return a;
 #else
   GGML_UNUSED(a);
   NO_DEVICE_CODE;
 #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
 }
 static __device__ __forceinline__ float warp_reduce_max(float x) {
 #pragma unroll
    for (int mask = 16; mask > 0; mask >>= 1) {
        x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
    }
    return x;
 }
 static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b) {
 #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 #if CUDART_VERSION >= CUDART_HMAX
    return __hmax(a, b);
 #else
    return __half2float(a) > __half2float(b) ? a : b;
 #endif // CUDART_VERSION >= CUDART_HMAX
 #else
    GGML_UNUSED(a);
    GGML_UNUSED(b);
    NO_DEVICE_CODE;
 #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
 }
 static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const half2 b) {
 #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 #if CUDART_VERSION >= CUDART_HMAX
    return __hmax2(a, b);
 #else
    half2 ret;
    reinterpret_cast<half&>(ret.x) =  __low2float(a) >  __low2float(b) ?  __low2half(a) :  __low2half(b);
    reinterpret_cast<half&>(ret.y) = __high2float(a) > __high2float(b) ? __high2half(a) : __high2half(b);
    return ret;
 #endif // CUDART_VERSION >= CUDART_HMAX
 #else
    GGML_UNUSED(a);
    GGML_UNUSED(b);
    NO_DEVICE_CODE;
 #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
 }
 static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
 #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
 #pragma unroll
   for (int mask = 16; mask > 0; mask >>= 1) {
       x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
   }
   return x;
 #else
   GGML_UNUSED(x);
   NO_DEVICE_CODE;
 #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
 }
 #if CUDART_VERSION < CUDART_HMASK
 static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half2 b) {
    const uint32_t mask_low  = 0x0000FFFF * (float( __low2half(a)) > float( __low2half(b)));
    const uint32_t mask_high = 0xFFFF0000 * (float(__high2half(a)) > float(__high2half(b)));
    return mask_low | mask_high;
 }
 #endif // CUDART_VERSION < 12000
 #if defined(GGML_USE_HIPBLAS)
 #define __CUDA_ARCH__ 1300
@ -433,11 +317,143 @@ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
 }
 #endif // defined(GGML_USE_HIPBLAS)
-#define FP16_AVAILABLE     defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) ? \
+#define FP16_AVAILABLE (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
    defined(RDNA1) || defined(RDNA2) || defined(RDNA3) : __CUDA_ARCH__ >= CC_PASCAL
 #define FP16_MMA_AVAILABLE !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
 static bool fp16_mma_available(const int cc) {
    return cc < CC_OFFSET_AMD && cc >= CC_VOLTA;
 }
 [[noreturn]]
 static __device__ void no_device_code(
    const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
 #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
    printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n",
           file_name, line, function_name, arch);
    GGML_UNUSED(arch_list);
 #else
    printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",
           file_name, line, function_name, arch, arch_list);
 #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
    __trap();
    GGML_UNUSED(no_device_code); // suppress unused function warning
 }
 #ifdef __CUDA_ARCH__
 #define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__))
 #else
 #define NO_DEVICE_CODE //GGML_ASSERT(false && "NO_DEVICE_CODE not valid in host code.")
 #endif // __CUDA_ARCH__
 static __device__ __forceinline__ float warp_reduce_sum(float x) {
 #pragma unroll
    for (int mask = 16; mask > 0; mask >>= 1) {
        x += __shfl_xor_sync(0xffffffff, x, mask, 32);
    }
    return x;
 }
 static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
 #pragma unroll
    for (int mask = 16; mask > 0; mask >>= 1) {
        a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
        a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
    }
    return a;
 }
 static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
 #if FP16_AVAILABLE
 #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 #pragma unroll
    for (int mask = 16; mask > 0; mask >>= 1) {
        const half2 a_other = __shfl_xor_sync(0xffffffff, a, mask, 32);
        reinterpret_cast<half&>(a.x) +=  __low2half(a_other);
        reinterpret_cast<half&>(a.y) += __high2half(a_other);
    }
    return a;
 #else
 #pragma unroll
    for (int mask = 16; mask > 0; mask >>= 1) {
        a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
    }
    return a;
 #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 #else
    NO_DEVICE_CODE;
    return a;
 #endif // FP16_AVAILABLE
 }
 static __device__ __forceinline__ float warp_reduce_max(float x) {
 #pragma unroll
    for (int mask = 16; mask > 0; mask >>= 1) {
        x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
    }
    return x;
 }
 static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b) {
 #if FP16_AVAILABLE
 #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
    return __float2half(fmaxf(__half2float(a), __half2float(b)));
 #else
    return __hmax(a, b);
 #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
 #else
   NO_DEVICE_CODE;
   GGML_UNUSED(b);
   return a;
 #endif // FP16_AVAILABLE
 }
 static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const half2 b) {
 #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 #if CUDART_VERSION >= CUDART_HMAX
    return __hmax2(a, b);
 #else
    half2 ret;
    reinterpret_cast<half&>(ret.x) = __float2half(fmaxf( __low2float(a),  __low2float(b)));
    reinterpret_cast<half&>(ret.y) = __float2half(fmaxf(__high2float(a), __high2float(b)));
    return ret;
 #endif // CUDART_VERSION >= CUDART_HMAX
 #else
    GGML_UNUSED(a);
    GGML_UNUSED(b);
    NO_DEVICE_CODE;
 #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 }
 static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
 #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
 #pragma unroll
   for (int mask = 16; mask > 0; mask >>= 1) {
       x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
   }
   return x;
 #else
   GGML_UNUSED(x);
   NO_DEVICE_CODE;
 #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
 }
 #if CUDART_VERSION < CUDART_HMASK
 static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half2 b) {
    const uint32_t mask_low  = 0x0000FFFF * (float( __low2half(a)) > float( __low2half(b)));
    const uint32_t mask_high = 0xFFFF0000 * (float(__high2half(a)) > float(__high2half(b)));
    return mask_low | mask_high;
 }
 #endif // CUDART_VERSION < 12000
 // TODO: move to ggml-common.h
 static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
--- a/ggml-cuda/fattn.cu
+++ b/ggml-cuda/fattn.cu
@ -11,8 +11,10 @@
 #define HALF_MAX_HALF         __float2half(65504.0f/2) // Use neg. of this instead of -INFINITY to initialize KQ max vals to avoid NaN upon subtraction.
 #define SOFTMAX_FTZ_THRESHOLD -20.0f                   // Softmax exp. of values smaller than this are flushed to zero to avoid NaNs.
-template<int D, int parallel_blocks> // D == head size
+template<int D, int ncols, int parallel_blocks> // D == head size
-__launch_bounds__(((D + WARP_SIZE - 1) / WARP_SIZE)*WARP_SIZE, 1)
+#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 __launch_bounds__(D, 1)
 #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 static __global__ void flash_attn_vec_ext_f16(
        const char * __restrict__ Q,
        const char * __restrict__ K,
@ -44,55 +46,77 @@ static __global__ void flash_attn_vec_ext_f16(
 #if FP16_AVAILABLE
    //In this kernel Q, K, V are matrices while i, j, k are matrix indices.
-    const int ic = blockIdx.x / parallel_blocks; // Index of the Q/QKV column to work on.
+    const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
    const int ip  =  blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel.
    const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
-    const float2 * Q_f2  = (const float2 *) (Q    + nb02* blockIdx.y              + nb01*ic);
+    const float2 * Q_f2  = (const float2 *) (Q    + nb02* blockIdx.y              + nb01*ic0);
    const half2  * K_h2  = (const half2  *) (K    + nb12*(blockIdx.y / gqa_ratio));
    const half   * V_h   = (const half   *) (V    + nb12*(blockIdx.y / gqa_ratio)); // K and V have same shape
-    const half   * maskh = (const half   *)  mask + ne11*ic;
+    const half   * maskh = (const half   *)  mask + ne11*ic0;
    const int stride_KV  = nb11 / sizeof(half);
    const int stride_KV2 = nb11 / sizeof(half2);
-    constexpr int nwarps = (D + WARP_SIZE - 1) / WARP_SIZE;
+    static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
    constexpr int nwarps = D / WARP_SIZE;
    const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
-    __builtin_assume(tid < nwarps*WARP_SIZE);
+    __builtin_assume(tid < D);
-    __shared__ half KQ[nwarps*WARP_SIZE];
+    __shared__ half KQ[ncols*D];
-    KQ[tid] = -INFINITY;
+#pragma unroll
    for (int j = 0; j < ncols; ++j) {
        KQ[j*D + tid] = -HALF_MAX_HALF;
    }
    half2 * KQ2 = (half2 *) KQ;
-    half kqmax = -HALF_MAX_HALF;
+    half kqmax[ncols];
-    half kqsum = 0.0f;
+#pragma unroll
    for (int j = 0; j < ncols; ++j) {
        kqmax[j] = -HALF_MAX_HALF;
    }
    half kqsum[ncols] = {0.0f};
-    __shared__ half kqmax_shared[WARP_SIZE];
+    __shared__ half kqmax_shared[ncols][WARP_SIZE];
-    __shared__ half kqsum_shared[WARP_SIZE];
+    __shared__ half kqsum_shared[ncols][WARP_SIZE];
 #pragma unroll
    for (int j = 0; j < ncols; ++j) {
        if (threadIdx.y == 0) {
-        kqmax_shared[threadIdx.x] = -HALF_MAX_HALF;
+            kqmax_shared[j][threadIdx.x] = -HALF_MAX_HALF;
-        kqsum_shared[threadIdx.x] = 0.0f;
+            kqsum_shared[j][threadIdx.x] = 0.0f;
        }
    }
    __syncthreads();
    // Convert Q to half2 and store in registers:
-    half2 Q_h2[(D/2 + WARP_SIZE - 1) / WARP_SIZE];
+    half2 Q_h2[ncols][D/(2*WARP_SIZE)];
 #pragma unroll
    for (int j = 0; j < ncols; ++j) {
 #pragma unroll
        for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
            const int i = i0 + threadIdx.x;
-        if (i0 + WARP_SIZE > D/2 && i >= D/2) {
+
-            break;
+            const float2 tmp = Q_f2[j*(nb01/sizeof(float2)) + i];
            Q_h2[j][i0/WARP_SIZE] = make_half2(scale, scale) * make_half2(tmp.x, tmp.y);
        }
    }
-        Q_h2[i0/WARP_SIZE] = make_half2(scale, scale) * make_half2(Q_f2[i].x, Q_f2[i].y);
+    half2 VKQ[ncols] = {{0.0f, 0.0f}};
    }
    half2 VKQ = make_half2(0.0f, 0.0f); // Each thread calculates a single VKQ value.
    const int k_start = parallel_blocks == 1 ? 0 : ip*D;
    for (int k_VKQ_0 = k_start; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*D) {
        // Calculate KQ tile and keep track of new maximum KQ values:
-        half kqmax_new = kqmax;
+
        // For unknown reasons using a half array of size 1 for kqmax_new causes a performance regression,
        // see https://github.com/ggerganov/llama.cpp/pull/7061 .
        // Therefore this variable is defined twice but only used once (so that the compiler can optimize out the unused variable).
        half kqmax_new = kqmax[0];
        half kqmax_new_arr[ncols];
 #pragma unroll
        for (int j = 0; j < ncols; ++j) {
            kqmax_new_arr[j] = kqmax[j];
        }
 #pragma unroll
        for (int i_KQ_0 = 0; i_KQ_0 < D; i_KQ_0 += nwarps) {
            const int i_KQ = i_KQ_0 + threadIdx.y;
@ -101,47 +125,65 @@ static __global__ void flash_attn_vec_ext_f16(
                break;
            }
-            half2 sum2 = make_half2(0.0f, 0.0f);
+            half2 sum2[ncols] = {{0.0f, 0.0f}};
 #pragma unroll
            for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += WARP_SIZE) {
                const int k_KQ = k_KQ_0 + threadIdx.x;
                if (k_KQ_0 + WARP_SIZE > D/2 && k_KQ >= D/2) {
                    break;
                }
                const half2 K_ik = K_h2[(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ];
-                sum2 += K_ik * Q_h2[k_KQ_0/WARP_SIZE];
+#pragma unroll
                for (int j = 0; j < ncols; ++j) {
                    sum2[j] += K_ik * Q_h2[j][k_KQ_0/WARP_SIZE];
                }
            }
-            sum2 = warp_reduce_sum(sum2);
+#pragma unroll
-            half sum = __low2half(sum2) + __high2half(sum2);
+            for (int j = 0; j < ncols; ++j) {
-            sum += mask ? maskh[k_VKQ_0 + i_KQ] : __float2half(0.0f);
+                sum2[j] = warp_reduce_sum(sum2[j]);
                half sum = __low2half(sum2[j]) + __high2half(sum2[j]);
                sum += mask ? maskh[j*ne11 + k_VKQ_0 + i_KQ] : __float2half(0.0f);
                if (ncols == 1) {
                    kqmax_new        = ggml_cuda_hmax(kqmax_new,        sum);
                } else {
                    kqmax_new_arr[j] = ggml_cuda_hmax(kqmax_new_arr[j], sum);
                }
                if (threadIdx.x == 0) {
-                KQ[i_KQ] = sum;
+                    KQ[j*D + i_KQ] = sum;
                }
            }
        }
-        kqmax_new = warp_reduce_max(kqmax_new);
+#pragma unroll
        for (int j = 0; j < ncols; ++j) {
            half kqmax_new_j = ncols == 1 ? kqmax_new : kqmax_new_arr[j];
            kqmax_new_j = warp_reduce_max(kqmax_new_j);
            if (threadIdx.x == 0) {
-            kqmax_shared[threadIdx.y] = kqmax_new;
+                kqmax_shared[j][threadIdx.y] = kqmax_new_j;
            }
        }
        __syncthreads();
 #pragma unroll
        for (int j = 0; j < ncols; ++j) {
            half kqmax_new_j = kqmax_shared[j][threadIdx.x];
            kqmax_new_j = warp_reduce_max(kqmax_new_j);
            const half KQ_max_scale = hexp(kqmax[j] - kqmax_new_j);
            kqmax[j] = kqmax_new_j;
            const half val = hexp(KQ[j*D + tid] - kqmax[j]);
            kqsum[j] = kqsum[j]*KQ_max_scale + val;
            KQ[j*D + tid] = val;
            VKQ[j] *= __half2half2(KQ_max_scale);
        }
        __syncthreads();
        kqmax_new = kqmax_shared[threadIdx.x];
        kqmax_new = warp_reduce_max(kqmax_new);
        const half KQ_max_scale = hexp(kqmax - kqmax_new);
        kqmax = kqmax_new;
        const half val = hexp(KQ[tid] - kqmax);
        kqsum = kqsum*KQ_max_scale + val;
        KQ[tid] = val;
        VKQ *= __half2half2(KQ_max_scale);
        __syncthreads();
        if (tid < D) {
 #pragma unroll
        for (int k0 = 0; k0 < D; k0 += 2) {
            if (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + k0 >= ne11) {
@ -151,39 +193,44 @@ static __global__ void flash_attn_vec_ext_f16(
            half2 V_k;
            reinterpret_cast<half&>(V_k.x) = V_h[(k_VKQ_0 + k0 + 0)*stride_KV + tid];
            reinterpret_cast<half&>(V_k.y) = V_h[(k_VKQ_0 + k0 + 1)*stride_KV + tid];
-                VKQ += V_k*KQ2[k0/2];
+#pragma unroll
            for (int j = 0; j < ncols; ++j) {
                VKQ[j] += V_k*KQ2[j*(D/2) + k0/2];
            }
        }
        __syncthreads();
    }
-    if (tid >= D) {
+#pragma unroll
-        kqsum = 0.0f;
+    for (int j = 0; j < ncols; ++j) {
-    }
+        kqsum[j] = warp_reduce_sum(kqsum[j]);
    kqsum = warp_reduce_sum(kqsum);
        if (threadIdx.x == 0) {
-        kqsum_shared[threadIdx.y] = kqsum;
+            kqsum_shared[j][threadIdx.y] = kqsum[j];
        }
    }
    __syncthreads();
    kqsum = kqsum_shared[threadIdx.x];
    kqsum = warp_reduce_sum(kqsum);
-    if (tid >= D) {
+#pragma unroll
-        return;
+    for (int j_VKQ = 0; j_VKQ < ncols; ++j_VKQ) {
-    }
+        kqsum[j_VKQ] = kqsum_shared[j_VKQ][threadIdx.x];
        kqsum[j_VKQ] = warp_reduce_sum(kqsum[j_VKQ]);
-    half dst_val = (__low2half(VKQ) + __high2half(VKQ));
+        half dst_val = (__low2half(VKQ[j_VKQ]) + __high2half(VKQ[j_VKQ]));
        if (parallel_blocks == 1) {
-        dst_val /= kqsum;
+            dst_val /= kqsum[j_VKQ];
        }
        const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip;
        dst[j_dst*D*gridDim.y + D*blockIdx.y + tid] = dst_val;
    }
    dst[D*gridDim.y*blockIdx.x + D*blockIdx.y + tid] = dst_val;
-    if (parallel_blocks == 1 || tid != 0) {
+    if (parallel_blocks != 1 && tid != 0) {
-        return;
+#pragma unroll
        for (int j = 0; j < ncols; ++j) {
            dst_meta[(ic0 + j)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[j], kqsum[j]);
        }
    }
    dst_meta[ic*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax, kqsum);
 #else
   NO_DEVICE_CODE;
 #endif // FP16_AVAILABLE
@ -191,7 +238,9 @@ static __global__ void flash_attn_vec_ext_f16(
 // D == head size, VKQ_stride == num VKQ rows calculated in parallel:
 template<int D, int ncols, int nwarps, int VKQ_stride, int parallel_blocks, typename KQ_acc_t>
 #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 __launch_bounds__(nwarps*WARP_SIZE, 1)
 #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 static __global__ void flash_attn_ext_f16(
        const char * __restrict__ Q,
        const char * __restrict__ K,
@ -573,7 +622,9 @@ static __global__ void flash_attn_ext_f16(
 }
 template<int D, int parallel_blocks> // D == head size
 #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 __launch_bounds__(D, 1)
 #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 static __global__ void flash_attn_combine_results(
        const float  * __restrict__ VKQ_parts,
        const float2 * __restrict__ VKQ_meta,
@ -642,7 +693,7 @@ static_assert(get_VKQ_stride( 80, 1, 16) ==  16, "Test failed.");
 static_assert(get_VKQ_stride( 80, 2, 16) ==  16, "Test failed.");
 static_assert(get_VKQ_stride( 80, 4, 16) ==  16, "Test failed.");
-template <int D, int parallel_blocks> void launch_fattn_vec_f16(
+template <int D, int cols_per_block, int parallel_blocks> void launch_fattn_vec_f16(
        const ggml_tensor * Q, const ggml_tensor * K, const ggml_tensor * V, ggml_tensor * KQV, const ggml_tensor * mask,
        ggml_cuda_pool & pool, cudaStream_t main_stream
 ) {
@ -656,13 +707,13 @@ template <int D, int parallel_blocks> void launch_fattn_vec_f16(
    constexpr int  nwarps = (D + WARP_SIZE - 1) / WARP_SIZE;
    const     dim3 block_dim(WARP_SIZE, nwarps, 1);
-    const     dim3 blocks_num(parallel_blocks*Q->ne[1], Q->ne[2], Q->ne[3]);
+    const     dim3 blocks_num(parallel_blocks*((Q->ne[1] + cols_per_block - 1) / cols_per_block), Q->ne[2], Q->ne[3]);
    const     int  shmem = 0;
    float scale;
    memcpy(&scale, KQV->op_params, sizeof(float));
-    flash_attn_vec_ext_f16<D, parallel_blocks>
+    flash_attn_vec_ext_f16<D, cols_per_block, parallel_blocks>
        <<<blocks_num, block_dim, shmem, main_stream>>> (
                (const char *) Q->data,
                (const char *) K->data,
@ -783,10 +834,99 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
    ggml_cuda_set_device(ctx.device);
    const int cc  = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
    const int nsm = ggml_cuda_info().devices[ggml_cuda_get_device()].nsm;
    const int32_t precision = KQV->op_params[1];
    if (!fp16_mma_available(cc)) {
        GGML_ASSERT(precision == GGML_PREC_DEFAULT);
        GGML_ASSERT(Q->ne[0] == 64 || Q->ne[0] == 128 && "FlashAttention without tensor cores only supports head sizes 64 and 128.");
        if (Q->ne[1] == 1) {
            constexpr int cols_per_block = 1;
            constexpr int parallel_blocks = 4;
            switch (Q->ne[0]) {
                case 64:
                    launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
                    break;
                case 128:
                    launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
                    break;
                default:
                    GGML_ASSERT(false);
                    break;
            }
            return;
        }
        if (Q->ne[1] == 2) {
            constexpr int cols_per_block = 2;
            constexpr int parallel_blocks = 4;
            switch (Q->ne[0]) {
                case 64:
                    launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
                    break;
                case 128:
                    launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
                    break;
                default:
                    GGML_ASSERT(false);
                    break;
            }
            return;
        }
        if (Q->ne[1] <= 4) {
            constexpr int cols_per_block = 4;
            constexpr int parallel_blocks = 4;
            switch (Q->ne[0]) {
                case 64:
                    launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
                    break;
                case 128:
                    launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
                    break;
                default:
                    GGML_ASSERT(false);
                    break;
            }
            return;
        }
        if (Q->ne[1] <= 8) {
            constexpr int cols_per_block = 8;
            constexpr int parallel_blocks = 4;
            switch (Q->ne[0]) {
                case 64:
                    launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
                    break;
                case 128:
                    launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
                    break;
                default:
                    GGML_ASSERT(false);
                    break;
            }
            return;
        }
        constexpr int cols_per_block = 8;
        constexpr int parallel_blocks = 1;
        switch (Q->ne[0]) {
            case 64:
                launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
                break;
            case 128:
                launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
                break;
            default:
                GGML_ASSERT(false);
                break;
        }
        return;
    }
    if (precision != GGML_PREC_DEFAULT) {
        if (Q->ne[1] <= 32 || Q->ne[0] > 128) {
            constexpr int cols_per_block = 16;
@ -845,16 +985,17 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
    }
    if (Q->ne[1] == 1 && Q->ne[0] % (2*WARP_SIZE) == 0) {
        constexpr int cols_per_block = 1;
        constexpr int parallel_blocks = 4;
        switch (Q->ne[0]) {
            case 64:
-                launch_fattn_vec_f16< 64, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
+                launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
                break;
            case 128:
-                launch_fattn_vec_f16<128, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
+                launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
                break;
            case 256:
-                launch_fattn_vec_f16<256, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
+                launch_fattn_vec_f16<256, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
                break;
            default:
                GGML_ASSERT(false);
--- a/ggml-opencl.cpp
+++ b/ggml-opencl.cpp
@ -2128,6 +2128,7 @@ static size_t ggml_backend_opencl_buffer_type_get_alignment(ggml_backend_buffer_
    if (alignment == (cl_uint)-1) {
        ggml_cl_init();
        clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &alignment, NULL);
        alignment /= 8; // bits to bytes
    }
    return alignment;
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@ -8330,22 +8330,24 @@ static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict_
    const int blocks_per_row = ncols / qk;
    const int blocks_per_warp = vdr * WARP_SIZE / qi;
-// partial sum for each thread
+    const int qi_vdr = (qi / vdr); // N_threads processing 1 qk block
    // partial sum for each thread
    float tmp = 0.0f;
    const block_q_t  * x = (const block_q_t  *) vx;
    const block_q8_1 * y = (const block_q8_1 *) vy;
-    for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
+    for (int i = item_ct1.get_local_id(2) / qi_vdr; i < blocks_per_row;
         i += blocks_per_warp) {
-        const int ibx = row*blocks_per_row + i; // x block index
+      const int ibx = row * blocks_per_row + i; // x block index
-        const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
+      const int iby = i * (qk / QK8_1); // y block index that aligns with ibx
      const int iqs =
          vdr *
-            (item_ct1.get_local_id(2) %
+          (item_ct1.get_local_id(2) -
-             (qi / vdr)); // x block quant index when casting the quants to int
+           i * qi_vdr); // x block quant index when casting the quants to int
      tmp += vec_dot_q_sycl(&x[ibx], &y[iby], iqs);
    }
--- a/ggml-vulkan-shaders.hpp
+++ b/ggml-vulkan-shaders.hpp
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
--- a/ggml_vk_generate_shaders.py
+++ b/ggml_vk_generate_shaders.py
--- a/gguf-py/scripts/gguf-new-metadata.py
+++ b/gguf-py/scripts/gguf-new-metadata.py
@ -7,7 +7,8 @@ import json
 from pathlib import Path
 import numpy as np
-from typing import Any, Sequence
+from tqdm import tqdm
 from typing import Any, Sequence, NamedTuple
 # Necessary to load the local gguf package
 if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent / 'gguf-py').exists():
@ -18,6 +19,12 @@ import gguf
 logger = logging.getLogger("gguf-new-metadata")
 class MetadataDetails(NamedTuple):
    type: gguf.GGUFValueType
    value: Any
    description: str = ''
 def get_byteorder(reader: gguf.GGUFReader) -> gguf.GGUFEndian:
    if np.uint32(1) == np.uint32(1).newbyteorder("<"):
        # Host is little endian
@ -59,7 +66,16 @@ def get_field_data(reader: gguf.GGUFReader, key: str) -> Any:
    return decode_field(field)
-def copy_with_new_metadata(reader: gguf.GGUFReader, writer: gguf.GGUFWriter, new_metadata: dict[str, str], remove_metadata: Sequence[str]) -> None:
+def find_token(token_list: Sequence[int], token: str) -> Sequence[int]:
    token_ids = [index for index, value in enumerate(token_list) if value == token]
    if len(token_ids) == 0:
        raise LookupError(f'Unable to find "{token}" in token list!')
    return token_ids
 def copy_with_new_metadata(reader: gguf.GGUFReader, writer: gguf.GGUFWriter, new_metadata: dict[str, MetadataDetails], remove_metadata: Sequence[str]) -> None:
    for field in reader.fields.values():
        # Suppress virtual fields and fields written by GGUFWriter
        if field.name == gguf.Keys.General.ARCHITECTURE or field.name.startswith('GGUF.'):
@ -75,54 +91,64 @@ def copy_with_new_metadata(reader: gguf.GGUFReader, writer: gguf.GGUFWriter, new
            logger.debug(f'Removing {field.name}')
            continue
-        old_val = decode_field(field)
+        old_val = MetadataDetails(field.types[0], decode_field(field))
        val = new_metadata.get(field.name, old_val)
        if field.name in new_metadata:
-            logger.debug(f'Modifying {field.name}: "{old_val}" -> "{val}"')
+            logger.debug(f'Modifying {field.name}: "{old_val.value}" -> "{val.value}" {val.description}')
            del new_metadata[field.name]
-        elif val is not None:
+        elif val.value is not None:
            logger.debug(f'Copying {field.name}')
-        if val is not None:
+        if val.value is not None:
            writer.add_key(field.name)
-            writer.add_val(val, field.types[0])
+            writer.add_val(val.value, val.type)
    if gguf.Keys.Tokenizer.CHAT_TEMPLATE in new_metadata:
        logger.debug('Adding chat template(s)')
-        writer.add_chat_template(new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE])
+        writer.add_chat_template(new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE].value)
        del new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE]
    # TODO: Support other types than string?
    for key, val in new_metadata.items():
-        logger.debug(f'Adding {key}: {val}')
+        logger.debug(f'Adding {key}: "{val.value}" {val.description}')
        writer.add_key(key)
-        writer.add_val(val, gguf.GGUFValueType.STRING)
+        writer.add_val(val.value, val.type)
    total_bytes = 0
    for tensor in reader.tensors:
        total_bytes += tensor.n_bytes
        # Dimensions are written in reverse order, so flip them first
        shape = np.flipud(tensor.shape).tolist()
        writer.add_tensor_info(tensor.name, shape, tensor.data.dtype, tensor.data.nbytes, tensor.tensor_type)
    bar = tqdm(desc="Writing", total=total_bytes, unit="byte", unit_scale=True)
    writer.write_header_to_file()
    writer.write_kv_data_to_file()
    writer.write_ti_data_to_file()
    for tensor in reader.tensors:
        writer.write_tensor_data(tensor.data)
        bar.update(tensor.n_bytes)
    writer.close()
 def main() -> None:
    tokenizer_metadata = (getattr(gguf.Keys.Tokenizer, n) for n in gguf.Keys.Tokenizer.__dict__.keys() if not n.startswith('_'))
    token_names = dict((n.split('.')[-1][:-len('_token_id')], n) for n in tokenizer_metadata if n.endswith('_token_id'))
    parser = argparse.ArgumentParser(description="Make a copy of a GGUF file with new metadata")
    parser.add_argument("input",                                       type=Path, help="GGUF format model input filename")
    parser.add_argument("output",                                      type=Path, help="GGUF format model output filename")
-    parser.add_argument("--general-name",                              type=str,  help="The models general.name")
+    parser.add_argument("--general-name",                              type=str,  help="The models general.name", metavar='"name"')
-    parser.add_argument("--general-description",                       type=str,  help="The models general.description")
+    parser.add_argument("--general-description",                       type=str,  help="The models general.description", metavar='"Description ..."')
-    parser.add_argument("--chat-template",                             type=str,  help="Chat template string (or JSON string containing templates)")
+    parser.add_argument("--chat-template",                             type=str,  help="Chat template string (or JSON string containing templates)", metavar='"{% ... %} ..."')
-    parser.add_argument("--chat-template-config",                      type=Path, help="Config file (tokenizer_config.json) containing chat template(s)")
+    parser.add_argument("--chat-template-config",                      type=Path, help="Config file containing chat template(s)", metavar='tokenizer_config.json')
-    parser.add_argument("--remove-metadata",      action="append",     type=str,  help="Remove metadata (by key name) from output model")
+    parser.add_argument("--remove-metadata",      action="append",     type=str,  help="Remove metadata (by key name) from output model", metavar='general.url')
    parser.add_argument("--special-token",        action="append",     type=str,  help="Special token by value", nargs=2, metavar=(' | '.join(token_names.keys()), '"<token>"'))
    parser.add_argument("--special-token-by-id",  action="append",     type=str,  help="Special token by id", nargs=2, metavar=(' | '.join(token_names.keys()), '0'))
    parser.add_argument("--force",                action="store_true",            help="Bypass warnings without confirmation")
    parser.add_argument("--verbose",              action="store_true",            help="Increase output verbosity")
    args = parser.parse_args(None if len(sys.argv) > 2 else ["--help"])
@ -133,20 +159,20 @@ def main() -> None:
    remove_metadata = args.remove_metadata or []
    if args.general_name:
-        new_metadata[gguf.Keys.General.NAME] = args.general_name
+        new_metadata[gguf.Keys.General.NAME] = MetadataDetails(gguf.GGUFValueType.STRING, args.general_name)
    if args.general_description:
-        new_metadata[gguf.Keys.General.DESCRIPTION] = args.general_description
+        new_metadata[gguf.Keys.General.DESCRIPTION] = MetadataDetails(gguf.GGUFValueType.STRING, args.general_description)
    if args.chat_template:
-        new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE] = json.loads(args.chat_template) if args.chat_template.startswith('[') else args.chat_template
+        new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE] = MetadataDetails(gguf.GGUFValueType.STRING, json.loads(args.chat_template) if args.chat_template.startswith('[') else args.chat_template)
    if args.chat_template_config:
        with open(args.chat_template_config, 'r') as fp:
            config = json.load(fp)
            template = config.get('chat_template')
            if template:
-                new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE] = template
+                new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE] = MetadataDetails(gguf.GGUFValueType.STRING, template)
    if remove_metadata:
        logger.warning('*** Warning *** Warning *** Warning **')
@ -166,6 +192,32 @@ def main() -> None:
    arch = get_field_data(reader, gguf.Keys.General.ARCHITECTURE)
    endianess = get_byteorder(reader)
    token_list = get_field_data(reader, gguf.Keys.Tokenizer.LIST) or []
    for name, token in args.special_token or []:
        if name not in token_names:
            logger.warning(f'Unknown special token "{name}", ignoring...')
        else:
            ids = find_token(token_list, token)
            new_metadata[token_names[name]] = MetadataDetails(gguf.GGUFValueType.UINT32, ids[0], f'= {token}')
            if len(ids) > 1:
                logger.warning(f'Multiple "{token}" tokens found, choosing ID {ids[0]}, use --special-token-by-id if you want another:')
                logger.warning(', '.join(str(i) for i in ids))
    for name, id_string in args.special_token_by_id or []:
        if name not in token_names:
            logger.warning(f'Unknown special token "{name}", ignoring...')
        elif not id_string.isdecimal():
            raise LookupError(f'Token ID "{id_string}" is not a valid ID!')
        else:
            id_int = int(id_string)
            if id_int >= 0 and id_int < len(token_list):
                new_metadata[token_names[name]] = MetadataDetails(gguf.GGUFValueType.UINT32, id_int, f'= {token_list[id_int]}')
            else:
                raise LookupError(f'Token ID {id_int} is not within token list!')
    if os.path.isfile(args.output) and not args.force:
        logger.warning('*** Warning *** Warning *** Warning **')
        logger.warning(f'* The "{args.output}" GGUF file already exists, it will be overwritten!')
--- a/llama.cpp
+++ b/llama.cpp
@ -12779,7 +12779,7 @@ struct llm_tokenizer_wpm {
                continue;
            }
            code = unicode_tolower(code);
-            if (type == CODEPOINT_TYPE_WHITESPACE) {
+            if (type == CODEPOINT_TYPE_SEPARATOR) {
                code = ' ';
            }
            std::string s = unicode_cpt_to_utf8(code);
@ -15824,13 +15824,6 @@ struct llama_context * llama_new_context_with_model(
        cparams.flash_attn = false;
    }
 #ifdef GGML_USE_HIPBLAS
    if (cparams.flash_attn) {
        LLAMA_LOG_WARN("%s: flash_attn is not yet compatible with HIPBLAS builds - forcing off\n", __func__);
        cparams.flash_attn = false;
    }
 #endif
    if (params.seed == LLAMA_DEFAULT_SEED) {
        params.seed = time(NULL);
    }
@ -18199,7 +18192,7 @@ struct llama_timings llama_get_timings(struct llama_context * ctx) {
        /*.t_eval_ms   =*/ 1e-3 * ctx->t_eval_us,
        /*.n_sample =*/ std::max(1, ctx->n_sample),
-        /*.n_p_eval =*/ std::max(1, ctx->n_p_eval),
+        /*.n_p_eval =*/ std::max(0, ctx->n_p_eval),
        /*.n_eval   =*/ std::max(1, ctx->n_eval),
    };
--- a/scripts/gen-unicode-data.py
+++ b/scripts/gen-unicode-data.py
@ -1,31 +1,14 @@
 import regex
 def cpt_to_utf8_str(cpt):
    if cpt <= 0xFF:
        return bytes([cpt, 0, 0, 0])
    elif cpt <= 0xFFFF:
        return bytes([cpt & 0xFF, cpt >> 8, 0, 0])
    elif cpt <= 0xFFFFFF:
        return bytes([cpt & 0xFF, (cpt >> 8) & 0xFF, (cpt >> 16) & 0xFF, 0])
    else:
        return bytes([cpt & 0xFF, (cpt >> 8) & 0xFF, (cpt >> 16) & 0xFF, cpt >> 24])
 def is_match(codepoint, regex_expr):
    try:
        res = regex.match(regex_expr, cpt_to_utf8_str(codepoint).decode('utf-32'))
        return res is not None
    except Exception:
        return False
 def get_matches(regex_expr):
    regex_expr_compiled = regex.compile(regex_expr)
    unicode_ranges = []
    current_range = None
    for codepoint in range(0x110000):
-        if is_match(codepoint, regex_expr):
+        char = chr(codepoint)
        if regex_expr_compiled.match(char):
            if current_range is None:
                current_range = [codepoint, codepoint]
            else:
@ -40,27 +23,42 @@ def get_matches(regex_expr):
    return unicode_ranges
-def print_cat(cat, ranges):
+def print_cat(mode, cat, ranges):
    if mode == "range":
        print("const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_{} = {{".format(cat)) # noqa: NP100
-    cnt = 0
+    if mode == "map":
-    for start, end in ranges:
+        print("const std::map<uint32_t, uint32_t> unicode_map_{} = {{".format(cat)) # noqa: NP100
-        if cnt % 4 != 0:
+    for i, values in enumerate(ranges):
-            print(" ", end="") # noqa: NP100
+        end = ",\n" if (i % 4 == 3 or i + 1 == len(ranges)) else ", "
-        print("{{0x{:08X}, 0x{:08X}}},".format(start, end), end="") # noqa: NP100
+        values = ["0x%08X" % value for value in values]
-        if cnt % 4 == 3:
+        print("{" + ", ".join(values) + "}", end=end) # noqa: NP100
            print("") # noqa: NP100
        cnt += 1
    if cnt % 4 != 0:
        print("") # noqa: NP100
    print("};") # noqa: NP100
    print("") # noqa: NP100
-print_cat("number",      get_matches(r'\p{N}'))
+print_cat("range", "number",      get_matches(r'\p{N}'))
-print_cat("letter",      get_matches(r'\p{L}'))
+print_cat("range", "letter",      get_matches(r'\p{L}'))
-print_cat("whitespace",  get_matches(r'\p{Z}'))
+print_cat("range", "separator",   get_matches(r'\p{Z}'))
-print_cat("accent_mark", get_matches(r'\p{M}'))
+print_cat("range", "accent_mark", get_matches(r'\p{M}'))
-print_cat("punctuation", get_matches(r'\p{P}'))
+print_cat("range", "punctuation", get_matches(r'\p{P}'))
-print_cat("symbol",      get_matches(r'\p{S}'))
+print_cat("range", "symbol",      get_matches(r'\p{S}'))
-print_cat("control",     get_matches(r'\p{C}'))
+print_cat("range", "control",     get_matches(r'\p{C}'))
 print_cat("range", "whitespace",  get_matches(r'\s'))
 map_lowercase = []
 map_uppercase = []
 for codepoint in range(0x110000):
    char = chr(codepoint)
    lower = ord(char.lower()[0])
    upper = ord(char.upper()[0])
    if codepoint != lower:
        map_lowercase.append((codepoint, lower))
    if codepoint != upper:
        map_uppercase.append((codepoint, upper))
 print_cat("map", "lowercase", map_lowercase)
 print_cat("map", "uppercase", map_uppercase)
 # TODO: generate unicode_map_nfd
--- a/tests/test-tokenizer-random.py
+++ b/tests/test-tokenizer-random.py
@ -0,0 +1,295 @@
 # Test libllama tokenizer == AutoTokenizer.
 # Brute force random tokens/text generation.
 #
 # Sample usage:
 #
 #   python3 tests/test-tokenizer-random.py ./models/ggml-vocab-llama-bpe.gguf ./models/tokenizers/llama-bpe
 #
 import time
 import logging
 import argparse
 import subprocess
 import random
 from typing import Iterator
 import cffi
 from transformers import AutoTokenizer, PreTrainedTokenizerBase
 logger = logging.getLogger("test-tokenizer-random-bpe")
 class LibLlama:
    DEFAULT_PATH_LLAMA_H = "./llama.h"
    DEFAULT_PATH_LIBLLAMA = "./build/libllama.so"  # CMakeLists.txt: BUILD_SHARED_LIBS ON
    def __init__(self, path_llama_h: str = None, path_libllama: str = None):
        path_llama_h = path_llama_h or self.DEFAULT_PATH_LLAMA_H
        path_libllama = path_libllama or self.DEFAULT_PATH_LIBLLAMA
        (self.ffi, self.lib) = self._load_libllama_cffi(path_llama_h, path_libllama)
        self.lib.llama_backend_init()
    def _load_libllama_cffi(self, path_llama_h: str, path_libllama: str):
        cmd = ["gcc", "-E", "-P", "-D__restrict=", "-D__attribute__(x)=", "-D__asm__(x)=", path_llama_h]
        res = subprocess.run(cmd, stdout=subprocess.PIPE)
        assert (res.returncode == 0)
        source = res.stdout.decode()
        ffi = cffi.FFI()
        if True:  # workarounds for pycparser
            source = "typedef struct { } __builtin_va_list;" + "\n" + source
            source = source.replace("sizeof (int)",    str(ffi.sizeof("int")))
            source = source.replace("sizeof (void *)", str(ffi.sizeof("void*")))
            source = source.replace("sizeof (size_t)", str(ffi.sizeof("size_t")))
            source = source.replace("sizeof(int32_t)", str(ffi.sizeof("int32_t")))
        ffi.cdef(source, override=True)
        lib = ffi.dlopen(path_libllama)
        return (ffi, lib)
    def model_default_params(self, **kwargs):
        mparams = self.lib.llama_model_default_params()
        for k, v in kwargs.items():
            setattr(mparams, k, v)
        return mparams
    def context_default_params(self, **kwargs):
        cparams = self.lib.llama_context_default_params()
        for k, v in kwargs.items():
            setattr(cparams, k, v)
        return cparams
 class LibLlamaModel:
    def __init__(self, libllama: LibLlama, path_model: str, mparams={}, cparams={}):
        self.lib = libllama.lib
        self.ffi = libllama.ffi
        if isinstance(mparams, dict):
            mparams = libllama.model_default_params(**mparams)
        self.model = self.lib.llama_load_model_from_file(path_model.encode(), mparams)
        if not self.model:
            raise RuntimeError("error: failed to load model '%s'" % path_model)
        if isinstance(cparams, dict):
            cparams = libllama.context_default_params(**cparams)
        self.ctx = self.lib.llama_new_context_with_model(self.model, cparams)
        if not self.ctx:
            raise RuntimeError("error: failed to create context for model '%s'" % path_model)
        n_tokens_max = self.lib.llama_n_ctx(self.ctx)
        self.token_ids = self.ffi.new("llama_token[]", n_tokens_max)
    def free(self):
        if self.ctx:
            self.lib.llama_free(self.ctx)
        if self.model:
            self.lib.llama_free_model(self.model)
        self.ctx = None
        self.model = None
        self.lib = None
    def tokenize(self, text: str, n_tokens_max: int = 0, add_special: bool = False, parse_special: bool = False) -> list[int]:
        n_tokens_max = n_tokens_max if n_tokens_max > 0 else len(self.token_ids)
        text = text.encode("utf-8")
        num = self.lib.llama_tokenize(self.model, text, len(text), self.token_ids, n_tokens_max, add_special, parse_special)
        if num < 0:
            return []
        return list(self.token_ids[0:num])
 def generator_custom_text() -> Iterator[str]:
    """General tests"""
    yield from [
        "",
        " ",
        "  ",
        "   ",
        "\t",
        "\n",
        "\n\n",
        "\n\n\n",
        "\t\n",
        "Hello world",
        " Hello world",
        "Hello World",
        " Hello World",
        " Hello World!",
        "Hello, world!",
        " Hello, world!",
        " this is 🦙.cpp",
        "w048 7tuijk dsdfhu",
        "нещо на Български",
        "កាន់តែពិសេសអាចខលចេញ",
        "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
        "Hello",
        " Hello",
        "  Hello",
        "   Hello",
        "    Hello",
        "    Hello\n    Hello",
        " (",
        "\n =",
        "' era",
        "Hello, y'all! How are you 😁 ?我想在apple工作1314151天～",
        "3",
        "33",
        "333",
        "3333",
        "33333",
        "333333",
        "3333333",
        "33333333",
        "333333333",
    ]
 def generator_custom_text_edge_cases() -> Iterator[str]:
    """Edge cases found while debugging"""
    yield from [
        '\x1f-a',   # unicode_ranges_control, {0x00001C, 0x00001F}
        '¼-a',      # unicode_ranges_digit, 0x00BC
        '½-a',      # unicode_ranges_digit, 0x00BD
        '¾-a',      # unicode_ranges_digit, 0x00BE
        'a 〇b',    # unicode_ranges_digit, 0x3007
        'Ⅵ-a',     # unicode_ranges_digit, {0x00002150, 0x0000218F} // Number Forms
        '\uFEFF//', # unicode_ranges_control, 0xFEFF (BOM)
        '<s>a'      # TODO: Phi-3 fail
    ]
 def generator_random_chars(iterations = 100) -> Iterator[str]:
    """Brute force random text with simple characters"""
    WHITESPACES = list(" " * 20 + "\n" * 5 + "\r\n" * 5 + "\t" * 5)
    CHARS = list(set("""
        ABCDEFGHIJKLMNOPQRSTUVWXYZ
        abcdefghijklmnopqrstuvwxyz
        ÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛÄËÏÖÜ
        áéíóúàèìòùâêîôûäëïöü
        .-,*/-+ª!"·$%&/()=?¿[]{}<>\\|@#~½¬~;:_
    """))
    rand = random.Random()
    for m in range(iterations):
        rand.seed(m)
        text = []
        num_words = rand.randint(300, 400)
        for i in range(num_words):
            k = rand.randint(1, 7)
            word = rand.choices(CHARS, k=k)
            space = rand.choice(WHITESPACES)
            text.append("".join(word) + space)
        yield "".join(text)
 def generator_random_vocab_chars(tokenizer: PreTrainedTokenizerBase, iterations = 100) -> Iterator[str]:
    """Brute force random text with vocab characters"""
    vocab_ids = list(tokenizer.vocab.values())
    vocab_text = tokenizer.decode(vocab_ids, skip_special_tokens=True)
    vocab_chars = list(set(vocab_text))
    del vocab_ids, vocab_text
    rand = random.Random()
    for m in range(iterations):
        rand.seed(m)
        text = rand.choices(vocab_chars, k=1024)
        yield "".join(text)
 def generator_random_vocab_tokens(tokenizer: PreTrainedTokenizerBase, iterations = 100) -> Iterator[str]:
    """Brute force random text from vocab tokens"""
    space_id = tokenizer.encode(" ", add_special_tokens=False)[0]
    vocab_ids = list(tokenizer.vocab.values())
    vocab_ids = list(sorted(vocab_ids + vocab_ids))
    for i in range(1, len(vocab_ids), 2):
        vocab_ids[i] = space_id
    vocab_tokens = tokenizer.decode(vocab_ids, skip_special_tokens=True)
    vocab_tokens = vocab_tokens.split(" ")
    del vocab_ids
    yield from vocab_tokens
    rand = random.Random()
    for m in range(iterations):
        rand.seed(m)
        text = []
        num_words = rand.randint(300, 400)
        for i in range(num_words):
            k = rand.randint(1, 3)
            tokens = rand.choices(vocab_tokens, k=k)
            tokens = [t.strip(" \n\r\t") for t in tokens]
            sep = rand.choice("     \n\r\t")
            text.append("".join(tokens) + sep)
        yield "".join(text)
 def generator_random_bytes(iterations = 100) -> Iterator[str]:
    """Brute force random bytes"""
    WHITESPACES = list(" " * 20 + "\n" * 5 + "\r\n" * 5 + "\t" * 5)
    rand = random.Random()
    for m in range(iterations):
        rand.seed(m)
        text = []
        num_words = rand.randint(300, 400)
        for i in range(num_words):
            k = rand.randint(1, 8)
            word = [chr(r) for r in rand.randbytes(k) if r]
            word.append(rand.choice(WHITESPACES))
            text.append("".join(word))
        yield "".join(text)
 def test_compare_tokenizer(model: LibLlamaModel, tokenizer: PreTrainedTokenizerBase, generator: Iterator[str]):
    def find_first_mismatch(ids1: list[int], ids2: list[int]):
        for i, (a,b) in enumerate(zip(ids1, ids2)):
            if a != b:
                return i
        if len(ids1) == len(ids2):
            return -1
        return min(len(ids1), len(ids2))
    t0 = time.perf_counter()
    logger.info("%s: %s" % (generator.__name__, "ini"))
    for text in generator:
        ids1 = model.tokenize(text, add_special=False, parse_special=False)
        ids2 = tokenizer.encode(text, add_special_tokens=False)
        if ids1 != ids2:
            i = find_first_mismatch(ids1, ids2)
            ids1 = list(ids1)[max(0, i - 2) : i + 2 + 1]
            ids2 = list(ids2)[max(0, i - 2) : i + 2 + 1]
            text2 = tokenizer.decode(ids2, skip_special_tokens=True)
            assert (text2 in text)
            logger.info(" Text:     " + repr(text2))
            logger.info(" TokenIDs: " + str(ids1))
            logger.info(" Expected: " + str(ids2))
            raise Exception()
    t1 = time.perf_counter()
    logger.info("%s: end, time: %.3f secs" % (generator.__name__, t1 - t0))
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("vocab_file", help="path to vocab 'gguf' file")
    parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file")
    parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
    args = parser.parse_args()
    logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
    model = LibLlamaModel(LibLlama(), args.vocab_file, mparams=dict(vocab_only=True), cparams=dict(n_ctx=2048))
    tokenizer = AutoTokenizer.from_pretrained(args.dir_tokenizer)
    test_compare_tokenizer(model, tokenizer, generator_custom_text())
    test_compare_tokenizer(model, tokenizer, generator_custom_text_edge_cases())
    test_compare_tokenizer(model, tokenizer, generator_random_chars(10_000))
    test_compare_tokenizer(model, tokenizer, generator_random_vocab_chars(tokenizer, 10_000))
    test_compare_tokenizer(model, tokenizer, generator_random_vocab_tokens(tokenizer, 10_000))
    # test_compare_tokenizer(model, tokenizer, generator_random_bytes(10_000)) # FAIL
    model.free()
--- a/unicode-data.cpp
+++ b/unicode-data.cpp
--- a/unicode-data.h
+++ b/unicode-data.h
@ -7,6 +7,7 @@
 extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_number;
 extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_letter;
 extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_separator;
 extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_whitespace;
 extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_accent_mark;
 extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_punctuation;
--- a/unicode.cpp
+++ b/unicode.cpp
@ -9,6 +9,7 @@
 #include <stdexcept>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
 #include <utility>
 #include <vector>
 #include <locale>
@ -111,27 +112,27 @@ static uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset)
 static std::unordered_map<uint32_t, int> unicode_cpt_type_map() {
    std::unordered_map<uint32_t, int> cpt_types;
    for (auto p : unicode_ranges_number) {
-        for (auto i = p.first; i <= p.second; ++ i) {
+        for (auto i = p.first; i <= p.second; ++i) {
            cpt_types[i] = CODEPOINT_TYPE_NUMBER;
        }
    }
    for (auto p : unicode_ranges_letter) {
-        for (auto i = p.first; i <= p.second; ++ i) {
+        for (auto i = p.first; i <= p.second; ++i) {
            cpt_types[i] = CODEPOINT_TYPE_LETTER;
        }
    }
-    for (auto p : unicode_ranges_whitespace) {
+    for (auto p : unicode_ranges_separator) {
-        for (auto i = p.first; i <= p.second; ++ i) {
+        for (auto i = p.first; i <= p.second; ++i) {
-            cpt_types[i] = CODEPOINT_TYPE_WHITESPACE;
+            cpt_types[i] = CODEPOINT_TYPE_SEPARATOR;
        }
    }
    for (auto p : unicode_ranges_accent_mark) {
-        for (auto i = p.first; i <= p.second; ++ i) {
+        for (auto i = p.first; i <= p.second; ++i) {
            cpt_types[i] = CODEPOINT_TYPE_ACCENT_MARK;
        }
    }
    for (auto p : unicode_ranges_punctuation) {
-        for (auto i = p.first; i <= p.second; ++ i) {
+        for (auto i = p.first; i <= p.second; ++i) {
            cpt_types[i] = CODEPOINT_TYPE_PUNCTUATION;
        }
    }
@ -141,7 +142,7 @@ static std::unordered_map<uint32_t, int> unicode_cpt_type_map() {
        }
    }
    for (auto p : unicode_ranges_control) {
-        for (auto i = p.first; i <= p.second; ++ i) {
+        for (auto i = p.first; i <= p.second; ++i) {
            cpt_types[i] = CODEPOINT_TYPE_CONTROL;
        }
    }
@ -224,138 +225,256 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
    std::vector<size_t> bpe_offsets; // store the offset of each word
    bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
    const auto cpts = unicode_cpts_from_utf8(text);
    size_t start = 0;
    for (auto offset : offsets) {
        const size_t offset_ini = start;
        const size_t offset_end = start + offset;
        assert(offset_end <= cpts.size());
        start = offset_end;
        auto _get_cpt = [&] (const size_t pos) -> char32_t {
            return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0;
        };
        auto _get_cpt_type = [&] (const size_t pos) -> int {
            return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_type(cpts[pos]) : CODEPOINT_TYPE_UNIDENTIFIED;
        };
        size_t _prev_end = offset_ini;
        auto _add_token = [&] (const size_t end) -> size_t {
            assert(_prev_end <= end && end <= offset_end);
            size_t len = end - _prev_end;
            if (len > 0) {
                bpe_offsets.push_back(len);
            }
            _prev_end = end;
            //if (len > 0) {
            //    std::string s = "";
            //    for(size_t p = end-len; p < end; p++)
            //        s += unicode_cpt_to_utf8(cpts[p]);
            //    printf(">>> '%s'\n", s.c_str());
            //}
            return len;
        };
        for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
            const char32_t cpt = _get_cpt(pos);
            const int cpt_type = _get_cpt_type(pos);
            // regex: 's|'t|'re|'ve|'m|'ll|'d
            if (cpt == '\'' && pos+1 < offset_end) {
                char32_t cpt_next = _get_cpt(pos+1);
                if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || cpt_next == 'd') {
                    pos += _add_token(pos+2);
                    continue;
                }
                if (pos+2 < offset_end) {
                    char32_t cpt_next_next = _get_cpt(pos+2);
                    if ((cpt_next == 'r' && cpt_next_next == 'e') ||
                        (cpt_next == 'v' && cpt_next_next == 'e') ||
                        (cpt_next == 'l' && cpt_next_next == 'l')) {
                        pos += _add_token(pos+3);
                        continue;
                    }
                }
            }
            char32_t cpt2 = (cpt == ' ' ? _get_cpt(pos+1) : cpt);
            int cpt2_type = (cpt == ' ' ? _get_cpt_type(pos+1) : cpt_type);
            // regex: <space>?\p{L}+
            if (cpt2_type == CODEPOINT_TYPE_LETTER) {
                pos += (cpt == ' ');
                while (cpt2_type == CODEPOINT_TYPE_LETTER) {
                    cpt2_type = _get_cpt_type(++pos);
                }
                _add_token(pos);
                continue;
            }
            // regex: <space>?\p{N}+
            if (cpt2_type == CODEPOINT_TYPE_NUMBER) {
                pos += (cpt == ' ');
                while (cpt2_type == CODEPOINT_TYPE_NUMBER) {
                    cpt2_type = _get_cpt_type(++pos);
                }
                _add_token(pos);
                continue;
            }
            // regex: <space>?[^\s\p{L}\p{N}]+
            if (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
                pos += (cpt == ' ');
                while (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
                    cpt2_type = _get_cpt_type(++pos);
                    cpt2 = _get_cpt(pos);
                }
                _add_token(pos);
                continue;
            }
            size_t num_whitespaces = 0;
            while (unicode_cpt_is_whitespace(_get_cpt(pos+num_whitespaces))) {
                num_whitespaces++;
            }
            // regex: \s+(?!\S)
            if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != 0) {
                pos += num_whitespaces - 1;
                _add_token(pos);
                continue;
            }
            // regex: \s+
            if (num_whitespaces > 0) {
                pos += num_whitespaces;
                _add_token(pos);
                continue;
            }
            // no matches
            _add_token(++pos);
        }
    }
    return bpe_offsets;
 }
 // LLAMA3 system regex: "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"
 static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string & text, const std::vector<size_t> & offsets) {
    std::vector<size_t> bpe_offsets; // store the offset of each word
    bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
    const auto cpts = unicode_cpts_from_utf8(text);
    size_t start = 0;
    for (auto offset : offsets) {
-        std::string token;
+        const size_t offset_ini = start;
        const size_t offset_end = start + offset;
        assert(offset_end <= cpts.size());
        start = offset_end;
-        bool collecting_numeric = false;
+        auto _get_cpt = [&] (const size_t pos) -> char32_t {
-        bool collecting_letter = false;
+            return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0;
-        bool collecting_special = false;
+        };
        bool collecting_whitespace_lookahead = false;
        bool collecting = false;
-        std::vector<std::string> text_utf;
+        auto _get_cpt_type = [&] (const size_t pos) -> int {
-        text_utf.reserve(offset);
+            return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_type(cpts[pos]) : CODEPOINT_TYPE_UNIDENTIFIED;
        };
-        for (size_t i = start; i < start + offset; ++i) {
+        size_t _prev_end = offset_ini;
-            text_utf.emplace_back(unicode_cpt_to_utf8(cpts[i]));
+        auto _add_token = [&] (const size_t end) -> size_t {
            assert(_prev_end <= end && end <= offset_end);
            size_t len = end - _prev_end;
            if (len > 0) {
                bpe_offsets.push_back(len);
            }
            _prev_end = end;
            //if (len > 0) {
            //    std::string s = "";
            //    for(size_t p = end-len; p < end; p++)
            //        s += unicode_cpt_to_utf8(cpts[p]);
            //    printf(">>> '%s'\n", s.c_str());
            //}
            return len;
        };
-        for (int i = 0; i < (int)text_utf.size(); i++) {
+        for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
-            const std::string & utf_char = text_utf[i];
+            const char32_t cpt = _get_cpt(pos);
-            bool split_condition = false;
+            const int cpt_type = _get_cpt_type(pos);
            int bytes_remain = text_utf.size() - i;
-            // forward backward lookups
+            // regex: (?i:'s|'t|'re|'ve|'m|'ll|'d) // case insensitive
-            const std::string & utf_char_next      = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
+            if (cpt == '\'' && pos+1 < offset_end) {
-            const std::string & utf_char_next_next = (i + 2 < (int)text_utf.size()) ? text_utf[i + 2] : "";
+                char32_t cpt_next = unicode_tolower(_get_cpt(pos+1));
-
+                if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || cpt_next == 'd') {
-            // handling contractions
+                    pos += _add_token(pos+2);
-            if (!split_condition && bytes_remain >= 2) {
+                    continue;
                // 's|'t|'m|'d
                if (utf_char == "\'" && (utf_char_next == "s" || utf_char_next == "t" || utf_char_next == "m" || utf_char_next == "d")) {
                    split_condition = true;
                }
-                if (split_condition) {
+                if (pos+2 < offset_end) {
-                    if (token.size()) {
+                    char32_t cpt_next_next = unicode_tolower(_get_cpt(pos+2));
-                        bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size());
+                    if ((cpt_next == 'r' && cpt_next_next == 'e') ||
-                    }
+                        (cpt_next == 'v' && cpt_next_next == 'e') ||
-                    token = utf_char + utf_char_next;
+                        (cpt_next == 'l' && cpt_next_next == 'l')) {
-                    bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size());
+                        pos += _add_token(pos+3);
                    token = "";
                    i++;
                        continue;
                    }
                }
            if (!split_condition && bytes_remain >= 3) {
                // 're|'ve|'ll
                if (utf_char == "\'" && (
                    (utf_char_next == "r" && utf_char_next_next == "e") ||
                    (utf_char_next == "v" && utf_char_next_next == "e") ||
                    (utf_char_next == "l" && utf_char_next_next == "l"))
                    ) {
                    split_condition = true;
            }
                if (split_condition) {
                    // current token + next token can be defined
                    if (token.size()) {
                        bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size());
                    }
                    token =  utf_char;
                    token += utf_char_next;
                    token += utf_char_next_next;
-                    bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size());
+            // regex: [^\r\n\p{L}\p{N}]?\p{L}+  //####FIXME: the first \p{L} is correct?
-                    token = "";
+            if (cpt != '\r' && cpt != '\n' && /*cpt_type != CODEPOINT_TYPE_LETTER &&*/ cpt_type != CODEPOINT_TYPE_NUMBER) {
-                    i += 2;
+                if (cpt_type == CODEPOINT_TYPE_LETTER || _get_cpt_type(pos+1) == CODEPOINT_TYPE_LETTER) {  // one or more letters
                    pos++;
                    while (_get_cpt_type(pos) == CODEPOINT_TYPE_LETTER) {
                        pos++;
                    }
                    _add_token(pos);
                    continue;
                }
            }
-            if (!split_condition && !collecting) {
+            // regex: \p{N}{1,3}
-                if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || (token.empty() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER)) {
+            if (cpt_type == CODEPOINT_TYPE_NUMBER) {
-                    collecting_letter = true;
+                size_t ini = pos;
-                    collecting = true;
+                while (_get_cpt_type(pos) == CODEPOINT_TYPE_NUMBER) {
-                }
+                    if (++pos - ini >= 3 ) {
-                else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_NUMBER || (token.empty() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_NUMBER)) {
+                        _add_token(pos);
-                    collecting_numeric = true;
+                        ini = pos;
                    collecting = true;
                }
                else if (
                    ((unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_NUMBER) && (unicode_cpt_type(utf_char) != CODEPOINT_TYPE_WHITESPACE)) ||
                    (token.empty() && utf_char == " " && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_NUMBER && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE)
                    ) {
                    collecting_special = true;
                    collecting = true;
                }
                else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_WHITESPACE) {
                    collecting_whitespace_lookahead = true;
                    collecting = true;
                }
                else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE) {
                    split_condition = true;
                    }
                }
-            else if (!split_condition && collecting) {
+                _add_token(pos);
-                if (collecting_letter && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER) {
+                continue;
                    split_condition = true;
                }
                else if (collecting_numeric && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_NUMBER) {
                    split_condition = true;
                }
                else if (collecting_special && (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_NUMBER || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
                    split_condition = true;
                }
                else if (collecting_whitespace_lookahead && (unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_NUMBER)) {
                    split_condition = true;
                }
            }
-            if (utf_char_next == "") {
+            // regex: <space>?[^\s\p{L}\p{N}]+[\r\n]*
-                split_condition = true; // final
+            char32_t cpt2 = (cpt == ' ' ? _get_cpt(pos+1) : cpt);
-                token += utf_char;
+            int cpt2_type = (cpt == ' ' ? _get_cpt_type(pos+1) : cpt_type);
            if (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
                pos += (cpt == ' ');
                while (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
                    cpt2_type = _get_cpt_type(++pos);
                    cpt2 = _get_cpt(pos);
                }
                while (cpt2 == '\r' || cpt2 == '\n') {
                    cpt2 = _get_cpt(++pos);
                }
                _add_token(pos);
                continue;
            }
-            if (split_condition) {
+            size_t num_whitespaces = 0;
-                if (token.size()) {
+            size_t last_end_r_or_n = 0;
-                    bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size());
+            while (unicode_cpt_is_whitespace(_get_cpt(pos+num_whitespaces))) {
-                }
+                char32_t cpt2 = _get_cpt(pos+num_whitespaces);
-                token = utf_char;
+                if (cpt2 == '\r' || cpt2 == '\n') {
-                collecting = false;
+                    last_end_r_or_n = pos + num_whitespaces + 1;
                collecting_letter = false;
                collecting_numeric = false;
                collecting_special = false;
                collecting_whitespace_lookahead = false;
            }
            else {
                token += utf_char;
                }
                num_whitespaces++;
            }
-        start += offset;
+            // regex: \s*[\r\n]+
            if (last_end_r_or_n > 0) {
                pos = last_end_r_or_n;
                _add_token(pos);
                continue;
            }
            // regex: \s+(?!\S)
            if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != 0) {
                pos += num_whitespaces - 1;
                _add_token(pos);
                continue;
            }
            // regex: \s+
            if (num_whitespaces > 0) {
                pos += num_whitespaces;
                _add_token(pos);
                continue;
            }
            // no matches
            _add_token(++pos);
        }
    }
    return bpe_offsets;
@ -424,14 +543,14 @@ static std::vector<size_t> unicode_regex_split_stl(const std::string & text, con
 static std::vector<size_t> unicode_regex_split_custom(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) {
    std::vector<size_t> bpe_offsets;
-    (void)(text);
+    if (regex_expr == "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)") {
-    (void)(regex_expr);
+        bpe_offsets = unicode_regex_split_custom_gpt2(text, offsets);
-    (void)(offsets);
+    } else if (
-    // TODO: this implementation is actually wrong, uncomment and run:
+            regex_expr == "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" ||
-    //       make -j && ./bin/test-tokenizer-0 ../models/ggml-vocab-gpt-2.gguf
+            regex_expr == "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+") {
-    //if (regex_expr == "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)") {
+
-    //    bpe_offsets = unicode_regex_split_custom_gpt2(text, offsets);
+        bpe_offsets = unicode_regex_split_custom_llama3(text, offsets);
-    //}
+    }
    return bpe_offsets;
 }
@ -506,6 +625,19 @@ int unicode_cpt_type(const std::string & utf8) {
    return unicode_cpt_type(unicode_cpt_from_utf8(utf8, offset));
 }
 bool unicode_cpt_is_whitespace(uint32_t cp) {
    static const std::unordered_set<uint32_t> is_whitespace = [] {
        std::unordered_set<uint32_t> is_whitespace;
        for (auto p : unicode_ranges_whitespace) {
            for (auto i = p.first; i <= p.second; ++i) {
                is_whitespace.insert(i);
            }
        }
        return is_whitespace;
    }();
    return (bool)is_whitespace.count(cp);
 }
 std::string unicode_byte_to_utf8(uint8_t byte) {
    static std::unordered_map<uint8_t, std::string> map = unicode_byte_to_utf8_map();
    return map.at(byte);
--- a/unicode.h
+++ b/unicode.h
@ -7,7 +7,7 @@
 #define CODEPOINT_TYPE_UNIDENTIFIED 0
 #define CODEPOINT_TYPE_NUMBER       1
 #define CODEPOINT_TYPE_LETTER       2
-#define CODEPOINT_TYPE_WHITESPACE   3
+#define CODEPOINT_TYPE_SEPARATOR    3
 #define CODEPOINT_TYPE_ACCENT_MARK  4
 #define CODEPOINT_TYPE_PUNCTUATION  5
 #define CODEPOINT_TYPE_SYMBOL       6
@ -21,6 +21,8 @@ std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & c
 int unicode_cpt_type(uint32_t cp);
 int unicode_cpt_type(const std::string & utf8);
 bool unicode_cpt_is_whitespace(uint32_t cp);
 std::string unicode_byte_to_utf8(uint8_t byte);
 uint8_t unicode_utf8_to_byte(const std::string & utf8);