mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 09:34:37 +00:00
Merge commit '8c570c9496
' into concedo_experimental
# Conflicts: # README.md # tests/test-backend-ops.cpp
This commit is contained in:
commit
db82bad6f2
18 changed files with 45581 additions and 35766 deletions
|
@ -49,6 +49,10 @@ chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶
|
||||||
|
|
||||||
if len(sys.argv) == 2:
|
if len(sys.argv) == 2:
|
||||||
token = sys.argv[1]
|
token = sys.argv[1]
|
||||||
|
if not token.startswith("hf_"):
|
||||||
|
logger.info("Huggingface token seems invalid")
|
||||||
|
logger.info("Usage: python convert-hf-to-gguf-update.py <huggingface_token>")
|
||||||
|
sys.exit(1)
|
||||||
else:
|
else:
|
||||||
logger.info("Usage: python convert-hf-to-gguf-update.py <huggingface_token>")
|
logger.info("Usage: python convert-hf-to-gguf-update.py <huggingface_token>")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
@ -257,6 +261,7 @@ tests = [
|
||||||
"3333333",
|
"3333333",
|
||||||
"33333333",
|
"33333333",
|
||||||
"333333333",
|
"333333333",
|
||||||
|
# "Cửa Việt", # llama-bpe fails on this
|
||||||
chktxt,
|
chktxt,
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
|
|
||||||
This example reads weights from project [llama2.c](https://github.com/karpathy/llama2.c) and saves them in ggml compatible format. The vocab that is available in `models/ggml-vocab.bin` is used by default.
|
This example reads weights from project [llama2.c](https://github.com/karpathy/llama2.c) and saves them in ggml compatible format. The vocab that is available in `models/ggml-vocab.bin` is used by default.
|
||||||
|
|
||||||
To convert the model first download the models from the [llma2.c](https://github.com/karpathy/llama2.c) repository:
|
To convert the model first download the models from the [llama2.c](https://github.com/karpathy/llama2.c) repository:
|
||||||
|
|
||||||
`$ make -j`
|
`$ make -j`
|
||||||
|
|
||||||
|
|
|
@ -52,15 +52,15 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
|
||||||
size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
|
size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
|
||||||
float v;
|
float v;
|
||||||
if (type == GGML_TYPE_F16) {
|
if (type == GGML_TYPE_F16) {
|
||||||
v = ggml_fp16_to_fp32(*(ggml_fp16_t *) data + i);
|
v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]);
|
||||||
} else if (type == GGML_TYPE_F32) {
|
} else if (type == GGML_TYPE_F32) {
|
||||||
v = *(float *) data + i;
|
v = *(float *) &data[i];
|
||||||
} else if (type == GGML_TYPE_I32) {
|
} else if (type == GGML_TYPE_I32) {
|
||||||
v = (float) *(int32_t *) data + i;
|
v = (float) *(int32_t *) &data[i];
|
||||||
} else if (type == GGML_TYPE_I16) {
|
} else if (type == GGML_TYPE_I16) {
|
||||||
v = (float) *(int16_t *) data + i;
|
v = (float) *(int16_t *) &data[i];
|
||||||
} else if (type == GGML_TYPE_I8) {
|
} else if (type == GGML_TYPE_I8) {
|
||||||
v = (float) *(int8_t *) data + i;
|
v = (float) *(int8_t *) &data[i];
|
||||||
} else {
|
} else {
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
}
|
}
|
||||||
|
|
|
@ -234,122 +234,6 @@ typedef float dfloat; // dequantize float
|
||||||
typedef float2 dfloat2;
|
typedef float2 dfloat2;
|
||||||
#endif //GGML_CUDA_F16
|
#endif //GGML_CUDA_F16
|
||||||
|
|
||||||
[[noreturn]]
|
|
||||||
static __device__ void no_device_code(
|
|
||||||
const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
|
|
||||||
|
|
||||||
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
|
||||||
printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n",
|
|
||||||
file_name, line, function_name, arch);
|
|
||||||
GGML_UNUSED(arch_list);
|
|
||||||
#else
|
|
||||||
printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",
|
|
||||||
file_name, line, function_name, arch, arch_list);
|
|
||||||
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
|
||||||
__trap();
|
|
||||||
|
|
||||||
GGML_UNUSED(no_device_code); // suppress unused function warning
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef __CUDA_ARCH__
|
|
||||||
#define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__))
|
|
||||||
#else
|
|
||||||
#define NO_DEVICE_CODE //GGML_ASSERT(false && "NO_DEVICE_CODE not valid in host code.")
|
|
||||||
#endif // __CUDA_ARCH__
|
|
||||||
|
|
||||||
static __device__ __forceinline__ float warp_reduce_sum(float x) {
|
|
||||||
#pragma unroll
|
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
|
||||||
x += __shfl_xor_sync(0xffffffff, x, mask, 32);
|
|
||||||
}
|
|
||||||
return x;
|
|
||||||
}
|
|
||||||
|
|
||||||
static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
|
|
||||||
#pragma unroll
|
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
|
||||||
a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
|
|
||||||
a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
|
|
||||||
}
|
|
||||||
return a;
|
|
||||||
}
|
|
||||||
|
|
||||||
static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
|
|
||||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
|
||||||
#pragma unroll
|
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
|
||||||
a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
|
|
||||||
}
|
|
||||||
return a;
|
|
||||||
#else
|
|
||||||
GGML_UNUSED(a);
|
|
||||||
NO_DEVICE_CODE;
|
|
||||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
|
||||||
}
|
|
||||||
|
|
||||||
static __device__ __forceinline__ float warp_reduce_max(float x) {
|
|
||||||
#pragma unroll
|
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
|
||||||
x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
|
|
||||||
}
|
|
||||||
return x;
|
|
||||||
}
|
|
||||||
|
|
||||||
static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b) {
|
|
||||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
|
||||||
|
|
||||||
#if CUDART_VERSION >= CUDART_HMAX
|
|
||||||
return __hmax(a, b);
|
|
||||||
#else
|
|
||||||
return __half2float(a) > __half2float(b) ? a : b;
|
|
||||||
#endif // CUDART_VERSION >= CUDART_HMAX
|
|
||||||
|
|
||||||
#else
|
|
||||||
GGML_UNUSED(a);
|
|
||||||
GGML_UNUSED(b);
|
|
||||||
NO_DEVICE_CODE;
|
|
||||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
|
|
||||||
}
|
|
||||||
static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const half2 b) {
|
|
||||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
|
||||||
|
|
||||||
#if CUDART_VERSION >= CUDART_HMAX
|
|
||||||
return __hmax2(a, b);
|
|
||||||
#else
|
|
||||||
half2 ret;
|
|
||||||
reinterpret_cast<half&>(ret.x) = __low2float(a) > __low2float(b) ? __low2half(a) : __low2half(b);
|
|
||||||
reinterpret_cast<half&>(ret.y) = __high2float(a) > __high2float(b) ? __high2half(a) : __high2half(b);
|
|
||||||
return ret;
|
|
||||||
#endif // CUDART_VERSION >= CUDART_HMAX
|
|
||||||
|
|
||||||
#else
|
|
||||||
GGML_UNUSED(a);
|
|
||||||
GGML_UNUSED(b);
|
|
||||||
NO_DEVICE_CODE;
|
|
||||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
|
|
||||||
}
|
|
||||||
|
|
||||||
static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
|
|
||||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
|
||||||
#pragma unroll
|
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
|
||||||
x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
|
|
||||||
}
|
|
||||||
return x;
|
|
||||||
#else
|
|
||||||
GGML_UNUSED(x);
|
|
||||||
NO_DEVICE_CODE;
|
|
||||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
|
||||||
}
|
|
||||||
|
|
||||||
#if CUDART_VERSION < CUDART_HMASK
|
|
||||||
static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half2 b) {
|
|
||||||
const uint32_t mask_low = 0x0000FFFF * (float( __low2half(a)) > float( __low2half(b)));
|
|
||||||
const uint32_t mask_high = 0xFFFF0000 * (float(__high2half(a)) > float(__high2half(b)));
|
|
||||||
return mask_low | mask_high;
|
|
||||||
}
|
|
||||||
#endif // CUDART_VERSION < 12000
|
|
||||||
|
|
||||||
#if defined(GGML_USE_HIPBLAS)
|
#if defined(GGML_USE_HIPBLAS)
|
||||||
#define __CUDA_ARCH__ 1300
|
#define __CUDA_ARCH__ 1300
|
||||||
|
|
||||||
|
@ -433,11 +317,143 @@ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
|
||||||
}
|
}
|
||||||
#endif // defined(GGML_USE_HIPBLAS)
|
#endif // defined(GGML_USE_HIPBLAS)
|
||||||
|
|
||||||
#define FP16_AVAILABLE defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) ? \
|
#define FP16_AVAILABLE (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
|
||||||
defined(RDNA1) || defined(RDNA2) || defined(RDNA3) : __CUDA_ARCH__ >= CC_PASCAL
|
|
||||||
|
|
||||||
#define FP16_MMA_AVAILABLE !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
|
#define FP16_MMA_AVAILABLE !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
|
||||||
|
|
||||||
|
static bool fp16_mma_available(const int cc) {
|
||||||
|
return cc < CC_OFFSET_AMD && cc >= CC_VOLTA;
|
||||||
|
}
|
||||||
|
|
||||||
|
[[noreturn]]
|
||||||
|
static __device__ void no_device_code(
|
||||||
|
const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
|
||||||
|
|
||||||
|
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||||
|
printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n",
|
||||||
|
file_name, line, function_name, arch);
|
||||||
|
GGML_UNUSED(arch_list);
|
||||||
|
#else
|
||||||
|
printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",
|
||||||
|
file_name, line, function_name, arch, arch_list);
|
||||||
|
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||||
|
__trap();
|
||||||
|
|
||||||
|
GGML_UNUSED(no_device_code); // suppress unused function warning
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef __CUDA_ARCH__
|
||||||
|
#define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__))
|
||||||
|
#else
|
||||||
|
#define NO_DEVICE_CODE //GGML_ASSERT(false && "NO_DEVICE_CODE not valid in host code.")
|
||||||
|
#endif // __CUDA_ARCH__
|
||||||
|
|
||||||
|
static __device__ __forceinline__ float warp_reduce_sum(float x) {
|
||||||
|
#pragma unroll
|
||||||
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||||
|
x += __shfl_xor_sync(0xffffffff, x, mask, 32);
|
||||||
|
}
|
||||||
|
return x;
|
||||||
|
}
|
||||||
|
|
||||||
|
static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
|
||||||
|
#pragma unroll
|
||||||
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||||
|
a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
|
||||||
|
a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
|
||||||
|
}
|
||||||
|
return a;
|
||||||
|
}
|
||||||
|
|
||||||
|
static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
|
||||||
|
#if FP16_AVAILABLE
|
||||||
|
|
||||||
|
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||||
|
#pragma unroll
|
||||||
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||||
|
const half2 a_other = __shfl_xor_sync(0xffffffff, a, mask, 32);
|
||||||
|
reinterpret_cast<half&>(a.x) += __low2half(a_other);
|
||||||
|
reinterpret_cast<half&>(a.y) += __high2half(a_other);
|
||||||
|
}
|
||||||
|
return a;
|
||||||
|
#else
|
||||||
|
#pragma unroll
|
||||||
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||||
|
a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
|
||||||
|
}
|
||||||
|
return a;
|
||||||
|
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||||
|
|
||||||
|
#else
|
||||||
|
NO_DEVICE_CODE;
|
||||||
|
return a;
|
||||||
|
#endif // FP16_AVAILABLE
|
||||||
|
}
|
||||||
|
|
||||||
|
static __device__ __forceinline__ float warp_reduce_max(float x) {
|
||||||
|
#pragma unroll
|
||||||
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||||
|
x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
|
||||||
|
}
|
||||||
|
return x;
|
||||||
|
}
|
||||||
|
|
||||||
|
static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b) {
|
||||||
|
#if FP16_AVAILABLE
|
||||||
|
|
||||||
|
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
|
||||||
|
return __float2half(fmaxf(__half2float(a), __half2float(b)));
|
||||||
|
#else
|
||||||
|
return __hmax(a, b);
|
||||||
|
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
|
||||||
|
|
||||||
|
#else
|
||||||
|
NO_DEVICE_CODE;
|
||||||
|
GGML_UNUSED(b);
|
||||||
|
return a;
|
||||||
|
#endif // FP16_AVAILABLE
|
||||||
|
}
|
||||||
|
|
||||||
|
static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const half2 b) {
|
||||||
|
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||||
|
|
||||||
|
#if CUDART_VERSION >= CUDART_HMAX
|
||||||
|
return __hmax2(a, b);
|
||||||
|
#else
|
||||||
|
half2 ret;
|
||||||
|
reinterpret_cast<half&>(ret.x) = __float2half(fmaxf( __low2float(a), __low2float(b)));
|
||||||
|
reinterpret_cast<half&>(ret.y) = __float2half(fmaxf(__high2float(a), __high2float(b)));
|
||||||
|
return ret;
|
||||||
|
#endif // CUDART_VERSION >= CUDART_HMAX
|
||||||
|
|
||||||
|
#else
|
||||||
|
GGML_UNUSED(a);
|
||||||
|
GGML_UNUSED(b);
|
||||||
|
NO_DEVICE_CODE;
|
||||||
|
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||||
|
}
|
||||||
|
|
||||||
|
static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
|
||||||
|
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
||||||
|
#pragma unroll
|
||||||
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||||
|
x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
|
||||||
|
}
|
||||||
|
return x;
|
||||||
|
#else
|
||||||
|
GGML_UNUSED(x);
|
||||||
|
NO_DEVICE_CODE;
|
||||||
|
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
||||||
|
}
|
||||||
|
|
||||||
|
#if CUDART_VERSION < CUDART_HMASK
|
||||||
|
static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half2 b) {
|
||||||
|
const uint32_t mask_low = 0x0000FFFF * (float( __low2half(a)) > float( __low2half(b)));
|
||||||
|
const uint32_t mask_high = 0xFFFF0000 * (float(__high2half(a)) > float(__high2half(b)));
|
||||||
|
return mask_low | mask_high;
|
||||||
|
}
|
||||||
|
#endif // CUDART_VERSION < 12000
|
||||||
|
|
||||||
// TODO: move to ggml-common.h
|
// TODO: move to ggml-common.h
|
||||||
static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
|
static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
|
||||||
|
|
||||||
|
|
|
@ -11,8 +11,10 @@
|
||||||
#define HALF_MAX_HALF __float2half(65504.0f/2) // Use neg. of this instead of -INFINITY to initialize KQ max vals to avoid NaN upon subtraction.
|
#define HALF_MAX_HALF __float2half(65504.0f/2) // Use neg. of this instead of -INFINITY to initialize KQ max vals to avoid NaN upon subtraction.
|
||||||
#define SOFTMAX_FTZ_THRESHOLD -20.0f // Softmax exp. of values smaller than this are flushed to zero to avoid NaNs.
|
#define SOFTMAX_FTZ_THRESHOLD -20.0f // Softmax exp. of values smaller than this are flushed to zero to avoid NaNs.
|
||||||
|
|
||||||
template<int D, int parallel_blocks> // D == head size
|
template<int D, int ncols, int parallel_blocks> // D == head size
|
||||||
__launch_bounds__(((D + WARP_SIZE - 1) / WARP_SIZE)*WARP_SIZE, 1)
|
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||||
|
__launch_bounds__(D, 1)
|
||||||
|
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||||
static __global__ void flash_attn_vec_ext_f16(
|
static __global__ void flash_attn_vec_ext_f16(
|
||||||
const char * __restrict__ Q,
|
const char * __restrict__ Q,
|
||||||
const char * __restrict__ K,
|
const char * __restrict__ K,
|
||||||
|
@ -44,55 +46,77 @@ static __global__ void flash_attn_vec_ext_f16(
|
||||||
#if FP16_AVAILABLE
|
#if FP16_AVAILABLE
|
||||||
//In this kernel Q, K, V are matrices while i, j, k are matrix indices.
|
//In this kernel Q, K, V are matrices while i, j, k are matrix indices.
|
||||||
|
|
||||||
const int ic = blockIdx.x / parallel_blocks; // Index of the Q/QKV column to work on.
|
const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
|
||||||
const int ip = blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel.
|
const int ip = blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel.
|
||||||
|
|
||||||
const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
|
const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
|
||||||
const float2 * Q_f2 = (const float2 *) (Q + nb02* blockIdx.y + nb01*ic);
|
const float2 * Q_f2 = (const float2 *) (Q + nb02* blockIdx.y + nb01*ic0);
|
||||||
const half2 * K_h2 = (const half2 *) (K + nb12*(blockIdx.y / gqa_ratio));
|
const half2 * K_h2 = (const half2 *) (K + nb12*(blockIdx.y / gqa_ratio));
|
||||||
const half * V_h = (const half *) (V + nb12*(blockIdx.y / gqa_ratio)); // K and V have same shape
|
const half * V_h = (const half *) (V + nb12*(blockIdx.y / gqa_ratio)); // K and V have same shape
|
||||||
const half * maskh = (const half *) mask + ne11*ic;
|
const half * maskh = (const half *) mask + ne11*ic0;
|
||||||
|
|
||||||
const int stride_KV = nb11 / sizeof(half);
|
const int stride_KV = nb11 / sizeof(half);
|
||||||
const int stride_KV2 = nb11 / sizeof(half2);
|
const int stride_KV2 = nb11 / sizeof(half2);
|
||||||
|
|
||||||
constexpr int nwarps = (D + WARP_SIZE - 1) / WARP_SIZE;
|
static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
|
||||||
|
constexpr int nwarps = D / WARP_SIZE;
|
||||||
const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
|
const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
|
||||||
__builtin_assume(tid < nwarps*WARP_SIZE);
|
__builtin_assume(tid < D);
|
||||||
|
|
||||||
__shared__ half KQ[nwarps*WARP_SIZE];
|
__shared__ half KQ[ncols*D];
|
||||||
KQ[tid] = -INFINITY;
|
#pragma unroll
|
||||||
|
for (int j = 0; j < ncols; ++j) {
|
||||||
|
KQ[j*D + tid] = -HALF_MAX_HALF;
|
||||||
|
}
|
||||||
half2 * KQ2 = (half2 *) KQ;
|
half2 * KQ2 = (half2 *) KQ;
|
||||||
|
|
||||||
half kqmax = -HALF_MAX_HALF;
|
half kqmax[ncols];
|
||||||
half kqsum = 0.0f;
|
#pragma unroll
|
||||||
|
for (int j = 0; j < ncols; ++j) {
|
||||||
|
kqmax[j] = -HALF_MAX_HALF;
|
||||||
|
}
|
||||||
|
half kqsum[ncols] = {0.0f};
|
||||||
|
|
||||||
__shared__ half kqmax_shared[WARP_SIZE];
|
__shared__ half kqmax_shared[ncols][WARP_SIZE];
|
||||||
__shared__ half kqsum_shared[WARP_SIZE];
|
__shared__ half kqsum_shared[ncols][WARP_SIZE];
|
||||||
|
#pragma unroll
|
||||||
|
for (int j = 0; j < ncols; ++j) {
|
||||||
if (threadIdx.y == 0) {
|
if (threadIdx.y == 0) {
|
||||||
kqmax_shared[threadIdx.x] = -HALF_MAX_HALF;
|
kqmax_shared[j][threadIdx.x] = -HALF_MAX_HALF;
|
||||||
kqsum_shared[threadIdx.x] = 0.0f;
|
kqsum_shared[j][threadIdx.x] = 0.0f;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
// Convert Q to half2 and store in registers:
|
// Convert Q to half2 and store in registers:
|
||||||
half2 Q_h2[(D/2 + WARP_SIZE - 1) / WARP_SIZE];
|
half2 Q_h2[ncols][D/(2*WARP_SIZE)];
|
||||||
|
#pragma unroll
|
||||||
|
for (int j = 0; j < ncols; ++j) {
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
|
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
|
||||||
const int i = i0 + threadIdx.x;
|
const int i = i0 + threadIdx.x;
|
||||||
if (i0 + WARP_SIZE > D/2 && i >= D/2) {
|
|
||||||
break;
|
const float2 tmp = Q_f2[j*(nb01/sizeof(float2)) + i];
|
||||||
|
Q_h2[j][i0/WARP_SIZE] = make_half2(scale, scale) * make_half2(tmp.x, tmp.y);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Q_h2[i0/WARP_SIZE] = make_half2(scale, scale) * make_half2(Q_f2[i].x, Q_f2[i].y);
|
half2 VKQ[ncols] = {{0.0f, 0.0f}};
|
||||||
}
|
|
||||||
|
|
||||||
half2 VKQ = make_half2(0.0f, 0.0f); // Each thread calculates a single VKQ value.
|
|
||||||
|
|
||||||
const int k_start = parallel_blocks == 1 ? 0 : ip*D;
|
const int k_start = parallel_blocks == 1 ? 0 : ip*D;
|
||||||
for (int k_VKQ_0 = k_start; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*D) {
|
for (int k_VKQ_0 = k_start; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*D) {
|
||||||
// Calculate KQ tile and keep track of new maximum KQ values:
|
// Calculate KQ tile and keep track of new maximum KQ values:
|
||||||
half kqmax_new = kqmax;
|
|
||||||
|
// For unknown reasons using a half array of size 1 for kqmax_new causes a performance regression,
|
||||||
|
// see https://github.com/ggerganov/llama.cpp/pull/7061 .
|
||||||
|
// Therefore this variable is defined twice but only used once (so that the compiler can optimize out the unused variable).
|
||||||
|
half kqmax_new = kqmax[0];
|
||||||
|
half kqmax_new_arr[ncols];
|
||||||
|
#pragma unroll
|
||||||
|
for (int j = 0; j < ncols; ++j) {
|
||||||
|
kqmax_new_arr[j] = kqmax[j];
|
||||||
|
}
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i_KQ_0 = 0; i_KQ_0 < D; i_KQ_0 += nwarps) {
|
for (int i_KQ_0 = 0; i_KQ_0 < D; i_KQ_0 += nwarps) {
|
||||||
const int i_KQ = i_KQ_0 + threadIdx.y;
|
const int i_KQ = i_KQ_0 + threadIdx.y;
|
||||||
|
@ -101,47 +125,65 @@ static __global__ void flash_attn_vec_ext_f16(
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
half2 sum2 = make_half2(0.0f, 0.0f);
|
half2 sum2[ncols] = {{0.0f, 0.0f}};
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += WARP_SIZE) {
|
for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += WARP_SIZE) {
|
||||||
const int k_KQ = k_KQ_0 + threadIdx.x;
|
const int k_KQ = k_KQ_0 + threadIdx.x;
|
||||||
if (k_KQ_0 + WARP_SIZE > D/2 && k_KQ >= D/2) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
const half2 K_ik = K_h2[(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ];
|
const half2 K_ik = K_h2[(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ];
|
||||||
sum2 += K_ik * Q_h2[k_KQ_0/WARP_SIZE];
|
#pragma unroll
|
||||||
|
for (int j = 0; j < ncols; ++j) {
|
||||||
|
sum2[j] += K_ik * Q_h2[j][k_KQ_0/WARP_SIZE];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
sum2 = warp_reduce_sum(sum2);
|
#pragma unroll
|
||||||
half sum = __low2half(sum2) + __high2half(sum2);
|
for (int j = 0; j < ncols; ++j) {
|
||||||
sum += mask ? maskh[k_VKQ_0 + i_KQ] : __float2half(0.0f);
|
sum2[j] = warp_reduce_sum(sum2[j]);
|
||||||
|
half sum = __low2half(sum2[j]) + __high2half(sum2[j]);
|
||||||
|
sum += mask ? maskh[j*ne11 + k_VKQ_0 + i_KQ] : __float2half(0.0f);
|
||||||
|
|
||||||
|
if (ncols == 1) {
|
||||||
kqmax_new = ggml_cuda_hmax(kqmax_new, sum);
|
kqmax_new = ggml_cuda_hmax(kqmax_new, sum);
|
||||||
|
} else {
|
||||||
|
kqmax_new_arr[j] = ggml_cuda_hmax(kqmax_new_arr[j], sum);
|
||||||
|
}
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
KQ[i_KQ] = sum;
|
KQ[j*D + i_KQ] = sum;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
kqmax_new = warp_reduce_max(kqmax_new);
|
#pragma unroll
|
||||||
|
for (int j = 0; j < ncols; ++j) {
|
||||||
|
half kqmax_new_j = ncols == 1 ? kqmax_new : kqmax_new_arr[j];
|
||||||
|
|
||||||
|
kqmax_new_j = warp_reduce_max(kqmax_new_j);
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
kqmax_shared[threadIdx.y] = kqmax_new;
|
kqmax_shared[j][threadIdx.y] = kqmax_new_j;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (int j = 0; j < ncols; ++j) {
|
||||||
|
half kqmax_new_j = kqmax_shared[j][threadIdx.x];
|
||||||
|
kqmax_new_j = warp_reduce_max(kqmax_new_j);
|
||||||
|
|
||||||
|
const half KQ_max_scale = hexp(kqmax[j] - kqmax_new_j);
|
||||||
|
kqmax[j] = kqmax_new_j;
|
||||||
|
|
||||||
|
const half val = hexp(KQ[j*D + tid] - kqmax[j]);
|
||||||
|
kqsum[j] = kqsum[j]*KQ_max_scale + val;
|
||||||
|
KQ[j*D + tid] = val;
|
||||||
|
|
||||||
|
VKQ[j] *= __half2half2(KQ_max_scale);
|
||||||
}
|
}
|
||||||
__syncthreads();
|
|
||||||
kqmax_new = kqmax_shared[threadIdx.x];
|
|
||||||
kqmax_new = warp_reduce_max(kqmax_new);
|
|
||||||
|
|
||||||
const half KQ_max_scale = hexp(kqmax - kqmax_new);
|
|
||||||
kqmax = kqmax_new;
|
|
||||||
|
|
||||||
const half val = hexp(KQ[tid] - kqmax);
|
|
||||||
kqsum = kqsum*KQ_max_scale + val;
|
|
||||||
KQ[tid] = val;
|
|
||||||
|
|
||||||
VKQ *= __half2half2(KQ_max_scale);
|
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
if (tid < D) {
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int k0 = 0; k0 < D; k0 += 2) {
|
for (int k0 = 0; k0 < D; k0 += 2) {
|
||||||
if (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + k0 >= ne11) {
|
if (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + k0 >= ne11) {
|
||||||
|
@ -151,39 +193,44 @@ static __global__ void flash_attn_vec_ext_f16(
|
||||||
half2 V_k;
|
half2 V_k;
|
||||||
reinterpret_cast<half&>(V_k.x) = V_h[(k_VKQ_0 + k0 + 0)*stride_KV + tid];
|
reinterpret_cast<half&>(V_k.x) = V_h[(k_VKQ_0 + k0 + 0)*stride_KV + tid];
|
||||||
reinterpret_cast<half&>(V_k.y) = V_h[(k_VKQ_0 + k0 + 1)*stride_KV + tid];
|
reinterpret_cast<half&>(V_k.y) = V_h[(k_VKQ_0 + k0 + 1)*stride_KV + tid];
|
||||||
VKQ += V_k*KQ2[k0/2];
|
#pragma unroll
|
||||||
|
for (int j = 0; j < ncols; ++j) {
|
||||||
|
VKQ[j] += V_k*KQ2[j*(D/2) + k0/2];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
}
|
}
|
||||||
|
|
||||||
if (tid >= D) {
|
#pragma unroll
|
||||||
kqsum = 0.0f;
|
for (int j = 0; j < ncols; ++j) {
|
||||||
}
|
kqsum[j] = warp_reduce_sum(kqsum[j]);
|
||||||
|
|
||||||
kqsum = warp_reduce_sum(kqsum);
|
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
kqsum_shared[threadIdx.y] = kqsum;
|
kqsum_shared[j][threadIdx.y] = kqsum[j];
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
kqsum = kqsum_shared[threadIdx.x];
|
|
||||||
kqsum = warp_reduce_sum(kqsum);
|
|
||||||
|
|
||||||
if (tid >= D) {
|
#pragma unroll
|
||||||
return;
|
for (int j_VKQ = 0; j_VKQ < ncols; ++j_VKQ) {
|
||||||
}
|
kqsum[j_VKQ] = kqsum_shared[j_VKQ][threadIdx.x];
|
||||||
|
kqsum[j_VKQ] = warp_reduce_sum(kqsum[j_VKQ]);
|
||||||
|
|
||||||
half dst_val = (__low2half(VKQ) + __high2half(VKQ));
|
half dst_val = (__low2half(VKQ[j_VKQ]) + __high2half(VKQ[j_VKQ]));
|
||||||
if (parallel_blocks == 1) {
|
if (parallel_blocks == 1) {
|
||||||
dst_val /= kqsum;
|
dst_val /= kqsum[j_VKQ];
|
||||||
|
}
|
||||||
|
const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip;
|
||||||
|
dst[j_dst*D*gridDim.y + D*blockIdx.y + tid] = dst_val;
|
||||||
}
|
}
|
||||||
dst[D*gridDim.y*blockIdx.x + D*blockIdx.y + tid] = dst_val;
|
|
||||||
|
|
||||||
if (parallel_blocks == 1 || tid != 0) {
|
if (parallel_blocks != 1 && tid != 0) {
|
||||||
return;
|
#pragma unroll
|
||||||
|
for (int j = 0; j < ncols; ++j) {
|
||||||
|
dst_meta[(ic0 + j)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[j], kqsum[j]);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
dst_meta[ic*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax, kqsum);
|
|
||||||
#else
|
#else
|
||||||
NO_DEVICE_CODE;
|
NO_DEVICE_CODE;
|
||||||
#endif // FP16_AVAILABLE
|
#endif // FP16_AVAILABLE
|
||||||
|
@ -191,7 +238,9 @@ static __global__ void flash_attn_vec_ext_f16(
|
||||||
|
|
||||||
// D == head size, VKQ_stride == num VKQ rows calculated in parallel:
|
// D == head size, VKQ_stride == num VKQ rows calculated in parallel:
|
||||||
template<int D, int ncols, int nwarps, int VKQ_stride, int parallel_blocks, typename KQ_acc_t>
|
template<int D, int ncols, int nwarps, int VKQ_stride, int parallel_blocks, typename KQ_acc_t>
|
||||||
|
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||||
__launch_bounds__(nwarps*WARP_SIZE, 1)
|
__launch_bounds__(nwarps*WARP_SIZE, 1)
|
||||||
|
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||||
static __global__ void flash_attn_ext_f16(
|
static __global__ void flash_attn_ext_f16(
|
||||||
const char * __restrict__ Q,
|
const char * __restrict__ Q,
|
||||||
const char * __restrict__ K,
|
const char * __restrict__ K,
|
||||||
|
@ -573,7 +622,9 @@ static __global__ void flash_attn_ext_f16(
|
||||||
}
|
}
|
||||||
|
|
||||||
template<int D, int parallel_blocks> // D == head size
|
template<int D, int parallel_blocks> // D == head size
|
||||||
|
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||||
__launch_bounds__(D, 1)
|
__launch_bounds__(D, 1)
|
||||||
|
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||||
static __global__ void flash_attn_combine_results(
|
static __global__ void flash_attn_combine_results(
|
||||||
const float * __restrict__ VKQ_parts,
|
const float * __restrict__ VKQ_parts,
|
||||||
const float2 * __restrict__ VKQ_meta,
|
const float2 * __restrict__ VKQ_meta,
|
||||||
|
@ -642,7 +693,7 @@ static_assert(get_VKQ_stride( 80, 1, 16) == 16, "Test failed.");
|
||||||
static_assert(get_VKQ_stride( 80, 2, 16) == 16, "Test failed.");
|
static_assert(get_VKQ_stride( 80, 2, 16) == 16, "Test failed.");
|
||||||
static_assert(get_VKQ_stride( 80, 4, 16) == 16, "Test failed.");
|
static_assert(get_VKQ_stride( 80, 4, 16) == 16, "Test failed.");
|
||||||
|
|
||||||
template <int D, int parallel_blocks> void launch_fattn_vec_f16(
|
template <int D, int cols_per_block, int parallel_blocks> void launch_fattn_vec_f16(
|
||||||
const ggml_tensor * Q, const ggml_tensor * K, const ggml_tensor * V, ggml_tensor * KQV, const ggml_tensor * mask,
|
const ggml_tensor * Q, const ggml_tensor * K, const ggml_tensor * V, ggml_tensor * KQV, const ggml_tensor * mask,
|
||||||
ggml_cuda_pool & pool, cudaStream_t main_stream
|
ggml_cuda_pool & pool, cudaStream_t main_stream
|
||||||
) {
|
) {
|
||||||
|
@ -656,13 +707,13 @@ template <int D, int parallel_blocks> void launch_fattn_vec_f16(
|
||||||
|
|
||||||
constexpr int nwarps = (D + WARP_SIZE - 1) / WARP_SIZE;
|
constexpr int nwarps = (D + WARP_SIZE - 1) / WARP_SIZE;
|
||||||
const dim3 block_dim(WARP_SIZE, nwarps, 1);
|
const dim3 block_dim(WARP_SIZE, nwarps, 1);
|
||||||
const dim3 blocks_num(parallel_blocks*Q->ne[1], Q->ne[2], Q->ne[3]);
|
const dim3 blocks_num(parallel_blocks*((Q->ne[1] + cols_per_block - 1) / cols_per_block), Q->ne[2], Q->ne[3]);
|
||||||
const int shmem = 0;
|
const int shmem = 0;
|
||||||
|
|
||||||
float scale;
|
float scale;
|
||||||
memcpy(&scale, KQV->op_params, sizeof(float));
|
memcpy(&scale, KQV->op_params, sizeof(float));
|
||||||
|
|
||||||
flash_attn_vec_ext_f16<D, parallel_blocks>
|
flash_attn_vec_ext_f16<D, cols_per_block, parallel_blocks>
|
||||||
<<<blocks_num, block_dim, shmem, main_stream>>> (
|
<<<blocks_num, block_dim, shmem, main_stream>>> (
|
||||||
(const char *) Q->data,
|
(const char *) Q->data,
|
||||||
(const char *) K->data,
|
(const char *) K->data,
|
||||||
|
@ -783,10 +834,99 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
|
||||||
|
|
||||||
ggml_cuda_set_device(ctx.device);
|
ggml_cuda_set_device(ctx.device);
|
||||||
|
|
||||||
|
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
|
||||||
const int nsm = ggml_cuda_info().devices[ggml_cuda_get_device()].nsm;
|
const int nsm = ggml_cuda_info().devices[ggml_cuda_get_device()].nsm;
|
||||||
|
|
||||||
const int32_t precision = KQV->op_params[1];
|
const int32_t precision = KQV->op_params[1];
|
||||||
|
|
||||||
|
if (!fp16_mma_available(cc)) {
|
||||||
|
GGML_ASSERT(precision == GGML_PREC_DEFAULT);
|
||||||
|
GGML_ASSERT(Q->ne[0] == 64 || Q->ne[0] == 128 && "FlashAttention without tensor cores only supports head sizes 64 and 128.");
|
||||||
|
|
||||||
|
if (Q->ne[1] == 1) {
|
||||||
|
constexpr int cols_per_block = 1;
|
||||||
|
constexpr int parallel_blocks = 4;
|
||||||
|
switch (Q->ne[0]) {
|
||||||
|
case 64:
|
||||||
|
launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||||
|
break;
|
||||||
|
case 128:
|
||||||
|
launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (Q->ne[1] == 2) {
|
||||||
|
constexpr int cols_per_block = 2;
|
||||||
|
constexpr int parallel_blocks = 4;
|
||||||
|
switch (Q->ne[0]) {
|
||||||
|
case 64:
|
||||||
|
launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||||
|
break;
|
||||||
|
case 128:
|
||||||
|
launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (Q->ne[1] <= 4) {
|
||||||
|
constexpr int cols_per_block = 4;
|
||||||
|
constexpr int parallel_blocks = 4;
|
||||||
|
switch (Q->ne[0]) {
|
||||||
|
case 64:
|
||||||
|
launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||||
|
break;
|
||||||
|
case 128:
|
||||||
|
launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (Q->ne[1] <= 8) {
|
||||||
|
constexpr int cols_per_block = 8;
|
||||||
|
constexpr int parallel_blocks = 4;
|
||||||
|
switch (Q->ne[0]) {
|
||||||
|
case 64:
|
||||||
|
launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||||
|
break;
|
||||||
|
case 128:
|
||||||
|
launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
constexpr int cols_per_block = 8;
|
||||||
|
constexpr int parallel_blocks = 1;
|
||||||
|
switch (Q->ne[0]) {
|
||||||
|
case 64:
|
||||||
|
launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||||
|
break;
|
||||||
|
case 128:
|
||||||
|
launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
if (precision != GGML_PREC_DEFAULT) {
|
if (precision != GGML_PREC_DEFAULT) {
|
||||||
if (Q->ne[1] <= 32 || Q->ne[0] > 128) {
|
if (Q->ne[1] <= 32 || Q->ne[0] > 128) {
|
||||||
constexpr int cols_per_block = 16;
|
constexpr int cols_per_block = 16;
|
||||||
|
@ -845,16 +985,17 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
|
||||||
}
|
}
|
||||||
|
|
||||||
if (Q->ne[1] == 1 && Q->ne[0] % (2*WARP_SIZE) == 0) {
|
if (Q->ne[1] == 1 && Q->ne[0] % (2*WARP_SIZE) == 0) {
|
||||||
|
constexpr int cols_per_block = 1;
|
||||||
constexpr int parallel_blocks = 4;
|
constexpr int parallel_blocks = 4;
|
||||||
switch (Q->ne[0]) {
|
switch (Q->ne[0]) {
|
||||||
case 64:
|
case 64:
|
||||||
launch_fattn_vec_f16< 64, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||||
break;
|
break;
|
||||||
case 128:
|
case 128:
|
||||||
launch_fattn_vec_f16<128, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||||
break;
|
break;
|
||||||
case 256:
|
case 256:
|
||||||
launch_fattn_vec_f16<256, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
launch_fattn_vec_f16<256, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
|
|
|
@ -2128,6 +2128,7 @@ static size_t ggml_backend_opencl_buffer_type_get_alignment(ggml_backend_buffer_
|
||||||
if (alignment == (cl_uint)-1) {
|
if (alignment == (cl_uint)-1) {
|
||||||
ggml_cl_init();
|
ggml_cl_init();
|
||||||
clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &alignment, NULL);
|
clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &alignment, NULL);
|
||||||
|
alignment /= 8; // bits to bytes
|
||||||
}
|
}
|
||||||
return alignment;
|
return alignment;
|
||||||
|
|
||||||
|
|
|
@ -8330,22 +8330,24 @@ static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict_
|
||||||
const int blocks_per_row = ncols / qk;
|
const int blocks_per_row = ncols / qk;
|
||||||
const int blocks_per_warp = vdr * WARP_SIZE / qi;
|
const int blocks_per_warp = vdr * WARP_SIZE / qi;
|
||||||
|
|
||||||
// partial sum for each thread
|
const int qi_vdr = (qi / vdr); // N_threads processing 1 qk block
|
||||||
|
|
||||||
|
// partial sum for each thread
|
||||||
float tmp = 0.0f;
|
float tmp = 0.0f;
|
||||||
|
|
||||||
const block_q_t * x = (const block_q_t *) vx;
|
const block_q_t * x = (const block_q_t *) vx;
|
||||||
const block_q8_1 * y = (const block_q8_1 *) vy;
|
const block_q8_1 * y = (const block_q8_1 *) vy;
|
||||||
|
|
||||||
for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
|
for (int i = item_ct1.get_local_id(2) / qi_vdr; i < blocks_per_row;
|
||||||
i += blocks_per_warp) {
|
i += blocks_per_warp) {
|
||||||
const int ibx = row*blocks_per_row + i; // x block index
|
const int ibx = row * blocks_per_row + i; // x block index
|
||||||
|
|
||||||
const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
|
const int iby = i * (qk / QK8_1); // y block index that aligns with ibx
|
||||||
|
|
||||||
const int iqs =
|
const int iqs =
|
||||||
vdr *
|
vdr *
|
||||||
(item_ct1.get_local_id(2) %
|
(item_ct1.get_local_id(2) -
|
||||||
(qi / vdr)); // x block quant index when casting the quants to int
|
i * qi_vdr); // x block quant index when casting the quants to int
|
||||||
|
|
||||||
tmp += vec_dot_q_sycl(&x[ibx], &y[iby], iqs);
|
tmp += vec_dot_q_sycl(&x[ibx], &y[iby], iqs);
|
||||||
}
|
}
|
||||||
|
|
76726
ggml-vulkan-shaders.hpp
76726
ggml-vulkan-shaders.hpp
File diff suppressed because it is too large
Load diff
1159
ggml-vulkan.cpp
1159
ggml-vulkan.cpp
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
92
gguf-py/scripts/gguf-new-metadata.py
Normal file → Executable file
92
gguf-py/scripts/gguf-new-metadata.py
Normal file → Executable file
|
@ -7,7 +7,8 @@ import json
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from typing import Any, Sequence
|
from tqdm import tqdm
|
||||||
|
from typing import Any, Sequence, NamedTuple
|
||||||
|
|
||||||
# Necessary to load the local gguf package
|
# Necessary to load the local gguf package
|
||||||
if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent / 'gguf-py').exists():
|
if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent / 'gguf-py').exists():
|
||||||
|
@ -18,6 +19,12 @@ import gguf
|
||||||
logger = logging.getLogger("gguf-new-metadata")
|
logger = logging.getLogger("gguf-new-metadata")
|
||||||
|
|
||||||
|
|
||||||
|
class MetadataDetails(NamedTuple):
|
||||||
|
type: gguf.GGUFValueType
|
||||||
|
value: Any
|
||||||
|
description: str = ''
|
||||||
|
|
||||||
|
|
||||||
def get_byteorder(reader: gguf.GGUFReader) -> gguf.GGUFEndian:
|
def get_byteorder(reader: gguf.GGUFReader) -> gguf.GGUFEndian:
|
||||||
if np.uint32(1) == np.uint32(1).newbyteorder("<"):
|
if np.uint32(1) == np.uint32(1).newbyteorder("<"):
|
||||||
# Host is little endian
|
# Host is little endian
|
||||||
|
@ -59,7 +66,16 @@ def get_field_data(reader: gguf.GGUFReader, key: str) -> Any:
|
||||||
return decode_field(field)
|
return decode_field(field)
|
||||||
|
|
||||||
|
|
||||||
def copy_with_new_metadata(reader: gguf.GGUFReader, writer: gguf.GGUFWriter, new_metadata: dict[str, str], remove_metadata: Sequence[str]) -> None:
|
def find_token(token_list: Sequence[int], token: str) -> Sequence[int]:
|
||||||
|
token_ids = [index for index, value in enumerate(token_list) if value == token]
|
||||||
|
|
||||||
|
if len(token_ids) == 0:
|
||||||
|
raise LookupError(f'Unable to find "{token}" in token list!')
|
||||||
|
|
||||||
|
return token_ids
|
||||||
|
|
||||||
|
|
||||||
|
def copy_with_new_metadata(reader: gguf.GGUFReader, writer: gguf.GGUFWriter, new_metadata: dict[str, MetadataDetails], remove_metadata: Sequence[str]) -> None:
|
||||||
for field in reader.fields.values():
|
for field in reader.fields.values():
|
||||||
# Suppress virtual fields and fields written by GGUFWriter
|
# Suppress virtual fields and fields written by GGUFWriter
|
||||||
if field.name == gguf.Keys.General.ARCHITECTURE or field.name.startswith('GGUF.'):
|
if field.name == gguf.Keys.General.ARCHITECTURE or field.name.startswith('GGUF.'):
|
||||||
|
@ -75,54 +91,64 @@ def copy_with_new_metadata(reader: gguf.GGUFReader, writer: gguf.GGUFWriter, new
|
||||||
logger.debug(f'Removing {field.name}')
|
logger.debug(f'Removing {field.name}')
|
||||||
continue
|
continue
|
||||||
|
|
||||||
old_val = decode_field(field)
|
old_val = MetadataDetails(field.types[0], decode_field(field))
|
||||||
val = new_metadata.get(field.name, old_val)
|
val = new_metadata.get(field.name, old_val)
|
||||||
|
|
||||||
if field.name in new_metadata:
|
if field.name in new_metadata:
|
||||||
logger.debug(f'Modifying {field.name}: "{old_val}" -> "{val}"')
|
logger.debug(f'Modifying {field.name}: "{old_val.value}" -> "{val.value}" {val.description}')
|
||||||
del new_metadata[field.name]
|
del new_metadata[field.name]
|
||||||
elif val is not None:
|
elif val.value is not None:
|
||||||
logger.debug(f'Copying {field.name}')
|
logger.debug(f'Copying {field.name}')
|
||||||
|
|
||||||
if val is not None:
|
if val.value is not None:
|
||||||
writer.add_key(field.name)
|
writer.add_key(field.name)
|
||||||
writer.add_val(val, field.types[0])
|
writer.add_val(val.value, val.type)
|
||||||
|
|
||||||
if gguf.Keys.Tokenizer.CHAT_TEMPLATE in new_metadata:
|
if gguf.Keys.Tokenizer.CHAT_TEMPLATE in new_metadata:
|
||||||
logger.debug('Adding chat template(s)')
|
logger.debug('Adding chat template(s)')
|
||||||
writer.add_chat_template(new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE])
|
writer.add_chat_template(new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE].value)
|
||||||
del new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE]
|
del new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE]
|
||||||
|
|
||||||
# TODO: Support other types than string?
|
|
||||||
for key, val in new_metadata.items():
|
for key, val in new_metadata.items():
|
||||||
logger.debug(f'Adding {key}: {val}')
|
logger.debug(f'Adding {key}: "{val.value}" {val.description}')
|
||||||
writer.add_key(key)
|
writer.add_key(key)
|
||||||
writer.add_val(val, gguf.GGUFValueType.STRING)
|
writer.add_val(val.value, val.type)
|
||||||
|
|
||||||
|
total_bytes = 0
|
||||||
|
|
||||||
for tensor in reader.tensors:
|
for tensor in reader.tensors:
|
||||||
|
total_bytes += tensor.n_bytes
|
||||||
# Dimensions are written in reverse order, so flip them first
|
# Dimensions are written in reverse order, so flip them first
|
||||||
shape = np.flipud(tensor.shape).tolist()
|
shape = np.flipud(tensor.shape).tolist()
|
||||||
writer.add_tensor_info(tensor.name, shape, tensor.data.dtype, tensor.data.nbytes, tensor.tensor_type)
|
writer.add_tensor_info(tensor.name, shape, tensor.data.dtype, tensor.data.nbytes, tensor.tensor_type)
|
||||||
|
|
||||||
|
bar = tqdm(desc="Writing", total=total_bytes, unit="byte", unit_scale=True)
|
||||||
|
|
||||||
writer.write_header_to_file()
|
writer.write_header_to_file()
|
||||||
writer.write_kv_data_to_file()
|
writer.write_kv_data_to_file()
|
||||||
writer.write_ti_data_to_file()
|
writer.write_ti_data_to_file()
|
||||||
|
|
||||||
for tensor in reader.tensors:
|
for tensor in reader.tensors:
|
||||||
writer.write_tensor_data(tensor.data)
|
writer.write_tensor_data(tensor.data)
|
||||||
|
bar.update(tensor.n_bytes)
|
||||||
|
|
||||||
writer.close()
|
writer.close()
|
||||||
|
|
||||||
|
|
||||||
def main() -> None:
|
def main() -> None:
|
||||||
|
tokenizer_metadata = (getattr(gguf.Keys.Tokenizer, n) for n in gguf.Keys.Tokenizer.__dict__.keys() if not n.startswith('_'))
|
||||||
|
token_names = dict((n.split('.')[-1][:-len('_token_id')], n) for n in tokenizer_metadata if n.endswith('_token_id'))
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description="Make a copy of a GGUF file with new metadata")
|
parser = argparse.ArgumentParser(description="Make a copy of a GGUF file with new metadata")
|
||||||
parser.add_argument("input", type=Path, help="GGUF format model input filename")
|
parser.add_argument("input", type=Path, help="GGUF format model input filename")
|
||||||
parser.add_argument("output", type=Path, help="GGUF format model output filename")
|
parser.add_argument("output", type=Path, help="GGUF format model output filename")
|
||||||
parser.add_argument("--general-name", type=str, help="The models general.name")
|
parser.add_argument("--general-name", type=str, help="The models general.name", metavar='"name"')
|
||||||
parser.add_argument("--general-description", type=str, help="The models general.description")
|
parser.add_argument("--general-description", type=str, help="The models general.description", metavar='"Description ..."')
|
||||||
parser.add_argument("--chat-template", type=str, help="Chat template string (or JSON string containing templates)")
|
parser.add_argument("--chat-template", type=str, help="Chat template string (or JSON string containing templates)", metavar='"{% ... %} ..."')
|
||||||
parser.add_argument("--chat-template-config", type=Path, help="Config file (tokenizer_config.json) containing chat template(s)")
|
parser.add_argument("--chat-template-config", type=Path, help="Config file containing chat template(s)", metavar='tokenizer_config.json')
|
||||||
parser.add_argument("--remove-metadata", action="append", type=str, help="Remove metadata (by key name) from output model")
|
parser.add_argument("--remove-metadata", action="append", type=str, help="Remove metadata (by key name) from output model", metavar='general.url')
|
||||||
|
parser.add_argument("--special-token", action="append", type=str, help="Special token by value", nargs=2, metavar=(' | '.join(token_names.keys()), '"<token>"'))
|
||||||
|
parser.add_argument("--special-token-by-id", action="append", type=str, help="Special token by id", nargs=2, metavar=(' | '.join(token_names.keys()), '0'))
|
||||||
parser.add_argument("--force", action="store_true", help="Bypass warnings without confirmation")
|
parser.add_argument("--force", action="store_true", help="Bypass warnings without confirmation")
|
||||||
parser.add_argument("--verbose", action="store_true", help="Increase output verbosity")
|
parser.add_argument("--verbose", action="store_true", help="Increase output verbosity")
|
||||||
args = parser.parse_args(None if len(sys.argv) > 2 else ["--help"])
|
args = parser.parse_args(None if len(sys.argv) > 2 else ["--help"])
|
||||||
|
@ -133,20 +159,20 @@ def main() -> None:
|
||||||
remove_metadata = args.remove_metadata or []
|
remove_metadata = args.remove_metadata or []
|
||||||
|
|
||||||
if args.general_name:
|
if args.general_name:
|
||||||
new_metadata[gguf.Keys.General.NAME] = args.general_name
|
new_metadata[gguf.Keys.General.NAME] = MetadataDetails(gguf.GGUFValueType.STRING, args.general_name)
|
||||||
|
|
||||||
if args.general_description:
|
if args.general_description:
|
||||||
new_metadata[gguf.Keys.General.DESCRIPTION] = args.general_description
|
new_metadata[gguf.Keys.General.DESCRIPTION] = MetadataDetails(gguf.GGUFValueType.STRING, args.general_description)
|
||||||
|
|
||||||
if args.chat_template:
|
if args.chat_template:
|
||||||
new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE] = json.loads(args.chat_template) if args.chat_template.startswith('[') else args.chat_template
|
new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE] = MetadataDetails(gguf.GGUFValueType.STRING, json.loads(args.chat_template) if args.chat_template.startswith('[') else args.chat_template)
|
||||||
|
|
||||||
if args.chat_template_config:
|
if args.chat_template_config:
|
||||||
with open(args.chat_template_config, 'r') as fp:
|
with open(args.chat_template_config, 'r') as fp:
|
||||||
config = json.load(fp)
|
config = json.load(fp)
|
||||||
template = config.get('chat_template')
|
template = config.get('chat_template')
|
||||||
if template:
|
if template:
|
||||||
new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE] = template
|
new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE] = MetadataDetails(gguf.GGUFValueType.STRING, template)
|
||||||
|
|
||||||
if remove_metadata:
|
if remove_metadata:
|
||||||
logger.warning('*** Warning *** Warning *** Warning **')
|
logger.warning('*** Warning *** Warning *** Warning **')
|
||||||
|
@ -166,6 +192,32 @@ def main() -> None:
|
||||||
arch = get_field_data(reader, gguf.Keys.General.ARCHITECTURE)
|
arch = get_field_data(reader, gguf.Keys.General.ARCHITECTURE)
|
||||||
endianess = get_byteorder(reader)
|
endianess = get_byteorder(reader)
|
||||||
|
|
||||||
|
token_list = get_field_data(reader, gguf.Keys.Tokenizer.LIST) or []
|
||||||
|
|
||||||
|
for name, token in args.special_token or []:
|
||||||
|
if name not in token_names:
|
||||||
|
logger.warning(f'Unknown special token "{name}", ignoring...')
|
||||||
|
else:
|
||||||
|
ids = find_token(token_list, token)
|
||||||
|
new_metadata[token_names[name]] = MetadataDetails(gguf.GGUFValueType.UINT32, ids[0], f'= {token}')
|
||||||
|
|
||||||
|
if len(ids) > 1:
|
||||||
|
logger.warning(f'Multiple "{token}" tokens found, choosing ID {ids[0]}, use --special-token-by-id if you want another:')
|
||||||
|
logger.warning(', '.join(str(i) for i in ids))
|
||||||
|
|
||||||
|
for name, id_string in args.special_token_by_id or []:
|
||||||
|
if name not in token_names:
|
||||||
|
logger.warning(f'Unknown special token "{name}", ignoring...')
|
||||||
|
elif not id_string.isdecimal():
|
||||||
|
raise LookupError(f'Token ID "{id_string}" is not a valid ID!')
|
||||||
|
else:
|
||||||
|
id_int = int(id_string)
|
||||||
|
|
||||||
|
if id_int >= 0 and id_int < len(token_list):
|
||||||
|
new_metadata[token_names[name]] = MetadataDetails(gguf.GGUFValueType.UINT32, id_int, f'= {token_list[id_int]}')
|
||||||
|
else:
|
||||||
|
raise LookupError(f'Token ID {id_int} is not within token list!')
|
||||||
|
|
||||||
if os.path.isfile(args.output) and not args.force:
|
if os.path.isfile(args.output) and not args.force:
|
||||||
logger.warning('*** Warning *** Warning *** Warning **')
|
logger.warning('*** Warning *** Warning *** Warning **')
|
||||||
logger.warning(f'* The "{args.output}" GGUF file already exists, it will be overwritten!')
|
logger.warning(f'* The "{args.output}" GGUF file already exists, it will be overwritten!')
|
||||||
|
|
11
llama.cpp
11
llama.cpp
|
@ -12779,7 +12779,7 @@ struct llm_tokenizer_wpm {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
code = unicode_tolower(code);
|
code = unicode_tolower(code);
|
||||||
if (type == CODEPOINT_TYPE_WHITESPACE) {
|
if (type == CODEPOINT_TYPE_SEPARATOR) {
|
||||||
code = ' ';
|
code = ' ';
|
||||||
}
|
}
|
||||||
std::string s = unicode_cpt_to_utf8(code);
|
std::string s = unicode_cpt_to_utf8(code);
|
||||||
|
@ -15824,13 +15824,6 @@ struct llama_context * llama_new_context_with_model(
|
||||||
cparams.flash_attn = false;
|
cparams.flash_attn = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef GGML_USE_HIPBLAS
|
|
||||||
if (cparams.flash_attn) {
|
|
||||||
LLAMA_LOG_WARN("%s: flash_attn is not yet compatible with HIPBLAS builds - forcing off\n", __func__);
|
|
||||||
cparams.flash_attn = false;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
if (params.seed == LLAMA_DEFAULT_SEED) {
|
if (params.seed == LLAMA_DEFAULT_SEED) {
|
||||||
params.seed = time(NULL);
|
params.seed = time(NULL);
|
||||||
}
|
}
|
||||||
|
@ -18199,7 +18192,7 @@ struct llama_timings llama_get_timings(struct llama_context * ctx) {
|
||||||
/*.t_eval_ms =*/ 1e-3 * ctx->t_eval_us,
|
/*.t_eval_ms =*/ 1e-3 * ctx->t_eval_us,
|
||||||
|
|
||||||
/*.n_sample =*/ std::max(1, ctx->n_sample),
|
/*.n_sample =*/ std::max(1, ctx->n_sample),
|
||||||
/*.n_p_eval =*/ std::max(1, ctx->n_p_eval),
|
/*.n_p_eval =*/ std::max(0, ctx->n_p_eval),
|
||||||
/*.n_eval =*/ std::max(1, ctx->n_eval),
|
/*.n_eval =*/ std::max(1, ctx->n_eval),
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -1,31 +1,14 @@
|
||||||
import regex
|
import regex
|
||||||
|
|
||||||
|
|
||||||
def cpt_to_utf8_str(cpt):
|
|
||||||
if cpt <= 0xFF:
|
|
||||||
return bytes([cpt, 0, 0, 0])
|
|
||||||
elif cpt <= 0xFFFF:
|
|
||||||
return bytes([cpt & 0xFF, cpt >> 8, 0, 0])
|
|
||||||
elif cpt <= 0xFFFFFF:
|
|
||||||
return bytes([cpt & 0xFF, (cpt >> 8) & 0xFF, (cpt >> 16) & 0xFF, 0])
|
|
||||||
else:
|
|
||||||
return bytes([cpt & 0xFF, (cpt >> 8) & 0xFF, (cpt >> 16) & 0xFF, cpt >> 24])
|
|
||||||
|
|
||||||
|
|
||||||
def is_match(codepoint, regex_expr):
|
|
||||||
try:
|
|
||||||
res = regex.match(regex_expr, cpt_to_utf8_str(codepoint).decode('utf-32'))
|
|
||||||
return res is not None
|
|
||||||
except Exception:
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def get_matches(regex_expr):
|
def get_matches(regex_expr):
|
||||||
|
regex_expr_compiled = regex.compile(regex_expr)
|
||||||
unicode_ranges = []
|
unicode_ranges = []
|
||||||
current_range = None
|
current_range = None
|
||||||
|
|
||||||
for codepoint in range(0x110000):
|
for codepoint in range(0x110000):
|
||||||
if is_match(codepoint, regex_expr):
|
char = chr(codepoint)
|
||||||
|
if regex_expr_compiled.match(char):
|
||||||
if current_range is None:
|
if current_range is None:
|
||||||
current_range = [codepoint, codepoint]
|
current_range = [codepoint, codepoint]
|
||||||
else:
|
else:
|
||||||
|
@ -40,27 +23,42 @@ def get_matches(regex_expr):
|
||||||
return unicode_ranges
|
return unicode_ranges
|
||||||
|
|
||||||
|
|
||||||
def print_cat(cat, ranges):
|
def print_cat(mode, cat, ranges):
|
||||||
|
if mode == "range":
|
||||||
print("const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_{} = {{".format(cat)) # noqa: NP100
|
print("const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_{} = {{".format(cat)) # noqa: NP100
|
||||||
cnt = 0
|
if mode == "map":
|
||||||
for start, end in ranges:
|
print("const std::map<uint32_t, uint32_t> unicode_map_{} = {{".format(cat)) # noqa: NP100
|
||||||
if cnt % 4 != 0:
|
for i, values in enumerate(ranges):
|
||||||
print(" ", end="") # noqa: NP100
|
end = ",\n" if (i % 4 == 3 or i + 1 == len(ranges)) else ", "
|
||||||
print("{{0x{:08X}, 0x{:08X}}},".format(start, end), end="") # noqa: NP100
|
values = ["0x%08X" % value for value in values]
|
||||||
if cnt % 4 == 3:
|
print("{" + ", ".join(values) + "}", end=end) # noqa: NP100
|
||||||
print("") # noqa: NP100
|
|
||||||
cnt += 1
|
|
||||||
|
|
||||||
if cnt % 4 != 0:
|
|
||||||
print("") # noqa: NP100
|
|
||||||
print("};") # noqa: NP100
|
print("};") # noqa: NP100
|
||||||
print("") # noqa: NP100
|
print("") # noqa: NP100
|
||||||
|
|
||||||
|
|
||||||
print_cat("number", get_matches(r'\p{N}'))
|
print_cat("range", "number", get_matches(r'\p{N}'))
|
||||||
print_cat("letter", get_matches(r'\p{L}'))
|
print_cat("range", "letter", get_matches(r'\p{L}'))
|
||||||
print_cat("whitespace", get_matches(r'\p{Z}'))
|
print_cat("range", "separator", get_matches(r'\p{Z}'))
|
||||||
print_cat("accent_mark", get_matches(r'\p{M}'))
|
print_cat("range", "accent_mark", get_matches(r'\p{M}'))
|
||||||
print_cat("punctuation", get_matches(r'\p{P}'))
|
print_cat("range", "punctuation", get_matches(r'\p{P}'))
|
||||||
print_cat("symbol", get_matches(r'\p{S}'))
|
print_cat("range", "symbol", get_matches(r'\p{S}'))
|
||||||
print_cat("control", get_matches(r'\p{C}'))
|
print_cat("range", "control", get_matches(r'\p{C}'))
|
||||||
|
|
||||||
|
print_cat("range", "whitespace", get_matches(r'\s'))
|
||||||
|
|
||||||
|
|
||||||
|
map_lowercase = []
|
||||||
|
map_uppercase = []
|
||||||
|
for codepoint in range(0x110000):
|
||||||
|
char = chr(codepoint)
|
||||||
|
lower = ord(char.lower()[0])
|
||||||
|
upper = ord(char.upper()[0])
|
||||||
|
if codepoint != lower:
|
||||||
|
map_lowercase.append((codepoint, lower))
|
||||||
|
if codepoint != upper:
|
||||||
|
map_uppercase.append((codepoint, upper))
|
||||||
|
print_cat("map", "lowercase", map_lowercase)
|
||||||
|
print_cat("map", "uppercase", map_uppercase)
|
||||||
|
|
||||||
|
|
||||||
|
# TODO: generate unicode_map_nfd
|
||||||
|
|
295
tests/test-tokenizer-random.py
Normal file
295
tests/test-tokenizer-random.py
Normal file
|
@ -0,0 +1,295 @@
|
||||||
|
# Test libllama tokenizer == AutoTokenizer.
|
||||||
|
# Brute force random tokens/text generation.
|
||||||
|
#
|
||||||
|
# Sample usage:
|
||||||
|
#
|
||||||
|
# python3 tests/test-tokenizer-random.py ./models/ggml-vocab-llama-bpe.gguf ./models/tokenizers/llama-bpe
|
||||||
|
#
|
||||||
|
|
||||||
|
import time
|
||||||
|
import logging
|
||||||
|
import argparse
|
||||||
|
import subprocess
|
||||||
|
import random
|
||||||
|
|
||||||
|
from typing import Iterator
|
||||||
|
|
||||||
|
import cffi
|
||||||
|
from transformers import AutoTokenizer, PreTrainedTokenizerBase
|
||||||
|
|
||||||
|
logger = logging.getLogger("test-tokenizer-random-bpe")
|
||||||
|
|
||||||
|
|
||||||
|
class LibLlama:
|
||||||
|
|
||||||
|
DEFAULT_PATH_LLAMA_H = "./llama.h"
|
||||||
|
DEFAULT_PATH_LIBLLAMA = "./build/libllama.so" # CMakeLists.txt: BUILD_SHARED_LIBS ON
|
||||||
|
|
||||||
|
def __init__(self, path_llama_h: str = None, path_libllama: str = None):
|
||||||
|
path_llama_h = path_llama_h or self.DEFAULT_PATH_LLAMA_H
|
||||||
|
path_libllama = path_libllama or self.DEFAULT_PATH_LIBLLAMA
|
||||||
|
(self.ffi, self.lib) = self._load_libllama_cffi(path_llama_h, path_libllama)
|
||||||
|
self.lib.llama_backend_init()
|
||||||
|
|
||||||
|
def _load_libllama_cffi(self, path_llama_h: str, path_libllama: str):
|
||||||
|
cmd = ["gcc", "-E", "-P", "-D__restrict=", "-D__attribute__(x)=", "-D__asm__(x)=", path_llama_h]
|
||||||
|
res = subprocess.run(cmd, stdout=subprocess.PIPE)
|
||||||
|
assert (res.returncode == 0)
|
||||||
|
source = res.stdout.decode()
|
||||||
|
ffi = cffi.FFI()
|
||||||
|
if True: # workarounds for pycparser
|
||||||
|
source = "typedef struct { } __builtin_va_list;" + "\n" + source
|
||||||
|
source = source.replace("sizeof (int)", str(ffi.sizeof("int")))
|
||||||
|
source = source.replace("sizeof (void *)", str(ffi.sizeof("void*")))
|
||||||
|
source = source.replace("sizeof (size_t)", str(ffi.sizeof("size_t")))
|
||||||
|
source = source.replace("sizeof(int32_t)", str(ffi.sizeof("int32_t")))
|
||||||
|
ffi.cdef(source, override=True)
|
||||||
|
lib = ffi.dlopen(path_libllama)
|
||||||
|
return (ffi, lib)
|
||||||
|
|
||||||
|
def model_default_params(self, **kwargs):
|
||||||
|
mparams = self.lib.llama_model_default_params()
|
||||||
|
for k, v in kwargs.items():
|
||||||
|
setattr(mparams, k, v)
|
||||||
|
return mparams
|
||||||
|
|
||||||
|
def context_default_params(self, **kwargs):
|
||||||
|
cparams = self.lib.llama_context_default_params()
|
||||||
|
for k, v in kwargs.items():
|
||||||
|
setattr(cparams, k, v)
|
||||||
|
return cparams
|
||||||
|
|
||||||
|
|
||||||
|
class LibLlamaModel:
|
||||||
|
|
||||||
|
def __init__(self, libllama: LibLlama, path_model: str, mparams={}, cparams={}):
|
||||||
|
self.lib = libllama.lib
|
||||||
|
self.ffi = libllama.ffi
|
||||||
|
if isinstance(mparams, dict):
|
||||||
|
mparams = libllama.model_default_params(**mparams)
|
||||||
|
self.model = self.lib.llama_load_model_from_file(path_model.encode(), mparams)
|
||||||
|
if not self.model:
|
||||||
|
raise RuntimeError("error: failed to load model '%s'" % path_model)
|
||||||
|
if isinstance(cparams, dict):
|
||||||
|
cparams = libllama.context_default_params(**cparams)
|
||||||
|
self.ctx = self.lib.llama_new_context_with_model(self.model, cparams)
|
||||||
|
if not self.ctx:
|
||||||
|
raise RuntimeError("error: failed to create context for model '%s'" % path_model)
|
||||||
|
n_tokens_max = self.lib.llama_n_ctx(self.ctx)
|
||||||
|
self.token_ids = self.ffi.new("llama_token[]", n_tokens_max)
|
||||||
|
|
||||||
|
def free(self):
|
||||||
|
if self.ctx:
|
||||||
|
self.lib.llama_free(self.ctx)
|
||||||
|
if self.model:
|
||||||
|
self.lib.llama_free_model(self.model)
|
||||||
|
self.ctx = None
|
||||||
|
self.model = None
|
||||||
|
self.lib = None
|
||||||
|
|
||||||
|
def tokenize(self, text: str, n_tokens_max: int = 0, add_special: bool = False, parse_special: bool = False) -> list[int]:
|
||||||
|
n_tokens_max = n_tokens_max if n_tokens_max > 0 else len(self.token_ids)
|
||||||
|
text = text.encode("utf-8")
|
||||||
|
num = self.lib.llama_tokenize(self.model, text, len(text), self.token_ids, n_tokens_max, add_special, parse_special)
|
||||||
|
if num < 0:
|
||||||
|
return []
|
||||||
|
return list(self.token_ids[0:num])
|
||||||
|
|
||||||
|
|
||||||
|
def generator_custom_text() -> Iterator[str]:
|
||||||
|
"""General tests"""
|
||||||
|
yield from [
|
||||||
|
"",
|
||||||
|
" ",
|
||||||
|
" ",
|
||||||
|
" ",
|
||||||
|
"\t",
|
||||||
|
"\n",
|
||||||
|
"\n\n",
|
||||||
|
"\n\n\n",
|
||||||
|
"\t\n",
|
||||||
|
"Hello world",
|
||||||
|
" Hello world",
|
||||||
|
"Hello World",
|
||||||
|
" Hello World",
|
||||||
|
" Hello World!",
|
||||||
|
"Hello, world!",
|
||||||
|
" Hello, world!",
|
||||||
|
" this is 🦙.cpp",
|
||||||
|
"w048 7tuijk dsdfhu",
|
||||||
|
"нещо на Български",
|
||||||
|
"កាន់តែពិសេសអាចខលចេញ",
|
||||||
|
"🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
|
||||||
|
"Hello",
|
||||||
|
" Hello",
|
||||||
|
" Hello",
|
||||||
|
" Hello",
|
||||||
|
" Hello",
|
||||||
|
" Hello\n Hello",
|
||||||
|
" (",
|
||||||
|
"\n =",
|
||||||
|
"' era",
|
||||||
|
"Hello, y'all! How are you 😁 ?我想在apple工作1314151天~",
|
||||||
|
"3",
|
||||||
|
"33",
|
||||||
|
"333",
|
||||||
|
"3333",
|
||||||
|
"33333",
|
||||||
|
"333333",
|
||||||
|
"3333333",
|
||||||
|
"33333333",
|
||||||
|
"333333333",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def generator_custom_text_edge_cases() -> Iterator[str]:
|
||||||
|
"""Edge cases found while debugging"""
|
||||||
|
yield from [
|
||||||
|
'\x1f-a', # unicode_ranges_control, {0x00001C, 0x00001F}
|
||||||
|
'¼-a', # unicode_ranges_digit, 0x00BC
|
||||||
|
'½-a', # unicode_ranges_digit, 0x00BD
|
||||||
|
'¾-a', # unicode_ranges_digit, 0x00BE
|
||||||
|
'a 〇b', # unicode_ranges_digit, 0x3007
|
||||||
|
'Ⅵ-a', # unicode_ranges_digit, {0x00002150, 0x0000218F} // Number Forms
|
||||||
|
'\uFEFF//', # unicode_ranges_control, 0xFEFF (BOM)
|
||||||
|
'<s>a' # TODO: Phi-3 fail
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def generator_random_chars(iterations = 100) -> Iterator[str]:
|
||||||
|
"""Brute force random text with simple characters"""
|
||||||
|
|
||||||
|
WHITESPACES = list(" " * 20 + "\n" * 5 + "\r\n" * 5 + "\t" * 5)
|
||||||
|
CHARS = list(set("""
|
||||||
|
ABCDEFGHIJKLMNOPQRSTUVWXYZ
|
||||||
|
abcdefghijklmnopqrstuvwxyz
|
||||||
|
ÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛÄËÏÖÜ
|
||||||
|
áéíóúàèìòùâêîôûäëïöü
|
||||||
|
.-,*/-+ª!"·$%&/()=?¿[]{}<>\\|@#~½¬~;:_
|
||||||
|
"""))
|
||||||
|
|
||||||
|
rand = random.Random()
|
||||||
|
for m in range(iterations):
|
||||||
|
rand.seed(m)
|
||||||
|
text = []
|
||||||
|
num_words = rand.randint(300, 400)
|
||||||
|
for i in range(num_words):
|
||||||
|
k = rand.randint(1, 7)
|
||||||
|
word = rand.choices(CHARS, k=k)
|
||||||
|
space = rand.choice(WHITESPACES)
|
||||||
|
text.append("".join(word) + space)
|
||||||
|
yield "".join(text)
|
||||||
|
|
||||||
|
|
||||||
|
def generator_random_vocab_chars(tokenizer: PreTrainedTokenizerBase, iterations = 100) -> Iterator[str]:
|
||||||
|
"""Brute force random text with vocab characters"""
|
||||||
|
|
||||||
|
vocab_ids = list(tokenizer.vocab.values())
|
||||||
|
vocab_text = tokenizer.decode(vocab_ids, skip_special_tokens=True)
|
||||||
|
vocab_chars = list(set(vocab_text))
|
||||||
|
del vocab_ids, vocab_text
|
||||||
|
|
||||||
|
rand = random.Random()
|
||||||
|
for m in range(iterations):
|
||||||
|
rand.seed(m)
|
||||||
|
text = rand.choices(vocab_chars, k=1024)
|
||||||
|
yield "".join(text)
|
||||||
|
|
||||||
|
|
||||||
|
def generator_random_vocab_tokens(tokenizer: PreTrainedTokenizerBase, iterations = 100) -> Iterator[str]:
|
||||||
|
"""Brute force random text from vocab tokens"""
|
||||||
|
|
||||||
|
space_id = tokenizer.encode(" ", add_special_tokens=False)[0]
|
||||||
|
vocab_ids = list(tokenizer.vocab.values())
|
||||||
|
vocab_ids = list(sorted(vocab_ids + vocab_ids))
|
||||||
|
for i in range(1, len(vocab_ids), 2):
|
||||||
|
vocab_ids[i] = space_id
|
||||||
|
vocab_tokens = tokenizer.decode(vocab_ids, skip_special_tokens=True)
|
||||||
|
vocab_tokens = vocab_tokens.split(" ")
|
||||||
|
del vocab_ids
|
||||||
|
|
||||||
|
yield from vocab_tokens
|
||||||
|
|
||||||
|
rand = random.Random()
|
||||||
|
for m in range(iterations):
|
||||||
|
rand.seed(m)
|
||||||
|
text = []
|
||||||
|
num_words = rand.randint(300, 400)
|
||||||
|
for i in range(num_words):
|
||||||
|
k = rand.randint(1, 3)
|
||||||
|
tokens = rand.choices(vocab_tokens, k=k)
|
||||||
|
tokens = [t.strip(" \n\r\t") for t in tokens]
|
||||||
|
sep = rand.choice(" \n\r\t")
|
||||||
|
text.append("".join(tokens) + sep)
|
||||||
|
yield "".join(text)
|
||||||
|
|
||||||
|
|
||||||
|
def generator_random_bytes(iterations = 100) -> Iterator[str]:
|
||||||
|
"""Brute force random bytes"""
|
||||||
|
|
||||||
|
WHITESPACES = list(" " * 20 + "\n" * 5 + "\r\n" * 5 + "\t" * 5)
|
||||||
|
|
||||||
|
rand = random.Random()
|
||||||
|
for m in range(iterations):
|
||||||
|
rand.seed(m)
|
||||||
|
text = []
|
||||||
|
num_words = rand.randint(300, 400)
|
||||||
|
for i in range(num_words):
|
||||||
|
k = rand.randint(1, 8)
|
||||||
|
word = [chr(r) for r in rand.randbytes(k) if r]
|
||||||
|
word.append(rand.choice(WHITESPACES))
|
||||||
|
text.append("".join(word))
|
||||||
|
yield "".join(text)
|
||||||
|
|
||||||
|
|
||||||
|
def test_compare_tokenizer(model: LibLlamaModel, tokenizer: PreTrainedTokenizerBase, generator: Iterator[str]):
|
||||||
|
|
||||||
|
def find_first_mismatch(ids1: list[int], ids2: list[int]):
|
||||||
|
for i, (a,b) in enumerate(zip(ids1, ids2)):
|
||||||
|
if a != b:
|
||||||
|
return i
|
||||||
|
if len(ids1) == len(ids2):
|
||||||
|
return -1
|
||||||
|
return min(len(ids1), len(ids2))
|
||||||
|
|
||||||
|
t0 = time.perf_counter()
|
||||||
|
logger.info("%s: %s" % (generator.__name__, "ini"))
|
||||||
|
for text in generator:
|
||||||
|
ids1 = model.tokenize(text, add_special=False, parse_special=False)
|
||||||
|
ids2 = tokenizer.encode(text, add_special_tokens=False)
|
||||||
|
if ids1 != ids2:
|
||||||
|
i = find_first_mismatch(ids1, ids2)
|
||||||
|
ids1 = list(ids1)[max(0, i - 2) : i + 2 + 1]
|
||||||
|
ids2 = list(ids2)[max(0, i - 2) : i + 2 + 1]
|
||||||
|
text2 = tokenizer.decode(ids2, skip_special_tokens=True)
|
||||||
|
assert (text2 in text)
|
||||||
|
logger.info(" Text: " + repr(text2))
|
||||||
|
logger.info(" TokenIDs: " + str(ids1))
|
||||||
|
logger.info(" Expected: " + str(ids2))
|
||||||
|
raise Exception()
|
||||||
|
t1 = time.perf_counter()
|
||||||
|
logger.info("%s: end, time: %.3f secs" % (generator.__name__, t1 - t0))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("vocab_file", help="path to vocab 'gguf' file")
|
||||||
|
parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file")
|
||||||
|
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
|
||||||
|
|
||||||
|
model = LibLlamaModel(LibLlama(), args.vocab_file, mparams=dict(vocab_only=True), cparams=dict(n_ctx=2048))
|
||||||
|
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(args.dir_tokenizer)
|
||||||
|
|
||||||
|
test_compare_tokenizer(model, tokenizer, generator_custom_text())
|
||||||
|
test_compare_tokenizer(model, tokenizer, generator_custom_text_edge_cases())
|
||||||
|
test_compare_tokenizer(model, tokenizer, generator_random_chars(10_000))
|
||||||
|
test_compare_tokenizer(model, tokenizer, generator_random_vocab_chars(tokenizer, 10_000))
|
||||||
|
test_compare_tokenizer(model, tokenizer, generator_random_vocab_tokens(tokenizer, 10_000))
|
||||||
|
# test_compare_tokenizer(model, tokenizer, generator_random_bytes(10_000)) # FAIL
|
||||||
|
|
||||||
|
model.free()
|
1262
unicode-data.cpp
1262
unicode-data.cpp
File diff suppressed because it is too large
Load diff
|
@ -7,6 +7,7 @@
|
||||||
|
|
||||||
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_number;
|
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_number;
|
||||||
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_letter;
|
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_letter;
|
||||||
|
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_separator;
|
||||||
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_whitespace;
|
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_whitespace;
|
||||||
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_accent_mark;
|
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_accent_mark;
|
||||||
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_punctuation;
|
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_punctuation;
|
||||||
|
|
368
unicode.cpp
368
unicode.cpp
|
@ -9,6 +9,7 @@
|
||||||
#include <stdexcept>
|
#include <stdexcept>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
|
#include <unordered_set>
|
||||||
#include <utility>
|
#include <utility>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <locale>
|
#include <locale>
|
||||||
|
@ -111,27 +112,27 @@ static uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset)
|
||||||
static std::unordered_map<uint32_t, int> unicode_cpt_type_map() {
|
static std::unordered_map<uint32_t, int> unicode_cpt_type_map() {
|
||||||
std::unordered_map<uint32_t, int> cpt_types;
|
std::unordered_map<uint32_t, int> cpt_types;
|
||||||
for (auto p : unicode_ranges_number) {
|
for (auto p : unicode_ranges_number) {
|
||||||
for (auto i = p.first; i <= p.second; ++ i) {
|
for (auto i = p.first; i <= p.second; ++i) {
|
||||||
cpt_types[i] = CODEPOINT_TYPE_NUMBER;
|
cpt_types[i] = CODEPOINT_TYPE_NUMBER;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (auto p : unicode_ranges_letter) {
|
for (auto p : unicode_ranges_letter) {
|
||||||
for (auto i = p.first; i <= p.second; ++ i) {
|
for (auto i = p.first; i <= p.second; ++i) {
|
||||||
cpt_types[i] = CODEPOINT_TYPE_LETTER;
|
cpt_types[i] = CODEPOINT_TYPE_LETTER;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (auto p : unicode_ranges_whitespace) {
|
for (auto p : unicode_ranges_separator) {
|
||||||
for (auto i = p.first; i <= p.second; ++ i) {
|
for (auto i = p.first; i <= p.second; ++i) {
|
||||||
cpt_types[i] = CODEPOINT_TYPE_WHITESPACE;
|
cpt_types[i] = CODEPOINT_TYPE_SEPARATOR;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (auto p : unicode_ranges_accent_mark) {
|
for (auto p : unicode_ranges_accent_mark) {
|
||||||
for (auto i = p.first; i <= p.second; ++ i) {
|
for (auto i = p.first; i <= p.second; ++i) {
|
||||||
cpt_types[i] = CODEPOINT_TYPE_ACCENT_MARK;
|
cpt_types[i] = CODEPOINT_TYPE_ACCENT_MARK;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (auto p : unicode_ranges_punctuation) {
|
for (auto p : unicode_ranges_punctuation) {
|
||||||
for (auto i = p.first; i <= p.second; ++ i) {
|
for (auto i = p.first; i <= p.second; ++i) {
|
||||||
cpt_types[i] = CODEPOINT_TYPE_PUNCTUATION;
|
cpt_types[i] = CODEPOINT_TYPE_PUNCTUATION;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -141,7 +142,7 @@ static std::unordered_map<uint32_t, int> unicode_cpt_type_map() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (auto p : unicode_ranges_control) {
|
for (auto p : unicode_ranges_control) {
|
||||||
for (auto i = p.first; i <= p.second; ++ i) {
|
for (auto i = p.first; i <= p.second; ++i) {
|
||||||
cpt_types[i] = CODEPOINT_TYPE_CONTROL;
|
cpt_types[i] = CODEPOINT_TYPE_CONTROL;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -224,138 +225,256 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
|
||||||
std::vector<size_t> bpe_offsets; // store the offset of each word
|
std::vector<size_t> bpe_offsets; // store the offset of each word
|
||||||
bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
|
bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
|
||||||
|
|
||||||
|
const auto cpts = unicode_cpts_from_utf8(text);
|
||||||
|
|
||||||
size_t start = 0;
|
size_t start = 0;
|
||||||
|
for (auto offset : offsets) {
|
||||||
|
const size_t offset_ini = start;
|
||||||
|
const size_t offset_end = start + offset;
|
||||||
|
assert(offset_end <= cpts.size());
|
||||||
|
start = offset_end;
|
||||||
|
|
||||||
|
auto _get_cpt = [&] (const size_t pos) -> char32_t {
|
||||||
|
return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
auto _get_cpt_type = [&] (const size_t pos) -> int {
|
||||||
|
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_type(cpts[pos]) : CODEPOINT_TYPE_UNIDENTIFIED;
|
||||||
|
};
|
||||||
|
|
||||||
|
size_t _prev_end = offset_ini;
|
||||||
|
auto _add_token = [&] (const size_t end) -> size_t {
|
||||||
|
assert(_prev_end <= end && end <= offset_end);
|
||||||
|
size_t len = end - _prev_end;
|
||||||
|
if (len > 0) {
|
||||||
|
bpe_offsets.push_back(len);
|
||||||
|
}
|
||||||
|
_prev_end = end;
|
||||||
|
//if (len > 0) {
|
||||||
|
// std::string s = "";
|
||||||
|
// for(size_t p = end-len; p < end; p++)
|
||||||
|
// s += unicode_cpt_to_utf8(cpts[p]);
|
||||||
|
// printf(">>> '%s'\n", s.c_str());
|
||||||
|
//}
|
||||||
|
return len;
|
||||||
|
};
|
||||||
|
|
||||||
|
for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
|
||||||
|
const char32_t cpt = _get_cpt(pos);
|
||||||
|
const int cpt_type = _get_cpt_type(pos);
|
||||||
|
|
||||||
|
// regex: 's|'t|'re|'ve|'m|'ll|'d
|
||||||
|
if (cpt == '\'' && pos+1 < offset_end) {
|
||||||
|
char32_t cpt_next = _get_cpt(pos+1);
|
||||||
|
if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || cpt_next == 'd') {
|
||||||
|
pos += _add_token(pos+2);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (pos+2 < offset_end) {
|
||||||
|
char32_t cpt_next_next = _get_cpt(pos+2);
|
||||||
|
if ((cpt_next == 'r' && cpt_next_next == 'e') ||
|
||||||
|
(cpt_next == 'v' && cpt_next_next == 'e') ||
|
||||||
|
(cpt_next == 'l' && cpt_next_next == 'l')) {
|
||||||
|
pos += _add_token(pos+3);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
char32_t cpt2 = (cpt == ' ' ? _get_cpt(pos+1) : cpt);
|
||||||
|
int cpt2_type = (cpt == ' ' ? _get_cpt_type(pos+1) : cpt_type);
|
||||||
|
// regex: <space>?\p{L}+
|
||||||
|
if (cpt2_type == CODEPOINT_TYPE_LETTER) {
|
||||||
|
pos += (cpt == ' ');
|
||||||
|
while (cpt2_type == CODEPOINT_TYPE_LETTER) {
|
||||||
|
cpt2_type = _get_cpt_type(++pos);
|
||||||
|
}
|
||||||
|
_add_token(pos);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
// regex: <space>?\p{N}+
|
||||||
|
if (cpt2_type == CODEPOINT_TYPE_NUMBER) {
|
||||||
|
pos += (cpt == ' ');
|
||||||
|
while (cpt2_type == CODEPOINT_TYPE_NUMBER) {
|
||||||
|
cpt2_type = _get_cpt_type(++pos);
|
||||||
|
}
|
||||||
|
_add_token(pos);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
// regex: <space>?[^\s\p{L}\p{N}]+
|
||||||
|
if (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
|
||||||
|
pos += (cpt == ' ');
|
||||||
|
while (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
|
||||||
|
cpt2_type = _get_cpt_type(++pos);
|
||||||
|
cpt2 = _get_cpt(pos);
|
||||||
|
}
|
||||||
|
_add_token(pos);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t num_whitespaces = 0;
|
||||||
|
while (unicode_cpt_is_whitespace(_get_cpt(pos+num_whitespaces))) {
|
||||||
|
num_whitespaces++;
|
||||||
|
}
|
||||||
|
|
||||||
|
// regex: \s+(?!\S)
|
||||||
|
if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != 0) {
|
||||||
|
pos += num_whitespaces - 1;
|
||||||
|
_add_token(pos);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// regex: \s+
|
||||||
|
if (num_whitespaces > 0) {
|
||||||
|
pos += num_whitespaces;
|
||||||
|
_add_token(pos);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// no matches
|
||||||
|
_add_token(++pos);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return bpe_offsets;
|
||||||
|
}
|
||||||
|
|
||||||
|
// LLAMA3 system regex: "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"
|
||||||
|
static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string & text, const std::vector<size_t> & offsets) {
|
||||||
|
std::vector<size_t> bpe_offsets; // store the offset of each word
|
||||||
|
bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
|
||||||
|
|
||||||
const auto cpts = unicode_cpts_from_utf8(text);
|
const auto cpts = unicode_cpts_from_utf8(text);
|
||||||
|
|
||||||
|
size_t start = 0;
|
||||||
for (auto offset : offsets) {
|
for (auto offset : offsets) {
|
||||||
std::string token;
|
const size_t offset_ini = start;
|
||||||
|
const size_t offset_end = start + offset;
|
||||||
|
assert(offset_end <= cpts.size());
|
||||||
|
start = offset_end;
|
||||||
|
|
||||||
bool collecting_numeric = false;
|
auto _get_cpt = [&] (const size_t pos) -> char32_t {
|
||||||
bool collecting_letter = false;
|
return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0;
|
||||||
bool collecting_special = false;
|
};
|
||||||
bool collecting_whitespace_lookahead = false;
|
|
||||||
bool collecting = false;
|
|
||||||
|
|
||||||
std::vector<std::string> text_utf;
|
auto _get_cpt_type = [&] (const size_t pos) -> int {
|
||||||
text_utf.reserve(offset);
|
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_type(cpts[pos]) : CODEPOINT_TYPE_UNIDENTIFIED;
|
||||||
|
};
|
||||||
|
|
||||||
for (size_t i = start; i < start + offset; ++i) {
|
size_t _prev_end = offset_ini;
|
||||||
text_utf.emplace_back(unicode_cpt_to_utf8(cpts[i]));
|
auto _add_token = [&] (const size_t end) -> size_t {
|
||||||
|
assert(_prev_end <= end && end <= offset_end);
|
||||||
|
size_t len = end - _prev_end;
|
||||||
|
if (len > 0) {
|
||||||
|
bpe_offsets.push_back(len);
|
||||||
}
|
}
|
||||||
|
_prev_end = end;
|
||||||
|
//if (len > 0) {
|
||||||
|
// std::string s = "";
|
||||||
|
// for(size_t p = end-len; p < end; p++)
|
||||||
|
// s += unicode_cpt_to_utf8(cpts[p]);
|
||||||
|
// printf(">>> '%s'\n", s.c_str());
|
||||||
|
//}
|
||||||
|
return len;
|
||||||
|
};
|
||||||
|
|
||||||
for (int i = 0; i < (int)text_utf.size(); i++) {
|
for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
|
||||||
const std::string & utf_char = text_utf[i];
|
const char32_t cpt = _get_cpt(pos);
|
||||||
bool split_condition = false;
|
const int cpt_type = _get_cpt_type(pos);
|
||||||
int bytes_remain = text_utf.size() - i;
|
|
||||||
|
|
||||||
// forward backward lookups
|
// regex: (?i:'s|'t|'re|'ve|'m|'ll|'d) // case insensitive
|
||||||
const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
|
if (cpt == '\'' && pos+1 < offset_end) {
|
||||||
const std::string & utf_char_next_next = (i + 2 < (int)text_utf.size()) ? text_utf[i + 2] : "";
|
char32_t cpt_next = unicode_tolower(_get_cpt(pos+1));
|
||||||
|
if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || cpt_next == 'd') {
|
||||||
// handling contractions
|
pos += _add_token(pos+2);
|
||||||
if (!split_condition && bytes_remain >= 2) {
|
continue;
|
||||||
// 's|'t|'m|'d
|
|
||||||
if (utf_char == "\'" && (utf_char_next == "s" || utf_char_next == "t" || utf_char_next == "m" || utf_char_next == "d")) {
|
|
||||||
split_condition = true;
|
|
||||||
}
|
}
|
||||||
if (split_condition) {
|
if (pos+2 < offset_end) {
|
||||||
if (token.size()) {
|
char32_t cpt_next_next = unicode_tolower(_get_cpt(pos+2));
|
||||||
bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size());
|
if ((cpt_next == 'r' && cpt_next_next == 'e') ||
|
||||||
}
|
(cpt_next == 'v' && cpt_next_next == 'e') ||
|
||||||
token = utf_char + utf_char_next;
|
(cpt_next == 'l' && cpt_next_next == 'l')) {
|
||||||
bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size());
|
pos += _add_token(pos+3);
|
||||||
token = "";
|
|
||||||
i++;
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!split_condition && bytes_remain >= 3) {
|
|
||||||
// 're|'ve|'ll
|
|
||||||
if (utf_char == "\'" && (
|
|
||||||
(utf_char_next == "r" && utf_char_next_next == "e") ||
|
|
||||||
(utf_char_next == "v" && utf_char_next_next == "e") ||
|
|
||||||
(utf_char_next == "l" && utf_char_next_next == "l"))
|
|
||||||
) {
|
|
||||||
split_condition = true;
|
|
||||||
}
|
}
|
||||||
if (split_condition) {
|
|
||||||
// current token + next token can be defined
|
|
||||||
if (token.size()) {
|
|
||||||
bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size());
|
|
||||||
}
|
|
||||||
token = utf_char;
|
|
||||||
token += utf_char_next;
|
|
||||||
token += utf_char_next_next;
|
|
||||||
|
|
||||||
bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size());
|
// regex: [^\r\n\p{L}\p{N}]?\p{L}+ //####FIXME: the first \p{L} is correct?
|
||||||
token = "";
|
if (cpt != '\r' && cpt != '\n' && /*cpt_type != CODEPOINT_TYPE_LETTER &&*/ cpt_type != CODEPOINT_TYPE_NUMBER) {
|
||||||
i += 2;
|
if (cpt_type == CODEPOINT_TYPE_LETTER || _get_cpt_type(pos+1) == CODEPOINT_TYPE_LETTER) { // one or more letters
|
||||||
|
pos++;
|
||||||
|
while (_get_cpt_type(pos) == CODEPOINT_TYPE_LETTER) {
|
||||||
|
pos++;
|
||||||
|
}
|
||||||
|
_add_token(pos);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!split_condition && !collecting) {
|
// regex: \p{N}{1,3}
|
||||||
if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || (token.empty() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER)) {
|
if (cpt_type == CODEPOINT_TYPE_NUMBER) {
|
||||||
collecting_letter = true;
|
size_t ini = pos;
|
||||||
collecting = true;
|
while (_get_cpt_type(pos) == CODEPOINT_TYPE_NUMBER) {
|
||||||
}
|
if (++pos - ini >= 3 ) {
|
||||||
else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_NUMBER || (token.empty() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_NUMBER)) {
|
_add_token(pos);
|
||||||
collecting_numeric = true;
|
ini = pos;
|
||||||
collecting = true;
|
|
||||||
}
|
|
||||||
else if (
|
|
||||||
((unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_NUMBER) && (unicode_cpt_type(utf_char) != CODEPOINT_TYPE_WHITESPACE)) ||
|
|
||||||
(token.empty() && utf_char == " " && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_NUMBER && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE)
|
|
||||||
) {
|
|
||||||
collecting_special = true;
|
|
||||||
collecting = true;
|
|
||||||
}
|
|
||||||
else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_WHITESPACE) {
|
|
||||||
collecting_whitespace_lookahead = true;
|
|
||||||
collecting = true;
|
|
||||||
}
|
|
||||||
else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE) {
|
|
||||||
split_condition = true;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if (!split_condition && collecting) {
|
_add_token(pos);
|
||||||
if (collecting_letter && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER) {
|
continue;
|
||||||
split_condition = true;
|
|
||||||
}
|
|
||||||
else if (collecting_numeric && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_NUMBER) {
|
|
||||||
split_condition = true;
|
|
||||||
}
|
|
||||||
else if (collecting_special && (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_NUMBER || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
|
|
||||||
split_condition = true;
|
|
||||||
}
|
|
||||||
else if (collecting_whitespace_lookahead && (unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_NUMBER)) {
|
|
||||||
split_condition = true;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (utf_char_next == "") {
|
// regex: <space>?[^\s\p{L}\p{N}]+[\r\n]*
|
||||||
split_condition = true; // final
|
char32_t cpt2 = (cpt == ' ' ? _get_cpt(pos+1) : cpt);
|
||||||
token += utf_char;
|
int cpt2_type = (cpt == ' ' ? _get_cpt_type(pos+1) : cpt_type);
|
||||||
|
if (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
|
||||||
|
pos += (cpt == ' ');
|
||||||
|
while (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
|
||||||
|
cpt2_type = _get_cpt_type(++pos);
|
||||||
|
cpt2 = _get_cpt(pos);
|
||||||
|
}
|
||||||
|
while (cpt2 == '\r' || cpt2 == '\n') {
|
||||||
|
cpt2 = _get_cpt(++pos);
|
||||||
|
}
|
||||||
|
_add_token(pos);
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (split_condition) {
|
size_t num_whitespaces = 0;
|
||||||
if (token.size()) {
|
size_t last_end_r_or_n = 0;
|
||||||
bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size());
|
while (unicode_cpt_is_whitespace(_get_cpt(pos+num_whitespaces))) {
|
||||||
}
|
char32_t cpt2 = _get_cpt(pos+num_whitespaces);
|
||||||
token = utf_char;
|
if (cpt2 == '\r' || cpt2 == '\n') {
|
||||||
collecting = false;
|
last_end_r_or_n = pos + num_whitespaces + 1;
|
||||||
collecting_letter = false;
|
|
||||||
collecting_numeric = false;
|
|
||||||
collecting_special = false;
|
|
||||||
collecting_whitespace_lookahead = false;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
token += utf_char;
|
|
||||||
}
|
}
|
||||||
|
num_whitespaces++;
|
||||||
}
|
}
|
||||||
|
|
||||||
start += offset;
|
// regex: \s*[\r\n]+
|
||||||
|
if (last_end_r_or_n > 0) {
|
||||||
|
pos = last_end_r_or_n;
|
||||||
|
_add_token(pos);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// regex: \s+(?!\S)
|
||||||
|
if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != 0) {
|
||||||
|
pos += num_whitespaces - 1;
|
||||||
|
_add_token(pos);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// regex: \s+
|
||||||
|
if (num_whitespaces > 0) {
|
||||||
|
pos += num_whitespaces;
|
||||||
|
_add_token(pos);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// no matches
|
||||||
|
_add_token(++pos);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return bpe_offsets;
|
return bpe_offsets;
|
||||||
|
@ -424,14 +543,14 @@ static std::vector<size_t> unicode_regex_split_stl(const std::string & text, con
|
||||||
static std::vector<size_t> unicode_regex_split_custom(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) {
|
static std::vector<size_t> unicode_regex_split_custom(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) {
|
||||||
std::vector<size_t> bpe_offsets;
|
std::vector<size_t> bpe_offsets;
|
||||||
|
|
||||||
(void)(text);
|
if (regex_expr == "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)") {
|
||||||
(void)(regex_expr);
|
bpe_offsets = unicode_regex_split_custom_gpt2(text, offsets);
|
||||||
(void)(offsets);
|
} else if (
|
||||||
// TODO: this implementation is actually wrong, uncomment and run:
|
regex_expr == "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" ||
|
||||||
// make -j && ./bin/test-tokenizer-0 ../models/ggml-vocab-gpt-2.gguf
|
regex_expr == "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+") {
|
||||||
//if (regex_expr == "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)") {
|
|
||||||
// bpe_offsets = unicode_regex_split_custom_gpt2(text, offsets);
|
bpe_offsets = unicode_regex_split_custom_llama3(text, offsets);
|
||||||
//}
|
}
|
||||||
|
|
||||||
return bpe_offsets;
|
return bpe_offsets;
|
||||||
}
|
}
|
||||||
|
@ -506,6 +625,19 @@ int unicode_cpt_type(const std::string & utf8) {
|
||||||
return unicode_cpt_type(unicode_cpt_from_utf8(utf8, offset));
|
return unicode_cpt_type(unicode_cpt_from_utf8(utf8, offset));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool unicode_cpt_is_whitespace(uint32_t cp) {
|
||||||
|
static const std::unordered_set<uint32_t> is_whitespace = [] {
|
||||||
|
std::unordered_set<uint32_t> is_whitespace;
|
||||||
|
for (auto p : unicode_ranges_whitespace) {
|
||||||
|
for (auto i = p.first; i <= p.second; ++i) {
|
||||||
|
is_whitespace.insert(i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return is_whitespace;
|
||||||
|
}();
|
||||||
|
return (bool)is_whitespace.count(cp);
|
||||||
|
}
|
||||||
|
|
||||||
std::string unicode_byte_to_utf8(uint8_t byte) {
|
std::string unicode_byte_to_utf8(uint8_t byte) {
|
||||||
static std::unordered_map<uint8_t, std::string> map = unicode_byte_to_utf8_map();
|
static std::unordered_map<uint8_t, std::string> map = unicode_byte_to_utf8_map();
|
||||||
return map.at(byte);
|
return map.at(byte);
|
||||||
|
|
|
@ -7,7 +7,7 @@
|
||||||
#define CODEPOINT_TYPE_UNIDENTIFIED 0
|
#define CODEPOINT_TYPE_UNIDENTIFIED 0
|
||||||
#define CODEPOINT_TYPE_NUMBER 1
|
#define CODEPOINT_TYPE_NUMBER 1
|
||||||
#define CODEPOINT_TYPE_LETTER 2
|
#define CODEPOINT_TYPE_LETTER 2
|
||||||
#define CODEPOINT_TYPE_WHITESPACE 3
|
#define CODEPOINT_TYPE_SEPARATOR 3
|
||||||
#define CODEPOINT_TYPE_ACCENT_MARK 4
|
#define CODEPOINT_TYPE_ACCENT_MARK 4
|
||||||
#define CODEPOINT_TYPE_PUNCTUATION 5
|
#define CODEPOINT_TYPE_PUNCTUATION 5
|
||||||
#define CODEPOINT_TYPE_SYMBOL 6
|
#define CODEPOINT_TYPE_SYMBOL 6
|
||||||
|
@ -21,6 +21,8 @@ std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & c
|
||||||
int unicode_cpt_type(uint32_t cp);
|
int unicode_cpt_type(uint32_t cp);
|
||||||
int unicode_cpt_type(const std::string & utf8);
|
int unicode_cpt_type(const std::string & utf8);
|
||||||
|
|
||||||
|
bool unicode_cpt_is_whitespace(uint32_t cp);
|
||||||
|
|
||||||
std::string unicode_byte_to_utf8(uint8_t byte);
|
std::string unicode_byte_to_utf8(uint8_t byte);
|
||||||
uint8_t unicode_utf8_to_byte(const std::string & utf8);
|
uint8_t unicode_utf8_to_byte(const std::string & utf8);
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue