mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-19 16:31:59 +00:00
Merge commit 'de1aa6fa73' into concedo_experimental
# Conflicts: # docs/build.md # docs/ops.md # docs/ops/WebGPU.csv # ggml/src/ggml-sycl/dequantize.hpp # ggml/src/ggml-sycl/dmmv.cpp # ggml/src/ggml-sycl/ggml-sycl.cpp # ggml/src/ggml-sycl/mmvq.cpp # ggml/src/ggml-sycl/quants.hpp # ggml/src/ggml-sycl/vecdotq.hpp # ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp # ggml/src/ggml-webgpu/ggml-webgpu.cpp # ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl # tests/test-backend-ops.cpp # tests/test-quantize-fns.cpp
This commit is contained in:
commit
5529748a01
42 changed files with 799 additions and 2231 deletions
|
|
@ -434,7 +434,8 @@ extern "C" {
|
|||
// GGML_TYPE_IQ4_NL_8_8 = 38,
|
||||
GGML_TYPE_MXFP4 = 39, // MXFP4 (1 block)
|
||||
GGML_TYPE_NVFP4 = 40, // NVFP4 (4 blocks, E4M3 scale)
|
||||
GGML_TYPE_COUNT = 41,
|
||||
GGML_TYPE_Q1_0 = 41,
|
||||
GGML_TYPE_COUNT = 42,
|
||||
};
|
||||
|
||||
// precision
|
||||
|
|
@ -471,6 +472,7 @@ extern "C" {
|
|||
GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_MXFP4 = 25, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_NVFP4 = 26, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_Q1_0 = 27, // except 1d tensors
|
||||
};
|
||||
|
||||
// available tensor operations:
|
||||
|
|
@ -918,15 +920,17 @@ extern "C" {
|
|||
struct ggml_tensor * b,
|
||||
struct ggml_tensor * ids);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_add1(
|
||||
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_add1(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
struct ggml_tensor * b),
|
||||
"use ggml_add instead");
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_add1_inplace(
|
||||
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_add1_inplace(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
struct ggml_tensor * b),
|
||||
"use ggml_add_inplace instead");
|
||||
|
||||
// dst = a
|
||||
// view(dst, nb1, nb2, nb3, offset) += b
|
||||
|
|
|
|||
|
|
@ -93,6 +93,10 @@ typedef sycl::half2 ggml_half2;
|
|||
// QR = QK / number of values before dequantization
|
||||
// QI = number of 32 bit integers before dequantization
|
||||
|
||||
#define QI1_0 (QK1_0 / 32)
|
||||
#define QR1_0 1
|
||||
|
||||
|
||||
#define QI4_0 (QK4_0 / (4 * QR4_0))
|
||||
#define QR4_0 2
|
||||
|
||||
|
|
@ -170,6 +174,13 @@ typedef sycl::half2 ggml_half2;
|
|||
#define GGML_EXTENSION __extension__
|
||||
#endif // _MSC_VER
|
||||
|
||||
#define QK1_0 128
|
||||
typedef struct {
|
||||
ggml_half d; // delta
|
||||
uint8_t qs[QK1_0 / 8]; // bits / quants
|
||||
} block_q1_0;
|
||||
static_assert(sizeof(block_q1_0) == sizeof(ggml_half) + QK1_0 / 8, "wrong q1_0 block size/padding");
|
||||
|
||||
#define QK4_0 32
|
||||
typedef struct {
|
||||
ggml_half d; // delta
|
||||
|
|
|
|||
|
|
@ -16,6 +16,7 @@
|
|||
#define ggml_vec_dot_q8_0_q8_0_generic ggml_vec_dot_q8_0_q8_0
|
||||
#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
|
||||
#define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
|
||||
#define ggml_vec_dot_q1_0_q8_0_generic ggml_vec_dot_q1_0_q8_0
|
||||
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
|
||||
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
|
||||
#define ggml_vec_dot_q2_K_q8_K_generic ggml_vec_dot_q2_K_q8_K
|
||||
|
|
@ -72,6 +73,7 @@
|
|||
#elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
|
||||
// quants.c
|
||||
#define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
|
||||
#define ggml_vec_dot_q1_0_q8_0_generic ggml_vec_dot_q1_0_q8_0
|
||||
// repack.cpp
|
||||
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||
#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
|
||||
|
|
@ -100,6 +102,7 @@
|
|||
// quants.c
|
||||
#define quantize_row_q8_K_generic quantize_row_q8_K
|
||||
#define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
|
||||
#define ggml_vec_dot_q1_0_q8_0_generic ggml_vec_dot_q1_0_q8_0
|
||||
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
|
||||
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
|
||||
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
|
||||
|
|
@ -142,6 +145,7 @@
|
|||
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
|
||||
#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
|
||||
#define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
|
||||
#define ggml_vec_dot_q1_0_q8_0_generic ggml_vec_dot_q1_0_q8_0
|
||||
// repack.cpp
|
||||
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
|
||||
|
|
@ -176,6 +180,7 @@
|
|||
#elif defined(__riscv)
|
||||
// quants.c
|
||||
#define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
|
||||
#define ggml_vec_dot_q1_0_q8_0_generic ggml_vec_dot_q1_0_q8_0
|
||||
// repack.cpp
|
||||
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
|
||||
|
|
@ -209,6 +214,7 @@
|
|||
// quants.c
|
||||
#define quantize_row_q8_K_generic quantize_row_q8_K
|
||||
#define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
|
||||
#define ggml_vec_dot_q1_0_q8_0_generic ggml_vec_dot_q1_0_q8_0
|
||||
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
|
||||
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
|
||||
#define ggml_vec_dot_q2_K_q8_K_generic ggml_vec_dot_q2_K_q8_K
|
||||
|
|
@ -266,6 +272,7 @@
|
|||
#define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K
|
||||
#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
|
||||
#define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
|
||||
#define ggml_vec_dot_q1_0_q8_0_generic ggml_vec_dot_q1_0_q8_0
|
||||
// repack.cpp
|
||||
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
|
||||
|
|
|
|||
|
|
@ -137,6 +137,109 @@ void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in
|
|||
|
||||
//===================================== Dot products =================================
|
||||
|
||||
void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
const int qk = QK1_0; // 128
|
||||
const int nb = n / qk;
|
||||
|
||||
assert(n % qk == 0);
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
UNUSED(bx);
|
||||
UNUSED(by);
|
||||
UNUSED(bs);
|
||||
|
||||
const block_q1_0 * GGML_RESTRICT x = vx;
|
||||
const block_q8_0 * GGML_RESTRICT y = vy;
|
||||
|
||||
float sumf = 0.0f;
|
||||
|
||||
#if defined(__ARM_NEON)
|
||||
float32x4_t sumv = vdupq_n_f32(0.0f);
|
||||
|
||||
for (int i = 0; i < nb; i++) {
|
||||
const float d0 = GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
|
||||
// Process 4 Q8_0 blocks (each has 32 elements)
|
||||
for (int k = 0; k < 4; k++) {
|
||||
const block_q8_0 * GGML_RESTRICT yb = &y[i * 4 + k];
|
||||
const float d1 = GGML_CPU_FP16_TO_FP32(yb->d);
|
||||
|
||||
// Get the 4 bytes of bits for this Q8_0 block (32 bits = 4 bytes)
|
||||
// Bits are at offset k*4 bytes in x[i].qs
|
||||
const uint8_t * bits = &x[i].qs[k * 4];
|
||||
|
||||
// Load 32 int8 values from y
|
||||
const int8x16_t y0 = vld1q_s8(yb->qs);
|
||||
const int8x16_t y1 = vld1q_s8(yb->qs + 16);
|
||||
|
||||
// Byte 0-1: bits for y0[0..15]
|
||||
const uint64_t expand0 = table_b2b_0[bits[0]];
|
||||
const uint64_t expand1 = table_b2b_0[bits[1]];
|
||||
// Byte 2-3: bits for y1[0..15]
|
||||
const uint64_t expand2 = table_b2b_0[bits[2]];
|
||||
const uint64_t expand3 = table_b2b_0[bits[3]];
|
||||
|
||||
// Build the sign vectors by reinterpreting the table values
|
||||
uint8x8_t e0 = vcreate_u8(expand0);
|
||||
uint8x8_t e1 = vcreate_u8(expand1);
|
||||
uint8x8_t e2 = vcreate_u8(expand2);
|
||||
uint8x8_t e3 = vcreate_u8(expand3);
|
||||
|
||||
// Shift right by 4 to get 0 or 1
|
||||
int8x8_t s0 = vreinterpret_s8_u8(vshr_n_u8(e0, 4));
|
||||
int8x8_t s1 = vreinterpret_s8_u8(vshr_n_u8(e1, 4));
|
||||
int8x8_t s2 = vreinterpret_s8_u8(vshr_n_u8(e2, 4));
|
||||
int8x8_t s3 = vreinterpret_s8_u8(vshr_n_u8(e3, 4));
|
||||
|
||||
// Convert 0/1 to -1/+1: sign = 2*val - 1
|
||||
int8x8_t one = vdup_n_s8(1);
|
||||
s0 = vsub_s8(vadd_s8(s0, s0), one); // 2*s0 - 1
|
||||
s1 = vsub_s8(vadd_s8(s1, s1), one);
|
||||
s2 = vsub_s8(vadd_s8(s2, s2), one);
|
||||
s3 = vsub_s8(vadd_s8(s3, s3), one);
|
||||
|
||||
// Combine into 16-element vectors
|
||||
int8x16_t signs0 = vcombine_s8(s0, s1);
|
||||
int8x16_t signs1 = vcombine_s8(s2, s3);
|
||||
|
||||
// Multiply signs with y values and accumulate
|
||||
// dot(signs, y) where signs are +1/-1
|
||||
int32x4_t p0 = ggml_vdotq_s32(vdupq_n_s32(0), signs0, y0);
|
||||
int32x4_t p1 = ggml_vdotq_s32(p0, signs1, y1);
|
||||
|
||||
// Scale by d1 and accumulate
|
||||
sumv = vmlaq_n_f32(sumv, vcvtq_f32_s32(p1), d0 * d1);
|
||||
}
|
||||
}
|
||||
|
||||
sumf = vaddvq_f32(sumv);
|
||||
#else
|
||||
// Scalar fallback
|
||||
for (int i = 0; i < nb; i++) {
|
||||
const float d0 = GGML_FP16_TO_FP32(x[i].d);
|
||||
|
||||
// Process 4 Q8_0 blocks
|
||||
for (int k = 0; k < 4; k++) {
|
||||
const float d1 = GGML_FP16_TO_FP32(y[i*4 + k].d);
|
||||
|
||||
int sumi = 0;
|
||||
for (int j = 0; j < QK8_0; j++) {
|
||||
const int bit_index = k * QK8_0 + j;
|
||||
const int byte_index = bit_index / 8;
|
||||
const int bit_offset = bit_index % 8;
|
||||
|
||||
const int xi = ((x[i].qs[byte_index] >> bit_offset) & 1) ? 1 : -1;
|
||||
sumi += xi * y[i*4 + k].qs[j];
|
||||
}
|
||||
sumf += d0 * d1 * sumi;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
*s = sumf;
|
||||
}
|
||||
|
||||
|
||||
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
|
|
|
|||
|
|
@ -2156,4 +2156,3 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|||
ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -2302,4 +2302,3 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|||
ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1463,4 +1463,3 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|||
ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1218,4 +1218,3 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -218,6 +218,12 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
|
|||
.vec_dot_type = GGML_TYPE_F16,
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_Q1_0] = {
|
||||
.from_float = quantize_row_q1_0,
|
||||
.vec_dot = ggml_vec_dot_q1_0_q8_0,
|
||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_Q4_0] = {
|
||||
.from_float = quantize_row_q4_0,
|
||||
.vec_dot = ggml_vec_dot_q4_0_q8_0,
|
||||
|
|
|
|||
|
|
@ -4829,6 +4829,7 @@ void ggml_compute_forward_get_rows(
|
|||
const ggml_tensor * src0 = dst->src[0];
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_Q1_0:
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q5_0:
|
||||
|
|
@ -5554,6 +5555,7 @@ void ggml_compute_forward_clamp(
|
|||
ggml_compute_forward_clamp_f16(params, dst);
|
||||
} break;
|
||||
case GGML_TYPE_BF16:
|
||||
case GGML_TYPE_Q1_0:
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q5_0:
|
||||
|
|
|
|||
|
|
@ -22,6 +22,10 @@
|
|||
|
||||
#define UNUSED GGML_UNUSED
|
||||
|
||||
void quantize_row_q1_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
|
||||
quantize_row_q1_0_ref(x, y, k);
|
||||
}
|
||||
|
||||
void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
|
||||
quantize_row_q4_0_ref(x, y, k);
|
||||
}
|
||||
|
|
@ -116,6 +120,51 @@ void quantize_row_q8_K_generic(const float * GGML_RESTRICT x, void * GGML_RESTRI
|
|||
|
||||
//===================================== Dot products =================================
|
||||
|
||||
void ggml_vec_dot_q1_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
const int qk = QK1_0;
|
||||
const int nb = n / qk;
|
||||
|
||||
assert(n % qk == 0);
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
UNUSED(bx);
|
||||
UNUSED(by);
|
||||
UNUSED(bs);
|
||||
|
||||
const block_q1_0 * GGML_RESTRICT x = vx;
|
||||
const block_q8_0 * GGML_RESTRICT y = vy;
|
||||
|
||||
float sumf = 0.0;
|
||||
|
||||
for (int i = 0; i < nb; i++) {
|
||||
const float d0 = GGML_FP16_TO_FP32(x[i].d);
|
||||
|
||||
float sumi = 0.0f;
|
||||
|
||||
for (int k = 0; k < 4; k++) {
|
||||
const float d1 = GGML_FP16_TO_FP32(y[i*4 + k].d);
|
||||
|
||||
int sumi_block = 0;
|
||||
|
||||
for (int j = 0; j < QK8_0; j++) {
|
||||
const int bit_index = k * QK8_0 + j;
|
||||
const int byte_index = bit_index / 8;
|
||||
const int bit_offset = bit_index % 8;
|
||||
|
||||
const int xi = ((x[i].qs[byte_index] >> bit_offset) & 1) ? 1 : -1;
|
||||
sumi_block += xi * y[i*4 + k].qs[j];
|
||||
}
|
||||
|
||||
sumi += d1 * sumi_block;
|
||||
}
|
||||
|
||||
sumf += d0 * sumi;
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
}
|
||||
|
||||
|
||||
void ggml_vec_dot_q4_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
|
|
|
|||
|
|
@ -12,6 +12,7 @@ extern "C" {
|
|||
#endif
|
||||
|
||||
// Quantization
|
||||
void quantize_row_q1_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
|
|
@ -36,6 +37,7 @@ void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y,
|
|||
void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
// Dot product
|
||||
void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
|
|
@ -68,6 +70,7 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
|
|||
void quantize_row_q8_0_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
||||
void quantize_row_q8_1_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
||||
void quantize_row_q8_K_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void ggml_vec_dot_q1_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q4_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q5_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
|
|
|
|||
|
|
@ -65,7 +65,7 @@
|
|||
#define GGML_CUDA_CC_VEGA (GGML_CUDA_CC_OFFSET_AMD + 0x900) // Vega56/64, minimum for fp16 dual issue
|
||||
#define GGML_CUDA_CC_VEGA20 (GGML_CUDA_CC_OFFSET_AMD + 0x906) // MI50/Radeon VII, minimum for dp4a
|
||||
#define GGML_CUDA_CC_CDNA1 (GGML_CUDA_CC_OFFSET_AMD + 0x908) // MI100, minimum for MFMA, acc registers
|
||||
#define GGML_CUDA_CC_CDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x910) // MI210, minimum acc register renameing
|
||||
#define GGML_CUDA_CC_CDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x90a) // MI210 (gfx90a), minimum acc register renaming
|
||||
#define GGML_CUDA_CC_CDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x942) // MI300
|
||||
|
||||
// RDNA removes MFMA, dp4a, xnack, acc registers, wave size is 32
|
||||
|
|
|
|||
|
|
@ -3324,6 +3324,71 @@ static bool ggml_cuda_topk_moe_fusion(const struct ggml_cgraph * cgraph, int nod
|
|||
return true;
|
||||
}
|
||||
|
||||
// returns whether the write (out) nodes overwrite the read nodes in operation
|
||||
static bool ggml_cuda_check_fusion_memory_ranges(const ggml_cgraph * cgraph,
|
||||
const int node_idx,
|
||||
const int node_count,
|
||||
const int * out_nodes,
|
||||
const int out_count,
|
||||
const bool is_topk_moe = false) {
|
||||
auto nodes_overlap = [&](const ggml_tensor * a, const ggml_tensor * b) {
|
||||
const int64_t a_start = (int64_t) a->data;
|
||||
const int64_t a_end = a_start + ggml_backend_buft_get_alloc_size(a->buffer->buft, a);
|
||||
|
||||
const int64_t b_start = (int64_t) b->data;
|
||||
const int64_t b_end = b_start + ggml_backend_buft_get_alloc_size(b->buffer->buft, b);
|
||||
|
||||
if ((b_start <= a_start && a_start < b_end) || (a_start <= b_start && b_start < a_end)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
};
|
||||
|
||||
bool is_ok = true;
|
||||
// exception for topk-moe, as each row is read entirely before writing
|
||||
if (ggml_nrows(cgraph->nodes[node_idx]) == 1 && is_topk_moe) {
|
||||
return true;
|
||||
}
|
||||
|
||||
for (int i = 0; i < out_count; ++i) {
|
||||
const ggml_tensor * dst = cgraph->nodes[out_nodes[i]];
|
||||
|
||||
for (int j = node_idx; j < node_idx + node_count; ++j) {
|
||||
// Loop over all srcs of all nodes in the fusion. If the src overlaps
|
||||
// the destination and the src is not an intermediate node that's being
|
||||
// elided, then disable fusion.
|
||||
|
||||
for (int src_idx = 0; src_idx < GGML_MAX_SRC; ++src_idx) {
|
||||
const ggml_tensor * src = cgraph->nodes[j]->src[src_idx];
|
||||
|
||||
if (!src || src->op == GGML_OP_NONE) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (nodes_overlap(dst, src)) {
|
||||
bool found = false;
|
||||
|
||||
for (int k = node_idx; k < j; ++k) {
|
||||
if (cgraph->nodes[k] == src) {
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!found) {
|
||||
is_ok = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return is_ok;
|
||||
}
|
||||
|
||||
|
||||
static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph,
|
||||
int node_idx,
|
||||
std::initializer_list<enum ggml_op> ops,
|
||||
|
|
@ -3353,7 +3418,8 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph,
|
|||
const ggml_tensor * glu = cgraph->nodes[node_idx + 4];
|
||||
|
||||
if (ggml_cuda_should_fuse_mul_mat(ffn_up, ffn_gate, glu, ffn_up_bias, ffn_gate_bias)) {
|
||||
return true;
|
||||
int out_nodes[] = { node_idx + 4 };
|
||||
return ggml_cuda_check_fusion_memory_ranges(cgraph, node_idx, (int)ops.size(), out_nodes, 1);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -3364,7 +3430,8 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph,
|
|||
const ggml_tensor * glu = cgraph->nodes[node_idx + 2];
|
||||
|
||||
if (ggml_cuda_should_fuse_mul_mat(ffn_up, ffn_gate, glu)) {
|
||||
return true;
|
||||
int out_nodes[] = { node_idx + 2 };
|
||||
return ggml_cuda_check_fusion_memory_ranges(cgraph, node_idx, (int)ops.size(), out_nodes, 1);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -3490,69 +3557,6 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph,
|
|||
return false;
|
||||
}
|
||||
|
||||
// returns whether the write (out) nodes overwrite the read nodes in operation
|
||||
static bool ggml_cuda_check_fusion_memory_ranges(ggml_cgraph * cgraph,
|
||||
int node_idx,
|
||||
int node_count,
|
||||
int * out_nodes,
|
||||
int out_count) {
|
||||
auto nodes_overlap = [&](const ggml_tensor * a, const ggml_tensor * b) {
|
||||
const int64_t a_start = (int64_t) a->data;
|
||||
const int64_t a_end = a_start + ggml_nbytes(a);
|
||||
|
||||
const int64_t b_start = (int64_t) b->data;
|
||||
const int64_t b_end = b_start + ggml_nbytes(b);
|
||||
|
||||
if ((b_start <= a_start && a_start < b_end) || (a_start <= b_start && b_start < a_end)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
};
|
||||
|
||||
bool is_ok = true;
|
||||
// for nrows=1, all fusion operations correctly read the src before writing dst or do it elementwise, so we should be ok
|
||||
if (ggml_nrows(cgraph->nodes[node_idx]) == 1) {
|
||||
return true;
|
||||
}
|
||||
|
||||
for (int i = 0; i < out_count; ++i) {
|
||||
const ggml_tensor * dst = cgraph->nodes[out_nodes[i]];
|
||||
|
||||
for (int j = node_idx; j < node_idx + node_count; ++j) {
|
||||
// Loop over all srcs of all nodes in the fusion. If the src overlaps
|
||||
// the destination and the src is not an intermediate node that's being
|
||||
// elided, then disable fusion.
|
||||
|
||||
for (int src_idx = 0; src_idx < GGML_MAX_SRC; ++src_idx) {
|
||||
const ggml_tensor * src = cgraph->nodes[j]->src[src_idx];
|
||||
|
||||
if (!src || src->op == GGML_OP_NONE) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (nodes_overlap(dst, src)) {
|
||||
bool found = false;
|
||||
|
||||
for (int k = node_idx; k < j; ++k) {
|
||||
if (cgraph->nodes[k] == src) {
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!found) {
|
||||
is_ok = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return is_ok;
|
||||
}
|
||||
|
||||
static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph, const bool use_cuda_graph, const bool cuda_graph_update_required, const void * graph_key) {
|
||||
bool graph_evaluated_or_captured = false;
|
||||
|
||||
|
|
@ -3750,7 +3754,7 @@ static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cud
|
|||
|
||||
if (ggml_can_fuse_subgraph(cgraph, i, ops.size(), ops.data(), out_nodes, 2) &&
|
||||
ggml_cuda_should_use_topk_moe(node, logits, weights, ids) &&
|
||||
ggml_cuda_check_fusion_memory_ranges(cgraph, i, ops.size(), out_nodes, 2)) {
|
||||
ggml_cuda_check_fusion_memory_ranges(cgraph, i, ops.size(), out_nodes, 2, /*is_topk_moe=*/ true)) {
|
||||
ggml_cuda_op_topk_moe(*cuda_ctx, logits, weights, ids, clamp, scale, bias, args);
|
||||
i += ops.size() - 1;
|
||||
continue;
|
||||
|
|
@ -3766,7 +3770,7 @@ static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cud
|
|||
int out_nodes[2] = { i + 1, i + 5 };
|
||||
if (ggml_can_fuse_subgraph(cgraph, i, ops.size(), ops.data(), out_nodes, 2) &&
|
||||
ggml_cuda_should_use_topk_moe(softmax, logits, weights, ids) &&
|
||||
ggml_cuda_check_fusion_memory_ranges(cgraph, i, ops.size(), out_nodes, 2)) {
|
||||
ggml_cuda_check_fusion_memory_ranges(cgraph, i, ops.size(), out_nodes, 2, /*is_topk_moe=*/ true)) {
|
||||
ggml_cuda_op_topk_moe(*cuda_ctx, logits, weights, ids, clamp, scale, bias, args);
|
||||
i += ops.size() - 1;
|
||||
continue;
|
||||
|
|
|
|||
|
|
@ -32,6 +32,41 @@ static inline int best_index_int8(int n, const int8_t * val, float x) {
|
|||
return x - val[mu-1] < val[mu] - x ? mu-1 : mu;
|
||||
}
|
||||
|
||||
// reference implementation for deterministic creation of model files
|
||||
void quantize_row_q1_0_ref(const float * GGML_RESTRICT x, block_q1_0 * GGML_RESTRICT y, int64_t k) {
|
||||
static const int qk = QK1_0;
|
||||
|
||||
assert(k % qk == 0);
|
||||
|
||||
const int nb = k / qk;
|
||||
|
||||
for (int i = 0; i < nb; i++) {
|
||||
float sum_abs = 0.0f;
|
||||
for (int j = 0; j < qk; j++) {
|
||||
sum_abs += fabsf(x[i*qk + j]);
|
||||
}
|
||||
const float d = sum_abs / qk;
|
||||
|
||||
y[i].d = GGML_FP32_TO_FP16(d);
|
||||
|
||||
// Clear all bits first
|
||||
for (int j = 0; j < qk / 8; ++j) {
|
||||
y[i].qs[j] = 0;
|
||||
}
|
||||
|
||||
// Just store sign of each weight directly (no normalization)
|
||||
for (int j = 0; j < qk; ++j) {
|
||||
const int bit_index = j;
|
||||
const int byte_index = bit_index / 8;
|
||||
const int bit_offset = bit_index % 8;
|
||||
|
||||
if (x[i*qk + j] >= 0.0f) {
|
||||
y[i].qs[byte_index] |= (1 << bit_offset);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// reference implementation for deterministic creation of model files
|
||||
void quantize_row_q4_0_ref(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k) {
|
||||
static const int qk = QK4_0;
|
||||
|
|
@ -339,6 +374,26 @@ void quantize_row_nvfp4_ref(const float * GGML_RESTRICT x, block_nvfp4 * GGML_RE
|
|||
}
|
||||
}
|
||||
|
||||
void dequantize_row_q1_0(const block_q1_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
||||
static const int qk = QK1_0;
|
||||
|
||||
assert(k % qk == 0);
|
||||
|
||||
const int nb = k / qk;
|
||||
|
||||
for (int i = 0; i < nb; i++) {
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d);
|
||||
const float neg_d = -d;
|
||||
|
||||
for (int j = 0; j < qk; ++j) {
|
||||
const int byte_index = j / 8;
|
||||
const int bit_offset = j % 8;
|
||||
const uint8_t bit = (x[i].qs[byte_index] >> bit_offset) & 1;
|
||||
y[i*qk + j] = bit ? d : neg_d;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
||||
static const int qk = QK4_0;
|
||||
|
||||
|
|
@ -1978,6 +2033,22 @@ static void quantize_row_q4_0_impl(const float * GGML_RESTRICT x, block_q4_0 * G
|
|||
}
|
||||
}
|
||||
|
||||
size_t quantize_q1_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
if (!quant_weights) {
|
||||
quantize_row_q1_0_ref(src, dst, (int64_t)nrow*n_per_row);
|
||||
return nrow * ggml_row_size(GGML_TYPE_Q1_0, n_per_row);
|
||||
}
|
||||
size_t row_size = ggml_row_size(GGML_TYPE_Q1_0, n_per_row);
|
||||
char * qrow = (char *)dst;
|
||||
for (int64_t row = 0; row < nrow; ++row) {
|
||||
quantize_row_q1_0_ref(src, (block_q1_0*)qrow, n_per_row);
|
||||
src += n_per_row;
|
||||
qrow += row_size;
|
||||
}
|
||||
return nrow * row_size;
|
||||
}
|
||||
|
||||
|
||||
size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
if (!quant_weights) {
|
||||
quantize_row_q4_0_ref(src, dst, (int64_t)nrow*n_per_row);
|
||||
|
|
@ -5286,6 +5357,10 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
|
|||
}
|
||||
}
|
||||
} break;
|
||||
case GGML_TYPE_Q1_0:
|
||||
{
|
||||
VALIDATE_ROW_DATA_D_F16_IMPL(block_q1_0, data, nb);
|
||||
} break;
|
||||
case GGML_TYPE_Q4_0:
|
||||
{
|
||||
VALIDATE_ROW_DATA_D_F16_IMPL(block_q4_0, data, nb);
|
||||
|
|
|
|||
|
|
@ -14,6 +14,7 @@ extern "C" {
|
|||
// NOTE: these functions are defined as GGML_API because they used by the CPU backend
|
||||
|
||||
// Quantization
|
||||
GGML_API void quantize_row_q1_0_ref(const float * GGML_RESTRICT x, block_q1_0 * GGML_RESTRICT y, int64_t k);
|
||||
GGML_API void quantize_row_q4_0_ref(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k);
|
||||
GGML_API void quantize_row_q4_1_ref(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t k);
|
||||
GGML_API void quantize_row_q5_0_ref(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t k);
|
||||
|
|
@ -41,6 +42,7 @@ GGML_API void quantize_row_iq3_s_ref (const float * GGML_RESTRICT x, block_iq3_
|
|||
GGML_API void quantize_row_iq2_s_ref (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
// Dequantization
|
||||
GGML_API void dequantize_row_q1_0(const block_q1_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
GGML_API void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
GGML_API void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
GGML_API void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
|
|
@ -90,6 +92,7 @@ GGML_API size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTR
|
|||
GGML_API size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
GGML_API size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
GGML_API size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
GGML_API size_t quantize_q1_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
GGML_API size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
GGML_API size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
GGML_API size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
|
|
|
|||
|
|
@ -3463,11 +3463,19 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|||
CREATE_FA(GGML_TYPE_F16, f16, FA_SCALAR, )
|
||||
CREATE_FA(GGML_TYPE_Q4_0, q4_0, FA_SCALAR, )
|
||||
CREATE_FA(GGML_TYPE_Q8_0, q8_0, FA_SCALAR, )
|
||||
CREATE_FA(GGML_TYPE_Q4_1, q4_1, FA_SCALAR, )
|
||||
CREATE_FA(GGML_TYPE_Q5_0, q5_0, FA_SCALAR, )
|
||||
CREATE_FA(GGML_TYPE_Q5_1, q5_1, FA_SCALAR, )
|
||||
CREATE_FA(GGML_TYPE_IQ4_NL, iq4_nl, FA_SCALAR, )
|
||||
} else {
|
||||
CREATE_FA(GGML_TYPE_F32, f32, FA_SCALAR, _fp32)
|
||||
CREATE_FA(GGML_TYPE_F16, f16, FA_SCALAR, _fp32)
|
||||
CREATE_FA(GGML_TYPE_Q4_0, q4_0, FA_SCALAR, _fp32)
|
||||
CREATE_FA(GGML_TYPE_Q8_0, q8_0, FA_SCALAR, _fp32)
|
||||
CREATE_FA(GGML_TYPE_Q4_1, q4_1, FA_SCALAR, _fp32)
|
||||
CREATE_FA(GGML_TYPE_Q5_0, q5_0, FA_SCALAR, _fp32)
|
||||
CREATE_FA(GGML_TYPE_Q5_1, q5_1, FA_SCALAR, _fp32)
|
||||
CREATE_FA(GGML_TYPE_IQ4_NL, iq4_nl, FA_SCALAR, _fp32)
|
||||
}
|
||||
#if defined(VK_KHR_cooperative_matrix) && defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
|
||||
if (device->coopmat1_fa_support) {
|
||||
|
|
@ -3475,6 +3483,10 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|||
CREATE_FA(GGML_TYPE_F16, f16, FA_COOPMAT1, _cm1)
|
||||
CREATE_FA(GGML_TYPE_Q4_0, q4_0, FA_COOPMAT1, _cm1)
|
||||
CREATE_FA(GGML_TYPE_Q8_0, q8_0, FA_COOPMAT1, _cm1)
|
||||
CREATE_FA(GGML_TYPE_Q4_1, q4_1, FA_COOPMAT1, _cm1)
|
||||
CREATE_FA(GGML_TYPE_Q5_0, q5_0, FA_COOPMAT1, _cm1)
|
||||
CREATE_FA(GGML_TYPE_Q5_1, q5_1, FA_COOPMAT1, _cm1)
|
||||
CREATE_FA(GGML_TYPE_IQ4_NL, iq4_nl, FA_COOPMAT1, _cm1)
|
||||
}
|
||||
#endif
|
||||
#if defined(VK_NV_cooperative_matrix2) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
|
||||
|
|
@ -15369,11 +15381,12 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
|||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q8_0:
|
||||
// supported in scalar and coopmat2 paths
|
||||
break;
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q5_0:
|
||||
case GGML_TYPE_Q5_1:
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
// supported in scalar and coopmat2 paths
|
||||
break;
|
||||
// K dequants currently disabled because D dimension is rounded up to 256 and runs inefficiently
|
||||
//case GGML_TYPE_Q2_K:
|
||||
//case GGML_TYPE_Q3_K:
|
||||
|
|
@ -15388,12 +15401,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
|||
//case GGML_TYPE_IQ3_XXS:
|
||||
//case GGML_TYPE_IQ3_S:
|
||||
//case GGML_TYPE_IQ4_XS:
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
// currently supported only in coopmat2 path
|
||||
if (!coopmat2) {
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -110,6 +110,97 @@ FLOAT_TYPEV4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {
|
|||
|
||||
#if defined(DATA_A_Q4_0)
|
||||
#define BLOCK_BYTE_SIZE 18
|
||||
#elif defined(DATA_A_Q4_1)
|
||||
#define BLOCK_BYTE_SIZE 20
|
||||
#endif
|
||||
|
||||
#if defined(DATA_A_Q4_0) || defined(DATA_A_Q4_1)
|
||||
FLOAT_TYPEV4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {
|
||||
if (binding_idx == BINDING_IDX_K) {
|
||||
uint vui_lo = uint(k_packed.k_data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 0]);
|
||||
uint vui_hi = uint(k_packed.k_data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 1]);
|
||||
uint shift = (iqs & 0x10) >> 2;
|
||||
vui_lo >>= shift;
|
||||
vui_hi >>= shift;
|
||||
|
||||
FLOAT_TYPEV4 nibbles = FLOAT_TYPEV4(vui_lo & 0xF, (vui_lo >> 8) & 0xF, vui_hi & 0xF, (vui_hi >> 8) & 0xF);
|
||||
#ifdef DATA_A_Q4_1
|
||||
return FLOAT_TYPE(k_packed.k_data_packed16[a_offset + ib].d) * nibbles + FLOAT_TYPE(k_packed.k_data_packed16[a_offset + ib].m);
|
||||
#else
|
||||
return FLOAT_TYPE(k_packed.k_data_packed16[a_offset + ib].d) * (nibbles - FLOAT_TYPE(8.0f));
|
||||
#endif
|
||||
} else {
|
||||
uint vui_lo = uint(v_packed.v_data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 0]);
|
||||
uint vui_hi = uint(v_packed.v_data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 1]);
|
||||
uint shift = (iqs & 0x10) >> 2;
|
||||
vui_lo >>= shift;
|
||||
vui_hi >>= shift;
|
||||
|
||||
FLOAT_TYPEV4 nibbles = FLOAT_TYPEV4(vui_lo & 0xF, (vui_lo >> 8) & 0xF, vui_hi & 0xF, (vui_hi >> 8) & 0xF);
|
||||
#ifdef DATA_A_Q4_1
|
||||
return FLOAT_TYPE(v_packed.v_data_packed16[a_offset + ib].d) * nibbles + FLOAT_TYPE(v_packed.v_data_packed16[a_offset + ib].m);
|
||||
#else
|
||||
return FLOAT_TYPE(v_packed.v_data_packed16[a_offset + ib].d) * (nibbles - FLOAT_TYPE(8.0f));
|
||||
#endif
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(DATA_A_Q5_0)
|
||||
#define BLOCK_BYTE_SIZE 22
|
||||
#elif defined(DATA_A_Q5_1)
|
||||
#define BLOCK_BYTE_SIZE 24
|
||||
#endif
|
||||
|
||||
#if defined(DATA_A_Q5_0) || defined(DATA_A_Q5_1)
|
||||
FLOAT_TYPEV4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {
|
||||
if (binding_idx == BINDING_IDX_K) {
|
||||
uint vui_lo = uint(k_packed.k_data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 0]);
|
||||
uint vui_hi = uint(k_packed.k_data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 1]);
|
||||
uint shift = (iqs & 0x10) >> 2;
|
||||
vui_lo >>= shift;
|
||||
vui_hi >>= shift;
|
||||
|
||||
#ifdef DATA_A_Q5_1
|
||||
uint qh = k_packed.k_data_packed16[a_offset + ib].qh;
|
||||
#else
|
||||
uint qh = uint(k_packed.k_data_packed16[a_offset + ib].qh[0]) | (uint(k_packed.k_data_packed16[a_offset + ib].qh[1]) << 16);
|
||||
#endif
|
||||
FLOAT_TYPEV4 hb = FLOAT_TYPEV4((qh >> iqs) & 1, (qh >> (iqs + 1)) & 1, (qh >> (iqs + 2)) & 1, (qh >> (iqs + 3)) & 1) * FLOAT_TYPE(16.0f);
|
||||
|
||||
FLOAT_TYPEV4 nibbles = FLOAT_TYPEV4(vui_lo & 0xF, (vui_lo >> 8) & 0xF, vui_hi & 0xF, (vui_hi >> 8) & 0xF);
|
||||
#ifdef DATA_A_Q5_1
|
||||
return FLOAT_TYPE(k_packed.k_data_packed16[a_offset + ib].d) * (nibbles + hb) + FLOAT_TYPE(k_packed.k_data_packed16[a_offset + ib].m);
|
||||
#else
|
||||
return FLOAT_TYPE(k_packed.k_data_packed16[a_offset + ib].d) * (nibbles + hb - FLOAT_TYPE(16.0f));
|
||||
#endif
|
||||
} else {
|
||||
uint vui_lo = uint(v_packed.v_data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 0]);
|
||||
uint vui_hi = uint(v_packed.v_data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 1]);
|
||||
uint shift = (iqs & 0x10) >> 2;
|
||||
vui_lo >>= shift;
|
||||
vui_hi >>= shift;
|
||||
|
||||
#ifdef DATA_A_Q5_1
|
||||
uint qh = v_packed.v_data_packed16[a_offset + ib].qh;
|
||||
#else
|
||||
uint qh = uint(v_packed.v_data_packed16[a_offset + ib].qh[0]) | (uint(v_packed.v_data_packed16[a_offset + ib].qh[1]) << 16);
|
||||
#endif
|
||||
FLOAT_TYPEV4 hb = FLOAT_TYPEV4((qh >> iqs) & 1, (qh >> (iqs + 1)) & 1, (qh >> (iqs + 2)) & 1, (qh >> (iqs + 3)) & 1) * FLOAT_TYPE(16.0f);
|
||||
|
||||
FLOAT_TYPEV4 nibbles = FLOAT_TYPEV4(vui_lo & 0xF, (vui_lo >> 8) & 0xF, vui_hi & 0xF, (vui_hi >> 8) & 0xF);
|
||||
#ifdef DATA_A_Q5_1
|
||||
return FLOAT_TYPE(v_packed.v_data_packed16[a_offset + ib].d) * (nibbles + hb) + FLOAT_TYPE(v_packed.v_data_packed16[a_offset + ib].m);
|
||||
#else
|
||||
return FLOAT_TYPE(v_packed.v_data_packed16[a_offset + ib].d) * (nibbles + hb - FLOAT_TYPE(16.0f));
|
||||
#endif
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(DATA_A_IQ4_NL)
|
||||
#define BLOCK_BYTE_SIZE 18
|
||||
|
||||
FLOAT_TYPEV4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {
|
||||
if (binding_idx == BINDING_IDX_K) {
|
||||
|
|
@ -119,7 +210,11 @@ FLOAT_TYPEV4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {
|
|||
vui_lo >>= shift;
|
||||
vui_hi >>= shift;
|
||||
|
||||
return FLOAT_TYPE(k_packed.k_data_packed16[a_offset + ib].d) * (FLOAT_TYPEV4(vui_lo & 0xF, (vui_lo >> 8) & 0xF, vui_hi & 0xF, (vui_hi >> 8) & 0xF) - FLOAT_TYPE(8.0f));
|
||||
return FLOAT_TYPE(k_packed.k_data_packed16[a_offset + ib].d) * FLOAT_TYPEV4(
|
||||
kvalues_iq4nl[vui_lo & 0xF],
|
||||
kvalues_iq4nl[(vui_lo >> 8) & 0xF],
|
||||
kvalues_iq4nl[vui_hi & 0xF],
|
||||
kvalues_iq4nl[(vui_hi >> 8) & 0xF]);
|
||||
} else {
|
||||
uint vui_lo = uint(v_packed.v_data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 0]);
|
||||
uint vui_hi = uint(v_packed.v_data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 1]);
|
||||
|
|
@ -127,11 +222,14 @@ FLOAT_TYPEV4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {
|
|||
vui_lo >>= shift;
|
||||
vui_hi >>= shift;
|
||||
|
||||
return FLOAT_TYPE(v_packed.v_data_packed16[a_offset + ib].d) * (FLOAT_TYPEV4(vui_lo & 0xF, (vui_lo >> 8) & 0xF, vui_hi & 0xF, (vui_hi >> 8) & 0xF) - FLOAT_TYPE(8.0f));
|
||||
return FLOAT_TYPE(v_packed.v_data_packed16[a_offset + ib].d) * FLOAT_TYPEV4(
|
||||
kvalues_iq4nl[vui_lo & 0xF],
|
||||
kvalues_iq4nl[(vui_lo >> 8) & 0xF],
|
||||
kvalues_iq4nl[vui_hi & 0xF],
|
||||
kvalues_iq4nl[(vui_hi >> 8) & 0xF]);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(DATA_A_Q8_0)
|
||||
#define BLOCK_BYTE_SIZE 34
|
||||
FLOAT_TYPEV4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {
|
||||
|
|
|
|||
|
|
@ -151,6 +151,7 @@ void execute_command(std::vector<std::string>& command, std::string& stdout_str,
|
|||
|
||||
pid_t pid = fork();
|
||||
if (pid < 0) {
|
||||
std::cerr << strerror(errno) << "\n";
|
||||
throw std::runtime_error("Failed to fork process");
|
||||
}
|
||||
|
||||
|
|
@ -672,7 +673,7 @@ void process_shaders() {
|
|||
if (tname == "f16") {
|
||||
string_to_spv("flash_attn_f32_f16_" + tname, "flash_attn_cm1.comp",
|
||||
merge_maps(fa_base_dict, {{"Q_TYPE", "float"}, {"D_TYPE", "float"}, {"D_TYPEV4", "vec4"}, {"COOPMAT", "1"}}), fp16, true, false, f16acc);
|
||||
} else if (tname == "q4_0" || tname == "q8_0" || tname == "f32") {
|
||||
} else if (tname == "q4_0" || tname == "q4_1" || tname == "q5_0" || tname == "q5_1" || tname == "iq4_nl" || tname == "q8_0" || tname == "f32") {
|
||||
std::string data_a_key = "DATA_A_" + to_uppercase(tname);
|
||||
string_to_spv("flash_attn_f32_f16_" + tname, "flash_attn_cm1.comp",
|
||||
merge_maps(fa_base_dict, {{data_a_key, "1"}, {"Q_TYPE", "float"}, {"D_TYPE", "float"}, {"D_TYPEV4", "vec4"}, {"BLOCK_SIZE", "QUANT_K_"+to_uppercase(tname)}, {"COOPMAT", "1"}}), fp16, true, false, f16acc);
|
||||
|
|
@ -683,7 +684,7 @@ void process_shaders() {
|
|||
if (tname == "f16") {
|
||||
string_to_spv("flash_attn_f32_f16_" + tname, "flash_attn.comp",
|
||||
merge_maps(fa_base_dict, {{"Q_TYPE", "float"}, {"D_TYPE", "float"}, {"D_TYPEV4", "vec4"}}), fp16, false, false, f16acc);
|
||||
} else if (tname == "q4_0" || tname == "q8_0" || tname == "f32") {
|
||||
} else if (tname == "q4_0" || tname == "q4_1" || tname == "q5_0" || tname == "q5_1" || tname == "iq4_nl" || tname == "q8_0" || tname == "f32") {
|
||||
std::string data_a_key = "DATA_A_" + to_uppercase(tname);
|
||||
string_to_spv("flash_attn_f32_f16_" + tname, "flash_attn.comp",
|
||||
merge_maps(fa_base_dict, {{data_a_key, "1"}, {"Q_TYPE", "float"}, {"D_TYPE", "float"}, {"D_TYPEV4", "vec4"}, {"BLOCK_SIZE", "QUANT_K_"+to_uppercase(tname) }}), fp16, false, false, f16acc);
|
||||
|
|
|
|||
|
|
@ -655,6 +655,14 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
|
|||
.to_float = (ggml_to_float_t) ggml_fp16_to_fp32_row,
|
||||
.from_float_ref = (ggml_from_float_t) ggml_fp32_to_fp16_row,
|
||||
},
|
||||
[GGML_TYPE_Q1_0] = {
|
||||
.type_name = "q1_0",
|
||||
.blck_size = QK1_0,
|
||||
.type_size = sizeof(block_q1_0),
|
||||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_q1_0,
|
||||
.from_float_ref = (ggml_from_float_t) quantize_row_q1_0_ref,
|
||||
},
|
||||
[GGML_TYPE_Q4_0] = {
|
||||
.type_name = "q4_0",
|
||||
.blck_size = QK4_0,
|
||||
|
|
@ -1400,6 +1408,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
|
|||
case GGML_FTYPE_MOSTLY_BF16: wtype = GGML_TYPE_BF16; break;
|
||||
case GGML_FTYPE_MOSTLY_Q4_0: wtype = GGML_TYPE_Q4_0; break;
|
||||
case GGML_FTYPE_MOSTLY_Q4_1: wtype = GGML_TYPE_Q4_1; break;
|
||||
case GGML_FTYPE_MOSTLY_Q1_0: wtype = GGML_TYPE_Q1_0; break;
|
||||
case GGML_FTYPE_MOSTLY_Q5_0: wtype = GGML_TYPE_Q5_0; break;
|
||||
case GGML_FTYPE_MOSTLY_Q5_1: wtype = GGML_TYPE_Q5_1; break;
|
||||
case GGML_FTYPE_MOSTLY_Q8_0: wtype = GGML_TYPE_Q8_0; break;
|
||||
|
|
@ -7668,6 +7677,7 @@ size_t ggml_quantize_chunk(
|
|||
size_t result = 0;
|
||||
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q1_0: result = quantize_q1_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_Q4_0: result = quantize_q4_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_Q4_1: result = quantize_q4_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_Q5_0: result = quantize_q5_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
|
|
|
|||
|
|
@ -3996,6 +3996,7 @@ class GGMLQuantizationType(IntEnum):
|
|||
TQ2_0 = 35
|
||||
MXFP4 = 39
|
||||
NVFP4 = 40
|
||||
Q1_0 = 41
|
||||
|
||||
|
||||
class ExpertGatingFuncType(IntEnum):
|
||||
|
|
@ -4049,6 +4050,7 @@ class LlamaFileType(IntEnum):
|
|||
MOSTLY_TQ2_0 = 37 # except 1d tensors
|
||||
MOSTLY_MXFP4_MOE = 38 # except 1d tensors
|
||||
MOSTLY_NVFP4 = 39 # except 1d tensors
|
||||
MOSTLY_Q1_0 = 40 # except 1d tensors
|
||||
|
||||
GUESSED = 1024 # not specified in the model file
|
||||
|
||||
|
|
@ -4161,6 +4163,7 @@ GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = {
|
|||
GGMLQuantizationType.TQ2_0: (256, 2 + 64),
|
||||
GGMLQuantizationType.MXFP4: (32, 1 + 16),
|
||||
GGMLQuantizationType.NVFP4: (64, 4 + 32),
|
||||
GGMLQuantizationType.Q1_0: (128, 2 + 16),
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -157,6 +157,7 @@ extern "C" {
|
|||
LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_NVFP4 = 39, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q1_0 = 40, // except 1d tensors
|
||||
|
||||
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
||||
};
|
||||
|
|
|
|||
|
|
@ -121,7 +121,8 @@ enum sd_type_t {
|
|||
// SD_TYPE_IQ4_NL_8_8 = 38,
|
||||
SD_TYPE_MXFP4 = 39, // MXFP4 (1 block)
|
||||
SD_TYPE_NVFP4 = 40, // NVFP4 (4 blocks, E4M3 scale)
|
||||
SD_TYPE_COUNT = 41,
|
||||
SD_TYPE_Q1_0 = 41,
|
||||
SD_TYPE_COUNT = 42,
|
||||
};
|
||||
|
||||
enum sd_log_level_t {
|
||||
|
|
|
|||
2026
src/llama-arch.cpp
2026
src/llama-arch.cpp
File diff suppressed because it is too large
Load diff
|
|
@ -585,8 +585,6 @@ struct LLM_TN_IMPL {
|
|||
const int bid;
|
||||
const int xid;
|
||||
|
||||
const std::set<llm_tensor> model_tensors;
|
||||
|
||||
LLM_TN_IMPL(llm_arch arch, llm_tensor tensor, const char * suffix, int bid, int xid);
|
||||
|
||||
std::string str() const;
|
||||
|
|
|
|||
|
|
@ -36,6 +36,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|||
case LLAMA_FTYPE_ALL_F32: return "all F32";
|
||||
case LLAMA_FTYPE_MOSTLY_F16: return "F16";
|
||||
case LLAMA_FTYPE_MOSTLY_BF16: return "BF16";
|
||||
case LLAMA_FTYPE_MOSTLY_Q1_0: return "Q1_0";
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
|
||||
case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0";
|
||||
|
|
@ -759,6 +760,7 @@ llama_model_loader::llama_model_loader(
|
|||
case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break;
|
||||
case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break;
|
||||
case GGML_TYPE_NVFP4: ftype = LLAMA_FTYPE_MOSTLY_NVFP4; break;
|
||||
case GGML_TYPE_Q1_0: ftype = LLAMA_FTYPE_MOSTLY_Q1_0; break;
|
||||
default:
|
||||
{
|
||||
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
|
||||
|
|
|
|||
|
|
@ -801,6 +801,7 @@ ggml_type llama_ftype_get_default_type(llama_ftype ftype) {
|
|||
case LLAMA_FTYPE_MOSTLY_F16: return GGML_TYPE_F16;
|
||||
case LLAMA_FTYPE_MOSTLY_BF16: return GGML_TYPE_BF16;
|
||||
case LLAMA_FTYPE_ALL_F32: return GGML_TYPE_F32;
|
||||
case LLAMA_FTYPE_MOSTLY_Q1_0: return GGML_TYPE_Q1_0;
|
||||
|
||||
case LLAMA_FTYPE_MOSTLY_MXFP4_MOE: return GGML_TYPE_MXFP4;
|
||||
|
||||
|
|
|
|||
139
src/unicode.cpp
139
src/unicode.cpp
|
|
@ -470,6 +470,141 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
|
|||
return bpe_offsets;
|
||||
}
|
||||
|
||||
// Qwen2 system regex: "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
|
||||
static std::vector<size_t> unicode_regex_split_custom_qwen2(const std::string & text, const std::vector<size_t> & offsets) {
|
||||
std::vector<size_t> bpe_offsets; // store the offset of each word
|
||||
bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
|
||||
|
||||
const auto cpts = unicode_cpts_from_utf8(text);
|
||||
|
||||
size_t start = 0;
|
||||
for (auto offset : offsets) {
|
||||
const size_t offset_ini = start;
|
||||
const size_t offset_end = start + offset;
|
||||
assert(offset_end <= cpts.size());
|
||||
start = offset_end;
|
||||
|
||||
static const uint32_t OUT_OF_RANGE = 0xFFFFFFFF;
|
||||
auto _get_cpt = [&] (const size_t pos) -> uint32_t {
|
||||
return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
|
||||
};
|
||||
|
||||
auto _get_flags = [&] (const size_t pos) -> unicode_cpt_flags {
|
||||
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags_from_cpt(cpts[pos]) : unicode_cpt_flags{};
|
||||
};
|
||||
|
||||
size_t _prev_end = offset_ini;
|
||||
auto _add_token = [&] (const size_t end) -> size_t {
|
||||
assert(_prev_end <= end && end <= offset_end);
|
||||
size_t len = end - _prev_end;
|
||||
if (len > 0) {
|
||||
bpe_offsets.push_back(len);
|
||||
}
|
||||
_prev_end = end;
|
||||
//if (len > 0) {
|
||||
// std::string s = "";
|
||||
// for(size_t p = end-len; p < end; p++)
|
||||
// s += unicode_cpt_to_utf8(cpts[p]);
|
||||
// printf(">>> '%s'\n", s.c_str());
|
||||
//}
|
||||
return len;
|
||||
};
|
||||
|
||||
for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
|
||||
const uint32_t cpt = _get_cpt(pos);
|
||||
const auto flags = _get_flags(pos);
|
||||
|
||||
// regex: (?i:'s|'t|'re|'ve|'m|'ll|'d) // case insensitive
|
||||
if (cpt == '\'' && pos+1 < offset_end) {
|
||||
uint32_t cpt_next = unicode_tolower(_get_cpt(pos+1));
|
||||
if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || cpt_next == 'd') {
|
||||
pos += _add_token(pos+2);
|
||||
continue;
|
||||
}
|
||||
if (pos+2 < offset_end) {
|
||||
uint32_t cpt_next_next = unicode_tolower(_get_cpt(pos+2));
|
||||
if ((cpt_next == 'r' && cpt_next_next == 'e') ||
|
||||
(cpt_next == 'v' && cpt_next_next == 'e') ||
|
||||
(cpt_next == 'l' && cpt_next_next == 'l')) {
|
||||
pos += _add_token(pos+3);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// regex: [^\r\n\p{L}\p{N}]?\p{L}+
|
||||
if (!(cpt == '\r' || cpt == '\n' || flags.is_number)) {
|
||||
if (flags.is_letter || _get_flags(pos+1).is_letter) { // one or more letters
|
||||
pos++;
|
||||
while (_get_flags(pos).is_letter) {
|
||||
pos++;
|
||||
}
|
||||
_add_token(pos);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// regex: \p{N}
|
||||
if (flags.is_number) {
|
||||
pos++;
|
||||
_add_token(pos);
|
||||
continue;
|
||||
}
|
||||
|
||||
// regex: <space>?[^\s\p{L}\p{N}]+[\r\n]*
|
||||
auto flags2 = (cpt == ' ' ? _get_flags(pos+1) : flags);
|
||||
if (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags.as_uint()) {
|
||||
pos += (cpt == ' ');
|
||||
while (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags2.as_uint()) {
|
||||
flags2 = _get_flags(++pos);
|
||||
}
|
||||
uint32_t cpt2 = _get_cpt(pos);
|
||||
while (cpt2 == '\r' || cpt2 == '\n') {
|
||||
cpt2 = _get_cpt(++pos);
|
||||
}
|
||||
_add_token(pos);
|
||||
continue;
|
||||
}
|
||||
|
||||
size_t num_whitespaces = 0;
|
||||
size_t last_end_r_or_n = 0;
|
||||
while (_get_flags(pos+num_whitespaces).is_whitespace) {
|
||||
uint32_t cpt2 = _get_cpt(pos+num_whitespaces);
|
||||
if (cpt2 == '\r' || cpt2 == '\n') {
|
||||
last_end_r_or_n = pos + num_whitespaces + 1;
|
||||
}
|
||||
num_whitespaces++;
|
||||
}
|
||||
|
||||
// regex: \s*[\r\n]+
|
||||
if (last_end_r_or_n > 0) {
|
||||
pos = last_end_r_or_n;
|
||||
_add_token(pos);
|
||||
continue;
|
||||
}
|
||||
|
||||
// regex: \s+(?!\S)
|
||||
if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != OUT_OF_RANGE) {
|
||||
pos += num_whitespaces - 1;
|
||||
_add_token(pos);
|
||||
continue;
|
||||
}
|
||||
|
||||
// regex: \s+
|
||||
if (num_whitespaces > 0) {
|
||||
pos += num_whitespaces;
|
||||
_add_token(pos);
|
||||
continue;
|
||||
}
|
||||
|
||||
// no matches
|
||||
_add_token(++pos);
|
||||
}
|
||||
}
|
||||
|
||||
return bpe_offsets;
|
||||
}
|
||||
|
||||
template <typename CharT>
|
||||
static std::vector<size_t> unicode_regex_split_stl(const std::basic_string<CharT> & text, const std::basic_string<CharT> & regex, const std::vector<size_t> & offsets) {
|
||||
using BidirIt = typename std::basic_string<CharT>::const_iterator;
|
||||
|
|
@ -790,8 +925,10 @@ static std::vector<size_t> unicode_regex_split_custom(const std::string & text,
|
|||
} else if (
|
||||
regex_expr == "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" ||
|
||||
regex_expr == "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+") {
|
||||
|
||||
bpe_offsets = unicode_regex_split_custom_llama3(text, offsets);
|
||||
} else if (
|
||||
regex_expr == "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+") {
|
||||
bpe_offsets = unicode_regex_split_custom_qwen2(text, offsets);
|
||||
} else if (regex_expr == "\\p{Han}+") {
|
||||
// K2's first pattern - handle all K2 patterns together
|
||||
bpe_offsets = unicode_regex_split_custom_kimi_k2(text, offsets);
|
||||
|
|
|
|||
|
|
@ -30,6 +30,7 @@ struct quant_option {
|
|||
};
|
||||
|
||||
static const std::vector<quant_option> QUANT_OPTIONS = {
|
||||
{ "Q1_0", LLAMA_FTYPE_MOSTLY_Q1_0, " 1.125 bpw quantization", },
|
||||
{ "Q4_0", LLAMA_FTYPE_MOSTLY_Q4_0, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
|
||||
{ "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 4.78G, +0.4511 ppl @ Llama-3-8B", },
|
||||
{ "MXFP4_MOE",LLAMA_FTYPE_MOSTLY_MXFP4_MOE," MXFP4 MoE", },
|
||||
|
|
|
|||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
|
@ -18,7 +18,7 @@
|
|||
<div style="display: contents">
|
||||
<script>
|
||||
{
|
||||
__sveltekit_1trm5n9 = {
|
||||
__sveltekit_10avopp = {
|
||||
base: new URL('.', location).pathname.slice(0, -1)
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -632,7 +632,7 @@ private:
|
|||
|
||||
// load the model and initialize llama_context
|
||||
// this may also be called to resume from sleeping state
|
||||
bool load_model(const common_params & params) {
|
||||
bool load_model(common_params & params) {
|
||||
bool is_resume = sleeping;
|
||||
|
||||
SRV_INF("loading model '%s'\n", params.model.path.c_str());
|
||||
|
|
@ -641,6 +641,9 @@ private:
|
|||
|
||||
llama_init = common_init_from_params(params_base);
|
||||
|
||||
// propagate model-metadata sampling defaults back to caller
|
||||
params.sampling = params_base.sampling;
|
||||
|
||||
model = llama_init->model();
|
||||
ctx = llama_init->context();
|
||||
|
||||
|
|
@ -2404,7 +2407,7 @@ private:
|
|||
// guarantee that a checkpoint will result in at least one token being processed [TAG_PROMPT_LOGITS]
|
||||
LOG_INF("slot %12.*s: id %2d | task %d | Checking checkpoint with [%d, %d] against %d...\n", 12,
|
||||
func_name, (slot).id, ((slot).task ? (slot).task->id : -1), cur.pos_min, cur.pos_max, pos_min_thold);
|
||||
return cur.pos_min < pos_min_thold;
|
||||
return cur.pos_min < pos_min_thold || cur.pos_min == 0;
|
||||
}
|
||||
);
|
||||
|
||||
|
|
@ -2978,7 +2981,7 @@ private:
|
|||
server_context::server_context() : impl(new server_context_impl()) {}
|
||||
server_context::~server_context() = default;
|
||||
|
||||
bool server_context::load_model(const common_params & params) {
|
||||
bool server_context::load_model(common_params & params) {
|
||||
return impl->load_model(params);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -56,7 +56,7 @@ struct server_context {
|
|||
|
||||
// load the model and initialize llama_context
|
||||
// returns true on success
|
||||
bool load_model(const common_params & params);
|
||||
bool load_model(common_params & params);
|
||||
|
||||
// this function will block main thread until termination
|
||||
void start_loop();
|
||||
|
|
|
|||
8
tools/server/webui/package-lock.json
generated
8
tools/server/webui/package-lock.json
generated
|
|
@ -51,7 +51,6 @@
|
|||
"eslint-config-prettier": "^10.0.1",
|
||||
"eslint-plugin-storybook": "^10.2.4",
|
||||
"eslint-plugin-svelte": "^3.0.0",
|
||||
"fflate": "^0.8.2",
|
||||
"globals": "^16.0.0",
|
||||
"http-server": "^14.1.1",
|
||||
"mdast": "^3.0.0",
|
||||
|
|
@ -5051,13 +5050,6 @@
|
|||
}
|
||||
}
|
||||
},
|
||||
"node_modules/fflate": {
|
||||
"version": "0.8.2",
|
||||
"resolved": "https://registry.npmjs.org/fflate/-/fflate-0.8.2.tgz",
|
||||
"integrity": "sha512-cPJU47OaAoCbg0pBvzsgpTPhmhqI5eJjh/JIu8tPj5q+T7iLvW/JAYUqmE7KOB4R1ZyEhzBaIQpQpardBF5z8A==",
|
||||
"dev": true,
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/file-entry-cache": {
|
||||
"version": "8.0.0",
|
||||
"resolved": "https://registry.npmjs.org/file-entry-cache/-/file-entry-cache-8.0.0.tgz",
|
||||
|
|
|
|||
|
|
@ -33,7 +33,7 @@
|
|||
const showToolCallInProgress = $derived(config().showToolCallInProgress as boolean);
|
||||
const showThoughtInProgress = $derived(config().showThoughtInProgress as boolean);
|
||||
|
||||
const sections = $derived(deriveAgenticSections(message, toolMessages, []));
|
||||
const sections = $derived(deriveAgenticSections(message, toolMessages, [], isStreaming));
|
||||
|
||||
// Parse tool results with images
|
||||
const sectionsParsed = $derived(
|
||||
|
|
|
|||
|
|
@ -16,6 +16,7 @@
|
|||
import { rehypeEnhanceLinks } from '$lib/markdown/enhance-links';
|
||||
import { rehypeEnhanceCodeBlocks } from '$lib/markdown/enhance-code-blocks';
|
||||
import { rehypeResolveAttachmentImages } from '$lib/markdown/resolve-attachment-images';
|
||||
import { rehypeRtlSupport } from '$lib/markdown/rehype-rtl-support';
|
||||
import { remarkLiteralHtml } from '$lib/markdown/literal-html';
|
||||
import { copyCodeToClipboard, preprocessLaTeX, getImageErrorFallbackHtml } from '$lib/utils';
|
||||
import {
|
||||
|
|
@ -101,6 +102,7 @@
|
|||
.use(rehypeEnhanceLinks) // Add target="_blank" to links
|
||||
.use(rehypeEnhanceCodeBlocks) // Wrap code blocks with header and actions
|
||||
.use(rehypeResolveAttachmentImages, { attachments })
|
||||
.use(rehypeRtlSupport) // Add bidirectional text support
|
||||
.use(rehypeStringify, { allowDangerousHtml: true }); // Convert to HTML string
|
||||
});
|
||||
|
||||
|
|
@ -781,19 +783,19 @@
|
|||
/* Lists */
|
||||
div :global(ul) {
|
||||
list-style-type: disc;
|
||||
margin-left: 1.5rem;
|
||||
margin-inline-start: 1.5rem;
|
||||
margin-bottom: 1rem;
|
||||
}
|
||||
|
||||
div :global(ol) {
|
||||
list-style-type: decimal;
|
||||
margin-left: 1.5rem;
|
||||
margin-inline-start: 1.5rem;
|
||||
margin-bottom: 1rem;
|
||||
}
|
||||
|
||||
div :global(li) {
|
||||
margin-bottom: 0.25rem;
|
||||
padding-left: 0.5rem;
|
||||
padding-inline-start: 0.5rem;
|
||||
}
|
||||
|
||||
div :global(li::marker) {
|
||||
|
|
@ -816,8 +818,8 @@
|
|||
/* Task lists */
|
||||
div :global(.task-list-item) {
|
||||
list-style: none;
|
||||
margin-left: 0;
|
||||
padding-left: 0;
|
||||
margin-inline-start: 0;
|
||||
padding-inline-start: 0;
|
||||
}
|
||||
|
||||
div :global(.task-list-item-checkbox) {
|
||||
|
|
|
|||
28
tools/server/webui/src/lib/markdown/rehype-rtl-support.ts
Normal file
28
tools/server/webui/src/lib/markdown/rehype-rtl-support.ts
Normal file
|
|
@ -0,0 +1,28 @@
|
|||
/**
|
||||
* Rehype plugin to provide comprehensive RTL support by adding dir="auto"
|
||||
* to all text-containing elements.
|
||||
*
|
||||
* This operates directly on the HAST tree, ensuring that all elements
|
||||
* (including those not in a predefined list) receive the attribute.
|
||||
*/
|
||||
|
||||
import type { Plugin } from 'unified';
|
||||
import type { Root, Element } from 'hast';
|
||||
import { visit } from 'unist-util-visit';
|
||||
|
||||
/**
|
||||
* Rehype plugin to add dir="auto" to all elements that have children.
|
||||
* This provides bidirectional text support for mixed RTL/LTR content.
|
||||
*/
|
||||
export const rehypeRtlSupport: Plugin<[], Root> = () => {
|
||||
return (tree: Root) => {
|
||||
visit(tree, 'element', (node: Element) => {
|
||||
if (node.children && node.children.length > 0) {
|
||||
node.properties = {
|
||||
...node.properties,
|
||||
dir: 'auto'
|
||||
};
|
||||
}
|
||||
});
|
||||
};
|
||||
};
|
||||
|
|
@ -474,6 +474,7 @@ class AgenticStore {
|
|||
sessionMessages.push({
|
||||
role: MessageRole.ASSISTANT,
|
||||
content: turnContent || undefined,
|
||||
reasoning_content: turnReasoningContent || undefined,
|
||||
tool_calls: normalizedCalls
|
||||
});
|
||||
|
||||
|
|
|
|||
|
|
@ -41,6 +41,7 @@ export type AgenticMessage =
|
|||
| {
|
||||
role: MessageRole.ASSISTANT;
|
||||
content?: string | ApiChatMessageContentPart[];
|
||||
reasoning_content?: string;
|
||||
tool_calls?: AgenticToolCallPayload[];
|
||||
}
|
||||
| {
|
||||
|
|
|
|||
|
|
@ -38,14 +38,19 @@ export type ToolResultLine = {
|
|||
function deriveSingleTurnSections(
|
||||
message: DatabaseMessage,
|
||||
toolMessages: DatabaseMessage[] = [],
|
||||
streamingToolCalls: ApiChatCompletionToolCall[] = []
|
||||
streamingToolCalls: ApiChatCompletionToolCall[] = [],
|
||||
isStreaming: boolean = false
|
||||
): AgenticSection[] {
|
||||
const sections: AgenticSection[] = [];
|
||||
|
||||
// 1. Reasoning content (from dedicated field)
|
||||
if (message.reasoningContent) {
|
||||
const toolCalls = parseToolCalls(message.toolCalls);
|
||||
const hasContentAfterReasoning =
|
||||
!!message.content?.trim() || toolCalls.length > 0 || streamingToolCalls.length > 0;
|
||||
const isPending = isStreaming && !hasContentAfterReasoning;
|
||||
sections.push({
|
||||
type: AgenticSectionType.REASONING,
|
||||
type: isPending ? AgenticSectionType.REASONING_PENDING : AgenticSectionType.REASONING,
|
||||
content: message.reasoningContent
|
||||
});
|
||||
}
|
||||
|
|
@ -104,12 +109,13 @@ function deriveSingleTurnSections(
|
|||
export function deriveAgenticSections(
|
||||
message: DatabaseMessage,
|
||||
toolMessages: DatabaseMessage[] = [],
|
||||
streamingToolCalls: ApiChatCompletionToolCall[] = []
|
||||
streamingToolCalls: ApiChatCompletionToolCall[] = [],
|
||||
isStreaming: boolean = false
|
||||
): AgenticSection[] {
|
||||
const hasAssistantContinuations = toolMessages.some((m) => m.role === MessageRole.ASSISTANT);
|
||||
|
||||
if (!hasAssistantContinuations) {
|
||||
return deriveSingleTurnSections(message, toolMessages, streamingToolCalls);
|
||||
return deriveSingleTurnSections(message, toolMessages, streamingToolCalls, isStreaming);
|
||||
}
|
||||
|
||||
const sections: AgenticSection[] = [];
|
||||
|
|
@ -127,7 +133,12 @@ export function deriveAgenticSections(
|
|||
const isLastTurn = i + 1 + turnToolMsgs.length >= toolMessages.length;
|
||||
|
||||
sections.push(
|
||||
...deriveSingleTurnSections(msg, turnToolMsgs, isLastTurn ? streamingToolCalls : [])
|
||||
...deriveSingleTurnSections(
|
||||
msg,
|
||||
turnToolMsgs,
|
||||
isLastTurn ? streamingToolCalls : [],
|
||||
isLastTurn && isStreaming
|
||||
)
|
||||
);
|
||||
|
||||
i += 1 + turnToolMsgs.length;
|
||||
|
|
|
|||
|
|
@ -162,6 +162,36 @@ describe('deriveAgenticSections', () => {
|
|||
expect(sections[4].content).toBe('Here is the analysis.');
|
||||
});
|
||||
|
||||
it('returns REASONING_PENDING when streaming with only reasoning content', () => {
|
||||
const msg = makeAssistant({
|
||||
reasoningContent: 'Let me think about this...'
|
||||
});
|
||||
const sections = deriveAgenticSections(msg, [], [], true);
|
||||
expect(sections).toHaveLength(1);
|
||||
expect(sections[0].type).toBe(AgenticSectionType.REASONING_PENDING);
|
||||
expect(sections[0].content).toBe('Let me think about this...');
|
||||
});
|
||||
|
||||
it('returns REASONING (not pending) when streaming but text content has appeared', () => {
|
||||
const msg = makeAssistant({
|
||||
content: 'The answer is',
|
||||
reasoningContent: 'Let me think...'
|
||||
});
|
||||
const sections = deriveAgenticSections(msg, [], [], true);
|
||||
expect(sections).toHaveLength(2);
|
||||
expect(sections[0].type).toBe(AgenticSectionType.REASONING);
|
||||
expect(sections[1].type).toBe(AgenticSectionType.TEXT);
|
||||
});
|
||||
|
||||
it('returns REASONING (not pending) when not streaming', () => {
|
||||
const msg = makeAssistant({
|
||||
reasoningContent: 'Let me think...'
|
||||
});
|
||||
const sections = deriveAgenticSections(msg, [], [], false);
|
||||
expect(sections).toHaveLength(1);
|
||||
expect(sections[0].type).toBe(AgenticSectionType.REASONING);
|
||||
});
|
||||
|
||||
it('multi-turn: streaming tool calls on last turn', () => {
|
||||
const assistant1 = makeAssistant({
|
||||
toolCalls: JSON.stringify([
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue