mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-06 16:21:49 +00:00
further reverse the mxfp4 changes
This commit is contained in:
parent
d20e60ddd5
commit
a40038d8e6
1 changed files with 0 additions and 156 deletions
|
|
@ -498,81 +498,6 @@ void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const
|
|||
ggml_gemv_iq4_nl_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
void ggml_gemv_mxfp4_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
const int ncols_interleaved = 4;
|
||||
const int blocklen = 4;
|
||||
|
||||
assert (n % qk == 0);
|
||||
assert (nc % ncols_interleaved == 0);
|
||||
|
||||
UNUSED(s);
|
||||
UNUSED(bs);
|
||||
UNUSED(vx);
|
||||
UNUSED(vy);
|
||||
UNUSED(nr);
|
||||
UNUSED(nc);
|
||||
UNUSED(nb);
|
||||
UNUSED(ncols_interleaved);
|
||||
UNUSED(blocklen);
|
||||
|
||||
#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
|
||||
const int8x16_t kvalues = vld1q_s8(kvalues_mxfp4);
|
||||
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
||||
float * res_ptr = s;
|
||||
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
const block_mxfp4x4 * b_ptr = (const block_mxfp4x4 *) vx + (x * nb);
|
||||
|
||||
float32x4_t sumf = vdupq_n_f32(0);
|
||||
for (int l = 0; l < nb; l++) {
|
||||
uint8x16_t b_0 = vld1q_u8(b_ptr[l].qs + 0);
|
||||
uint8x16_t b_1 = vld1q_u8(b_ptr[l].qs + 16);
|
||||
uint8x16_t b_2 = vld1q_u8(b_ptr[l].qs + 32);
|
||||
uint8x16_t b_3 = vld1q_u8(b_ptr[l].qs + 48);
|
||||
|
||||
int8x16_t b_0_hi = vqtbl1q_s8(kvalues, b_0 >> 4);
|
||||
int8x16_t b_0_lo = vqtbl1q_s8(kvalues, b_0 & 0x0F);
|
||||
int8x16_t b_1_hi = vqtbl1q_s8(kvalues, b_1 >> 4);
|
||||
int8x16_t b_1_lo = vqtbl1q_s8(kvalues, b_1 & 0x0F);
|
||||
int8x16_t b_2_hi = vqtbl1q_s8(kvalues, b_2 >> 4);
|
||||
int8x16_t b_2_lo = vqtbl1q_s8(kvalues, b_2 & 0x0F);
|
||||
int8x16_t b_3_hi = vqtbl1q_s8(kvalues, b_3 >> 4);
|
||||
int8x16_t b_3_lo = vqtbl1q_s8(kvalues, b_3 & 0x0F);
|
||||
|
||||
int8x16_t a_0 = vld1q_s8(a_ptr[l].qs + 0);
|
||||
int8x16_t a_1 = vld1q_s8(a_ptr[l].qs + 16);
|
||||
|
||||
int32x4_t sumi = vdupq_n_s32(0);
|
||||
sumi = vdotq_laneq_s32(sumi, b_0_lo, a_0, 0);
|
||||
sumi = vdotq_laneq_s32(sumi, b_0_hi, a_1, 0);
|
||||
sumi = vdotq_laneq_s32(sumi, b_1_lo, a_0, 1);
|
||||
sumi = vdotq_laneq_s32(sumi, b_1_hi, a_1, 1);
|
||||
sumi = vdotq_laneq_s32(sumi, b_2_lo, a_0, 2);
|
||||
sumi = vdotq_laneq_s32(sumi, b_2_hi, a_1, 2);
|
||||
sumi = vdotq_laneq_s32(sumi, b_3_lo, a_0, 3);
|
||||
sumi = vdotq_laneq_s32(sumi, b_3_hi, a_1, 3);
|
||||
|
||||
float32x4_t a_d = vcvt_f32_f16(vld1_dup_f16((const float16_t *)&a_ptr[l].d));
|
||||
float32x4_t b_d = {
|
||||
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[l].e[0]),
|
||||
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[l].e[1]),
|
||||
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[l].e[2]),
|
||||
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[l].e[3]),
|
||||
};
|
||||
float32x4_t d = a_d * b_d;
|
||||
|
||||
sumf = vmlaq_f32(sumf, d, vcvtq_f32_s32(sumi));
|
||||
}
|
||||
|
||||
vst1q_f32(res_ptr + x * 4, sumf);
|
||||
}
|
||||
return;
|
||||
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
|
||||
ggml_gemv_mxfp4_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
void ggml_gemv_q4_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
constexpr int qk = QK_K;
|
||||
const int nb = n / qk;
|
||||
|
|
@ -3239,87 +3164,6 @@ void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const
|
|||
ggml_gemm_iq4_nl_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
void ggml_gemm_mxfp4_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
const int ncols_interleaved = 4;
|
||||
const int blocklen = 4;
|
||||
|
||||
assert (n % qk == 0);
|
||||
assert (nr % 4 == 0);
|
||||
assert (nc % ncols_interleaved == 0);
|
||||
|
||||
UNUSED(s);
|
||||
UNUSED(bs);
|
||||
UNUSED(vx);
|
||||
UNUSED(vy);
|
||||
UNUSED(nr);
|
||||
UNUSED(nc);
|
||||
UNUSED(nb);
|
||||
UNUSED(ncols_interleaved);
|
||||
UNUSED(blocklen);
|
||||
|
||||
#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
|
||||
const int8x16_t kvalues = vld1q_s8(kvalues_mxfp4);
|
||||
|
||||
for (int y = 0; y < nr / 4; y++) {
|
||||
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
const block_mxfp4x4 * b_ptr = (const block_mxfp4x4 *) vx + (x * nb);
|
||||
|
||||
float32x4_t sumf[4];
|
||||
for (int m = 0; m < 4; m++) {
|
||||
sumf[m] = vdupq_n_f32(0);
|
||||
}
|
||||
|
||||
for (int l = 0; l < nb; l++) {
|
||||
float32x4_t a_d = vcvt_f32_f16(vld1_f16((const float16_t *)a_ptr[l].d));
|
||||
float32x4_t b_d = {
|
||||
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[l].e[0]),
|
||||
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[l].e[1]),
|
||||
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[l].e[2]),
|
||||
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[l].e[3]),
|
||||
};
|
||||
|
||||
int32x4_t sumi_0 = vdupq_n_s32(0);
|
||||
int32x4_t sumi_1 = vdupq_n_s32(0);
|
||||
int32x4_t sumi_2 = vdupq_n_s32(0);
|
||||
int32x4_t sumi_3 = vdupq_n_s32(0);
|
||||
|
||||
for (int k = 0; k < 4; k++) {
|
||||
int8x16_t a_0 = vld1q_s8(a_ptr[l].qs + 16 * k + 0);
|
||||
int8x16_t a_1 = vld1q_s8(a_ptr[l].qs + 16 * k + 64);
|
||||
|
||||
uint8x16_t b = vld1q_u8(b_ptr[l].qs + 16 * k);
|
||||
int8x16_t b_hi = vqtbl1q_s8(kvalues, b >> 4);
|
||||
int8x16_t b_lo = vqtbl1q_s8(kvalues, b & 0xF);
|
||||
|
||||
sumi_0 = vdotq_laneq_s32(sumi_0, b_lo, a_0, 0);
|
||||
sumi_1 = vdotq_laneq_s32(sumi_1, b_lo, a_0, 1);
|
||||
sumi_2 = vdotq_laneq_s32(sumi_2, b_lo, a_0, 2);
|
||||
sumi_3 = vdotq_laneq_s32(sumi_3, b_lo, a_0, 3);
|
||||
sumi_0 = vdotq_laneq_s32(sumi_0, b_hi, a_1, 0);
|
||||
sumi_1 = vdotq_laneq_s32(sumi_1, b_hi, a_1, 1);
|
||||
sumi_2 = vdotq_laneq_s32(sumi_2, b_hi, a_1, 2);
|
||||
sumi_3 = vdotq_laneq_s32(sumi_3, b_hi, a_1, 3);
|
||||
}
|
||||
|
||||
sumf[0] = vmlaq_f32(sumf[0], vmulq_laneq_f32(b_d, a_d, 0), vcvtq_f32_s32(sumi_0));
|
||||
sumf[1] = vmlaq_f32(sumf[1], vmulq_laneq_f32(b_d, a_d, 1), vcvtq_f32_s32(sumi_1));
|
||||
sumf[2] = vmlaq_f32(sumf[2], vmulq_laneq_f32(b_d, a_d, 2), vcvtq_f32_s32(sumi_2));
|
||||
sumf[3] = vmlaq_f32(sumf[3], vmulq_laneq_f32(b_d, a_d, 3), vcvtq_f32_s32(sumi_3));
|
||||
}
|
||||
|
||||
for (int m = 0; m < 4; m++) {
|
||||
vst1q_f32(s + (y * 4 + m) * bs + x * 4, sumf[m]);
|
||||
}
|
||||
}
|
||||
}
|
||||
return;
|
||||
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
|
||||
ggml_gemm_mxfp4_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
void ggml_gemm_q4_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
constexpr int qk = QK_K;
|
||||
const int nb = n / qk;
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue