From a40038d8e63deb7dc681f9f68cbce79deac33450 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Sat, 7 Mar 2026 22:42:22 +0800 Subject: [PATCH] further reverse the mxfp4 changes --- ggml/src/ggml-cpu/arch/arm/repack.cpp | 156 -------------------------- 1 file changed, 156 deletions(-) diff --git a/ggml/src/ggml-cpu/arch/arm/repack.cpp b/ggml/src/ggml-cpu/arch/arm/repack.cpp index 3eed0105b..c2e4623f3 100644 --- a/ggml/src/ggml-cpu/arch/arm/repack.cpp +++ b/ggml/src/ggml-cpu/arch/arm/repack.cpp @@ -498,81 +498,6 @@ void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const ggml_gemv_iq4_nl_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc); } -void ggml_gemv_mxfp4_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { - const int qk = QK8_0; - const int nb = n / qk; - const int ncols_interleaved = 4; - const int blocklen = 4; - - assert (n % qk == 0); - assert (nc % ncols_interleaved == 0); - - UNUSED(s); - UNUSED(bs); - UNUSED(vx); - UNUSED(vy); - UNUSED(nr); - UNUSED(nc); - UNUSED(nb); - UNUSED(ncols_interleaved); - UNUSED(blocklen); - -#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD) - const int8x16_t kvalues = vld1q_s8(kvalues_mxfp4); - const block_q8_0 * a_ptr = (const block_q8_0 *) vy; - float * res_ptr = s; - - for (int x = 0; x < nc / ncols_interleaved; x++) { - const block_mxfp4x4 * b_ptr = (const block_mxfp4x4 *) vx + (x * nb); - - float32x4_t sumf = vdupq_n_f32(0); - for (int l = 0; l < nb; l++) { - uint8x16_t b_0 = vld1q_u8(b_ptr[l].qs + 0); - uint8x16_t b_1 = vld1q_u8(b_ptr[l].qs + 16); - uint8x16_t b_2 = vld1q_u8(b_ptr[l].qs + 32); - uint8x16_t b_3 = vld1q_u8(b_ptr[l].qs + 48); - - int8x16_t b_0_hi = vqtbl1q_s8(kvalues, b_0 >> 4); - int8x16_t b_0_lo = vqtbl1q_s8(kvalues, b_0 & 0x0F); - int8x16_t b_1_hi = vqtbl1q_s8(kvalues, b_1 >> 4); - int8x16_t b_1_lo = vqtbl1q_s8(kvalues, b_1 & 0x0F); - int8x16_t b_2_hi = vqtbl1q_s8(kvalues, b_2 >> 4); - int8x16_t b_2_lo = vqtbl1q_s8(kvalues, b_2 & 0x0F); - int8x16_t b_3_hi = vqtbl1q_s8(kvalues, b_3 >> 4); - int8x16_t b_3_lo = vqtbl1q_s8(kvalues, b_3 & 0x0F); - - int8x16_t a_0 = vld1q_s8(a_ptr[l].qs + 0); - int8x16_t a_1 = vld1q_s8(a_ptr[l].qs + 16); - - int32x4_t sumi = vdupq_n_s32(0); - sumi = vdotq_laneq_s32(sumi, b_0_lo, a_0, 0); - sumi = vdotq_laneq_s32(sumi, b_0_hi, a_1, 0); - sumi = vdotq_laneq_s32(sumi, b_1_lo, a_0, 1); - sumi = vdotq_laneq_s32(sumi, b_1_hi, a_1, 1); - sumi = vdotq_laneq_s32(sumi, b_2_lo, a_0, 2); - sumi = vdotq_laneq_s32(sumi, b_2_hi, a_1, 2); - sumi = vdotq_laneq_s32(sumi, b_3_lo, a_0, 3); - sumi = vdotq_laneq_s32(sumi, b_3_hi, a_1, 3); - - float32x4_t a_d = vcvt_f32_f16(vld1_dup_f16((const float16_t *)&a_ptr[l].d)); - float32x4_t b_d = { - GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[l].e[0]), - GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[l].e[1]), - GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[l].e[2]), - GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[l].e[3]), - }; - float32x4_t d = a_d * b_d; - - sumf = vmlaq_f32(sumf, d, vcvtq_f32_s32(sumi)); - } - - vst1q_f32(res_ptr + x * 4, sumf); - } - return; -#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) - ggml_gemv_mxfp4_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc); -} - void ggml_gemv_q4_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { constexpr int qk = QK_K; const int nb = n / qk; @@ -3239,87 +3164,6 @@ void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const ggml_gemm_iq4_nl_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc); } -void ggml_gemm_mxfp4_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { - const int qk = QK8_0; - const int nb = n / qk; - const int ncols_interleaved = 4; - const int blocklen = 4; - - assert (n % qk == 0); - assert (nr % 4 == 0); - assert (nc % ncols_interleaved == 0); - - UNUSED(s); - UNUSED(bs); - UNUSED(vx); - UNUSED(vy); - UNUSED(nr); - UNUSED(nc); - UNUSED(nb); - UNUSED(ncols_interleaved); - UNUSED(blocklen); - -#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD) - const int8x16_t kvalues = vld1q_s8(kvalues_mxfp4); - - for (int y = 0; y < nr / 4; y++) { - const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb); - for (int x = 0; x < nc / ncols_interleaved; x++) { - const block_mxfp4x4 * b_ptr = (const block_mxfp4x4 *) vx + (x * nb); - - float32x4_t sumf[4]; - for (int m = 0; m < 4; m++) { - sumf[m] = vdupq_n_f32(0); - } - - for (int l = 0; l < nb; l++) { - float32x4_t a_d = vcvt_f32_f16(vld1_f16((const float16_t *)a_ptr[l].d)); - float32x4_t b_d = { - GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[l].e[0]), - GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[l].e[1]), - GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[l].e[2]), - GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[l].e[3]), - }; - - int32x4_t sumi_0 = vdupq_n_s32(0); - int32x4_t sumi_1 = vdupq_n_s32(0); - int32x4_t sumi_2 = vdupq_n_s32(0); - int32x4_t sumi_3 = vdupq_n_s32(0); - - for (int k = 0; k < 4; k++) { - int8x16_t a_0 = vld1q_s8(a_ptr[l].qs + 16 * k + 0); - int8x16_t a_1 = vld1q_s8(a_ptr[l].qs + 16 * k + 64); - - uint8x16_t b = vld1q_u8(b_ptr[l].qs + 16 * k); - int8x16_t b_hi = vqtbl1q_s8(kvalues, b >> 4); - int8x16_t b_lo = vqtbl1q_s8(kvalues, b & 0xF); - - sumi_0 = vdotq_laneq_s32(sumi_0, b_lo, a_0, 0); - sumi_1 = vdotq_laneq_s32(sumi_1, b_lo, a_0, 1); - sumi_2 = vdotq_laneq_s32(sumi_2, b_lo, a_0, 2); - sumi_3 = vdotq_laneq_s32(sumi_3, b_lo, a_0, 3); - sumi_0 = vdotq_laneq_s32(sumi_0, b_hi, a_1, 0); - sumi_1 = vdotq_laneq_s32(sumi_1, b_hi, a_1, 1); - sumi_2 = vdotq_laneq_s32(sumi_2, b_hi, a_1, 2); - sumi_3 = vdotq_laneq_s32(sumi_3, b_hi, a_1, 3); - } - - sumf[0] = vmlaq_f32(sumf[0], vmulq_laneq_f32(b_d, a_d, 0), vcvtq_f32_s32(sumi_0)); - sumf[1] = vmlaq_f32(sumf[1], vmulq_laneq_f32(b_d, a_d, 1), vcvtq_f32_s32(sumi_1)); - sumf[2] = vmlaq_f32(sumf[2], vmulq_laneq_f32(b_d, a_d, 2), vcvtq_f32_s32(sumi_2)); - sumf[3] = vmlaq_f32(sumf[3], vmulq_laneq_f32(b_d, a_d, 3), vcvtq_f32_s32(sumi_3)); - } - - for (int m = 0; m < 4; m++) { - vst1q_f32(s + (y * 4 + m) * bs + x * 4, sumf[m]); - } - } - } - return; -#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) - ggml_gemm_mxfp4_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc); -} - void ggml_gemm_q4_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { constexpr int qk = QK_K; const int nb = n / qk;