From a40038d8e63deb7dc681f9f68cbce79deac33450 Mon Sep 17 00:00:00 2001
From: Concedo <39025047+LostRuins@users.noreply.github.com>
Date: Sat, 7 Mar 2026 22:42:22 +0800
Subject: [PATCH] further reverse the mxfp4 changes

---
 ggml/src/ggml-cpu/arch/arm/repack.cpp | 156 --------------------------
 1 file changed, 156 deletions(-)

diff --git a/ggml/src/ggml-cpu/arch/arm/repack.cpp b/ggml/src/ggml-cpu/arch/arm/repack.cpp
index 3eed0105b..c2e4623f3 100644
--- a/ggml/src/ggml-cpu/arch/arm/repack.cpp
+++ b/ggml/src/ggml-cpu/arch/arm/repack.cpp
@@ -498,81 +498,6 @@ void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const
     ggml_gemv_iq4_nl_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc);
 }
 
-void ggml_gemv_mxfp4_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
-    const int qk = QK8_0;
-    const int nb = n / qk;
-    const int ncols_interleaved = 4;
-    const int blocklen = 4;
-
-    assert (n % qk == 0);
-    assert (nc % ncols_interleaved == 0);
-
-    UNUSED(s);
-    UNUSED(bs);
-    UNUSED(vx);
-    UNUSED(vy);
-    UNUSED(nr);
-    UNUSED(nc);
-    UNUSED(nb);
-    UNUSED(ncols_interleaved);
-    UNUSED(blocklen);
-
-#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
-    const int8x16_t kvalues = vld1q_s8(kvalues_mxfp4);
-    const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
-    float * res_ptr = s;
-
-    for (int x = 0; x < nc / ncols_interleaved; x++) {
-        const block_mxfp4x4 * b_ptr = (const block_mxfp4x4 *) vx + (x * nb);
-
-        float32x4_t sumf = vdupq_n_f32(0);
-        for (int l = 0; l < nb; l++) {
-            uint8x16_t b_0 = vld1q_u8(b_ptr[l].qs + 0);
-            uint8x16_t b_1 = vld1q_u8(b_ptr[l].qs + 16);
-            uint8x16_t b_2 = vld1q_u8(b_ptr[l].qs + 32);
-            uint8x16_t b_3 = vld1q_u8(b_ptr[l].qs + 48);
-
-            int8x16_t b_0_hi = vqtbl1q_s8(kvalues, b_0 >> 4);
-            int8x16_t b_0_lo = vqtbl1q_s8(kvalues, b_0 & 0x0F);
-            int8x16_t b_1_hi = vqtbl1q_s8(kvalues, b_1 >> 4);
-            int8x16_t b_1_lo = vqtbl1q_s8(kvalues, b_1 & 0x0F);
-            int8x16_t b_2_hi = vqtbl1q_s8(kvalues, b_2 >> 4);
-            int8x16_t b_2_lo = vqtbl1q_s8(kvalues, b_2 & 0x0F);
-            int8x16_t b_3_hi = vqtbl1q_s8(kvalues, b_3 >> 4);
-            int8x16_t b_3_lo = vqtbl1q_s8(kvalues, b_3 & 0x0F);
-
-            int8x16_t a_0 = vld1q_s8(a_ptr[l].qs + 0);
-            int8x16_t a_1 = vld1q_s8(a_ptr[l].qs + 16);
-
-            int32x4_t sumi = vdupq_n_s32(0);
-            sumi = vdotq_laneq_s32(sumi, b_0_lo, a_0, 0);
-            sumi = vdotq_laneq_s32(sumi, b_0_hi, a_1, 0);
-            sumi = vdotq_laneq_s32(sumi, b_1_lo, a_0, 1);
-            sumi = vdotq_laneq_s32(sumi, b_1_hi, a_1, 1);
-            sumi = vdotq_laneq_s32(sumi, b_2_lo, a_0, 2);
-            sumi = vdotq_laneq_s32(sumi, b_2_hi, a_1, 2);
-            sumi = vdotq_laneq_s32(sumi, b_3_lo, a_0, 3);
-            sumi = vdotq_laneq_s32(sumi, b_3_hi, a_1, 3);
-
-            float32x4_t a_d = vcvt_f32_f16(vld1_dup_f16((const float16_t *)&a_ptr[l].d));
-            float32x4_t b_d = {
-                GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[l].e[0]),
-                GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[l].e[1]),
-                GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[l].e[2]),
-                GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[l].e[3]),
-            };
-            float32x4_t d = a_d * b_d;
-
-            sumf = vmlaq_f32(sumf, d, vcvtq_f32_s32(sumi));
-        }
-
-        vst1q_f32(res_ptr + x * 4, sumf);
-    }
-    return;
-#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
-    ggml_gemv_mxfp4_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc);
-}
-
 void ggml_gemv_q4_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
     constexpr int qk = QK_K;
     const int     nb = n / qk;
@@ -3239,87 +3164,6 @@ void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const
     ggml_gemm_iq4_nl_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc);
 }
 
-void ggml_gemm_mxfp4_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
-    const int qk = QK8_0;
-    const int nb = n / qk;
-    const int ncols_interleaved = 4;
-    const int blocklen = 4;
-
-    assert (n % qk == 0);
-    assert (nr % 4 == 0);
-    assert (nc % ncols_interleaved == 0);
-
-    UNUSED(s);
-    UNUSED(bs);
-    UNUSED(vx);
-    UNUSED(vy);
-    UNUSED(nr);
-    UNUSED(nc);
-    UNUSED(nb);
-    UNUSED(ncols_interleaved);
-    UNUSED(blocklen);
-
-#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
-    const int8x16_t kvalues = vld1q_s8(kvalues_mxfp4);
-
-    for (int y = 0; y < nr / 4; y++) {
-        const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
-        for (int x = 0; x < nc / ncols_interleaved; x++) {
-            const block_mxfp4x4 * b_ptr = (const block_mxfp4x4 *) vx + (x * nb);
-
-            float32x4_t sumf[4];
-            for (int m = 0; m < 4; m++) {
-                sumf[m] = vdupq_n_f32(0);
-            }
-
-            for (int l = 0; l < nb; l++) {
-                float32x4_t a_d = vcvt_f32_f16(vld1_f16((const float16_t *)a_ptr[l].d));
-                float32x4_t b_d = {
-                    GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[l].e[0]),
-                    GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[l].e[1]),
-                    GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[l].e[2]),
-                    GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[l].e[3]),
-                };
-
-                int32x4_t sumi_0 = vdupq_n_s32(0);
-                int32x4_t sumi_1 = vdupq_n_s32(0);
-                int32x4_t sumi_2 = vdupq_n_s32(0);
-                int32x4_t sumi_3 = vdupq_n_s32(0);
-
-                for (int k = 0; k < 4; k++) {
-                    int8x16_t a_0 = vld1q_s8(a_ptr[l].qs + 16 * k + 0);
-                    int8x16_t a_1 = vld1q_s8(a_ptr[l].qs + 16 * k + 64);
-
-                    uint8x16_t b = vld1q_u8(b_ptr[l].qs + 16 * k);
-                    int8x16_t b_hi = vqtbl1q_s8(kvalues, b >> 4);
-                    int8x16_t b_lo = vqtbl1q_s8(kvalues, b & 0xF);
-
-                    sumi_0 = vdotq_laneq_s32(sumi_0, b_lo, a_0, 0);
-                    sumi_1 = vdotq_laneq_s32(sumi_1, b_lo, a_0, 1);
-                    sumi_2 = vdotq_laneq_s32(sumi_2, b_lo, a_0, 2);
-                    sumi_3 = vdotq_laneq_s32(sumi_3, b_lo, a_0, 3);
-                    sumi_0 = vdotq_laneq_s32(sumi_0, b_hi, a_1, 0);
-                    sumi_1 = vdotq_laneq_s32(sumi_1, b_hi, a_1, 1);
-                    sumi_2 = vdotq_laneq_s32(sumi_2, b_hi, a_1, 2);
-                    sumi_3 = vdotq_laneq_s32(sumi_3, b_hi, a_1, 3);
-                }
-
-                sumf[0] = vmlaq_f32(sumf[0], vmulq_laneq_f32(b_d, a_d, 0), vcvtq_f32_s32(sumi_0));
-                sumf[1] = vmlaq_f32(sumf[1], vmulq_laneq_f32(b_d, a_d, 1), vcvtq_f32_s32(sumi_1));
-                sumf[2] = vmlaq_f32(sumf[2], vmulq_laneq_f32(b_d, a_d, 2), vcvtq_f32_s32(sumi_2));
-                sumf[3] = vmlaq_f32(sumf[3], vmulq_laneq_f32(b_d, a_d, 3), vcvtq_f32_s32(sumi_3));
-            }
-
-            for (int m = 0; m < 4; m++) {
-                vst1q_f32(s + (y * 4 + m) * bs + x * 4, sumf[m]);
-            }
-        }
-    }
-    return;
-#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
-    ggml_gemm_mxfp4_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc);
-}
-
 void ggml_gemm_q4_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
     constexpr int qk = QK_K;
     const int     nb = n / qk;