[feat](kt-kernel): support avx2 only inference for bf16 fp8 and gptq int4 (#1892)

* feat: support avx2 bf16 fp8 inference * feat: support avx2 gptq int4 inference * fix: numeric issues in fp8 dequant * Tutorial avx2 (#1900) * fix: prevent injecting -DLLAMA_AVX512=ON on AVX2-only machines * docs: add AVX2 tutorial for running KTransformers on AVX2-only CPUs * Tutorial avx2 (#1901) * fix: prevent injecting -DLLAMA_AVX512=ON on AVX2-only machines * docs: add AVX2 tutorial for running KTransformers on AVX2-only CPUs * docs: update README.md --------- Co-authored-by: Benjamin F <159887351+yyj6666667@users.noreply.github.com>
2026-04-28 03:39:48 +00:00 · 2026-03-27 14:45:02 +08:00 · 2026-03-27 14:45:02 +08:00 · 7a9daf0cd4
commit 7a9daf0cd4
parent 8561a71dd1
19 changed files with 3472 additions and 12 deletions
--- a/kt-kernel/ext_bindings.cpp
+++ b/kt-kernel/ext_bindings.cpp
@ -45,6 +45,13 @@ static const bool _is_plain_ = false;
 #include "operators/amx/la/amx_kernels.hpp"
 #include "operators/amx/moe.hpp"
 #endif
+// AVX2 backends — always available on x86_64 (no AMX/AVX512 dependency)
+#if defined(__x86_64__)
+#include "operators/avx2/bf16-moe.hpp"
+#include "operators/avx2/fp8-moe.hpp"
+#include "operators/avx2/gptq_int4-moe.hpp"
+#endif
+
 #include <pybind11/stl.h>  // std::vector/std::pair/std::string conversions

 #include <cstdint>
@ -578,6 +585,13 @@ PYBIND11_MODULE(kt_kernel_ext, m) {
  bind_moe_module<AMX_FP8_PERCHANNEL_MOE_TP<amx::GemmKernel224FP8PerChannel>>(moe_module, "AMXFP8PerChannel_MOE");
 #endif
 #endif
+// AVX2 backends — available on all x86_64 (no AMX/AVX512 requirement)
+#if defined(__x86_64__)
+  bind_moe_module<AVX2_BF16_MOE_TP<avx2::GemmKernelAVX2BF16>>(moe_module, "AVX2BF16_MOE");
+  bind_moe_module<AVX2_FP8_MOE_TP<avx2::GemmKernelAVX2FP8>>(moe_module, "AVX2FP8_MOE");
+  bind_moe_module<AVX2_GPTQ_INT4_MOE_TP<avx2::GemmKernelAVX2GPTQInt4>>(moe_module, "AVX2GPTQInt4_MOE");
+#endif
+
 #if defined(USE_MOE_KERNEL)
  bind_moe_module<MOE_KERNEL_TP<moe_kernel::GemmKernelInt8, _is_plain_>>(moe_module, "Int8_KERNEL_MOE");
 #if defined(__aarch64__) && defined(CPU_USE_KML)