[feat](kt-kernel): support avx2 only inference for bf16 fp8 and gptq int4 (#1892)
Some checks are pending
Book-CI / test (push) Waiting to run
Book-CI / test-1 (push) Waiting to run
Book-CI / test-2 (push) Waiting to run
Deploy / deploy (macos-latest) (push) Waiting to run
Deploy / deploy (ubuntu-latest) (push) Waiting to run
Deploy / deploy (windows-latest) (push) Waiting to run

* feat: support avx2 bf16 fp8 inference

* feat: support avx2 gptq int4 inference

* fix: numeric issues in fp8 dequant

* Tutorial avx2 (#1900)

* fix: prevent injecting -DLLAMA_AVX512=ON on AVX2-only machines

* docs: add AVX2 tutorial for running KTransformers on AVX2-only CPUs

* Tutorial avx2 (#1901)

* fix: prevent injecting -DLLAMA_AVX512=ON on AVX2-only machines

* docs: add AVX2 tutorial for running KTransformers on AVX2-only CPUs

* docs: update README.md

---------

Co-authored-by: Benjamin F <159887351+yyj6666667@users.noreply.github.com>
This commit is contained in:
mrhaoxx 2026-03-27 14:45:02 +08:00 committed by GitHub
parent 8561a71dd1
commit 7a9daf0cd4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
19 changed files with 3472 additions and 12 deletions

View file

@ -45,6 +45,13 @@ static const bool _is_plain_ = false;
#include "operators/amx/la/amx_kernels.hpp"
#include "operators/amx/moe.hpp"
#endif
// AVX2 backends — always available on x86_64 (no AMX/AVX512 dependency)
#if defined(__x86_64__)
#include "operators/avx2/bf16-moe.hpp"
#include "operators/avx2/fp8-moe.hpp"
#include "operators/avx2/gptq_int4-moe.hpp"
#endif
#include <pybind11/stl.h> // std::vector/std::pair/std::string conversions
#include <cstdint>
@ -578,6 +585,13 @@ PYBIND11_MODULE(kt_kernel_ext, m) {
bind_moe_module<AMX_FP8_PERCHANNEL_MOE_TP<amx::GemmKernel224FP8PerChannel>>(moe_module, "AMXFP8PerChannel_MOE");
#endif
#endif
// AVX2 backends — available on all x86_64 (no AMX/AVX512 requirement)
#if defined(__x86_64__)
bind_moe_module<AVX2_BF16_MOE_TP<avx2::GemmKernelAVX2BF16>>(moe_module, "AVX2BF16_MOE");
bind_moe_module<AVX2_FP8_MOE_TP<avx2::GemmKernelAVX2FP8>>(moe_module, "AVX2FP8_MOE");
bind_moe_module<AVX2_GPTQ_INT4_MOE_TP<avx2::GemmKernelAVX2GPTQInt4>>(moe_module, "AVX2GPTQInt4_MOE");
#endif
#if defined(USE_MOE_KERNEL)
bind_moe_module<MOE_KERNEL_TP<moe_kernel::GemmKernelInt8, _is_plain_>>(moe_module, "Int8_KERNEL_MOE");
#if defined(__aarch64__) && defined(CPU_USE_KML)