mirror of
https://github.com/kvcache-ai/ktransformers.git
synced 2026-04-28 03:39:48 +00:00
[feat](kt-kernel): support avx2 only inference for bf16 fp8 and gptq int4 (#1892)
Some checks are pending
Book-CI / test (push) Waiting to run
Book-CI / test-1 (push) Waiting to run
Book-CI / test-2 (push) Waiting to run
Deploy / deploy (macos-latest) (push) Waiting to run
Deploy / deploy (ubuntu-latest) (push) Waiting to run
Deploy / deploy (windows-latest) (push) Waiting to run
Some checks are pending
Book-CI / test (push) Waiting to run
Book-CI / test-1 (push) Waiting to run
Book-CI / test-2 (push) Waiting to run
Deploy / deploy (macos-latest) (push) Waiting to run
Deploy / deploy (ubuntu-latest) (push) Waiting to run
Deploy / deploy (windows-latest) (push) Waiting to run
* feat: support avx2 bf16 fp8 inference * feat: support avx2 gptq int4 inference * fix: numeric issues in fp8 dequant * Tutorial avx2 (#1900) * fix: prevent injecting -DLLAMA_AVX512=ON on AVX2-only machines * docs: add AVX2 tutorial for running KTransformers on AVX2-only CPUs * Tutorial avx2 (#1901) * fix: prevent injecting -DLLAMA_AVX512=ON on AVX2-only machines * docs: add AVX2 tutorial for running KTransformers on AVX2-only CPUs * docs: update README.md --------- Co-authored-by: Benjamin F <159887351+yyj6666667@users.noreply.github.com>
This commit is contained in:
parent
8561a71dd1
commit
7a9daf0cd4
19 changed files with 3472 additions and 12 deletions
|
|
@ -45,6 +45,13 @@ static const bool _is_plain_ = false;
|
|||
#include "operators/amx/la/amx_kernels.hpp"
|
||||
#include "operators/amx/moe.hpp"
|
||||
#endif
|
||||
// AVX2 backends — always available on x86_64 (no AMX/AVX512 dependency)
|
||||
#if defined(__x86_64__)
|
||||
#include "operators/avx2/bf16-moe.hpp"
|
||||
#include "operators/avx2/fp8-moe.hpp"
|
||||
#include "operators/avx2/gptq_int4-moe.hpp"
|
||||
#endif
|
||||
|
||||
#include <pybind11/stl.h> // std::vector/std::pair/std::string conversions
|
||||
|
||||
#include <cstdint>
|
||||
|
|
@ -578,6 +585,13 @@ PYBIND11_MODULE(kt_kernel_ext, m) {
|
|||
bind_moe_module<AMX_FP8_PERCHANNEL_MOE_TP<amx::GemmKernel224FP8PerChannel>>(moe_module, "AMXFP8PerChannel_MOE");
|
||||
#endif
|
||||
#endif
|
||||
// AVX2 backends — available on all x86_64 (no AMX/AVX512 requirement)
|
||||
#if defined(__x86_64__)
|
||||
bind_moe_module<AVX2_BF16_MOE_TP<avx2::GemmKernelAVX2BF16>>(moe_module, "AVX2BF16_MOE");
|
||||
bind_moe_module<AVX2_FP8_MOE_TP<avx2::GemmKernelAVX2FP8>>(moe_module, "AVX2FP8_MOE");
|
||||
bind_moe_module<AVX2_GPTQ_INT4_MOE_TP<avx2::GemmKernelAVX2GPTQInt4>>(moe_module, "AVX2GPTQInt4_MOE");
|
||||
#endif
|
||||
|
||||
#if defined(USE_MOE_KERNEL)
|
||||
bind_moe_module<MOE_KERNEL_TP<moe_kernel::GemmKernelInt8, _is_plain_>>(moe_module, "Int8_KERNEL_MOE");
|
||||
#if defined(__aarch64__) && defined(CPU_USE_KML)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue