mirror of
https://github.com/kvcache-ai/ktransformers.git
synced 2026-04-26 10:50:59 +00:00
[feat](moe_kernel): add amd blis support (int8) (#1600)
Some checks are pending
Book-CI / test (push) Waiting to run
Book-CI / test-1 (push) Waiting to run
Book-CI / test-2 (push) Waiting to run
Deploy / deploy (macos-latest) (push) Waiting to run
Deploy / deploy (ubuntu-latest) (push) Waiting to run
Deploy / deploy (windows-latest) (push) Waiting to run
Some checks are pending
Book-CI / test (push) Waiting to run
Book-CI / test-1 (push) Waiting to run
Book-CI / test-2 (push) Waiting to run
Deploy / deploy (macos-latest) (push) Waiting to run
Deploy / deploy (ubuntu-latest) (push) Waiting to run
Deploy / deploy (windows-latest) (push) Waiting to run
* [feat]: init amd adaption * [feat]: add blis support * [fix]: fix setup and moe kernel warpper * [fix](setup.py): support rebuild with cache and import kt_kernel works fine * [feat]: add moe_kernel converter for amd and implement the load method(haven't tested yet) * [feat](moe_kernel/moe.hpp): delete unused memory when using save * [fix](moe_kernel): update PLAIN for pack * [fix](moe_kernel): rm printf debug * [fix](moe_kernel): skip gpu experts * [fix](moe_kernel/moe.hpp): update include memory path * [feat](moe_kernel/moe.hpp): support expert deferral * [feat]: finish amd --------- Co-authored-by: mrhaoxx <mr.haoxx@gmail.com>
This commit is contained in:
parent
fef6dd98a8
commit
1374b98ee5
14 changed files with 655 additions and 238 deletions
|
|
@ -495,7 +495,7 @@ if(NOT DEFINED CLANG_FORMAT_BIN)
|
|||
)
|
||||
endif()
|
||||
if(NOT CLANG_FORMAT_BIN)
|
||||
message(WARNING "clang-format not found. Please install clang-format (>=18) or pass -DCLANG_FORMAT_BIN=/full/path and reconfigure.")
|
||||
message(WARNING "ONLY for developer: clang-format not found. Please install clang-format (>=18) or pass -DCLANG_FORMAT_BIN=/full/path and reconfigure.")
|
||||
else()
|
||||
execute_process(
|
||||
COMMAND ${CLANG_FORMAT_BIN} --version
|
||||
|
|
|
|||
|
|
@ -39,6 +39,20 @@
|
|||
"KTRANSFORMERS_CPU_USE_AMX_AVX512": "ON",
|
||||
"KTRANSFORMERS_USE_CUDA": "ON"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "amd",
|
||||
"displayName": "amd_platform",
|
||||
"description": "for amd platform",
|
||||
"cacheVariables": {
|
||||
"KTRANSFORMERS_CPU_USE_AMX": "OFF",
|
||||
"LLAMA_AVX512": "OFF",
|
||||
"LLAMA_AVX2": "ON",
|
||||
"KTRANSFORMERS_CPU_USE_AMX_AVX512": "OFF",
|
||||
"KTRANSFORMERS_USE_CUDA": "ON",
|
||||
"KTRANSFORMERS_CPU_MOE_AMD": "ON",
|
||||
"KTRANSFORMERS_CPU_MOE_KERNEL": "ON"
|
||||
}
|
||||
}
|
||||
|
||||
]
|
||||
|
|
|
|||
|
|
@ -2,41 +2,32 @@
|
|||
|
||||
High-performance kernel operations for KTransformers, featuring CPU-optimized MoE inference with AMX, AVX, KML and blis (amd library) support.
|
||||
|
||||
- [KT-Kernel](#kt-kernel)
|
||||
- [Note](#note)
|
||||
- [Features](#features)
|
||||
- [Installation](#installation)
|
||||
- [Prerequisites](#prerequisites)
|
||||
- [Quick Installation (Recommended)](#quick-installation-recommended)
|
||||
- [Manual Configuration (Advanced)](#manual-configuration-advanced)
|
||||
- [Verification](#verification)
|
||||
- [Integration with SGLang](#integration-with-sglang)
|
||||
- [Installation Steps](#installation-steps)
|
||||
- [1. Install SGLang](#1-install-sglang)
|
||||
- [2. Prepare Weights](#2-prepare-weights)
|
||||
- [3. Launch SGLang Server](#3-launch-sglang-server)
|
||||
- [Complete Example: Qwen3-30B-A3B](#complete-example-qwen3-30b-a3b)
|
||||
- [Option A: AMX Backend (AMXINT8)](#option-a-amx-backend-amxint8)
|
||||
- [Option B: LLAMAFILE Backend (GGUF)](#option-b-llamafile-backend-gguf)
|
||||
- [KT-Kernel Parameters](#kt-kernel-parameters)
|
||||
- [Direct Python API Usage](#direct-python-api-usage)
|
||||
- [Advanced Options](#advanced-options)
|
||||
- [Build Configuration](#build-configuration)
|
||||
- [Manual Installation](#manual-installation)
|
||||
- [1. Install System Dependencies](#1-install-system-dependencies)
|
||||
- [2. Set Build Configuration](#2-set-build-configuration)
|
||||
- [3. Build and Install](#3-build-and-install)
|
||||
- [Error Troubleshooting](#error-troubleshooting)
|
||||
- [CUDA Not Found](#cuda-not-found)
|
||||
- [hwloc Not Found](#hwloc-not-found)
|
||||
- [Weight Quantization](#weight-quantization)
|
||||
- [Before Commit!](#before-commit)
|
||||
- [Note](#note)
|
||||
- [Features](#features)
|
||||
- [Installation](#installation)
|
||||
- [Prerequisites](#prerequisites)
|
||||
- [Quick Installation (Recommended)](#quick-installation-recommended)
|
||||
- [Manual Configuration (Advanced)](#manual-configuration-advanced)
|
||||
- [Verification](#verification)
|
||||
- [Integration with SGLang](#integration-with-sglang)
|
||||
- [Installation Steps](#installation-steps)
|
||||
- [Complete Example: Qwen3-30B-A3B](#complete-example-qwen3-30b-a3b)
|
||||
- [KT-Kernel Parameters](#kt-kernel-parameters)
|
||||
- [Direct Python API Usage](#direct-python-api-usage)
|
||||
- [Advanced Options](#advanced-options)
|
||||
- [Build Configuration](#build-configuration)
|
||||
- [Manual Installation](#manual-installation)
|
||||
- [Error Troubleshooting](#error-troubleshooting)
|
||||
- [CUDA Not Found](#cuda-not-found)
|
||||
- [hwloc Not Found](#hwloc-not-found)
|
||||
- [Weight Quantization](#weight-quantization)
|
||||
- [Before Commit!](#before-commit)
|
||||
## Note
|
||||
|
||||
**Current Support Status:**
|
||||
- ✅ **Intel CPUs with AMX**: Fully supported (using weights converted to INT4/INT8 format)
|
||||
- ✅ **Universal CPU (llamafile backend)**: Supported (using GGUF-format weights)
|
||||
- ⚠️ **AMD CPUs with BLIS**: In progress, not yet fully integrated
|
||||
- ✅ **AMD CPUs with BLIS**: Supported (for int8 prefill & decode)
|
||||
|
||||
## Features
|
||||
|
||||
|
|
@ -145,7 +136,7 @@ python scripts/convert_cpu_weights.py \
|
|||
--input-path /path/to/model \
|
||||
--input-type bf16 \
|
||||
--output /path/to/cpu-weights \
|
||||
--quant-method int8 # or int4
|
||||
--quant-method int8 # or int4 or moe_int8 (for amd now)
|
||||
```
|
||||
|
||||
- `--input-path`: Path to GPU-side original weights
|
||||
|
|
|
|||
|
|
@ -2,42 +2,33 @@
|
|||
|
||||
高性能 KTransformers 内核库,提供面向 CPU 的高效 MoE 推理内核,支持 AMX 和 AVX 等后端。
|
||||
|
||||
- [KT-Kernel](#kt-kernel)
|
||||
- [说明](#说明)
|
||||
- [特性](#特性)
|
||||
- [安装](#安装)
|
||||
- [先决条件](#先决条件)
|
||||
- [快速安装(推荐)](#快速安装推荐)
|
||||
- [手动配置(进阶)](#手动配置进阶)
|
||||
- [验证安装](#验证安装)
|
||||
- [与 SGLang 集成](#与-sglang-集成)
|
||||
- [安装步骤](#安装步骤)
|
||||
- [1. 安装 SGLang](#1-安装-sglang)
|
||||
- [2. 准备权重](#2-准备权重)
|
||||
- [3. 启动 SGLang Server](#3-启动-sglang-server)
|
||||
- [完整示例:Qwen3-30B-A3B](#完整示例qwen3-30b-a3b)
|
||||
- [方案 A:AMX 后端(AMXINT8)](#方案-aamx-后端amxint8)
|
||||
- [方案 B:LLAMAFILE 后端(GGUF)](#方案-bllamafile-后端gguf)
|
||||
- [KT-Kernel 参数](#kt-kernel-参数)
|
||||
- [直接使用 Python API](#直接使用-python-api)
|
||||
- [高级选项](#高级选项)
|
||||
- [构建配置](#构建配置)
|
||||
- [手动安装](#手动安装)
|
||||
- [1. 安装系统依赖](#1-安装系统依赖)
|
||||
- [2. 配置构建参数](#2-配置构建参数)
|
||||
- [3. 构建并安装](#3-构建并安装)
|
||||
- [错误排查](#错误排查)
|
||||
- [找不到 CUDA](#找不到-cuda)
|
||||
- [找不到 hwloc](#找不到-hwloc)
|
||||
- [权重量化](#权重量化)
|
||||
- [提交前必读](#提交前必读)
|
||||
- [说明](#说明)
|
||||
- [特性](#特性)
|
||||
- [安装](#安装)
|
||||
- [先决条件](#先决条件)
|
||||
- [快速安装(推荐)](#快速安装推荐)
|
||||
- [手动配置(进阶)](#手动配置进阶)
|
||||
- [验证安装](#验证安装)
|
||||
- [与 SGLang 集成](#与-sglang-集成)
|
||||
- [安装步骤](#安装步骤)
|
||||
- [完整示例:Qwen3-30B-A3B](#完整示例qwen3-30b-a3b)
|
||||
- [KT-Kernel 参数](#kt-kernel-参数)
|
||||
- [直接使用 Python API](#直接使用-python-api)
|
||||
- [高级选项](#高级选项)
|
||||
- [构建配置](#构建配置)
|
||||
- [手动安装](#手动安装)
|
||||
- [错误排查](#错误排查)
|
||||
- [找不到 CUDA](#找不到-cuda)
|
||||
- [找不到 hwloc](#找不到-hwloc)
|
||||
- [权重量化](#权重量化)
|
||||
- [提交前必读](#提交前必读)
|
||||
|
||||
## 说明
|
||||
|
||||
**当前支持状态:**
|
||||
- ✅ **带 AMX 的 Intel CPU**:已支持(基于转换为 INT4/INT8 格式的权重)
|
||||
- ✅ **通用 CPU(llamafile 后端)**:已支持(基于 GGUF 格式的权重)
|
||||
- ⚠️ **带 BLIS 的 AMD CPU**:进行中,尚未完全集成
|
||||
- ✅ **带 BLIS 的 AMD CPU**:已支持(int8 的 prefill 和 decode)
|
||||
|
||||
## 特性
|
||||
|
||||
|
|
@ -149,7 +140,7 @@ python scripts/convert_cpu_weights.py \
|
|||
--input-path /path/to/model \
|
||||
--input-type bf16 \
|
||||
--output /path/to/cpu-weights \
|
||||
--quant-method int8 # 或 int4
|
||||
--quant-method int8 # 或 int4 或 moe_int8(用于 amd 的)
|
||||
```
|
||||
|
||||
- `--input-path`:GPU 侧原始权重路径
|
||||
|
|
|
|||
|
|
@ -2376,9 +2376,7 @@ bool ggml_compute_forward_mul_mat_use_amx(struct ggml_tensor* dst) {
|
|||
static thread_local bool is_first_time = true;
|
||||
if (is_first_time) {
|
||||
#pragma omp single
|
||||
{
|
||||
ggml_amx_init();
|
||||
}
|
||||
{ ggml_amx_init(); }
|
||||
|
||||
// load tile config
|
||||
ggml_tile_config_init();
|
||||
|
|
|
|||
|
|
@ -2372,9 +2372,7 @@ bool ggml_compute_forward_mul_mat_use_amx(struct ggml_tensor* dst) {
|
|||
static thread_local bool is_first_time = true;
|
||||
if (is_first_time) {
|
||||
#pragma omp single
|
||||
{
|
||||
ggml_amx_init();
|
||||
}
|
||||
{ ggml_amx_init(); }
|
||||
|
||||
// load tile config
|
||||
ggml_tile_config_init();
|
||||
|
|
|
|||
|
|
@ -14,15 +14,15 @@
|
|||
// #include <utility>
|
||||
// #include <vector>
|
||||
|
||||
// #define DIRECT_OR_POOL_BY(what, threshold, var, fn) \
|
||||
// do { \
|
||||
// if ((what) < (threshold)) { \
|
||||
// for (int i = 0; i < (var); i++) { \
|
||||
// (fn)(i); \
|
||||
// } \
|
||||
// } else { \
|
||||
// pool->do_work_stealing_job((var), nullptr, (fn), nullptr); \
|
||||
// } \
|
||||
// #define DIRECT_OR_POOL_BY(what, threshold, var, fn) \
|
||||
// do { \
|
||||
// if ((what) < (threshold)) { \
|
||||
// for (int i = 0; i < (var); i++) { \
|
||||
// (fn)(i); \
|
||||
// } \
|
||||
// } else { \
|
||||
// pool->do_work_stealing_job((var), nullptr, (fn), nullptr); \
|
||||
// } \
|
||||
// } while (0)
|
||||
|
||||
// #define VEC_DOT_TYPE(type) (ggml_internal_get_type_traits((ggml_type)(type)).vec_dot_type)
|
||||
|
|
@ -31,19 +31,20 @@
|
|||
// #define QUANT_OFFSET(ptr, type, n, n_elements) \
|
||||
// (offset_pointer((ptr), (size_t)(n) * QUANT_BLCK_SIZE((n_elements), (type))))
|
||||
|
||||
// #define LLAMAFILE_SGEMM_QUANT_FULL_MATMUL(m, n, k, a, a_type, b, b_col, c, c_col) \
|
||||
// do { \
|
||||
// llamafile_sgemm((m), (n), QUANT_BLCK_COUNT((k), (a_type)), (a), QUANT_BLCK_COUNT((k), (a_type)), \
|
||||
// QUANT_OFFSET((b), VEC_DOT_TYPE((a_type)), (b_col), (k)), \
|
||||
// QUANT_BLCK_COUNT((k), VEC_DOT_TYPE((a_type))), offset_pointer((c), (c_col) * (m) * sizeof(float)), \
|
||||
// (k), 0, 1, GGML_TASK_TYPE_COMPUTE, (a_type), VEC_DOT_TYPE((a_type)), GGML_TYPE_F32, \
|
||||
// GGML_PREC_DEFAULT); \
|
||||
// #define LLAMAFILE_SGEMM_QUANT_FULL_MATMUL(m, n, k, a, a_type, b, b_col, c, c_col) \
|
||||
// do { \
|
||||
// llamafile_sgemm((m), (n), QUANT_BLCK_COUNT((k), (a_type)), (a), QUANT_BLCK_COUNT((k), (a_type)), \
|
||||
// QUANT_OFFSET((b), VEC_DOT_TYPE((a_type)), (b_col), (k)), \
|
||||
// QUANT_BLCK_COUNT((k), VEC_DOT_TYPE((a_type))), offset_pointer((c), (c_col) * (m) *
|
||||
// sizeof(float)), \
|
||||
// (k), 0, 1, GGML_TASK_TYPE_COMPUTE, (a_type), VEC_DOT_TYPE((a_type)), GGML_TYPE_F32, \
|
||||
// GGML_PREC_DEFAULT); \
|
||||
// } while (0)
|
||||
|
||||
// #define LLAMAFILE_SGEMM_MATMUL_F32(m, n, k, a, lda, b, ldb, c, ldc) \
|
||||
// do { \
|
||||
// llamafile_sgemm((m), (n), (k), (a), (lda), (b), (ldb), (c), (ldc), 0, 1, GGML_TASK_TYPE_COMPUTE, GGML_TYPE_F32, \
|
||||
// GGML_TYPE_F32, GGML_TYPE_F32, GGML_PREC_DEFAULT); \
|
||||
// #define LLAMAFILE_SGEMM_MATMUL_F32(m, n, k, a, lda, b, ldb, c, ldc) \
|
||||
// do { \
|
||||
// llamafile_sgemm((m), (n), (k), (a), (lda), (b), (ldb), (c), (ldc), 0, 1, GGML_TASK_TYPE_COMPUTE, GGML_TYPE_F32, \
|
||||
// GGML_TYPE_F32, GGML_TYPE_F32, GGML_PREC_DEFAULT); \
|
||||
// } while (0)
|
||||
|
||||
// // bool decide_absorb(size_t a,int a_type,size_t b,int b_type,size_t c,int c_type,size_t d,int d_type){
|
||||
|
|
|
|||
|
|
@ -340,7 +340,7 @@ struct GemmKernelInt8 {
|
|||
static inline const int PACK_SIZE_M = 8;
|
||||
static inline const int PACK_SIZE_K = 32;
|
||||
|
||||
static std::string name() { return "INT8"; }
|
||||
static std::string name() { return "MOE_INT8"; }
|
||||
static int recommended_nth(int n) { return (n + N_BLOCK - 1) / N_BLOCK; }
|
||||
// type_: d for decode, p for prefill
|
||||
static int recommended_nth_down(int n, char type_ = 'd') {
|
||||
|
|
@ -833,7 +833,7 @@ struct GemmKernelInt4 {
|
|||
static inline const int PACK_SIZE_K = 32;
|
||||
static inline const int PACK_SIZE_M = 8;
|
||||
|
||||
static std::string name() { return "INT4"; }
|
||||
static std::string name() { return "MOE_INT4"; }
|
||||
static int recommended_nth(int n) { return (n + N_BLOCK - 1) / N_BLOCK; }
|
||||
|
||||
static int recommended_nth_down(int n, char type_ = 'd') {
|
||||
|
|
|
|||
|
|
@ -12,8 +12,8 @@
|
|||
#include <iostream>
|
||||
#include <vector>
|
||||
|
||||
#include "../../cpu_backend/shared_mem_buffer.h"
|
||||
#include "../common.hpp"
|
||||
#include "../cpu_backend/shared_mem_buffer.h"
|
||||
#include "../moe-tp.hpp"
|
||||
#include "api/common.h"
|
||||
#include "api/mat_kernel.h"
|
||||
|
|
@ -57,6 +57,9 @@ class MOE_KERNEL_TP
|
|||
std::vector<std::shared_ptr<typename T::BufferB>> down_bb_;
|
||||
std::vector<std::shared_ptr<typename T::BufferC>> down_bc_;
|
||||
|
||||
std::vector<void*> gate_up_owner_ptr_;
|
||||
std::vector<void*> down_owner_ptr_;
|
||||
|
||||
inline void write_weights(std::filesystem::path prefix, std::string mat_class, char* bb, int expert_idx, size_t size,
|
||||
size_t scale_size) {
|
||||
// printf("expert %d, size %ld, scale size %ld\n", expert_idx, size, scale_size);
|
||||
|
|
@ -182,6 +185,7 @@ class MOE_KERNEL_TP
|
|||
down_ba_.push_back(std::make_shared<typename T::BufferA>(config_.max_len, config_.intermediate_size, nullptr));
|
||||
down_bc_.push_back(std::make_shared<typename T::BufferC>(config_.max_len, config_.hidden_size, nullptr));
|
||||
void* gate_up_down_per_exp_ptr = std::aligned_alloc(64, gate_up_exp_size);
|
||||
gate_up_owner_ptr_.push_back(gate_up_down_per_exp_ptr);
|
||||
|
||||
gate_bb_.push_back(std::make_shared<typename T::BufferB>(config_.intermediate_size, config_.hidden_size,
|
||||
gate_up_down_per_exp_ptr, PACKED, 'u', PLAIN));
|
||||
|
|
@ -193,6 +197,7 @@ class MOE_KERNEL_TP
|
|||
|
||||
void* down_bb_ptr = std::aligned_alloc(
|
||||
64, T::BufferB::required_size(config_.hidden_size, config_.intermediate_size, PACKED, 'd', PLAIN));
|
||||
down_owner_ptr_.push_back(down_bb_ptr);
|
||||
down_bb_.push_back(std::make_shared<typename T::BufferB>(config_.hidden_size, config_.intermediate_size,
|
||||
down_bb_ptr, PACKED, 'd', PLAIN));
|
||||
}
|
||||
|
|
@ -220,27 +225,41 @@ class MOE_KERNEL_TP
|
|||
|
||||
~MOE_KERNEL_TP() {
|
||||
// printf(" Destroying KML_MOE_TP %lx\n", (intptr_t)(this));
|
||||
for (void* ptr : gate_up_owner_ptr_) {
|
||||
std::free(ptr);
|
||||
}
|
||||
for (void* ptr : down_owner_ptr_) {
|
||||
std::free(ptr);
|
||||
}
|
||||
}
|
||||
|
||||
void load_weights() {
|
||||
auto pool = config_.pool->get_subpool(tp_part_idx);
|
||||
const uint64_t* physical_to_logical_map = (const uint64_t*)config_.physical_to_logical_map;
|
||||
if (config_.gate_projs.size()) {
|
||||
printf("load from safetensor");
|
||||
pool->do_work_stealing_job(
|
||||
config_.expert_num, nullptr,
|
||||
[this, physical_to_logical_map](int expert_id) {
|
||||
uint64_t logical_expert_id = expert_map(physical_to_logical_map, expert_id);
|
||||
{
|
||||
size_t scale_size = config_.intermediate_size * sizeof(float);
|
||||
size_t size = T::BufferB::required_size(config_.intermediate_size, config_.hidden_size) - scale_size;
|
||||
size_t whole_size_ =
|
||||
T::BufferB::required_size(config_.intermediate_size, config_.hidden_size, PACKED, 'u', PLAIN);
|
||||
size_t size = whole_size_ - scale_size;
|
||||
void* dst_ = PLAIN ? gate_bb_[expert_id]->b : gate_bb_[expert_id]->b_pack[0];
|
||||
|
||||
memcpy(gate_bb_[expert_id]->b, config_.gate_projs[tp_part_idx][logical_expert_id], size);
|
||||
memcpy(dst_, config_.gate_projs[tp_part_idx][logical_expert_id], size);
|
||||
|
||||
if constexpr (T::BufferB::SCALE) {
|
||||
memcpy(gate_bb_[expert_id]->d, config_.gate_scales[tp_part_idx][logical_expert_id], scale_size);
|
||||
}
|
||||
|
||||
memcpy(up_bb_[expert_id]->b, config_.up_projs[tp_part_idx][logical_expert_id], size);
|
||||
whole_size_ =
|
||||
T::BufferB::required_size(config_.intermediate_size, config_.hidden_size, PACKED, 'u', PLAIN);
|
||||
size = whole_size_ - scale_size;
|
||||
dst_ = PLAIN ? up_bb_[expert_id]->b : up_bb_[expert_id]->b_pack[0];
|
||||
memcpy(dst_, config_.up_projs[tp_part_idx][logical_expert_id], size);
|
||||
|
||||
if constexpr (T::BufferB::SCALE) {
|
||||
memcpy(up_bb_[expert_id]->d, config_.up_scales[tp_part_idx][logical_expert_id], scale_size);
|
||||
|
|
@ -249,9 +268,11 @@ class MOE_KERNEL_TP
|
|||
|
||||
{
|
||||
size_t scale_size = config_.hidden_size * sizeof(float);
|
||||
size_t size = T::BufferB::required_size(config_.hidden_size, config_.intermediate_size) - scale_size;
|
||||
|
||||
memcpy(down_bb_[expert_id]->b, config_.down_projs[tp_part_idx][logical_expert_id], size);
|
||||
size_t whole_size_ =
|
||||
T::BufferB::required_size(config_.hidden_size, config_.intermediate_size, PACKED, 'd', PLAIN);
|
||||
size_t size = whole_size_ - scale_size;
|
||||
void* dst_ = PLAIN ? down_bb_[expert_id]->b : down_bb_[expert_id]->b_pack[0];
|
||||
memcpy(dst_, config_.down_projs[tp_part_idx][logical_expert_id], size);
|
||||
|
||||
if constexpr (T::BufferB::SCALE) {
|
||||
memcpy(down_bb_[expert_id]->d, config_.down_scales[tp_part_idx][logical_expert_id], scale_size);
|
||||
|
|
@ -269,21 +290,22 @@ class MOE_KERNEL_TP
|
|||
uint8_t mat_class = (task_id % (mat_type_all * mat_split)) / mat_split;
|
||||
uint8_t mat_split_idex = task_id % mat_split;
|
||||
uint64_t logical_expert_id = expert_map(physical_to_logical_map, expert_idx);
|
||||
void* src_;
|
||||
if (mat_class == 0) { // the up matrix
|
||||
size_t size = T::BufferB::required_size(config_.intermediate_size, config_.hidden_size);
|
||||
src_ = PLAIN ? up_bb_[expert_idx]->b : up_bb_[expert_idx]->b_pack[0];
|
||||
size_t size = T::BufferB::required_size(config_.intermediate_size, config_.hidden_size, PACKED, 'u', PLAIN);
|
||||
size_t scale_size = config_.intermediate_size * sizeof(float);
|
||||
read_weights(prefix, "_up_", (char*)up_bb_[expert_idx]->b, logical_expert_id, size, scale_size, mat_split,
|
||||
mat_split_idex);
|
||||
read_weights(prefix, "_up_", (char*)src_, logical_expert_id, size, scale_size, mat_split, mat_split_idex);
|
||||
} else if (mat_class == 1) {
|
||||
size_t size = T::BufferB::required_size(config_.intermediate_size, config_.hidden_size);
|
||||
void* src_ = PLAIN ? gate_bb_[expert_idx]->b : gate_bb_[expert_idx]->b_pack[0];
|
||||
size_t size = T::BufferB::required_size(config_.intermediate_size, config_.hidden_size, PACKED, 'u', PLAIN);
|
||||
size_t scale_size = config_.intermediate_size * sizeof(float);
|
||||
read_weights(prefix, "_gate_", (char*)gate_bb_[expert_idx]->b, logical_expert_id, size, scale_size,
|
||||
mat_split, mat_split_idex);
|
||||
read_weights(prefix, "_gate_", (char*)src_, logical_expert_id, size, scale_size, mat_split, mat_split_idex);
|
||||
} else {
|
||||
size_t size = T::BufferB::required_size(config_.hidden_size, config_.intermediate_size);
|
||||
void* src_ = PLAIN ? down_bb_[expert_idx]->b : down_bb_[expert_idx]->b_pack[0];
|
||||
size_t size = T::BufferB::required_size(config_.hidden_size, config_.intermediate_size, PACKED, 'd', PLAIN);
|
||||
size_t scale_size = config_.hidden_size * sizeof(float);
|
||||
read_weights(prefix, "_down_", (char*)down_bb_[expert_idx]->b, logical_expert_id, size, scale_size,
|
||||
mat_split, mat_split_idex);
|
||||
read_weights(prefix, "_down_", (char*)src_, logical_expert_id, size, scale_size, mat_split, mat_split_idex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -342,17 +364,20 @@ class MOE_KERNEL_TP
|
|||
expert_idx = expert_map(physical_to_logical_map, expert_idx);
|
||||
uint8_t mat_class = task_id % mat_type_all;
|
||||
if (mat_class == 0) { // the up matrix
|
||||
size_t size = T::BufferB::required_size(config_.intermediate_size, config_.hidden_size);
|
||||
size_t size =
|
||||
T::BufferB::required_size(config_.intermediate_size, config_.hidden_size, PACKED, 'u', PLAIN);
|
||||
size_t scale_size = config_.intermediate_size * sizeof(float);
|
||||
write_weights(prefix, "_up_", (char*)up_bb_[expert_idx]->b, expert_idx, size, scale_size);
|
||||
write_weights(prefix, "_up_", (char*)up_bb_[expert_idx]->b_pack[0], expert_idx, size, scale_size);
|
||||
} else if (mat_class == 1) {
|
||||
size_t size = T::BufferB::required_size(config_.intermediate_size, config_.hidden_size);
|
||||
size_t size =
|
||||
T::BufferB::required_size(config_.intermediate_size, config_.hidden_size, PACKED, 'u', PLAIN);
|
||||
size_t scale_size = config_.intermediate_size * sizeof(float);
|
||||
write_weights(prefix, "_gate_", (char*)gate_bb_[expert_idx]->b, expert_idx, size, scale_size);
|
||||
write_weights(prefix, "_gate_", (char*)gate_bb_[expert_idx]->b_pack[0], expert_idx, size, scale_size);
|
||||
} else if (mat_class == 2) {
|
||||
size_t size = T::BufferB::required_size(config_.hidden_size, config_.intermediate_size);
|
||||
size_t size =
|
||||
T::BufferB::required_size(config_.hidden_size, config_.intermediate_size, PACKED, 'd', PLAIN);
|
||||
size_t scale_size = config_.hidden_size * sizeof(float);
|
||||
write_weights(prefix, "_down_", (char*)down_bb_[expert_idx]->b, expert_idx, size, scale_size);
|
||||
write_weights(prefix, "_down_", (char*)down_bb_[expert_idx]->b_pack[0], expert_idx, size, scale_size);
|
||||
}
|
||||
},
|
||||
nullptr);
|
||||
|
|
@ -432,6 +457,9 @@ class MOE_KERNEL_TP
|
|||
}
|
||||
for (int i = 0; i < qlen; i++) {
|
||||
for (int j = 0; j < k; j++) {
|
||||
if (expert_ids[i * k + j] < config_.num_gpu_experts || expert_ids[i * k + j] >= config_.expert_num) {
|
||||
continue;
|
||||
}
|
||||
m_local_pos_[i][j] = m_local_num_[expert_ids[i * k + j]]++;
|
||||
}
|
||||
}
|
||||
|
|
@ -460,6 +488,9 @@ class MOE_KERNEL_TP
|
|||
// Copy inputs into expert-local buffers
|
||||
MOE_DIRECT_OR_POOL_BY_VAR(qlen, [&](int i) {
|
||||
for (int j = 0; j < k; j++) {
|
||||
if (expert_ids[i * k + j] < config_.num_gpu_experts || expert_ids[i * k + j] >= config_.expert_num) {
|
||||
continue;
|
||||
}
|
||||
memcpy(m_local_input_ptr_[expert_ids[i * k + j]] + m_local_pos_[i][j] * config_.hidden_size,
|
||||
(input_t*)input + i * config_.hidden_size, sizeof(input_t) * config_.hidden_size);
|
||||
}
|
||||
|
|
@ -608,6 +639,10 @@ class MOE_KERNEL_TP
|
|||
for (int e = e_start; e < e_end; e++) {
|
||||
float sum = 0;
|
||||
for (int j = 0; j < k; j++) {
|
||||
if (expert_ids[q_idx * k + j] < config_.num_gpu_experts ||
|
||||
expert_ids[q_idx * k + j] >= config_.expert_num) {
|
||||
continue;
|
||||
}
|
||||
sum += weights[q_idx * k + j] * ((float*)m_local_down_output_ptr_[expert_ids[q_idx * k + j]])
|
||||
[m_local_pos_[q_idx][j] * config_.hidden_size + e];
|
||||
}
|
||||
|
|
@ -691,6 +726,10 @@ class TP_MOE<MOE_KERNEL_TP<K, T>> : public TP_MOE_Common<MOE_KERNEL_TP<K, T>> {
|
|||
delete[] (ggml_bf16_t*)(tpc.up_proj);
|
||||
delete[] (ggml_bf16_t*)(tpc.down_proj);
|
||||
}
|
||||
if (config.save) {
|
||||
// free the bf16 weights after saving
|
||||
tps.clear();
|
||||
}
|
||||
|
||||
this->weights_loaded = true;
|
||||
} else if (config.path != "") {
|
||||
|
|
@ -702,17 +741,22 @@ class TP_MOE<MOE_KERNEL_TP<K, T>> : public TP_MOE_Common<MOE_KERNEL_TP<K, T>> {
|
|||
}
|
||||
}
|
||||
|
||||
void merge_results(int qlen, void* output) {
|
||||
void merge_results(int qlen, void* output, bool incremental) {
|
||||
// #ifdef FORWARD_TIME_PROFILE
|
||||
// forward_perf_start();
|
||||
// #endif
|
||||
auto pool = this->config.pool;
|
||||
auto merge_fn = [this, output](int token_nth) {
|
||||
auto merge_fn = [this, output, incremental](int token_nth) {
|
||||
auto& local_output_numa = this->local_output_numa;
|
||||
auto& tp_configs = this->tp_configs;
|
||||
auto& tp_count = this->tp_count;
|
||||
auto& config = this->config;
|
||||
float* merge_to = local_output_numa[0] + token_nth * tp_configs[0].hidden_size;
|
||||
if (incremental) {
|
||||
for (int e = 0; e < config.hidden_size; e++) {
|
||||
merge_to[e] += ggml_bf16_to_fp32(((ggml_bf16_t*)output + token_nth * config.hidden_size)[e]);
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 1; i < tp_count; i++) {
|
||||
float* merge_from = local_output_numa[i] + token_nth * tp_configs[i].hidden_size;
|
||||
|
|
@ -750,6 +794,8 @@ class TP_MOE<MOE_KERNEL_TP<K, T>> : public TP_MOE_Common<MOE_KERNEL_TP<K, T>> {
|
|||
// perf_report();
|
||||
// #endif
|
||||
}
|
||||
|
||||
void merge_results(int qlen, void* output) { merge_results(qlen, output, false); }
|
||||
};
|
||||
|
||||
#endif
|
||||
|
|
@ -19,6 +19,7 @@ from .experts_base import BaseMoEWrapper, KExpertsCPUBuffer
|
|||
# Import backend implementations
|
||||
from .utils.amx import AMXMoEWrapper
|
||||
from .utils.llamafile import LlamafileMoEWrapper
|
||||
from .utils.moe_kernel import GeneralMoEWrapper
|
||||
|
||||
|
||||
class KTMoEWrapper:
|
||||
|
|
@ -76,7 +77,7 @@ class KTMoEWrapper:
|
|||
chunked_prefill_size: Maximum prefill chunk size
|
||||
cpu_save: Whether to save weights to CPU memory
|
||||
max_deferred_experts_per_token: Number of experts per token to defer. Defaults to 0.
|
||||
method: Backend method ("AMXINT4", "AMXINT8", "LLAMAFILE")
|
||||
method: Backend method ("AMXINT4", "AMXINT8", "LLAMAFILE", "MOE_INT4", "MOE_INT8")
|
||||
|
||||
Returns:
|
||||
An instance of the appropriate backend implementation (e.g., AMXMoEWrapper)
|
||||
|
|
@ -86,6 +87,8 @@ class KTMoEWrapper:
|
|||
backend_cls = AMXMoEWrapper
|
||||
elif method == "LLAMAFILE":
|
||||
backend_cls = LlamafileMoEWrapper
|
||||
elif method in ["MOE_INT4", "MOE_INT8"]:
|
||||
backend_cls = GeneralMoEWrapper
|
||||
else:
|
||||
raise NotImplementedError(f"Unsupported method: {method}")
|
||||
|
||||
|
|
|
|||
315
kt-kernel/python/utils/moe_kernel.py
Normal file
315
kt-kernel/python/utils/moe_kernel.py
Normal file
|
|
@ -0,0 +1,315 @@
|
|||
import os
|
||||
import torch
|
||||
import ctypes
|
||||
|
||||
# Use relative imports for package structure
|
||||
from ..experts_base import BaseMoEWrapper
|
||||
from .loader import SafeTensorLoader
|
||||
from kt_kernel_ext.moe import MOEConfig
|
||||
|
||||
try:
|
||||
from kt_kernel_ext.moe import Int8_KERNEL_MOE
|
||||
|
||||
_HAS_INT8_SUPPORT = True
|
||||
except (ImportError, AttributeError):
|
||||
Int8_KERNEL_MOE = None
|
||||
_HAS_INT8_SUPPORT = False
|
||||
try:
|
||||
from kt_kernel_ext.moe import Int4_KERNEL_MOE
|
||||
|
||||
_HAS_INT4_SUPPORT = True
|
||||
except (ImportError, AttributeError):
|
||||
Int4_KERNEL_MOE = None
|
||||
_HAS_INT4_SUPPORT = False
|
||||
|
||||
from typing import Optional
|
||||
|
||||
|
||||
class GeneralMoEWrapper(BaseMoEWrapper):
|
||||
"""
|
||||
moe-based MoE wrapper implementation.
|
||||
Supports MOE_INT4 and MOE_INT8 quantization methods.
|
||||
"""
|
||||
|
||||
_safetensor_loader_instance = None # Singleton SafeTensorLoader
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
layer_idx: int,
|
||||
num_experts: int,
|
||||
num_experts_per_tok: int,
|
||||
hidden_size: int,
|
||||
moe_intermediate_size: int,
|
||||
num_gpu_experts: int,
|
||||
cpuinfer_threads: int,
|
||||
threadpool_count: int,
|
||||
weight_path: str,
|
||||
chunked_prefill_size: int,
|
||||
cpu_save: bool = False,
|
||||
max_deferred_experts_per_token: Optional[int] = None,
|
||||
method: str = "MOE_INT8",
|
||||
):
|
||||
"""
|
||||
Initialize general MoE Wrapper.
|
||||
|
||||
Args:
|
||||
layer_idx: Layer index
|
||||
num_experts: Total number of experts
|
||||
num_experts_per_tok: Number of experts per token (top-k)
|
||||
hidden_size: Hidden dimension size
|
||||
moe_intermediate_size: MoE intermediate size
|
||||
num_gpu_experts: Number of experts to run on GPU
|
||||
cpuinfer_threads: Number of CPU inference threads
|
||||
threadpool_count: Number of NUMA subpools
|
||||
weight_path: Path to weights (SafeTensor format)
|
||||
chunked_prefill_size: Maximum prefill chunk size
|
||||
cpu_save: Whether to save weights to CPU memory
|
||||
max_deferred_experts_per_token: Number of experts per token to defer. Defaults to 0.
|
||||
method: general quantization method ("MOE_INT4" or "MOE_INT8")
|
||||
"""
|
||||
if not _HAS_INT4_SUPPORT and method == "MOE_INT4":
|
||||
raise RuntimeError(
|
||||
"MoE_INT4 backend not available. kt_kernel_ext was not compiled with int4 support.\n"
|
||||
"Please recompile with int4 enabled."
|
||||
)
|
||||
if not _HAS_INT8_SUPPORT and method == "MOE_INT8":
|
||||
raise RuntimeError(
|
||||
"MoE_INT8 backend not available. kt_kernel_ext was not compiled with int8 support.\n"
|
||||
"Please recompile with int8 enabled."
|
||||
)
|
||||
|
||||
# Initialize base class
|
||||
super().__init__(
|
||||
layer_idx=layer_idx,
|
||||
num_experts=num_experts,
|
||||
num_experts_per_tok=num_experts_per_tok,
|
||||
hidden_size=hidden_size,
|
||||
moe_intermediate_size=moe_intermediate_size,
|
||||
num_gpu_experts=num_gpu_experts,
|
||||
cpuinfer_threads=cpuinfer_threads,
|
||||
threadpool_count=threadpool_count,
|
||||
weight_path=weight_path,
|
||||
chunked_prefill_size=chunked_prefill_size,
|
||||
cpu_save=cpu_save,
|
||||
max_deferred_experts_per_token=max_deferred_experts_per_token,
|
||||
method=method,
|
||||
)
|
||||
|
||||
# moe-specific: Check if we should load merged safetensor weights
|
||||
self.load_merged_weight = False
|
||||
import glob
|
||||
|
||||
if glob.glob(os.path.join(weight_path, "*.safetensors")):
|
||||
self.load_merged_weight = True
|
||||
|
||||
# Initialize SafeTensor loader (singleton)
|
||||
if self.load_merged_weight:
|
||||
if GeneralMoEWrapper._safetensor_loader_instance is None:
|
||||
GeneralMoEWrapper._safetensor_loader_instance = SafeTensorLoader(weight_path)
|
||||
self.safetensor_loader = GeneralMoEWrapper._safetensor_loader_instance
|
||||
|
||||
# moe-specific weight storage
|
||||
self.gate_weights = None
|
||||
self.up_weights = None
|
||||
self.down_weights = None
|
||||
self.gate_scales = None
|
||||
self.up_scales = None
|
||||
self.down_scales = None
|
||||
|
||||
def load_weights_from_tensors(
|
||||
self,
|
||||
gate_proj: torch.Tensor,
|
||||
up_proj: torch.Tensor,
|
||||
down_proj: torch.Tensor,
|
||||
physical_to_logical_map_cpu: torch.Tensor,
|
||||
):
|
||||
"""
|
||||
Load and quantize weights from BF16/FP16 tensors (online quantization).
|
||||
|
||||
Args:
|
||||
gate_proj: Gate projection weights [num_experts, intermediate_size, hidden_size]
|
||||
up_proj: Up projection weights [num_experts, intermediate_size, hidden_size]
|
||||
down_proj: Down projection weights [num_experts, hidden_size, intermediate_size]
|
||||
physical_to_logical_map_cpu: Mapping from physical to logical expert IDs
|
||||
"""
|
||||
# Store tensors as instance variables to keep them alive
|
||||
self.gate_proj = gate_proj.contiguous()
|
||||
self.up_proj = up_proj.contiguous()
|
||||
self.down_proj = down_proj.contiguous()
|
||||
|
||||
# Configure MoE with online quantization (cpu_save mode)
|
||||
moe_config = MOEConfig(
|
||||
self.num_experts,
|
||||
self.num_experts_per_tok,
|
||||
self.hidden_size,
|
||||
self.moe_intermediate_size,
|
||||
self.num_gpu_experts,
|
||||
)
|
||||
moe_config.layer_idx = self.layer_idx
|
||||
moe_config.pool = self.cpu_infer.backend_
|
||||
moe_config.max_len = self.chunked_prefill_size
|
||||
|
||||
# Enable save mode for online quantization
|
||||
moe_config.save = True
|
||||
moe_config.load = False
|
||||
|
||||
# Set weight pointers
|
||||
moe_config.gate_proj = self.gate_proj.data_ptr()
|
||||
moe_config.up_proj = self.up_proj.data_ptr()
|
||||
moe_config.down_proj = self.down_proj.data_ptr()
|
||||
|
||||
# Set output path for quantized weights
|
||||
moe_config.path = self.weight_path
|
||||
|
||||
# Create MoE module based on method
|
||||
if self.method == "MOE_INT4":
|
||||
self.moe = Int4_KERNEL_MOE(moe_config)
|
||||
elif self.method == "MOE_INT8":
|
||||
self.moe = Int8_KERNEL_MOE(moe_config)
|
||||
else:
|
||||
raise NotImplementedError(f"Unsupported MoE method: {self.method}")
|
||||
|
||||
# Submit quantization and save task
|
||||
self.cpu_infer.submit(self.moe.load_weights_task(physical_to_logical_map_cpu.data_ptr()))
|
||||
self.cpu_infer.sync()
|
||||
|
||||
def load_weights(self, physical_to_logical_map_cpu: torch.Tensor):
|
||||
"""
|
||||
Load weights for this layer and initialize the MoE module.
|
||||
|
||||
Args:
|
||||
physical_to_logical_map_cpu: Mapping from physical to logical expert IDs
|
||||
"""
|
||||
gate_ptr = 0
|
||||
up_ptr = 0
|
||||
down_ptr = 0
|
||||
|
||||
gate_ptrs = []
|
||||
up_ptrs = []
|
||||
down_ptrs = []
|
||||
|
||||
gate_scale_ptrs = []
|
||||
up_scale_ptrs = []
|
||||
down_scale_ptrs = []
|
||||
|
||||
if self.load_merged_weight:
|
||||
base_key = f"blk.{self.layer_idx}"
|
||||
w = self.safetensor_loader.load_experts(base_key)
|
||||
|
||||
self.gate_weights = w["gate"]
|
||||
self.up_weights = w["up"]
|
||||
self.down_weights = w["down"]
|
||||
self.gate_scales = w["gate_scale"]
|
||||
self.up_scales = w["up_scale"]
|
||||
self.down_scales = w["down_scale"]
|
||||
|
||||
# Get pointers to weight arrays
|
||||
gate_ptrs = [
|
||||
[
|
||||
ctypes.addressof(ctypes.cast(et.ctypes.data, ctypes.POINTER(ctypes.c_uint64)).contents)
|
||||
for et in numa_array
|
||||
]
|
||||
for numa_array in self.gate_weights
|
||||
]
|
||||
|
||||
up_ptrs = [
|
||||
[
|
||||
ctypes.addressof(ctypes.cast(et.ctypes.data, ctypes.POINTER(ctypes.c_uint64)).contents)
|
||||
for et in numa_array
|
||||
]
|
||||
for numa_array in self.up_weights
|
||||
]
|
||||
|
||||
down_ptrs = [
|
||||
[
|
||||
ctypes.addressof(ctypes.cast(et.ctypes.data, ctypes.POINTER(ctypes.c_uint64)).contents)
|
||||
for et in numa_array
|
||||
]
|
||||
for numa_array in self.down_weights
|
||||
]
|
||||
|
||||
gate_scale_ptrs = [
|
||||
[
|
||||
ctypes.addressof(ctypes.cast(et.ctypes.data, ctypes.POINTER(ctypes.c_uint64)).contents)
|
||||
for et in numa_array
|
||||
]
|
||||
for numa_array in self.gate_scales
|
||||
]
|
||||
|
||||
up_scale_ptrs = [
|
||||
[
|
||||
ctypes.addressof(ctypes.cast(et.ctypes.data, ctypes.POINTER(ctypes.c_uint64)).contents)
|
||||
for et in numa_array
|
||||
]
|
||||
for numa_array in self.up_scales
|
||||
]
|
||||
|
||||
down_scale_ptrs = [
|
||||
[
|
||||
ctypes.addressof(ctypes.cast(et.ctypes.data, ctypes.POINTER(ctypes.c_uint64)).contents)
|
||||
for et in numa_array
|
||||
]
|
||||
for numa_array in self.down_scales
|
||||
]
|
||||
|
||||
# Configure MoE
|
||||
moe_config = MOEConfig(
|
||||
self.num_experts,
|
||||
self.num_experts_per_tok,
|
||||
self.hidden_size,
|
||||
self.moe_intermediate_size,
|
||||
self.num_gpu_experts,
|
||||
)
|
||||
moe_config.layer_idx = self.layer_idx
|
||||
moe_config.pool = self.cpu_infer.backend_
|
||||
moe_config.max_len = self.chunked_prefill_size
|
||||
|
||||
moe_config.gate_proj = gate_ptr
|
||||
moe_config.up_proj = up_ptr
|
||||
moe_config.down_proj = down_ptr
|
||||
moe_config.gate_projs = gate_ptrs
|
||||
moe_config.up_projs = up_ptrs
|
||||
moe_config.down_projs = down_ptrs
|
||||
moe_config.gate_scales = gate_scale_ptrs
|
||||
moe_config.up_scales = up_scale_ptrs
|
||||
moe_config.down_scales = down_scale_ptrs
|
||||
|
||||
if self.cpu_save:
|
||||
moe_config.save = True
|
||||
moe_config.load = False
|
||||
base_key = f"model.layers.{self.layer_idx}"
|
||||
w = self.safetensor_loader.load_experts(base_key)
|
||||
|
||||
self.gate_proj = torch.cat(w["gate_weight"], dim=0).contiguous()
|
||||
self.up_proj = torch.cat(w["up_weight"], dim=0).contiguous()
|
||||
self.down_proj = torch.cat(w["down_weight"], dim=0).contiguous()
|
||||
|
||||
moe_config.gate_proj = self.gate_proj.data_ptr()
|
||||
moe_config.up_proj = self.up_proj.data_ptr()
|
||||
moe_config.down_proj = self.down_proj.data_ptr()
|
||||
else:
|
||||
moe_config.load = True
|
||||
|
||||
if not self.load_merged_weight:
|
||||
moe_config.path = self.weight_path
|
||||
|
||||
# Create MoE module based on moe method
|
||||
if self.method == "MOE_INT4":
|
||||
self.moe = Int4_KERNEL_MOE(moe_config)
|
||||
elif self.method == "MOE_INT8":
|
||||
self.moe = Int8_KERNEL_MOE(moe_config)
|
||||
else:
|
||||
raise NotImplementedError(f"Unsupported MoE method: {self.method}")
|
||||
|
||||
# Load weights
|
||||
self.cpu_infer.submit(self.moe.load_weights_task(physical_to_logical_map_cpu.data_ptr()))
|
||||
self.cpu_infer.sync()
|
||||
|
||||
# Clean up temporary weight storage if using merged weights
|
||||
if self.load_merged_weight:
|
||||
del self.gate_weights
|
||||
del self.up_weights
|
||||
del self.down_weights
|
||||
del self.gate_scales
|
||||
del self.up_scales
|
||||
del self.down_scales
|
||||
|
|
@ -615,6 +615,8 @@ class OnlineQuantConverter(ConverterBase):
|
|||
quant_to_amx_map = {
|
||||
"int4": "INT4",
|
||||
"int8": "INT8",
|
||||
"moe_int4": "MOE_INT4",
|
||||
"moe_int8": "MOE_INT8",
|
||||
}
|
||||
amx_method = quant_to_amx_map.get(self.quant_method, "INT4")
|
||||
|
||||
|
|
@ -622,6 +624,7 @@ class OnlineQuantConverter(ConverterBase):
|
|||
for numa_idx in range(self.threadpool_count):
|
||||
numa_folder = os.path.join(layer_path, f"_numa_{numa_idx}")
|
||||
if not os.path.exists(numa_folder):
|
||||
print(f" Warning: NUMA folder not found: {numa_folder}, skipping...")
|
||||
continue
|
||||
|
||||
# Iterate through all experts
|
||||
|
|
@ -755,6 +758,8 @@ class OnlineQuantConverter(ConverterBase):
|
|||
quant_to_amx_map = {
|
||||
"int4": "AMXINT4",
|
||||
"int8": "AMXINT8",
|
||||
"moe_int4": "MOE_INT4",
|
||||
"moe_int8": "MOE_INT8",
|
||||
}
|
||||
amx_method = quant_to_amx_map.get(self.quant_method, "AMXINT4")
|
||||
|
||||
|
|
@ -826,7 +831,7 @@ def main():
|
|||
parser.add_argument("--output", "-o", required=True, help="Output directory for converted safetensors")
|
||||
parser.add_argument(
|
||||
"--quant-method",
|
||||
choices=["int4", "int8", "awq"],
|
||||
choices=["int4", "int8", "awq", "moe_int4", "moe_int8"],
|
||||
default="int4",
|
||||
help="Quantization method for output (default: int4)",
|
||||
)
|
||||
|
|
@ -890,7 +895,7 @@ def main():
|
|||
input_type=None,
|
||||
merge_to_safetensor=merge_to_safetensor,
|
||||
)
|
||||
elif quant_method in ["int4", "int8"] and args.input_type in ["fp8", "fp16", "bf16"]:
|
||||
elif quant_method in ["int4", "int8", "moe_int4", "moe_int8"] and args.input_type in ["fp8", "fp16", "bf16"]:
|
||||
# Use OnlineQuantConverter for both INT4 and INT8 quantization
|
||||
converter = OnlineQuantConverter(
|
||||
args.input_path,
|
||||
|
|
|
|||
|
|
@ -34,63 +34,42 @@ from datasets import load_dataset
|
|||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description="Quantize MoE models with selective quantization")
|
||||
|
||||
|
||||
# Required arguments
|
||||
parser.add_argument(
|
||||
"--model_id",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path to the input model directory"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output_dir",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path to save the quantized model"
|
||||
)
|
||||
|
||||
parser.add_argument("--model_id", type=str, required=True, help="Path to the input model directory")
|
||||
parser.add_argument("--output_dir", type=str, required=True, help="Path to save the quantized model")
|
||||
|
||||
# Optional arguments
|
||||
parser.add_argument(
|
||||
"--quant_type",
|
||||
type=str,
|
||||
choices=["W4A16", "W8A16"],
|
||||
default="W8A16",
|
||||
help="Quantization type: W4A16 (GPTQ4) or W8A16 (GPTQ8). Default: W8A16"
|
||||
help="Quantization type: W4A16 (GPTQ4) or W8A16 (GPTQ8). Default: W8A16",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--num_calibration_samples",
|
||||
type=int,
|
||||
default=512,
|
||||
help="Number of calibration samples. Default: 512"
|
||||
"--num_calibration_samples", type=int, default=512, help="Number of calibration samples. Default: 512"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max_sequence_length",
|
||||
type=int,
|
||||
default=2048,
|
||||
help="Maximum sequence length for calibration. Default: 2048"
|
||||
"--max_sequence_length", type=int, default=2048, help="Maximum sequence length for calibration. Default: 2048"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dampening_frac",
|
||||
type=float,
|
||||
default=0.1,
|
||||
help="Dampening fraction to mitigate quantization noise. Default: 0.1"
|
||||
help="Dampening fraction to mitigate quantization noise. Default: 0.1",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dataset",
|
||||
type=str,
|
||||
default="HuggingFaceH4/ultrachat_200k",
|
||||
help="Dataset for calibration. Default: HuggingFaceH4/ultrachat_200k"
|
||||
help="Dataset for calibration. Default: HuggingFaceH4/ultrachat_200k",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dataset_split",
|
||||
type=str,
|
||||
default="train_sft",
|
||||
help="Dataset split to use. Default: train_sft"
|
||||
"--dataset_split", type=str, default="train_sft", help="Dataset split to use. Default: train_sft"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--force_cpu",
|
||||
action="store_true",
|
||||
help="Force all computations to CPU (sets CUDA_VISIBLE_DEVICES='')"
|
||||
"--force_cpu", action="store_true", help="Force all computations to CPU (sets CUDA_VISIBLE_DEVICES='')"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--ignore_patterns",
|
||||
|
|
@ -103,44 +82,37 @@ def parse_args():
|
|||
r"re:.*\.shared_expert\..*$",
|
||||
r"re:.*\.shared_experts\..*$",
|
||||
r"re:.*\.mlp\.shared_expert_gate$",
|
||||
r"re:.*\.linear_attn\..*$"
|
||||
r"re:.*\.linear_attn\..*$",
|
||||
],
|
||||
help="Regex patterns for layers to ignore during quantization"
|
||||
help="Regex patterns for layers to ignore during quantization",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--torch_dtype",
|
||||
type=str,
|
||||
choices=["bfloat16", "float16", "float32"],
|
||||
default="bfloat16",
|
||||
help="PyTorch dtype for model loading. Default: bfloat16"
|
||||
help="PyTorch dtype for model loading. Default: bfloat16",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--trust_remote_code",
|
||||
action="store_true",
|
||||
help="Allow loading of remote code (required for some models)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--random_seed",
|
||||
type=int,
|
||||
default=42,
|
||||
help="Random seed for dataset shuffling. Default: 42"
|
||||
"--trust_remote_code", action="store_true", help="Allow loading of remote code (required for some models)"
|
||||
)
|
||||
parser.add_argument("--random_seed", type=int, default=42, help="Random seed for dataset shuffling. Default: 42")
|
||||
parser.add_argument(
|
||||
"--max_gpu_memory",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Maximum GPU memory for model weights per device (e.g., '40GiB'). "
|
||||
"GPTQ quantization requires additional GPU memory for Hessian matrix computation, "
|
||||
"so reserve 40-50%% of total VRAM. For example, use '40GiB' on 80GB GPUs. "
|
||||
"Remaining layers will be offloaded to CPU. Default: use all available"
|
||||
"GPTQ quantization requires additional GPU memory for Hessian matrix computation, "
|
||||
"so reserve 40-50%% of total VRAM. For example, use '40GiB' on 80GB GPUs. "
|
||||
"Remaining layers will be offloaded to CPU. Default: use all available",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max_cpu_memory",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Maximum CPU memory to use (e.g., '100GiB'). Default: use all available"
|
||||
help="Maximum CPU memory to use (e.g., '100GiB'). Default: use all available",
|
||||
)
|
||||
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
|
|
@ -167,11 +139,7 @@ def get_torch_dtype(dtype_str):
|
|||
Returns:
|
||||
torch.dtype: Corresponding PyTorch dtype
|
||||
"""
|
||||
dtype_map = {
|
||||
"bfloat16": torch.bfloat16,
|
||||
"float16": torch.float16,
|
||||
"float32": torch.float32
|
||||
}
|
||||
dtype_map = {"bfloat16": torch.bfloat16, "float16": torch.float16, "float32": torch.float32}
|
||||
return dtype_map[dtype_str]
|
||||
|
||||
|
||||
|
|
@ -191,18 +159,18 @@ def check_dense_layers_and_update_ignore(model_id, ignore_patterns, trust_remote
|
|||
Updated ignore_patterns list with dense layer patterns added
|
||||
"""
|
||||
print("🔍 Checking model configuration for dense layers...")
|
||||
|
||||
|
||||
try:
|
||||
# Load model configuration
|
||||
config = AutoConfig.from_pretrained(model_id, trust_remote_code=trust_remote_code)
|
||||
|
||||
|
||||
# Check if the model has first_k_dense_replace parameter
|
||||
first_k_dense_replace = getattr(config, 'first_k_dense_replace', None)
|
||||
|
||||
first_k_dense_replace = getattr(config, "first_k_dense_replace", None)
|
||||
|
||||
if first_k_dense_replace is not None and first_k_dense_replace > 0:
|
||||
print(f"✅ Found dense layers configuration: first_k_dense_replace = {first_k_dense_replace}")
|
||||
print(f" Adding first {first_k_dense_replace} layers to ignore list...")
|
||||
|
||||
|
||||
# Create regex pattern for dense layers (layers 0 to first_k_dense_replace-1)
|
||||
if first_k_dense_replace == 1:
|
||||
dense_pattern = r"re:model\.layers\.0\.mlp\..*$"
|
||||
|
|
@ -210,18 +178,18 @@ def check_dense_layers_and_update_ignore(model_id, ignore_patterns, trust_remote
|
|||
# For multiple layers, use range pattern
|
||||
layer_range = f"[0-{first_k_dense_replace-1}]"
|
||||
dense_pattern = f"re:model\\.layers\\.{layer_range}\\.mlp\\..*$"
|
||||
|
||||
|
||||
# Add the dense layer pattern to ignore list
|
||||
updated_ignore_patterns = ignore_patterns + [dense_pattern]
|
||||
|
||||
|
||||
print(f" Dense layer pattern added: {dense_pattern}")
|
||||
print(f" This will ignore MLP components in layers 0-{first_k_dense_replace-1}")
|
||||
|
||||
|
||||
return updated_ignore_patterns
|
||||
else:
|
||||
print("ℹ️ No dense layers detected (first_k_dense_replace not found or is 0)")
|
||||
return ignore_patterns
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f"⚠️ Warning: Could not check model config for dense layers: {e}")
|
||||
print(" Proceeding with original ignore patterns...")
|
||||
|
|
@ -261,11 +229,7 @@ def load_and_prepare_dataset(dataset_name, dataset_split, num_samples, max_lengt
|
|||
# Tokenize the data
|
||||
def tokenize(sample):
|
||||
return tokenizer(
|
||||
sample["text"],
|
||||
padding=False,
|
||||
max_length=max_length,
|
||||
truncation=True,
|
||||
add_special_tokens=False
|
||||
sample["text"], padding=False, max_length=max_length, truncation=True, add_special_tokens=False
|
||||
)
|
||||
|
||||
ds = ds.map(tokenize, remove_columns=ds.column_names)
|
||||
|
|
@ -306,9 +270,7 @@ def main():
|
|||
# 0) Check for dense layers and update ignore patterns
|
||||
# Dense layers in the first few layers should not be quantized
|
||||
updated_ignore_patterns = check_dense_layers_and_update_ignore(
|
||||
args.model_id,
|
||||
args.ignore_patterns,
|
||||
args.trust_remote_code
|
||||
args.model_id, args.ignore_patterns, args.trust_remote_code
|
||||
)
|
||||
|
||||
# --------------------------------------------------------------------
|
||||
|
|
@ -320,9 +282,7 @@ def main():
|
|||
print("🔍 Building CPU-only device map...")
|
||||
with init_empty_weights():
|
||||
dummy = AutoModelForCausalLM.from_pretrained(
|
||||
args.model_id,
|
||||
torch_dtype=torch_dtype,
|
||||
trust_remote_code=args.trust_remote_code
|
||||
args.model_id, torch_dtype=torch_dtype, trust_remote_code=args.trust_remote_code
|
||||
)
|
||||
device_map = {name: "cpu" for name, _ in dummy.named_modules() if name}
|
||||
del dummy
|
||||
|
|
@ -330,9 +290,7 @@ def main():
|
|||
print("🔍 Inferring device map...")
|
||||
with init_empty_weights():
|
||||
dummy = AutoModelForCausalLM.from_pretrained(
|
||||
args.model_id,
|
||||
torch_dtype=torch_dtype,
|
||||
trust_remote_code=args.trust_remote_code
|
||||
args.model_id, torch_dtype=torch_dtype, trust_remote_code=args.trust_remote_code
|
||||
)
|
||||
# Build max_memory dict if specified
|
||||
max_memory = None
|
||||
|
|
@ -357,9 +315,7 @@ def main():
|
|||
print(f" CPU memory limit: 1000GiB (default, to prevent disk offloading)")
|
||||
|
||||
device_map = infer_auto_device_map(
|
||||
dummy,
|
||||
no_split_module_classes=dummy._no_split_modules,
|
||||
max_memory=max_memory
|
||||
dummy, no_split_module_classes=dummy._no_split_modules, max_memory=max_memory
|
||||
)
|
||||
|
||||
# Check if disk offloading was triggered (not supported by llmcompressor)
|
||||
|
|
@ -371,8 +327,10 @@ def main():
|
|||
print(" 1. Increase --max_gpu_memory to use more GPU memory")
|
||||
print(" 2. Add --max_cpu_memory with higher value (e.g., '200GiB')")
|
||||
print(" 3. Ensure your machine has enough GPU + CPU memory")
|
||||
raise RuntimeError("Disk offloading is not supported by llmcompressor. "
|
||||
"Please ensure you have enough GPU + CPU memory.")
|
||||
raise RuntimeError(
|
||||
"Disk offloading is not supported by llmcompressor. "
|
||||
"Please ensure you have enough GPU + CPU memory."
|
||||
)
|
||||
|
||||
del dummy
|
||||
# --------------------------------------------------------------------
|
||||
|
|
@ -409,7 +367,7 @@ def main():
|
|||
args.num_calibration_samples,
|
||||
args.max_sequence_length,
|
||||
tokenizer,
|
||||
args.random_seed
|
||||
args.random_seed,
|
||||
)
|
||||
|
||||
# --------------------------------------------------------------------
|
||||
|
|
@ -447,4 +405,4 @@ def main():
|
|||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
main()
|
||||
|
|
|
|||
|
|
@ -21,6 +21,7 @@ Environment knobs (export before running pip install .):
|
|||
CPUINFER_ENABLE_AMD=OFF ON/OFF -> -DKTRANSFORMERS_CPU_MOE_AMD
|
||||
CPUINFER_ENABLE_KML=OFF ON/OFF -> -DKTRANSFORMERS_CPU_USE_KML
|
||||
CPUINFER_ENABLE_AVX512=OFF ON/OFF -> -DKTRANSFORMERS_CPU_USE_AMX_AVX512
|
||||
CPUINFER_BLIS_ROOT=/path/to/blis Forward to -DBLIS_ROOT
|
||||
|
||||
|
||||
CPUINFER_ENABLE_LTO=ON ON/OFF -> -DCPUINFER_ENABLE_LTO (your added option)
|
||||
|
|
@ -28,6 +29,7 @@ Environment knobs (export before running pip install .):
|
|||
CPUINFER_LTO_MODE=auto Forward to -DCPUINFER_LTO_MODE
|
||||
CPUINFER_NATIVE=ON (override LLAMA_NATIVE)
|
||||
|
||||
|
||||
GPU backends (if ever added later, keep placeholders):
|
||||
CPUINFER_USE_CUDA=0/1 -DKTRANSFORMERS_USE_CUDA
|
||||
CPUINFER_USE_ROCM=0/1 -DKTRANSFORMERS_USE_ROCM
|
||||
|
|
@ -51,6 +53,43 @@ from setuptools import setup, Extension
|
|||
from setuptools.command.build_ext import build_ext
|
||||
import shutil
|
||||
|
||||
# -------------------------
|
||||
# Env parsing helpers
|
||||
# -------------------------
|
||||
def _env_get_bool(name: str, default: bool | None = None) -> bool | None:
|
||||
v = os.environ.get(name)
|
||||
if v is None:
|
||||
return default
|
||||
val = v.strip().lower()
|
||||
if val in ("1", "on", "true", "yes", "y", "enable", "enabled"):
|
||||
return True
|
||||
if val in ("0", "off", "false", "no", "n", "disable", "disabled"):
|
||||
return False
|
||||
return default
|
||||
|
||||
|
||||
def _cmake_onoff(flag: bool) -> str:
|
||||
return "ON" if flag else "OFF"
|
||||
|
||||
|
||||
def _forward_bool_env(cmake_args: list[str], env_name: str, cmake_flag: str) -> bool:
|
||||
"""If env exists, forward it to CMake as -D<flag>=ON/OFF and return True; else return False."""
|
||||
b = _env_get_bool(env_name, None)
|
||||
if b is None:
|
||||
return False
|
||||
cmake_args.append(f"-D{cmake_flag}={_cmake_onoff(b)}")
|
||||
print(f"-- Forward {env_name} -> -D{cmake_flag}={_cmake_onoff(b)}")
|
||||
return True
|
||||
|
||||
|
||||
def _forward_str_env(cmake_args: list[str], env_name: str, cmake_flag: str) -> bool:
|
||||
v = os.environ.get(env_name)
|
||||
if not v:
|
||||
return False
|
||||
cmake_args.append(f"-D{cmake_flag}={v}")
|
||||
print(f"-- Forward {env_name} -> -D{cmake_flag}={v}")
|
||||
return True
|
||||
|
||||
################################################################################
|
||||
# Helpers
|
||||
################################################################################
|
||||
|
|
@ -204,7 +243,34 @@ class CMakeBuild(build_ext):
|
|||
return True
|
||||
return False
|
||||
|
||||
if os.environ.get("CPUINFER_USE_CUDA") is None:
|
||||
# Locate nvcc executable (without forcing user to set -DCMAKE_CUDA_COMPILER)
|
||||
def find_nvcc_path() -> str | None:
|
||||
cuda_home = os.environ.get("CUDA_HOME")
|
||||
if cuda_home:
|
||||
cand = Path(cuda_home) / "bin" / "nvcc"
|
||||
if cand.exists():
|
||||
return str(cand)
|
||||
which_nvcc = shutil.which("nvcc")
|
||||
if which_nvcc:
|
||||
return which_nvcc
|
||||
# Common fallbacks (ordered by preference)
|
||||
for cand in [
|
||||
"/usr/local/cuda-12.6/bin/nvcc",
|
||||
"/usr/local/cuda/bin/nvcc",
|
||||
"/usr/bin/nvcc",
|
||||
"/usr/lib/nvidia-cuda-toolkit/bin/nvcc",
|
||||
]:
|
||||
if Path(cand).exists():
|
||||
return cand
|
||||
return None
|
||||
|
||||
# Note: We no longer set CMAKE_CUDA_ARCHITECTURES by default.
|
||||
# If users want to specify CUDA archs, they can set env CPUINFER_CUDA_ARCHS
|
||||
# (e.g. "89" or "86;89") or pass it via CMAKE_ARGS.
|
||||
auto_moe_kernel_ = False
|
||||
# Normalize CPUINFER_USE_CUDA: if unset, auto-detect; otherwise respect truthy/falsey values
|
||||
cuda_env = _env_get_bool("CPUINFER_USE_CUDA", None)
|
||||
if cuda_env is None:
|
||||
auto_cuda = detect_cuda_toolkit()
|
||||
os.environ["CPUINFER_USE_CUDA"] = "1" if auto_cuda else "0"
|
||||
print(f"-- CPUINFER_USE_CUDA not set; auto-detected CUDA toolkit: {'YES' if auto_cuda else 'NO'}")
|
||||
|
|
@ -228,56 +294,87 @@ class CMakeBuild(build_ext):
|
|||
print(f"Detected CPU info: {d}")
|
||||
|
||||
# Vendor / feature specific toggles
|
||||
# Enable AMD MoE kernel on AMD by default unless user explicitly set CPUINFER_ENABLE_AMD
|
||||
# temporarily disabled this opt, use llamafile backend for now
|
||||
# if d.get("vendor") == "amd" and os.environ.get("CPUINFER_ENABLE_AMD") is None:
|
||||
# cmake_args.append("-DKTRANSFORMERS_CPU_MOE_AMD=ON")
|
||||
# print("-- Detected AMD CPU; enabling AMD MoE kernel (-DKTRANSFORMERS_CPU_MOE_AMD=ON)")
|
||||
# AMD MoE: explicit env overrides; otherwise default ON on AMD CPU
|
||||
if not _forward_bool_env(cmake_args, "CPUINFER_ENABLE_AMD", "KTRANSFORMERS_CPU_MOE_AMD"):
|
||||
if d.get("vendor") == "amd":
|
||||
auto_moe_kernel_ = True
|
||||
cmake_args.append("-DKTRANSFORMERS_CPU_MOE_AMD=ON")
|
||||
print("-- Detected AMD CPU; enabling AMD MoE kernel (-DKTRANSFORMERS_CPU_MOE_AMD=ON)")
|
||||
_forward_str_env(cmake_args, "CPUINFER_BLIS_ROOT", "BLIS_ROOT")
|
||||
|
||||
# On ARM, enable KML by default if not explicitly toggled
|
||||
if d.get("vendor") == "arm" and os.environ.get("CPUINFER_ENABLE_KML") is None:
|
||||
cmake_args.append("-DKTRANSFORMERS_CPU_USE_KML=ON")
|
||||
print("-- Detected ARM CPU; enabling KML (-DKTRANSFORMERS_CPU_USE_KML=ON)")
|
||||
# KML: explicit env overrides; otherwise default ON on ARM
|
||||
if not _forward_bool_env(cmake_args, "CPUINFER_ENABLE_KML", "KTRANSFORMERS_CPU_USE_KML"):
|
||||
if d.get("vendor") == "arm":
|
||||
auto_moe_kernel_ = True
|
||||
cmake_args.append("-DKTRANSFORMERS_CPU_USE_KML=ON")
|
||||
print("-- Detected ARM CPU; enabling KML (-DKTRANSFORMERS_CPU_USE_KML=ON)")
|
||||
|
||||
# If AMX or AVX512 present, enable umbrella unless overridden; enable AMX specifically when present
|
||||
if "AMX" in d["features"]:
|
||||
if os.environ.get("CPUINFER_ENABLE_AMX") is None:
|
||||
# AMX: explicit env overrides; else enable if detected
|
||||
if not _forward_bool_env(cmake_args, "CPUINFER_ENABLE_AMX", "KTRANSFORMERS_CPU_USE_AMX"):
|
||||
if "AMX" in d["features"]:
|
||||
cmake_args.append("-DKTRANSFORMERS_CPU_USE_AMX=ON")
|
||||
print("-- AMX support detected; enabling (-DKTRANSFORMERS_CPU_USE_AMX=ON)")
|
||||
if ("AMX" in d["features"] or "AVX512" in d["features"]) and os.environ.get(
|
||||
"CPUINFER_ENABLE_AVX512"
|
||||
) is None:
|
||||
cmake_args.append("-DKTRANSFORMERS_CPU_USE_AMX_AVX512=ON")
|
||||
print("-- Enabling AMX/AVX512 umbrella (-DKTRANSFORMERS_CPU_USE_AMX_AVX512=ON)")
|
||||
# AVX512 umbrella: explicit env overrides; else enable if AMX or AVX512 detected
|
||||
if not _forward_bool_env(cmake_args, "CPUINFER_ENABLE_AVX512", "KTRANSFORMERS_CPU_USE_AMX_AVX512"):
|
||||
if "AMX" in d["features"] or "AVX512" in d["features"]:
|
||||
cmake_args.append("-DKTRANSFORMERS_CPU_USE_AMX_AVX512=ON")
|
||||
print("-- Enabling AMX/AVX512 umbrella (-DKTRANSFORMERS_CPU_USE_AMX_AVX512=ON)")
|
||||
|
||||
# Auto-enable MOE kernel only when env explicitly turns on AMD or KML backend
|
||||
# (Do not enable purely on vendor auto-detection to avoid surprise behavior.)
|
||||
amd_env = _env_get_bool("CPUINFER_ENABLE_AMD", None)
|
||||
kml_env = _env_get_bool("CPUINFER_ENABLE_KML", None)
|
||||
if amd_env or kml_env:
|
||||
auto_moe_kernel_ = True
|
||||
already_set = any("KTRANSFORMERS_CPU_MOE_KERNEL" in a for a in cmake_args)
|
||||
if not already_set and auto_moe_kernel_:
|
||||
cmake_args.append("-DKTRANSFORMERS_CPU_MOE_KERNEL=ON")
|
||||
print("-- Auto-enabling MOE kernel (-DKTRANSFORMERS_CPU_MOE_KERNEL=ON) because CPUINFER_ENABLE_AMD or CPUINFER_ENABLE_KML is ON")
|
||||
|
||||
# Friendly summary
|
||||
print(
|
||||
f"-- CPU detection: vendor={d.get('vendor')} arch={d.get('arch')} features={sorted(list(d.get('features', [])))}"
|
||||
)
|
||||
|
||||
# Optional AMX / MLA toggles (explicit env overrides auto detection above)
|
||||
if os.environ.get("CPUINFER_ENABLE_AMX"):
|
||||
cmake_args.append(f"-DKTRANSFORMERS_CPU_USE_AMX={os.environ['CPUINFER_ENABLE_AMX']}")
|
||||
if os.environ.get("CPUINFER_ENABLE_KML"):
|
||||
cmake_args.append(f"-DKTRANSFORMERS_CPU_USE_KML={os.environ['CPUINFER_ENABLE_KML']}")
|
||||
if os.environ.get("CPUINFER_ENABLE_MLA"):
|
||||
cmake_args.append(f"-DKTRANSFORMERS_CPU_MLA={os.environ['CPUINFER_ENABLE_MLA']}")
|
||||
# MLA toggle (string/boolean allowed)
|
||||
if not _forward_bool_env(cmake_args, "CPUINFER_ENABLE_MLA", "KTRANSFORMERS_CPU_MLA"):
|
||||
_forward_str_env(cmake_args, "CPUINFER_ENABLE_MLA", "KTRANSFORMERS_CPU_MLA")
|
||||
|
||||
# LTO toggles if user added them in CMakeLists
|
||||
if os.environ.get("CPUINFER_ENABLE_LTO"):
|
||||
cmake_args.append(f"-DCPUINFER_ENABLE_LTO={os.environ['CPUINFER_ENABLE_LTO']}")
|
||||
if os.environ.get("CPUINFER_LTO_JOBS"):
|
||||
cmake_args.append(f"-DCPUINFER_LTO_JOBS={os.environ['CPUINFER_LTO_JOBS']}")
|
||||
if os.environ.get("CPUINFER_LTO_MODE"):
|
||||
cmake_args.append(f"-DCPUINFER_LTO_MODE={os.environ['CPUINFER_LTO_MODE']}")
|
||||
# LTO toggles
|
||||
_forward_bool_env(cmake_args, "CPUINFER_ENABLE_LTO", "CPUINFER_ENABLE_LTO")
|
||||
_forward_str_env(cmake_args, "CPUINFER_LTO_JOBS", "CPUINFER_LTO_JOBS")
|
||||
_forward_str_env(cmake_args, "CPUINFER_LTO_MODE", "CPUINFER_LTO_MODE")
|
||||
|
||||
# GPU backends (mutually exclusive expected)
|
||||
if os.environ.get("CPUINFER_USE_CUDA") == "1":
|
||||
if _env_get_bool("CPUINFER_USE_CUDA", False):
|
||||
cmake_args.append("-DKTRANSFORMERS_USE_CUDA=ON")
|
||||
print("-- Enabling CUDA backend (-DKTRANSFORMERS_USE_CUDA=ON)")
|
||||
if os.environ.get("CPUINFER_USE_ROCM") == "1":
|
||||
# Inject nvcc compiler path automatically unless user already specified one.
|
||||
user_specified_compiler = any("CMAKE_CUDA_COMPILER" in a for a in cmake_args)
|
||||
if not user_specified_compiler:
|
||||
extra_env = os.environ.get("CMAKE_ARGS", "")
|
||||
if "CMAKE_CUDA_COMPILER" in extra_env:
|
||||
user_specified_compiler = True
|
||||
if not user_specified_compiler:
|
||||
nvcc_path = find_nvcc_path()
|
||||
if nvcc_path:
|
||||
cmake_args.append(f"-DCMAKE_CUDA_COMPILER={nvcc_path}")
|
||||
print(f"-- Auto-detected nvcc: {nvcc_path} (adding -DCMAKE_CUDA_COMPILER)")
|
||||
else:
|
||||
print("-- Warning: nvcc not found via CUDA_HOME/PATH/common prefixes; CUDA configure may fail.")
|
||||
# Optional host compiler for nvcc if user set CUDAHOSTCXX
|
||||
if os.environ.get("CUDAHOSTCXX"):
|
||||
hostcxx = os.environ["CUDAHOSTCXX"]
|
||||
cmake_args.append(f"-DCMAKE_CUDA_HOST_COMPILER={hostcxx}")
|
||||
print(f"-- Using CUDA host compiler from CUDAHOSTCXX: {hostcxx}")
|
||||
# Respect user-provided architectures only (no default auto-detection).
|
||||
archs_env = os.environ.get("CPUINFER_CUDA_ARCHS", "").strip()
|
||||
if archs_env and not any("CMAKE_CUDA_ARCHITECTURES" in a for a in cmake_args):
|
||||
cmake_args.append(f"-DCMAKE_CUDA_ARCHITECTURES={archs_env}")
|
||||
print(f"-- Set CUDA architectures from CPUINFER_CUDA_ARCHS: {archs_env}")
|
||||
if _env_get_bool("CPUINFER_USE_ROCM", False):
|
||||
cmake_args.append("-DKTRANSFORMERS_USE_ROCM=ON")
|
||||
if os.environ.get("CPUINFER_USE_MUSA") == "1":
|
||||
if _env_get_bool("CPUINFER_USE_MUSA", False):
|
||||
cmake_args.append("-DKTRANSFORMERS_USE_MUSA=ON")
|
||||
|
||||
# Respect user extra CMAKE_ARGS (space separated)
|
||||
|
|
@ -286,7 +383,7 @@ class CMakeBuild(build_ext):
|
|||
cmake_args += [a for a in extra.split() if a]
|
||||
|
||||
# Force rebuild? (delete cache)
|
||||
if os.environ.get("CPUINFER_FORCE_REBUILD") == "1":
|
||||
if _env_get_bool("CPUINFER_FORCE_REBUILD", True):
|
||||
cache = build_temp / "CMakeCache.txt"
|
||||
if cache.exists():
|
||||
cache.unlink()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue