diff --git a/kt-kernel/CMakeLists.txt b/kt-kernel/CMakeLists.txt index 438c43d3..1192b4cf 100644 --- a/kt-kernel/CMakeLists.txt +++ b/kt-kernel/CMakeLists.txt @@ -495,7 +495,7 @@ if(NOT DEFINED CLANG_FORMAT_BIN) ) endif() if(NOT CLANG_FORMAT_BIN) - message(WARNING "clang-format not found. Please install clang-format (>=18) or pass -DCLANG_FORMAT_BIN=/full/path and reconfigure.") + message(WARNING "ONLY for developer: clang-format not found. Please install clang-format (>=18) or pass -DCLANG_FORMAT_BIN=/full/path and reconfigure.") else() execute_process( COMMAND ${CLANG_FORMAT_BIN} --version diff --git a/kt-kernel/CMakePresets.json b/kt-kernel/CMakePresets.json index 1c3b00b9..c8db5fc8 100644 --- a/kt-kernel/CMakePresets.json +++ b/kt-kernel/CMakePresets.json @@ -39,6 +39,20 @@ "KTRANSFORMERS_CPU_USE_AMX_AVX512": "ON", "KTRANSFORMERS_USE_CUDA": "ON" } + }, + { + "name": "amd", + "displayName": "amd_platform", + "description": "for amd platform", + "cacheVariables": { + "KTRANSFORMERS_CPU_USE_AMX": "OFF", + "LLAMA_AVX512": "OFF", + "LLAMA_AVX2": "ON", + "KTRANSFORMERS_CPU_USE_AMX_AVX512": "OFF", + "KTRANSFORMERS_USE_CUDA": "ON", + "KTRANSFORMERS_CPU_MOE_AMD": "ON", + "KTRANSFORMERS_CPU_MOE_KERNEL": "ON" + } } ] diff --git a/kt-kernel/README.md b/kt-kernel/README.md index 2e373e3b..2f4f6cfa 100644 --- a/kt-kernel/README.md +++ b/kt-kernel/README.md @@ -2,41 +2,32 @@ High-performance kernel operations for KTransformers, featuring CPU-optimized MoE inference with AMX, AVX, KML and blis (amd library) support. -- [KT-Kernel](#kt-kernel) - - [Note](#note) - - [Features](#features) - - [Installation](#installation) - - [Prerequisites](#prerequisites) - - [Quick Installation (Recommended)](#quick-installation-recommended) - - [Manual Configuration (Advanced)](#manual-configuration-advanced) - - [Verification](#verification) - - [Integration with SGLang](#integration-with-sglang) - - [Installation Steps](#installation-steps) - - [1. Install SGLang](#1-install-sglang) - - [2. Prepare Weights](#2-prepare-weights) - - [3. Launch SGLang Server](#3-launch-sglang-server) - - [Complete Example: Qwen3-30B-A3B](#complete-example-qwen3-30b-a3b) - - [Option A: AMX Backend (AMXINT8)](#option-a-amx-backend-amxint8) - - [Option B: LLAMAFILE Backend (GGUF)](#option-b-llamafile-backend-gguf) - - [KT-Kernel Parameters](#kt-kernel-parameters) - - [Direct Python API Usage](#direct-python-api-usage) - - [Advanced Options](#advanced-options) - - [Build Configuration](#build-configuration) - - [Manual Installation](#manual-installation) - - [1. Install System Dependencies](#1-install-system-dependencies) - - [2. Set Build Configuration](#2-set-build-configuration) - - [3. Build and Install](#3-build-and-install) - - [Error Troubleshooting](#error-troubleshooting) - - [CUDA Not Found](#cuda-not-found) - - [hwloc Not Found](#hwloc-not-found) - - [Weight Quantization](#weight-quantization) - - [Before Commit!](#before-commit) +- [Note](#note) +- [Features](#features) +- [Installation](#installation) + - [Prerequisites](#prerequisites) + - [Quick Installation (Recommended)](#quick-installation-recommended) + - [Manual Configuration (Advanced)](#manual-configuration-advanced) +- [Verification](#verification) +- [Integration with SGLang](#integration-with-sglang) + - [Installation Steps](#installation-steps) + - [Complete Example: Qwen3-30B-A3B](#complete-example-qwen3-30b-a3b) + - [KT-Kernel Parameters](#kt-kernel-parameters) +- [Direct Python API Usage](#direct-python-api-usage) + - [Advanced Options](#advanced-options) +- [Build Configuration](#build-configuration) + - [Manual Installation](#manual-installation) +- [Error Troubleshooting](#error-troubleshooting) + - [CUDA Not Found](#cuda-not-found) + - [hwloc Not Found](#hwloc-not-found) +- [Weight Quantization](#weight-quantization) +- [Before Commit!](#before-commit) ## Note **Current Support Status:** - ✅ **Intel CPUs with AMX**: Fully supported (using weights converted to INT4/INT8 format) - ✅ **Universal CPU (llamafile backend)**: Supported (using GGUF-format weights) -- ⚠️ **AMD CPUs with BLIS**: In progress, not yet fully integrated +- ✅ **AMD CPUs with BLIS**: Supported (for int8 prefill & decode) ## Features @@ -145,7 +136,7 @@ python scripts/convert_cpu_weights.py \ --input-path /path/to/model \ --input-type bf16 \ --output /path/to/cpu-weights \ - --quant-method int8 # or int4 + --quant-method int8 # or int4 or moe_int8 (for amd now) ``` - `--input-path`: Path to GPU-side original weights diff --git a/kt-kernel/README_zh.md b/kt-kernel/README_zh.md index a4a7cadc..95e781f2 100644 --- a/kt-kernel/README_zh.md +++ b/kt-kernel/README_zh.md @@ -2,42 +2,33 @@ 高性能 KTransformers 内核库,提供面向 CPU 的高效 MoE 推理内核,支持 AMX 和 AVX 等后端。 -- [KT-Kernel](#kt-kernel) - - [说明](#说明) - - [特性](#特性) - - [安装](#安装) - - [先决条件](#先决条件) - - [快速安装(推荐)](#快速安装推荐) - - [手动配置(进阶)](#手动配置进阶) - - [验证安装](#验证安装) - - [与 SGLang 集成](#与-sglang-集成) - - [安装步骤](#安装步骤) - - [1. 安装 SGLang](#1-安装-sglang) - - [2. 准备权重](#2-准备权重) - - [3. 启动 SGLang Server](#3-启动-sglang-server) - - [完整示例:Qwen3-30B-A3B](#完整示例qwen3-30b-a3b) - - [方案 A:AMX 后端(AMXINT8)](#方案-aamx-后端amxint8) - - [方案 B:LLAMAFILE 后端(GGUF)](#方案-bllamafile-后端gguf) - - [KT-Kernel 参数](#kt-kernel-参数) - - [直接使用 Python API](#直接使用-python-api) - - [高级选项](#高级选项) - - [构建配置](#构建配置) - - [手动安装](#手动安装) - - [1. 安装系统依赖](#1-安装系统依赖) - - [2. 配置构建参数](#2-配置构建参数) - - [3. 构建并安装](#3-构建并安装) - - [错误排查](#错误排查) - - [找不到 CUDA](#找不到-cuda) - - [找不到 hwloc](#找不到-hwloc) - - [权重量化](#权重量化) - - [提交前必读](#提交前必读) +- [说明](#说明) +- [特性](#特性) +- [安装](#安装) + - [先决条件](#先决条件) + - [快速安装(推荐)](#快速安装推荐) + - [手动配置(进阶)](#手动配置进阶) +- [验证安装](#验证安装) +- [与 SGLang 集成](#与-sglang-集成) + - [安装步骤](#安装步骤) + - [完整示例:Qwen3-30B-A3B](#完整示例qwen3-30b-a3b) + - [KT-Kernel 参数](#kt-kernel-参数) +- [直接使用 Python API](#直接使用-python-api) + - [高级选项](#高级选项) +- [构建配置](#构建配置) + - [手动安装](#手动安装) +- [错误排查](#错误排查) + - [找不到 CUDA](#找不到-cuda) + - [找不到 hwloc](#找不到-hwloc) +- [权重量化](#权重量化) +- [提交前必读](#提交前必读) ## 说明 **当前支持状态:** - ✅ **带 AMX 的 Intel CPU**:已支持(基于转换为 INT4/INT8 格式的权重) - ✅ **通用 CPU(llamafile 后端)**:已支持(基于 GGUF 格式的权重) -- ⚠️ **带 BLIS 的 AMD CPU**:进行中,尚未完全集成 +- ✅ **带 BLIS 的 AMD CPU**:已支持(int8 的 prefill 和 decode) ## 特性 @@ -149,7 +140,7 @@ python scripts/convert_cpu_weights.py \ --input-path /path/to/model \ --input-type bf16 \ --output /path/to/cpu-weights \ - --quant-method int8 # 或 int4 + --quant-method int8 # 或 int4 或 moe_int8(用于 amd 的) ``` - `--input-path`:GPU 侧原始权重路径 diff --git a/kt-kernel/operators/amx/test/mmq-test.cpp b/kt-kernel/operators/amx/test/mmq-test.cpp index 2a35ae2b..c7dc43a8 100644 --- a/kt-kernel/operators/amx/test/mmq-test.cpp +++ b/kt-kernel/operators/amx/test/mmq-test.cpp @@ -2376,9 +2376,7 @@ bool ggml_compute_forward_mul_mat_use_amx(struct ggml_tensor* dst) { static thread_local bool is_first_time = true; if (is_first_time) { #pragma omp single - { - ggml_amx_init(); - } + { ggml_amx_init(); } // load tile config ggml_tile_config_init(); diff --git a/kt-kernel/operators/amx/test/mmq.cpp b/kt-kernel/operators/amx/test/mmq.cpp index 0f446cc6..8e9f296d 100644 --- a/kt-kernel/operators/amx/test/mmq.cpp +++ b/kt-kernel/operators/amx/test/mmq.cpp @@ -2372,9 +2372,7 @@ bool ggml_compute_forward_mul_mat_use_amx(struct ggml_tensor* dst) { static thread_local bool is_first_time = true; if (is_first_time) { #pragma omp single - { - ggml_amx_init(); - } + { ggml_amx_init(); } // load tile config ggml_tile_config_init(); diff --git a/kt-kernel/operators/llamafile/mla.hpp b/kt-kernel/operators/llamafile/mla.hpp index 7ac23242..03f70c4d 100644 --- a/kt-kernel/operators/llamafile/mla.hpp +++ b/kt-kernel/operators/llamafile/mla.hpp @@ -14,15 +14,15 @@ // #include // #include -// #define DIRECT_OR_POOL_BY(what, threshold, var, fn) \ -// do { \ -// if ((what) < (threshold)) { \ -// for (int i = 0; i < (var); i++) { \ -// (fn)(i); \ -// } \ -// } else { \ -// pool->do_work_stealing_job((var), nullptr, (fn), nullptr); \ -// } \ +// #define DIRECT_OR_POOL_BY(what, threshold, var, fn) \ +// do { \ +// if ((what) < (threshold)) { \ +// for (int i = 0; i < (var); i++) { \ +// (fn)(i); \ +// } \ +// } else { \ +// pool->do_work_stealing_job((var), nullptr, (fn), nullptr); \ +// } \ // } while (0) // #define VEC_DOT_TYPE(type) (ggml_internal_get_type_traits((ggml_type)(type)).vec_dot_type) @@ -31,19 +31,20 @@ // #define QUANT_OFFSET(ptr, type, n, n_elements) \ // (offset_pointer((ptr), (size_t)(n) * QUANT_BLCK_SIZE((n_elements), (type)))) -// #define LLAMAFILE_SGEMM_QUANT_FULL_MATMUL(m, n, k, a, a_type, b, b_col, c, c_col) \ -// do { \ -// llamafile_sgemm((m), (n), QUANT_BLCK_COUNT((k), (a_type)), (a), QUANT_BLCK_COUNT((k), (a_type)), \ -// QUANT_OFFSET((b), VEC_DOT_TYPE((a_type)), (b_col), (k)), \ -// QUANT_BLCK_COUNT((k), VEC_DOT_TYPE((a_type))), offset_pointer((c), (c_col) * (m) * sizeof(float)), \ -// (k), 0, 1, GGML_TASK_TYPE_COMPUTE, (a_type), VEC_DOT_TYPE((a_type)), GGML_TYPE_F32, \ -// GGML_PREC_DEFAULT); \ +// #define LLAMAFILE_SGEMM_QUANT_FULL_MATMUL(m, n, k, a, a_type, b, b_col, c, c_col) \ +// do { \ +// llamafile_sgemm((m), (n), QUANT_BLCK_COUNT((k), (a_type)), (a), QUANT_BLCK_COUNT((k), (a_type)), \ +// QUANT_OFFSET((b), VEC_DOT_TYPE((a_type)), (b_col), (k)), \ +// QUANT_BLCK_COUNT((k), VEC_DOT_TYPE((a_type))), offset_pointer((c), (c_col) * (m) * +// sizeof(float)), \ +// (k), 0, 1, GGML_TASK_TYPE_COMPUTE, (a_type), VEC_DOT_TYPE((a_type)), GGML_TYPE_F32, \ +// GGML_PREC_DEFAULT); \ // } while (0) -// #define LLAMAFILE_SGEMM_MATMUL_F32(m, n, k, a, lda, b, ldb, c, ldc) \ -// do { \ -// llamafile_sgemm((m), (n), (k), (a), (lda), (b), (ldb), (c), (ldc), 0, 1, GGML_TASK_TYPE_COMPUTE, GGML_TYPE_F32, \ -// GGML_TYPE_F32, GGML_TYPE_F32, GGML_PREC_DEFAULT); \ +// #define LLAMAFILE_SGEMM_MATMUL_F32(m, n, k, a, lda, b, ldb, c, ldc) \ +// do { \ +// llamafile_sgemm((m), (n), (k), (a), (lda), (b), (ldb), (c), (ldc), 0, 1, GGML_TASK_TYPE_COMPUTE, GGML_TYPE_F32, \ +// GGML_TYPE_F32, GGML_TYPE_F32, GGML_PREC_DEFAULT); \ // } while (0) // // bool decide_absorb(size_t a,int a_type,size_t b,int b_type,size_t c,int c_type,size_t d,int d_type){ diff --git a/kt-kernel/operators/moe_kernel/la/kernel.hpp b/kt-kernel/operators/moe_kernel/la/kernel.hpp index 34d55fc0..bc685e38 100644 --- a/kt-kernel/operators/moe_kernel/la/kernel.hpp +++ b/kt-kernel/operators/moe_kernel/la/kernel.hpp @@ -340,7 +340,7 @@ struct GemmKernelInt8 { static inline const int PACK_SIZE_M = 8; static inline const int PACK_SIZE_K = 32; - static std::string name() { return "INT8"; } + static std::string name() { return "MOE_INT8"; } static int recommended_nth(int n) { return (n + N_BLOCK - 1) / N_BLOCK; } // type_: d for decode, p for prefill static int recommended_nth_down(int n, char type_ = 'd') { @@ -833,7 +833,7 @@ struct GemmKernelInt4 { static inline const int PACK_SIZE_K = 32; static inline const int PACK_SIZE_M = 8; - static std::string name() { return "INT4"; } + static std::string name() { return "MOE_INT4"; } static int recommended_nth(int n) { return (n + N_BLOCK - 1) / N_BLOCK; } static int recommended_nth_down(int n, char type_ = 'd') { diff --git a/kt-kernel/operators/moe_kernel/moe.hpp b/kt-kernel/operators/moe_kernel/moe.hpp index c5d3acbc..b866739b 100644 --- a/kt-kernel/operators/moe_kernel/moe.hpp +++ b/kt-kernel/operators/moe_kernel/moe.hpp @@ -12,8 +12,8 @@ #include #include +#include "../../cpu_backend/shared_mem_buffer.h" #include "../common.hpp" -#include "../cpu_backend/shared_mem_buffer.h" #include "../moe-tp.hpp" #include "api/common.h" #include "api/mat_kernel.h" @@ -57,6 +57,9 @@ class MOE_KERNEL_TP std::vector> down_bb_; std::vector> down_bc_; + std::vector gate_up_owner_ptr_; + std::vector down_owner_ptr_; + inline void write_weights(std::filesystem::path prefix, std::string mat_class, char* bb, int expert_idx, size_t size, size_t scale_size) { // printf("expert %d, size %ld, scale size %ld\n", expert_idx, size, scale_size); @@ -182,6 +185,7 @@ class MOE_KERNEL_TP down_ba_.push_back(std::make_shared(config_.max_len, config_.intermediate_size, nullptr)); down_bc_.push_back(std::make_shared(config_.max_len, config_.hidden_size, nullptr)); void* gate_up_down_per_exp_ptr = std::aligned_alloc(64, gate_up_exp_size); + gate_up_owner_ptr_.push_back(gate_up_down_per_exp_ptr); gate_bb_.push_back(std::make_shared(config_.intermediate_size, config_.hidden_size, gate_up_down_per_exp_ptr, PACKED, 'u', PLAIN)); @@ -193,6 +197,7 @@ class MOE_KERNEL_TP void* down_bb_ptr = std::aligned_alloc( 64, T::BufferB::required_size(config_.hidden_size, config_.intermediate_size, PACKED, 'd', PLAIN)); + down_owner_ptr_.push_back(down_bb_ptr); down_bb_.push_back(std::make_shared(config_.hidden_size, config_.intermediate_size, down_bb_ptr, PACKED, 'd', PLAIN)); } @@ -220,27 +225,41 @@ class MOE_KERNEL_TP ~MOE_KERNEL_TP() { // printf(" Destroying KML_MOE_TP %lx\n", (intptr_t)(this)); + for (void* ptr : gate_up_owner_ptr_) { + std::free(ptr); + } + for (void* ptr : down_owner_ptr_) { + std::free(ptr); + } } void load_weights() { auto pool = config_.pool->get_subpool(tp_part_idx); const uint64_t* physical_to_logical_map = (const uint64_t*)config_.physical_to_logical_map; if (config_.gate_projs.size()) { + printf("load from safetensor"); pool->do_work_stealing_job( config_.expert_num, nullptr, [this, physical_to_logical_map](int expert_id) { uint64_t logical_expert_id = expert_map(physical_to_logical_map, expert_id); { size_t scale_size = config_.intermediate_size * sizeof(float); - size_t size = T::BufferB::required_size(config_.intermediate_size, config_.hidden_size) - scale_size; + size_t whole_size_ = + T::BufferB::required_size(config_.intermediate_size, config_.hidden_size, PACKED, 'u', PLAIN); + size_t size = whole_size_ - scale_size; + void* dst_ = PLAIN ? gate_bb_[expert_id]->b : gate_bb_[expert_id]->b_pack[0]; - memcpy(gate_bb_[expert_id]->b, config_.gate_projs[tp_part_idx][logical_expert_id], size); + memcpy(dst_, config_.gate_projs[tp_part_idx][logical_expert_id], size); if constexpr (T::BufferB::SCALE) { memcpy(gate_bb_[expert_id]->d, config_.gate_scales[tp_part_idx][logical_expert_id], scale_size); } - memcpy(up_bb_[expert_id]->b, config_.up_projs[tp_part_idx][logical_expert_id], size); + whole_size_ = + T::BufferB::required_size(config_.intermediate_size, config_.hidden_size, PACKED, 'u', PLAIN); + size = whole_size_ - scale_size; + dst_ = PLAIN ? up_bb_[expert_id]->b : up_bb_[expert_id]->b_pack[0]; + memcpy(dst_, config_.up_projs[tp_part_idx][logical_expert_id], size); if constexpr (T::BufferB::SCALE) { memcpy(up_bb_[expert_id]->d, config_.up_scales[tp_part_idx][logical_expert_id], scale_size); @@ -249,9 +268,11 @@ class MOE_KERNEL_TP { size_t scale_size = config_.hidden_size * sizeof(float); - size_t size = T::BufferB::required_size(config_.hidden_size, config_.intermediate_size) - scale_size; - - memcpy(down_bb_[expert_id]->b, config_.down_projs[tp_part_idx][logical_expert_id], size); + size_t whole_size_ = + T::BufferB::required_size(config_.hidden_size, config_.intermediate_size, PACKED, 'd', PLAIN); + size_t size = whole_size_ - scale_size; + void* dst_ = PLAIN ? down_bb_[expert_id]->b : down_bb_[expert_id]->b_pack[0]; + memcpy(dst_, config_.down_projs[tp_part_idx][logical_expert_id], size); if constexpr (T::BufferB::SCALE) { memcpy(down_bb_[expert_id]->d, config_.down_scales[tp_part_idx][logical_expert_id], scale_size); @@ -269,21 +290,22 @@ class MOE_KERNEL_TP uint8_t mat_class = (task_id % (mat_type_all * mat_split)) / mat_split; uint8_t mat_split_idex = task_id % mat_split; uint64_t logical_expert_id = expert_map(physical_to_logical_map, expert_idx); + void* src_; if (mat_class == 0) { // the up matrix - size_t size = T::BufferB::required_size(config_.intermediate_size, config_.hidden_size); + src_ = PLAIN ? up_bb_[expert_idx]->b : up_bb_[expert_idx]->b_pack[0]; + size_t size = T::BufferB::required_size(config_.intermediate_size, config_.hidden_size, PACKED, 'u', PLAIN); size_t scale_size = config_.intermediate_size * sizeof(float); - read_weights(prefix, "_up_", (char*)up_bb_[expert_idx]->b, logical_expert_id, size, scale_size, mat_split, - mat_split_idex); + read_weights(prefix, "_up_", (char*)src_, logical_expert_id, size, scale_size, mat_split, mat_split_idex); } else if (mat_class == 1) { - size_t size = T::BufferB::required_size(config_.intermediate_size, config_.hidden_size); + void* src_ = PLAIN ? gate_bb_[expert_idx]->b : gate_bb_[expert_idx]->b_pack[0]; + size_t size = T::BufferB::required_size(config_.intermediate_size, config_.hidden_size, PACKED, 'u', PLAIN); size_t scale_size = config_.intermediate_size * sizeof(float); - read_weights(prefix, "_gate_", (char*)gate_bb_[expert_idx]->b, logical_expert_id, size, scale_size, - mat_split, mat_split_idex); + read_weights(prefix, "_gate_", (char*)src_, logical_expert_id, size, scale_size, mat_split, mat_split_idex); } else { - size_t size = T::BufferB::required_size(config_.hidden_size, config_.intermediate_size); + void* src_ = PLAIN ? down_bb_[expert_idx]->b : down_bb_[expert_idx]->b_pack[0]; + size_t size = T::BufferB::required_size(config_.hidden_size, config_.intermediate_size, PACKED, 'd', PLAIN); size_t scale_size = config_.hidden_size * sizeof(float); - read_weights(prefix, "_down_", (char*)down_bb_[expert_idx]->b, logical_expert_id, size, scale_size, - mat_split, mat_split_idex); + read_weights(prefix, "_down_", (char*)src_, logical_expert_id, size, scale_size, mat_split, mat_split_idex); } } } @@ -342,17 +364,20 @@ class MOE_KERNEL_TP expert_idx = expert_map(physical_to_logical_map, expert_idx); uint8_t mat_class = task_id % mat_type_all; if (mat_class == 0) { // the up matrix - size_t size = T::BufferB::required_size(config_.intermediate_size, config_.hidden_size); + size_t size = + T::BufferB::required_size(config_.intermediate_size, config_.hidden_size, PACKED, 'u', PLAIN); size_t scale_size = config_.intermediate_size * sizeof(float); - write_weights(prefix, "_up_", (char*)up_bb_[expert_idx]->b, expert_idx, size, scale_size); + write_weights(prefix, "_up_", (char*)up_bb_[expert_idx]->b_pack[0], expert_idx, size, scale_size); } else if (mat_class == 1) { - size_t size = T::BufferB::required_size(config_.intermediate_size, config_.hidden_size); + size_t size = + T::BufferB::required_size(config_.intermediate_size, config_.hidden_size, PACKED, 'u', PLAIN); size_t scale_size = config_.intermediate_size * sizeof(float); - write_weights(prefix, "_gate_", (char*)gate_bb_[expert_idx]->b, expert_idx, size, scale_size); + write_weights(prefix, "_gate_", (char*)gate_bb_[expert_idx]->b_pack[0], expert_idx, size, scale_size); } else if (mat_class == 2) { - size_t size = T::BufferB::required_size(config_.hidden_size, config_.intermediate_size); + size_t size = + T::BufferB::required_size(config_.hidden_size, config_.intermediate_size, PACKED, 'd', PLAIN); size_t scale_size = config_.hidden_size * sizeof(float); - write_weights(prefix, "_down_", (char*)down_bb_[expert_idx]->b, expert_idx, size, scale_size); + write_weights(prefix, "_down_", (char*)down_bb_[expert_idx]->b_pack[0], expert_idx, size, scale_size); } }, nullptr); @@ -432,6 +457,9 @@ class MOE_KERNEL_TP } for (int i = 0; i < qlen; i++) { for (int j = 0; j < k; j++) { + if (expert_ids[i * k + j] < config_.num_gpu_experts || expert_ids[i * k + j] >= config_.expert_num) { + continue; + } m_local_pos_[i][j] = m_local_num_[expert_ids[i * k + j]]++; } } @@ -460,6 +488,9 @@ class MOE_KERNEL_TP // Copy inputs into expert-local buffers MOE_DIRECT_OR_POOL_BY_VAR(qlen, [&](int i) { for (int j = 0; j < k; j++) { + if (expert_ids[i * k + j] < config_.num_gpu_experts || expert_ids[i * k + j] >= config_.expert_num) { + continue; + } memcpy(m_local_input_ptr_[expert_ids[i * k + j]] + m_local_pos_[i][j] * config_.hidden_size, (input_t*)input + i * config_.hidden_size, sizeof(input_t) * config_.hidden_size); } @@ -608,6 +639,10 @@ class MOE_KERNEL_TP for (int e = e_start; e < e_end; e++) { float sum = 0; for (int j = 0; j < k; j++) { + if (expert_ids[q_idx * k + j] < config_.num_gpu_experts || + expert_ids[q_idx * k + j] >= config_.expert_num) { + continue; + } sum += weights[q_idx * k + j] * ((float*)m_local_down_output_ptr_[expert_ids[q_idx * k + j]]) [m_local_pos_[q_idx][j] * config_.hidden_size + e]; } @@ -691,6 +726,10 @@ class TP_MOE> : public TP_MOE_Common> { delete[] (ggml_bf16_t*)(tpc.up_proj); delete[] (ggml_bf16_t*)(tpc.down_proj); } + if (config.save) { + // free the bf16 weights after saving + tps.clear(); + } this->weights_loaded = true; } else if (config.path != "") { @@ -702,17 +741,22 @@ class TP_MOE> : public TP_MOE_Common> { } } - void merge_results(int qlen, void* output) { + void merge_results(int qlen, void* output, bool incremental) { // #ifdef FORWARD_TIME_PROFILE // forward_perf_start(); // #endif auto pool = this->config.pool; - auto merge_fn = [this, output](int token_nth) { + auto merge_fn = [this, output, incremental](int token_nth) { auto& local_output_numa = this->local_output_numa; auto& tp_configs = this->tp_configs; auto& tp_count = this->tp_count; auto& config = this->config; float* merge_to = local_output_numa[0] + token_nth * tp_configs[0].hidden_size; + if (incremental) { + for (int e = 0; e < config.hidden_size; e++) { + merge_to[e] += ggml_bf16_to_fp32(((ggml_bf16_t*)output + token_nth * config.hidden_size)[e]); + } + } for (int i = 1; i < tp_count; i++) { float* merge_from = local_output_numa[i] + token_nth * tp_configs[i].hidden_size; @@ -750,6 +794,8 @@ class TP_MOE> : public TP_MOE_Common> { // perf_report(); // #endif } + + void merge_results(int qlen, void* output) { merge_results(qlen, output, false); } }; #endif \ No newline at end of file diff --git a/kt-kernel/python/experts.py b/kt-kernel/python/experts.py index 55fb4915..78807eeb 100644 --- a/kt-kernel/python/experts.py +++ b/kt-kernel/python/experts.py @@ -19,6 +19,7 @@ from .experts_base import BaseMoEWrapper, KExpertsCPUBuffer # Import backend implementations from .utils.amx import AMXMoEWrapper from .utils.llamafile import LlamafileMoEWrapper +from .utils.moe_kernel import GeneralMoEWrapper class KTMoEWrapper: @@ -76,7 +77,7 @@ class KTMoEWrapper: chunked_prefill_size: Maximum prefill chunk size cpu_save: Whether to save weights to CPU memory max_deferred_experts_per_token: Number of experts per token to defer. Defaults to 0. - method: Backend method ("AMXINT4", "AMXINT8", "LLAMAFILE") + method: Backend method ("AMXINT4", "AMXINT8", "LLAMAFILE", "MOE_INT4", "MOE_INT8") Returns: An instance of the appropriate backend implementation (e.g., AMXMoEWrapper) @@ -86,6 +87,8 @@ class KTMoEWrapper: backend_cls = AMXMoEWrapper elif method == "LLAMAFILE": backend_cls = LlamafileMoEWrapper + elif method in ["MOE_INT4", "MOE_INT8"]: + backend_cls = GeneralMoEWrapper else: raise NotImplementedError(f"Unsupported method: {method}") diff --git a/kt-kernel/python/utils/moe_kernel.py b/kt-kernel/python/utils/moe_kernel.py new file mode 100644 index 00000000..fa3b3d0c --- /dev/null +++ b/kt-kernel/python/utils/moe_kernel.py @@ -0,0 +1,315 @@ +import os +import torch +import ctypes + +# Use relative imports for package structure +from ..experts_base import BaseMoEWrapper +from .loader import SafeTensorLoader +from kt_kernel_ext.moe import MOEConfig + +try: + from kt_kernel_ext.moe import Int8_KERNEL_MOE + + _HAS_INT8_SUPPORT = True +except (ImportError, AttributeError): + Int8_KERNEL_MOE = None + _HAS_INT8_SUPPORT = False +try: + from kt_kernel_ext.moe import Int4_KERNEL_MOE + + _HAS_INT4_SUPPORT = True +except (ImportError, AttributeError): + Int4_KERNEL_MOE = None + _HAS_INT4_SUPPORT = False + +from typing import Optional + + +class GeneralMoEWrapper(BaseMoEWrapper): + """ + moe-based MoE wrapper implementation. + Supports MOE_INT4 and MOE_INT8 quantization methods. + """ + + _safetensor_loader_instance = None # Singleton SafeTensorLoader + + def __init__( + self, + layer_idx: int, + num_experts: int, + num_experts_per_tok: int, + hidden_size: int, + moe_intermediate_size: int, + num_gpu_experts: int, + cpuinfer_threads: int, + threadpool_count: int, + weight_path: str, + chunked_prefill_size: int, + cpu_save: bool = False, + max_deferred_experts_per_token: Optional[int] = None, + method: str = "MOE_INT8", + ): + """ + Initialize general MoE Wrapper. + + Args: + layer_idx: Layer index + num_experts: Total number of experts + num_experts_per_tok: Number of experts per token (top-k) + hidden_size: Hidden dimension size + moe_intermediate_size: MoE intermediate size + num_gpu_experts: Number of experts to run on GPU + cpuinfer_threads: Number of CPU inference threads + threadpool_count: Number of NUMA subpools + weight_path: Path to weights (SafeTensor format) + chunked_prefill_size: Maximum prefill chunk size + cpu_save: Whether to save weights to CPU memory + max_deferred_experts_per_token: Number of experts per token to defer. Defaults to 0. + method: general quantization method ("MOE_INT4" or "MOE_INT8") + """ + if not _HAS_INT4_SUPPORT and method == "MOE_INT4": + raise RuntimeError( + "MoE_INT4 backend not available. kt_kernel_ext was not compiled with int4 support.\n" + "Please recompile with int4 enabled." + ) + if not _HAS_INT8_SUPPORT and method == "MOE_INT8": + raise RuntimeError( + "MoE_INT8 backend not available. kt_kernel_ext was not compiled with int8 support.\n" + "Please recompile with int8 enabled." + ) + + # Initialize base class + super().__init__( + layer_idx=layer_idx, + num_experts=num_experts, + num_experts_per_tok=num_experts_per_tok, + hidden_size=hidden_size, + moe_intermediate_size=moe_intermediate_size, + num_gpu_experts=num_gpu_experts, + cpuinfer_threads=cpuinfer_threads, + threadpool_count=threadpool_count, + weight_path=weight_path, + chunked_prefill_size=chunked_prefill_size, + cpu_save=cpu_save, + max_deferred_experts_per_token=max_deferred_experts_per_token, + method=method, + ) + + # moe-specific: Check if we should load merged safetensor weights + self.load_merged_weight = False + import glob + + if glob.glob(os.path.join(weight_path, "*.safetensors")): + self.load_merged_weight = True + + # Initialize SafeTensor loader (singleton) + if self.load_merged_weight: + if GeneralMoEWrapper._safetensor_loader_instance is None: + GeneralMoEWrapper._safetensor_loader_instance = SafeTensorLoader(weight_path) + self.safetensor_loader = GeneralMoEWrapper._safetensor_loader_instance + + # moe-specific weight storage + self.gate_weights = None + self.up_weights = None + self.down_weights = None + self.gate_scales = None + self.up_scales = None + self.down_scales = None + + def load_weights_from_tensors( + self, + gate_proj: torch.Tensor, + up_proj: torch.Tensor, + down_proj: torch.Tensor, + physical_to_logical_map_cpu: torch.Tensor, + ): + """ + Load and quantize weights from BF16/FP16 tensors (online quantization). + + Args: + gate_proj: Gate projection weights [num_experts, intermediate_size, hidden_size] + up_proj: Up projection weights [num_experts, intermediate_size, hidden_size] + down_proj: Down projection weights [num_experts, hidden_size, intermediate_size] + physical_to_logical_map_cpu: Mapping from physical to logical expert IDs + """ + # Store tensors as instance variables to keep them alive + self.gate_proj = gate_proj.contiguous() + self.up_proj = up_proj.contiguous() + self.down_proj = down_proj.contiguous() + + # Configure MoE with online quantization (cpu_save mode) + moe_config = MOEConfig( + self.num_experts, + self.num_experts_per_tok, + self.hidden_size, + self.moe_intermediate_size, + self.num_gpu_experts, + ) + moe_config.layer_idx = self.layer_idx + moe_config.pool = self.cpu_infer.backend_ + moe_config.max_len = self.chunked_prefill_size + + # Enable save mode for online quantization + moe_config.save = True + moe_config.load = False + + # Set weight pointers + moe_config.gate_proj = self.gate_proj.data_ptr() + moe_config.up_proj = self.up_proj.data_ptr() + moe_config.down_proj = self.down_proj.data_ptr() + + # Set output path for quantized weights + moe_config.path = self.weight_path + + # Create MoE module based on method + if self.method == "MOE_INT4": + self.moe = Int4_KERNEL_MOE(moe_config) + elif self.method == "MOE_INT8": + self.moe = Int8_KERNEL_MOE(moe_config) + else: + raise NotImplementedError(f"Unsupported MoE method: {self.method}") + + # Submit quantization and save task + self.cpu_infer.submit(self.moe.load_weights_task(physical_to_logical_map_cpu.data_ptr())) + self.cpu_infer.sync() + + def load_weights(self, physical_to_logical_map_cpu: torch.Tensor): + """ + Load weights for this layer and initialize the MoE module. + + Args: + physical_to_logical_map_cpu: Mapping from physical to logical expert IDs + """ + gate_ptr = 0 + up_ptr = 0 + down_ptr = 0 + + gate_ptrs = [] + up_ptrs = [] + down_ptrs = [] + + gate_scale_ptrs = [] + up_scale_ptrs = [] + down_scale_ptrs = [] + + if self.load_merged_weight: + base_key = f"blk.{self.layer_idx}" + w = self.safetensor_loader.load_experts(base_key) + + self.gate_weights = w["gate"] + self.up_weights = w["up"] + self.down_weights = w["down"] + self.gate_scales = w["gate_scale"] + self.up_scales = w["up_scale"] + self.down_scales = w["down_scale"] + + # Get pointers to weight arrays + gate_ptrs = [ + [ + ctypes.addressof(ctypes.cast(et.ctypes.data, ctypes.POINTER(ctypes.c_uint64)).contents) + for et in numa_array + ] + for numa_array in self.gate_weights + ] + + up_ptrs = [ + [ + ctypes.addressof(ctypes.cast(et.ctypes.data, ctypes.POINTER(ctypes.c_uint64)).contents) + for et in numa_array + ] + for numa_array in self.up_weights + ] + + down_ptrs = [ + [ + ctypes.addressof(ctypes.cast(et.ctypes.data, ctypes.POINTER(ctypes.c_uint64)).contents) + for et in numa_array + ] + for numa_array in self.down_weights + ] + + gate_scale_ptrs = [ + [ + ctypes.addressof(ctypes.cast(et.ctypes.data, ctypes.POINTER(ctypes.c_uint64)).contents) + for et in numa_array + ] + for numa_array in self.gate_scales + ] + + up_scale_ptrs = [ + [ + ctypes.addressof(ctypes.cast(et.ctypes.data, ctypes.POINTER(ctypes.c_uint64)).contents) + for et in numa_array + ] + for numa_array in self.up_scales + ] + + down_scale_ptrs = [ + [ + ctypes.addressof(ctypes.cast(et.ctypes.data, ctypes.POINTER(ctypes.c_uint64)).contents) + for et in numa_array + ] + for numa_array in self.down_scales + ] + + # Configure MoE + moe_config = MOEConfig( + self.num_experts, + self.num_experts_per_tok, + self.hidden_size, + self.moe_intermediate_size, + self.num_gpu_experts, + ) + moe_config.layer_idx = self.layer_idx + moe_config.pool = self.cpu_infer.backend_ + moe_config.max_len = self.chunked_prefill_size + + moe_config.gate_proj = gate_ptr + moe_config.up_proj = up_ptr + moe_config.down_proj = down_ptr + moe_config.gate_projs = gate_ptrs + moe_config.up_projs = up_ptrs + moe_config.down_projs = down_ptrs + moe_config.gate_scales = gate_scale_ptrs + moe_config.up_scales = up_scale_ptrs + moe_config.down_scales = down_scale_ptrs + + if self.cpu_save: + moe_config.save = True + moe_config.load = False + base_key = f"model.layers.{self.layer_idx}" + w = self.safetensor_loader.load_experts(base_key) + + self.gate_proj = torch.cat(w["gate_weight"], dim=0).contiguous() + self.up_proj = torch.cat(w["up_weight"], dim=0).contiguous() + self.down_proj = torch.cat(w["down_weight"], dim=0).contiguous() + + moe_config.gate_proj = self.gate_proj.data_ptr() + moe_config.up_proj = self.up_proj.data_ptr() + moe_config.down_proj = self.down_proj.data_ptr() + else: + moe_config.load = True + + if not self.load_merged_weight: + moe_config.path = self.weight_path + + # Create MoE module based on moe method + if self.method == "MOE_INT4": + self.moe = Int4_KERNEL_MOE(moe_config) + elif self.method == "MOE_INT8": + self.moe = Int8_KERNEL_MOE(moe_config) + else: + raise NotImplementedError(f"Unsupported MoE method: {self.method}") + + # Load weights + self.cpu_infer.submit(self.moe.load_weights_task(physical_to_logical_map_cpu.data_ptr())) + self.cpu_infer.sync() + + # Clean up temporary weight storage if using merged weights + if self.load_merged_weight: + del self.gate_weights + del self.up_weights + del self.down_weights + del self.gate_scales + del self.up_scales + del self.down_scales diff --git a/kt-kernel/scripts/convert_cpu_weights.py b/kt-kernel/scripts/convert_cpu_weights.py index 14d11c37..bb217b4c 100644 --- a/kt-kernel/scripts/convert_cpu_weights.py +++ b/kt-kernel/scripts/convert_cpu_weights.py @@ -615,6 +615,8 @@ class OnlineQuantConverter(ConverterBase): quant_to_amx_map = { "int4": "INT4", "int8": "INT8", + "moe_int4": "MOE_INT4", + "moe_int8": "MOE_INT8", } amx_method = quant_to_amx_map.get(self.quant_method, "INT4") @@ -622,6 +624,7 @@ class OnlineQuantConverter(ConverterBase): for numa_idx in range(self.threadpool_count): numa_folder = os.path.join(layer_path, f"_numa_{numa_idx}") if not os.path.exists(numa_folder): + print(f" Warning: NUMA folder not found: {numa_folder}, skipping...") continue # Iterate through all experts @@ -755,6 +758,8 @@ class OnlineQuantConverter(ConverterBase): quant_to_amx_map = { "int4": "AMXINT4", "int8": "AMXINT8", + "moe_int4": "MOE_INT4", + "moe_int8": "MOE_INT8", } amx_method = quant_to_amx_map.get(self.quant_method, "AMXINT4") @@ -826,7 +831,7 @@ def main(): parser.add_argument("--output", "-o", required=True, help="Output directory for converted safetensors") parser.add_argument( "--quant-method", - choices=["int4", "int8", "awq"], + choices=["int4", "int8", "awq", "moe_int4", "moe_int8"], default="int4", help="Quantization method for output (default: int4)", ) @@ -890,7 +895,7 @@ def main(): input_type=None, merge_to_safetensor=merge_to_safetensor, ) - elif quant_method in ["int4", "int8"] and args.input_type in ["fp8", "fp16", "bf16"]: + elif quant_method in ["int4", "int8", "moe_int4", "moe_int8"] and args.input_type in ["fp8", "fp16", "bf16"]: # Use OnlineQuantConverter for both INT4 and INT8 quantization converter = OnlineQuantConverter( args.input_path, diff --git a/kt-kernel/scripts/convert_gpu_weights.py b/kt-kernel/scripts/convert_gpu_weights.py index 6c9bfa89..fc695a10 100644 --- a/kt-kernel/scripts/convert_gpu_weights.py +++ b/kt-kernel/scripts/convert_gpu_weights.py @@ -34,63 +34,42 @@ from datasets import load_dataset def parse_args(): parser = argparse.ArgumentParser(description="Quantize MoE models with selective quantization") - + # Required arguments - parser.add_argument( - "--model_id", - type=str, - required=True, - help="Path to the input model directory" - ) - parser.add_argument( - "--output_dir", - type=str, - required=True, - help="Path to save the quantized model" - ) - + parser.add_argument("--model_id", type=str, required=True, help="Path to the input model directory") + parser.add_argument("--output_dir", type=str, required=True, help="Path to save the quantized model") + # Optional arguments parser.add_argument( "--quant_type", type=str, choices=["W4A16", "W8A16"], default="W8A16", - help="Quantization type: W4A16 (GPTQ4) or W8A16 (GPTQ8). Default: W8A16" + help="Quantization type: W4A16 (GPTQ4) or W8A16 (GPTQ8). Default: W8A16", ) parser.add_argument( - "--num_calibration_samples", - type=int, - default=512, - help="Number of calibration samples. Default: 512" + "--num_calibration_samples", type=int, default=512, help="Number of calibration samples. Default: 512" ) parser.add_argument( - "--max_sequence_length", - type=int, - default=2048, - help="Maximum sequence length for calibration. Default: 2048" + "--max_sequence_length", type=int, default=2048, help="Maximum sequence length for calibration. Default: 2048" ) parser.add_argument( "--dampening_frac", type=float, default=0.1, - help="Dampening fraction to mitigate quantization noise. Default: 0.1" + help="Dampening fraction to mitigate quantization noise. Default: 0.1", ) parser.add_argument( "--dataset", type=str, default="HuggingFaceH4/ultrachat_200k", - help="Dataset for calibration. Default: HuggingFaceH4/ultrachat_200k" + help="Dataset for calibration. Default: HuggingFaceH4/ultrachat_200k", ) parser.add_argument( - "--dataset_split", - type=str, - default="train_sft", - help="Dataset split to use. Default: train_sft" + "--dataset_split", type=str, default="train_sft", help="Dataset split to use. Default: train_sft" ) parser.add_argument( - "--force_cpu", - action="store_true", - help="Force all computations to CPU (sets CUDA_VISIBLE_DEVICES='')" + "--force_cpu", action="store_true", help="Force all computations to CPU (sets CUDA_VISIBLE_DEVICES='')" ) parser.add_argument( "--ignore_patterns", @@ -103,44 +82,37 @@ def parse_args(): r"re:.*\.shared_expert\..*$", r"re:.*\.shared_experts\..*$", r"re:.*\.mlp\.shared_expert_gate$", - r"re:.*\.linear_attn\..*$" + r"re:.*\.linear_attn\..*$", ], - help="Regex patterns for layers to ignore during quantization" + help="Regex patterns for layers to ignore during quantization", ) parser.add_argument( "--torch_dtype", type=str, choices=["bfloat16", "float16", "float32"], default="bfloat16", - help="PyTorch dtype for model loading. Default: bfloat16" + help="PyTorch dtype for model loading. Default: bfloat16", ) parser.add_argument( - "--trust_remote_code", - action="store_true", - help="Allow loading of remote code (required for some models)" - ) - parser.add_argument( - "--random_seed", - type=int, - default=42, - help="Random seed for dataset shuffling. Default: 42" + "--trust_remote_code", action="store_true", help="Allow loading of remote code (required for some models)" ) + parser.add_argument("--random_seed", type=int, default=42, help="Random seed for dataset shuffling. Default: 42") parser.add_argument( "--max_gpu_memory", type=str, default=None, help="Maximum GPU memory for model weights per device (e.g., '40GiB'). " - "GPTQ quantization requires additional GPU memory for Hessian matrix computation, " - "so reserve 40-50%% of total VRAM. For example, use '40GiB' on 80GB GPUs. " - "Remaining layers will be offloaded to CPU. Default: use all available" + "GPTQ quantization requires additional GPU memory for Hessian matrix computation, " + "so reserve 40-50%% of total VRAM. For example, use '40GiB' on 80GB GPUs. " + "Remaining layers will be offloaded to CPU. Default: use all available", ) parser.add_argument( "--max_cpu_memory", type=str, default=None, - help="Maximum CPU memory to use (e.g., '100GiB'). Default: use all available" + help="Maximum CPU memory to use (e.g., '100GiB'). Default: use all available", ) - + return parser.parse_args() @@ -167,11 +139,7 @@ def get_torch_dtype(dtype_str): Returns: torch.dtype: Corresponding PyTorch dtype """ - dtype_map = { - "bfloat16": torch.bfloat16, - "float16": torch.float16, - "float32": torch.float32 - } + dtype_map = {"bfloat16": torch.bfloat16, "float16": torch.float16, "float32": torch.float32} return dtype_map[dtype_str] @@ -191,18 +159,18 @@ def check_dense_layers_and_update_ignore(model_id, ignore_patterns, trust_remote Updated ignore_patterns list with dense layer patterns added """ print("🔍 Checking model configuration for dense layers...") - + try: # Load model configuration config = AutoConfig.from_pretrained(model_id, trust_remote_code=trust_remote_code) - + # Check if the model has first_k_dense_replace parameter - first_k_dense_replace = getattr(config, 'first_k_dense_replace', None) - + first_k_dense_replace = getattr(config, "first_k_dense_replace", None) + if first_k_dense_replace is not None and first_k_dense_replace > 0: print(f"✅ Found dense layers configuration: first_k_dense_replace = {first_k_dense_replace}") print(f" Adding first {first_k_dense_replace} layers to ignore list...") - + # Create regex pattern for dense layers (layers 0 to first_k_dense_replace-1) if first_k_dense_replace == 1: dense_pattern = r"re:model\.layers\.0\.mlp\..*$" @@ -210,18 +178,18 @@ def check_dense_layers_and_update_ignore(model_id, ignore_patterns, trust_remote # For multiple layers, use range pattern layer_range = f"[0-{first_k_dense_replace-1}]" dense_pattern = f"re:model\\.layers\\.{layer_range}\\.mlp\\..*$" - + # Add the dense layer pattern to ignore list updated_ignore_patterns = ignore_patterns + [dense_pattern] - + print(f" Dense layer pattern added: {dense_pattern}") print(f" This will ignore MLP components in layers 0-{first_k_dense_replace-1}") - + return updated_ignore_patterns else: print("ℹ️ No dense layers detected (first_k_dense_replace not found or is 0)") return ignore_patterns - + except Exception as e: print(f"⚠️ Warning: Could not check model config for dense layers: {e}") print(" Proceeding with original ignore patterns...") @@ -261,11 +229,7 @@ def load_and_prepare_dataset(dataset_name, dataset_split, num_samples, max_lengt # Tokenize the data def tokenize(sample): return tokenizer( - sample["text"], - padding=False, - max_length=max_length, - truncation=True, - add_special_tokens=False + sample["text"], padding=False, max_length=max_length, truncation=True, add_special_tokens=False ) ds = ds.map(tokenize, remove_columns=ds.column_names) @@ -306,9 +270,7 @@ def main(): # 0) Check for dense layers and update ignore patterns # Dense layers in the first few layers should not be quantized updated_ignore_patterns = check_dense_layers_and_update_ignore( - args.model_id, - args.ignore_patterns, - args.trust_remote_code + args.model_id, args.ignore_patterns, args.trust_remote_code ) # -------------------------------------------------------------------- @@ -320,9 +282,7 @@ def main(): print("🔍 Building CPU-only device map...") with init_empty_weights(): dummy = AutoModelForCausalLM.from_pretrained( - args.model_id, - torch_dtype=torch_dtype, - trust_remote_code=args.trust_remote_code + args.model_id, torch_dtype=torch_dtype, trust_remote_code=args.trust_remote_code ) device_map = {name: "cpu" for name, _ in dummy.named_modules() if name} del dummy @@ -330,9 +290,7 @@ def main(): print("🔍 Inferring device map...") with init_empty_weights(): dummy = AutoModelForCausalLM.from_pretrained( - args.model_id, - torch_dtype=torch_dtype, - trust_remote_code=args.trust_remote_code + args.model_id, torch_dtype=torch_dtype, trust_remote_code=args.trust_remote_code ) # Build max_memory dict if specified max_memory = None @@ -357,9 +315,7 @@ def main(): print(f" CPU memory limit: 1000GiB (default, to prevent disk offloading)") device_map = infer_auto_device_map( - dummy, - no_split_module_classes=dummy._no_split_modules, - max_memory=max_memory + dummy, no_split_module_classes=dummy._no_split_modules, max_memory=max_memory ) # Check if disk offloading was triggered (not supported by llmcompressor) @@ -371,8 +327,10 @@ def main(): print(" 1. Increase --max_gpu_memory to use more GPU memory") print(" 2. Add --max_cpu_memory with higher value (e.g., '200GiB')") print(" 3. Ensure your machine has enough GPU + CPU memory") - raise RuntimeError("Disk offloading is not supported by llmcompressor. " - "Please ensure you have enough GPU + CPU memory.") + raise RuntimeError( + "Disk offloading is not supported by llmcompressor. " + "Please ensure you have enough GPU + CPU memory." + ) del dummy # -------------------------------------------------------------------- @@ -409,7 +367,7 @@ def main(): args.num_calibration_samples, args.max_sequence_length, tokenizer, - args.random_seed + args.random_seed, ) # -------------------------------------------------------------------- @@ -447,4 +405,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/kt-kernel/setup.py b/kt-kernel/setup.py index 3860f35c..f2bf0dd9 100644 --- a/kt-kernel/setup.py +++ b/kt-kernel/setup.py @@ -21,6 +21,7 @@ Environment knobs (export before running pip install .): CPUINFER_ENABLE_AMD=OFF ON/OFF -> -DKTRANSFORMERS_CPU_MOE_AMD CPUINFER_ENABLE_KML=OFF ON/OFF -> -DKTRANSFORMERS_CPU_USE_KML CPUINFER_ENABLE_AVX512=OFF ON/OFF -> -DKTRANSFORMERS_CPU_USE_AMX_AVX512 + CPUINFER_BLIS_ROOT=/path/to/blis Forward to -DBLIS_ROOT CPUINFER_ENABLE_LTO=ON ON/OFF -> -DCPUINFER_ENABLE_LTO (your added option) @@ -28,6 +29,7 @@ Environment knobs (export before running pip install .): CPUINFER_LTO_MODE=auto Forward to -DCPUINFER_LTO_MODE CPUINFER_NATIVE=ON (override LLAMA_NATIVE) + GPU backends (if ever added later, keep placeholders): CPUINFER_USE_CUDA=0/1 -DKTRANSFORMERS_USE_CUDA CPUINFER_USE_ROCM=0/1 -DKTRANSFORMERS_USE_ROCM @@ -51,6 +53,43 @@ from setuptools import setup, Extension from setuptools.command.build_ext import build_ext import shutil +# ------------------------- +# Env parsing helpers +# ------------------------- +def _env_get_bool(name: str, default: bool | None = None) -> bool | None: + v = os.environ.get(name) + if v is None: + return default + val = v.strip().lower() + if val in ("1", "on", "true", "yes", "y", "enable", "enabled"): + return True + if val in ("0", "off", "false", "no", "n", "disable", "disabled"): + return False + return default + + +def _cmake_onoff(flag: bool) -> str: + return "ON" if flag else "OFF" + + +def _forward_bool_env(cmake_args: list[str], env_name: str, cmake_flag: str) -> bool: + """If env exists, forward it to CMake as -D=ON/OFF and return True; else return False.""" + b = _env_get_bool(env_name, None) + if b is None: + return False + cmake_args.append(f"-D{cmake_flag}={_cmake_onoff(b)}") + print(f"-- Forward {env_name} -> -D{cmake_flag}={_cmake_onoff(b)}") + return True + + +def _forward_str_env(cmake_args: list[str], env_name: str, cmake_flag: str) -> bool: + v = os.environ.get(env_name) + if not v: + return False + cmake_args.append(f"-D{cmake_flag}={v}") + print(f"-- Forward {env_name} -> -D{cmake_flag}={v}") + return True + ################################################################################ # Helpers ################################################################################ @@ -204,7 +243,34 @@ class CMakeBuild(build_ext): return True return False - if os.environ.get("CPUINFER_USE_CUDA") is None: + # Locate nvcc executable (without forcing user to set -DCMAKE_CUDA_COMPILER) + def find_nvcc_path() -> str | None: + cuda_home = os.environ.get("CUDA_HOME") + if cuda_home: + cand = Path(cuda_home) / "bin" / "nvcc" + if cand.exists(): + return str(cand) + which_nvcc = shutil.which("nvcc") + if which_nvcc: + return which_nvcc + # Common fallbacks (ordered by preference) + for cand in [ + "/usr/local/cuda-12.6/bin/nvcc", + "/usr/local/cuda/bin/nvcc", + "/usr/bin/nvcc", + "/usr/lib/nvidia-cuda-toolkit/bin/nvcc", + ]: + if Path(cand).exists(): + return cand + return None + + # Note: We no longer set CMAKE_CUDA_ARCHITECTURES by default. + # If users want to specify CUDA archs, they can set env CPUINFER_CUDA_ARCHS + # (e.g. "89" or "86;89") or pass it via CMAKE_ARGS. + auto_moe_kernel_ = False + # Normalize CPUINFER_USE_CUDA: if unset, auto-detect; otherwise respect truthy/falsey values + cuda_env = _env_get_bool("CPUINFER_USE_CUDA", None) + if cuda_env is None: auto_cuda = detect_cuda_toolkit() os.environ["CPUINFER_USE_CUDA"] = "1" if auto_cuda else "0" print(f"-- CPUINFER_USE_CUDA not set; auto-detected CUDA toolkit: {'YES' if auto_cuda else 'NO'}") @@ -228,56 +294,87 @@ class CMakeBuild(build_ext): print(f"Detected CPU info: {d}") # Vendor / feature specific toggles - # Enable AMD MoE kernel on AMD by default unless user explicitly set CPUINFER_ENABLE_AMD - # temporarily disabled this opt, use llamafile backend for now - # if d.get("vendor") == "amd" and os.environ.get("CPUINFER_ENABLE_AMD") is None: - # cmake_args.append("-DKTRANSFORMERS_CPU_MOE_AMD=ON") - # print("-- Detected AMD CPU; enabling AMD MoE kernel (-DKTRANSFORMERS_CPU_MOE_AMD=ON)") + # AMD MoE: explicit env overrides; otherwise default ON on AMD CPU + if not _forward_bool_env(cmake_args, "CPUINFER_ENABLE_AMD", "KTRANSFORMERS_CPU_MOE_AMD"): + if d.get("vendor") == "amd": + auto_moe_kernel_ = True + cmake_args.append("-DKTRANSFORMERS_CPU_MOE_AMD=ON") + print("-- Detected AMD CPU; enabling AMD MoE kernel (-DKTRANSFORMERS_CPU_MOE_AMD=ON)") + _forward_str_env(cmake_args, "CPUINFER_BLIS_ROOT", "BLIS_ROOT") - # On ARM, enable KML by default if not explicitly toggled - if d.get("vendor") == "arm" and os.environ.get("CPUINFER_ENABLE_KML") is None: - cmake_args.append("-DKTRANSFORMERS_CPU_USE_KML=ON") - print("-- Detected ARM CPU; enabling KML (-DKTRANSFORMERS_CPU_USE_KML=ON)") + # KML: explicit env overrides; otherwise default ON on ARM + if not _forward_bool_env(cmake_args, "CPUINFER_ENABLE_KML", "KTRANSFORMERS_CPU_USE_KML"): + if d.get("vendor") == "arm": + auto_moe_kernel_ = True + cmake_args.append("-DKTRANSFORMERS_CPU_USE_KML=ON") + print("-- Detected ARM CPU; enabling KML (-DKTRANSFORMERS_CPU_USE_KML=ON)") - # If AMX or AVX512 present, enable umbrella unless overridden; enable AMX specifically when present - if "AMX" in d["features"]: - if os.environ.get("CPUINFER_ENABLE_AMX") is None: + # AMX: explicit env overrides; else enable if detected + if not _forward_bool_env(cmake_args, "CPUINFER_ENABLE_AMX", "KTRANSFORMERS_CPU_USE_AMX"): + if "AMX" in d["features"]: cmake_args.append("-DKTRANSFORMERS_CPU_USE_AMX=ON") print("-- AMX support detected; enabling (-DKTRANSFORMERS_CPU_USE_AMX=ON)") - if ("AMX" in d["features"] or "AVX512" in d["features"]) and os.environ.get( - "CPUINFER_ENABLE_AVX512" - ) is None: - cmake_args.append("-DKTRANSFORMERS_CPU_USE_AMX_AVX512=ON") - print("-- Enabling AMX/AVX512 umbrella (-DKTRANSFORMERS_CPU_USE_AMX_AVX512=ON)") + # AVX512 umbrella: explicit env overrides; else enable if AMX or AVX512 detected + if not _forward_bool_env(cmake_args, "CPUINFER_ENABLE_AVX512", "KTRANSFORMERS_CPU_USE_AMX_AVX512"): + if "AMX" in d["features"] or "AVX512" in d["features"]: + cmake_args.append("-DKTRANSFORMERS_CPU_USE_AMX_AVX512=ON") + print("-- Enabling AMX/AVX512 umbrella (-DKTRANSFORMERS_CPU_USE_AMX_AVX512=ON)") + + # Auto-enable MOE kernel only when env explicitly turns on AMD or KML backend + # (Do not enable purely on vendor auto-detection to avoid surprise behavior.) + amd_env = _env_get_bool("CPUINFER_ENABLE_AMD", None) + kml_env = _env_get_bool("CPUINFER_ENABLE_KML", None) + if amd_env or kml_env: + auto_moe_kernel_ = True + already_set = any("KTRANSFORMERS_CPU_MOE_KERNEL" in a for a in cmake_args) + if not already_set and auto_moe_kernel_: + cmake_args.append("-DKTRANSFORMERS_CPU_MOE_KERNEL=ON") + print("-- Auto-enabling MOE kernel (-DKTRANSFORMERS_CPU_MOE_KERNEL=ON) because CPUINFER_ENABLE_AMD or CPUINFER_ENABLE_KML is ON") # Friendly summary print( f"-- CPU detection: vendor={d.get('vendor')} arch={d.get('arch')} features={sorted(list(d.get('features', [])))}" ) - # Optional AMX / MLA toggles (explicit env overrides auto detection above) - if os.environ.get("CPUINFER_ENABLE_AMX"): - cmake_args.append(f"-DKTRANSFORMERS_CPU_USE_AMX={os.environ['CPUINFER_ENABLE_AMX']}") - if os.environ.get("CPUINFER_ENABLE_KML"): - cmake_args.append(f"-DKTRANSFORMERS_CPU_USE_KML={os.environ['CPUINFER_ENABLE_KML']}") - if os.environ.get("CPUINFER_ENABLE_MLA"): - cmake_args.append(f"-DKTRANSFORMERS_CPU_MLA={os.environ['CPUINFER_ENABLE_MLA']}") + # MLA toggle (string/boolean allowed) + if not _forward_bool_env(cmake_args, "CPUINFER_ENABLE_MLA", "KTRANSFORMERS_CPU_MLA"): + _forward_str_env(cmake_args, "CPUINFER_ENABLE_MLA", "KTRANSFORMERS_CPU_MLA") - # LTO toggles if user added them in CMakeLists - if os.environ.get("CPUINFER_ENABLE_LTO"): - cmake_args.append(f"-DCPUINFER_ENABLE_LTO={os.environ['CPUINFER_ENABLE_LTO']}") - if os.environ.get("CPUINFER_LTO_JOBS"): - cmake_args.append(f"-DCPUINFER_LTO_JOBS={os.environ['CPUINFER_LTO_JOBS']}") - if os.environ.get("CPUINFER_LTO_MODE"): - cmake_args.append(f"-DCPUINFER_LTO_MODE={os.environ['CPUINFER_LTO_MODE']}") + # LTO toggles + _forward_bool_env(cmake_args, "CPUINFER_ENABLE_LTO", "CPUINFER_ENABLE_LTO") + _forward_str_env(cmake_args, "CPUINFER_LTO_JOBS", "CPUINFER_LTO_JOBS") + _forward_str_env(cmake_args, "CPUINFER_LTO_MODE", "CPUINFER_LTO_MODE") # GPU backends (mutually exclusive expected) - if os.environ.get("CPUINFER_USE_CUDA") == "1": + if _env_get_bool("CPUINFER_USE_CUDA", False): cmake_args.append("-DKTRANSFORMERS_USE_CUDA=ON") print("-- Enabling CUDA backend (-DKTRANSFORMERS_USE_CUDA=ON)") - if os.environ.get("CPUINFER_USE_ROCM") == "1": + # Inject nvcc compiler path automatically unless user already specified one. + user_specified_compiler = any("CMAKE_CUDA_COMPILER" in a for a in cmake_args) + if not user_specified_compiler: + extra_env = os.environ.get("CMAKE_ARGS", "") + if "CMAKE_CUDA_COMPILER" in extra_env: + user_specified_compiler = True + if not user_specified_compiler: + nvcc_path = find_nvcc_path() + if nvcc_path: + cmake_args.append(f"-DCMAKE_CUDA_COMPILER={nvcc_path}") + print(f"-- Auto-detected nvcc: {nvcc_path} (adding -DCMAKE_CUDA_COMPILER)") + else: + print("-- Warning: nvcc not found via CUDA_HOME/PATH/common prefixes; CUDA configure may fail.") + # Optional host compiler for nvcc if user set CUDAHOSTCXX + if os.environ.get("CUDAHOSTCXX"): + hostcxx = os.environ["CUDAHOSTCXX"] + cmake_args.append(f"-DCMAKE_CUDA_HOST_COMPILER={hostcxx}") + print(f"-- Using CUDA host compiler from CUDAHOSTCXX: {hostcxx}") + # Respect user-provided architectures only (no default auto-detection). + archs_env = os.environ.get("CPUINFER_CUDA_ARCHS", "").strip() + if archs_env and not any("CMAKE_CUDA_ARCHITECTURES" in a for a in cmake_args): + cmake_args.append(f"-DCMAKE_CUDA_ARCHITECTURES={archs_env}") + print(f"-- Set CUDA architectures from CPUINFER_CUDA_ARCHS: {archs_env}") + if _env_get_bool("CPUINFER_USE_ROCM", False): cmake_args.append("-DKTRANSFORMERS_USE_ROCM=ON") - if os.environ.get("CPUINFER_USE_MUSA") == "1": + if _env_get_bool("CPUINFER_USE_MUSA", False): cmake_args.append("-DKTRANSFORMERS_USE_MUSA=ON") # Respect user extra CMAKE_ARGS (space separated) @@ -286,7 +383,7 @@ class CMakeBuild(build_ext): cmake_args += [a for a in extra.split() if a] # Force rebuild? (delete cache) - if os.environ.get("CPUINFER_FORCE_REBUILD") == "1": + if _env_get_bool("CPUINFER_FORCE_REBUILD", True): cache = build_temp / "CMakeCache.txt" if cache.exists(): cache.unlink()