diff --git a/kt-kernel/CMakeLists.txt b/kt-kernel/CMakeLists.txt
index 438c43d3..1192b4cf 100644
--- a/kt-kernel/CMakeLists.txt
+++ b/kt-kernel/CMakeLists.txt
@@ -495,7 +495,7 @@ if(NOT DEFINED CLANG_FORMAT_BIN)
     )
 endif()
 if(NOT CLANG_FORMAT_BIN)
-    message(WARNING "clang-format not found. Please install clang-format (>=18) or pass -DCLANG_FORMAT_BIN=/full/path and reconfigure.")
+    message(WARNING "ONLY for developer: clang-format not found. Please install clang-format (>=18) or pass -DCLANG_FORMAT_BIN=/full/path and reconfigure.")
 else()
     execute_process(
         COMMAND ${CLANG_FORMAT_BIN} --version
diff --git a/kt-kernel/CMakePresets.json b/kt-kernel/CMakePresets.json
index 1c3b00b9..c8db5fc8 100644
--- a/kt-kernel/CMakePresets.json
+++ b/kt-kernel/CMakePresets.json
@@ -39,6 +39,20 @@
         "KTRANSFORMERS_CPU_USE_AMX_AVX512": "ON",
         "KTRANSFORMERS_USE_CUDA": "ON"
       }
+    },
+    {
+      "name": "amd",
+      "displayName": "amd_platform",
+      "description": "for amd platform",
+      "cacheVariables": {
+        "KTRANSFORMERS_CPU_USE_AMX": "OFF",
+        "LLAMA_AVX512": "OFF",
+        "LLAMA_AVX2": "ON",
+        "KTRANSFORMERS_CPU_USE_AMX_AVX512": "OFF",
+        "KTRANSFORMERS_USE_CUDA": "ON",
+        "KTRANSFORMERS_CPU_MOE_AMD": "ON",
+        "KTRANSFORMERS_CPU_MOE_KERNEL": "ON"
+      }
     }
 
   ]
diff --git a/kt-kernel/README.md b/kt-kernel/README.md
index 2e373e3b..2f4f6cfa 100644
--- a/kt-kernel/README.md
+++ b/kt-kernel/README.md
@@ -2,41 +2,32 @@
 
 High-performance kernel operations for KTransformers, featuring CPU-optimized MoE inference with AMX, AVX, KML and blis (amd library) support.
 
-- [KT-Kernel](#kt-kernel)
-  - [Note](#note)
-  - [Features](#features)
-  - [Installation](#installation)
-    - [Prerequisites](#prerequisites)
-    - [Quick Installation (Recommended)](#quick-installation-recommended)
-    - [Manual Configuration (Advanced)](#manual-configuration-advanced)
-  - [Verification](#verification)
-  - [Integration with SGLang](#integration-with-sglang)
-    - [Installation Steps](#installation-steps)
-      - [1. Install SGLang](#1-install-sglang)
-      - [2. Prepare Weights](#2-prepare-weights)
-      - [3. Launch SGLang Server](#3-launch-sglang-server)
-    - [Complete Example: Qwen3-30B-A3B](#complete-example-qwen3-30b-a3b)
-      - [Option A: AMX Backend (AMXINT8)](#option-a-amx-backend-amxint8)
-      - [Option B: LLAMAFILE Backend (GGUF)](#option-b-llamafile-backend-gguf)
-    - [KT-Kernel Parameters](#kt-kernel-parameters)
-  - [Direct Python API Usage](#direct-python-api-usage)
-    - [Advanced Options](#advanced-options)
-  - [Build Configuration](#build-configuration)
-    - [Manual Installation](#manual-installation)
-      - [1. Install System Dependencies](#1-install-system-dependencies)
-      - [2. Set Build Configuration](#2-set-build-configuration)
-      - [3. Build and Install](#3-build-and-install)
-  - [Error Troubleshooting](#error-troubleshooting)
-    - [CUDA Not Found](#cuda-not-found)
-    - [hwloc Not Found](#hwloc-not-found)
-  - [Weight Quantization](#weight-quantization)
-  - [Before Commit!](#before-commit)
+- [Note](#note)
+- [Features](#features)
+- [Installation](#installation)
+  - [Prerequisites](#prerequisites)
+  - [Quick Installation (Recommended)](#quick-installation-recommended)
+  - [Manual Configuration (Advanced)](#manual-configuration-advanced)
+- [Verification](#verification)
+- [Integration with SGLang](#integration-with-sglang)
+  - [Installation Steps](#installation-steps)
+  - [Complete Example: Qwen3-30B-A3B](#complete-example-qwen3-30b-a3b)
+  - [KT-Kernel Parameters](#kt-kernel-parameters)
+- [Direct Python API Usage](#direct-python-api-usage)
+  - [Advanced Options](#advanced-options)
+- [Build Configuration](#build-configuration)
+  - [Manual Installation](#manual-installation)
+- [Error Troubleshooting](#error-troubleshooting)
+  - [CUDA Not Found](#cuda-not-found)
+  - [hwloc Not Found](#hwloc-not-found)
+- [Weight Quantization](#weight-quantization)
+- [Before Commit!](#before-commit)
 ## Note
 
 **Current Support Status:**
 - ✅ **Intel CPUs with AMX**: Fully supported (using weights converted to INT4/INT8 format)
 - ✅ **Universal CPU (llamafile backend)**: Supported (using GGUF-format weights)
-- ⚠️ **AMD CPUs with BLIS**: In progress, not yet fully integrated
+- ✅ **AMD CPUs with BLIS**: Supported (for int8 prefill & decode)
 
 ## Features
 
@@ -145,7 +136,7 @@ python scripts/convert_cpu_weights.py \
   --input-path /path/to/model \
   --input-type bf16 \
   --output /path/to/cpu-weights \
-  --quant-method int8  # or int4
+  --quant-method int8  # or int4 or moe_int8 (for amd now) 
 ```
 
 - `--input-path`: Path to GPU-side original weights
diff --git a/kt-kernel/README_zh.md b/kt-kernel/README_zh.md
index a4a7cadc..95e781f2 100644
--- a/kt-kernel/README_zh.md
+++ b/kt-kernel/README_zh.md
@@ -2,42 +2,33 @@
 
 高性能 KTransformers 内核库，提供面向 CPU 的高效 MoE 推理内核，支持 AMX 和 AVX 等后端。
 
-- [KT-Kernel](#kt-kernel)
-  - [说明](#说明)
-  - [特性](#特性)
-  - [安装](#安装)
-    - [先决条件](#先决条件)
-    - [快速安装（推荐）](#快速安装推荐)
-    - [手动配置（进阶）](#手动配置进阶)
-  - [验证安装](#验证安装)
-  - [与 SGLang 集成](#与-sglang-集成)
-    - [安装步骤](#安装步骤)
-      - [1. 安装 SGLang](#1-安装-sglang)
-      - [2. 准备权重](#2-准备权重)
-      - [3. 启动 SGLang Server](#3-启动-sglang-server)
-    - [完整示例：Qwen3-30B-A3B](#完整示例qwen3-30b-a3b)
-      - [方案 A：AMX 后端（AMXINT8）](#方案-aamx-后端amxint8)
-      - [方案 B：LLAMAFILE 后端（GGUF）](#方案-bllamafile-后端gguf)
-    - [KT-Kernel 参数](#kt-kernel-参数)
-  - [直接使用 Python API](#直接使用-python-api)
-    - [高级选项](#高级选项)
-  - [构建配置](#构建配置)
-    - [手动安装](#手动安装)
-      - [1. 安装系统依赖](#1-安装系统依赖)
-      - [2. 配置构建参数](#2-配置构建参数)
-      - [3. 构建并安装](#3-构建并安装)
-  - [错误排查](#错误排查)
-    - [找不到 CUDA](#找不到-cuda)
-    - [找不到 hwloc](#找不到-hwloc)
-  - [权重量化](#权重量化)
-  - [提交前必读](#提交前必读)
+- [说明](#说明)
+- [特性](#特性)
+- [安装](#安装)
+  - [先决条件](#先决条件)
+  - [快速安装（推荐）](#快速安装推荐)
+  - [手动配置（进阶）](#手动配置进阶)
+- [验证安装](#验证安装)
+- [与 SGLang 集成](#与-sglang-集成)
+  - [安装步骤](#安装步骤)
+  - [完整示例：Qwen3-30B-A3B](#完整示例qwen3-30b-a3b)
+  - [KT-Kernel 参数](#kt-kernel-参数)
+- [直接使用 Python API](#直接使用-python-api)
+  - [高级选项](#高级选项)
+- [构建配置](#构建配置)
+  - [手动安装](#手动安装)
+- [错误排查](#错误排查)
+  - [找不到 CUDA](#找不到-cuda)
+  - [找不到 hwloc](#找不到-hwloc)
+- [权重量化](#权重量化)
+- [提交前必读](#提交前必读)
 
 ## 说明
 
 **当前支持状态：**
 - ✅ **带 AMX 的 Intel CPU**：已支持（基于转换为 INT4/INT8 格式的权重）
 - ✅ **通用 CPU（llamafile 后端）**：已支持（基于 GGUF 格式的权重）
-- ⚠️ **带 BLIS 的 AMD CPU**：进行中，尚未完全集成
+- ✅ **带 BLIS 的 AMD CPU**：已支持（int8 的 prefill 和 decode）
 
 ## 特性
 
@@ -149,7 +140,7 @@ python scripts/convert_cpu_weights.py \
   --input-path /path/to/model \
   --input-type bf16 \
   --output /path/to/cpu-weights \
-  --quant-method int8  # 或 int4
+  --quant-method int8  # 或 int4 或 moe_int8（用于 amd 的）
 ```
 
 - `--input-path`：GPU 侧原始权重路径
diff --git a/kt-kernel/operators/amx/test/mmq-test.cpp b/kt-kernel/operators/amx/test/mmq-test.cpp
index 2a35ae2b..c7dc43a8 100644
--- a/kt-kernel/operators/amx/test/mmq-test.cpp
+++ b/kt-kernel/operators/amx/test/mmq-test.cpp
@@ -2376,9 +2376,7 @@ bool ggml_compute_forward_mul_mat_use_amx(struct ggml_tensor* dst) {
   static thread_local bool is_first_time = true;
   if (is_first_time) {
 #pragma omp single
-    {
-      ggml_amx_init();
-    }
+    { ggml_amx_init(); }
 
     // load tile config
     ggml_tile_config_init();
diff --git a/kt-kernel/operators/amx/test/mmq.cpp b/kt-kernel/operators/amx/test/mmq.cpp
index 0f446cc6..8e9f296d 100644
--- a/kt-kernel/operators/amx/test/mmq.cpp
+++ b/kt-kernel/operators/amx/test/mmq.cpp
@@ -2372,9 +2372,7 @@ bool ggml_compute_forward_mul_mat_use_amx(struct ggml_tensor* dst) {
   static thread_local bool is_first_time = true;
   if (is_first_time) {
 #pragma omp single
-    {
-      ggml_amx_init();
-    }
+    { ggml_amx_init(); }
 
     // load tile config
     ggml_tile_config_init();
diff --git a/kt-kernel/operators/llamafile/mla.hpp b/kt-kernel/operators/llamafile/mla.hpp
index 7ac23242..03f70c4d 100644
--- a/kt-kernel/operators/llamafile/mla.hpp
+++ b/kt-kernel/operators/llamafile/mla.hpp
@@ -14,15 +14,15 @@
 // #include <utility>
 // #include <vector>
 
-// #define DIRECT_OR_POOL_BY(what, threshold, var, fn)                                                                    \
-//   do {                                                                                                                 \
-//     if ((what) < (threshold)) {                                                                                        \
-//       for (int i = 0; i < (var); i++) {                                                                                \
-//         (fn)(i);                                                                                                       \
-//       }                                                                                                                \
-//     } else {                                                                                                           \
-//       pool->do_work_stealing_job((var), nullptr, (fn), nullptr);                                                       \
-//     }                                                                                                                  \
+// #define DIRECT_OR_POOL_BY(what, threshold, var, fn) \
+//   do { \
+//     if ((what) < (threshold)) { \
+//       for (int i = 0; i < (var); i++) { \
+//         (fn)(i); \
+//       } \
+//     } else { \
+//       pool->do_work_stealing_job((var), nullptr, (fn), nullptr); \
+//     } \
 //   } while (0)
 
 // #define VEC_DOT_TYPE(type) (ggml_internal_get_type_traits((ggml_type)(type)).vec_dot_type)
@@ -31,19 +31,20 @@
 // #define QUANT_OFFSET(ptr, type, n, n_elements) \
 //   (offset_pointer((ptr), (size_t)(n) * QUANT_BLCK_SIZE((n_elements), (type))))
 
-// #define LLAMAFILE_SGEMM_QUANT_FULL_MATMUL(m, n, k, a, a_type, b, b_col, c, c_col)                                      \
-//   do {                                                                                                                 \
-//     llamafile_sgemm((m), (n), QUANT_BLCK_COUNT((k), (a_type)), (a), QUANT_BLCK_COUNT((k), (a_type)),                   \
-//                     QUANT_OFFSET((b), VEC_DOT_TYPE((a_type)), (b_col), (k)),                                           \
-//                     QUANT_BLCK_COUNT((k), VEC_DOT_TYPE((a_type))), offset_pointer((c), (c_col) * (m) * sizeof(float)), \
-//                     (k), 0, 1, GGML_TASK_TYPE_COMPUTE, (a_type), VEC_DOT_TYPE((a_type)), GGML_TYPE_F32,                \
-//                     GGML_PREC_DEFAULT);                                                                                \
+// #define LLAMAFILE_SGEMM_QUANT_FULL_MATMUL(m, n, k, a, a_type, b, b_col, c, c_col) \
+//   do { \
+//     llamafile_sgemm((m), (n), QUANT_BLCK_COUNT((k), (a_type)), (a), QUANT_BLCK_COUNT((k), (a_type)), \
+//                     QUANT_OFFSET((b), VEC_DOT_TYPE((a_type)), (b_col), (k)), \
+//                     QUANT_BLCK_COUNT((k), VEC_DOT_TYPE((a_type))), offset_pointer((c), (c_col) * (m) *
+//                     sizeof(float)), \
+//                     (k), 0, 1, GGML_TASK_TYPE_COMPUTE, (a_type), VEC_DOT_TYPE((a_type)), GGML_TYPE_F32, \
+//                     GGML_PREC_DEFAULT); \
 //   } while (0)
 
-// #define LLAMAFILE_SGEMM_MATMUL_F32(m, n, k, a, lda, b, ldb, c, ldc)                                                    \
-//   do {                                                                                                                 \
-//     llamafile_sgemm((m), (n), (k), (a), (lda), (b), (ldb), (c), (ldc), 0, 1, GGML_TASK_TYPE_COMPUTE, GGML_TYPE_F32,    \
-//                     GGML_TYPE_F32, GGML_TYPE_F32, GGML_PREC_DEFAULT);                                                  \
+// #define LLAMAFILE_SGEMM_MATMUL_F32(m, n, k, a, lda, b, ldb, c, ldc) \
+//   do { \
+//     llamafile_sgemm((m), (n), (k), (a), (lda), (b), (ldb), (c), (ldc), 0, 1, GGML_TASK_TYPE_COMPUTE, GGML_TYPE_F32, \
+//                     GGML_TYPE_F32, GGML_TYPE_F32, GGML_PREC_DEFAULT); \
 //   } while (0)
 
 // // bool decide_absorb(size_t a,int a_type,size_t b,int b_type,size_t c,int c_type,size_t d,int d_type){
diff --git a/kt-kernel/operators/moe_kernel/la/kernel.hpp b/kt-kernel/operators/moe_kernel/la/kernel.hpp
index 34d55fc0..bc685e38 100644
--- a/kt-kernel/operators/moe_kernel/la/kernel.hpp
+++ b/kt-kernel/operators/moe_kernel/la/kernel.hpp
@@ -340,7 +340,7 @@ struct GemmKernelInt8 {
   static inline const int PACK_SIZE_M = 8;
   static inline const int PACK_SIZE_K = 32;
 
-  static std::string name() { return "INT8"; }
+  static std::string name() { return "MOE_INT8"; }
   static int recommended_nth(int n) { return (n + N_BLOCK - 1) / N_BLOCK; }
   // type_: d for decode, p for prefill
   static int recommended_nth_down(int n, char type_ = 'd') {
@@ -833,7 +833,7 @@ struct GemmKernelInt4 {
   static inline const int PACK_SIZE_K = 32;
   static inline const int PACK_SIZE_M = 8;
 
-  static std::string name() { return "INT4"; }
+  static std::string name() { return "MOE_INT4"; }
   static int recommended_nth(int n) { return (n + N_BLOCK - 1) / N_BLOCK; }
 
   static int recommended_nth_down(int n, char type_ = 'd') {
diff --git a/kt-kernel/operators/moe_kernel/moe.hpp b/kt-kernel/operators/moe_kernel/moe.hpp
index c5d3acbc..b866739b 100644
--- a/kt-kernel/operators/moe_kernel/moe.hpp
+++ b/kt-kernel/operators/moe_kernel/moe.hpp
@@ -12,8 +12,8 @@
 #include <iostream>
 #include <vector>
 
+#include "../../cpu_backend/shared_mem_buffer.h"
 #include "../common.hpp"
-#include "../cpu_backend/shared_mem_buffer.h"
 #include "../moe-tp.hpp"
 #include "api/common.h"
 #include "api/mat_kernel.h"
@@ -57,6 +57,9 @@ class MOE_KERNEL_TP
   std::vector<std::shared_ptr<typename T::BufferB>> down_bb_;
   std::vector<std::shared_ptr<typename T::BufferC>> down_bc_;
 
+  std::vector<void*> gate_up_owner_ptr_;
+  std::vector<void*> down_owner_ptr_;
+
   inline void write_weights(std::filesystem::path prefix, std::string mat_class, char* bb, int expert_idx, size_t size,
                             size_t scale_size) {
     // printf("expert %d, size %ld, scale size %ld\n", expert_idx, size, scale_size);
@@ -182,6 +185,7 @@ class MOE_KERNEL_TP
       down_ba_.push_back(std::make_shared<typename T::BufferA>(config_.max_len, config_.intermediate_size, nullptr));
       down_bc_.push_back(std::make_shared<typename T::BufferC>(config_.max_len, config_.hidden_size, nullptr));
       void* gate_up_down_per_exp_ptr = std::aligned_alloc(64, gate_up_exp_size);
+      gate_up_owner_ptr_.push_back(gate_up_down_per_exp_ptr);
 
       gate_bb_.push_back(std::make_shared<typename T::BufferB>(config_.intermediate_size, config_.hidden_size,
                                                                gate_up_down_per_exp_ptr, PACKED, 'u', PLAIN));
@@ -193,6 +197,7 @@ class MOE_KERNEL_TP
 
       void* down_bb_ptr = std::aligned_alloc(
           64, T::BufferB::required_size(config_.hidden_size, config_.intermediate_size, PACKED, 'd', PLAIN));
+      down_owner_ptr_.push_back(down_bb_ptr);
       down_bb_.push_back(std::make_shared<typename T::BufferB>(config_.hidden_size, config_.intermediate_size,
                                                                down_bb_ptr, PACKED, 'd', PLAIN));
     }
@@ -220,27 +225,41 @@ class MOE_KERNEL_TP
 
   ~MOE_KERNEL_TP() {
     // printf("  Destroying KML_MOE_TP %lx\n", (intptr_t)(this));
+    for (void* ptr : gate_up_owner_ptr_) {
+      std::free(ptr);
+    }
+    for (void* ptr : down_owner_ptr_) {
+      std::free(ptr);
+    }
   }
 
   void load_weights() {
     auto pool = config_.pool->get_subpool(tp_part_idx);
     const uint64_t* physical_to_logical_map = (const uint64_t*)config_.physical_to_logical_map;
     if (config_.gate_projs.size()) {
+      printf("load from safetensor");
       pool->do_work_stealing_job(
           config_.expert_num, nullptr,
           [this, physical_to_logical_map](int expert_id) {
             uint64_t logical_expert_id = expert_map(physical_to_logical_map, expert_id);
             {
               size_t scale_size = config_.intermediate_size * sizeof(float);
-              size_t size = T::BufferB::required_size(config_.intermediate_size, config_.hidden_size) - scale_size;
+              size_t whole_size_ =
+                  T::BufferB::required_size(config_.intermediate_size, config_.hidden_size, PACKED, 'u', PLAIN);
+              size_t size = whole_size_ - scale_size;
+              void* dst_ = PLAIN ? gate_bb_[expert_id]->b : gate_bb_[expert_id]->b_pack[0];
 
-              memcpy(gate_bb_[expert_id]->b, config_.gate_projs[tp_part_idx][logical_expert_id], size);
+              memcpy(dst_, config_.gate_projs[tp_part_idx][logical_expert_id], size);
 
               if constexpr (T::BufferB::SCALE) {
                 memcpy(gate_bb_[expert_id]->d, config_.gate_scales[tp_part_idx][logical_expert_id], scale_size);
               }
 
-              memcpy(up_bb_[expert_id]->b, config_.up_projs[tp_part_idx][logical_expert_id], size);
+              whole_size_ =
+                  T::BufferB::required_size(config_.intermediate_size, config_.hidden_size, PACKED, 'u', PLAIN);
+              size = whole_size_ - scale_size;
+              dst_ = PLAIN ? up_bb_[expert_id]->b : up_bb_[expert_id]->b_pack[0];
+              memcpy(dst_, config_.up_projs[tp_part_idx][logical_expert_id], size);
 
               if constexpr (T::BufferB::SCALE) {
                 memcpy(up_bb_[expert_id]->d, config_.up_scales[tp_part_idx][logical_expert_id], scale_size);
@@ -249,9 +268,11 @@ class MOE_KERNEL_TP
 
             {
               size_t scale_size = config_.hidden_size * sizeof(float);
-              size_t size = T::BufferB::required_size(config_.hidden_size, config_.intermediate_size) - scale_size;
-
-              memcpy(down_bb_[expert_id]->b, config_.down_projs[tp_part_idx][logical_expert_id], size);
+              size_t whole_size_ =
+                  T::BufferB::required_size(config_.hidden_size, config_.intermediate_size, PACKED, 'd', PLAIN);
+              size_t size = whole_size_ - scale_size;
+              void* dst_ = PLAIN ? down_bb_[expert_id]->b : down_bb_[expert_id]->b_pack[0];
+              memcpy(dst_, config_.down_projs[tp_part_idx][logical_expert_id], size);
 
               if constexpr (T::BufferB::SCALE) {
                 memcpy(down_bb_[expert_id]->d, config_.down_scales[tp_part_idx][logical_expert_id], scale_size);
@@ -269,21 +290,22 @@ class MOE_KERNEL_TP
           uint8_t mat_class = (task_id % (mat_type_all * mat_split)) / mat_split;
           uint8_t mat_split_idex = task_id % mat_split;
           uint64_t logical_expert_id = expert_map(physical_to_logical_map, expert_idx);
+          void* src_;
           if (mat_class == 0) {  // the up matrix
-            size_t size = T::BufferB::required_size(config_.intermediate_size, config_.hidden_size);
+            src_ = PLAIN ? up_bb_[expert_idx]->b : up_bb_[expert_idx]->b_pack[0];
+            size_t size = T::BufferB::required_size(config_.intermediate_size, config_.hidden_size, PACKED, 'u', PLAIN);
             size_t scale_size = config_.intermediate_size * sizeof(float);
-            read_weights(prefix, "_up_", (char*)up_bb_[expert_idx]->b, logical_expert_id, size, scale_size, mat_split,
-                         mat_split_idex);
+            read_weights(prefix, "_up_", (char*)src_, logical_expert_id, size, scale_size, mat_split, mat_split_idex);
           } else if (mat_class == 1) {
-            size_t size = T::BufferB::required_size(config_.intermediate_size, config_.hidden_size);
+            void* src_ = PLAIN ? gate_bb_[expert_idx]->b : gate_bb_[expert_idx]->b_pack[0];
+            size_t size = T::BufferB::required_size(config_.intermediate_size, config_.hidden_size, PACKED, 'u', PLAIN);
             size_t scale_size = config_.intermediate_size * sizeof(float);
-            read_weights(prefix, "_gate_", (char*)gate_bb_[expert_idx]->b, logical_expert_id, size, scale_size,
-                         mat_split, mat_split_idex);
+            read_weights(prefix, "_gate_", (char*)src_, logical_expert_id, size, scale_size, mat_split, mat_split_idex);
           } else {
-            size_t size = T::BufferB::required_size(config_.hidden_size, config_.intermediate_size);
+            void* src_ = PLAIN ? down_bb_[expert_idx]->b : down_bb_[expert_idx]->b_pack[0];
+            size_t size = T::BufferB::required_size(config_.hidden_size, config_.intermediate_size, PACKED, 'd', PLAIN);
             size_t scale_size = config_.hidden_size * sizeof(float);
-            read_weights(prefix, "_down_", (char*)down_bb_[expert_idx]->b, logical_expert_id, size, scale_size,
-                         mat_split, mat_split_idex);
+            read_weights(prefix, "_down_", (char*)src_, logical_expert_id, size, scale_size, mat_split, mat_split_idex);
           }
         }
       }
@@ -342,17 +364,20 @@ class MOE_KERNEL_TP
               expert_idx = expert_map(physical_to_logical_map, expert_idx);
               uint8_t mat_class = task_id % mat_type_all;
               if (mat_class == 0) {  // the up matrix
-                size_t size = T::BufferB::required_size(config_.intermediate_size, config_.hidden_size);
+                size_t size =
+                    T::BufferB::required_size(config_.intermediate_size, config_.hidden_size, PACKED, 'u', PLAIN);
                 size_t scale_size = config_.intermediate_size * sizeof(float);
-                write_weights(prefix, "_up_", (char*)up_bb_[expert_idx]->b, expert_idx, size, scale_size);
+                write_weights(prefix, "_up_", (char*)up_bb_[expert_idx]->b_pack[0], expert_idx, size, scale_size);
               } else if (mat_class == 1) {
-                size_t size = T::BufferB::required_size(config_.intermediate_size, config_.hidden_size);
+                size_t size =
+                    T::BufferB::required_size(config_.intermediate_size, config_.hidden_size, PACKED, 'u', PLAIN);
                 size_t scale_size = config_.intermediate_size * sizeof(float);
-                write_weights(prefix, "_gate_", (char*)gate_bb_[expert_idx]->b, expert_idx, size, scale_size);
+                write_weights(prefix, "_gate_", (char*)gate_bb_[expert_idx]->b_pack[0], expert_idx, size, scale_size);
               } else if (mat_class == 2) {
-                size_t size = T::BufferB::required_size(config_.hidden_size, config_.intermediate_size);
+                size_t size =
+                    T::BufferB::required_size(config_.hidden_size, config_.intermediate_size, PACKED, 'd', PLAIN);
                 size_t scale_size = config_.hidden_size * sizeof(float);
-                write_weights(prefix, "_down_", (char*)down_bb_[expert_idx]->b, expert_idx, size, scale_size);
+                write_weights(prefix, "_down_", (char*)down_bb_[expert_idx]->b_pack[0], expert_idx, size, scale_size);
               }
             },
             nullptr);
@@ -432,6 +457,9 @@ class MOE_KERNEL_TP
     }
     for (int i = 0; i < qlen; i++) {
       for (int j = 0; j < k; j++) {
+        if (expert_ids[i * k + j] < config_.num_gpu_experts || expert_ids[i * k + j] >= config_.expert_num) {
+          continue;
+        }
         m_local_pos_[i][j] = m_local_num_[expert_ids[i * k + j]]++;
       }
     }
@@ -460,6 +488,9 @@ class MOE_KERNEL_TP
     // Copy inputs into expert-local buffers
     MOE_DIRECT_OR_POOL_BY_VAR(qlen, [&](int i) {
       for (int j = 0; j < k; j++) {
+        if (expert_ids[i * k + j] < config_.num_gpu_experts || expert_ids[i * k + j] >= config_.expert_num) {
+          continue;
+        }
         memcpy(m_local_input_ptr_[expert_ids[i * k + j]] + m_local_pos_[i][j] * config_.hidden_size,
                (input_t*)input + i * config_.hidden_size, sizeof(input_t) * config_.hidden_size);
       }
@@ -608,6 +639,10 @@ class MOE_KERNEL_TP
           for (int e = e_start; e < e_end; e++) {
             float sum = 0;
             for (int j = 0; j < k; j++) {
+              if (expert_ids[q_idx * k + j] < config_.num_gpu_experts ||
+                  expert_ids[q_idx * k + j] >= config_.expert_num) {
+                continue;
+              }
               sum += weights[q_idx * k + j] * ((float*)m_local_down_output_ptr_[expert_ids[q_idx * k + j]])
                                                   [m_local_pos_[q_idx][j] * config_.hidden_size + e];
             }
@@ -691,6 +726,10 @@ class TP_MOE<MOE_KERNEL_TP<K, T>> : public TP_MOE_Common<MOE_KERNEL_TP<K, T>> {
         delete[] (ggml_bf16_t*)(tpc.up_proj);
         delete[] (ggml_bf16_t*)(tpc.down_proj);
       }
+      if (config.save) {
+        // free the bf16 weights after saving
+        tps.clear();
+      }
 
       this->weights_loaded = true;
     } else if (config.path != "") {
@@ -702,17 +741,22 @@ class TP_MOE<MOE_KERNEL_TP<K, T>> : public TP_MOE_Common<MOE_KERNEL_TP<K, T>> {
     }
   }
 
-  void merge_results(int qlen, void* output) {
+  void merge_results(int qlen, void* output, bool incremental) {
     // #ifdef FORWARD_TIME_PROFILE
     //     forward_perf_start();
     // #endif
     auto pool = this->config.pool;
-    auto merge_fn = [this, output](int token_nth) {
+    auto merge_fn = [this, output, incremental](int token_nth) {
       auto& local_output_numa = this->local_output_numa;
       auto& tp_configs = this->tp_configs;
       auto& tp_count = this->tp_count;
       auto& config = this->config;
       float* merge_to = local_output_numa[0] + token_nth * tp_configs[0].hidden_size;
+      if (incremental) {
+        for (int e = 0; e < config.hidden_size; e++) {
+          merge_to[e] += ggml_bf16_to_fp32(((ggml_bf16_t*)output + token_nth * config.hidden_size)[e]);
+        }
+      }
 
       for (int i = 1; i < tp_count; i++) {
         float* merge_from = local_output_numa[i] + token_nth * tp_configs[i].hidden_size;
@@ -750,6 +794,8 @@ class TP_MOE<MOE_KERNEL_TP<K, T>> : public TP_MOE_Common<MOE_KERNEL_TP<K, T>> {
     //     perf_report();
     // #endif
   }
+
+  void merge_results(int qlen, void* output) { merge_results(qlen, output, false); }
 };
 
 #endif
\ No newline at end of file
diff --git a/kt-kernel/python/experts.py b/kt-kernel/python/experts.py
index 55fb4915..78807eeb 100644
--- a/kt-kernel/python/experts.py
+++ b/kt-kernel/python/experts.py
@@ -19,6 +19,7 @@ from .experts_base import BaseMoEWrapper, KExpertsCPUBuffer
 # Import backend implementations
 from .utils.amx import AMXMoEWrapper
 from .utils.llamafile import LlamafileMoEWrapper
+from .utils.moe_kernel import GeneralMoEWrapper
 
 
 class KTMoEWrapper:
@@ -76,7 +77,7 @@ class KTMoEWrapper:
             chunked_prefill_size: Maximum prefill chunk size
             cpu_save: Whether to save weights to CPU memory
             max_deferred_experts_per_token: Number of experts per token to defer. Defaults to 0.
-            method: Backend method ("AMXINT4", "AMXINT8", "LLAMAFILE")
+            method: Backend method ("AMXINT4", "AMXINT8", "LLAMAFILE", "MOE_INT4", "MOE_INT8")
 
         Returns:
             An instance of the appropriate backend implementation (e.g., AMXMoEWrapper)
@@ -86,6 +87,8 @@ class KTMoEWrapper:
             backend_cls = AMXMoEWrapper
         elif method == "LLAMAFILE":
             backend_cls = LlamafileMoEWrapper
+        elif method in ["MOE_INT4", "MOE_INT8"]:
+            backend_cls = GeneralMoEWrapper
         else:
             raise NotImplementedError(f"Unsupported method: {method}")
 
diff --git a/kt-kernel/python/utils/moe_kernel.py b/kt-kernel/python/utils/moe_kernel.py
new file mode 100644
index 00000000..fa3b3d0c
--- /dev/null
+++ b/kt-kernel/python/utils/moe_kernel.py
@@ -0,0 +1,315 @@
+import os
+import torch
+import ctypes
+
+# Use relative imports for package structure
+from ..experts_base import BaseMoEWrapper
+from .loader import SafeTensorLoader
+from kt_kernel_ext.moe import MOEConfig
+
+try:
+    from kt_kernel_ext.moe import Int8_KERNEL_MOE
+
+    _HAS_INT8_SUPPORT = True
+except (ImportError, AttributeError):
+    Int8_KERNEL_MOE = None
+    _HAS_INT8_SUPPORT = False
+try:
+    from kt_kernel_ext.moe import Int4_KERNEL_MOE
+
+    _HAS_INT4_SUPPORT = True
+except (ImportError, AttributeError):
+    Int4_KERNEL_MOE = None
+    _HAS_INT4_SUPPORT = False
+
+from typing import Optional
+
+
+class GeneralMoEWrapper(BaseMoEWrapper):
+    """
+    moe-based MoE wrapper implementation.
+    Supports MOE_INT4 and MOE_INT8 quantization methods.
+    """
+
+    _safetensor_loader_instance = None  # Singleton SafeTensorLoader
+
+    def __init__(
+        self,
+        layer_idx: int,
+        num_experts: int,
+        num_experts_per_tok: int,
+        hidden_size: int,
+        moe_intermediate_size: int,
+        num_gpu_experts: int,
+        cpuinfer_threads: int,
+        threadpool_count: int,
+        weight_path: str,
+        chunked_prefill_size: int,
+        cpu_save: bool = False,
+        max_deferred_experts_per_token: Optional[int] = None,
+        method: str = "MOE_INT8",
+    ):
+        """
+        Initialize general MoE Wrapper.
+
+        Args:
+            layer_idx: Layer index
+            num_experts: Total number of experts
+            num_experts_per_tok: Number of experts per token (top-k)
+            hidden_size: Hidden dimension size
+            moe_intermediate_size: MoE intermediate size
+            num_gpu_experts: Number of experts to run on GPU
+            cpuinfer_threads: Number of CPU inference threads
+            threadpool_count: Number of NUMA subpools
+            weight_path: Path to weights (SafeTensor format)
+            chunked_prefill_size: Maximum prefill chunk size
+            cpu_save: Whether to save weights to CPU memory
+            max_deferred_experts_per_token: Number of experts per token to defer. Defaults to 0.
+            method: general quantization method ("MOE_INT4" or "MOE_INT8")
+        """
+        if not _HAS_INT4_SUPPORT and method == "MOE_INT4":
+            raise RuntimeError(
+                "MoE_INT4 backend not available. kt_kernel_ext was not compiled with int4 support.\n"
+                "Please recompile with int4 enabled."
+            )
+        if not _HAS_INT8_SUPPORT and method == "MOE_INT8":
+            raise RuntimeError(
+                "MoE_INT8 backend not available. kt_kernel_ext was not compiled with int8 support.\n"
+                "Please recompile with int8 enabled."
+            )
+
+        # Initialize base class
+        super().__init__(
+            layer_idx=layer_idx,
+            num_experts=num_experts,
+            num_experts_per_tok=num_experts_per_tok,
+            hidden_size=hidden_size,
+            moe_intermediate_size=moe_intermediate_size,
+            num_gpu_experts=num_gpu_experts,
+            cpuinfer_threads=cpuinfer_threads,
+            threadpool_count=threadpool_count,
+            weight_path=weight_path,
+            chunked_prefill_size=chunked_prefill_size,
+            cpu_save=cpu_save,
+            max_deferred_experts_per_token=max_deferred_experts_per_token,
+            method=method,
+        )
+
+        # moe-specific: Check if we should load merged safetensor weights
+        self.load_merged_weight = False
+        import glob
+
+        if glob.glob(os.path.join(weight_path, "*.safetensors")):
+            self.load_merged_weight = True
+
+        # Initialize SafeTensor loader (singleton)
+        if self.load_merged_weight:
+            if GeneralMoEWrapper._safetensor_loader_instance is None:
+                GeneralMoEWrapper._safetensor_loader_instance = SafeTensorLoader(weight_path)
+            self.safetensor_loader = GeneralMoEWrapper._safetensor_loader_instance
+
+        # moe-specific weight storage
+        self.gate_weights = None
+        self.up_weights = None
+        self.down_weights = None
+        self.gate_scales = None
+        self.up_scales = None
+        self.down_scales = None
+
+    def load_weights_from_tensors(
+        self,
+        gate_proj: torch.Tensor,
+        up_proj: torch.Tensor,
+        down_proj: torch.Tensor,
+        physical_to_logical_map_cpu: torch.Tensor,
+    ):
+        """
+        Load and quantize weights from BF16/FP16 tensors (online quantization).
+
+        Args:
+            gate_proj: Gate projection weights [num_experts, intermediate_size, hidden_size]
+            up_proj: Up projection weights [num_experts, intermediate_size, hidden_size]
+            down_proj: Down projection weights [num_experts, hidden_size, intermediate_size]
+            physical_to_logical_map_cpu: Mapping from physical to logical expert IDs
+        """
+        # Store tensors as instance variables to keep them alive
+        self.gate_proj = gate_proj.contiguous()
+        self.up_proj = up_proj.contiguous()
+        self.down_proj = down_proj.contiguous()
+
+        # Configure MoE with online quantization (cpu_save mode)
+        moe_config = MOEConfig(
+            self.num_experts,
+            self.num_experts_per_tok,
+            self.hidden_size,
+            self.moe_intermediate_size,
+            self.num_gpu_experts,
+        )
+        moe_config.layer_idx = self.layer_idx
+        moe_config.pool = self.cpu_infer.backend_
+        moe_config.max_len = self.chunked_prefill_size
+
+        # Enable save mode for online quantization
+        moe_config.save = True
+        moe_config.load = False
+
+        # Set weight pointers
+        moe_config.gate_proj = self.gate_proj.data_ptr()
+        moe_config.up_proj = self.up_proj.data_ptr()
+        moe_config.down_proj = self.down_proj.data_ptr()
+
+        # Set output path for quantized weights
+        moe_config.path = self.weight_path
+
+        # Create MoE module based on method
+        if self.method == "MOE_INT4":
+            self.moe = Int4_KERNEL_MOE(moe_config)
+        elif self.method == "MOE_INT8":
+            self.moe = Int8_KERNEL_MOE(moe_config)
+        else:
+            raise NotImplementedError(f"Unsupported MoE method: {self.method}")
+
+        # Submit quantization and save task
+        self.cpu_infer.submit(self.moe.load_weights_task(physical_to_logical_map_cpu.data_ptr()))
+        self.cpu_infer.sync()
+
+    def load_weights(self, physical_to_logical_map_cpu: torch.Tensor):
+        """
+        Load weights for this layer and initialize the MoE module.
+
+        Args:
+            physical_to_logical_map_cpu: Mapping from physical to logical expert IDs
+        """
+        gate_ptr = 0
+        up_ptr = 0
+        down_ptr = 0
+
+        gate_ptrs = []
+        up_ptrs = []
+        down_ptrs = []
+
+        gate_scale_ptrs = []
+        up_scale_ptrs = []
+        down_scale_ptrs = []
+
+        if self.load_merged_weight:
+            base_key = f"blk.{self.layer_idx}"
+            w = self.safetensor_loader.load_experts(base_key)
+
+            self.gate_weights = w["gate"]
+            self.up_weights = w["up"]
+            self.down_weights = w["down"]
+            self.gate_scales = w["gate_scale"]
+            self.up_scales = w["up_scale"]
+            self.down_scales = w["down_scale"]
+
+            # Get pointers to weight arrays
+            gate_ptrs = [
+                [
+                    ctypes.addressof(ctypes.cast(et.ctypes.data, ctypes.POINTER(ctypes.c_uint64)).contents)
+                    for et in numa_array
+                ]
+                for numa_array in self.gate_weights
+            ]
+
+            up_ptrs = [
+                [
+                    ctypes.addressof(ctypes.cast(et.ctypes.data, ctypes.POINTER(ctypes.c_uint64)).contents)
+                    for et in numa_array
+                ]
+                for numa_array in self.up_weights
+            ]
+
+            down_ptrs = [
+                [
+                    ctypes.addressof(ctypes.cast(et.ctypes.data, ctypes.POINTER(ctypes.c_uint64)).contents)
+                    for et in numa_array
+                ]
+                for numa_array in self.down_weights
+            ]
+
+            gate_scale_ptrs = [
+                [
+                    ctypes.addressof(ctypes.cast(et.ctypes.data, ctypes.POINTER(ctypes.c_uint64)).contents)
+                    for et in numa_array
+                ]
+                for numa_array in self.gate_scales
+            ]
+
+            up_scale_ptrs = [
+                [
+                    ctypes.addressof(ctypes.cast(et.ctypes.data, ctypes.POINTER(ctypes.c_uint64)).contents)
+                    for et in numa_array
+                ]
+                for numa_array in self.up_scales
+            ]
+
+            down_scale_ptrs = [
+                [
+                    ctypes.addressof(ctypes.cast(et.ctypes.data, ctypes.POINTER(ctypes.c_uint64)).contents)
+                    for et in numa_array
+                ]
+                for numa_array in self.down_scales
+            ]
+
+        # Configure MoE
+        moe_config = MOEConfig(
+            self.num_experts,
+            self.num_experts_per_tok,
+            self.hidden_size,
+            self.moe_intermediate_size,
+            self.num_gpu_experts,
+        )
+        moe_config.layer_idx = self.layer_idx
+        moe_config.pool = self.cpu_infer.backend_
+        moe_config.max_len = self.chunked_prefill_size
+
+        moe_config.gate_proj = gate_ptr
+        moe_config.up_proj = up_ptr
+        moe_config.down_proj = down_ptr
+        moe_config.gate_projs = gate_ptrs
+        moe_config.up_projs = up_ptrs
+        moe_config.down_projs = down_ptrs
+        moe_config.gate_scales = gate_scale_ptrs
+        moe_config.up_scales = up_scale_ptrs
+        moe_config.down_scales = down_scale_ptrs
+
+        if self.cpu_save:
+            moe_config.save = True
+            moe_config.load = False
+            base_key = f"model.layers.{self.layer_idx}"
+            w = self.safetensor_loader.load_experts(base_key)
+
+            self.gate_proj = torch.cat(w["gate_weight"], dim=0).contiguous()
+            self.up_proj = torch.cat(w["up_weight"], dim=0).contiguous()
+            self.down_proj = torch.cat(w["down_weight"], dim=0).contiguous()
+
+            moe_config.gate_proj = self.gate_proj.data_ptr()
+            moe_config.up_proj = self.up_proj.data_ptr()
+            moe_config.down_proj = self.down_proj.data_ptr()
+        else:
+            moe_config.load = True
+
+        if not self.load_merged_weight:
+            moe_config.path = self.weight_path
+
+        # Create MoE module based on moe method
+        if self.method == "MOE_INT4":
+            self.moe = Int4_KERNEL_MOE(moe_config)
+        elif self.method == "MOE_INT8":
+            self.moe = Int8_KERNEL_MOE(moe_config)
+        else:
+            raise NotImplementedError(f"Unsupported MoE method: {self.method}")
+
+        # Load weights
+        self.cpu_infer.submit(self.moe.load_weights_task(physical_to_logical_map_cpu.data_ptr()))
+        self.cpu_infer.sync()
+
+        # Clean up temporary weight storage if using merged weights
+        if self.load_merged_weight:
+            del self.gate_weights
+            del self.up_weights
+            del self.down_weights
+            del self.gate_scales
+            del self.up_scales
+            del self.down_scales
diff --git a/kt-kernel/scripts/convert_cpu_weights.py b/kt-kernel/scripts/convert_cpu_weights.py
index 14d11c37..bb217b4c 100644
--- a/kt-kernel/scripts/convert_cpu_weights.py
+++ b/kt-kernel/scripts/convert_cpu_weights.py
@@ -615,6 +615,8 @@ class OnlineQuantConverter(ConverterBase):
         quant_to_amx_map = {
             "int4": "INT4",
             "int8": "INT8",
+            "moe_int4": "MOE_INT4",
+            "moe_int8": "MOE_INT8",
         }
         amx_method = quant_to_amx_map.get(self.quant_method, "INT4")
 
@@ -622,6 +624,7 @@ class OnlineQuantConverter(ConverterBase):
         for numa_idx in range(self.threadpool_count):
             numa_folder = os.path.join(layer_path, f"_numa_{numa_idx}")
             if not os.path.exists(numa_folder):
+                print(f"  Warning: NUMA folder not found: {numa_folder}, skipping...")
                 continue
 
             # Iterate through all experts
@@ -755,6 +758,8 @@ class OnlineQuantConverter(ConverterBase):
         quant_to_amx_map = {
             "int4": "AMXINT4",
             "int8": "AMXINT8",
+            "moe_int4": "MOE_INT4",
+            "moe_int8": "MOE_INT8",
         }
         amx_method = quant_to_amx_map.get(self.quant_method, "AMXINT4")
 
@@ -826,7 +831,7 @@ def main():
     parser.add_argument("--output", "-o", required=True, help="Output directory for converted safetensors")
     parser.add_argument(
         "--quant-method",
-        choices=["int4", "int8", "awq"],
+        choices=["int4", "int8", "awq", "moe_int4", "moe_int8"],
         default="int4",
         help="Quantization method for output (default: int4)",
     )
@@ -890,7 +895,7 @@ def main():
                 input_type=None,
                 merge_to_safetensor=merge_to_safetensor,
             )
-        elif quant_method in ["int4", "int8"] and args.input_type in ["fp8", "fp16", "bf16"]:
+        elif quant_method in ["int4", "int8", "moe_int4", "moe_int8"] and args.input_type in ["fp8", "fp16", "bf16"]:
             # Use OnlineQuantConverter for both INT4 and INT8 quantization
             converter = OnlineQuantConverter(
                 args.input_path,
diff --git a/kt-kernel/scripts/convert_gpu_weights.py b/kt-kernel/scripts/convert_gpu_weights.py
index 6c9bfa89..fc695a10 100644
--- a/kt-kernel/scripts/convert_gpu_weights.py
+++ b/kt-kernel/scripts/convert_gpu_weights.py
@@ -34,63 +34,42 @@ from datasets import load_dataset
 
 def parse_args():
     parser = argparse.ArgumentParser(description="Quantize MoE models with selective quantization")
-    
+
     # Required arguments
-    parser.add_argument(
-        "--model_id",
-        type=str,
-        required=True,
-        help="Path to the input model directory"
-    )
-    parser.add_argument(
-        "--output_dir", 
-        type=str,
-        required=True,
-        help="Path to save the quantized model"
-    )
-    
+    parser.add_argument("--model_id", type=str, required=True, help="Path to the input model directory")
+    parser.add_argument("--output_dir", type=str, required=True, help="Path to save the quantized model")
+
     # Optional arguments
     parser.add_argument(
         "--quant_type",
         type=str,
         choices=["W4A16", "W8A16"],
         default="W8A16",
-        help="Quantization type: W4A16 (GPTQ4) or W8A16 (GPTQ8). Default: W8A16"
+        help="Quantization type: W4A16 (GPTQ4) or W8A16 (GPTQ8). Default: W8A16",
     )
     parser.add_argument(
-        "--num_calibration_samples",
-        type=int,
-        default=512,
-        help="Number of calibration samples. Default: 512"
+        "--num_calibration_samples", type=int, default=512, help="Number of calibration samples. Default: 512"
     )
     parser.add_argument(
-        "--max_sequence_length",
-        type=int,
-        default=2048,
-        help="Maximum sequence length for calibration. Default: 2048"
+        "--max_sequence_length", type=int, default=2048, help="Maximum sequence length for calibration. Default: 2048"
     )
     parser.add_argument(
         "--dampening_frac",
         type=float,
         default=0.1,
-        help="Dampening fraction to mitigate quantization noise. Default: 0.1"
+        help="Dampening fraction to mitigate quantization noise. Default: 0.1",
     )
     parser.add_argument(
         "--dataset",
         type=str,
         default="HuggingFaceH4/ultrachat_200k",
-        help="Dataset for calibration. Default: HuggingFaceH4/ultrachat_200k"
+        help="Dataset for calibration. Default: HuggingFaceH4/ultrachat_200k",
     )
     parser.add_argument(
-        "--dataset_split",
-        type=str,
-        default="train_sft",
-        help="Dataset split to use. Default: train_sft"
+        "--dataset_split", type=str, default="train_sft", help="Dataset split to use. Default: train_sft"
     )
     parser.add_argument(
-        "--force_cpu",
-        action="store_true",
-        help="Force all computations to CPU (sets CUDA_VISIBLE_DEVICES='')"
+        "--force_cpu", action="store_true", help="Force all computations to CPU (sets CUDA_VISIBLE_DEVICES='')"
     )
     parser.add_argument(
         "--ignore_patterns",
@@ -103,44 +82,37 @@ def parse_args():
             r"re:.*\.shared_expert\..*$",
             r"re:.*\.shared_experts\..*$",
             r"re:.*\.mlp\.shared_expert_gate$",
-            r"re:.*\.linear_attn\..*$"
+            r"re:.*\.linear_attn\..*$",
         ],
-        help="Regex patterns for layers to ignore during quantization"
+        help="Regex patterns for layers to ignore during quantization",
     )
     parser.add_argument(
         "--torch_dtype",
         type=str,
         choices=["bfloat16", "float16", "float32"],
         default="bfloat16",
-        help="PyTorch dtype for model loading. Default: bfloat16"
+        help="PyTorch dtype for model loading. Default: bfloat16",
     )
     parser.add_argument(
-        "--trust_remote_code",
-        action="store_true",
-        help="Allow loading of remote code (required for some models)"
-    )
-    parser.add_argument(
-        "--random_seed",
-        type=int,
-        default=42,
-        help="Random seed for dataset shuffling. Default: 42"
+        "--trust_remote_code", action="store_true", help="Allow loading of remote code (required for some models)"
     )
+    parser.add_argument("--random_seed", type=int, default=42, help="Random seed for dataset shuffling. Default: 42")
     parser.add_argument(
         "--max_gpu_memory",
         type=str,
         default=None,
         help="Maximum GPU memory for model weights per device (e.g., '40GiB'). "
-             "GPTQ quantization requires additional GPU memory for Hessian matrix computation, "
-             "so reserve 40-50%% of total VRAM. For example, use '40GiB' on 80GB GPUs. "
-             "Remaining layers will be offloaded to CPU. Default: use all available"
+        "GPTQ quantization requires additional GPU memory for Hessian matrix computation, "
+        "so reserve 40-50%% of total VRAM. For example, use '40GiB' on 80GB GPUs. "
+        "Remaining layers will be offloaded to CPU. Default: use all available",
     )
     parser.add_argument(
         "--max_cpu_memory",
         type=str,
         default=None,
-        help="Maximum CPU memory to use (e.g., '100GiB'). Default: use all available"
+        help="Maximum CPU memory to use (e.g., '100GiB'). Default: use all available",
     )
-    
+
     return parser.parse_args()
 
 
@@ -167,11 +139,7 @@ def get_torch_dtype(dtype_str):
     Returns:
         torch.dtype: Corresponding PyTorch dtype
     """
-    dtype_map = {
-        "bfloat16": torch.bfloat16,
-        "float16": torch.float16,
-        "float32": torch.float32
-    }
+    dtype_map = {"bfloat16": torch.bfloat16, "float16": torch.float16, "float32": torch.float32}
     return dtype_map[dtype_str]
 
 
@@ -191,18 +159,18 @@ def check_dense_layers_and_update_ignore(model_id, ignore_patterns, trust_remote
         Updated ignore_patterns list with dense layer patterns added
     """
     print("🔍 Checking model configuration for dense layers...")
-    
+
     try:
         # Load model configuration
         config = AutoConfig.from_pretrained(model_id, trust_remote_code=trust_remote_code)
-        
+
         # Check if the model has first_k_dense_replace parameter
-        first_k_dense_replace = getattr(config, 'first_k_dense_replace', None)
-        
+        first_k_dense_replace = getattr(config, "first_k_dense_replace", None)
+
         if first_k_dense_replace is not None and first_k_dense_replace > 0:
             print(f"✅ Found dense layers configuration: first_k_dense_replace = {first_k_dense_replace}")
             print(f"   Adding first {first_k_dense_replace} layers to ignore list...")
-            
+
             # Create regex pattern for dense layers (layers 0 to first_k_dense_replace-1)
             if first_k_dense_replace == 1:
                 dense_pattern = r"re:model\.layers\.0\.mlp\..*$"
@@ -210,18 +178,18 @@ def check_dense_layers_and_update_ignore(model_id, ignore_patterns, trust_remote
                 # For multiple layers, use range pattern
                 layer_range = f"[0-{first_k_dense_replace-1}]"
                 dense_pattern = f"re:model\\.layers\\.{layer_range}\\.mlp\\..*$"
-            
+
             # Add the dense layer pattern to ignore list
             updated_ignore_patterns = ignore_patterns + [dense_pattern]
-            
+
             print(f"   Dense layer pattern added: {dense_pattern}")
             print(f"   This will ignore MLP components in layers 0-{first_k_dense_replace-1}")
-            
+
             return updated_ignore_patterns
         else:
             print("ℹ️  No dense layers detected (first_k_dense_replace not found or is 0)")
             return ignore_patterns
-            
+
     except Exception as e:
         print(f"⚠️  Warning: Could not check model config for dense layers: {e}")
         print("   Proceeding with original ignore patterns...")
@@ -261,11 +229,7 @@ def load_and_prepare_dataset(dataset_name, dataset_split, num_samples, max_lengt
     # Tokenize the data
     def tokenize(sample):
         return tokenizer(
-            sample["text"],
-            padding=False,
-            max_length=max_length,
-            truncation=True,
-            add_special_tokens=False
+            sample["text"], padding=False, max_length=max_length, truncation=True, add_special_tokens=False
         )
 
     ds = ds.map(tokenize, remove_columns=ds.column_names)
@@ -306,9 +270,7 @@ def main():
     # 0) Check for dense layers and update ignore patterns
     # Dense layers in the first few layers should not be quantized
     updated_ignore_patterns = check_dense_layers_and_update_ignore(
-        args.model_id,
-        args.ignore_patterns,
-        args.trust_remote_code
+        args.model_id, args.ignore_patterns, args.trust_remote_code
     )
 
     # --------------------------------------------------------------------
@@ -320,9 +282,7 @@ def main():
         print("🔍 Building CPU-only device map...")
         with init_empty_weights():
             dummy = AutoModelForCausalLM.from_pretrained(
-                args.model_id,
-                torch_dtype=torch_dtype,
-                trust_remote_code=args.trust_remote_code
+                args.model_id, torch_dtype=torch_dtype, trust_remote_code=args.trust_remote_code
             )
             device_map = {name: "cpu" for name, _ in dummy.named_modules() if name}
             del dummy
@@ -330,9 +290,7 @@ def main():
         print("🔍 Inferring device map...")
         with init_empty_weights():
             dummy = AutoModelForCausalLM.from_pretrained(
-                args.model_id,
-                torch_dtype=torch_dtype,
-                trust_remote_code=args.trust_remote_code
+                args.model_id, torch_dtype=torch_dtype, trust_remote_code=args.trust_remote_code
             )
             # Build max_memory dict if specified
             max_memory = None
@@ -357,9 +315,7 @@ def main():
                     print(f"   CPU memory limit: 1000GiB (default, to prevent disk offloading)")
 
             device_map = infer_auto_device_map(
-                dummy,
-                no_split_module_classes=dummy._no_split_modules,
-                max_memory=max_memory
+                dummy, no_split_module_classes=dummy._no_split_modules, max_memory=max_memory
             )
 
             # Check if disk offloading was triggered (not supported by llmcompressor)
@@ -371,8 +327,10 @@ def main():
                 print("   1. Increase --max_gpu_memory to use more GPU memory")
                 print("   2. Add --max_cpu_memory with higher value (e.g., '200GiB')")
                 print("   3. Ensure your machine has enough GPU + CPU memory")
-                raise RuntimeError("Disk offloading is not supported by llmcompressor. "
-                                 "Please ensure you have enough GPU + CPU memory.")
+                raise RuntimeError(
+                    "Disk offloading is not supported by llmcompressor. "
+                    "Please ensure you have enough GPU + CPU memory."
+                )
 
             del dummy
     # --------------------------------------------------------------------
@@ -409,7 +367,7 @@ def main():
         args.num_calibration_samples,
         args.max_sequence_length,
         tokenizer,
-        args.random_seed
+        args.random_seed,
     )
 
     # --------------------------------------------------------------------
@@ -447,4 +405,4 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/kt-kernel/setup.py b/kt-kernel/setup.py
index 3860f35c..f2bf0dd9 100644
--- a/kt-kernel/setup.py
+++ b/kt-kernel/setup.py
@@ -21,6 +21,7 @@ Environment knobs (export before running pip install .):
   CPUINFER_ENABLE_AMD=OFF         ON/OFF -> -DKTRANSFORMERS_CPU_MOE_AMD
   CPUINFER_ENABLE_KML=OFF         ON/OFF -> -DKTRANSFORMERS_CPU_USE_KML
   CPUINFER_ENABLE_AVX512=OFF      ON/OFF -> -DKTRANSFORMERS_CPU_USE_AMX_AVX512
+  CPUINFER_BLIS_ROOT=/path/to/blis  Forward to -DBLIS_ROOT
 
 
   CPUINFER_ENABLE_LTO=ON          ON/OFF -> -DCPUINFER_ENABLE_LTO (your added option)
@@ -28,6 +29,7 @@ Environment knobs (export before running pip install .):
   CPUINFER_LTO_MODE=auto          Forward to -DCPUINFER_LTO_MODE
   CPUINFER_NATIVE=ON               (override LLAMA_NATIVE)
 
+
 GPU backends (if ever added later, keep placeholders):
   CPUINFER_USE_CUDA=0/1           -DKTRANSFORMERS_USE_CUDA
   CPUINFER_USE_ROCM=0/1           -DKTRANSFORMERS_USE_ROCM
@@ -51,6 +53,43 @@ from setuptools import setup, Extension
 from setuptools.command.build_ext import build_ext
 import shutil
 
+# -------------------------
+# Env parsing helpers
+# -------------------------
+def _env_get_bool(name: str, default: bool | None = None) -> bool | None:
+    v = os.environ.get(name)
+    if v is None:
+        return default
+    val = v.strip().lower()
+    if val in ("1", "on", "true", "yes", "y", "enable", "enabled"):
+        return True
+    if val in ("0", "off", "false", "no", "n", "disable", "disabled"):
+        return False
+    return default
+
+
+def _cmake_onoff(flag: bool) -> str:
+    return "ON" if flag else "OFF"
+
+
+def _forward_bool_env(cmake_args: list[str], env_name: str, cmake_flag: str) -> bool:
+    """If env exists, forward it to CMake as -D<flag>=ON/OFF and return True; else return False."""
+    b = _env_get_bool(env_name, None)
+    if b is None:
+        return False
+    cmake_args.append(f"-D{cmake_flag}={_cmake_onoff(b)}")
+    print(f"-- Forward {env_name} -> -D{cmake_flag}={_cmake_onoff(b)}")
+    return True
+
+
+def _forward_str_env(cmake_args: list[str], env_name: str, cmake_flag: str) -> bool:
+    v = os.environ.get(env_name)
+    if not v:
+        return False
+    cmake_args.append(f"-D{cmake_flag}={v}")
+    print(f"-- Forward {env_name} -> -D{cmake_flag}={v}")
+    return True
+
 ################################################################################
 # Helpers
 ################################################################################
@@ -204,7 +243,34 @@ class CMakeBuild(build_ext):
                 return True
             return False
 
-        if os.environ.get("CPUINFER_USE_CUDA") is None:
+        # Locate nvcc executable (without forcing user to set -DCMAKE_CUDA_COMPILER)
+        def find_nvcc_path() -> str | None:
+            cuda_home = os.environ.get("CUDA_HOME")
+            if cuda_home:
+                cand = Path(cuda_home) / "bin" / "nvcc"
+                if cand.exists():
+                    return str(cand)
+            which_nvcc = shutil.which("nvcc")
+            if which_nvcc:
+                return which_nvcc
+            # Common fallbacks (ordered by preference)
+            for cand in [
+                "/usr/local/cuda-12.6/bin/nvcc",
+                "/usr/local/cuda/bin/nvcc",
+                "/usr/bin/nvcc",
+                "/usr/lib/nvidia-cuda-toolkit/bin/nvcc",
+            ]:
+                if Path(cand).exists():
+                    return cand
+            return None
+
+        # Note: We no longer set CMAKE_CUDA_ARCHITECTURES by default.
+        # If users want to specify CUDA archs, they can set env CPUINFER_CUDA_ARCHS
+        # (e.g. "89" or "86;89") or pass it via CMAKE_ARGS.
+        auto_moe_kernel_ = False
+        # Normalize CPUINFER_USE_CUDA: if unset, auto-detect; otherwise respect truthy/falsey values
+        cuda_env = _env_get_bool("CPUINFER_USE_CUDA", None)
+        if cuda_env is None:
             auto_cuda = detect_cuda_toolkit()
             os.environ["CPUINFER_USE_CUDA"] = "1" if auto_cuda else "0"
             print(f"-- CPUINFER_USE_CUDA not set; auto-detected CUDA toolkit: {'YES' if auto_cuda else 'NO'}")
@@ -228,56 +294,87 @@ class CMakeBuild(build_ext):
         print(f"Detected CPU info: {d}")
 
         # Vendor / feature specific toggles
-        # Enable AMD MoE kernel on AMD by default unless user explicitly set CPUINFER_ENABLE_AMD
-        # temporarily disabled this opt, use llamafile backend for now
-        # if d.get("vendor") == "amd" and os.environ.get("CPUINFER_ENABLE_AMD") is None:
-        #     cmake_args.append("-DKTRANSFORMERS_CPU_MOE_AMD=ON")
-        #     print("-- Detected AMD CPU; enabling AMD MoE kernel (-DKTRANSFORMERS_CPU_MOE_AMD=ON)")
+        # AMD MoE: explicit env overrides; otherwise default ON on AMD CPU
+        if not _forward_bool_env(cmake_args, "CPUINFER_ENABLE_AMD", "KTRANSFORMERS_CPU_MOE_AMD"):
+            if d.get("vendor") == "amd":
+                auto_moe_kernel_ = True
+                cmake_args.append("-DKTRANSFORMERS_CPU_MOE_AMD=ON")
+                print("-- Detected AMD CPU; enabling AMD MoE kernel (-DKTRANSFORMERS_CPU_MOE_AMD=ON)")
+                _forward_str_env(cmake_args, "CPUINFER_BLIS_ROOT", "BLIS_ROOT")
 
-        # On ARM, enable KML by default if not explicitly toggled
-        if d.get("vendor") == "arm" and os.environ.get("CPUINFER_ENABLE_KML") is None:
-            cmake_args.append("-DKTRANSFORMERS_CPU_USE_KML=ON")
-            print("-- Detected ARM CPU; enabling KML (-DKTRANSFORMERS_CPU_USE_KML=ON)")
+        # KML: explicit env overrides; otherwise default ON on ARM
+        if not _forward_bool_env(cmake_args, "CPUINFER_ENABLE_KML", "KTRANSFORMERS_CPU_USE_KML"):
+            if d.get("vendor") == "arm":
+                auto_moe_kernel_ = True
+                cmake_args.append("-DKTRANSFORMERS_CPU_USE_KML=ON")
+                print("-- Detected ARM CPU; enabling KML (-DKTRANSFORMERS_CPU_USE_KML=ON)")
 
-        # If AMX or AVX512 present, enable umbrella unless overridden; enable AMX specifically when present
-        if "AMX" in d["features"]:
-            if os.environ.get("CPUINFER_ENABLE_AMX") is None:
+        # AMX: explicit env overrides; else enable if detected
+        if not _forward_bool_env(cmake_args, "CPUINFER_ENABLE_AMX", "KTRANSFORMERS_CPU_USE_AMX"):
+            if "AMX" in d["features"]:
                 cmake_args.append("-DKTRANSFORMERS_CPU_USE_AMX=ON")
                 print("-- AMX support detected; enabling (-DKTRANSFORMERS_CPU_USE_AMX=ON)")
-        if ("AMX" in d["features"] or "AVX512" in d["features"]) and os.environ.get(
-            "CPUINFER_ENABLE_AVX512"
-        ) is None:
-            cmake_args.append("-DKTRANSFORMERS_CPU_USE_AMX_AVX512=ON")
-            print("-- Enabling AMX/AVX512 umbrella (-DKTRANSFORMERS_CPU_USE_AMX_AVX512=ON)")
+        # AVX512 umbrella: explicit env overrides; else enable if AMX or AVX512 detected
+        if not _forward_bool_env(cmake_args, "CPUINFER_ENABLE_AVX512", "KTRANSFORMERS_CPU_USE_AMX_AVX512"):
+            if "AMX" in d["features"] or "AVX512" in d["features"]:
+                cmake_args.append("-DKTRANSFORMERS_CPU_USE_AMX_AVX512=ON")
+                print("-- Enabling AMX/AVX512 umbrella (-DKTRANSFORMERS_CPU_USE_AMX_AVX512=ON)")
+
+        # Auto-enable MOE kernel only when env explicitly turns on AMD or KML backend
+        # (Do not enable purely on vendor auto-detection to avoid surprise behavior.)
+        amd_env = _env_get_bool("CPUINFER_ENABLE_AMD", None)
+        kml_env = _env_get_bool("CPUINFER_ENABLE_KML", None)
+        if amd_env or kml_env:
+            auto_moe_kernel_ = True
+        already_set = any("KTRANSFORMERS_CPU_MOE_KERNEL" in a for a in cmake_args)
+        if not already_set and auto_moe_kernel_:
+            cmake_args.append("-DKTRANSFORMERS_CPU_MOE_KERNEL=ON")
+            print("-- Auto-enabling MOE kernel (-DKTRANSFORMERS_CPU_MOE_KERNEL=ON) because CPUINFER_ENABLE_AMD or CPUINFER_ENABLE_KML is ON")
 
         # Friendly summary
         print(
             f"-- CPU detection: vendor={d.get('vendor')} arch={d.get('arch')} features={sorted(list(d.get('features', [])))}"
         )
 
-        # Optional AMX / MLA toggles (explicit env overrides auto detection above)
-        if os.environ.get("CPUINFER_ENABLE_AMX"):
-            cmake_args.append(f"-DKTRANSFORMERS_CPU_USE_AMX={os.environ['CPUINFER_ENABLE_AMX']}")
-        if os.environ.get("CPUINFER_ENABLE_KML"):
-            cmake_args.append(f"-DKTRANSFORMERS_CPU_USE_KML={os.environ['CPUINFER_ENABLE_KML']}")
-        if os.environ.get("CPUINFER_ENABLE_MLA"):
-            cmake_args.append(f"-DKTRANSFORMERS_CPU_MLA={os.environ['CPUINFER_ENABLE_MLA']}")
+        # MLA toggle (string/boolean allowed)
+        if not _forward_bool_env(cmake_args, "CPUINFER_ENABLE_MLA", "KTRANSFORMERS_CPU_MLA"):
+            _forward_str_env(cmake_args, "CPUINFER_ENABLE_MLA", "KTRANSFORMERS_CPU_MLA")
 
-        # LTO toggles if user added them in CMakeLists
-        if os.environ.get("CPUINFER_ENABLE_LTO"):
-            cmake_args.append(f"-DCPUINFER_ENABLE_LTO={os.environ['CPUINFER_ENABLE_LTO']}")
-        if os.environ.get("CPUINFER_LTO_JOBS"):
-            cmake_args.append(f"-DCPUINFER_LTO_JOBS={os.environ['CPUINFER_LTO_JOBS']}")
-        if os.environ.get("CPUINFER_LTO_MODE"):
-            cmake_args.append(f"-DCPUINFER_LTO_MODE={os.environ['CPUINFER_LTO_MODE']}")
+        # LTO toggles
+        _forward_bool_env(cmake_args, "CPUINFER_ENABLE_LTO", "CPUINFER_ENABLE_LTO")
+        _forward_str_env(cmake_args, "CPUINFER_LTO_JOBS", "CPUINFER_LTO_JOBS")
+        _forward_str_env(cmake_args, "CPUINFER_LTO_MODE", "CPUINFER_LTO_MODE")
 
         # GPU backends (mutually exclusive expected)
-        if os.environ.get("CPUINFER_USE_CUDA") == "1":
+        if _env_get_bool("CPUINFER_USE_CUDA", False):
             cmake_args.append("-DKTRANSFORMERS_USE_CUDA=ON")
             print("-- Enabling CUDA backend (-DKTRANSFORMERS_USE_CUDA=ON)")
-        if os.environ.get("CPUINFER_USE_ROCM") == "1":
+            # Inject nvcc compiler path automatically unless user already specified one.
+            user_specified_compiler = any("CMAKE_CUDA_COMPILER" in a for a in cmake_args)
+            if not user_specified_compiler:
+                extra_env = os.environ.get("CMAKE_ARGS", "")
+                if "CMAKE_CUDA_COMPILER" in extra_env:
+                    user_specified_compiler = True
+            if not user_specified_compiler:
+                nvcc_path = find_nvcc_path()
+                if nvcc_path:
+                    cmake_args.append(f"-DCMAKE_CUDA_COMPILER={nvcc_path}")
+                    print(f"-- Auto-detected nvcc: {nvcc_path} (adding -DCMAKE_CUDA_COMPILER)")
+                else:
+                    print("-- Warning: nvcc not found via CUDA_HOME/PATH/common prefixes; CUDA configure may fail.")
+            # Optional host compiler for nvcc if user set CUDAHOSTCXX
+            if os.environ.get("CUDAHOSTCXX"):
+                hostcxx = os.environ["CUDAHOSTCXX"]
+                cmake_args.append(f"-DCMAKE_CUDA_HOST_COMPILER={hostcxx}")
+                print(f"-- Using CUDA host compiler from CUDAHOSTCXX: {hostcxx}")
+            # Respect user-provided architectures only (no default auto-detection).
+            archs_env = os.environ.get("CPUINFER_CUDA_ARCHS", "").strip()
+            if archs_env and not any("CMAKE_CUDA_ARCHITECTURES" in a for a in cmake_args):
+                cmake_args.append(f"-DCMAKE_CUDA_ARCHITECTURES={archs_env}")
+                print(f"-- Set CUDA architectures from CPUINFER_CUDA_ARCHS: {archs_env}")
+        if _env_get_bool("CPUINFER_USE_ROCM", False):
             cmake_args.append("-DKTRANSFORMERS_USE_ROCM=ON")
-        if os.environ.get("CPUINFER_USE_MUSA") == "1":
+        if _env_get_bool("CPUINFER_USE_MUSA", False):
             cmake_args.append("-DKTRANSFORMERS_USE_MUSA=ON")
 
         # Respect user extra CMAKE_ARGS (space separated)
@@ -286,7 +383,7 @@ class CMakeBuild(build_ext):
             cmake_args += [a for a in extra.split() if a]
 
         # Force rebuild? (delete cache)
-        if os.environ.get("CPUINFER_FORCE_REBUILD") == "1":
+        if _env_get_bool("CPUINFER_FORCE_REBUILD", True):
             cache = build_temp / "CMakeCache.txt"
             if cache.exists():
                 cache.unlink()