[ADD] support multi-gpu qlen>1 q5_k

2025-09-05 20:19:51 +00:00 · 2024-08-12 11:17:29 +00:00 · 2024-08-12 11:17:29 +00:00 · f5f79f5c0e
commit f5f79f5c0e
parent f293803156
63 changed files with 3271 additions and 1285 deletions
--- a/.gitignore
+++ b/.gitignore
@ -15,3 +15,6 @@ node_modules
 compile_commands.json
 *.egg-info*
 *dist/
+ktransformers/server/local_store/
+ktransformers/server_test1.db
+*.patch
--- a/install.bat
+++ b/install.bat
@ -0,0 +1,16 @@
+@echo off
+
+REM clear build dirs
+rmdir /S /Q ktransformers\ktransformers_ext\build
+rmdir /S /Q ktransformers\ktransformers_ext\cuda\build
+rmdir /S /Q ktransformers\ktransformers_ext\cuda\dist
+rmdir /S /Q ktransformers\ktransformers_ext\out
+del /F /Q ktransformers\ktransformers_ext\cuda\*.egg-info
+
+echo Installing python dependencies from requirements.txt
+pip install -r requirements-local_chat.txt
+
+echo Installing ktransformers
+set KTRANSFORMERS_FORCE_BUILD=TRUE
+pip install . --no-build-isolation
+echo Installation completed successfully
--- a/install.sh
+++ b/install.sh
@ -11,5 +11,5 @@ echo "Installing python dependencies from requirements.txt"
 pip install -r requirements-local_chat.txt

 echo "Installing ktransformers"
-pip install . --no-build-isolation
+KTRANSFORMERS_FORCE_BUILD=TRUE pip install . --no-build-isolation
 echo "Installation completed successfully"
--- a/ktransformers/ktransformers_ext/CMakeLists.txt
+++ b/ktransformers/ktransformers_ext/CMakeLists.txt
@ -189,7 +189,13 @@ else()
    message(STATUS "Unknown architecture")
 endif()

-find_package(CUDA REQUIRED)
+# message(STATUS "CUDAToolkit_ROOT:${CUDAToolkit_ROOT}")
+# find_package(FindCUDAToolkit REQUIRED)
+# if(CUDAToolkit_FOUND)
+#     message(STATUS "Found CUDA cudart lib at:${CUDAToolkit_LIBRARY_DIR}")
+# else()
+#     message(STATUS "Can't found CUDA lib")
+# endif()

 add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>")
 add_compile_options("$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>")
@ -198,7 +204,12 @@ add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../third_party/pybind11 ${CMAKE_
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../third_party/llama.cpp ${CMAKE_CURRENT_BINARY_DIR}/third_party/llama.cpp)

 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../../third_party)
-include_directories("${CUDA_INCLUDE_DIRS}")
+if (WIN32)
+    include_directories("$ENV{CUDA_PATH}/include")
+elseif (UNIX)
+    find_package(CUDA REQUIRED)
+    include_directories("${CUDA_INCLUDE_DIRS}")
+endif()

 aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR} SOURCE_DIR1)
 aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/cpu_backend SOURCE_DIR2)
@ -209,4 +220,8 @@ message(STATUS "ALL_SOURCES: ${ALL_SOURCES}")

 pybind11_add_module(${PROJECT_NAME} MODULE ${ALL_SOURCES})
 target_link_libraries(${PROJECT_NAME} PRIVATE llama)
-target_link_libraries(${PROJECT_NAME} PRIVATE "/usr/local/cuda/lib64/libcudart.so")
+if(WIN32)
+    target_link_libraries(${PROJECT_NAME} PRIVATE "$ENV{CUDA_PATH}/lib/x64/cudart.lib")#CUDA::cudart
+elseif(UNIX)
+    target_link_libraries(${PROJECT_NAME} PRIVATE "$ENV{CUDA_HOME}/lib64/libcudart.so")
+endif()
--- a/ktransformers/ktransformers_ext/cpu_backend/task_queue.h
+++ b/ktransformers/ktransformers_ext/cpu_backend/task_queue.h
@ -3,8 +3,8 @@
 * @Author       : chenht2022
 * @Date         : 2024-07-16 10:43:18
 * @Version      : 1.0.0
- * @LastEditors  : chenht2022 
- * @LastEditTime : 2024-07-25 10:33:47
+ * @LastEditors  : chenxl 
+ * @LastEditTime : 2024-08-08 04:23:51
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/
 #ifndef CPUINFER_TASKQUEUE_H
@ -17,6 +17,44 @@
 #include <queue>
 #include <thread>
 #include <vector>
+#ifdef _WIN32
+#include <windows.h>
+#endif
+
+class custom_mutex {
+private:
+#ifdef _WIN32
+    HANDLE  global_mutex;
+#else
+    std::mutex global_mutex;
+#endif
+    
+public:
+    custom_mutex()
+    {
+#ifdef _WIN32
+        HANDLE  global_mutex;
+#endif
+    }
+
+    void lock()
+    {
+#ifdef _WIN32
+        WaitForSingleObject(global_mutex, INFINITE);
+#else
+        global_mutex.lock();
+#endif
+    }
+
+    void unlock()
+    {
+#ifdef _WIN32
+        ReleaseMutex(global_mutex);
+#else
+        global_mutex.lock();
+#endif
+    }
+};

 class TaskQueue {
   public:
@ -32,7 +70,7 @@ class TaskQueue {

    std::queue<std::function<void()>> tasks;
    std::thread worker;
-    std::mutex mutex;
+    custom_mutex mutex;
    std::atomic<bool> sync_flag;
    std::atomic<bool> exit_flag;
 };
--- a/ktransformers/ktransformers_ext/cuda/binding.cpp
+++ b/ktransformers/ktransformers_ext/cuda/binding.cpp
@ -3,8 +3,8 @@
 * @Author       : Azure-Tang
 * @Date         : 2024-07-25 13:38:30
 * @Version      : 1.0.0
- * @LastEditors  : Azure 
- * @LastEditTime : 2024-07-26 08:36:03
+ * @LastEditors  : kkk1nak0
+ * @LastEditTime : 2024-08-09 01:45:02
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
 **/

@ -23,6 +23,8 @@ PYBIND11_MODULE(KTransformersOps, m) {
            py::arg("data"), py::arg("blk_size"), py::arg("device"));
      m.def("dequantize_q6_k", &dequantize_q6_k, "Function to dequantize q6_k data.",
            py::arg("data"), py::arg("blk_size"), py::arg("device"));
+      m.def("dequantize_q5_k", &dequantize_q5_k, "Function to dequantize q5_k data.",
+            py::arg("data"), py::arg("blk_size"), py::arg("device"));
      m.def("dequantize_q4_k",  &dequantize_q4_k, "Function to dequantize q4_k data.",
            py::arg("data"), py::arg("blk_size"), py::arg("device"));
      m.def("gptq_marlin_gemm", &gptq_marlin_gemm, "Function to perform GEMM using Marlin quantization.",
--- a/ktransformers/ktransformers_ext/cuda/custom_gguf/binding.cpp
+++ b/ktransformers/ktransformers_ext/cuda/custom_gguf/binding.cpp
@ -12,12 +12,15 @@ int test(){
 }

 torch::Tensor dequantize_q6_k(torch::Tensor data, int blk_size, torch::Device device);
+torch::Tensor dequantize_q5_k(torch::Tensor data, int blk_size, torch::Device device);

 PYBIND11_MODULE(cudaops, m) {
    m.def("dequantize_q8_0", &dequantize_q8_0, "Function to dequantize q8_0 data.",
          py::arg("data"), py::arg("blk_size"), py::arg("device"));
    m.def("dequantize_q6_k", &dequantize_q6_k, "Function to dequantize q6_k data.",
          py::arg("data"), py::arg("blk_size"), py::arg("device"));
+    m.def("dequantize_q5_k", &dequantize_q5_k, "Function to dequantize q5_k data.",
+          py::arg("data"), py::arg("blk_size"), py::arg("device"));
    m.def("dequantize_q4_k",  &dequantize_q4_k, "Function to dequantize q4_k data.",
          py::arg("data"), py::arg("blk_size"), py::arg("device"));
    m.def("test", &test, "Function to test.");
--- a/ktransformers/ktransformers_ext/cuda/custom_gguf/custom_ggml.h
+++ b/ktransformers/ktransformers_ext/cuda/custom_gguf/custom_ggml.h
@ -1,39 +0,0 @@
-
-
-
-#include <cuda_fp16.h>
-
-
-__device__ float ggml_compute_fp16_to_fp32(uint16_t h) {
-    return __uint2float_rd(h);
-}
-
-static inline float ggml_compute_fp16_to_fp32(uint16_t h) {
-    uint16_t tmp;
-    memcpy(&tmp, &h, sizeof(ggml_fp16_t));
-    return (float)tmp;
-}
-
-// define the global table for fp16 to fp32 conversion
-__device__ float ggml_table_f32_f16[1 << 16];
-
-// CUDA Kernel to init the table
-__global__ void init_fp16_to_fp32_table() {
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    for (auto blk_id = idx; blk_id<(1 << 16); blk_id+=blockDim.x * gridDim.x){
-        ggml_table_f32_f16[blk_id] = GGML_COMPUTE_FP16_TO_FP32(blk_id);
-    }
-}
-
-#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
-
-extern __device__ float ggml_table_f32_f16[1 << 16]; // Declare as __device__ if used within device code
-
-// This version of the function is designed to be called from within a CUDA kernel
-#if !defined(GGML_FP16_TO_FP32)
-__device__ float ggml_lookup_fp16_to_fp32(uint16_t f) {
-    return ggml_table_f32_f16[f];
-}
-
-#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
-#endif
--- a/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu
+++ b/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu
@ -3,8 +3,8 @@
 * @Author       : Azure-Tang, Boxin Zhang
 * @Date         : 2024-07-25 13:38:30
 * @Version      : 1.0.0
- * @LastEditors  : Azure 
- * @LastEditTime : 2024-07-26 11:58:50
+ * @LastEditors  : kkk1nak0
+ * @LastEditTime : 2024-08-09 07:57:06
 * Adapted from https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.c
 * Copyright (c) 2023-2024 The ggml authors
 * Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
@ -14,6 +14,7 @@
 #include <torch/extension.h>
 #include <torch/torch.h>
 #include <cstdint>
+#include <c10/cuda/CUDAGuard.h>

 __global__ void dequantize_q8_0_kernel(float* output, const float* scales, const int8_t* qs, int num_blocks, int blk_size) {
    int global_idx = blockIdx.x * blockDim.x + threadIdx.x;
@ -59,6 +60,35 @@ __global__ void dequantize_q4_k_kernel(int8_t* data, float* output, int blk_size
    }
 }

+__global__ void dequantize_q5_k_kernel(int8_t* data, float* output, int blk_size, int num_blocks) {
+    int global_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    for (auto block_id=global_idx; block_id<num_blocks; block_id+= blockDim.x * gridDim.x){
+        float* __restrict__ output_blk = (float*)(output + block_id * 256);
+
+        const float d   = __half2float(*(reinterpret_cast<half*>(data + block_id * blk_size + 0)));
+        const float min = __half2float(*(reinterpret_cast<half*>(data + block_id * blk_size + 2)));
+
+        const uint8_t * __restrict__ qh = (uint8_t*)(data + block_id * blk_size + 16);
+        const uint8_t * __restrict__ ql = (uint8_t*)(data + block_id * blk_size + 48);
+
+        int is = 0;
+        uint8_t sc, m;
+        uint8_t u1 = 1, u2 = 2;
+        uint8_t* scales = (uint8_t*)(data + block_id * blk_size + 4);
+
+        for (int j = 0; j < 256; j += 64) {
+            get_scale_min_k4(is + 0, scales, &sc, &m);
+            const float d1 = d * sc; const float m1 = min * m;
+            get_scale_min_k4(is + 1, scales, &sc, &m);
+            const float d2 = d * sc; const float m2 = min * m;
+            for (int l = 0; l < 32; ++l) *output_blk++ = d1 * ((ql[l] & 0xF) + (qh[l] & u1 ? 16 : 0)) - m1;
+            for (int l = 0; l < 32; ++l) *output_blk++ = d2 * ((ql[l]  >> 4) + (qh[l] & u2 ? 16 : 0)) - m2;
+            ql += 32; is += 2;
+            u1 <<= 2; u2 <<= 2;
+        }
+    }
+}
+
 __global__ void dequantize_q6_k_kernel(int8_t* data, float* output, int blk_size, int num_blocks) {
    int global_idx = blockIdx.x * blockDim.x + threadIdx.x;
    for (auto block_id=global_idx; block_id<num_blocks;block_id+=blockDim.x * gridDim.x){
@ -94,6 +124,7 @@ __global__ void dequantize_q6_k_kernel(int8_t* data, float* output, int blk_size

 torch::Tensor dequantize_q8_0(torch::Tensor data, int blk_size, torch::Device device) {
    int num_blocks = data.numel() / blk_size;
+    const at::cuda::OptionalCUDAGuard device_guard(device);
    // create gpu
    auto options_scales = torch::TensorOptions().dtype(torch::kFloat32).device(device).memory_format(torch::MemoryFormat::Contiguous);
    auto options_qs = torch::TensorOptions().dtype(torch::kInt8).device(device).memory_format(torch::MemoryFormat::Contiguous);
@ -128,6 +159,7 @@ torch::Tensor dequantize_q6_k(torch::Tensor data, int blk_size, torch::Device de
    // data.numel%blk_size should be 0, else raise err
    int num_blocks = data.numel() / blk_size;

+    const at::cuda::OptionalCUDAGuard device_guard(device);
    auto options = torch::TensorOptions().dtype(torch::kInt8).device(device).memory_format(torch::MemoryFormat::Contiguous);
    auto data_gpu = torch::empty({data.numel()}, options);

@ -147,6 +179,7 @@ torch::Tensor dequantize_q6_k(torch::Tensor data, int blk_size, torch::Device de
 torch::Tensor dequantize_q4_k(torch::Tensor data, int blk_size, torch::Device device) {
    // data.numel%blk_size should be 0, else raise err
    int num_blocks = data.numel() / blk_size;
+    const at::cuda::OptionalCUDAGuard device_guard(device);

    auto options = torch::TensorOptions().dtype(torch::kInt8).device(device).memory_format(torch::MemoryFormat::Contiguous);
    auto data_gpu = torch::empty({data.numel()}, options);
@ -162,3 +195,22 @@ torch::Tensor dequantize_q4_k(torch::Tensor data, int blk_size, torch::Device de
    cudaDeviceSynchronize();
    return output;
 }
+
+
+torch::Tensor dequantize_q5_k(torch::Tensor data, int blk_size, torch::Device device) {
+    int num_blocks = data.numel() / blk_size;
+
+    auto options = torch::TensorOptions().dtype(torch::kInt8).device(device).memory_format(torch::MemoryFormat::Contiguous);
+    auto data_gpu = torch::empty({data.numel()}, options);
+
+    data_gpu.copy_(data, false);
+
+    // Create output tensor
+    auto output = torch::zeros({num_blocks, 256}, torch::dtype(torch::kFloat32).device(device));
+
+    // Launch kernel
+    dequantize_q5_k_kernel<<< 512, 256 >>>(data_gpu.data_ptr<int8_t>(), output.data_ptr<float>(), blk_size, num_blocks);
+
+    cudaDeviceSynchronize();
+    return output;
+}
--- a/ktransformers/ktransformers_ext/cuda/custom_gguf/ops.h
+++ b/ktransformers/ktransformers_ext/cuda/custom_gguf/ops.h
@ -3,8 +3,8 @@
 * @Author       : Azure-Tang
 * @Date         : 2024-07-22 09:27:55
 * @Version      : 1.0.0
- * @LastEditors  : Azure 
- * @LastEditTime : 2024-07-26 08:38:20
+ * @LastEditors  : kkk1nak0
+ * @LastEditTime : 2024-08-09 01:44:21
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
 **/
 #pragma once
@ -15,4 +15,5 @@

 torch::Tensor dequantize_q8_0(torch::Tensor data, int blk_size, torch::Device device);
 torch::Tensor dequantize_q6_k(torch::Tensor data, int blk_size, torch::Device device);
+torch::Tensor dequantize_q5_k(torch::Tensor data, int blk_size, torch::Device device);
 torch::Tensor dequantize_q4_k(torch::Tensor data, int blk_size, torch::Device device);
--- a/ktransformers/ktransformers_ext/cuda/gptq_marlin/gptq_marlin.cu
+++ b/ktransformers/ktransformers_ext/cuda/gptq_marlin/gptq_marlin.cu
@ -23,7 +23,7 @@
 */
 #include "gptq_marlin.cuh"
 #include "gptq_marlin_dtypes.cuh"
-
+#include <c10/cuda/CUDAGuard.h>
 #define STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t)               \
  static_assert(std::is_same<scalar_t, half>::value ||          \
                    std::is_same<scalar_t, nv_bfloat16>::value, \
@ -1703,28 +1703,63 @@ void marlin_mm_f16i4(const void* A, const void* B, void* C, void* s,
      thread_m_blocks = exec_cfg.max_m_blocks;
    }

+
+
    // Define kernel configurations
+#define undefined_error TORCH_CHECK(false, "Unsupported shapes: MNK = [" + str(prob_m) + ", " + \
+    str(prob_n) + ", " + str(prob_k) + "]" + \
+        ", has_act_order = " + str(has_act_order) + \
+        ", num_groups = " + str(num_groups) + \
+        ", group_size = " + str(group_size) + \
+        ", thread_m_blocks = " + str(thread_m_blocks) + \
+        ", thread_n_blocks = " + str(thread_n_blocks) + \
+        ", thread_k_blocks = " + str(thread_k_blocks));
+
+
+    if (num_bits == 4 && num_threads == 256)
+    {
        if (false) {
        }
        CALL_IF(4, 32, 2, 256)
        CALL_IF(4, 16, 4, 256)
        CALL_IF(4, 8, 8, 256)
+        else {
+            undefined_error
+        }
+    }
+    else if (num_bits == 4 && num_threads == 128)
+    {
+        if (false) {
+        }
        CALL_IF(4, 8, 4, 128)
        CALL_IF(4, 4, 8, 128)
+        else {
+            undefined_error
+        }
+    }
+    else if (num_bits == 8 && num_threads == 256)
+    {
+        if (false) {
+        }
        CALL_IF(8, 32, 2, 256)
        CALL_IF(8, 16, 4, 256)
        CALL_IF(8, 8, 8, 256)
+        else {
+            undefined_error
+        }
+    }
+    else if (num_bits == 8 && num_threads == 128)
+    {
+        if (false) {
+        }
        CALL_IF(8, 8, 4, 128)
        CALL_IF(8, 4, 8, 128)
        else {
-      TORCH_CHECK(false, "Unsupported shapes: MNK = [" + str(prob_m) + ", " +
-                             str(prob_n) + ", " + str(prob_k) + "]" +
-                             ", has_act_order = " + str(has_act_order) +
-                             ", num_groups = " + str(num_groups) +
-                             ", group_size = " + str(group_size) +
-                             ", thread_m_blocks = " + str(thread_m_blocks) +
-                             ", thread_n_blocks = " + str(thread_n_blocks) +
-                             ", thread_k_blocks = " + str(thread_k_blocks));
+            undefined_error
+        }
+    }
+    else {
+        undefined_error
    }

    A_ptr += 16 * thread_m_blocks * (prob_k / 8) * par;
@ -1739,6 +1774,7 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
                               torch::Tensor& perm, torch::Tensor& workspace,
                               int64_t num_bits, int64_t size_m, int64_t size_n,
                               int64_t size_k, bool is_k_full) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
  // Verify num_bits
  TORCH_CHECK(num_bits == 4 || num_bits == 8,
              "num_bits must be 4 or 8. Got = ", num_bits);
@ -1781,7 +1817,6 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
  TORCH_CHECK(perm.is_contiguous(), "perm is not contiguous");

  // Alloc buffers
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
  auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device());
  torch::Tensor c = torch::empty({size_m, size_n}, options);
  torch::Tensor a_tmp = torch::empty({size_m, size_k}, options);
--- a/ktransformers/ktransformers_ext/cuda/setup.py
+++ b/ktransformers/ktransformers_ext/cuda/setup.py
@ -2,17 +2,25 @@
 from setuptools import setup, Extension
 from torch.utils import cpp_extension
 from torch.utils.cpp_extension import BuildExtension, CUDAExtension
-
-# setup marlin gemm
-setup(name='KTransformersOps',
+setup(
+    name='KTransformersOps',
    ext_modules=[
-          CUDAExtension('KTransformersOps', [
+        CUDAExtension(
+            'KTransformersOps', [
                'custom_gguf/dequant.cu',
                'binding.cpp',
                'gptq_marlin/gptq_marlin.cu',
                # 'gptq_marlin_repack.cu',
-      ])
            ],
-      cmdclass={'build_ext': BuildExtension
-})
-
+            extra_compile_args={
+                'cxx': ['-O3'],
+                'nvcc': [
+                    '-O3',
+                    '--use_fast_math',
+                    '-Xcompiler', '-fPIC',
+                ]
+            },
+        )
+    ],
+    cmdclass={'build_ext': BuildExtension}
+)
--- a/ktransformers/ktransformers_ext/ext_bindings.cpp
+++ b/ktransformers/ktransformers_ext/ext_bindings.cpp
@ -37,7 +37,7 @@ class LinearBindings {
            Args* args_ = (Args*)args;
            args_->cpuinfer->enqueue(&Linear::warm_up, args_->linear);
        }
-        static std::pair<intptr_t, intptr_t> interface(Linear& linear) {
+        static std::pair<intptr_t, intptr_t> cpuinfer_interface(Linear& linear) {
            Args* args = new Args{nullptr, &linear};
            return std::make_pair((intptr_t)&inner, (intptr_t)args);
        }
@ -55,7 +55,7 @@ class LinearBindings {
            Args* args_ = (Args*)args;
            args_->cpuinfer->enqueue(&Linear::forward, args_->linear, args_->qlen, args_->input, args_->output);
        }
-        static std::pair<intptr_t, intptr_t> interface(Linear& linear, int qlen, intptr_t input, intptr_t output) {
+        static std::pair<intptr_t, intptr_t> cpuinfer_interface(Linear& linear, int qlen, intptr_t input, intptr_t output) {
            Args* args = new Args{nullptr, &linear, qlen, (const void*)input, (void*)output};
            return std::make_pair((intptr_t)&inner, (intptr_t)args);
        }
@ -74,7 +74,7 @@ class MLPBindings {
            Args* args_ = (Args*)args;
            args_->cpuinfer->enqueue(&MLP::warm_up, args_->mlp);
        }
-        static std::pair<intptr_t, intptr_t> interface(MLP& mlp) {
+        static std::pair<intptr_t, intptr_t> cpuinfer_interface(MLP& mlp) {
            Args* args = new Args{nullptr, &mlp};
            return std::make_pair((intptr_t)&inner, (intptr_t)args);
        }
@ -92,7 +92,7 @@ class MLPBindings {
            Args* args_ = (Args*)args;
            args_->cpuinfer->enqueue(&MLP::forward, args_->mlp, args_->qlen, args_->input, args_->output);
        }
-        static std::pair<intptr_t, intptr_t> interface(MLP& mlp, int qlen, intptr_t input, intptr_t output) {
+        static std::pair<intptr_t, intptr_t> cpuinfer_interface(MLP& mlp, int qlen, intptr_t input, intptr_t output) {
            Args* args = new Args{nullptr, &mlp, qlen, (const void*)input, (void*)output};
            return std::make_pair((intptr_t)&inner, (intptr_t)args);
        }
@ -111,7 +111,7 @@ class MOEBindings {
            Args* args_ = (Args*)args;
            args_->cpuinfer->enqueue(&MOE::warm_up, args_->moe);
        }
-        static std::pair<intptr_t, intptr_t> interface(MOE& moe) {
+        static std::pair<intptr_t, intptr_t> cpuinfer_interface(MOE& moe) {
            Args* args = new Args{nullptr, &moe};
            return std::make_pair((intptr_t)&inner, (intptr_t)args);
        }
@ -132,7 +132,7 @@ class MOEBindings {
            Args* args_ = (Args*)args;
            args_->cpuinfer->enqueue(&MOE::forward, args_->moe, args_->qlen, args_->k, args_->expert_ids, args_->weights, args_->input, args_->output);
        }
-        static std::pair<intptr_t, intptr_t> interface(MOE& moe, int qlen, int k, intptr_t expert_ids, intptr_t weights, intptr_t input, intptr_t output) {
+        static std::pair<intptr_t, intptr_t> cpuinfer_interface(MOE& moe, int qlen, int k, intptr_t expert_ids, intptr_t weights, intptr_t input, intptr_t output) {
            Args* args = new Args{nullptr, &moe, qlen, k, (const uint64_t*)expert_ids, (const float*)weights, (const void*)input, (void*)output};
            return std::make_pair((intptr_t)&inner, (intptr_t)args);
        }
@ -154,8 +154,8 @@ PYBIND11_MODULE(cpuinfer_ext, m) {
        }));
    py::class_<Linear>(linear_module, "Linear")
        .def(py::init<LinearConfig>())
-        .def("warm_up", &LinearBindings::WarmUpBindinds::interface)
-        .def("forward", &LinearBindings::ForwardBindings::interface);
+        .def("warm_up", &LinearBindings::WarmUpBindinds::cpuinfer_interface)
+        .def("forward", &LinearBindings::ForwardBindings::cpuinfer_interface);

    auto mlp_module = m.def_submodule("mlp");
    py::class_<MLPConfig>(mlp_module, "MLPConfig")
@ -164,8 +164,8 @@ PYBIND11_MODULE(cpuinfer_ext, m) {
        }));
    py::class_<MLP>(mlp_module, "MLP")
        .def(py::init<MLPConfig>())
-        .def("warm_up", &MLPBindings::WarmUpBindinds::interface)
-        .def("forward", &MLPBindings::ForwardBindings::interface);
+        .def("warm_up", &MLPBindings::WarmUpBindinds::cpuinfer_interface)
+        .def("forward", &MLPBindings::ForwardBindings::cpuinfer_interface);

    auto moe_module = m.def_submodule("moe");
    py::class_<MOEConfig>(moe_module, "MOEConfig")
@ -174,6 +174,6 @@ PYBIND11_MODULE(cpuinfer_ext, m) {
        }));
    py::class_<MOE>(moe_module, "MOE")
        .def(py::init<MOEConfig>())
-        .def("warm_up", &MOEBindings::WarmUpBindinds::interface)
-        .def("forward", &MOEBindings::ForwardBindings::interface);
+        .def("warm_up", &MOEBindings::WarmUpBindinds::cpuinfer_interface)
+        .def("forward", &MOEBindings::ForwardBindings::cpuinfer_interface);
 }
--- a/ktransformers/ktransformers_ext/operators/custom_marlin/quantize/gptq.py
+++ b/ktransformers/ktransformers_ext/operators/custom_marlin/quantize/gptq.py
@ -1,206 +0,0 @@
-import math
-import os
-import time
-from logging import getLogger
-
-import torch
-import torch.nn as nn
-import transformers
-
-from .quantizer import Quantizer
-
-
-logger = getLogger(__name__)
-
-torch.backends.cuda.matmul.allow_tf32 = False
-torch.backends.cudnn.allow_tf32 = False
-
-
-class GPTQ:
-    def __init__(self, layer):
-        self.layer = layer
-        self.dev = self.layer.weight.device
-        W = layer.weight.data.clone()
-        if isinstance(self.layer, nn.Conv2d):
-            W = W.flatten(1)
-        if isinstance(self.layer, transformers.pytorch_utils.Conv1D):
-            W = W.t()
-        self.rows = W.shape[0]
-        self.columns = W.shape[1]
-        self.H = torch.zeros((self.columns, self.columns), device=self.dev)
-        self.nsamples = 0
-        self.quantizer = Quantizer()
-
-    def add_batch(self, inp, out):
-        if os.environ.get("DEBUG"):
-            self.inp1 = inp
-            self.out1 = out
-        if len(inp.shape) == 2:
-            inp = inp.unsqueeze(0)
-        tmp = inp.shape[0]
-        if isinstance(self.layer, nn.Linear) or isinstance(self.layer, transformers.Conv1D):
-            if len(inp.shape) == 3:
-                inp = inp.reshape((-1, inp.shape[-1]))
-            inp = inp.t()
-        if isinstance(self.layer, nn.Conv2d):
-            unfold = nn.Unfold(
-                self.layer.kernel_size,
-                dilation=self.layer.dilation,
-                padding=self.layer.padding,
-                stride=self.layer.stride,
-            )
-            inp = unfold(inp)
-            inp = inp.permute([1, 0, 2])
-            inp = inp.flatten(1)
-        self.H *= self.nsamples / (self.nsamples + tmp)
-        self.nsamples += tmp
-        # inp = inp.float()
-        inp = math.sqrt(2 / self.nsamples) * inp.float()
-        # self.H += 2 / self.nsamples * inp.matmul(inp.t())
-        self.H += inp.matmul(inp.t())
-
-    def fasterquant(
-        self,
-        blocksize=128,
-        percdamp=0.01,
-        group_size=-1,
-        actorder=False,
-        static_groups=False,
-    ):
-        W = self.layer.weight.data.clone()
-        if isinstance(self.layer, nn.Conv2d):
-            W = W.flatten(1)
-        if isinstance(self.layer, transformers.Conv1D):
-            W = W.t()
-        W = W.float()
-
-        tick = time.time()
-
-        if not self.quantizer.ready():
-            self.quantizer.find_params(W, weight=True)
-
-        H = self.H
-        del self.H
-        dead = torch.diag(H) == 0
-        H[dead, dead] = 1
-        W[:, dead] = 0
-
-        g_idx = []
-        scale = []
-        zero = []
-        now_idx = 1
-
-        if static_groups:
-            import copy
-
-            groups = []
-            for i in range(0, self.columns, group_size):
-                quantizer = copy.deepcopy(self.quantizer)
-                quantizer.find_params(W[:, i : (i + group_size)], weight=True)
-                scale.append(quantizer.scale)
-                zero.append(quantizer.zero)
-                groups.append(quantizer)
-
-        if actorder:
-            perm = torch.argsort(torch.diag(H), descending=True)
-            W = W[:, perm]
-            H = H[perm][:, perm]
-            invperm = torch.argsort(perm)
-
-        Losses = torch.zeros_like(W)
-        Q = torch.zeros_like(W)
-
-        damp = percdamp * torch.mean(torch.diag(H))
-        diag = torch.arange(self.columns, device=self.dev)
-        H[diag, diag] += damp
-        H = torch.linalg.cholesky(H)
-        H = torch.cholesky_inverse(H)
-        H = torch.linalg.cholesky(H, upper=True)
-        Hinv = H
-
-        for i1 in range(0, self.columns, blocksize):
-            i2 = min(i1 + blocksize, self.columns)
-            count = i2 - i1
-
-            W1 = W[:, i1:i2].clone()
-            Q1 = torch.zeros_like(W1)
-            Err1 = torch.zeros_like(W1)
-            Losses1 = torch.zeros_like(W1)
-            Hinv1 = Hinv[i1:i2, i1:i2]
-
-            for i in range(count):
-                w = W1[:, i]
-                d = Hinv1[i, i]
-
-                if group_size != -1:
-                    if not static_groups:
-                        if (i1 + i) % group_size == 0:
-                            self.quantizer.find_params(W[:, (i1 + i) : (i1 + i + group_size)], weight=True)
-
-                        if ((i1 + i) // group_size) - now_idx == -1:
-                            scale.append(self.quantizer.scale)
-                            zero.append(self.quantizer.zero)
-                            now_idx += 1
-                    else:
-                        idx = i1 + i
-                        if actorder:
-                            idx = perm[idx]
-                        self.quantizer = groups[idx // group_size]
-
-                q = self.quantizer.quantize(w.unsqueeze(1)).flatten()
-                Q1[:, i] = q
-                Losses1[:, i] = (w - q) ** 2 / d**2
-
-                err1 = (w - q) / d
-                W1[:, i:] -= err1.unsqueeze(1).matmul(Hinv1[i, i:].unsqueeze(0))
-                Err1[:, i] = err1
-
-            Q[:, i1:i2] = Q1
-            Losses[:, i1:i2] = Losses1 / 2
-
-            W[:, i2:] -= Err1.matmul(Hinv[i1:i2, i2:])
-
-            if os.environ.get("DEBUG"):
-                self.layer.weight.data[:, :i2] = Q[:, :i2]
-                self.layer.weight.data[:, i2:] = W[:, i2:]
-                logger.debug(torch.sum((self.layer(self.inp1) - self.out1) ** 2))
-                logger.debug(torch.sum(Losses))
-
-        torch.cuda.synchronize()
-        logger.info(f"duration: {(time.time() - tick)}")
-        logger.info(f"avg loss: {torch.sum(Losses).item() / self.nsamples}")
-
-        group_size = group_size if group_size != -1 else self.columns
-        if static_groups and actorder:
-            g_idx = [perm[i] // group_size for i in range(self.columns)]
-        else:
-            g_idx = [i // group_size for i in range(self.columns)]
-        g_idx = torch.tensor(g_idx, dtype=torch.int32, device=Q.device)
-        if actorder:
-            Q = Q[:, invperm]
-            g_idx = g_idx[invperm]
-
-        if isinstance(self.layer, transformers.Conv1D):
-            Q = Q.t()
-        self.layer.weight.data = Q.reshape(self.layer.weight.shape).type_as(self.layer.weight.data)
-        if os.environ.get("DEBUG"):
-            logger.debug(torch.sum((self.layer(self.inp1) - self.out1) ** 2))
-
-        if scale == []:
-            scale.append(self.quantizer.scale)
-            zero.append(self.quantizer.zero)
-        scale = torch.cat(scale, dim=1)
-        zero = torch.cat(zero, dim=1)
-        return scale, zero, g_idx
-
-    def free(self):
-        if os.environ.get("DEBUG"):
-            self.inp1 = None
-            self.out1 = None
-        self.H = None
-        self.Losses = None
-        self.Trace = None
-        torch.cuda.empty_cache()
-
-
-__all__ = ["GPTQ"]
--- a/ktransformers/ktransformers_ext/operators/custom_marlin/quantize/gptq_marlin.py
+++ b/ktransformers/ktransformers_ext/operators/custom_marlin/quantize/gptq_marlin.py
@ -1,458 +0,0 @@
-import enum
-from enum import Enum
-from typing import Any, Dict, List, Optional
-
-import torch
-from torch.nn.parameter import Parameter
-
-from vllm import _custom_ops as ops
-from vllm.logger import init_logger
-from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
-                                               set_weight_attrs)
-from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig)
-
-logger = init_logger(__name__)
-
-GPTQ_MARLIN_TILE = 16
-GPTQ_MARLIN_MIN_THREAD_N = 64
-GPTQ_MARLIN_MIN_THREAD_K = 128
-GPTQ_MARLIN_MAX_PARALLEL = 16
-
-GPTQ_MARLIN_SUPPORTED_NUM_BITS = [4, 8]
-GPTQ_MARLIN_SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128]
-GPTQ_MARLIN_SUPPORTED_SYM = [True]
-
-
-# Permutations for Marlin scale shuffling
-def get_scale_perms(num_bits: int):
-    scale_perm: List[int] = []
-    for i in range(8):
-        scale_perm.extend([i + 8 * j for j in range(8)])
-    scale_perm_single: List[int] = []
-    for i in range(4):
-        scale_perm_single.extend(
-            [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
-    return scale_perm, scale_perm_single
-
-
-def get_pack_factor(num_bits: int):
-    assert (num_bits in GPTQ_MARLIN_SUPPORTED_NUM_BITS
-            ), f"Unsupported num_bits = {num_bits}"
-    return 32 // num_bits
-
-
-def marlin_permute_scales(s: torch.Tensor, size_k: int, size_n: int,
-                          group_size: int, num_bits: int):
-    scale_perm, scale_perm_single = get_scale_perms(num_bits)
-    if group_size < size_k and group_size != -1:
-        s = s.reshape((-1, len(scale_perm)))[:, scale_perm]
-    else:
-        s = s.reshape((-1, len(scale_perm_single)))[:, scale_perm_single]
-    s = s.reshape((-1, size_n)).contiguous()
-
-    return s
-
-
-class GPTQMarlinConfig(QuantizationConfig):
-    """Config class for GPTQ Marlin"""
-
-    def __init__(self, weight_bits: int, group_size: int, desc_act: bool,
-                 is_sym: bool) -> None:
-        if desc_act and group_size == -1:
-            # In this case, act_order == True is the same as act_order == False
-            # (since we have only one group per output channel)
-            desc_act = False
-
-        self.weight_bits = weight_bits
-        self.group_size = group_size
-        self.desc_act = desc_act
-        self.is_sym = is_sym
-
-        # Verify
-        if self.weight_bits not in GPTQ_MARLIN_SUPPORTED_NUM_BITS:
-            raise ValueError(
-                f"Marlin does not support weight_bits = {self.weight_bits}. "
-                f"Only weight_bits = {GPTQ_MARLIN_SUPPORTED_NUM_BITS} "
-                "are supported.")
-        if self.group_size not in GPTQ_MARLIN_SUPPORTED_GROUP_SIZES:
-            raise ValueError(
-                f"Marlin does not support group_size = {self.group_size}. "
-                f"Only group_sizes = {GPTQ_MARLIN_SUPPORTED_GROUP_SIZES} "
-                "are supported.")
-        if self.is_sym not in GPTQ_MARLIN_SUPPORTED_SYM:
-            raise ValueError(
-                f"Marlin does not support is_sym = {self.is_sym}. "
-                f"Only sym = {GPTQ_MARLIN_SUPPORTED_SYM} are supported.")
-
-        # Init
-        self.pack_factor = get_pack_factor(weight_bits)
-        self.tile_size = GPTQ_MARLIN_TILE
-        self.min_thread_n = GPTQ_MARLIN_MIN_THREAD_N
-        self.min_thread_k = GPTQ_MARLIN_MIN_THREAD_K
-        self.max_parallel = GPTQ_MARLIN_MAX_PARALLEL
-
-    def __repr__(self) -> str:
-        return (f"GPTQMarlinConfig(weight_bits={self.weight_bits}, "
-                f"group_size={self.group_size}, "
-                f"desc_act={self.desc_act})")
-
-    @classmethod
-    def get_name(cls) -> str:
-        return "gptq_marlin"
-
-    @classmethod
-    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
-        return [torch.half, torch.bfloat16]
-
-    @classmethod
-    def get_min_capability(cls) -> int:
-        return 80
-
-    @classmethod
-    def get_config_filenames(cls) -> List[str]:
-        return ["quantize_config.json"]
-
-    @classmethod
-    def from_config(cls, config: Dict[str, Any]) -> "GPTQMarlinConfig":
-        weight_bits = cls.get_from_keys(config, ["bits"])
-        group_size = cls.get_from_keys(config, ["group_size"])
-        desc_act = cls.get_from_keys(config, ["desc_act"])
-        is_sym = cls.get_from_keys(config, ["sym"])
-        return cls(weight_bits, group_size, desc_act, is_sym)
-
-    @classmethod
-    def override_quantization_method(cls, hf_quant_cfg,
-                                     user_quant) -> Optional[str]:
-        can_convert = cls.is_marlin_compatible(hf_quant_cfg)
-
-        is_valid_user_quant = (user_quant is None or user_quant == "marlin")
-
-        if can_convert and is_valid_user_quant:
-            msg = ("The model is convertible to {} during runtime."
-                   " Using {} kernel.".format(cls.get_name(), cls.get_name()))
-            logger.info(msg)
-            return cls.get_name()
-
-        if can_convert and user_quant == "gptq":
-            logger.info("Detected that the model can run with gptq_marlin"
-                        ", however you specified quantization=gptq explicitly,"
-                        " so forcing gptq. Use quantization=gptq_marlin for"
-                        " faster inference")
-        return None
-
-    def get_quant_method(
-            self,
-            layer: torch.nn.Module) -> Optional["GPTQMarlinLinearMethod"]:
-        if isinstance(layer, LinearBase):
-            return GPTQMarlinLinearMethod(self)
-        return None
-
-    def get_scaled_act_names(self) -> List[str]:
-        return []
-
-    @classmethod
-    def is_marlin_compatible(cls, quant_config: Dict[str, Any]):
-        # Extract data from quant config.
-        num_bits = quant_config.get("bits", None)
-        group_size = quant_config.get("group_size", None)
-        sym = quant_config.get("sym", None)
-        desc_act = quant_config.get("desc_act", None)
-
-        # If we cannot find the info needed in the config, cannot convert.
-        if (num_bits is None or group_size is None or sym is None
-                or desc_act is None):
-            return False
-
-        # If the capability of the device is too low, cannot convert.
-        major, minor = torch.cuda.get_device_capability()
-        device_capability = major * 10 + minor
-        if device_capability < cls.get_min_capability():
-            return False
-
-        # Otherwise, can convert if model satisfies marlin constraints.
-        return (num_bits in GPTQ_MARLIN_SUPPORTED_NUM_BITS
-                and group_size in GPTQ_MARLIN_SUPPORTED_GROUP_SIZES
-                and sym in GPTQ_MARLIN_SUPPORTED_SYM)
-
-
-class GPTQMarlinState(Enum):
-    REPACK = enum.auto()
-    READY = enum.auto()
-
-
-class GPTQMarlinLinearMethod(LinearMethodBase):
-    """Linear method for GPTQ Marlin.
-
-    Args:
-        quant_config: The GPTQ Marlin quantization config.
-    """
-
-    def __init__(self, quant_config: GPTQMarlinConfig) -> None:
-        self.quant_config = quant_config
-
-    def create_weights(
-        self,
-        layer: torch.nn.Module,
-        input_size_per_partition: int,
-        output_partition_sizes: List[int],
-        input_size: int,
-        output_size: int,
-        params_dtype: torch.dtype,
-        **extra_weight_attrs,
-    ) -> None:
-        del output_size
-
-        # Normalize group_size
-        if self.quant_config.group_size != -1:
-            group_size = self.quant_config.group_size
-        else:
-            group_size = input_size
-
-        # Validate dtype
-        if params_dtype not in [torch.float16, torch.bfloat16]:
-            raise ValueError(f"The params dtype must be float16 "
-                             f"or bfloat16, but got {params_dtype}")
-
-        # Validate output_size_per_partition
-        output_size_per_partition = sum(output_partition_sizes)
-        if output_size_per_partition % self.quant_config.min_thread_n != 0:
-            raise ValueError(
-                f"Weight output_size_per_partition = "
-                f"{output_size_per_partition} is not divisible by "
-                f" min_thread_n = {self.quant_config.min_thread_n}.")
-
-        # Validate input_size_per_partition
-        if input_size_per_partition % self.quant_config.min_thread_k != 0:
-            raise ValueError(
-                f"Weight input_size_per_partition = "
-                f"{input_size_per_partition} is not divisible "
-                f"by min_thread_k = {self.quant_config.min_thread_k}.")
-
-        if (group_size < input_size
-                and input_size_per_partition % group_size != 0):
-            raise ValueError(
-                f"Weight input_size_per_partition = {input_size_per_partition}"
-                f" is not divisible by group_size = {group_size}.")
-
-        # Detect sharding of scales/zp
-
-        # By default, no sharding over "input dim"
-        scales_and_zp_size = input_size // group_size
-        scales_and_zp_input_dim = None
-
-        if self.quant_config.desc_act:
-            # Act-order case
-            assert self.quant_config.group_size != -1
-
-            is_k_full = input_size_per_partition == input_size
-
-        else:
-            # No act-order case
-
-            # K is always full due to full alignment with
-            # group-size and shard of scales/zp
-            is_k_full = True
-
-            # If this is a row-parallel case, then shard scales/zp
-            if (input_size != input_size_per_partition
-                    and self.quant_config.group_size != -1):
-                scales_and_zp_size = input_size_per_partition // group_size
-                scales_and_zp_input_dim = 0
-
-        # Init buffers
-
-        # Quantized weights
-        qweight = Parameter(
-            torch.empty(
-                input_size_per_partition // self.quant_config.pack_factor,
-                output_size_per_partition,
-                dtype=torch.int32,
-            ),
-            requires_grad=False,
-        )
-        set_weight_attrs(
-            qweight,
-            {
-                **extra_weight_attrs,
-                "input_dim": 0,
-                "output_dim": 1,
-                "packed_dim": 0,
-                "pack_factor": self.quant_config.pack_factor,
-            },
-        )
-
-        # Activation order
-        g_idx = Parameter(
-            torch.empty(
-                input_size_per_partition,
-                dtype=torch.int32,
-            ),
-            requires_grad=False,
-        )
-        # Ignore warning from fused linear layers such as QKVParallelLinear.
-        set_weight_attrs(
-            g_idx,
-            {
-                **extra_weight_attrs, "input_dim": 0,
-                "ignore_warning": True
-            },
-        )
-
-        g_idx_sort_indices = torch.empty(
-            g_idx.shape,
-            dtype=torch.int32,
-        )
-
-        # Scales
-        scales = Parameter(
-            torch.empty(
-                scales_and_zp_size,
-                output_size_per_partition,
-                dtype=params_dtype,
-            ),
-            requires_grad=False,
-        )
-        set_weight_attrs(
-            scales,
-            {
-                **extra_weight_attrs,
-                "input_dim": scales_and_zp_input_dim,
-                "output_dim": 1,
-            },
-        )
-
-        # Quantized zero-points
-        qzeros = Parameter(
-            torch.empty(
-                scales_and_zp_size,
-                output_size_per_partition // self.quant_config.pack_factor,
-                dtype=torch.int32,
-                device="meta",
-            ),
-            requires_grad=False,
-        )
-        set_weight_attrs(
-            qzeros,
-            {
-                **extra_weight_attrs,
-                "input_dim": scales_and_zp_input_dim,
-                "output_dim": 1,
-                "packed_dim": 1,
-                "pack_factor": self.quant_config.pack_factor,
-            },
-        )
-
-        # Allocate marlin workspace
-        max_workspace_size = (
-            output_size_per_partition //
-            self.quant_config.min_thread_n) * self.quant_config.max_parallel
-        workspace = torch.zeros(max_workspace_size,
-                                dtype=torch.int,
-                                requires_grad=False)
-
-        layer.register_parameter("qweight", qweight)
-        layer.register_parameter("g_idx", g_idx)
-        layer.register_parameter("scales", scales)
-        layer.register_parameter("qzeros", qzeros)
-        layer.g_idx_sort_indices = g_idx_sort_indices
-        layer.workspace = workspace
-        layer.input_size_per_partition = input_size_per_partition
-        layer.output_size_per_partition = output_size_per_partition
-        layer.input_size = input_size
-        layer.is_k_full = is_k_full
-        layer.marlin_state = GPTQMarlinState.REPACK
-
-    def apply(
-        self,
-        layer: torch.nn.Module,
-        x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        reshaped_x = x.reshape(-1, x.shape[-1])
-
-        size_m = reshaped_x.shape[0]
-        part_size_n = layer.output_size_per_partition
-        part_size_k = layer.input_size_per_partition
-        full_size_k = layer.input_size
-
-        out_shape = x.shape[:-1] + (part_size_n, )
-
-        if layer.marlin_state == GPTQMarlinState.REPACK:
-            layer.marlin_state = GPTQMarlinState.READY
-
-            # Newly generated tensors need to replace existing tensors that are
-            # already registered as parameters by vLLM (and won't be freed)
-            def replace_tensor(name, new_t):
-                # It is important to use resize_() here since it ensures
-                # the same buffer is reused
-                getattr(layer, name).resize_(new_t.shape)
-                getattr(layer, name).copy_(new_t)
-                del new_t
-
-            cur_device = layer.qweight.device
-
-            # Process act_order
-            if self.quant_config.desc_act:
-                # Get sorting based on g_idx
-                g_idx_sort_indices = torch.argsort(layer.g_idx).to(torch.int)
-
-                sorted_g_idx = layer.g_idx[g_idx_sort_indices]
-
-                replace_tensor("g_idx", sorted_g_idx)
-                replace_tensor("g_idx_sort_indices", g_idx_sort_indices)
-
-            else:
-                # Reset g_idx related tensors
-                layer.g_idx = Parameter(
-                    torch.empty(0, dtype=torch.int, device=cur_device),
-                    requires_grad=False,
-                )
-                layer.g_idx_sort_indices = Parameter(
-                    torch.empty(0, dtype=torch.int, device=cur_device),
-                    requires_grad=False,
-                )
-
-            # Repack weights
-            marlin_qweight = ops.gptq_marlin_repack(
-                layer.qweight,
-                layer.g_idx_sort_indices,
-                part_size_k,
-                part_size_n,
-                self.quant_config.weight_bits,
-            )
-            replace_tensor("qweight", marlin_qweight)
-
-            # Permute scales
-            scales_size_k = part_size_k
-            scales_size_n = part_size_n
-            if self.quant_config.desc_act:
-                scales_size_k = full_size_k
-
-            marlin_scales = marlin_permute_scales(
-                layer.scales,
-                scales_size_k,
-                scales_size_n,
-                self.quant_config.group_size,
-                self.quant_config.weight_bits,
-            )
-            replace_tensor("scales", marlin_scales)
-
-        output = ops.gptq_marlin_gemm(
-            reshaped_x,
-            layer.qweight,
-            layer.scales,
-            layer.g_idx,
-            layer.g_idx_sort_indices,
-            layer.workspace,
-            self.quant_config.weight_bits,
-            size_m,
-            part_size_n,
-            part_size_k,
-            layer.is_k_full,
-        )
-
-        if bias is not None:
-            output.add_(bias)  # In-place add
-
-        return output.reshape(out_shape)
--- a/ktransformers/ktransformers_ext/operators/custom_marlin/quantize/quantizer.py
+++ b/ktransformers/ktransformers_ext/operators/custom_marlin/quantize/quantizer.py
@ -1,140 +0,0 @@
-from logging import getLogger
-
-import torch
-import torch.nn as nn
-
-
-logger = getLogger(__name__)
-
-
-def quantize(x, scale, zero, maxq):
-    if maxq < 0:
-        return (x > scale / 2).float() * scale + (x < zero / 2).float() * zero
-    q = torch.clamp(torch.round(x / scale) + zero, 0, maxq)
-    return scale * (q - zero)
-
-
-class Quantizer(nn.Module):
-    def __init__(self, shape=1):
-        super(Quantizer, self).__init__()
-        self.register_buffer("maxq", torch.tensor(0))
-        self.register_buffer("scale", torch.zeros(shape))
-        self.register_buffer("zero", torch.zeros(shape))
-
-    def configure(
-        self,
-        bits,
-        perchannel=False,
-        sym=True,
-        mse=False,
-        norm=2.4,
-        grid=100,
-        maxshrink=0.8,
-        trits=False,
-    ):
-        self.maxq = torch.tensor(2**bits - 1)
-        self.perchannel = perchannel
-        self.sym = sym
-        self.mse = mse
-        self.norm = norm
-        self.grid = grid
-        self.maxshrink = maxshrink
-        if trits:
-            self.maxq = torch.tensor(-1)
-
-    def find_params(self, x, weight=False):
-        dev = x.device
-        self.maxq = self.maxq.to(dev)
-
-        shape = x.shape
-        if self.perchannel:
-            if weight:
-                x = x.flatten(1)
-            else:
-                if len(shape) == 4:
-                    x = x.permute([1, 0, 2, 3])
-                    x = x.flatten(1)
-                if len(shape) == 3:
-                    x = x.reshape((-1, shape[-1])).t()
-                if len(shape) == 2:
-                    x = x.t()
-        else:
-            x = x.flatten().unsqueeze(0)
-
-        tmp = torch.zeros(x.shape[0], device=dev)
-        xmin = torch.minimum(x.min(1)[0], tmp)
-        xmax = torch.maximum(x.max(1)[0], tmp)
-
-        if self.sym:
-            xmax = torch.maximum(torch.abs(xmin), xmax)
-            tmp = xmin < 0
-            if torch.any(tmp):
-                xmin[tmp] = -xmax[tmp]
-        tmp = (xmin == 0) & (xmax == 0)
-        xmin[tmp] = -1
-        xmax[tmp] = +1
-
-        if self.maxq < 0:
-            self.scale = xmax
-            self.zero = xmin
-        else:
-            self.scale = (xmax - xmin) / self.maxq
-            if self.sym:
-                self.zero = torch.full_like(self.scale, (self.maxq + 1) / 2)
-            else:
-                self.zero = torch.round(-xmin / self.scale)
-
-        if self.mse:
-            best = torch.full([x.shape[0]], float("inf"), device=dev)
-            for i in range(int(self.maxshrink * self.grid)):
-                p = 1 - i / self.grid
-                xmin1 = p * xmin
-                xmax1 = p * xmax
-                scale1 = (xmax1 - xmin1) / self.maxq
-                zero1 = torch.round(-xmin1 / scale1) if not self.sym else self.zero
-                q = quantize(x, scale1.unsqueeze(1), zero1.unsqueeze(1), self.maxq)
-                q -= x
-                q.abs_()
-                q.pow_(self.norm)
-                err = torch.sum(q, 1)
-                tmp = err < best
-                if torch.any(tmp):
-                    best[tmp] = err[tmp]
-                    self.scale[tmp] = scale1[tmp]
-                    self.zero[tmp] = zero1[tmp]
-        if not self.perchannel:
-            if weight:
-                tmp = shape[0]
-            else:
-                tmp = shape[1] if len(shape) != 3 else shape[2]
-            self.scale = self.scale.repeat(tmp)
-            self.zero = self.zero.repeat(tmp)
-
-        if weight:
-            shape = [-1] + [1] * (len(shape) - 1)
-            self.scale = self.scale.reshape(shape)
-            self.zero = self.zero.reshape(shape)
-            return
-        if len(shape) == 4:
-            self.scale = self.scale.reshape((1, -1, 1, 1))
-            self.zero = self.zero.reshape((1, -1, 1, 1))
-        if len(shape) == 3:
-            self.scale = self.scale.reshape((1, 1, -1))
-            self.zero = self.zero.reshape((1, 1, -1))
-        if len(shape) == 2:
-            self.scale = self.scale.unsqueeze(0)
-            self.zero = self.zero.unsqueeze(0)
-
-    def quantize(self, x):
-        if self.ready():
-            return quantize(x, self.scale, self.zero, self.maxq)
-        return x
-
-    def enabled(self):
-        return self.maxq > 0
-
-    def ready(self):
-        return torch.all(self.scale != 0)
-
-
-__all__ = ["Quantizer"]
--- a/ktransformers/ktransformers_ext/operators/custom_marlin/quantize/repack.py
+++ b/ktransformers/ktransformers_ext/operators/custom_marlin/quantize/repack.py
@ -1,99 +0,0 @@
-import torch
-import enum
-from enum import Enum
-from typing import Any, Dict, List, Optional
-from torch.nn.parameter import Parameter
-
-def apply(
-    self,
-    layer: torch.nn.Module,
-    x: torch.Tensor,
-    bias: Optional[torch.Tensor] = None,
-) -> torch.Tensor:
-    reshaped_x = x.reshape(-1, x.shape[-1])
-
-    size_m = reshaped_x.shape[0]
-    part_size_n = layer.output_size_per_partition
-    part_size_k = layer.input_size_per_partition
-    full_size_k = layer.input_size
-
-    out_shape = x.shape[:-1] + (part_size_n, )
-
-    if layer.marlin_state == GPTQMarlinState.REPACK:
-        layer.marlin_state = GPTQMarlinState.READY
-
-        # Newly generated tensors need to replace existing tensors that are
-        # already registered as parameters by vLLM (and won't be freed)
-        def replace_tensor(name, new_t):
-            # It is important to use resize_() here since it ensures
-            # the same buffer is reused
-            getattr(layer, name).resize_(new_t.shape)
-            getattr(layer, name).copy_(new_t)
-            del new_t
-
-        cur_device = layer.qweight.device
-
-        # Process act_order
-        if self.quant_config.desc_act:
-            # Get sorting based on g_idx
-            g_idx_sort_indices = torch.argsort(layer.g_idx).to(torch.int)
-
-            sorted_g_idx = layer.g_idx[g_idx_sort_indices]
-
-            replace_tensor("g_idx", sorted_g_idx)
-            replace_tensor("g_idx_sort_indices", g_idx_sort_indices)
-
-        else:
-            # Reset g_idx related tensors
-            layer.g_idx = Parameter(
-                torch.empty(0, dtype=torch.int, device=cur_device),
-                requires_grad=False,
-            )
-            layer.g_idx_sort_indices = Parameter(
-                torch.empty(0, dtype=torch.int, device=cur_device),
-                requires_grad=False,
-            )
-
-        # Repack weights
-        marlin_qweight = ops.gptq_marlin_repack(
-            layer.qweight,
-            layer.g_idx_sort_indices,
-            part_size_k,
-            part_size_n,
-            self.quant_config.weight_bits,
-        )
-        replace_tensor("qweight", marlin_qweight)
-
-        # Permute scales
-        scales_size_k = part_size_k
-        scales_size_n = part_size_n
-        if self.quant_config.desc_act:
-            scales_size_k = full_size_k
-
-        marlin_scales = marlin_permute_scales(
-            layer.scales,
-            scales_size_k,
-            scales_size_n,
-            self.quant_config.group_size,
-            self.quant_config.weight_bits,
-        )
-        replace_tensor("scales", marlin_scales)
-
-    output = ops.gptq_marlin_gemm(
-        reshaped_x,
-        layer.qweight,
-        layer.scales,
-        layer.g_idx,
-        layer.g_idx_sort_indices,
-        layer.workspace,
-        self.quant_config.weight_bits,
-        size_m,
-        part_size_n,
-        part_size_k,
-        layer.is_k_full,
-    )
-
-    if bias is not None:
-        output.add_(bias)  # In-place add
-
-    return output.reshape(out_shape)
--- a/ktransformers/ktransformers_ext/operators/custom_marlin/quantize/utils/marlin_utils.py
+++ b/ktransformers/ktransformers_ext/operators/custom_marlin/quantize/utils/marlin_utils.py
@ -220,7 +220,7 @@ def compute_max_diff(output, output_ref):

 class MarlinWorkspace:

-    def __init__(self, out_features, min_thread_n, max_parallel):
+    def __init__(self, out_features, min_thread_n, max_parallel, device):
        assert (out_features % min_thread_n == 0), (
            "out_features = {} is undivisible by min_thread_n = {}".format(
                out_features, min_thread_n))
@ -229,4 +229,4 @@ class MarlinWorkspace:

        self.scratch = torch.zeros(max_workspace_size,
                                   dtype=torch.int,
-                                   device="cuda")
+                                   device=device)
--- a/ktransformers/ktransformers_ext/operators/llamafile/linear.cpp
+++ b/ktransformers/ktransformers_ext/operators/llamafile/linear.cpp
@ -47,13 +47,13 @@ void Linear::forward_many(int qlen, const void* input, void* output, Backend* ba
    int nth = config_.output_size / config_.stride;
    backend->do_work_stealing_job(nth, [&](int task_id) {
        int ith = task_id;
-        void* proj_ptr = proj_ + ith * config_.stride * config_.input_size * ggml_type_size(config_.proj_type) / ggml_blck_size(config_.proj_type);
+        void* proj_ptr = (uint8_t*)proj_ + ith * config_.stride * config_.input_size * ggml_type_size(config_.proj_type) / ggml_blck_size(config_.proj_type);
        float* proj_output_ptr = proj_output_ + ith * config_.stride;
        llamafile_sgemm(config_.stride, qlen, config_.input_size / ggml_blck_size(config_.proj_type), proj_ptr, config_.input_size / ggml_blck_size(config_.proj_type), proj_input_ptr, config_.input_size / ggml_blck_size(config_.proj_type), proj_output_ptr, config_.output_size, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.proj_type, ggml_internal_get_type_traits(config_.proj_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
        if (config_.stride % ggml_blck_size(config_.hidden_type) == 0) {
            for (int i = 0; i < qlen; i++) {
                float* output_fp32_ptr = proj_output_ + i * config_.output_size + ith * config_.stride;
-                void* output_ptr = output + i * config_.output_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type) + ith * config_.stride * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type);
+                void* output_ptr = (uint8_t*)output + i * config_.output_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type) + ith * config_.stride * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type);
                from_float(output_fp32_ptr, output_ptr, config_.stride, config_.hidden_type);
            }
        }
@ -69,5 +69,5 @@ void Linear::forward(int qlen, const void* input, void* output, Backend* backend
    }
    int forward_len = std::min(qlen, config_.group_max_len);
    forward_many(forward_len, input, output, backend);
-    forward(qlen - forward_len, input + forward_len * config_.input_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), output + forward_len * config_.output_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), backend);
+    forward(qlen - forward_len, (uint8_t*)input + forward_len * config_.input_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), (uint8_t*)output + forward_len * config_.output_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), backend);
 }
--- a/ktransformers/ktransformers_ext/operators/llamafile/mlp.cpp
+++ b/ktransformers/ktransformers_ext/operators/llamafile/mlp.cpp
@ -74,10 +74,10 @@ void MLP::forward_many(int qlen, const void* input, void* output, Backend* backe
    int nth = config_.intermediate_size / config_.stride;
    backend->do_work_stealing_job(nth, [&](int task_id) {
        int ith = task_id;
-        void* gate_proj_ptr = gate_proj_ + ith * config_.stride * config_.hidden_size * ggml_type_size(config_.gate_type) / ggml_blck_size(config_.gate_type);
+        void* gate_proj_ptr = (uint8_t*)gate_proj_ + ith * config_.stride * config_.hidden_size * ggml_type_size(config_.gate_type) / ggml_blck_size(config_.gate_type);
        float* gate_output_ptr = gate_output_ + ith * config_.stride;
        llamafile_sgemm(config_.stride, qlen, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_proj_ptr, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_input_ptr, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_output_ptr, config_.intermediate_size, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.gate_type, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
-        void* up_proj_ptr = up_proj_ + ith * config_.stride * config_.hidden_size * ggml_type_size(config_.up_type) / ggml_blck_size(config_.up_type);
+        void* up_proj_ptr = (uint8_t*)up_proj_ + ith * config_.stride * config_.hidden_size * ggml_type_size(config_.up_type) / ggml_blck_size(config_.up_type);
        float* up_output_ptr = up_output_ + ith * config_.stride;
        llamafile_sgemm(config_.stride, qlen, config_.hidden_size / ggml_blck_size(config_.up_type), up_proj_ptr, config_.hidden_size / ggml_blck_size(config_.up_type), up_input_ptr, config_.hidden_size / ggml_blck_size(config_.up_type), up_output_ptr, config_.intermediate_size, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.up_type, ggml_internal_get_type_traits(config_.up_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
        for (int i = 0; i < qlen; i++) {
@ -86,7 +86,7 @@ void MLP::forward_many(int qlen, const void* input, void* output, Backend* backe
            }
            if (config_.stride % ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) == 0) {
                float* intermediate_fp32_ptr = intermediate_fp32_ + i * config_.intermediate_size + ith * config_.stride;
-                void* down_input_ptr = down_input_ + i * config_.intermediate_size * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) + ith * config_.stride * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
+                void* down_input_ptr = (uint8_t*)down_input_ + i * config_.intermediate_size * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) + ith * config_.stride * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
                from_float(intermediate_fp32_ptr, down_input_ptr, config_.stride, ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
            }
        }
@ -97,13 +97,13 @@ void MLP::forward_many(int qlen, const void* input, void* output, Backend* backe
    nth = config_.hidden_size / config_.stride;
    backend->do_work_stealing_job(nth, [&](int task_id) {
        int ith = task_id;
-        void* down_proj_ptr = down_proj_ + ith * config_.stride * config_.intermediate_size * ggml_type_size(config_.down_type) / ggml_blck_size(config_.down_type);
+        void* down_proj_ptr = (uint8_t*)down_proj_ + ith * config_.stride * config_.intermediate_size * ggml_type_size(config_.down_type) / ggml_blck_size(config_.down_type);
        float* down_output_ptr = down_output_ + ith * config_.stride;
        llamafile_sgemm(config_.stride, qlen, config_.intermediate_size / ggml_blck_size(config_.down_type), down_proj_ptr, config_.intermediate_size / ggml_blck_size(config_.down_type), down_input_, config_.intermediate_size / ggml_blck_size(config_.down_type), down_output_ptr, config_.hidden_size, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.down_type, ggml_internal_get_type_traits(config_.down_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
        if (config_.stride % ggml_blck_size(config_.hidden_type) == 0) {
            for (int i = 0; i < qlen; i++) {
                float* output_fp32_ptr = down_output_ + i * config_.hidden_size + ith * config_.stride;
-                void* output_ptr = output + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type) + ith * config_.stride * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type);
+                void* output_ptr = (uint8_t*)output + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type) + ith * config_.stride * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type);
                from_float(output_fp32_ptr, output_ptr, config_.stride, config_.hidden_type);
            }
        }
@ -119,5 +119,5 @@ void MLP::forward(int qlen, const void* input, void* output, Backend* backend) {
    }
    int forward_len = std::min(qlen, config_.group_max_len);
    forward_many(forward_len, input, output, backend);
-    forward(qlen - forward_len, input + forward_len * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), output + forward_len * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), backend);
+    forward(qlen - forward_len, (uint8_t*)input + forward_len * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), (uint8_t*)output + forward_len * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), backend);
 }
--- a/ktransformers/ktransformers_ext/operators/llamafile/moe.cpp
+++ b/ktransformers/ktransformers_ext/operators/llamafile/moe.cpp
@ -9,7 +9,7 @@
 **/
 #include "moe.h"
 #include <iostream>
-#include "unistd.h"
+#include <cstdint>

 MOE::MOE(MOEConfig config) {
    config_ = config;
@ -60,7 +60,7 @@ MOE::MOE(MOEConfig config) {

    m_local_pos_.resize(config_.group_max_len);
    for (int i = 0; i < config_.group_max_len; i++) {
-        m_local_pos_[i].reserve(config_.expert_num);
+        m_local_pos_[i].resize(config_.routed_expert_num);
    }
    m_local_num_.resize(config_.expert_num);
    m_local_gate_input_ptr_.resize(config_.expert_num);
@ -125,10 +125,10 @@ void MOE::forward_one(int k, const uint64_t* expert_ids, const float* weights, c
        int expert_idx = task_id / nth;
        uint64_t expert_id = expert_ids[expert_idx];
        int ith = task_id % nth;
-        void* gate_proj_ptr = gate_proj_ + (expert_id * config_.intermediate_size + ith * config_.stride) * config_.hidden_size * ggml_type_size(config_.gate_type) / ggml_blck_size(config_.gate_type);
+        void* gate_proj_ptr = (uint8_t*)gate_proj_ + (expert_id * config_.intermediate_size + ith * config_.stride) * config_.hidden_size * ggml_type_size(config_.gate_type) / ggml_blck_size(config_.gate_type);
        float* gate_output_ptr = s_gate_output_[expert_idx] + ith * config_.stride;
        llamafile_sgemm(config_.stride, 1, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_proj_ptr, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_input_ptr, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_output_ptr, config_.stride, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.gate_type, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
-        void* up_proj_ptr = up_proj_ + (expert_id * config_.intermediate_size + ith * config_.stride) * config_.hidden_size * ggml_type_size(config_.up_type) / ggml_blck_size(config_.up_type);
+        void* up_proj_ptr = (uint8_t*)up_proj_ + (expert_id * config_.intermediate_size + ith * config_.stride) * config_.hidden_size * ggml_type_size(config_.up_type) / ggml_blck_size(config_.up_type);
        float* up_output_ptr = s_up_output_[expert_idx] + ith * config_.stride;
        llamafile_sgemm(config_.stride, 1, config_.hidden_size / ggml_blck_size(config_.up_type), up_proj_ptr, config_.hidden_size / ggml_blck_size(config_.up_type), up_input_ptr, config_.hidden_size / ggml_blck_size(config_.up_type), up_output_ptr, config_.stride, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.up_type, ggml_internal_get_type_traits(config_.up_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
        for (int i = ith * config_.stride; i < (ith + 1) * config_.stride; i++) {
@ -153,7 +153,7 @@ void MOE::forward_one(int k, const uint64_t* expert_ids, const float* weights, c
        }
        for (int expert_idx = 0; expert_idx < k; expert_idx++) {
            uint64_t expert_id = expert_ids[expert_idx];
-            void* down_proj_ptr = down_proj_ + (expert_id * config_.hidden_size + ith * config_.stride) * config_.intermediate_size * ggml_type_size(config_.down_type) / ggml_blck_size(config_.down_type);
+            void* down_proj_ptr = (uint8_t*)down_proj_ + (expert_id * config_.hidden_size + ith * config_.stride) * config_.intermediate_size * ggml_type_size(config_.down_type) / ggml_blck_size(config_.down_type);
            float* down_output_ptr = s_down_output_[expert_idx] + ith * config_.stride;
            llamafile_sgemm(config_.stride, 1, config_.intermediate_size / ggml_blck_size(config_.down_type), down_proj_ptr, config_.intermediate_size / ggml_blck_size(config_.down_type), s_down_input_[expert_idx], config_.intermediate_size / ggml_blck_size(config_.down_type), down_output_ptr, config_.stride, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.down_type, ggml_internal_get_type_traits(config_.down_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
            for (int i = ith * config_.stride; i < (ith + 1) * config_.stride; i++) {
@ -162,7 +162,7 @@ void MOE::forward_one(int k, const uint64_t* expert_ids, const float* weights, c
        }
        if (config_.stride % ggml_blck_size(config_.hidden_type) == 0) {
            float* output_fp32_ptr = s_output_fp32_ + ith * config_.stride;
-            void* output_ptr = output + ith * config_.stride * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type);
+            void* output_ptr = (uint8_t*)output + ith * config_.stride * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type);
            from_float(output_fp32_ptr, output_ptr, config_.stride, config_.hidden_type);
        }
    });
@ -195,9 +195,9 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
        const void* gate_input_ptr;
        const void* up_input_ptr;
        if (config_.hidden_type == ggml_internal_get_type_traits(config_.gate_type).vec_dot_type && config_.hidden_type == ggml_internal_get_type_traits(config_.up_type).vec_dot_type) {
-            gate_input_ptr = up_input_ptr = input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type);
+            gate_input_ptr = up_input_ptr = (uint8_t*)input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type);
        } else {
-            to_float(input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), m_input_fp32_[i], config_.hidden_size, config_.hidden_type);
+            to_float((uint8_t*)input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), m_input_fp32_[i], config_.hidden_size, config_.hidden_type);
            if (ggml_internal_get_type_traits(config_.gate_type).vec_dot_type == ggml_internal_get_type_traits(config_.up_type).vec_dot_type) {
                from_float(m_input_fp32_[i], m_gate_input_[i], config_.hidden_size, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type);
                gate_input_ptr = up_input_ptr = m_gate_input_[i];
@ -206,13 +206,13 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
                    from_float(m_input_fp32_[i], m_gate_input_[i], config_.hidden_size, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type);
                    gate_input_ptr = m_gate_input_[i];
                } else {
-                    gate_input_ptr = input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type);
+                    gate_input_ptr = (uint8_t*)input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type);
                }
                if (config_.hidden_type != ggml_internal_get_type_traits(config_.up_type).vec_dot_type) {
                    from_float(m_input_fp32_[i], m_up_input_[i], config_.hidden_size, ggml_internal_get_type_traits(config_.up_type).vec_dot_type);
                    up_input_ptr = m_up_input_[i];
                } else {
-                    up_input_ptr = input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type);
+                    up_input_ptr = (uint8_t*)input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type);
                }
            }
        }
@ -227,11 +227,11 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
        int expert_idx = task_id / nth;
        int ith = task_id % nth;
        void* gate_input_ptr = m_local_gate_input_ptr_[expert_idx];
-        void* gate_proj_ptr = gate_proj_ + (expert_idx * config_.intermediate_size + ith * stride) * config_.hidden_size * ggml_type_size(config_.gate_type) / ggml_blck_size(config_.gate_type);
+        void* gate_proj_ptr = (uint8_t*)gate_proj_ + (expert_idx * config_.intermediate_size + ith * stride) * config_.hidden_size * ggml_type_size(config_.gate_type) / ggml_blck_size(config_.gate_type);
        float* gate_output_ptr = m_local_gate_output_ptr_[expert_idx] + ith * stride;
        llamafile_sgemm(stride, m_local_num_[expert_idx], config_.hidden_size / ggml_blck_size(config_.gate_type), gate_proj_ptr, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_input_ptr, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_output_ptr, config_.intermediate_size, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.gate_type, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
        void* up_input_ptr = m_local_up_input_ptr_[expert_idx];
-        void* up_proj_ptr = up_proj_ + (expert_idx * config_.intermediate_size + ith * stride) * config_.hidden_size * ggml_type_size(config_.up_type) / ggml_blck_size(config_.up_type);
+        void* up_proj_ptr = (uint8_t*)up_proj_ + (expert_idx * config_.intermediate_size + ith * stride) * config_.hidden_size * ggml_type_size(config_.up_type) / ggml_blck_size(config_.up_type);
        float* up_output_ptr = m_local_up_output_ptr_[expert_idx] + ith * stride;
        llamafile_sgemm(stride, m_local_num_[expert_idx], config_.hidden_size / ggml_blck_size(config_.up_type), up_proj_ptr, config_.hidden_size / ggml_blck_size(config_.up_type), up_input_ptr, config_.hidden_size / ggml_blck_size(config_.up_type), up_output_ptr, config_.intermediate_size, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.up_type, ggml_internal_get_type_traits(config_.up_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
        for (int i = 0; i < m_local_num_[expert_idx]; i++) {
@ -249,7 +249,7 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
        int expert_idx = task_id / nth;
        int ith = task_id % nth;
        void* down_input_ptr = m_local_down_input_ptr_[expert_idx];
-        void* down_proj_ptr = down_proj_ + (expert_idx * config_.hidden_size + ith * stride) * config_.intermediate_size * ggml_type_size(config_.down_type) / ggml_blck_size(config_.down_type);
+        void* down_proj_ptr = (uint8_t*)down_proj_ + (expert_idx * config_.hidden_size + ith * stride) * config_.intermediate_size * ggml_type_size(config_.down_type) / ggml_blck_size(config_.down_type);
        float* down_output_ptr = m_local_down_output_ptr_[expert_idx] + ith * stride;
        llamafile_sgemm(stride, m_local_num_[expert_idx], config_.intermediate_size / ggml_blck_size(config_.down_type), down_proj_ptr, config_.intermediate_size / ggml_blck_size(config_.down_type), down_input_ptr, config_.intermediate_size / ggml_blck_size(config_.down_type), down_output_ptr, config_.hidden_size, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.down_type, ggml_internal_get_type_traits(config_.down_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
    });
@ -262,18 +262,18 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
                m_output_fp32_[i][e] += m_local_down_output_ptr_[expert_ids[i * k + j]][m_local_pos_[i][j] * config_.hidden_size + e] * weights[i * k + j];
            }
        }
-        from_float(m_output_fp32_[i], output + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), config_.hidden_size, config_.hidden_type);
+        from_float(m_output_fp32_[i], (uint8_t*)output + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), config_.hidden_size, config_.hidden_type);
    });
 }

 void MOE::forward(int qlen, int k, const uint64_t* expert_ids, const float* weights, const void* input, void* output, Backend* backend) {
    if (qlen < config_.group_min_len) {
        for (int i = 0; i < qlen; i++) {
-            forward_one(k, expert_ids + i * k, weights + i * k, input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), output + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), backend);
+            forward_one(k, expert_ids + i * k, weights + i * k, (uint8_t*)input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), (uint8_t*)output + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), backend);
        }
        return;
    }
    int forward_len = std::min(config_.group_max_len, qlen);
    forward_many(forward_len, k, expert_ids, weights, input, output, backend);
-    forward(qlen - forward_len, k, expert_ids + forward_len * k, weights + forward_len * k, input + forward_len * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), output + forward_len * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), backend);
+    forward(qlen - forward_len, k, expert_ids + forward_len * k, weights + forward_len * k, (uint8_t*)input + forward_len * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), (uint8_t*)output + forward_len * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), backend);
 }
--- a/ktransformers/ktransformers_ext/operators/llamafile/shared_mem_buffer.cpp
+++ b/ktransformers/ktransformers_ext/operators/llamafile/shared_mem_buffer.cpp
@ -49,7 +49,7 @@ void SharedMemBuffer::dealloc(void* object) {
 void SharedMemBuffer::arrange(std::vector<std::pair<void**, uint64_t>> requests) {
    uint64_t offset = 0;
    for (auto& request : requests) {
-        *(request.first) = buffer_ + offset;
+        *(request.first) = (uint8_t*)buffer_ + offset;
        offset += request.second;
    }
 }
--- a/ktransformers/local_chat.py
+++ b/ktransformers/local_chat.py
@ -31,18 +31,21 @@ import fire
 from ktransformers.optimize.optimize import optimize_and_load_gguf
 from ktransformers.models.modeling_deepseek import DeepseekV2ForCausalLM
 from ktransformers.models.modeling_qwen2_moe import Qwen2MoeForCausalLM
+from ktransformers.models.modeling_mixtral import MixtralForCausalLM
 from ktransformers.util.utils import prefill_and_generate
 from ktransformers.server.config.config import Config

 custom_models = {
    "DeepseekV2ForCausalLM": DeepseekV2ForCausalLM,
    "Qwen2MoeForCausalLM": Qwen2MoeForCausalLM,
+    "MixtralForCausalLM": MixtralForCausalLM,
 }

 ktransformer_rules_dir = os.path.dirname(os.path.abspath(__file__)) + "/optimize/optimize_rules/"
 default_optimize_rules ={
    "DeepseekV2ForCausalLM": ktransformer_rules_dir + "DeepSeek-V2-Chat.yaml",
    "Qwen2MoeForCausalLM": ktransformer_rules_dir + "Qwen2-57B-A14B-Instruct.yaml",
+    "MixtralForCausalLM": ktransformer_rules_dir + "Mixtral.yaml",
 }

 def local_chat(
@ -50,7 +53,8 @@ def local_chat(
    optimize_rule_path: str = None,
    gguf_path: str = None,
    max_new_tokens: int = 1000,
-    cpu_infer: int = Config().cpu_infer
+    cpu_infer: int = Config().cpu_infer,
+    use_cuda_graph: bool = True,
 ):
    torch.set_grad_enabled(False)
    
@ -64,6 +68,8 @@ def local_chat(
            print("using custom modeling_xxx.py.")
            if "Qwen2Moe" in config.architectures[0]: # Qwen2Moe must use flash_attention_2 to avoid overflow.
                config._attn_implementation = "flash_attention_2"
+            if "Mixtral" in config.architectures[0]: 
+                config._attn_implementation = "flash_attention_2"
            model = custom_models[config.architectures[0]](config)
        else:
            model = AutoModelForCausalLM.from_config(
@ -100,7 +106,6 @@ def local_chat(

    while True:
        content = input("Chat: ")
-        # if content is num
        if content == "":
            content = "Please write a piece of quicksort code in C++." 

@ -109,7 +114,7 @@ def local_chat(
            messages, add_generation_prompt=True, return_tensors="pt"
        )
        torch.set_default_dtype(torch.bfloat16) # TODO: Remove this, replace dtype using config
-        generated = prefill_and_generate(model, tokenizer, input_tensor.cuda(), max_new_tokens)
+        generated = prefill_and_generate(model, tokenizer, input_tensor.cuda(), max_new_tokens, use_cuda_graph)

 if __name__ == "__main__":
    fire.Fire(local_chat)
--- a/ktransformers/models/custom_cache.py
+++ b/ktransformers/models/custom_cache.py
@ -22,13 +22,14 @@ class StaticCache(transformers.StaticCache):
            The maximum batch size with which the model will be used.
        max_cache_len (`int`):
            The maximum sequence length with which the model will be used.
-        device (`torch.device`):
+        device (`torch.device` or `dict`):
            The device on which the cache should be initialized. Should be the same as the layer.
+            If a `dict`, it should contain the `device` key with the device name as the value.
        dtype (*optional*, defaults to `torch.float32`):
            The default `dtype` to use when initializing the layer.
    """

-    def __init__(self, config: PretrainedConfig, max_batch_size: int, max_cache_len: int, device, dtype=None) -> None:
+    def __init__(self, config: PretrainedConfig, max_batch_size: int, max_cache_len: int, device: torch.device| dict, dtype=None) -> None:
        Cache.__init__(self)
        self.max_batch_size = max_batch_size
        self.max_cache_len = config.max_position_embeddings if max_cache_len is None else max_cache_len
@ -46,6 +47,7 @@ class StaticCache(transformers.StaticCache):
        self.value_cache: List[torch.Tensor] = []
        cache_shape = (max_batch_size, self.num_key_value_heads, self.max_cache_len, self.head_dim)
        if config.architectures[0] == "DeepseekV2ForCausalLM":
+            # TODO: for deepseek, cache_shape is different whether using Absorbed MLA, check it automatically
            # key_shape = (max_batch_size, self.num_key_value_heads, self.max_cache_len, config.qk_rope_head_dim + config.qk_nope_head_dim)
            # value_shape = (max_batch_size, self.num_key_value_heads, self.max_cache_len, config.v_head_dim)
            key_shape = (max_batch_size, 1, self.max_cache_len, config.qk_rope_head_dim)
@ -56,11 +58,15 @@ class StaticCache(transformers.StaticCache):

        self.past_tokens = []
        self.num_hidden_layers = config.num_hidden_layers
-        for _ in range(self.num_hidden_layers):
+        for idx in range(self.num_hidden_layers):
            # Note: `mark_static_address` is used to tag the cache as an fixed data pointer, preventing cuda graph
            # breaks when updating the cache.
-            new_layer_key_cache = torch.zeros(key_shape, dtype=self.dtype, device=device)
-            new_layer_value_cache = torch.zeros(value_shape, dtype=self.dtype, device=device)
+            if isinstance(device, dict):
+                target_device = device[f"blk.{idx}.self_attn"]["generate_device"]
+            else:
+                target_device = device
+            new_layer_key_cache = torch.zeros(key_shape, dtype=self.dtype, device=target_device)
+            new_layer_value_cache = torch.zeros(value_shape, dtype=self.dtype, device=target_device)
            torch._dynamo.mark_static_address(new_layer_key_cache)
            torch._dynamo.mark_static_address(new_layer_value_cache)
            self.key_cache.append(new_layer_key_cache)
--- a/ktransformers/models/modeling_deepseek.py
+++ b/ktransformers/models/modeling_deepseek.py
@ -1048,7 +1048,7 @@ class DeepseekV2FlashAttention2(DeepseekV2Attention):
        """
        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
        first unpad the input, then computes the attention scores and pad the final attention scores.
-        Args:
+        # Args:
            query_states (`torch.Tensor`):
                Input query states to be passed to Flash Attention API
            key_states (`torch.Tensor`):
@ -1245,12 +1245,14 @@ class DeepseekV2DecoderLayer(nn.Module):
            cache_position=cache_position,
            **kwargs,
        )
+
        hidden_states = residual + hidden_states

        # Fully Connected
        residual = hidden_states
        hidden_states = self.post_attention_layernorm(hidden_states)
        hidden_states = self.mlp(hidden_states)
+
        hidden_states = residual + hidden_states

        outputs = (hidden_states,)
--- a/ktransformers/models/modeling_mixtral.py
+++ b/ktransformers/models/modeling_mixtral.py
--- a/ktransformers/operators/RoPE.py
+++ b/ktransformers/operators/RoPE.py
@ -10,6 +10,7 @@ from ktransformers.operators.base_operator import BaseInjectedModule
 from ktransformers.util.custom_gguf import GGUFLoader
 from ktransformers.util.utils import InferenceState
 from transformers.configuration_utils import PretrainedConfig
+
 # Copied from transformers.models.mixtral.modeling_mixtral.MixtralRotaryEmbedding with Mixtral->Qwen2Moe
 class RotaryEmbedding(BaseInjectedModule, DeepseekV2RotaryEmbedding):
    def __init__(self,
@ -17,12 +18,16 @@ class RotaryEmbedding(BaseInjectedModule, DeepseekV2RotaryEmbedding):
                 gguf_loader : GGUFLoader,
                 config: PretrainedConfig,
                 orig_module: nn.Module,
-                 device: str = "cuda",
+                #  device: str = "cuda",
+                 generate_device: str = "cuda",
+                 prefill_device: str = "cuda",
                 **kwargs):
-        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, device, **kwargs)
+        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, generate_device, **kwargs)
        self.orig_module.__init__(orig_module.dim,
            orig_module.max_position_embeddings,
            orig_module.base)
+        self.generate_device = generate_device
+        self.prefill_device = prefill_device
    
    def load(self):
        self.orig_module.__init__(self.orig_module.dim,
@ -36,9 +41,11 @@ class YarnRotaryEmbedding(BaseInjectedModule, DeepseekV2YarnRotaryEmbedding):
                 gguf_loader : GGUFLoader,
                 config: PretrainedConfig,
                 orig_module: nn.Module,
-                 device: str = "cuda",
+                #  device: str = "cuda",
+                 generate_device: str = "cuda",
+                 prefill_device: str = "cuda",
                 **kwargs):
-        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, device, **kwargs)
+        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, generate_device, **kwargs)
        self.orig_module.__init__(orig_module.dim,
            orig_module.max_position_embeddings,
            orig_module.base,
@ -49,13 +56,15 @@ class YarnRotaryEmbedding(BaseInjectedModule, DeepseekV2YarnRotaryEmbedding):
            orig_module.beta_slow,
            orig_module.mscale,
            orig_module.mscale_all_dim)
+        self.generate_device = generate_device
+        self.prefill_device = prefill_device
        
    
    def load(self):
        self.orig_module.__init__(self.orig_module.dim,
            self.orig_module.max_position_embeddings,
            self.orig_module.base,
-            self.device,
+            self.generate_device,
            self.orig_module.scaling_factor,
            self.orig_module.original_max_position_embeddings,
            self.orig_module.beta_fast,
--- a/ktransformers/operators/experts.py
+++ b/ktransformers/operators/experts.py
@ -5,8 +5,8 @@ Description  :
 Author       : Azure-Tang, Boxin Zhang, chenht2022
 Date         : 2024-07-25 11:25:24
 Version      : 0.1.0
-LastEditors  : Azure 
-LastEditTime : 2024-07-26 09:27:41
+LastEditors  : kkk1nak0
+LastEditTime : 2024-08-11 12:14:39
 Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
 '''

@ -19,7 +19,9 @@ import torch
 import sys, os
 from ktransformers.operators.base_operator import BaseInjectedModule

-sys.path.append(os.path.dirname(__file__) + "/../ktransformers_ext/build")
+sys.path.append(os.path.join(os.path.dirname(__file__), "..", "ktransformers_ext", "build"))
+sys.path.append(os.path.join(os.path.dirname(__file__), "..", "ktransformers_ext", "build", "Release"))
+sys.path.append(os.path.join(os.path.dirname(__file__), "..", "ktransformers_ext", "build", "Debug"))
 import cpuinfer_ext
 from cpuinfer_ext.moe import MOEConfig, MOE
 import ctypes
@ -78,6 +80,25 @@ class MLPExpertsBase(ABC):
                gate_type = self.gguf_loader.tensor_info[key + ".ffn_gate_exps.weight"]["ggml_type"]
                up_type = self.gguf_loader.tensor_info[key + ".ffn_up_exps.weight"]["ggml_type"]
                down_type = self.gguf_loader.tensor_info[key + ".ffn_down_exps.weight"]["ggml_type"]
+            elif key + ".ffn_down.0.weight" in self.gguf_loader.tensor_info:
+                # for supporting  Mixtral-8x7B-Instuct  
+                gate = []
+                up = []
+                down = []
+                for i in range(8):
+                    gatei, upi, downi = f".ffn_gate.{i}.weight", f".ffn_up.{i}.weight", f".ffn_down.{i}.weight"
+                    targets = [gatei, upi, downi]
+                    tensors = self.load_multi(key, targets, device=device)
+                    gate_it, up_it, down_it = tensors[gatei], tensors[upi], tensors[downi]
+                    gate.append(gate_it)
+                    up.append(up_it)
+                    down.append(down_it)
+                gate = torch.stack(gate)
+                up = torch.stack(up)
+                down = torch.stack(down)
+                gate_type = self.gguf_loader.tensor_info[key + ".ffn_gate.0.weight"]["ggml_type"]
+                up_type = self.gguf_loader.tensor_info[key + ".ffn_up.0.weight"]["ggml_type"]
+                down_type = self.gguf_loader.tensor_info[key + ".ffn_down.0.weight"]["ggml_type"]
            else:
                raise ValueError(f"Experts {key} not found in gguf_loader")
            res = {key:{"gate": gate, "up": up, "down": down, "gate_type": gate_type, "up_type": up_type, "down_type": down_type}}
@ -94,7 +115,8 @@ class MLPCPUExperts(MLPExpertsBase):
    expert_ids_cpu:Tensor = None
    weights_cpu:Tensor = None
    output_cpu:Tensor = None
-    output_gpu:Tensor = None
+    output_gpu_map:dict = {} # Manage output tensor buffer on different gpu
+    #stream_map:dict = {} # Manage cuda stream on different gpu
    CPU_INFER = cpuinfer_ext.CPUInfer(Config().cpu_infer)
    def __init__(
        self,
@ -113,6 +135,7 @@ class MLPCPUExperts(MLPExpertsBase):
        self.out_device = out_device

    def load(self, w: dict | nn.Parameter | tuple | None = None, device:str|None = None, warmup:bool = False):
+        with torch.device(self.out_device):
            if device:
                assert device.lower() == "cpu", "MLPCPUExperts can only be loaded on CPU, Parameter \"device\" can be cpu or None."
            if w is None: w = self.load_weights()[self.key]
@ -157,37 +180,38 @@ class MLPCPUExperts(MLPExpertsBase):
            if warmup:
                self.cpu_infer.submit(self.moe.warm_up())
                self.cpu_infer.sync()
-        if MLPCPUExperts.output_gpu == None:
-            MLPCPUExperts.input_tensor_cpu = torch.empty((self.config.hidden_size), device="cpu", pin_memory=True)
-            MLPCPUExperts.expert_ids_cpu = torch.empty((num_experts_per_tok), device="cpu", dtype=torch.long, pin_memory=True)
-            MLPCPUExperts.weights_cpu = torch.empty((num_experts_per_tok), device="cpu", dtype=torch.float32, pin_memory=True)
-            MLPCPUExperts.output_cpu = torch.empty((self.config.hidden_size), device="cpu", pin_memory=True)
-            MLPCPUExperts.output_gpu = torch.empty((self.config.hidden_size), device=self.out_device)
+        if self.out_device not in MLPCPUExperts.output_gpu_map:
+            MLPCPUExperts.output_gpu_map[self.out_device] = torch.zeros((self.config.hidden_size), device=self.out_device)
+        if MLPCPUExperts.input_tensor_cpu == None:
+            MLPCPUExperts.input_tensor_cpu = torch.zeros((self.config.hidden_size), device="cpu", pin_memory=True)
+            MLPCPUExperts.expert_ids_cpu = torch.zeros((num_experts_per_tok), device="cpu", dtype=torch.long, pin_memory=True)
+            MLPCPUExperts.weights_cpu = torch.zeros((num_experts_per_tok), device="cpu", dtype=torch.float32, pin_memory=True)
+            MLPCPUExperts.output_cpu = torch.zeros((self.config.hidden_size), device="cpu", pin_memory=True, dtype=torch.bfloat16)
            
    def submit_for_one_decode(self, input_tensor, expert_ids, weights):
        MLPCPUExperts.input_tensor_cpu.copy_(input_tensor, non_blocking=True)
        MLPCPUExperts.expert_ids_cpu.copy_(expert_ids, non_blocking=True)
        MLPCPUExperts.weights_cpu.copy_(weights, non_blocking=True)
-        self.cpu_infer.submit_with_cuda_stream(torch.cuda.current_stream().cuda_stream, self.moe.forward(1, expert_ids.size(0), MLPCPUExperts.expert_ids_cpu.data_ptr(), MLPCPUExperts.weights_cpu.data_ptr(), MLPCPUExperts.input_tensor_cpu.data_ptr(), MLPCPUExperts.output_cpu.data_ptr()))
+        self.cpu_infer.submit_with_cuda_stream(torch.cuda.current_stream(self.out_device).cuda_stream, self.moe.forward(1, expert_ids.size(0), MLPCPUExperts.expert_ids_cpu.data_ptr(), MLPCPUExperts.weights_cpu.data_ptr(), MLPCPUExperts.input_tensor_cpu.data_ptr(), MLPCPUExperts.output_cpu.data_ptr()))
        
    def sync_for_one_decode(self):
-        self.cpu_infer.sync_with_cuda_stream(torch.cuda.current_stream().cuda_stream)
-        MLPCPUExperts.output_gpu.copy_(MLPCPUExperts.output_cpu, non_blocking=True)
-        #print("capturing experts finish")
-        return MLPCPUExperts.output_gpu
+        self.cpu_infer.sync_with_cuda_stream(torch.cuda.current_stream(self.out_device).cuda_stream)
+        MLPCPUExperts.output_gpu_map[self.out_device].copy_(MLPCPUExperts.output_cpu, non_blocking=True)
+        return MLPCPUExperts.output_gpu_map[self.out_device]

    def forward(self, input_tensor, expert_ids, weights):
        # generate, capture and run cuda graph
+        # print(expert_ids)
        if input_tensor.size(0)==1:
+            # TODO: this branch is unreachable, but the shape of input_tensor([1,hidden_size]) and input_tensor_cpu([hidden_size]) is not compatible
            #print("capturing experts")
            MLPCPUExperts.input_tensor_cpu.copy_(input_tensor, non_blocking=True)
            MLPCPUExperts.expert_ids_cpu.copy_(expert_ids, non_blocking=True)
            MLPCPUExperts.weights_cpu.copy_(weights, non_blocking=True)
            self.cpu_infer.submit_with_cuda_stream(torch.cuda.current_stream().cuda_stream, self.moe.forward(1, expert_ids.size(1), MLPCPUExperts.expert_ids_cpu.data_ptr(), MLPCPUExperts.weights_cpu.data_ptr(), MLPCPUExperts.input_tensor_cpu.data_ptr(), MLPCPUExperts.output_cpu.data_ptr()))
            self.cpu_infer.sync_with_cuda_stream(torch.cuda.current_stream().cuda_stream)
-            MLPCPUExperts.output_gpu.copy_(MLPCPUExperts.output_cpu, non_blocking=True)
-            #print("capturing experts finish")
-            return MLPCPUExperts.output_gpu
+            MLPCPUExperts.output_gpu_map[self.out_device].copy_(MLPCPUExperts.output_cpu, non_blocking=True)
+            return MLPCPUExperts.output_gpu_map[self.out_device]
        else:
            input_tensor = input_tensor.contiguous().cpu()
            expert_ids = expert_ids.contiguous().cpu()
@ -195,7 +219,7 @@ class MLPCPUExperts(MLPExpertsBase):
            output = torch.empty_like(input_tensor).contiguous()
            self.cpu_infer.submit(self.moe.forward(expert_ids.size(0), expert_ids.size(1), expert_ids.data_ptr(), weights.data_ptr(), input_tensor.data_ptr(), output.data_ptr()))
            self.cpu_infer.sync()
-            return output.to(device=object.__getattribute__(self, "device"))
+            return output.to(device=object.__getattribute__(self, "out_device"))
    
    def unload(self):
        return
@ -222,6 +246,24 @@ class MLPCPUExperts(MLPExpertsBase):
                gate_type = self.gguf_loader.tensor_info[key + ".ffn_gate_exps.weight"]["ggml_type"]
                up_type = self.gguf_loader.tensor_info[key + ".ffn_up_exps.weight"]["ggml_type"]
                down_type = self.gguf_loader.tensor_info[key + ".ffn_down_exps.weight"]["ggml_type"]
+            elif key + ".ffn_down.0.weight" in self.gguf_loader.tensor_info:
+                # for supporting  Mixtral-8x7B-Instuct  
+                gate = []
+                up = []
+                down = []
+                for i in range(8):
+                    gate_it = self.gguf_loader.get_mmap_tensor(f"{key}.ffn_gate.{i}.weight")
+                    up_it = self.gguf_loader.get_mmap_tensor(f"{key}.ffn_up.{i}.weight")
+                    down_it = self.gguf_loader.get_mmap_tensor(f"{key}.ffn_down.{i}.weight")
+                    gate.append(gate_it)
+                    up.append(up_it)
+                    down.append(down_it)
+                gate = np.stack(gate)
+                up = np.stack(up)
+                down = np.stack(down)
+                gate_type = self.gguf_loader.tensor_info[key + ".ffn_gate.0.weight"]["ggml_type"]
+                up_type = self.gguf_loader.tensor_info[key + ".ffn_up.0.weight"]["ggml_type"]
+                down_type = self.gguf_loader.tensor_info[key + ".ffn_down.0.weight"]["ggml_type"]
            else:
                raise ValueError(f"Experts {key} not found in gguf_loader")
            res = {key:{"gate": gate, "up": up, "down": down, "gate_type": gate_type, "up_type": up_type, "down_type": down_type}}
@ -360,6 +402,11 @@ class MLPExpertsTorch(MLPExpertsBase):

    def forward(self, hidden_states_cpu: torch.Tensor, selected_experts_cpu: torch.Tensor, routing_weights_cpu: torch.Tensor) -> torch.Tensor:

+        org_device = hidden_states_cpu.device
+        hidden_states_cpu = hidden_states_cpu.to(self.device)
+        selected_experts_cpu = selected_experts_cpu.to(self.device)
+        routing_weights_cpu = routing_weights_cpu.to(self.device)
+        
        batch_sequence_length, hidden_dim = hidden_states_cpu.size()

        final_hidden_states = torch.zeros(
@ -388,27 +435,29 @@ class MLPExpertsTorch(MLPExpertsBase):
            # the `top_x` tensor here.
            final_hidden_states.index_add_(0, top_x, current_hidden_states)

-        return final_hidden_states.to(org_dtype)
+
+        return final_hidden_states.to(org_dtype, device=org_device)

 EXPERTS_MAP = {
    "MLPCPUExperts": MLPCPUExperts,
    "MLPExpertsTorch": MLPExpertsTorch,
    "MLPExpertsMarlin": MLPExpertsMarlin,
 }
+
 class KTransformersMLPExpert(BaseInjectedModule, MLPExpertsBase):
    def __init__(self,
                 key: str,
                 gguf_loader: GGUFLoader,
                 config: PretrainedConfig,
                 orig_module: nn.Module,
-                 device: str = "cuda",
+                #  device: str = "cuda",
                 prefill_device:str = "cuda",
                 prefill_mlp_type: str | None = "MLPExpertsTorch",
                 generate_device: str = "cpu",
                 generate_mlp_type: str | None = "MLPCPUExperts",
                 **kwargs):
-        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, device, **kwargs)
-        MLPExpertsBase.__init__(self, key, gguf_loader, config, orig_module, device, **kwargs)
+        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, generate_device, **kwargs)
+        MLPExpertsBase.__init__(self, key, gguf_loader, config, orig_module, generate_device, **kwargs)
        if generate_mlp_type is not None:
            self.generate_experts = EXPERTS_MAP[generate_mlp_type](key, gguf_loader, config, len(orig_module), device=generate_device, **kwargs)
        else:
@ -471,6 +520,7 @@ class KTransformersMLPExpert(BaseInjectedModule, MLPExpertsBase):

 from ktransformers.models.modeling_deepseek import DeepseekV2MoE
 from ktransformers.models.modeling_qwen2_moe import Qwen2MoeSparseMoeBlock
+from ktransformers.models.modeling_mixtral import MixtralSparseMoeBlock


 class Qwen2MoeSparseMoeBlockInjected(BaseInjectedModule, Qwen2MoeSparseMoeBlock):
@ -578,7 +628,6 @@ class Qwen2MoeSparseMoeBlockInjected(BaseInjectedModule, Qwen2MoeSparseMoeBlock)

        return final_hidden_states

-
 class DeepseekV2MoEInjected(BaseInjectedModule, DeepseekV2MoE):
    def forward(self, hidden_states):
        identity = hidden_states
@ -587,7 +636,7 @@ class DeepseekV2MoEInjected(BaseInjectedModule, DeepseekV2MoE):
        topk_idx, topk_weight, aux_loss = self.gate(hidden_states)
        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
        
-        if sequence_length == 1:
+        if sequence_length == 1 and hasattr(self.experts.generate_experts, "submit_for_one_decode"):
            self.experts.generate_experts.submit_for_one_decode(hidden_states[0], topk_idx[0], topk_weight[0])
            if self.config.n_shared_experts is not None:
                y_ = self.shared_experts(identity).squeeze(0)
@ -677,3 +726,102 @@ class DeepseekV2MoEInjected(BaseInjectedModule, DeepseekV2MoE):
            .type(new_x.dtype)
        )
        return final_out
+
+class MisrtalSparseMoEBlockInjected(BaseInjectedModule, MixtralSparseMoeBlock):
+    
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        """ """
+        orig_shape = hidden_states.shape
+        batch_size, sequence_length, hidden_dim = hidden_states.shape
+        if self.training and self.jitter_noise > 0:
+            hidden_states *= torch.empty_like(hidden_states).uniform_(1.0 - self.jitter_noise, 1.0 + self.jitter_noise)
+        hidden_states = hidden_states.view(-1, hidden_dim)
+        # router_logits: (batch * sequence_length, n_experts)
+        router_logits = self.gate(hidden_states)
+
+        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
+        routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
+        routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
+        # we cast back to the input dtype
+        routing_weights = routing_weights.to(hidden_states.dtype)
+        
+        if sequence_length == 1 and hasattr(self.experts.generate_experts, "submit_for_one_decode"):
+            self.experts.generate_experts.submit_for_one_decode(hidden_states[0], selected_experts[0], routing_weights[0])
+            y = self.experts.generate_experts.sync_for_one_decode().unsqueeze(0)
+            y.resize_(*orig_shape)
+            return y, router_logits
+        
+        hidden_states_expert = hidden_states.to(self.experts.device)  if isinstance(self.experts, MLPExpertsBase) else hidden_states_expert.cpu()
+        selected_experts_expert = selected_experts.to(self.experts.device) if isinstance(self.experts, MLPExpertsBase) else selected_experts_expert.cpu()
+        routing_weights_expert = routing_weights.to(self.experts.device) if isinstance(self.experts, MLPExpertsBase) else routing_weights_expert.cpu()
+
+        if isinstance(self.experts, MLPExpertsBase):
+            y = (
+                self.moe_on_cpuinfer(
+                    hidden_states_expert, selected_experts_expert, routing_weights_expert
+                )
+                .view(*orig_shape)
+                .to(device=hidden_states.device)
+            )
+        elif hidden_states_expert.size(0) > 10:
+            y = self.moe_infer(
+                hidden_states_expert, selected_experts_expert, routing_weights_expert, orig_shape
+            ).to(device=hidden_states.device)
+        else:
+            y = self.moe_infer_simple(
+                hidden_states_expert, selected_experts_expert, routing_weights_expert
+            ).to(device=hidden_states.device)
+            
+        y.resize_(*orig_shape)
+        return y, router_logits
+    
+    @torch.no_grad()
+    def moe_on_cpuinfer(self, x: torch.Tensor, topk_ids: torch.Tensor, topk_weight: torch.Tensor) -> torch.Tensor:
+        outs = torch.empty_like(x)
+        outs = self.experts(x, topk_ids, topk_weight)
+        return outs
+
+    @torch.no_grad()
+    # TODO may bugs here
+    def moe_infer_simple(self, hidden_states_cpu: torch.Tensor, selected_experts_cpu: torch.Tensor, routing_weights_cpu: torch.Tensor) -> torch.Tensor:
+        '''
+        hidden_states_cpu: [num_tokens, hidden_size]
+        topk_ids, topk_weight: [num_tokens, num_selected_experts]
+        '''
+        outs = torch.zeros_like(hidden_states_cpu)
+        for token_idx in range(selected_experts_cpu.size(0)):
+            for expert_idx in range(selected_experts_cpu.size(1)):
+                expert = self.experts[selected_experts_cpu[token_idx, expert_idx]]
+                outs[token_idx] += expert.forward(hidden_states_cpu[token_idx]) * routing_weights_cpu[token_idx, expert_idx]
+        return outs
+    
+    @torch.no_grad()
+    # TODO may bugs here
+    def moe_infer(self, hidden_states_cpu: torch.Tensor, selected_experts_cpu: torch.Tensor, routing_weights_cpu: torch.Tensor, orig_shape: tuple) -> torch.Tensor:
+        
+        batch_size, sequence_length, hidden_dim = orig_shape
+
+        final_hidden_states = torch.zeros(
+            (batch_size * sequence_length, hidden_dim), dtype=hidden_states_cpu.dtype, device=hidden_states_cpu.device
+        )
+
+        # One hot encode the selected experts to create an expert mask
+        # this will be used to easily index which expert is going to be sollicitated
+        expert_mask = torch.nn.functional.one_hot(selected_experts_cpu, num_classes=self.num_experts).permute(2, 1, 0)
+
+        # Loop over all available experts in the model and perform the computation on each expert
+        for expert_idx in range(self.num_experts):
+            expert_layer = self.experts[expert_idx]
+            idx, top_x = torch.where(expert_mask[expert_idx])
+
+            # Index the correct hidden states and compute the expert hidden state for
+            # the current expert. We need to make sure to multiply the output hidden
+            # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
+            current_state = hidden_states_cpu[None, top_x].reshape(-1, hidden_dim)
+            current_hidden_states = expert_layer.forward(current_state) * routing_weights_cpu[top_x, idx, None]
+
+            # However `index_add_` only support torch tensors for indexing so we'll use
+            # the `top_x` tensor here.
+            final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states_cpu.dtype))
+
+        return final_hidden_states
--- a/ktransformers/operators/layer_wise_prefill.py
+++ b/ktransformers/operators/layer_wise_prefill.py
@ -6,7 +6,7 @@ Author       : Azure-Tang
 Date         : 2024-07-25 11:25:24
 Version      : 1.0.0
 LastEditors  : Azure 
-LastEditTime : 2024-07-26 09:27:48
+LastEditTime : 2024-08-08 10:09:14
 Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
 '''

@ -45,6 +45,8 @@ from ktransformers.models.modeling_deepseek import BaseModelOutputWithPast, Deep
 from transformers.models.qwen2_moe.configuration_qwen2_moe import Qwen2MoeConfig
 from ktransformers.operators.base_operator import BaseInjectedModule
 from ktransformers.util.utils import InferenceState
+from ktransformers.util.custom_gguf import GGUFLoader
+from transformers.configuration_utils import PretrainedConfig

 if is_flash_attn_2_available():
    from flash_attn import flash_attn_func, flash_attn_varlen_func
@ -73,34 +75,6 @@ QWEN2MOE_START_DOCSTRING = r"""
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
 """

-
-@add_start_docstrings(
-    "The bare Qwen2MoE Model outputting raw hidden-states without any specific head on top.",
-    QWEN2MOE_START_DOCSTRING,
-)
-class Qwen2MoePreTrainedModel(PreTrainedModel):
-    config_class = Qwen2MoeConfig
-    base_model_prefix = "model"
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["Qwen2MoeDecoderLayer"]
-    _skip_keys_device_placement = "past_key_values"
-    _supports_flash_attn_2 = True
-    _supports_sdpa = True
-    _supports_cache_class = True
-    _supports_static_cache = True
-
-    def _init_weights(self, module):
-        std = self.config.initializer_range
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-
-
 QWEN2MOE_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
@ -177,13 +151,11 @@ QWEN2MOE_INPUTS_DOCSTRING = r"""
            the complete sequence length.
 """

-from ktransformers.util.custom_gguf import GGUFLoader
-from transformers.configuration_utils import PretrainedConfig
@add_start_docstrings(
    "The bare Qwen2MoE Model outputting raw hidden-states without any specific head on top.",
    QWEN2MOE_START_DOCSTRING,
 )
-class Qwen2MoeModelPerLayerPrefill(BaseInjectedModule):
+class Qwen2MoeModelKTransformers(BaseInjectedModule):
    """
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen2MoeDecoderLayer`]

@ -198,10 +170,13 @@ class Qwen2MoeModelPerLayerPrefill(BaseInjectedModule):
        orig_module: nn.Module,
        device: str = "cuda",
        per_layer_prefill_intput_threshold: int = 30000, # if None, no per-layer prefill
+        transfer_map: dict = None,
        **kwargs,
    ):
        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, device, **kwargs)
        self.per_layer_prefill_intput_threshold = per_layer_prefill_intput_threshold
+        self.transfer_map = transfer_map
+        self.stream_device_map = dict()

    @add_start_docstrings_to_model_forward(QWEN2MOE_INPUTS_DOCSTRING)
    def forward(
@ -287,7 +262,20 @@ class Qwen2MoeModelPerLayerPrefill(BaseInjectedModule):
        all_router_logits = () if output_router_logits else None
        next_decoder_cache = None

-        for decoder_layer in self.layers:
+        for i, decoder_layer in enumerate(self.layers):
+            if self.transfer_map is not None and i in self.transfer_map: 
+                prev_stream = torch.cuda.current_stream()
+                cur_device = self.transfer_map[i]
+                if cur_device not in self.stream_device_map:
+                    self.stream_device_map[cur_device] = torch.cuda.Stream(cur_device)
+                torch.cuda.set_device(cur_device)
+                self.stream_device_map[cur_device].wait_stream(prev_stream)
+                torch.cuda.set_stream(self.stream_device_map[cur_device])
+                hidden_states = hidden_states.to(self.transfer_map[i], non_blocking = True)
+                causal_mask = causal_mask.to(self.transfer_map[i], non_blocking = True) if causal_mask is not None else None
+                position_ids = position_ids.to(self.transfer_map[i], non_blocking = True) if position_ids is not None else None
+                cache_position = cache_position.to(self.transfer_map[i], non_blocking = True) if cache_position is not None else None
+                
            if output_hidden_states:
                all_hidden_states += (hidden_states,)

@ -463,7 +451,7 @@ DeepseekV2_INPUTS_DOCSTRING = r"""
 """


-class DeepseekV2ModelPerLayerPrefill(BaseInjectedModule):
+class DeepseekV2ModelKTransformers(BaseInjectedModule):
    """
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`DeepseekV2DecoderLayer`]

@ -478,10 +466,13 @@ class DeepseekV2ModelPerLayerPrefill(BaseInjectedModule):
        orig_module: nn.Module,
        device: str = "cuda",
        per_layer_prefill_intput_threshold: int = 30000, # if None, no per-layer prefill
+        transfer_map: dict = None,
        **kwargs,
    ):
        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, device, **kwargs)
        self.per_layer_prefill_intput_threshold = per_layer_prefill_intput_threshold
+        self.transfer_map = transfer_map
+        self.stream_device_map = dict()

    @add_start_docstrings_to_model_forward(DeepseekV2_INPUTS_DOCSTRING)
    def forward(
@ -584,7 +575,20 @@ class DeepseekV2ModelPerLayerPrefill(BaseInjectedModule):
        t_cpu = 0
        t_f = 0

-        for decoder_layer in self.layers:
+        for i, decoder_layer in enumerate(self.layers):
+            if self.transfer_map is not None and i in self.transfer_map: 
+                prev_stream = torch.cuda.current_stream()
+                cur_device = self.transfer_map[i]
+                if cur_device not in self.stream_device_map:
+                    self.stream_device_map[cur_device] = torch.cuda.Stream(cur_device)
+                torch.cuda.set_device(cur_device)
+                self.stream_device_map[cur_device].wait_stream(prev_stream)
+                torch.cuda.set_stream(self.stream_device_map[cur_device])
+                hidden_states = hidden_states.to(self.transfer_map[i], non_blocking = True)
+                causal_mask = causal_mask.to(self.transfer_map[i], non_blocking = True) if causal_mask is not None else None
+                position_ids = position_ids.to(self.transfer_map[i], non_blocking = True) if position_ids is not None else None
+                cache_position = cache_position.to(self.transfer_map[i], non_blocking = True) if cache_position is not None else None
+
            if output_hidden_states:
                all_hidden_states += (hidden_states,)

--- a/ktransformers/operators/linear.py
+++ b/ktransformers/operators/linear.py
@ -176,7 +176,7 @@ class QuantizedLinearMarlin(QuantizedLinearBase):
        self.act_order = act_order
        self.is_k_full = is_k_full

-    def load(self, w: dict | nn.Parameter | tuple | None = None, device: str|None = "cuda"):
+    def load(self, w: dict | nn.Parameter | tuple | None = None, device: str|None = None):
        if device is None: device = self.device
        assert device.lower() != "cpu", "Marlin quantized linear only supports GPU device"
        if w is None: w = self.load_weight(device=device)
@ -200,7 +200,7 @@ class QuantizedLinearMarlin(QuantizedLinearBase):
            weight, self.num_bits, self.group_size, self.act_order
        )
        self.workspace = MarlinWorkspace(
-            self.out_features, GPTQ_MARLIN_MIN_THREAD_N, GPTQ_MARLIN_MAX_PARALLEL
+            self.out_features, GPTQ_MARLIN_MIN_THREAD_N, GPTQ_MARLIN_MAX_PARALLEL,self.device
        )
        self.marlin_q_w = marlin_q_w
        self.marlin_s = marlin_s
@ -247,7 +247,6 @@ class QuantizedLinearMarlin(QuantizedLinearBase):
 LINEAR_MAP = {
    "QuantizedLinearMarlin": QuantizedLinearMarlin,
    "QuantizedLinearTorch": QuantizedLinearTorch,
-    "QuantizedLinearTorch": QuantizedLinearTorch,
 }

 class KTransformerLinear(BaseInjectedModule, QuantizedLinearBase):
@ -257,15 +256,15 @@ class KTransformerLinear(BaseInjectedModule, QuantizedLinearBase):
        gguf_loader: GGUFLoader,
        config: PretrainedConfig,
        orig_module: nn.Module,
-        device: str = "cuda",
+        # device: str = "cuda",
        generate_device: str = "cuda",
        generate_op: str| None = "QuantizedLinearMarlin",
        prefill_device: str = "cuda",
        prefill_op: str| None = "QuantizedLinearTorch",
        **kwargs,
    ):
-        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, device, **kwargs)
-        QuantizedLinearBase.__init__(self, key, gguf_loader, config, orig_module, device, **kwargs)
+        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, generate_device, **kwargs)
+        QuantizedLinearBase.__init__(self, key, gguf_loader, config, orig_module, generate_device, **kwargs)
        # build all the linear operators
        if prefill_op is not None:
            assert prefill_op in LINEAR_MAP, f"linear_type {prefill_op} not supported"
@ -289,7 +288,6 @@ class KTransformerLinear(BaseInjectedModule, QuantizedLinearBase):
                self.generate_linear = LINEAR_MAP[generate_op](key, gguf_loader, config, orig_module, generate_device, **kwargs)
        else:
            self.generate_linear = None
-        self.device = device
        self.mode = InferenceState.UNLOAD

    def forward(self, x):
--- a/ktransformers/optimize/optimize.py
+++ b/ktransformers/optimize/optimize.py
@ -1,6 +1,6 @@
 '''
 Description  :  
-Author       : Boxin Zhang
+Author       : Boxin Zhang, Azure-Tang
 Version      : 0.1.0
 Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
 '''
@ -15,6 +15,7 @@ from transformers.configuration_utils import PretrainedConfig
 from ktransformers.util.custom_gguf import GGUFLoader, translate_name_to_gguf
 from ktransformers.util.utils import set_module, load_weights
 import itertools
+import copy

 def inject(module, local_optimization_dict, model_config:AutoConfig ,gguf_loader:GGUFLoader, prefix=''):
    for name, child in module._modules.items():
@ -22,18 +23,20 @@ def inject(module, local_optimization_dict, model_config:AutoConfig ,gguf_loader
            child_prefix = prefix + name
            if child_prefix in local_optimization_dict:
                inject_module_meta=local_optimization_dict[child_prefix]
-                if isinstance(inject_module_meta, Mapping):
+                if inject_module_meta["class"] != "default":
                    import_path = inject_module_meta["class"].split(".")
                    import_module_name = ".".join(import_path[:-1])
+                    gguf_loader.tensor_device_map[inject_module_meta["key"]] = inject_module_meta["kwargs"] if "kwargs" in inject_module_meta else dict()
                    import_class_name = import_path[-1]
                    module_cls=getattr(__import__(import_module_name, fromlist=[""]), import_class_name)
                    print(f"Injecting {child_prefix} as", import_module_name, ".", import_class_name)
-                    inject_module=module_cls(key = inject_module_meta["key"], gguf_loader = gguf_loader, config = model_config, orig_module=child, device = inject_module_meta["device"], **inject_module_meta["kwargs"])
+                    inject_module=module_cls(key = inject_module_meta["key"], gguf_loader = gguf_loader, config = model_config, orig_module=child, **inject_module_meta["kwargs"])
                    set_module(module, name, inject_module)
-                elif isinstance(inject_module_meta, str):
-                    assert inject_module_meta=="default", "for str inject_module_meta, only support \"default\"."
+                elif inject_module_meta["class"] == "default":
+                    print(f"Injecting {child_prefix} as default")
+                    gguf_loader.tensor_device_map[inject_module_meta["key"]] = inject_module_meta["kwargs"] if "kwargs" in inject_module_meta else dict()
                else:
-                    raise Exception("inject_module_meta must be a dict or str")
+                    raise Exception("inject_module_meta[\"class\"] must be \"default\" or a class path")
                child_prefix += "."
                child_optimization_dict = {k: v for k, v in local_optimization_dict.items() if k.startswith(child_prefix)}
                inject(child, child_optimization_dict, model_config, gguf_loader, child_prefix)
@ -57,6 +60,8 @@ def gen_optimize_config(module: nn.Module, out_data: Mapping, rule_list: List, p
    for rule in rule_list:
        #print(rule)
        match_meta = rule["match"]
+        if "class" not in match_meta and "name" not in match_meta:
+            raise Exception("match must have at least one of \"class\" and \"name\"")
        if "class" in match_meta:
            import_path = match_meta["class"].split(".")
            import_module_name = ".".join(import_path[:-1])
@ -67,16 +72,29 @@ def gen_optimize_config(module: nn.Module, out_data: Mapping, rule_list: List, p
        if "name" in match_meta:
            if re.search(match_meta["name"], module_name) is None:
                continue
+        if "replace" not in rule:
+            raise Exception("replace must be in rule")
+        if "replace" in rule:
            replace_meta = rule["replace"]
+            if module_name not in out_data:
                out_data[module_name]={"key": translated_name,
-                               "class": replace_meta["class"],
-                               "device": replace_meta["device"] if "device" in replace_meta else default_device,
-                               "kwargs": replace_meta["kwargs"] if "kwargs" in replace_meta else dict()}
+                                    "class": replace_meta["class"] if "class" in replace_meta else "default",
+                                    # "device": replace_meta["device"] if "device" in replace_meta else default_device,
+                                    "kwargs": copy.deepcopy(replace_meta["kwargs"]) if "kwargs" in replace_meta else dict()}
+            else:
+                if out_data[module_name]["class"] == "default":
+                    out_data[module_name]["class"] = replace_meta["class"] if "class" in replace_meta else "default"
+                out_data[module_name]["kwargs"].update(copy.deepcopy(replace_meta["kwargs"]) if "kwargs" in replace_meta else dict())
        if "recursive" in rule:
            recursive = bool(rule["recursive"])
            
    if module_name not in out_data:
-        out_data[module_name]="default"
+        out_data[module_name]= {
+            "class": "default",
+            "key": translated_name,
+            "kwargs": {"generate_device": default_device,
+                       "prefill_device": default_device}
+        }

    #print(out_data[module_name])
    #input()
@ -88,6 +106,14 @@ def gen_optimize_config(module: nn.Module, out_data: Mapping, rule_list: List, p
                gen_optimize_config(child, out_data, rule_list, child_prefix)
    

+def translate_model_config(model_config: PretrainedConfig):
+    # for supporting some special model 
+    if model_config.model_type == "mixtral":
+        model_config.moe_intermediate_size = model_config.intermediate_size
+    
+    return model_config
+
+
 def optimize_and_load_gguf(module: nn.Module, rule_file: str, gguf_path: str, model_config: PretrainedConfig, default_device: str = "cuda:0"):
    with open(rule_file, 'r', encoding='utf-8') as f:
        rule_list = yaml.load(f.read(), Loader=yaml.FullLoader)
@ -95,8 +121,11 @@ def optimize_and_load_gguf(module: nn.Module, rule_file: str, gguf_path: str, mo
    optimize_config = dict()
    gen_optimize_config(module, optimize_config, rule_list, default_device = default_device)
    
+    model_config = translate_model_config(model_config)
+
    gguf_loader=GGUFLoader(gguf_path)
    with torch.device("meta"):
        inject(module, optimize_config, model_config, gguf_loader)
    load_weights(module, gguf_loader)
+    model_config.gguf_loader = gguf_loader
    del_meta(module)
--- a/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu-4.yaml
+++ b/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu-4.yaml
@ -0,0 +1,228 @@
+- match:
+    name: "^model\\.layers\\.([0-9])\\."
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+- match:
+    name: "(^model\\.layers\\.([1][0-9])\\.)"
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+- match:
+    name: "(^model\\.layers\\.([2][0-9])\\.)"
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cuda:2"
+      prefill_device: "cuda:2"
+- match:
+    name: "(^model\\.layers\\.([345][0-9])\\.)|(^model.norm)|(^lm_head)"
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cuda:3"
+      prefill_device: "cuda:3"
+
+- match:
+    name: "^model.embed_tokens"
+  replace:
+    class: "default"
+    kwargs:
+        generate_device: "cpu"
+        prefill_device: "cpu"
+
+- match:
+    name: "^model\\.layers\\.([0-9])\\."
+    class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
+  replace:
+    class: ktransformers.operators.RoPE.YarnRotaryEmbedding
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+- match:
+    name: "^model\\.layers\\.([1][0-9])\\."
+    class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
+  replace:
+    class: ktransformers.operators.RoPE.YarnRotaryEmbedding
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+- match:
+    name: "^model\\.layers\\.([2][0-9])\\."
+    class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
+  replace:
+    class: ktransformers.operators.RoPE.YarnRotaryEmbedding
+    kwargs:
+      generate_device: "cuda:2"
+      prefill_device: "cuda:2"
+- match:
+    name: "^model\\.layers\\.([345][0-9])\\."
+    class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
+  replace:
+    class: ktransformers.operators.RoPE.YarnRotaryEmbedding
+    kwargs:
+      generate_device: "cuda:3"
+      prefill_device: "cuda:3"
+
+- match:
+    name: "^model\\.layers\\.([1][0-9])\\.(?!self_attn).*$"  # regular expression 
+    class: torch.nn.Linear  # only match modules matching name and class simultaneously
+  replace:
+    class: ktransformers.operators.linear.KTransformerLinear  # optimized Kernel on quantized data types
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+      generate_op: "QuantizedLinearMarlin"
+      prefill_op: "QuantizedLinearTorch"
+- match:
+    name: "^model\\.layers\\.([1][0-9])\\.(?!self_attn).*$"  # regular expression 
+    class: torch.nn.Linear  # only match modules matching name and class simultaneously
+  replace:
+    class: ktransformers.operators.linear.KTransformerLinear  # optimized Kernel on quantized data types
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+      generate_op: "QuantizedLinearMarlin"
+      prefill_op: "QuantizedLinearTorch"
+- match:
+    name: "^model\\.layers\\.([2][0-9])\\.(?!self_attn).*$"  # regular expression 
+    class: torch.nn.Linear  # only match modules matching name and class simultaneously
+  replace:
+    class: ktransformers.operators.linear.KTransformerLinear  # optimized Kernel on quantized data types
+    kwargs:
+      generate_device: "cuda:2"
+      prefill_device: "cuda:2"
+      generate_op: "QuantizedLinearMarlin"
+      prefill_op: "QuantizedLinearTorch"
+- match:
+    name: "^model\\.layers\\.([345][0-9])\\.(?!self_attn).*$"  # regular expression 
+    class: torch.nn.Linear  # only match modules matching name and class simultaneously
+  replace:
+    class: ktransformers.operators.linear.KTransformerLinear  # optimized Kernel on quantized data types
+    kwargs:
+      generate_device: "cuda:3"
+      prefill_device: "cuda:3"
+      generate_op: "QuantizedLinearMarlin"
+      prefill_op: "QuantizedLinearTorch"
+
+- match:
+    name: "^model\\.layers\\.([0-9])\\.mlp$"
+    class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
+  replace:
+    class: ktransformers.operators.experts.DeepseekV2MoEInjected     # mlp module with custom forward function
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+- match:
+    name: "^model\\.layers\\.([1][0-9])\\.mlp$"
+    class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
+  replace:
+    class: ktransformers.operators.experts.DeepseekV2MoEInjected     # mlp module with custom forward function
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+- match:
+    name: "^model\\.layers\\.([2][0-9])\\.mlp$"
+    class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
+  replace:
+    class: ktransformers.operators.experts.DeepseekV2MoEInjected     # mlp module with custom forward function
+    kwargs:
+      generate_device: "cuda:2"
+      prefill_device: "cuda:2"
+- match:
+    name: "^model\\.layers\\.([345][0-9])\\.mlp$"
+    class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
+  replace:
+    class: ktransformers.operators.experts.DeepseekV2MoEInjected     # mlp module with custom forward function
+    kwargs:
+      generate_device: "cuda:3"
+      prefill_device: "cuda:3"
+
+- match:
+    name: "^model\\.layers\\.([0-9])\\.mlp\\.experts$"
+  replace:
+    class: ktransformers.operators.experts.KTransformersMLPExpert     # custom MoE Kernel with expert paralleism
+    kwargs:
+      prefill_device: "cuda:0"
+      prefill_mlp_type: "MLPExpertsTorch"
+      generate_device: "cpu"
+      generate_mlp_type:  "MLPCPUExperts"
+      out_device: "cuda:0"
+  recursive: False # don't recursively inject submodules of this module
+- match:
+    name: "^model\\.layers\\.([1][0-9])\\.mlp\\.experts$"
+  replace:
+    class: ktransformers.operators.experts.KTransformersMLPExpert     # custom MoE Kernel with expert paralleism
+    kwargs:
+      prefill_device: "cuda:1"
+      prefill_mlp_type: "MLPExpertsTorch"
+      generate_device: "cpu"
+      generate_mlp_type:  "MLPCPUExperts"
+      out_device: "cuda:1"
+  recursive: False # don't recursively inject submodules of this module
+- match:
+    name: "^model\\.layers\\.([2][0-9])\\.mlp\\.experts$"
+  replace:
+    class: ktransformers.operators.experts.KTransformersMLPExpert     # custom MoE Kernel with expert paralleism
+    kwargs:
+      prefill_device: "cuda:2"
+      prefill_mlp_type: "MLPExpertsTorch"
+      generate_device: "cpu"
+      generate_mlp_type:  "MLPCPUExperts"
+      out_device: "cuda:2"
+  recursive: False # don't recursively inject submodules of this module
+- match:
+    name: "^model\\.layers\\.([345][0-9])\\.mlp\\.experts$"
+  replace:
+    class: ktransformers.operators.experts.KTransformersMLPExpert     # custom MoE Kernel with expert paralleism
+    kwargs:
+      prefill_device: "cuda:3"
+      prefill_mlp_type: "MLPExpertsTorch"
+      generate_device: "cpu"
+      generate_mlp_type:  "MLPCPUExperts"
+      out_device: "cuda:3"
+  recursive: False # don't recursively inject submodules of this module
+
+- match:
+    name: "^model\\.layers\\.([0-9])\\.self_attn$"
+  replace:
+    class: ktransformers.operators.attention.DeepseekV2AttentionInjected # optimized MLA implementation
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+- match:
+    name: "^model\\.layers\\.([1][0-9])\\.self_attn$"
+  replace:
+    class: ktransformers.operators.attention.DeepseekV2AttentionInjected # optimized MLA implementation
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+- match:
+    name: "^model\\.layers\\.([2][0-9])\\.self_attn$"
+  replace:
+    class: ktransformers.operators.attention.DeepseekV2AttentionInjected # optimized MLA implementation
+    kwargs:
+      generate_device: "cuda:2"
+      prefill_device: "cuda:2"
+- match:
+    name: "^model\\.layers\\.([345][0-9])\\.self_attn$"
+  replace:
+    class: ktransformers.operators.attention.DeepseekV2AttentionInjected # optimized MLA implementation
+    kwargs:
+      generate_device: "cuda:3"
+      prefill_device: "cuda:3"
+
+- match:
+    name: "^model$"
+  replace:
+    class: "ktransformers.operators.layer_wise_prefill.DeepseekV2ModelKTransformers"
+    kwargs:
+      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
+      transfer_map: 
+        10: "cuda:1"
+        20: "cuda:2"
+        30: "cuda:3"
--- a/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu.yaml
+++ b/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu.yaml
@ -0,0 +1,126 @@
+- match:
+    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\."
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+
+- match:
+    name: "(^model\\.layers\\.([345][0-9])\\.)|(model.norm)|(lm_head)"
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+
+- match:
+    name: "^model.embed_tokens"
+  replace:
+    class: "default"
+    kwargs:
+        generate_device: "cpu"
+        prefill_device: "cpu"
+
+- match:
+    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\."
+    class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
+  replace:
+    class: ktransformers.operators.RoPE.YarnRotaryEmbedding
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+- match:
+    name: "^model\\.layers\\.([345][0-9])\\."
+    class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
+  replace:
+    class: ktransformers.operators.RoPE.YarnRotaryEmbedding
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+
+- match:
+    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.(?!self_attn).*$"  # regular expression 
+    class: torch.nn.Linear  # only match modules matching name and class simultaneously
+  replace:
+    class: ktransformers.operators.linear.KTransformerLinear  # optimized Kernel on quantized data types
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+      generate_op: "QuantizedLinearMarlin"
+      prefill_op: "QuantizedLinearTorch"
+
+- match:
+    name: "^model\\.layers\\.([345][0-9])\\.(?!self_attn).*$"  # regular expression 
+    class: torch.nn.Linear  # only match modules matching name and class simultaneously
+  replace:
+    class: ktransformers.operators.linear.KTransformerLinear  # optimized Kernel on quantized data types
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+      generate_op: "QuantizedLinearMarlin"
+      prefill_op: "QuantizedLinearTorch"
+  
+- match:
+    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp$"
+    class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
+  replace:
+    class: ktransformers.operators.experts.DeepseekV2MoEInjected     # mlp module with custom forward function
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+- match:
+    name: "^model\\.layers\\.([345][0-9])\\.mlp$"
+    class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
+  replace:
+    class: ktransformers.operators.experts.DeepseekV2MoEInjected     # mlp module with custom forward function
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+
+- match:
+    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp\\.experts$"
+  replace:
+    class: ktransformers.operators.experts.KTransformersMLPExpert     # custom MoE Kernel with expert paralleism
+    kwargs:
+      prefill_device: "cuda:0"
+      prefill_mlp_type: "MLPExpertsTorch"
+      generate_device: "cpu"
+      generate_mlp_type:  "MLPCPUExperts"
+      out_device: "cuda:0"
+  recursive: False # don't recursively inject submodules of this module
+
+- match:
+    name: "^model\\.layers\\.([345][0-9])\\.mlp\\.experts$"
+  replace:
+    class: ktransformers.operators.experts.KTransformersMLPExpert     # custom MoE Kernel with expert paralleism
+    kwargs:
+      prefill_device: "cuda:1"
+      prefill_mlp_type: "MLPExpertsTorch"
+      generate_device: "cpu"
+      generate_mlp_type:  "MLPCPUExperts"
+      out_device: "cuda:1"
+  recursive: False # don't recursively inject submodules of this module
+
+- match:
+    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.self_attn$"
+  replace:
+    class: ktransformers.operators.attention.DeepseekV2AttentionInjected # optimized MLA implementation
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+- match:
+    name: "^model\\.layers\\.([345][0-9])\\.self_attn$"
+  replace:
+    class: ktransformers.operators.attention.DeepseekV2AttentionInjected # optimized MLA implementation
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+- match:
+    name: "^model$"
+  replace:
+    class: "ktransformers.operators.layer_wise_prefill.DeepseekV2ModelKTransformers"
+    kwargs:
+      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
+      transfer_map: 
+        30: "cuda:1"
--- a/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat.yaml
+++ b/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat.yaml
@ -1,3 +1,10 @@
+- match:
+    name: "^model\\.layers\\..*\\.|^lm_head"
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cuda"
+      prefill_device: "cuda"
 - match:
    class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
  replace:
@ -21,7 +28,6 @@
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersMLPExpert     # custom MoE Kernel with expert paralleism
-    device: "cpu"   # which devices to load this module when initializing
    kwargs:
      prefill_device: "cuda"
      prefill_mlp_type: "MLPExpertsTorch"
@ -36,6 +42,13 @@
 - match:
    name: "^model$"
  replace:
-    class: "ktransformers.operators.layer_wise_prefill.DeepseekV2ModelPerLayerPrefill"
+    class: "ktransformers.operators.layer_wise_prefill.DeepseekV2ModelKTransformers"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
+- match:
+    name: "^model.embed_tokens"
+  replace:
+    class: "default"
+    kwargs:
+        generate_device: "cpu"
+        prefill_device: "cpu"
--- a/ktransformers/optimize/optimize_rules/DeepSeek-V2-Lite-Chat-multi-gpu.yaml
+++ b/ktransformers/optimize/optimize_rules/DeepSeek-V2-Lite-Chat-multi-gpu.yaml
@ -0,0 +1,126 @@
+- match:
+    name: "^model\\.layers\\.(0|[1-9])\\."
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+
+- match:
+    name: "(^model\\.layers\\.([12][0-9])\\.)|(model.norm)|(lm_head)"
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+
+- match:
+    name: "^model.embed_tokens"
+  replace:
+    class: "default"
+    kwargs:
+        generate_device: "cpu"
+        prefill_device: "cpu"
+
+- match:
+    name: "^model\\.layers\\.(0|[1-9])\\."
+    class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
+  replace:
+    class: ktransformers.operators.RoPE.YarnRotaryEmbedding
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+- match:
+    name: "^model\\.layers\\.([12][0-9])\\."
+    class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
+  replace:
+    class: ktransformers.operators.RoPE.YarnRotaryEmbedding
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+
+- match:
+    name: "^model\\.layers\\.(0|[1-9])\\.(?!self_attn).*$"  # regular expression 
+    class: torch.nn.Linear  # only match modules matching name and class simultaneously
+  replace:
+    class: ktransformers.operators.linear.KTransformerLinear  # optimized Kernel on quantized data types
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+      generate_op: "QuantizedLinearMarlin"
+      prefill_op: "QuantizedLinearTorch"
+
+- match:
+    name: "^model\\.layers\\.([12][0-9])\\.(?!self_attn).*$"  # regular expression 
+    class: torch.nn.Linear  # only match modules matching name and class simultaneously
+  replace:
+    class: ktransformers.operators.linear.KTransformerLinear  # optimized Kernel on quantized data types
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+      generate_op: "QuantizedLinearMarlin"
+      prefill_op: "QuantizedLinearTorch"
+  
+- match:
+    name: "^model\\.layers\\.(0|[1-9])\\.mlp$"
+    class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
+  replace:
+    class: ktransformers.operators.experts.DeepseekV2MoEInjected     # mlp module with custom forward function
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+- match:
+    name: "^model\\.layers\\.([12][0-9])\\.mlp$"
+    class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
+  replace:
+    class: ktransformers.operators.experts.DeepseekV2MoEInjected     # mlp module with custom forward function
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+
+- match:
+    name: "^model\\.layers\\.(0|[1-9])\\.mlp\\.experts$"
+  replace:
+    class: ktransformers.operators.experts.KTransformersMLPExpert     # custom MoE Kernel with expert paralleism
+    kwargs:
+      prefill_device: "cuda:0"
+      prefill_mlp_type: "MLPExpertsTorch"
+      generate_device: "cpu"
+      generate_mlp_type:  "MLPCPUExperts"
+      out_device: "cuda:0"
+  recursive: False # don't recursively inject submodules of this module
+
+- match:
+    name: "^model\\.layers\\.([12][0-9])\\.mlp\\.experts$"
+  replace:
+    class: ktransformers.operators.experts.KTransformersMLPExpert     # custom MoE Kernel with expert paralleism
+    kwargs:
+      prefill_device: "cuda:1"
+      prefill_mlp_type: "MLPExpertsTorch"
+      generate_device: "cpu"
+      generate_mlp_type:  "MLPCPUExperts"
+      out_device: "cuda:1"
+  recursive: False # don't recursively inject submodules of this module
+
+- match:
+    name: "^model\\.layers\\.(0|[1-9])\\.self_attn$"
+  replace:
+    class: ktransformers.operators.attention.DeepseekV2AttentionInjected # optimized MLA implementation
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+- match:
+    name: "^model\\.layers\\.([12][0-9])\\.self_attn$"
+  replace:
+    class: ktransformers.operators.attention.DeepseekV2AttentionInjected # optimized MLA implementation
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+- match:
+    name: "^model$"
+  replace:
+    class: "ktransformers.operators.layer_wise_prefill.DeepseekV2ModelKTransformers"
+    kwargs:
+      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
+      transfer_map: 
+        10: "cuda:1"
--- a/ktransformers/optimize/optimize_rules/Mixtral.yaml
+++ b/ktransformers/optimize/optimize_rules/Mixtral.yaml
@ -0,0 +1,45 @@
+- match:
+    name: "^model\\.layers\\..*\\."
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cuda"
+      prefill_device: "cuda"
+- match:
+    class: ktransformers.models.modeling_mixtral.MixtralRotaryEmbedding
+  replace:
+    class: ktransformers.operators.RoPE.RotaryEmbedding
+- match:
+    name: "^model\\.layers\\..*$"
+    class: torch.nn.Linear  # only match modules matching name and class simultaneously
+  replace:
+    class: ktransformers.operators.linear.KTransformerLinear  # optimized Kernel on quantized data types
+    kwargs:
+      generate_device: "cuda"
+      prefill_device: "cuda"
+      generate_op: "QuantizedLinearMarlin"
+      prefill_op: "QuantizedLinearTorch"
+- match:
+    name: "^model\\.layers\\..*\\.block_sparse_moe$"
+    class: ktransformers.models.modeling_mixtral.MixtralSparseMoeBlock
+  replace: 
+    class: ktransformers.operators.experts.MisrtalSparseMoEBlockInjected
+- match:
+    name: "^model\\.layers\\..*\\.block_sparse_moe\\.experts$"
+  replace: 
+    class: ktransformers.operators.experts.KTransformersMLPExpert
+    kwargs:
+      prefill_device: "cuda"
+      prefill_mlp_type: "MLPExpertsTorch"
+      generate_device: "cpu"
+      generate_mlp_type:  "MLPCPUExperts"
+      out_device: "cuda"
+  recursive: False # don't recursively inject submodules of this module
+
+- match:
+    name: "^model.embed_tokens"
+  replace:
+    class: "default"
+    kwargs:
+        generate_device: "cpu"
+        prefill_device: "cpu"
--- a/ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct-multi-gpu.yaml
+++ b/ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct-multi-gpu.yaml
@ -0,0 +1,111 @@
+- match:
+    name: "^model\\.layers\\.([012])\\."
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+- match:
+    name: "^model\\.layers\\.([012])\\."
+    class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeRotaryEmbedding
+  replace:
+    class: ktransformers.operators.RoPE.RotaryEmbedding
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+- match:
+    name: "^model\\.layers\\.([012])$"  # regular expression 
+    class: torch.nn.Linear  # only match modules matching name and class simultaneously
+  replace:
+    class: ktransformers.operators.linear.KTransformerLinear  # optimized Kernel on quantized data types
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+      generate_op: "QuantizedLinearMarlin"
+      prefill_op: "QuantizedLinearTorch"
+- match:
+    name: "^model\\.layers\\.([012])\\.mlp$"
+    class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeSparseMoeBlock
+  replace:
+    class: ktransformers.operators.experts.Qwen2MoeSparseMoeBlockInjected     # mlp module with custom forward function
+- match:
+    name: "^model\\.layers\\.([012])\\.mlp\\.experts$"
+  replace:
+    class: ktransformers.operators.experts.KTransformersMLPExpert     # custom MoE Kernel with expert paralleism
+    # device: "cpu"   # which devices to load this module when initializing
+    kwargs:
+      prefill_device: "cuda:0"
+      prefill_mlp_type: "MLPExpertsTorch"
+      generate_device: "cpu"
+      generate_mlp_type:  "MLPCPUExperts"
+      out_device: "cuda:0"
+  recursive: False # don't recursively inject submodules of this module
+
+- match:
+    name: "^model\\.layers\\.([12][0-9]|[3-9])\\."
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+- match:
+    name: "^model\\.layers\\.([12][0-9]|[3-9])\\."
+    class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeRotaryEmbedding
+  replace:
+    class: ktransformers.operators.RoPE.RotaryEmbedding
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+- match:
+    name: "^model\\.layers\\.([12][0-9]|[3-9])$"  # regular expression 
+    class: torch.nn.Linear  # only match modules matching name and class simultaneously
+  replace:
+    class: ktransformers.operators.linear.KTransformerLinear  # optimized Kernel on quantized data types
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+      generate_op: "QuantizedLinearMarlin"
+      prefill_op: "QuantizedLinearTorch"
+- match:
+    name: "^model\\.layers\\.([12][0-9]|[3-9])\\.mlp$"
+    class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeSparseMoeBlock
+  replace:
+    class: ktransformers.operators.experts.Qwen2MoeSparseMoeBlockInjected     # mlp module with custom forward function
+- match:
+    name: "^model\\.layers\\.([12][0-9]|[3-9])\\.mlp\\.experts$"
+  replace:
+    class: ktransformers.operators.experts.KTransformersMLPExpert     # custom MoE Kernel with expert paralleism
+    # device: "cpu"   # which devices to load this module when initializing
+    kwargs:
+      prefill_device: "cuda:1"
+      prefill_mlp_type: "MLPExpertsTorch"
+      generate_device: "cpu"
+      generate_mlp_type:  "MLPCPUExperts"
+      out_device: "cuda:1"
+  recursive: False # don't recursively inject submodules of this module
+
+- match:
+    name: "^model.embed_tokens"
+  replace:
+    class: "default"
+    kwargs:
+        generate_device: "cpu"
+        prefill_device: "cpu"
+
+- match:
+    name: "(^model.norm)|(^lm_head)"
+  replace:
+    class: "default"
+    kwargs:
+        generate_device: "cuda:1"
+        prefill_device: "cuda:1"
+
+- match:
+    name: "^model$"
+  replace:
+    class: "ktransformers.operators.layer_wise_prefill.Qwen2MoeModelKTransformers"
+    kwargs:
+      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
+      transfer_map: 
+        3: "cuda:1"
+
--- a/ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct.yaml
+++ b/ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct.yaml
@ -1,3 +1,10 @@
+- match:
+    name: "^model\\.layers\\..*\\."
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cuda"
+      prefill_device: "cuda"
 - match:
    class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeRotaryEmbedding
  replace:
@ -21,7 +28,7 @@
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersMLPExpert     # custom MoE Kernel with expert paralleism
-    device: "cpu"   # which devices to load this module when initializing
+    # device: "cpu"   # which devices to load this module when initializing
    kwargs:
      prefill_device: "cuda"
      prefill_mlp_type: "MLPExpertsTorch"
@ -32,6 +39,13 @@
 - match:
    name: "^model$"
  replace:
-    class: "ktransformers.operators.layer_wise_prefill.Qwen2MoeModelPerLayerPrefill"
+    class: "ktransformers.operators.layer_wise_prefill.Qwen2MoeModelKTransformers"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
+- match:
+    name: "^model.embed_tokens"
+  replace:
+    class: "default"
+    kwargs:
+        generate_device: "cpu"
+        prefill_device: "cpu"
--- a/ktransformers/tests/dequant_gpu.py
+++ b/ktransformers/tests/dequant_gpu.py
@ -1,12 +1,9 @@
 import os 
-os.environ["CUDA_VISIBLE_DEVICES"]="1"
+# os.environ["CUDA_VISIBLE_DEVICES"]="1,2"
 # add path
 import sys
 current_path = os.path.abspath(os.path.dirname(__file__))
 sys.path.append(current_path+"/../..")
-import pycuda.autoinit
-import pycuda.driver as cuda
-from pycuda.compiler import SourceModule
 import numpy as np
 # from ktransformers.operators.linear import KTransformerLinear, QuantizedLinearMarlin
 # from ktransformers.operators.experts import KTransformersMLPExpert, MLPExpertsTorch
@ -18,36 +15,23 @@ import time
 from transformers import (
    AutoConfig,
 )
+import os
+# CUDA_LAUNCH_BLOCKING=1
+os.environ["CUDA_LAUNCH_BLOCKING"]="1"

 gguf_config = GGUFLoader("/data/Qwen2-57B-A14B-Instruct-GGUF/q4_k_m")
 model_name = "/data/Qwen2-57B-A14B-Instruct"
-key = "blk.0."
-target = "ffn_down_exps.weight"
-
-t1 = time.time()
-q_weight_cpu = gguf_config.load_gguf_tensor(key+target, "cpu")
-# q_weight_cpu = torch.from_numpy(q_weight_cpu)
-
-t2 = time.time()
-q_weight_gpu = gguf_config.load_gguf_tensor(key+target, "cuda")
-t3 = time.time()
-print()
-allclose = torch.allclose(q_weight_cpu, q_weight_gpu.cpu().to(torch.float32), atol=1e-6)
-print(f"Q6k {key+target}")
-print("load gguf tensor from cpu cost: ", t2-t1)
-print("load gguf tensor from gpu cost: ", t3-t2)
-print("allclose: ", allclose)
-

+# Q4k
 key = "blk.1."
-target = "ffn_up_shexp.weight"
+target = "attn_q.weight"

 t1 = time.time()
 q_weight_cpu = gguf_config.load_gguf_tensor(key+target, "cpu")
 # q_weight_cpu = torch.from_numpy(q_weight_cpu)

 t2 = time.time()
-q_weight_gpu = gguf_config.load_gguf_tensor(key+target, "cuda")
+q_weight_gpu = gguf_config.load_gguf_tensor(key+target, "cuda:0")
 t3 = time.time()
 print()
 allclose = torch.allclose(q_weight_cpu, q_weight_gpu.cpu(), atol=1e-6)
@ -55,3 +39,20 @@ print(f"Q4k {key+target}")
 print("load gguf tensor from cpu cost: ", t2-t1)
 print("load gguf tensor from gpu cost: ", t3-t2)
 print("allclose: ", allclose)
+
+
+# Q6k
+key = "blk.0."
+target = "ffn_down_exps.weight"
+
+t1 = time.time()
+q_weight_cpu = gguf_config.load_gguf_tensor(key+target, "cpu")
+t2 = time.time()
+q_weight_gpu = gguf_config.load_gguf_tensor(key+target, "cuda:0")
+t3 = time.time()
+print()
+allclose = torch.allclose(q_weight_cpu, q_weight_gpu.cpu().to(torch.float32), atol=1e-6)
+print(f"Q6k {key+target}")
+print("load gguf tensor from cpu cost: ", t2-t1)
+print("load gguf tensor from gpu cost: ", t3-t2)
+print("allclose: ", allclose)
--- a/ktransformers/tests/dequant_gpu_t.py
+++ b/ktransformers/tests/dequant_gpu_t.py
@ -11,7 +11,7 @@ from ktransformers.operators.linear import KTransformerLinear, QuantizedLinearMa
 from ktransformers.operators.experts import KTransformersMLPExpert, MLPExpertsTorch
 from ktransformers.util.custom_gguf import GGUFLoader, dequantize_q4_k_gpu, dequantize_q4_k
 import torch
-import CudaOps
+import KTransformersOps
 torch.set_default_dtype(torch.bfloat16)
 import time
 from transformers import (
--- a/ktransformers/util/cuda_graph_runner.py
+++ b/ktransformers/util/cuda_graph_runner.py
@ -21,6 +21,7 @@ class CUDAGraphRunner:
        position_ids,
        cache_position,
        past_key_values,
+        main_device,
        **kwargs,
    ) -> None:
        assert self.graph is None
@ -29,15 +30,24 @@ class CUDAGraphRunner:
        self.graph = torch.cuda.CUDAGraph()
        #self.graph.enable_debug_mode()
        self.model = model
-        inputs_embeds = model.model.embed_tokens(cur_token.to("cpu")).to("cuda")
-        with torch.cuda.graph(self.graph):
+        inputs_embeds = model.model.embed_tokens(cur_token.to("cpu")).to(main_device)
+        # torch.cuda.set_device can't set "cuda", must have a index
+        if main_device == "cuda":
+            main_device = "cuda:0"
+        torch.cuda.set_device(main_device)
+        self.main_device = main_device
+        capture_stream = torch.cuda.Stream()
+        with torch.cuda.graph(self.graph, stream = capture_stream):
            logits=model(inputs_embeds=inputs_embeds, 
                         position_ids=position_ids,
                         cache_position=cache_position,
                         past_key_values=past_key_values,
                         **kwargs)[0]
+            capture_stream.wait_stream(torch.cuda.current_stream())
+            torch.cuda.set_device(main_device)
+            torch.cuda.set_stream(capture_stream)
        past_key_values.change_seq_length(-1)
-        torch.cuda.synchronize()
+        torch.cuda.synchronize(self.main_device)
        #self.graph.debug_dump("cuda_graph_hooked.dot")

        # Save the input and output buffers.
@ -65,7 +75,7 @@ class CUDAGraphRunner:
        #print("begin replay")
        #time.sleep(1)
        self.graph.replay()
-        torch.cuda.synchronize()
+        torch.cuda.synchronize(self.main_device)
        # Return the output tensor.
        return self.output_buffers["logits"]

--- a/ktransformers/util/custom_gguf.py
+++ b/ktransformers/util/custom_gguf.py
@ -5,8 +5,11 @@ Description  :
 Author       : Azure-Tang, Boxin Zhang, chenht2022
 Date         : 2024-07-26 08:48:54
 Version      : 1.0.0
-LastEditors  : Azure 
-LastEditTime : 2024-07-26 09:28:25
+LastEditors  : kkk1nak0
+LastEditTime : 2024-08-09 08:03:44
+Adapted from https://github.com/99991/pygguf/blob/main/gguf.py
+Copyright (c) 2023-2024 The ggml authors
+Copyright (c) 2024 Thomas Germer
 Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
 '''
 # copied from llama.cpp/gguf-py/gguf/constants.py to satisfy dependence of gguf
@ -15,6 +18,7 @@ Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 import struct
 import warnings
 import numpy as np
+import re
 import numpy.typing as npt
 from typing import Sequence
 import os
@ -96,6 +100,8 @@ def quant_shape_to_byte_shape(shape: Sequence[int], quant_type: GGMLQuantization
 GGML_TYPES = {
    "F32": 0,
    "F16": 1,
+    "Q4_0": 2,
+    "Q5_0": 6,
    "Q8_0": 8,
    "Q2_K": 10,
    "Q3_K": 11,
@ -109,6 +115,8 @@ GGML_NAMES = {ggml_type: name for name, ggml_type in GGML_TYPES.items()}
 GGML_BLOCK_SIZES = {
    "F32": 4,
    "F16": 2,
+    "Q4_0": 2 + 16,
+    "Q5_0": 2 + 4 + 16,
    "Q8_0": 2 + 32,
    "Q2_K": 256 // 16 + 256 // 4 + 2 + 2,
    "Q3_K": 256 // 8 + 256 // 4 + 12 + 2,
@ -120,6 +128,8 @@ GGML_BLOCK_SIZES = {
 GGML_ELEMENTS_PER_BLOCK = {
    "F32": 1,
    "F16": 1,
+    "Q4_0": 32,
+    "Q5_0": 32,
    "Q8_0": 32,
    "Q2_K": 256,
    "Q3_K": 256,
@ -128,14 +138,6 @@ GGML_ELEMENTS_PER_BLOCK = {
    "Q6_K": 256,
 }

-# DATA_TYPES = {
-#     "uint32": 4,
-#     "int32": 5,
-#     "float32": 6,
-#     "string": 8,
-#     "array": 9,
-#     "uint64": 10,
-# }
 DATA_TYPES = {
    "uint8": 0,
    "int8": 1,
@ -167,6 +169,7 @@ class GGUFLoader:
        self.tensor_file_map = {}
        self.file_data_map = {}
        self.gguf_file_meta = {}
+        self.tensor_device_map = {}
        
        # Walk through all the .gguf files in the directory
        for root, dirs, files in os.walk(gguf_path):
@ -283,14 +286,27 @@ class GGUFLoader:

        data = self.get_mmap_tensor(name)

-
        if "cuda" in device.lower():
            values = GGML_DEQUANTIZE_GPU[ggml_name](data, device)
+            #values = GGML_DEQUANTIZE[ggml_name](data)
+            #print("load_gguf_tensor")
+            #values = torch.from_numpy(values).to(device = device)
        else:
            values = GGML_DEQUANTIZE[ggml_name](data)
            values = torch.from_numpy(values)

-        return values.view(shape[::-1])
+        values = values.view(shape[::-1])
+        if "attn_q" in name and self.gguf_file_meta['general.architecture'] in ["llama"]:
+            n_head = self.gguf_file_meta['llama.attention.head_count']
+            values = (values.reshape(n_head, values.shape[0] // n_head // 2, 2, *values.shape[1:])
+            .swapaxes(1, 2)
+            .reshape(values.shape))
+        elif "attn_k" in name and self.gguf_file_meta['general.architecture'] in ["llama"]:
+            n_head = self.gguf_file_meta['llama.attention.head_count_kv'] 
+            values = (values.reshape(n_head, values.shape[0] // n_head // 2, 2, *values.shape[1:])
+            .swapaxes(1, 2)
+            .reshape(values.shape))
+        return values

 def read_value(f, data_type):
    if data_type == DATA_TYPES["string"]:
@ -375,7 +391,7 @@ def dequantize_q2_k(data):
    return d * (scales & 15) * (tmp & 3) - dmin * (scales >> 4)

 def dequantize_q2_k_gpu(data):
-    pass
+    raise NotImplementedError()

 def dequantize_q3_k(data):
    # C implementation
@ -420,7 +436,7 @@ def dequantize_q3_k(data):
    ], axis=1)

 def dequantize_q3_k_gpu(data):
-    pass
+    raise NotImplementedError()

 def dequantize_q4_k(data):
    # C implementation
@ -429,20 +445,16 @@ def dequantize_q4_k(data):
    # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L116
    block_size = GGML_BLOCK_SIZES["Q4_K"]
    num_blocks = len(data) // block_size
-
    data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, block_size // 2)
    data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, block_size)
-
    # Casting to float32 because float16 is very slow on CPU
    scale_factors = data_f16[:, 0].reshape(num_blocks, 1, 1).astype(np.float32)
    scale_offsets = data_f16[:, 1].reshape(num_blocks, 1, 1).astype(np.float32)
    qs1 = data_u8[:, 4:16].reshape(num_blocks, 12, 1)
    qs2 = data_u8[:, 16:].reshape(num_blocks, 4, 32)
-
    # Dequantize scales and offsets (6 bits and 4 + 2 bits)
    factors = scale_factors * np.concatenate([qs1[:, 0:4] & 0b111111, (qs1[:, 8:] & 15) | ((qs1[:, 0:4] >> 6) << 4)], axis=1)
    offsets = scale_offsets * np.concatenate([qs1[:, 4:8] & 0b111111, (qs1[:, 8:] >> 4) | ((qs1[:, 4:8] >> 6) << 4)], axis=1)
-
    # Interleave low and high quantized bits
    qs2 = np.stack([qs2 & 0xf, qs2 >> 4], axis=2).reshape(num_blocks, 8, 32)
    # Dequantize final weights using scales and offsets
@ -512,9 +524,14 @@ def dequantize_q5_k(data):
        d8 * (qs_hi_4[:, 3] + (bits[:, :, 7] << 4)) - m8,
    ], axis=1)

-def dequantize_q5_k_gpu(data):
-    pass
-
+def dequantize_q5_k_gpu(data, device:str ="cuda"):
+    block_size = GGML_BLOCK_SIZES["Q5_K"]
+    data = np.frombuffer(data, dtype=data.dtype)
+    device = torch.device(device)
+    # TODO: this and from_numpy in other functions will cause a warning saying that numpy is not writable, 
+    # the best way to fix this is transfer ptr to KTransformersOps instead of Tensor.
+    data = torch.from_numpy(data)
+    return KTransformersOps.dequantize_q5_k(data, block_size, device)

 def dequantize_q6_k(data):
    # C implementation
@ -571,7 +588,49 @@ def dequantize_q6_k_gpu(data: np.ndarray, device:str = "cuda"):
    num_blocks = len(data) // block_size
    data = np.frombuffer(data, dtype=data.dtype)
    data = torch.from_numpy(data)
-    return KTransformersOps.dequantize_q6_k(data, 210, device)
+    return KTransformersOps.dequantize_q6_k(data, block_size, device)
+
+def dequantize_q4_0(data):
+    # C implementation
+    # https://github.com/ggerganov/ggml/blob/a3c0188a4b5d3dec052ff87c9f773baa53631d70/src/ggml-quants.c#L1515
+    # C struct definition
+    # https://github.com/ggerganov/ggml/blob/a3c0188a4b5d3dec052ff87c9f773baa53631d70/src/ggml-common.h#L141
+    num_blocks = len(data) // GGML_BLOCK_SIZES["Q4_0"]
+
+    scales = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, 1 + 8)[:, :1].astype(np.float32)
+    qs = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, 2 + 16)[:, 2:]
+
+    return np.concatenate([
+        scales * ((qs & 0xf).astype(np.int8) - 8),
+        scales * ((qs >> 4).astype(np.int8) - 8),
+    ], axis=1)
+
+def dequantize_q4_0_gpu(data):
+    raise NotImplementedError()
+
+def dequantize_q5_0(data):
+    # C implementation
+    # https://github.com/ggerganov/ggml/blob/a3c0188a4b5d3dec052ff87c9f773baa53631d70/src/ggml-quants.c#L1556
+    # C struct definition
+    # https://github.com/ggerganov/ggml/blob/a3c0188a4b5d3dec052ff87c9f773baa53631d70/src/ggml-common.h#L161
+    num_blocks = len(data) // GGML_BLOCK_SIZES["Q5_0"]
+
+    scales = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, 1 + 2 + 8)[:, :1].astype(np.float32)
+    qh = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, 2 + 4 + 16)[:, 2:2 + 4]
+    qs = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, 2 + 4 + 16)[:, 2 + 4:]
+
+    bits = np.unpackbits(qh, axis=-1, bitorder="little")
+
+    x0 = ((qs & 0xf).astype(np.int8) | (bits[:, :16] << 4)) - 16
+    x1 = ((qs >> 4).astype(np.int8) | (bits[:, 16:] << 4)) - 16
+
+    return np.concatenate([
+        scales * x0,
+        scales * x1,
+    ], axis=1)
+
+def dequantize_q5_0_gpu(data):
+    raise NotImplementedError()

 def dequantize_q8_0(data):
    # C struct definition
@ -615,6 +674,8 @@ def dequantize_f16_gpu(data, device):
 GGML_DEQUANTIZE = {
    "F32": dequantize_f32,
    "F16": dequantize_f16,
+    "Q4_0": dequantize_q4_0,
+    "Q5_0": dequantize_q5_0,
    "Q8_0": dequantize_q8_0,
    "Q2_K": dequantize_q2_k,
    "Q3_K": dequantize_q3_k,
@ -626,6 +687,8 @@ GGML_DEQUANTIZE = {
 GGML_DEQUANTIZE_GPU = {
    "F32": dequantize_f32_gpu,
    "F16": dequantize_f16_gpu,
+    "Q4_0": dequantize_q4_0_gpu,
+    "Q5_0": dequantize_q5_0_gpu,
    "Q8_0": dequantize_q8_0_gpu,
    "Q2_K": dequantize_q2_k_gpu,
    "Q3_K": dequantize_q3_k_gpu,
@ -634,7 +697,34 @@ GGML_DEQUANTIZE_GPU = {
    "Q6_K": dequantize_q6_k_gpu,
 }

+
+def translate_name_to_gguf_mixtral(name):
+    
+    replacement_template = {
+        "w1.weight": "ffn_gate",
+        "w2.weight": "ffn_down",
+        "w3.weight": "ffn_up"
+    }  
+
+    pattern = re.compile(r"model.layers\.(\d+)\.block_sparse_moe\.experts\.(\d+)\.(w\d\.weight)")
+
+    def replace_match(match):
+        blk_id = match.group(1)
+        expert_id = match.group(2)
+        weight_type = match.group(3)
+        if weight_type in replacement_template:
+            return f"blk.{blk_id}.{replacement_template[weight_type]}.{expert_id}.weight"
+        else:
+            return match.group(0)
+
+    new_name = re.sub(pattern, replace_match, name)
+    
+    return new_name
+
 def translate_name_to_gguf(name):
+
+    name = translate_name_to_gguf_mixtral(name)
+
    name = name.replace("lm_head.", "output.")
    name = name.replace("model.embed_tokens.", "token_embd.")
    name = name.replace("model.norm.", "output_norm.")
@ -671,9 +761,14 @@ def translate_name_to_gguf(name):
    name = name.replace(".mlp.experts.ffn_gate_exps", ".ffn_gate_exps")
    name = name.replace(".mlp.experts.ffn_up_exps", ".ffn_up_exps")

+    
+    name = name.replace(".block_sparse_moe.gate.", ".ffn_gate_inp.")
+    name = name.replace(".block_sparse_moe.experts", "")
+    
    return name

 if __name__ == '__main__':
    gguf_path = '/mnt/data/model/DeepSeek-Coder-V2-GGUF-WJH'
    loader = GGUFLoader(gguf_path)
    loader.load_gguf_tensor('token_embd.weight')
+
--- a/ktransformers/util/utils.py
+++ b/ktransformers/util/utils.py
@ -39,6 +39,22 @@ def set_param(module: nn.Module, name: str, weights: torch.Tensor):
        param.unsqueeze_(0)
    setattr(module, name, param)

+def get_device(gguf_module_key:str, device_map:dict):
+    if gguf_module_key in device_map:
+        return device_map[gguf_module_key]["generate_device"]
+    else:
+        return "cuda"
+
+def get_all_used_cuda_device(device_map:dict):
+    all_device_list = set()
+    for key in device_map:
+        all_device_list.add(device_map[key]["generate_device"]) if "generate_device" in device_map[key] else None
+        all_device_list.add(device_map[key]["prefill_device"]) if "prefill_device" in device_map[key] else None
+    if "cpu" in all_device_list:
+        all_device_list.remove("cpu")
+    all_device_list = list(all_device_list)
+    return all_device_list
+
 def load_cur_state_dict(module: nn.Module, gguf_loader: GGUFLoader, prefix: str = ""):
    prefix = prefix.replace("orig_module.", "")
    persistent_buffers = {k: v for k, v in module._buffers.items() if k not in module._non_persistent_buffers_set}
@ -47,18 +63,19 @@ def load_cur_state_dict(module: nn.Module, gguf_loader: GGUFLoader, prefix: str
    for name, param in local_state.items():
        key = prefix + name
        translated_key = translate_name_to_gguf(key)
-        print("default loading weights", key, translated_key)
        if translated_key in gguf_loader.tensor_file_map:
            target_dtype = torch.get_default_dtype()
-            device = "cpu" if "embd" in translated_key else "cuda"
+            device = get_device(translated_key[:translated_key.rfind(".")], gguf_loader.tensor_device_map)
+            print(f"loading {translated_key} to {device}")
+            # device = "cpu" if "embd" in translated_key else "cuda"
            weights = gguf_loader.load_gguf_tensor(translated_key, device = device).to(dtype = target_dtype)
            set_param(module, name, weights)
            del weights
        else:
            #print(load_config.tensor_file_map.keys())
-            raise Exception(f"can't fand {translated_key} in GGUF file!")
+            raise Exception(f"can't find {translated_key} in GGUF file!")
        
-def load_weights(module:nn.Module, gguf_loader:GGUFLoader, prefix='', return_when_injected:bool = False, only_load_injected:bool = False):
+def load_weights(module:nn.Module, gguf_loader:GGUFLoader, prefix=''):
    # print(f"recursively loading weights {prefix},{return_when_injected=}, {only_load_injected=}")
    if not isinstance(module, base_operator.BaseInjectedModule):
        load_cur_state_dict(module, gguf_loader, prefix)
@ -67,26 +84,35 @@ def load_weights(module:nn.Module, gguf_loader:GGUFLoader, prefix='', return_whe
    else:
        module.load()

-def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000):
+def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000, use_cuda_graph: bool = True):
    import os
    os.environ["TOKENIZERS_PARALLELISM"] = "false"
    torch._dynamo.config.suppress_errors = True
    batch_size, seq_length = inputs.shape
-    torch_device = inputs.device
+    device_map = model.config.gguf_loader.tensor_device_map
+    torch_device = get_device('blk.0.self_attn', device_map)
+    torch_device = "cuda:0" if torch_device == "cuda" else torch_device
+    inputs = inputs.to(torch_device)
+    all_cuda_device = get_all_used_cuda_device(device_map)
+
    tokens = []
    
-    def decode_one_tokens(cuda_graph_runner, cur_token, position_ids, cache_position, past_key_values):
+    def decode_one_tokens(cuda_graph_runner, cur_token, position_ids, cache_position, past_key_values, use_cuda_graph: bool = True):
+        if use_cuda_graph:
            logits = cuda_graph_runner(cur_token, position_ids, cache_position)
-        past_key_values.change_seq_length(1)
-        """
-        with torch.cuda.stream(custom_stream):
-            logits=model(cur_token, 
+        else:
+            # custom_stream = torch.cuda.Stream()
+            torch.cuda.set_device(torch_device)
+            inputs_embeds = model.model.embed_tokens(cur_token.to("cpu")).to(torch_device)
+            # with torch.cuda.stream(custom_stream):
+            logits=model(inputs_embeds=inputs_embeds,
                        position_ids=position_ids,
                        cache_position=cache_position,
                        past_key_values=past_key_values,
                        return_dict=False, use_cache=True)[0]
-        #"""            
-        torch.cuda.synchronize()
+        past_key_values.change_seq_length(1)
+        for device in all_cuda_device:
+            torch.cuda.synchronize(device)
        #print(logits)
        next_token_scores = logits_warper(inputs, logits[:, -1, :])
        if generation_config.do_sample:
@ -96,10 +122,11 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000):
            next_token = torch.argmax(next_token_scores, dim=-1)
        return next_token
    
+    torch.cuda.set_device(torch_device)
    with torch.no_grad():
        stream = TextStreamer(tokenizer)
        past_key_values = StaticCache(
-            config = model.config, max_batch_size = 1, max_cache_len = seq_length + max_new_tokens, device = torch_device, dtype = model.dtype
+            config = model.config, max_batch_size = 1, max_cache_len = seq_length + max_new_tokens, device = device_map, dtype = model.dtype
        )
        cache_position = torch.arange(seq_length, device=torch_device)
        generated_ids = torch.zeros(
@ -108,23 +135,22 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000):
        generated_ids[:, cache_position] = inputs.to(torch_device).to(torch.int)
        past_key_values.cur_idx=cache_position
        start_time = time.time()
-        #custom_stream = torch.cuda.Stream()

-        inputs_embeds = model.model.embed_tokens(inputs.to("cpu")).to("cuda")
+        inputs_embeds = model.model.embed_tokens(inputs.to("cpu")).to(torch_device)
        logits = model(
            inputs_embeds = inputs_embeds, cache_position=cache_position, past_key_values=past_key_values, return_dict=False, use_cache=True
-        )[0][:,-1,:].unsqueeze(0).clone()
+        )[0][:,-1,:].unsqueeze(0).clone().to(torch_device)
        generation_config, model_kwargs = model._prepare_generation_config(
            None, max_length=max_new_tokens,
            do_sample=True, top_k=5, top_p=0.85, temperature=0.1 # change this to modify generate config
        )
        try: # transformers==4.43
            logits_warper = (
-                model._get_logits_warper(generation_config,device=inputs.device) if generation_config.do_sample else None
+                model._get_logits_warper(generation_config,device=inputs.device)
            )
        except: 
            logits_warper = (
-                model._get_logits_warper(generation_config) if generation_config.do_sample else None
+                model._get_logits_warper(generation_config)
            )
        next_token_scores = logits_warper(inputs, logits[:, -1, :])
        if generation_config.do_sample:
@ -136,7 +162,6 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000):

        prefill_count = seq_length
        prefill_time = first_token_time
-
        print(stream.put(next_token.item()), end="", flush=True)
        generated_ids[:, seq_length] = next_token
        tokens.append(next_token)
@ -145,11 +170,15 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000):
        position_ids = cache_position.unsqueeze(0)
        seq_length += 1
        
+        if use_cuda_graph:
            cuda_graph_runner = CUDAGraphRunner()
-        cuda_graph_runner.capture(model, next_token.unsqueeze(0), position_ids, cache_position, past_key_values, return_dict=False, use_cache=True)
+            cuda_graph_runner.capture(model, next_token.unsqueeze(0), position_ids, cache_position, past_key_values, torch_device, return_dict=False, use_cache=True)
+        else:
+            cuda_graph_runner = None
+            
        start_time = time.time()
        for _ in range(1, max_new_tokens):
-            next_token = decode_one_tokens(cuda_graph_runner, next_token.unsqueeze(0), position_ids, cache_position, past_key_values)
+            next_token = decode_one_tokens(cuda_graph_runner, next_token.unsqueeze(0), position_ids, cache_position, past_key_values, use_cuda_graph).to(torch_device)
            inputs = torch.cat((inputs, next_token.unsqueeze(0)), dim=-1)
            generated_ids[:, cache_position] = next_token.int()
            tokens.append(next_token.int())
@ -163,6 +192,7 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000):
            cache_position += 1
            position_ids = cache_position.unsqueeze(0)
        
+
    total_time = time.time() - start_time
    tokens_generated = len(tokens)
    tokens_per_second = tokens_generated / total_time
--- a/pyproject.toml
+++ b/pyproject.toml
@ -3,7 +3,8 @@ requires = [
  "setuptools",
  "torch >= 2.3.0", 
  "ninja",
-  "packaging"
+  "packaging",
+  "cpufeature"
  ]
 build-backend = "setuptools.build_meta"

--- a/setup.py
+++ b/setup.py
@ -6,7 +6,7 @@ Author       : chenxl
 Date         : 2024-07-27 16:15:27
 Version      : 1.0.0
 LastEditors  : chenxl 
-LastEditTime : 2024-07-31 09:44:46
+LastEditTime : 2024-08-08 02:45:15
 Adapted from:
 https://github.com/Dao-AILab/flash-attention/blob/v2.6.3/setup.py
 Copyright (c) 2023, Tri Dao.
@ -19,6 +19,7 @@ import re
 import ast
 import subprocess
 import platform
+import shutil
 import http.client
 import urllib.request
 import urllib.error
@ -27,6 +28,7 @@ from packaging.version import parse
 import torch.version
 from wheel.bdist_wheel import bdist_wheel as _bdist_wheel
 from setuptools import setup, Extension
+from cpufeature.extension import CPUFeature
 from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME

 class CpuInstructInfo:
@ -67,6 +69,8 @@ class VersionInfo:
        """
        if sys.platform.startswith("linux"):
            return f'linux_{platform.uname().machine}'
+        elif sys.platform == "win32":
+            return "win_amd64"
        else:
            raise ValueError("Unsupported platform: {}".format(sys.platform))

@ -97,6 +101,15 @@ class VersionInfo:
                    return 'avx2'
            raise ValueError(
                "Unsupported cpu Instructions: {}".format(flags_line))
+        elif sys.platform == "win32":
+            if CPUFeature.get("AVX512bw", False):
+                return 'fancy'
+            if CPUFeature.get("AVX512f", False):
+                return 'avx512'
+            if CPUFeature.get("AVX2", False):
+                return 'avx2'
+            raise ValueError(
+                "Unsupported cpu Instructions: {}".format(str(CPUFeature)))
        else:
            raise ValueError("Unsupported platform: {}".format(sys.platform))

@ -154,7 +167,7 @@ class BuildWheelsCommand(_bdist_wheel):

            wheel_path = os.path.join(self.dist_dir, archive_basename + ".whl")
            print("Raw wheel path", wheel_path)
-            os.rename(wheel_filename, wheel_path)
+            shutil.move(wheel_filename, wheel_path)
        except (urllib.error.HTTPError, urllib.error.URLError, http.client.RemoteDisconnected):
            print("Precompiled wheel not found. Building from source...")
            # If the wheel could not be downloaded, build from source
--- a/third_party/llamafile/iqk_mul_mat.inc
+++ b/third_party/llamafile/iqk_mul_mat.inc
@ -22,7 +22,7 @@

 #include <cstring>
 #include <type_traits>
-#if defined __x86_64__ || defined __aarch64__
+#if defined __x86_64__ || defined __aarch64__ || defined(_M_X64)

 #include "llama.cpp/ggml-impl.h"
 #include "llama.cpp/ggml-quants.h"
@ -225,7 +225,7 @@ bool iqk_mul_mat_moe(long Nx, long Ny, long ne00, int ne11, int typeA, const voi
    return true;
 }

-#if defined __x86_64__
+#if defined __x86_64__ || defined(_M_X64)

 #if defined HAVE_FANCY_SIMD
    #undef HAVE_FANCY_SIMD
@ -1412,6 +1412,7 @@ template <typename Dequantizer> void MulMat::set_functions(MulMat& m) {

 bool MulMat::set_mul_mat(int typeA, int ne00, MulMat& mm, int& row_size_q8, int) {

+    if (ne00 % ggml_blck_size(GGML_TYPE_Q8_K) == 0)
        row_size_q8 = ggml_row_size(GGML_TYPE_Q8_K, ne00);

    switch (typeA) {
--- a/third_party/llamafile/iqk_mul_mat_amd_avx2.cpp
+++ b/third_party/llamafile/iqk_mul_mat_amd_avx2.cpp
@ -3,6 +3,6 @@
 // Copyrigth 2024 Iwan Kawrakow.
 // Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

-#ifdef __x86_64__
+#if defined(__x86_64__) || defined(_M_X64)
 #include "iqk_mul_mat.inc"
 #endif  // __x86_64__
--- a/third_party/llamafile/iqk_mul_mat_amd_zen4.cpp
+++ b/third_party/llamafile/iqk_mul_mat_amd_zen4.cpp
@ -3,7 +3,7 @@
 // Copyrigth 2024 Iwan Kawrakow.
 // Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

-#ifdef __x86_64__
+#if defined(__x86_64__) || defined(_M_X64)
 #define iqk_mul_mat iqk_mul_mat_zen4
 #define iqk_mul_mat_moe iqk_mul_mat_moe_zen4
 #include "iqk_mul_mat.inc"
--- a/third_party/llamafile/sgemm.cpp
+++ b/third_party/llamafile/sgemm.cpp
@ -22,19 +22,22 @@

 #include "sgemm.h"
 // #include <cosmo.h>
-#include <cpuid.h>
+// #include <cpuid.h>
 // #include <libc/sysv/consts/hwcap.h>
 #include <stdio.h>
-#include <sys/auxv.h>
+// #include <sys/auxv.h>
 #include <cassert>
 // #include "llamafile.h"

 static const struct GemmFuncs {
-    typeof(llamafile_sgemm)* sgemm;
-    typeof(llamafile_mixmul)* mixmul;
-    typeof(llamafile_mixmul_iqk)* iqk_mixmul = iqk_mul_mat_moe_unsupported;
+    bool (*sgemm)(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
+    bool (*mixmul)(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
+    bool (*iqk_mixmul)(long, long, long, int, int, const void*, const void*, float*, long, long, const void*, int, int);
+    // typeof(llamafile_sgemm)* sgemm;
+    // typeof(llamafile_mixmul)* mixmul;
+    // typeof(llamafile_mixmul_iqk)* iqk_mixmul = iqk_mul_mat_moe_unsupported;
    GemmFuncs() {
-#ifdef __x86_64__
+#if defined(__x86_64__) || defined(_M_X64)
        // if (X86_HAVE(AVX)) {
        //     if (X86_HAVE(FMA)) {
        //         if (X86_HAVE(AVX2)) {
@ -86,10 +89,12 @@ static const struct GemmFuncs {
        //     sgemm = llamafile_sgemm_unsupported;
        //     mixmul = llamafile_mixmul_unsupported;
        // }
+
 #if defined(__AVX__)
-#if defined(__FMA__)
+#if defined(__FMA__) || (defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__)))
 #if defined(__AVX2__)
 #if defined(__AVX512F__)
+        printf("__AVX512F__\n");
 #if defined(__AVX512VL__) && defined(__AVX512BW__) && defined(__AVX512DQ__) && defined(__AVX512VNNI__) && defined(__AVX512BF16__)
        // AMD Zen4+ (2023-)
        sgemm = llamafile_sgemm_amd_zen4;
--- a/third_party/llamafile/tinyblas_cpu.h
+++ b/third_party/llamafile/tinyblas_cpu.h
@ -223,7 +223,7 @@ inline float32x4_t badder(float32x4_t a, float b, float32x4_t c, float32x4_t* e)
 }
 #endif

-#if defined(__FMA__)
+#if defined(__FMA__) || (defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__)))
 #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
 template <>
 inline __m256 madd(__m256 a, __m256 b, __m256 c) {
--- a/third_party/llamafile/tinyblas_cpu_mixmul_amd_avx.cpp
+++ b/third_party/llamafile/tinyblas_cpu_mixmul_amd_avx.cpp
@ -3,7 +3,7 @@
 // Copyrigth 2024 Mozilla Foundation.
 // Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

-#ifdef __x86_64__
+#if defined(__x86_64__) || defined(_M_X64)
 #define llamafile_mixmul llamafile_mixmul_amd_avx
 #include "tinyblas_cpu_mixmul.inc"

--- a/third_party/llamafile/tinyblas_cpu_mixmul_amd_avx2.cpp
+++ b/third_party/llamafile/tinyblas_cpu_mixmul_amd_avx2.cpp
@ -3,7 +3,7 @@
 // Copyrigth 2024 Mozilla Foundation.
 // Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

-#ifdef __x86_64__
+#if defined(__x86_64__) || defined(_M_X64)
 #define llamafile_mixmul llamafile_mixmul_amd_avx2
 #include "tinyblas_cpu_mixmul.inc"
 #endif  // __x86_64__
--- a/third_party/llamafile/tinyblas_cpu_mixmul_amd_avx512f.cpp
+++ b/third_party/llamafile/tinyblas_cpu_mixmul_amd_avx512f.cpp
@ -3,7 +3,7 @@
 // Copyrigth 2024 Mozilla Foundation.
 // Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

-#ifdef __x86_64__
+#if defined(__x86_64__) || defined(_M_X64)
 #define llamafile_mixmul llamafile_mixmul_amd_avx512f
 #include "tinyblas_cpu_mixmul.inc"
 #endif  // __x86_64__
--- a/third_party/llamafile/tinyblas_cpu_mixmul_amd_avxvnni.cpp
+++ b/third_party/llamafile/tinyblas_cpu_mixmul_amd_avxvnni.cpp
@ -3,7 +3,7 @@
 // Copyrigth 2024 Mozilla Foundation.
 // Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

-#ifdef __x86_64__
+#if defined(__x86_64__) || defined(_M_X64)
 #define llamafile_mixmul llamafile_mixmul_amd_avxvnni
 #include "tinyblas_cpu_mixmul.inc"
 #endif  // __x86_64__
--- a/third_party/llamafile/tinyblas_cpu_mixmul_amd_fma.cpp
+++ b/third_party/llamafile/tinyblas_cpu_mixmul_amd_fma.cpp
@ -3,7 +3,7 @@
 // Copyrigth 2024 Mozilla Foundation.
 // Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

-#ifdef __x86_64__
+#if defined(__x86_64__) || defined(_M_X64)
 #define llamafile_mixmul llamafile_mixmul_amd_fma
 #include "tinyblas_cpu_mixmul.inc"
 #endif  // __x86_64__
--- a/third_party/llamafile/tinyblas_cpu_mixmul_amd_zen4.cpp
+++ b/third_party/llamafile/tinyblas_cpu_mixmul_amd_zen4.cpp
@ -3,7 +3,7 @@
 // Copyrigth 2024 Mozilla Foundation.
 // Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

-#ifdef __x86_64__
+#if defined(__x86_64__) || defined(_M_X64)
 #define llamafile_mixmul llamafile_mixmul_amd_zen4
 #include "tinyblas_cpu_mixmul.inc"
 #endif  // __x86_64__
--- a/third_party/llamafile/tinyblas_cpu_sgemm.inc
+++ b/third_party/llamafile/tinyblas_cpu_sgemm.inc
@ -321,8 +321,8 @@ bool llamafile_sgemm(long m, long n, long k, const void* A, long lda, const void
    assert(ith < nth);

 #if QK_K == 256
-#if defined(__x86_64__)
-#if defined(__AVX2__) && defined(__FMA__)
+#if defined(__x86_64__) || defined(_M_X64)
+#if defined(__AVX2__) && (defined(__FMA__) || (defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))))
    // if (X86_CHECK(AVX2) && X86_CHECK(FMA)) {
    if (Btype == GGML_TYPE_Q8_K && Ctype == GGML_TYPE_F32) {
        if (iqk_mul_mat(m, n, k * QK_K, Atype, A, B, (float*)C, ldc, ith, nth)) {
--- a/third_party/llamafile/tinyblas_cpu_sgemm_amd_avx.cpp
+++ b/third_party/llamafile/tinyblas_cpu_sgemm_amd_avx.cpp
@ -3,7 +3,7 @@
 // Copyrigth 2024 Mozilla Foundation.
 // Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

-#ifdef __x86_64__
+#if defined(__x86_64__) || defined(_M_X64)
 #define llamafile_sgemm llamafile_sgemm_amd_avx
 #include "tinyblas_cpu_sgemm.inc"
 #endif  // __x86_64__
--- a/third_party/llamafile/tinyblas_cpu_sgemm_amd_avx2.cpp
+++ b/third_party/llamafile/tinyblas_cpu_sgemm_amd_avx2.cpp
@ -3,7 +3,7 @@
 // Copyrigth 2024 Mozilla Foundation.
 // Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

-#ifdef __x86_64__
+#if defined(__x86_64__) || defined(_M_X64)
 #define llamafile_sgemm llamafile_sgemm_amd_avx2
 #include "tinyblas_cpu_sgemm.inc"
 #endif  // __x86_64__
--- a/third_party/llamafile/tinyblas_cpu_sgemm_amd_avx512f.cpp
+++ b/third_party/llamafile/tinyblas_cpu_sgemm_amd_avx512f.cpp
@ -3,7 +3,7 @@
 // Copyrigth 2024 Mozilla Foundation.
 // Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

-#ifdef __x86_64__
+#if defined(__x86_64__) || defined(_M_X64)
 #define llamafile_sgemm llamafile_sgemm_amd_avx512f
 #include "tinyblas_cpu_sgemm.inc"
 #endif  // __x86_64__
--- a/third_party/llamafile/tinyblas_cpu_sgemm_amd_avxvnni.cpp
+++ b/third_party/llamafile/tinyblas_cpu_sgemm_amd_avxvnni.cpp
@ -3,7 +3,7 @@
 // Copyrigth 2024 Mozilla Foundation.
 // Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

-#ifdef __x86_64__
+#if defined(__x86_64__) || defined(_M_X64)
 #define llamafile_sgemm llamafile_sgemm_amd_avxvnni
 #include "tinyblas_cpu_sgemm.inc"
 #endif  // __x86_64__
--- a/third_party/llamafile/tinyblas_cpu_sgemm_amd_fma.cpp
+++ b/third_party/llamafile/tinyblas_cpu_sgemm_amd_fma.cpp
@ -3,7 +3,7 @@
 // Copyrigth 2024 Mozilla Foundation.
 // Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

-#ifdef __x86_64__
+#if defined(__x86_64__) || defined(_M_X64)
 #define llamafile_sgemm llamafile_sgemm_amd_fma
 #include "tinyblas_cpu_sgemm.inc"
 #endif  // __x86_64__
--- a/third_party/llamafile/tinyblas_cpu_sgemm_amd_zen4.cpp
+++ b/third_party/llamafile/tinyblas_cpu_sgemm_amd_zen4.cpp
@ -3,7 +3,7 @@
 // Copyrigth 2024 Mozilla Foundation.
 // Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

-#ifdef __x86_64__
+#if defined(__x86_64__) || defined(_M_X64)
 #define llamafile_sgemm llamafile_sgemm_amd_zen4
 #define iqk_mul_mat iqk_mul_mat_zen4
 #include "tinyblas_cpu_sgemm.inc"