Initial commit

2025-09-10 15:29:39 +00:00 · 2024-07-27 16:06:58 +08:00 · 2024-07-27 16:06:58 +08:00 · 18c42e67df
commit 18c42e67df
247 changed files with 53775 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,17 @@
+__pycache__
+build
+.vscode
+*.so
+*.cache
+server.db
+logs
+node_modules
+*.nsys-rep
+.vs/
+*pycache*
+*build/
+*/third_party/*
+.DS_Store
+compile_commands.json
+*.egg-info*
+*dist/
--- a/.gitmodules
+++ b/.gitmodules
@ -0,0 +1,6 @@
+[submodule "third_party/llama.cpp"]
+	path = third_party/llama.cpp
+	url = https://github.com/ggerganov/llama.cpp.git
+[submodule "third_party/pybind11"]
+	path = third_party/pybind11
+	url = https://github.com/pybind/pybind11.git
--- a/.pylintrc
+++ b/.pylintrc
@ -0,0 +1,6 @@
+[MASTER]
+extension-pkg-whitelist=pydantic
+max-line-length=120
+
+[MESSAGES CONTROL]
+disable=missing-function-docstring
--- a/201
+++ b/201
@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -0,0 +1,11 @@
+graft third_party
+graft ktransformers
+graft local_chat.py
+include LICENSE README.md
+prune ktransformers/website
+prune ktransformers/logs
+prune ktransformers.egg-info
+prune third_party/llama.cpp/models
+graft ktransformers/website/dist
+global-exclude __pycache__
+include KTransformersOps.*.so
--- a/README.md
+++ b/README.md
@ -0,0 +1,284 @@
+<div align="center">
+  <!-- <h1>KTransformers</h1> -->
+  <p align="center">
+  
+  <picture>
+    <img alt="DeepSeek-Coder-V2 Score" src="https://github.com/user-attachments/assets/9fa710bf-1389-46b7-b9d2-3f67b98bd7a6" width=50%>
+  </picture>
+  
+  </p>
+  <h3>A Flexible Framework for Experiencing Cutting-edge LLM Inference Optimizations</h3>
+  <strong><a href="#show-cases">🔥 Show Cases</a> | <a href="#quick-start">🚀 Quick Start</a> | <a href="#tutorial">📃 Tutorial</a> | <a href="https://github.com/kvcache-ai/ktransformers/discussions">💬  Discussion </a> </strong>
+</div>
+
+
+<h2 id="intro">🎉 Introduction</h2>
+KTransformers, pronounced as Quick Transformers, is designed to enhance your 🤗 <a href="https://github.com/huggingface/transformers">Transformers</a> experience with advanced kernel optimizations and placement/parallelism strategies.
+<br/><br/>
+KTransformers is a flexible, Python-centric framework designed with extensibility at its core. 
+By implementing and injecting an optimized module with a single line of code, users gain access to a Transformers-compatible
+interface, RESTful APIs compliant with OpenAI and Ollama, and even a simplified ChatGPT-like web UI. 
+<br/><br/>
+Our vision for KTransformers is to serve as a flexible platform for experimenting with innovative LLM inference optimizations. Please let us know if you need any other features.
+
+
+<h2 id="show-cases">🔥 Show Cases</h2>
+<h3>GPT-4-level Local VSCode Copilot on a Desktop with only 24GB VRAM</h3>
+<p align="center">
+
+  https://github.com/user-attachments/assets/3f85780e-aa53-4d2f-91b2-5585c8dade85
+
+</p>
+
+- **Local 236B DeepSeek-Coder-V2:** Running its Q4_K_M version using only 21GB VRAM and 136GB DRAM, attainable on a local desktop machine, which scores even better than GPT4-0613 in [BigCodeBench](https://huggingface.co/blog/leaderboard-bigcodebench).
+
+<p align="center">
+  <picture>
+    <img alt="DeepSeek-Coder-V2 Score" src="https://github.com/user-attachments/assets/81efb94f-f859-4413-b6e0-d986508ad667" width=80%>
+  </picture>
+</p>
+
+- **Faster Speed:** Achieving 126 tokens/s for 2K prompt prefill and 13.6 tokens/s for generation through MoE offloading and injecting advanced kernels from [Llamafile](https://github.com/Mozilla-Ocho/llamafile/tree/main) and [Marlin](https://github.com/IST-DASLab/marlin).
+- **VSCode Integration:** Wrapped into an OpenAI and Ollama compatible API for seamless integration as a backend for [Tabby](https://github.com/TabbyML/tabby) and various other frontends.
+
+<p align="center">
+  <!-- <img alt="Tabby integration" src="https://XXXX.png" width=55%> -->
+
+  https://github.com/user-attachments/assets/e6e27cb3-8372-44e6-8f1f-34402eae56c1
+  
+</p>
+
+
+<strong>More advanced features will coming soon, so stay tuned!</strong>
+
+<h2 id="quick-start">🚀 Quick Start</h2>
+
+<h3>Preparation</h3>
+Some preparation:
+
+- CUDA 12.1 and above, if you didn't have it yet, you may install from [here](https://developer.nvidia.com/cuda-downloads).
+
+  <!-- ```
+  export PATH=/usr/local/cuda/bin:$PATH
+  export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+  export CUDA_PATH=/usr/local/cuda
+  ``` -->
+- Linux-x86_64 with gcc, g++ and cmake
+  ```sh
+  sudo apt-get update
+  sudo apt-get install gcc g++ cmake ninja-build
+  ```
+- We recommend using [Conda](https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh) to create a virtual environment with Python=3.11 to run our program.
+  ```sh
+  conda create --name ktransformers python=3.11
+  conda activate ktransformers # you may need to run ‘conda init’ and reopen shell first
+  ```
+
+  Download source code:
+  ```sh
+  git clone https://github.com/kvcache-ai/ktransformers.git
+  cd ktransformers
+  git submodule init
+  git submodule update
+  ```
+
+<h3>Local Chat</h3>
+We provide a simple command-line local chat Python script that you can run for testing. 
+
+  > Note that this is a very simple test tool only support one round chat without any memory about last input, if you want to try full ability of the model, you may go to [RESTful API and Web UI](#id_666). We use the DeepSeek-V2-Lite-Chat-GGUF model as an example here. But we alse support other models, you can replace it with any other model that you want to test. 
+
+<h4>Install</h4>
+
+```sh
+bash install.sh
+```
+
+<h4>Run Example</h4>
+
+```shell
+# Begin from root of your cloned repo!
+# Begin from root of your cloned repo!!
+# Begin from root of your cloned repo!!! 
+
+# Download mzwing/DeepSeek-V2-Lite-Chat-GGUF from huggingface
+mkdir DeepSeek-V2-Lite-Chat-GGUF
+cd DeepSeek-V2-Lite-Chat-GGUF
+
+wget https://huggingface.co/mzwing/DeepSeek-V2-Lite-Chat-GGUF/resolve/main/DeepSeek-V2-Lite-Chat.Q4_K_M.gguf -O DeepSeek-V2-Lite-Chat.Q4_K_M.gguf
+
+cd .. # Move to repo's root dir
+
+# Start local chat
+python  ktransformers/local_chat.py --model_path deepseek-ai/DeepSeek-V2-Lite-Chat --gguf_path ./DeepSeek-V2-Lite-Chat-GGUF
+
+# If you see “OSError: We couldn't connect to 'https://huggingface.co' to load this file”, try：
+# GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite
+# python  ktransformers/local_chat.py --model_path ./DeepSeek-V2-Lite --gguf_path ./DeepSeek-V2-Lite-Chat-GGUF
+```
+
+
+It features the following arguments:
+
+- `--model_path` (required): Name of the model (such as "deepseek-ai/DeepSeek-V2-Lite-Chat" which will automatically download configs from [Hugging Face](https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite)). Or if you already got local files  you may directly use that path to initialize the model.  
+  >Note: <strong>.safetensors</strong> files are not required in the directory. We only need config files to build model and tokenizer.
+- `--gguf_path` (required): Path of a directory containing GGUF files which could that can be downloaded from [Hugging Face](https://huggingface.co/mzwing/DeepSeek-V2-Lite-Chat-GGUF/tree/main) (we only support q4_k_m and q8_0 for now, more formats are coming soon).
+- `--optimize_rule_path` (required except for Qwen2Moe and DeepSeek-V2): Path of YAML file containing optimize rules. There are two rule files pre-written in the [ktransformers/optimize/optimize_rules](ktransformers/optimize/optimize_rules) directory for optimizing DeepSeek-V2 and Qwen2-57B-A14, two SOTA MoE models.
+- `--max_new_tokens`: Int (default=1000). Maximum number of new tokens to generate.
+- `--cpu_infer`: Int (default=10). The number of CPUs used for inference. Should ideally be set to the (total number of cores - 2).
+
+<h3 id="supported-model"> Supported Model</h3>
+
+| Model Name | Model Size | VRAM | Minimum DRAM | Recommended DRAM |
+| ----  | ---- | ---- | ---- | ---- |
+| DeepSeek-V2-q4_k_m | 133G | 24G | 136G | 192G |
+| Qwen2-57B-A14B-Instruct-q4_k_m | 33G | 8G | 34G | 64G |
+| DeepSeek-V2-Lite-q4_k_m | 9.7G | 3G | 13G | 16G |
+
+
+More will come soon. Please let us know which models you are most interested in. 
+
+Be aware that you need to be subject to their corresponding model licenses when using [DeepSeek](https://huggingface.co/deepseek-ai/DeepSeek-V2/blob/main/LICENSE) and [QWen](https://huggingface.co/Qwen/Qwen2-72B-Instruct/blob/main/LICENSE).
+
+<details>
+  <summary>Click To Show how to run other examples</summary>
+
+
+* Qwen2-57B
+
+```sh
+pip install flash_attn # For Qwen2
+
+mkdir Qwen2-57B-GGUF && cd Qwen2-57B-GGUF
+
+wget https://huggingface.co/Qwen/Qwen2-57B-A14B-Instruct-GGUF/resolve/main/qwen2-57b-a14b-instruct-q4_k_m.gguf?download=true -O qwen2-57b-a14b-instruct-q4_k_m.gguf
+
+cd ..
+
+python ktransformers/local_chat.py --model_name Qwen/Qwen2-57B-A14B-Instruct --gguf_path ./Qwen2-57B-GGUF
+
+# If you see “OSError: We couldn't connect to 'https://huggingface.co' to load this file”, try：
+# GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/Qwen/Qwen2-57B-A14B-Instruct
+# python  ktransformers/local_chat.py --model_path ./Qwen2-57B-A14B-Instruct --gguf_path ./DeepSeek-V2-Lite-Chat-GGUF
+```
+
+* DeepseekV2
+```sh
+mkdir DeepSeek-V2-Chat-0628-GGUF && cd DeepSeek-V2-Chat-0628-GGUF
+# Download weights
+wget https://huggingface.co/bartowski/DeepSeek-V2-Chat-0628-GGUF/resolve/main/DeepSeek-V2-Chat-0628-Q4_K_M/DeepSeek-V2-Chat-0628-Q4_K_M-00001-of-00004.gguf -o DeepSeek-V2-Chat-0628-Q4_K_M-00001-of-00004.gguf
+wget https://huggingface.co/bartowski/DeepSeek-V2-Chat-0628-GGUF/resolve/main/DeepSeek-V2-Chat-0628-Q4_K_M/DeepSeek-V2-Chat-0628-Q4_K_M-00002-of-00004.gguf -o DeepSeek-V2-Chat-0628-Q4_K_M-00002-of-00004.gguf
+wget https://huggingface.co/bartowski/DeepSeek-V2-Chat-0628-GGUF/resolve/main/DeepSeek-V2-Chat-0628-Q4_K_M/DeepSeek-V2-Chat-0628-Q4_K_M-00003-of-00004.gguf -o DeepSeek-V2-Chat-0628-Q4_K_M-00003-of-00004.gguf
+wget https://huggingface.co/bartowski/DeepSeek-V2-Chat-0628-GGUF/resolve/main/DeepSeek-V2-Chat-0628-Q4_K_M/DeepSeek-V2-Chat-0628-Q4_K_M-00004-of-00004.gguf -o DeepSeek-V2-Chat-0628-Q4_K_M-00004-of-00004.gguf
+
+cd ..
+
+python ktransformers/local_chat.py --model_name deepseek-ai/DeepSeek-V2-Chat-0628 --gguf_path ./DeepSeek-V2-Chat-0628-GGUF
+
+# If you see “OSError: We couldn't connect to 'https://huggingface.co' to load this file”, try：
+# GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/deepseek-ai/DeepSeek-V2-Chat-0628
+# python  ktransformers/local_chat.py --model_path ./DeepSeek-V2-Chat-0628 --gguf_path ./DeepSeek-V2-Chat-0628-GGUF
+```
+
+| model name | weights download link |
+|----------|----------|
+| Qwen2-57B | [Qwen2-57B-A14B-gguf-Q4K-M](https://huggingface.co/Qwen/Qwen2-57B-A14B-Instruct-GGUF/tree/main) |
+| DeepseekV2-coder |[DeepSeek-Coder-V2-Instruct-gguf-Q4K-M](https://huggingface.co/LoneStriker/DeepSeek-Coder-V2-Instruct-GGUF/tree/main) |
+| DeepseekV2-chat |[DeepSeek-V2-Chat-gguf-Q4K-M](https://huggingface.co/bullerwins/DeepSeek-V2-Chat-0628-GGUF/tree/main) |
+| DeepseekV2-lite | [DeepSeek-V2-Lite-Chat-GGUF-Q4K-M](https://huggingface.co/mzwing/DeepSeek-V2-Lite-Chat-GGUF/tree/main) |
+
+</details>
+
+<!-- pin block for jump -->
+<span id='id_666'> 
+
+<h3>RESTful API and Web UI</h3>
+
+<h4>Install</h4>
+
+[Optional] If you want to run with website, please [compile the website](./doc/en/api/server/website.md) before execute ```pip install .```
+  
+Install ktransformers with source.
+```
+pip install -r requirements-local_chat.txt
+pip install . --no-build-isolation
+```
+
+Start without website:
+
+```sh
+ktransformers --model_path deepseek-ai/DeepSeek-V2-Lite-Chat --gguf_path /path/to/DeepSeek-V2-Lite-Chat-GGUF --port 10002
+```
+Start with website:
+```sh
+ktransformers --model_path deepseek-ai/DeepSeek-V2-Lite-Chat --gguf_path /path/to/DeepSeek-V2-Lite-Chat-GGUF  --port 10002 --web True
+```
+Or you want to start server with transformers, the model_path should include safetensors
+```bash
+ktransformers --type transformers --model_path /mnt/data/model/Qwen2-0.5B-Instruct --port 10002 --web True
+```
+
+Access website with url [http://localhost:10002/web/index.html#/chat](http://localhost:10002/web/index.html#/chat) :
+
+<p align="center">
+  <picture>
+    <img alt="Web UI" src="https://github.com/user-attachments/assets/a8eca392-e948-4706-ba9c-743142d8a464" width=80%>
+  </picture>
+</p>
+
+More information about the RESTful API server can be found [here](doc/en/api/server/server.md). You can also find an example of integrating with Tabby [here](doc/en/api/server/tabby.md).
+
+
+<h2 id="tutorial">📃 Brief Injection Tutorial</h2>
+At the heart of KTransformers is a user-friendly, template-based injection framework. 
+This allows researchers to easily replace original torch modules with optimized variants. It also simplifies the process of combining multiple optimizations, allowing the exploration of their synergistic effects. 
+
+</br>
+<p align="center">
+  <picture>
+    <img alt="Inject-Struction" src="https://github.com/user-attachments/assets/b922180e-3e73-4b62-b5a0-5ac98d7052c5" width=50%>
+  </picture>
+</p>
+
+Given that vLLM already serves as a great framework for large-scale deployment optimizations, KTransformers is particularly focused on local deployments that are constrained by limited resources. We pay special attention to heterogeneous computing opportunities, such as GPU/CPU offloading of quantized models. For example, we support the efficient <a herf="https://github.com/Mozilla-Ocho/llamafile/tree/main">Llamafile</a> and <a herf="https://github.com/IST-DASLab/marlin">Marlin</a> kernels for CPU and GPU, respectively. More details can be found <a herf="doc/en/operators/llamafile.md">here</a>.
+
+<h3>Example Usage</h3>
+To utilize the provided kernels, users only need to create a YAML-based injection template and add the call to `optimize_and_load_gguf` before using the Transformers model.
+
+```python
+with torch.device("meta"):
+    model = AutoModelForCausalLM.from_config(config, trust_remote_code=True)
+optimize_and_load_gguf(model, optimize_rule_path, gguf_path, config)
+...
+generated = prefill_and_generate(model, tokenizer, input_tensor.cuda(), max_new_tokens=1000)
+```
+
+In this example, the AutoModel is first initialized on the meta device to avoid occupying any memory resources. Then, `optimize_and_load_gguf` iterates through all sub-modules of the model, matches rules specified in your YAML rule file, and replaces them with advanced modules as specified.
+
+After injection, the original `generate` interface is available, but we also provide a compatible `prefill_and_generate` method, which enables further optimizations like CUDAGraph to improve generation speed.
+
+<h3>YAML Template</h3>
+Below is an example of a YAML template for replacing all original Linear modules with Marlin, an advanced 4-bit quantization kernel.
+
+```yaml
+- match:
+    name: "^model\\.layers\\..*$"  # regular expression 
+    class: torch.nn.Linear  # only match modules matching name and class simultaneously
+  replace:
+    class: ktransformers.operators.linear.KTransformerLinear  # optimized Kernel on quantized data types
+    device: "cpu"   # which devices to load this module when initializing
+    kwargs:
+      generate_device: "cuda"
+      generate_linear_type: "QuantizedLinearMarlin"
+```
+
+Each rule in the YAML file has two parts: `match` and `replace`. The `match` part specifies which module should be replaced, and the `replace` part specifies the module to be injected into the model along with the initialization keywords.
+
+You can find example rule templates for optimizing DeepSeek-V2 and Qwen2-57B-A14, two SOTA MoE models, in the [ktransformers/optimize/optimize_rules](ktransformers/optimize/optimize_rules) directory. These templates are used to power the `local_chat.py` demo.
+
+A detailed description of the injection using DeepSeek-V2 as an example is given [here](doc/en/deepseek-v2-injection.md).
+
+<h2 id="ack">Acknowledgment and Contributors</h2>
+
+The development of KTransformer is based on the flexible and versatile framework provided by Transformers. We also benefit from advanced kernels such as GGUF/GGML, Llamafile, and Marlin. We are planning to contribute back to the community by upstreaming our modifications.
+
+KTransformer is actively maintained and developed by contributors from the <a href="https://madsys.cs.tsinghua.edu.cn/">MADSys group</a> at Tsinghua University and members from <a href="http://approaching.ai/">Approaching.AI</a>. We welcome new contributors to join us in making KTransformer faster and easier to use.
--- a/doc/assets/BigCodeBench.png
+++ b/doc/assets/BigCodeBench.png
--- a/doc/assets/DeepSeek-on-KTransformers.PNG
+++ b/doc/assets/DeepSeek-on-KTransformers.PNG
--- a/doc/assets/InjectStruction.png
+++ b/doc/assets/InjectStruction.png
--- a/doc/assets/KTransformers.png
+++ b/doc/assets/KTransformers.png
--- a/doc/assets/cpuinfer.png
+++ b/doc/assets/cpuinfer.png
--- a/doc/assets/website.png
+++ b/doc/assets/website.png
--- a/doc/en/api/server/api.md
+++ b/doc/en/api/server/api.md
@ -0,0 +1,108 @@
+# API
+
+- [OpenAI ChatCompletion](#openai-chatcompletion)
+- [Ollama ChatCompletion](#ollama-chatcompletion)
+- [OpenAI Assistant](#openai-assistant)
+
+## OpenAI ChatCompletion
+```bash
+POST /v1/chat/completions
+
+```
+Generate responses based on the selected model.
+
+### Parameters
+- `messages`: An array of `message` representing all historical messages. A `message` can be from a user or model (assistant) and includes:
+
+  - `role`: Either `user` or `assistant`, indicating the creator of this message.
+  - `content`: The message from the user or model.
+- `model`: The name of the selected model
+- `stream`: Either true or false. Indicates whether to use streaming response. If true, model inference results are returned via HTTP event stream.
+
+### Response
+- Streaming response: An event stream, each event contains a `chat.completion.chunk`. `chunk.choices[0].delta.content` is the incremental output returned by the model each time.
+- Non-streaming response: Not supported yet.
+
+
+
+### Example
+
+```bash
+curl -X 'POST' \
+  'http://localhost:9112/v1/chat/completions' \
+  -H 'accept: application/json' \
+  -H 'Content-Type: application/json' \
+  -d '{
+  "messages": [
+    {
+      "content": "tell a joke",
+      "role": "user"
+    }
+  ],
+  "model": "Meta-Llama-3-8B-Instruct",
+  "stream": true
+}'
+```
+
+```bash
+data:{"id":"c30445e8-1061-4149-a101-39b8222e79e1","object":"chat.completion.chunk","created":1720511671,"model":"not implmented","system_fingerprint":"not implmented","usage":null,"choices":[{"index":0,"delta":{"content":"Why ","role":"assistant","name":null},"logprobs":null,"finish_reason":null}]}
+
+data:{"id":"c30445e8-1061-4149-a101-39b8222e79e1","object":"chat.completion.chunk","created":1720511671,"model":"not implmented","system_fingerprint":"not implmented","usage":null,"choices":[{"index":0,"delta":{"content":"","role":"assistant","name":null},"logprobs":null,"finish_reason":null}]}
+
+data:{"id":"c30445e8-1061-4149-a101-39b8222e79e1","object":"chat.completion.chunk","created":1720511671,"model":"not implmented","system_fingerprint":"not implmented","usage":null,"choices":[{"index":0,"delta":{"content":"couldn't ","role":"assistant","name":null},"logprobs":null,"finish_reason":null}]}
+
+...
+
+data:{"id":"c30445e8-1061-4149-a101-39b8222e79e1","object":"chat.completion.chunk","created":1720511671,"model":"not implmented","system_fingerprint":"not implmented","usage":null,"choices":[{"index":0,"delta":{"content":"two-tired!","role":"assistant","name":null},"logprobs":null,"finish_reason":null}]}
+
+event: done
+data: [DONE]
+```
+
+
+
+## Ollama ChatCompletion
+
+```bash
+POST /api/generate
+```
+
+Generate responses using the selected model.
+
+### Parameters
+- `prompt`: A string representing the input prompt.
+- `model`: The name of the selected model
+- `stream`: Either true or false. Indicates whether to use streaming responses. If true, returns the model inference results in the form of an HTTP event stream.
+
+### Response
+- Streaming response: A stream of JSON responses, each line is a JSON.
+  - `response`: The incremental result of the model completion.
+  - `done`: Whether the inference has finished.
+- Non-streaming response: Not yet supported.
+
+### 例子
+
+```bash
+curl -X 'POST' \
+  'http://localhost:9112/api/generate' \
+  -H 'accept: application/json' \
+  -H 'Content-Type: application/json' \
+  -d '{
+  "model": "Meta-Llama-3-8B-Instruct",
+  "prompt": "tell me a joke",
+  "stream": true
+}'
+```
+
+```bash
+{"model":"Meta-Llama-3-8B-Instruct","created_at":"2024-07-09 08:13:11.686513","response":"I'll ","done":false}
+{"model":"Meta-Llama-3-8B-Instruct","created_at":"2024-07-09 08:13:11.729214","response":"give ","done":false}
+
+...
+
+{"model":"Meta-Llama-3-8B-Instruct","created_at":"2024-07-09 08:13:33.955475","response":"for","done":false}
+{"model":"Meta-Llama-3-8B-Instruct","created_at":"2024-07-09 08:13:33.956795","response":"","done":true}
+```
+
+
+
--- a/doc/en/api/server/run-tabby.png
+++ b/doc/en/api/server/run-tabby.png
--- a/doc/en/api/server/server-arch.png
+++ b/doc/en/api/server/server-arch.png
--- a/doc/en/api/server/server.md
+++ b/doc/en/api/server/server.md
@ -0,0 +1,37 @@
+# Backend Services (Server)
+The Server offers fast heterogeneous inference capabilities of ktransformers through an API for external usage.
+
+<img src="server-arch.png" height="600" alt="Server architecture">
+
+## API
+
+The Server provides model inference services externally through a RESTful API, with two methods of interaction: ChatCompletion and Assistant.
+
+- The ChatCompletion interface requires users to provide all historical dialogues at once, after which the model responds. AI service providers (such as [OpenAI](https://platform.openai.com/docs/api-reference/chat/create)) and local inference frameworks (such as [Ollama](https://github.com/ollama/ollama/blob/main/docs/api.md)) both offer the ChatCompletion interface. To ensure compatibility with OpenAI and Ollama, the Server offers APIs that are consistent with theirs. Therefore, applications currently using OpenAI and Ollama can seamlessly switch to our Server. For example: [How to use Tabby and ktransformers locally with a 236B model for code completion?](tabby.md).
+- The Assistant is suitable for applications that need to reuse a series of resources and call the model. For instance, in educational applications, developers can create an Assistant named "Second Grade Math Teacher" and set an initial prompt ("You are an experienced second-grade math teacher..."), and upload relevant materials (second grade math textbooks). After creating the Assistant, the application needs to create a Thread to store the dialogues between the user and the model (Message). When calling the model, the application creates a Run to obtain the Assistant's response. Compared to ChatCompletion, the Assistant-enabled Server handles the reuse of conversational contexts and multi-turn dialogues, making model calls in complex scenarios more convenient. The [OpenAI Assistant API](https://platform.openai.com/docs/api-reference/assistants/createAssistant) introduces such an Assistant interface, and the Server provides a consistent API.
+
+These API definitions are located in `server/api`, and their specific usage can be seen [here](api.md).
+
+## Integrating Model Inference Frameworks
+
+The Server uses ktransformers for model calling and inference. It also supports other inference frameworks, such as the already supported [transformers](https://huggingface.co/docs/transformers/index), and plans to support [exllamav2](https://github.com/turboderp/exllamav2). These functionalities are implemented in `server/backend`.
+
+The model inference functionalities of the frameworks are abstracted into a base class `BackendInterfaceBase`. This class includes a function: inference. It takes historical dialogue information messages as input and returns the text result from the model. The inference function adopts an async generator design, allowing the Server to return model responses in a streaming manner.
+
+```python
+class BackendInterfaceBase:
+  async def inference(self, messages, **kwargs)->AsyncIterator[str]:
+    ...
+```
+
+This inference function naturally implements the functionality of ChatCompletion because its inputs and outputs are historical dialogues and model responses, respectively. Thus, the ChatCompletion API can directly call the inference function to complete model inference.
+
+Assistant is more complex than ChatCompletion, requiring the Server to store the related state of the Assistant and call the inference function appropriately. The Server maintains a set of Assistant logic in the database, storing the Assistants, Threads, and Messages created by applications. In memory, the Server maintains a `ThreadContext` for each Thread, gathering information related to each Thread's Assistant, etc. When a user sends a new Message, the Server calls the get_local_messages function of ThreadContext to obtain messages and then calls the inference function to get the inference results.
+
+```python
+class MyThreadContext(ThreadContext):
+    def get_local_messages(self):
+      ...
+```
+
+Since different model inference frameworks have different historical dialogue input formats, `ThreadContext` and `BackendInterface` need to be used in pairs. Besides its own ktransformers, the Server also supports transformers. For integrating other model inference frameworks, refer to the implementations of `TransformersInterface` and `TransformersThreadContext` in [transformers.py](https://github.com/kvcache-ai/ktransformers-dev/blob/main/ktransformers/server/backend/interfaces/transformers.py). 
--- a/doc/en/api/server/tabby.md
+++ b/doc/en/api/server/tabby.md
@ -0,0 +1,33 @@
+# How to Use Tabby and ktransformers Locally with 236B Large Models for Code Completion?
+
+[Tabby](https://tabby.tabbyml.com/docs/welcome/) is an open-source code assistant that allows users to manually configure the backend framework and model, and use it across multiple IDEs/editors, such as VSCode and IntelliJ. Since Tabby can interface with Ollama on the framework side, and the ktransformers server provides a consistent API with Ollama, we can connect Tabby to the ktransformers server. This setup allows us to experience fast, heterogeneous inference in code completion scenarios.
+
+1. Start ktransformers.
+```bash
+./ktransformers --port 9112
+```
+2. Install Tabby: Follow the official tutorial to install Tabby on a Linux server or Windows PC with an NVIDIA GPU [here](https://tabby.tabbyml.com/docs/quick-start/installation/linux/).
+3. Configure Tabby: Create `~/.tabby/config.toml` and add the following configuration.
+```toml
+[model.completion.http]
+kind = "ollama/completion"
+api_endpoint = "http://127.0.0.1:9112/"
+model_name = "DeepSeek-Coder-V2-Instruct"
+prompt_template = "<｜fim▁begin｜>{prefix}<｜fim▁hole｜>{suffix}<｜fim▁end｜>" # Prompt Template
+```
+
+In this configuration, `kind` specifies that ktransformers uses the standard Ollama API to serve Tabby; `api_endpoint` matches the interface bound when launching ktransformers; `model_name` is set to the model used by ktransformers, here `DeepSeek-Coder-V2-Instruct` is the backend inference model; `prompt_template` is the model's prompt template, which requires a corresponding template for different models to use the Fill In the Middle feature properly.
+Here we demonstrate the relevant configuration for Tabby using the Ollama API to provide the Completion feature. For configuration information about other functions available in Tabby, refer to [here](https://tabby.tabbyml.com/docs/administration/model/).
+
+
+4. Start the Tabby service: `./tabby serve`.
+<img src="run-tabby.png" alt="image-20240709112329577" style="zoom:50%;" />
+
+   After launching, you should see access to the `/api/tags` interface in the ktransformers command line (in version v0.13.0 of Tabby, this changes to access to the `/api/show/` interface).
+<img src="visit-api-tags.png" alt="image-20240709111648215" style="zoom:67%;" />
+
+6. Register a Tabby account, obtain a Token: After starting the Tabby service, open the corresponding link in a browser (as shown above at 0.0.0.0:8080), and follow the [tutorial](https://tabby.tabbyml.com/docs/quick-start/register-account/) to create a user and get a Token.
+
+7. Start VSCode, install the Tabby extension plugin, and use the Token obtained in the previous step to connect to the Tabby Server, following [here](https://tabby.tabbyml.com/docs/extensions/installation/vscode/).
+
+8. Open any code file and experience the fast heterogeneous inference of ktransformers.
--- a/doc/en/api/server/visit-api-tags.png
+++ b/doc/en/api/server/visit-api-tags.png
--- a/doc/en/api/server/website.md
+++ b/doc/en/api/server/website.md
@ -0,0 +1,32 @@
+# Start with website
+
+This document provides the necessary steps to set up and run the web service for this project.
+
+## 1. Starting the Web Service
+
+### 1.1. Compiling the Web Code
+
+Before you can compile the web code, make sure you have installed [Node.js](https://nodejs.org) version 18.3 or higher
+
+Once npm is installed, navigate to the `ktransformers/website` directory:
+
+```bash
+cd ktransformers/website
+```
+
+Next, install the Vue CLI with the following command:
+
+```bash
+npm install @vue/cli
+```
+
+Now you can build the project:
+
+```bash
+npm run build
+```
+Finally you can build ktransformers with website:
+```
+cd ../../
+pip install .
+```
--- a/doc/en/deepseek-v2-injection.md
+++ b/doc/en/deepseek-v2-injection.md
@ -0,0 +1,166 @@
+# Tutorial: Heterogeneous and Local DeepSeek-V2 Inference
+
+DeepSeek-(Code)-V2 is a series of strong mixture-of-experts (MoE) models, featuring a total of 236 billion parameters, with 21 billion parameters activated per token. This model has demonstrated remarkable reasoning capabilities across various benchmarks, positioning it as one of the SOTA open models and nearly comparable in performance to GPT-4. 
+
+<p align="center">
+  <picture>
+    <img alt="DeepSeek-Coder-V2 Score" src="../assets/BigCodeBench.png" width=80%>
+  </picture>
+</p>
+
+Moreover, unlike previous models that employed traditional attention mechanisms like Grouped-Query Attention (GQA), DeepSeek-V2 incorporates a novel Multi-head Latent Attention (MLA). This innovation significantly reduces the size of the KV cache required during inference, enhancing efficiency.
+
+
+However, despite its efficiency, the practicality of running such a large model on personal computing setups seems impractical. Official documentation for DeepSeek-V2 indicates that eight 80GB GPUs are necessary for standard inference operations, and even the scaled-down Q4_k_m version requires at least two 80GB GPUs. These requirements are beyond the reach of most individual researchers and small teams.
+
+
+Nonetheless, by employing several cutting-edge optimization techniques, we have successfully operated this colossal model on a desktop computer with only 21GB of VRAM and 136GB of DRAM. In this document, we outline the specific optimizations utilized and provide a detailed tutorial on how to implement these strategies using KTransformers.
+
+## Applied Optimizations
+
+### Optimized MLA Operator
+
+The following figure provides a brief overview of DeepSeek-V2 architecture. At the heart of its attention layer, DeepSeek-V2 introduces a novel MLA operator that represents the heads of key-value pairs using a common, joint compressed representation, which holds significant potential for efficiency improvements. However, the official open-source implementation of the MLA operator explicitly decompresses this compressed representation and caches the decompressed key-value pairs. This process not only enlarges the KV cache size but also diminishes inference performance.
+
+<p align="center">
+  <picture>
+    <img alt="DeepSeek on KTransformers" src="../assets/DeepSeek-on-KTransformers.PNG" width=80%>
+  </picture>
+</p>
+
+To truly capitalize on the benefits of MLA, we have implemented an optimized version for inference. According to its original paper, we absorb the decompression matrices directly into the q_proj and out_proj weights. Consequently, the compressed representation does not need to be decompressed to compute the attention. This adjustment significantly reduces the KV cache size and increases the arithmetic intensity of this operator, which greatly optimizes the utilization of GPU computational power.
+
+### Advanced Quantization Kernels
+
+The original DeepSeek-V2 model stores its parameters in BF16 format, consuming approximately 470GB of raw storage. This exceeds the RAM capacity available on mainstream desktop computers. To address this, we leverage the well-established GGUF community's quantized weights to simplify the process for users.
+However, quantized data types are not typically supported by highly-optimized BLAS packages. As a result, the original HuggingFace Transformers' Torch implementation must dequantize these tensors to supported data types before processing, which introduces unnecessary computational overhead and increases memory traffic. To overcome this, we have incorporated advanced kernels that operate directly on quantized data types, thereby optimizing inference performance.
+
+
+In the current version of KTransformers, we utilize Marlin for GPU kernels and llamafile for CPU kernels. These kerenls are specially designed to benefit from modern GPU architecture and modern CPU instruction extensions such as AVX512-BF16 (AMD Zen4 or newer) and AVX-VNNI (Intel Alder Lake or newer), that are tailored for quantized data types and machine learning workloads. We also use expert parallelism and other optimization for MOE inferencem on CPU based on llamafile, and call them as CPUInfer.  As demonstrated in Figure 2(cite from Marlin), Marlin can achieve near ideal 3.87x speedup compare to corresponding Torch counterparts. As demonstrated in the following figure, our micro benchmarks show that inference using CPUInfer performs several times faster than Torch in low bits representation. Note that in practical inference such as using transformers, the Torch baseline use BF16 or FP16 as linear weights, and will occupy more memory resources, or it will be more slower due to dequantization when using quanted weights.
+
+<p align="center">
+  <picture>
+    <img alt="CPUInfer Performance" src="../assets/cpuinfer.png" width=80%>
+  </picture>
+</p>
+
+
+### Arithmetic Intensity Guided Offloading
+
+Storing all 236 billion parameters of a model in GPU VRAM is clearly impractical for local users. Therefore, we strategically store only the most computationally intensive parameters on the GPU. For instance, after our optimizations, the MLA operator, which contains 128 heads with a shared compressed key-value representation, shows an arithmetic intensity of 512. This makes it the most intensive operator, particularly during smaller inference batch sizes. Hence, it is allocated to the GPU to leverage the power of tensor cores.
+
+
+On the other hand, as shown in Figure 1, each transformer block in DeepSeek-V2 includes 160 mixture-of-experts (MoE) experts, comprising 96% of the total parameters. However, the MoE router activates only 6 out of these 160 experts for each token, which means that only 3.75% of the MoE parameters are utilized during the decoding phase. With a batch size of one, the arithmetic intensity of the MoE operation is roughly 0.075. This operation, primarily involving a batched General Matrix-Vector Multiplication (GEMV), can thus be efficiently handled by the CPU.
+
+
+Following this principle of arranging all operators by their arithmetic intensity and placing the most intensive ones in the GPU as much as possible, we prioritize positioning the MoE parameters and word embeddings computations on the CPU side to utilize its larger memory capacity. Meanwhile, the remaining parameters, including shared experts, projections in the attention module, and MLA, are stored in the GPU VRAM. As these parameters are accessed by every token, their placement on the GPU maximizes the benefits of high memory bandwidth. This configuration leads to approximately 20.7 GB of VRAM usage and 136GB DRAM memory requests if the Q4_K_M version is used, which is feasible even on a local desktop. Additionally, the placement can be adjusted according to the actual configuration, adhering to the same principle.
+
+
+Moreover, as an extensible framework, KTransformers is set to support more advanced operators in future releases, continually enhancing its capability to handle diverse workloads efficiently.
+
+## YAML Template
+
+To implement the above optimizations in KTransformers, users need to write a YAML file containing the optimized rules. 
+KTransformers will iterate through all sub-modules of the model, match rules specified in the YAML rule file, and replace them with advanced modules as specified.
+
+<p align="center">
+  <picture>
+    <img alt="Inject-Struction" src="../assets/InjectStruction.png" width=80%>
+  </picture>
+</p>
+
+Specifically, the following rules are used:
+
+- Replace the Attention module with our [optimized MLA Operator](#mla).
+- Replace routed experts with [CPUInfer kernels](#experts) that use Llamafile.
+- Replace all Linear modules not belonging to attention with [Marlin](#linear) kernels.
+
+
+
+<h3 id="mla">MLA</h3>
+
+For attention module injection, we only need to match the module name used in Transformers using a regular expression and replace it with our pre-implemented module. 
+The YAML rule is listed below.
+
+```yaml
+- match:
+    name: "^model\\.layers\\..*\\.self_attn$" # regular expression
+  replace:
+    class: ktransformers.operators.attention.DeepseekV2AttentionInjected # optimized MLA implementation
+```
+
+As we can see, each rule in the YAML file has two parts: `match` and `replace`. 
+The match part specifies which module should be replaced, and the replace part specifies the module to be injected into the model along with the initialization keywords. 
+
+<h3 id="experts">Routed Experts </h3>
+
+For routed experts, the module we inject is a wrapper of CPUInfer, KTransformersMLPExpert. There are several implementations within a wrapper, and we need to specify keywords to tell the wrapper which implementation we want to use and how we intend to use it.
+
+In KTransformers, some models exhibit different behaviors during prefilling and generation for better performance. KTransformersMLPExpert is one of them. All these special modules have a `device` keyword describing which device the module should be initialized on. Other keywords specify the behaviors during prefilling and generation and may be differ when using different injection modules. Here, we specify which implementation on which device we want to use during prefilling and generation, and which device the output should be on.
+Note that we only use these parameters when layer-wise prefilling is enabled; otherwise, prefilling is conducted with the same configuration as generation.
+
+In the original implementation of Transformers, MoE is implemented using `nn.ModuleList`. We don't want KTransformers to iterate through all the sub-modules in the list, so we set `recursive: False` in this rule to prevent recursive injection into submodules of the current module. Here is the YAML rule:
+
+```yaml
+- match:
+    name: "^model\\.layers\\..*\\.mlp\\.experts$"
+  replace:
+    class: ktransformers.operators.experts.KTransformersMLPExpert     # custom MoE Kernel with expert parallelism
+    device: "cpu"   # device to load this module on initialization
+    kwargs:
+      prefill_device: "cuda"
+      prefill_mlp_type: "MLPExpertsTorch"
+      generate_device: "cpu"
+      generate_mlp_type:  "MLPCPUExperts"
+      out_device: "cuda"
+  recursive: False # don't recursively inject submodules of this module
+```
+
+If we inject the expert list as a custom module, we can't use the interface in `nn.ModuleList` as default. We need to change the forward function in the FFN module. The simplest way is implementing a new module using custom forward function and inject it. We have implemented the new module, and the injection can be done by simply adding an injection rule. We can use the `class` instead of `name` to match a module that will be replaced. Here is the YAML rule:
+
+```yaml
+- match:
+    class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
+  replace:
+    class: ktransformers.operators.experts.DeepseekV2MoEInjected     # MLP module with custom forward function
+```
+
+<h3 id="linear">Other Linear Modules</h3>
+
+For the remained linear modules, we want to use our quantization kernels. However, we don't want to inject linear in the MLA operator because we currently don't know the effect of using quantization in MLA. 
+So, we can change our regular expression and add a class check in the match part of the rule. Only modules matching both name and class simultaneously will be injected. 
+We also need to transfer some keywords similar to the injection of experts. Here is the YAML rule:
+
+```yaml
+- match:
+    name: "^model\\.layers\\.(?!.*self_attn).*$"  # regular expression 
+    class: torch.nn.Linear  # only match modules matching name and class simultaneously
+  replace:
+    class: ktransformers.operators.linear.KTransformerLinear  # optimized Kernel on quantized data types
+    kwargs:
+      generate_device: "cuda"
+      prefill_device: "cuda"
+      generate_op: "QuantizedLinearMarlin"
+      prefill_op: "QuantizedLinearTorch"
+```
+
+<h3 id="Pre-compute Buffers">Pre-compute Buffers </h3>
+
+The original model is initialized on the meta device. The rotary embedding module pre-computes some buffers when initializing, which has no effect and doesn't compute anything when using the meta device. Therefore, we need to compute the buffers when loading the model. For convenience, we inject the rotary embedding module with our custom module, which performs pre-computations when loading. Here is the YAML rule:
+
+```yaml
+- match:
+    class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
+  replace:
+    class: ktransformers.operators.RoPE.YarnRotaryEmbedding
+```
+
+## Wrap Your Custom Module
+
+We have implemented some modules, but you may need to inject your custom module using KTransformers. 
+The only thing you need to do is wrap your custom module and write YAML files. We provide a base operator specifying interfaces an injection module should have. You only need to inherit from that module and change the `__init__`, `forward`, or `load` function as needed.
+
+- The `__init__` function of the base operator maintains the necessary information for injection and execution of the KTransformers framework. To override this function, subclass modules need to call the base operator's `__init__` function in their own initializer.
+- The `forward` function is a function in torch that will be called during inference, where the module author has the freedom to achieve higher performance.
+- The `load` function is used to load all parameters of this module. The default implementation is to call the `load` function of all submodules. You can modify this function to customize its loading method and explicitly control the loading of its submodules.
+
--- a/doc/en/operators/Combined_MoE_time_per_layer.png
+++ b/doc/en/operators/Combined_MoE_time_per_layer.png
--- a/doc/en/operators/Linear_projection_time.png
+++ b/doc/en/operators/Linear_projection_time.png
--- a/doc/en/operators/llamafile.md
+++ b/doc/en/operators/llamafile.md
@ -0,0 +1,35 @@
+# Llamafile Operators Documentation
+
+## Llamafile Sgemm
+
+The Llamafile Sgemm module is an efficient implementation of general matrix multiplication (GEMM) extracted from the great [Llamafile project](https://github.com/Mozilla-Ocho/llamafile/blob/main/llamafile/sgemm.cpp). 
+This module optimizes performance by utilizing various processor-specific instruction sets. For instance, it checks for different x86 instruction sets such as AVX, FMA, and AVX512, leveraging these advanced instructions to accelerate computation. 
+Additionally, the Llamafile Sgemm module supports multiple quantization types, including q8_0, q6_k, and q5_k, among others. This adaptability to different hardware capabilities ensures the most advanced instructions are used in any given computing environment, achieving high computational efficiency. For more information, you can view the [Llamafile Sgemm module](https://github.com/Mozilla-Ocho/llamafile/blob/main/llamafile/sgemm.cpp) on GitHub.
+
+
+## CPUInfer
+To power Llamafile and many future CPU kernels without the original GGML framework, we developed a simple CPUInfer multi-threaded execution framework. It currently leverages the Llamafile Sgemm module to implement key operators such as linear layers, MLP, and MoE, and will be extended to support many other operators. These operators are fundamental components for building large models. CPUInfer features a backend work-stealing thread pool and asynchronous task queue execution logic to efficiently offload parts of model parameters to the CPU, thereby maintaining high inference performance. It supports adjustments based on hardware capabilities or user configurations, providing enhanced inference performance and making it an ideal tool for running deep learning models on CPUs.
+
+## Expert-Parallel MoE
+
+The MoE module's performance can be enhanced by using custom kernels that utilize **expert parallelism**. Since the routed experts are independently computable, we can utilize this inherent parallelism to speed up MoE computations. Specifically, we can allocate each expert MLP to a separate thread group, allowing for the simultaneous computation of all routed experts. This approach of expert parallelism significantly boosts MoE performance by minimizing the frequency of global synchronizations and reducing kernel launch overhead compared to sequential expert computation.
+
+## Microbenchmark
+
+Our evaluations were conducted on an Intel(R) Xeon(R) Gold 6454S processor, utilizing real parameters from the DeepSeek-Coder-V2-Instruct model.
+
+### Linear Projection
+
+The performance of the linear layer was assessed using an Attention Output Projection with dimensions of [5120, 16384]. Here, the input was a vector of 16384 dimensions, and the output was a vector of 5120 dimensions.
+
+![Linear_projection_time](Linear_projection_time.png)
+
+As we can see, in half-precision floating-point formats (fp16 and bf16), CPUInfer's performance exceeded that of Torch by 1.7 and 1.5 times, respectively. For 8-bit quantization, CPUInfer (supporting q8_0) and Torch (supporting qint8) demonstrated nearly equivalent performance. However, CPUInfer employs a more refined scaling approach, using different factors for each group (in q8_0 quantization, every 32 numbers form one group), whereas Torch uses a basic per-tensor quantization, potentially leading to significant precision loss. Furthermore, CPUInfer’s capability to use lower-bit quantization enhances inference speed in specific scenarios.
+
+### MoE
+
+In the MoE module, each token selected 6 experts out of 160 for computation, with input and output dimensions of 5120, and an intermediate dimension of 1536.
+
+![Combined_MoE_time_per_layer](Combined_MoE_time_per_layer.png)
+
+For half-precision floating points and 8-bit quantization formats, CPUInfer's generation performance was 2.5 and 3.2 times better than Torch, respectively. Moreover, using the 8-bit quantization format, CPUInfer achieved faster prefill speeds compared to Torch, with shorter prompts highlighting a more pronounced performance difference.
--- a/doc/zh/api/server/api.md
+++ b/doc/zh/api/server/api.md
@ -0,0 +1,115 @@
+# API
+
+
+- [OpenAI ChatCompletion](#openai-chatcompletion)
+- [Ollama ChatCompletion](#ollama-chatcompletion)
+- [OpenAI Assistant](#openai-assistant)
+
+
+## OpenAI ChatCompletion
+```bash
+POST /v1/chat/completions
+```
+根据选定的模型生成回复。
+
+### 参数
+
+
+- `messages`：一个 `message` 的数组所有的历史消息。`message`：表示用户（user）或者模型（assistant）的消息。`message`包含：
+
+  - `role`: 取值`user`或`assistant`，代表这个 message 的创建者。
+  - `content`: 用户或者模型的消息。
+
+- `model`：选定的模型名
+- `stream`：取值 true 或者 false。表示是否使用流式返回。如果为 true，则以 http 的 event stream 的方式返回模型推理结果。
+
+### 响应
+
+- 流式返回：一个 event stream，每个 event 含有一个`chat.completion.chunk`。`chunk.choices[0].delta.content`是每次模型返回的增量输出。
+- 非流式返回：还未支持。
+
+### 例子
+
+```bash
+curl -X 'POST' \
+  'http://localhost:9112/v1/chat/completions' \
+  -H 'accept: application/json' \
+  -H 'Content-Type: application/json' \
+  -d '{
+  "messages": [
+    {
+      "content": "tell a joke",
+      "role": "user"
+    }
+  ],
+  "model": "Meta-Llama-3-8B-Instruct",
+  "stream": true
+}'
+```
+
+```bash
+data:{"id":"c30445e8-1061-4149-a101-39b8222e79e1","object":"chat.completion.chunk","created":1720511671,"model":"not implmented","system_fingerprint":"not implmented","usage":null,"choices":[{"index":0,"delta":{"content":"Why ","role":"assistant","name":null},"logprobs":null,"finish_reason":null}]}
+
+data:{"id":"c30445e8-1061-4149-a101-39b8222e79e1","object":"chat.completion.chunk","created":1720511671,"model":"not implmented","system_fingerprint":"not implmented","usage":null,"choices":[{"index":0,"delta":{"content":"","role":"assistant","name":null},"logprobs":null,"finish_reason":null}]}
+
+data:{"id":"c30445e8-1061-4149-a101-39b8222e79e1","object":"chat.completion.chunk","created":1720511671,"model":"not implmented","system_fingerprint":"not implmented","usage":null,"choices":[{"index":0,"delta":{"content":"couldn't ","role":"assistant","name":null},"logprobs":null,"finish_reason":null}]}
+
+...
+
+data:{"id":"c30445e8-1061-4149-a101-39b8222e79e1","object":"chat.completion.chunk","created":1720511671,"model":"not implmented","system_fingerprint":"not implmented","usage":null,"choices":[{"index":0,"delta":{"content":"two-tired!","role":"assistant","name":null},"logprobs":null,"finish_reason":null}]}
+
+event: done
+data: [DONE]
+```
+
+
+
+## Ollama ChatCompletion
+
+```bash
+POST /api/generate
+```
+
+根据选定的模型生成回复。
+
+### 参数
+
+
+- `prompt`：一个字符串，代表输入的 prompt。
+- `model`：选定的模型名
+- `stream`：取值 true 或者 false。表示是否使用流式返回。如果为 true，则以 http 的 event stream 的方式返回模型推理结果。
+
+### 响应
+
+- 流式返回：一个流式的 json 返回，每行是一个 json。
+  - `response`：模型补全的增量结果。
+  - `done`：是否推理结束。
+
+- 非流式返回：还未支持。
+
+### 例子
+
+```bash
+curl -X 'POST' \
+  'http://localhost:9112/api/generate' \
+  -H 'accept: application/json' \
+  -H 'Content-Type: application/json' \
+  -d '{
+  "model": "Meta-Llama-3-8B-Instruct",
+  "prompt": "tell me a joke",
+  "stream": true
+}'
+```
+
+```bash
+{"model":"Meta-Llama-3-8B-Instruct","created_at":"2024-07-09 08:13:11.686513","response":"I'll ","done":false}
+{"model":"Meta-Llama-3-8B-Instruct","created_at":"2024-07-09 08:13:11.729214","response":"give ","done":false}
+
+...
+
+{"model":"Meta-Llama-3-8B-Instruct","created_at":"2024-07-09 08:13:33.955475","response":"for","done":false}
+{"model":"Meta-Llama-3-8B-Instruct","created_at":"2024-07-09 08:13:33.956795","response":"","done":true}
+```
+
+
+
--- a/doc/zh/api/server/run-tabby.png
+++ b/doc/zh/api/server/run-tabby.png
--- a/doc/zh/api/server/server-arch.png
+++ b/doc/zh/api/server/server-arch.png
--- a/doc/zh/api/server/server.md
+++ b/doc/zh/api/server/server.md
@ -0,0 +1,41 @@
+# 后端服务（Server）
+Server 将 ktransformers 的快速异构推理能力通过 API 提供给外界调用。
+
+<img src="server-arch.png" height="600" alt="Server架构">
+
+## API
+
+Server 通过 RESTful API 对外提供模型推理服务，提供  ChatCompletion 和 Assistant 两种调用方式。
+
+- ChatCompletion 接口要求用户一次提供所有的历史对话，然后返回模型的回复。AI 服务提供商（例如[OpenAI](https://platform.openai.com/docs/api-reference/chat/create) ）和本地推理框架（例如[Ollama](https://github.com/ollama/ollama/blob/main/docs/api.md) ）都提供 ChatCompletion 接口。为了兼容 OpenAI 和 Ollama，Server 分别提供和它们一致的 API 接口。因此，当前使用 OpenAI 和 Ollama 的应用可以无缝切换到我们的 Server。例如： [如何使用 Tabby 和 ktransformers 在本地利用 236B 的大模型做代码补全？](tabby.md)。
+- Assistant 适用于应用需要复用一系列资源并调用模型的场景。例如，在教育应用场景中，应用开发者可以创建一个名为二年级数学老师的 Assistant，并设置初始prompt（“你是一个有经验的的二年级数学老师...”），上传相关的资料（二年级数学教材）。创建 Assistant 后，应用需要创建一个 Thread 来存储用户和模型的对话消息（Message）。调用模型时，应用需要创建一个 Run 来获得 Assistant 的回复。相对于 ChatCompletion，实现了 Assistant 的 Server 代替应用实现了对话背景复用和多轮对话，使得复杂场景下的模型的调用更加方便。 [OpenAI Assistant API](https://platform.openai.com/docs/api-reference/assistants/createAssistant) 提出了这样的 Assistant 接口，而 Server 也提供和它一致的 API 。
+
+这些 API 定义在`server/api`中，它们的具体使用请见[这里](api.md)。
+
+
+## 对接模型推理框架
+
+Server 通过 ktransformers 调用模型并进行推理。Server 也支持其他的推理框架，例如已经支持的 [transformers](https://huggingface.co/docs/transformers/index) ，并计划支持 [exllamav2](https://github.com/turboderp/exllamav2)。这些功能在`server/backend` 中实现。
+
+Server 将模型推理框架的推理功能抽象成一个基类`BackendInterfaceBase`。这个基类包含一个函数：inference。它的输入是是历史的对话信息 messages，输出是模型返回的文字结果。inference 函数采用 async generator 的设计，这使得 Server 可以流式地返回模型的回复。
+
+```python
+class BackendInterfaceBase:
+  async def inference(self, messages, **kwargs)->AsyncIterator[str]:
+  	...
+```
+
+这个 inference 函数，因为它的输入和输出分别是历史对话和模型回复，所以它自然地实现了 ChatCompletion 的功能。因此 ChatCompletion API 可以直接调用inference 函数完成模型推理。
+
+而 Assistant 则比 ChatCompletion 复杂许多，需要 Server 存储 Assistant 的相关状态，并以合适的方式调用 inference 函数。Server 在数据库中维护了一套 Assistant 逻辑，存储应用创建的 Assistant，Thread 和 Message。在内存中，Server 为每个 Thread 维护一个 `ThreadContext`，集合每个Thread 相关的 Assistant 等信息。当用户发出新的 Message 时，Server 调用 ThreadContext 的get_local_messages函数，获得 messages，并调用 inference 函数获得推理结果。
+
+```python
+class MyThreadContext(ThreadContext):
+    def get_local_messages(self):
+      ...
+```
+
+由于不同的模型推理框架有着不同的历史对话输入格式，所以 `ThreadContext` 和 `BackendInterface` 需要成对地使用。Server 除了自己的 ktransformers 之外，还支持 transformers。如果要对接其他的模型推理框架，可以参考在 [transformers.py](https://github.com/kvcache-ai/ktransformers-dev/blob/main/ktransformers/server/backend/interfaces/transformers.py) 中`TransformersInterface`和`TransformersThreadContext`的实现。 
+
+
+
--- a/doc/zh/api/server/tabby.md
+++ b/doc/zh/api/server/tabby.md
@ -0,0 +1,34 @@
+# 如何使用 Tabby 和 ktransformers 在本地利用 236B 的大模型做代码补全？
+
+[Tabby](https://tabby.tabbyml.com/docs/welcome/) 是一个开源的代码助手，用户可以手动配置后端使用的框架及模型，并在多个 IDE/编辑器 上使用，例如 VSCode 和 InteliJ。因为 Tabby 在框架侧可以对接到 Ollama，并且 ktransformers server 提供和 Ollama 一致的 API 接口，所以我们可以将 Tabby 对接到 ktransformers server。并在代码补全的场景中体验到 ktransformers 快速的异构推理。
+
+1. 启动 ktransformers。
+```bash
+./ktransformers --port 9112
+```
+2. 安装 Tabby：按照 Tabby 的官方教程在带有英伟达 GPU 的 Linux 服务器或者 Windows PC 上[安装 Tabby](https://tabby.tabbyml.com/docs/quick-start/installation/linux/)。
+3. 配置 Tabby：创建`~/.tabby/config.toml`，并加入以下配置。
+```toml
+[model.completion.http]
+kind = "ollama/completion"
+api_endpoint = "http://127.0.0.1:9112/"
+model_name = "DeepSeek-Coder-V2-Instruct"
+prompt_template = "<｜fim▁begin｜>{prefix}<｜fim▁hole｜>{suffix}<｜fim▁end｜>" # Prompt Template
+```
+
+在这个配置中，`kind` 指明 ktransformers 使用 Ollama 的标准 API 为 Tabby 提供服务；`api_endpoint` 与 ktransforer 启动时绑定的接口保持一致；`model_name` 设置为 ktransformers 使用的模型，这里使用 `DeepSeek-Coder-V2-Instruct` 作为后台推理的模型；`prompt_template` 是模型的提示词模板，针对不同的模型，使用相对应的模版才能正常使用模型 Fill In the Middle 的功能。
+在这里演示的是 Tabby 使用 Ollama API 提供 Completion 功能的相关配置，有关 Tabby 其他可选功能的配置信息请参照[这里](https://tabby.tabbyml.com/docs/administration/model/)。
+
+
+4. 启动 Tabby 服务：`./tabby serve`。
+<img src="run-tabby.png" alt="image-20240709112329577" style="zoom:50%;" />
+
+	启动之后，期望会在 ktransformers 的命令行界面看到对 `/api/tags` 接口的访问(在 Tabby 新版本 v0.13.0 中变为对 `/api/show/` 接口的访问)。
+<img src="visit-api-tags.png" alt="image-20240709111648215" style="zoom:67%;" />
+
+6. 注册 Tabby 账户，获取 Token：在启动 Tabby 服务后，在浏览器中打开相应的链接(如上图的 0.0.0.0:8080)，并参照[教程](https://tabby.tabbyml.com/docs/quick-start/register-account/) 创建用户并获取 Token。
+
+7. 启动 VScode 安装 Tabby 拓展插件，并在相关提示下，使用上一步获得的 Token 连接 Tabby Server，参照[这里](https://tabby.tabbyml.com/docs/extensions/installation/vscode/)。
+
+8. 打开任意代码文件，体验 ktransformers 的快速异构推理。
+
--- a/doc/zh/api/server/visit-api-tags.png
+++ b/doc/zh/api/server/visit-api-tags.png
--- a/doc/zh/api/server/website.md
+++ b/doc/zh/api/server/website.md
@ -0,0 +1,32 @@
+# Start with website
+
+This document provides the necessary steps to set up and run the web service for this project.
+
+## 1. Starting the Web Service
+
+### 1.1. Compiling the Web Code
+
+Before you can compile the web code, make sure you have installed [Node.js](https://nodejs.org) version 18.3 or higher
+
+Once npm is installed, navigate to the `ktransformers/website` directory:
+
+```bash
+cd ktransformers/website
+```
+
+Next, install the Vue CLI with the following command:
+
+```bash
+npm install @vue/cli
+```
+
+Now you can build the project:
+
+```bash
+npm run build
+```
+Finally you can build ktransformers with website:
+```
+cd ../../
+pip install .
+```
--- a/install.sh
+++ b/install.sh
@ -0,0 +1,25 @@
+#!/bin/bash
+set -e  
+
+# clear build dirs
+rm -rf ktransformers/ktransformers_ext/build
+rm -rf ktransformers/ktransformers_ext/cuda/build
+rm -rf ktransformers/ktransformers_ext/cuda/dist
+rm -rf ktransformers/ktransformers_ext/cuda/*.egg-info
+
+echo "Installing python dependencies from requirements.txt"
+pip install -r requirements-local_chat.txt
+
+echo "Installing ktransformers cpuinfer"
+mkdir -p ktransformers/ktransformers_ext/build
+cd ktransformers/ktransformers_ext/build
+cmake ..
+cmake --build . --config Release
+
+echo "Installing ktransformers gpu kernel, this may take for a while, please wait"
+sleep 3
+
+cd ../cuda
+python setup.py install
+cd ../../..
+echo "Installation completed successfully"
--- a/ktransformers/init.py
+++ b/ktransformers/init.py
@ -0,0 +1 @@
+__version__ = "0.1.0"
--- a/ktransformers/configs/config.yaml
+++ b/ktransformers/configs/config.yaml
@ -0,0 +1,37 @@
+log:
+  dir: "logs"
+  file: "lexllama.log"
+  #log level: debug, info, warn, error, crit
+  level: "debug"
+  backup_count: -1
+
+server:
+  ip: 0.0.0.0
+  port: 12456
+
+db:
+  type: "sqllite"
+  database: "server.db"
+  host: "./"
+  pool_size: 10
+
+user:
+  secret_key: "981f1dd2a44e27d68759d0252a486568ed43480b4e616a26e3af3709c3a7ce73"
+  algorithm: "HS256"
+
+model:
+  # type: transformers
+  type: ktransformers
+
+  name: DeepSeek-Coder-V2-Instruct
+  path: /mnt/data/model/DeepSeek-Coder-V2-Instruct/
+  gguf_path: /mnt/data/model/DeepSeek-Coder-V2-GGUF-WJH/
+
+  device: cuda:0
+
+web:
+  mount: False
+  open_cross_domain: True
+
+ext:
+  cpu_infer: 10
--- a/ktransformers/configs/log_config.ini
+++ b/ktransformers/configs/log_config.ini
@ -0,0 +1,46 @@
+[loggers]
+keys=root,uvicorn,uvicornError,uvicornAccess
+
+[handlers]
+keys=consoleHandler,fileHandler
+
+[formatters]
+keys=detailedFormatter
+
+[logger_root]
+level=INFO
+handlers=consoleHandler
+
+[logger_uvicorn]
+level=INFO
+handlers=consoleHandler,fileHandler
+qualname=uvicorn
+propagate=0
+
+[logger_uvicornError]
+level=ERROR
+handlers=consoleHandler,fileHandler
+qualname=uvicorn.error
+propagate=0
+
+[logger_uvicornAccess]
+level=INFO
+handlers=consoleHandler,fileHandler
+qualname=uvicorn.access
+propagate=0
+
+[handler_consoleHandler]
+class=StreamHandler
+level=INFO
+formatter=detailedFormatter
+args=(sys.stdout,)
+
+[handler_fileHandler]
+class=logging.FileHandler
+level=INFO
+formatter=detailedFormatter
+args=('uvicorn_logs.log', 'a')
+
+[formatter_detailedFormatter]
+format=%(asctime)s - %(name)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s
+datefmt=%Y-%m-%d %H:%M:%S
--- a/ktransformers/ktransformers_ext/CMakeLists.txt
+++ b/ktransformers/ktransformers_ext/CMakeLists.txt
@ -0,0 +1,169 @@
+cmake_minimum_required(VERSION 3.16)
+project(cpuinfer_ext VERSION 0.1.0)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
+set(CMAKE_BUILD_TYPE "Release")
+include(CheckCXXCompilerFlag)
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+
+
+option(LLAMA_NATIVE                     "llama: enable -march=native flag"                      ON)
+
+# Architecture specific
+# TODO: probably these flags need to be tweaked on some architectures
+#       feel free to update the Makefile for your architecture and send a pull request or issue
+message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")
+if (MSVC)
+    string(TOLOWER "${CMAKE_GENERATOR_PLATFORM}" CMAKE_GENERATOR_PLATFORM_LWR)
+    message(STATUS "CMAKE_GENERATOR_PLATFORM: ${CMAKE_GENERATOR_PLATFORM}")
+else ()
+    set(CMAKE_GENERATOR_PLATFORM_LWR "")
+endif ()
+
+if (NOT MSVC)
+    if (LLAMA_STATIC)
+        add_link_options(-static)
+        if (MINGW)
+            add_link_options(-static-libgcc -static-libstdc++)
+        endif()
+    endif()
+    if (LLAMA_GPROF)
+        add_compile_options(-pg)
+    endif()
+endif()
+
+set(ARCH_FLAGS "")
+
+if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR
+    (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
+     CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$"))
+    message(STATUS "ARM detected")
+    if (MSVC)
+        add_compile_definitions(__aarch64__) # MSVC defines _M_ARM64 instead
+        add_compile_definitions(__ARM_NEON)
+        add_compile_definitions(__ARM_FEATURE_FMA)
+
+        set(CMAKE_REQUIRED_FLAGS_PREV ${CMAKE_REQUIRED_FLAGS})
+        string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} "/arch:armv8.2")
+        check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
+        if (GGML_COMPILER_SUPPORT_DOTPROD)
+            add_compile_definitions(__ARM_FEATURE_DOTPROD)
+        endif ()
+        check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
+        if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
+            add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+        endif ()
+        set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_PREV})
+    else()
+        check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E)
+        if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
+            list(APPEND ARCH_FLAGS -mfp16-format=ieee)
+        endif()
+        if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
+            # Raspberry Pi 1, Zero
+            list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access)
+        endif()
+        if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
+            if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Android")
+                # Android armeabi-v7a
+                list(APPEND ARCH_FLAGS -mfpu=neon-vfpv4 -mno-unaligned-access -funsafe-math-optimizations)
+            else()
+                # Raspberry Pi 2
+                list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
+            endif()
+        endif()
+        if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
+            # Android arm64-v8a
+            # Raspberry Pi 3, 4, Zero 2 (32-bit)
+            list(APPEND ARCH_FLAGS -mno-unaligned-access)
+        endif()
+    endif()
+elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
+        (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
+         CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$"))
+    message(STATUS "x86 detected")
+    if (MSVC)
+        # instruction set detection for MSVC only
+        if (LLAMA_NATIVE)
+            include(cmake/FindSIMD.cmake)
+        endif ()
+        if (LLAMA_AVX512)
+            list(APPEND ARCH_FLAGS /arch:AVX512)
+            # MSVC has no compile-time flags enabling specific
+            # AVX512 extensions, neither it defines the
+            # macros corresponding to the extensions.
+            # Do it manually.
+            if (LLAMA_AVX512_VBMI)
+                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>)
+                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>)
+            endif()
+            if (LLAMA_AVX512_VNNI)
+                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
+                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
+            endif()
+        elseif (LLAMA_AVX2)
+            list(APPEND ARCH_FLAGS /arch:AVX2)
+        elseif (LLAMA_AVX)
+            list(APPEND ARCH_FLAGS /arch:AVX)
+        endif()
+    else()
+        if (LLAMA_NATIVE)
+            list(APPEND ARCH_FLAGS -march=native)
+        endif()
+        if (LLAMA_F16C)
+            list(APPEND ARCH_FLAGS -mf16c)
+        endif()
+        if (LLAMA_FMA)
+            list(APPEND ARCH_FLAGS -mfma)
+        endif()
+        if (LLAMA_AVX)
+            list(APPEND ARCH_FLAGS -mavx)
+        endif()
+        if (LLAMA_AVX2)
+            list(APPEND ARCH_FLAGS -mavx2)
+        endif()
+        if (LLAMA_AVX512)
+            list(APPEND ARCH_FLAGS -mavx512f)
+            list(APPEND ARCH_FLAGS -mavx512bw)
+        endif()
+        if (LLAMA_AVX512_VBMI)
+            list(APPEND ARCH_FLAGS -mavx512vbmi)
+        endif()
+        if (LLAMA_AVX512_VNNI)
+            list(APPEND ARCH_FLAGS -mavx512vnni)
+        endif()
+    endif()
+elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
+    message(STATUS "PowerPC detected")
+    if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
+        list(APPEND ARCH_FLAGS -mcpu=powerpc64le)
+    else()
+        list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)
+        #TODO: Add  targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
+    endif()
+else()
+    message(STATUS "Unknown architecture")
+endif()
+
+find_package(CUDA REQUIRED)
+
+add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>")
+add_compile_options("$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>")
+
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../third_party/pybind11 ${CMAKE_CURRENT_BINARY_DIR}/third_party/pybind11)
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../third_party/llama.cpp ${CMAKE_CURRENT_BINARY_DIR}/third_party/llama.cpp)
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../../third_party)
+include_directories("${CUDA_INCLUDE_DIRS}")
+
+aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR} SOURCE_DIR1)
+aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/cpu_backend SOURCE_DIR2)
+aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/operators/llamafile SOURCE_DIR3)
+aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/../../third_party/llamafile SOURCE_DIR4)
+set(ALL_SOURCES ${SOURCE_DIR1} ${SOURCE_DIR2} ${SOURCE_DIR3} ${SOURCE_DIR4})
+message(STATUS "ALL_SOURCES: ${ALL_SOURCES}")
+
+pybind11_add_module(${PROJECT_NAME} MODULE ${ALL_SOURCES})
+target_link_libraries(${PROJECT_NAME} PRIVATE llama)
+target_link_libraries(${PROJECT_NAME} PRIVATE "/usr/local/cuda/lib64/libcudart.so")
--- a/ktransformers/ktransformers_ext/bench/bench_linear.py
+++ b/ktransformers/ktransformers_ext/bench/bench_linear.py
@ -0,0 +1,111 @@
+#!/usr/bin/env python
+# coding=utf-8
+'''
+Description  :  
+Author       : chenht2022
+Date         : 2024-07-25 10:31:59
+Version      : 1.0.0
+LastEditors  : chenht2022 
+LastEditTime : 2024-07-25 10:32:51
+Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
+'''
+import os, sys
+import time
+sys.path.append(os.path.dirname(__file__) + '/../build')
+import cpuinfer_ext
+import torch
+
+def bench_linear(quant_mode: str):
+    with torch.inference_mode(mode=True):
+        input_size = 16384
+        output_size = 5120
+        stride = 16
+        layer_num = 10
+        CPUInfer = cpuinfer_ext.CPUInfer(64)
+        warm_up_iter = 1000
+        test_iter = 10000
+
+        hidden_type = 30 # ggml_type::GGML_TYPE_BF16
+        if quant_mode == "fp32":
+            proj_type = 0 # ggml_type::GGML_TYPE_F32
+            bytes_per_elem = 4.000000
+        elif quant_mode == "fp16":
+            proj_type = 1 # ggml_type::GGML_TYPE_F16
+            bytes_per_elem = 2.000000
+        elif quant_mode == "bf16":
+            proj_type = 30 # ggml_type::GGML_TYPE_BF16
+            bytes_per_elem = 2.000000
+        elif quant_mode == "q8_0":
+            proj_type = 8 # ggml_type::GGML_TYPE_Q8_0
+            bytes_per_elem = 1.062500
+        elif quant_mode == "q6_k":
+            proj_type = 14 # ggml_type::GGML_TYPE_Q6_K
+            bytes_per_elem = 0.820312
+        elif quant_mode == "q5_k_m":
+            proj_type = 13 # ggml_type::GGML_TYPE_Q5_K
+            bytes_per_elem = 0.687500
+        elif quant_mode == "q4_k_m":
+            proj_type = 12 # ggml_type::GGML_TYPE_Q4_K
+            bytes_per_elem = 0.562500
+        elif quant_mode == "q3_k_m":
+            proj_type = 11 # ggml_type::GGML_TYPE_Q3_K
+            bytes_per_elem = 0.429688
+        elif quant_mode == "q2_k":
+            proj_type = 10 # ggml_type::GGML_TYPE_Q2_K
+            bytes_per_elem = 0.328125
+        elif quant_mode == "iq3_xs":
+            proj_type = 21 # ggml_type::GGML_TYPE_IQ3_S
+            bytes_per_elem = 0.429688
+        elif quant_mode == "iq2_xxs":
+            proj_type = 16 # ggml_type::GGML_TYPE_IQ2_XXS
+            bytes_per_elem = 0.257812
+        else:
+            assert(False)
+
+        linears = []
+        projs = []
+        for _ in range(layer_num):
+            proj = torch.randn((output_size, input_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
+            config = cpuinfer_ext.linear.LinearConfig(input_size, output_size, stride, proj.data_ptr(), proj_type, hidden_type)
+            linear = cpuinfer_ext.linear.Linear(config)
+            projs.append(proj)
+            linears.append(linear)
+
+        # warm up
+        for i in range(warm_up_iter):
+            linear = linears[i % layer_num]
+            input = torch.randn((1, input_size), dtype=torch.bfloat16).contiguous()
+            output = torch.empty((1, output_size), dtype=torch.bfloat16).contiguous()
+            CPUInfer.submit(linear.forward, input.data_ptr(), output.data_ptr())
+            CPUInfer.sync()
+
+        # test
+        total_time = 0
+        for i in range(test_iter):
+            linear = linears[i % layer_num]
+            input = torch.randn((1, input_size), dtype=torch.bfloat16).contiguous()
+            output = torch.empty((1, output_size), dtype=torch.bfloat16).contiguous()
+            start = time.perf_counter()
+            CPUInfer.submit(linear.forward, input.data_ptr(), output.data_ptr())
+            CPUInfer.sync()
+            end = time.perf_counter()
+            total_time += end - start
+        print('Quant mode: ', quant_mode)
+        print('Time(s): ', total_time)
+        print('Iteration: ', test_iter) 
+        print('Time(us) per iteration: ', total_time / test_iter * 1000000)
+        print('Bandwidth: ', input_size * output_size * bytes_per_elem * test_iter / total_time / 1000 / 1000 / 1000, 'GB/s')
+        print('')
+
+bench_linear("fp32")
+bench_linear("fp16")
+bench_linear("bf16")
+bench_linear("q8_0")
+bench_linear("q6_k")
+bench_linear("q5_k_m")
+bench_linear("q4_k_m")
+bench_linear("q3_k_m")
+bench_linear("q2_k")
+# Not supported on __x86_64__
+# bench_linear("iq3_xs")
+# bench_linear("iq2_xxs")
--- a/ktransformers/ktransformers_ext/bench/bench_linear_torch.py
+++ b/ktransformers/ktransformers_ext/bench/bench_linear_torch.py
@ -0,0 +1,85 @@
+#!/usr/bin/env python
+# coding=utf-8
+'''
+Description  :  
+Author       : chenht2022
+Date         : 2024-07-25 10:31:59
+Version      : 1.0.0
+LastEditors  : chenht2022 
+LastEditTime : 2024-07-25 10:32:48
+Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
+'''
+import os, sys
+import time
+import torch
+import torch.nn.quantized as nnq
+
+def bench_linear(quant_mode: str):
+    with torch.inference_mode(mode=True):
+        input_size = 16384
+        output_size = 5120
+        layer_num = 10
+        warm_up_iter = 1000
+        test_iter = 10000
+
+        if quant_mode == "fp32":
+            proj_type = torch.float32
+            bytes_per_elem = 4.000000
+        elif quant_mode == "fp16":
+            proj_type = torch.float16
+            bytes_per_elem = 2.000000
+        elif quant_mode == "bf16":
+            proj_type = torch.bfloat16
+            bytes_per_elem = 2.000000
+        elif quant_mode == "qint8":
+            proj_type = torch.qint8
+            bytes_per_elem = 1.000000
+        else:
+            assert(False)
+
+        projs = []
+        for _ in range(layer_num):
+            proj = torch.randn((output_size, input_size), dtype = torch.float32, device = "cuda").to("cpu").contiguous()
+            if quant_mode == "qint8":
+                scale, zero_point = 0.1, 0  # Adjust scale and zero_point based on your dataset
+                proj_q = torch.quantize_per_tensor(proj, scale, zero_point, torch.qint8)
+                quantized_layer = nnq.Linear(input_size, output_size)
+                quantized_layer.set_weight_bias(proj_q, None)
+                projs.append(quantized_layer)
+            else:
+                projs.append(proj.to(proj_type))
+
+        # warm up
+        for i in range(warm_up_iter):
+            input = torch.randn((1, input_size), dtype=torch.float32).contiguous()
+            if quant_mode == "qint8":
+                input_q = torch.quantize_per_tensor(input, scale, zero_point, torch.quint8)
+                quantized_layer = projs[i % layer_num]
+                t_output = quantized_layer(input_q)
+            else:
+                t_output = torch.mm(input.to(proj_type), projs[i % layer_num].t())
+
+        # test
+        total_time = 0
+        for i in range(test_iter):
+            input = torch.randn((1, input_size), dtype=torch.float32).contiguous()
+            start = time.perf_counter()
+            if quant_mode == "qint8":
+                input_q = torch.quantize_per_tensor(input, scale, zero_point, torch.quint8)
+                quantized_layer = projs[i % layer_num]
+                t_output = quantized_layer(input_q)
+            else:
+                t_output = torch.mm(input.to(proj_type), projs[i % layer_num].t())
+            end = time.perf_counter()
+            total_time += end - start
+        print('Quant mode: ', quant_mode)
+        print('Time(s): ', total_time)
+        print('Iteration: ', test_iter) 
+        print('Time(us) per iteration: ', total_time / test_iter * 1000000)
+        print('Bandwidth: ', input_size * output_size * bytes_per_elem * test_iter / total_time / 1000 / 1000 / 1000, 'GB/s')
+        print('')
+
+bench_linear("fp32")
+bench_linear("fp16")
+bench_linear("bf16")
+bench_linear("qint8")
--- a/ktransformers/ktransformers_ext/bench/bench_mlp.py
+++ b/ktransformers/ktransformers_ext/bench/bench_mlp.py
@ -0,0 +1,140 @@
+#!/usr/bin/env python
+# coding=utf-8
+'''
+Description  :  
+Author       : chenht2022
+Date         : 2024-07-16 10:43:18
+Version      : 1.0.0
+LastEditors  : chenht2022 
+LastEditTime : 2024-07-25 10:32:55
+Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
+'''
+import os, sys
+import time
+sys.path.append(os.path.dirname(__file__) + '/../build')
+import cpuinfer_ext
+import torch
+
+def bench_mlp(quant_mode: str):
+    with torch.inference_mode(mode=True):
+        hidden_size = 5120
+        intermediate_size = 3072
+        stride = 16
+        layer_num = 10
+        CPUInfer = cpuinfer_ext.CPUInfer(64)
+        warm_up_iter = 1000
+        test_iter = 10000
+
+        hidden_type = 30 # ggml_type::GGML_TYPE_BF16
+        if quant_mode == "fp32":
+            gate_type = 0 # ggml_type::GGML_TYPE_F32
+            up_type = 0 # ggml_type::GGML_TYPE_F32
+            down_type = 0 # ggml_type::GGML_TYPE_F32
+            bytes_per_elem = 4.000000
+        elif quant_mode == "fp16":
+            gate_type = 1 # ggml_type::GGML_TYPE_F16
+            up_type = 1 # ggml_type::GGML_TYPE_F16
+            down_type = 1 # ggml_type::GGML_TYPE_F16
+            bytes_per_elem = 2.000000
+        elif quant_mode == "bf16":
+            gate_type = 30 # ggml_type::GGML_TYPE_BF16
+            up_type = 30 # ggml_type::GGML_TYPE_BF16
+            down_type = 30 # ggml_type::GGML_TYPE_BF16
+            bytes_per_elem = 2.000000
+        elif quant_mode == "q8_0":
+            gate_type = 8 # ggml_type::GGML_TYPE_Q8_0
+            up_type = 8 # ggml_type::GGML_TYPE_Q8_0
+            down_type = 8 # ggml_type::GGML_TYPE_Q8_0
+            bytes_per_elem = 1.062500
+        elif quant_mode == "q6_k":
+            gate_type = 14 # ggml_type::GGML_TYPE_Q6_K
+            up_type = 14 # ggml_type::GGML_TYPE_Q6_K
+            down_type = 14 # ggml_type::GGML_TYPE_Q6_K
+            bytes_per_elem = 0.820312
+        elif quant_mode == "q5_k_m":
+            gate_type = 13 # ggml_type::GGML_TYPE_Q5_K
+            up_type = 13 # ggml_type::GGML_TYPE_Q5_K
+            down_type = 14 # ggml_type::GGML_TYPE_Q6_K
+            bytes_per_elem = 0.731771
+        elif quant_mode == "q4_k_m":
+            gate_type = 12 # ggml_type::GGML_TYPE_Q4_K
+            up_type = 12 # ggml_type::GGML_TYPE_Q4_K
+            down_type = 14 # ggml_type::GGML_TYPE_Q6_K
+            bytes_per_elem = 0.648437
+        elif quant_mode == "q3_k_m":
+            gate_type = 11 # ggml_type::GGML_TYPE_Q3_K
+            up_type = 11 # ggml_type::GGML_TYPE_Q3_K
+            down_type = 13 # ggml_type::GGML_TYPE_Q5_K
+            bytes_per_elem = 0.515625
+        elif quant_mode == "q2_k":
+            gate_type = 10 # ggml_type::GGML_TYPE_Q2_K
+            up_type = 10 # ggml_type::GGML_TYPE_Q2_K
+            down_type = 11 # ggml_type::GGML_TYPE_Q3_K
+            bytes_per_elem = 0.328125
+        elif quant_mode == "iq3_xs":
+            gate_type = 21 # ggml_type::GGML_TYPE_IQ3_S
+            up_type = 21 # ggml_type::GGML_TYPE_IQ3_S
+            down_type = 21 # ggml_type::GGML_TYPE_IQ3_S
+            bytes_per_elem = 0.429688
+        elif quant_mode == "iq2_xxs":
+            gate_type = 16 # ggml_type::GGML_TYPE_IQ2_XXS
+            up_type = 16 # ggml_type::GGML_TYPE_IQ2_XXS
+            down_type = 16 # ggml_type::GGML_TYPE_IQ2_XXS
+            bytes_per_elem = 0.257812
+        else:
+            assert(False)
+
+
+        mlps = []
+        gate_projs = []
+        up_projs = []
+        down_projs = []
+        for _ in range(layer_num):
+            gate_proj = torch.randn((intermediate_size, hidden_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
+            up_proj = torch.randn((intermediate_size, hidden_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
+            down_proj = torch.randn((hidden_size, intermediate_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
+            config = cpuinfer_ext.mlp.MLPConfig(hidden_size, intermediate_size, stride, gate_proj.data_ptr(), up_proj.data_ptr(), down_proj.data_ptr(), gate_type, up_type, down_type, hidden_type)
+            mlp = cpuinfer_ext.mlp.MLP(config)
+            gate_projs.append(gate_proj)
+            up_projs.append(up_proj)
+            down_projs.append(down_proj)
+            mlps.append(mlp)
+
+        # warm up
+        for i in range(warm_up_iter):
+            mlp = mlps[i % layer_num]
+            input = torch.randn((1, hidden_size), dtype=torch.bfloat16).contiguous()
+            output = torch.empty((1, hidden_size), dtype=torch.bfloat16).contiguous()
+            CPUInfer.submit(mlp.forward, input.data_ptr(), output.data_ptr())
+            CPUInfer.sync()
+
+        # test
+        total_time = 0
+        for i in range(test_iter):
+            mlp = mlps[i % layer_num]
+            input = torch.randn((1, hidden_size), dtype=torch.bfloat16).contiguous()
+            output = torch.empty((1, hidden_size), dtype=torch.bfloat16).contiguous()
+            start = time.perf_counter()
+            CPUInfer.submit(mlp.forward, input.data_ptr(), output.data_ptr())
+            CPUInfer.sync()
+            end = time.perf_counter()
+            total_time += end - start
+        print('Quant mode: ', quant_mode)
+        print('Time(s): ', total_time)
+        print('Iteration: ', test_iter) 
+        print('Time(us) per iteration: ', total_time / test_iter * 1000000)
+        print('Bandwidth: ', hidden_size * intermediate_size * 3 * bytes_per_elem * test_iter / total_time / 1000 / 1000 / 1000, 'GB/s')
+        print('')
+
+bench_mlp("fp32")
+bench_mlp("fp16")
+bench_mlp("bf16")
+bench_mlp("q8_0")
+bench_mlp("q6_k")
+bench_mlp("q5_k_m")
+bench_mlp("q4_k_m")
+bench_mlp("q3_k_m")
+bench_mlp("q2_k")
+# Not supported on __x86_64__
+# bench_linear("iq3_xs")
+# bench_linear("iq2_xxs")
--- a/ktransformers/ktransformers_ext/bench/bench_mlp_torch.py
+++ b/ktransformers/ktransformers_ext/bench/bench_mlp_torch.py
@ -0,0 +1,130 @@
+#!/usr/bin/env python
+# coding=utf-8
+'''
+Description  :  
+Author       : chenht2022
+Date         : 2024-07-16 10:43:18
+Version      : 1.0.0
+LastEditors  : chenht2022 
+LastEditTime : 2024-07-25 10:32:53
+Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
+'''
+import os, sys
+import time
+import torch
+import torch.nn.quantized as nnq
+
+def act_fn(x):
+    return x / (1.0 + torch.exp(-x))
+
+def bench_mlp(quant_mode: str):
+    with torch.inference_mode(mode=True):
+        hidden_size = 5120
+        intermediate_size = 3072
+        layer_num = 10
+        warm_up_iter = 1000
+        test_iter = 10000
+
+        if quant_mode == "fp32":
+            proj_type = torch.float32
+            bytes_per_elem = 4.000000
+        elif quant_mode == "fp16":
+            proj_type = torch.float16
+            bytes_per_elem = 2.000000
+        elif quant_mode == "bf16":
+            proj_type = torch.bfloat16
+            bytes_per_elem = 2.000000
+        elif quant_mode == "qint8":
+            proj_type = torch.qint8
+            bytes_per_elem = 1.000000
+        else:
+            assert(False)
+
+        gate_projs = []
+        up_projs = []
+        down_projs = []
+        for _ in range(layer_num):
+            gate_proj = torch.randn((intermediate_size, hidden_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
+            up_proj = torch.randn((intermediate_size, hidden_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
+            down_proj = torch.randn((hidden_size, intermediate_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
+            if quant_mode == "qint8":
+                scale, zero_point = 0.1, 0  # Adjust scale and zero_point based on your dataset
+                gate_proj_q = torch.quantize_per_tensor(gate_proj, scale, zero_point, torch.qint8)
+                quantized_gate = nnq.Linear(hidden_size, intermediate_size)
+                quantized_gate.set_weight_bias(gate_proj_q, None)
+                up_proj_q = torch.quantize_per_tensor(up_proj, scale, zero_point, torch.qint8)
+                quantized_up = nnq.Linear(hidden_size, intermediate_size)
+                quantized_up.set_weight_bias(up_proj_q, None)
+                down_proj_q = torch.quantize_per_tensor(down_proj, scale, zero_point, torch.qint8)
+                quantized_down = nnq.Linear(intermediate_size, hidden_size)
+                quantized_down.set_weight_bias(down_proj_q, None)
+                gate_projs.append(quantized_gate)
+                up_projs.append(quantized_up)
+                down_projs.append(quantized_down)
+            else:
+                gate_projs.append(gate_proj.to(proj_type))
+                up_projs.append(up_proj.to(proj_type))
+                down_projs.append(down_proj.to(proj_type))
+
+        # warm up
+        for i in range(warm_up_iter):
+            input = torch.randn((1, hidden_size), dtype=torch.float32).contiguous()
+            if quant_mode == "qint8":
+                input_q = torch.quantize_per_tensor(input, scale, zero_point, torch.quint8)
+                quantized_gate = gate_projs[i % layer_num]
+                gate_buf = quantized_gate(input_q)
+                quantized_up = up_projs[i % layer_num]
+                up_buf = quantized_gate(input_q)
+                gate_buf = gate_buf.dequantize()
+                up_buf = up_buf.dequantize()
+                intermediate = act_fn(gate_buf) * up_buf
+                intermediate_q = torch.quantize_per_tensor(intermediate, scale, zero_point, torch.quint8)
+                quantized_down = down_projs[i % layer_num]
+                t_output = quantized_down(intermediate_q)
+            else:
+                gate_proj = gate_projs[i%layer_num]
+                up_proj = up_projs[i%layer_num]
+                down_proj = down_projs[i%layer_num]
+                gate_buf = torch.mm(input.to(proj_type), gate_proj.t())
+                up_buf = torch.mm(input.to(proj_type), up_proj.t())
+                intermediate = act_fn(gate_buf) * up_buf
+                t_output = torch.mm(intermediate.to(proj_type), down_proj.t())
+
+        # test
+        total_time = 0
+        for i in range(test_iter):
+            input = torch.randn((1, hidden_size), dtype=torch.float32).contiguous()
+            start = time.perf_counter()
+            if quant_mode == "qint8":
+                input_q = torch.quantize_per_tensor(input, scale, zero_point, torch.quint8)
+                quantized_gate = gate_projs[i % layer_num]
+                gate_buf = quantized_gate(input_q)
+                quantized_up = up_projs[i % layer_num]
+                up_buf = quantized_gate(input_q)
+                gate_buf = gate_buf.dequantize()
+                up_buf = up_buf.dequantize()
+                intermediate = act_fn(gate_buf) * up_buf
+                intermediate_q = torch.quantize_per_tensor(intermediate, scale, zero_point, torch.quint8)
+                quantized_down = down_projs[i % layer_num]
+                t_output = quantized_down(intermediate_q)
+            else:
+                gate_proj = gate_projs[i%layer_num]
+                up_proj = up_projs[i%layer_num]
+                down_proj = down_projs[i%layer_num]
+                gate_buf = torch.mm(input.to(proj_type), gate_proj.t())
+                up_buf = torch.mm(input.to(proj_type), up_proj.t())
+                intermediate = act_fn(gate_buf) * up_buf
+                t_output = torch.mm(intermediate.to(proj_type), down_proj.t())
+            end = time.perf_counter()
+            total_time += end - start
+        print('Quant mode: ', quant_mode)
+        print('Time(s): ', total_time)
+        print('Iteration: ', test_iter) 
+        print('Time(us) per iteration: ', total_time / test_iter * 1000000)
+        print('Bandwidth: ', hidden_size * intermediate_size * 3 * bytes_per_elem * test_iter / total_time / 1000 / 1000 / 1000, 'GB/s')
+        print('')
+
+bench_mlp("fp32")
+bench_mlp("fp16")
+bench_mlp("bf16")
+bench_mlp("qint8")
--- a/ktransformers/ktransformers_ext/bench/bench_moe.py
+++ b/ktransformers/ktransformers_ext/bench/bench_moe.py
@ -0,0 +1,154 @@
+#!/usr/bin/env python
+# coding=utf-8
+'''
+Description  :  
+Author       : chenht2022
+Date         : 2024-07-25 10:32:05
+Version      : 1.0.0
+LastEditors  : chenht2022 
+LastEditTime : 2024-07-25 10:33:00
+Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
+'''
+import os, sys
+import time
+sys.path.append(os.path.dirname(__file__) + '/../build')
+import cpuinfer_ext
+import torch
+
+def bench_moe(quant_mode: str):
+    with torch.inference_mode(mode=True):
+        expert_num = 10
+        hidden_size = 5120
+        intermediate_size = 1536
+        stride = 16
+        group_min_len = 10
+        group_max_len = 1024
+        n_routed_experts = 6
+        layer_num = 10
+        qlen = 1
+        CPUInfer = cpuinfer_ext.CPUInfer(64)
+        warm_up_iter = 1000
+        test_iter = 10000
+
+        hidden_type = 30 # ggml_type::GGML_TYPE_BF16
+        if quant_mode == "fp32":
+            gate_type = 0 # ggml_type::GGML_TYPE_F32
+            up_type = 0 # ggml_type::GGML_TYPE_F32
+            down_type = 0 # ggml_type::GGML_TYPE_F32
+            bytes_per_elem = 4.000000
+        elif quant_mode == "fp16":
+            gate_type = 1 # ggml_type::GGML_TYPE_F16
+            up_type = 1 # ggml_type::GGML_TYPE_F16
+            down_type = 1 # ggml_type::GGML_TYPE_F16
+            bytes_per_elem = 2.000000
+        elif quant_mode == "bf16":
+            gate_type = 30 # ggml_type::GGML_TYPE_BF16
+            up_type = 30 # ggml_type::GGML_TYPE_BF16
+            down_type = 30 # ggml_type::GGML_TYPE_BF16
+            bytes_per_elem = 2.000000
+        elif quant_mode == "q8_0":
+            gate_type = 8 # ggml_type::GGML_TYPE_Q8_0
+            up_type = 8 # ggml_type::GGML_TYPE_Q8_0
+            down_type = 8 # ggml_type::GGML_TYPE_Q8_0
+            bytes_per_elem = 1.062500
+        elif quant_mode == "q6_k":
+            gate_type = 14 # ggml_type::GGML_TYPE_Q6_K
+            up_type = 14 # ggml_type::GGML_TYPE_Q6_K
+            down_type = 14 # ggml_type::GGML_TYPE_Q6_K
+            bytes_per_elem = 0.820312
+        elif quant_mode == "q5_k_m":
+            gate_type = 13 # ggml_type::GGML_TYPE_Q5_K
+            up_type = 13 # ggml_type::GGML_TYPE_Q5_K
+            down_type = 14 # ggml_type::GGML_TYPE_Q6_K
+            bytes_per_elem = 0.731771
+        elif quant_mode == "q4_k_m":
+            gate_type = 12 # ggml_type::GGML_TYPE_Q4_K
+            up_type = 12 # ggml_type::GGML_TYPE_Q4_K
+            down_type = 14 # ggml_type::GGML_TYPE_Q6_K
+            bytes_per_elem = 0.648437
+        elif quant_mode == "q3_k_m":
+            gate_type = 11 # ggml_type::GGML_TYPE_Q3_K
+            up_type = 11 # ggml_type::GGML_TYPE_Q3_K
+            down_type = 13 # ggml_type::GGML_TYPE_Q5_K
+            bytes_per_elem = 0.515625
+        elif quant_mode == "q2_k":
+            gate_type = 10 # ggml_type::GGML_TYPE_Q2_K
+            up_type = 10 # ggml_type::GGML_TYPE_Q2_K
+            down_type = 11 # ggml_type::GGML_TYPE_Q3_K
+            bytes_per_elem = 0.328125
+        elif quant_mode == "iq3_xs":
+            gate_type = 21 # ggml_type::GGML_TYPE_IQ3_S
+            up_type = 21 # ggml_type::GGML_TYPE_IQ3_S
+            down_type = 21 # ggml_type::GGML_TYPE_IQ3_S
+            bytes_per_elem = 0.429688
+        elif quant_mode == "iq2_xxs":
+            gate_type = 16 # ggml_type::GGML_TYPE_IQ2_XXS
+            up_type = 16 # ggml_type::GGML_TYPE_IQ2_XXS
+            down_type = 16 # ggml_type::GGML_TYPE_IQ2_XXS
+            bytes_per_elem = 0.257812
+        else:
+            assert(False)
+
+
+        moes = []
+        gate_projs = []
+        up_projs = []
+        down_projs = []
+        for _ in range(layer_num):
+            gate_proj = torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
+            up_proj = torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
+            down_proj = torch.randn((expert_num, hidden_size, intermediate_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
+            config = cpuinfer_ext.moe.MOEConfig(expert_num, n_routed_experts, hidden_size, intermediate_size, stride, group_min_len, group_max_len, gate_proj.data_ptr(), up_proj.data_ptr(), down_proj.data_ptr(), gate_type, up_type, down_type, hidden_type)
+            moe = cpuinfer_ext.moe.MOE(config)
+            gate_projs.append(gate_proj)
+            up_projs.append(up_proj)
+            down_projs.append(down_proj)
+            moes.append(moe)
+        expert_ids = torch.randint(0, expert_num, (layer_num, qlen, n_routed_experts), dtype=torch.int64, device = "cuda").to("cpu").contiguous()
+        weights = torch.rand((layer_num, qlen, n_routed_experts), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
+        input = torch.randn((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device = "cuda").to("cpu").contiguous()
+        output = torch.empty((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device = "cuda").to("cpu").contiguous()
+
+        # warm up
+        for i in range(warm_up_iter):
+            CPUInfer.submit(moes[i % layer_num].forward, 
+                            qlen, 
+                            n_routed_experts, 
+                            expert_ids[i % layer_num].data_ptr(), 
+                            weights[i % layer_num].data_ptr(),
+                            input[i % layer_num].data_ptr(), 
+                            output[i % layer_num].data_ptr())
+            CPUInfer.sync()
+
+        # test
+        start = time.perf_counter()
+        for i in range(test_iter):
+            CPUInfer.submit(moes[i % layer_num].forward, 
+                            qlen, 
+                            n_routed_experts, 
+                            expert_ids[i % layer_num].data_ptr(), 
+                            weights[i % layer_num].data_ptr(),
+                            input[i % layer_num].data_ptr(), 
+                            output[i % layer_num].data_ptr())
+            CPUInfer.sync()
+        end = time.perf_counter()
+        total_time = end - start
+        print('Quant mode: ', quant_mode)
+        print('Time(s): ', total_time)
+        print('Iteration: ', test_iter) 
+        print('Time(us) per iteration: ', total_time / test_iter * 1000000)
+        print('Bandwidth: ', hidden_size * intermediate_size * 3 * n_routed_experts * bytes_per_elem * test_iter / total_time / 1000 / 1000 / 1000, 'GB/s')
+        print('')
+
+bench_moe("fp32")
+bench_moe("fp16")
+bench_moe("bf16")
+bench_moe("q8_0")
+bench_moe("q6_k")
+bench_moe("q5_k_m")
+bench_moe("q4_k_m")
+bench_moe("q3_k_m")
+bench_moe("q2_k")
+# Not supported on __x86_64__
+# bench_linear("iq3_xs")
+# bench_linear("iq2_xxs")
--- a/ktransformers/ktransformers_ext/bench/bench_moe_torch.py
+++ b/ktransformers/ktransformers_ext/bench/bench_moe_torch.py
@ -0,0 +1,163 @@
+#!/usr/bin/env python
+# coding=utf-8
+'''
+Description  :  
+Author       : chenht2022
+Date         : 2024-07-25 10:32:05
+Version      : 1.0.0
+LastEditors  : chenht2022 
+LastEditTime : 2024-07-25 10:32:57
+Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
+'''
+import os, sys
+import time
+import torch
+import torch.nn.quantized as nnq
+
+def act_fn(x):
+    return x / (1.0 + torch.exp(-x))
+
+def bench_moe(quant_mode: str):
+    with torch.inference_mode(mode=True):
+        expert_num = 10
+        hidden_size = 5120
+        intermediate_size = 1536
+        n_routed_experts = 6
+        layer_num = 10
+        warm_up_iter = 1000
+        test_iter = 10000
+
+        if quant_mode == "fp32":
+            proj_type = torch.float32
+            bytes_per_elem = 4.000000
+        elif quant_mode == "fp16":
+            proj_type = torch.float16
+            bytes_per_elem = 2.000000
+        elif quant_mode == "bf16":
+            proj_type = torch.bfloat16
+            bytes_per_elem = 2.000000
+        elif quant_mode == "qint8":
+            proj_type = torch.qint8
+            bytes_per_elem = 1.000000
+        else:
+            assert(False)
+
+        gate_projs = []
+        up_projs = []
+        down_projs = []
+        for _ in range(layer_num):
+            gate_proj = torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
+            up_proj = torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
+            down_proj = torch.randn((expert_num, hidden_size, intermediate_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
+            if quant_mode == "qint8":
+                scale, zero_point = 0.1, 0  # Adjust scale and zero_point based on your dataset
+                quantized_gate_proj = []
+                quantized_up_proj = []
+                quantized_down_proj = []
+                for i in range(expert_num):
+                    gate_proj_q = torch.quantize_per_tensor(gate_proj[i], scale, zero_point, torch.qint8)
+                    quantized_gate = nnq.Linear(hidden_size, intermediate_size)
+                    quantized_gate.set_weight_bias(gate_proj_q, None)
+                    quantized_gate_proj.append(quantized_gate)
+                    up_proj_q = torch.quantize_per_tensor(up_proj[i], scale, zero_point, torch.qint8)
+                    quantized_up = nnq.Linear(hidden_size, intermediate_size)
+                    quantized_up.set_weight_bias(up_proj_q, None)
+                    quantized_up_proj.append(quantized_up)
+                    down_proj_q = torch.quantize_per_tensor(down_proj[i], scale, zero_point, torch.qint8)
+                    quantized_down = nnq.Linear(intermediate_size, hidden_size)
+                    quantized_down.set_weight_bias(down_proj_q, None)
+                    quantized_down_proj.append(quantized_down)
+                gate_projs.append(quantized_gate_proj)
+                up_projs.append(quantized_up_proj)
+                down_projs.append(quantized_down_proj)
+            else:
+                gate_projs.append(gate_proj.to(proj_type))
+                up_projs.append(up_proj.to(proj_type))
+                down_projs.append(down_proj.to(proj_type))
+
+        # warm up
+        for i in range(warm_up_iter):
+            expert_ids = torch.randint(0, expert_num, (n_routed_experts,), dtype=torch.int64).contiguous()
+            weights = torch.rand((n_routed_experts,), dtype=torch.float32).contiguous()
+            input = torch.randn((1, hidden_size), dtype=torch.float32).contiguous()
+            if quant_mode == "qint8":
+                input_q = torch.quantize_per_tensor(input, scale, zero_point, torch.quint8)
+                t_output = torch.zeros((1, hidden_size), dtype=torch.float32).contiguous()
+                gate_proj = gate_projs[i%layer_num]
+                up_proj = up_projs[i%layer_num]
+                down_proj = down_projs[i%layer_num]
+                for i, expert_id in enumerate(expert_ids):
+                    quantized_gate = gate_proj[expert_id]
+                    gate_buf = quantized_gate(input_q)
+                    quantized_up = up_proj[expert_id]
+                    up_buf = quantized_up(input_q)
+                    gate_buf = gate_buf.dequantize()
+                    up_buf = up_buf.dequantize()
+                    intermediate = act_fn(gate_buf) * up_buf
+                    intermediate_q = torch.quantize_per_tensor(intermediate, scale, zero_point, torch.quint8)
+                    quantized_down = down_proj[expert_id]
+                    expert_output = quantized_down(intermediate_q)
+                    expert_output = expert_output.dequantize()
+                    t_output += weights[i] * expert_output
+            else:
+                t_output = torch.zeros((1, hidden_size), dtype=proj_type).contiguous()
+                gate_proj = gate_projs[i%layer_num]
+                up_proj = up_projs[i%layer_num]
+                down_proj = down_projs[i%layer_num]
+                for i, expert_id in enumerate(expert_ids):
+                    gate_buf = torch.mm(input.to(proj_type), gate_proj[expert_id].t())
+                    up_buf = torch.mm(input.to(proj_type), up_proj[expert_id].t())
+                    intermediate = act_fn(gate_buf) * up_buf
+                    expert_output = torch.mm(intermediate.to(proj_type), down_proj[expert_id].t())
+                    t_output += weights[i] * expert_output
+
+        # test
+        total_time = 0
+        for i in range(test_iter):
+            expert_ids = torch.randint(0, expert_num, (n_routed_experts,), dtype=torch.int64).contiguous()
+            weights = torch.rand((n_routed_experts,), dtype=torch.float32).contiguous()
+            input = torch.randn((1, hidden_size), dtype=torch.float32).contiguous()
+            start = time.perf_counter()
+            if quant_mode == "qint8":
+                input_q = torch.quantize_per_tensor(input, scale, zero_point, torch.quint8)
+                t_output = torch.zeros((1, hidden_size), dtype=torch.float32).contiguous()
+                gate_proj = gate_projs[i%layer_num]
+                up_proj = up_projs[i%layer_num]
+                down_proj = down_projs[i%layer_num]
+                for i, expert_id in enumerate(expert_ids):
+                    quantized_gate = gate_proj[expert_id]
+                    gate_buf = quantized_gate(input_q)
+                    quantized_up = up_proj[expert_id]
+                    up_buf = quantized_up(input_q)
+                    gate_buf = gate_buf.dequantize()
+                    up_buf = up_buf.dequantize()
+                    intermediate = act_fn(gate_buf) * up_buf
+                    intermediate_q = torch.quantize_per_tensor(intermediate, scale, zero_point, torch.quint8)
+                    quantized_down = down_proj[expert_id]
+                    expert_output = quantized_down(intermediate_q)
+                    expert_output = expert_output.dequantize()
+                    t_output += weights[i] * expert_output
+            else:
+                t_output = torch.zeros((1, hidden_size), dtype=proj_type).contiguous()
+                gate_proj = gate_projs[i%layer_num]
+                up_proj = up_projs[i%layer_num]
+                down_proj = down_projs[i%layer_num]
+                for i, expert_id in enumerate(expert_ids):
+                    gate_buf = torch.mm(input.to(proj_type), gate_proj[expert_id].t())
+                    up_buf = torch.mm(input.to(proj_type), up_proj[expert_id].t())
+                    intermediate = act_fn(gate_buf) * up_buf
+                    expert_output = torch.mm(intermediate.to(proj_type), down_proj[expert_id].t())
+                    t_output += weights[i] * expert_output
+            end = time.perf_counter()
+            total_time += end - start
+        print('Quant mode: ', quant_mode)
+        print('Time(s): ', total_time)
+        print('Iteration: ', test_iter) 
+        print('Time(us) per iteration: ', total_time / test_iter * 1000000)
+        print('Bandwidth: ', hidden_size * intermediate_size * 3 * n_routed_experts * bytes_per_elem * test_iter / total_time / 1000 / 1000 / 1000, 'GB/s')
+        print('')
+
+bench_moe("fp32")
+bench_moe("fp16")
+bench_moe("bf16")
+bench_moe("qint8")
--- a/ktransformers/ktransformers_ext/cpu_backend/backend.cpp
+++ b/ktransformers/ktransformers_ext/cpu_backend/backend.cpp
@ -0,0 +1,100 @@
+/**
+ * @Description  :
+ * @Author       : chenht2022
+ * @Date         : 2024-07-22 02:03:05
+ * @Version      : 1.0.0
+ * @LastEditors  : chenht2022 
+ * @LastEditTime : 2024-07-25 10:33:34
+ * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
+ **/
+#include "backend.h"
+
+Backend::Backend(int thread_num) {
+    thread_num_ = thread_num;
+    thread_state_.resize(thread_num);
+    for (int i = 0; i < thread_num; i++) {
+        thread_state_[i].curr = std::make_unique<std::atomic<int>>();
+        thread_state_[i].status = std::make_unique<std::atomic<ThreadStatus>>(ThreadStatus::WAITING);
+    }
+    workers_.resize(thread_num);
+    for (int i = 1; i < thread_num; i++) {
+        workers_[i] = std::thread(&Backend::worker_thread, this, i);
+    }
+}
+
+Backend::~Backend() {
+    for (int i = 0; i < thread_num_; i++) {
+        thread_state_[i].status->store(ThreadStatus::EXIT, std::memory_order_release);
+    }
+    for (int i = 1; i < thread_num_; i++) {
+        if (workers_[i].joinable()) {
+            workers_[i].join();
+        }
+    }
+}
+
+int Backend::get_thread_num() {
+    return thread_num_;
+}
+
+void Backend::do_work_stealing_job(int task_num, std::function<void(int)> func) {
+    func_ = func;
+    int base = task_num / thread_num_;
+    int remain = task_num % thread_num_;
+    thread_state_[0].end = base + (0 < remain);
+    for (int i = 1; i < thread_num_; i++) {
+        thread_state_[i].curr->store(thread_state_[i - 1].end, std::memory_order_relaxed);
+        thread_state_[i].end = thread_state_[i - 1].end + base + (i < remain);
+        thread_state_[i].status->store(ThreadStatus::WORKING, std::memory_order_release);
+    }
+    thread_state_[0].curr->store(0, std::memory_order_relaxed);
+    thread_state_[0].status->store(ThreadStatus::WORKING, std::memory_order_release);
+    process_tasks(0);
+    for (int i = 1; i < thread_num_; i++) {
+        while (thread_state_[i].status->load(std::memory_order_acquire) == ThreadStatus::WORKING) {
+        }
+    }
+}
+
+void Backend::process_tasks(int thread_id) {
+    while (true) {
+        int task_id = thread_state_[thread_id].curr->fetch_add(1, std::memory_order_acq_rel);
+        if (task_id >= thread_state_[thread_id].end) {
+            break;
+        }
+        func_(task_id);
+    }
+    for (int t_offset = 1; t_offset < thread_num_; t_offset++) {
+        int t_i = (thread_id + t_offset) % thread_num_;
+        if (thread_state_[t_i].status->load(std::memory_order_acquire) != ThreadStatus::WORKING) {
+            continue;
+        }
+        while (true) {
+            int task_id = thread_state_[t_i].curr->fetch_add(1, std::memory_order_acq_rel);
+            if (task_id >= thread_state_[t_i].end) {
+                break;
+            }
+            func_(task_id);
+        }
+    }
+    thread_state_[thread_id].status->store(ThreadStatus::WAITING, std::memory_order_release);
+}
+
+void Backend::worker_thread(int thread_id) {
+    auto start = std::chrono::steady_clock::now();
+    while (true) {
+        ThreadStatus status = thread_state_[thread_id].status->load(std::memory_order_acquire);
+        if (status == ThreadStatus::WORKING) {
+            process_tasks(thread_id);
+            start = std::chrono::steady_clock::now();
+        } else if (status == ThreadStatus::WAITING) {
+            auto now = std::chrono::steady_clock::now();
+            auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(now - start).count();
+            if (duration > 50) {
+                std::this_thread::sleep_for(std::chrono::milliseconds(1));
+            }
+        } else if (status == ThreadStatus::EXIT) {
+            return;
+        }
+    }
+}
--- a/ktransformers/ktransformers_ext/cpu_backend/backend.h
+++ b/ktransformers/ktransformers_ext/cpu_backend/backend.h
@ -0,0 +1,50 @@
+/**
+ * @Description  :
+ * @Author       : chenht2022
+ * @Date         : 2024-07-22 02:03:05
+ * @Version      : 1.0.0
+ * @LastEditors  : chenht2022 
+ * @LastEditTime : 2024-07-25 10:33:38
+ * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
+ **/
+#ifndef CPUINFER_BACKEND_H
+#define CPUINFER_BACKEND_H
+
+#include <atomic>
+#include <condition_variable>
+#include <cstdio>
+#include <functional>
+#include <mutex>
+#include <thread>
+#include <vector>
+
+enum ThreadStatus {
+    WORKING,
+    WAITING,
+    EXIT,
+};
+
+struct ThreadState {
+    std::unique_ptr<std::atomic<ThreadStatus>> status;
+    std::unique_ptr<std::atomic<int>> curr;
+    int end;
+};
+
+class Backend {
+   public:
+    Backend(int);
+    ~Backend();
+    int get_thread_num();
+    void do_work_stealing_job(int, std::function<void(int)>);
+
+   private:
+    int thread_num_;
+    std::vector<ThreadState> thread_state_;  // [thread_num]
+    std::function<void(int)> func_;
+    std::vector<std::thread> workers_;
+
+    void process_tasks(int);
+    void worker_thread(int);
+};
+
+#endif
--- a/ktransformers/ktransformers_ext/cpu_backend/cpuinfer.h
+++ b/ktransformers/ktransformers_ext/cpu_backend/cpuinfer.h
@ -0,0 +1,57 @@
+/**
+ * @Description  :  
+ * @Author       : chenht2022
+ * @Date         : 2024-07-16 10:43:18
+ * @Version      : 1.0.0
+ * @LastEditors  : chenht2022 
+ * @LastEditTime : 2024-07-25 10:33:42
+ * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
+**/
+#ifndef CPUINFER_CPUINFER_H
+#define CPUINFER_CPUINFER_H
+
+#include <atomic>
+#include <condition_variable>
+#include <functional>
+#include <mutex>
+#include <queue>
+#include <thread>
+#include <vector>
+
+#include "backend.h"
+#include "task_queue.h"
+
+#include "llama.cpp/ggml-impl.h"
+
+class CPUInfer {
+   public:
+    CPUInfer(int thread_num) {
+        backend_ = new Backend(thread_num - 1);
+        task_queue_ = new TaskQueue();
+        for (int i = 0; i < (1 << 16); ++i) {
+            ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(i);
+        }
+    }
+
+    ~CPUInfer() {
+        delete backend_;
+        delete task_queue_;
+    }
+
+    template <typename Func, typename Obj, typename... Args>
+    void submit(Func f, Obj* obj, Args... args) {
+        task_queue_->enqueue([=]() {
+            std::invoke(f, *obj, args..., backend_);
+        });
+    }
+
+    void sync() {
+        task_queue_->sync();
+    }
+
+   public:
+    Backend* backend_;
+    TaskQueue* task_queue_;
+};
+
+#endif
--- a/ktransformers/ktransformers_ext/cpu_backend/task_queue.cpp
+++ b/ktransformers/ktransformers_ext/cpu_backend/task_queue.cpp
@ -0,0 +1,57 @@
+/**
+ * @Description  :
+ * @Author       : chenht2022
+ * @Date         : 2024-07-17 12:25:51
+ * @Version      : 1.0.0
+ * @LastEditors  : chenht2022
+ * @LastEditTime : 2024-07-25 10:33:44
+ * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
+ **/
+#include "task_queue.h"
+
+TaskQueue::TaskQueue() {
+    worker = std::thread(&TaskQueue::processTasks, this);
+    sync_flag.store(true, std::memory_order_seq_cst);
+    exit_flag.store(false, std::memory_order_seq_cst);
+}
+
+TaskQueue::~TaskQueue() {
+    exit_flag.store(true, std::memory_order_seq_cst);
+    if (worker.joinable()) {
+        worker.join();
+    }
+}
+
+void TaskQueue::enqueue(std::function<void()> task) {
+    mutex.lock();
+    tasks.push(task);
+    sync_flag.store(false, std::memory_order_seq_cst);
+    mutex.unlock();
+}
+
+void TaskQueue::sync() {
+    while (!sync_flag.load(std::memory_order_seq_cst))
+        ;
+}
+
+void TaskQueue::processTasks() {
+    while (true) {
+        mutex.lock();
+        if (tasks.empty()) {
+            if (exit_flag.load(std::memory_order_seq_cst)) {
+                return;
+            }
+            mutex.unlock();
+            continue;
+        }
+        std::function<void()> task = tasks.front();
+        mutex.unlock();
+        task();
+        mutex.lock();
+        tasks.pop();
+        if (tasks.empty()) {
+            sync_flag.store(true, std::memory_order_seq_cst);
+        }
+        mutex.unlock();
+    }
+}
--- a/ktransformers/ktransformers_ext/cpu_backend/task_queue.h
+++ b/ktransformers/ktransformers_ext/cpu_backend/task_queue.h
@ -0,0 +1,39 @@
+/**
+ * @Description  :
+ * @Author       : chenht2022
+ * @Date         : 2024-07-16 10:43:18
+ * @Version      : 1.0.0
+ * @LastEditors  : chenht2022 
+ * @LastEditTime : 2024-07-25 10:33:47
+ * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
+ **/
+#ifndef CPUINFER_TASKQUEUE_H
+#define CPUINFER_TASKQUEUE_H
+
+#include <atomic>
+#include <condition_variable>
+#include <functional>
+#include <mutex>
+#include <queue>
+#include <thread>
+#include <vector>
+
+class TaskQueue {
+   public:
+    TaskQueue();
+    ~TaskQueue();
+
+    void enqueue(std::function<void()>);
+
+    void sync();
+
+   private:
+    void processTasks();
+
+    std::queue<std::function<void()>> tasks;
+    std::thread worker;
+    std::mutex mutex;
+    std::atomic<bool> sync_flag;
+    std::atomic<bool> exit_flag;
+};
+#endif
--- a/ktransformers/ktransformers_ext/cuda/binding.cpp
+++ b/ktransformers/ktransformers_ext/cuda/binding.cpp
@ -0,0 +1,32 @@
+/**
+ * @Description  :  
+ * @Author       : Azure-Tang
+ * @Date         : 2024-07-25 13:38:30
+ * @Version      : 1.0.0
+ * @LastEditors  : Azure 
+ * @LastEditTime : 2024-07-26 08:36:03
+ * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
+**/
+
+#include "custom_gguf/ops.h"
+#include "gptq_marlin/ops.h"
+// Python bindings
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <torch/library.h>
+#include <torch/extension.h>
+#include <torch/torch.h>
+// namespace py = pybind11;
+
+PYBIND11_MODULE(KTransformersOps, m) {
+      m.def("dequantize_q8_0", &dequantize_q8_0, "Function to dequantize q8_0 data.",
+            py::arg("data"), py::arg("blk_size"), py::arg("device"));
+      m.def("dequantize_q6_k", &dequantize_q6_k, "Function to dequantize q6_k data.",
+            py::arg("data"), py::arg("blk_size"), py::arg("device"));
+      m.def("dequantize_q4_k",  &dequantize_q4_k, "Function to dequantize q4_k data.",
+            py::arg("data"), py::arg("blk_size"), py::arg("device"));
+      m.def("gptq_marlin_gemm", &gptq_marlin_gemm, "Function to perform GEMM using Marlin quantization.",
+            py::arg("a"), py::arg("b_q_weight"), py::arg("b_scales"), py::arg("g_idx"),
+            py::arg("perm"), py::arg("workspace"), py::arg("num_bits"), py::arg("size_m"),
+            py::arg("size_n"), py::arg("size_k"), py::arg("is_k_full"));
+}
--- a/ktransformers/ktransformers_ext/cuda/custom_gguf/binding.cpp
+++ b/ktransformers/ktransformers_ext/cuda/custom_gguf/binding.cpp
@ -0,0 +1,25 @@
+#include "ops.h"
+// Python bindings
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <torch/library.h>
+#include <torch/extension.h>
+#include <torch/torch.h>
+// namespace py = pybind11;
+
+int test(){
+    return 5;
+}
+
+torch::Tensor dequantize_q6_k(torch::Tensor data, int blk_size, torch::Device device);
+
+PYBIND11_MODULE(cudaops, m) {
+    m.def("dequantize_q8_0", &dequantize_q8_0, "Function to dequantize q8_0 data.",
+          py::arg("data"), py::arg("blk_size"), py::arg("device"));
+    m.def("dequantize_q6_k", &dequantize_q6_k, "Function to dequantize q6_k data.",
+          py::arg("data"), py::arg("blk_size"), py::arg("device"));
+    m.def("dequantize_q4_k",  &dequantize_q4_k, "Function to dequantize q4_k data.",
+          py::arg("data"), py::arg("blk_size"), py::arg("device"));
+    m.def("test", &test, "Function to test.");
+    
+}
--- a/ktransformers/ktransformers_ext/cuda/custom_gguf/custom_ggml.h
+++ b/ktransformers/ktransformers_ext/cuda/custom_gguf/custom_ggml.h
@ -0,0 +1,39 @@
+
+
+
+#include <cuda_fp16.h>
+
+
+__device__ float ggml_compute_fp16_to_fp32(uint16_t h) {
+    return __uint2float_rd(h);
+}
+
+static inline float ggml_compute_fp16_to_fp32(uint16_t h) {
+    uint16_t tmp;
+    memcpy(&tmp, &h, sizeof(ggml_fp16_t));
+    return (float)tmp;
+}
+
+// define the global table for fp16 to fp32 conversion
+__device__ float ggml_table_f32_f16[1 << 16];
+
+// CUDA Kernel to init the table
+__global__ void init_fp16_to_fp32_table() {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    for (auto blk_id = idx; blk_id<(1 << 16); blk_id+=blockDim.x * gridDim.x){
+        ggml_table_f32_f16[blk_id] = GGML_COMPUTE_FP16_TO_FP32(blk_id);
+    }
+}
+
+#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
+
+extern __device__ float ggml_table_f32_f16[1 << 16]; // Declare as __device__ if used within device code
+
+// This version of the function is designed to be called from within a CUDA kernel
+#if !defined(GGML_FP16_TO_FP32)
+__device__ float ggml_lookup_fp16_to_fp32(uint16_t f) {
+    return ggml_table_f32_f16[f];
+}
+
+#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
+#endif
--- a/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu
+++ b/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu
@ -0,0 +1,164 @@
+/*
+ * @Description  :  
+ * @Author       : Azure-Tang, Boxin Zhang
+ * @Date         : 2024-07-25 13:38:30
+ * @Version      : 1.0.0
+ * @LastEditors  : Azure 
+ * @LastEditTime : 2024-07-26 11:58:50
+ * Adapted from https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.c
+ * Copyright (c) 2023-2024 The ggml authors
+ * Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
+ */
+#include <cuda_runtime.h>
+#include <torch/library.h>
+#include <torch/extension.h>
+#include <torch/torch.h>
+#include <cstdint>
+
+__global__ void dequantize_q8_0_kernel(float* output, const float* scales, const int8_t* qs, int num_blocks, int blk_size) {
+    int global_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    for (auto block_id=global_idx; block_id<num_blocks;block_id+=blockDim.x * gridDim.x){
+        for(int i=0;i<blk_size;i++){
+            float scale = scales[block_id];
+            output[block_id * blk_size + i] = scale * qs[block_id * blk_size + i];
+        }
+    }
+}
+
+// __device__ void get_scale_min_k4(int j, const uint8_t * __restrict__ q, uint8_t * __restrict__ d, uint8_t * __restrict__ m) {
+__device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t * __restrict__ d, uint8_t * __restrict__ m) {
+    if (j < 4) {
+        *d = q[j] & 63; *m = q[j + 4] & 63;
+    } else {
+        *d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4);
+        *m = (q[j+4] >>  4) | ((q[j-0] >> 6) << 4);
+    }
+}
+
+__global__ void dequantize_q4_k_kernel(int8_t* data, float* output, int blk_size, int num_blocks) {
+    int global_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    for (auto block_id=global_idx; block_id<num_blocks;block_id+=blockDim.x * gridDim.x){
+        float* __restrict__ output_blk = (float*)(output + block_id * 256);
+        // const uint8_t * q = data[i].qs;
+        const uint8_t * q = (uint8_t*)(data + block_id * 144 + 16);
+
+        const float d   = __half2float(*(reinterpret_cast<half*>(data + block_id * 144 + 0)));
+        const float min = __half2float(*(reinterpret_cast<half*>(data + block_id * 144 + 2)));
+        int is = 0;
+        uint8_t sc, m;
+        for (int j = 0; j < blk_size; j += 64) {
+            uint8_t* scales = (uint8_t*)(data + block_id * 144 + 4);
+            get_scale_min_k4(is + 0, scales, &sc, &m);
+            const float d1 = d * sc; const float m1 = min * m;
+            get_scale_min_k4(is + 1, scales, &sc, &m);
+            const float d2 = d * sc; const float m2 = min * m;
+            for (int l = 0; l < 32; ++l) *output_blk++ = d1 * (q[l] & 0xF) - m1;
+            for (int l = 0; l < 32; ++l) *output_blk++ = d2 * (q[l]  >> 4) - m2;
+            q += 32; is += 2;
+        }
+    }
+}
+
+__global__ void dequantize_q6_k_kernel(int8_t* data, float* output, int blk_size, int num_blocks) {
+    int global_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    for (auto block_id=global_idx; block_id<num_blocks;block_id+=blockDim.x * gridDim.x){
+        float* __restrict__ output_blk = (float*)(output + block_id * 256);
+        const float d = __half2float(*(reinterpret_cast<half*>(data + block_id * blk_size + 208)));
+
+        const uint8_t * __restrict__ ql = (uint8_t*)(data + block_id * blk_size);
+        const uint8_t * __restrict__ qh = (uint8_t*)(data + block_id * blk_size + 128);
+        const int8_t  * __restrict__ sc = (int8_t*)(data + block_id * blk_size + 192);
+
+
+        //if (blk_size == 256){
+            for (int n = 0; n < blk_size; n += 128) {
+                for (int l = 0; l < 32; ++l) {
+                    int is = l/16;
+                    const int8_t q1 = (int8_t)((ql[l +  0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
+                    const int8_t q2 = (int8_t)((ql[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
+                    const int8_t q3 = (int8_t)((ql[l +  0]  >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
+                    const int8_t q4 = (int8_t)((ql[l + 32]  >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
+                    output_blk[l +  0] = d * sc[is + 0] * q1;
+                    output_blk[l + 32] = d * sc[is + 2] * q2;
+                    output_blk[l + 64] = d * sc[is + 4] * q3;
+                    output_blk[l + 96] = d * sc[is + 6] * q4;
+                }
+                output_blk += 128;
+                ql += 64;
+                qh += 32;
+                sc += 8;
+            }
+    }
+}
+
+
+torch::Tensor dequantize_q8_0(torch::Tensor data, int blk_size, torch::Device device) {
+    int num_blocks = data.numel() / blk_size;
+    // create gpu
+    auto options_scales = torch::TensorOptions().dtype(torch::kFloat32).device(device).memory_format(torch::MemoryFormat::Contiguous);
+    auto options_qs = torch::TensorOptions().dtype(torch::kInt8).device(device).memory_format(torch::MemoryFormat::Contiguous);
+    auto scales_gpu = torch::empty({{num_blocks, 1}}, options_scales);
+    auto qs_gpu = torch::empty({num_blocks, 32}, options_qs);
+
+    // read on cpu
+    options_scales = torch::TensorOptions().dtype(torch::kFloat16).device(torch::kCPU);
+    options_qs = torch::TensorOptions().dtype(torch::kInt8).device(torch::kCPU);
+
+    // // reinterpret
+    auto scales = torch::from_blob(data.data_ptr(), {num_blocks, 1 + 16}, options_scales).slice(1, 0, 1);
+    auto qs = torch::from_blob(data.data_ptr(), {num_blocks, 2 + 32}, options_qs).slice(1, 2);
+    
+    auto scales_f32 = scales.to(torch::kFloat32);
+    scales_gpu.copy_(scales_f32, false);
+    qs_gpu.copy_(qs, false);
+
+    // Create output tensor
+    auto output = torch::zeros_like(qs, torch::dtype(torch::kFloat32).device(device));
+
+    // Launch kernel
+    dequantize_q8_0_kernel<<< 512, 256 >>>(
+        output.data_ptr<float>(), scales_gpu.data_ptr<float>(), qs_gpu.data_ptr<int8_t>(), num_blocks, 32);
+
+    cudaDeviceSynchronize();
+    return output;
+}
+
+
+torch::Tensor dequantize_q6_k(torch::Tensor data, int blk_size, torch::Device device) {
+    // data.numel%blk_size should be 0, else raise err
+    int num_blocks = data.numel() / blk_size;
+
+    auto options = torch::TensorOptions().dtype(torch::kInt8).device(device).memory_format(torch::MemoryFormat::Contiguous);
+    auto data_gpu = torch::empty({data.numel()}, options);
+
+    data_gpu.copy_(data, false);
+
+    // Create output tensor
+    auto output = torch::zeros({num_blocks, 256}, torch::dtype(torch::kFloat32).device(device));
+
+    // Launch kernel
+    dequantize_q6_k_kernel<<< 512, 256 >>>(data_gpu.data_ptr<int8_t>(), output.data_ptr<float>(), blk_size, num_blocks);
+    // dequantize_q6_k_kernel<<< 512, 256 >>>(data_gpu.data_ptr<int8_t>(), output.data_ptr<float>(), 256, num_blocks);
+
+    cudaDeviceSynchronize();
+    return output;
+}
+
+torch::Tensor dequantize_q4_k(torch::Tensor data, int blk_size, torch::Device device) {
+    // data.numel%blk_size should be 0, else raise err
+    int num_blocks = data.numel() / blk_size;
+
+    auto options = torch::TensorOptions().dtype(torch::kInt8).device(device).memory_format(torch::MemoryFormat::Contiguous);
+    auto data_gpu = torch::empty({data.numel()}, options);
+
+    data_gpu.copy_(data, false);
+
+    // Create output tensor
+    auto output = torch::zeros({num_blocks, 256}, torch::dtype(torch::kFloat32).device(device));
+
+    // Launch kernel
+    dequantize_q4_k_kernel<<< 512, 256 >>>(data_gpu.data_ptr<int8_t>(), output.data_ptr<float>(), 256, num_blocks);
+
+    cudaDeviceSynchronize();
+    return output;
+}
--- a/ktransformers/ktransformers_ext/cuda/custom_gguf/ops.h
+++ b/ktransformers/ktransformers_ext/cuda/custom_gguf/ops.h
@ -0,0 +1,18 @@
+/**
+ * @Description  :  
+ * @Author       : Azure-Tang
+ * @Date         : 2024-07-22 09:27:55
+ * @Version      : 1.0.0
+ * @LastEditors  : Azure 
+ * @LastEditTime : 2024-07-26 08:38:20
+ * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
+**/
+#pragma once
+
+#include <torch/library.h>
+#include <torch/extension.h>
+#include <torch/torch.h>
+
+torch::Tensor dequantize_q8_0(torch::Tensor data, int blk_size, torch::Device device);
+torch::Tensor dequantize_q6_k(torch::Tensor data, int blk_size, torch::Device device);
+torch::Tensor dequantize_q4_k(torch::Tensor data, int blk_size, torch::Device device);
--- a/ktransformers/ktransformers_ext/cuda/gptq_marlin/gptq_marlin.cu
+++ b/ktransformers/ktransformers_ext/cuda/gptq_marlin/gptq_marlin.cu
--- a/ktransformers/ktransformers_ext/cuda/gptq_marlin/gptq_marlin.cuh
+++ b/ktransformers/ktransformers_ext/cuda/gptq_marlin/gptq_marlin.cuh
@ -0,0 +1,80 @@
+// Adapted from
+// https://github.com/vllm-project/vllm/tree/main/csrc/quantization/gptq_marlin
+// Copyrigth 2024 The vLLM team.
+// Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
+#pragma once
+
+#include <torch/all.h>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include <iostream>
+
+namespace gptq_marlin {
+
+// 8 warps are a good choice since every SM has 4 schedulers and having more
+// than 1 warp per schedule allows some more latency hiding. At the same time,
+// we want relatively few warps to have many registers per warp and small tiles.
+static constexpr int default_threads = 256;
+
+static constexpr int pipe_stages =
+    4;  // 4 pipeline stages fit into shared memory
+
+static constexpr int min_thread_n = 64;
+static constexpr int min_thread_k = 64;
+
+static constexpr int tile_size = 16;
+static constexpr int max_par = 16;
+
+template <typename T, int n>
+struct Vec {
+  T elems[n];
+  __device__ T& operator[](int i) { return elems[i]; }
+};
+
+using I4 = Vec<int, 4>;
+
+constexpr int div_ceil(int a, int b) { return (a + b - 1) / b; }
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+// No support for async
+#else
+
+__device__ inline void cp_async4_pred(void* smem_ptr, const void* glob_ptr,
+                                      bool pred = true) {
+  const int BYTES = 16;
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile(
+      "{\n"
+      "   .reg .pred p;\n"
+      "   setp.ne.b32 p, %0, 0;\n"
+      "   @p cp.async.cg.shared.global [%1], [%2], %3;\n"
+      "}\n" ::"r"((int)pred),
+      "r"(smem), "l"(glob_ptr), "n"(BYTES));
+}
+
+__device__ inline void cp_async4(void* smem_ptr, const void* glob_ptr) {
+  const int BYTES = 16;
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile(
+      "{\n"
+      "   cp.async.cg.shared.global [%0], [%1], %2;\n"
+      "}\n" ::"r"(smem),
+      "l"(glob_ptr), "n"(BYTES));
+}
+
+__device__ inline void cp_async_fence() {
+  asm volatile("cp.async.commit_group;\n" ::);
+}
+
+template <int n>
+__device__ inline void cp_async_wait() {
+  asm volatile("cp.async.wait_group %0;\n" ::"n"(n));
+}
+
+#endif
+
+}  // namespace gptq_marlin
--- a/ktransformers/ktransformers_ext/cuda/gptq_marlin/gptq_marlin_dtypes.cuh
+++ b/ktransformers/ktransformers_ext/cuda/gptq_marlin/gptq_marlin_dtypes.cuh
@ -0,0 +1,80 @@
+// Adapted from
+// https://github.com/vllm-project/vllm/tree/main/csrc/quantization/gptq_marlin
+// Copyrigth 2024 The vLLM team.
+// Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
+#ifndef _data_types_cuh
+#define _data_types_cuh
+#include "gptq_marlin.cuh"
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+
+namespace gptq_marlin {
+
+template <typename scalar_t>
+class ScalarType {};
+
+template <>
+class ScalarType<half> {
+ public:
+  using scalar_t = half;
+  using scalar_t2 = half2;
+
+  // Matrix fragments for tensor core instructions; their precise layout is
+  // documented here:
+  // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type
+  using FragA = Vec<half2, 4>;
+  using FragB = Vec<half2, 2>;
+  using FragC = Vec<float, 4>;
+  using FragS = Vec<half2, 1>;
+
+  static __device__ float inline num2float(const half x) {
+    return __half2float(x);
+  }
+
+  static __device__ half2 inline num2num2(const half x) {
+    return __half2half2(x);
+  }
+
+  static __device__ half2 inline nums2num2(const half x1, const half x2) {
+    return __halves2half2(x1, x2);
+  }
+
+  static __host__ __device__ half inline float2num(const float x) {
+    return __float2half(x);
+  }
+};
+
+template <>
+class ScalarType<nv_bfloat16> {
+ public:
+  using scalar_t = nv_bfloat16;
+  using scalar_t2 = nv_bfloat162;
+
+  using FragA = Vec<nv_bfloat162, 4>;
+  using FragB = Vec<nv_bfloat162, 2>;
+  using FragC = Vec<float, 4>;
+  using FragS = Vec<nv_bfloat162, 1>;
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  static __device__ float inline num2float(const nv_bfloat16 x) {
+    return __bfloat162float(x);
+  }
+
+  static __device__ nv_bfloat162 inline num2num2(const nv_bfloat16 x) {
+    return __bfloat162bfloat162(x);
+  }
+
+  static __device__ nv_bfloat162 inline nums2num2(const nv_bfloat16 x1,
+                                                  const nv_bfloat16 x2) {
+    return __halves2bfloat162(x1, x2);
+  }
+
+  static __host__ __device__ nv_bfloat16 inline float2num(const float x) {
+    return __float2bfloat16(x);
+  }
+#endif
+};
+
+}  // namespace gptq_marlin
+
+#endif
--- a/ktransformers/ktransformers_ext/cuda/gptq_marlin/ops.h
+++ b/ktransformers/ktransformers_ext/cuda/gptq_marlin/ops.h
@ -0,0 +1,24 @@
+/**
+ * @Description  :  
+ * @Author       : Azure
+ * @Date         : 2024-07-22 09:27:55
+ * @Version      : 1.0.0
+ * @LastEditors  : Azure 
+ * @LastEditTime : 2024-07-26 08:35:00
+ * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
+**/
+#pragma once
+
+#include <torch/library.h>
+#include <torch/extension.h>
+#include <torch/torch.h>
+
+torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
+                               torch::Tensor& b_scales, torch::Tensor& g_idx,
+                               torch::Tensor& perm, torch::Tensor& workspace,
+                               int64_t num_bits, int64_t size_m, int64_t size_n,
+                               int64_t size_k, bool is_k_full);
+
+// torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
+//                                  int64_t size_k, int64_t size_n,
+//                                  int64_t num_bits);
--- a/ktransformers/ktransformers_ext/cuda/setup.py
+++ b/ktransformers/ktransformers_ext/cuda/setup.py
@ -0,0 +1,18 @@
+
+from setuptools import setup, Extension
+from torch.utils import cpp_extension
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+
+# setup marlin gemm
+setup(name='KTransformersOps',
+      ext_modules=[
+          CUDAExtension('KTransformersOps', [
+                'custom_gguf/dequant.cu',
+                'binding.cpp',
+                'gptq_marlin/gptq_marlin.cu',
+                # 'gptq_marlin_repack.cu',
+      ])
+      ],
+      cmdclass={'build_ext': BuildExtension
+})
+
--- a/ktransformers/ktransformers_ext/examples/test_linear.py
+++ b/ktransformers/ktransformers_ext/examples/test_linear.py
@ -0,0 +1,83 @@
+#!/usr/bin/env python
+# coding=utf-8
+'''
+Description  :  
+Author       : chenht2022
+Date         : 2024-07-25 10:32:05
+Version      : 1.0.0
+LastEditors  : chenht2022 
+LastEditTime : 2024-07-25 10:34:00
+Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
+'''
+import os, sys
+import time
+sys.path.append(os.path.dirname(__file__) + '/../build')
+import cpuinfer_ext
+import torch
+
+with torch.inference_mode(mode=True):
+    input_size = 16384
+    output_size = 5120
+    stride = 32
+    proj_type = 1 # ggml_type::GGML_TYPE_F16
+    hidden_type = 1 # ggml_type::GGML_TYPE_F16
+    layer_num = 10
+    CPUInfer = cpuinfer_ext.CPUInfer(48)
+    validation_iter = 100
+    warm_up_iter = 1000
+    test_iter = 10000
+
+    linears = []
+    projs = []
+    for _ in range(layer_num):
+        proj = torch.randn((output_size, input_size), dtype=torch.float16, device = "cuda").to("cpu").contiguous()
+        config = cpuinfer_ext.linear.LinearConfig(input_size, output_size, stride, proj.data_ptr(), proj_type, hidden_type)
+        linear = cpuinfer_ext.linear.Linear(config)
+        projs.append(proj)
+        linears.append(linear)
+
+    # validation
+    for i in range(validation_iter):
+        linear = linears[i % layer_num]
+        input = torch.randn((1, input_size), dtype=torch.float16).contiguous()
+        output = torch.empty((1, output_size), dtype=torch.float16).contiguous()
+        input = input / 100
+
+        CPUInfer.submit(linear.forward, input.data_ptr(), output.data_ptr())
+        CPUInfer.sync()
+        # print('cpuinfer output', output)
+
+        proj = projs[i%layer_num]
+        t_output = torch.mm(input, proj.t())
+        # print('torch output', t_output)
+
+        diff = torch.mean(torch.abs(output - t_output)) / torch.mean(torch.abs(t_output))
+        print('diff = ', diff)
+        assert(diff < 0.001)
+
+    # warm up
+    for i in range(warm_up_iter):
+        linear = linears[i % layer_num]
+        input = torch.randn((1, input_size), dtype=torch.float16).contiguous()
+        output = torch.empty((1, output_size), dtype=torch.float16).contiguous()
+        input = input / 100
+        CPUInfer.submit(linear.forward, input.data_ptr(), output.data_ptr())
+        CPUInfer.sync()
+
+    # test
+    total_time = 0
+    for i in range(test_iter):
+        linear = linears[i % layer_num]
+        input = torch.randn((1, input_size), dtype=torch.float16).contiguous()
+        output = torch.empty((1, output_size), dtype=torch.float16).contiguous()
+        input = input / 100
+        start = time.perf_counter()
+        CPUInfer.submit(linear.forward, input.data_ptr(), output.data_ptr())
+        CPUInfer.sync()
+        end = time.perf_counter()
+        total_time += end - start
+    print('Time: ', total_time)
+    print('Iteration: ', test_iter) 
+    print('Time per iteration: ', total_time / test_iter)
+    print('Bandwidth: ', input_size * output_size * 2 * test_iter / total_time / 1000 / 1000 / 1000, 'GB/s')
+    print("All tasks completed.")
--- a/ktransformers/ktransformers_ext/examples/test_mlp.py
+++ b/ktransformers/ktransformers_ext/examples/test_mlp.py
@ -0,0 +1,98 @@
+#!/usr/bin/env python
+# coding=utf-8
+'''
+Description  :  
+Author       : chenht2022
+Date         : 2024-07-25 10:32:05
+Version      : 1.0.0
+LastEditors  : chenht2022 
+LastEditTime : 2024-07-25 10:34:03
+Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
+'''
+import os, sys
+import time
+sys.path.append(os.path.dirname(__file__) + '/../build')
+import cpuinfer_ext
+import torch
+
+with torch.inference_mode(mode=True):
+    hidden_size = 5120
+    intermediate_size = 3072
+    stride = 32
+    gate_type = 1 # ggml_type::GGML_TYPE_F16
+    up_type = 1 # ggml_type::GGML_TYPE_F16
+    down_type = 1 # ggml_type::GGML_TYPE_F16
+    hidden_type = 1 # ggml_type::GGML_TYPE_F16
+    layer_num = 10
+    CPUInfer = cpuinfer_ext.CPUInfer(48)
+    validation_iter = 100
+    warm_up_iter = 1000
+    test_iter = 10000
+
+    mlps = []
+    gate_projs = []
+    up_projs = []
+    down_projs = []
+    for _ in range(layer_num):
+        gate_proj = torch.randn((intermediate_size, hidden_size), dtype=torch.float16, device = "cuda").to("cpu").contiguous()
+        up_proj = torch.randn((intermediate_size, hidden_size), dtype=torch.float16, device = "cuda").to("cpu").contiguous()
+        down_proj = torch.randn((hidden_size, intermediate_size), dtype=torch.float16, device = "cuda").to("cpu").contiguous()
+        config = cpuinfer_ext.mlp.MLPConfig(hidden_size, intermediate_size, stride, gate_proj.data_ptr(), up_proj.data_ptr(), down_proj.data_ptr(), gate_type, up_type, down_type, hidden_type)
+        mlp = cpuinfer_ext.mlp.MLP(config)
+        gate_projs.append(gate_proj)
+        up_projs.append(up_proj)
+        down_projs.append(down_proj)
+        mlps.append(mlp)
+
+    # validation
+    for i in range(validation_iter):
+        mlp = mlps[i % layer_num]
+        input = torch.randn((1, hidden_size), dtype=torch.float16).contiguous()
+        output = torch.empty((1, hidden_size), dtype=torch.float16).contiguous()
+        input = input / 100
+
+        CPUInfer.submit(mlp.forward, input.data_ptr(), output.data_ptr())
+        CPUInfer.sync()
+        # print('cpuinfer output', output)
+
+        def act_fn(x):
+            return x / (1.0 + torch.exp(-x))
+        gate_proj = gate_projs[i%layer_num]
+        up_proj = up_projs[i%layer_num]
+        down_proj = down_projs[i%layer_num]
+        gate_buf = torch.mm(input, gate_proj.t())
+        up_buf = torch.mm(input, up_proj.t())
+        intermediate = act_fn(gate_buf) * up_buf
+        t_output = torch.mm(intermediate, down_proj.t())
+        # print('torch output', t_output)
+
+        diff = torch.mean(torch.abs(output - t_output)) / torch.mean(torch.abs(t_output))
+        print('diff = ', diff)
+        assert(diff < 0.001)
+
+    # warm up
+    for i in range(warm_up_iter):
+        mlp = mlps[i % layer_num]
+        input = torch.randn((1, hidden_size), dtype=torch.float16).contiguous()
+        output = torch.empty((1, hidden_size), dtype=torch.float16).contiguous()
+        input = input / 100
+        CPUInfer.submit(mlp.forward, input.data_ptr(), output.data_ptr())
+        CPUInfer.sync()
+
+    # test
+    total_time = 0
+    for i in range(test_iter):
+        mlp = mlps[i % layer_num]
+        input = torch.randn((1, hidden_size), dtype=torch.float16).contiguous()
+        output = torch.empty((1, hidden_size), dtype=torch.float16).contiguous()
+        input = input / 100
+        start = time.time()
+        CPUInfer.submit(mlp.forward, input.data_ptr(), output.data_ptr())
+        CPUInfer.sync()
+        end = time.time()
+        total_time += end - start
+    print('Time: ', total_time)
+    print('Iteration: ', test_iter) 
+    print('Time per iteration: ', total_time / test_iter)
+    print('Bandwidth: ', hidden_size * intermediate_size * 3 * 2 * test_iter / total_time / 1024 / 1024 / 1024, 'GB/s')
+    print("All tasks completed.")
--- a/ktransformers/ktransformers_ext/examples/test_moe.py
+++ b/ktransformers/ktransformers_ext/examples/test_moe.py
@ -0,0 +1,113 @@
+#!/usr/bin/env python
+# coding=utf-8
+'''
+Description  :  
+Author       : chenht2022
+Date         : 2024-07-25 10:32:05
+Version      : 1.0.0
+LastEditors  : chenht2022 
+LastEditTime : 2024-07-25 10:34:06
+Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
+'''
+import os, sys
+import time
+sys.path.append(os.path.dirname(__file__) + '/../build')
+import cpuinfer_ext
+import torch
+
+with torch.inference_mode(mode=True):
+    expert_num = 10
+    hidden_size = 5120
+    intermediate_size = 1536
+    stride = 32
+    group_min_len = 10
+    group_max_len = 1024
+    gate_type = 1 # ggml_type::GGML_TYPE_F16
+    up_type = 1 # ggml_type::GGML_TYPE_F16
+    down_type = 1 # ggml_type::GGML_TYPE_F16
+    hidden_type = 1 # ggml_type::GGML_TYPE_F16
+    n_routed_experts = 6
+    qlen = 30
+    layer_num = 10
+    CPUInfer = cpuinfer_ext.CPUInfer(48)
+    validation_iter = 100
+    warm_up_iter = 1000
+    test_iter = 10000
+
+    moes = []
+    gate_projs = []
+    up_projs = []
+    down_projs = []
+    for _ in range(layer_num):
+        gate_proj = torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float16, device = "cuda").to("cpu").contiguous()
+        up_proj = torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float16, device = "cuda").to("cpu").contiguous()
+        down_proj = torch.randn((expert_num, hidden_size, intermediate_size), dtype=torch.float16, device = "cuda").to("cpu").contiguous()
+        config = cpuinfer_ext.moe.MOEConfig(expert_num, n_routed_experts, hidden_size, intermediate_size, stride, group_min_len, group_max_len, gate_proj.data_ptr(), up_proj.data_ptr(), down_proj.data_ptr(), gate_type, up_type, down_type, hidden_type)
+        moe = cpuinfer_ext.moe.MOE(config)
+        gate_projs.append(gate_proj)
+        up_projs.append(up_proj)
+        down_projs.append(down_proj)
+        moes.append(moe)
+
+    # validation
+    for i in range(validation_iter):
+        moe = moes[i % layer_num]
+        expert_ids = torch.randint(0, expert_num, (qlen, n_routed_experts), dtype=torch.int64).contiguous()
+        weights = torch.rand((qlen, n_routed_experts), dtype=torch.float32).contiguous()
+        input = torch.randn((qlen, 1, hidden_size), dtype=torch.float16).contiguous()
+        output = torch.empty((qlen, 1, hidden_size), dtype=torch.float16).contiguous()
+        input = input / 100
+        
+        CPUInfer.submit(moe.forward, qlen, n_routed_experts, expert_ids.data_ptr(), weights.data_ptr(), input.data_ptr(), output.data_ptr())
+        CPUInfer.sync()
+        # print('cpuinfer output', output)
+
+        def act_fn(x):
+            return x / (1.0 + torch.exp(-x))
+        t_output = torch.zeros((qlen, 1, hidden_size), dtype=torch.float32).contiguous()
+        gate_proj = gate_projs[i%layer_num]
+        up_proj = up_projs[i%layer_num]
+        down_proj = down_projs[i%layer_num]
+        for token_idx in range(qlen):
+            for i, expert_id in enumerate(expert_ids[token_idx]):
+                gate_buf = torch.mm(input[token_idx], gate_proj[expert_id].t())
+                up_buf = torch.mm(input[token_idx], up_proj[expert_id].t())
+                intermediate = act_fn(gate_buf) * up_buf
+                expert_output = torch.mm(intermediate, down_proj[expert_id].t())
+                t_output[token_idx] += weights[token_idx][i] * expert_output
+        # print('torch output', t_output)
+
+        diff = torch.mean(torch.abs(output - t_output)) / torch.mean(torch.abs(t_output))
+        print('diff = ', diff)
+        assert(diff < 0.001)
+
+    # warm up
+    for i in range(warm_up_iter):
+        moe = moes[i % layer_num]
+        expert_ids = torch.randint(0, expert_num, (qlen, n_routed_experts), dtype=torch.int64).contiguous()
+        weights = torch.rand((qlen, n_routed_experts), dtype=torch.float32).contiguous()
+        input = torch.randn((qlen, hidden_size), dtype=torch.float16).contiguous()
+        output = torch.empty((qlen, hidden_size), dtype=torch.float16).contiguous()
+        input = input / 100
+        CPUInfer.submit(moe.forward, qlen, n_routed_experts, expert_ids.data_ptr(), weights.data_ptr(), input.data_ptr(), output.data_ptr())
+        CPUInfer.sync()
+
+    # test
+    total_time = 0
+    for i in range(test_iter):
+        moe = moes[i % layer_num]
+        expert_ids = torch.randint(0, expert_num, (qlen, n_routed_experts), dtype=torch.int64).contiguous()
+        weights = torch.rand((qlen, n_routed_experts), dtype=torch.float32).contiguous()
+        input = torch.randn((qlen, hidden_size), dtype=torch.float16).contiguous()
+        output = torch.empty((qlen, hidden_size), dtype=torch.float16).contiguous()
+        input = input / 100
+        start = time.perf_counter()
+        CPUInfer.submit(moe.forward, qlen, n_routed_experts, expert_ids.data_ptr(), weights.data_ptr(), input.data_ptr(), output.data_ptr())
+        CPUInfer.sync()
+        end = time.perf_counter()
+        total_time += end - start
+    print('Time: ', total_time)
+    print('Iteration: ', test_iter) 
+    print('Time per iteration: ', total_time / test_iter)
+    print('Bandwidth: ', hidden_size * intermediate_size * 3 * n_routed_experts * 2 * test_iter / total_time / 1000 / 1000 / 1000, 'GB/s')
+    print("All tasks completed.")
--- a/ktransformers/ktransformers_ext/ext_bindings.cpp
+++ b/ktransformers/ktransformers_ext/ext_bindings.cpp
@ -0,0 +1,264 @@
+/**
+ * @Description  :
+ * @Author       : chenht2022
+ * @Date         : 2024-07-22 02:03:22
+ * @Version      : 1.0.0
+ * @LastEditors  : chenht2022 
+ * @LastEditTime : 2024-07-25 10:34:23
+ * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
+ **/
+// Python bindings
+#include <cstdint>
+#include <iostream>
+#include <memory>
+#include "cpu_backend/cpuinfer.h"
+#include "cuda_runtime.h"
+#include "device_launch_parameters.h"
+#include "llamafile/flags.h"
+#include "operators/llamafile/linear.h"
+#include "operators/llamafile/mlp.h"
+#include "operators/llamafile/moe.h"
+#include "pybind11/functional.h"
+#include "pybind11/operators.h"
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+
+namespace py = pybind11;
+using namespace pybind11::literals;
+
+// Binding functions for the Linear class
+class LinearBindings {
+   public:
+    static void bind_forward(CPUInfer& cpuinfer, Linear* linear, py::args args, py::kwargs kwargs) {
+        auto input = args[0].cast<intptr_t>();
+        auto output = args[1].cast<intptr_t>();
+        cpuinfer.submit(&Linear::forward, linear,
+                        (const void*)input, (void*)output);
+    }
+
+    static void bind_warm_up(CPUInfer& cpuinfer, Linear* linear, py::args args, py::kwargs kwargs) {
+        cpuinfer.submit(&Linear::warm_up, linear);
+    }
+
+    static void bind_functions(CPUInfer& cpuinfer, py::object func, py::args args, py::kwargs kwargs) {
+        auto linear = func.attr("__self__").cast<Linear*>();
+        std::string func_name = py::str(func.attr("__func__").attr("__name__"));
+
+        if (func_name == "forward") {
+            bind_forward(cpuinfer, linear, args, kwargs);
+        } else if (func_name == "warm_up") {
+            bind_warm_up(cpuinfer, linear, args, kwargs);
+        } else {
+            throw py::value_error("Unsupported function: " +
+                                  std::string(func_name));
+        }
+    }
+};
+
+// Binding functions for the MLP class
+class MLPBindings {
+   public:
+    static void bind_forward(CPUInfer& cpuinfer, MLP* mlp, py::args args, py::kwargs kwargs) {
+        auto input = args[0].cast<intptr_t>();
+        auto output = args[1].cast<intptr_t>();
+        cpuinfer.submit(&MLP::forward, mlp,
+                        (const void*)input, (void*)output);
+    }
+
+    static void bind_warm_up(CPUInfer& cpuinfer, MLP* mlp, py::args args, py::kwargs kwargs) {
+        cpuinfer.submit(&MLP::warm_up, mlp);
+    }
+
+    static void bind_functions(CPUInfer& cpuinfer, py::object func, py::args args, py::kwargs kwargs) {
+        auto mlp = func.attr("__self__").cast<MLP*>();
+        std::string func_name = py::str(func.attr("__func__").attr("__name__"));
+
+        if (func_name == "forward") {
+            bind_forward(cpuinfer, mlp, args, kwargs);
+        } else if (func_name == "warm_up") {
+            bind_warm_up(cpuinfer, mlp, args, kwargs);
+        } else {
+            throw py::value_error("Unsupported function: " +
+                                  std::string(func_name));
+        }
+    }
+};
+
+// Binding functions for the MOE class
+class MOEBindings {
+   public:
+    static void bind_forward(CPUInfer& cpuinfer, MOE* moe, py::args args, py::kwargs kwargs) {
+        int qlen = args[0].cast<int>();
+        int k = args[1].cast<int>();
+        auto expert_ids = args[2].cast<intptr_t>();
+        auto weights = args[3].cast<intptr_t>();
+        auto input = args[4].cast<intptr_t>();
+        auto output = args[5].cast<intptr_t>();
+        cpuinfer.submit(&MOE::forward, moe,
+                        qlen, k, (const uint64_t*)expert_ids, (const float*)weights, (const void*)input, (void*)output);
+    }
+
+    static void bind_warm_up(CPUInfer& cpuinfer, MOE* moe, py::args args, py::kwargs kwargs) {
+        cpuinfer.submit(&MOE::warm_up, moe);
+    }
+
+    static void bind_functions(CPUInfer& cpuinfer, py::object func, py::args args, py::kwargs kwargs) {
+        auto moe = func.attr("__self__").cast<MOE*>();
+        std::string func_name = py::str(func.attr("__func__").attr("__name__"));
+
+        if (func_name == "forward") {
+            bind_forward(cpuinfer, moe, args, kwargs);
+        } else if (func_name == "warm_up") {
+            bind_warm_up(cpuinfer, moe, args, kwargs);
+        } else {
+            throw py::value_error("Unsupported function: " +
+                                  std::string(func_name));
+        }
+    }
+};
+
+struct MOEForwardArgs {
+    CPUInfer* cpuinfer;
+    MOE* moe;
+    int qlen;
+    int k;
+    uint64_t* expert_ids;
+    float* weights;
+    void* input;
+    void* output;
+};
+
+void submit_moe_forward_with_host_args_ptr(void* host_args_ptr) {
+    MOEForwardArgs* host_args = (MOEForwardArgs*)host_args_ptr;
+    host_args->cpuinfer->submit(&MOE::forward, host_args->moe,
+                                host_args->qlen, host_args->k, host_args->expert_ids, host_args->weights, host_args->input, host_args->output);
+}
+
+void cpuinfer_sync(void* host_args_ptr) {
+    CPUInfer* cpuinfer = (CPUInfer*)host_args_ptr;
+    cpuinfer->sync();
+}
+
+PYBIND11_MODULE(cpuinfer_ext, m) {
+    auto linear_module = m.def_submodule("linear");
+
+    py::class_<LinearConfig>(linear_module, "LinearConfig")
+        .def(py::init([](int hidden_size, int intermediate_size, int stride, intptr_t proj, int proj_type, int hidden_type) {
+            return LinearConfig(hidden_size, intermediate_size, stride, (void*)proj, (ggml_type)proj_type, (ggml_type)hidden_type);
+        }));
+
+    py::class_<Linear>(linear_module, "Linear")
+        .def(py::init<LinearConfig>())
+        .def("warm_up", [](Linear& linear) {
+            throw std::runtime_error("!!! Doing nothing, please use CPUInfer.submit to call it!!!\n");
+        })
+        .def("forward", [](Linear& linear, intptr_t input, intptr_t output) {
+            throw std::runtime_error("!!! Doing nothing, please use CPUInfer.submit to call it!!!\n");
+        });
+
+    auto mlp_module = m.def_submodule("mlp");
+
+    py::class_<MLPConfig>(mlp_module, "MLPConfig")
+        .def(py::init([](int hidden_size, int intermediate_size, int stride, intptr_t gate_proj, intptr_t up_proj, intptr_t down_proj, int gate_type, int up_type, int down_type, int hidden_type) {
+            return MLPConfig(hidden_size, intermediate_size, stride, (void*)gate_proj, (void*)up_proj, (void*)down_proj, (ggml_type)gate_type, (ggml_type)up_type, (ggml_type)down_type, (ggml_type)hidden_type);
+        }));
+
+    py::class_<MLP>(mlp_module, "MLP")
+        .def(py::init<MLPConfig>())
+        .def("warm_up", [](MLP& mlp) {
+            throw std::runtime_error("!!! Doing nothing, please use CPUInfer.submit to call it!!!\n");
+        })
+        .def("forward", [](MLP& mlp, intptr_t input, intptr_t output) {
+            throw std::runtime_error("!!! Doing nothing, please use CPUInfer.submit to call it!!!\n");
+        });
+
+    auto moe_module = m.def_submodule("moe");
+
+    py::class_<MOEConfig>(moe_module, "MOEConfig")
+        .def(py::init([](int expert_num, int routed_expert_num, int hidden_size, int intermediate_size, int stride, int group_min_len, int group_max_len, intptr_t gate_proj, intptr_t up_proj, intptr_t down_proj, int gate_type, int up_type, int down_type, int hidden_type) {
+            return MOEConfig(expert_num, routed_expert_num, hidden_size, intermediate_size, stride, group_min_len, group_max_len, (void*)gate_proj, (void*)up_proj, (void*)down_proj, (ggml_type)gate_type, (ggml_type)up_type, (ggml_type)down_type, (ggml_type)hidden_type);
+        }));
+
+    py::class_<MOE>(moe_module, "MOE")
+        .def(py::init<MOEConfig>())
+        .def("warm_up", [](MOE& moe) {
+            throw std::runtime_error("!!! Doing nothing, please use CPUInfer.submit to call it!!!\n");
+        })
+        .def("forward", [](MOE& moe, int k, uint64_t expert_ids, intptr_t weights, intptr_t input, intptr_t output) {
+            throw std::runtime_error("!!! Doing nothing, please use CPUInfer.submit to call it!!!\n");
+        });
+
+    py::class_<CPUInfer>(m, "CPUInfer")
+        .def(py::init<int>())
+        .def("submit",
+             [linear_module, mlp_module, moe_module](CPUInfer& cpuinfer, py::object func, py::args args, py::kwargs kwargs) {
+                 if (py::hasattr(func, "__self__") &&
+                     py::hasattr(func, "__func__")) {
+                     std::string class_name = py::str(func.attr("__self__")
+                                                          .attr("__class__")
+                                                          .attr("__name__"));
+                     if (class_name == "Linear") {
+                         LinearBindings::bind_functions(cpuinfer, func,
+                                                        args, kwargs);
+                     } else if (class_name == "MLP") {
+                         MLPBindings::bind_functions(cpuinfer, func,
+                                                     args, kwargs);
+                     } else if (class_name == "MOE") {
+                         MOEBindings::bind_functions(cpuinfer, func,
+                                                     args, kwargs);
+                     } else {
+                         // handle other classes
+                         throw py::type_error("Unsupported class type: " +
+                                              class_name);
+                     }
+                 } else {
+                     // handle cases where func does not have __self__ or
+                     // __func__
+                     throw py::type_error(
+                         "Invalid function object: missing "
+                         "__self__ or __func__ attribute.");
+                 }
+             })
+        .def("submit_with_cuda_stream",
+             [linear_module, mlp_module, moe_module](CPUInfer& cpuinfer, intptr_t user_cuda_stream, py::object func, py::args args, py::kwargs kwargs) {
+                 if (py::hasattr(func, "__self__") &&
+                     py::hasattr(func, "__func__")) {
+                     std::string class_name = py::str(func.attr("__self__")
+                                                          .attr("__class__")
+                                                          .attr("__name__"));
+                     if (class_name == "MOE") {
+                         std::string func_name = py::str(func.attr("__func__").attr("__name__"));
+                         if (func_name == "forward") {
+                             auto moe = func.attr("__self__").cast<MOE*>();
+                             int qlen = args[0].cast<int>();
+                             int k = args[1].cast<int>();
+                             auto expert_ids = args[2].cast<intptr_t>();
+                             auto weights = args[3].cast<intptr_t>();
+                             auto input = args[4].cast<intptr_t>();
+                             auto output = args[5].cast<intptr_t>();
+                             MOEForwardArgs* moe_forward_args = new MOEForwardArgs{&cpuinfer, moe, qlen, k, (uint64_t*)expert_ids, (float*)weights, (void*)input, (void*)output};
+                             // submit_moe_forward_with_host_args_ptr(moe_forward_args);
+                             cudaLaunchHostFunc((cudaStream_t)user_cuda_stream, (cudaHostFn_t)submit_moe_forward_with_host_args_ptr, moe_forward_args);
+                         } else {
+                             throw py::value_error("Unsupported function: " +
+                                                   std::string(func_name));
+                         }
+                     } else {
+                         // handle other classes
+                         throw py::type_error("Unsupported class type: " +
+                                              class_name);
+                     }
+                 } else {
+                     // handle cases where func does not have __self__ or
+                     // __func__
+                     throw py::type_error(
+                         "Invalid function object: missing "
+                         "__self__ or __func__ attribute.");
+                 }
+             })
+        .def("sync_with_cuda_stream", [](CPUInfer& cpuinfer, intptr_t user_cuda_stream) {
+            // cpuinfer_sync((void*)(&cpuinfer));
+            cudaLaunchHostFunc((cudaStream_t)user_cuda_stream, (cudaHostFn_t)cpuinfer_sync, (void*)(&cpuinfer));
+        })
+        .def("sync", &CPUInfer::sync);
+}
--- a/ktransformers/ktransformers_ext/operators/custom_marlin/quantize/gptq.py
+++ b/ktransformers/ktransformers_ext/operators/custom_marlin/quantize/gptq.py
@ -0,0 +1,206 @@
+import math
+import os
+import time
+from logging import getLogger
+
+import torch
+import torch.nn as nn
+import transformers
+
+from .quantizer import Quantizer
+
+
+logger = getLogger(__name__)
+
+torch.backends.cuda.matmul.allow_tf32 = False
+torch.backends.cudnn.allow_tf32 = False
+
+
+class GPTQ:
+    def __init__(self, layer):
+        self.layer = layer
+        self.dev = self.layer.weight.device
+        W = layer.weight.data.clone()
+        if isinstance(self.layer, nn.Conv2d):
+            W = W.flatten(1)
+        if isinstance(self.layer, transformers.pytorch_utils.Conv1D):
+            W = W.t()
+        self.rows = W.shape[0]
+        self.columns = W.shape[1]
+        self.H = torch.zeros((self.columns, self.columns), device=self.dev)
+        self.nsamples = 0
+        self.quantizer = Quantizer()
+
+    def add_batch(self, inp, out):
+        if os.environ.get("DEBUG"):
+            self.inp1 = inp
+            self.out1 = out
+        if len(inp.shape) == 2:
+            inp = inp.unsqueeze(0)
+        tmp = inp.shape[0]
+        if isinstance(self.layer, nn.Linear) or isinstance(self.layer, transformers.Conv1D):
+            if len(inp.shape) == 3:
+                inp = inp.reshape((-1, inp.shape[-1]))
+            inp = inp.t()
+        if isinstance(self.layer, nn.Conv2d):
+            unfold = nn.Unfold(
+                self.layer.kernel_size,
+                dilation=self.layer.dilation,
+                padding=self.layer.padding,
+                stride=self.layer.stride,
+            )
+            inp = unfold(inp)
+            inp = inp.permute([1, 0, 2])
+            inp = inp.flatten(1)
+        self.H *= self.nsamples / (self.nsamples + tmp)
+        self.nsamples += tmp
+        # inp = inp.float()
+        inp = math.sqrt(2 / self.nsamples) * inp.float()
+        # self.H += 2 / self.nsamples * inp.matmul(inp.t())
+        self.H += inp.matmul(inp.t())
+
+    def fasterquant(
+        self,
+        blocksize=128,
+        percdamp=0.01,
+        group_size=-1,
+        actorder=False,
+        static_groups=False,
+    ):
+        W = self.layer.weight.data.clone()
+        if isinstance(self.layer, nn.Conv2d):
+            W = W.flatten(1)
+        if isinstance(self.layer, transformers.Conv1D):
+            W = W.t()
+        W = W.float()
+
+        tick = time.time()
+
+        if not self.quantizer.ready():
+            self.quantizer.find_params(W, weight=True)
+
+        H = self.H
+        del self.H
+        dead = torch.diag(H) == 0
+        H[dead, dead] = 1
+        W[:, dead] = 0
+
+        g_idx = []
+        scale = []
+        zero = []
+        now_idx = 1
+
+        if static_groups:
+            import copy
+
+            groups = []
+            for i in range(0, self.columns, group_size):
+                quantizer = copy.deepcopy(self.quantizer)
+                quantizer.find_params(W[:, i : (i + group_size)], weight=True)
+                scale.append(quantizer.scale)
+                zero.append(quantizer.zero)
+                groups.append(quantizer)
+
+        if actorder:
+            perm = torch.argsort(torch.diag(H), descending=True)
+            W = W[:, perm]
+            H = H[perm][:, perm]
+            invperm = torch.argsort(perm)
+
+        Losses = torch.zeros_like(W)
+        Q = torch.zeros_like(W)
+
+        damp = percdamp * torch.mean(torch.diag(H))
+        diag = torch.arange(self.columns, device=self.dev)
+        H[diag, diag] += damp
+        H = torch.linalg.cholesky(H)
+        H = torch.cholesky_inverse(H)
+        H = torch.linalg.cholesky(H, upper=True)
+        Hinv = H
+
+        for i1 in range(0, self.columns, blocksize):
+            i2 = min(i1 + blocksize, self.columns)
+            count = i2 - i1
+
+            W1 = W[:, i1:i2].clone()
+            Q1 = torch.zeros_like(W1)
+            Err1 = torch.zeros_like(W1)
+            Losses1 = torch.zeros_like(W1)
+            Hinv1 = Hinv[i1:i2, i1:i2]
+
+            for i in range(count):
+                w = W1[:, i]
+                d = Hinv1[i, i]
+
+                if group_size != -1:
+                    if not static_groups:
+                        if (i1 + i) % group_size == 0:
+                            self.quantizer.find_params(W[:, (i1 + i) : (i1 + i + group_size)], weight=True)
+
+                        if ((i1 + i) // group_size) - now_idx == -1:
+                            scale.append(self.quantizer.scale)
+                            zero.append(self.quantizer.zero)
+                            now_idx += 1
+                    else:
+                        idx = i1 + i
+                        if actorder:
+                            idx = perm[idx]
+                        self.quantizer = groups[idx // group_size]
+
+                q = self.quantizer.quantize(w.unsqueeze(1)).flatten()
+                Q1[:, i] = q
+                Losses1[:, i] = (w - q) ** 2 / d**2
+
+                err1 = (w - q) / d
+                W1[:, i:] -= err1.unsqueeze(1).matmul(Hinv1[i, i:].unsqueeze(0))
+                Err1[:, i] = err1
+
+            Q[:, i1:i2] = Q1
+            Losses[:, i1:i2] = Losses1 / 2
+
+            W[:, i2:] -= Err1.matmul(Hinv[i1:i2, i2:])
+
+            if os.environ.get("DEBUG"):
+                self.layer.weight.data[:, :i2] = Q[:, :i2]
+                self.layer.weight.data[:, i2:] = W[:, i2:]
+                logger.debug(torch.sum((self.layer(self.inp1) - self.out1) ** 2))
+                logger.debug(torch.sum(Losses))
+
+        torch.cuda.synchronize()
+        logger.info(f"duration: {(time.time() - tick)}")
+        logger.info(f"avg loss: {torch.sum(Losses).item() / self.nsamples}")
+
+        group_size = group_size if group_size != -1 else self.columns
+        if static_groups and actorder:
+            g_idx = [perm[i] // group_size for i in range(self.columns)]
+        else:
+            g_idx = [i // group_size for i in range(self.columns)]
+        g_idx = torch.tensor(g_idx, dtype=torch.int32, device=Q.device)
+        if actorder:
+            Q = Q[:, invperm]
+            g_idx = g_idx[invperm]
+
+        if isinstance(self.layer, transformers.Conv1D):
+            Q = Q.t()
+        self.layer.weight.data = Q.reshape(self.layer.weight.shape).type_as(self.layer.weight.data)
+        if os.environ.get("DEBUG"):
+            logger.debug(torch.sum((self.layer(self.inp1) - self.out1) ** 2))
+
+        if scale == []:
+            scale.append(self.quantizer.scale)
+            zero.append(self.quantizer.zero)
+        scale = torch.cat(scale, dim=1)
+        zero = torch.cat(zero, dim=1)
+        return scale, zero, g_idx
+
+    def free(self):
+        if os.environ.get("DEBUG"):
+            self.inp1 = None
+            self.out1 = None
+        self.H = None
+        self.Losses = None
+        self.Trace = None
+        torch.cuda.empty_cache()
+
+
+__all__ = ["GPTQ"]
--- a/ktransformers/ktransformers_ext/operators/custom_marlin/quantize/gptq_marlin.py
+++ b/ktransformers/ktransformers_ext/operators/custom_marlin/quantize/gptq_marlin.py
@ -0,0 +1,458 @@
+import enum
+from enum import Enum
+from typing import Any, Dict, List, Optional
+
+import torch
+from torch.nn.parameter import Parameter
+
+from vllm import _custom_ops as ops
+from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
+                                               set_weight_attrs)
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+
+logger = init_logger(__name__)
+
+GPTQ_MARLIN_TILE = 16
+GPTQ_MARLIN_MIN_THREAD_N = 64
+GPTQ_MARLIN_MIN_THREAD_K = 128
+GPTQ_MARLIN_MAX_PARALLEL = 16
+
+GPTQ_MARLIN_SUPPORTED_NUM_BITS = [4, 8]
+GPTQ_MARLIN_SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128]
+GPTQ_MARLIN_SUPPORTED_SYM = [True]
+
+
+# Permutations for Marlin scale shuffling
+def get_scale_perms(num_bits: int):
+    scale_perm: List[int] = []
+    for i in range(8):
+        scale_perm.extend([i + 8 * j for j in range(8)])
+    scale_perm_single: List[int] = []
+    for i in range(4):
+        scale_perm_single.extend(
+            [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
+    return scale_perm, scale_perm_single
+
+
+def get_pack_factor(num_bits: int):
+    assert (num_bits in GPTQ_MARLIN_SUPPORTED_NUM_BITS
+            ), f"Unsupported num_bits = {num_bits}"
+    return 32 // num_bits
+
+
+def marlin_permute_scales(s: torch.Tensor, size_k: int, size_n: int,
+                          group_size: int, num_bits: int):
+    scale_perm, scale_perm_single = get_scale_perms(num_bits)
+    if group_size < size_k and group_size != -1:
+        s = s.reshape((-1, len(scale_perm)))[:, scale_perm]
+    else:
+        s = s.reshape((-1, len(scale_perm_single)))[:, scale_perm_single]
+    s = s.reshape((-1, size_n)).contiguous()
+
+    return s
+
+
+class GPTQMarlinConfig(QuantizationConfig):
+    """Config class for GPTQ Marlin"""
+
+    def __init__(self, weight_bits: int, group_size: int, desc_act: bool,
+                 is_sym: bool) -> None:
+        if desc_act and group_size == -1:
+            # In this case, act_order == True is the same as act_order == False
+            # (since we have only one group per output channel)
+            desc_act = False
+
+        self.weight_bits = weight_bits
+        self.group_size = group_size
+        self.desc_act = desc_act
+        self.is_sym = is_sym
+
+        # Verify
+        if self.weight_bits not in GPTQ_MARLIN_SUPPORTED_NUM_BITS:
+            raise ValueError(
+                f"Marlin does not support weight_bits = {self.weight_bits}. "
+                f"Only weight_bits = {GPTQ_MARLIN_SUPPORTED_NUM_BITS} "
+                "are supported.")
+        if self.group_size not in GPTQ_MARLIN_SUPPORTED_GROUP_SIZES:
+            raise ValueError(
+                f"Marlin does not support group_size = {self.group_size}. "
+                f"Only group_sizes = {GPTQ_MARLIN_SUPPORTED_GROUP_SIZES} "
+                "are supported.")
+        if self.is_sym not in GPTQ_MARLIN_SUPPORTED_SYM:
+            raise ValueError(
+                f"Marlin does not support is_sym = {self.is_sym}. "
+                f"Only sym = {GPTQ_MARLIN_SUPPORTED_SYM} are supported.")
+
+        # Init
+        self.pack_factor = get_pack_factor(weight_bits)
+        self.tile_size = GPTQ_MARLIN_TILE
+        self.min_thread_n = GPTQ_MARLIN_MIN_THREAD_N
+        self.min_thread_k = GPTQ_MARLIN_MIN_THREAD_K
+        self.max_parallel = GPTQ_MARLIN_MAX_PARALLEL
+
+    def __repr__(self) -> str:
+        return (f"GPTQMarlinConfig(weight_bits={self.weight_bits}, "
+                f"group_size={self.group_size}, "
+                f"desc_act={self.desc_act})")
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "gptq_marlin"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.half, torch.bfloat16]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 80
+
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        return ["quantize_config.json"]
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> "GPTQMarlinConfig":
+        weight_bits = cls.get_from_keys(config, ["bits"])
+        group_size = cls.get_from_keys(config, ["group_size"])
+        desc_act = cls.get_from_keys(config, ["desc_act"])
+        is_sym = cls.get_from_keys(config, ["sym"])
+        return cls(weight_bits, group_size, desc_act, is_sym)
+
+    @classmethod
+    def override_quantization_method(cls, hf_quant_cfg,
+                                     user_quant) -> Optional[str]:
+        can_convert = cls.is_marlin_compatible(hf_quant_cfg)
+
+        is_valid_user_quant = (user_quant is None or user_quant == "marlin")
+
+        if can_convert and is_valid_user_quant:
+            msg = ("The model is convertible to {} during runtime."
+                   " Using {} kernel.".format(cls.get_name(), cls.get_name()))
+            logger.info(msg)
+            return cls.get_name()
+
+        if can_convert and user_quant == "gptq":
+            logger.info("Detected that the model can run with gptq_marlin"
+                        ", however you specified quantization=gptq explicitly,"
+                        " so forcing gptq. Use quantization=gptq_marlin for"
+                        " faster inference")
+        return None
+
+    def get_quant_method(
+            self,
+            layer: torch.nn.Module) -> Optional["GPTQMarlinLinearMethod"]:
+        if isinstance(layer, LinearBase):
+            return GPTQMarlinLinearMethod(self)
+        return None
+
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+
+    @classmethod
+    def is_marlin_compatible(cls, quant_config: Dict[str, Any]):
+        # Extract data from quant config.
+        num_bits = quant_config.get("bits", None)
+        group_size = quant_config.get("group_size", None)
+        sym = quant_config.get("sym", None)
+        desc_act = quant_config.get("desc_act", None)
+
+        # If we cannot find the info needed in the config, cannot convert.
+        if (num_bits is None or group_size is None or sym is None
+                or desc_act is None):
+            return False
+
+        # If the capability of the device is too low, cannot convert.
+        major, minor = torch.cuda.get_device_capability()
+        device_capability = major * 10 + minor
+        if device_capability < cls.get_min_capability():
+            return False
+
+        # Otherwise, can convert if model satisfies marlin constraints.
+        return (num_bits in GPTQ_MARLIN_SUPPORTED_NUM_BITS
+                and group_size in GPTQ_MARLIN_SUPPORTED_GROUP_SIZES
+                and sym in GPTQ_MARLIN_SUPPORTED_SYM)
+
+
+class GPTQMarlinState(Enum):
+    REPACK = enum.auto()
+    READY = enum.auto()
+
+
+class GPTQMarlinLinearMethod(LinearMethodBase):
+    """Linear method for GPTQ Marlin.
+
+    Args:
+        quant_config: The GPTQ Marlin quantization config.
+    """
+
+    def __init__(self, quant_config: GPTQMarlinConfig) -> None:
+        self.quant_config = quant_config
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ) -> None:
+        del output_size
+
+        # Normalize group_size
+        if self.quant_config.group_size != -1:
+            group_size = self.quant_config.group_size
+        else:
+            group_size = input_size
+
+        # Validate dtype
+        if params_dtype not in [torch.float16, torch.bfloat16]:
+            raise ValueError(f"The params dtype must be float16 "
+                             f"or bfloat16, but got {params_dtype}")
+
+        # Validate output_size_per_partition
+        output_size_per_partition = sum(output_partition_sizes)
+        if output_size_per_partition % self.quant_config.min_thread_n != 0:
+            raise ValueError(
+                f"Weight output_size_per_partition = "
+                f"{output_size_per_partition} is not divisible by "
+                f" min_thread_n = {self.quant_config.min_thread_n}.")
+
+        # Validate input_size_per_partition
+        if input_size_per_partition % self.quant_config.min_thread_k != 0:
+            raise ValueError(
+                f"Weight input_size_per_partition = "
+                f"{input_size_per_partition} is not divisible "
+                f"by min_thread_k = {self.quant_config.min_thread_k}.")
+
+        if (group_size < input_size
+                and input_size_per_partition % group_size != 0):
+            raise ValueError(
+                f"Weight input_size_per_partition = {input_size_per_partition}"
+                f" is not divisible by group_size = {group_size}.")
+
+        # Detect sharding of scales/zp
+
+        # By default, no sharding over "input dim"
+        scales_and_zp_size = input_size // group_size
+        scales_and_zp_input_dim = None
+
+        if self.quant_config.desc_act:
+            # Act-order case
+            assert self.quant_config.group_size != -1
+
+            is_k_full = input_size_per_partition == input_size
+
+        else:
+            # No act-order case
+
+            # K is always full due to full alignment with
+            # group-size and shard of scales/zp
+            is_k_full = True
+
+            # If this is a row-parallel case, then shard scales/zp
+            if (input_size != input_size_per_partition
+                    and self.quant_config.group_size != -1):
+                scales_and_zp_size = input_size_per_partition // group_size
+                scales_and_zp_input_dim = 0
+
+        # Init buffers
+
+        # Quantized weights
+        qweight = Parameter(
+            torch.empty(
+                input_size_per_partition // self.quant_config.pack_factor,
+                output_size_per_partition,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        set_weight_attrs(
+            qweight,
+            {
+                **extra_weight_attrs,
+                "input_dim": 0,
+                "output_dim": 1,
+                "packed_dim": 0,
+                "pack_factor": self.quant_config.pack_factor,
+            },
+        )
+
+        # Activation order
+        g_idx = Parameter(
+            torch.empty(
+                input_size_per_partition,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        # Ignore warning from fused linear layers such as QKVParallelLinear.
+        set_weight_attrs(
+            g_idx,
+            {
+                **extra_weight_attrs, "input_dim": 0,
+                "ignore_warning": True
+            },
+        )
+
+        g_idx_sort_indices = torch.empty(
+            g_idx.shape,
+            dtype=torch.int32,
+        )
+
+        # Scales
+        scales = Parameter(
+            torch.empty(
+                scales_and_zp_size,
+                output_size_per_partition,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        set_weight_attrs(
+            scales,
+            {
+                **extra_weight_attrs,
+                "input_dim": scales_and_zp_input_dim,
+                "output_dim": 1,
+            },
+        )
+
+        # Quantized zero-points
+        qzeros = Parameter(
+            torch.empty(
+                scales_and_zp_size,
+                output_size_per_partition // self.quant_config.pack_factor,
+                dtype=torch.int32,
+                device="meta",
+            ),
+            requires_grad=False,
+        )
+        set_weight_attrs(
+            qzeros,
+            {
+                **extra_weight_attrs,
+                "input_dim": scales_and_zp_input_dim,
+                "output_dim": 1,
+                "packed_dim": 1,
+                "pack_factor": self.quant_config.pack_factor,
+            },
+        )
+
+        # Allocate marlin workspace
+        max_workspace_size = (
+            output_size_per_partition //
+            self.quant_config.min_thread_n) * self.quant_config.max_parallel
+        workspace = torch.zeros(max_workspace_size,
+                                dtype=torch.int,
+                                requires_grad=False)
+
+        layer.register_parameter("qweight", qweight)
+        layer.register_parameter("g_idx", g_idx)
+        layer.register_parameter("scales", scales)
+        layer.register_parameter("qzeros", qzeros)
+        layer.g_idx_sort_indices = g_idx_sort_indices
+        layer.workspace = workspace
+        layer.input_size_per_partition = input_size_per_partition
+        layer.output_size_per_partition = output_size_per_partition
+        layer.input_size = input_size
+        layer.is_k_full = is_k_full
+        layer.marlin_state = GPTQMarlinState.REPACK
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        reshaped_x = x.reshape(-1, x.shape[-1])
+
+        size_m = reshaped_x.shape[0]
+        part_size_n = layer.output_size_per_partition
+        part_size_k = layer.input_size_per_partition
+        full_size_k = layer.input_size
+
+        out_shape = x.shape[:-1] + (part_size_n, )
+
+        if layer.marlin_state == GPTQMarlinState.REPACK:
+            layer.marlin_state = GPTQMarlinState.READY
+
+            # Newly generated tensors need to replace existing tensors that are
+            # already registered as parameters by vLLM (and won't be freed)
+            def replace_tensor(name, new_t):
+                # It is important to use resize_() here since it ensures
+                # the same buffer is reused
+                getattr(layer, name).resize_(new_t.shape)
+                getattr(layer, name).copy_(new_t)
+                del new_t
+
+            cur_device = layer.qweight.device
+
+            # Process act_order
+            if self.quant_config.desc_act:
+                # Get sorting based on g_idx
+                g_idx_sort_indices = torch.argsort(layer.g_idx).to(torch.int)
+
+                sorted_g_idx = layer.g_idx[g_idx_sort_indices]
+
+                replace_tensor("g_idx", sorted_g_idx)
+                replace_tensor("g_idx_sort_indices", g_idx_sort_indices)
+
+            else:
+                # Reset g_idx related tensors
+                layer.g_idx = Parameter(
+                    torch.empty(0, dtype=torch.int, device=cur_device),
+                    requires_grad=False,
+                )
+                layer.g_idx_sort_indices = Parameter(
+                    torch.empty(0, dtype=torch.int, device=cur_device),
+                    requires_grad=False,
+                )
+
+            # Repack weights
+            marlin_qweight = ops.gptq_marlin_repack(
+                layer.qweight,
+                layer.g_idx_sort_indices,
+                part_size_k,
+                part_size_n,
+                self.quant_config.weight_bits,
+            )
+            replace_tensor("qweight", marlin_qweight)
+
+            # Permute scales
+            scales_size_k = part_size_k
+            scales_size_n = part_size_n
+            if self.quant_config.desc_act:
+                scales_size_k = full_size_k
+
+            marlin_scales = marlin_permute_scales(
+                layer.scales,
+                scales_size_k,
+                scales_size_n,
+                self.quant_config.group_size,
+                self.quant_config.weight_bits,
+            )
+            replace_tensor("scales", marlin_scales)
+
+        output = ops.gptq_marlin_gemm(
+            reshaped_x,
+            layer.qweight,
+            layer.scales,
+            layer.g_idx,
+            layer.g_idx_sort_indices,
+            layer.workspace,
+            self.quant_config.weight_bits,
+            size_m,
+            part_size_n,
+            part_size_k,
+            layer.is_k_full,
+        )
+
+        if bias is not None:
+            output.add_(bias)  # In-place add
+
+        return output.reshape(out_shape)
--- a/ktransformers/ktransformers_ext/operators/custom_marlin/quantize/quantizer.py
+++ b/ktransformers/ktransformers_ext/operators/custom_marlin/quantize/quantizer.py
@ -0,0 +1,140 @@
+from logging import getLogger
+
+import torch
+import torch.nn as nn
+
+
+logger = getLogger(__name__)
+
+
+def quantize(x, scale, zero, maxq):
+    if maxq < 0:
+        return (x > scale / 2).float() * scale + (x < zero / 2).float() * zero
+    q = torch.clamp(torch.round(x / scale) + zero, 0, maxq)
+    return scale * (q - zero)
+
+
+class Quantizer(nn.Module):
+    def __init__(self, shape=1):
+        super(Quantizer, self).__init__()
+        self.register_buffer("maxq", torch.tensor(0))
+        self.register_buffer("scale", torch.zeros(shape))
+        self.register_buffer("zero", torch.zeros(shape))
+
+    def configure(
+        self,
+        bits,
+        perchannel=False,
+        sym=True,
+        mse=False,
+        norm=2.4,
+        grid=100,
+        maxshrink=0.8,
+        trits=False,
+    ):
+        self.maxq = torch.tensor(2**bits - 1)
+        self.perchannel = perchannel
+        self.sym = sym
+        self.mse = mse
+        self.norm = norm
+        self.grid = grid
+        self.maxshrink = maxshrink
+        if trits:
+            self.maxq = torch.tensor(-1)
+
+    def find_params(self, x, weight=False):
+        dev = x.device
+        self.maxq = self.maxq.to(dev)
+
+        shape = x.shape
+        if self.perchannel:
+            if weight:
+                x = x.flatten(1)
+            else:
+                if len(shape) == 4:
+                    x = x.permute([1, 0, 2, 3])
+                    x = x.flatten(1)
+                if len(shape) == 3:
+                    x = x.reshape((-1, shape[-1])).t()
+                if len(shape) == 2:
+                    x = x.t()
+        else:
+            x = x.flatten().unsqueeze(0)
+
+        tmp = torch.zeros(x.shape[0], device=dev)
+        xmin = torch.minimum(x.min(1)[0], tmp)
+        xmax = torch.maximum(x.max(1)[0], tmp)
+
+        if self.sym:
+            xmax = torch.maximum(torch.abs(xmin), xmax)
+            tmp = xmin < 0
+            if torch.any(tmp):
+                xmin[tmp] = -xmax[tmp]
+        tmp = (xmin == 0) & (xmax == 0)
+        xmin[tmp] = -1
+        xmax[tmp] = +1
+
+        if self.maxq < 0:
+            self.scale = xmax
+            self.zero = xmin
+        else:
+            self.scale = (xmax - xmin) / self.maxq
+            if self.sym:
+                self.zero = torch.full_like(self.scale, (self.maxq + 1) / 2)
+            else:
+                self.zero = torch.round(-xmin / self.scale)
+
+        if self.mse:
+            best = torch.full([x.shape[0]], float("inf"), device=dev)
+            for i in range(int(self.maxshrink * self.grid)):
+                p = 1 - i / self.grid
+                xmin1 = p * xmin
+                xmax1 = p * xmax
+                scale1 = (xmax1 - xmin1) / self.maxq
+                zero1 = torch.round(-xmin1 / scale1) if not self.sym else self.zero
+                q = quantize(x, scale1.unsqueeze(1), zero1.unsqueeze(1), self.maxq)
+                q -= x
+                q.abs_()
+                q.pow_(self.norm)
+                err = torch.sum(q, 1)
+                tmp = err < best
+                if torch.any(tmp):
+                    best[tmp] = err[tmp]
+                    self.scale[tmp] = scale1[tmp]
+                    self.zero[tmp] = zero1[tmp]
+        if not self.perchannel:
+            if weight:
+                tmp = shape[0]
+            else:
+                tmp = shape[1] if len(shape) != 3 else shape[2]
+            self.scale = self.scale.repeat(tmp)
+            self.zero = self.zero.repeat(tmp)
+
+        if weight:
+            shape = [-1] + [1] * (len(shape) - 1)
+            self.scale = self.scale.reshape(shape)
+            self.zero = self.zero.reshape(shape)
+            return
+        if len(shape) == 4:
+            self.scale = self.scale.reshape((1, -1, 1, 1))
+            self.zero = self.zero.reshape((1, -1, 1, 1))
+        if len(shape) == 3:
+            self.scale = self.scale.reshape((1, 1, -1))
+            self.zero = self.zero.reshape((1, 1, -1))
+        if len(shape) == 2:
+            self.scale = self.scale.unsqueeze(0)
+            self.zero = self.zero.unsqueeze(0)
+
+    def quantize(self, x):
+        if self.ready():
+            return quantize(x, self.scale, self.zero, self.maxq)
+        return x
+
+    def enabled(self):
+        return self.maxq > 0
+
+    def ready(self):
+        return torch.all(self.scale != 0)
+
+
+__all__ = ["Quantizer"]
--- a/ktransformers/ktransformers_ext/operators/custom_marlin/quantize/repack.py
+++ b/ktransformers/ktransformers_ext/operators/custom_marlin/quantize/repack.py
@ -0,0 +1,99 @@
+import torch
+import enum
+from enum import Enum
+from typing import Any, Dict, List, Optional
+from torch.nn.parameter import Parameter
+
+def apply(
+    self,
+    layer: torch.nn.Module,
+    x: torch.Tensor,
+    bias: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    reshaped_x = x.reshape(-1, x.shape[-1])
+
+    size_m = reshaped_x.shape[0]
+    part_size_n = layer.output_size_per_partition
+    part_size_k = layer.input_size_per_partition
+    full_size_k = layer.input_size
+
+    out_shape = x.shape[:-1] + (part_size_n, )
+
+    if layer.marlin_state == GPTQMarlinState.REPACK:
+        layer.marlin_state = GPTQMarlinState.READY
+
+        # Newly generated tensors need to replace existing tensors that are
+        # already registered as parameters by vLLM (and won't be freed)
+        def replace_tensor(name, new_t):
+            # It is important to use resize_() here since it ensures
+            # the same buffer is reused
+            getattr(layer, name).resize_(new_t.shape)
+            getattr(layer, name).copy_(new_t)
+            del new_t
+
+        cur_device = layer.qweight.device
+
+        # Process act_order
+        if self.quant_config.desc_act:
+            # Get sorting based on g_idx
+            g_idx_sort_indices = torch.argsort(layer.g_idx).to(torch.int)
+
+            sorted_g_idx = layer.g_idx[g_idx_sort_indices]
+
+            replace_tensor("g_idx", sorted_g_idx)
+            replace_tensor("g_idx_sort_indices", g_idx_sort_indices)
+
+        else:
+            # Reset g_idx related tensors
+            layer.g_idx = Parameter(
+                torch.empty(0, dtype=torch.int, device=cur_device),
+                requires_grad=False,
+            )
+            layer.g_idx_sort_indices = Parameter(
+                torch.empty(0, dtype=torch.int, device=cur_device),
+                requires_grad=False,
+            )
+
+        # Repack weights
+        marlin_qweight = ops.gptq_marlin_repack(
+            layer.qweight,
+            layer.g_idx_sort_indices,
+            part_size_k,
+            part_size_n,
+            self.quant_config.weight_bits,
+        )
+        replace_tensor("qweight", marlin_qweight)
+
+        # Permute scales
+        scales_size_k = part_size_k
+        scales_size_n = part_size_n
+        if self.quant_config.desc_act:
+            scales_size_k = full_size_k
+
+        marlin_scales = marlin_permute_scales(
+            layer.scales,
+            scales_size_k,
+            scales_size_n,
+            self.quant_config.group_size,
+            self.quant_config.weight_bits,
+        )
+        replace_tensor("scales", marlin_scales)
+
+    output = ops.gptq_marlin_gemm(
+        reshaped_x,
+        layer.qweight,
+        layer.scales,
+        layer.g_idx,
+        layer.g_idx_sort_indices,
+        layer.workspace,
+        self.quant_config.weight_bits,
+        size_m,
+        part_size_n,
+        part_size_k,
+        layer.is_k_full,
+    )
+
+    if bias is not None:
+        output.add_(bias)  # In-place add
+
+    return output.reshape(out_shape)
--- a/ktransformers/ktransformers_ext/operators/custom_marlin/quantize/utils/init.py
+++ b/ktransformers/ktransformers_ext/operators/custom_marlin/quantize/utils/init.py
--- a/ktransformers/ktransformers_ext/operators/custom_marlin/quantize/utils/format_24.py
+++ b/ktransformers/ktransformers_ext/operators/custom_marlin/quantize/utils/format_24.py
@ -0,0 +1,308 @@
+#
+# Modified by Roberto Lopez Castro (roberto.lopez.castro@udc.es).
+#
+
+import torch
+
+
+# This is PyTorch implementation of main part of reorder_meta()
+# function, from tools/util/include/cutlass/util/host_reorder.h file
+# of CUTLASS source tree.  Furthermore, CUTLASS template for sparse
+# GEMM decides upon layout of this matrix, and at the moment for the
+# sparse GEMM executed on tensor cores, this is layout described by
+# ColumnMajorInterleaved<2> data structure, in
+# include/cutlass/layout/matrix.h of CUTLASS source tree.  The
+# reordering of meta matrix into meta_reordered matrix calculated
+# according to these segments of CUTLASS code is re-implemented here.
+# Note that this calculation produces offsets for scattering metadata
+# matrix elements into reordered metadata matrix elements (or,
+# equivalently, for gathering reordered metadata matrix element back
+# into metadata matrix elements).
+def _calculate_meta_reordering_scatter_offsets(m, meta_ncols, meta_dtype,
+                                               device):
+    dst_rows = torch.arange(0, m, device=device)[:, None].repeat(1, meta_ncols)
+    dst_cols = torch.arange(0, meta_ncols, device=device).repeat(m, 1)
+
+    # Reorder the rows, then swizzle the 2x2 blocks.
+    group_x = 64
+    group_y = 32 if meta_dtype.itemsize == 2 else 16
+
+    dst_rows = (dst_rows // group_x * group_x + (dst_rows % 2) * 2 +
+                (dst_rows % 8) // 4 + ((dst_rows % group_y) % 4) // 2 * 32 +
+                ((dst_rows % group_x) // 8) * 4)
+
+    topright = ((dst_rows % 2 == 0) & (dst_cols % 2 == 1)).to(torch.int8)
+    bottomleft = ((dst_rows % 2 == 1) & (dst_cols % 2 == 0)).to(torch.int8)
+    dst_rows += topright - bottomleft
+    dst_cols -= topright - bottomleft
+
+    # Assumed that meta tensor is to be stored in CUTLASS
+    # InterleavedColumnMajor layout, and reverse engineered
+    # corresponding code to store values into this tensor.
+    interleave = 2
+    cols_maj = dst_cols // interleave
+    cols_min = dst_cols % interleave
+    return (cols_maj * m * interleave + dst_rows * interleave +
+            cols_min).view(-1)
+
+
+# This function converts dense matrix into sparse semi-structured
+# representation, producing "compressed" matrix, in the layout used by
+# CUTLASS backend, and corresponding metadata matrix.
+def sparse_semi_structured_from_dense_cutlass(dense):
+    if dense.dim() != 2:
+        raise RuntimeError(
+            f"Expected 2-dimensional dense tensor, got {dense.dim()}-dimensional tensor"  # noqa: E501
+        )
+
+    m, k = dense.shape
+    device = dense.device
+
+    meta_dtype = torch.int8
+    if dense.dtype == torch.int8:
+        meta_dtype = torch.int32
+    elif dense.dtype in [torch.half, torch.bfloat16, torch.float, torch.int32]:
+        meta_dtype = torch.int16
+    else:
+        raise RuntimeError(f"Invalid datatype {dense.dtype} of dense matrix")
+    quadbits_per_meta_elem = meta_dtype.itemsize * 8 // 4
+    if quadbits_per_meta_elem not in (4, 8):
+        raise RuntimeError(
+            "Invalid number of elements per meta element calculated")
+
+    if meta_dtype == torch.int32:
+        if m % 16 != 0:
+            raise RuntimeError(
+                f"Number of rows of dense matrix {m} must be divisible by 16")
+    else:
+        if m % 32 != 0:
+            raise RuntimeError(
+                f"Number of rows of dense matrix {m} must be divisible by 32")
+    if k % (4 * quadbits_per_meta_elem) != 0:
+        raise RuntimeError(
+            f"Number of columns of dense matrix {k} must be divisible by {4 * quadbits_per_meta_elem}"  # noqa: E501
+        )
+
+    if dense.dtype != torch.float:
+        ksparse = 4
+        dense_4 = dense.view(-1, k // ksparse, ksparse)
+        m0, m1, m2, m3 = (dense_4 != 0).unbind(-1)
+    else:
+        ksparse = 2
+        dense_2 = dense.view(-1, k // ksparse, ksparse)
+        m0, m2 = m1, m3 = (dense_2 != 0).unbind(-1)
+    meta_ncols = k // (ksparse * quadbits_per_meta_elem)
+
+    # Encoding quadruples of True/False values as follows:
+    #     [True,  True,  False, False] -> 0b0100
+    #     [True,  False, True,  False] -> 0b1000
+    #     [False, True,  True,  False] -> 0b1001
+    #     [True,  False, False, True ] -> 0b1100
+    #     [False, True,  False, True ] -> 0b1101
+    #     [False, False, True,  True ] -> 0b1110
+    # Thus, lower two bits in the encoding are index of the True value
+    # at the lowest index in the quadruple, and the higher two bits in
+    # the encoding are index of the other True value in the quadruple.
+    # In case there are less than two True values, than False value or
+    # values at some index or indices are considered True for the
+    # encoding.  In case there are more than two True values, then the
+    # excess True value(s) at some indices are considered False for
+    # the encoding.  The exact encodings used for these cases are as
+    # follows:
+    #     [False, False, False, False] -> 0b1110
+    #     [False, False, False, True ] -> 0b1110
+    #     [False, False, True,  False] -> 0b1110
+    #     [False, True,  False, False] -> 0b1001
+    #     [False, True,  True,  True ] -> 0b1101
+    #     [True,  False, False, False] -> 0b1000
+    #     [True,  False, True,  True ] -> 0b1100
+    #     [True,  True,  False, True ] -> 0b0100
+    #     [True,  True,  True,  False] -> 0b0100
+    #     [True,  True,  True,  True ] -> 0b0100
+    # These particular encodings are chosen, with the help of Espresso
+    # logic minimizer software, for the purpose of minimization of
+    # corresponding Boolean functions, that translate non-zero flags
+    # into encoding bits.  Note also possible choices for the first
+    # and last of these encodings were limited only to (0b0100,
+    # 0b1110), in order to produce valid encodings for 1:2 sparsity
+    # case.
+
+    expr0 = m0 & m1
+    expr1 = ~m0 & m1
+    expr2 = ~m0 & ~m1
+    bit0 = expr1
+    bit1 = expr2
+    bit2 = expr0 | expr2 | m3
+    bit3 = expr1 | ~m1
+    idxs0 = bit0 | (bit1.to(torch.int64) << 1)
+    idxs1 = bit2 | (bit3.to(torch.int64) << 1)
+
+    if dense.dtype != torch.float:
+        sparse0 = dense_4.gather(
+            -1, idxs0.unsqueeze(-1))  # type: ignore[possibly-undefined]
+        sparse1 = dense_4.gather(-1, idxs1.unsqueeze(-1))
+        sparse = torch.stack((sparse0, sparse1), dim=-1).view(m, k // 2)
+    else:
+        sparse = dense_2.gather(-1,
+                                idxs0.unsqueeze(-1) // 2).view(
+                                    m,
+                                    k // 2)  # type: ignore[possibly-undefined]
+
+    meta_4 = idxs0 | (idxs1 << 2)
+    meta_n = meta_4.view(
+        (-1, meta_ncols, quadbits_per_meta_elem)).to(meta_dtype)
+
+    if quadbits_per_meta_elem == 4:
+        meta = (meta_n[:, :, 0]
+                | (meta_n[:, :, 1] << 4)
+                | (meta_n[:, :, 2] << 8)
+                | (meta_n[:, :, 3] << 12))
+    elif quadbits_per_meta_elem == 8:
+        meta = (meta_n[:, :, 0]
+                | (meta_n[:, :, 1] << 4)
+                | (meta_n[:, :, 2] << 8)
+                | (meta_n[:, :, 3] << 12)
+                | (meta_n[:, :, 4] << 16)
+                | (meta_n[:, :, 5] << 20)
+                | (meta_n[:, :, 6] << 24)
+                | (meta_n[:, :, 7] << 28))
+
+    # Reorder meta tensor elements.
+    meta_reordered = meta.new_empty(
+        (m * meta_ncols, ))  # type: ignore[possibly-undefined]
+    meta_offsets = _calculate_meta_reordering_scatter_offsets(
+        m, meta_ncols, meta_dtype, device)
+    meta_reordered.scatter_(0, meta_offsets, meta.view(-1))
+
+    return (sparse, meta_reordered.view(m, meta_ncols))
+
+
+# This function performs reverse of the function above - it
+# reconstructs dense matrix from a pair of "compressed" matrix, given
+# in the layout used by CUTLASS backend, and accompanying metadata
+# matrix.
+def sparse_semi_structured_to_dense_cutlass(sparse, meta_reordered):
+    if sparse.dim() != 2:
+        raise RuntimeError(
+            f"Expected 2-dimensional sparse tensor, got {sparse.dim()}-dimensional tensor"  # noqa: E501
+        )
+
+    m, k = sparse.shape
+    device = sparse.device
+
+    if meta_reordered.dim() != 2:
+        raise RuntimeError(
+            f"Expected 2-dimensional meta tensor, got {meta_reordered.dim()}-dimensional tensor"  # noqa: E501
+        )
+    if meta_reordered.device != device:
+        raise RuntimeError(
+            f"Expected meta matrix to be on {device} device, got matrix on {meta_reordered.device} device"  # noqa: E501
+        )
+
+    meta_dtype = meta_reordered.dtype
+    if meta_dtype not in (torch.int16, torch.int32):
+        raise RuntimeError(f"Invalid datatype {meta_dtype} of meta matrix")
+    quadbits_per_meta_elem = meta_dtype.itemsize * 8 // 4
+
+    ksparse = 4 if sparse.dtype != torch.float else 2
+
+    meta_nrows, meta_ncols = meta_reordered.shape
+    if meta_nrows != m:
+        raise RuntimeError(
+            f"Number of rows of meta matrix {meta_nrows} must be equal to number of columns of spase matrix {m}"  # noqa: E501
+        )
+    if meta_ncols * ksparse * quadbits_per_meta_elem != 2 * k:
+        raise RuntimeError(
+            f"Number of columns of sparse matrix {k} different from the {meta_ncols * ksparse * quadbits_per_meta_elem // 2}, "  # noqa: E501
+            "expected according to the number of columns of meta matrix")
+
+    # Undo meta tensor elements reordering.
+    meta_offsets = _calculate_meta_reordering_scatter_offsets(
+        m, meta_ncols, meta_dtype, device)
+    meta = torch.gather(meta_reordered.view(-1), 0,
+                        meta_offsets).view(m, meta_ncols)
+
+    # Unpack sparse tensor back to original dense tensor, using
+    # information provided by meta tensor.  Note that torch.float
+    # datatype is handled pretty much the same as
+    # torch.half/torch.bfloat16, as metadata for a pair of torch.float
+    # value is encoded as if underlying 8 bytes contain four
+    # torch.half/torch.bfloat16 values, where either first two or last
+    # two are zeros.
+    meta_2 = torch.empty(
+        (m, meta_ncols, 2 * quadbits_per_meta_elem),
+        dtype=meta_dtype,
+        device=device,
+    )
+    if quadbits_per_meta_elem == 4:
+        meta_2[:, :, 0] = meta & 0b11
+        meta_2[:, :, 1] = (meta >> 2) & 0b11
+        meta_2[:, :, 2] = (meta >> 4) & 0b11
+        meta_2[:, :, 3] = (meta >> 6) & 0b11
+        meta_2[:, :, 4] = (meta >> 8) & 0b11
+        meta_2[:, :, 5] = (meta >> 10) & 0b11
+        meta_2[:, :, 6] = (meta >> 12) & 0b11
+        meta_2[:, :, 7] = (meta >> 14) & 0b11
+    elif quadbits_per_meta_elem == 8:
+        meta_2[:, :, 0] = meta & 0b11
+        meta_2[:, :, 1] = (meta >> 2) & 0b11
+        meta_2[:, :, 2] = (meta >> 4) & 0b11
+        meta_2[:, :, 3] = (meta >> 6) & 0b11
+        meta_2[:, :, 4] = (meta >> 8) & 0b11
+        meta_2[:, :, 5] = (meta >> 10) & 0b11
+        meta_2[:, :, 6] = (meta >> 12) & 0b11
+        meta_2[:, :, 7] = (meta >> 14) & 0b11
+        meta_2[:, :, 8] = (meta >> 16) & 0b11
+        meta_2[:, :, 9] = (meta >> 18) & 0b11
+        meta_2[:, :, 10] = (meta >> 20) & 0b11
+        meta_2[:, :, 11] = (meta >> 22) & 0b11
+        meta_2[:, :, 12] = (meta >> 24) & 0b11
+        meta_2[:, :, 13] = (meta >> 26) & 0b11
+        meta_2[:, :, 14] = (meta >> 28) & 0b11
+        meta_2[:, :, 15] = (meta >> 30) & 0b11
+
+    dense_offsets = meta_2.view(-1) + (
+        torch.arange(0, 2 * m * k // ksparse, device=device) * 4).view(
+            -1, 1).repeat(1, 2).view(-1)
+
+    dense = torch.zeros((m * 2 * k, ), dtype=sparse.dtype, device=device)
+    if sparse.dtype != torch.float:
+        # dense.scatter_(0, dense_offsets, sparse.view(-1))
+        dense.scatter_(0, dense_offsets, sparse.reshape(-1))
+    else:
+        dense.view(torch.half).scatter_(0, dense_offsets,
+                                        sparse.view(torch.half).view(-1))
+
+    return dense.view(m, 2 * k)
+
+
+def mask_creator(tensor):
+    """
+    Class for creating N:M sparsity masks.
+    Masks will be created using the N:M ratio, where for every block of 
+    M weights, N will be pruned based on ranked weight value. Each mask 
+    will correspond to the given tensor.
+
+    :param N: The number of weights in a group to keep
+    :param M: The size of a weight group
+    """
+    N = 2
+    M = 4
+
+    mask = None
+    # for i, tensor in enumerate(tensors):
+    if tensor.numel() % M != 0:
+        raise ValueError(
+            f"Tensor of size {tensor.shape} can't be evenly divided into "
+            f"{M} groups")
+
+    num_groups = tensor.numel() // M
+
+    # N:M sparsity for linear layers
+    tensor_temp = tensor.detach().abs().reshape(num_groups, M)
+    index = torch.argsort(tensor_temp, dim=1)[:, :int(M - N)]
+
+    w_b = torch.ones(tensor_temp.shape, device=tensor_temp.device)
+    mask = w_b.scatter_(dim=1, index=index, value=0).reshape(tensor.shape)
+
+    return mask
--- a/ktransformers/ktransformers_ext/operators/custom_marlin/quantize/utils/marlin_24_perms.py
+++ b/ktransformers/ktransformers_ext/operators/custom_marlin/quantize/utils/marlin_24_perms.py
@ -0,0 +1,60 @@
+"""This file is used for /tests and /benchmarks"""
+from typing import Dict, List
+
+import numpy
+import torch
+
+
+# Precompute permutations for Marlin24 weight and scale shuffling # noqa: E501
+#
+# Marlin works on [16*2,64] tiles. The goal of the permutations is to reorder the weight data so that it is compatible noqa: # noqa: E501
+# with the tensor-core format that is described here:
+# https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type # noqa: E501
+#
+# As a result of this reordering, the vector loads inside the kernel will get the data as it is needed for tensor-core # noqa: E501
+# (without the need to use ldmatrix instructions) # noqa: E501
+def get_perms_24(num_bits: int):
+    perm_list: List[int] = []
+    for i in range(32):
+        perm1: List[int] = []
+        col = i // 4
+        col_o = col // 2
+        for block in [0, 1]:
+            for row in [
+                    2 * (i % 4),
+                    2 * (i % 4) + 1,
+                    2 * (i % 4 + 4),
+                    2 * (i % 4 + 4) + 1,
+            ]:
+                perm1.append(16 * row + col_o * 256 + 8 * (col % 2) +
+                             4 * block)
+        for j in range(4):
+            perm_list.extend([p + 1 * j for p in perm1])
+    perm = numpy.array(perm_list)
+
+    if num_bits == 4:
+        interleave = numpy.array([0, 2, 4, 6, 1, 3, 5, 7])
+    elif num_bits == 8:
+        interleave = numpy.array([0, 2, 1, 3])
+    else:
+        raise ValueError("num_bits must be 4 or 8, got {}".format(num_bits))
+
+    perm = perm.reshape((-1, len(interleave)))[:, interleave].ravel()
+    perm = torch.from_numpy(perm)
+    scale_perm: List[int] = []
+    for i in range(8):
+        scale_perm.extend([i * 8 + j for j in [0, 4, 1, 5, 2, 6, 3, 7]])
+    scale_perm_single: List[int] = []
+    for i in range(8):
+        scale_perm_single.extend([8 * i + j for j in [0, 1, 2, 3, 4, 5, 6, 7]])
+    return perm, scale_perm, scale_perm_single
+
+
+marlin_24_perm: Dict[int, torch.Tensor] = {}
+marlin_24_scale_perm: Dict[int, List[int]] = {}
+marlin_24_scale_perm_single: Dict[int, List[int]] = {}
+for num_bits in [4, 8]:
+    perm_24, scale_perm_24, scale_perm_single_24 = get_perms_24(num_bits)
+    marlin_24_perm[num_bits] = perm_24
+    marlin_24_scale_perm[num_bits] = scale_perm_24
+    marlin_24_scale_perm_single[num_bits] = scale_perm_single_24
--- a/ktransformers/ktransformers_ext/operators/custom_marlin/quantize/utils/marlin_perms.py
+++ b/ktransformers/ktransformers_ext/operators/custom_marlin/quantize/utils/marlin_perms.py
@ -0,0 +1,60 @@
+"""This file is used for /tests and /benchmarks"""
+from typing import Dict, List
+
+import numpy
+import torch
+
+
+# Precompute permutations for Marlin weight and scale shuffling # noqa: E501
+#
+# Marlin works on [16,64] tiles. The goal of the permutations is to reorder the weight data so that it is compatible noqa: # noqa: E501
+# with the tensor-core format that is described here:
+# https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type # noqa: E501
+#
+# As a result of this reordering, the vector loads inside the kernel will get the data as it is needed for tensor-core # noqa: E501
+# (without the need to use ldmatrix instructions) # noqa: E501
+def get_perms(num_bits: int):
+    perm_list: List[int] = []
+    for i in range(32):
+        perm1: List[int] = []
+        col = i // 4
+        for block in [0, 1]:
+            for row in [
+                    2 * (i % 4),
+                    2 * (i % 4) + 1,
+                    2 * (i % 4 + 4),
+                    2 * (i % 4 + 4) + 1,
+            ]:
+                perm1.append(16 * row + col + 8 * block)
+        for j in range(4):
+            perm_list.extend([p + 256 * j for p in perm1])
+
+    perm = numpy.array(perm_list)
+
+    if num_bits == 4:
+        interleave = numpy.array([0, 2, 4, 6, 1, 3, 5, 7])
+    elif num_bits == 8:
+        interleave = numpy.array([0, 2, 1, 3])
+    else:
+        raise Exception("num_bits must be 4 or 8, got {}".format(num_bits))
+
+    perm = perm.reshape((-1, len(interleave)))[:, interleave].ravel()
+    perm = torch.from_numpy(perm)
+    scale_perm: List[int] = []
+    for i in range(8):
+        scale_perm.extend([i + 8 * j for j in range(8)])
+    scale_perm_single: List[int] = []
+    for i in range(4):
+        scale_perm_single.extend(
+            [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
+    return perm, scale_perm, scale_perm_single
+
+
+marlin_perm: Dict[int, torch.Tensor] = {}
+marlin_scale_perm: Dict[int, List[int]] = {}
+marlin_scale_perm_single: Dict[int, List[int]] = {}
+for num_bits in [4, 8]:
+    perm, scale_perm, scale_perm_single = get_perms(num_bits)
+    marlin_perm[num_bits] = perm
+    marlin_scale_perm[num_bits] = scale_perm
+    marlin_scale_perm_single[num_bits] = scale_perm_single
--- a/ktransformers/ktransformers_ext/operators/custom_marlin/quantize/utils/marlin_utils.py
+++ b/ktransformers/ktransformers_ext/operators/custom_marlin/quantize/utils/marlin_utils.py
@ -0,0 +1,232 @@
+"""This file is used for /tests and /benchmarks"""
+import random
+
+import numpy
+import torch
+
+from ktransformers.ktransformers_ext.operators.custom_marlin.quantize.utils.format_24 import (
+    mask_creator, sparse_semi_structured_from_dense_cutlass)
+from ktransformers.ktransformers_ext.operators.custom_marlin.quantize.utils.marlin_24_perms import (
+    marlin_24_perm, marlin_24_scale_perm, marlin_24_scale_perm_single)
+from ktransformers.ktransformers_ext.operators.custom_marlin.quantize.utils.marlin_perms import (
+    marlin_perm, marlin_scale_perm, marlin_scale_perm_single)
+from ktransformers.ktransformers_ext.operators.custom_marlin.quantize.utils.quant_utils import (
+    get_pack_factor, quantize_weights, sort_weights)
+
+__cuda_arch = torch.cuda.get_device_capability()
+
+MARLIN_TILE = 16
+
+GPTQ_MARLIN_TILE = 16
+GPTQ_MARLIN_MIN_THREAD_N = 64
+GPTQ_MARLIN_MIN_THREAD_K = 128
+GPTQ_MARLIN_MAX_PARALLEL = 16
+
+GPTQ_MARLIN_SUPPORTED_NUM_BITS = [4, 8]
+GPTQ_MARLIN_SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128]
+GPTQ_MARLIN_SUPPORTED_SYM = [True]
+
+def is_marlin_supported():
+    return __cuda_arch[0] >= 8
+
+
+def marlin_permute_weights(q_w, size_k, size_n, perm, tile=MARLIN_TILE):
+    assert q_w.shape == (size_k, size_n)
+    assert size_k % tile == 0, f"size_k = {size_k}, tile = {tile}"
+    assert size_n % tile == 0, f"size_k = {size_n}, tile = {tile}"
+
+    # Permute weights to 16x64 marlin tiles
+    q_w = q_w.reshape((size_k // tile, tile, size_n // tile, tile))
+    q_w = q_w.permute((0, 2, 1, 3))
+    q_w = q_w.reshape((size_k // tile, size_n * tile))
+
+    q_w = q_w.reshape((-1, perm.numel()))[:, perm].reshape(q_w.shape)
+
+    return q_w
+
+
+def marlin_weights(q_w, size_k, size_n, num_bits, perm):
+    # Permute
+    q_w = marlin_permute_weights(q_w, size_k, size_n, perm)
+
+    # Pack
+    pack_factor = get_pack_factor(num_bits)
+    orig_device = q_w.device
+
+    q_w = q_w.cpu().numpy().astype(numpy.uint32)
+
+    q_packed = numpy.zeros((q_w.shape[0], q_w.shape[1] // pack_factor),
+                           dtype=numpy.uint32)
+    for i in range(pack_factor):
+        q_packed |= q_w[:, i::pack_factor] << num_bits * i
+
+    q_packed = torch.from_numpy(q_packed.astype(numpy.int32)).to(orig_device)
+
+    return q_packed
+
+
+def marlin_permute_scales(s, size_k, size_n, group_size, scale_perm,
+                          scale_perm_single):
+    if group_size < size_k and group_size != -1:
+        s = s.reshape((-1, len(scale_perm)))[:, scale_perm]
+    else:
+        s = s.reshape((-1, len(scale_perm_single)))[:, scale_perm_single]
+    s = s.reshape((-1, size_n)).contiguous()
+
+    return s
+
+
+def marlin_quantize(
+    w: torch.Tensor,
+    num_bits: int,
+    group_size: int,
+    act_order: bool,
+):
+    size_k, size_n = w.shape
+
+    # Normalize group_size
+    if group_size == -1:
+        group_size = size_k
+    assert group_size <= size_k
+
+    # Quantize (and apply act_order if provided)
+    w_ref, q_w, s, g_idx, rand_perm = quantize_weights(w, num_bits, group_size,
+                                                       act_order)
+
+    # For act_order, sort the "weights" and "g_idx" so that group ids are
+    # increasing
+    sort_indices = torch.empty(0, dtype=torch.int, device=w.device)
+    if act_order:
+        q_w, g_idx, sort_indices = sort_weights(q_w, g_idx)
+
+    # Reformat to marlin
+    marlin_q_w = marlin_weights(q_w, size_k, size_n, num_bits,
+                                marlin_perm[num_bits])
+    marlin_s = marlin_permute_scales(s, size_k, size_n, group_size,
+                                     marlin_scale_perm[num_bits],
+                                     marlin_scale_perm_single[num_bits])
+
+    # Create result
+    res_list = [w_ref, marlin_q_w, marlin_s, g_idx, sort_indices, rand_perm]
+    for i in range(len(res_list)):
+        res_list[i] = res_list[i].to(w.device)
+
+    return res_list
+
+
+def inject_24(w, size_k, size_n):
+    assert w.shape == (size_k, size_n)
+
+    mask = mask_creator(w.t()).t().cuda().bool()
+
+    return (mask * w).contiguous(), mask.contiguous()
+
+
+def check_24(w, num_rows_to_sample=50, _verbose=False):
+    BLOCK_SIZE = 4
+    MAX_NON_ZEROS = 2
+
+    w = w.t().contiguous()
+
+    print("check_24: w.shape = {}".format(w.shape))
+
+    num_rows, num_cols = w.shape
+    sampled_row_idxs = random.choices(range(num_rows), k=num_rows_to_sample)
+    if _verbose:
+        print(f"Sampled row idxs = {sampled_row_idxs}")
+
+    total_segments = 0
+    non_24_segments = 0
+    for i in sampled_row_idxs:
+        for j in range(0, num_cols - BLOCK_SIZE, BLOCK_SIZE):
+            total_segments += 1
+            block = w[i, j:j + BLOCK_SIZE]
+            num_nonzero = torch.count_nonzero(block)
+            if num_nonzero > MAX_NON_ZEROS:
+                print("i = {} j = {} block = {}".format(i, j, block))
+                non_24_segments += 1
+
+    print(f"{non_24_segments} / {total_segments} do not have 2:4 structure.")
+
+
+def compress_quantized_24_weight(q_24, size_k, size_n, num_bits):
+    assert q_24.shape == (size_k, size_n)
+
+    # Remove zp to normalize over 0
+    max_q_val = (1 << num_bits) - 1
+    zp = (max_q_val + 1) // 2
+    q_24_no_zp = q_24 - zp
+
+    # Compress
+    q_24_no_zp = q_24_no_zp.t().contiguous()
+    q_24_no_zp_comp, meta = sparse_semi_structured_from_dense_cutlass(
+        q_24_no_zp)
+    q_24_no_zp_comp = q_24_no_zp_comp.t().contiguous()
+
+    # Restore zp
+    q_24_comp = q_24_no_zp_comp + zp
+
+    # Resize meta to its actual shape (without moving any data)
+    meta = meta.resize_(meta.shape[1] // 2, meta.shape[0] * 2)
+
+    return q_24_comp, meta
+
+
+def marlin_24_quantize(
+    w: torch.Tensor,
+    num_bits: int,
+    group_size: int,
+):
+    size_k, size_n = w.shape
+
+    # Normalize group_size
+    if group_size == -1:
+        group_size = size_k
+    assert group_size <= size_k
+
+    # Inject 2:4 sparsity
+    w_24, mask_24 = inject_24(w, size_k, size_n)
+
+    # Quantize
+    w_24_ref, q_w_24, s, g_idx, rand_perm = quantize_weights(w_24,
+                                                             num_bits,
+                                                             group_size,
+                                                             act_order=False)
+
+    # Compress quantized weight
+    q_w_24_comp, meta = compress_quantized_24_weight(q_w_24, size_k, size_n,
+                                                     num_bits)
+    size_k_comp = size_k // 2
+
+    # Reformat to marlin
+    marlin_24_q_w_comp = marlin_weights(q_w_24_comp, size_k_comp, size_n,
+                                        num_bits, marlin_24_perm[num_bits])
+    marlin_24_s = marlin_permute_scales(s, size_k, size_n, group_size,
+                                        marlin_24_scale_perm[num_bits],
+                                        marlin_24_scale_perm_single[num_bits])
+
+    # Create result
+    res_list = [w_24_ref, marlin_24_q_w_comp, meta, marlin_24_s]
+    for i in range(len(res_list)):
+        res_list[i] = res_list[i].to(w.device)
+
+    return res_list
+
+
+def compute_max_diff(output, output_ref):
+    return torch.mean(torch.abs(output - output_ref)) / torch.mean(
+        torch.abs(output_ref))
+
+
+class MarlinWorkspace:
+
+    def __init__(self, out_features, min_thread_n, max_parallel):
+        assert (out_features % min_thread_n == 0), (
+            "out_features = {} is undivisible by min_thread_n = {}".format(
+                out_features, min_thread_n))
+
+        max_workspace_size = ((out_features // min_thread_n) * max_parallel)
+
+        self.scratch = torch.zeros(max_workspace_size,
+                                   dtype=torch.int,
+                                   device="cuda")
--- a/ktransformers/ktransformers_ext/operators/custom_marlin/quantize/utils/quant_utils.py
+++ b/ktransformers/ktransformers_ext/operators/custom_marlin/quantize/utils/quant_utils.py
@ -0,0 +1,146 @@
+"""This file is used for /tests and /benchmarks"""
+import numpy
+import torch
+
+SUPPORTED_NUM_BITS = [4, 8]
+SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128]
+
+
+def get_pack_factor(num_bits):
+    assert num_bits in SUPPORTED_NUM_BITS, f"Unsupported num_bits = {num_bits}"
+    return 32 // num_bits
+
+
+def permute_rows(q_w: torch.Tensor, w_ref: torch.Tensor, group_size: int):
+    assert q_w.shape == w_ref.shape
+
+    orig_device = q_w.device
+    k_size, _ = q_w.shape
+
+    g_idx = torch.zeros((k_size, ), dtype=torch.int32)
+    for i in range(k_size):
+        g_idx[i] = i // group_size
+
+    # Simulate act_order by doing a random permutation on K
+    rand_perm = torch.randperm(k_size)
+
+    g_idx = g_idx[rand_perm].contiguous()
+    q_w = q_w[rand_perm, :].contiguous()
+    w_ref = w_ref[rand_perm, :].contiguous()
+
+    return (
+        w_ref.to(device=orig_device),
+        q_w.to(device=orig_device),
+        g_idx.to(device=orig_device),
+        rand_perm.to(device=orig_device),
+    )
+
+
+def quantize_weights(w: torch.Tensor, num_bits: int, group_size: int,
+                     act_order: bool):
+    orig_device = w.device
+    size_k, size_n = w.shape
+
+    assert w.is_floating_point(), "w must be float"
+    assert num_bits in SUPPORTED_NUM_BITS, f"Unsupported num_bits = {num_bits}"
+    assert group_size in SUPPORTED_GROUP_SIZES + [
+        size_k
+    ], f"Unsupported groupsize = {group_size}"
+
+    if group_size == -1:
+        group_size = size_k
+    assert group_size <= size_k
+
+    max_q_val = 2**num_bits - 1
+    half_q_val = (max_q_val + 1) // 2
+
+    # Reshape to [groupsize, -1]
+    if group_size < size_k:
+        w = w.view((-1, group_size, size_n))
+        w = w.permute(1, 0, 2)
+        w = w.reshape((group_size, -1))
+
+    # Compute scale for each group
+    s = torch.max(torch.abs(w), 0, keepdim=True)[0]
+    s *= 2 / max_q_val  # 2 => symmetric
+
+    # Quantize
+    q_w = torch.round(w / s).int()
+    q_w += half_q_val
+    q_w = torch.clamp(q_w, 0, max_q_val)
+
+    # Compute ref (dequantized)
+    w_ref = (q_w - half_q_val).half() * s
+
+    # Restore original shapes
+    if group_size < size_k:
+
+        def reshape_w(w):
+            w = w.reshape((group_size, -1, size_n))
+            w = w.permute(1, 0, 2)
+            w = w.reshape((size_k, size_n)).contiguous()
+            return w
+
+        q_w = reshape_w(q_w)
+        w_ref = reshape_w(w_ref)
+
+    s = s.reshape((-1, size_n)).contiguous()
+
+    # Apply act_order
+    g_idx = torch.empty(0, dtype=torch.int, device=w.device)
+    rand_perm = torch.empty(0, dtype=torch.int, device=w.device)
+    if act_order:
+        assert (
+            group_size < size_k
+        ), "For act_order, groupsize = {} must be less than size_k = {}".format(
+            group_size, size_k)
+
+        w_ref, q_w, g_idx, rand_perm = permute_rows(q_w, w_ref, group_size)
+
+    return (
+        w_ref.to(device=orig_device),
+        q_w.to(device=orig_device),
+        s.to(device=orig_device),
+        g_idx.to(device=orig_device),
+        rand_perm.to(device=orig_device),
+    )
+
+
+def sort_weights(q_w: torch.Tensor, g_idx: torch.Tensor):
+    orig_device = q_w.device
+
+    sort_indices = torch.argsort(g_idx).to(
+        dtype=torch.int32)  # Sort based on g_idx
+
+    g_idx = g_idx[sort_indices].contiguous()
+    q_w = q_w[sort_indices, :].contiguous()
+
+    return (
+        q_w.to(device=orig_device),
+        g_idx.to(device=orig_device),
+        sort_indices.to(device=orig_device),
+    )
+
+
+def gptq_pack(
+    q_w: torch.Tensor,
+    num_bits: int,
+    size_k: int,
+    size_n: int,
+):
+    assert q_w.shape == (size_k, size_n)
+
+    pack_factor = get_pack_factor(num_bits)
+    assert size_k % pack_factor == 0
+
+    orig_device = q_w.device
+
+    q_w = q_w.cpu().numpy().astype(numpy.uint32)
+
+    q_res = numpy.zeros((size_k // pack_factor, size_n), dtype=numpy.uint32)
+
+    for i in range(pack_factor):
+        q_res |= q_w[i::pack_factor, :] << num_bits * i
+
+    q_res = torch.from_numpy(q_res.astype(numpy.int32)).to(orig_device)
+    return q_res
--- a/ktransformers/ktransformers_ext/operators/llamafile/conversion.h
+++ b/ktransformers/ktransformers_ext/operators/llamafile/conversion.h
@ -0,0 +1,32 @@
+/**
+ * @Description  :
+ * @Author       : chenht2022
+ * @Date         : 2024-07-12 10:07:58
+ * @Version      : 1.0.0
+ * @LastEditors  : chenht2022 
+ * @LastEditTime : 2024-07-25 10:34:55
+ * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
+ **/
+#ifndef CPUINFER_CONVERSION_H
+#define CPUINFER_CONVERSION_H
+
+#include <memory.h>
+#include "llama.cpp/ggml.h"
+
+inline void to_float(const void* input, float* output, int size, ggml_type type) {
+    if (type == ggml_type::GGML_TYPE_F32) {
+        memcpy(output, input, size * sizeof(float));
+    } else {
+        ggml_internal_get_type_traits(type).to_float(input, output, size);
+    }
+}
+
+inline void from_float(const float* input, void* output, int size, ggml_type type) {
+    if (type == ggml_type::GGML_TYPE_F32) {
+        memcpy(output, input, size * sizeof(float));
+    } else {
+        ggml_internal_get_type_traits(type).from_float(input, output, size);
+    }
+}
+
+#endif
--- a/ktransformers/ktransformers_ext/operators/llamafile/linear.cpp
+++ b/ktransformers/ktransformers_ext/operators/llamafile/linear.cpp
@ -0,0 +1,47 @@
+/**
+ * @Description  :
+ * @Author       : chenht2022
+ * @Date         : 2024-07-12 10:07:58
+ * @Version      : 1.0.0
+ * @LastEditors  : chenht2022 
+ * @LastEditTime : 2024-07-25 10:34:58
+ * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
+ **/
+#include "linear.h"
+
+Linear::Linear(LinearConfig config) {
+    config_ = config;
+    proj_ = config_.proj;
+
+    input_fp32_.resize(config_.input_size);
+    proj_input_.resize(config_.input_size * 4);
+    proj_output_.resize(config_.output_size);
+}
+
+void Linear::warm_up(Backend* backend) {
+    std::vector<float> input_fp32(config_.input_size);
+    std::vector<uint8_t> input(config_.input_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type));
+    std::vector<uint8_t> output(config_.output_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type));
+    for (int i = 0; i < config_.input_size; i++) {
+        input_fp32[i] = 0;
+    }
+    from_float(input_fp32.data(), input.data(), config_.input_size, config_.hidden_type);
+    forward(input.data(), output.data(), backend);
+}
+
+void Linear::forward(const void* input, void* output, Backend* backend) {
+    const void* proj_input_ptr;
+    if (config_.hidden_type == ggml_internal_get_type_traits(config_.proj_type).vec_dot_type) {
+        proj_input_ptr = input;
+    } else {
+        to_float(input, input_fp32_.data(), config_.input_size, config_.hidden_type);
+        from_float(input_fp32_.data(), proj_input_.data(), config_.input_size, ggml_internal_get_type_traits(config_.proj_type).vec_dot_type);
+        proj_input_ptr = proj_input_.data();
+    }
+    int nth = config_.output_size / config_.stride;
+    backend->do_work_stealing_job(nth, [&](int task_id) {
+        int ith = task_id % nth;
+        llamafile_sgemm(config_.output_size, 1, config_.input_size / ggml_blck_size(config_.proj_type), proj_, config_.input_size / ggml_blck_size(config_.proj_type), proj_input_ptr, config_.input_size / ggml_blck_size(config_.proj_type), proj_output_.data(), config_.output_size, ith, nth, GGML_TASK_TYPE_COMPUTE, config_.proj_type, ggml_internal_get_type_traits(config_.proj_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
+    });
+    from_float(proj_output_.data(), output, config_.output_size, config_.hidden_type);
+}
--- a/ktransformers/ktransformers_ext/operators/llamafile/linear.h
+++ b/ktransformers/ktransformers_ext/operators/llamafile/linear.h
@ -0,0 +1,55 @@
+/**
+ * @Description  :
+ * @Author       : chenht2022
+ * @Date         : 2024-07-12 10:07:58
+ * @Version      : 1.0.0
+ * @LastEditors  : chenht2022 
+ * @LastEditTime : 2024-07-25 10:35:00
+ * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
+ **/
+#ifndef CPUINFER_OPERATOR_LINEAR_H
+#define CPUINFER_OPERATOR_LINEAR_H
+
+#include <cmath>
+#include <cstdio>
+#include <functional>
+#include <mutex>
+#include <vector>
+
+#include "../../cpu_backend/backend.h"
+#include "conversion.h"
+#include "llama.cpp/ggml-impl.h"
+#include "llama.cpp/ggml-quants.h"
+#include "llama.cpp/ggml.h"
+#include "llamafile/sgemm.h"
+
+struct LinearConfig {
+    int input_size;
+    int output_size;
+    int stride;
+    void* proj;
+    ggml_type proj_type;
+    ggml_type hidden_type;
+
+    LinearConfig() {}
+
+    LinearConfig(int input_size, int output_size, int stride, void* proj, ggml_type proj_type, ggml_type hidden_type)
+        : input_size(input_size), output_size(output_size), stride(stride), proj(proj), proj_type(proj_type), hidden_type(hidden_type) {}
+};
+
+class Linear {
+   public:
+    Linear(LinearConfig);
+    void warm_up(Backend* backend);
+    void forward(const void* input, void* output, Backend* backend);
+
+   private:
+    LinearConfig config_;
+    void* proj_;  // [output_size * input_size ( /32 if quantized)]
+
+    std::vector<float> input_fp32_;    // [input_size]
+    std::vector<uint8_t> proj_input_;  // [input_size * 4]
+    std::vector<float> proj_output_;   // [output_size]
+};
+
+#endif
--- a/ktransformers/ktransformers_ext/operators/llamafile/mlp.cpp
+++ b/ktransformers/ktransformers_ext/operators/llamafile/mlp.cpp
@ -0,0 +1,103 @@
+/**
+ * @Description  :
+ * @Author       : chenht2022
+ * @Date         : 2024-07-16 10:43:18
+ * @Version      : 1.0.0
+ * @LastEditors  : chenht2022 
+ * @LastEditTime : 2024-07-25 10:35:04
+ * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
+ **/
+#include "mlp.h"
+
+MLP::MLP(MLPConfig config) {
+    config_ = config;
+    gate_proj_ = config_.gate_proj;
+    up_proj_ = config_.up_proj;
+    down_proj_ = config_.down_proj;
+
+    input_fp32_.resize(config_.hidden_size);
+    gate_input_.resize(config_.hidden_size * 4);
+    up_input_.resize(config_.hidden_size * 4);
+    gate_output_.resize(config_.intermediate_size);
+    up_output_.resize(config_.intermediate_size);
+    intermediate_fp32_.resize(config_.intermediate_size);
+    down_input_.resize(config_.intermediate_size * 4);
+    down_output_.resize(config_.hidden_size);
+}
+
+void MLP::warm_up(Backend* backend) {
+    std::vector<float> input_fp32(config_.hidden_size);
+    std::vector<uint8_t> input(config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type));
+    std::vector<uint8_t> output(config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type));
+    for (int i = 0; i < config_.hidden_size; i++) {
+        input_fp32[i] = 0;
+    }
+    from_float(input_fp32.data(), input.data(), config_.hidden_size, config_.hidden_type);
+    forward(input.data(), output.data(), backend);
+}
+
+static float act_fn(float x) {
+    return x / (1.0f + expf(-x));
+}
+
+void MLP::forward(const void* input, void* output, Backend* backend) {
+    const void* gate_input_ptr;
+    const void* up_input_ptr;
+    if (config_.hidden_type == ggml_internal_get_type_traits(config_.gate_type).vec_dot_type && config_.hidden_type == ggml_internal_get_type_traits(config_.up_type).vec_dot_type) {
+        gate_input_ptr = up_input_ptr = input;
+    } else {
+        to_float(input, input_fp32_.data(), config_.hidden_size, config_.hidden_type);
+        if (ggml_internal_get_type_traits(config_.gate_type).vec_dot_type == ggml_internal_get_type_traits(config_.up_type).vec_dot_type) {
+            from_float(input_fp32_.data(), gate_input_.data(), config_.hidden_size, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type);
+            gate_input_ptr = up_input_ptr = gate_input_.data();
+        } else {
+            if (config_.hidden_type != ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) {
+                from_float(input_fp32_.data(), gate_input_.data(), config_.hidden_size, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type);
+                gate_input_ptr = gate_input_.data();
+            } else {
+                gate_input_ptr = input;
+            }
+            if (config_.hidden_type != ggml_internal_get_type_traits(config_.up_type).vec_dot_type) {
+                from_float(input_fp32_.data(), up_input_.data(), config_.hidden_size, ggml_internal_get_type_traits(config_.up_type).vec_dot_type);
+                up_input_ptr = up_input_.data();
+            } else {
+                up_input_ptr = input;
+            }
+        }
+    }
+    int nth = config_.intermediate_size / config_.stride;
+    backend->do_work_stealing_job(nth, [&](int task_id) {
+        int ith = task_id;
+        void* gate_proj_ptr = gate_proj_ + ith * config_.stride * config_.hidden_size * ggml_type_size(config_.gate_type) / ggml_blck_size(config_.gate_type);
+        float* gate_output_ptr = gate_output_.data() + ith * config_.stride;
+        llamafile_sgemm(config_.stride, 1, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_proj_ptr, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_input_ptr, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_output_ptr, config_.stride, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.gate_type, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
+        void* up_proj_ptr = up_proj_ + ith * config_.stride * config_.hidden_size * ggml_type_size(config_.up_type) / ggml_blck_size(config_.up_type);
+        float* up_output_ptr = up_output_.data() + ith * config_.stride;
+        llamafile_sgemm(config_.stride, 1, config_.hidden_size / ggml_blck_size(config_.up_type), up_proj_ptr, config_.hidden_size / ggml_blck_size(config_.up_type), up_input_ptr, config_.hidden_size / ggml_blck_size(config_.up_type), up_output_ptr, config_.stride, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.up_type, ggml_internal_get_type_traits(config_.up_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
+        for (int i = ith * config_.stride; i < (ith + 1) * config_.stride; i++) {
+            intermediate_fp32_[i] = act_fn(gate_output_[i]) * up_output_[i];
+        }
+        if (config_.stride % ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) == 0) {
+            float* intermediate_fp32_ptr = intermediate_fp32_.data() + ith * config_.stride;
+            void* down_input_ptr = down_input_.data() + ith * config_.stride * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
+            from_float(intermediate_fp32_ptr, down_input_ptr, config_.stride, ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
+        }
+    });
+    if (config_.stride % ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) != 0) {
+        from_float(intermediate_fp32_.data(), down_input_.data(), config_.intermediate_size, ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
+    }
+    nth = config_.hidden_size / config_.stride;
+    backend->do_work_stealing_job(nth, [&](int task_id) {
+        int ith = task_id;
+        void* down_proj_ptr = down_proj_ + ith * config_.stride * config_.intermediate_size * ggml_type_size(config_.down_type) / ggml_blck_size(config_.down_type);
+        float* down_output_ptr = down_output_.data() + ith * config_.stride;
+        llamafile_sgemm(config_.stride, 1, config_.intermediate_size / ggml_blck_size(config_.down_type), down_proj_ptr, config_.intermediate_size / ggml_blck_size(config_.down_type), down_input_.data(), config_.intermediate_size / ggml_blck_size(config_.down_type), down_output_ptr, config_.stride, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.down_type, ggml_internal_get_type_traits(config_.down_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
+        if (config_.stride % ggml_blck_size(config_.hidden_type) == 0) {
+            void* output_ptr = output + ith * config_.stride * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type);
+            from_float(down_output_ptr, output_ptr, config_.stride, config_.hidden_type);
+        }
+    });
+    if (config_.stride % ggml_blck_size(config_.hidden_type) != 0) {
+        from_float(down_output_.data(), output, config_.hidden_size, config_.hidden_type);
+    }
+}
--- a/ktransformers/ktransformers_ext/operators/llamafile/mlp.h
+++ b/ktransformers/ktransformers_ext/operators/llamafile/mlp.h
@ -0,0 +1,66 @@
+/**
+ * @Description  :
+ * @Author       : chenht2022
+ * @Date         : 2024-07-12 10:07:58
+ * @Version      : 1.0.0
+ * @LastEditors  : chenht2022 
+ * @LastEditTime : 2024-07-25 10:35:06
+ * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
+ **/
+#ifndef CPUINFER_OPERATOR_MLP_H
+#define CPUINFER_OPERATOR_MLP_H
+
+#include <cmath>
+#include <cstdio>
+#include <functional>
+#include <mutex>
+#include <vector>
+
+#include "../../cpu_backend/backend.h"
+#include "conversion.h"
+#include "llama.cpp/ggml-impl.h"
+#include "llama.cpp/ggml-quants.h"
+#include "llama.cpp/ggml.h"
+#include "llamafile/sgemm.h"
+
+struct MLPConfig {
+    int hidden_size;
+    int intermediate_size;
+    int stride;
+    void* gate_proj;
+    void* up_proj;
+    void* down_proj;
+    ggml_type gate_type;
+    ggml_type up_type;
+    ggml_type down_type;
+    ggml_type hidden_type;
+
+    MLPConfig() {}
+
+    MLPConfig(int hidden_size, int intermediate_size, int stride, void* gate_proj, void* up_proj, void* down_proj, ggml_type gate_type, ggml_type up_type, ggml_type down_type, ggml_type hidden_type)
+        : hidden_size(hidden_size), intermediate_size(intermediate_size), stride(stride), gate_proj(gate_proj), up_proj(up_proj), down_proj(down_proj), gate_type(gate_type), up_type(up_type), down_type(down_type), hidden_type(hidden_type) {}
+};
+
+class MLP {
+   public:
+    MLP(MLPConfig);
+    void warm_up(Backend* backend);
+    void forward(const void* input, void* output, Backend* backend);
+
+   private:
+    MLPConfig config_;
+    void* gate_proj_;  // [intermediate_size * hidden_size ( /32 if quantized)]
+    void* up_proj_;    // [intermediate_size * hidden_size ( /32 if quantized)]
+    void* down_proj_;  // [hidden_size * intermediate_size ( /32 if quantized)]
+
+    std::vector<float> input_fp32_;         // [hidden_size]
+    std::vector<uint8_t> gate_input_;       // [hidden_size * 4]
+    std::vector<uint8_t> up_input_;         // [hidden_size * 4]
+    std::vector<float> gate_output_;        // [intermediate_size]
+    std::vector<float> up_output_;          // [intermediate_size]
+    std::vector<float> intermediate_fp32_;  // [intermediate_size]
+    std::vector<uint8_t> down_input_;       // [intermediate_size * 4]
+    std::vector<float> down_output_;        // [hidden_size]
+};
+
+#endif
--- a/ktransformers/ktransformers_ext/operators/llamafile/moe.cpp
+++ b/ktransformers/ktransformers_ext/operators/llamafile/moe.cpp
@ -0,0 +1,310 @@
+/**
+ * @Description  :  
+ * @Author       : chenht2022
+ * @Date         : 2024-07-22 02:03:22
+ * @Version      : 1.0.0
+ * @LastEditors  : chenht2022 
+ * @LastEditTime : 2024-07-25 10:35:07
+ * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
+**/
+#include "moe.h"
+#include <iostream>
+#include "unistd.h"
+
+void* MOE::buffer_ = nullptr;
+
+MOE::MOE(MOEConfig config) {
+    config_ = config;
+    gate_proj_ = config_.gate_proj;
+    up_proj_ = config_.up_proj;
+    down_proj_ = config_.down_proj;
+
+    if (MOE::buffer_ == nullptr) {
+        uint64_t buffer_size = 0;
+        buffer_size += sizeof(float) * config_.group_max_len * config_.hidden_size;
+        buffer_size += config_.group_max_len * config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type);
+        buffer_size += config_.group_max_len * config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type);
+        buffer_size += config_.routed_expert_num * config_.group_max_len * config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type);
+        buffer_size += config_.routed_expert_num * config_.group_max_len * config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type);
+        buffer_size += sizeof(float) * config_.routed_expert_num * config_.group_max_len * config_.intermediate_size;
+        buffer_size += sizeof(float) * config_.routed_expert_num * config_.group_max_len * config_.intermediate_size;
+        buffer_size += sizeof(float) * config_.routed_expert_num * config_.group_max_len * config_.intermediate_size;
+        buffer_size += config_.routed_expert_num * config_.group_max_len * config_.intermediate_size * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
+        buffer_size += sizeof(float) * config_.routed_expert_num * config_.group_max_len * config_.hidden_size;
+        buffer_size += sizeof(float) * config_.group_max_len * config_.hidden_size;
+        buffer_ = malloc(buffer_size);
+    }
+
+    uint64_t offset = 0;
+    s_input_fp32_ = (float*)(buffer_ + offset);
+    offset += sizeof(float) * config_.hidden_size;
+    s_gate_input_ = (uint8_t*)(buffer_ + offset);
+    offset += config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type);
+    s_up_input_ = (uint8_t*)(buffer_ + offset);
+    offset += config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type);
+    s_gate_output_.resize(config_.routed_expert_num);
+    s_up_output_.resize(config_.routed_expert_num);
+    s_intermediate_fp32_.resize(config_.routed_expert_num);
+    s_down_input_.resize(config_.routed_expert_num);
+    s_down_output_.resize(config_.routed_expert_num);
+    for (int i = 0; i < config_.routed_expert_num; i++) {
+        s_gate_output_[i] = (float*)(buffer_ + offset);
+        offset += sizeof(float) * config_.intermediate_size;
+        s_up_output_[i] = (float*)(buffer_ + offset);
+        offset += sizeof(float) * config_.intermediate_size;
+        s_intermediate_fp32_[i] = (float*)(buffer_ + offset);
+        offset += sizeof(float) * config_.intermediate_size;
+        s_down_input_[i] = (uint8_t*)(buffer_ + offset);
+        offset += config_.intermediate_size * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
+        s_down_output_[i] = (float*)(buffer_ + offset);
+        offset += sizeof(float) * config_.hidden_size;
+    }
+    s_output_fp32_ = (float*)(buffer_ + offset);
+
+    offset = 0;
+    m_input_fp32_.resize(config_.group_max_len);
+    m_gate_input_.resize(config_.group_max_len);
+    m_up_input_.resize(config_.group_max_len);
+    for (int i = 0; i < config_.group_max_len; i++) {
+        m_input_fp32_[i] = (float*)(buffer_ + offset);
+        offset += sizeof(float) * config_.hidden_size;
+        m_gate_input_[i] = (uint8_t*)(buffer_ + offset);
+        offset += config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type);
+        m_up_input_[i] = (uint8_t*)(buffer_ + offset);
+        offset += config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type);
+    }
+    m_local_gate_input_ = (uint8_t*)(buffer_ + offset);
+    offset += config_.routed_expert_num * config_.group_max_len * config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type);
+    m_local_up_input_ = (uint8_t*)(buffer_ + offset);
+    offset += config_.routed_expert_num * config_.group_max_len * config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type);
+    m_local_gate_output_ = (float*)(buffer_ + offset);
+    offset += sizeof(float) * config_.routed_expert_num * config_.group_max_len * config_.intermediate_size;
+    m_local_up_output_ = (float*)(buffer_ + offset);
+    offset += sizeof(float) * config_.routed_expert_num * config_.group_max_len * config_.intermediate_size;
+    m_local_intermediate_fp32_ = (float*)(buffer_ + offset);
+    offset += sizeof(float) * config_.routed_expert_num * config_.group_max_len * config_.intermediate_size;
+    m_local_down_input_ = (uint8_t*)(buffer_ + offset);
+    offset += config_.routed_expert_num * config_.group_max_len * config_.intermediate_size * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
+    m_local_down_output_ = (float*)(buffer_ + offset);
+    offset += sizeof(float) * config_.routed_expert_num * config_.group_max_len * config_.hidden_size;
+    m_output_fp32_.resize(config_.group_max_len);
+    for (int i = 0; i < config_.group_max_len; i++) {
+        m_output_fp32_[i] = (float*)(buffer_ + offset);
+        offset += sizeof(float) * config_.hidden_size;
+    }
+
+    m_local_pos_.resize(config_.group_max_len);
+    for (int i = 0; i < config_.group_max_len; i++) {
+        m_local_pos_[i].reserve(config_.expert_num);
+    }
+    m_local_num_.resize(config_.expert_num);
+    m_local_gate_input_ptr_.resize(config_.expert_num);
+    m_local_up_input_ptr_.resize(config_.expert_num);
+    m_local_gate_output_ptr_.resize(config_.expert_num);
+    m_local_up_output_ptr_.resize(config_.expert_num);
+    m_local_intermediate_fp32_ptr_.resize(config_.expert_num);
+    m_local_down_input_ptr_.resize(config_.expert_num);
+    m_local_down_output_ptr_.resize(config_.expert_num);
+}
+
+void MOE::warm_up(Backend* backend) {
+    std::vector<float> input_fp32(config_.hidden_size);
+    std::vector<uint8_t> input(config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type));
+    std::vector<uint8_t> output(config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type));
+    for (int i = 0; i < config_.hidden_size; i++) {
+        input_fp32[i] = 0;
+    }
+    from_float(input_fp32.data(), input.data(), config_.hidden_size, config_.hidden_type);
+    for (int i = 0; i < config_.expert_num; i++) {
+        uint64_t expert_ids = i;
+        float weights = 0;
+        forward_one(1, &expert_ids, &weights, input.data(), output.data(), backend);
+    }
+}
+
+static float act_fn(float x) {
+    return x / (1.0f + expf(-x));
+}
+
+void MOE::forward_one(int k, const uint64_t* expert_ids, const float* weights, const void* input, void* output, Backend* backend) {
+    const void* gate_input_ptr;
+    const void* up_input_ptr;
+    if (config_.hidden_type == ggml_internal_get_type_traits(config_.gate_type).vec_dot_type && config_.hidden_type == ggml_internal_get_type_traits(config_.up_type).vec_dot_type) {
+        gate_input_ptr = up_input_ptr = input;
+    } else {
+        to_float(input, s_input_fp32_, config_.hidden_size, config_.hidden_type);
+        if (ggml_internal_get_type_traits(config_.gate_type).vec_dot_type == ggml_internal_get_type_traits(config_.up_type).vec_dot_type) {
+            from_float(s_input_fp32_, s_gate_input_, config_.hidden_size, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type);
+            gate_input_ptr = up_input_ptr = s_gate_input_;
+        } else {
+            if (config_.hidden_type != ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) {
+                from_float(s_input_fp32_, s_gate_input_, config_.hidden_size, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type);
+                gate_input_ptr = s_gate_input_;
+            } else {
+                gate_input_ptr = input;
+            }
+            if (config_.hidden_type != ggml_internal_get_type_traits(config_.up_type).vec_dot_type) {
+                from_float(s_input_fp32_, s_up_input_, config_.hidden_size, ggml_internal_get_type_traits(config_.up_type).vec_dot_type);
+                up_input_ptr = s_up_input_;
+            } else {
+                up_input_ptr = input;
+            }
+        }
+    }
+    int nth = config_.intermediate_size / config_.stride;
+    backend->do_work_stealing_job(nth * k, [&](int task_id) {
+        int expert_idx = task_id / nth;
+        uint64_t expert_id = expert_ids[expert_idx];
+        int ith = task_id % nth;
+        void* gate_proj_ptr = gate_proj_ + (expert_id * config_.intermediate_size + ith * config_.stride) * config_.hidden_size * ggml_type_size(config_.gate_type) / ggml_blck_size(config_.gate_type);
+        float* gate_output_ptr = s_gate_output_[expert_idx] + ith * config_.stride;
+        llamafile_sgemm(config_.stride, 1, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_proj_ptr, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_input_ptr, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_output_ptr, config_.stride, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.gate_type, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
+        void* up_proj_ptr = up_proj_ + (expert_id * config_.intermediate_size + ith * config_.stride) * config_.hidden_size * ggml_type_size(config_.up_type) / ggml_blck_size(config_.up_type);
+        float* up_output_ptr = s_up_output_[expert_idx] + ith * config_.stride;
+        llamafile_sgemm(config_.stride, 1, config_.hidden_size / ggml_blck_size(config_.up_type), up_proj_ptr, config_.hidden_size / ggml_blck_size(config_.up_type), up_input_ptr, config_.hidden_size / ggml_blck_size(config_.up_type), up_output_ptr, config_.stride, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.up_type, ggml_internal_get_type_traits(config_.up_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
+        for (int i = ith * config_.stride; i < (ith + 1) * config_.stride; i++) {
+            s_intermediate_fp32_[expert_idx][i] = act_fn(s_gate_output_[expert_idx][i]) * s_up_output_[expert_idx][i];
+        }
+        if (config_.stride % ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) == 0) {
+            float* intermediate_fp32_ptr = s_intermediate_fp32_[expert_idx] + ith * config_.stride;
+            void* down_input_ptr = s_down_input_[expert_idx] + ith * config_.stride * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
+            from_float(intermediate_fp32_ptr, down_input_ptr, config_.stride, ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
+        }
+    });
+    if (config_.stride % ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) != 0) {
+        for (int i = 0; i < k; i++) {
+            from_float(s_intermediate_fp32_[i], s_down_input_[i], config_.intermediate_size, ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
+        }
+    }
+    nth = config_.hidden_size / config_.stride;
+    backend->do_work_stealing_job(nth, [&](int task_id) {
+        int ith = task_id;
+        for (int i = ith * config_.stride; i < (ith + 1) * config_.stride; i++) {
+            s_output_fp32_[i] = 0;
+        }
+        for (int expert_idx = 0; expert_idx < k; expert_idx++) {
+            uint64_t expert_id = expert_ids[expert_idx];
+            void* down_proj_ptr = down_proj_ + (expert_id * config_.hidden_size + ith * config_.stride) * config_.intermediate_size * ggml_type_size(config_.down_type) / ggml_blck_size(config_.down_type);
+            float* down_output_ptr = s_down_output_[expert_idx] + ith * config_.stride;
+            llamafile_sgemm(config_.stride, 1, config_.intermediate_size / ggml_blck_size(config_.down_type), down_proj_ptr, config_.intermediate_size / ggml_blck_size(config_.down_type), s_down_input_[expert_idx], config_.intermediate_size / ggml_blck_size(config_.down_type), down_output_ptr, config_.stride, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.down_type, ggml_internal_get_type_traits(config_.down_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
+            for (int i = ith * config_.stride; i < (ith + 1) * config_.stride; i++) {
+                s_output_fp32_[i] += s_down_output_[expert_idx][i] * weights[expert_idx];
+            }
+        }
+        if (config_.stride % ggml_blck_size(config_.hidden_type) == 0) {
+            float* output_fp32_ptr = s_output_fp32_ + ith * config_.stride;
+            void* output_ptr = output + ith * config_.stride * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type);
+            from_float(output_fp32_ptr, output_ptr, config_.stride, config_.hidden_type);
+        }
+    });
+    if (config_.stride % ggml_blck_size(config_.hidden_type) != 0) {
+        from_float(s_output_fp32_, output, config_.hidden_size, config_.hidden_type);
+    }
+}
+
+void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float* weights, const void* input, void* output, Backend* backend) {
+    for (int i = 0; i < config_.expert_num; i++) {
+        m_local_num_[i] = 0;
+    }
+    for (int i = 0; i < qlen; i++) {
+        for (int j = 0; j < k; j++) {
+            m_local_pos_[i][j] = m_local_num_[expert_ids[i * k + j]]++;
+        }
+    }
+    uint64_t offset = 0;
+    for (int i = 0; i < config_.expert_num; i++) {
+        m_local_gate_input_ptr_[i] = m_local_gate_input_ + offset * config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type);
+        m_local_up_input_ptr_[i] = m_local_up_input_ + offset * config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type);
+        m_local_gate_output_ptr_[i] = m_local_gate_output_ + offset * config_.intermediate_size;
+        m_local_up_output_ptr_[i] = m_local_up_output_ + offset * config_.intermediate_size;
+        m_local_intermediate_fp32_ptr_[i] = m_local_intermediate_fp32_ + offset * config_.intermediate_size;
+        m_local_down_input_ptr_[i] = m_local_down_input_ + offset * config_.intermediate_size * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
+        m_local_down_output_ptr_[i] = m_local_down_output_ + offset * config_.hidden_size;
+        offset += m_local_num_[i];
+    }
+    backend->do_work_stealing_job(qlen, [&](int i) {
+        const void* gate_input_ptr;
+        const void* up_input_ptr;
+        if (config_.hidden_type == ggml_internal_get_type_traits(config_.gate_type).vec_dot_type && config_.hidden_type == ggml_internal_get_type_traits(config_.up_type).vec_dot_type) {
+            gate_input_ptr = up_input_ptr = input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type);
+        } else {
+            to_float(input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), m_input_fp32_[i], config_.hidden_size, config_.hidden_type);
+            if (ggml_internal_get_type_traits(config_.gate_type).vec_dot_type == ggml_internal_get_type_traits(config_.up_type).vec_dot_type) {
+                from_float(m_input_fp32_[i], m_gate_input_[i], config_.hidden_size, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type);
+                gate_input_ptr = up_input_ptr = m_gate_input_[i];
+            } else {
+                if (config_.hidden_type != ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) {
+                    from_float(m_input_fp32_[i], m_gate_input_[i], config_.hidden_size, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type);
+                    gate_input_ptr = m_gate_input_[i];
+                } else {
+                    gate_input_ptr = input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type);
+                }
+                if (config_.hidden_type != ggml_internal_get_type_traits(config_.up_type).vec_dot_type) {
+                    from_float(m_input_fp32_[i], m_up_input_[i], config_.hidden_size, ggml_internal_get_type_traits(config_.up_type).vec_dot_type);
+                    up_input_ptr = m_up_input_[i];
+                } else {
+                    up_input_ptr = input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type);
+                }
+            }
+        }
+        for (int j = 0; j < k; j++) {
+            memcpy(m_local_gate_input_ptr_[expert_ids[i * k + j]] + m_local_pos_[i][j] * config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type), gate_input_ptr, config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type));
+            memcpy(m_local_up_input_ptr_[expert_ids[i * k + j]] + m_local_pos_[i][j] * config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type), up_input_ptr, config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type));
+        }
+    });
+    int stride = QK_K;
+    int nth = config_.intermediate_size / stride;
+    backend->do_work_stealing_job(nth * config_.expert_num, [&](int task_id) {
+        int expert_idx = task_id / nth;
+        int ith = task_id % nth;
+        void* gate_input_ptr = m_local_gate_input_ptr_[expert_idx];
+        void* gate_proj_ptr = gate_proj_ + (expert_idx * config_.intermediate_size + ith * stride) * config_.hidden_size * ggml_type_size(config_.gate_type) / ggml_blck_size(config_.gate_type);
+        float* gate_output_ptr = m_local_gate_output_ptr_[expert_idx] + ith * stride;
+        llamafile_sgemm(stride, m_local_num_[expert_idx], config_.hidden_size / ggml_blck_size(config_.gate_type), gate_proj_ptr, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_input_ptr, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_output_ptr, config_.intermediate_size, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.gate_type, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
+        void* up_input_ptr = m_local_up_input_ptr_[expert_idx];
+        void* up_proj_ptr = up_proj_ + (expert_idx * config_.intermediate_size + ith * stride) * config_.hidden_size * ggml_type_size(config_.up_type) / ggml_blck_size(config_.up_type);
+        float* up_output_ptr = m_local_up_output_ptr_[expert_idx] + ith * stride;
+        llamafile_sgemm(stride, m_local_num_[expert_idx], config_.hidden_size / ggml_blck_size(config_.up_type), up_proj_ptr, config_.hidden_size / ggml_blck_size(config_.up_type), up_input_ptr, config_.hidden_size / ggml_blck_size(config_.up_type), up_output_ptr, config_.intermediate_size, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.up_type, ggml_internal_get_type_traits(config_.up_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
+        for (int i = 0; i < m_local_num_[expert_idx]; i++) {
+            for (int j = ith * stride; j < (ith + 1) * stride; j++) {
+                m_local_intermediate_fp32_ptr_[expert_idx][i * config_.intermediate_size + j] = act_fn(m_local_gate_output_ptr_[expert_idx][i * config_.intermediate_size + j]) * m_local_up_output_ptr_[expert_idx][i * config_.intermediate_size + j];
+            }
+            float* intermediate_fp32_ptr = m_local_intermediate_fp32_ptr_[expert_idx] + i * config_.intermediate_size + ith * stride;
+            void* down_input_ptr = m_local_down_input_ptr_[expert_idx] + i * config_.intermediate_size * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) + ith * stride * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
+            from_float(intermediate_fp32_ptr, down_input_ptr, stride, ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
+        }
+    });
+    stride = QK_K;
+    nth = config_.hidden_size / stride;
+    backend->do_work_stealing_job(nth * config_.expert_num, [&](int task_id) {
+        int expert_idx = task_id / nth;
+        int ith = task_id % nth;
+        void* down_input_ptr = m_local_down_input_ptr_[expert_idx];
+        void* down_proj_ptr = down_proj_ + (expert_idx * config_.hidden_size + ith * stride) * config_.intermediate_size * ggml_type_size(config_.down_type) / ggml_blck_size(config_.down_type);
+        float* down_output_ptr = m_local_down_output_ptr_[expert_idx] + ith * stride;
+        llamafile_sgemm(stride, m_local_num_[expert_idx], config_.intermediate_size / ggml_blck_size(config_.down_type), down_proj_ptr, config_.intermediate_size / ggml_blck_size(config_.down_type), down_input_ptr, config_.intermediate_size / ggml_blck_size(config_.down_type), down_output_ptr, config_.hidden_size, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.down_type, ggml_internal_get_type_traits(config_.down_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
+    });
+    backend->do_work_stealing_job(qlen, [&](int i) {
+        for (int e = 0; e < config_.hidden_size; e++) {
+            m_output_fp32_[i][e] = 0;
+        }
+        for (int j = 0; j < k; j++) {
+            for (int e = 0; e < config_.hidden_size; e++) {
+                m_output_fp32_[i][e] += m_local_down_output_ptr_[expert_ids[i * k + j]][m_local_pos_[i][j] * config_.hidden_size + e] * weights[i * k + j];
+            }
+        }
+        from_float(m_output_fp32_[i], output + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), config_.hidden_size, config_.hidden_type);
+    });
+}
+
+void MOE::forward(int qlen, int k, const uint64_t* expert_ids, const float* weights, const void* input, void* output, Backend* backend) {
+    if (qlen < config_.group_min_len) {
+        for (int i = 0; i < qlen; i++) {
+            forward_one(k, expert_ids + i * k, weights + i * k, input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), output + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), backend);
+        }
+        return;
+    }
+    int forward_len = std::min(config_.group_max_len, qlen);
+    forward_many(forward_len, k, expert_ids, weights, input, output, backend);
+    forward(qlen - forward_len, k, expert_ids + forward_len * k, weights + forward_len * k, input + forward_len * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), output + forward_len * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), backend);
+}
--- a/ktransformers/ktransformers_ext/operators/llamafile/moe.h
+++ b/ktransformers/ktransformers_ext/operators/llamafile/moe.h
@ -0,0 +1,96 @@
+/**
+ * @Description  :
+ * @Author       : chenht2022
+ * @Date         : 2024-07-22 02:03:22
+ * @Version      : 1.0.0
+ * @LastEditors  : chenht2022 
+ * @LastEditTime : 2024-07-25 10:35:10
+ * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
+ **/
+#ifndef CPUINFER_OPERATOR_MOE_H
+#define CPUINFER_OPERATOR_MOE_H
+
+#include <cmath>
+#include <cstdio>
+#include <functional>
+#include <mutex>
+#include <vector>
+
+#include "../../cpu_backend/backend.h"
+#include "conversion.h"
+#include "llama.cpp/ggml-impl.h"
+#include "llama.cpp/ggml-quants.h"
+#include "llama.cpp/ggml.h"
+#include "llamafile/sgemm.h"
+
+struct MOEConfig {
+    int expert_num;
+    int routed_expert_num;
+    int hidden_size;
+    int intermediate_size;
+    int stride;
+    int group_min_len;
+    int group_max_len;
+    void* gate_proj;
+    void* up_proj;
+    void* down_proj;
+    ggml_type gate_type;
+    ggml_type up_type;
+    ggml_type down_type;
+    ggml_type hidden_type;
+
+    MOEConfig() {}
+
+    MOEConfig(int expert_num, int routed_expert_num, int hidden_size, int intermediate_size, int stride, int group_min_len, int group_max_len, void* gate_proj, void* up_proj, void* down_proj, ggml_type gate_type, ggml_type up_type, ggml_type down_type, ggml_type hidden_type)
+        : expert_num(expert_num), routed_expert_num(routed_expert_num), hidden_size(hidden_size), intermediate_size(intermediate_size), stride(stride), group_min_len(group_min_len), group_max_len(group_max_len), gate_proj(gate_proj), up_proj(up_proj), down_proj(down_proj), gate_type(gate_type), up_type(up_type), down_type(down_type), hidden_type(hidden_type) {}
+};
+
+class MOE {
+   public:
+    MOE(MOEConfig);
+    void warm_up(Backend* backend);
+    void forward_one(int k, const uint64_t* expert_ids, const float* weights, const void* input, void* output, Backend* backend);
+    void forward_many(int qlen, int k, const uint64_t* expert_ids, const float* weights, const void* input, void* output, Backend* backend);
+    void forward(int qlen, int k, const uint64_t* expert_ids, const float* weights, const void* input, void* output, Backend* backend);
+
+   private:
+    static void* buffer_;
+    MOEConfig config_;
+    void* gate_proj_;  // [expert_num * intermediate_size * hidden_size ( /32 if quantized)]
+    void* up_proj_;    // [expert_num * intermediate_size * hidden_size ( /32 if quantized)]
+    void* down_proj_;  // [expert_num * hidden_size * intermediate_size ( /32 if quantized)]
+
+    float* s_input_fp32_;                      // [hidden_size]
+    uint8_t* s_gate_input_;                    // [hidden_size * ggml_type_size(ggml_internal_get_type_traits(gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(gate_type).vec_dot_type)]
+    uint8_t* s_up_input_;                      // [hidden_size * ggml_type_size(ggml_internal_get_type_traits(up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(up_type).vec_dot_type)]
+    std::vector<float*> s_gate_output_;        // [routed_expert_num, intermediate_size]
+    std::vector<float*> s_up_output_;          // [routed_expert_num, intermediate_size]
+    std::vector<float*> s_intermediate_fp32_;  // [routed_expert_num, intermediate_size]
+    std::vector<uint8_t*> s_down_input_;       // [routed_expert_num, intermediate_size * ggml_type_size(ggml_internal_get_type_traits(down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(down_type).vec_dot_type)]
+    std::vector<float*> s_down_output_;        // [routed_expert_num, hidden_size]
+    float* s_output_fp32_;                     // [hidden_size]
+
+    std::vector<float*> m_input_fp32_;    // [group_max_len, hidden_size]
+    std::vector<uint8_t*> m_gate_input_;  // [group_max_len, hidden_size * ggml_type_size(ggml_internal_get_type_traits(gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(gate_type).vec_dot_type)]
+    std::vector<uint8_t*> m_up_input_;    // [group_max_len, hidden_size * ggml_type_size(ggml_internal_get_type_traits(up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(up_type).vec_dot_type)]
+    uint8_t* m_local_gate_input_;         // [routed_expert_num * group_max_len * hidden_size * ggml_type_size(ggml_internal_get_type_traits(gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(gate_type).vec_dot_type)]
+    uint8_t* m_local_up_input_;           // [routed_expert_num * group_max_len * hidden_size * ggml_type_size(ggml_internal_get_type_traits(up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(up_type).vec_dot_type)]
+    float* m_local_gate_output_;          // [routed_expert_num * group_max_len * intermediate_size]
+    float* m_local_up_output_;            // [routed_expert_num * group_max_len * intermediate_size]
+    float* m_local_intermediate_fp32_;    // [routed_expert_num * group_max_len * intermediate_size]
+    uint8_t* m_local_down_input_;         // [routed_expert_num * group_max_len * intermediate_size * ggml_type_size(ggml_internal_get_type_traits(down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(down_type).vec_dot_type)]
+    float* m_local_down_output_;          // [routed_expert_num * group_max_len * hidden_size]
+    std::vector<float*> m_output_fp32_;   // [group_max_len, hidden_size]
+
+    std::vector<std::vector<int>> m_local_pos_;          // [group_max_len, routed_expert_num]
+    std::vector<int> m_local_num_;                       // [expert_num]
+    std::vector<uint8_t*> m_local_gate_input_ptr_;       // [expert_num]
+    std::vector<uint8_t*> m_local_up_input_ptr_;         // [expert_num]
+    std::vector<float*> m_local_gate_output_ptr_;        // [expert_num]
+    std::vector<float*> m_local_up_output_ptr_;          // [expert_num]
+    std::vector<float*> m_local_intermediate_fp32_ptr_;  // [expert_num]
+    std::vector<uint8_t*> m_local_down_input_ptr_;       // [expert_num]
+    std::vector<float*> m_local_down_output_ptr_;        // [expert_num]
+};
+
+#endif
--- a/ktransformers/local_chat.py
+++ b/ktransformers/local_chat.py
@ -0,0 +1,115 @@
+# Copyright 2024 Shaoyuan Chen
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import platform
+import sys
+project_dir = os.path.dirname(os.path.dirname(__file__))
+sys.path.insert(0, project_dir)
+import torch
+import logging
+from transformers import (
+    AutoTokenizer,
+    AutoConfig,
+    AutoModelForCausalLM,
+    GenerationConfig,
+    TextStreamer,
+)
+import json
+import fire
+from ktransformers.optimize.optimize import optimize_and_load_gguf
+from ktransformers.models.modeling_deepseek import DeepseekV2ForCausalLM
+from ktransformers.models.modeling_qwen2_moe import Qwen2MoeForCausalLM
+from ktransformers.util.utils import prefill_and_generate
+from ktransformers.server.config.config import Config
+
+custom_models = {
+    "DeepseekV2ForCausalLM": DeepseekV2ForCausalLM,
+    "Qwen2MoeForCausalLM": Qwen2MoeForCausalLM,
+}
+
+ktransformer_rules_dir = os.path.dirname(os.path.abspath(__file__)) + "/optimize/optimize_rules/"
+default_optimize_rules ={
+    "DeepseekV2ForCausalLM": ktransformer_rules_dir + "DeepSeek-V2-Chat.yaml",
+    "Qwen2MoeForCausalLM": ktransformer_rules_dir + "Qwen2-57B-A14B-Instruct.yaml",
+}
+
+def local_chat(
+    model_path: str,
+    optimize_rule_path: str = None,
+    gguf_path: str = None,
+    max_new_tokens: int = 1000,
+    cpu_infer: int = Config().cpu_infer
+):
+    torch.set_grad_enabled(False)
+    
+    Config().cpu_infer = cpu_infer
+    tokenizer = AutoTokenizer.from_pretrained(model_path)
+    config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+    torch.set_default_dtype(config.torch_dtype)
+
+    with torch.device("meta"):
+        if config.architectures[0] in custom_models:
+            print("using custom modeling_xxx.py.")
+            if "Qwen2Moe" in config.architectures[0]: # Qwen2Moe must use flash_attention_2 to avoid overflow.
+                config._attn_implementation = "flash_attention_2"
+            model = custom_models[config.architectures[0]](config)
+        else:
+            model = AutoModelForCausalLM.from_config(
+                config, trust_remote_code=True, attn_implementation="flash_attention_2"
+            )
+
+    if optimize_rule_path is None:
+        if config.architectures[0] in default_optimize_rules:
+            print("using default_optimize_rule for", config.architectures[0])
+            optimize_rule_path = default_optimize_rules[config.architectures[0]]
+        else:
+            optimize_rule_path = input(
+                "please input the path of your rule file(yaml file containing optimize rules):"
+            )
+
+    if gguf_path is None:
+        gguf_path = input(
+            "please input the path of your gguf file(gguf file in the dir containing input gguf file must all belong to current model):"
+        )
+    optimize_and_load_gguf(model, optimize_rule_path, gguf_path, config)
+
+    model.generation_config = GenerationConfig.from_pretrained(model_path)
+    if model.generation_config.pad_token_id is None:
+        model.generation_config.pad_token_id = model.generation_config.eos_token_id
+    model.eval()
+
+    logging.basicConfig(level=logging.INFO)
+
+    system = platform.system()
+    if (system == u'Windows'):
+        os.system('cls')
+    else:
+        os.system('clear')
+
+    while True:
+        content = input("Chat: ")
+        # if content is num
+        if content == "":
+            content = "Please write a piece of quicksort code in C++." 
+
+        messages = [{"role": "user", "content": content}]
+        input_tensor = tokenizer.apply_chat_template(
+            messages, add_generation_prompt=True, return_tensors="pt"
+        )
+        torch.set_default_dtype(torch.bfloat16) # TODO: Remove this, replace dtype using config
+        generated = prefill_and_generate(model, tokenizer, input_tensor.cuda(), max_new_tokens)
+
+if __name__ == "__main__":
+    fire.Fire(local_chat)
--- a/ktransformers/models/init.py
+++ b/ktransformers/models/init.py
--- a/ktransformers/models/configuration_deepseek.py
+++ b/ktransformers/models/configuration_deepseek.py
@ -0,0 +1,207 @@
+# Adapted from
+# https://huggingface.co/deepseek-ai/DeepSeek-V2-Chat-0628/blob/main/configuration_deepseek.py
+# Copyright 2023 DeepSeek-AI and The HuggingFace Inc. team. All rights reserved.
+# Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+DEEPSEEK_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
+class DeepseekV2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`DeepseekV2Model`]. It is used to instantiate an DeepSeek
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the DeepSeek-V2.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 102400):
+            Vocabulary size of the Deep model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`DeepseekV2Model`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations.
+        moe_intermediate_size (`int`, *optional*, defaults to 1407):
+            Dimension of the MoE representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        n_shared_experts (`int`, *optional*, defaults to None):
+            Number of shared experts, None means dense model.
+        n_routed_experts (`int`, *optional*, defaults to None):
+            Number of routed experts, None means dense model.
+        routed_scaling_factor (`float`, *optional*, defaults to 1.0):
+            Scaling factor or routed experts.
+        topk_method (`str`, *optional*, defaults to `gready`):
+            Topk method used in routed gate.
+        n_group (`int`, *optional*, defaults to None):
+            Number of groups for routed experts.
+        topk_group (`int`, *optional*, defaults to None):
+            Number of selected groups for each token(for each token, ensuring the selected experts is only within `topk_group` groups).
+        num_experts_per_tok (`int`, *optional*, defaults to None):
+            Number of selected experts, None means dense model.
+        moe_layer_freq (`int`, *optional*, defaults to 1):
+            The frequency of the MoE layer: one expert layer for every `moe_layer_freq - 1` dense layers.
+        first_k_dense_replace (`int`, *optional*, defaults to 0):
+            Number of dense layers in shallow layers(embed->dense->dense->...->dense->moe->moe...->lm_head).
+                                                            \--k dense layers--/
+        norm_topk_prob (`bool`, *optional*, defaults to False):
+            Whether to normalize the weights of the routed experts.
+        scoring_func (`str`, *optional*, defaults to 'softmax'):
+            Method of computing expert weights.
+        aux_loss_alpha (`float`, *optional*, defaults to 0.001):
+            Auxiliary loss weight coefficient.
+        seq_aux = (`bool`, *optional*, defaults to True):
+            Whether to compute the auxiliary loss for each individual sample.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            End of stream token id.
+        pretraining_tp (`int`, *optional*, defaults to 1):
+            Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
+            document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
+            necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
+            issue](https://github.com/pytorch/pytorch/issues/76232).
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
+            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
+            `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
+            `max_position_embeddings` to the expected new maximum.
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+    ```python
+    >>> from transformers import DeepseekV2Model, DeepseekV2Config
+    >>> # Initializing a Deepseek-V2 style configuration
+    >>> configuration = DeepseekV2Config()
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "deepseek_v2"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=102400,
+        hidden_size=4096,
+        intermediate_size=11008,
+        moe_intermediate_size = 1407,
+        num_hidden_layers=30,
+        num_attention_heads=32,
+        num_key_value_heads=32,
+        n_shared_experts = None,
+        n_routed_experts = None,
+        ep_size = 1,
+        routed_scaling_factor = 1.0,
+        kv_lora_rank = 512,
+        q_lora_rank = 1536,
+        qk_rope_head_dim = 64,
+        v_head_dim = 128,
+        qk_nope_head_dim = 128,
+        topk_method = 'gready',
+        n_group = None,
+        topk_group = None,
+        num_experts_per_tok = None,
+        moe_layer_freq = 1,
+        first_k_dense_replace = 0,
+        norm_topk_prob = False,
+        scoring_func = 'softmax',
+        aux_loss_alpha = 0.001,
+        seq_aux = True,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=100000,
+        eos_token_id=100001,
+        pretraining_tp=1,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        cpu_quant=None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.moe_intermediate_size = moe_intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.n_shared_experts = n_shared_experts
+        self.n_routed_experts = n_routed_experts
+        self.ep_size = ep_size
+        self.routed_scaling_factor = routed_scaling_factor
+        self.kv_lora_rank = kv_lora_rank
+        self.q_lora_rank = q_lora_rank
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.v_head_dim = v_head_dim
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.topk_method = topk_method
+        self.n_group = n_group
+        self.topk_group = topk_group
+        self.num_experts_per_tok = num_experts_per_tok
+        self.moe_layer_freq = moe_layer_freq
+        self.first_k_dense_replace = first_k_dense_replace
+        self.norm_topk_prob = norm_topk_prob
+        self.scoring_func = scoring_func
+        self.aux_loss_alpha = aux_loss_alpha
+        self.seq_aux = seq_aux
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+
+        self.cpu_quant = cpu_quant
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
--- a/ktransformers/models/custom_cache.py
+++ b/ktransformers/models/custom_cache.py
@ -0,0 +1,128 @@
+'''
+Description  :  
+Author       : Boxin Zhang
+Version      : 0.1.0
+'''
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.41.2/src/transformers/cache_utils.py
+# Copyright 2018- The Hugging Face team. All rights reserved.
+# Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
+import torch
+import transformers
+from transformers import Cache, PretrainedConfig
+from typing import List, Optional, Dict, Any, Tuple
+class StaticCache(transformers.StaticCache):
+    """
+    Static Cache class to be used with `torch.compile(model)`.
+
+    Parameters:
+        config (`PretrainedConfig):
+            The configuration file defining the shape-related attributes required to initialize the static cache.
+        max_batch_size (`int`):
+            The maximum batch size with which the model will be used.
+        max_cache_len (`int`):
+            The maximum sequence length with which the model will be used.
+        device (`torch.device`):
+            The device on which the cache should be initialized. Should be the same as the layer.
+        dtype (*optional*, defaults to `torch.float32`):
+            The default `dtype` to use when initializing the layer.
+    """
+
+    def __init__(self, config: PretrainedConfig, max_batch_size: int, max_cache_len: int, device, dtype=None) -> None:
+        Cache.__init__(self)
+        self.max_batch_size = max_batch_size
+        self.max_cache_len = config.max_position_embeddings if max_cache_len is None else max_cache_len
+        # Some model define a custom `head_dim` != config.hidden_size // config.num_attention_heads
+        self.head_dim = (
+            config.head_dim if hasattr(config, "head_dim") else config.hidden_size // config.num_attention_heads
+        )
+
+        self.dtype = dtype if dtype is not None else torch.float32
+        self.num_key_value_heads = (
+            config.num_attention_heads if config.num_key_value_heads is None else config.num_key_value_heads
+        )
+
+        self.key_cache: List[torch.Tensor] = []
+        self.value_cache: List[torch.Tensor] = []
+        cache_shape = (max_batch_size, self.num_key_value_heads, self.max_cache_len, self.head_dim)
+        if config.architectures[0] == "DeepseekV2ForCausalLM":
+            # key_shape = (max_batch_size, self.num_key_value_heads, self.max_cache_len, config.qk_rope_head_dim + config.qk_nope_head_dim)
+            # value_shape = (max_batch_size, self.num_key_value_heads, self.max_cache_len, config.v_head_dim)
+            key_shape = (max_batch_size, 1, self.max_cache_len, config.qk_rope_head_dim)
+            value_shape = (max_batch_size, 1, self.max_cache_len, config.kv_lora_rank)
+        else:
+            key_shape = cache_shape
+            value_shape = cache_shape
+
+        self.past_tokens = []
+        self.num_hidden_layers = config.num_hidden_layers
+        for _ in range(self.num_hidden_layers):
+            # Note: `mark_static_address` is used to tag the cache as an fixed data pointer, preventing cuda graph
+            # breaks when updating the cache.
+            new_layer_key_cache = torch.zeros(key_shape, dtype=self.dtype, device=device)
+            new_layer_value_cache = torch.zeros(value_shape, dtype=self.dtype, device=device)
+            torch._dynamo.mark_static_address(new_layer_key_cache)
+            torch._dynamo.mark_static_address(new_layer_value_cache)
+            self.key_cache.append(new_layer_key_cache)
+            self.value_cache.append(new_layer_value_cache)
+            self.past_tokens.append(0)
+
+    def update(
+        self,
+        key_states: torch.Tensor,
+        value_states: torch.Tensor,
+        layer_idx: int,
+        cache_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.
+        It is VERY important to index using a tensor, otherwise you introduce a copy to the device.
+
+        Parameters:
+            key_states (`torch.Tensor`):
+                The new key states to cache.
+            value_states (`torch.Tensor`):
+                The new value states to cache.
+            layer_idx (`int`):
+                The index of the layer to cache the states for.
+            cache_kwargs (`Dict[str, Any]`, `optional`):
+                Additional arguments for the cache subclass. The `StaticCache` needs the `cache_position` input
+                to know how where to write in the cache.
+
+        Return:
+            A tuple containing the updated key and value states.
+        """
+        cache_position = cache_kwargs.get("cache_position")
+        k_out = self.key_cache[layer_idx]
+        v_out = self.value_cache[layer_idx]
+        #print(cache_position)
+        k_out[:, :, cache_position] = key_states
+        v_out[:, :, cache_position] = value_states
+        self.past_tokens[layer_idx] += cache_position.size(0)
+        return k_out, v_out
+
+    def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
+        """Returns the sequence length of the cached states that were seen by the model."""
+        # Occupied cache == any slot in the 3rd dim (sequence length) holds a non-zero value. To save on compute, let's
+        # limit the check to the first batch member and head dimension.
+        # TODO: deprecate this function in favor of `cache_position`
+        return self.past_tokens[layer_idx]
+    
+    def change_seq_length(self, bias: Optional[int] = 0) -> int:
+        """Returns the sequence length of the cached states that were seen by the model."""
+        # Occupied cache == any slot in the 3rd dim (sequence length) holds a non-zero value. To save on compute, let's
+        # limit the check to the first batch member and head dimension.
+        # TODO: deprecate this function in favor of `cache_position`
+        for layer_idx in range(self.num_hidden_layers):
+            self.past_tokens[layer_idx] += bias
+
+    def get_max_length(self) -> Optional[int]:
+        """Returns the maximum sequence length of the cached states."""
+        return self.max_cache_len
+
+    def reset(self):
+        """Resets the cache values while preserving the objects"""
+        for layer_idx in range(len(self.key_cache)):
+            # In-place ops prevent breaking the static address
+            self.key_cache[layer_idx].zero_()
+            self.value_cache[layer_idx].zero_()
--- a/ktransformers/models/modeling_deepseek.py
+++ b/ktransformers/models/modeling_deepseek.py
--- a/ktransformers/models/modeling_qwen2_moe.py
+++ b/ktransformers/models/modeling_qwen2_moe.py
--- a/ktransformers/operators/RoPE.py
+++ b/ktransformers/operators/RoPE.py
@ -0,0 +1,65 @@
+'''
+Description  :  
+Author       : Boxin Zhang
+Version      : 0.1.0
+Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
+'''
+from torch import nn
+from ktransformers.models.modeling_deepseek import DeepseekV2YarnRotaryEmbedding, DeepseekV2RotaryEmbedding
+from ktransformers.operators.base_operator import BaseInjectedModule
+from ktransformers.util.custom_gguf import GGUFLoader
+from ktransformers.util.utils import InferenceState
+from transformers.configuration_utils import PretrainedConfig
+# Copied from transformers.models.mixtral.modeling_mixtral.MixtralRotaryEmbedding with Mixtral->Qwen2Moe
+class RotaryEmbedding(BaseInjectedModule, DeepseekV2RotaryEmbedding):
+    def __init__(self,
+                 key: str,
+                 gguf_loader : GGUFLoader,
+                 config: PretrainedConfig,
+                 orig_module: nn.Module,
+                 device: str = "cuda",
+                 **kwargs):
+        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, device, **kwargs)
+        self.orig_module.__init__(orig_module.dim,
+            orig_module.max_position_embeddings,
+            orig_module.base)
+    
+    def load(self):
+        self.orig_module.__init__(self.orig_module.dim,
+            self.orig_module.max_position_embeddings,
+            self.orig_module.base,
+            self.device)
+    
+class YarnRotaryEmbedding(BaseInjectedModule, DeepseekV2YarnRotaryEmbedding):
+    def __init__(self,
+                 key: str,
+                 gguf_loader : GGUFLoader,
+                 config: PretrainedConfig,
+                 orig_module: nn.Module,
+                 device: str = "cuda",
+                 **kwargs):
+        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, device, **kwargs)
+        self.orig_module.__init__(orig_module.dim,
+            orig_module.max_position_embeddings,
+            orig_module.base,
+            None, #device
+            orig_module.scaling_factor,
+            orig_module.original_max_position_embeddings,
+            orig_module.beta_fast,
+            orig_module.beta_slow,
+            orig_module.mscale,
+            orig_module.mscale_all_dim)
+        
+    
+    def load(self):
+        self.orig_module.__init__(self.orig_module.dim,
+            self.orig_module.max_position_embeddings,
+            self.orig_module.base,
+            self.device,
+            self.orig_module.scaling_factor,
+            self.orig_module.original_max_position_embeddings,
+            self.orig_module.beta_fast,
+            self.orig_module.beta_slow,
+            self.orig_module.mscale,
+            self.orig_module.mscale_all_dim)
+    
--- a/ktransformers/operators/init.py
+++ b/ktransformers/operators/init.py
@ -0,0 +1 @@
+
--- a/ktransformers/operators/attention.py
+++ b/ktransformers/operators/attention.py
@ -0,0 +1,199 @@
+'''
+Description  :  
+Author       : Boxin Zhang
+Version      : 0.1.0
+Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
+'''
+import torch
+from torch import nn
+import warnings
+from ktransformers.models.configuration_deepseek import DeepseekV2Config
+from ktransformers.models.modeling_deepseek import DeepseekV2Attention, apply_rotary_pos_emb
+from typing import Optional, Tuple
+from ktransformers.operators.base_operator import BaseInjectedModule
+from ktransformers.util.custom_gguf import GGUFLoader
+from transformers.configuration_utils import PretrainedConfig
+from transformers.cache_utils import Cache
+
+class DeepseekV2AttentionInjected(BaseInjectedModule, DeepseekV2Attention):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self,
+                 key: str,
+                 gguf_loader : GGUFLoader,
+                 config: PretrainedConfig,
+                 orig_module: nn.Module,
+                 device: str = "cuda",
+                 **kwargs):
+        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, device, **kwargs)
+        self.orig_module.__init__(orig_module.config,
+            orig_module.layer_idx)
+
+    def get_absorbed(self) -> Tuple[torch.Tensor, torch.Tensor]:
+        if not (hasattr(self, 'q_absorb') and hasattr(self, 'out_absorb')):
+            kv_b_proj = self.kv_b_proj.weight.view(self.num_heads, -1, self.kv_lora_rank)
+            q_absorb = kv_b_proj[:, :self.qk_nope_head_dim, :].reshape(-1, self.kv_lora_rank)
+            out_absorb = kv_b_proj[:, self.qk_nope_head_dim:, :].reshape(-1, self.kv_lora_rank)
+            self.q_absorb = nn.Linear(self.kv_lora_rank, self.num_heads * self.qk_nope_head_dim, 
+                                      bias=False, dtype=q_absorb.dtype, device=q_absorb.device)
+            self.q_absorb.weight.data = q_absorb
+            self.out_absorb = nn.Linear(self.kv_lora_rank, self.num_heads * self.v_head_dim, 
+                                        bias=False, dtype=out_absorb.dtype, device=out_absorb.device)
+            self.out_absorb.weight.data = out_absorb
+            del self.orig_module.kv_b_proj
+        q_absorb = self.q_absorb.weight.view(self.num_heads, self.qk_nope_head_dim, self.kv_lora_rank)
+        out_absorb = self.out_absorb.weight.view(self.num_heads, self.v_head_dim, self.kv_lora_rank)
+        return q_absorb, out_absorb
+
+    def forward_chunck(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+        if self.q_lora_rank is None:
+            q = self.q_proj(hidden_states)
+        else:
+            q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
+        q = q.view(bsz, q_len, self.num_heads, self.q_head_dim).transpose(1, 2)
+        q_nope, q_pe = torch.split(
+            q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
+        )
+
+        compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
+        compressed_kv, k_pe = torch.split(
+            compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
+        )
+        compressed_kv = self.kv_a_layernorm(compressed_kv)
+        k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim).transpose(1, 2)
+
+        kv_seq_len = k_pe.shape[-2]
+        if past_key_value is not None:
+            if self.layer_idx is None:
+                raise ValueError(
+                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                    "with a layer index."
+                )
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+
+        cos, sin = self.rotary_emb(q_pe, position_ids)
+        q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin)
+
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
+            compressed_kv = compressed_kv.unsqueeze(1)
+            k_pe, compressed_kv = past_key_value.update(k_pe, compressed_kv, self.layer_idx, cache_kwargs)
+            compressed_kv = compressed_kv.squeeze(1)
+            #if cache_position is not None:  
+            #    compressed_kv = compressed_kv[:,: cache_position[-1] + 1,:]
+            #    k_pe = k_pe[:,:,: cache_position[-1] + 1,:]
+        q_absorb, out_absorb = self.get_absorbed()
+
+        q_nope = torch.matmul(q_nope, q_absorb)
+        attn_weights = (torch.matmul(q_pe, k_pe.mT) + torch.matmul(q_nope, compressed_kv.unsqueeze(-3).mT)) * self.softmax_scale
+        """
+        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+        assert attention_mask is not None
+        """
+        if attention_mask is not None:
+            """
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+            """
+            #causal_mask = attention_mask[:, :, :, : kv_seq_len]
+            attn_weights = attn_weights + attention_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(
+            attn_weights, dim=-1, dtype=torch.float32
+        ).to(q_pe.dtype)
+        attn_weights = nn.functional.dropout(
+            attn_weights, p=self.attention_dropout, training=self.training
+        )
+        attn_output = torch.einsum('bhql,blc->bhqc', attn_weights, compressed_kv)
+
+        attn_output = torch.matmul(attn_output, out_absorb.mT) 
+
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.v_head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.v_head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+
+        attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.v_head_dim)
+
+        attn_output = self.o_proj(attn_output)
+
+        return attn_output, None, past_key_value
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+        bsz, q_len, _ = hidden_states.size()
+        chunck_size = 256 # TODO, generate chunck_size automatically.
+        
+        if q_len <= chunck_size:
+            return self.forward_chunck(
+                            hidden_states,
+                            attention_mask,
+                            position_ids,
+                            past_key_value,
+                            output_attentions,
+                            use_cache,
+                            cache_position,
+                            **kwargs
+                        )
+
+        assert output_attentions == False, "output_attentions is not supported when using chunked attention"
+        attn_output = None
+        cur_idx = 0
+        while cur_idx < q_len:
+            if attention_mask is not None:
+                chunk_mask = attention_mask[:, :, cur_idx:min(cur_idx + chunck_size, q_len), ...]
+            else:
+                chunk_mask = None
+
+            cur_output, _, _ = self.forward_chunck(
+                            hidden_states[:, cur_idx:min(cur_idx + chunck_size, q_len), ...],
+                            chunk_mask,
+                            position_ids[:, cur_idx:min(cur_idx + chunck_size, q_len)],
+                            past_key_value,
+                            output_attentions,
+                            use_cache,
+                            cache_position[cur_idx:min(cur_idx + chunck_size, q_len)],
+                            **kwargs
+                        )
+            cur_idx += chunck_size
+            if attn_output is None:
+                attn_output = cur_output
+            else:
+                attn_output = torch.cat((attn_output, cur_output), dim=-2)
+                
+        return attn_output, None, past_key_value
--- a/ktransformers/operators/base_operator.py
+++ b/ktransformers/operators/base_operator.py
@ -0,0 +1,60 @@
+'''
+Description  :  
+Author       : Boxin Zhang
+Version      : 0.1.0
+Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
+'''
+from typing import Any
+from torch import nn, Tensor
+from ktransformers.util.custom_gguf import GGUFLoader
+from transformers.configuration_utils import PretrainedConfig
+import ktransformers.util.utils as utils
+class BaseInjectedModule(nn.Module):
+    
+    def __init__(self,
+                 key: str,
+                 gguf_loader : GGUFLoader,
+                 config: PretrainedConfig,
+                 orig_module: nn.Module,
+                 device: str = "cuda",
+                 **kwargs):
+        nn.Module.__init__(self)
+        nn.Module.__setattr__(self, "orig_module", orig_module)
+        object.__setattr__(self, "key", key)
+        object.__setattr__(self, "gguf_loader", gguf_loader)
+        object.__setattr__(self, "config", config)
+        object.__setattr__(self, "device", device)
+        
+    def __getattr__(self, name: str) -> Any:
+        # __getattr__ in nn.Module doesn't call super().__getattribute__ when name is not in nn.Module.__dict__,
+        # but __setattr__ in nn.Module call super().__setattr__ in that case, there may be some attribute set 
+        # but can't get using __getattr__, typically these attr is build in attr of the class, so class.attr does not
+        # call __getattr__.
+        # Example:
+        # ...import torch
+        # ...l=torch.nn.Linear(100,200)
+        # ...l.out_features # 200
+        # ...l.__getattr__("out_features") # AttributeError: 'Linear' object has no attribute 'out_features'
+        try:
+            return object.__getattribute__(self, name) # if this attr belongs to BaseInjectedModule
+        except:
+            if name == "orig_module":
+                return nn.Module.__getattr__(self, "orig_module")
+            try:
+                return nn.Module.__getattr__(self, "orig_module").__getattr__(name) # if this attr belongs to orig_module
+            except:
+                return super(nn.Module, nn.Module.__getattr__(self, "orig_module")).__getattribute__(name) # if this attr belongs to orig_module but not in nn.Module.__dict__
+
+    def __setattr__(self, name: str, value: Tensor | nn.Module) -> None:
+        if name == "orig_module":
+            return nn.Module.__setattr__(self, "orig_module", value)
+        elif hasattr(self, name):
+            return object.__setattr__(self, name, value)
+        return nn.Module.__getattr__(self, "orig_module").__setattr__(name, value)
+    
+    def forward(self, *args, **kwargs):
+        return self.orig_module.forward(*args, **kwargs)
+    
+    def load(self):
+        for name, child in self._modules.items():
+            utils.load_weights(child, self.gguf_loader, self.key+".")
--- a/ktransformers/operators/experts.py
+++ b/ktransformers/operators/experts.py
@ -0,0 +1,679 @@
+#!/usr/bin/env python
+# coding=utf-8
+'''
+Description  :  
+Author       : Azure-Tang, Boxin Zhang, chenht2022
+Date         : 2024-07-25 11:25:24
+Version      : 0.1.0
+LastEditors  : Azure 
+LastEditTime : 2024-07-26 09:27:41
+Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
+'''
+
+from typing import Any, Union
+import numpy as np
+import numpy.typing as npt
+from torch import Tensor, nn
+import torch.nn.functional as F
+import torch
+import sys, os
+from ktransformers.operators.base_operator import BaseInjectedModule
+
+sys.path.append(os.path.dirname(__file__) + "/../ktransformers_ext/build")
+import cpuinfer_ext
+from cpuinfer_ext.moe import MOEConfig, MOE
+import ctypes
+from ktransformers.util.custom_gguf import GGUFLoader
+from ktransformers.util.utils import InferenceState
+from ktransformers.server.config.config import Config
+from transformers.activations import ACT2FN
+from transformers.configuration_utils import PretrainedConfig
+from abc import ABC, abstractmethod
+from ktransformers.operators.linear import QuantizedLinearMarlin, QuantizedLinearTorch, KTransformerLinear
+import time
+
+
+# class Base(BaseInjectedModule, ABC):
+class MLPExpertsBase(ABC):
+    def __init__(self, key: str, gguf_loader: GGUFLoader, config: PretrainedConfig, orig_module: nn.Module, device: str = "cuda", **kwargs):
+        # super().__init__(key, gguf_loader, config, orig_module, device, **kwargs)
+        self.key = key
+        self.gguf_loader = gguf_loader
+        self.config = config
+        self.device = device
+    
+    @abstractmethod
+    def forward(self, input_tensor, expert_ids, weights):
+        pass
+
+    @abstractmethod
+    def load(self, w: dict | nn.Parameter | tuple | None = None, device: str = "cpu", warmup: bool = False):
+        pass
+    
+    @abstractmethod
+    def unload():
+        pass
+
+    def load_weights(self, override_key: str | None = None, device: str = "cpu"):
+        res = {}
+        if override_key is not None:
+            keys = override_key
+        else:
+            keys = [self.key]
+
+        gate = None
+        up = None
+        down = None
+        gate_type = None
+        up_type = None
+        down_type = None
+
+        for key in keys:
+            if key + ".ffn_gate_exps.weight" in self.gguf_loader.tensor_info:
+                targets = [".ffn_gate_exps.weight", ".ffn_up_exps.weight", ".ffn_down_exps.weight" ]
+                tensors = self.load_multi(key, targets, device=device)
+                gate = tensors[".ffn_gate_exps.weight"]
+                up = tensors[".ffn_up_exps.weight"]
+                down = tensors[".ffn_down_exps.weight"]
+                gate_type = self.gguf_loader.tensor_info[key + ".ffn_gate_exps.weight"]["ggml_type"]
+                up_type = self.gguf_loader.tensor_info[key + ".ffn_up_exps.weight"]["ggml_type"]
+                down_type = self.gguf_loader.tensor_info[key + ".ffn_down_exps.weight"]["ggml_type"]
+            else:
+                raise ValueError(f"Experts {key} not found in gguf_loader")
+            res = {key:{"gate": gate, "up": up, "down": down, "gate_type": gate_type, "up_type": up_type, "down_type": down_type}}
+        return res
+    
+    def load_multi(self, key: str, keys: list[str], device: str = "cpu"):
+        tensors = {}
+        for k in keys:
+            tensors[k] = self.gguf_loader.load_gguf_tensor(key + k, device=device)
+        return tensors
+
+class MLPCPUExperts(MLPExpertsBase):
+    input_tensor_cpu:Tensor = None
+    expert_ids_cpu:Tensor = None
+    weights_cpu:Tensor = None
+    output_cpu:Tensor = None
+    output_gpu:Tensor = None
+    CPU_INFER = cpuinfer_ext.CPUInfer(Config().cpu_infer)
+    def __init__(
+        self,
+        key: str,
+        gguf_loader: GGUFLoader,
+        config: PretrainedConfig,
+        n_routed_experts: int,
+        orig_module: nn.Module = None,
+        device: str = "cpu",
+        out_device: str = "cuda", # this device mean which device the output should on 
+        **kwargs
+    ):
+        super().__init__(key, gguf_loader, config, orig_module, device, **kwargs)
+        assert device.lower() == "cpu", "MLPCPUExperts can only be loaded on CPU"
+        self.n_routed_experts = n_routed_experts
+        self.out_device = out_device
+
+    def load(self, w: dict | nn.Parameter | tuple | None = None, device:str|None = None, warmup:bool = False):
+        if device:
+            assert device.lower() == "cpu", "MLPCPUExperts can only be loaded on CPU, Parameter \"device\" can be cpu or None."
+        if w is None: w = self.load_weights()[self.key]
+        self.gate = w["gate"]
+        self.up = w["up"]
+        self.down = w["down"]
+        self.gate_type = w["gate_type"]
+        self.up_type = w["up_type"]
+        self.down_type = w["down_type"]
+        gate_ptr = ctypes.addressof(
+            ctypes.cast(self.gate.ctypes.data, ctypes.POINTER(ctypes.c_uint64)).contents
+        )
+        up_ptr = ctypes.addressof(
+            ctypes.cast(self.up.ctypes.data, ctypes.POINTER(ctypes.c_uint64)).contents
+        )
+        down_ptr = ctypes.addressof(
+            ctypes.cast(self.down.ctypes.data, ctypes.POINTER(ctypes.c_uint64)).contents
+        )
+        # print(self.gate_qtype, self.up_qtype, self.down_qtype)
+        n_routed_experts = self.n_routed_experts
+        # n_routed_experts = len(self.orig_module)
+        moe_config = MOEConfig(
+            n_routed_experts,
+            self.config.num_experts_per_tok,
+            self.config.hidden_size,
+            self.config.moe_intermediate_size,
+            64,
+            10,
+            1024,
+            gate_ptr,
+            up_ptr,
+            down_ptr,
+            self.gate_type,
+            self.up_type,
+            self.down_type,
+            30, # TODO: get from model.dtype
+        )
+        # print(n_routed_experts, hidden_size, moe_intermediate_size)
+        num_experts_per_tok = self.config.num_experts_per_tok
+        self.moe = MOE(moe_config)
+        self.cpu_infer = MLPCPUExperts.CPU_INFER
+        if warmup:
+            self.cpu_infer.submit(self.moe.warm_up)
+            self.cpu_infer.sync()
+        if MLPCPUExperts.output_gpu == None:
+            MLPCPUExperts.input_tensor_cpu = torch.empty((self.config.hidden_size), device="cpu", pin_memory=True)
+            MLPCPUExperts.expert_ids_cpu = torch.empty((num_experts_per_tok), device="cpu", dtype=torch.long, pin_memory=True)
+            MLPCPUExperts.weights_cpu = torch.empty((num_experts_per_tok), device="cpu", dtype=torch.float32, pin_memory=True)
+            MLPCPUExperts.output_cpu = torch.empty((self.config.hidden_size), device="cpu", pin_memory=True)
+            MLPCPUExperts.output_gpu = torch.empty((self.config.hidden_size), device=self.out_device)
+
+    def submit_for_one_decode(self, input_tensor, expert_ids, weights):
+        MLPCPUExperts.input_tensor_cpu.copy_(input_tensor, non_blocking=True)
+        MLPCPUExperts.expert_ids_cpu.copy_(expert_ids, non_blocking=True)
+        MLPCPUExperts.weights_cpu.copy_(weights, non_blocking=True)
+        self.cpu_infer.submit_with_cuda_stream(torch.cuda.current_stream().cuda_stream, self.moe.forward, 1, expert_ids.size(0), MLPCPUExperts.expert_ids_cpu.data_ptr(), MLPCPUExperts.weights_cpu.data_ptr(), MLPCPUExperts.input_tensor_cpu.data_ptr(), MLPCPUExperts.output_cpu.data_ptr())
+    
+    def sync_for_one_decode(self):
+        self.cpu_infer.sync_with_cuda_stream(torch.cuda.current_stream().cuda_stream)
+        MLPCPUExperts.output_gpu.copy_(MLPCPUExperts.output_cpu, non_blocking=True)
+        #print("capturing experts finish")
+        return MLPCPUExperts.output_gpu
+
+    def forward(self, input_tensor, expert_ids, weights):
+        # generate, capture and run cuda graph
+        if input_tensor.size(0)==1:
+            #print("capturing experts")
+            MLPCPUExperts.input_tensor_cpu.copy_(input_tensor, non_blocking=True)
+            MLPCPUExperts.expert_ids_cpu.copy_(expert_ids, non_blocking=True)
+            MLPCPUExperts.weights_cpu.copy_(weights, non_blocking=True)
+            self.cpu_infer.submit_with_cuda_stream(torch.cuda.current_stream().cuda_stream, self.moe.forward, 1, expert_ids.size(1), MLPCPUExperts.expert_ids_cpu.data_ptr(), MLPCPUExperts.weights_cpu.data_ptr(), MLPCPUExperts.input_tensor_cpu.data_ptr(), MLPCPUExperts.output_cpu.data_ptr())
+            self.cpu_infer.sync_with_cuda_stream(torch.cuda.current_stream().cuda_stream)
+            MLPCPUExperts.output_gpu.copy_(MLPCPUExperts.output_cpu, non_blocking=True)
+            #print("capturing experts finish")
+            return MLPCPUExperts.output_gpu
+        else:
+            input_tensor = input_tensor.contiguous().cpu()
+            expert_ids = expert_ids.contiguous().cpu()
+            weights = weights.contiguous().to(torch.float32).cpu()
+            output = torch.empty_like(input_tensor).contiguous()
+            self.cpu_infer.submit(self.moe.forward, expert_ids.size(0), expert_ids.size(1), expert_ids.data_ptr(), weights.data_ptr(), input_tensor.data_ptr(), output.data_ptr())
+            self.cpu_infer.sync()
+            return output.to(device=object.__getattribute__(self, "device"))
+    
+    def unload(self):
+        return
+
+    def load_weights(self, override_key: str | None = None, device: str = "cpu"):
+        res = {}
+        if override_key is not None:
+            keys = override_key
+        else:
+            keys = [self.key]
+
+        gate = None
+        up = None
+        down = None
+        gate_type = None
+        up_type = None
+        down_type = None
+
+        for key in keys:
+            if key + ".ffn_gate_exps.weight" in self.gguf_loader.tensor_info:
+                gate = self.gguf_loader.get_mmap_tensor(key + ".ffn_gate_exps.weight")
+                up = self.gguf_loader.get_mmap_tensor(key + ".ffn_up_exps.weight")
+                down = self.gguf_loader.get_mmap_tensor(key + ".ffn_down_exps.weight")
+                gate_type = self.gguf_loader.tensor_info[key + ".ffn_gate_exps.weight"]["ggml_type"]
+                up_type = self.gguf_loader.tensor_info[key + ".ffn_up_exps.weight"]["ggml_type"]
+                down_type = self.gguf_loader.tensor_info[key + ".ffn_down_exps.weight"]["ggml_type"]
+            else:
+                raise ValueError(f"Experts {key} not found in gguf_loader")
+            res = {key:{"gate": gate, "up": up, "down": down, "gate_type": gate_type, "up_type": up_type, "down_type": down_type}}
+        return res
+    
+class MLPExpertsMarlin(MLPExpertsBase):
+    expert_num: int
+    loaded_experts_idx: list[int]
+    def __init__(
+        self,
+        key: str,
+        gguf_loader: GGUFLoader,
+        config: PretrainedConfig,
+        n_routed_experts: int,
+        orig_module: nn.Module = None,
+        device: str = "cuda",
+        **kwargs
+    ):
+        super().__init__(key, gguf_loader, config, orig_module, device, **kwargs)
+        self.expert_num = n_routed_experts
+        self.loaded_experts_idx = []
+        self.act_fn = ACT2FN[config.hidden_act]
+        assert device.lower() != "cpu", "Marlin experts can only be loaded on GPU"
+        self.device = device
+        # create empty marlin experts according to the number of experts per token
+        # up
+        self.up_projs = [QuantizedLinearMarlin(key+ "." + "ffn_up_exps", gguf_loader, config, device=device) for i in range(self.expert_num)]
+        # gate
+        self.gate_projs = [QuantizedLinearMarlin(key+ "." + "ffn_gate_exps", gguf_loader, config, device=device) for i in range(self.expert_num)]
+        # down
+        self.down_projs = [QuantizedLinearMarlin(key+ "." + "ffn_down_exps", gguf_loader, config, device=device) for i in range(self.expert_num)]
+
+    def load(self, w: dict | nn.Parameter | tuple | None = None, device: str | None = None, warmup: bool = False):
+        if device is None: device = self.device
+        assert device.lower() != "cpu", "Marlin experts can only be loaded on GPU"
+        if w is None: w = self.load_weights()[self.key]
+
+        if isinstance(w, dict):
+            self.gate = nn.Parameter(torch.from_numpy(w["gate"]))
+            self.up = nn.Parameter(torch.from_numpy(w["up"]))
+            self.down = nn.Parameter(torch.from_numpy(w["down"]))
+            for i in range(self.expert_num):
+                self.up_projs[i].load(self.up[i,...], device=device)
+                self.gate_projs[i].load(self.gate[i,...], device=device)
+                self.down_projs[i].load(self.down[i,...], device=device)
+                self.loaded_experts_idx.append(i)
+        return 
+
+    def unload(self):
+        for i in self.loaded_experts_idx:
+            self.up_projs[i].unload()
+            self.gate_projs[i].unload()
+            self.down_projs[i].unload()
+        self.loaded_experts_idx = []
+
+    def load_weights(self, override_key: str | None = None):
+        res = {}
+        if override_key is not None:
+            keys = override_key
+        else:
+            keys = [self.key]
+
+        gate = None
+        up = None
+        down = None
+        gate_type = None
+        up_type = None
+        down_type = None
+
+        for key in keys:
+            if key + ".ffn_gate_exps.weight" in self.gguf_loader.tensor_info:
+                gate = self.gguf_loader.load_gguf_tensor(key + ".ffn_gate_exps.weight")
+                up = self.gguf_loader.load_gguf_tensor(key + ".ffn_up_exps.weight")
+                down = self.gguf_loader.load_gguf_tensor(key + ".ffn_down_exps.weight")
+                gate_type = self.gguf_loader.tensor_info[key + ".ffn_gate_exps.weight"]["ggml_type"]
+                up_type = self.gguf_loader.tensor_info[key + ".ffn_up_exps.weight"]["ggml_type"]
+                down_type = self.gguf_loader.tensor_info[key + ".ffn_down_exps.weight"]["ggml_type"]
+                # tensors = self.load_multi(key, [".ffn_gate_exps.weight", ".ffn_up_exps.weight", ".ffn_down_exps.weight"])    
+            res = {key:{"gate": gate, "up": up, "down": down, "gate_type": gate_type, "up_type": up_type, "down_type": down_type}}
+        return res
+
+    def forward(self, input_tensor:torch.Tensor, expert_ids, weights):
+        # forward
+        device = input_tensor.device
+        input_tensor = input_tensor.to("cuda")
+        outs = torch.zeros_like(input_tensor)
+        for expert_idx in range(expert_ids.size(0)):
+            down_proj = self.down_projs[expert_idx]
+            gate_proj = self.gate_projs[expert_idx]
+            up_proj = self.up_projs[expert_idx]
+
+            outs += down_proj(self.act_fn(gate_proj(input_tensor)) * up_proj(input_tensor)) * weights[expert_idx]
+        outs = outs.to(device)
+        return outs
+
+class MLPExpertsTorch(MLPExpertsBase):
+    expert_num: int
+    loaded_experts_idx: list[int]
+    gate: torch.Tensor
+    up: torch.Tensor
+    down: torch.Tensor
+    def __init__(
+        self,
+        key: str,
+        gguf_loader: GGUFLoader,
+        config: PretrainedConfig,
+        n_routed_experts: int,
+        orig_module: nn.Module = None,
+        device: str = "cpu",
+        **kwargs
+    ):
+        super().__init__(key, gguf_loader, config, orig_module, device, **kwargs)
+        self.expert_num = n_routed_experts
+        # self.loaded_experts_idx = []
+        self.act_fn = ACT2FN[config.hidden_act]
+        self.device = device
+        self.gate = None
+        self.up = None
+        self.donw = None
+        self.dtype = torch.get_default_dtype()
+
+    def load(self, w: dict | nn.Parameter | tuple | None = None, device: str | None = None, warmup: bool = False):
+        if device is None: device = self.device
+        if w is None: w = self.load_weights(device=device)[self.key]
+
+        if isinstance(w, dict):
+            self.gate = w["gate"].to(device=device, dtype=self.dtype)
+            self.up = w["up"].to(device=device, dtype=self.dtype)
+            self.down = w["down"].to(device=device, dtype=self.dtype)
+
+    def unload(self):
+        if self.gate is not None:
+            self.gate = None
+            self.up = None
+            self.down = None
+
+    def forward(self, hidden_states_cpu: torch.Tensor, selected_experts_cpu: torch.Tensor, routing_weights_cpu: torch.Tensor) -> torch.Tensor:
+        
+        batch_sequence_length, hidden_dim = hidden_states_cpu.size()
+
+        final_hidden_states = torch.zeros(
+            (batch_sequence_length, hidden_dim), dtype=self.gate.dtype, device=hidden_states_cpu.device
+        )
+        org_dtype = hidden_states_cpu.dtype
+        hidden_states_cpu = hidden_states_cpu.to(self.gate.dtype)
+        routing_weights_cpu = routing_weights_cpu.to(self.gate.dtype)
+        # One hot encode the selected experts to create an expert mask
+        # this will be used to easily index which expert is going to be sollicitated
+        expert_mask = torch.nn.functional.one_hot(selected_experts_cpu, num_classes=self.expert_num).permute(2, 1, 0)
+
+        # Loop over all available experts in the model and perform the computation on each expert
+        for expert_idx in range(self.expert_num):
+            idx, top_x = torch.where(expert_mask[expert_idx])
+            # Index the correct hidden states and compute the expert hidden state for
+            # the current expert. We need to make sure to multiply the output hidden
+            # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
+            current_state = hidden_states_cpu[None, top_x].reshape(-1, hidden_dim)
+            G = current_state @ self.gate[expert_idx,...].T
+            A = self.act_fn(G)
+            U = current_state @ self.up[expert_idx,...].T
+            H = A * U  # Element-wise multiplication
+            current_hidden_states = H @ self.down[expert_idx,...].T * routing_weights_cpu[top_x, idx, None]
+            # However `index_add_` only support torch tensors for indexing so we'll use
+            # the `top_x` tensor here.
+            final_hidden_states.index_add_(0, top_x, current_hidden_states)
+
+        return final_hidden_states.to(org_dtype)
+
+EXPERTS_MAP = {
+    "MLPCPUExperts": MLPCPUExperts,
+    "MLPExpertsTorch": MLPExpertsTorch,
+    "MLPExpertsMarlin": MLPExpertsMarlin,
+}
+class KTransformersMLPExpert(BaseInjectedModule, MLPExpertsBase):
+    def __init__(self,
+                 key: str,
+                 gguf_loader: GGUFLoader,
+                 config: PretrainedConfig,
+                 orig_module: nn.Module,
+                 device: str = "cuda",
+                 prefill_device:str = "cuda",
+                 prefill_mlp_type: str | None = "MLPExpertsTorch",
+                 generate_device: str = "cpu",
+                 generate_mlp_type: str | None = "MLPCPUExperts",
+                 **kwargs):
+        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, device, **kwargs)
+        MLPExpertsBase.__init__(self, key, gguf_loader, config, orig_module, device, **kwargs)
+        if generate_mlp_type is not None:
+            self.generate_experts = EXPERTS_MAP[generate_mlp_type](key, gguf_loader, config, len(orig_module), device=generate_device, **kwargs)
+        else:
+            self.generate_experts = None
+        if prefill_mlp_type is not None:
+            self.prefill_experts = EXPERTS_MAP[prefill_mlp_type](key, gguf_loader, config, len(orig_module), device=prefill_device, **kwargs)
+        else:
+            self.prefill_experts = None
+        self.gpu_mlp_type = prefill_mlp_type
+        self.cpu_mlp_type = generate_mlp_type
+        self.mode = InferenceState.UNLOAD
+
+    def load(self, w: dict = None,  mode: InferenceState = None, warmup: bool = True):
+        # TODO support w as input
+        if not mode: mode = InferenceState.GENERATE
+        if mode == InferenceState.GENERATE:
+            self.prefill_experts.unload()
+            self.generate_experts.load(w, warmup=warmup)
+            self.device = self.generate_experts.device
+            self.mode = mode
+        elif mode == InferenceState.PREFILL:
+            self.generate_experts.unload()
+            self.prefill_experts.load(w, warmup=warmup)
+            self.device = self.prefill_experts.device
+            self.mode = mode
+        elif mode == InferenceState.UNLOAD:
+            self.unload()
+            self.mode = mode
+            self.device = self.generate_experts.device
+        else:
+            raise ValueError("mode must be either InferenceState.GENERATE, InferenceState.PREFILL or InferenceState.UNLOAD")
+
+    def unload(self):
+        if self.generate_experts is not None:
+            self.generate_experts.unload()
+        if self.prefill_experts is not None:
+            self.prefill_experts.unload()
+        self.device = self.generate_experts.device
+
+    def forward(self, input_tensor, expert_ids, weights):
+        if self.mode == InferenceState.GENERATE:
+            assert self.generate_experts is not None, "generate_experts is None"
+            return self.generate_experts.forward(input_tensor, expert_ids, weights)
+        elif self.mode == InferenceState.PREFILL:
+            assert self.prefill_experts is not None, "prefill_experts is None"
+            return self.prefill_experts.forward(input_tensor, expert_ids, weights)
+        else:
+            raise ValueError("load or set_inference_mode before forward")
+
+    def set_inference_mode(self, mode: InferenceState):
+        if mode == InferenceState.GENERATE:
+            self.load(mode=InferenceState.GENERATE, warmup=False)
+        elif mode == InferenceState.PREFILL:
+            self.load(mode=InferenceState.PREFILL, warmup=False)
+        elif mode == InferenceState.UNLOAD:
+            self.unload()
+        else:
+            raise ValueError("mode must be either InferenceState.GENERATE, InferenceState.PREFILL or InferenceState.UNLOAD")
+
+
+from ktransformers.models.modeling_deepseek import DeepseekV2MoE
+from ktransformers.models.modeling_qwen2_moe import Qwen2MoeSparseMoeBlock
+
+
+class Qwen2MoeSparseMoeBlockInjected(BaseInjectedModule, Qwen2MoeSparseMoeBlock):
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        """ """
+        orig_shape = hidden_states.shape
+        batch_size, sequence_length, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+        # router_logits: (batch * sequence_length, n_experts)
+        router_logits = self.gate(hidden_states)
+
+        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
+        routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
+        if self.norm_topk_prob:
+            routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
+        # we cast back to the input dtype
+        routing_weights = routing_weights.to(hidden_states.dtype)
+        
+        if sequence_length == 1 and hasattr(self.experts.generate_experts, "submit_for_one_decode"):
+            self.experts.generate_experts.submit_for_one_decode(hidden_states[0], selected_experts[0], routing_weights[0])
+            shared_expert_output = self.shared_expert(hidden_states)
+            shared_expert_output = F.sigmoid(self.shared_expert_gate(hidden_states)) * shared_expert_output
+            y = self.experts.generate_experts.sync_for_one_decode().unsqueeze(0)
+            y += shared_expert_output
+            y.resize_(*orig_shape)
+            return y, router_logits
+        
+        hidden_states_expert = hidden_states.to(self.experts.device)  if isinstance(self.experts, MLPExpertsBase) else hidden_states_expert.cpu()
+        selected_experts_expert = selected_experts.to(self.experts.device) if isinstance(self.experts, MLPExpertsBase) else selected_experts_expert.cpu()
+        routing_weights_expert = routing_weights.to(self.experts.device) if isinstance(self.experts, MLPExpertsBase) else routing_weights_expert.cpu()
+
+        shared_expert_output = self.shared_expert(hidden_states)
+        shared_expert_output = (
+            F.sigmoid(self.shared_expert_gate(hidden_states)) * shared_expert_output
+        )
+
+        if isinstance(self.experts, MLPExpertsBase):
+            y = (
+                self.moe_on_cpuinfer(
+                    hidden_states_expert, selected_experts_expert, routing_weights_expert
+                )
+                .view(*orig_shape)
+                .to(device=hidden_states.device)
+            )
+        elif hidden_states_expert.size(0) > 10:
+            y = self.moe_infer(
+                hidden_states_expert, selected_experts_expert, routing_weights_expert, orig_shape
+            ).to(device=hidden_states.device)
+        else:
+            y = self.moe_infer_simple(
+                hidden_states_expert, selected_experts_expert, routing_weights_expert
+            ).to(device=hidden_states.device)
+        y += shared_expert_output
+        y.resize_(*orig_shape)
+        return y, router_logits
+    
+    @torch.no_grad()
+    def moe_on_cpuinfer(self, x: torch.Tensor, topk_ids: torch.Tensor, topk_weight: torch.Tensor) -> torch.Tensor:
+        outs = torch.empty_like(x)
+        outs = self.experts(x, topk_ids, topk_weight)
+        return outs
+
+    @torch.no_grad()
+    # TODO may bugs here
+    def moe_infer_simple(self, hidden_states_cpu: torch.Tensor, selected_experts_cpu: torch.Tensor, routing_weights_cpu: torch.Tensor) -> torch.Tensor:
+        '''
+        hidden_states_cpu: [num_tokens, hidden_size]
+        topk_ids, topk_weight: [num_tokens, num_selected_experts]
+        '''
+        outs = torch.zeros_like(hidden_states_cpu)
+        for token_idx in range(selected_experts_cpu.size(0)):
+            for expert_idx in range(selected_experts_cpu.size(1)):
+                expert = self.experts[selected_experts_cpu[token_idx, expert_idx]]
+                outs[token_idx] += expert.forward(hidden_states_cpu[token_idx]) * routing_weights_cpu[token_idx, expert_idx]
+        return outs
+    
+    @torch.no_grad()
+    # TODO may bugs here
+    def moe_infer(self, hidden_states_cpu: torch.Tensor, selected_experts_cpu: torch.Tensor, routing_weights_cpu: torch.Tensor, orig_shape: tuple) -> torch.Tensor:
+        
+        batch_size, sequence_length, hidden_dim = orig_shape
+
+        final_hidden_states = torch.zeros(
+            (batch_size * sequence_length, hidden_dim), dtype=hidden_states_cpu.dtype, device=hidden_states_cpu.device
+        )
+
+        # One hot encode the selected experts to create an expert mask
+        # this will be used to easily index which expert is going to be sollicitated
+        expert_mask = torch.nn.functional.one_hot(selected_experts_cpu, num_classes=self.num_experts).permute(2, 1, 0)
+
+        # Loop over all available experts in the model and perform the computation on each expert
+        for expert_idx in range(self.num_experts):
+            expert_layer = self.experts[expert_idx]
+            idx, top_x = torch.where(expert_mask[expert_idx])
+
+            # Index the correct hidden states and compute the expert hidden state for
+            # the current expert. We need to make sure to multiply the output hidden
+            # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
+            current_state = hidden_states_cpu[None, top_x].reshape(-1, hidden_dim)
+            current_hidden_states = expert_layer.forward(current_state) * routing_weights_cpu[top_x, idx, None]
+
+            # However `index_add_` only support torch tensors for indexing so we'll use
+            # the `top_x` tensor here.
+            final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states_cpu.dtype))
+
+        return final_hidden_states
+
+
+class DeepseekV2MoEInjected(BaseInjectedModule, DeepseekV2MoE):
+    def forward(self, hidden_states):
+        identity = hidden_states
+        orig_shape = hidden_states.shape
+        sequence_length = orig_shape[1]
+        topk_idx, topk_weight, aux_loss = self.gate(hidden_states)
+        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
+        
+        if sequence_length == 1:
+            self.experts.generate_experts.submit_for_one_decode(hidden_states[0], topk_idx[0], topk_weight[0])
+            if self.config.n_shared_experts is not None:
+                y_ = self.shared_experts(identity).squeeze(0)
+            y = self.experts.generate_experts.sync_for_one_decode().unsqueeze(0)
+            y += y_
+            y.resize_(*orig_shape)
+            return y
+
+        if self.config.n_shared_experts is not None:
+            y_ = self.shared_experts(identity).squeeze(0)
+            
+        if isinstance(self.experts, MLPExpertsBase):
+            y = self.moe_on_cpuinfer(hidden_states, topk_idx, topk_weight).view(*orig_shape).to(device=hidden_states.device)
+        elif hidden_states.size(0) > 10:
+            # TODO may bugs here
+            y = (
+                self.moe_infer(hidden_states, topk_idx, topk_weight)
+                .view(*orig_shape)
+                .to(device=hidden_states.device)
+            )
+        else:
+            # TODO may bugs here
+            y = (
+                self.moe_infer_simple(hidden_states, topk_idx, topk_weight)
+                .view(*orig_shape)
+                .to(device=hidden_states.device)
+            )
+        if self.config.n_shared_experts is not None:
+            y += y_
+        return y
+
+    @torch.no_grad()
+    def moe_on_cpuinfer(self, x: torch.Tensor, topk_ids: torch.Tensor, topk_weight: torch.Tensor) -> torch.Tensor:
+        outs = torch.empty_like(x)
+        outs = self.experts(x, topk_ids, topk_weight)
+        return outs
+
+    @torch.no_grad()
+    # TODO may bugs here
+    def moe_infer_simple(
+        self, x: torch.Tensor, topk_ids: torch.Tensor, topk_weight: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        x: [num_tokens, hidden_size]
+        topk_ids, topk_weight: [num_tokens, num_selected_experts]
+        """
+        outs = torch.zeros_like(x)
+        for token_idx in range(topk_ids.size(0)):
+            for expert_idx in range(topk_ids.size(1)):
+                expert = self.experts[topk_ids[token_idx, expert_idx]]
+                outs[token_idx] += (
+                    expert.forward(x[token_idx]) * topk_weight[token_idx, expert_idx]
+                )
+        return outs
+
+    @torch.no_grad()
+    # TODO may bugs here
+    def moe_infer(self, x, topk_ids, topk_weight):
+        cnts = topk_ids.new_zeros((topk_ids.shape[0], len(self.experts)))
+        cnts.scatter_(1, topk_ids, 1)
+        tokens_per_expert = cnts.sum(dim=0)
+        idxs = topk_ids.view(-1).argsort()
+        sorted_tokens = x[idxs // topk_ids.shape[1]]
+        tokens_per_expert = tokens_per_expert.cpu().numpy()
+
+        outputs = []
+        start_idx = 0
+        for i, num_tokens in enumerate(tokens_per_expert):
+            end_idx = start_idx + num_tokens
+            if num_tokens == 0:
+                continue
+            expert = self.experts[i + self.ep_rank * self.experts_per_rank]
+            tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
+            expert_out = expert.forward(tokens_for_this_expert)
+            outputs.append(expert_out)
+            start_idx = end_idx
+
+        outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0)
+
+        new_x = torch.empty_like(outs)
+        new_x[idxs] = outs
+        final_out = (
+            new_x.view(*topk_ids.shape, -1)
+            .type(topk_weight.dtype)
+            .mul_(topk_weight.unsqueeze(dim=-1))
+            .sum(dim=1)
+            .type(new_x.dtype)
+        )
+        return final_out
--- a/ktransformers/operators/layer_wise_prefill.py
+++ b/ktransformers/operators/layer_wise_prefill.py
@ -0,0 +1,700 @@
+#!/usr/bin/env python
+# coding=utf-8
+'''
+Description  :  
+Author       : Azure-Tang
+Date         : 2024-07-25 11:25:24
+Version      : 1.0.0
+LastEditors  : Azure 
+LastEditTime : 2024-07-26 09:27:48
+Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
+'''
+
+import inspect
+import math
+from typing import List, Optional, Tuple, Union
+import time
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache, StaticCache
+from transformers.modeling_attn_mask_utils import (
+    AttentionMaskConverter,
+)
+from transformers.modeling_outputs import (
+    MoeCausalLMOutputWithPast,
+    MoeModelOutputWithPast,
+    SequenceClassifierOutputWithPast,
+    TokenClassifierOutput,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_flash_attn_2_available,
+    is_flash_attn_greater_or_equal_2_10,
+    logging,
+    replace_return_docstrings,
+)
+from ktransformers.models.modeling_qwen2_moe import Qwen2MoeSparseMoeBlock, Qwen2MoeMLP, Qwen2MoeDecoderLayer
+from ktransformers.models.modeling_deepseek import BaseModelOutputWithPast, DeepseekV2DecoderLayer, DeepseekV2MoE
+from transformers.models.qwen2_moe.configuration_qwen2_moe import Qwen2MoeConfig
+from ktransformers.operators.base_operator import BaseInjectedModule
+from ktransformers.util.utils import InferenceState
+
+if is_flash_attn_2_available():
+    from flash_attn import flash_attn_func, flash_attn_varlen_func
+    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
+
+    _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "Qwen/Qwen1.5-MoE-A2.7B"
+_CONFIG_FOR_DOC = "Qwen2MoeConfig"
+
+QWEN2MOE_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`Qwen2MoeConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare Qwen2MoE Model outputting raw hidden-states without any specific head on top.",
+    QWEN2MOE_START_DOCSTRING,
+)
+class Qwen2MoePreTrainedModel(PreTrainedModel):
+    config_class = Qwen2MoeConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Qwen2MoeDecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+    _supports_static_cache = True
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+QWEN2MOE_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance;
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        output_router_logits (`bool`, *optional*):
+            Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
+            should not be returned during inference.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
+"""
+
+from ktransformers.util.custom_gguf import GGUFLoader
+from transformers.configuration_utils import PretrainedConfig
+@add_start_docstrings(
+    "The bare Qwen2MoE Model outputting raw hidden-states without any specific head on top.",
+    QWEN2MOE_START_DOCSTRING,
+)
+class Qwen2MoeModelPerLayerPrefill(BaseInjectedModule):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen2MoeDecoderLayer`]
+
+    Args:
+        config: Qwen2MoeConfig
+    """
+    def __init__(
+        self,
+        key: str,
+        gguf_loader : GGUFLoader,
+        config: PretrainedConfig,
+        orig_module: nn.Module,
+        device: str = "cuda",
+        per_layer_prefill_intput_threshold: int = 30000, # if None, no per-layer prefill
+        **kwargs,
+    ):
+        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, device, **kwargs)
+        self.per_layer_prefill_intput_threshold = per_layer_prefill_intput_threshold
+
+    @add_start_docstrings_to_model_forward(QWEN2MOE_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_router_logits: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        per_layer_prefill_intput_threshold: int | None = None, # if None or 0, close per-layer prefill
+    ) -> Union[Tuple, MoeModelOutputWithPast]:
+        # print(f'Total length of input_ids: {input_ids.size(1)}, {input_ids.size()}')
+
+        if per_layer_prefill_intput_threshold is None: per_layer_prefill_intput_threshold = self.per_layer_prefill_intput_threshold
+        per_layer_prefill_flag = False
+        seq_lenth = inputs_embeds.size(1) if inputs_embeds is not None else input_ids.size(1)
+        if per_layer_prefill_intput_threshold and per_layer_prefill_intput_threshold < seq_lenth:
+            per_layer_prefill_flag = True
+            for layer in self.layers:
+                self.load_layer_to(layer, InferenceState.UNLOAD)
+        else:
+            pass
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_router_logits = (
+            output_router_logits if output_router_logits is not None else self.config.output_router_logits
+        )
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+            )
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        use_legacy_cache = False
+        if use_cache and not isinstance(past_key_values, Cache):
+            use_legacy_cache = True
+            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            logger.warning_once(
+                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
+                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)"
+            )
+
+        if inputs_embeds is None:
+            input_ids = input_ids.to("cpu")
+            inputs_embeds = self.embed_tokens(input_ids)
+            inputs_embeds = inputs_embeds.to("cuda")
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+        )
+
+        hidden_states = inputs_embeds
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_router_logits = () if output_router_logits else None
+        next_decoder_cache = None
+
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    causal_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    output_router_logits,
+                    use_cache,
+                    cache_position,
+                )
+            else:
+                if per_layer_prefill_flag:
+                    # print(f"to gpu")
+                    self.load_layer_to(decoder_layer, InferenceState.PREFILL)
+                    torch.cuda.empty_cache()
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=causal_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    output_router_logits=output_router_logits,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                )
+                if per_layer_prefill_flag:
+                    # print(f"to cpu")
+                    self.load_layer_to(decoder_layer, InferenceState.UNLOAD)
+                    torch.cuda.empty_cache()
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+            if output_router_logits and layer_outputs[-1] is not None:
+                all_router_logits += (layer_outputs[-1],)
+
+        hidden_states = self.norm(hidden_states)
+
+
+        if per_layer_prefill_flag:
+            per_layer_prefill_flag = False
+            for layer in self.layers:
+                self.load_layer_to(layer, InferenceState.GENERATE)
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = None
+        if use_cache:
+            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_router_logits]
+                if v is not None
+            )
+        return MoeModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            router_logits=all_router_logits,
+        )
+
+    def load_layer_to(self,  layer:Qwen2MoeDecoderLayer, target: InferenceState):
+        assert isinstance(layer, Qwen2MoeDecoderLayer), "module should be nn.ModuleList of decoder layers"
+
+        # TODO Support restore to original device, not only cuda
+        device = "cpu" if target == InferenceState.UNLOAD else "cuda" 
+
+        # attn
+        layer.self_attn.q_proj.set_inference_mode(target)
+        layer.self_attn.k_proj.set_inference_mode(target)
+        layer.self_attn.v_proj.set_inference_mode(target)
+        layer.self_attn.o_proj.set_inference_mode(target)
+        layer.self_attn.rotary_emb = layer.self_attn.rotary_emb.to(device)
+
+        # mlp
+        if isinstance(layer.mlp, Qwen2MoeSparseMoeBlock):
+            layer.mlp.gate.set_inference_mode(target)
+            layer.mlp.experts.set_inference_mode(target)
+            layer.mlp.shared_expert.gate_proj.set_inference_mode(target)
+            layer.mlp.shared_expert.up_proj.set_inference_mode(target)
+            layer.mlp.shared_expert.down_proj.set_inference_mode(target)
+            layer.mlp.shared_expert.act_fn.to(device)
+            layer.mlp.shared_expert_gate.to(device)
+        else:
+            layer.mlp.gate_proj.set_inference_mode(target)
+            layer.mlp.up_proj.set_inference_mode(target)
+            layer.mlp.down_proj.set_inference_mode(target)
+            layer.mlp.act_fn.to(device)
+        # layer norm
+        layer.input_layernorm.to(device)
+        layer.post_attention_layernorm.to(device)
+
+
+DeepseekV2_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance;
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+class DeepseekV2ModelPerLayerPrefill(BaseInjectedModule):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`DeepseekV2DecoderLayer`]
+
+    Args:
+        config: DeepseekV2Config
+    """
+    def __init__(
+        self,
+        key: str,
+        gguf_loader : GGUFLoader,
+        config: PretrainedConfig,
+        orig_module: nn.Module,
+        device: str = "cuda",
+        per_layer_prefill_intput_threshold: int = 30000, # if None, no per-layer prefill
+        **kwargs,
+    ):
+        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, device, **kwargs)
+        self.per_layer_prefill_intput_threshold = per_layer_prefill_intput_threshold
+
+    @add_start_docstrings_to_model_forward(DeepseekV2_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        per_layer_prefill_intput_threshold: int | None = None, # if None, no per-layer prefill
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        if per_layer_prefill_intput_threshold is None: per_layer_prefill_intput_threshold = self.per_layer_prefill_intput_threshold
+        per_layer_prefill_flag = False
+        seq_lenth = inputs_embeds.size(1) if inputs_embeds is not None else input_ids.size(1)
+        if per_layer_prefill_intput_threshold and per_layer_prefill_intput_threshold < seq_lenth:
+            per_layer_prefill_flag = True
+            for layer in self.layers:
+                self.load_layer_to(layer,  InferenceState.UNLOAD)
+            torch.cuda.empty_cache()
+        else:
+            pass
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time"
+            )
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape[:2]
+        elif inputs_embeds is not None:
+            batch_size, seq_length = inputs_embeds.shape[:2]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`transformers."
+                )
+                use_cache = False
+
+        past_key_values_length = 0
+        if use_cache:
+            use_legacy_cache = not isinstance(past_key_values, Cache)
+            if use_legacy_cache:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            past_key_values_length = past_key_values.get_usable_length(seq_length)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        if inputs_embeds is None:
+            org_device = input_ids.device
+            input_ids = input_ids.to("cpu")
+            inputs_embeds = self.embed_tokens(input_ids)
+            input_ids = input_ids.to(org_device)
+
+
+        causal_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+        )
+
+        # embed positions
+        hidden_states = inputs_embeds
+        if per_layer_prefill_flag:
+            print(f'Total length of input_ids: {hidden_states.size(1)}')
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+
+        t_gpu = 0
+        t_cpu = 0
+        t_f = 0
+
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    causal_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    cache_position,
+                )
+            else:
+                t3 = time.time()
+                if per_layer_prefill_flag:
+                    # print(f"to gpu")
+                    self.load_layer_to(decoder_layer, InferenceState.PREFILL)
+                    torch.cuda.empty_cache()
+                t4 = time.time()
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=causal_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                )
+                t5 = time.time()
+                if per_layer_prefill_flag:
+                    # print(f"to cpu")
+                    self.load_layer_to(decoder_layer,  InferenceState.UNLOAD)
+                    torch.cuda.empty_cache()
+                t6 = time.time()
+            t_gpu += t4-t3
+            t_cpu += t6-t5
+            t_f += t5-t4
+
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        if per_layer_prefill_flag:
+            t6 = time.time()
+            # print(f"restore")
+            per_layer_prefill_flag = False
+            for layer in self.layers:
+                self.load_layer_to(layer, InferenceState.GENERATE)
+            torch.cuda.empty_cache()
+            t7 = time.time()
+
+            print(f"total time: {t7-t3}, \n layer num{len(self.layers)}, gpu time: {t_gpu}, cpu time: {t_cpu}, forward time: {t_f}, restore time: {t7-t6}")
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = None
+        if use_cache:
+            next_cache = (
+                next_decoder_cache.to_legacy_cache()
+                if use_legacy_cache
+                else next_decoder_cache
+            )
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
+                if v is not None
+            )
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+    def load_layer_to(self,  layer: DeepseekV2DecoderLayer, target: InferenceState):
+        assert isinstance(layer, DeepseekV2DecoderLayer), "module should be nn.ModuleList of decoder layers"
+
+        # TODO Support restore to original device, not only cuda
+        device = "cpu" if target == InferenceState.UNLOAD else "cuda" 
+
+        # TODO Support DFS to auto use {to, set_inference_mode} according to the module type
+
+        # attn
+        layer.self_attn.to(device) #
+
+        # mlp
+        if isinstance(layer.mlp, DeepseekV2MoE):
+            layer.mlp.gate.to(device)
+            layer.mlp.experts.set_inference_mode(target)
+            layer.mlp.shared_experts.gate_proj.set_inference_mode(target)
+            layer.mlp.shared_experts.up_proj.set_inference_mode(target)
+            layer.mlp.shared_experts.down_proj.set_inference_mode(target)
+            layer.mlp.shared_experts.act_fn.to(device)
+            # layer.mlp.shared_expert_gate.to(device)
+        else:
+            layer.mlp.gate_proj.set_inference_mode(target)
+            layer.mlp.up_proj.set_inference_mode(target)
+            layer.mlp.down_proj.set_inference_mode(target)
+            layer.mlp.act_fn.to(device)
+        # layer norm
+        layer.input_layernorm.to(device)
+        layer.post_attention_layernorm.to(device)
--- a/ktransformers/operators/linear.py
+++ b/ktransformers/operators/linear.py
@ -0,0 +1,340 @@
+#!/usr/bin/env python
+# coding=utf-8
+'''
+Description  :  
+Author       : Azure-Tang, Boxin Zhang
+Date         : 2024-07-25 11:25:24
+Version      : 0.1.0
+LastEditors  : Azure 
+LastEditTime : 2024-07-26 09:27:53
+Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
+'''
+
+
+import torch
+from torch import nn
+import KTransformersOps 
+from ktransformers.util.custom_gguf import GGUFLoader
+from ktransformers.util.utils import InferenceState
+from ktransformers.ktransformers_ext.operators.custom_marlin.quantize.utils.marlin_utils import (
+    MarlinWorkspace,
+    marlin_quantize,
+    GPTQ_MARLIN_MIN_THREAD_N,
+    GPTQ_MARLIN_MAX_PARALLEL,
+)
+from ktransformers.operators.base_operator import BaseInjectedModule
+from transformers.configuration_utils import PretrainedConfig
+from abc import ABC, abstractmethod
+
+
+#class QuantizedLinearBase(BaseInjectedModule, ABC):
+class QuantizedLinearBase(ABC):
+    def __init__(
+        self,
+        key: str,
+        gguf_loader: GGUFLoader,
+        config: PretrainedConfig,
+        orig_module: nn.Module = None,
+        device: str = "cuda",
+        **kwargs,
+    ):
+        # super().__init__(key, gguf_loader, config, orig_module, device, **kwargs)
+        super().__init__()
+        self.key = key
+        self.gguf_loader = gguf_loader
+        self.device = device
+        self.config = config
+
+        self.has_bias = False
+        self.dtype = torch.get_default_dtype()
+        if orig_module is not None:
+            self.in_features = orig_module.in_features
+            self.out_features = orig_module.out_features
+        else:
+            shape = self.gguf_loader.tensor_info[key + ".weight"]["shape"]
+            if len(shape) == 1:
+                print("Warning: orig_module is not set, but has in_features or out_features equals to 1, can't get in_features and out_features from GGUF")
+            self.in_features  = self.gguf_loader.tensor_info[key + ".weight"]["shape"][0]
+            self.out_features = self.gguf_loader.tensor_info[key + ".weight"]["shape"][1]
+
+    @abstractmethod
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        pass
+
+    def load_weight(self, override_key: str | None = None, device: str | None = None):
+        if override_key is not None:
+            keys = override_key
+        else:
+            keys = [self.key]
+
+        for key in keys:
+            if key + ".weight" in self.gguf_loader.tensor_file_map:
+                if key + ".bias" in self.gguf_loader.tensor_file_map:
+                    tensors = self.load_multi(key, ["weight", "bias"], device=device)
+                    tensor = tensors["weight"]
+                    bias = tensors["bias"]
+                    # self.qtype = GGML_TYPE_QTYPE_MAP[tensorinfo[key + ".weight"]["ggml_type"]]
+                    # print(torch.isinf(tensor).any(), torch.isinf(bias).any())
+                    return nn.Parameter(tensor), nn.Parameter(bias)
+                else:
+                    tensors = self.load_multi(key, ["weight"], device=device)
+                    tensor = tensors["weight"]
+                    # self.qtype = GGML_TYPE_QTYPE_MAP[tensorinfo[key + ".weight"]["ggml_type"]]
+                    return nn.Parameter(tensor)
+            else:
+                raise FileNotFoundError(f"Weight file not found for key {key}")
+
+    def load_multi(self, key: str, keys: list[str], device: str = "cpu"):
+        tensors = {}
+        for k in keys:
+            tensors[k] = self.gguf_loader.load_gguf_tensor(key + "." + k, device=device)
+        return tensors
+
+    @abstractmethod
+    def load(self, w: dict | nn.Parameter | tuple | None = None, device: str|None = "cuda"):
+        pass
+
+    @abstractmethod
+    def unload(self):
+        pass
+
+
+class QuantizedLinearTorch(QuantizedLinearBase):
+    def __init__(
+        self,
+        key: str,
+        gguf_loader: GGUFLoader,
+        config: PretrainedConfig,
+        orig_module: nn.Module = None,
+        device: str = "cuda",
+        **kwargs,
+    ):
+        super().__init__(key, gguf_loader, config, orig_module, device, **kwargs)
+        self.has_bias = False
+        self.dtype = torch.get_default_dtype()
+        self.w = None
+        self.has_bias = False
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        dtype = x.dtype
+        out_device = x.device
+        x = x.to(device=self.device, dtype=self.dtype)
+        x = x @ self.w
+        if self.has_bias:
+            x = x + self.bias
+        x = x.to(dtype=dtype, device=out_device)
+        return x
+
+    def load(self, w: dict | nn.Parameter | tuple | None = None, device: str|None = None):
+        if device is None: device = self.device
+        if w is None: w = self.load_weight(device=device)
+
+        if isinstance(w, nn.Parameter):
+            self.w = w.to(dtype=self.dtype).view(self.out_features, self.in_features).T
+            self.has_bias = False
+        elif isinstance(w, tuple):
+            self.w = w[0].to(dtype=self.dtype).view(self.out_features, self.in_features).T
+            self.bias = w[1].to(dtype=self.dtype)
+            self.has_bias = True
+        else:
+            raise ValueError("Invalid weight type")
+        # self.linear = self.linear.to(device)
+        self.w = self.w.to(device)
+        if self.has_bias:
+            self.bias = self.bias.to(device)
+
+    def unload(self):
+        if self.w is not None:
+            self.w = None
+        if self.has_bias:
+            self.bias = None
+
+
+class QuantizedLinearMarlin(QuantizedLinearBase):
+    marlin_q_w: torch.Tensor
+    marlin_s: torch.Tensor
+    g_idx: torch.Tensor
+    sort_indices: torch.Tensor
+    has_bias: bool
+    def __init__(
+        self,
+        key: str,
+        gguf_loader: GGUFLoader,
+        config: PretrainedConfig,
+        orig_module: nn.Module = None,
+        device: str = "cuda",
+        num_bits: int = 4,  # 4-bit/8-bit is supported
+        group_size: int = 64,  # -1, 32, 64, 128
+        act_order: bool = False,
+        is_k_full=True,
+        **kwargs,
+    ):
+        assert device.lower() != "cpu", "Marlin quantized linear only supports GPU device"
+        super().__init__(key, gguf_loader, config, orig_module, device, **kwargs)
+        self.num_bits = num_bits
+        self.group_size = group_size
+        self.act_order = act_order
+        self.is_k_full = is_k_full
+
+    def load(self, w: dict | nn.Parameter | tuple | None = None, device: str|None = "cuda"):
+        if device is None: device = self.device
+        assert device.lower() != "cpu", "Marlin quantized linear only supports GPU device"
+        if w is None: w = self.load_weight(device=device)
+
+        if isinstance(w, nn.Parameter):
+            # pad weight
+            weight = w.view(self.out_features, self.in_features).T
+            self.has_bias = False
+        elif isinstance(w, tuple):
+            w = list(w)
+            weight = w[0].view(self.out_features, self.in_features).T
+            self.bias = w[1]
+            self.has_bias = True
+        else:
+            raise ValueError("Invalid weight type")
+        weight = weight.to(device)
+        if self.has_bias:
+            self.bias = self.bias.to(device)
+        # Pack Marlin linear
+        w_ref, marlin_q_w, marlin_s, g_idx, sort_indices, _ = marlin_quantize(
+            weight, self.num_bits, self.group_size, self.act_order
+        )
+        self.workspace = MarlinWorkspace(
+            self.out_features, GPTQ_MARLIN_MIN_THREAD_N, GPTQ_MARLIN_MAX_PARALLEL
+        )
+        self.marlin_q_w = marlin_q_w
+        self.marlin_s = marlin_s
+        self.g_idx = g_idx
+        self.sort_indices = sort_indices
+        self.k = weight.shape[0]
+        self.n = weight.shape[1]
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # Only support input x as BF16 and FP16
+        x = x.to(self.device)
+        orig_shape = list(x.shape)
+        orig_dtype = x.dtype
+        x = x.reshape(-1, x.shape[-1])
+        marlin_s = self.marlin_s.to(x.dtype)
+        x = KTransformersOps.gptq_marlin_gemm(
+            x,
+            self.marlin_q_w,
+            marlin_s,
+            self.g_idx,
+            self.sort_indices,
+            self.workspace.scratch,
+            self.num_bits,
+            x.shape[0],
+            self.n,
+            x.shape[-1],
+            self.is_k_full,
+        )
+        if self.has_bias:
+            x = x + self.bias
+        orig_shape[-1] = self.n
+        return x.reshape(orig_shape).to(orig_dtype)
+
+    def unload(self):
+
+        if self.has_bias:
+            self.bias = None
+        self.marlin_q_w = None
+        self.marlin_s = None
+        self.g_idx = None
+        self.sort_indices = None
+        self.workspace = None
+    
+LINEAR_MAP = {
+    "QuantizedLinearMarlin": QuantizedLinearMarlin,
+    "QuantizedLinearTorch": QuantizedLinearTorch,
+    "QuantizedLinearTorch": QuantizedLinearTorch,
+}
+
+class KTransformerLinear(BaseInjectedModule, QuantizedLinearBase):
+    def __init__(
+        self,
+        key: str,
+        gguf_loader: GGUFLoader,
+        config: PretrainedConfig,
+        orig_module: nn.Module,
+        device: str = "cuda",
+        generate_device: str = "cuda",
+        generate_op: str| None = "QuantizedLinearMarlin",
+        prefill_device: str = "cuda",
+        prefill_op: str| None = "QuantizedLinearTorch",
+        **kwargs,
+    ):
+        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, device, **kwargs)
+        QuantizedLinearBase.__init__(self, key, gguf_loader, config, orig_module, device, **kwargs)
+        # build all the linear operators
+        if prefill_op is not None:
+            assert prefill_op in LINEAR_MAP, f"linear_type {prefill_op} not supported"
+            if prefill_op == "QuantizedLinearMarlin" and (orig_module.in_features%GPTQ_MARLIN_MIN_THREAD_N!=0 or orig_module.out_features%GPTQ_MARLIN_MIN_THREAD_N!=0):
+                print(f"This linear module's in_features or out_features is not divisible by GPTQ_MARLIN_MIN_THREAD_N({GPTQ_MARLIN_MIN_THREAD_N}), using QuantizedLinearTorch instead.")
+                print(f"module info: key:{key} orig_module:{orig_module}")
+                self.prefill_linear = QuantizedLinearTorch(key, gguf_loader, config, orig_module, prefill_device, **kwargs)
+            else:
+                self.prefill_linear = LINEAR_MAP[prefill_op](key, gguf_loader, config, orig_module, prefill_device, **kwargs)
+        else:
+            self.prefill_linear = None
+
+        if generate_op is not None:
+            assert generate_op in LINEAR_MAP, f"linear_type {generate_op} not supported"
+            if generate_op == "QuantizedLinearMarlin" and (orig_module.in_features%GPTQ_MARLIN_MIN_THREAD_N!=0 or orig_module.out_features%GPTQ_MARLIN_MIN_THREAD_N!=0):
+                print(f"This linear module's in_features or out_features is not divisible by GPTQ_MARLIN_MIN_THREAD_N({GPTQ_MARLIN_MIN_THREAD_N}), using QuantizedLinearTorch instead.")
+                print(f"module info: key:{key} orig_module:{orig_module}")
+                self.generate_op = "QuantizedLinearTorch"
+                self.generate_linear = QuantizedLinearTorch(key, gguf_loader, config, orig_module, generate_device, **kwargs)
+            else:
+                self.generate_linear = LINEAR_MAP[generate_op](key, gguf_loader, config, orig_module, generate_device, **kwargs)
+        else:
+            self.generate_linear = None
+        self.device = device
+        self.mode = InferenceState.UNLOAD
+
+    def forward(self, x):
+        if self.mode == InferenceState.PREFILL:
+            assert self.prefill_linear is not None, "cpu linear is not initialized"
+            return self.prefill_linear.forward(x)
+        else:
+            assert self.generate_linear is not None, "gpu linear is not initialized"
+            return self.generate_linear.forward(x)
+
+    def load(self, w: dict | nn.Parameter | tuple | None = None, mode: InferenceState = InferenceState.GENERATE):
+        if not mode:
+            mode = InferenceState.GENERATE
+        # load to device
+        if mode == InferenceState.PREFILL:
+            self.generate_linear.unload()
+            self.prefill_linear.load(w=w)
+            self.device = self.prefill_linear.device 
+        elif mode == InferenceState.GENERATE:
+            self.prefill_linear.unload()
+            self.generate_linear.load(w=w)
+            self.device = self.generate_linear.device
+        elif mode == InferenceState.UNLOAD:
+            self.prefill_linear.unload()
+            self.generate_linear.unload()
+            self.device = "cpu"
+        else:
+            raise ValueError("mode must be either InferenceState.GENERATE, InferenceState.PREFILL or InferenceState.UNLOAD")
+        self.mode = mode
+
+    def unload(self):
+        if self.prefill_linear is not None:
+            self.prefill_linear.unload()
+        if self.generate_linear is not None:
+            self.generate_linear.unload()
+        self.device = self.generate_linear.device
+
+    def set_inference_mode(self, mode: InferenceState):
+        if not mode: 
+            mode = InferenceState.GENERATE
+        if mode == InferenceState.GENERATE:
+            self.load(mode=InferenceState.GENERATE)
+        elif mode == InferenceState.PREFILL:
+            self.load(mode=InferenceState.PREFILL)
+        elif mode == InferenceState.UNLOAD:
+            self.unload()
+        else:
+            raise ValueError("mode must be either InferenceState.GENERATE, InferenceState.PREFILL or InferenceState.UNLOAD")
--- a/ktransformers/optimize/optimize.py
+++ b/ktransformers/optimize/optimize.py
@ -0,0 +1,102 @@
+'''
+Description  :  
+Author       : Boxin Zhang
+Version      : 0.1.0
+Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
+'''
+from typing import Mapping, List
+import torch
+import yaml
+import re
+from torch import nn
+from transformers import AutoConfig
+from transformers.configuration_utils import PretrainedConfig
+# from operators import BaseInjectedModule
+from ktransformers.util.custom_gguf import GGUFLoader, translate_name_to_gguf
+from ktransformers.util.utils import set_module, load_weights
+import itertools
+
+def inject(module, local_optimization_dict, model_config:AutoConfig ,gguf_loader:GGUFLoader, prefix=''):
+    for name, child in module._modules.items():
+        if child is not None:
+            child_prefix = prefix + name
+            if child_prefix in local_optimization_dict:
+                inject_module_meta=local_optimization_dict[child_prefix]
+                if isinstance(inject_module_meta, Mapping):
+                    import_path = inject_module_meta["class"].split(".")
+                    import_module_name = ".".join(import_path[:-1])
+                    import_class_name = import_path[-1]
+                    module_cls=getattr(__import__(import_module_name, fromlist=[""]), import_class_name)
+                    print(f"Injecting {child_prefix} as", import_module_name, ".", import_class_name)
+                    inject_module=module_cls(key = inject_module_meta["key"], gguf_loader = gguf_loader, config = model_config, orig_module=child, device = inject_module_meta["device"], **inject_module_meta["kwargs"])
+                    set_module(module, name, inject_module)
+                elif isinstance(inject_module_meta, str):
+                    assert inject_module_meta=="default", "for str inject_module_meta, only support \"default\"."
+                else:
+                    raise Exception("inject_module_meta must be a dict or str")
+                child_prefix += "."
+                child_optimization_dict = {k: v for k, v in local_optimization_dict.items() if k.startswith(child_prefix)}
+                inject(child, child_optimization_dict, model_config, gguf_loader, child_prefix)
+
+def del_meta(module:nn.Module):
+    #print("default loading weights", prefix)
+    persistent_buffers = {k: v for k, v in module._buffers.items() if k not in module._non_persistent_buffers_set}
+    local_name_params = itertools.chain(module._parameters.items(), persistent_buffers.items())
+    local_state = {k: v for k, v in local_name_params if v is not None}
+    for name, param in local_state.items():
+        if param.device == "meta" or param.device == torch.device("meta"):
+            module.__delattr__(name)
+    for name, child in module._modules.items():
+        del_meta(child)
+
+def gen_optimize_config(module: nn.Module, out_data: Mapping, rule_list: List, prefix: str="", default_device: str = "cuda:0"):
+    module_name = prefix[:-1]
+    translated_name = translate_name_to_gguf(prefix)[:-1]
+    #print("gen_optimize_config", prefix, module_name, translated_name)
+    recursive = True
+    for rule in rule_list:
+        #print(rule)
+        match_meta = rule["match"]
+        if "class" in match_meta:
+            import_path = match_meta["class"].split(".")
+            import_module_name = ".".join(import_path[:-1])
+            import_class_name = import_path[-1]
+            module_cls=getattr(__import__(import_module_name, fromlist=[""]), import_class_name)
+            if not isinstance(module, module_cls):
+                continue
+        if "name" in match_meta:
+            if re.search(match_meta["name"], module_name) is None:
+                continue
+        replace_meta = rule["replace"]
+        out_data[module_name]={"key": translated_name,
+                               "class": replace_meta["class"],
+                               "device": replace_meta["device"] if "device" in replace_meta else default_device,
+                               "kwargs": replace_meta["kwargs"] if "kwargs" in replace_meta else dict()}
+        if "recursive" in rule:
+            recursive = bool(rule["recursive"])
+            
+    if module_name not in out_data:
+        out_data[module_name]="default"
+
+    #print(out_data[module_name])
+    #input()
+
+    if recursive:
+        for name, child in module._modules.items():
+            if child is not None:
+                child_prefix = prefix + name + "."
+                gen_optimize_config(child, out_data, rule_list, child_prefix)
+    
+
+def optimize_and_load_gguf(module: nn.Module, rule_file: str, gguf_path: str, model_config: PretrainedConfig, default_device: str = "cuda:0"):
+    with open(rule_file, 'r', encoding='utf-8') as f:
+        rule_list = yaml.load(f.read(), Loader=yaml.FullLoader)
+    
+    optimize_config = dict()
+    gen_optimize_config(module, optimize_config, rule_list, default_device = default_device)
+    
+    gguf_loader=GGUFLoader(gguf_path)
+    with torch.device("meta"):
+        inject(module, optimize_config, model_config, gguf_loader)
+    load_weights(module, gguf_loader)
+    del_meta(module)
--- a/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat.yaml
+++ b/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat.yaml
@ -0,0 +1,41 @@
+- match:
+    class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
+  replace:
+    class: ktransformers.operators.RoPE.YarnRotaryEmbedding
+- match:
+    name: "^model\\.layers\\.(?!.*self_attn).*$"  # regular expression 
+    class: torch.nn.Linear  # only match modules matching name and class simultaneously
+  replace:
+    class: ktransformers.operators.linear.KTransformerLinear  # optimized Kernel on quantized data types
+    kwargs:
+      generate_device: "cuda"
+      prefill_device: "cuda"
+      generate_op: "QuantizedLinearMarlin"
+      prefill_op: "QuantizedLinearTorch"
+- match:
+    name: "^model\\.layers\\..*\\.mlp$"
+    class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
+  replace:
+    class: ktransformers.operators.experts.DeepseekV2MoEInjected     # mlp module with custom forward function
+- match:
+    name: "^model\\.layers\\..*\\.mlp\\.experts$"
+  replace:
+    class: ktransformers.operators.experts.KTransformersMLPExpert     # custom MoE Kernel with expert paralleism
+    device: "cpu"   # which devices to load this module when initializing
+    kwargs:
+      prefill_device: "cuda"
+      prefill_mlp_type: "MLPExpertsTorch"
+      generate_device: "cpu"
+      generate_mlp_type:  "MLPCPUExperts"
+      out_device: "cuda"
+  recursive: False # don't recursively inject submodules of this module
+- match:
+    name: "^model\\.layers\\..*\\.self_attn$"
+  replace:
+    class: ktransformers.operators.attention.DeepseekV2AttentionInjected # optimized MLA implementation
+- match:
+    name: "^model$"
+  replace:
+    class: "ktransformers.operators.layer_wise_prefill.DeepseekV2ModelPerLayerPrefill"
+    kwargs:
+      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
--- a/ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct.yaml
+++ b/ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct.yaml
@ -0,0 +1,37 @@
+- match:
+    class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeRotaryEmbedding
+  replace:
+    class: ktransformers.operators.RoPE.RotaryEmbedding
+- match:
+    name: "^model\\.layers\\..*$"  # regular expression 
+    class: torch.nn.Linear  # only match modules matching name and class simultaneously
+  replace:
+    class: ktransformers.operators.linear.KTransformerLinear  # optimized Kernel on quantized data types
+    kwargs:
+      generate_device: "cuda"
+      prefill_device: "cuda"
+      generate_op: "QuantizedLinearMarlin"
+      prefill_op: "QuantizedLinearTorch"
+- match:
+    name: "^model\\.layers\\..*\\.mlp$"
+    class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeSparseMoeBlock
+  replace:
+    class: ktransformers.operators.experts.Qwen2MoeSparseMoeBlockInjected     # mlp module with custom forward function
+- match:
+    name: "^model\\.layers\\..*\\.mlp\\.experts$"
+  replace:
+    class: ktransformers.operators.experts.KTransformersMLPExpert     # custom MoE Kernel with expert paralleism
+    device: "cpu"   # which devices to load this module when initializing
+    kwargs:
+      prefill_device: "cuda"
+      prefill_mlp_type: "MLPExpertsTorch"
+      generate_device: "cpu"
+      generate_mlp_type:  "MLPCPUExperts"
+      out_device: "cuda"
+  recursive: False # don't recursively inject submodules of this module
+- match:
+    name: "^model$"
+  replace:
+    class: "ktransformers.operators.layer_wise_prefill.Qwen2MoeModelPerLayerPrefill"
+    kwargs:
+      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
--- a/ktransformers/server/init.py
+++ b/ktransformers/server/init.py
--- a/ktransformers/server/api/init.py
+++ b/ktransformers/server/api/init.py
@ -0,0 +1,10 @@
+from fastapi import APIRouter
+
+from .ollama import router as ollama_router
+from .openai import router as openai_router,post_db_creation_operations
+from .web import router as web_router
+
+router = APIRouter()
+router.include_router(ollama_router)
+router.include_router(openai_router)
+router.include_router(web_router)
--- a/ktransformers/server/api/ollama/init.py
+++ b/ktransformers/server/api/ollama/init.py
@ -0,0 +1,6 @@
+from fastapi import APIRouter
+
+from .completions import router as completions_router
+
+router = APIRouter()
+router.include_router(completions_router)
--- a/ktransformers/server/api/ollama/completions.py
+++ b/ktransformers/server/api/ollama/completions.py
@ -0,0 +1,160 @@
+from datetime import datetime
+from http.client import NOT_IMPLEMENTED
+import json
+from time import time
+from uuid import uuid4
+from typing import List, Optional
+
+from fastapi import APIRouter, Request
+from pydantic import BaseModel, Field
+
+from ktransformers.server.config.config import Config
+from ktransformers.server.utils.create_interface import get_interface
+from ktransformers.server.schemas.assistants.streaming import check_link_response
+from ktransformers.server.backend.base import BackendInterfaceBase
+router = APIRouter(prefix='/api')
+
+
+# https://github.com/ollama/ollama/blob/main/docs/api.md#generate-a-completion
+class OllamaGenerateCompletionRequest(BaseModel):
+    model: str = Field(..., description="The model name, which is required.")
+    prompt: Optional[str] = Field(
+        None, description="The prompt to generate a response for.")
+    images: Optional[List[str]] = Field(
+        None, description="A list of base64-encoded images for multimodal models such as llava.")
+    # Advanced parameters
+    format: Optional[str] = Field(
+        None, description="The format to return a response in, accepted value is json.")
+    options: Optional[dict] = Field(
+        None, description="Additional model parameters as listed in the documentation.")
+    system: Optional[str] = Field(
+        None, description="System message to override what is defined in the Modelfile.")
+    template: Optional[str] = Field(
+        None, description="The prompt template to use, overriding what is defined in the Modelfile.")
+    context: Optional[str] = Field(
+        None, description="The context parameter from a previous request to keep a short conversational memory.")
+    stream: Optional[bool] = Field(
+        None, description="If false, the response will be returned as a single response object.")
+    raw: Optional[bool] = Field(
+        None, description="If true, no formatting will be applied to the prompt.")
+    keep_alive: Optional[str] = Field(
+        "5m", description="Controls how long the model will stay loaded into memory following the request.")
+
+
+class OllamaGenerationStreamResponse(BaseModel):
+    model: str
+    created_at: str
+    response: str
+    done: bool = Field(...)
+
+
+class OllamaGenerationResponse(BaseModel):
+    pass
+
+
+@router.post("/generate", tags=['ollama'])
+async def generate(request: Request, input: OllamaGenerateCompletionRequest):
+    id = str(uuid4())
+
+    interface: BackendInterfaceBase = get_interface()
+    print(f'COMPLETION INPUT:----\n{input.prompt}\n----')
+
+    config = Config()
+
+    if input.stream:
+        async def inner():
+            async for token in interface.inference(input.prompt,id): 
+                d = OllamaGenerationStreamResponse(model=config.model_name,created_at=str(datetime.now()),response=token,done=False)
+                yield d.model_dump_json()+'\n' 
+                # d = {'model':config.model_name,'created_at':"", 'response':token,'done':False}
+                # yield f"{json.dumps(d)}\n"
+            # d = {'model':config.model_name,'created_at':"", 'response':'','done':True}
+            # yield f"{json.dumps(d)}\n"
+            d = OllamaGenerationStreamResponse(model=config.model_name,created_at=str(datetime.now()),response='',done=True)   
+            yield d.model_dump_json()+'\n'
+        return check_link_response(request,inner())
+    else:
+        raise NotImplementedError
+
+# https://github.com/ollama/ollama/blob/main/docs/api.md#generate-a-chat-completion
+
+
+class OllamaChatCompletionRequest(BaseModel):
+    pass
+
+
+class OllamaChatCompletionStreamResponse(BaseModel):
+    pass
+
+
+class OllamaChatCompletionResponse(BaseModel):
+    pass
+
+
+@router.post("/chat", tags=['ollama'])
+async def chat(request: Request, input: OllamaChatCompletionRequest):
+    raise NotImplementedError
+
+
+# https://github.com/ollama/ollama/blob/main/docs/api.md#list-local-models
+class OllamaModel(BaseModel):
+    name: str
+    modified_at: str
+    size: int
+    # TODO: fill the rest correctly
+
+
+# mock ollama
+@router.get("/tags",tags=['ollama'])
+async def tags():
+    config = Config()
+    # TODO: fill this correctly, although it does not effect Tabby
+    return {"models": [OllamaModel(name=config.model_name, modified_at="123", size=123)]}
+
+class OllamaModelInfo(BaseModel):
+    # TODO: fill this correctly
+    pass
+
+class OllamaShowRequest(BaseModel):
+    name: str = Field(..., description="Name of the model to show")
+    verbose: Optional[bool] = Field(
+        None, description="If set to true, returns full data for verbose response fields")
+
+class OllamaShowDetial(BaseModel):
+    parent_model: str
+    format: str
+    family: str
+    families: List[str]
+    parameter_size: str
+    quantization_level: str
+
+class OllamaShowResponse(BaseModel):
+    modelfile: str
+    parameters: str
+    template: str
+    details: OllamaShowDetial
+    model_info: OllamaModelInfo
+
+
+
+
+@router.post("/show", tags=['ollama'])
+async def show(request: Request, input: OllamaShowRequest):
+    config = Config()
+    # TODO: Add more info in config to return, although it does not effect Tabby
+    return OllamaShowResponse(
+        modelfile = "# Modelfile generated by ...",
+        parameters = " ",
+        template = " ",
+        details = OllamaShowDetial(
+            parent_model = " ",
+            format = "gguf",
+            family = " ",
+            families = [
+                " " 
+            ],
+            parameter_size = " ",
+            quantization_level = " "
+        ),
+        model_info = OllamaModelInfo()
+    )
--- a/ktransformers/server/api/openai/init.py
+++ b/ktransformers/server/api/openai/init.py
@ -0,0 +1,15 @@
+from fastapi import APIRouter
+
+from .assistants import router as assistants_router,create_default_assistant
+from .endpoints.chat import router as chat_router
+from .legacy import router as legacy_router
+
+router = APIRouter(prefix='/v1')
+
+
+router.include_router(assistants_router)
+router.include_router(chat_router)
+router.include_router(legacy_router)
+
+def post_db_creation_operations():
+    create_default_assistant()
--- a/ktransformers/server/api/openai/assistants/init.py
+++ b/ktransformers/server/api/openai/assistants/init.py
@ -0,0 +1,14 @@
+from fastapi import APIRouter
+
+from .assistants import router as assistants_router, create_default_assistant
+from .messages import router as messages_router
+from .runs import router as runs_router
+from .threads import router as threads_router
+
+router = APIRouter()
+
+threads_router.include_router(runs_router)
+threads_router.include_router(messages_router)
+
+router.include_router(assistants_router)
+router.include_router(threads_router)
--- a/ktransformers/server/api/openai/assistants/assistants.py
+++ b/ktransformers/server/api/openai/assistants/assistants.py
@ -0,0 +1,103 @@
+from typing import Optional
+
+from fastapi import APIRouter
+from fastapi.testclient import TestClient
+
+from ktransformers.server.crud.assistants.assistants import AssistantDatabaseManager
+from ktransformers.server.crud.assistants.runs import RunsDatabaseManager
+from ktransformers.server.schemas.assistants.assistants import AssistantCreate, AssistantModify, ObjectID, AssistantBuildStatus, AssistantObject
+from ktransformers.server.schemas.base import DeleteResponse, Order
+from ktransformers.server.config.log import logger
+
+
+router = APIRouter(prefix="/assistants")
+assistant_manager = AssistantDatabaseManager()
+runs_manager = RunsDatabaseManager()
+
+
+@router.post("/", tags=['openai'])
+async def create_assistant(
+    assistant: AssistantCreate,
+):
+    return assistant_manager.db_create_assistant(assistant).as_api_response()
+
+
+@router.get("/", tags=['openai'])
+async def list_assistants(
+    limit: Optional[int] = 20,
+    order: Order = Order.DESC,
+    after: Optional[str] = None,
+    before: Optional[str] = None,
+):
+    return [assistant.as_api_response() for assistant in assistant_manager.db_list_assistants(limit, order)]
+
+# list assistant with status
+
+
+@router.get("/status", tags=['openai-ext'])
+async def list_assistants_with_status(
+    limit: Optional[int] = 20,
+    order: Order = Order.DESC,
+    after: Optional[str] = None,
+    before: Optional[str] = None,
+):
+    return assistant_manager.db_list_assistants(limit, order)
+
+
+@router.get("/{assistant_id}", tags=['openai'])
+async def retrieve_assistant(
+    assistant_id: str,
+):
+    return assistant_manager.db_get_assistant_by_id(assistant_id).as_api_response()
+
+
+@router.post("/{assistant_id}", tags=['openai'])
+async def modify_assistant(
+    assistant_id: str,
+    assistant: AssistantModify,
+):
+    return assistant_manager.db_update_assistant_by_id(assistant_id, assistant).as_api_response()
+
+
+@router.delete("/{assistant_id}", tags=['openai'], response_model=DeleteResponse)
+async def delete_assistant(assistant_id: str):
+    assistant_manager.db_delete_assistant_by_id(assistant_id)
+    return DeleteResponse(id=assistant_id, object="assistant.deleted")
+
+
+@router.get("/{assistant_id}/related_thread", tags=['openai'])
+async def get_related_thread(assistant_id: ObjectID):
+    assistant = assistant_manager.db_get_assistant_by_id(assistant_id)
+    return assistant.get_related_threads_ids()
+
+
+def create_default_assistant():
+    logger.info('Creating default assistant')
+    if assistant_manager.db_count_assistants() == 0:
+        default_assistant = assistant_manager.db_create_assistant(AssistantCreate(name="KT Assistant",
+                                                                                  model="default model",
+                                                                                  instructions="""You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  """ +
+                                                                                  """Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. """ +
+                                                                                  """Please ensure that your responses are socially unbiased and positive in nature."""))
+        default_assistant.build_status.status = AssistantBuildStatus.Status.completed
+        default_assistant.sync_db()
+
+
+# unit test
+client = TestClient(router)
+
+
+def test_create_assistant():
+    ass_create = AssistantCreate(model="awesome model", instructions="hello")
+
+    res = client.post("/", json=ass_create.model_dump(mode="json"))
+
+    assert res.status_code == 200
+    assistant = AssistantObject.model_validate(res.json())
+
+    assert assistant.model == ass_create.model
+    assert assistant.instructions == ass_create.instructions
+
+    res = client.get(f"/{assistant.id}")
+    ass1 = AssistantObject.model_validate(res.json())
+    assert assistant == ass1
--- a/Show more
+++ b/Show more