Merge branch 'main' into hip

2025-09-11 07:44:35 +00:00 · 2025-03-13 04:17:49 -04:00 · 2025-03-13 04:17:49 -04:00 · c009512a93
commit c009512a93
parent c1f13a69ed 4f22d726a5
121 changed files with 10070 additions and 1369 deletions
--- a/.devcontainer/Dockerfile
+++ b/.devcontainer/Dockerfile
@ -0,0 +1,19 @@
+FROM pytorch/pytorch:2.5.1-cuda12.1-cudnn9-devel as compile_server
+WORKDIR /workspace
+ENV CUDA_HOME /usr/local/cuda
+RUN <<EOF
+apt update -y &&  apt install -y  --no-install-recommends \
+    git \
+    wget \
+    vim \
+    gcc \
+    g++ \
+    cmake && 
+rm -rf /var/lib/apt/lists/* &&
+pip install --upgrade pip &&
+pip install ninja pyproject numpy cpufeature &&
+pip install flash-attn &&
+cp /usr/lib/x86_64-linux-gnu/libstdc++.so.6 /opt/conda/lib/
+EOF
+# Set the default shell to bash
+CMD ["/bin/bash"]
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@ -0,0 +1,34 @@
+{
+    "name": "Ktrans Dev Container",
+    "privileged": true,
+    "build": {
+        "dockerfile": "Dockerfile",
+        "context": "..",
+        "args": {
+            "http_proxy": "${env:http_proxy}",
+            "https_proxy": "${env:https_proxy}",
+        }
+    },
+    "runArgs": [
+        "--network=host",
+        "--gpus",
+        "all"
+        // "--gpu all"
+    ],
+    "workspaceFolder": "/workspace",
+    "workspaceMount": "source=${localWorkspaceFolder},target=/workspace,type=bind,consistency=cached",
+    "mounts": [
+        "source=/mnt/data,target=/mnt/incontainer,type=bind,consistency=cached"
+    ],
+    "customizations": {
+        "vscode": {
+            "extensions": [
+            ],
+            "settings": {
+                "terminal.integrated.shell.linux": "/bin/bash",
+                "cmake.configureOnOpen": true,
+                "cmake.generator": "Ninja"
+            }
+        }
+    }
+}
--- a/.github/ISSUE_TEMPLATE/-bug-.yaml
+++ b/.github/ISSUE_TEMPLATE/-bug-.yaml
@ -0,0 +1,39 @@
+name: 🐞 Bug report
+description: Create a report to help us reproduce and fix the bug
+title: "[Bug] "
+labels: ['Bug']
+
+body:
+- type: checkboxes
+  attributes:
+    label: Checklist
+    options:
+    - label: 1. I have searched related issues but cannot get the expected help.
+    - label: 2. The bug has not been fixed in the latest version.
+    - label: 3. Please note that if the bug-related issue you submitted lacks corresponding environment info and a minimal reproducible demo, it will be challenging for us to reproduce and resolve the issue, reducing the likelihood of receiving feedback.
+    - label: 4. If the issue you raised is not a bug but a question, please raise a discussion at https://github.com/kvcache-ai/ktransformers/discussions. Otherwise, it will be closed.
+    - label: 5. To help the community, I will use Chinese/English or attach an Chinese/English translation if using another language. Non-Chinese/English content without translation may be closed.
+
+- type: textarea
+  attributes:
+    label: Describe the bug
+    description: A clear and concise description of what the bug is.
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: Reproduction
+    description: |
+      What command or script did you run? Which **model** are you using?
+    placeholder: |
+      A placeholder for the command.
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: Environment
+    description: |
+      Please provide necessary environment information here (e.g. OS/GPU/CPU). Otherwise the issue will be close.
+    placeholder: Environment here.
+  validations:
+    required: true
--- a/.github/ISSUE_TEMPLATE/-bug2-.yaml
+++ b/.github/ISSUE_TEMPLATE/-bug2-.yaml
@ -0,0 +1,39 @@
+name: 🐞 BUG报告
+description: 创建报告以帮助我们复现并修复BUG
+title: "[Bug] "
+labels: ['Bug']
+
+body:
+- type: checkboxes
+  attributes:
+    label: 检查清单
+    options:
+    - label: 1. 我已经搜索过相关问题，但未能获得预期的帮助
+    - label: 2. 该问题在最新版本中尚未修复
+    - label: 3. 请注意，如果您提交的BUG相关 issue 缺少对应环境信息和最小可复现示例，我们将难以复现和定位问题，降低获得反馈的可能性
+    - label: 4. 如果您提出的不是bug而是问题，请在讨论区发起讨论 https://github.com/kvcache-ai/ktransformers/discussions。否则该 issue 将被关闭
+    - label: 5. 为方便社区交流，我将使用中文/英文或附上中文/英文翻译（如使用其他语言）。未附带翻译的非中文/英语内容可能会被关闭
+
+- type: textarea
+  attributes:
+    label: 问题描述
+    description: 清晰简洁地描述BUG是什么
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: 复现步骤
+    description: |
+      你运行了什么命令或脚本？使用的是哪个**模型**？
+    placeholder: |
+      在此处填写命令
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: 环境信息
+    description: |
+      请提供必要的环境信息（如操作系统/GPU/CPU），否则该 issue 将被关闭
+    placeholder: 在此处填写环境信息
+  validations:
+    required: true
--- a/.github/ISSUE_TEMPLATE/-feature-.yaml
+++ b/.github/ISSUE_TEMPLATE/-feature-.yaml
@ -0,0 +1,23 @@
+name: 🚀 Feature request
+description: Suggest an idea for this project
+title: "[Feature] "
+
+body:
+- type: checkboxes
+  attributes:
+    label: Checklist
+    options:
+    - label: 1. If the issue you raised is not a feature but a question, please raise a discussion at https://github.com/kvcache-ai/ktransformers/discussions. Otherwise, it will be closed.
+    - label: 2. To help the community, I will use Chinese/English or attach an Chinese/English translation if using another language. Non-English/Chinese content without translation may be closed.
+- type: textarea
+  attributes:
+    label: Motivation
+    description: |
+      A clear and concise description of the motivation of the feature.
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: Related resources
+    description: |
+      If there is an official code release or third-party implementations, please also provide the information here, which would be very helpful.
--- a/.github/ISSUE_TEMPLATE/-feature2-.yaml
+++ b/.github/ISSUE_TEMPLATE/-feature2-.yaml
@ -0,0 +1,23 @@
+name: 🚀 新功能请求
+description: 为项目提出新功能建议
+title: "[Feature] "
+
+body:
+- type: checkboxes
+  attributes:
+    label: 检查清单
+    options:
+    - label: 1. 如果您提出的不是新功能而是问题，请在讨论区发起讨论 https://github.com/kvcache-ai/ktransformers/discussions。否则该 issue 将被关闭
+    - label: 2. 为方便社区交流，我将使用中文/英文或附上英文/中文翻译（如使用其他语言）。未附带翻译的非英文/中文内容可能会被关闭
+- type: textarea
+  attributes:
+    label: 需求背景
+    description: |
+      清晰简洁地描述该功能的背景需求
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: 相关资源
+    description: |
+      如果有官方代码实现或第三方实现，请在此提供相关信息，这将非常有帮助
--- a/.github/workflows/book-ci.yml
+++ b/.github/workflows/book-ci.yml
@ -0,0 +1,32 @@
+name: Book-CI
+
+on:
+  push:
+    branches:
+      - main
+      # - server_support
+
+  pull_request:
+    branches:
+      - main
+      # - server_support
+jobs:
+  test:
+    name: test
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-latest, macos-latest, windows-latest]
+    steps:
+      - uses: actions/checkout@v4
+      - name: Install Rust
+        run: |
+          rustup set profile minimal
+          rustup toolchain install stable
+          rustup default stable
+      - name: Setup mdBook
+        uses: peaceiris/actions-mdbook@v2
+        with:
+          mdbook-version: "latest"
+      # - name: Run tests
+      #   run: mdbook test
--- a/.github/workflows/deploy.yml
+++ b/.github/workflows/deploy.yml
@ -0,0 +1,49 @@
+name: Deploy
+
+on:
+  push:
+    branches:
+      - main
+      # - server_support
+
+  pull_request:
+    branches:
+      - main
+      # - server_support
+
+defaults:
+  run:
+    shell: bash
+
+permissions:
+  contents: write
+
+jobs:
+  deploy:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-latest, macos-latest, windows-latest]
+    steps:
+      - uses: actions/checkout@v4
+      - name: Install Rust
+        run: |
+          rustup set profile minimal
+          rustup toolchain install stable
+          rustup default stable
+      - name: Setup mdBook
+        uses: peaceiris/actions-mdbook@v2
+        with:
+          mdbook-version: "latest"
+      - run: mdbook build
+      # - name: Copy Assets
+      #   run: |
+      #     chmod +x ci/copy-assets.sh
+      #     ci/copy-assets.sh ${{ matrix.os }}
+      - name: Deploy
+        uses: peaceiris/actions-gh-pages@v3
+        # or || github.ref == 'refs/heads/server_support'
+        if: ${{ github.ref == 'refs/heads/main' }}
+        with:
+          github_token: ${{ secrets.GITHUB_TOKEN }}
+          publish_dir: ./book
--- a/.github/workflows/docker-image.yml
+++ b/.github/workflows/docker-image.yml
@ -0,0 +1,98 @@
+name: DockerHub CI
+
+on:
+  release:
+    types: [published]
+  workflow_dispatch:
+    inputs:
+      choose:
+        description: 'Will you push the image to DockerHub? 0 for No, 1 for Yes'
+        required: true
+        default: '0'
+        type: string
+
+  # push:
+  #   branches:
+  #     - main
+env:
+  DOCKERHUB_REPO: ${{ secrets.DOCKERHUB_USERNAME }}/ktransformers
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - name: Run tests
+        run: |
+          if [ -f docker-compose.test.yml ]; then
+            docker-compose --file docker-compose.test.yml build
+            docker-compose --file docker-compose.test.yml run sut
+          else
+            docker build . --file Dockerfile
+          fi
+
+  docker_task:
+    needs: test
+    name: ${{ matrix.instruct}}
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+        # for amd64
+          - {instruct: "FANCY",   platform: "linux/amd64"}
+          - {instruct: "AVX512",  platform: "linux/amd64"}
+          - {instruct: "AVX2",    platform: "linux/amd64"}   
+          - {instruct: "NATIVE",  platform: "linux/amd64"}
+        # for arm64
+          - {instruct: "NATIVE",  platform: "linux/arm64"}
+
+    steps:
+        - name: Move Docker data directory
+          run: |
+            sudo systemctl stop docker
+            sudo mkdir -p /mnt/docker
+            sudo rsync -avz /var/lib/docker/ /mnt/docker
+            sudo rm -rf /var/lib/docker 
+            sudo ln -s /mnt/docker /var/lib/docker
+            sudo systemctl start docker
+
+        -
+          name: Set up QEMU
+          uses: docker/setup-qemu-action@v3
+
+        -
+          name: Set up Docker Buildx
+          uses: docker/setup-buildx-action@v3
+
+        -
+          name: Login to Docker Hub
+          uses: docker/login-action@v3
+          with:
+            username: ${{ secrets.DOCKERHUB_USERNAME }}
+            password: ${{ secrets.DOCKERHUB_TOKEN }}
+        -
+          name: Build and push for amd64
+          if: matrix.platform == 'linux/amd64'
+          uses: docker/build-push-action@v6
+          with:
+            push: true
+            platforms: |
+              linux/amd64
+            tags: |
+              ${{ env.DOCKERHUB_REPO }}:latest-${{ matrix.instruct }}
+              ${{ env.DOCKERHUB_REPO }}:${{ github.event.release.tag_name }}-${{ matrix.instruct }}
+            build-args: |
+              CPU_INSTRUCT=${{ matrix.instruct }}
+        -
+          name: Build and push for arm64
+          if: matrix.platform == 'linux/arm64'
+          uses: docker/build-push-action@v6
+          with:
+            push: true
+            platforms: |
+              linux/arm64
+            tags: |
+              ${{ env.DOCKERHUB_REPO }}:latest-${{ matrix.instruct }}
+              ${{ env.DOCKERHUB_REPO }}:${{ github.event.release.tag_name }}-${{ matrix.instruct }}
+            build-args: |
+              CPU_INSTRUCT=${{ matrix.instruct }}
--- a/.github/workflows/install.yml
+++ b/.github/workflows/install.yml
@ -0,0 +1,52 @@
+name: Install KTransformers
+run-name: Install KTransformers
+on:
+  workflow_dispatch:
+    inputs:
+      job_to_run:
+        description: "Which job to run?"
+        required: true
+        default: "install"
+        type: choice
+        options:
+          - create&install
+          - install
+jobs:
+  Install-KTransformers:
+    runs-on: self-hosted
+    steps:
+      - run: echo "🎉 The job was automatically triggered by a ${{ github.event_name }} event."
+      - run: echo "🔎 The name of your branch is ${{ github.ref }} and your repository is ${{ github.repository }}."
+      - name: Check out repository code
+        uses: actions/checkout@v4
+      - run: echo "💡 The ${{ github.repository }} repository has been cloned to the runner."
+      - name: Remove old conda environment
+        continue-on-error: true
+        if: ${{ inputs.job_to_run == 'create&install'}}
+        run: |
+          source /home/qujing3/anaconda3/etc/profile.d/conda.sh
+          conda env remove --name ktransformers-dev -y
+      - name: Create conda environment
+        if: ${{ inputs.job_to_run == 'create&install'}}
+        run: |
+          source /home/qujing3/anaconda3/etc/profile.d/conda.sh
+          conda create --name ktransformers-dev python=3.11
+          conda activate ktransformers-dev
+          conda install -c conda-forge libstdcxx-ng -y
+      - name: Install dependencies
+        if: ${{ inputs.job_to_run == 'create&install'}}
+        run: |
+          source /home/qujing3/anaconda3/etc/profile.d/conda.sh
+          conda activate ktransformers-dev
+          pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126
+          pip3 install packaging ninja cpufeature numpy
+          pip install ~/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiTRUE-cp311-cp311-linux_x86_64.whl
+      - name: Install KTransformers
+        run: |
+          source /home/qujing3/anaconda3/etc/profile.d/conda.sh
+          conda activate ktransformers-dev
+          cd ${{ github.workspace }}
+          git submodule init
+          git submodule update
+          USE_NUMA=1 bash install.sh
+      - run: echo "This job's status is ${{ job.status }}."
--- a/.gitignore
+++ b/.gitignore
@ -19,6 +19,10 @@ ktransformers/server/local_store/
 ktransformers/server_test1.db
 *.patch
 img/
-tmp1.txt
-test_65_300_1536.txt
+tmp*.txt
 test.txt
+book
+ktransformers/tests/chat_txt.txt
+mmlu_result*
+ktransformers/ktransformers_ext/cuda_musa/
+test_prompt.txt
--- a/11
+++ b/11
@ -10,7 +10,8 @@ EOF



-FROM pytorch/pytorch:2.3.1-cuda12.1-cudnn8-devel as compile_server
+FROM pytorch/pytorch:2.5.1-cuda12.1-cudnn9-devel as compile_server
+ARG CPU_INSTRUCT=NATIVE
 WORKDIR /workspace
 ENV CUDA_HOME /usr/local/cuda
 COPY --from=web_compile /home/ktransformers /workspace/ktransformers
@ -26,10 +27,12 @@ rm -rf /var/lib/apt/lists/* &&
 cd ktransformers &&
 git submodule init &&
 git submodule update &&
+pip install --upgrade pip &&
 pip install ninja pyproject numpy cpufeature &&
 pip install flash-attn &&
-CPU_INSTRUCT=NATIVE  KTRANSFORMERS_FORCE_BUILD=TRUE TORCH_CUDA_ARCH_LIST="8.0;8.6;8.7;8.9;9.0+PTX" pip install . --no-build-isolation --verbose &&
-pip cache purge
+CPU_INSTRUCT=${CPU_INSTRUCT}  KTRANSFORMERS_FORCE_BUILD=TRUE TORCH_CUDA_ARCH_LIST="8.0;8.6;8.7;8.9;9.0+PTX" pip install . --no-build-isolation --verbose &&
+pip cache purge &&
+cp /usr/lib/x86_64-linux-gnu/libstdc++.so.6 /opt/conda/lib/
 EOF

-ENTRYPOINT [ "/opt/conda/bin/ktransformers" ]
+ENTRYPOINT ["tail", "-f", "/dev/null"]
--- a/11
+++ b/11
@ -19,3 +19,14 @@ dev_install:
 	echo "Installing ktransformers"
 	KTRANSFORMERS_FORCE_BUILD=TRUE pip install -e . -v --no-build-isolation
 	echo "Installation completed successfully"
+clean:
+	rm -rf build
+	rm -rf *.egg-info
+	rm -rf ktransformers/ktransformers_ext/build
+	rm -rf ktransformers/ktransformers_ext/cuda/build
+	rm -rf ktransformers/ktransformers_ext/cuda/dist
+	rm -rf ktransformers/ktransformers_ext/cuda/*.egg-info	
+install_numa:
+	USE_NUMA=1 make dev_install
+install_no_numa:
+	env -u USE_NUMA make dev_install
--- a/README.md
+++ b/README.md
@ -23,14 +23,16 @@ Our vision for KTransformers is to serve as a flexible platform for experimentin

 <h2 id="Updates">🔥 Updates</h2>

-* **Feb 10, 2025**: Support Deepseek-R1 and V3 on single (24GB VRAM)/multi gpu and 382G DRAM, up to 3~28x speedup. The detailed tutorial is [here](./doc/en/DeepseekR1_V3_tutorial.md).
-* **Aug 28, 2024**: Support 1M context under the InternLM2.5-7B-Chat-1M model, utilizing 24GB of VRAM and 150GB of DRAM. The detailed tutorial is [here](./doc/en/long_context_tutorial.md).
+* **Mar 5, 2025**: Support unsloth 1.58/2.51 bits weights and [IQ1_S/FP8 hybrid](./doc/en/fp8_kernel.md) weights. Support 139K [Longer Context](./doc/en/DeepseekR1_V3_tutorial.md#v022-longer-context) for DeepSeek-V3 and R1 in 24GB VRAM.
+* **Feb 25, 2025**: Support [FP8 GPU kernel](./doc/en/fp8_kernel.md) for DeepSeek-V3 and R1; [Longer Context](./doc/en/DeepseekR1_V3_tutorial.md#v022-longer-context).
+* **Feb 15, 2025**: Longer Context (from 4K to 8K for 24GB VRAM) & Slightly Faster Speed （+15%, up to 16 Tokens/s), update [docs](./doc/en/DeepseekR1_V3_tutorial.md) and [online books](https://kvcache-ai.github.io/ktransformers/).
+* **Feb 10, 2025**: Support Deepseek-R1 and V3 on single (24GB VRAM)/multi gpu and 382G DRAM, up to 3~28x speedup. For detailed show case and reproduction tutorial, see [here](./doc/en/DeepseekR1_V3_tutorial.md).
 * **Aug 28, 2024**: Decrease DeepseekV2's required VRAM from 21G to 11G.
-* **Aug 15, 2024**: Update detailed [TUTORIAL](doc/en/injection_tutorial.md) for injection and multi-GPU. 
+* **Aug 15, 2024**: Update detailed [tutorial](doc/en/injection_tutorial.md) for injection and multi-GPU. 
 * **Aug 14, 2024**: Support llamfile as linear backend. 
 * **Aug 12, 2024**: Support multiple GPU; Support new model: mixtral 8\*7B  and 8\*22B; Support q2k, q3k, q5k dequant on gpu.
 * **Aug 9, 2024**: Support windows native.
-
+<!-- * **Aug 28, 2024**: Support 1M context under the InternLM2.5-7B-Chat-1M model, utilizing 24GB of VRAM and 150GB of DRAM. The detailed tutorial is [here](./doc/en/long_context_tutorial.md). -->
 <h2 id="show-cases">🌟 Show Cases</h2>

 <div>
@ -43,10 +45,10 @@ https://github.com/user-attachments/assets/ebd70bfa-b2c1-4abb-ae3b-296ed38aa285

 - **[NEW!!!] Local 671B DeepSeek-Coder-V3/R1:** Running its Q4_K_M version using only 14GB VRAM and 382GB DRAM([Tutorial](./doc/en/DeepseekR1_V3_tutorial.md)).
 	- Prefill Speed (tokens/s): 
- 		- KTransfermor: 54.21 (32 cores) → 74.362 (dual-socket, 2×32 cores) → 255.26 (optimized AMX-based MoE kernel, V0.3 only) → 286.55 (selectively using 6 experts, V0.3 only)  
+ 		- KTransformers: 54.21 (32 cores) → 74.362 (dual-socket, 2×32 cores) → 255.26 (optimized AMX-based MoE kernel, V0.3 only) → 286.55 (selectively using 6 experts, V0.3 only)  
 		- Compared to 10.31 tokens/s in llama.cpp with 2×32 cores, achieving up to **27.79× speedup**.  
 	- Decode Speed (tokens/s):  
- 		- KTransfermor: 8.73 (32 cores) → 11.26 (dual-socket, 2×32 cores) → 13.69 (selectively using 6 experts, V0.3 only)  
+ 		- KTransformers: 8.73 (32 cores) → 11.26 (dual-socket, 2×32 cores) → 13.69 (selectively using 6 experts, V0.3 only)  
 		- Compared to 4.51 tokens/s in llama.cpp with 2×32 cores, achieving up to **3.03× speedup**.  
 	- Upcoming Open Source Release:
 		- AMX optimizations and selective expert activation will be open-sourced in V0.3.  
@ -69,7 +71,7 @@ https://github.com/user-attachments/assets/4c6a8a38-05aa-497d-8eb1-3a5b3918429c

 </p>

-<h3>1M Context Local Inference on a Desktop with Only 24GB VRAM</h3>
+<!-- <h3>1M Context Local Inference on a Desktop with Only 24GB VRAM</h3>
 <p align="center">

 https://github.com/user-attachments/assets/a865e5e4-bca3-401e-94b8-af3c080e6c12
@ -91,228 +93,20 @@ https://github.com/user-attachments/assets/a865e5e4-bca3-401e-94b8-af3c080e6c12
 * **Enhanced Speed**: Reaches 16.91 tokens/s for generation with a 1M context using sparse attention, powered by llamafile kernels. This method is over 10 times faster than full attention approach of llama.cpp.

 * **Flexible Sparse Attention Framework**: Offers a flexible block sparse attention framework for CPU offloaded decoding. Compatible with SnapKV, Quest, and InfLLm. Further information is available [here](./doc/en/long_context_introduction.md).
-
+ -->


 <strong>More advanced features will coming soon, so stay tuned!</strong>

 <h2 id="quick-start">🚀 Quick Start</h2>

-<h3>Preparation</h3>
-Some preparation:

- CUDA 12.1 and above, if you didn't have it yet, you may install from [here](https://developer.nvidia.com/cuda-downloads).
+Getting started with KTransformers is simple! Follow the steps below to set up and start using it.

-  ```sh
-  # Adding CUDA to PATH
-  export PATH=/usr/local/cuda/bin:$PATH
-  export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
-  export CUDA_PATH=/usr/local/cuda
-  ```
+### 📥 Installation

- Linux-x86_64 with gcc, g++ and cmake
+To install KTransformers, follow the official [Installation Guide](https://kvcache-ai.github.io/ktransformers/en/install.html).

-  ```sh
-  sudo apt-get update
-  sudo apt-get install gcc g++ cmake ninja-build
-  ```
-
- We recommend using [Conda](https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh) to create a virtual environment with Python=3.11 to run our program.
-  
-  ```sh
-  conda create --name ktransformers python=3.11
-  conda activate ktransformers # you may need to run ‘conda init’ and reopen shell first
-  ```
-
- Make sure that PyTorch, packaging, ninja is installed
-  
-  ```
-  pip install torch packaging ninja cpufeature numpy
-  ```
-
-<h3>Installation</h3>
-
-1. Use a Docker image, see [documentation for Docker](./doc/en/Docker.md) 
-
-2. You can install using Pypi (for linux):
-   
-   ```
-   pip install ktransformers --no-build-isolation
-   ```
-   
-   for windows we prepare a pre compiled whl package on [ktransformers-0.2.0+cu125torch24avx2-cp312-cp312-win_amd64.whl](https://github.com/kvcache-ai/ktransformers/releases/download/v0.2.0/ktransformers-0.2.0+cu125torch24avx2-cp312-cp312-win_amd64.whl), which require cuda-12.5, torch-2.4, python-3.11, more pre compiled package are being produced. 
-
-3. Or you can download source code and compile:
-   
-   - init source code 
-     
-     ```sh
-     git clone https://github.com/kvcache-ai/ktransformers.git
-     cd ktransformers
-     git submodule init
-     git submodule update
-     ```
-
-   - [Optional] If you want to run with website, please [compile the website](./doc/en/api/server/website.md) before execute ```bash install.sh```
-
-   - Compile and install (for Linux)
-     
-     ```
-     bash install.sh
-     ```
-
-   - Compile and install(for Windows)
-     
-     ```
-     install.bat
-     ```
-4. If you are developer, you can make use of the makefile to compile and format the code. <br> the detailed usage of makefile is [here](./doc/en/makefile_usage.md) 
-<h3>Local Chat</h3>
-We provide a simple command-line local chat Python script that you can run for testing.
-
-> Note that this is a very simple test tool only support one round chat without any memory about last input, if you want to try full ability of the model, you may go to [RESTful API and Web UI](#id_666). We use the DeepSeek-V2-Lite-Chat-GGUF model as an example here. But we also support other models, you can replace it with any other model that you want to test. 
-
-<h4>Run Example</h4>
-
-```shell
-# Begin from root of your cloned repo!
-# Begin from root of your cloned repo!!
-# Begin from root of your cloned repo!!! 
-
-# Download mzwing/DeepSeek-V2-Lite-Chat-GGUF from huggingface
-mkdir DeepSeek-V2-Lite-Chat-GGUF
-cd DeepSeek-V2-Lite-Chat-GGUF
-
-wget https://huggingface.co/mzwing/DeepSeek-V2-Lite-Chat-GGUF/resolve/main/DeepSeek-V2-Lite-Chat.Q4_K_M.gguf -O DeepSeek-V2-Lite-Chat.Q4_K_M.gguf
-
-cd .. # Move to repo's root dir
-
-# Start local chat
-python -m ktransformers.local_chat --model_path deepseek-ai/DeepSeek-V2-Lite-Chat --gguf_path ./DeepSeek-V2-Lite-Chat-GGUF
-
-# If you see “OSError: We couldn't connect to 'https://huggingface.co' to load this file”, try：
-# GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite
-# python  ktransformers.local_chat --model_path ./DeepSeek-V2-Lite --gguf_path ./DeepSeek-V2-Lite-Chat-GGUF
-```
-
-It features the following arguments:
-
- `--model_path` (required): Name of the model (such as "deepseek-ai/DeepSeek-V2-Lite-Chat" which will automatically download configs from [Hugging Face](https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite)). Or if you already got local files  you may directly use that path to initialize the model.  
-  
-  > Note: <strong>.safetensors</strong> files are not required in the directory. We only need config files to build model and tokenizer.
-
- `--gguf_path` (required): Path of a directory containing GGUF files which could that can be downloaded from [Hugging Face](https://huggingface.co/mzwing/DeepSeek-V2-Lite-Chat-GGUF/tree/main). Note that the directory should only contains GGUF of current model, which means you need one separate directory for each model.
-
- `--optimize_rule_path` (required except for Qwen2Moe and DeepSeek-V2): Path of YAML file containing optimize rules. There are two rule files pre-written in the [ktransformers/optimize/optimize_rules](ktransformers/optimize/optimize_rules) directory for optimizing DeepSeek-V2 and Qwen2-57B-A14, two SOTA MoE models.
-
- `--max_new_tokens`: Int (default=1000). Maximum number of new tokens to generate.
-
- `--cpu_infer`: Int (default=10). The number of CPUs used for inference. Should ideally be set to the (total number of cores - 2).
-
-<h3 id="suggested-model"> Suggested Model</h3>
-
-| Model Name                     | Model Size | VRAM  | Minimum DRAM    | Recommended DRAM  |
-| ------------------------------ | ---------- | ----- | --------------- | ----------------- |
-| DeepSeek-R1-q4_k_m		 | 377G       | 14G   | 382G            | 512G		    |
-| DeepSeek-V3-q4_k_m		 | 377G       | 14G   | 382G            | 512G		    |
-| DeepSeek-V2-q4_k_m             | 133G       | 11G   | 136G            | 192G              |
-| DeepSeek-V2.5-q4_k_m           | 133G       | 11G   | 136G            | 192G              |
-| DeepSeek-V2.5-IQ4_XS           | 117G       | 10G   | 107G            | 128G              |
-| Qwen2-57B-A14B-Instruct-q4_k_m | 33G        | 8G    | 34G             | 64G               |
-| DeepSeek-V2-Lite-q4_k_m        | 9.7G       | 3G    | 13G             | 16G               |
-| Mixtral-8x7B-q4_k_m            | 25G        | 1.6G  | 51G             | 64G               |
-| Mixtral-8x22B-q4_k_m           | 80G        | 4G    | 86.1G           | 96G               |
-| InternLM2.5-7B-Chat-1M         | 15.5G      | 15.5G | 8G(32K context) | 150G (1M context) |
-
-
-More will come soon. Please let us know which models you are most interested in. 
-
-Be aware that you need to be subject to their corresponding model licenses when using [DeepSeek](https://huggingface.co/deepseek-ai/DeepSeek-V2/blob/main/LICENSE) and [QWen](https://huggingface.co/Qwen/Qwen2-72B-Instruct/blob/main/LICENSE).
-
-<details>
-  <summary>Click To Show how to run other examples</summary>
-
-* Qwen2-57B
-
-  ```sh
-  pip install flash_attn # For Qwen2
-
-  mkdir Qwen2-57B-GGUF && cd Qwen2-57B-GGUF
-
-  wget https://huggingface.co/Qwen/Qwen2-57B-A14B-Instruct-GGUF/resolve/main/qwen2-57b-a14b-instruct-q4_k_m.gguf?download=true -O qwen2-57b-a14b-instruct-q4_k_m.gguf
-
-  cd ..
-
-  python -m ktransformers.local_chat --model_name Qwen/Qwen2-57B-A14B-Instruct --gguf_path ./Qwen2-57B-GGUF
-
-  # If you see “OSError: We couldn't connect to 'https://huggingface.co' to load this file”, try：
-  # GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/Qwen/Qwen2-57B-A14B-Instruct
-  # python  ktransformers/local_chat.py --model_path ./Qwen2-57B-A14B-Instruct --gguf_path ./DeepSeek-V2-Lite-Chat-GGUF
-  ```
-
-* DeepseekV2
-  
-  ```sh
-  mkdir DeepSeek-V2-Chat-0628-GGUF && cd DeepSeek-V2-Chat-0628-GGUF
-  # Download weights
-  wget https://huggingface.co/bartowski/DeepSeek-V2-Chat-0628-GGUF/resolve/main/DeepSeek-V2-Chat-0628-Q4_K_M/DeepSeek-V2-Chat-0628-Q4_K_M-00001-of-00004.gguf -o DeepSeek-V2-Chat-0628-Q4_K_M-00001-of-00004.gguf
-  wget https://huggingface.co/bartowski/DeepSeek-V2-Chat-0628-GGUF/resolve/main/DeepSeek-V2-Chat-0628-Q4_K_M/DeepSeek-V2-Chat-0628-Q4_K_M-00002-of-00004.gguf -o DeepSeek-V2-Chat-0628-Q4_K_M-00002-of-00004.gguf
-  wget https://huggingface.co/bartowski/DeepSeek-V2-Chat-0628-GGUF/resolve/main/DeepSeek-V2-Chat-0628-Q4_K_M/DeepSeek-V2-Chat-0628-Q4_K_M-00003-of-00004.gguf -o DeepSeek-V2-Chat-0628-Q4_K_M-00003-of-00004.gguf
-  wget https://huggingface.co/bartowski/DeepSeek-V2-Chat-0628-GGUF/resolve/main/DeepSeek-V2-Chat-0628-Q4_K_M/DeepSeek-V2-Chat-0628-Q4_K_M-00004-of-00004.gguf -o DeepSeek-V2-Chat-0628-Q4_K_M-00004-of-00004.gguf
-
-  cd ..
-
-  python -m ktransformers.local_chat --model_name deepseek-ai/DeepSeek-V2-Chat-0628 --gguf_path ./DeepSeek-V2-Chat-0628-GGUF
-
-  # If you see “OSError: We couldn't connect to 'https://huggingface.co' to load this file”, try：
-
-  # GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/deepseek-ai/DeepSeek-V2-Chat-0628
-
-  # python -m ktransformers.local_chat --model_path ./DeepSeek-V2-Chat-0628 --gguf_path ./DeepSeek-V2-Chat-0628-GGUF
-  ```
-
-| model name | weights download link |
-|----------|----------|
-| Qwen2-57B | [Qwen2-57B-A14B-gguf-Q4K-M](https://huggingface.co/Qwen/Qwen2-57B-A14B-Instruct-GGUF/tree/main) |
-| DeepseekV2-coder |[DeepSeek-Coder-V2-Instruct-gguf-Q4K-M](https://huggingface.co/LoneStriker/DeepSeek-Coder-V2-Instruct-GGUF/tree/main) |
-| DeepseekV2-chat |[DeepSeek-V2-Chat-gguf-Q4K-M](https://huggingface.co/bullerwins/DeepSeek-V2-Chat-0628-GGUF/tree/main) |
-| DeepseekV2-lite | [DeepSeek-V2-Lite-Chat-GGUF-Q4K-M](https://huggingface.co/mzwing/DeepSeek-V2-Lite-Chat-GGUF/tree/main) |
-
-</details>
-
-<!-- pin block for jump -->
-<span id='id_666'> 
-
-<h3>RESTful API and Web UI</h3>
-
-
-Start without website:
-
-```sh
-ktransformers --model_path deepseek-ai/DeepSeek-V2-Lite-Chat --gguf_path /path/to/DeepSeek-V2-Lite-Chat-GGUF --port 10002
-```
-
-Start with website:
-
-```sh
-ktransformers --model_path deepseek-ai/DeepSeek-V2-Lite-Chat --gguf_path /path/to/DeepSeek-V2-Lite-Chat-GGUF  --port 10002 --web True
-```
-
-Or you want to start server with transformers, the model_path should include safetensors
-
-```bash
-ktransformers --type transformers --model_path /mnt/data/model/Qwen2-0.5B-Instruct --port 10002 --web True
-```
-
-Access website with url [http://localhost:10002/web/index.html#/chat](http://localhost:10002/web/index.html#/chat) :
-
-<p align="center">
-  <picture>
-    <img alt="Web UI" src="https://github.com/user-attachments/assets/615dca9b-a08c-4183-bbd3-ad1362680faf" width=90%>
-  </picture>
-</p>
-
-More information about the RESTful API server can be found [here](doc/en/api/server/server.md). You can also find an example of integrating with Tabby [here](doc/en/api/server/tabby.md).

 <h2 id="tutorial">📃 Brief Injection Tutorial</h2>
 At the heart of KTransformers is a user-friendly, template-based injection framework. 
@ -333,7 +127,7 @@ To utilize the provided kernels, users only need to create a YAML-based injectio
 ```python
 with torch.device("meta"):
    model = AutoModelForCausalLM.from_config(config, trust_remote_code=True)
-optimize_and_load_gguf(model, optimize_rule_path, gguf_path, config)
+optimize_and_load_gguf(model, optimize_config_path, gguf_path, config)
 ...
 generated = prefill_and_generate(model, tokenizer, input_tensor.cuda(), max_new_tokens=1000)
 ```
@ -368,14 +162,14 @@ If you are interested in our design principles and the implementation of the inj

 <h2 id="ack">Acknowledgment and Contributors</h2>

-The development of KTransformer is based on the flexible and versatile framework provided by Transformers. We also benefit from advanced kernels such as GGUF/GGML, Llamafile, and Marlin. We are planning to contribute back to the community by upstreaming our modifications.
+The development of KTransformer is based on the flexible and versatile framework provided by Transformers. We also benefit from advanced kernels such as GGUF/GGML, Llamafile, Marlin, sglang and flashinfer. We are planning to contribute back to the community by upstreaming our modifications.

 KTransformer is actively maintained and developed by contributors from the <a href="https://madsys.cs.tsinghua.edu.cn/">MADSys group</a> at Tsinghua University and members from <a href="http://approaching.ai/">Approaching.AI</a>. We welcome new contributors to join us in making KTransformer faster and easier to use.


 <h2 id="ack">Discussion</h2>

-If you have any questions, feel free to open an issue. Alternatively, you can join our WeChat group for further discussion. QR Code: [WeChat Group](WeChatGrouop.jpg)
+If you have any questions, feel free to open an issue. Alternatively, you can join our WeChat group for further discussion. QR Code: [WeChat Group](WeChatGroup.png)

 <h2 id="FAQ">🙋 FAQ</h2>

--- a/README_ZH.md
+++ b/README_ZH.md
@ -0,0 +1,166 @@
+<div align="center">
+  <!-- <h1>KTransformers</h1> -->
+  <p align="center">
+
+<picture>
+    <img alt="KTransformers" src="https://github.com/user-attachments/assets/d5a2492f-a415-4456-af99-4ab102f13f8b" width=50%>
+
+</picture>
+
+</p>
+  <h3>一个用于体验尖端 LLM 推理优化的灵活框架</h3>
+  <strong><a href="#show-cases">🌟 案例展示</a> | <a href="#quick-start">🚀 快速入门</a> | <a href="#tutorial">📃 教程</a> | <a href="https://github.com/kvcache-ai/ktransformers/discussions">💬 讨论</a> | <a href="#FAQ">🙋 常见问题</a> </strong>
+</div>
+
+<h2 id="intro">🎉 介绍</h2>
+KTransformers（发音为 Quick Transformers）旨在通过先进的内核优化和放置/并行策略来增强您对 🤗 [Transformers](https://github.com/huggingface/transformers) 的体验。
+<br/><br/>
+KTransformers 是一个以 Python 为中心的灵活框架，其核心是可扩展性。通过用一行代码实现并注入优化模块，用户可以获得与 Transformers 兼容的接口、符合 OpenAI 和 Ollama 的 RESTful API，甚至是一个简化的类似 ChatGPT 的 Web 界面。
+<br/><br/>
+我们对 KTransformers 的愿景是成为一个用于实验创新 LLM 推理优化的灵活平台。如果您需要任何其他功能，请告诉我们。
+
+<h2 id="Updates">🔥 更新</h2>
+
+* **2025 年 2 月 15 日**：为DeepSeek-V3/R1支持[FP8 GPU内核](./doc/en/fp8_kernel.md); 支持更长的上下文([教程](./doc/en/DeepseekR1_V3_tutorial.md#v022-longer-context)).
+* **2025 年 2 月 15 日**：长上下文(从4K到8K，24GB VRAM) & 稍快的速度(+15%)(最快 16 Tokens/s)，文档请参见 [这里](./doc/en/DeepseekR1_V3_tutorial.md) 和 [在线指南](https://kvcache-ai.github.io/ktransformers/) 。
+* **2025 年 2 月 10 日**：支持 Deepseek-R1 和 V3 在单个（24GB VRAM）/多 GPU 和 382G DRAM 上运行，速度提升高达 3~28 倍。详细教程请参见 [这里](./doc/en/DeepseekR1_V3_tutorial.md)。
+* **2024 年 8 月 28 日**：支持 InternLM2.5-7B-Chat-1M 模型下的 1M 上下文，使用 24GB 的 VRAM 和 150GB 的 DRAM。详细教程请参见 [这里](./doc/en/long_context_tutorial.md)。
+* **2024 年 8 月 28 日**：将 DeepseekV2 所需的 VRAM 从 21G 降低到 11G。
+* **2024 年 8 月 15 日**：更新了详细的 [教程](doc/en/injection_tutorial.md)，介绍注入和多 GPU 的使用。
+* **2024 年 8 月 14 日**：支持 llamfile 作为线性后端。
+* **2024 年 8 月 12 日**：支持多 GPU；支持新模型：mixtral 8\*7B 和 8\*22B；支持 q2k、q3k、q5k 在 GPU 上的去量化。
+* **2024 年 8 月 9 日**：支持 Windows。
+
+<h2 id="show-cases">🌟 案例展示</h2>
+
+<div>
+<h3>在仅 24GB VRAM 的桌面上运行 GPT-4/o1 级别的本地 VSCode Copilot</h3>
+</div>
+
+https://github.com/user-attachments/assets/ebd70bfa-b2c1-4abb-ae3b-296ed38aa285
+
+</p>
+
+- **[NEW!!!] 本地 671B DeepSeek-Coder-V3/R1**：使用其 Q4_K_M 版本，仅需 14GB VRAM 和 382GB DRAM 即可运行（教程请参见 [这里](./doc/en/DeepseekR1_V3_tutorial.md)）。
+	- 预填充速度（tokens/s）：
+ 		- KTransformers：54.21（32 核）→ 74.362（双插槽，2×32 核）→ 255.26（优化的 AMX 基 MoE 内核，仅 V0.3）→ 286.55（选择性使用 6 个专家，仅 V0.3）
+ 		- 与 llama.cpp 在 2×32 核下相比，达到 **27.79× 速度提升**。
+ 	- 解码速度（tokens/s）：
+ 		- KTransformers：8.73（32 核）→ 11.26（双插槽，2×32 核）→ 13.69（选择性使用 6 个专家，仅 V0.3）
+ 		- 与 llama.cpp 在 2×32 核下相比，达到 **3.03× 速度提升**。
+	- 即将开源发布：
+		- AMX 优化和选择性专家激活将在 V0.3 中开源。
+		- 目前仅在预览二进制分发中可用，可从 [这里](./doc/en/DeepseekR1_V3_tutorial.md) 下载。
+
+- **本地 236B DeepSeek-Coder-V2**：使用其 Q4_K_M 版本，仅需 21GB VRAM 和 136GB DRAM 即可运行，甚至在 [BigCodeBench](https://huggingface.co/blog/leaderboard-bigcodebench) 中得分超过 GPT4-0613。
+
+<p align="center">
+  <picture>
+    <img alt="DeepSeek-Coder-V2 Score" src="https://github.com/user-attachments/assets/d052924e-8631-44de-aad2-97c54b965693" width=100%>
+  </picture>
+</p>
+
+- **更快的速度**：通过 MoE 卸载和注入来自 [Llamafile](https://github.com/Mozilla-Ocho/llamafile/tree/main) 和 [Marlin](https://github.com/IST-DASLab/marlin) 的高级内核，实现了 2K 提示预填充 126 tokens/s 和生成 13.6 tokens/s 的速度。
+- **VSCode 集成**：封装成符合 OpenAI 和 Ollama 的 API，可无缝集成到 [Tabby](https://github.com/TabbyML/tabby) 和其他前端的后端。
+
+<p align="center">
+
+https://github.com/user-attachments/assets/4c6a8a38-05aa-497d-8eb1-3a5b3918429c
+
+</p>
+
+<!-- <h3>在仅 24GB VRAM 的桌面上进行 1M 上下文本地推理</h3>
+<p align="center"> -->
+
+<!-- https://github.com/user-attachments/assets/a865e5e4-bca3-401e-94b8-af3c080e6c12 -->
+<!-- 
+* **1M 上下文 InternLM 2.5 7B**：以全 bf16 精度运行，使用 24GB VRAM 和 150GB DRAM，可在本地桌面设置中实现。在 1M "针在干草堆中" 测试中达到 92.88% 的成功率，在 128K NIAH 测试中达到 100%。
+
+<p align="center">
+  <picture>
+    <img alt="Single Needle Retrieval 128K" src="./doc/assets/needle_128K.png" width=100%>
+  </picture>
+</p>
+
+<p align="center">
+  <picture>
+    <img alt="Single Needle Retrieval 1000K" src="./doc/assets/needle_1M.png" width=100%>
+  </picture>
+</p>
+
+* **增强的速度**：使用稀疏注意力，通过 llamafile 内核实现 1M 上下文生成 16.91 tokens/s 的速度。这种方法比 llama.cpp 的全注意力方法快 10 倍以上。
+
+* **灵活的稀疏注意力框架**：提供了一个灵活的块稀疏注意力框架，用于 CPU 卸载解码。与 SnapKV、Quest 和 InfLLm 兼容。更多信息请参见 [这里](./doc/en/long_context_introduction.md)。 -->
+
+<strong>更多高级功能即将推出，敬请期待！</strong>
+
+<h2 id="quick-start">🚀 快速入门</h2>
+
+
+KTransformers 的入门非常简单！请参考我们的[安装指南]((https://kvcache-ai.github.io/ktransformers/))进行安装。
+
+<h2 id="tutorial">📃 简要注入教程</h2>
+KTransformers 的核心是一个用户友好的、基于模板的注入框架。这使得研究人员可以轻松地将原始 torch 模块替换为优化的变体。它还简化了多种优化的组合过程，允许探索它们的协同效应。
+</br>
+<p align="center">
+  <picture>
+    <img alt="Inject-Struction" src="https://github.com/user-attachments/assets/6b4c1e54-9f6d-45c5-a3fc-8fa45e7d257e" width=65%>
+  </picture>
+</p>
+
+鉴于 vLLM 已经是一个用于大规模部署优化的优秀框架，KTransformers 特别关注受资源限制的本地部署。我们特别关注异构计算时机，例如量化模型的 GPU/CPU 卸载。例如，我们支持高效的 <a herf="https://github.com/Mozilla-Ocho/llamafile/tree/main">Llamafile</a> 和<a herf="https://github.com/IST-DASLab/marlin">Marlin</a> 内核，分别用于 CPU 和 GPU。 更多详细信息可以在 <a herf="doc/en/operators/llamafile.md">这里</a>找到。
+
+
+<h3>示例用法</h3>
+要使用提供的内核，用户只需创建一个基于 YAML 的注入模板，并在使用 Transformers 模型之前添加对 `optimize_and_load_gguf` 的调用。
+
+```python
+with torch.device("meta"):
+    model = AutoModelForCausalLM.from_config(config, trust_remote_code=True)
+optimize_and_load_gguf(model, optimize_config_path, gguf_path, config)
+...
+generated = prefill_and_generate(model, tokenizer, input_tensor.cuda(), max_new_tokens=1000)
+```
+
+在这个示例中，首先在 meta 设备上初始化 AutoModel，以避免占用任何内存资源。然后，`optimize_and_load_gguf` 遍历模型的所有子模块，匹配您的 YAML 规则文件中指定的规则，并将它们替换为指定的高级模块。
+
+注入后，原始的 `generate` 接口仍然可用，但我们还提供了一个兼容的 `prefill_and_generate` 方法，这使得可以进一步优化，例如使用 CUDAGraph 提高生成速度。
+
+<h3>如何自定义您的模型</h3>
+
+一个详细的使用 DeepSeek-V2 作为示例的注入和 multi-GPU 教程在 [这里](doc/en/injection_tutorial.md)。
+
+以下是一个将所有原始 Linear 模块替换为 Marlin 的 YAML 模板示例，Marlin 是一个高级的 4 位量化内核。
+
+```yaml
+- match:
+    name: "^model\\.layers\\..*$"  # 正则表达式 
+    class: torch.nn.Linear  # 仅匹配同时符合名称和类的模块
+  replace:
+    class: ktransformers.operators.linear.KTransformerLinear  # 量化数据类型的优化内核
+    device: "cpu"   # 初始化时加载该模块的 device
+    kwargs:
+      generate_device: "cuda"
+      generate_linear_type: "QuantizedLinearMarlin"
+```
+
+YAML 文件中的每个规则都有两部分：`match` 和 `replace`。`match` 部分指定应替换的模块，`replace` 部分指定要注入到模型中的模块以及初始化关键字。
+
+您可以在 [ktransformers/optimize/optimize_rules](ktransformers/optimize/optimize_rules) 目录中找到用于优化 DeepSeek-V2 和 Qwen2-57B-A14 的示例规则模板。这些模板用于为 `local_chat.py` 示例提供支持。
+
+如果您对我们的设计原则和注入框架的实现感兴趣，请参考 [设计文档](doc/en/deepseek-v2-injection.md)。
+
+<h2 id="ack">致谢和贡献者</h2>
+
+KTransformer 的开发基于 Transformers 提供的灵活和多功能框架。我们还受益于 GGUF/GGML、Llamafile 、 Marlin、sglang和flashinfer 等高级内核。我们计划通过向上游贡献我们的修改来回馈社区。
+
+KTransformer 由清华大学 <a href="https://madsys.cs.tsinghua.edu.cn/">MADSys group</a> 小组的成员以及 <a href="http://approaching.ai/">Approaching.AI</a> 的成员积极维护和开发。我们欢迎新的贡献者加入我们，使 KTransformer 更快、更易于使用。
+
+
+<h2 id="ack">讨论</h2>
+
+如果您有任何问题，欢迎随时提出 issue。或者，您可以加入我们的微信群进行进一步讨论。二维码： [微信群](WeChatGroup.png)
+
+<h2 id="FAQ">🙋 常见问题</h2>
+
+一些常见问题的答案可以在 [FAQ](doc/en/FAQ.md) 中找到。 
--- a/WeChatGrouop.jpg
+++ b/WeChatGrouop.jpg
--- a/WeChatGroup.png
+++ b/WeChatGroup.png
--- a/book.toml
+++ b/book.toml
@ -0,0 +1,18 @@
+[book]
+authors = ["kvcache-ai"]
+language = "zh-CN"
+title = "Ktransformers"
+src = "doc"
+
+[output.html]
+git-repository-url = "https://github.com/kvcache-ai/ktransformers"
+edit-url-template = "https://github.com/kvcache-ai/ktransformers/edit/main/{path}"
+
+[output.html.playground]
+editable = true
+copy-js = true
+# line-numbers = true
+
+[output.html.fold]
+enable = true
+level = 0
--- a/doc/README.md
+++ b/doc/README.md
@ -0,0 +1,33 @@
+<div align="center">
+  <!-- <h1>KTransformers</h1> -->
+  <p align="center">
+
+<picture>
+    <img alt="KTransformers" src="https://github.com/user-attachments/assets/d5a2492f-a415-4456-af99-4ab102f13f8b" width=50%>
+
+</picture>
+
+</p>
+
+</div>
+
+<h2 id="intro">🎉 Introduction</h2>
+KTransformers, pronounced as Quick Transformers, is designed to enhance your 🤗 <a href="https://github.com/huggingface/transformers">Transformers</a> experience with advanced kernel optimizations and placement/parallelism strategies.
+<br/><br/>
+KTransformers is a flexible, Python-centric framework designed with extensibility at its core. 
+By implementing and injecting an optimized module with a single line of code, users gain access to a Transformers-compatible
+interface, RESTful APIs compliant with OpenAI and Ollama, and even a simplified ChatGPT-like web UI. 
+<br/><br/>
+Our vision for KTransformers is to serve as a flexible platform for experimenting with innovative LLM inference optimizations. Please let us know if you need any other features.
+
+<h2 id="Updates">🔥 Updates</h2>
+
+* **Mar 5, 2025**: Support unsloth 1.58/2.51 bits weights and [IQ1_S/FP8 hybrid](./doc/en/fp8_kernel.md) weights. Support 139K [Longer Context](./doc/en/DeepseekR1_V3_tutorial.md#v022-longer-context) for DeepSeek-V3 and R1 in 24GB VRAM.
+* **Feb 25, 2025**: Support [FP8 GPU kernel](./doc/en/fp8_kernel.md) for DeepSeek-V3 and R1; [Longer Context](./doc/en/DeepseekR1_V3_tutorial.md#v022-longer-context).
+* **Feb 10, 2025**: Support Deepseek-R1 and V3 on single (24GB VRAM)/multi gpu and 382G DRAM, up to 3~28x speedup. The detailed tutorial is [here](./en/DeepseekR1_V3_tutorial.md).
+* **Aug 28, 2024**: Support 1M context under the InternLM2.5-7B-Chat-1M model, utilizing 24GB of VRAM and 150GB of DRAM. The detailed tutorial is [here](./en/long_context_tutorial.md).
+* **Aug 28, 2024**: Decrease DeepseekV2's required VRAM from 21G to 11G.
+* **Aug 15, 2024**: Update detailed [TUTORIAL](./en/injection_tutorial.md) for injection and multi-GPU. 
+* **Aug 14, 2024**: Support llamfile as linear backend. 
+* **Aug 12, 2024**: Support multiple GPU; Support new model: mixtral 8\*7B  and 8\*22B; Support q2k, q3k, q5k dequant on gpu.
+* **Aug 9, 2024**: Support windows native.
--- a/doc/SUMMARY.md
+++ b/doc/SUMMARY.md
@ -0,0 +1,25 @@
+# Ktransformer
+
+[Introduction](./README.md)
+# Install
+- [Installation Guide](en/install.md)
+
+# Tutorial 
+- [Deepseek-R1/V3 Show Case/Tutorial](en/DeepseekR1_V3_tutorial.md)
+- [Why KTransformers So Fast](en/deepseek-v2-injection.md)
+- [Injection Tutorial](en/injection_tutorial.md)
+- [Multi-GPU Tutorial](en/multi-gpu-tutorial.md)
+- [Use FP8 GPU Kernel](en/fp8_kernel.md)
+# Server
+  - [Server](en/api/server/server.md)
+  - [Website](en/api/server/website.md)
+  - [Tabby](en/api/server/tabby.md)
+# For Developer
+- [Makefile Usage](en/makefile_usage.md)
+
+# FAQ
+- [FAQ](en/FAQ.md)
+# V3 Reproduction
+- [Success List](en/V3-success.md)
+# Benchmark
+- [Benchmark](en/benchmark.md)
--- a/doc/assets/DeepSeek-on-KTransformers.PNG
+++ b/doc/assets/DeepSeek-on-KTransformers.PNG
--- a/doc/assets/DeepSeek-on-KTransformers.png
+++ b/doc/assets/DeepSeek-on-KTransformers.png
--- a/doc/basic/note1.md
+++ b/doc/basic/note1.md
@ -0,0 +1 @@
+# basic-first20
--- a/doc/basic/note2.md
+++ b/doc/basic/note2.md
@ -0,0 +1 @@
+# basic-data_structure
--- a/doc/en/DeepseekR1_V3_tutorial.md
+++ b/doc/en/DeepseekR1_V3_tutorial.md
@ -1,30 +1,40 @@
 <!-- omit in toc -->
 # GPT-4/o1-level Local VSCode Copilot on a Desktop with only 24GB VRAM
 - [SUMMARY](#summary)
-	- [Prerequisites](#prerequisites)
+	- [Show Case Environment](#show-case-environment)
 	- [Bench Result](#bench-result)
+		- [V0.2.1](#v021)
+			- [Memory consumption:](#memory-consumption)
+			- [Change Log](#change-log)
+			- [Benchmark Results](#benchmark-results)
 		- [V0.2](#v02)
 			- [Settings](#settings)
-			- [Memory consumption:](#memory-consumption)
-			- [Benchmark Results](#benchmark-results)
+			- [Memory consumption:](#memory-consumption-1)
+			- [Benchmark Results](#benchmark-results-1)
 		- [V0.3-Preview](#v03-preview)
 			- [Settings](#settings-1)
 			- [Memory consumptions:](#memory-consumptions)
-			- [Benchmark results](#benchmark-results-1)
+			- [Benchmark results](#benchmark-results-2)
 	- [How to Run](#how-to-run)
-		- [V0.2 Showcase](#v02-showcase)
+		- [v0.2.2 \& v0.2.3 longer context \& FP8 kernel](#v022--v023-longer-context--fp8-kernel)
+			- [longer context](#longer-context)
+			- [FP8 kernel](#fp8-kernel)
+		- [V0.2 \& V0.2.1 Showcase](#v02--v021-showcase)
 			- [Single socket version (32 cores)](#single-socket-version-32-cores)
 			- [Dual socket version (64 cores)](#dual-socket-version-64-cores)
 		- [V0.3 Showcase](#v03-showcase)
 			- [Dual socket version (64 cores)](#dual-socket-version-64-cores-1)
 	- [Some Explanations](#some-explanations)
+	- [Next](#next)
+		- [Faster](#faster)
+		- [Easier](#easier)
 	- [FAQ](#faq)
 		- [R1 No Thinking](#r1-no-thinking)
 		- [More FAQ](#more-faq)

 # SUMMARY

-> **Fed 10, 2025**: Support DeepseekR1 and V3 on single (24GB VRAM)/multi gpu and 382G DRAM, up to 3~28x speedup.<br>
+> **Feb 10, 2025**: Support DeepseekR1 and V3 on single (24GB VRAM)/multi gpu and 382G DRAM, up to 3~28x speedup.<br>

 Hi, we're the KTransformers team (formerly known for our local CPU/GPU hybrid inference open source project with DeepSeek-V2).  

@ -39,23 +49,64 @@ https://github.com/user-attachments/assets/ebd70bfa-b2c1-4abb-ae3b-296ed38aa285

 - **[NEW!!!] Local 671B DeepSeek-Coder-V3/R1:** Running its Q4_K_M version using only 14GB VRAM and 382GB DRAM.
 	- Prefill Speed (tokens/s): 
- 		- KTransfermor: 54.21 (32 cores) → 74.362 (dual-socket, 2×32 cores) → 255.26 (optimized AMX-based MoE kernel, V0.3 only) → 286.55 (selectively using 6 experts, V0.3 only)  
+ 		- KTransformers: 54.21 (32 cores) → 74.362 (dual-socket, 2×32 cores) → 255.26 (optimized AMX-based MoE kernel, V0.3 only) → 286.55 (selectively using 6 experts, V0.3 only)  
 		- Compared to 10.31 tokens/s in llama.cpp with 2×32 cores, achieving up to **27.79× speedup**.  
 	- Decode Speed (tokens/s):  
- 		- KTransfermor: 8.73 (32 cores) → 11.26 (dual-socket, 2×32 cores) → 13.69 (selectively using 6 experts, V0.3 only)  
+ 		- KTransformers: 8.73 (32 cores) → 11.26 (dual-socket, 2×32 cores) → 13.69 (selectively using 6 experts, V0.3 only)  
 		- Compared to 4.51 tokens/s in llama.cpp with 2×32 cores, achieving up to **3.03× speedup**.  
  

 We also give our upcoming optimizations previews, including an Intel AMX-accelerated kernel and a selective expert activation method, which will significantly enhance performance. With V0.3-preview, we achieve up to 286 tokens/s for prefill, making it up to **28× faster than llama.cpp** for local inference.
 The binary distribution is available now and the source code will come ASAP! Check out the wheel package [here](https://github.com/kvcache-ai/ktransformers/releases/download/v0.1.4/ktransformers-0.3.0rc0+cu126torch26fancy-cp311-cp311-linux_x86_64.whl)  

+> **Feb 15, 2025**: KTransformers V0.2.1: Longer Context (from 4K to 8K for 24GB VRAM) & Slightly Faster Speed （+15%) (Up to 16 Tokens/s), update docs [here](./doc/en/DeepseekR1_V3_tutorial.md) and [online books](https://kvcache-ai.github.io/ktransformers/).

-## Prerequisites
+We speed up the decode and prefill speed a littlt bit. The reason for the limited performance improvement mainly lies in the fact that the inference process is still constrained by the CPU's computational speed and memory bandwidth. The MLA part handled by the GPU accounts for a relatively small proportion.
+
+Besides the improvements in speed, we've also significantly updated the documentation to enhance usability, including:<br>
+- Added Multi-GPU configuration tutorial.
+- Consolidated installation guide.
+- Add a detailed tutorial on registering extra GPU memory with ExpertMarlin;
+
+
+## Show Case Environment
 We run our best performance tests (V0.2) on <br>
 CPU: Intel (R) Xeon (R) Gold 6454S 1T DRAM (2 NUMA nodes) <br>
 GPU: 4090D 24G VRAM <br>
-Memory: standard DDR5-4800 server DRAM (1 TB)
+Memory: standard DDR5-4800 server DRAM (1 TB), each socket with 8×DDR5-4800
 ## Bench Result
+### V0.2.1
+- Model: DeepseekV3-q4km (int4)<br>
+- CPU: cpu_model_name: Intel (R) Xeon (R) Gold 6454S, 32 cores per socket, 2 sockets, 2 numa nodes
+- GPU: 4090 24G VRAM
+- We test after enough warm up
+#### Memory consumption:
+  - Single socket: 382G DRAM, at least 14GB VRAM
+  - Dual socket: 1T DRAM, at least 14GB VRAM
+#### Change Log
+- Longer Context (from 4K to 8K for 24GB VRAM) and Slightly Faster Speed （+15%):<br>
+Integrated the highly efficient Triton MLA Kernel from the fantastic sglang project, enable much longer context length and slightly faster prefill/decode speed
+- We suspect that some of the improvements come from the change of hardware platform (4090D->4090)
+#### Benchmark Results
+
+
+"6 experts" case is part of V0.3's preview
+
+
+| Prompt | hi (2) | 1K (969) | 2K (1930) | 4K (3846) | 8K (7678) | 
+| --- | --- | --- | --- | --- | --- | 
+| Output length | 10tokens | 300tokens | 300tokens | 300tokens | 300tokens | 
+| **6 experts V0.2.0** |  |  |  |  |  |
+| Prefill token/s | 13 | 105 | 102 | 88 | CUDA OOM |
+| decode token/s | 16.8 | 15.4 | 14.2 | 13.0 | CUDA OOM |
+| **6 experts V0.2.1** |   |   |   |   |   |
+| Prefill token/s | 13 | 111 | 112.5 | 102 **(1.16x speedup)** | 101 |
+| decode token/s | 16.8 | 15.9 | 15.4 | 14.9 **(1.15x speedup)** | 13.9 |
+| **8 experts V0.2.1** |   |   |   |   |   |
+| Prefill token/s | 12.2 | 88.2 | 88.5 | 81.9 | 80 |
+| Decode token/s | 13.4 | 13.5 | 13.4 | 13.2 | 12.4 |
+
+
 ### V0.2
 #### Settings
 - Model: DeepseekV3-q4km (int4)<br>
@ -106,14 +157,41 @@ the output quality doesn't change. But the speed of decoding and prefill
 is speed up which is inspiring. So our showcase makes use of this finding*

 ## How to Run
-### V0.2 Showcase
+### v0.2.2 & v0.2.3 longer context & FP8 kernel
+#### longer context
+To use this feature, [install flashinfer](https://github.com/flashinfer-ai/flashinfer) first.
+
+Note: The latest MLA kernel in FlashInfer still has a few minor issues. They are continuously fixing them on the main branch. If you are using FlashInfer, please install it from the main source code.
+
+If you want to use long context(longer than 20K) for prefill, enable the matrix absorption MLA during the prefill phase, which will significantly reduce the size of the kv cache. Modify yaml file like this:
+```
+- match:
+    name: "^model\\.layers\\..*\\.self_attn$"
+  replace:
+    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
+    kwargs:
+      generate_device: "cuda"
+      prefill_device: "cuda"
+      absorb_for_prefill: True # change this to True to enable long context(prefill may slower).
+```
+
+If the VRAM is still insufficient, try reducing the `chunk_prefill_size` parameter (default is 8192) to further decrease the intermediate results during chunk prefill.
+#### FP8 kernel
+
+The DeepSeek-AI team provides FP8 safetensors for DeepSeek-R1/V3 models. We achieve performance optimization through the following works:
+- **FP8 GPU Kernel Integration**: FP8 linear layer acceleration kernels integrated in KTransformers
+- **Hybrid Quantization Architecture**:
+  - Attention and Shared-Expert modules use FP8 precision (enhances computational accuracy)
+  - Experts modules retain GGML quantization (GGUF format, reside in CPU to save GPU memory)
+
+So those who are persuing the best performance can use the FP8 linear kernel for DeepSeek-V3/R1.
+
+The detailed guide is [here](./fp8_kernel.md).
+
+### V0.2 & V0.2.1 Showcase
 #### Single socket version (32 cores)
 Our local_chat test command is:
 ``` shell
-git clone https://github.com/kvcache-ai/ktransformers.git
-cd ktransformers
-git submodule init
-git submodule update
 numactl -N 1 -m 1 python ./ktransformers/local_chat.py --model_path <your model path> --gguf_path <your gguf path>  --prompt_file <your prompt txt file>  --cpu_infer 33 --max_new_tokens 1000
 <when you see chat, then press enter to load the text prompt_file>
 ```
@ -121,20 +199,24 @@ numactl -N 1 -m 1 python ./ktransformers/local_chat.py --model_path <your model
 `<your gguf path>` can also be online, but as its large we recommend you download it and quantize the model to what you want (notice it's the dir path) <br>
 `--max_new_tokens 1000` is the max output token length. If you find the answer is truncated, you
 can increase the number for longer answer (But be aware of OOM, and increase it will slow down the generation rate.). 
-<br>
-The command numactl -N 1 -m 1 aims to advoid data transfer between numa nodes<br>
+
+The command `numactl -N 1 -m 1` aims to advoid data transfer between numa nodes<br>
 Attention! If you are testing R1 and it may skip thinking. So you can add arg: `--force_think true`. This is explained in [FAQ](#faq) part

 #### Dual socket version (64 cores)
-Make suer before you install (use install.sh or `make dev_install`), setting the env var `USE_NUMA=1` by `export USE_NUMA=1` (if already installed, reinstall it with this env var set) <br>
-Our local_chat test command is:
+
+Make sure before you install (use install.sh or `make dev_install`), setting the env var `USE_NUMA=1` by `export USE_NUMA=1` (if already installed, reinstall it with this env var set). You may check the doc [here](./install.md) for install details. <br>
+
+Test Command:
 ``` shell
-git clone https://github.com/kvcache-ai/ktransformers.git
-cd ktransformers
-git submodule init
-git submodule update
-export USE_NUMA=1
-make dev_install # or sh ./install.sh
+# ---For those who have not installed ktransformers---
+# git clone https://github.com/kvcache-ai/ktransformers.git
+# cd ktransformers
+# git submodule init
+# git submodule update
+# export USE_NUMA=1
+# make dev_install # or sh ./install.sh
+# ----------------------------------------------------
 python ./ktransformers/local_chat.py --model_path <your model path> --gguf_path <your gguf path>  --prompt_file <your prompt txt file>  --cpu_infer 65 --max_new_tokens 1000
 <when you see chat, then press enter to load the text prompt_file>
 ```
@ -170,6 +252,18 @@ DeepSeek's MLA operators are highly computationally intensive. While running eve

 5. Why Intel CPUs?
 Intel is currently the only CPU vendor that supports AMX-like instructions, which delivers significantly better performance compared to AVX-only alternatives.
+## Next
+### Faster
+* The FlashInfer (https://github.com/flashinfer-ai/flashinfer) project is releasing an even more efficient fused MLA operator, promising further speedups
+* vLLM has explored multi-token prediction in DeepSeek-V3, and support is on our roadmap for even better performance
+* We are collaborating with Intel to enhance the AMX kernel (v0.3) and optimize for Xeon6/MRDIMM
+### Easier
+* Official Docker images to simplify installation
+* Fix the server integration for web API access
+* Fix the local chat only accepting a single line prompt (currently \n begins generating prompt)
+* Support for more quantization types, including the highly requested dynamic quantization from unsloth
+
+Stay tuned for more updates! 
 ## FAQ
 ### R1 No Thinking
 Attention! If you are testing R1 and it may skip thinking. So you can add arg: `--force_think true`. The detail is in [FAQ](./FAQ.md) part <br>
--- a/doc/en/Docker.md
+++ b/doc/en/Docker.md
@ -7,7 +7,7 @@
 ## Images
 There is a Docker image available for our project, you can pull the docker image by：
 ```
-docker pull approachingai/ktransformers:0.1.1
+docker pull approachingai/ktransformers:0.2.1
 ```
 **Notice**: In this image, we compile the ktransformers in AVX512 instuction CPUs, if your cpu not support AVX512, it is suggested to recompile and install ktransformer in the /workspace/ktransformers directory within the container.

@ -16,14 +16,16 @@ docker pull approachingai/ktransformers:0.1.1

 - finish, execute
   ```bash
-   docker build  -t approachingai/ktransformers:v0.1.1 .
+   docker build  -t approachingai/ktransformers:0.2.1 .
   ```

 ## Usage

 Assuming you have the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) that you can use the GPU in a Docker container.
 ```
-docker run --gpus all -v /path/to/models:/models -p 10002:10002 approachingai/ktransformers:v0.1.1 --port 10002 --gguf_path /models/path/to/gguf_path --model_path /models/path/to/model_path --web True
+docker run --gpus all -v /path/to/models:/models --name ktransformers -itd approachingai/ktransformers:0.2.1
+docker exec -it ktransformers /bin/bash
+python -m ktransformers.local_chat  --gguf_path /models/path/to/gguf_path --model_path /models/path/to/model_path --cpu_infer 33
 ```

 More operators you can see in the [readme](../../README.md)
--- a/doc/en/FAQ.md
+++ b/doc/en/FAQ.md
@ -1,4 +1,18 @@
+<!-- omit in toc -->
 # FAQ
+- [Install](#install)
+  - [Q: ImportError: /lib/x86\_64-linux-gnu/libstdc++.so.6: version GLIBCXX\_3.4.32' not found](#q-importerror-libx86_64-linux-gnulibstdcso6-version-glibcxx_3432-not-found)
+  - [Q: DeepSeek-R1 not outputting initial  token](#q-deepseek-r1-not-outputting-initial--token)
+- [Usage](#usage)
+  - [Q: If I got more VRAM than the model's requirement, how can I fully utilize it?](#q-if-i-got-more-vram-than-the-models-requirement-how-can-i-fully-utilize-it)
+  - [Q: If I don't have enough VRAM, but I have multiple GPUs, how can I utilize them?](#q-if-i-dont-have-enough-vram-but-i-have-multiple-gpus-how-can-i-utilize-them)
+  - [Q: How to get the best performance?](#q-how-to-get-the-best-performance)
+  - [Q: My DeepSeek-R1 model is not thinking.](#q-my-deepseek-r1-model-is-not-thinking)
+  - [Q: Loading gguf error](#q-loading-gguf-error)
+  - [Q: Version \`GLIBCXX\_3.4.30' not found](#q-version-glibcxx_3430-not-found)
+  - [Q: When running the bfloat16 moe model, the data shows NaN](#q-when-running-the-bfloat16-moe-model-the-data-shows-nan)
+  - [Q: Using fp8 prefill very slow.](#q-using-fp8-prefill-very-slow)
+  - [Q: Possible ways to run graphics cards using volta and turing architectures](#q-possible-ways-to-run-graphics-cards-using-volta-and-turing-architectures)
 ## Install
 ### Q: ImportError: /lib/x86_64-linux-gnu/libstdc++.so.6: version GLIBCXX_3.4.32' not found
 ```
@ -25,7 +39,7 @@ from-https://github.com/kvcache-ai/ktransformers/issues/129#issue-2842799552
   1. local_chat.py: You can increase the context window size by setting `--max_new_tokens` to a larger value.
   2. server: Increase the `--cache_lens' to a larger value.
 2. Move more weights to the GPU.
-    Refer to the ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-marlin.yaml
+    Refer to the ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-4.yaml
    ```yaml
    - match:
       name: "^model\\.layers\\.([4-10])\\.mlp\\.experts$" # inject experts in layer 4~10 as marlin expert
@ -39,11 +53,13 @@ from-https://github.com/kvcache-ai/ktransformers/issues/129#issue-2842799552
    You can modify layer as you want, eg. `name: "^model\\.layers\\.([4-10])\\.mlp\\.experts$"` to `name: "^model\\.layers\\.([4-12])\\.mlp\\.experts$"` to move more weights to the GPU.

    > Note: The first matched rule in yaml will be applied. For example, if you have two rules that match the same layer, only the first rule's replacement will be valid.
+    > Note：Currently, executing experts on the GPU will conflict with CUDA Graph. Without CUDA Graph, there will be a significant slowdown. Therefore, unless you have a substantial amount of VRAM (placing a single layer of experts for DeepSeek-V3/R1 on the GPU requires at least 5.6GB of VRAM), we do not recommend enabling this feature. We are actively working on optimization.
+    > Note KExpertsTorch is untested.


 ### Q: If I don't have enough VRAM, but I have multiple GPUs, how can I utilize them?

-Use the `--optimize_rule_path ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu.yaml` to load the two optimized rule yaml file. You may also use it as an example to write your own 4/8 gpu optimized rule yaml file.
+Use the `--optimize_config_path ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu.yaml` to load the two optimized rule yaml file. You may also use it as an example to write your own 4/8 gpu optimized rule yaml file.

 > Note: The ktransformers' multi-gpu stratigy is pipline, which is not able to speed up the model's inference. It's only for the model's weight distribution.

@ -53,7 +69,7 @@ You have to set `--cpu_infer` to the number of cores you want to use. The more c

 ### Q: My DeepSeek-R1 model is not thinking.

-According to DeepSeek, you need to enforce the model to initiate its response with "\<think>\n" at the beginning of every output by passing the arg `--force_think true `.
+According to DeepSeek, you need to enforce the model to initiate its response with "\<think>\n" at the beginning of every output by passing the arg `--force_think True `.

 ### Q: Loading gguf error

@ -61,9 +77,91 @@ Make sure you:
 1. Have the `gguf` file in the `--gguf_path` directory.
 2. The directory only contains gguf files from one model. If you have multiple models, you need to separate them into different directories.
 3. The folder name it self should not end with `.gguf`, eg. `Deep-gguf` is correct, `Deep.gguf` is wrong.
+4. The file itself is not corrupted; you can verify this by checking that the sha256sum matches the one from huggingface, modelscope, or hf-mirror.

 ### Q: Version `GLIBCXX_3.4.30' not found
 The detailed error:
 >ImportError: /mnt/data/miniconda3/envs/xxx/bin/../lib/libstdc++.so.6: version `GLIBCXX_3.4.30' not found (required by /home/xxx/xxx/ktransformers/./cpuinfer_ext.cpython-312-x86_64-linux-gnu.so)

-It may because of your conda env have no this version. Your can first exit your conda env by `conda deactivate` and use `whereis libstdc++.so.6` to find the path. And re enter your conda env and copy the .so by `cp <path of outter libstdc++> <path of your conda env libstdc++>` 
+Running `conda install -c conda-forge libstdcxx-ng` can solve the problem.
+
+
+### Q: When running the bfloat16 moe model, the data shows NaN
+The detailed error:
+```shell
+Traceback (most recent call last):
+  File "/root/ktransformers/ktransformers/local_chat.py", line 183, in <module>
+    fire.Fire(local_chat)
+  File "/usr/local/lib/python3.10/dist-packages/fire/core.py", line 135, in Fire
+    component_trace = _Fire(component, args, parsed_flag_args, context, name)
+  File "/usr/local/lib/python3.10/dist-packages/fire/core.py", line 468, in _Fire
+    component, remaining_args = _CallAndUpdateTrace(
+  File "/usr/local/lib/python3.10/dist-packages/fire/core.py", line 684, in _CallAndUpdateTrace
+    component = fn(*varargs, **kwargs)
+  File "/root/ktransformers/ktransformers/local_chat.py", line 177, in local_chat
+    generated = prefill_and_generate(
+  File "/root/ktransformers/ktransformers/util/utils.py", line 204, in prefill_and_generate
+    next_token = decode_one_tokens(cuda_graph_runner, next_token.unsqueeze(0), position_ids, cache_position, past_key_values, use_cuda_graph).to(torch_device)
+  File "/root/ktransformers/ktransformers/util/utils.py", line 128, in decode_one_tokens
+    next_token = torch.multinomial(probs, num_samples=1).squeeze(1)
+RuntimeError: probability tensor contains either `inf`, `nan` or element < 0
+```
+**SOLUTION**: The issue of running ktransformers on Ubuntu 22.04 is caused by the current system's g++ version being too old, and the pre-defined macros do not include avx_bf16. We have tested and confirmed that it works on g++ 11.4 in Ubuntu 22.04.
+
+### Q: Using fp8 prefill very slow.
+
+The FP8 kernel is build by JIT, so the first run will be slow. The subsequent runs will be faster.
+
+### Q: Possible ways to run graphics cards using volta and turing architectures
+
+From: https://github.com/kvcache-ai/ktransformers/issues/374
+
+1. First, download the latest source code using git.
+2. Then, modify the DeepSeek-V3-Chat-multi-gpu-4.yaml in the source code and all related yaml files, replacing all instances of KLinearMarlin with KLinearTorch.
+3. Next, you need to compile from the ktransformer source code until it successfully compiles on your local machine.
+4. Then, install flash-attn. It won't be used, but not installing it will cause an error.
+5. Then, modify local_chat.py, replacing all instances of flash_attention_2 with eager.
+6. Then, run local_chat.py. Be sure to follow the official tutorial's commands and adjust according to your local machine's parameters.
+7. During the running process, check the memory usage. Observe its invocation through the top command. The memory capacity on a single CPU must be greater than the complete size of the model. (For multiple CPUs, it's just a copy.)
+Finally, confirm that the model is fully loaded into memory and specific weight layers are fully loaded into the GPU memory. Then, try to input content in the chat interface and observe if there are any errors.
+
+Attention, for better perfomance, you can check this [method](https://github.com/kvcache-ai/ktransformers/issues/374#issuecomment-2667520838) in the issue
+>
+>https://github.com/kvcache-ai/ktransformers/blob/89f8218a2ab7ff82fa54dbfe30df741c574317fc/ktransformers/operators/attention.py#L274-L279
+>
+>```diff
+>+ original_dtype = query_states.dtype
+>+ target_dtype = torch.half
+>+ query_states = query_states.to(target_dtype)
+>+ compressed_kv_with_k_pe = compressed_kv_with_k_pe.to(target_dtype)
+>+ compressed_kv = compressed_kv.to(target_dtype)
+>+ attn_output = attn_output.to(target_dtype)
+>
+>decode_attention_fwd_grouped(query_states, compressed_kv_with_k_pe, compressed_kv, attn_output,
+>                             page_table,
+>                             position_ids.squeeze(0).to(torch.int32)+1, attn_logits,
+>                             4, #num_kv_splits # follow vLLM, fix it TODO
+>                             self.softmax_scale,
+>                             past_key_value.page_size)
+>
+>+ attn_output = attn_output.to(original_dtype)
+>```
+>
+>https://github.com/kvcache-ai/ktransformers/blob/89f8218a2ab7ff82fa54dbfe30df741c574317fc/ktransformers/operators/attention.py#L320-L326
+>
+>```diff
+>- attn_output = flash_attn_func( 
+>-     query_states, 
+>-     key_states, 
+>-     value_states_padded, 
+>-     softmax_scale=self.softmax_scale, 
+>-     causal=True, 
+>- )
+>+ attn_output = F.scaled_dot_product_attention(
+>+     query_states.transpose(1, 2),
+>+     key_states.transpose(1, 2),
+>+     value_states_padded.transpose(1, 2),
+>+     scale=self.softmax_scale,
+>+     is_causal=True
+>+ ).transpose(1, 2)
+>```
--- a/doc/en/V3-success.md
+++ b/doc/en/V3-success.md
@ -0,0 +1,11 @@
+## Hello everyone, here is the successfully reproduced environment configuration for your reference:
+### Case 1
+- Configuration: l40s 48G + 9654 x2 (192 cores) + 768G DDR5 12-channel
+- Performance: prefill 108 tokens/s, decode 10.8 tokens/s
+- Used version: main source code compiled 
+### Case 2
+- Configuration: Dual Xeon 6430 32C processors, totaling 64 cores and 128 threads, 480GB DDR5 memory, single 4090 24G graphics card
+- Performance: Running speed approximately 6-8 tokens per second 
+## NOTE
+If there are any other configurations that have been successfully run, please feel free to let us know. We will keep updating for everyone to refer to when reproducing. (It has been found that it also works on 2080, AMD, etc. (doge : )
+[click here](https://docs.qq.com/smartsheet/form/AVxgQOYhhNfl%2FBB08J2%2Fv3rnnq?tab=BB08J2)
--- a/doc/en/api/server/website.md
+++ b/doc/en/api/server/website.md
@ -8,6 +8,20 @@ This document provides the necessary steps to set up and run the web service for

 Before you can compile the web code, make sure you have installed [Node.js](https://nodejs.org) version 18.3 or higher

+Note: The version of Node.js in the Ubuntu or Debian GNU/Linux software repository is too low, causing compilation errors. Users can also install Node.js through the Nodesource repository, provided they uninstall the outdated version first.
+
+```bash
+
+  # sudo apt-get remove nodejs npm -y && sudo apt-get autoremove -y
+  sudo apt-get update -y && sudo apt-get install -y apt-transport-https ca-certificates curl gnupg
+  curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key | sudo gpg --dearmor -o /usr/share/keyrings/nodesource.gpg
+  sudo chmod 644 /usr/share/keyrings/nodesource.gpg
+  echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/nodesource.gpg] https://deb.nodesource.com/node_23.x nodistro main" | sudo tee /etc/apt/sources.list.d/nodesource.list
+  sudo apt-get update -y
+  sudo apt-get install nodejs -y
+
+```
+
 Once npm is installed, navigate to the `ktransformers/website` directory:

 ```bash
--- a/doc/en/benchmark.md
+++ b/doc/en/benchmark.md
@ -0,0 +1,63 @@
+## Benchmark
+
+To conduct a quick and convenient check, we have employed a simple Python script available [here](https://github.com/kvcache-ai/ktransformers/tree/main/ktransformers/tests) to assess the precision of our **[ktransformers](https://github.com/kvcache-ai/ktransformers)** project. For this evaluation, we utilized the same dataset, which was shuffled in a consistent manner and limited to the first 1,000 data points, to test our implementation across a variety of CPU kernels, MLA kernels, and quantization formats.
+
+We selected the DeepSeek-V3 model in its bf16, int8, and q4km versions for this test. The MMLU dataset, which can be found [here](https://huggingface.co/datasets/cais/mmlu), was used (we selected all datasets and shuffled them with a fixed random seed).
+
+**!!! However, we skipped the few-shot part and only chose the first 1,000 data points for a quick check.** Please note that this approach may result in results that are not consistent with the technical report of DeepSeek-V3. And the test of R1 and further more tests are on going.
+
+To verify our results, we chose [cloud service platform](https://cloud.siliconflow.cn/models) as baseline. All tests were conducted using the same script and datasets, allowing us to make a preliminary assessment of our project's precision.
+
+We set the argument `temperature=0.6`, and to simplify the test process, we skipped the few-shot part and used the following prompt: `There is a single choice question. Answer the question by replying A, B, C, D. No other answers are accepted. Just the letter. \nQuestion: {question}\nA. {option_a}\nB. {option_b}\nC. {option_c}\nD. {option_d}\nAnswer: '`. For more details, please refer to the [script](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/tests/mmlu_test.py).
+
+Given that we have only tested 1,000 cases, which provides only a preliminary judgment, some fluctuations in the results are reasonable. We selected all datasets and shuffled them with a fixed random seed to ensure consistency.
+
+## Some Details
+
+- The bf16 model of DeepSeek-V3 is available [here](https://huggingface.co/opensourcerelease/DeepSeek-V3-bf16/tree/main) (you may convert it to gguf by llama.cpp). The q4km model can be found [here](https://huggingface.co/unsloth/DeepSeek-V3-GGUF/tree/main/DeepSeek-V3-Q4_K_M).
+    
+- The optimization YAML file is located [here](https://github.com/kvcache-ai/ktransformers/tree/main/ktransformers/optimize/optimize_rules). For the GEMM Kernel, you can change `KLinearMarlin` to `KLinearTorch`.
+    
+- To switch the MLA Kernel from Triton to Torch, you can check and modify [this file](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/operators/attention.py), specifically by using the `forward_windows` method.
+    
+- When attempting to conduct the bf16 test (both CPU Weight and GPU Weight), you may encounter issues stemming from older versions of g++ and as, particularly when using Ubuntu 20 or earlier versions. To facilitate a smoother experience and enable you to reproduce our results, we have provided a development container. This container offers a pre-configured environment tailored for this purpose. However, please note that the container does not have the ktrans package installed. Therefore, you may still need to manually install certain packages to ensure everything runs smoothly.
+    
+    - You may config the model mount dir in `devcontainer/devcontainer.json`, check the `"mouts":` config.
+
+
+## The Result Table
+Uses DeepSeek-V3 model (Some specific cases are R1)
+|                          |                   |            |                   |         |            |                                                        |              |
+| ------------------------ | ----------------- | ---------- | ----------------- | ------- | ---------- | ------------------------------------------------------ | ------------ |
+| DataSet                  | CPU Weight Format | CPU Kernel | GPU Weight Format | GEMM Kernel   | MLA Kernel | [Siliconflow](https://cloud.siliconflow.cn/models)<br> | Ktrans Point |
+| MMLU<br><br>(shuffle 1k) |               |    |               |    |       |                                                    |          |
+|          1                | bf16              | cpuinfer   | bf16              | torch   | torch      | 81.6                                                   | 81.9         |
+|           2               | q8_0              | cpuinfer   | bf16              | torch   | torch      | 81.6                                                   | 83.1         |
+|             3             | q4km              | cpuinfer   | bf16              | torch   | triton     | 81.6                                                   | 81.4         |
+|              4            | q4km              | cpuinfer   | q4km->marlin 8    | marlin  | triton     | 81.6                                                   | 81.1         |
+|               5           | q4km              | cpuinfer   | q4km->marlin 4    | marlin  | triton     | 81.6                                                   | 81           |
+|                6          | q4km              | cpuinfer   | fp8               | fp8gemm  | triton     | 81.6                                                   | 81.5         |
+|                7 (DeepSeek-R1)          |  iq1             | cpuinfer   |     fp8           |  fp8gemm | triton     | 78.6                                                   | 83.6         |
+| MMLU-pro<br>(shuffle 1k)                 |               |    |                |  |      |                                                    |          |
+| 1                 | q4km              | cpuinfer   | fp8               | fp8gemm | triton     | 57.7                                                   | 57.6         |
+|  2             | q4km              | cpuinfer   | q4km->marlin 4    | marlin  | triton     | 57.7                                                   | 57.5         |
+|  3 (DeepSeek-R1)             | iq1              | cpuinfer   | fp8    | fp8gem  | triton     | 71.9                                                   | tbd         |
+| HumanEval                | tbd               | tbd        | tbd               | tbd     | tbd        | tbd                                                    | tbd          |
+| GSM8K                    | tbd               | tbd        | tbd               | tbd     | tbd        | tbd                                                    | tbd          |
+
+**The details for each case are listed below**:
+
+By default, The MLA kernel uses triton in linux and torch in windows. But we need to test torch in linux, so we manually modify the [file](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/operators/attention.py#L592). Just get rid of all the if branch and force it to use `self.forward_windows`
+
+- MMLU test
+  1. [v3-chat_yaml](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml) change all the `KLinearMarlin` to `KLinearTorch` (just find all the usage in this file). The source weight comes from [there](https://huggingface.co/opensourcerelease/DeepSeek-V3-bf16) (you need to use llama.cpp to convert it to gguf)
+  2. [v3-chat_yaml](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml). You need to modify the code to separately load cpu's expert weight. We leave this as comment in these places: [1](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/operators/experts.py#L122), [2](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/operators/experts.py#L136), [3](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/operators/experts.py#L137) (note in 3, change the path to your local weight file path). The weight file for q8_0 is [here](https://huggingface.co/unsloth/DeepSeek-V3-GGUF/tree/main/DeepSeek-V3-Q8_0)
+  3. [v3-chat_yaml](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml). You need to modify the code to separately load cpu's expert weight. We leave this as comment in these places: [1](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/operators/experts.py#L122), [2](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/operators/experts.py#L136), [3](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/operators/experts.py#L137) (note in 3, change the path to your local weight file path). The weight file for q4km is [here](https://huggingface.co/unsloth/DeepSeek-V3-GGUF/tree/main/DeepSeek-V3-Q4_K_M)
+  4. [v3-chat_yaml](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml). You don't need to change the source code as they both use q4km. But note the yaml file [here](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml#L29) and [here](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml#L18), below these lines you need to add `num_bits: 8` (in other words: add this kwargs to all that use `KLinearMarlin`). The weight file for q4km is [here](https://huggingface.co/unsloth/DeepSeek-V3-GGUF/tree/main/DeepSeek-V3-Q4_K_M)
+  5. [v3-chat_yaml](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml). No need to change yaml, just use the default. The weight file for q4km is [here](https://huggingface.co/unsloth/DeepSeek-V3-GGUF/tree/main/DeepSeek-V3-Q4_K_M)
+  6. You should check the [doc](./fp8_kernel.md) to learn how to test this case. This is a mixture tensor case.
+  7. You should check the [doc](./fp8_kernel.md) to learn how to test this case. This is a mixture tensor case.
+- MMLU-pro test
+  1. You should check the [doc](./fp8_kernel.md) to learn how to test this case. This is a mixture tensor case. 
+  2. [v3-chat_yaml](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml). No need to change yaml, just use the default. The weight file for q4km is [here](https://huggingface.co/unsloth/DeepSeek-V3-GGUF/tree/main/DeepSeek-V3-Q4_K_M)
+  3. You should check the [doc](./fp8_kernel.md) to learn how to test this case. This is a mixture tensor case.
--- a/doc/en/deepseek-v2-injection.md
+++ b/doc/en/deepseek-v2-injection.md
@ -1,6 +1,6 @@
-# Tutorial: Heterogeneous and Local DeepSeek-V2 Inference
+# Tutorial: Heterogeneous and Local MoE Inference

-DeepSeek-(Code)-V2 is a series of strong mixture-of-experts (MoE) models, featuring a total of 236 billion parameters, with 21 billion parameters activated per token. This model has demonstrated remarkable reasoning capabilities across various benchmarks, positioning it as one of the SOTA open models and nearly comparable in performance to GPT-4. 
+DeepSeek-(Code)-V2 is a series of strong mixture-of-experts (MoE) models, featuring a total of 236 billion parameters, with 21 billion parameters activated per token. This model has demonstrated remarkable reasoning capabilities across various benchmarks, positioning it as one of the SOTA open models and nearly comparable in performance to GPT-4. DeepSeek-R1 uses a similar architecture to DeepSeek-V2, but with a bigger number of parameters.

 <p align="center">
  <picture>
@ -24,7 +24,7 @@ The following figure provides a brief overview of DeepSeek-V2 architecture. At t

 <p align="center">
  <picture>
-    <img alt="DeepSeek on KTransformers" src="../assets/DeepSeek-on-KTransformers.PNG" width=80%>
+    <img alt="DeepSeek on KTransformers" src="../assets/DeepSeek-on-KTransformers.png" width=80%>
  </picture>
 </p>

--- a/doc/en/fp8_kernel.md
+++ b/doc/en/fp8_kernel.md
@ -0,0 +1,76 @@
+# FP8 Linear Kernel for DeepSeek-V3/R1
+
+## Overview
+The DeepSeek-AI team provides FP8 safetensors for DeepSeek-R1/V3 models. We achieve performance optimization through the following works:
+- **FP8 GPU Kernel Integration**: FP8 linear layer acceleration kernels integrated in KTransformers
+- **Hybrid Quantization Architecture**:
+  - Attention and Shared-Expert modules use FP8 precision (enhances computational accuracy)
+  - Experts modules retain GGML quantization (GGUF format, reside in CPU to save GPU memory)
+
+So those who are persuing the best performance can use the FP8 linear kernel for DeepSeek-V3/R1.
+
+## Key Features
+
+✅ Hybrid Precision Architecture (FP8 + GGML)<br>
+✅ Memory Optimization (~19GB VRAM usage)
+
+## Quick Start
+### Using Pre-Merged Weights
+
+Pre-merged weights are available on Hugging Face:<br>
+[KVCache-ai/DeepSeek-V3-GGML-FP8-Hybrid](https://huggingface.co/KVCache-ai/DeepSeek-V3)<br>
+[KVCache-ai/DeepSeek-R1-GGML-FP8-Hybrid](https://huggingface.co/KVCache-ai/DeepSeek-R1)
+
+> Please confirm the weights are fully uploaded before downloading. The large file size may extend Hugging Face upload time.
+
+
+Download Pre-Merged Weights
+```shell
+pip install -U huggingface_hub
+
+# Optional: Use HF Mirror for faster downloads in special area.
+# export HF_ENDPOINT=https://hf-mirror.com 
+
+huggingface-cli download --resume-download KVCache-ai/DeepSeek-V3-GGML-FP8-Hybrid --local-dir <local_dir>
+```
+### Using merge scripts
+If you got local DeepSeek-R1/V3 fp8 safetensors and gguf weights(eg.q4km), you can merge them using the following scripts.
+
+```shell
+python merge_tensors/merge_safetensor_gguf.py \
+  --safetensor_path <fp8_safetensor_path> \
+  --gguf_path <gguf_folder_path> \
+  --output_path <merged_output_path>
+```
+
+* `--safetensor_path`:	input path of safetensor file([Download](https://huggingface.co/deepseek-ai/DeepSeek-V3/tree/main)).
+* `--gguf_path`: input path of gguf folder ([Download](https://huggingface.co/unsloth/DeepSeek-V3-GGUF/tree/main/DeepSeek-V3-Q4_K_M)).
+* `--output_path`: output path of merged file.
+
+
+### Execution Notes
+
+Launch local_chat.py with custom quantized experts
+```shell
+python ktransformers/local_chat.py \
+  --model_path deepseek-ai/DeepSeek-V3 \
+  --gguf_path <merged_weights_folder> \
+  --optimize_config_path ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-fp8-linear-ggml-experts.yaml \
+  --cpu_infer <cpu_cores + 1>
+```
+
+
+## Notes
+
+⚠️ Hardware Requirements<br>
+* Recommended minimum 19GB available VRAM for FP8 kernel.
+* Requires GPU with FP8 support (e.g., 4090)
+
+⏳ First-Run Optimization
+JIT compilation causes longer initial execution (subsequent runs retain optimized speed).
+
+🔄 Temporary Interface<br>
+Current weight loading implementation is provisional - will be refined in future versions
+
+📁 Path Specification<br>
+Despite hybrid quantization, merged weights are stored as .safetensors - pass the containing folder path to `--gguf_path`
--- a/doc/en/injection_tutorial.md
+++ b/doc/en/injection_tutorial.md
@ -59,6 +59,7 @@ Supported operators and their corresponding classes are as follows:
 | Linear    | KTransformersLinear    | KLinearMarlin           | Marlin as backend    |
 |           |                        | KLinearTorch            | pytorch as backend   |
 |           |                        | KLinearCPUInfer         | llamafile as backend |
+|           |                        | KLinearFP8         | Triton fp8_gemm kernel. Requires GPU be able to caluculate fp8 data |
 | experts   | KTransformersExperts   | KExpertsTorch           | pytorch as backend   |
 |           |                        | KExpertsMarlin          | Marlin as backend    |
 |           |                        | KExpertsCPU             | llamafile as backend |
--- a/doc/en/install.md
+++ b/doc/en/install.md
@ -0,0 +1,295 @@
+<!-- omit in toc -->
+# How to Run DeepSeek-R1
+- [Preparation](#preparation)
+- [Installation](#installation)
+  - [Attention](#attention)
+  - [Supported models include:](#supported-models-include)
+  - [Support quantize format:](#support-quantize-format)
+
+In this document, we will show you how to install and run KTransformers on your local machine. There are two versions: 
+* V0.2 is the current main branch.
+* V0.3 is a preview version only provides binary distribution for now.
+* To reproduce our DeepSeek-R1/V3 results, please refer to [Deepseek-R1/V3 Tutorial](./DeepseekR1_V3_tutorial.md) for more detail settings after installation.
+## Preparation
+Some preparation:
+
+- CUDA 12.1 and above, if you didn't have it yet, you may install from [here](https://developer.nvidia.com/cuda-downloads).
+  
+  ```sh
+  # Adding CUDA to PATH
+  if [ -d "/usr/local/cuda/bin" ]; then
+      export PATH=$PATH:/usr/local/cuda/bin
+  fi
+
+  if [ -d "/usr/local/cuda/lib64" ]; then
+      export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64
+      # Or you can add it to /etc/ld.so.conf and run ldconfig as root:
+      # echo "/usr/local/cuda-12.x/lib64" | sudo tee -a /etc/ld.so.conf
+      # sudo ldconfig
+  fi
+
+  if [ -d "/usr/local/cuda" ]; then
+      export CUDA_PATH=$CUDA_PATH:/usr/local/cuda
+  fi
+  ```
+
+- Linux-x86_64 with gcc, g++ and cmake (using Ubuntu as an example)
+  
+  ```sh
+  sudo apt-get update
+  sudo apt-get install build-essential cmake ninja-build
+  ```
+
+- We recommend using [Miniconda3](https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh) or [Anaconda3](https://repo.anaconda.com/archive/Anaconda3-2024.10-1-Linux-x86_64.sh) to create a virtual environment with Python=3.11 to run our program. Assuming your Anaconda installation directory is `~/anaconda3`, you should ensure that the version identifier of the GNU C++standard library used by Anaconda includes `GLIBCXX-3.4.32`
+
+  
+  ```sh
+  conda create --name ktransformers python=3.11
+  conda activate ktransformers # you may need to run ‘conda init’ and reopen shell first
+  
+  conda install -c conda-forge libstdcxx-ng # Anaconda provides a package called `libstdcxx-ng` that includes a newer version of `libstdc++`, which can be installed via `conda-forge`.
+
+  strings ~/anaconda3/envs/ktransformers-0.3/lib/libstdc++.so.6 | grep GLIBCXX
+  ```
+
+- Make sure that PyTorch, packaging, ninja is installed You can also [install previous versions of PyTorch](https://pytorch.org/get-started/previous-versions/)
+  
+  ```
+  pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126
+  pip3 install packaging ninja cpufeature numpy
+  ```
+
+ - At the same time, you should download and install the corresponding version of flash-attention from https://github.com/Dao-AILab/flash-attention/releases.
+
+## Installation
+### Attention
+If you want to use numa support, not only do you need to set USE_NUMA=1, but you also need to make sure you have installed the libnuma-dev (`sudo apt-get install libnuma-dev` may help you).
+
+<!-- 1. ~~Use a Docker image, see [documentation for Docker](./doc/en/Docker.md)~~
+   
+   >We are working on the latest docker image, please wait for a while.
+
+2. ~~You can install using Pypi (for linux):~~
+    > We are working on the latest pypi package, please wait for a while.
+   
+   ```
+   pip install ktransformers --no-build-isolation
+   ```
+   
+   for windows we prepare a pre compiled whl package on [ktransformers-0.2.0+cu125torch24avx2-cp312-cp312-win_amd64.whl](https://github.com/kvcache-ai/ktransformers/releases/download/v0.2.0/ktransformers-0.2.0+cu125torch24avx2-cp312-cp312-win_amd64.whl), which require cuda-12.5, torch-2.4, python-3.11, more pre compiled package are being produced.  -->
+
+* Download source code and compile:
+   
+   - init source code 
+     
+     ```sh
+     git clone https://github.com/kvcache-ai/ktransformers.git
+     cd ktransformers
+     git submodule init
+     git submodule update
+     ```
+
+   - [Optional] If you want to run with website, please [compile the website](./api/server/website.md) before execute ```bash install.sh```
+
+   - For Linux
+     - For simple install:
+     
+        ```shell
+        bash install.sh
+        ```
+     - For those who have two cpu and 1T RAM:
+
+       ```shell
+        # Make sure your system has dual sockets and double size RAM than the model's size (e.g. 1T RAM for 512G model)
+        apt install libnuma-dev
+        export USE_NUMA=1
+        bash install.sh # or #make dev_install
+        ```
+
+   - For Windows
+     
+     ```shell
+     install.bat
+     ```
+
+* If you are developer, you can make use of the makefile to compile and format the code. <br> the detailed usage of makefile is [here](./makefile_usage.md) 
+
+<h3>Local Chat</h3>
+We provide a simple command-line local chat Python script that you can run for testing.
+
+> Note: this is a very simple test tool only support one round chat without any memory about last input, if you want to try full ability of the model, you may go to [RESTful API and Web UI](#id_666). 
+
+<h4>Run Example</h4>
+
+```shell
+# Begin from root of your cloned repo!
+# Begin from root of your cloned repo!!
+# Begin from root of your cloned repo!!! 
+
+# Download mzwing/DeepSeek-V2-Lite-Chat-GGUF from huggingface
+mkdir DeepSeek-V2-Lite-Chat-GGUF
+cd DeepSeek-V2-Lite-Chat-GGUF
+
+wget https://huggingface.co/mradermacher/DeepSeek-V2-Lite-GGUF/resolve/main/DeepSeek-V2-Lite.Q4_K_M.gguf -O DeepSeek-V2-Lite-Chat.Q4_K_M.gguf
+
+cd .. # Move to repo's root dir
+
+# Start local chat
+python -m ktransformers.local_chat --model_path deepseek-ai/DeepSeek-V2-Lite-Chat --gguf_path ./DeepSeek-V2-Lite-Chat-GGUF
+
+# If you see “OSError: We couldn't connect to 'https://huggingface.co' to load this file”, try：
+# GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite
+# python  ktransformers.local_chat --model_path ./DeepSeek-V2-Lite --gguf_path ./DeepSeek-V2-Lite-Chat-GGUF
+```
+
+It features the following arguments:
+
+- `--model_path` (required): Name of the model (such as "deepseek-ai/DeepSeek-V2-Lite-Chat" which will automatically download configs from [Hugging Face](https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite)). Or if you already got local files  you may directly use that path to initialize the model.  
+  
+  > Note: <strong>.safetensors</strong> files are not required in the directory. We only need config files to build model and tokenizer.
+
+- `--gguf_path` (required): Path of a directory containing GGUF files which could that can be downloaded from [Hugging Face](https://huggingface.co/mzwing/DeepSeek-V2-Lite-Chat-GGUF/tree/main). Note that the directory should only contains GGUF of current model, which means you need one separate directory for each model.
+
+- `--optimize_config_path` (required except for Qwen2Moe and DeepSeek-V2): Path of YAML file containing optimize rules. There are two rule files pre-written in the [ktransformers/optimize/optimize_rules](ktransformers/optimize/optimize_rules) directory for optimizing DeepSeek-V2 and Qwen2-57B-A14, two SOTA MoE models.
+
+- `--max_new_tokens`: Int (default=1000). Maximum number of new tokens to generate.
+
+- `--cpu_infer`: Int (default=10). The number of CPUs used for inference. Should ideally be set to the (total number of cores - 2).
+
+<details>
+<summary>Supported Models/quantization</summary>
+
+### Supported models include:
+
+| ✅ **Supported Models** | ❌ **Deprecated Models** |
+|------------------------|------------------------|
+| DeepSeek-R1 | ~~InternLM2.5-7B-Chat-1M~~ |
+| DeepSeek-V3 |  |
+| DeepSeek-V2 |  |
+| DeepSeek-V2.5 |  |
+| Qwen2-57B |  |
+| DeepSeek-V2-Lite |  |
+| Mixtral-8x7B |  |
+| Mixtral-8x22B |  |
+
+### Support quantize format:
+
+| ✅ **Supported Formats** | ❌ **Deprecated Formats** |
+|--------------------------|--------------------------|
+| Q2_K_L | ~~IQ2_XXS~~ |
+| Q2_K_XS |  |
+| Q3_K_M |  |
+| Q4_K_M |  |
+| Q5_K_M |  |
+| Q6_K |  |
+| Q8_0 |  |
+</details>
+
+<details>
+<summary>Suggested Model</summary>
+
+| Model Name                     | Model Size | VRAM  | Minimum DRAM    | Recommended DRAM  |
+| ------------------------------ | ---------- | ----- | --------------- | ----------------- |
+| DeepSeek-R1-q4_k_m		 | 377G       | 14G   | 382G            | 512G		    |
+| DeepSeek-V3-q4_k_m		 | 377G       | 14G   | 382G            | 512G		    |
+| DeepSeek-V2-q4_k_m             | 133G       | 11G   | 136G            | 192G              |
+| DeepSeek-V2.5-q4_k_m           | 133G       | 11G   | 136G            | 192G              |
+| DeepSeek-V2.5-IQ4_XS           | 117G       | 10G   | 107G            | 128G              |
+| Qwen2-57B-A14B-Instruct-q4_k_m | 33G        | 8G    | 34G             | 64G               |
+| DeepSeek-V2-Lite-q4_k_m        | 9.7G       | 3G    | 13G             | 16G               |
+| Mixtral-8x7B-q4_k_m            | 25G        | 1.6G  | 51G             | 64G               |
+| Mixtral-8x22B-q4_k_m           | 80G        | 4G    | 86.1G           | 96G               |
+| InternLM2.5-7B-Chat-1M         | 15.5G      | 15.5G | 8G(32K context) | 150G (1M context) |
+
+
+More will come soon. Please let us know which models you are most interested in. 
+
+Be aware that you need to be subject to their corresponding model licenses when using [DeepSeek](https://huggingface.co/deepseek-ai/DeepSeek-V2/blob/main/LICENSE) and [QWen](https://huggingface.co/Qwen/Qwen2-72B-Instruct/blob/main/LICENSE).
+</details>
+
+
+<details>
+  <summary>Click To Show how to run other examples</summary>
+
+* Qwen2-57B
+
+  ```sh
+  pip install flash_attn # For Qwen2
+
+  mkdir Qwen2-57B-GGUF && cd Qwen2-57B-GGUF
+
+  wget https://huggingface.co/Qwen/Qwen2-57B-A14B-Instruct-GGUF/resolve/main/qwen2-57b-a14b-instruct-q4_k_m.gguf?download=true -O qwen2-57b-a14b-instruct-q4_k_m.gguf
+
+  cd ..
+
+  python -m ktransformers.local_chat --model_name Qwen/Qwen2-57B-A14B-Instruct --gguf_path ./Qwen2-57B-GGUF
+
+  # If you see “OSError: We couldn't connect to 'https://huggingface.co' to load this file”, try：
+  # GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/Qwen/Qwen2-57B-A14B-Instruct
+  # python  ktransformers/local_chat.py --model_path ./Qwen2-57B-A14B-Instruct --gguf_path ./DeepSeek-V2-Lite-Chat-GGUF
+  ```
+
+* Deepseek-V2
+  
+  ```sh
+  mkdir DeepSeek-V2-Chat-0628-GGUF && cd DeepSeek-V2-Chat-0628-GGUF
+  # Download weights
+  wget https://huggingface.co/bartowski/DeepSeek-V2-Chat-0628-GGUF/resolve/main/DeepSeek-V2-Chat-0628-Q4_K_M/DeepSeek-V2-Chat-0628-Q4_K_M-00001-of-00004.gguf -o DeepSeek-V2-Chat-0628-Q4_K_M-00001-of-00004.gguf
+  wget https://huggingface.co/bartowski/DeepSeek-V2-Chat-0628-GGUF/resolve/main/DeepSeek-V2-Chat-0628-Q4_K_M/DeepSeek-V2-Chat-0628-Q4_K_M-00002-of-00004.gguf -o DeepSeek-V2-Chat-0628-Q4_K_M-00002-of-00004.gguf
+  wget https://huggingface.co/bartowski/DeepSeek-V2-Chat-0628-GGUF/resolve/main/DeepSeek-V2-Chat-0628-Q4_K_M/DeepSeek-V2-Chat-0628-Q4_K_M-00003-of-00004.gguf -o DeepSeek-V2-Chat-0628-Q4_K_M-00003-of-00004.gguf
+  wget https://huggingface.co/bartowski/DeepSeek-V2-Chat-0628-GGUF/resolve/main/DeepSeek-V2-Chat-0628-Q4_K_M/DeepSeek-V2-Chat-0628-Q4_K_M-00004-of-00004.gguf -o DeepSeek-V2-Chat-0628-Q4_K_M-00004-of-00004.gguf
+
+  cd ..
+
+  python -m ktransformers.local_chat --model_name deepseek-ai/DeepSeek-V2-Chat-0628 --gguf_path ./DeepSeek-V2-Chat-0628-GGUF
+
+  # If you see “OSError: We couldn't connect to 'https://huggingface.co' to load this file”, try：
+
+  # GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/deepseek-ai/DeepSeek-V2-Chat-0628
+
+  # python -m ktransformers.local_chat --model_path ./DeepSeek-V2-Chat-0628 --gguf_path ./DeepSeek-V2-Chat-0628-GGUF
+  ```
+
+| model name | weights download link |
+|----------|----------|
+| Qwen2-57B | [Qwen2-57B-A14B-gguf-Q4K-M](https://huggingface.co/Qwen/Qwen2-57B-A14B-Instruct-GGUF/tree/main) |
+| DeepseekV2-coder |[DeepSeek-Coder-V2-Instruct-gguf-Q4K-M](https://huggingface.co/LoneStriker/DeepSeek-Coder-V2-Instruct-GGUF/tree/main) |
+| DeepseekV2-chat |[DeepSeek-V2-Chat-gguf-Q4K-M](https://huggingface.co/bullerwins/DeepSeek-V2-Chat-0628-GGUF/tree/main) |
+| DeepseekV2-lite | [DeepSeek-V2-Lite-Chat-GGUF-Q4K-M](https://huggingface.co/mzwing/DeepSeek-V2-Lite-Chat-GGUF/tree/main) |
+| DeepSeek-R1 | [DeepSeek-R1-gguf-Q4K-M](https://huggingface.co/unsloth/DeepSeek-R1-GGUF/tree/main/DeepSeek-R1-Q4_K_M) |
+
+</details>
+
+<!-- pin block for jump -->
+<span id='id_666'> 
+
+<h3>RESTful API and Web UI  </h3>
+
+
+Start without website:
+
+```sh
+ktransformers --model_path deepseek-ai/DeepSeek-V2-Lite-Chat --gguf_path /path/to/DeepSeek-V2-Lite-Chat-GGUF --port 10002
+```
+
+Start with website:
+
+```sh
+ktransformers --model_path deepseek-ai/DeepSeek-V2-Lite-Chat --gguf_path /path/to/DeepSeek-V2-Lite-Chat-GGUF  --port 10002 --web True
+```
+
+Or you want to start server with transformers, the model_path should include safetensors
+
+```bash
+ktransformers --type transformers --model_path /mnt/data/model/Qwen2-0.5B-Instruct --port 10002 --web True
+```
+
+Access website with url [http://localhost:10002/web/index.html#/chat](http://localhost:10002/web/index.html#/chat) :
+
+<p align="center">
+  <picture>
+    <img alt="Web UI" src="https://github.com/user-attachments/assets/615dca9b-a08c-4183-bbd3-ad1362680faf" width=90%>
+  </picture>
+</p>
+
+More information about the RESTful API server can be found [here](doc/en/api/server/server.md). You can also find an example of integrating with Tabby [here](doc/en/api/server/tabby.md).
--- a/doc/en/multi-gpu-tutorial.md
+++ b/doc/en/multi-gpu-tutorial.md
@ -0,0 +1,118 @@
+
+# Muti-GPU
+
+Assume you have read the [Injection Tutorial](./injection_tutorial.md) and have a basic understanding of how to inject a model. In this tutorial, we will show you how to use KTransformers to run a model on multiple GPUs.
+
+If you have multiple GPUs, you can set the device for each module to different GPUs. 
+DeepseekV2-Chat got 60 layers, if we got 2 GPUs, we can allocate 30 layers to each GPU. Complete multi GPU rule examples [here](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu.yaml).
+
+
+<p align="center">
+  <picture>
+    <img alt="Inject-Struction" src="../assets/multi_gpu.png" width=60%>
+  </picture>
+</p>
+
+First of all, for multi-GPU, we have to inject an new operator `KDeepseekV2Model`. And set division of the layers to different GPUs. For our case, we have to set the `transfer_map` in the `KDeepseekV2Model` operatoras as follows:
+
+```yaml
+- match:
+    name: "^model$"
+  replace:
+    class: "ktransformers.operators.models.KDeepseekV2Model"
+    kwargs:
+      transfer_map: 
+        30: "cuda:1"
+```
+
+And we have to set the device for each module in the model. 
+
+For example, for `routed experts`, the yaml for one GPU is:
+```yaml
+- match:
+    name: "^model\\.layers\\..*\\.mlp\\.experts$"
+  replace:
+    class: ktransformers.operators.experts.KTransformersExperts     # Custom MoE kernel with expert parallelism
+    kwargs:
+      generate_device: "cuda:0"
+      generate_op: "MLPCUDAExperts"
+      out_device: "cuda:0"
+  recursive: False # Don't recursively inject submodules of this module
+```
+But for two GPUs, we need to set the device for each module in the model. 
+
+```yaml
+# allcate 0-29 layers‘s out_device to cuda:0
+- match:
+    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp\\.experts$"
+  replace:
+    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
+    kwargs:
+      generate_device: "cpu"
+      generate_op:  "KExpertsCPU"
+      out_device: "cuda:0"
+  recursive: False # don't recursively inject submodules of this module
+
+# allocate 30-59 layers‘s out_device to cuda:1
+- match:
+    name: "^model\\.layers\\.([345][0-9])\\.mlp\\.experts$"
+  replace:
+    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
+    kwargs:
+      generate_device: "cpu"
+      generate_op:  "KExpertsCPU"
+      out_device: "cuda:1"
+  recursive: False # don't recursively inject submodules of this module
+```
+For other modules, we can set the device in the same way.
+
+# How to fully utilize multi-GPU's VRAM
+
+When you have multiple GPUs, you can fully utilize the VRAM of each GPU by moving more weights to the GPU.
+
+For example, for DeepSeekV2-Chat, we can move the weights of the experts to the GPU. 
+
+For example, the yaml for two GPUs is:
+```yaml
+- match:
+    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp\\.experts$"
+  replace:
+    class: ktransformers.operators.experts.KTransformersExperts
+    kwargs:
+      generate_device: "cpu"
+      generate_op:  "KExpertsCPU"
+      out_device: "cuda:0"
+  recursive: False
+```
+
+But we got extra 60GB VRAM on cuda:0, we can move experts in layer 4~8 to cuda:0. 
+
+```yaml
+# Add new rule before old rule.
+- match:
+    name: "^model\\.layers\\.([4-8])\\.mlp\\.experts$" # inject experts in layer 4~8 as marlin expert
+  replace:
+    class: ktransformers.operators.experts.KTransformersExperts  
+    kwargs:
+      generate_device: "cuda:0"
+      generate_op:  "KExpertsMarlin"
+  recursive: False
+
+- match:
+    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp\\.experts$"
+  replace:
+    class: ktransformers.operators.experts.KTransformersExperts     
+    kwargs:
+      generate_device: "cpu"
+      generate_op:  "KExpertsCPU"
+      out_device: "cuda:0"
+  recursive: False 
+```
+
+Adjust the layer range as you want. Note that:
+* The loading speed will be significantly slower for each expert moved to the GPU.
+* You have to close the cuda graph if you want to move the experts to the GPU.
+* For DeepSeek-R1/V3, each expert moved to the GPU will consume approximately 6GB of VRAM.
+* The first matched rule in yaml will be applied. For example, if you have two rules that match the same layer, only the first rule's replacement will be valid.
+
+
--- a/doc/zh/DeepseekR1_V3_tutorial_zh.md
+++ b/doc/zh/DeepseekR1_V3_tutorial_zh.md
@ -0,0 +1,173 @@
+<!-- omit in toc -->
+# GPT-4/o1 级别本地 VSCode Copilot 在仅 24GB 显存的台式机上的表现
+- [摘要](#摘要)
+	- [先决条件](#先决条件)
+	- [基准测试结果](#基准测试结果)
+		- [V0.2](#v02)
+			- [设置](#设置)
+			- [内存占用](#内存占用)
+			- [基准测试结果](#基准测试结果)
+		- [V0.3-Preview](#V0.3-Preview)
+			- [设置](#设置-1)
+			- [内存占用](#内存占用-1)
+			- [基准测试结果](#基准测试结果-1)
+	- [如何运行](#如何运行)
+		- [V0.2 展示](#v02-展示)
+			- [单插槽版本 (32 核心)](#单插槽版本（32 核心）)
+			- [双插槽版本 (64 核心)](#双插槽版本（64 核心）)
+		- [V0.3 展示](#v03-展示)
+			- [双插槽版本 (64 核心)](#双插槽版本（64 核心）-1)
+	- [一些解释](#一些解释)
+	- [常见问题解答](#常见问题解答)
+		- [R1 不思考](#R1 不返回思考过程)
+		- [更多常见问题解答](#更多常见问题解答)
+
+# 摘要
+
+> **2025年2月10日**: 支持在单个（24GB 显存）/多个 GPU 和 382GB 内存上运行 DeepseekR1 和 V3，速度提升高达 3~28 倍。<br>
+
+嗨，我们是 KTransformers 团队（以前因本地 CPU/GPU 混合推理开源项目 DeepSeek-V2 而闻名）。
+
+我们听到了您对 DeepSeek-R1/V3 支持的请求——我们很高兴终于可以交付了！很抱歉让您久等了，但我们一直在酝酿一些真正令人惊叹的东西！
+
+今天，我们自豪地宣布，我们不仅支持 DeepSeek-R1/V3，如下视频所示：
+
+https://github.com/user-attachments/assets/ebd70bfa-b2c1-4abb-ae3b-296ed38aa285
+
+</p>
+
+- **[NEW!!!] 本地 671B DeepSeek-Coder-V3/R1:** 仅使用 14GB 显存和 382GB 内存运行其 Q4_K_M 版本。
+	- 预填充(Prefill)速度 (tokens/s): 
+ 		- KTransformers: 54.21 (32 核心) → 74.362 (双插槽，2×32 核心) → 255.26 (优化的 AMX 基 MoE 内核，仅 V0.3) → 286.55 (选择性使用 6 个专家，仅 V0.3)  
+ 		- 与 llama.cpp 在 2×32 核心下 10.31 tokens/s 相比，速度提升高达 **27.79 倍**
+ 	- 解码(Decode)速度 (tokens/s):  
+ 		- KTransformers: 8.73 (32 核心) → 11.26 (双插槽， 2×32 核心) → 13.69 (选择性使用 6 个专家，仅 V0.3)  
+ 		- 与 llama.cpp 在 2×32 核心下 4.51 tokens/s 相比，速度提升高达 **3.03 倍**
+
+	
+我们还提供了即将推出的优化预览，包括英特尔 AMX 加速内核和选择性专家激活方法，这将显著提升性能。通过 V0.3 预览版，我们在预填充方面实现了高达 286 tokens/s 的速度，比本地推理的 llama.cpp **快 28 倍**。二进制发行版现已可用，源代码即将推出！请查看 wheel 包 [此处](https://github.com/kvcache-ai/ktransformers/releases/download/v0.1.4/ktransformers-0.3.0rc0+cu126torch26fancy-cp311-cp311-linux_x86_64.whl) 。
+
+
+## 先决条件
+我们在以下配置下进行了最佳性能测试（V0.2）： <br>
+CPU: Intel (R) Xeon (R) Gold 6454S 1T 内存 (2 NUMA 节点) <br>
+GPU: 4090D 24G 显存 <br>
+内存: 标准 DDR5-4800 服务器内存 (1 TB)
+## 基准测试结果
+### V0.2
+#### 设置
+- Model: DeepseekV3-q4km (int4)<br>
+- CPU: cpu_model_name: Intel (R) Xeon (R) Gold 6454S，每个插槽 32 核心，2 个插槽，2 个 NUMA 节点
+- GPU: 4090D 24G 显存
+- 我们在充分预热后进行测试
+#### 内存占用:
+  - 单插槽: 382G 内存，至少 14GB 显存
+  - 双插槽: 1T 内存，至少 14GB 显存
+
+#### 基准测试结果
+
+“6 个专家” 情况是 V0.3 预览版中内容
+
+| Prompt<br>(500 tokens) | 双插槽 Ktrans (6 个专家) | 双插槽 Ktrans (8 个专家) | Single socket Ktrans (6 个专家) | Single socket Ktrans (8 个专家)| llama.cpp (8 个专家) | 
+|------------------------| --- | --- | --- | --- | --- | 
+| 预填充(Prefill) token/s   | 97.32 | 82.94 | 65.14 | 54.21 | 10.31 |
+| 解码(Decode) token/s     | 13.69 | 12.208 | 10.303 | 8.73 |4.51 |
+
+**最高加速比在解码方面达到 <u>3.03x</u> 倍，在预填充方面达到 <u>9.44x</u> 倍。**
+
+### V0.3-Preview
+#### 设置
+- Model: DeepseekV3-BF16 (在线量化为 CPU 的 int8 和 GPU 的 int4)
+- CPU: cpu_model_name: Intel (R) Xeon (R) Gold 6454S，每个插槽 32 核心，2 个插槽，2 个 NUMA 节点
+- GPU: (1~4)x 4090D 24G 显存 (更长的 prompt 需要更多显存)
+
+#### 内存占用:
+- 644GB 内存，至少 14GB 显存
+
+#### 基准测试结果
+| Prompt length  | 1K  | 2K  | 4K  | 8K |
+|---------------|-----|-----|-----|-----|
+| KTrans (8 个专家) Prefill token/s |   185.96  |  255.26   |  252.58   |  195.62   |
+| KTrans (6 个专家) Prefill token/s |   203.70  |  286.55   |  271.08   |  207.20   |
+
+**KTrans V0.3 的预填充速度比 KTrans V0.2 快 <u>3.45x</u> 倍，比 llama.cpp 快 <u>27.79x</u> 倍。**
+**解码速度与 KTrans V0.2（6 个专家版本）相同，因此省略。**
+
+主要加速来自于 
+- 英特尔 AMX 指令集和我们专门设计的缓存友好内存布局
+- 专家选择策略，根据离线配置文件结果选择更少的专家
+
+
+*从我们对 DeepSeekV2、DeepSeekV3 和 DeepSeekR1 的研究中，当我们略微减少推理中的激活专家数量时，输出质量没有变化。但解码和预填充的速度加快了，这令人鼓舞。因此，我们的展示利用了这一发现。*
+
+## 如何运行
+### V0.2 展示
+#### 单插槽版本（32 核心）
+我们的 local_chat 测试命令是:
+``` shell
+git clone https://github.com/kvcache-ai/ktransformers.git
+cd ktransformers
+git submodule init
+git submodule update
+numactl -N 1 -m 1 python ./ktransformers/local_chat.py --model_path <your model path> --gguf_path <your gguf path>  --prompt_file <your prompt txt file>  --cpu_infer 33 --max_new_tokens 1000
+<当您看到聊天时，按回车键加载文本提示文件>
+```
+`<your model path>` 可以是本地路径，也可以是在线路径，例如 deepseek-ai/DeepSeek-V3。如果在线连接出现问题，可以尝试使用镜像（hf-mirror.com） <br>
+`<your gguf path>` 也可以是在线路径，但由于其体积较大，我们建议您下载并量化模型（注意这是目录路径） <br>
+`--max_new_tokens 1000` 是最大输出 token 长度。如果发现答案被截断，可以增加此数字以获得更长的答案（但要注意内存不足问题，增加此数字会降低生成速度）. 
+<br>
+命令 numactl -N 1 -m 1 的目的是避免 NUMA 节点之间的数据传输<br>
+注意！如果测试 R1 可能会跳过思考。因此，可以添加参数：`--force_think true`，这在 [常见问题解答](#常见问题解答) 部分中解释。
+
+#### 双插槽版本（64 核心）
+在安装之前（使用 install.sh 或 `make dev_install`），请确保设置环境变量 `USE_NUMA=1`，方法是 `export USE_NUMA=1`（如果已经安装，请重新安装并设置此环境变量） <br>
+我们的 local_chat 测试命令是：
+``` shell
+git clone https://github.com/kvcache-ai/ktransformers.git
+cd ktransformers
+git submodule init
+git submodule update
+export USE_NUMA=1
+make dev_install # or sh ./install.sh
+python ./ktransformers/local_chat.py --model_path <your model path> --gguf_path <your gguf path>  --prompt_file <your prompt txt file>  --cpu_infer 65 --max_new_tokens 1000
+<当您看到聊天时，按回车键加载文本提示文件>
+```
+参数的含义相同。但因为我们使用双插槽，所以将 cpu_infer 设置为 65。
+
+### V0.3 展示
+#### 双插槽版本（64 核心）
+我们的 local_chat 测试命令是：
+``` shell
+wget https://github.com/kvcache-ai/ktransformers/releases/download/v0.1.4/ktransformers-0.3.0rc0+cu126torch26fancy-cp311-cp311-linux_x86_64.whl
+pip install ./ktransformers-0.3.0rc0+cu126torch26fancy-cp311-cp311-linux_x86_64.whl
+python -m ktransformers.local_chat --model_path <your model path> --gguf_path <your gguf path>  --prompt_file <your prompt txt file>  --cpu_infer 65 --max_new_tokens 1000
+<当您看到聊天时，按回车键加载文本提示文件>
+```
+参数的含义与 V0.2 相同。但因为我们使用双插槽，所以将 cpu_infer 设置为 65。
+
+## 一些解释
+1. 我们还想进一步利用 Xeon Gold CPU 上的两个 NUMA 节点。为了避免节点之间的数据传输成本，我们在两个节点上 "copy" 了关键矩阵，这会增加内存占用，但会加速预填充和解码过程。但这种方法占用大量内存，加载权重时速度较慢，因此加载时请耐心等待并监控内存使用情况。我们计划优化这一巨大的内存开销。敬请期待。
+
+2. 命令参数 `--cpu_infer 65` 指定使用多少核心（超过物理核心数量是可以的，但并不是越多越好。根据实际核心数量适当降低此值）。<br>
+
+3. 为什么使用 CPU/GPU 混合推理？
+DeepSeek 的 MLA 操作符计算密集。虽然全部在 CPU 上运行是可行的，但将繁重的计算任务卸载到 GPU 上能带来巨大的性能提升。
+
+4. 加速来自哪里？
+
+   - 专家卸载：与传统的基于层或 KVCache 卸载（如 llama.cpp 中的）不同，我们将专家计算卸载到 CPU，将 MLA/KVCache 卸载到 GPU，与 DeepSeek 的架构完美对齐，实现最佳效率。
+   - 英特尔 AMX 优化 – 我们的 AMX 加速内核经过精心调优，运行速度是现有 llama.cpp 实现的数倍。我们计划在清理后开源此内核，并考虑向 llama.cpp 上游贡献代码。 
+
+5. 为什么选择英特尔 CPU？
+英特尔目前是唯一支持 AMX 类似指令的 CPU 供应商，与仅支持 AVX 的替代方案相比，性能显著更好。
+
+## 常见问题解答
+### R1 不返回思考过程
+注意！如果测试 R1 可能会跳过思考。因此，可以添加参数：`--force_think true`。详细信息在 [常见问题解答](./FAQ.md) 部分中。 <br>
+
+## 问题
+* 修复服务器集成功能以实现网络API访问支持
+* 修复本地聊天功能仅支持单行提示输入的问题（目前输入换行符(\n)即开始生成提示）
+
+### 更多常见问题解答
+[详见](./FAQ.md)
--- a/install.sh
+++ b/install.sh
@ -2,6 +2,8 @@
 set -e  

 # clear build dirs
+rm -rf build
+rm -rf *.egg-info
 rm -rf ktransformers/ktransformers_ext/build
 rm -rf ktransformers/ktransformers_ext/cuda/build
 rm -rf ktransformers/ktransformers_ext/cuda/dist
--- a/ktransformers/init.py
+++ b/ktransformers/init.py
@ -5,7 +5,7 @@ Description  :
 Author       : kkk1nak0
 Date         : 2024-08-15 07:34:46
 Version      : 1.0.0
-LastEditors  : unicornchan 
-LastEditTime : 2025-02-10 00:59:53
+LastEditors  : chenxl 
+LastEditTime : 2025-02-15 03:53:02
 '''
-__version__ = "0.2.0"
+__version__ = "0.2.3.post1"
--- a/ktransformers/ktransformers_ext/CMakeLists.txt
+++ b/ktransformers/ktransformers_ext/CMakeLists.txt
@ -30,6 +30,9 @@ if (NOT MSVC)
    option(LLAMA_F16C                        "llama: enable F16C"                               OFF)
 endif()
 option(LLAMA_AVX512_FANCY_SIMD               "llama: enable AVX512-VL, AVX512-BW, AVX512-DQ, AVX512-VNNI"                        OFF)
+option(KTRANSFORMERS_USE_CUDA                "ktransformers: use CUDA"                          OFF)
+option(KTRANSFORMERS_USE_MUSA                "ktransformers: use MUSA"                          OFF)
+option(KTRANSFORMERS_USE_ROCM                "ktransformers: use ROCM"                          OFF)

 # Architecture specific
 # TODO: probably these flags need to be tweaked on some architectures
@ -173,6 +176,7 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
            list(APPEND ARCH_FLAGS -mavx512bw)
            list(APPEND ARCH_FLAGS -mavx512dq)
            list(APPEND ARCH_FLAGS -mavx512vnni)
+            list(APPEND ARCH_FLAGS -mavx512vpopcntdq)
        endif()
        if (LLAMA_AVX512_BF16)
            list(APPEND ARCH_FLAGS -mavx512bf16)
@ -232,18 +236,40 @@ add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../third_party/llama.cpp ${CMAKE
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../../third_party)
 if (WIN32)
    include_directories("$ENV{CUDA_PATH}/include")
+    add_compile_definitions(KTRANSFORMERS_USE_CUDA=1)
 elseif (UNIX)
-    find_package(CUDA)
-    find_package(HIP)
-    find_package(MUSAToolkit)
-    if(CUDA_FOUND)
+    if (KTRANSFORMERS_USE_CUDA)
+        find_package(CUDA REQUIRED)
        include_directories("${CUDA_INCLUDE_DIRS}")
+        add_compile_definitions(KTRANSFORMERS_USE_CUDA=1)
    endif()
+
+    if (KTRANSFORMERS_USE_ROCM)
+        find_package(HIP REQUIRED)
        if(HIP_FOUND)
            include_directories("${HIP_INCLUDE_DIRS}")
+            add_compile_definitions(KTRANSFORMERS_USE_ROCM=1)
        endif()
+    endif()
+
+    if (KTRANSFORMERS_USE_MUSA)
+        if (NOT EXISTS $ENV{MUSA_PATH})
+            if (NOT EXISTS /opt/musa)
+                set(MUSA_PATH /usr/local/musa)
+            else()
+                set(MUSA_PATH /opt/musa)
+            endif()
+        else()
+            set(MUSA_PATH $ENV{MUSA_PATH})
+        endif()
+
+        list(APPEND CMAKE_MODULE_PATH "${MUSA_PATH}/cmake")
+
+        find_package(MUSAToolkit)
        if (MUSAToolkit_FOUND)
-        include_directories("${MUSA_INCLUDE_DIRS}")
+            message(STATUS "MUSA Toolkit found")
+            add_compile_definitions(KTRANSFORMERS_USE_MUSA=1)
+        endif()
    endif()
 endif()

@ -260,22 +286,19 @@ target_link_libraries(${PROJECT_NAME} PRIVATE llama)
 if(WIN32)
    target_link_libraries(${PROJECT_NAME} PRIVATE "$ENV{CUDA_PATH}/lib/x64/cudart.lib")#CUDA::cudart
 elseif(UNIX)
+    if(KTRANSFORMERS_USE_CUDA)
        if(NOT DEFINED ENV{CUDA_HOME} OR "$ENV{CUDA_HOME}" STREQUAL "")
            set(ENV{CUDA_HOME} "/usr/local/cuda")
        endif()
-    if(CUDA_FOUND)
-        add_compile_definitions(USE_CUDA=1)
        target_link_libraries(${PROJECT_NAME} PRIVATE "$ENV{CUDA_HOME}/lib64/libcudart.so")
-        message(STATUS "Building for CUDA")
    endif()
-    if(HIP_FOUND)
+    if (KTRANSFORMERS_USE_ROCM)
        add_compile_definitions(USE_HIP=1)
        target_link_libraries(${PROJECT_NAME} PRIVATE "${ROCM_PATH}/lib/libamdhip64.so")
        message(STATUS "Building for HIP")
    endif()
-    if(MUSAToolkit_FOUND)
-        add_compile_definitions(USE_MUSA=1)
-        message(STATUS "Building for MUSA")
+    if(KTRANSFORMERS_USE_MUSA)
+        target_link_libraries(${PROJECT_NAME} PRIVATE MUSA::musart)
    endif()
 endif()

--- a/ktransformers/ktransformers_ext/cpu_backend/backend.cpp
+++ b/ktransformers/ktransformers_ext/cpu_backend/backend.cpp
@ -54,7 +54,12 @@ void Backend::do_work_stealing_job(int task_num,
    init_func_ = init_func;
    compute_func_ = compute_func;
    finalize_func_ = finalize_func;
+#ifdef USE_NUMA
+    // numa node location will be calculated based on the number of threads
+    thread_num_ = max_thread_num_;
+#else
    thread_num_ = std::min(max_thread_num_, task_num);
+#endif
    int base = task_num / thread_num_;
    int remain = task_num % thread_num_;
    thread_state_[0].end = base + (0 < remain);
--- a/ktransformers/ktransformers_ext/cpu_backend/cpuinfer.h
+++ b/ktransformers/ktransformers_ext/cpu_backend/cpuinfer.h
@ -17,6 +17,14 @@
 #include <queue>
 #include <thread>
 #include <vector>
+ #ifdef KTRANSFORMERS_USE_CUDA
+ #include "vendors/cuda.h"
+ #elif KTRANSFORMERS_USE_MUSA
+ #include "vendors/musa.h"
+ #elif KTRANSFORMERS_USE_ROCM
+ #define __HIP_PLATFORM_AMD__
+ #include "vendors/hip.h"
+ #endif
 
 #include "backend.h"
 #include "task_queue.h"
--- a/ktransformers/ktransformers_ext/cpu_backend/vendors/README.md
+++ b/ktransformers/ktransformers_ext/cpu_backend/vendors/README.md
@ -0,0 +1,3 @@
+## TODO
+
+This directory can be removed after updating the version of `llama.cpp`.
--- a/ktransformers/ktransformers_ext/cpu_backend/vendors/cuda.h
+++ b/ktransformers/ktransformers_ext/cpu_backend/vendors/cuda.h
@ -0,0 +1,15 @@
+#pragma once
+
+#include <cuda_runtime.h>
+#include <cuda.h>
+#include <cublas_v2.h>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+
+#if CUDART_VERSION < 11020
+#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
+#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
+#define CUBLAS_COMPUTE_16F CUDA_R_16F
+#define CUBLAS_COMPUTE_32F CUDA_R_32F
+#define cublasComputeType_t cudaDataType_t
+#endif // CUDART_VERSION < 11020
--- a/ktransformers/ktransformers_ext/cpu_backend/vendors/hip.h
+++ b/ktransformers/ktransformers_ext/cpu_backend/vendors/hip.h
@ -0,0 +1,172 @@
+#pragma once
+
+#define HIP_ENABLE_WARP_SYNC_BUILTINS 1
+#include <hip/hip_runtime.h>
+#include <hipblas/hipblas.h>
+#include <hip/hip_fp16.h>
+#include <hip/hip_bfloat16.h>
+#ifdef __HIP_PLATFORM_AMD__
+// for rocblas_initialize()
+#include "rocblas/rocblas.h"
+#endif // __HIP_PLATFORM_AMD__
+
+#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
+#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
+#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
+#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
+#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
+#define CUBLAS_OP_N HIPBLAS_OP_N
+#define CUBLAS_OP_T HIPBLAS_OP_T
+#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#define CUBLAS_TF32_TENSOR_OP_MATH 0
+#define CUDA_R_16F  HIPBLAS_R_16F
+#define CUDA_R_32F  HIPBLAS_R_32F
+#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED hipDeviceAttributeVirtualMemoryManagementSupported
+#define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED hipMemAllocationGranularityRecommended
+#define CU_MEM_ALLOCATION_TYPE_PINNED hipMemAllocationTypePinned
+#define CU_MEM_LOCATION_TYPE_DEVICE hipMemLocationTypeDevice
+#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE hipMemAccessFlagsProtReadWrite
+#define CU_CHECK(fn) {hipError_t err = fn; if(err != hipSuccess) { GGML_ABORT("HipVMM Failure: %s\n", hipGetErrorString(err)); }}
+#define __shfl_sync(mask, var, laneMask, width) __shfl(var, laneMask, width)
+#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
+#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
+#define cublasCreate hipblasCreate
+#define cublasDestroy hipblasDestroy
+#define cublasGemmEx hipblasGemmEx
+#define cublasGemmBatchedEx hipblasGemmBatchedEx
+#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
+#define cublasHandle_t hipblasHandle_t
+#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
+#define cublasSetStream hipblasSetStream
+#define cublasSgemm hipblasSgemm
+#define cublasStatus_t hipblasStatus_t
+#define cublasOperation_t hipblasOperation_t
+#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6
+#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
+#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
+#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
+#define cudaDeviceProp hipDeviceProp_t
+#define cudaDeviceSynchronize hipDeviceSynchronize
+#define cudaError_t hipError_t
+#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
+#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
+#define cudaEventCreateWithFlags hipEventCreateWithFlags
+#define cudaEventDisableTiming hipEventDisableTiming
+#define cudaEventRecord hipEventRecord
+#define cudaEventSynchronize hipEventSynchronize
+#define cudaEvent_t hipEvent_t
+#define cudaEventDestroy hipEventDestroy
+#define cudaFree hipFree
+#define cudaFreeHost hipHostFree
+#define cudaGetDevice hipGetDevice
+#define cudaGetDeviceCount hipGetDeviceCount
+#define cudaGetDeviceProperties hipGetDeviceProperties
+#define cudaGetErrorString hipGetErrorString
+#define cudaGetLastError hipGetLastError
+#define cudaHostRegister hipHostRegister
+#define cudaHostRegisterPortable hipHostRegisterPortable
+#define cudaHostRegisterReadOnly hipHostRegisterReadOnly
+#define cudaHostUnregister hipHostUnregister
+#define cudaLaunchHostFunc hipLaunchHostFunc
+#define cudaMalloc hipMalloc
+#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
+#define cudaMemcpy hipMemcpy
+#define cudaMemcpyAsync hipMemcpyAsync
+#define cudaMemcpyPeerAsync hipMemcpyPeerAsync
+#define cudaMemcpy2DAsync hipMemcpy2DAsync
+#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
+#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
+#define cudaMemcpyKind hipMemcpyKind
+#define cudaMemset hipMemset
+#define cudaMemsetAsync hipMemsetAsync
+#define cudaMemGetInfo hipMemGetInfo
+#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
+#define cudaSetDevice hipSetDevice
+#define cuDeviceGet hipDeviceGet
+#define CUdevice hipDevice_t
+#define CUdeviceptr hipDeviceptr_t
+#define cuMemUnmap hipMemUnmap
+#define CUmemAccessDesc hipMemAccessDesc
+#define cuMemAddressFree hipMemAddressFree
+#define cuMemRelease hipMemRelease
+#define CUmemGenericAllocationHandle hipMemGenericAllocationHandle_t
+#define cuMemCreate hipMemCreate
+#define cuMemAddressReserve hipMemAddressReserve
+#define cuMemMap hipMemMap
+#define cuMemSetAccess hipMemSetAccess
+#define cuMemGetAllocationGranularity hipMemGetAllocationGranularity
+#define CUmemAllocationProp hipMemAllocationProp
+#define cuDeviceGetAttribute hipDeviceGetAttribute
+#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
+#define cudaStreamDestroy hipStreamDestroy
+#define cudaStreamFireAndForget hipStreamFireAndForget
+#define cudaStreamNonBlocking hipStreamNonBlocking
+#define cudaStreamPerThread hipStreamPerThread
+#define cudaStreamSynchronize hipStreamSynchronize
+#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
+#define cudaGraphExec_t hipGraphExec_t
+#define cudaGraphNode_t hipGraphNode_t
+#define cudaKernelNodeParams hipKernelNodeParams
+#define cudaKernelNodeParams hipKernelNodeParams
+#define cudaGraphExecDestroy hipGraphExecDestroy
+#define cudaGraphLaunch hipGraphLaunch
+#define cudaErrorGraphExecUpdateFailure hipErrorGraphExecUpdateFailure
+#define cudaGraphExecUpdateResultInfo hipGraphExecUpdateResult
+#define cudaGraphNodeType hipGraphNodeType
+#define cudaGraphNodeTypeKernel hipGraphNodeTypeKernel
+#define cudaGraphInstantiate hipGraphInstantiate
+#define cudaStreamEndCapture hipStreamEndCapture
+#define cudaGraphDestroy hipGraphDestroy
+#define cudaGraphKernelNodeSetParams hipGraphKernelNodeSetParams
+#define cudaErrorInvalidDeviceFunction hipErrorInvalidDeviceFunction
+#define cudaGraphKernelNodeGetParams hipGraphKernelNodeGetParams
+#define cudaGraphNodeGetType hipGraphNodeGetType
+#define cudaGraphGetNodes hipGraphGetNodes
+#define cudaGraphExecUpdate hipGraphExecUpdate
+#define cudaStreamCaptureModeRelaxed hipStreamCaptureModeRelaxed
+#define cudaStreamBeginCapture hipStreamBeginCapture
+#define cudaGraph_t hipGraph_t
+#define cudaStream_t hipStream_t
+#define cudaSuccess hipSuccess
+#define cudaHostFn_t hipHostFn_t
+#define __trap() do { abort(); __builtin_unreachable(); } while(0)
+#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
+#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED
+#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE
+#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH
+#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR
+#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED
+#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
+#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
+
+#define __CUDA_ARCH__ 1300
+
+#if defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__)
+#define GCN
+#endif
+
+#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx942__)
+#define CDNA
+#endif
+
+#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
+    defined(__gfx1150__) || defined(__gfx1151__)
+#define RDNA3
+#endif
+
+#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
+    defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
+#define RDNA2
+#endif
+
+#if defined(__gfx1010__) || defined(__gfx1012__)
+#define RDNA1
+#endif
+
+#ifndef __has_builtin
+    #define __has_builtin(x) 0
+#endif
+
+typedef hip_bfloat16 nv_bfloat16;
--- a/ktransformers/ktransformers_ext/cpu_backend/vendors/musa.h
+++ b/ktransformers/ktransformers_ext/cpu_backend/vendors/musa.h
@ -0,0 +1,137 @@
+#pragma once
+
+#include <musa_runtime.h>
+#include <musa.h>
+#include <mublas.h>
+#include <musa_bf16.h>
+#include <musa_fp16.h>
+#define CUBLAS_COMPUTE_16F CUDA_R_16F
+#define CUBLAS_COMPUTE_32F CUDA_R_32F
+#define CUBLAS_COMPUTE_32F_FAST_16F MUBLAS_COMPUTE_32F_FAST_16F
+#define CUBLAS_GEMM_DEFAULT MUBLAS_GEMM_DEFAULT
+#define CUBLAS_GEMM_DEFAULT_TENSOR_OP MUBLAS_GEMM_DEFAULT
+#define CUBLAS_OP_N MUBLAS_OP_N
+#define CUBLAS_OP_T MUBLAS_OP_T
+#define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS
+#define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_MATH_MODE_DEFAULT
+#define CUDA_R_16F  MUSA_R_16F
+#define CUDA_R_32F  MUSA_R_32F
+#define cublasComputeType_t cudaDataType_t
+#define cublasCreate mublasCreate
+#define cublasDestroy mublasDestroy
+#define cublasGemmEx mublasGemmEx
+#define cublasGemmBatchedEx mublasGemmBatchedEx
+#define cublasGemmStridedBatchedEx mublasGemmStridedBatchedEx
+#define cublasHandle_t mublasHandle_t
+#define cublasSetMathMode mublasSetMathMode
+#define cublasSetStream mublasSetStream
+#define cublasSgemm mublasSgemm
+#define cublasStatus_t mublasStatus_t
+#define cublasOperation_t mublasOperation_t
+#define cublasGetStatusString mublasStatus_to_string
+#define cudaDataType_t musaDataType_t
+#define cudaDeviceCanAccessPeer musaDeviceCanAccessPeer
+#define cudaDeviceDisablePeerAccess musaDeviceDisablePeerAccess
+#define cudaDeviceEnablePeerAccess musaDeviceEnablePeerAccess
+#define cudaDeviceProp musaDeviceProp
+#define cudaDeviceSynchronize musaDeviceSynchronize
+#define cudaError_t musaError_t
+#define cudaErrorPeerAccessAlreadyEnabled musaErrorPeerAccessAlreadyEnabled
+#define cudaErrorPeerAccessNotEnabled musaErrorPeerAccessNotEnabled
+#define cudaEventCreateWithFlags musaEventCreateWithFlags
+#define cudaEventDisableTiming musaEventDisableTiming
+#define cudaEventRecord musaEventRecord
+#define cudaEventSynchronize musaEventSynchronize
+#define cudaEvent_t musaEvent_t
+#define cudaEventDestroy musaEventDestroy
+#define cudaFree musaFree
+#define cudaFreeHost musaFreeHost
+#define cudaGetDevice musaGetDevice
+#define cudaGetDeviceCount musaGetDeviceCount
+#define cudaGetDeviceProperties musaGetDeviceProperties
+#define cudaGetErrorString musaGetErrorString
+#define cudaGetLastError musaGetLastError
+#define cudaHostRegister musaHostRegister
+#define cudaHostRegisterPortable musaHostRegisterPortable
+#define cudaHostRegisterReadOnly musaHostRegisterReadOnly
+#define cudaHostUnregister musaHostUnregister
+#define cudaLaunchHostFunc musaLaunchHostFunc
+#define cudaMalloc musaMalloc
+#define cudaMallocHost musaMallocHost
+#define cudaMallocManaged musaMallocManaged
+#define cudaMemcpy musaMemcpy
+#define cudaMemcpyAsync musaMemcpyAsync
+#define cudaMemcpyPeerAsync musaMemcpyPeerAsync
+#define cudaMemcpy2DAsync musaMemcpy2DAsync
+#define cudaMemcpyDeviceToDevice musaMemcpyDeviceToDevice
+#define cudaMemcpyDeviceToHost musaMemcpyDeviceToHost
+#define cudaMemcpyHostToDevice musaMemcpyHostToDevice
+#define cudaMemcpyKind musaMemcpyKind
+#define cudaMemset musaMemset
+#define cudaMemsetAsync musaMemsetAsync
+#define cudaMemGetInfo musaMemGetInfo
+#define cudaOccupancyMaxPotentialBlockSize musaOccupancyMaxPotentialBlockSize
+#define cudaSetDevice musaSetDevice
+#define cudaStreamCreateWithFlags musaStreamCreateWithFlags
+#define cudaStreamDestroy musaStreamDestroy
+#define cudaStreamFireAndForget musaStreamFireAndForget
+#define cudaStreamNonBlocking musaStreamNonBlocking
+#define cudaStreamPerThread musaStreamPerThread
+#define cudaStreamSynchronize musaStreamSynchronize
+#define cudaStreamWaitEvent musaStreamWaitEvent
+#define cudaStream_t musaStream_t
+#define cudaSuccess musaSuccess
+
+// Additional mappings for MUSA virtual memory pool
+#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED MU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
+#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE MU_MEM_ACCESS_FLAGS_PROT_READWRITE
+#define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED MU_MEM_ALLOC_GRANULARITY_RECOMMENDED
+#define CU_MEM_ALLOCATION_TYPE_PINNED MU_MEM_ALLOCATION_TYPE_PINNED
+#define CU_MEM_LOCATION_TYPE_DEVICE MU_MEM_LOCATION_TYPE_DEVICE
+#define CUdevice MUdevice
+#define CUdeviceptr MUdeviceptr
+#define CUmemAccessDesc MUmemAccessDesc
+#define CUmemAllocationProp MUmemAllocationProp
+#define CUmemGenericAllocationHandle MUmemGenericAllocationHandle
+#define cuDeviceGet muDeviceGet
+#define cuDeviceGetAttribute muDeviceGetAttribute
+#define cuMemAddressFree muMemAddressFree
+#define cuMemAddressReserve muMemAddressReserve
+#define cuMemCreate muMemCreate
+#define cuMemGetAllocationGranularity muMemGetAllocationGranularity
+#define cuMemMap muMemMap
+#define cuMemRelease muMemRelease
+#define cuMemSetAccess muMemSetAccess
+#define cuMemUnmap muMemUnmap
+#define cudaFuncAttributeMaxDynamicSharedMemorySize musaFuncAttributeMaxDynamicSharedMemorySize
+#define cudaFuncSetAttribute musaFuncSetAttribute
+#define cudaMemcpy3DPeerParms musaMemcpy3DPeerParms
+#define make_cudaExtent make_musaExtent
+#define make_cudaPitchedPtr make_musaPitchedPtr
+
+// Additional mappings for MUSA graphs
+#define CUDA_SUCCESS MUSA_SUCCESS
+#define CUresult MUresult
+#define cuGetErrorString muGetErrorString
+#define cudaErrorGraphExecUpdateFailure musaErrorGraphExecUpdateFailure
+#define cudaErrorInvalidDeviceFunction musaErrorInvalidDeviceFunction
+#define cudaGraphDestroy musaGraphDestroy
+#define cudaGraphExecDestroy musaGraphExecDestroy
+#define cudaGraphExec_t musaGraphExec_t
+#define cudaGraphExecUpdate musaGraphExecUpdate
+#define cudaGraphExecUpdateResultInfo musaGraphExecUpdateResult
+#define cudaGraphGetNodes musaGraphGetNodes
+#define cudaGraphInstantiate musaGraphInstantiate
+#define cudaGraphKernelNodeGetParams musaGraphKernelNodeGetParams
+#define cudaGraphKernelNodeSetParams musaGraphKernelNodeSetParams
+#define cudaGraphLaunch musaGraphLaunch
+#define cudaGraphNodeGetType musaGraphNodeGetType
+#define cudaGraphNode_t musaGraphNode_t
+#define cudaGraphNodeType musaGraphNodeType
+#define cudaGraphNodeTypeKernel musaGraphNodeTypeKernel
+#define cudaGraph_t musaGraph_t
+#define cudaKernelNodeParams musaKernelNodeParams
+#define cudaStreamCaptureModeRelaxed musaStreamCaptureModeRelaxed
+#define cudaStreamEndCapture musaStreamEndCapture
+
+typedef mt_bfloat16 nv_bfloat16;
--- a/ktransformers/ktransformers_ext/cpu_backend/vendors/vendor.h
+++ b/ktransformers/ktransformers_ext/cpu_backend/vendors/vendor.h
@ -0,0 +1,13 @@
+#ifndef CPUINFER_VENDOR_VENDOR_H
+#define CPUINFER_VENDOR_VENDOR_H
+
+#ifdef USE_CUDA
+#include "cuda.h"
+#elif USE_HIP
+#define __HIP_PLATFORM_AMD__
+#include "hip.h"
+#elif USE_MUSA
+#include "musa.h"
+#endif
+
+#endif  // CPUINFER_VENDOR_VENDOR_H
--- a/ktransformers/ktransformers_ext/cuda/binding.cpp
+++ b/ktransformers/ktransformers_ext/cuda/binding.cpp
@ -1,15 +1,15 @@
 /**
 * @Description  :
- * @Author       : Azure-Tang
+ * @Author       : Azure-Tang, Boxin Zhang
 * @Date         : 2024-07-25 13:38:30
- * @Version      : 1.0.0
- * @LastEditors  : kkk1nak0
- * @LastEditTime : 2024-08-12 03:05:04
+ * @Version      : 0.2.2
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/

 #include "custom_gguf/ops.h"
+#ifdef KTRANSFORMERS_USE_CUDA
 #include "gptq_marlin/ops.h"
+#endif
 // Python bindings
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
@ -19,22 +19,53 @@
 // namespace py = pybind11;

 PYBIND11_MODULE(KTransformersOps, m) {
-      m.def("dequantize_q8_0", &dequantize_q8_0, "Function to dequantize q8_0 data.",
-            py::arg("data"), py::arg("blk_size"), py::arg("device"));
-      m.def("dequantize_q6_k", &dequantize_q6_k, "Function to dequantize q6_k data.",
-            py::arg("data"), py::arg("blk_size"), py::arg("device"));
-      m.def("dequantize_q5_k", &dequantize_q5_k, "Function to dequantize q5_k data.",
-            py::arg("data"), py::arg("blk_size"), py::arg("device"));
-      m.def("dequantize_q4_k",  &dequantize_q4_k, "Function to dequantize q4_k data.",
-            py::arg("data"), py::arg("blk_size"), py::arg("device"));
-      m.def("dequantize_q3_k",  &dequantize_q3_k, "Function to dequantize q3_k data.",
-            py::arg("data"), py::arg("blk_size"), py::arg("device"));
-      m.def("dequantize_q2_k",  &dequantize_q2_k, "Function to dequantize q2_k data.",
-            py::arg("data"), py::arg("blk_size"), py::arg("device"));
-      m.def("dequantize_iq4_xs",  &dequantize_iq4_xs, "Function to dequantize iq4_xs data.",
-            py::arg("data"), py::arg("blk_size"), py::arg("device"));
+
+    m.def("dequantize_q8_0", [](const intptr_t data, int num_bytes, int blk_size, const int ele_per_blk, torch::Device device, py::object target_dtype) {
+        torch::Dtype dtype = torch::python::detail::py_object_to_dtype(target_dtype);
+        return dequantize_q8_0((int8_t*)data, num_bytes, blk_size, ele_per_blk, device, dtype);
+        }, "Function to dequantize q8_0 data.",
+        py::arg("data"), py::arg("num_bytes"), py::arg("blk_size"), py::arg("ele_per_blk"), py::arg("device"), py::arg("target_dtype"));
+
+    m.def("dequantize_q6_k", [](const intptr_t data, int num_bytes, int blk_size, const int ele_per_blk, torch::Device device, py::object target_dtype) {
+        torch::Dtype dtype = torch::python::detail::py_object_to_dtype(target_dtype);
+        return dequantize_q6_k((int8_t*)data, num_bytes, blk_size, ele_per_blk, device, dtype);
+        }, "Function to dequantize q6_k data.",
+        py::arg("data"), py::arg("num_bytes"), py::arg("blk_size"), py::arg("ele_per_blk"), py::arg("device"), py::arg("target_dtype"));
+
+    m.def("dequantize_q5_k", [](const intptr_t data, int num_bytes, int blk_size, const int ele_per_blk, torch::Device device, py::object target_dtype) {
+        torch::Dtype dtype = torch::python::detail::py_object_to_dtype(target_dtype);
+        return dequantize_q5_k((int8_t*)data, num_bytes, blk_size, ele_per_blk, device, dtype);
+        }, "Function to dequantize q5_k data.",
+        py::arg("data"), py::arg("num_bytes"), py::arg("blk_size"), py::arg("ele_per_blk"), py::arg("device"), py::arg("target_dtype"));
+
+    m.def("dequantize_q4_k", [](const intptr_t data, int num_bytes, int blk_size, const int ele_per_blk, torch::Device device, py::object target_dtype) {
+        torch::Dtype dtype = torch::python::detail::py_object_to_dtype(target_dtype);
+        return dequantize_q4_k((int8_t*)data, num_bytes, blk_size, ele_per_blk, device, dtype);
+        }, "Function to dequantize q4_k data.",
+        py::arg("data"), py::arg("num_bytes"), py::arg("blk_size"), py::arg("ele_per_blk"), py::arg("device"), py::arg("target_dtype"));
+
+    m.def("dequantize_q3_k", [](const intptr_t data, int num_bytes, int blk_size, const int ele_per_blk, torch::Device device, py::object target_dtype) {
+        torch::Dtype dtype = torch::python::detail::py_object_to_dtype(target_dtype);
+        return dequantize_q3_k((int8_t*)data, num_bytes, blk_size, ele_per_blk, device, dtype);
+        }, "Function to dequantize q3_k data.",
+        py::arg("data"), py::arg("num_bytes"), py::arg("blk_size"), py::arg("ele_per_blk"), py::arg("device"), py::arg("target_dtype"));
+
+    m.def("dequantize_q2_k", [](const intptr_t data, int num_bytes, int blk_size, const int ele_per_blk, torch::Device device, py::object target_dtype) {
+        torch::Dtype dtype = torch::python::detail::py_object_to_dtype(target_dtype);
+        return dequantize_q2_k((int8_t*)data, num_bytes, blk_size, ele_per_blk, device, dtype);
+        }, "Function to dequantize q2_k data.",
+        py::arg("data"), py::arg("num_bytes"), py::arg("blk_size"), py::arg("ele_per_blk"), py::arg("device"), py::arg("target_dtype"));
+
+    m.def("dequantize_iq4_xs", [](const intptr_t data, int num_bytes, int blk_size, const int ele_per_blk, torch::Device device, py::object target_dtype) {
+        torch::Dtype dtype = torch::python::detail::py_object_to_dtype(target_dtype);
+        return dequantize_iq4_xs((int8_t*)data, num_bytes, blk_size, ele_per_blk, device, dtype);
+        }, "Function to dequantize iq4_xs data.",
+        py::arg("data"), py::arg("num_bytes"), py::arg("blk_size"), py::arg("ele_per_blk"), py::arg("device"), py::arg("target_dtype"));
+
+#ifdef KTRANSFORMERS_USE_CUDA
    m.def("gptq_marlin_gemm", &gptq_marlin_gemm, "Function to perform GEMM using Marlin quantization.",
        py::arg("a"), py::arg("b_q_weight"), py::arg("b_scales"), py::arg("g_idx"),
        py::arg("perm"), py::arg("workspace"), py::arg("num_bits"), py::arg("size_m"),
        py::arg("size_n"), py::arg("size_k"), py::arg("is_k_full"));
+#endif
 }
--- a/ktransformers/ktransformers_ext/cuda/custom_gguf/binding.cpp
+++ b/ktransformers/ktransformers_ext/cuda/custom_gguf/binding.cpp
@ -1,35 +0,0 @@
-#include "ops.h"
-// Python bindings
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <torch/library.h>
-#include <torch/extension.h>
-#include <torch/torch.h>
-// namespace py = pybind11;
-
-int test(){
-    return 5;
-}
-
-torch::Tensor dequantize_q6_k(torch::Tensor data, int blk_size, torch::Device device);
-torch::Tensor dequantize_q5_k(torch::Tensor data, int blk_size, torch::Device device);
-torch::Tensor dequantize_q2_k(torch::Tensor data, int blk_size, torch::Device device);
-
-PYBIND11_MODULE(cudaops, m) {
-    m.def("dequantize_q8_0", &dequantize_q8_0, "Function to dequantize q8_0 data.",
-          py::arg("data"), py::arg("blk_size"), py::arg("device"));
-    m.def("dequantize_q6_k", &dequantize_q6_k, "Function to dequantize q6_k data.",
-          py::arg("data"), py::arg("blk_size"), py::arg("device"));
-    m.def("dequantize_q5_k", &dequantize_q5_k, "Function to dequantize q5_k data.",
-          py::arg("data"), py::arg("blk_size"), py::arg("device"));
-    m.def("dequantize_q4_k",  &dequantize_q4_k, "Function to dequantize q4_k data.",
-          py::arg("data"), py::arg("blk_size"), py::arg("device"));
-    m.def("dequantize_q3_k",  &dequantize_q3_k, "Function to dequantize q3_k data.",
-            py::arg("data"), py::arg("blk_size"), py::arg("device"));
-    m.def("dequantize_q2_k",  &dequantize_q2_k, "Function to dequantize q2_k data.",
-          py::arg("data"), py::arg("blk_size"), py::arg("device"));
-    m.def("dequantize_iq4_xs",  &dequantize_iq4_xs, "Function to dequantize iq4_xs data.",
-          py::arg("data"), py::arg("blk_size"), py::arg("device"));
-    m.def("test", &test, "Function to test.");
-    
-}
--- a/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu
+++ b/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu
@ -2,26 +2,55 @@
 * @Description  :  
 * @Author       : Azure-Tang, Boxin Zhang
 * @Date         : 2024-07-25 13:38:30
- * @Version      : 1.0.0
- * @LastEditors  : kkk1nak0
- * @LastEditTime : 2024-08-12 04:18:04
+ * @Version      : 0.2.2
 * Adapted from https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.c
 * Copyright (c) 2023-2024 The ggml authors
 * Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
 */
 #include <cuda_runtime.h>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
 #include <torch/library.h>
 #include <torch/extension.h>
 #include <torch/torch.h>
 #include <cstdint>
 #include <c10/cuda/CUDAGuard.h>

-__global__ void dequantize_q8_0_kernel(float* output, const float* scales, const int8_t* qs, int num_blocks, int blk_size) {
-    int global_idx = blockIdx.x * blockDim.x + threadIdx.x;
-    for (auto block_id=global_idx; block_id<num_blocks;block_id+=blockDim.x * gridDim.x){
-        for(int i=0;i<blk_size;i++){
-            float scale = scales[block_id];
-            output[block_id * blk_size + i] = scale * qs[block_id * blk_size + i];
+__global__ void dequantize_q8_0_fp32_kernel(const int8_t* data, float* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
+    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    for (long long block_id = global_idx; block_id < num_blocks; block_id += blockDim.x * gridDim.x){
+        float* __restrict__ output_blk = (float*)(output + block_id * ele_per_blk);
+        const int8_t* cur_block = data + block_id * blk_size;
+        float scale = __half2float(*((half*)cur_block));
+        cur_block += 2;
+        for (int i = 0; i < ele_per_blk; i++){
+            output_blk[i] = scale * cur_block[i];
+        }
+    }
+}
+
+__global__ void dequantize_q8_0_fp16_kernel(const int8_t* data, __half* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
+    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    for (long long block_id = global_idx; block_id < num_blocks; block_id += blockDim.x * gridDim.x) {
+        __half* __restrict__ output_blk = (__half*)(output + block_id * ele_per_blk);
+        const int8_t* cur_block = data + block_id * blk_size;
+        float scale = __half2float(*((half*)cur_block));
+        cur_block += 2;
+        for (int i = 0; i < ele_per_blk; i++) {
+            output_blk[i] = __float2half(scale * cur_block[i]);
+        }
+    }
+}
+
+__global__ void dequantize_q8_0_bf16_kernel(const int8_t* data, nv_bfloat16* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
+    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    for (long long block_id = global_idx; block_id < num_blocks; block_id += blockDim.x * gridDim.x) {
+        nv_bfloat16* __restrict__ output_blk = (nv_bfloat16*)(output + block_id * ele_per_blk);
+        const int8_t* cur_block = data + block_id * blk_size;
+        float scale = __half2float(*((half*)cur_block));
+        cur_block += 2;
+        for (int i = 0; i < ele_per_blk; i++) {
+            output_blk[i] = __float2bfloat16(scale * cur_block[i]);
        }
    }
 }
@ -36,13 +65,13 @@ __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t * __restrict_
    }
 }

-__global__ void dequantize_q2_k_kernel(int8_t* data, float* output, int blk_size, int num_blocks) {
-    int global_idx = blockIdx.x * blockDim.x + threadIdx.x;
-    for (auto block_id=global_idx; block_id<num_blocks; block_id+= blockDim.x * gridDim.x){
-        float* __restrict__ output_blk = (float*)(output + block_id * 256);
+__global__ void dequantize_q2_k_fp32_kernel(const int8_t* data, float* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
+    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    for (long long block_id=global_idx; block_id<num_blocks; block_id+= blockDim.x * gridDim.x){
+        float* __restrict__ output_blk = (float*)(output + block_id * ele_per_blk);

-        const float d   = __half2float(*(reinterpret_cast<half*>(data + block_id * blk_size + 80)));
-        const float min = __half2float(*(reinterpret_cast<half*>(data + block_id * blk_size + 82)));
+        const float d   = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 80)));
+        const float min = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 82)));

        const uint8_t * __restrict__ q = (uint8_t*)(data + block_id * blk_size + 16);

@ -70,17 +99,85 @@ __global__ void dequantize_q2_k_kernel(int8_t* data, float* output, int blk_size
    }
 }

-__global__ void dequantize_q3_k_kernel(int8_t* data, float* output, int blk_size, int num_blocks) {
+__global__ void dequantize_q2_k_fp16_kernel(const int8_t* data, __half* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
+    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    for (long long block_id=global_idx; block_id<num_blocks; block_id+= blockDim.x * gridDim.x){
+        __half* __restrict__ output_blk = (__half*)(output + block_id * ele_per_blk);

-    int global_idx = blockIdx.x * blockDim.x + threadIdx.x;    
+        const float d   = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 80)));
+        const float min = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 82)));
+
+        const uint8_t * __restrict__ q = (uint8_t*)(data + block_id * blk_size + 16);
+
+        int is = 0;
+        float dl, ml;
+
+        for (int n = 0; n < 256; n += 128) {
+            int shift = 0;
+            for (int j = 0; j < 4; ++j) {
+                uint8_t* scales = (uint8_t*)(data + block_id * blk_size + (is++));
+                uint8_t sc = *scales;
+                dl = d * (sc & 0xF); ml = min * (sc >> 4);
+                for (int l = 0; l < 16; ++l) *output_blk++ = __float2half(dl * ((int8_t)((q[l] >> shift) & 3)) - ml);
+
+                scales = (uint8_t*)(data + block_id * blk_size + (is++));
+                sc = *scales;
+
+                dl = d * (sc & 0xF); ml = min * (sc >> 4);
+                for (int l = 0; l < 16; ++l) *output_blk++ = __float2half(dl * ((int8_t)((q[l+16] >> shift) & 3)) - ml);
+
+                shift += 2;
+            }
+            q += 32;
+        }
+    }
+}
+
+__global__ void dequantize_q2_k_bf16_kernel(const int8_t* data, nv_bfloat16* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
+    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    for (long long block_id=global_idx; block_id<num_blocks; block_id+= blockDim.x * gridDim.x){
+        nv_bfloat16* __restrict__ output_blk = (nv_bfloat16*)(output + block_id * ele_per_blk);
+
+        const float d   = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 80)));
+        const float min = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 82)));
+
+        const uint8_t * __restrict__ q = (uint8_t*)(data + block_id * blk_size + 16);
+
+        int is = 0;
+        float dl, ml;
+
+        for (int n = 0; n < 256; n += 128) {
+            int shift = 0;
+            for (int j = 0; j < 4; ++j) {
+                uint8_t* scales = (uint8_t*)(data + block_id * blk_size + (is++));
+                uint8_t sc = *scales;
+                dl = d * (sc & 0xF); ml = min * (sc >> 4);
+                for (int l = 0; l < 16; ++l) *output_blk++ = __float2bfloat16(dl * ((int8_t)((q[l] >> shift) & 3)) - ml);
+
+                scales = (uint8_t*)(data + block_id * blk_size + (is++));
+                sc = *scales;
+
+                dl = d * (sc & 0xF); ml = min * (sc >> 4);
+                for (int l = 0; l < 16; ++l) *output_blk++ = __float2bfloat16(dl * ((int8_t)((q[l+16] >> shift) & 3)) - ml);
+
+                shift += 2;
+            }
+            q += 32;
+        }
+    }
+}
+
+__global__ void dequantize_q3_k_fp32_kernel(const int8_t* data, float* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
+    
+    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;    
    const uint32_t kmask1 = 0x03030303;
    const uint32_t kmask2 = 0x0f0f0f0f;
-    for (auto block_id=global_idx; block_id<num_blocks; block_id+= blockDim.x * gridDim.x){
-        float* __restrict__ output_blk = (float*)(output + block_id * 256);
+    for (long long block_id=global_idx; block_id<num_blocks; block_id+= blockDim.x * gridDim.x){
+        float* __restrict__ output_blk = (float*)(output + block_id * ele_per_blk);

        uint32_t aux[4];
        const int8_t * scales = (const int8_t*)aux;
-        const float d_all = __half2float(*(reinterpret_cast<half*>(data + block_id * blk_size + 108)));
+        const float d_all = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 108)));

        const uint8_t * __restrict__ q  = (uint8_t*)(data + block_id * blk_size + 32);
        const uint8_t * __restrict__ hm = (uint8_t*)(data + block_id * blk_size + 0);
@ -126,19 +223,131 @@ __global__ void dequantize_q3_k_kernel(int8_t* data, float* output, int blk_size
    }
 }

+__global__ void dequantize_q3_k_fp16_kernel(const int8_t* data, __half* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
    
-__global__ void dequantize_q4_k_kernel(int8_t* data, float* output, int blk_size, int num_blocks) {
-    int global_idx = blockIdx.x * blockDim.x + threadIdx.x;
-    for (auto block_id=global_idx; block_id<num_blocks;block_id+=blockDim.x * gridDim.x){
-        float* __restrict__ output_blk = (float*)(output + block_id * 256);
+    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;    
+    const uint32_t kmask1 = 0x03030303;
+    const uint32_t kmask2 = 0x0f0f0f0f;
+    for (long long block_id=global_idx; block_id<num_blocks; block_id+= blockDim.x * gridDim.x){
+        __half* __restrict__ output_blk = (__half*)(output + block_id * ele_per_blk);
+
+        uint32_t aux[4];
+        const int8_t * scales = (const int8_t*)aux;
+        const float d_all = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 108)));
+
+        const uint8_t * __restrict__ q  = (uint8_t*)(data + block_id * blk_size + 32);
+        const uint8_t * __restrict__ hm = (uint8_t*)(data + block_id * blk_size + 0);
+        uint8_t m = 1;
+
+
+        uint8_t* block_scales = (uint8_t*)(data + block_id * blk_size + 96);
+
+        for (int i = 0; i < 3; i++) {  
+            aux[i] = 0;  
+            for (int j = 0; j < 4; j++) {  
+                aux[i] |= ((uint32_t)block_scales[i * 4 + j]) << (j * 8);
+            }
+        }
+
+        uint32_t tmp = aux[2];
+        aux[2] = ((aux[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
+        aux[3] = ((aux[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
+        aux[0] = (aux[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
+        aux[1] = (aux[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
+
+        int is = 0;
+        float dl;
+        for (int n = 0; n < 256; n += 128) {
+            int shift = 0;
+            for (int j = 0; j < 4; ++j) {
+
+                dl = d_all * (scales[is++] - 32);
+                for (int l = 0; l < 16; ++l) {
+                    *output_blk++ = __float2half(dl * ((int8_t)((q[l+ 0] >> shift) & 3) - ((hm[l+ 0] & m) ? 0 : 4)));
+                }
+
+                dl = d_all * (scales[is++] - 32);
+                for (int l = 0; l < 16; ++l) {
+                    *output_blk++ = __float2half(dl * ((int8_t)((q[l+16] >> shift) & 3) - ((hm[l+16] & m) ? 0 : 4)));
+                }
+
+                shift += 2;
+                m <<= 1;
+            }
+            q += 32;
+        }
+    }
+}
+
+__global__ void dequantize_q3_k_bf16_kernel(const int8_t* data, nv_bfloat16* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
+    
+    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;    
+    const uint32_t kmask1 = 0x03030303;
+    const uint32_t kmask2 = 0x0f0f0f0f;
+    for (long long block_id=global_idx; block_id<num_blocks; block_id+= blockDim.x * gridDim.x){
+        nv_bfloat16* __restrict__ output_blk = (nv_bfloat16*)(output + block_id * ele_per_blk);
+
+        uint32_t aux[4];
+        const int8_t * scales = (const int8_t*)aux;
+        const float d_all = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 108)));
+
+        const uint8_t * __restrict__ q  = (uint8_t*)(data + block_id * blk_size + 32);
+        const uint8_t * __restrict__ hm = (uint8_t*)(data + block_id * blk_size + 0);
+        uint8_t m = 1;
+
+
+        uint8_t* block_scales = (uint8_t*)(data + block_id * blk_size + 96);
+
+        for (int i = 0; i < 3; i++) {  
+            aux[i] = 0;  
+            for (int j = 0; j < 4; j++) {  
+                aux[i] |= ((uint32_t)block_scales[i * 4 + j]) << (j * 8);
+            }
+        }
+
+        uint32_t tmp = aux[2];
+        aux[2] = ((aux[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
+        aux[3] = ((aux[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
+        aux[0] = (aux[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
+        aux[1] = (aux[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
+
+        int is = 0;
+        float dl;
+        for (int n = 0; n < 256; n += 128) {
+            int shift = 0;
+            for (int j = 0; j < 4; ++j) {
+
+                dl = d_all * (scales[is++] - 32);
+                for (int l = 0; l < 16; ++l) {
+                    *output_blk++ = __float2bfloat16(dl * ((int8_t)((q[l+ 0] >> shift) & 3) - ((hm[l+ 0] & m) ? 0 : 4)));
+                }
+
+                dl = d_all * (scales[is++] - 32);
+                for (int l = 0; l < 16; ++l) {
+                    *output_blk++ = __float2bfloat16(dl * ((int8_t)((q[l+16] >> shift) & 3) - ((hm[l+16] & m) ? 0 : 4)));
+                }
+
+                shift += 2;
+                m <<= 1;
+            }
+            q += 32;
+        }
+    }
+}
+
+
+__global__ void dequantize_q4_k_fp32_kernel(const int8_t* data, float* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
+    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    for (long long block_id=global_idx; block_id<num_blocks; block_id+=blockDim.x * gridDim.x){
+        float* __restrict__ output_blk = (float*)(output + block_id * ele_per_blk);
        // const uint8_t * q = data[i].qs;
        const uint8_t * q = (uint8_t*)(data + block_id * 144 + 16);

-        const float d   = __half2float(*(reinterpret_cast<half*>(data + block_id * 144 + 0)));
-        const float min = __half2float(*(reinterpret_cast<half*>(data + block_id * 144 + 2)));
+        const float d   = __half2float(*(reinterpret_cast<const half*>(data + block_id * 144 + 0)));
+        const float min = __half2float(*(reinterpret_cast<const half*>(data + block_id * 144 + 2)));
        int is = 0;
        uint8_t sc, m;
-        for (int j = 0; j < blk_size; j += 64) {
+        for (int j = 0; j < ele_per_blk; j += 64) {
            uint8_t* scales = (uint8_t*)(data + block_id * 144 + 4);
            get_scale_min_k4(is + 0, scales, &sc, &m);
            const float d1 = d * sc; const float m1 = min * m;
@ -151,13 +360,61 @@ __global__ void dequantize_q4_k_kernel(int8_t* data, float* output, int blk_size
    }
 }

-__global__ void dequantize_q5_k_kernel(int8_t* data, float* output, int blk_size, int num_blocks) {
-    int global_idx = blockIdx.x * blockDim.x + threadIdx.x;
-    for (auto block_id=global_idx; block_id<num_blocks; block_id+= blockDim.x * gridDim.x){
-        float* __restrict__ output_blk = (float*)(output + block_id * 256);
+__global__ void dequantize_q4_k_fp16_kernel(const int8_t* data, __half* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
+    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    for (long long block_id=global_idx; block_id<num_blocks; block_id+=blockDim.x * gridDim.x){
+        __half* __restrict__ output_blk = (__half*)(output + block_id * ele_per_blk);
+        // const uint8_t * q = data[i].qs;
+        const uint8_t * q = (uint8_t*)(data + block_id * 144 + 16);

-        const float d   = __half2float(*(reinterpret_cast<half*>(data + block_id * blk_size + 0)));
-        const float min = __half2float(*(reinterpret_cast<half*>(data + block_id * blk_size + 2)));
+        const float d   = __half2float(*(reinterpret_cast<const half*>(data + block_id * 144 + 0)));
+        const float min = __half2float(*(reinterpret_cast<const half*>(data + block_id * 144 + 2)));
+        int is = 0;
+        uint8_t sc, m;
+        for (int j = 0; j < ele_per_blk; j += 64) {
+            uint8_t* scales = (uint8_t*)(data + block_id * 144 + 4);
+            get_scale_min_k4(is + 0, scales, &sc, &m);
+            const float d1 = d * sc; const float m1 = min * m;
+            get_scale_min_k4(is + 1, scales, &sc, &m);
+            const float d2 = d * sc; const float m2 = min * m;
+            for (int l = 0; l < 32; ++l) *output_blk++ = __float2half(d1 * (q[l] & 0xF) - m1);
+            for (int l = 0; l < 32; ++l) *output_blk++ = __float2half(d2 * (q[l]  >> 4) - m2);
+            q += 32; is += 2;
+        }
+    }
+}
+
+__global__ void dequantize_q4_k_bf16_kernel(const int8_t* data, nv_bfloat16* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
+    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    for (long long block_id=global_idx; block_id<num_blocks; block_id+=blockDim.x * gridDim.x){
+        nv_bfloat16* __restrict__ output_blk = (nv_bfloat16*)(output + block_id * ele_per_blk);
+        // const uint8_t * q = data[i].qs;
+        const uint8_t * q = (uint8_t*)(data + block_id * 144 + 16);
+
+        const float d   = __half2float(*(reinterpret_cast<const half*>(data + block_id * 144 + 0)));
+        const float min = __half2float(*(reinterpret_cast<const half*>(data + block_id * 144 + 2)));
+        int is = 0;
+        uint8_t sc, m;
+        for (int j = 0; j < ele_per_blk; j += 64) {
+            uint8_t* scales = (uint8_t*)(data + block_id * 144 + 4);
+            get_scale_min_k4(is + 0, scales, &sc, &m);
+            const float d1 = d * sc; const float m1 = min * m;
+            get_scale_min_k4(is + 1, scales, &sc, &m);
+            const float d2 = d * sc; const float m2 = min * m;
+            for (int l = 0; l < 32; ++l) *output_blk++ = __float2bfloat16(d1 * (q[l] & 0xF) - m1);
+            for (int l = 0; l < 32; ++l) *output_blk++ = __float2bfloat16(d2 * (q[l]  >> 4) - m2);
+            q += 32; is += 2;
+        }
+    }
+}
+
+__global__ void dequantize_q5_k_fp32_kernel(const int8_t* data, float* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
+    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    for (long long block_id = global_idx; block_id < num_blocks; block_id += blockDim.x * gridDim.x){
+        float* __restrict__ output_blk = (float*)(output + block_id * ele_per_blk);
+
+        const float d   = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 0)));
+        const float min = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 2)));

        const uint8_t * __restrict__ qh = (uint8_t*)(data + block_id * blk_size + 16);
        const uint8_t * __restrict__ ql = (uint8_t*)(data + block_id * blk_size + 48);
@ -180,19 +437,76 @@ __global__ void dequantize_q5_k_kernel(int8_t* data, float* output, int blk_size
    }
 }

-__global__ void dequantize_q6_k_kernel(int8_t* data, float* output, int blk_size, int num_blocks) {
-    int global_idx = blockIdx.x * blockDim.x + threadIdx.x;
-    for (auto block_id=global_idx; block_id<num_blocks;block_id+=blockDim.x * gridDim.x){
-        float* __restrict__ output_blk = (float*)(output + block_id * 256);
-        const float d = __half2float(*(reinterpret_cast<half*>(data + block_id * blk_size + 208)));
+__global__ void dequantize_q5_k_fp16_kernel(const int8_t* data, __half* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
+    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    for (long long block_id = global_idx; block_id < num_blocks; block_id += blockDim.x * gridDim.x){
+        __half* __restrict__ output_blk = (__half*)(output + block_id * ele_per_blk);
+
+        const float d   = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 0)));
+        const float min = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 2)));
+
+        const uint8_t * __restrict__ qh = (uint8_t*)(data + block_id * blk_size + 16);
+        const uint8_t * __restrict__ ql = (uint8_t*)(data + block_id * blk_size + 48);
+
+        int is = 0;
+        uint8_t sc, m;
+        uint8_t u1 = 1, u2 = 2;
+        uint8_t* scales = (uint8_t*)(data + block_id * blk_size + 4);
+
+        for (int j = 0; j < 256; j += 64) {
+            get_scale_min_k4(is + 0, scales, &sc, &m);
+            const float d1 = d * sc; const float m1 = min * m;
+            get_scale_min_k4(is + 1, scales, &sc, &m);
+            const float d2 = d * sc; const float m2 = min * m;
+            for (int l = 0; l < 32; ++l) *output_blk++ = __float2half(d1 * ((ql[l] & 0xF) + (qh[l] & u1 ? 16 : 0)) - m1);
+            for (int l = 0; l < 32; ++l) *output_blk++ = __float2half(d2 * ((ql[l]  >> 4) + (qh[l] & u2 ? 16 : 0)) - m2);
+            ql += 32; is += 2;
+            u1 <<= 2; u2 <<= 2;
+        }
+    }
+}
+
+__global__ void dequantize_q5_k_bf16_kernel(const int8_t* data, nv_bfloat16* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
+    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    for (long long block_id = global_idx; block_id < num_blocks; block_id += blockDim.x * gridDim.x){
+        nv_bfloat16* __restrict__ output_blk = (nv_bfloat16*)(output + block_id * ele_per_blk);
+
+        const float d   = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 0)));
+        const float min = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 2)));
+
+        const uint8_t * __restrict__ qh = (uint8_t*)(data + block_id * blk_size + 16);
+        const uint8_t * __restrict__ ql = (uint8_t*)(data + block_id * blk_size + 48);
+
+        int is = 0;
+        uint8_t sc, m;
+        uint8_t u1 = 1, u2 = 2;
+        uint8_t* scales = (uint8_t*)(data + block_id * blk_size + 4);
+
+        for (int j = 0; j < 256; j += 64) {
+            get_scale_min_k4(is + 0, scales, &sc, &m);
+            const float d1 = d * sc; const float m1 = min * m;
+            get_scale_min_k4(is + 1, scales, &sc, &m);
+            const float d2 = d * sc; const float m2 = min * m;
+            for (int l = 0; l < 32; ++l) *output_blk++ = __float2bfloat16(d1 * ((ql[l] & 0xF) + (qh[l] & u1 ? 16 : 0)) - m1);
+            for (int l = 0; l < 32; ++l) *output_blk++ = __float2bfloat16(d2 * ((ql[l]  >> 4) + (qh[l] & u2 ? 16 : 0)) - m2);
+            ql += 32; is += 2;
+            u1 <<= 2; u2 <<= 2;
+        }
+    }
+}
+
+__global__ void dequantize_q6_k_fp32_kernel(const int8_t* data, float* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
+    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    for (long long  block_id=global_idx; block_id<num_blocks;block_id+=blockDim.x * gridDim.x){
+        float* __restrict__ output_blk = (float*)(output + block_id * ele_per_blk);
+        const float d = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 208)));

        const uint8_t * __restrict__ ql = (uint8_t*)(data + block_id * blk_size);
        const uint8_t * __restrict__ qh = (uint8_t*)(data + block_id * blk_size + 128);
        const int8_t  * __restrict__ sc = (int8_t*)(data + block_id * blk_size + 192);


-        //if (blk_size == 256){
-            for (int n = 0; n < blk_size; n += 128) {
+        for (int n = 0; n < ele_per_blk; n += 128) {
            for (int l = 0; l < 32; ++l) {
                int is = l/16;
                const int8_t q1 = (int8_t)((ql[l +  0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
@ -212,14 +526,76 @@ __global__ void dequantize_q6_k_kernel(int8_t* data, float* output, int blk_size
    }
 }

+__global__ void dequantize_q6_k_fp16_kernel(const int8_t* data, __half* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
+    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    for (long long  block_id=global_idx; block_id<num_blocks;block_id+=blockDim.x * gridDim.x){
+        __half* __restrict__ output_blk = (__half*)(output + block_id * ele_per_blk);
+        const float d = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 208)));
+
+        const uint8_t * __restrict__ ql = (uint8_t*)(data + block_id * blk_size);
+        const uint8_t * __restrict__ qh = (uint8_t*)(data + block_id * blk_size + 128);
+        const int8_t  * __restrict__ sc = (int8_t*)(data + block_id * blk_size + 192);
+
+
+        for (int n = 0; n < ele_per_blk; n += 128) {
+            for (int l = 0; l < 32; ++l) {
+                int is = l/16;
+                const int8_t q1 = (int8_t)((ql[l +  0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
+                const int8_t q2 = (int8_t)((ql[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
+                const int8_t q3 = (int8_t)((ql[l +  0]  >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
+                const int8_t q4 = (int8_t)((ql[l + 32]  >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
+                output_blk[l +  0] = __float2half(d * sc[is + 0] * q1);
+                output_blk[l + 32] = __float2half(d * sc[is + 2] * q2);
+                output_blk[l + 64] = __float2half(d * sc[is + 4] * q3);
+                output_blk[l + 96] = __float2half(d * sc[is + 6] * q4);
+            }
+            output_blk += 128;
+            ql += 64;
+            qh += 32;
+            sc += 8;
+        }
+    }
+}
+
+__global__ void dequantize_q6_k_bf16_kernel(const int8_t* data, nv_bfloat16* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
+    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    for (long long  block_id=global_idx; block_id<num_blocks;block_id+=blockDim.x * gridDim.x){
+        nv_bfloat16* __restrict__ output_blk = (nv_bfloat16*)(output + block_id * ele_per_blk);
+        const float d = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 208)));
+
+        const uint8_t * __restrict__ ql = (uint8_t*)(data + block_id * blk_size);
+        const uint8_t * __restrict__ qh = (uint8_t*)(data + block_id * blk_size + 128);
+        const int8_t  * __restrict__ sc = (int8_t*)(data + block_id * blk_size + 192);
+
+
+        for (int n = 0; n < ele_per_blk; n += 128) {
+            for (int l = 0; l < 32; ++l) {
+                int is = l/16;
+                const int8_t q1 = (int8_t)((ql[l +  0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
+                const int8_t q2 = (int8_t)((ql[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
+                const int8_t q3 = (int8_t)((ql[l +  0]  >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
+                const int8_t q4 = (int8_t)((ql[l + 32]  >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
+                output_blk[l +  0] = __float2bfloat16(d * sc[is + 0] * q1);
+                output_blk[l + 32] = __float2bfloat16(d * sc[is + 2] * q2);
+                output_blk[l + 64] = __float2bfloat16(d * sc[is + 4] * q3);
+                output_blk[l + 96] = __float2bfloat16(d * sc[is + 6] * q4);
+            }
+            output_blk += 128;
+            ql += 64;
+            qh += 32;
+            sc += 8;
+        }
+    }
+}
+
 static constexpr __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};

-__global__ void dequantize_iq4_xs_kernel(int8_t* data, float* output, int blk_size, int num_blocks) {
-    int global_idx = blockIdx.x * blockDim.x + threadIdx.x;
-    for (auto block_id=global_idx; block_id<num_blocks; block_id+=blockDim.x * gridDim.x) {
-        float* __restrict__ output_blk = (float*)(output + block_id * 256);
-        const float d = __half2float(*(reinterpret_cast<half*>(data + block_id * blk_size)));
-        const uint16_t scales_h = *(reinterpret_cast<uint16_t*>(data + block_id * blk_size + 2));
+__global__ void dequantize_iq4_xs_fp32_kernel(const int8_t* data, float* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
+    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    for (long long block_id=global_idx; block_id<num_blocks; block_id+=blockDim.x * gridDim.x) {
+        float* __restrict__ output_blk = (float*)(output + block_id * ele_per_blk);
+        const float d = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size)));
+        const uint16_t scales_h = *(reinterpret_cast<const uint16_t*>(data + block_id * blk_size + 2));
        const uint8_t* scales_l = (uint8_t*)(data + block_id * blk_size + 2 + 2);
        const uint8_t* qs = (uint8_t*)(data + block_id * blk_size + 2 + 2 + 4);

@ -236,152 +612,267 @@ __global__ void dequantize_iq4_xs_kernel(int8_t* data, float* output, int blk_si
    }
 }

-torch::Tensor dequantize_q8_0(torch::Tensor data, int blk_size, torch::Device device) {
-    int num_blocks = data.numel() / blk_size;
+__global__ void dequantize_iq4_xs_fp16_kernel(const int8_t* data, __half* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
+    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    for (long long block_id=global_idx; block_id<num_blocks; block_id+=blockDim.x * gridDim.x) {
+        __half* __restrict__ output_blk = (__half*)(output + block_id * ele_per_blk);
+        const float d = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size)));
+        const uint16_t scales_h = *(reinterpret_cast<const uint16_t*>(data + block_id * blk_size + 2));
+        const uint8_t* scales_l = (uint8_t*)(data + block_id * blk_size + 2 + 2);
+        const uint8_t* qs = (uint8_t*)(data + block_id * blk_size + 2 + 2 + 4);
+
+        for (int ib = 0; ib < 8; ++ib) {
+            const int ls = ((scales_l[ib / 2] >> 4 * (ib % 2)) & 0xf) | (((scales_h >> 2 * ib) & 3) << 4);
+            const float dl = d * (ls - 32);
+            for (int j = 0; j < 16; ++j) {
+                output_blk[j + 0] = __float2half(dl * kvalues_iq4nl[qs[j] & 0xf]);
+                output_blk[j + 16] = __float2half(dl * kvalues_iq4nl[qs[j] >> 4]);
+            }
+            output_blk += 32;
+            qs += 16;
+        }
+    }
+}
+
+__global__ void dequantize_iq4_xs_bf16_kernel(const int8_t* data, nv_bfloat16* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
+    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    for (long long block_id=global_idx; block_id<num_blocks; block_id+=blockDim.x * gridDim.x) {
+        nv_bfloat16* __restrict__ output_blk = (nv_bfloat16*)(output + block_id * ele_per_blk);
+        const float d = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size)));
+        const uint16_t scales_h = *(reinterpret_cast<const uint16_t*>(data + block_id * blk_size + 2));
+        const uint8_t* scales_l = (uint8_t*)(data + block_id * blk_size + 2 + 2);
+        const uint8_t* qs = (uint8_t*)(data + block_id * blk_size + 2 + 2 + 4);
+
+        for (int ib = 0; ib < 8; ++ib) {
+            const int ls = ((scales_l[ib / 2] >> 4 * (ib % 2)) & 0xf) | (((scales_h >> 2 * ib) & 3) << 4);
+            const float dl = d * (ls - 32);
+            for (int j = 0; j < 16; ++j) {
+                output_blk[j + 0] = __float2bfloat16(dl * kvalues_iq4nl[qs[j] & 0xf]);
+                output_blk[j + 16] = __float2bfloat16(dl * kvalues_iq4nl[qs[j] >> 4]);
+            }
+            output_blk += 32;
+            qs += 16;
+        }
+    }
+}
+
+torch::Tensor dequantize_q8_0(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype) {
+    int num_blocks = num_bytes / blk_size;
    const at::cuda::OptionalCUDAGuard device_guard(device);
-    // create gpu
-    auto options_scales = torch::TensorOptions().dtype(torch::kFloat32).device(device).memory_format(torch::MemoryFormat::Contiguous);
-    auto options_qs = torch::TensorOptions().dtype(torch::kInt8).device(device).memory_format(torch::MemoryFormat::Contiguous);
-    auto scales_gpu = torch::empty({{num_blocks, 1}}, options_scales);
-    auto qs_gpu = torch::empty({num_blocks, 32}, options_qs);

-    // read on cpu
-    options_scales = torch::TensorOptions().dtype(torch::kFloat16).device(torch::kCPU);
-    options_qs = torch::TensorOptions().dtype(torch::kInt8).device(torch::kCPU);
+    auto options = torch::TensorOptions().dtype(torch::kInt8).device(device).memory_format(torch::MemoryFormat::Contiguous);
+    auto data_gpu = torch::empty({ num_bytes }, options);

-    // // reinterpret
-    auto scales = torch::from_blob(data.data_ptr(), {num_blocks, 1 + 16}, options_scales).slice(1, 0, 1);
-    auto qs = torch::from_blob(data.data_ptr(), {num_blocks, 2 + 32}, options_qs).slice(1, 2);
-    
-    auto scales_f32 = scales.to(torch::kFloat32);
-    scales_gpu.copy_(scales_f32, false);
-    qs_gpu.copy_(qs, false);
+    cudaMemcpy(data_gpu.data_ptr<int8_t>(), data, num_bytes, cudaMemcpyHostToDevice);
+    //data_gpu.copy_(data, false);

    // Create output tensor
-    auto output = torch::zeros_like(qs, torch::dtype(torch::kFloat32).device(device));
+    auto output = torch::zeros({ num_blocks, 32 }, torch::dtype(target_dtype).device(device));

-    // Launch kernel
-    dequantize_q8_0_kernel<<< 512, 256 >>>(
-        output.data_ptr<float>(), scales_gpu.data_ptr<float>(), qs_gpu.data_ptr<int8_t>(), num_blocks, 32);
+    switch (target_dtype) {
+        case torch::kFloat16:
+            dequantize_q8_0_fp16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (__half*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
+            break;
+        case torch::kBFloat16:
+            dequantize_q8_0_bf16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (nv_bfloat16*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
+            break;
+        case torch::kFloat32:
+            dequantize_q8_0_fp32_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), output.data_ptr<float>(), blk_size, ele_per_blk, num_blocks);
+            break;
+        default:
+            printf("target type not support\n");
+            exit(0);
+    }

    cudaDeviceSynchronize();
    return output;
 }


-torch::Tensor dequantize_q6_k(torch::Tensor data, int blk_size, torch::Device device) {
+torch::Tensor dequantize_q6_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype) {
    // data.numel%blk_size should be 0, else raise err
-    int num_blocks = data.numel() / blk_size;
+    int num_blocks = num_bytes / blk_size;

    const at::cuda::OptionalCUDAGuard device_guard(device);
    auto options = torch::TensorOptions().dtype(torch::kInt8).device(device).memory_format(torch::MemoryFormat::Contiguous);
-    auto data_gpu = torch::empty({data.numel()}, options);
+    auto data_gpu = torch::empty({num_bytes}, options);

-    data_gpu.copy_(data, false);
+    cudaMemcpy(data_gpu.data_ptr<int8_t>(), data, num_bytes, cudaMemcpyHostToDevice);
+    //data_gpu.copy_(data, false);

    // Create output tensor
-    auto output = torch::zeros({num_blocks, 256}, torch::dtype(torch::kFloat32).device(device));
-
-    // Launch kernel
-    dequantize_q6_k_kernel<<< 512, 256 >>>(data_gpu.data_ptr<int8_t>(), output.data_ptr<float>(), blk_size, num_blocks);
-    // dequantize_q6_k_kernel<<< 512, 256 >>>(data_gpu.data_ptr<int8_t>(), output.data_ptr<float>(), 256, num_blocks);
+    auto output = torch::zeros({num_blocks, 256}, torch::dtype(target_dtype).device(device));

+    switch (target_dtype) {
+        case torch::kFloat16:
+            dequantize_q6_k_fp16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (__half*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
+            break;
+        case torch::kBFloat16:
+            dequantize_q6_k_bf16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (nv_bfloat16*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
+            break;
+        case torch::kFloat32:
+            dequantize_q6_k_fp32_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), output.data_ptr<float>(), blk_size, ele_per_blk, num_blocks);
+            break;
+        default:
+            printf("target type not support\n");
+            exit(0);
+    }
    cudaDeviceSynchronize();
    return output;
 }

-torch::Tensor dequantize_q5_k(torch::Tensor data, int blk_size, torch::Device device) {
-    int num_blocks = data.numel() / blk_size;
+torch::Tensor dequantize_q5_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype) {
+    int num_blocks = num_bytes / blk_size;
    const at::cuda::OptionalCUDAGuard device_guard(device);

    auto options = torch::TensorOptions().dtype(torch::kInt8).device(device).memory_format(torch::MemoryFormat::Contiguous);
-    auto data_gpu = torch::empty({data.numel()}, options);
+    auto data_gpu = torch::empty({num_bytes}, options);

-    data_gpu.copy_(data, false);
+    cudaMemcpy(data_gpu.data_ptr<int8_t>(), data, num_bytes, cudaMemcpyHostToDevice);
+    //data_gpu.copy_(data, false);

    // Create output tensor
-    auto output = torch::zeros({num_blocks, 256}, torch::dtype(torch::kFloat32).device(device));
-
-    // Launch kernel
-    dequantize_q5_k_kernel<<< 512, 256 >>>(data_gpu.data_ptr<int8_t>(), output.data_ptr<float>(), blk_size, num_blocks);
+    auto output = torch::zeros({num_blocks, 256}, torch::dtype(target_dtype).device(device));

+    switch (target_dtype) {
+        case torch::kFloat16:
+            dequantize_q5_k_fp16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (__half*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
+            break;
+        case torch::kBFloat16:
+            dequantize_q5_k_bf16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (nv_bfloat16*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
+            break;
+        case torch::kFloat32:
+            dequantize_q5_k_fp32_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), output.data_ptr<float>(), blk_size, ele_per_blk, num_blocks);
+            break;
+        default:
+            printf("target type not support\n");
+            exit(0);
+    }
    cudaDeviceSynchronize();
    return output;
 }

-torch::Tensor dequantize_q4_k(torch::Tensor data, int blk_size, torch::Device device) {
+torch::Tensor dequantize_q4_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype) {
    // data.numel%blk_size should be 0, else raise err
-    int num_blocks = data.numel() / blk_size;
+    int num_blocks = num_bytes / blk_size;
    const at::cuda::OptionalCUDAGuard device_guard(device);

    auto options = torch::TensorOptions().dtype(torch::kInt8).device(device).memory_format(torch::MemoryFormat::Contiguous);
-    auto data_gpu = torch::empty({data.numel()}, options);
+    auto data_gpu = torch::empty({num_bytes}, options);

-    data_gpu.copy_(data, false);
+    cudaMemcpy(data_gpu.data_ptr<int8_t>(), data, num_bytes, cudaMemcpyHostToDevice);
+    //data_gpu.copy_(data, false);

    // Create output tensor
-    auto output = torch::zeros({num_blocks, 256}, torch::dtype(torch::kFloat32).device(device));
-
-    // Launch kernel
-    dequantize_q4_k_kernel<<< 512, 256 >>>(data_gpu.data_ptr<int8_t>(), output.data_ptr<float>(), 256, num_blocks);
+    auto output = torch::zeros({num_blocks, 256}, torch::dtype(target_dtype).device(device));

+    switch (target_dtype) {
+        case torch::kFloat16:
+            dequantize_q4_k_fp16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (__half*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
+            break;
+        case torch::kBFloat16:
+            dequantize_q4_k_bf16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (nv_bfloat16*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
+            break;
+        case torch::kFloat32:
+            dequantize_q4_k_fp32_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), output.data_ptr<float>(), blk_size, ele_per_blk, num_blocks);
+            break;
+        default:
+            printf("target type not support\n");
+            exit(0);
+    }
    cudaDeviceSynchronize();
    return output;
 }

-torch::Tensor dequantize_q3_k(torch::Tensor data, int blk_size, torch::Device device) {
-    int num_blocks = data.numel() / blk_size;
+torch::Tensor dequantize_q3_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype) {
+    int num_blocks = num_bytes / blk_size;
    const at::cuda::OptionalCUDAGuard device_guard(device);

    auto options = torch::TensorOptions().dtype(torch::kInt8).device(device).memory_format(torch::MemoryFormat::Contiguous);
-    auto data_gpu = torch::empty({data.numel()}, options);
+    auto data_gpu = torch::empty({num_bytes}, options);

-    data_gpu.copy_(data, false);
+    cudaMemcpy(data_gpu.data_ptr<int8_t>(), data, num_bytes, cudaMemcpyHostToDevice);
+    //data_gpu.copy_(data, false);

    // Create output tensor
-    auto output = torch::zeros({num_blocks, 256}, torch::dtype(torch::kFloat32).device(device));
-
-    // Launch kernel
-    dequantize_q3_k_kernel<<< 512, 256 >>>(data_gpu.data_ptr<int8_t>(), output.data_ptr<float>(), blk_size, num_blocks);
+    auto output = torch::zeros({num_blocks, 256}, torch::dtype(target_dtype).device(device));

+    switch (target_dtype) {
+        case torch::kFloat16:
+            dequantize_q3_k_fp16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (__half*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
+            break;
+        case torch::kBFloat16:
+            dequantize_q3_k_bf16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (nv_bfloat16*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
+            break;
+        case torch::kFloat32:
+            dequantize_q3_k_fp32_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), output.data_ptr<float>(), blk_size, ele_per_blk, num_blocks);
+            break;
+        default:
+            printf("target type not support\n");
+            exit(0);
+    }
    cudaDeviceSynchronize();
    return output;
 }

-torch::Tensor dequantize_q2_k(torch::Tensor data, int blk_size, torch::Device device) {
-    int num_blocks = data.numel() / blk_size;
+torch::Tensor dequantize_q2_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype) {
+    int num_blocks = num_bytes / blk_size;
    const at::cuda::OptionalCUDAGuard device_guard(device);

    auto options = torch::TensorOptions().dtype(torch::kInt8).device(device).memory_format(torch::MemoryFormat::Contiguous);
-    auto data_gpu = torch::empty({data.numel()}, options);
+    auto data_gpu = torch::empty({num_bytes}, options);

-    data_gpu.copy_(data, false);
+    cudaMemcpy(data_gpu.data_ptr<int8_t>(), data, num_bytes, cudaMemcpyHostToDevice);
+    //data_gpu.copy_(data, false);

    // Create output tensor
-    auto output = torch::zeros({num_blocks, 256}, torch::dtype(torch::kFloat32).device(device));
-
-    // Launch kernel
-    dequantize_q2_k_kernel<<< 512, 256 >>>(data_gpu.data_ptr<int8_t>(), output.data_ptr<float>(), blk_size, num_blocks);
+    auto output = torch::zeros({num_blocks, 256}, torch::dtype(target_dtype).device(device));

+    switch (target_dtype) {
+        case torch::kFloat16:
+            dequantize_q2_k_fp16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (__half*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
+            break;
+        case torch::kBFloat16:
+            dequantize_q2_k_bf16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (nv_bfloat16*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
+            break;
+        case torch::kFloat32:
+            dequantize_q2_k_fp32_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), output.data_ptr<float>(), blk_size, ele_per_blk, num_blocks);
+            break;
+        default:
+            printf("target type not support\n");
+            exit(0);
+    }
    cudaDeviceSynchronize();
    return output;
 }

-torch::Tensor dequantize_iq4_xs(torch::Tensor data, int blk_size, torch::Device device) {
-    int num_blocks = data.numel() / blk_size;
+torch::Tensor dequantize_iq4_xs(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype) {
+    int num_blocks = num_bytes / blk_size;
    const at::cuda::OptionalCUDAGuard device_guard(device);

    auto options = torch::TensorOptions().dtype(torch::kInt8).device(device).memory_format(torch::MemoryFormat::Contiguous);
-    auto data_gpu = torch::empty({data.numel()}, options);
+    auto data_gpu = torch::empty({num_bytes}, options);

-    data_gpu.copy_(data, false);
+    cudaMemcpy(data_gpu.data_ptr<int8_t>(), data, num_bytes, cudaMemcpyHostToDevice);
+    //data_gpu.copy_(data, false);

    // Create output tensor
-    auto output = torch::zeros({num_blocks, 256}, torch::dtype(torch::kFloat32).device(device));
-
-    // Launch kernel
-    dequantize_iq4_xs_kernel<<< 512, 256 >>>(data_gpu.data_ptr<int8_t>(), output.data_ptr<float>(), blk_size, num_blocks);
+    auto output = torch::zeros({num_blocks, 256}, torch::dtype(target_dtype).device(device));

+    switch (target_dtype) {
+        case torch::kFloat16:
+            dequantize_iq4_xs_fp16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (__half*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
+            break;
+        case torch::kBFloat16:
+            dequantize_iq4_xs_bf16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (nv_bfloat16*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
+            break;
+        case torch::kFloat32:
+            dequantize_iq4_xs_fp32_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), output.data_ptr<float>(), blk_size, ele_per_blk, num_blocks);
+            break;
+        default:
+            printf("target type not support\n");
+            exit(0);
+    }
    cudaDeviceSynchronize();
    return output;
 }
--- a/ktransformers/ktransformers_ext/cuda/custom_gguf/ops.h
+++ b/ktransformers/ktransformers_ext/cuda/custom_gguf/ops.h
@ -13,10 +13,10 @@
 #include <torch/extension.h>
 #include <torch/torch.h>

-torch::Tensor dequantize_q8_0(torch::Tensor data, int blk_size, torch::Device device);
-torch::Tensor dequantize_q6_k(torch::Tensor data, int blk_size, torch::Device device);
-torch::Tensor dequantize_q5_k(torch::Tensor data, int blk_size, torch::Device device);
-torch::Tensor dequantize_q4_k(torch::Tensor data, int blk_size, torch::Device device);
-torch::Tensor dequantize_q3_k(torch::Tensor data, int blk_size, torch::Device device);
-torch::Tensor dequantize_q2_k(torch::Tensor data, int blk_size, torch::Device device);
-torch::Tensor dequantize_iq4_xs(torch::Tensor data, int blk_size, torch::Device device);
+torch::Tensor dequantize_q8_0(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype);
+torch::Tensor dequantize_q6_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype);
+torch::Tensor dequantize_q5_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype);
+torch::Tensor dequantize_q4_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype);
+torch::Tensor dequantize_q3_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype);
+torch::Tensor dequantize_q2_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype);
+torch::Tensor dequantize_iq4_xs(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype);
--- a/ktransformers/ktransformers_ext/cuda/test_dequant.py
+++ b/ktransformers/ktransformers_ext/cuda/test_dequant.py
@ -0,0 +1,16 @@
+import os
+import sys
+sys.path.insert(0,"/home/zbx/ktransformers")
+from ktransformers.util.custom_gguf import GGUFLoader
+import torch
+
+gguf_loader_1 = GGUFLoader("/mnt/data/model/DeepseekV3-q4km-gguf")
+gguf_loader_2 = GGUFLoader("/mnt/data/chenht/model/gguf_for_ktransformers/DeepSeek-V3-bf16/")
+
+torch.set_default_dtype(torch.bfloat16)
+
+tensor_1 = gguf_loader_1.load_gguf_tensor("blk.0.attn_kv_a_mqa.weight", "cuda")
+tensor_2 = gguf_loader_2.load_gguf_tensor("blk.0.attn_kv_a_mqa.weight", "cuda")
+
+print(tensor_1[0, -64:])
+print(tensor_2[0, -64:])
--- a/ktransformers/ktransformers_ext/operators/custom_marlin/quantize/utils/marlin_utils.py
+++ b/ktransformers/ktransformers_ext/operators/custom_marlin/quantize/utils/marlin_utils.py
@ -90,7 +90,7 @@ def marlin_quantize(
    assert group_size <= size_k

    # Quantize (and apply act_order if provided)
-    w_ref, q_w, s, g_idx, rand_perm = quantize_weights(w, num_bits, group_size,
+    q_w, s, g_idx, rand_perm = quantize_weights(w, num_bits, group_size,
                                                       act_order)

    # For act_order, sort the "weights" and "g_idx" so that group ids are
@ -107,7 +107,7 @@ def marlin_quantize(
                                     marlin_scale_perm_single[num_bits])

    # Create result
-    res_list = [w_ref, marlin_q_w, marlin_s, g_idx, sort_indices, rand_perm]
+    res_list = [marlin_q_w, marlin_s, g_idx, sort_indices, rand_perm]
    for i in range(len(res_list)):
        res_list[i] = res_list[i].to(w.device)

--- a/ktransformers/ktransformers_ext/operators/custom_marlin/quantize/utils/quant_utils.py
+++ b/ktransformers/ktransformers_ext/operators/custom_marlin/quantize/utils/quant_utils.py
@ -11,8 +11,7 @@ def get_pack_factor(num_bits):
    return 32 // num_bits


-def permute_rows(q_w: torch.Tensor, w_ref: torch.Tensor, group_size: int):
-    assert q_w.shape == w_ref.shape
+def permute_rows(q_w: torch.Tensor, group_size: int):

    orig_device = q_w.device
    k_size, _ = q_w.shape
@ -26,10 +25,8 @@ def permute_rows(q_w: torch.Tensor, w_ref: torch.Tensor, group_size: int):

    g_idx = g_idx[rand_perm].contiguous()
    q_w = q_w[rand_perm, :].contiguous()
-    w_ref = w_ref[rand_perm, :].contiguous()

    return (
-        w_ref.to(device=orig_device),
        q_w.to(device=orig_device),
        g_idx.to(device=orig_device),
        rand_perm.to(device=orig_device),
@ -69,9 +66,6 @@ def quantize_weights(w: torch.Tensor, num_bits: int, group_size: int,
    q_w += half_q_val
    q_w = torch.clamp(q_w, 0, max_q_val)

-    # Compute ref (dequantized)
-    w_ref = (q_w - half_q_val).half() * s
-
    # Restore original shapes
    if group_size < size_k:

@ -82,7 +76,6 @@ def quantize_weights(w: torch.Tensor, num_bits: int, group_size: int,
            return w

        q_w = reshape_w(q_w)
-        w_ref = reshape_w(w_ref)

    s = s.reshape((-1, size_n)).contiguous()

@ -95,10 +88,9 @@ def quantize_weights(w: torch.Tensor, num_bits: int, group_size: int,
        ), "For act_order, groupsize = {} must be less than size_k = {}".format(
            group_size, size_k)

-        w_ref, q_w, g_idx, rand_perm = permute_rows(q_w, w_ref, group_size)
+        q_w, g_idx, rand_perm = permute_rows(q_w, group_size)

    return (
-        w_ref.to(device=orig_device),
        q_w.to(device=orig_device),
        s.to(device=orig_device),
        g_idx.to(device=orig_device),
--- a/ktransformers/ktransformers_ext/operators/kvcache/kvcache_attn.cpp
+++ b/ktransformers/ktransformers_ext/operators/kvcache/kvcache_attn.cpp
@ -10,6 +10,8 @@

 #include "kvcache.h"

+#include <chrono>
+
 void KVCache::attention_kvhead_(const uint16_t *q_in_data, ggml_fp16_t *output,
                                float *attn_lse, int batch_size,
                                Backend *backend) {
--- a/ktransformers/ktransformers_ext/operators/kvcache/kvcache_load_dump.cpp
+++ b/ktransformers/ktransformers_ext/operators/kvcache/kvcache_load_dump.cpp
@ -9,6 +9,9 @@
 **/

 #include "kvcache.h"
+
+#include <chrono>
+
 void KVCache::load_kvcache(std::string tensor_file_path, Backend *backend) {
    // Timer start
    auto start = std::chrono::high_resolution_clock::now();
--- a/ktransformers/ktransformers_ext/operators/kvcache/kvcache_read_write.cpp
+++ b/ktransformers/ktransformers_ext/operators/kvcache/kvcache_read_write.cpp
@ -10,6 +10,8 @@

 #include "kvcache.h"

+#include <chrono>
+
 void KVCache::get_anchor_one_block(ggml_fp16_t *anchor, int layer_id,
                                   int block_idx, Backend *backend) {
    // Timer start
--- a/ktransformers/ktransformers_ext/operators/kvcache/kvcache_utils.cpp
+++ b/ktransformers/ktransformers_ext/operators/kvcache/kvcache_utils.cpp
@ -10,6 +10,8 @@

 #include "kvcache.h"

+#include <chrono>
+
 std::string ggml_type_to_string(ggml_type type) {
    switch (type) {
    case GGML_TYPE_F32:
--- a/ktransformers/ktransformers_ext/triton/fp8gemm.py
+++ b/ktransformers/ktransformers_ext/triton/fp8gemm.py
@ -0,0 +1,193 @@
+# Adopted from https://huggingface.co/deepseek-ai/DeepSeek-V3/blob/main/inference/kernel.py
+from typing import Tuple
+
+import torch
+import triton
+import triton.language as tl
+from triton import Config
+
+
+@triton.jit
+def act_quant_kernel(x_ptr, y_ptr, s_ptr, BLOCK_SIZE: tl.constexpr):
+    """
+    Quantizes the input tensor `x_ptr` and stores the result in `y_ptr` and the scaling factor in `s_ptr`.
+
+    Args:
+        x_ptr (triton.Pointer): Pointer to the input tensor.
+        y_ptr (triton.Pointer): Pointer to the output tensor where quantized values will be stored.
+        s_ptr (triton.Pointer): Pointer to the output tensor where scaling factors will be stored.
+        BLOCK_SIZE (tl.constexpr): The size of the block to be processed by each program instance.
+
+    Returns:
+        None
+    """
+    pid = tl.program_id(axis=0)
+    offs = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    x = tl.load(x_ptr + offs).to(tl.float32)
+    s = tl.max(tl.abs(x)) / 448.
+    y = x / s
+    y = y.to(y_ptr.dtype.element_ty)
+    tl.store(y_ptr + offs, y)
+    tl.store(s_ptr + pid, s)
+
+
+def act_quant(x: torch.Tensor, block_size: int = 128) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Quantizes the input tensor `x` using block-wise quantization.
+
+    Args:
+        x (torch.Tensor): The input tensor to be quantized. Must be contiguous and its last dimension size must be divisible by `block_size`.
+        block_size (int, optional): The size of the blocks to be used for quantization. Default is 128.
+
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: A tuple containing:
+            - The quantized tensor with dtype `torch.float8_e4m3fn`.
+            - A tensor of scaling factors with dtype `torch.float32`.
+    """
+    assert x.is_contiguous(), 'Input tensor must be contiguous'
+    assert x.size(-1) % block_size == 0, f'Last dimension size must be divisible by block_size (block_size={block_size})'
+    y = torch.empty_like(x, dtype=torch.float8_e4m3fn)
+    s = x.new_empty(*x.size()[:-1], x.size(-1) // block_size, dtype=torch.float32)
+    grid = lambda meta: (triton.cdiv(x.numel(), meta['BLOCK_SIZE']), )
+    act_quant_kernel[grid](x, y, s, BLOCK_SIZE=block_size)
+    return y, s
+
+
+@triton.jit
+def weight_dequant_kernel(x_ptr, s_ptr, y_ptr, M, N, BLOCK_SIZE: tl.constexpr):
+    """
+    Dequantizes weights using the provided scaling factors and stores the result.
+
+    Args:
+        x_ptr (tl.pointer): Pointer to the quantized weights.
+        s_ptr (tl.pointer): Pointer to the scaling factors.
+        y_ptr (tl.pointer): Pointer to the output buffer for dequantized weights.
+        M (int): Number of rows in the weight matrix.
+        N (int): Number of columns in the weight matrix.
+        BLOCK_SIZE (tl.constexpr): Size of the block for tiling.
+
+    Returns:
+        None
+    """
+    pid_m = tl.program_id(axis=0)
+    pid_n = tl.program_id(axis=1)
+    n = tl.cdiv(N, BLOCK_SIZE)
+    offs_m = pid_m * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    offs_n = pid_n * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    offs = offs_m[:, None] * N + offs_n[None, :]
+    mask = (offs_m[:, None] < M) & (offs_n[None, :] < N)
+    x = tl.load(x_ptr + offs, mask=mask).to(tl.float32)
+    s = tl.load(s_ptr + pid_m * n + pid_n)
+    y = x * s
+    tl.store(y_ptr + offs, y, mask=mask)
+
+
+def weight_dequant(x: torch.Tensor, s: torch.Tensor, block_size: int = 128) -> torch.Tensor:
+    """
+    Dequantizes the given weight tensor using the provided scale tensor.
+
+    Args:
+        x (torch.Tensor): The quantized weight tensor of shape (M, N).
+        s (torch.Tensor): The scale tensor of shape (M, N).
+        block_size (int, optional): The block size to use for dequantization. Defaults to 128.
+
+    Returns:
+        torch.Tensor: The dequantized weight tensor of the same shape as `x`.
+
+    Raises:
+        AssertionError: If `x` or `s` are not contiguous or if their dimensions are not 2.
+    """
+    assert x.is_contiguous() and s.is_contiguous(), 'Input tensors must be contiguous'
+    assert x.dim() == 2 and s.dim() == 2, 'Input tensors must have 2 dimensions'
+    M, N = x.size()
+    y = torch.empty_like(x, dtype=torch.get_default_dtype())
+    grid = lambda meta: (triton.cdiv(M, meta['BLOCK_SIZE']), triton.cdiv(N, meta['BLOCK_SIZE']))
+    with torch.cuda.device(x.device):
+        weight_dequant_kernel[grid](x, s, y, M, N, BLOCK_SIZE=block_size)
+    return y
+
+
+fp8_gemm_configs = [
+    Config({'BLOCK_SIZE_M': block_m, 'BLOCK_SIZE_N': block_n, 'BLOCK_SIZE_K': 128}, num_stages=num_stages, num_warps=8)
+    for block_m in [16, 32, 64] for block_n in [32, 64, 128] for num_stages in [3, 4, 5, 6]
+]
+
+@triton.autotune(configs=fp8_gemm_configs, key=['N', 'K'])
+@triton.jit
+def fp8_gemm_kernel(a_ptr, b_ptr, c_ptr,
+                    a_s_ptr, b_s_ptr,
+                    M, N: tl.constexpr, K: tl.constexpr,
+                    BLOCK_SIZE_M: tl.constexpr,
+                    BLOCK_SIZE_N: tl.constexpr,
+                    BLOCK_SIZE_K: tl.constexpr):
+    """
+    Performs a matrix multiplication operation on FP8 matrices with scaling factors.
+
+    Args:
+        a_ptr (tl.tensor): Pointer to the first input matrix A.
+        b_ptr (tl.tensor): Pointer to the second input matrix B.
+        c_ptr (tl.tensor): Pointer to the output matrix C.
+        a_s_ptr (tl.tensor): Pointer to the scaling factors for matrix A.
+        b_s_ptr (tl.tensor): Pointer to the scaling factors for matrix B.
+        M (int): Number of rows in matrix A and C.
+        N (tl.constexpr): Number of columns in matrix B and C.
+        K (tl.constexpr): Number of columns in matrix A and rows in matrix B.
+        BLOCK_SIZE_M (tl.constexpr): Block size for the M dimension.
+        BLOCK_SIZE_N (tl.constexpr): Block size for the N dimension.
+        BLOCK_SIZE_K (tl.constexpr): Block size for the K dimension.
+
+    Returns:
+        None
+    """
+    pid_m = tl.program_id(axis=0)
+    pid_n = tl.program_id(axis=1)
+    k = tl.cdiv(K, BLOCK_SIZE_K)
+    offs_m = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_n = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = a_ptr + offs_m[:, None] * K + offs_k[None, :]
+    b_ptrs = b_ptr + offs_n[None, :] * K + offs_k[:, None]
+    a_s_ptrs = a_s_ptr + offs_m * k
+    b_s_ptrs = b_s_ptr + (offs_n // BLOCK_SIZE_K) * k
+
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for i in range(k):
+        a = tl.load(a_ptrs, mask=offs_k[None, :] < K - i * BLOCK_SIZE_K, other=0.0)
+        b = tl.load(b_ptrs, mask=offs_k[:, None] < K - i * BLOCK_SIZE_K, other=0.0)
+        a_s = tl.load(a_s_ptrs)
+        b_s = tl.load(b_s_ptrs)
+        accumulator += tl.dot(a, b) * a_s[:, None] * b_s[None, :]
+        a_ptrs += BLOCK_SIZE_K
+        b_ptrs += BLOCK_SIZE_K
+        a_s_ptrs += 1
+        b_s_ptrs += 1
+    c = accumulator.to(c_ptr.dtype.element_ty)
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    c_ptrs = c_ptr + offs_m[:, None] * N + offs_n[None, :]
+    mask = (offs_m[:, None] < M) & (offs_n[None, :] < N)
+    tl.store(c_ptrs, c, mask=mask)
+
+
+def fp8_gemm(a: torch.Tensor, a_s: torch.Tensor, b: torch.Tensor, b_s: torch.Tensor):
+    """
+    Perform a matrix multiplication using FP8 precision.
+
+    Args:
+        a (torch.Tensor): The first input matrix, must be contiguous.
+        a_s (torch.Tensor): The scaling factor for the first input matrix, must be contiguous.
+        b (torch.Tensor): The second input matrix, must be contiguous.
+        b_s (torch.Tensor): The scaling factor for the second input matrix, must be contiguous.
+
+    Returns:
+        torch.Tensor: The result of the matrix multiplication.
+    """
+    assert a.is_contiguous() and b.is_contiguous(), 'Input tensors must be contiguous'
+    assert a_s.is_contiguous() and b_s.is_contiguous(), 'Scaling factor tensors must be contiguous'
+    K = a.size(-1)
+    M = a.numel() // K
+    N = b.size(0)
+    c = a.new_empty(*a.size()[:-1], N, dtype=torch.get_default_dtype())
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']), triton.cdiv(N, META['BLOCK_SIZE_N']))
+    fp8_gemm_kernel[grid](a, b, c, a_s, b_s, M, N, K)
+    return c
--- a/ktransformers/local_chat.py
+++ b/ktransformers/local_chat.py
@ -28,8 +28,9 @@ from ktransformers.models.modeling_qwen2_moe import Qwen2MoeForCausalLM
 from ktransformers.models.modeling_deepseek_v3 import DeepseekV3ForCausalLM
 from ktransformers.models.modeling_llama import LlamaForCausalLM
 from ktransformers.models.modeling_mixtral import MixtralForCausalLM
-from ktransformers.util.utils import prefill_and_generate
+from ktransformers.util.utils import prefill_and_generate, get_compute_capability
 from ktransformers.server.config.config import Config
+from ktransformers.operators.flashinfer_wrapper import flashinfer_enabled

 custom_models = {
    "DeepseekV2ForCausalLM": DeepseekV2ForCausalLM,
@ -53,7 +54,7 @@ default_optimize_rules = {

 def local_chat(
    model_path: str | None = None,
-    optimize_rule_path: str = None,
+    optimize_config_path: str = None,
    gguf_path: str | None = None,
    max_new_tokens: int = 300,
    cpu_infer: int = Config().cpu_infer,
@ -61,9 +62,9 @@ def local_chat(
    prompt_file : str | None = None,
    mode: str = "normal",
    force_think: bool = False,
+    chunk_prefill_size: int = 8192
 ):

-
    torch.set_grad_enabled(False)

    Config().cpu_infer = cpu_infer
@ -94,12 +95,12 @@ def local_chat(
                config, trust_remote_code=True, attn_implementation="flash_attention_2"
            )

-    if optimize_rule_path is None:
+    if optimize_config_path is None:
        if config.architectures[0] in default_optimize_rules:
            print("using default_optimize_rule for", config.architectures[0])
-            optimize_rule_path = default_optimize_rules[config.architectures[0]]
+            optimize_config_path = default_optimize_rules[config.architectures[0]]
        else:
-            optimize_rule_path = input(
+            optimize_config_path = input(
                "please input the path of your rule file(yaml file containing optimize rules):"
            )

@ -107,15 +108,15 @@ def local_chat(
        gguf_path = input(
            "please input the path of your gguf file(gguf file in the dir containing input gguf file must all belong to current model):"
        )
-    optimize_and_load_gguf(model, optimize_rule_path, gguf_path, config)
+    optimize_and_load_gguf(model, optimize_config_path, gguf_path, config)
    
    try:
        model.generation_config = GenerationConfig.from_pretrained(model_path)
-    except:
+    except Exception as e:
+        print(f"generation config can't auto create, make default. Message: {e}")
        gen_config = GenerationConfig(
-                max_length=128,
-                temperature=0.7,
-                top_p=0.9,
+            temperature=0.6,
+            top_p=0.95,
            do_sample=True
        )
        model.generation_config = gen_config
@ -167,11 +168,15 @@ def local_chat(
        if mode == 'long_context':
            assert Config().long_context_config['max_seq_len'] > input_tensor.shape[1] + max_new_tokens, \
            "please change max_seq_len in  ~/.ktransformers/config.yaml"
-        torch.set_default_dtype(
-            torch.bfloat16
-        )  # TODO: Remove this, replace dtype using config
+        
+        if system != "Windows" and (config.architectures[0] == "DeepseekV2ForCausalLM" or config.architectures[0] == "DeepseekV3ForCausalLM") and flashinfer_enabled and get_compute_capability() >= 8:
            generated = prefill_and_generate(
-            model, tokenizer, input_tensor.cuda(), max_new_tokens, use_cuda_graph, mode, force_think
+                model, tokenizer, input_tensor.cuda(), max_new_tokens, use_cuda_graph, mode = mode, force_think = force_think, chunk_prefill_size = chunk_prefill_size,
+                use_flashinfer_mla = True, num_heads = config.num_attention_heads, head_dim_ckv = config.kv_lora_rank, head_dim_kpe = config.qk_rope_head_dim, q_head_dim = config.qk_rope_head_dim + config.qk_nope_head_dim
+            )
+        else:
+            generated = prefill_and_generate(
+                model, tokenizer, input_tensor.cuda(), max_new_tokens, use_cuda_graph, mode = mode, force_think = force_think, chunk_prefill_size = chunk_prefill_size,
            )


--- a/ktransformers/models/custom_cache.py
+++ b/ktransformers/models/custom_cache.py
@ -51,13 +51,34 @@ class StaticCache(transformers.StaticCache):
        cache_shape = (max_batch_size, self.num_key_value_heads, self.max_cache_len, self.head_dim)
        if config.architectures[0] == "DeepseekV2ForCausalLM" or config.architectures[0] == "DeepseekV3ForCausalLM":
            # TODO: for deepseek, cache_shape is different whether using Absorbed MLA, check it automatically
-            # key_shape = (max_batch_size, self.num_key_value_heads, self.max_cache_len, config.qk_rope_head_dim + config.qk_nope_head_dim)
-            # value_shape = (max_batch_size, self.num_key_value_heads, self.max_cache_len, config.v_head_dim)
-            key_shape = (max_batch_size, 1, self.max_cache_len, config.qk_rope_head_dim)
-            value_shape = (max_batch_size, 1, self.max_cache_len, config.kv_lora_rank)
+            self.page_size = 64
+            self.max_pages = (self.max_cache_len + self.page_size - 1) // self.page_size
+            latent_shape = (self.max_pages, self.page_size, 1, config.kv_lora_rank + config.qk_rope_head_dim)
+            self.kv_lora_rank = config.kv_lora_rank
+            self.qk_rope_head_dim = config.qk_rope_head_dim
+            # TODO: support real page table
+            self.page_table_map = dict()
+            self.page_table_list = []
+            for idx in range(config.num_hidden_layers):
+                if isinstance(device, dict):
+                    target_device = device[f"blk.{idx}.self_attn"]["generate_device"]
+                else:
+                    target_device = device
+                
+                if target_device not in self.page_table_map:
+                    page_table = torch.zeros((max_batch_size, self.max_pages), dtype=torch.int32, device=target_device)
+                    for seq_id in range(max_batch_size):
+                        page_table[seq_id, :] = torch.arange(seq_id * self.max_pages, seq_id * self.max_pages + self.max_pages, dtype=torch.int32, device=target_device)
+                    self.page_table_map[target_device] = page_table
+                    
+                self.page_table_list.append(self.page_table_map[target_device])
+                    
+            self.is_MLA = True
+            self.is_page = True
        else:
            key_shape = cache_shape
            value_shape = cache_shape
+            self.is_MLA = False

        self.past_tokens = []
        self.num_hidden_layers = config.num_hidden_layers
@ -68,10 +89,17 @@ class StaticCache(transformers.StaticCache):
                target_device = device[f"blk.{idx}.self_attn"]["generate_device"]
            else:
                target_device = device
+            
+            if self.is_MLA:
+                new_layer_key_cache = torch.zeros(latent_shape, dtype=self.dtype, device=target_device)
+                new_layer_value_cache = None
+                torch._dynamo.mark_static_address(new_layer_key_cache)
+            else:
                new_layer_key_cache = torch.zeros(key_shape, dtype=self.dtype, device=target_device)
                new_layer_value_cache = torch.zeros(value_shape, dtype=self.dtype, device=target_device)
                torch._dynamo.mark_static_address(new_layer_key_cache)
                torch._dynamo.mark_static_address(new_layer_value_cache)
+                
            self.key_cache.append(new_layer_key_cache)
            self.value_cache.append(new_layer_value_cache)
            self.past_tokens.append(0)
@ -104,10 +132,18 @@ class StaticCache(transformers.StaticCache):
        cache_position = cache_kwargs.get("cache_position")
        k_out = self.key_cache[layer_idx]
        v_out = self.value_cache[layer_idx]
+        self.past_tokens[layer_idx] += cache_position.size(0)
        #print(cache_position)
+        if self.is_MLA:
+            page_idx = cache_position // self.page_size
+            page_offset = cache_position % self.page_size
+            # key shape (self.max_pages, self.page_size, 1, config.kv_lora_rank + config.qk_rope_head_dim)
+            k_out[page_idx, page_offset, :, :self.kv_lora_rank] = key_states
+            k_out[page_idx, page_offset, :, self.kv_lora_rank:] = value_states
+            return k_out, self.page_table_list[layer_idx]
+        else:
            k_out[:, :, cache_position] = key_states
            v_out[:, :, cache_position] = value_states
-        self.past_tokens[layer_idx] += cache_position.size(0)
            return k_out, v_out

    def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
@ -134,7 +170,20 @@ class StaticCache(transformers.StaticCache):
        for layer_idx in range(len(self.key_cache)):
            # In-place ops prevent breaking the static address
            self.key_cache[layer_idx].zero_()
+            if self.value_cache[layer_idx] is not None:
                self.value_cache[layer_idx].zero_()
+            self.past_tokens[layer_idx] = 0
+
+    def remove_suffix(self, start_pos):
+        for layer_idx in range(len(self.key_cache)):
+            # In-place ops prevent breaking the static address
+            if self.is_MLA:
+                k_cache = self.key_cache[layer_idx]
+                k_cache.view(-1, k_cache.shape[-1])[start_pos:].zero_()
+            else:
+                self.key_cache[layer_idx][..., start_pos:, :].zero_()
+                self.value_cache[layer_idx][..., start_pos:, :].zero_()
+            self.past_tokens[layer_idx] = start_pos
    
    def get_max_cache_shape(self) -> Tuple[int, int, int, int]:
        """Returns the maximum shape of the cache."""
--- a/ktransformers/models/modeling_deepseek.py
+++ b/ktransformers/models/modeling_deepseek.py
@ -1742,8 +1742,7 @@ class DeepseekV2ForCausalLM(DeepseekV2PreTrainedModel):
        )

        hidden_states = outputs[0]
-        logits = self.lm_head(hidden_states)
-        logits = logits[:,-1,:].unsqueeze(0).float()
+        logits = self.lm_head(hidden_states[:,-1:,:]).float()

        loss = None
        if labels is not None:
--- a/ktransformers/models/modeling_deepseek_v3.py
+++ b/ktransformers/models/modeling_deepseek_v3.py
@ -1699,7 +1699,7 @@ class DeepseekV3ForCausalLM(DeepseekV3PreTrainedModel):
        )

        hidden_states = outputs[0]
-        logits = self.lm_head(hidden_states.to(self.lm_head.weight.device))
+        logits = self.lm_head(hidden_states[:,-1:,:])
        logits = logits.float()

        loss = None
--- a/ktransformers/operators/RoPE.py
+++ b/ktransformers/operators/RoPE.py
@ -42,7 +42,7 @@ class RotaryEmbedding(BaseInjectedModule, DeepseekV2RotaryEmbedding):
        **kwargs,
    ):
        BaseInjectedModule.__init__(
-            self, key, gguf_loader, config, orig_module, generate_device, **kwargs
+            self, key, gguf_loader, config, orig_module, prefill_device, generate_device, **kwargs
        )
        self.orig_module.__init__(
            orig_module.dim, orig_module.max_position_embeddings, orig_module.base
@ -72,7 +72,7 @@ class RotaryEmbeddingV3(BaseInjectedModule):
        **kwargs,
    ):
        BaseInjectedModule.__init__(
-            self, key, gguf_loader, config, orig_module, generate_device, **kwargs
+            self, key, gguf_loader, config, orig_module, prefill_device, generate_device, **kwargs
        )
        self.generate_device = generate_device
        self.prefill_device = prefill_device
@ -122,7 +122,7 @@ class RotaryEmbeddingV2(BaseInjectedModule, LlamaRotaryEmbedding):
        **kwargs,
    ):
        BaseInjectedModule.__init__(
-            self, key, gguf_loader, config, orig_module, generate_device, **kwargs
+            self, key, gguf_loader, config, orig_module, prefill_device, generate_device, **kwargs
        )
        self.orig_module.__init__(
            orig_module.dim,
@ -160,7 +160,7 @@ class YarnRotaryEmbedding(BaseInjectedModule, DeepseekV2YarnRotaryEmbedding):
        **kwargs,
    ):
        BaseInjectedModule.__init__(
-            self, key, gguf_loader, config, orig_module, generate_device, **kwargs
+            self, key, gguf_loader, config, orig_module, prefill_device, generate_device, **kwargs
        )
        self.orig_module.__init__(
            orig_module.dim,
@ -204,7 +204,7 @@ class YarnRotaryEmbedding(BaseInjectedModule, DeepseekV2YarnRotaryEmbedding):
 #         **kwargs,
 #     ):
 #         BaseInjectedModule.__init__(
-#             self, key, gguf_loader, config, orig_module, generate_device, **kwargs
+#             self, key, gguf_loader, config, orig_module, prefill_device, generate_device, **kwargs
 #         )
 #         self.generate_device = generate_device
 #         self.prefill_device = prefill_device
@ -230,7 +230,7 @@ class YarnRotaryEmbeddingV3(BaseInjectedModule):
        **kwargs,
    ):
        BaseInjectedModule.__init__(
-            self, key, gguf_loader, config, orig_module, generate_device, **kwargs
+            self, key, gguf_loader, config, orig_module, prefill_device, generate_device, **kwargs
        )
        self.generate_device = generate_device
        self.prefill_device = prefill_device
@ -332,11 +332,12 @@ class DynamicNTKScalingRotaryEmbedding(
        gguf_loader: GGUFLoader,
        config: PretrainedConfig,
        orig_module: nn.Module,
-        device: str = "cuda",
+        prefill_device: str = "cuda",
+        generate_device: str = "cuda",
        **kwargs,
    ):
        BaseInjectedModule.__init__(
-            self, key, gguf_loader, config, orig_module, device, **kwargs
+            self, key, gguf_loader, config, orig_module, prefill_device, generate_device, **kwargs
        )
        self.orig_module.__init__(
            orig_module.dim,
--- a/ktransformers/operators/attention.py
+++ b/ktransformers/operators/attention.py
@ -13,215 +13,30 @@ from ktransformers.models.configuration_deepseek import DeepseekV2Config
 from ktransformers.models.configuration_llama import LlamaConfig
 from ktransformers.models.modeling_llama import LlamaRotaryEmbedding
 from ktransformers.models.modeling_deepseek import DeepseekV2Attention, apply_rotary_pos_emb
-from ktransformers.models.modeling_deepseek_v3 import DeepseekV3Attention
-from ktransformers.models.modeling_deepseek_v3 import apply_rotary_pos_emb as apply_rotary_pos_emb_v3
 from typing import Optional, Tuple
 from ktransformers.operators.base_operator import BaseInjectedModule
 from ktransformers.util.custom_gguf import GGUFLoader
+from ktransformers.util.utils import get_compute_capability
 import logging
 from transformers.configuration_utils import PretrainedConfig
 from transformers.cache_utils import Cache
+from flash_attn import flash_attn_func
+from ktransformers.operators.triton_attention import decode_attention_fwd_grouped
+import os
+from ktransformers.operators.flashinfer_wrapper import flashinfer_enabled
+if flashinfer_enabled:
+    from ktransformers.operators.flashinfer_wrapper import MLAWrapperSingleton
+
 logger = logging.getLogger("attention")

-class KDeepseekV3Attention(BaseInjectedModule, DeepseekV3Attention):
-    """Multi-headed attention from 'Attention Is All You Need' paper"""
-    attn_mask: Optional[torch.Tensor] = None
-
-    def __init__(self,
-                 key: str,
-                 gguf_loader : GGUFLoader,
-                 config: PretrainedConfig,
-                 orig_module: nn.Module,
-                 device: str = "cuda",
-                 chunck_size: int = 1000,
-                 **kwargs):
-        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, device, **kwargs)
-        self.orig_module.__init__(orig_module.config,
-            orig_module.layer_idx)
-        self.chunck_size = chunck_size # TODO, generate chunck_size automatically.
-        self.softmax_scale = self.q_head_dim ** (-0.5)
-
-    def get_absorbed(self) -> Tuple[torch.Tensor, torch.Tensor]:
-        if not (hasattr(self, 'q_absorb') and hasattr(self, 'out_absorb')):
-            kv_b_proj = self.kv_b_proj.weight.view(self.num_heads, -1, self.kv_lora_rank)
-            q_absorb = kv_b_proj[:, :self.qk_nope_head_dim, :].reshape(-1, self.kv_lora_rank)
-            out_absorb = kv_b_proj[:, self.qk_nope_head_dim:, :].reshape(-1, self.kv_lora_rank)
-            self.q_absorb = nn.Linear(self.kv_lora_rank, self.num_heads * self.qk_nope_head_dim, 
-                                      bias=False, dtype=q_absorb.dtype, device=q_absorb.device)
-            self.q_absorb.weight.data = q_absorb
-            self.out_absorb = nn.Linear(self.kv_lora_rank, self.num_heads * self.v_head_dim, 
-                                        bias=False, dtype=out_absorb.dtype, device=out_absorb.device)
-            self.out_absorb.weight.data = out_absorb
-            del self.orig_module.kv_b_proj
-        q_absorb = self.q_absorb.weight.view(self.num_heads, self.qk_nope_head_dim, self.kv_lora_rank)
-        out_absorb = self.out_absorb.weight.view(self.num_heads, self.v_head_dim, self.kv_lora_rank)
-        return q_absorb, out_absorb
-
-    def forward_chunck(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        cache_position: Optional[torch.LongTensor] = None,
-        **kwargs
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        bsz, q_len, _ = hidden_states.size()
-        if self.q_lora_rank is None:
-            q = self.q_proj(hidden_states)
-        else:
-            q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
-        q = q.view(bsz, q_len, self.num_heads, self.q_head_dim).transpose(1, 2)
-        q_nope, q_pe = torch.split(
-            q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
-        )
-
-        compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
-        compressed_kv, k_pe = torch.split(
-            compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
-        )
-        compressed_kv = self.kv_a_layernorm(compressed_kv)
-        k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim).transpose(1, 2)
-
-        kv_seq_len = k_pe.shape[-2]
-        if past_key_value is not None:
-            if self.layer_idx is None:
-                raise ValueError(
-                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
-                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
-                    "with a layer index."
-                )
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-
-        cos, sin = self.rotary_emb(q_pe, position_ids)
-        q_pe, k_pe = apply_rotary_pos_emb_v3(q_pe, k_pe, cos, sin)
-
-        if past_key_value is not None:
-            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
-            compressed_kv = compressed_kv.unsqueeze(1)
-            k_pe, compressed_kv = past_key_value.update(k_pe, compressed_kv, self.layer_idx, cache_kwargs)
-            compressed_kv = compressed_kv.squeeze(1)
-            #if cache_position is not None:  
-            #    compressed_kv = compressed_kv[:,: cache_position[-1] + 1,:]
-            #    k_pe = k_pe[:,:,: cache_position[-1] + 1,:]
-        q_absorb, out_absorb = self.get_absorbed()
-
-        q_nope = torch.matmul(q_nope, q_absorb)
-        attn_weights = (torch.matmul(q_pe, k_pe.mT) + torch.matmul(q_nope, compressed_kv.unsqueeze(-3).mT)) * self.softmax_scale
-        """
-        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
-            raise ValueError(
-                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
-                f" {attn_weights.size()}"
-            )
-        assert attention_mask is not None
-        """
-        if attention_mask is not None:
-            """
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-                )
-            """
-            #causal_mask = attention_mask[:, :, :, : kv_seq_len]
-            attn_weights = attn_weights + attention_mask
-
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(
-            attn_weights, dim=-1, dtype=torch.float32
-        ).to(q_pe.dtype)
-        attn_weights = nn.functional.dropout(
-            attn_weights, p=self.attention_dropout, training=self.training
-        )
-        attn_output = torch.einsum('bhql,blc->bhqc', attn_weights, compressed_kv)
-
-        attn_output = torch.matmul(attn_output, out_absorb.mT) 
-
-        if attn_output.size() != (bsz, self.num_heads, q_len, self.v_head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.v_head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-
-        attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.v_head_dim)
-
-        attn_output = self.o_proj(attn_output)
-
-        return attn_output, attn_weights, past_key_value
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        cache_position: Optional[torch.LongTensor] = None,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-            )
-        bsz, q_len, _ = hidden_states.size()
-        
-        if q_len <= self.chunck_size:
-            return self.forward_chunck(
-                            hidden_states,
-                            attention_mask,
-                            position_ids,
-                            past_key_value,
-                            output_attentions,
-                            use_cache,
-                            cache_position,
-                            **kwargs
-                        )
-
-        assert output_attentions == False, "output_attentions is not supported when using chunked attention"
-        attn_output = None
-        attn_weight = None
-        cur_idx = 0
-        while cur_idx < q_len:
-            if attention_mask is not None:
-                chunk_mask = attention_mask[:, :, cur_idx:min(cur_idx + self.chunck_size, q_len), ...]
-            else:
-                # generate chunk_mask automatically.
-                self.attn_mask = \
-                    torch.zeros(1, 1, self.chunck_size, past_key_value.max_cache_len, device=hidden_states.device) \
-                        if self.attn_mask is None \
-                            else self.attn_mask
-                self.attn_mask[:, :, :, cur_idx:min(cur_idx+self.chunck_size, past_key_value.max_cache_len)] = \
-                    -1e+38 * torch.triu(torch.ones(self.chunck_size, self.chunck_size, device=hidden_states.device), diagonal=1)\
-                        [:,:min(self.chunck_size, min(past_key_value.max_cache_len-cur_idx, self.chunck_size))]
-                self.attn_mask[:, :, :, cur_idx+self.chunck_size:] = -1e+38
-                self.attn_mask[:, :, :, :cur_idx] = 0
-                chunk_mask = torch.narrow(self.attn_mask, 2, 0, min(self.chunck_size, q_len-cur_idx))
-
-            cur_output, cur_attn_weight = self.forward_chunck(
-                            hidden_states[:, cur_idx:min(cur_idx + self.chunck_size, q_len), ...],
-                            chunk_mask,
-                            position_ids[:, cur_idx:min(cur_idx + self.chunck_size, q_len)],
-                            past_key_value,
-                            output_attentions,
-                            use_cache,
-                            cache_position[cur_idx:min(cur_idx + self.chunck_size, q_len)],
-                            **kwargs
-                        )
-            cur_idx += self.chunck_size
-            if attn_output is None:
-                attn_output = cur_output
-                attn_weight = cur_attn_weight
-            else:
-                attn_output = torch.cat((attn_output, cur_output), dim=-2)
-                attn_weight = torch.cat((attn_weight, cur_attn_weight), dim=-2)
-                
-        return attn_output, attn_weight, past_key_value
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)

+# V3 MLA is same to V2
 class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention):
    """Multi-headed attention from 'Attention Is All You Need' paper"""
    attn_mask: Optional[torch.Tensor] = None
@ -231,29 +46,25 @@ class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention):
                 gguf_loader : GGUFLoader,
                 config: PretrainedConfig,
                 orig_module: nn.Module,
-                 device: str = "cuda",
+                 prefill_device: str = "cuda",
+                 generate_device: str = "cuda",
                 chunck_size: int = 1000,
+                 absorb_for_prefill: bool = False,
                 **kwargs):
-        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, device, **kwargs)
+        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, generate_device, **kwargs)
        self.orig_module.__init__(orig_module.config,
            orig_module.layer_idx)
        self.chunck_size = chunck_size # TODO, generate chunck_size automatically.
+        self.mla_wrapper = None
+        self.absorb_for_prefill = absorb_for_prefill

    def get_absorbed(self) -> Tuple[torch.Tensor, torch.Tensor]:
        if not (hasattr(self, 'q_absorb') and hasattr(self, 'out_absorb')):
            kv_b_proj = self.kv_b_proj.weight.view(self.num_heads, -1, self.kv_lora_rank)
-            q_absorb = kv_b_proj[:, :self.qk_nope_head_dim, :].reshape(-1, self.kv_lora_rank)
-            out_absorb = kv_b_proj[:, self.qk_nope_head_dim:, :].reshape(-1, self.kv_lora_rank)
-            self.q_absorb = nn.Linear(self.kv_lora_rank, self.num_heads * self.qk_nope_head_dim, 
-                                      bias=False, dtype=q_absorb.dtype, device=q_absorb.device)
-            self.q_absorb.weight.data = q_absorb
-            self.out_absorb = nn.Linear(self.kv_lora_rank, self.num_heads * self.v_head_dim, 
-                                        bias=False, dtype=out_absorb.dtype, device=out_absorb.device)
-            self.out_absorb.weight.data = out_absorb
-            del self.orig_module.kv_b_proj
-        q_absorb = self.q_absorb.weight.view(self.num_heads, self.qk_nope_head_dim, self.kv_lora_rank)
-        out_absorb = self.out_absorb.weight.view(self.num_heads, self.v_head_dim, self.kv_lora_rank)
-        return q_absorb, out_absorb
+            self.q_absorb = kv_b_proj[:, :self.qk_nope_head_dim, :].view(self.num_heads, self.qk_nope_head_dim, self.kv_lora_rank)
+            self.out_absorb = kv_b_proj[:, self.qk_nope_head_dim:, :].view(self.num_heads, self.v_head_dim, self.kv_lora_rank)
+            
+        return self.q_absorb, self.out_absorb

    def forward_chunck(
        self,
@ -275,6 +86,8 @@ class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention):
        q_nope, q_pe = torch.split(
            q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
        )
+        # q_nope [bsz, self.num_heads, q_len, self.qk_nope_head_dim]
+        # q_pe [bsz, self.num_heads, q_len, self.qk_rope_head_dim]

        compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
        compressed_kv, k_pe = torch.split(
@ -287,7 +100,7 @@ class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention):
        if past_key_value is not None:
            if self.layer_idx is None:
                raise ValueError(
-                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                    f"The cache structure has changed since transformer version v4.36. If you are using {self.__class__.__name__} "
                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
                    "with a layer index."
                )
@ -298,16 +111,36 @@ class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention):

        if past_key_value is not None:
            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
-            compressed_kv = compressed_kv.unsqueeze(1)
-            k_pe, compressed_kv = past_key_value.update(k_pe, compressed_kv, self.layer_idx, cache_kwargs)
-            compressed_kv = compressed_kv.squeeze(1)
-            #if cache_position is not None:  
-            #    compressed_kv = compressed_kv[:,: cache_position[-1] + 1,:]
-            #    k_pe = k_pe[:,:,: cache_position[-1] + 1,:]
+            
+            # compressed_kv [bsz, q_len, self.kv_lora_rank]
+            # k_pe [bsz, 1, q_len, self.qk_rope_head_dim]
+            k_pe = k_pe.transpose(1,2)
+            compressed_kv = compressed_kv.unsqueeze(2)
+            compressed_kv_with_k_pe, _ = past_key_value.update(compressed_kv, k_pe, self.layer_idx, cache_kwargs)
+            compressed_kv, k_pe = torch.split(
+                compressed_kv_with_k_pe, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
+            )
+            # k_pe [pages, page_size, 1, self.qk_rope_head_dim]
+            # compressed_kv [pages, page_size, 1, self.kv_lora_rank]
+            
        q_absorb, out_absorb = self.get_absorbed()

+        # q_nope [bsz, self.num_heads, q_len, self.qk_nope_head_dim]
+        # q_pe [bsz, self.num_heads, q_len, self.qk_rope_head_dim]
+        k_pe = k_pe.view(bsz, 1, -1, self.qk_rope_head_dim)[:,:,:attention_mask.size(-1),:]
+        compressed_kv = compressed_kv.view(bsz, 1, -1, self.kv_lora_rank)[:,:,:attention_mask.size(-1),:]
+        # k_pe [bsz, 1, cache_len, self.qk_rope_head_dim]
+        # compressed_kv [bsz, 1, cache_len,self.kv_lora_rank]
        q_nope = torch.matmul(q_nope, q_absorb)
-        attn_weights = (torch.matmul(q_pe, k_pe.mT) + torch.matmul(q_nope, compressed_kv.unsqueeze(-3).mT)) * self.softmax_scale
+        #print(q_pe.shape)
+        #print(k_pe.shape)
+        #print(q_nope.shape)
+        #print(compressed_kv.shape)
+        
+        attn_weights = (torch.matmul(q_pe, k_pe.mT) + torch.matmul(q_nope, compressed_kv.mT)) * self.softmax_scale
+        
+        #attn_weights [bsz, self.num_heads, q_len, kv_seq_len]
+        compressed_kv = compressed_kv.squeeze(1)
        """
        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
            raise ValueError(
@ -333,6 +166,7 @@ class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention):
        attn_weights = nn.functional.dropout(
            attn_weights, p=self.attention_dropout, training=self.training
        )
+        
        attn_output = torch.einsum('bhql,blc->bhqc', attn_weights, compressed_kv)
        
        attn_output = torch.matmul(attn_output, out_absorb.mT) 
@ -351,7 +185,334 @@ class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention):

        return attn_output, None, past_key_value

-    def forward(
+    def forward_linux_triton(
+            self,
+            hidden_states: torch.Tensor,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            past_key_value: Optional[Cache] = None,
+            output_attentions: bool = False,
+            use_cache: bool = False,
+            cache_position: Optional[torch.LongTensor] = None,
+            **kwargs,
+        ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+
+        bsz, q_len, _ = hidden_states.size()
+
+        if self.q_lora_rank is None:
+            q = self.q_proj(hidden_states)
+        else:
+            q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
+        q = q.view(bsz, q_len, self.num_heads, self.q_head_dim)
+        q_nope, q_pe = torch.split(
+            q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
+        )
+
+        compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
+        compressed_kv, k_pe = torch.split(
+            compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
+        )
+        compressed_kv = self.kv_a_layernorm(compressed_kv)
+        k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim)
+        compressed_kv = compressed_kv.view(bsz, q_len, 1, self.kv_lora_rank)
+
+        kv_seq_len = q_len
+        if past_key_value is not None:
+            if self.layer_idx is None:
+                raise ValueError(
+                    f"The cache structure has changed since transformer version v4.36. If you are using {self.__class__.__name__} "
+                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                    "with a layer index."
+                )
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        
+        cos, sin = self.rotary_emb(q_pe, position_ids)
+        q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, unsqueeze_dim=2)
+        # q_pe [bsz, q_len, self.num_heads, self.qk_rope_head_dim] k_pe [bsz, q_len, 1, self.qk_rope_head_dim]
+        
+        # decode
+        if q_len == 1:
+            if past_key_value is not None:
+                cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
+                compressed_kv_with_k_pe, page_table = past_key_value.update(compressed_kv, k_pe, self.layer_idx, cache_kwargs)
+                compressed_kv = compressed_kv_with_k_pe [:, :, :, :self.kv_lora_rank] # for speed
+                # compressed_kv_with_k_pe [bsz, q_len, 1, self.kv_lora_rank + self.qk_rope_head_dim]
+                # compressed_kv [bsz, q_len, 1, self.kv_lora_rank]
+
+            # q_nope [bsz, q_len, self.num_heads, self.qk_nope_head_dim]
+            # q_absorb [self.num_heads, self.qk_nope_head_dim, self.kv_lora_rank]
+            q_absorb, out_absorb = self.get_absorbed()
+            q_nope = q_nope.transpose(1, 2) # q_len is 1, no GPU overhead, same below
+            q_nope = torch.matmul(q_nope, q_absorb) # batched MM
+            q_nope = q_nope.transpose(1, 2)
+            #assert q_nope.is_contiguous()
+            
+            # q_nope [bsz, q_len, self.num_heads, self.kv_lora_rank]
+            # q_pe [bsz, q_len, self.num_heads, self.qk_rope_head_dim]
+            query_states = torch.cat([q_nope, q_pe], dim=-1)
+            
+            query_states = query_states.squeeze(1)
+            attn_output = torch.zeros_like(q_nope) # [bsz, q_len, self.num_heads, self.kv_lora_rank]
+            
+            attn_logits = torch.empty(
+                    (
+                        bsz,
+                        self.num_heads,
+                        4, #num_kv_splits # follow vLLM, fix it TODO
+                        self.kv_lora_rank + 1, 
+                    ),
+                    dtype=torch.float32,
+                    device = attn_output.device
+                )
+
+            """
+            print("query_states", torch.isnan(query_states).any())
+            print("compressed_kv_with_k_pe", torch.isnan(compressed_kv_with_k_pe[:,:,0,:]).any())
+            print("compressed_kv", torch.isnan(compressed_kv[:,:,0,:]).any())
+            print("position_ids", torch.isnan(position_ids).any())
+            """
+
+            # flash attn doesn't support head_dim bigger than 256
+            # use triton attention kernel adapted from vLLM and SGLang for MQA
+            decode_attention_fwd_grouped(query_states, compressed_kv_with_k_pe, compressed_kv, attn_output,
+                             page_table,
+                             position_ids.squeeze(0).to(torch.int32)+1, attn_logits,
+                             4, #num_kv_splits # follow vLLM, fix it TODO
+                             self.softmax_scale,
+                             past_key_value.page_size)
+            
+            # attn_output [bsz, q_len, self.num_heads, self.kv_lora_rank]
+            # out_absorb [self.num_heads, self.v_head_dim, self.kv_lora_rank]
+            attn_output = attn_output.transpose(1, 2)
+            attn_output = torch.matmul(attn_output, out_absorb.mT)
+            attn_output = attn_output.transpose(1, 2)
+            
+            attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.v_head_dim)
+            attn_output = self.o_proj(attn_output)
+            
+            #print("attn_output", torch.isnan(attn_output).any())
+            return attn_output, None, past_key_value
+        else:
+            if past_key_value is not None:
+                cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
+                k_pe.squeeze(0)
+                compressed_kv.squeeze(0)
+                compressed_kv_with_k_pe, _ = past_key_value.update(compressed_kv, k_pe, self.layer_idx, cache_kwargs)
+                compressed_kv, k_pe = torch.split(
+                    compressed_kv_with_k_pe, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
+                )
+            k_pe = k_pe.view(bsz, -1, self.qk_rope_head_dim)
+            k_pe = k_pe[:, :kv_seq_len]
+            compressed_kv = compressed_kv.view(bsz, -1, self.kv_lora_rank)
+            compressed_kv = compressed_kv[:, :kv_seq_len]
+            kv = (
+                self.kv_b_proj(compressed_kv)
+                .view(bsz, kv_seq_len, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
+            )
+            k_nope, value_states = torch.split(kv, [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
+            query_states = k_pe.new_empty(bsz, q_len, self.num_heads, self.q_head_dim)
+            query_states[:, :, :, : self.qk_nope_head_dim] = q_nope
+            query_states[:, :, :, self.qk_nope_head_dim :] = q_pe
+
+            key_states = k_pe.new_empty(bsz, kv_seq_len, self.num_heads, self.q_head_dim)
+            key_states[:, :, :, :self.qk_nope_head_dim] = k_nope
+            key_states[:, :, :, self.qk_nope_head_dim:] = k_pe.view(bsz, kv_seq_len, 1, -1)
+            
+            value_states = value_states.view(bsz, kv_seq_len, self.num_heads, self.v_head_dim)
+            value_states_padded = torch.nn.functional.pad(value_states, [0, query_states.shape[-1] - value_states.shape[-1]], value=0)
+
+            attn_output = flash_attn_func(
+                query_states,
+                key_states,
+                value_states_padded,
+                softmax_scale=self.softmax_scale,
+                causal=True,
+            )
+
+            if self.q_head_dim != self.v_head_dim:
+                attn_output = attn_output[:, :, :, : self.v_head_dim]
+
+            attn_output = attn_output.reshape(
+                bsz, q_len, self.num_heads * self.v_head_dim
+            ).contiguous()
+            attn_output = self.o_proj(attn_output)
+            return attn_output, None, past_key_value
+
+    def forward_linux_flashinfer(
+            self,
+            hidden_states: torch.Tensor,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.Tensor] = None,
+            past_key_value: Optional[Cache] = None,
+            output_attentions: bool = False,
+            use_cache: bool = False,
+            cache_position: Optional[torch.Tensor] = None,
+            **kwargs,
+        ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+
+        bsz, q_len, _ = hidden_states.size()
+
+        if self.q_lora_rank is None:
+            q = self.q_proj(hidden_states)
+        else:
+            q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
+        q = q.view(bsz, q_len, self.num_heads, self.q_head_dim)
+        q_nope, q_pe = torch.split(
+            q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
+        )
+
+        compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
+        compressed_kv, k_pe = torch.split(
+            compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
+        )
+        compressed_kv = self.kv_a_layernorm(compressed_kv)
+        k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim)
+        compressed_kv = compressed_kv.view(bsz, q_len, 1, self.kv_lora_rank)
+
+        kv_seq_len = q_len
+        if past_key_value is not None:
+            if self.layer_idx is None:
+                raise ValueError(
+                    f"The cache structure has changed since version transformer verision v4.36. If you are using {self.__class__.__name__} "
+                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                    "with a layer index."
+                )
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        
+        cos, sin = self.rotary_emb(q_pe, position_ids)
+        q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, unsqueeze_dim=2)
+        # q_pe [bsz, q_len, self.num_heads, self.qk_rope_head_dim] k_pe [bsz, q_len, 1, self.qk_rope_head_dim]
+        
+        # decode
+        if q_len == 1 or self.absorb_for_prefill:
+            if past_key_value is not None:
+                cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
+                compressed_kv_with_k_pe, page_table = past_key_value.update(compressed_kv, k_pe, self.layer_idx, cache_kwargs)
+                compressed_kv = compressed_kv_with_k_pe [:, :, :, :self.kv_lora_rank].view(-1, past_key_value.page_size, self.kv_lora_rank)
+                k_pe = compressed_kv_with_k_pe [:, :, :, self.kv_lora_rank:].view(-1, past_key_value.page_size, self.qk_rope_head_dim)
+                # k_pe [max_pages, page_size, self.qk_rope_head_dim]
+                # compressed_kv [max_pages, page_size, self.kv_lora_rank]
+
+            # q_nope [bsz, q_len, self.num_heads, self.qk_nope_head_dim]
+            # q_absorb [self.num_heads, self.qk_nope_head_dim, self.kv_lora_rank]
+            q_absorb, out_absorb = self.get_absorbed()
+            q_nope = q_nope.transpose(1, 2) # q_len is 1, no GPU overhead, same below
+            q_nope = torch.matmul(q_nope, q_absorb) # batched MM
+            q_nope = q_nope.transpose(1, 2)
+            q_nope = q_nope.contiguous()
+            #assert q_nope.is_contiguous()
+            
+            # q_nope [bsz, q_len, self.num_heads, self.kv_lora_rank]
+            # q_pe [bsz, q_len, self.num_heads, self.qk_rope_head_dim]
+            q_nope.squeeze_(0)
+            q_pe.squeeze_(0)
+
+            # flash attn doesn't support head_dim bigger than 256, use flashinfer
+            if self.mla_wrapper is None:
+                self.mla_wrapper = MLAWrapperSingleton.get_instance(self.device, 1, past_key_value.max_pages, use_cuda_graph = True)
+            if self.mla_wrapper.need_plan:
+                self.mla_wrapper.need_plan = False
+                if q_len == 1:
+                    self.mla_wrapper.plan(None,None,None,
+                                        position_ids.squeeze(1)+1,
+                                        self.num_heads,
+                                        self.kv_lora_rank,
+                                        self.qk_rope_head_dim,
+                                        past_key_value.page_size,
+                                        self.softmax_scale,
+                                        q_nope.dtype,
+                                        compressed_kv.dtype)
+                else:
+                    qo_indptr = torch.tensor([0, q_len], dtype=torch.int32, device=self.device)
+                    kv_len_arr = torch.tensor([position_ids[0, -1].item()+1], dtype=torch.int32, device=self.device)
+                    self.mla_wrapper.plan(qo_indptr,None,None,
+                                        kv_len_arr,
+                                        self.num_heads,
+                                        self.kv_lora_rank,
+                                        self.qk_rope_head_dim,
+                                        past_key_value.page_size,
+                                        self.softmax_scale,
+                                        q_nope.dtype,
+                                        compressed_kv.dtype)
+            attn_output = self.mla_wrapper.run(q_nope, q_pe, compressed_kv, k_pe).view(bsz, q_len, self.num_heads, self.kv_lora_rank)
+            """
+            k = (
+                torch.cat([compressed_kv, k_pe], dim=-1)
+                .view(-1, 1, 512 + 64)
+                .repeat_interleave(self.num_heads, dim=1)
+            )
+            v = compressed_kv.view(-1, 1, 512).repeat_interleave(self.num_heads, dim=1)
+            lens = position_ids.item() + 1
+            #print("lens", lens)
+            attn_ref, lse_ref = attention_ref(
+                1,
+                torch.cat([q_nope, q_pe], dim=-1),
+                k[:lens],
+                v[:lens],
+                False,
+                self.softmax_scale
+            )
+            attn_output = attn_ref.view(bsz, q_len, self.num_heads, self.kv_lora_rank)
+            """
+            
+            # mla_wrapper run output: [tokens, self.num_heads, self.kv_lora_rank]
+            # attn_output [bsz, q_len, self.num_heads, self.kv_lora_rank]
+            # out_absorb [self.num_heads, self.v_head_dim, self.kv_lora_rank]
+            attn_output = attn_output.transpose(1, 2) # [bsz, self.num_heads, q_len, self.kv_lora_rank]
+            attn_output = torch.matmul(attn_output, out_absorb.mT) # [bsz, self.num_heads, q_len, self.v_head_dim]
+            attn_output = attn_output.transpose(1, 2).contiguous() # [bsz, q_len, self.num_heads, self.kv_lora_rank]
+            
+            attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.v_head_dim) # [bsz, q_len, self.num_heads * self.v_head_dim]
+            attn_output = self.o_proj(attn_output)
+            
+            return attn_output, None, past_key_value
+        else:
+            if past_key_value is not None:
+                cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
+                k_pe.squeeze(0)
+                compressed_kv.squeeze(0)
+                compressed_kv_with_k_pe, _ = past_key_value.update(compressed_kv, k_pe, self.layer_idx, cache_kwargs)
+                compressed_kv, k_pe = torch.split(
+                    compressed_kv_with_k_pe, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
+                )
+            k_pe = k_pe.view(bsz, -1, self.qk_rope_head_dim)
+            k_pe = k_pe[:, :kv_seq_len]
+            compressed_kv = compressed_kv.view(bsz, -1, self.kv_lora_rank)
+            compressed_kv = compressed_kv[:, :kv_seq_len]
+            kv = (
+                self.kv_b_proj(compressed_kv)
+                .view(bsz, kv_seq_len, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
+            )
+            k_nope, value_states = torch.split(kv, [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
+            query_states = k_pe.new_empty(bsz, q_len, self.num_heads, self.q_head_dim)
+            query_states[:, :, :, : self.qk_nope_head_dim] = q_nope
+            query_states[:, :, :, self.qk_nope_head_dim :] = q_pe
+
+            key_states = k_pe.new_empty(bsz, kv_seq_len, self.num_heads, self.q_head_dim)
+            key_states[:, :, :, :self.qk_nope_head_dim] = k_nope
+            key_states[:, :, :, self.qk_nope_head_dim:] = k_pe.view(bsz, kv_seq_len, 1, -1)
+            
+            value_states = value_states.view(bsz, kv_seq_len, self.num_heads, self.v_head_dim)
+            value_states_padded = torch.nn.functional.pad(value_states, [0, query_states.shape[-1] - value_states.shape[-1]], value=0)
+
+            attn_output = flash_attn_func(
+                query_states,
+                key_states,
+                value_states_padded,
+                softmax_scale=self.softmax_scale,
+                causal=True,
+            )
+
+            if self.q_head_dim != self.v_head_dim:
+                attn_output = attn_output[:, :, :, : self.v_head_dim]
+
+            attn_output = attn_output.reshape(
+                bsz, q_len, self.num_heads * self.v_head_dim
+            ).contiguous()
+            attn_output = self.o_proj(attn_output)
+            return attn_output, None, past_key_value
+        
+    def forward_windows(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
@ -416,13 +577,53 @@ class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention):
                attn_output = torch.cat((attn_output, cur_output), dim=-2)
                
        return attn_output, None, past_key_value
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., : x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2 :]
-    return torch.cat((-x2, x1), dim=-1)
-

+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if os.name == 'nt' or get_compute_capability()<8:
+            print("for Windows or GPU before ampere, use forward_windows")
+            return self.forward_windows(
+                hidden_states,
+                attention_mask,
+                position_ids,
+                past_key_value,
+                output_attentions,
+                use_cache,
+                cache_position,
+                **kwargs,
+            )
+        else:
+            if flashinfer_enabled:
+                return self.forward_linux_flashinfer(
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    past_key_value,
+                    output_attentions,
+                    use_cache,
+                    cache_position,
+                    **kwargs,
+                )
+            else:
+                return self.forward_linux_triton(
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    past_key_value,
+                    output_attentions,
+                    use_cache,
+                    cache_position,
+                    **kwargs,
+                )


 class KLlamaAttention(BaseInjectedModule):
@ -433,9 +634,10 @@ class KLlamaAttention(BaseInjectedModule):
                 gguf_loader : GGUFLoader,
                 config: PretrainedConfig,
                 orig_module: nn.Module,
-                 device: str = "cuda",
+                 prefill_device: str = "cuda",
+                 generate_device: str = "cuda",
                 **kwargs):
-        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, device, **kwargs)
+        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, generate_device, **kwargs)
        self.orig_module.__init__(orig_module.config,
            orig_module.layer_idx)
    def apply_rotary_pos_emb(self, q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
--- a/ktransformers/operators/base_operator.py
+++ b/ktransformers/operators/base_operator.py
@ -16,14 +16,17 @@ class BaseInjectedModule(nn.Module):
                 gguf_loader : GGUFLoader,
                 config: PretrainedConfig,
                 orig_module: nn.Module,
-                 device: str = "cuda",
+                 prefill_device: str = "cuda",
+                 generate_device: str = "cuda",
                 **kwargs):
        nn.Module.__init__(self)
        nn.Module.__setattr__(self, "orig_module", orig_module)
        object.__setattr__(self, "key", key)
        object.__setattr__(self, "gguf_loader", gguf_loader)
        object.__setattr__(self, "config", config)
-        object.__setattr__(self, "device", device)
+        object.__setattr__(self, "prefill_device", prefill_device)
+        object.__setattr__(self, "generate_device", generate_device)
+        object.__setattr__(self, "device", generate_device)
        
    def __getattr__(self, name: str) -> Any:
        # __getattr__ in nn.Module doesn't call super().__getattribute__ when name is not in nn.Module.__dict__,
--- a/ktransformers/operators/experts.py
+++ b/ktransformers/operators/experts.py
@ -18,6 +18,7 @@ import torch.nn.functional as F
 import torch
 import sys, os
 from ktransformers.operators.base_operator import BaseInjectedModule
+from tqdm import tqdm

 sys.path.append(os.path.join(os.path.dirname(__file__), "..", "ktransformers_ext", "build"))
 sys.path.append(os.path.join(os.path.dirname(__file__), "..", "ktransformers_ext", "build", "Release"))
@ -118,6 +119,7 @@ class KExpertsCPU(KExpertsBase):
    output_cpu:Tensor = None
    output_gpu_map:dict = {} # Manage output tensor buffer on different gpu
    #stream_map:dict = {} # Manage cuda stream on different gpu
+    #gguf_loader:GGUFLoader = None
    CPU_INFER = CPUInfer(Config().cpu_infer)
    def __init__(
        self,
@ -131,6 +133,9 @@ class KExpertsCPU(KExpertsBase):
        **kwargs
    ):
        super().__init__(key, gguf_loader, config, orig_module, device, **kwargs)
+        #if KExpertsCPU.gguf_loader is None:
+        #    KExpertsCPU.gguf_loader = GGUFLoader("/mnt/data/model/DeepseekV3-q4km-gguf")
+        self.gguf_loader = gguf_loader
        assert device.lower() == "cpu", "KExpertsCPU can only be loaded on CPU"
        self.n_routed_experts = n_routed_experts
        self.out_device = out_device
@ -154,7 +159,7 @@ class KExpertsCPU(KExpertsBase):
        down_ptr = ctypes.addressof(
            ctypes.cast(self.down.ctypes.data, ctypes.POINTER(ctypes.c_uint64)).contents
        )
-        # print(self.gate_qtype, self.up_qtype, self.down_qtype)
+        #print(self.gate_type, self.up_type, self.down_type)
        n_routed_experts = self.n_routed_experts
        # n_routed_experts = len(self.orig_module)
        moe_config = MOEConfig(
@ -225,6 +230,7 @@ class KExpertsCPU(KExpertsBase):
        return

    def load_weights(self, override_key: str | None = None, device: str = "cpu"):
+        # TODO: support Bias
        res = {}
        if override_key is not None:
            keys = override_key
@ -239,7 +245,16 @@ class KExpertsCPU(KExpertsBase):
        down_type = None

        for key in keys:
-            if key + ".ffn_gate_exps.weight" in self.gguf_loader.tensor_info:
+            if self.gguf_loader.safetensor_loader is not None:
+                # using a temp ugly way to temprary load the tensor
+                gate = self.gguf_loader.safetensor_loader.load_tensor(key + ".ffn_gate_exps.weight").numpy()
+                up = self.gguf_loader.safetensor_loader.load_tensor(key + ".ffn_up_exps.weight").numpy()
+                down = self.gguf_loader.safetensor_loader.load_tensor(key + ".ffn_down_exps.weight").numpy()
+                gate_type = self.gguf_loader.safetensor_loader.load_tensor(key + ".ffn_gate_exps.ggml_type").item()
+                up_type = self.gguf_loader.safetensor_loader.load_tensor(key + ".ffn_up_exps.ggml_type").item()
+                down_type = self.gguf_loader.safetensor_loader.load_tensor(key + ".ffn_down_exps.ggml_type").item()
+            
+            elif key + ".ffn_gate_exps.weight" in self.gguf_loader.tensor_info:
                gate = self.gguf_loader.get_mmap_tensor(key + ".ffn_gate_exps.weight")
                up = self.gguf_loader.get_mmap_tensor(key + ".ffn_up_exps.weight")
                down = self.gguf_loader.get_mmap_tensor(key + ".ffn_down_exps.weight")
@ -288,6 +303,8 @@ class KExpertsMarlin(KExpertsBase):
        self.act_fn = ACT2FN[config.hidden_act]
        assert device.lower() != "cpu", "Marlin experts can only be loaded on GPU"
        self.device = device
+        self.elements_per_tensor = config.moe_intermediate_size * config.hidden_size
+
        # create empty marlin experts according to the number of experts per token
        # up
        self.up_projs = [KLinearMarlin(key+ "." + "ffn_up_exps", gguf_loader, config, device=device) for i in range(self.expert_num)]
@ -299,8 +316,25 @@ class KExpertsMarlin(KExpertsBase):
    def load(self, w: dict | nn.Parameter | tuple | None = None, device: str | None = None, warmup: bool = False):
        if device is None: device = self.device
        assert device.lower() != "cpu", "Marlin experts can only be loaded on GPU"
-        if w is None: w = self.load_weights()[self.key]
+        if w is None:
+            w = self.load_weights()
+            load_by_experts = True

+        if load_by_experts:
+            if isinstance(w, dict):
+                self.gate = w["gate"]
+                self.up = (w["up"])
+                self.down = (w["down"])
+                for i in tqdm(range(self.expert_num), desc=f"Dequanting and quanting for KExpertsMarlin {self.key}"):
+                    up_weights = self.gguf_loader.load_expert_tensor(self.key + ".ffn_up_exps.weight", self.up, i, self.elements_per_tensor, device=self.device)
+                    gate_weights = self.gguf_loader.load_expert_tensor(self.key + ".ffn_gate_exps.weight", self.gate, i, self.elements_per_tensor, device=self.device)
+                    down_weights = self.gguf_loader.load_expert_tensor(self.key + ".ffn_down_exps.weight", self.down, i, self.elements_per_tensor, device=self.device)
+                    
+                    self.up_projs[i].load(nn.Parameter(up_weights), device=device)
+                    self.gate_projs[i].load(nn.Parameter(gate_weights), device=device)
+                    self.down_projs[i].load(nn.Parameter(down_weights), device=device)
+                    self.loaded_experts_idx.append(i)
+        else:
            if isinstance(w, dict):
                self.gate = w["gate"]
                self.up = (w["up"])
@ -329,20 +363,13 @@ class KExpertsMarlin(KExpertsBase):
        gate = None
        up = None
        down = None
-        gate_type = None
-        up_type = None
-        down_type = None

        for key in keys:
            if key + ".ffn_gate_exps.weight" in self.gguf_loader.tensor_info:
-                gate = self.gguf_loader.load_gguf_tensor(key + ".ffn_gate_exps.weight")
-                up = self.gguf_loader.load_gguf_tensor(key + ".ffn_up_exps.weight")
-                down = self.gguf_loader.load_gguf_tensor(key + ".ffn_down_exps.weight")
-                gate_type = self.gguf_loader.tensor_info[key + ".ffn_gate_exps.weight"]["ggml_type"]
-                up_type = self.gguf_loader.tensor_info[key + ".ffn_up_exps.weight"]["ggml_type"]
-                down_type = self.gguf_loader.tensor_info[key + ".ffn_down_exps.weight"]["ggml_type"]
-                # tensors = self.load_multi(key, [".ffn_gate_exps.weight", ".ffn_up_exps.weight", ".ffn_down_exps.weight"])
-            res = {key:{"gate": nn.Parameter(gate), "up": nn.Parameter(up), "down": nn.Parameter(down), "gate_type": gate_type, "up_type": up_type, "down_type": down_type}}
+                gate = self.gguf_loader.get_mmap_tensor(key + ".ffn_gate_exps.weight")
+                up = self.gguf_loader.get_mmap_tensor(key + ".ffn_up_exps.weight")
+                down = self.gguf_loader.get_mmap_tensor(key + ".ffn_down_exps.weight")
+            res = {"gate": gate, "up": up, "down": down}
        return res

    def forward(self, hidden_states_cpu: torch.Tensor, selected_experts_cpu: torch.Tensor, routing_weights_cpu: torch.Tensor) -> torch.Tensor:
@ -381,6 +408,7 @@ class KExpertsMarlin(KExpertsBase):
        
        return final_hidden_states.to(dtype=org_dtype, device=org_device)
    
+# untested, CUDA OOM
 class KExpertsTorch(KExpertsBase):
    expert_num: int
    loaded_experts_idx: list[int]
@ -402,19 +430,39 @@ class KExpertsTorch(KExpertsBase):
        # self.loaded_experts_idx = []
        self.act_fn = ACT2FN[config.hidden_act]
        self.device = device
-        self.gate = None
-        self.up = None
-        self.donw = None
+        self.elements_per_tensor = config.moe_intermediate_size * config.hidden_size
+        self.gate = [None for _ in range(self.expert_num)]
+        self.up = [None for _ in range(self.expert_num)]
+        self.down = [None for _ in range(self.expert_num)]
        self.dtype = torch.get_default_dtype()

    def load(self, w: dict | nn.Parameter | tuple | None = None, device: str | None = None, warmup: bool = False):
        if device is None: device = self.device
-        if w is None: w = self.load_weights(device=device)[self.key]
+        if w is None:
+            w = self.load_weights()
+            load_by_experts = True

+        if load_by_experts:
            if isinstance(w, dict):
-            self.gate = w["gate"].to(device=device, dtype=self.dtype)
-            self.up = w["up"].to(device=device, dtype=self.dtype)
-            self.down = w["down"].to(device=device, dtype=self.dtype)
+                for i in tqdm(range(self.expert_num), desc=f"Dequanting for KExpertsTorch {self.key}"):
+                    up_weights = self.gguf_loader.load_expert_tensor(self.key + ".ffn_up_exps.weight", w["up"], i, self.elements_per_tensor, device=self.device)
+                    gate_weights = self.gguf_loader.load_expert_tensor(self.key + ".ffn_gate_exps.weight", w["gate"], i, self.elements_per_tensor, device=self.device)
+                    down_weights = self.gguf_loader.load_expert_tensor(self.key + ".ffn_down_exps.weight", w["down"], i, self.elements_per_tensor, device=self.device)
+                    
+                    self.up[i] = up_weights
+                    self.gate[i] = gate_weights
+                    self.down[i] = down_weights
+        else:
+            if isinstance(w, dict):
+                for i in range(self.expert_num):
+                    self.gate[i] = w["gate"][i, ...].to(device=device, dtype=self.dtype)
+                    self.up[i] = w["up"][i, ...].to(device=device, dtype=self.dtype)
+                    self.down[i] = w["down"][i, ...].to(device=device, dtype=self.dtype)
+        
+        self.up = torch.stack(self.up, dim=0)
+        self.gate = torch.stack(self.gate, dim=0)
+        self.down = torch.stack(self.down, dim=0)
+        return 

    def unload(self):
        if self.gate is not None:
@ -422,6 +470,25 @@ class KExpertsTorch(KExpertsBase):
            self.up = None
            self.down = None

+    def load_weights(self, override_key: str | None = None):
+        res = {}
+        if override_key is not None:
+            keys = override_key
+        else:
+            keys = [self.key]
+
+        gate = None
+        up = None
+        down = None
+
+        for key in keys:
+            if key + ".ffn_gate_exps.weight" in self.gguf_loader.tensor_info:
+                gate = self.gguf_loader.get_mmap_tensor(key + ".ffn_gate_exps.weight")
+                up = self.gguf_loader.get_mmap_tensor(key + ".ffn_up_exps.weight")
+                down = self.gguf_loader.get_mmap_tensor(key + ".ffn_down_exps.weight")
+            res = {"gate": gate, "up": up, "down": down}
+        return res
+
    def forward(self, hidden_states_cpu: torch.Tensor, selected_experts_cpu: torch.Tensor, routing_weights_cpu: torch.Tensor) -> torch.Tensor:

        org_device = hidden_states_cpu.device
@ -478,7 +545,7 @@ class KTransformersExperts(BaseInjectedModule, KExpertsBase):
                 generate_device: str = "cpu",
                 generate_op: str | None = "KExpertsCPU",
                 **kwargs):
-        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, generate_device, **kwargs)
+        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, generate_device, **kwargs)
        KExpertsBase.__init__(self, key, gguf_loader, config, orig_module, generate_device, **kwargs)
        if generate_op is not None:
            self.generate_experts = EXPERTS_MAP[generate_op](key, gguf_loader, config, len(orig_module), device=generate_device, **kwargs)
@ -582,7 +649,7 @@ class KQwen2MoeSparseMoeBlock(BaseInjectedModule, Qwen2MoeSparseMoeBlock):

        if isinstance(self.experts, KExpertsBase):
            y = (
-                self.moe_on_cpuinfer(
+                self.moe_kexperts(
                    hidden_states_expert, selected_experts_expert, routing_weights_expert
                )
                .view(*orig_shape)
@ -601,8 +668,7 @@ class KQwen2MoeSparseMoeBlock(BaseInjectedModule, Qwen2MoeSparseMoeBlock):
        return y, router_logits
    
    @torch.no_grad()
-    def moe_on_cpuinfer(self, x: torch.Tensor, topk_ids: torch.Tensor, topk_weight: torch.Tensor) -> torch.Tensor:
-        outs = torch.empty_like(x)
+    def moe_kexperts(self, x: torch.Tensor, topk_ids: torch.Tensor, topk_weight: torch.Tensor) -> torch.Tensor:
        outs = self.experts(x, topk_ids, topk_weight)
        return outs

@ -672,7 +738,7 @@ class KDeepseekV2MoE(BaseInjectedModule, DeepseekV2MoE):
            y_ = self.shared_experts(identity).squeeze(0)
            
        if isinstance(self.experts, KExpertsBase):
-            y = self.moe_on_cpuinfer(hidden_states, topk_idx, topk_weight).view(*orig_shape).to(device=hidden_states.device)
+            y = self.moe_kexperts(hidden_states, topk_idx, topk_weight).view(*orig_shape).to(device=hidden_states.device)
        elif hidden_states.size(0) > 10:
            # TODO may bugs here
            y = (
@ -692,8 +758,7 @@ class KDeepseekV2MoE(BaseInjectedModule, DeepseekV2MoE):
        return y

    @torch.no_grad()
-    def moe_on_cpuinfer(self, x: torch.Tensor, topk_ids: torch.Tensor, topk_weight: torch.Tensor) -> torch.Tensor:
-        outs = torch.empty_like(x)
+    def moe_kexperts(self, x: torch.Tensor, topk_ids: torch.Tensor, topk_weight: torch.Tensor) -> torch.Tensor:
        outs = self.experts(x, topk_ids, topk_weight)
        return outs

@ -773,7 +838,7 @@ class KDeepseekV3MoE(BaseInjectedModule, DeepseekV3MoE):
            y_ = self.shared_experts(identity).squeeze(0)
            
        if isinstance(self.experts, KExpertsBase):
-            y = self.moe_on_cpuinfer(hidden_states, topk_idx, topk_weight).view(*orig_shape).to(device=hidden_states.device)
+            y = self.moe_kexperts(hidden_states, topk_idx, topk_weight).view(*orig_shape).to(device=hidden_states.device)
        elif hidden_states.size(0) > 10:
            # TODO may bugs here
            y = (
@ -793,8 +858,7 @@ class KDeepseekV3MoE(BaseInjectedModule, DeepseekV3MoE):
        return y

    @torch.no_grad()
-    def moe_on_cpuinfer(self, x: torch.Tensor, topk_ids: torch.Tensor, topk_weight: torch.Tensor) -> torch.Tensor:
-        outs = torch.empty_like(x)
+    def moe_kexperts(self, x: torch.Tensor, topk_ids: torch.Tensor, topk_weight: torch.Tensor) -> torch.Tensor:
        outs = self.experts(x, topk_ids, topk_weight)
        return outs

@ -881,7 +945,7 @@ class KMistralSparseMoEBlock(BaseInjectedModule, MixtralSparseMoeBlock):

        if isinstance(self.experts, KExpertsBase):
            y = (
-                self.moe_on_cpuinfer(
+                self.moe_kexperts(
                    hidden_states_expert, selected_experts_expert, routing_weights_expert
                )
                .view(*orig_shape)
@ -900,8 +964,7 @@ class KMistralSparseMoEBlock(BaseInjectedModule, MixtralSparseMoeBlock):
        return y, router_logits
    
    @torch.no_grad()
-    def moe_on_cpuinfer(self, x: torch.Tensor, topk_ids: torch.Tensor, topk_weight: torch.Tensor) -> torch.Tensor:
-        outs = torch.empty_like(x)
+    def moe_kexperts(self, x: torch.Tensor, topk_ids: torch.Tensor, topk_weight: torch.Tensor) -> torch.Tensor:
        outs = self.experts(x, topk_ids, topk_weight)
        return outs

--- a/ktransformers/operators/flashinfer_wrapper.py
+++ b/ktransformers/operators/flashinfer_wrapper.py
@ -0,0 +1,380 @@
+'''
+Description  : flashinfer MLA wrapper
+Author       : Boxin Zhang
+Version      : 0.2.3
+'''
+import torch
+import os
+from ktransformers.operators.triton_attention import decode_attention_fwd_grouped
+
+flashinfer_enabled = False
+
+try:
+    import flashinfer
+    flashinfer_enabled = True
+    print("found flashinfer")
+    
+except ImportError:
+    print("flashinfer not found, use triton for linux")
+
+import math
+
+def attention_ref_torch(
+    batch_size,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    causal: bool,
+    sm_scale: float,
+) -> torch.Tensor:
+    qo_len = q.shape[0] // batch_size
+    kv_len = k.shape[0] // batch_size
+    num_qo_heads = q.shape[1]
+    head_dim_qk = q.shape[2]
+    head_dim_vo = v.shape[2]
+    logits = (
+        torch.einsum(
+            "bmhd,bnhd->bhmn",
+            q.view(batch_size, qo_len, num_qo_heads, head_dim_qk).float(),
+            k.view(batch_size, kv_len, num_qo_heads, head_dim_qk).float(),
+        )
+        * sm_scale
+    )
+
+    #print("attn weights", logits)
+
+    if causal:
+        mask = (
+            torch.arange(kv_len - qo_len, kv_len).unsqueeze(1)
+            >= torch.arange(0, kv_len).unsqueeze(0)
+        ).to(q.device)
+    else:
+        mask = torch.ones(qo_len, kv_len).to(q.device)
+
+    logits = logits.masked_fill(mask.unsqueeze(0).unsqueeze(0) == 0, float("-inf"))
+    lse_ref = torch.logsumexp(logits, -1).transpose(-1, -2)
+    p = torch.softmax(logits, dim=-1)
+    o_ref = (
+        torch.einsum(
+            "bhmn,bnhd->bmhd",
+            p,
+            v.view(batch_size, kv_len, num_qo_heads, head_dim_vo).float(),
+        )
+        .contiguous()
+        .view(batch_size * qo_len, num_qo_heads, head_dim_vo)
+        .to(q)
+    )
+
+    return o_ref, lse_ref * math.log2(math.e)
+
+class MLAWrapper():
+    def __init__(self,
+                 max_batch_size,
+                 max_pages,
+                 use_cuda_graph = True,
+                 device = "cuda",
+                 ):
+        self.float_workspace_buffer = torch.empty(128*1024*1024, dtype=torch.int8, device=device)
+        self.max_batch_size = max_batch_size
+        self.max_pages = max_pages
+        if use_cuda_graph:
+            if self.max_batch_size == 1:
+                self.qo_indptr_buf = torch.arange(0, max_batch_size+1, dtype=torch.int32, device=device)
+                self.kv_indptr_buf = torch.tensor([0, max_pages], dtype=torch.int32, device=device)
+                self.kv_indices_buf = torch.arange(0, max_pages, dtype=torch.int32, device=device)
+            else:
+                self.qo_indptr_buf = torch.empty(max_batch_size+1, dtype=torch.int32, device=device)
+                self.kv_indptr_buf = torch.empty(max_batch_size+1, dtype=torch.int32, device=device)
+                self.kv_indices_buf = torch.empty(max_pages, dtype=torch.int32, device=device)
+            self.kv_len_arr_buf = torch.empty(max_batch_size, dtype=torch.int32, device=device)
+        else:
+            self.qo_indptr_buf = None
+            self.kv_indptr_buf = None
+            self.kv_indices_buf = None
+            self.kv_len_arr_buf = None
+        self.wrapper = flashinfer.mla.BatchMLAPagedAttentionWrapper(
+            self.float_workspace_buffer,
+            use_cuda_graph=False,
+            qo_indptr=self.qo_indptr_buf,
+            kv_indptr=self.kv_indptr_buf,
+            kv_indices=self.kv_indices_buf,
+            kv_len_arr=self.kv_len_arr_buf,
+        )
+        self.need_plan = True
+    
+    def plan(self,
+             qo_indptr,
+             kv_indptr,
+             kv_indices,
+             kv_len_arr,
+             num_heads,
+             head_dim_ckv,
+             head_dim_kpe,
+             page_size,
+             sm_scale,
+             q_data_type,
+             kv_data_type,
+             ):
+        if qo_indptr is None:
+            assert self.max_batch_size == 1
+            qo_indptr = self.qo_indptr_buf
+        if kv_indptr is None:
+            assert self.max_batch_size == 1
+            kv_indptr = self.kv_indptr_buf
+        if kv_indices is None:
+            assert self.max_batch_size == 1
+            kv_indices = self.kv_indices_buf
+        
+        self.wrapper.plan(
+            qo_indptr,
+            kv_indptr,
+            kv_indices,
+            kv_len_arr,
+            num_heads,
+            head_dim_ckv,
+            head_dim_kpe,
+            page_size,
+            True, # causal
+            sm_scale,
+            q_data_type,
+            kv_data_type,
+        )
+
+    def run(self, q_nope, q_pe, ckv, k_pe, return_lse = False):
+        return self.wrapper.run(q_nope, q_pe, ckv, k_pe, return_lse = return_lse)
+
+class MLAWrapperSingleton():
+    wrappers:dict = {}
+
+    @classmethod
+    def get_instance(cls, device, *args, **kwargs)->MLAWrapper:
+        if device not in cls.wrappers:
+            cls.make_instance(device, *args, **kwargs)
+        return cls.wrappers[device]
+    
+    @classmethod
+    def make_instance(cls, device, *args, **kwargs):
+        cls.wrappers[device] = MLAWrapper(*args, **kwargs, device=device)
+
+    @classmethod
+    def plan_all(cls, qo_indptr,
+             kv_indptr,
+             kv_indices,
+             kv_len_arr,
+             num_heads,
+             head_dim_ckv,
+             head_dim_kpe,
+             page_size,
+             sm_scale,
+             q_data_type,
+             kv_data_type,):
+        for device, wrapper in cls.wrappers.items():
+            kv_len_arr_cur_device = kv_len_arr.to(device)
+            wrapper.plan(qo_indptr,
+                kv_indptr,
+                kv_indices,
+                kv_len_arr_cur_device,
+                num_heads,
+                head_dim_ckv,
+                head_dim_kpe,
+                page_size,
+                sm_scale,
+                q_data_type,
+                kv_data_type,)
+            wrapper.need_plan = False
+            
+    @classmethod
+    def need_plan_all(cls):
+        for device, wrapper in cls.wrappers.items():
+            wrapper.need_plan = True
+        
+    @classmethod
+    def reset_buffer(cls):
+        for device, wrapper in cls.wrappers.items():
+            wrapper.qo_indptr_buf[1] = 1 # assert max_batch_size=1 here.
+            
+    @classmethod
+    def update_buffer(cls, max_pages):
+        for device, wrapper in cls.wrappers.items():
+            wrapper.kv_indptr_buf[1] = max_pages # assert max_batch_size=1 here.
+            wrapper.kv_indices_buf = torch.arange(0, max_pages, dtype=torch.int32, device=device)
+            wrapper.wrapper._kv_indices_buf = wrapper.kv_indices_buf
+
+def checksame():
+    flashinfer_folder = "./flashinfer_output"
+    flashinfer_folder = "./kv_cache_flashinfer"
+    triton_folder = "./triton_output"
+    triton_folder = "./kv_cache_triton"
+    
+    max_layer_id = 1
+    max_forward_id = 2
+
+    for forward_id in range(0, 19):
+        print("forward_id", forward_id)
+        for layer_id in range(max_layer_id):
+            print(layer_id)
+            #file_name = f"layer_{layer_id}_forward_{forward_id}_attn_output.pt"
+            #file_name = f"layer_{layer_id}_forward_{forward_id}_q_pe.pt"
+            file_name = f"layer_{layer_id}.pt"
+            
+            flashinfer_path = os.path.join(flashinfer_folder, file_name)
+            triton_path = os.path.join(triton_folder, file_name)
+            
+            if not os.path.exists(triton_path):
+                print(f"{file_name} not exist in {triton_folder}")
+                continue
+            if not os.path.exists(flashinfer_path):
+                print(f"{file_name} not exist in {flashinfer_folder}")
+                continue
+            
+            
+            flashinfer_tensor = torch.load(flashinfer_path)[1:2, :62]#
+            triton_tensor = torch.load(triton_path)[1:2, :62]#.squeeze(1)#
+            try:
+                torch.testing.assert_close(flashinfer_tensor, triton_tensor, rtol=1e-9, atol=1e-9)
+            except AssertionError as e:
+                print(e)
+
+if __name__ == "__main__":
+    
+    #checksame()
+    #exit(0)
+
+    max_batch_size = 1
+    max_pages = 64
+    page_size = 64
+    num_heads = 128
+    
+    # warm-up
+    kv_len = 4023
+    q_len = 1
+    q_nope_buf = torch.randn((q_len, num_heads, 512), dtype=torch.bfloat16, device="cuda")
+    q_pe_buf = torch.randn((q_len, num_heads, 64), dtype=torch.bfloat16, device="cuda")
+    kv_buf = torch.randn((max_pages, page_size, 576), dtype=torch.bfloat16, device="cuda")
+    ckv, k_pe = torch.split(kv_buf, [512, 64], dim=-1)
+    
+
+    wrapper = MLAWrapperSingleton.get_instance(
+        "cuda",
+        max_batch_size,
+        max_pages,
+    )
+    
+    kv_len_arr = torch.tensor([kv_len], dtype=torch.int32, device="cuda")
+    qo_indptr = torch.tensor([0, q_len], dtype=torch.int32, device="cuda")
+    wrapper.plan(
+        qo_indptr,
+        None,
+        None,
+        kv_len_arr,
+        128,
+        512,
+        64,
+        page_size,
+        192 ** (-0.5),
+        torch.bfloat16,
+        torch.bfloat16,
+    )
+
+    attn_output = wrapper.run(q_nope_buf, q_pe_buf, ckv, k_pe)
+    print(attn_output.shape)
+    
+    graph = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(graph):
+        attn_output = wrapper.run(q_nope_buf, q_pe_buf, ckv, k_pe)
+    # warm-up finished
+
+    for forward_id in range(0, 1):
+        print("forward_id", forward_id)
+        for layer_id in range(1):
+            print(layer_id)
+            flashinfer_folder = "./kv_cache_flashinfer"
+            forward_id = 17
+            layer_id = 0
+            file_name = f"layer_{layer_id}.pt"
+            kv_cache_path = os.path.join(flashinfer_folder, file_name)
+            flashinfer_folder = "./flashinfer_output"
+
+            q_len = 1
+            kv_len = 126
+            file_name = f"layer_{layer_id}_forward_{forward_id}_q_nope.pt"
+            q_nope = torch.load(os.path.join(flashinfer_folder, file_name)).view(q_len,128,512).to(device="cuda")
+            file_name = f"layer_{layer_id}_forward_{forward_id}_q_pe.pt"
+            q_pe = torch.load(os.path.join(flashinfer_folder, file_name)).view(q_len,128,64).to(device="cuda")
+            q = torch.cat([q_nope, q_pe], dim=-1)
+            kv_cache = torch.load(kv_cache_path).to(device="cuda")
+            pages, page_size, _, head_dim = kv_cache.shape
+            kv_cache = kv_cache.view(pages, page_size, head_dim)
+            ckv, k_pe = torch.split(kv_cache, [512, 64], dim=-1)
+    
+            kv_len_arr = torch.tensor([kv_len], dtype=torch.int32, device="cuda")
+            qo_indptr = torch.tensor([0, q_len], dtype=torch.int32, device="cuda")
+            wrapper.plan(
+                None,
+                None,
+                None,
+                kv_len_arr,
+                128,
+                512,
+                64,
+                page_size,
+                192 ** (-0.5),
+                torch.bfloat16,
+                torch.bfloat16,
+            )
+    
+            q_nope_buf.copy_(q_nope)
+            q_pe_buf.copy_(q_pe)
+            kv_buf[:pages].copy_(kv_cache)
+
+            torch.cuda.synchronize()
+            graph.replay()
+            torch.cuda.synchronize()
+
+            # ref_torch
+            k = (
+                torch.cat([ckv, k_pe], dim=-1)
+                .view(-1, 1, 512 + 64)
+                .repeat_interleave(num_heads, dim=1)
+            )
+            v = ckv.view(-1, 1, 512).repeat_interleave(num_heads, dim=1)
+            attn_ref, lse_ref = attention_ref_torch(
+                max_batch_size,
+                q,
+                k[:kv_len],
+                v[:kv_len],
+                False,
+                192 ** (-0.5)
+            )
+            torch.testing.assert_close(attn_output, attn_ref, rtol=1e-3, atol=1e-3)
+    
+            # ref_triton
+            attn_logits = torch.empty(
+                    (
+                        max_batch_size,
+                        num_heads,
+                        4, #num_kv_splits # follow vLLM, fix it TODO
+                        512 + 1, 
+                    ),
+                    dtype=torch.float32,
+                    device = "cuda"
+                )
+            
+            triton_ref = torch.zeros_like(q_nope)
+            page_table = torch.arange(max_pages, dtype=torch.int32, device="cuda")
+            ckv_with_pe = torch.cat([ckv, k_pe], dim=-1).contiguous().view(pages, page_size, 1, 576)
+            ckv = ckv.view(pages, page_size, 1, 512)
+            decode_attention_fwd_grouped(q, ckv_with_pe, ckv, triton_ref,
+                page_table,
+                kv_len_arr, attn_logits,
+                4, #num_kv_splits # follow vLLM, fix it TODO
+                192 ** (-0.5),
+                page_size)
+
+            torch.testing.assert_close(attn_output, triton_ref, rtol=1e-3, atol=1e-3)
+            
+            #file_name = f"./flashinfer_output/layer_{layer_id}_forward_{forward_id}_attn_output.pt"
+            #ktrans_output = torch.load(file_name)
+            #torch.testing.assert_close(attn_output, ktrans_output.squeeze(1), rtol=1e-3, atol=1e-3)
+            print("test past")
+    
--- a/ktransformers/operators/gate.py
+++ b/ktransformers/operators/gate.py
@ -67,7 +67,14 @@ class KMoEGateBase(ABC):

        for key in keys:
            key = ".".join(key.split(".")[:-1])
-            if key + ".ffn_gate_inp.weight" in self.gguf_loader.tensor_info:
+            if self.gguf_loader.safetensor_loader is not None:
+                targets = [".ffn_gate_inp.weight", ".exp_probs_b.bias"]
+                weight = self.gguf_loader.safetensor_loader.load_tensor(key + ".ffn_gate_inp.weight") 
+                e_score_correction_bias = self.gguf_loader.safetensor_loader.load_tensor(key + ".exp_probs_b.bias")
+                weight_type = weight.dtype
+                e_score_correction_bias_type = e_score_correction_bias.dtype
+                res = {"weight": weight, "e_score_correction_bias": e_score_correction_bias,  "weight_type": weight_type, "e_score_correction_bias_type": e_score_correction_bias_type}
+            elif key + ".ffn_gate_inp.weight" in self.gguf_loader.tensor_info:
                targets = [".ffn_gate_inp.weight", ".exp_probs_b.bias"]
                tensors = self.load_multi(key, targets, device=device)
                weight = tensors[".ffn_gate_inp.weight"]
@ -93,11 +100,11 @@ class KMoEGate(BaseInjectedModule, KMoEGateBase):
        gguf_loader: GGUFLoader,
        config: PretrainedConfig,
        orig_module: nn.Module = None,
-        generate_device: str = "cuda",
        prefill_device: str = "cuda",
+        generate_device: str = "cuda",
        **kwargs,
    ):
-        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, generate_device, **kwargs)
+        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, generate_device, **kwargs)
        KMoEGateBase.__init__(self, key, gguf_loader, config, orig_module, generate_device, **kwargs)
        self.generate_device = generate_device
        self.prefill_device = prefill_device
@ -116,8 +123,8 @@ class KMoEGate(BaseInjectedModule, KMoEGateBase):
            self.orig_module.e_score_correction_bias = nn.Parameter(w["e_score_correction_bias"])
        else:
            raise ValueError("Invalid weight type")
-        self.orig_module.weight = self.orig_module.weight.to(device)
-        self.orig_module.e_score_correction_bias = self.orig_module.e_score_correction_bias.to(device)
+        self.orig_module.weight = nn.Parameter(self.orig_module.weight.to(device))
+        self.orig_module.e_score_correction_bias = nn.Parameter(self.orig_module.e_score_correction_bias.to(device))

    def unload(self):
        if self.weight is not None:
--- a/ktransformers/operators/linear.py
+++ b/ktransformers/operators/linear.py
@ -21,10 +21,12 @@ from ktransformers.ktransformers_ext.operators.custom_marlin.quantize.utils.marl
    MarlinWorkspace,
    marlin_quantize,
    GPTQ_MARLIN_MIN_THREAD_N,
+    GPTQ_MARLIN_MIN_THREAD_K,
    GPTQ_MARLIN_MAX_PARALLEL,
 )
 from ktransformers.operators.base_operator import BaseInjectedModule
 from transformers.configuration_utils import PretrainedConfig
+from ktransformers.ktransformers_ext.triton.fp8gemm import fp8_gemm, act_quant, weight_dequant
 from abc import ABC, abstractmethod
 import sys, os
 sys.path.append(os.path.join(os.path.dirname(__file__), "..", "ktransformers_ext", "build"))
@ -54,16 +56,18 @@ class KLinearBase(ABC):

        self.has_bias = False
        self.dtype = torch.get_default_dtype()
-        # if orig_module is not None:
-        #     self.in_features = orig_module.in_features
-        #     self.out_features = orig_module.out_features
-        # else:
+        if orig_module is not None:
+            self.in_features = orig_module.in_features
+            self.out_features = orig_module.out_features
+        else:
            shape = self.gguf_loader.tensor_info[key + ".weight"]["shape"]
            if len(shape) == 1:
                print("Warning: orig_module is not set, but has in_features or out_features equals to 1, can't get in_features and out_features from GGUF")
            self.in_features  = self.gguf_loader.tensor_info[key + ".weight"]["shape"][0]
            self.out_features = self.gguf_loader.tensor_info[key + ".weight"]["shape"][1]

+        self.loaded = False # for lm_head pre-load, TODO: use new way to do lm_head pre-load when layer wise prefill.
+
    @abstractmethod
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        pass
@ -75,7 +79,13 @@ class KLinearBase(ABC):
            keys = [self.key]

        for key in keys:
-            if key + ".weight" in self.gguf_loader.tensor_file_map:
+            if self.gguf_loader.safetensor_loader is not None:
+                # using safetensor_loader
+                tensor = self.gguf_loader.safetensor_loader.load_tensor(key+'.weight')
+                weight_scale_inv = self.gguf_loader.safetensor_loader.load_tensor(key+'.weight_scale_inv')
+                return nn.Parameter(tensor), nn.Parameter(weight_scale_inv)
+                
+            elif key + ".weight" in self.gguf_loader.tensor_file_map:
                if key + ".bias" in self.gguf_loader.tensor_file_map:
                    tensors = self.load_multi(key, ["weight", "bias"], device=device)
                    tensor = tensors["weight"]
@ -119,7 +129,7 @@ class KLinearTorch(KLinearBase):
        super().__init__(key, gguf_loader, config, orig_module, device, **kwargs)
        self.has_bias = False
        self.dtype = torch.get_default_dtype()
-        self.w = None
+        self.weight = None
        self.has_bias = False

    def forward(self, x: torch.Tensor) -> torch.Tensor:
@ -127,33 +137,96 @@ class KLinearTorch(KLinearBase):
        out_device = x.device
        # TODO: support CUDA Graph when using cpu, but CPUInfer is recommended.
        x = x.to(device=self.device, dtype=self.dtype)
-        x = x @ self.w
+        x = x @ self.weight
        if self.has_bias:
            x = x + self.bias
        x = x.to(dtype=dtype, device=out_device)
        return x

    def load(self, w: dict | nn.Parameter | tuple | None = None, device: str|None = None):
+        if self.loaded: return
        if device is None: device = self.device
        if w is None: w = self.load_weight(device=device)
+        # else: self.out_features = w.shape[0], self.in_features = w.shape[1]
        
        if isinstance(w, nn.Parameter):
-            self.w = w.to(dtype=self.dtype).T
+            try:
+                self.weight = w.to(dtype=self.dtype).view(self.out_features, self.in_features).T
+            except: 
+                self.weight = w.to(dtype=self.dtype).T
            self.has_bias = False
        elif isinstance(w, tuple):
-            self.w = w[0].to(dtype=self.dtype).T
+            try:
+                self.weight = w[0].to(dtype=self.dtype).view(self.out_features, self.in_features).T
+            except:
+                self.weight = w[0].to(dtype=self.dtype).T
            self.bias = w[1].to(dtype=self.dtype)
            self.has_bias = True
        else:
            raise ValueError("Invalid weight type")
        # self.linear = self.linear.to(device)
-        self.w = self.w.to(device)
+        self.weight = self.weight.to(device)
+        if self.has_bias:
+            self.bias = self.bias.to(device)
+        self.loaded = True
+
+    def unload(self):
+        if self.weight is not None:
+            self.weight = None
+        if self.has_bias:
+            self.bias = None
+
+class KLinearFP8(KLinearBase):
+    # this kernel requires special handling for weight
+    # Please load the weight file downloaded from KVCache.AI
+    marlin_q_w: torch.Tensor
+    marlin_s: torch.Tensor
+    g_idx: torch.Tensor
+    sort_indices: torch.Tensor
+    has_bias: bool
+    weight: torch.Tensor
+    scale_w: torch.Tensor
+    bias: torch.Tensor
+    def __init__(
+        self,
+        key: str,
+        gguf_loader: GGUFLoader,
+        config: PretrainedConfig,
+        orig_module: nn.Module = None,
+        device: str = "cuda",
+        block_size: int = 128,
+        **kwargs,
+    ):
+        super().__init__(key, gguf_loader, config, orig_module, device, **kwargs)
+        self.has_bias = False
+        self.dtype = torch.get_default_dtype()
+        self.block_size = block_size
+    
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x.to(self.device)
+        orig_dtype = x.dtype        
+        x_quantized, scale_x = act_quant(x, self.block_size)
+        y = fp8_gemm(x_quantized, scale_x, self.weight, self.weight_scale_inv)
+        return y.to(dtype=orig_dtype)
+    
+    def load(self, w: dict | nn.Parameter | tuple | None = None, device: str|None = None):
+        if device is None: device = self.device
+        if w is None: 
+            w = self.load_weight(device=device) 
+        ### TODO fit weight_inv format
+        if isinstance(w, tuple):
+            self.weight = w[0].to(device)
+            self.weight_scale_inv = w[1].to(device)
+            self.has_bias = False
+        else:
+            raise ValueError("Invalid weight type")
+        self.weight = self.weight.to(device)
        if self.has_bias:
            self.bias = self.bias.to(device)
        
    def unload(self):
-        if self.w is not None:
-            self.w = None
+        if self.weight is not None:
+            self.weight = None
        if self.has_bias:
            self.bias = None
        
@ -183,19 +256,36 @@ class KLinearMarlin(KLinearBase):
        self.group_size = group_size
        self.act_order = act_order
        self.is_k_full = is_k_full
+        self.padding = False
+        self.orin_in_features = self.in_features
+        self.orin_out_features = self.out_features
+        if self.in_features%GPTQ_MARLIN_MIN_THREAD_K!=0 or self.out_features%GPTQ_MARLIN_MIN_THREAD_K!=0:
+            #print(f"warning!, in_features={in_features} or out_features={out_features} is undivisible by GPTQ_MARLIN_MIN_THREAD_K={GPTQ_MARLIN_MIN_THREAD_K} and GPTQ_MARLIN_MIN_THREAD_N={GPTQ_MARLIN_MIN_THREAD_N}, padding")
+            self.padding = True
+            self.in_features = (self.in_features+GPTQ_MARLIN_MIN_THREAD_K-1)//GPTQ_MARLIN_MIN_THREAD_K*GPTQ_MARLIN_MIN_THREAD_K
+            self.out_features = (self.out_features+GPTQ_MARLIN_MIN_THREAD_N-1)//GPTQ_MARLIN_MIN_THREAD_N*GPTQ_MARLIN_MIN_THREAD_N
+            #print(f"After padding: in_features={in_features}, out_features={out_features}")
+        
+        self.k = self.in_features
+        self.n = self.out_features

    def load(self, w: dict | nn.Parameter | tuple | None = None, device: str|None = None):
+        if self.loaded: return
        if device is None: device = self.device
        assert device.lower() != "cpu", "Marlin quantized linear only supports GPU device"
-        if w is None: w = self.load_weight(device=device)
+        
+        #if self.in_features * self.out_features:
+        if w is None: 
+            w = self.load_weight(device=device) 

        if isinstance(w, nn.Parameter):
            # pad weight
-            weight = w.view(self.out_features, self.in_features).T
+            weight = w.view(self.orin_out_features, self.orin_in_features).T
            self.has_bias = False
        elif isinstance(w, tuple):
            w = list(w)
-            weight = w[0].view(self.out_features, self.in_features).T
+            weight = w[0].view(self.orin_out_features, self.orin_in_features).T
+            self.bias = w[1].view(self.orin_out_features)
            self.bias = w[1]
            self.has_bias = True
        else:
@ -203,19 +293,27 @@ class KLinearMarlin(KLinearBase):
        weight = weight.to(device)
        if self.has_bias:
            self.bias = self.bias.to(device)
+            
+        if self.padding:
+            padded_weight = torch.zeros(self.in_features, self.out_features, device=self.device)
+            padded_weight[:self.orin_in_features, :self.orin_out_features] = weight
+            weight = padded_weight
+
        # Pack Marlin linear
-        w_ref, marlin_q_w, marlin_s, g_idx, sort_indices, _ = marlin_quantize(
+        marlin_q_w, marlin_s, g_idx, sort_indices, _ = marlin_quantize(
            weight, self.num_bits, self.group_size, self.act_order
        )
        self.workspace = MarlinWorkspace(
            self.out_features, GPTQ_MARLIN_MIN_THREAD_N, GPTQ_MARLIN_MAX_PARALLEL,self.device
        )
+        self.weight = marlin_q_w # modeling_xxx.py may use linear.weight
        self.marlin_q_w = marlin_q_w
        self.marlin_s = marlin_s
        self.g_idx = g_idx
        self.sort_indices = sort_indices
        self.k = weight.shape[0]
        self.n = weight.shape[1]
+        self.loaded = True

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # Only support input x as BF16 and FP16
@ -223,6 +321,11 @@ class KLinearMarlin(KLinearBase):
        orig_shape = list(x.shape)
        orig_dtype = x.dtype
        x = x.reshape(-1, orig_shape[-1])
+        x = x.reshape(-1, x.shape[-1])
+        if self.padding:
+            padding_input=torch.empty(x.shape[0], self.in_features, device=x.device, dtype=x.dtype)
+            padding_input[:,:self.orin_in_features] = x
+            x = padding_input
        marlin_s = self.marlin_s.to(x.dtype)
        x = KTransformersOps.gptq_marlin_gemm(
            x,
@ -237,9 +340,13 @@ class KLinearMarlin(KLinearBase):
            x.shape[-1],
            self.is_k_full,
        )
+        if self.padding:
+            x = x[:,:self.orin_out_features]
+            orig_shape[-1] = self.orin_out_features
+        else:
+            orig_shape[-1] = self.out_features
        if self.has_bias:
            x = x + self.bias
-        orig_shape[-1] = self.n
        return x.reshape(orig_shape).to(orig_dtype)

    def unload(self):
@ -357,7 +464,8 @@ class KLinearCPUInfer(KLinearBase):
 LINEAR_MAP = {
    "KLinearMarlin": KLinearMarlin,
    "KLinearTorch": KLinearTorch,
-    "KLinearCPUInfer": KLinearCPUInfer
+    "KLinearCPUInfer": KLinearCPUInfer,
+    "KLinearFP8": KLinearFP8,
 }

 class KTransformersLinear(BaseInjectedModule, KLinearBase):
@ -374,28 +482,17 @@ class KTransformersLinear(BaseInjectedModule, KLinearBase):
        prefill_op: str| None = "KLinearTorch",
        **kwargs,
    ):
-        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, generate_device, **kwargs)
+        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, generate_device, **kwargs)
        KLinearBase.__init__(self, key, gguf_loader, config, orig_module, generate_device, **kwargs)
        # build all the linear operators
        if prefill_op is not None:
            assert prefill_op in LINEAR_MAP, f"linear_type {prefill_op} not supported"
-            if prefill_op == "KLinearMarlin" and (orig_module.in_features%GPTQ_MARLIN_MIN_THREAD_N!=0 or orig_module.out_features%GPTQ_MARLIN_MIN_THREAD_N!=0):
-                print(f"This linear module's in_features or out_features is not divisible by GPTQ_MARLIN_MIN_THREAD_N({GPTQ_MARLIN_MIN_THREAD_N}), using KLinearTorch instead.")
-                print(f"module info: key:{key} orig_module:{orig_module}")
-                self.prefill_linear = KLinearTorch(key, gguf_loader, config, orig_module, prefill_device, **kwargs)
-            else:
            self.prefill_linear = LINEAR_MAP[prefill_op](key, gguf_loader, config, orig_module, prefill_device, **kwargs)
        else:
            self.prefill_linear = None

        if generate_op is not None:
            assert generate_op in LINEAR_MAP, f"linear_type {generate_op} not supported"
-            if generate_op == "KLinearMarlin" and (orig_module.in_features%GPTQ_MARLIN_MIN_THREAD_N!=0 or orig_module.out_features%GPTQ_MARLIN_MIN_THREAD_N!=0):
-                print(f"This linear module's in_features or out_features is not divisible by GPTQ_MARLIN_MIN_THREAD_N({GPTQ_MARLIN_MIN_THREAD_N}), using KLinearTorch instead.")
-                print(f"module info: key:{key} orig_module:{orig_module}")
-                self.generate_op = "KLinearTorch"
-                self.generate_linear = KLinearTorch(key, gguf_loader, config, orig_module, generate_device, **kwargs)
-            else:
            self.generate_linear = LINEAR_MAP[generate_op](key, gguf_loader, config, orig_module, generate_device, **kwargs)
        else:
            self.generate_linear = None
@ -404,10 +501,11 @@ class KTransformersLinear(BaseInjectedModule, KLinearBase):
    def forward(self, x):
        if self.mode == InferenceState.PREFILL:
            assert self.prefill_linear is not None, "cpu linear is not initialized"
-            return self.prefill_linear.forward(x)
+            y = self.prefill_linear.forward(x)
        else:
            assert self.generate_linear is not None, "gpu linear is not initialized"
-            return self.generate_linear.forward(x)
+            y = self.generate_linear.forward(x)
+        return y

    def load(self, w: dict | nn.Parameter | tuple | None = None, mode: InferenceState = InferenceState.GENERATE):
        if not mode:
@ -417,10 +515,12 @@ class KTransformersLinear(BaseInjectedModule, KLinearBase):
            self.generate_linear.unload()
            self.prefill_linear.load(w=w)
            self.device = self.prefill_linear.device
+            self.weight = self.prefill_linear.weight # modeling_xxx.py may use linear.weight
        elif mode == InferenceState.GENERATE:
            self.prefill_linear.unload()
            self.generate_linear.load(w=w)
            self.device = self.generate_linear.device
+            self.weight = self.generate_linear.weight # modeling_xxx.py may use linear.weight
        elif mode == InferenceState.UNLOAD:
            self.prefill_linear.unload()
            self.generate_linear.unload()
--- a/ktransformers/operators/models.py
+++ b/ktransformers/operators/models.py
@ -56,7 +56,7 @@ from ktransformers.models.modeling_deepseek import (
 from transformers.models.qwen2_moe.configuration_qwen2_moe import Qwen2MoeConfig
 from ktransformers.models.configuration_llama import LlamaConfig
 from ktransformers.operators.base_operator import BaseInjectedModule
-from ktransformers.util.utils import InferenceState
+from ktransformers.util.utils import InferenceState, get_compute_capability
 from ktransformers.util.custom_gguf import GGUFLoader
 from transformers.configuration_utils import PretrainedConfig
 from ktransformers.models.modeling_llama import (
@ -649,9 +649,14 @@ class KDeepseekV2Model(BaseInjectedModule):
        if per_layer_prefill_flag:
            causal_mask = None
        else:
+            if os.name == 'nt' or get_compute_capability()<8:
+                print("for Windows or GPU before ampere, use forward_windows")
+                # only use mask in forward windows or can't flash attn
                causal_mask = self._update_causal_mask(
                    attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
                )
+            else:
+                causal_mask = None

        # embed positions
        hidden_states = inputs_embeds
--- a/ktransformers/operators/triton_attention.py
+++ b/ktransformers/operators/triton_attention.py
@ -0,0 +1,385 @@
+# Adapted from
+# https://github.com/sgl-project/sglang/blob/9f635ea50de920aa507f486daafba26a5b837574/python/sglang/srt/layers/attention/triton_ops/decode_attention.py
+# which was originally adapted from
+# https://github.com/ModelTC/lightllm/blob/96353e868a840db4d103138caf15ed9dbea8c186/lightllm/models/deepseek2/triton_kernel/gqa_flash_decoding_stage1.py
+# https://github.com/ModelTC/lightllm/blob/96353e868a840db4d103138caf15ed9dbea8c186/lightllm/models/deepseek2/triton_kernel/gqa_flash_decoding_stage2.py
+
+import triton
+import triton.language as tl
+
+@triton.jit
+def tanh(x):
+    # Tanh is just a scaled sigmoid
+    return 2 * tl.sigmoid(2 * x) - 1
+
+@triton.jit
+def _fwd_grouped_kernel_stage1(
+    Q,
+    K_Buffer,
+    V_Buffer,
+    sm_scale,
+    Req_to_tokens,
+    B_Seqlen,
+    Att_Out,
+    stride_req_to_tokens_b,
+    stride_qbs,
+    stride_qh,
+    stride_buf_kbs,
+    stride_buf_kh,
+    stride_buf_vbs,
+    stride_buf_vh,
+    stride_mid_ob,
+    stride_mid_oh,
+    stride_mid_os,
+    kv_group_num: tl.constexpr,
+    q_head_num: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+    BLOCK_DPE: tl.constexpr,
+    BLOCK_DV: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_H: tl.constexpr,
+    NUM_KV_SPLITS: tl.constexpr,
+    PAGE_SIZE: tl.constexpr,
+    logit_cap: tl.constexpr,
+    Lk: tl.constexpr,
+    Lv: tl.constexpr,
+):
+    cur_batch = tl.program_id(0)
+    cur_head_id = tl.program_id(1)
+    cur_kv_head = cur_head_id // tl.cdiv(kv_group_num, BLOCK_H)
+    split_kv_id = tl.program_id(2)
+
+    if kv_group_num > BLOCK_H:
+        VALID_BLOCK_H: tl.constexpr = BLOCK_H
+    else:
+        VALID_BLOCK_H: tl.constexpr = kv_group_num
+    cur_head = cur_head_id * VALID_BLOCK_H + tl.arange(0, BLOCK_H)
+    mask_h = cur_head < (cur_head_id + 1) * VALID_BLOCK_H
+    mask_h = mask_h & (cur_head < q_head_num)
+
+    offs_d = tl.arange(0, BLOCK_DMODEL)
+    offs_dv = tl.arange(0, BLOCK_DV)
+    mask_d = offs_d < Lk
+    mask_dv = offs_dv < Lv
+    cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
+    cur_batch_req_idx = cur_batch
+
+    offs_q = cur_batch * stride_qbs + cur_head[:, None] * stride_qh + offs_d[
+        None, :]
+    q = tl.load(Q + offs_q,
+                mask=(mask_h[:, None]) & (mask_d[None, :]),
+                other=0.0)
+
+    if BLOCK_DPE > 0:
+        offs_dpe = BLOCK_DMODEL + tl.arange(0, BLOCK_DPE)
+        mask_dpe = offs_dpe < Lk
+        off_qpe = (cur_batch * stride_qbs + cur_head[:, None] * stride_qh +
+                   offs_dpe[None, :])
+        qpe = tl.load(Q + off_qpe,
+                      mask=(mask_h[:, None]) & (mask_dpe[None, :]),
+                      other=0.0)
+
+    kv_len_per_split = tl.cdiv(cur_batch_seq_len, NUM_KV_SPLITS)
+    split_kv_start = kv_len_per_split * split_kv_id
+    split_kv_end = tl.minimum(split_kv_start + kv_len_per_split,
+                              cur_batch_seq_len)
+    
+    e_max = tl.zeros([BLOCK_H], dtype=tl.float32) - float("inf")
+    e_sum = tl.zeros([BLOCK_H], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_H, BLOCK_DV], dtype=tl.float32)
+
+    if split_kv_end > split_kv_start:
+        for start_n in range(split_kv_start, split_kv_end, BLOCK_N):
+            offs_n = start_n + tl.arange(0, BLOCK_N)
+            kv_page_number = tl.load(
+                Req_to_tokens + stride_req_to_tokens_b * cur_batch_req_idx +
+                offs_n // PAGE_SIZE,
+                mask=offs_n < split_kv_end,
+                other=0,
+            )
+            kv_loc = kv_page_number * PAGE_SIZE + offs_n % PAGE_SIZE
+            offs_buf_k = (kv_loc[None, :] * stride_buf_kbs +
+                          cur_kv_head * stride_buf_kh + offs_d[:, None])
+            k = tl.load(
+                K_Buffer + offs_buf_k,
+                mask=(offs_n[None, :] < split_kv_end) & (mask_d[:, None]),
+                other=0.0,
+            )
+            qk = tl.dot(q, k.to(q.dtype))
+            
+            if BLOCK_DPE > 0:
+                offs_buf_kpe = (kv_loc[None, :] * stride_buf_kbs +
+                                cur_kv_head * stride_buf_kh +
+                                offs_dpe[:, None])
+                kpe = tl.load(
+                    K_Buffer + offs_buf_kpe,
+                    mask=(offs_n[None, :] < split_kv_end) &
+                    (mask_dpe[:, None]),
+                    other=0.0,
+                )
+                qk += tl.dot(qpe, kpe.to(qpe.dtype))
+            qk *= sm_scale
+
+            if logit_cap > 0:
+                qk = logit_cap * tanh(qk / logit_cap)
+
+            qk = tl.where(mask_h[:, None] & (offs_n[None, :] < split_kv_end),
+                          qk, float("-inf"))
+
+            offs_buf_v = (kv_loc[:, None] * stride_buf_vbs +
+                          cur_kv_head * stride_buf_vh + offs_dv[None, :])
+            v = tl.load(
+                V_Buffer + offs_buf_v,
+                mask=(offs_n[:, None] < split_kv_end) & (mask_dv[None, :]),
+                other=0.0,
+            )
+
+            n_e_max = tl.maximum(tl.max(qk, 1), e_max)
+            re_scale = tl.exp(e_max - n_e_max)
+            p = tl.exp(qk - n_e_max[:, None])
+            acc *= re_scale[:, None]
+            acc += tl.dot(p.to(v.dtype), v)
+
+            e_sum = e_sum * re_scale + tl.sum(p, 1)
+            e_max = n_e_max
+
+        offs_mid_o = (cur_batch * stride_mid_ob +
+                      cur_head[:, None] * stride_mid_oh +
+                      split_kv_id * stride_mid_os + offs_dv[None, :])
+
+        tl.store(
+            Att_Out + offs_mid_o,
+            acc / e_sum[:, None],
+            mask=(mask_h[:, None]) & (mask_dv[None, :]),
+        )
+
+        offs_mid_o_1 = (cur_batch * stride_mid_ob + cur_head * stride_mid_oh +
+                        split_kv_id * stride_mid_os + Lv)
+
+        tl.store(
+            Att_Out + offs_mid_o_1,
+            e_max + tl.log(e_sum),
+            mask=mask_h,
+        )
+
+def _decode_grouped_att_m_fwd(
+    q,
+    k_buffer,
+    v_buffer,
+    att_out,
+    Req_to_tokens,
+    B_Seqlen,
+    num_kv_splits,
+    sm_scale,
+    page_size,
+    logit_cap,
+):
+    BLOCK = 32
+    Lk = k_buffer.shape[-1]
+    Lv = v_buffer.shape[-1]
+
+    # [TODO] work around shmem limit on MI3xx
+    
+    # TODO: support hip
+    #if is_hip_ and Lk >= 576:
+    #    BLOCK = 16
+
+    if Lk == 576:
+        BLOCK_DMODEL = 512
+        BLOCK_DPE = 64
+    elif Lk == 288:
+        BLOCK_DMODEL = 256
+        BLOCK_DPE = 32
+    else:
+        BLOCK_DMODEL = triton.next_power_of_2(Lk)
+        BLOCK_DPE = 0
+    BLOCK_DV = triton.next_power_of_2(Lv)
+
+    batch, head_num = q.shape[0], q.shape[1]
+    kv_group_num = q.shape[1] // k_buffer.shape[-2]
+
+    BLOCK_H = 16
+    NUM_KV_SPLITS = num_kv_splits
+    grid = (
+        batch,
+        triton.cdiv(head_num, min(BLOCK_H, kv_group_num)),
+        NUM_KV_SPLITS,
+    )
+
+    extra_kargs = {}
+    # TODO: support hip
+    """
+    if is_hip_:
+        # https://rocm.docs.amd.com/en/docs-6.2.0/how-to/llm-fine-tuning-optimization/optimizing-triton-kernel.html
+        # https://github.com/triton-lang/triton/blob/main/third_party/amd/backend/compiler.py
+        extra_kargs = {
+            "waves_per_eu": 4,
+            "matrix_instr_nonkdim": 16,
+            "kpack": 2
+        }
+    """
+    
+    _fwd_grouped_kernel_stage1[grid](
+        q,
+        k_buffer,
+        v_buffer,
+        sm_scale,
+        Req_to_tokens,
+        B_Seqlen,
+        att_out,
+        Req_to_tokens.stride(0),
+        q.stride(0),
+        q.stride(1),
+        k_buffer.stride(-3),  # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM)
+        k_buffer.stride(-2),  # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM)
+        v_buffer.stride(-3),  # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM)
+        v_buffer.stride(-2),  # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM)
+        att_out.stride(0),
+        att_out.stride(1),
+        att_out.stride(2),
+        kv_group_num=kv_group_num,
+        q_head_num=head_num,
+        BLOCK_DMODEL=BLOCK_DMODEL,
+        BLOCK_DPE=BLOCK_DPE,
+        BLOCK_DV=BLOCK_DV,
+        BLOCK_N=BLOCK,
+        BLOCK_H=BLOCK_H,
+        NUM_KV_SPLITS=NUM_KV_SPLITS,
+        PAGE_SIZE=page_size,
+        logit_cap=logit_cap,
+        num_warps=4,
+        num_stages=2,
+        Lk=Lk,
+        Lv=Lv,
+        **extra_kargs,
+    )
+
+@triton.jit
+def _fwd_kernel_stage2(
+    Mid_O,
+    o,
+    B_Seqlen,
+    stride_mid_ob,
+    stride_mid_oh,
+    stride_mid_os,
+    stride_obs,
+    stride_oh,
+    NUM_KV_SPLITS: tl.constexpr,
+    BLOCK_DV: tl.constexpr,
+    Lv: tl.constexpr,
+):
+    cur_batch = tl.program_id(0)
+    cur_head = tl.program_id(1)
+
+    cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
+
+    offs_d = tl.arange(0, BLOCK_DV)
+    mask_d = offs_d < Lv
+
+    e_sum = 0.0
+    e_max = -float("inf")
+    acc = tl.zeros([BLOCK_DV], dtype=tl.float32)
+
+    offs_v = cur_batch * stride_mid_ob + cur_head * stride_mid_oh + offs_d
+    offs_logic = cur_batch * stride_mid_ob + cur_head * stride_mid_oh + Lv
+
+    for split_kv_id in range(0, NUM_KV_SPLITS):
+        kv_len_per_split = tl.cdiv(cur_batch_seq_len, NUM_KV_SPLITS)
+        split_kv_start = kv_len_per_split * split_kv_id
+        split_kv_end = tl.minimum(split_kv_start + kv_len_per_split,
+                                  cur_batch_seq_len)
+
+        if split_kv_end > split_kv_start:
+            tv = tl.load(Mid_O + offs_v + split_kv_id * stride_mid_os,
+                         mask=mask_d,
+                         other=0.0)
+            tlogic = tl.load(Mid_O + offs_logic + split_kv_id * stride_mid_os)
+            n_e_max = tl.maximum(tlogic, e_max)
+
+            old_scale = tl.exp(e_max - n_e_max)
+            acc *= old_scale
+            exp_logic = tl.exp(tlogic - n_e_max)
+            acc += exp_logic * tv
+
+            e_sum = e_sum * old_scale + exp_logic
+            e_max = n_e_max
+
+    tl.store(
+        o + cur_batch * stride_obs + cur_head * stride_oh + offs_d,
+        acc / e_sum,
+        mask=mask_d,
+    )
+
+def _decode_softmax_reducev_fwd(
+    logits,
+    q,
+    o,
+    v_buffer,
+    b_seq_len,
+    num_kv_splits,
+):
+    batch, head_num = q.shape[0], q.shape[1]
+    Lv = v_buffer.shape[-1]
+    BLOCK_DV = triton.next_power_of_2(Lv)
+
+    NUM_KV_SPLITS = num_kv_splits
+
+    extra_kargs = {}
+    # TODO: support hip
+    """
+    if is_hip_:
+        # https://rocm.docs.amd.com/en/docs-6.2.0/how-to/llm-fine-tuning-optimization/optimizing-triton-kernel.html
+        # https://github.com/triton-lang/triton/blob/main/third_party/amd/backend/compiler.py
+        extra_kargs = {
+            "waves_per_eu": 4,
+            "matrix_instr_nonkdim": 16,
+            "kpack": 2
+        }
+    """
+    
+    grid = (batch, head_num)
+    _fwd_kernel_stage2[grid](
+        logits,
+        o,
+        b_seq_len,
+        logits.stride(0),
+        logits.stride(1),
+        logits.stride(2),
+        o.stride(0),
+        o.stride(1),
+        NUM_KV_SPLITS=NUM_KV_SPLITS,
+        BLOCK_DV=BLOCK_DV,
+        Lv=Lv,
+        num_warps=4,
+        num_stages=2,
+        **extra_kargs,
+    )
+
+def decode_attention_fwd_grouped(
+    q,
+    k_buffer,
+    v_buffer,
+    o,
+    req_to_token,
+    b_seq_len,
+    attn_logits,
+    num_kv_splits,
+    sm_scale,
+    page_size,
+    logit_cap=0.0,
+):
+    _decode_grouped_att_m_fwd(
+        q,
+        k_buffer,
+        v_buffer,
+        attn_logits,
+        req_to_token,
+        b_seq_len,
+        num_kv_splits,
+        sm_scale,
+        page_size,
+        logit_cap,
+    )
+
+    _decode_softmax_reducev_fwd(attn_logits, q, o, v_buffer, b_seq_len,
+                                num_kv_splits)
--- a/ktransformers/optimize/optimize.py
+++ b/ktransformers/optimize/optimize.py
@ -126,6 +126,8 @@ def optimize_and_load_gguf(module: nn.Module, rule_file: str, gguf_path: str, mo
    gguf_loader=GGUFLoader(gguf_path)
    with torch.device("meta"):
        inject(module, optimize_config, model_config, gguf_loader)
+    # pre load lm_head because its big inter result
+    load_weights(module.lm_head, gguf_loader, "lm_head.")
    load_weights(module, gguf_loader)
    module.gguf_loader = gguf_loader
    del_meta(module)
--- a/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu-4.yaml
+++ b/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu-4.yaml
@ -219,8 +219,20 @@
    kwargs:
      generate_device: "cuda:2"
      prefill_device: "cuda:2"
+
 - match:
-    name: "(^model\\.layers\\.([5][0-9]|[4][5-9])\\.)|(^model.norm)|(^lm_head)"
+    name: "^lm_head"
+    class: torch.nn.Linear
+  replace:
+    class: ktransformers.operators.linear.KTransformersLinear
+    kwargs:
+      generate_device: "cuda:3"
+      prefill_device: "cuda:3"
+      generate_op: "KLinearMarlin"
+      prefill_op: "KLinearTorch"
+      
+- match:
+    name: "(^model\\.layers\\.([5][0-9]|[4][5-9])\\.)|(^model.norm)"
  replace:
    class: "default"
    kwargs:
--- a/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu.yaml
+++ b/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu.yaml
@ -118,7 +118,18 @@
      prefill_device: "cuda:0"

 - match:
-    name: "(^model\\.layers\\.([345][0-9])\\.)|(model.norm)|(lm_head)"
+    name: "^lm_head"
+    class: torch.nn.Linear
+  replace:
+    class: ktransformers.operators.linear.KTransformersLinear
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+      generate_op: "KLinearMarlin"
+      prefill_op: "KLinearTorch"
+
+- match:
+    name: "(^model\\.layers\\.([345][0-9])\\.)|(model.norm)"
  replace:
    class: "default"
    kwargs:
--- a/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat.yaml
+++ b/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat.yaml
@ -15,6 +15,18 @@
      prefill_device: "cuda"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"
+
+- match:
+    name: "^lm_head"
+    class: torch.nn.Linear
+  replace:
+    class: ktransformers.operators.linear.KTransformersLinear
+    kwargs:
+      generate_device: "cuda"
+      prefill_device: "cuda"
+      generate_op: "KLinearMarlin"
+      prefill_op: "KLinearTorch"
+
 - match:
    name: "^model\\.layers\\..*\\.mlp$"
    class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
--- a/ktransformers/optimize/optimize_rules/DeepSeek-V2-Lite-Chat-multi-gpu.yaml
+++ b/ktransformers/optimize/optimize_rules/DeepSeek-V2-Lite-Chat-multi-gpu.yaml
@ -118,7 +118,18 @@
      prefill_device: "cuda:0"

 - match:
-    name: "(^model\\.layers\\.([12][0-9])\\.)|(model.norm)|(lm_head)"
+    name: "^lm_head"
+    class: torch.nn.Linear
+  replace:
+    class: ktransformers.operators.linear.KTransformersLinear
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+      generate_op: "KLinearMarlin"
+      prefill_op: "KLinearTorch"
+
+- match:
+    name: "(^model\\.layers\\.([12][0-9])\\.)|(model.norm)"
  replace:
    class: "default"
    kwargs:
--- a/ktransformers/optimize/optimize_rules/DeepSeek-V2-Lite-Chat.yaml
+++ b/ktransformers/optimize/optimize_rules/DeepSeek-V2-Lite-Chat.yaml
@ -15,6 +15,18 @@
      prefill_device: "cuda"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"
+
+- match:
+    name: "^lm_head"
+    class: torch.nn.Linear
+  replace:
+    class: ktransformers.operators.linear.KTransformersLinear
+    kwargs:
+      generate_device: "cuda"
+      prefill_device: "cuda"
+      generate_op: "KLinearMarlin"
+      prefill_op: "KLinearTorch"
+
 - match:
    name: "^model\\.layers\\..*\\.mlp$"
    class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
--- a/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-fp8-linear-ggml-experts.yaml
+++ b/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-fp8-linear-ggml-experts.yaml
@ -0,0 +1,63 @@
+- match:
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
+  replace:
+    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
+    kwargs:
+      generate_device: "cuda"
+      prefill_device: "cuda"
+- match:
+    name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$"  # regular expression 
+    class: torch.nn.Linear  # only match modules matching name and class simultaneously
+  replace:
+    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
+    kwargs:
+      generate_device: "cuda"
+      prefill_device: "cuda"
+      generate_op: "KLinearFP8"
+      prefill_op: "KLinearTorch"
+- match:
+    name: "^model\\.layers\\..*\\.mlp$"
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
+  replace:
+    class: ktransformers.operators.experts.KDeepseekV3MoE     # mlp module with custom forward function
+    kwargs:
+      generate_device: "cuda"
+      prefill_device: "cuda"
+- match:
+    class: ktransformers.models.modeling_deepseek_v3.MoEGate
+  replace:
+    class: ktransformers.operators.gate.KMoEGate
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+- match:
+    name: "^model\\.layers\\..*\\.mlp\\.experts$"
+  replace:
+    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
+    kwargs:
+      prefill_device: "cuda"
+      prefill_op: "KExpertsTorch"
+      generate_device: "cpu"
+      generate_op: "KExpertsCPU"
+      out_device: "cuda"
+  recursive: False # don't recursively inject submodules of this module
+- match:
+    name: "^model\\.layers\\..*\\.self_attn$"
+  replace:
+    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
+    kwargs:
+      generate_device: "cuda"
+      prefill_device: "cuda"
+- match:
+    name: "^model$"
+  replace:
+    class: "ktransformers.operators.models.KDeepseekV2Model"
+    kwargs:
+      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
+- match:
+    name: "^model.embed_tokens"
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cpu"
+      prefill_device: "cpu"
--- a/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-4.yaml
+++ b/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-4.yaml
@ -0,0 +1,388 @@
+- match:
+    name: "^model.embed_tokens"
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cpu"
+      prefill_device: "cpu"
+
+# === Rotary Embedding Replacement ===
+
+# GPU 0: layers 0–14
+- match:
+    name: "^model\\.layers\\.([0-9]|1[0-4])\\."
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
+  replace:
+    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+
+# GPU 1: layers 15–29
+- match:
+    name: "^model\\.layers\\.(1[5-9]|2[0-9])\\."
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
+  replace:
+    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+
+# GPU 2: layers 30–44
+- match:
+    name: "^model\\.layers\\.(3[0-9]|4[0-4])\\."
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
+  replace:
+    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
+    kwargs:
+      generate_device: "cuda:2"
+      prefill_device: "cuda:2"
+
+# GPU 3: layers 45–60
+- match:
+    name: "^model\\.layers\\.(4[5-9]|5[0-9]|60)\\."
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
+  replace:
+    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
+    kwargs:
+      generate_device: "cuda:3"
+      prefill_device: "cuda:3"
+
+# === Linear Layers Replacement (excluding self_attn.kv_b_proj) ===
+
+# GPU 0: layers 0–14
+- match:
+    name: "^model\\.layers\\.([0-9]|1[0-4])\\.(?!self_attn\\.kv_b_proj).*$"
+    class: torch.nn.Linear
+  replace:
+    class: ktransformers.operators.linear.KTransformersLinear
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+      generate_op: "KLinearMarlin"
+      prefill_op: "KLinearTorch"
+
+# GPU 1: layers 15–29
+- match:
+    name: "^model\\.layers\\.(1[5-9]|2[0-9])\\.(?!self_attn\\.kv_b_proj).*$"
+    class: torch.nn.Linear
+  replace:
+    class: ktransformers.operators.linear.KTransformersLinear
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+      generate_op: "KLinearMarlin"
+      prefill_op: "KLinearTorch"
+
+# GPU 2: layers 30–44
+- match:
+    name: "^model\\.layers\\.(3[0-9]|4[0-4])\\.(?!self_attn\\.kv_b_proj).*$"
+    class: torch.nn.Linear
+  replace:
+    class: ktransformers.operators.linear.KTransformersLinear
+    kwargs:
+      generate_device: "cuda:2"
+      prefill_device: "cuda:2"
+      generate_op: "KLinearMarlin"
+      prefill_op: "KLinearTorch"
+
+# GPU 3: layers 45–60
+- match:
+    name: "^model\\.layers\\.(4[5-9]|5[0-9]|60)\\.(?!self_attn\\.kv_b_proj).*$"
+    class: torch.nn.Linear
+  replace:
+    class: ktransformers.operators.linear.KTransformersLinear
+    kwargs:
+      generate_device: "cuda:3"
+      prefill_device: "cuda:3"
+      generate_op: "KLinearMarlin"
+      prefill_op: "KLinearTorch"
+
+# === MLP (MoE) Replacement ===
+
+# GPU 0: layers 0–14
+- match:
+    name: "^model\\.layers\\.([0-9]|1[0-4])\\.mlp$"
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
+  replace:
+    class: ktransformers.operators.experts.KDeepseekV3MoE
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+
+# GPU 1: layers 15–29
+- match:
+    name: "^model\\.layers\\.(1[5-9]|2[0-9])\\.mlp$"
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
+  replace:
+    class: ktransformers.operators.experts.KDeepseekV3MoE
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+
+# GPU 2: layers 30–44
+- match:
+    name: "^model\\.layers\\.(3[0-9]|4[0-4])\\.mlp$"
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
+  replace:
+    class: ktransformers.operators.experts.KDeepseekV3MoE
+    kwargs:
+      generate_device: "cuda:2"
+      prefill_device: "cuda:2"
+
+# GPU 3: layers 45–60
+- match:
+    name: "^model\\.layers\\.(4[5-9]|5[0-9]|60)\\.mlp$"
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
+  replace:
+    class: ktransformers.operators.experts.KDeepseekV3MoE
+    kwargs:
+      generate_device: "cuda:3"
+      prefill_device: "cuda:3"
+
+# === MLP Gate Replacement ===
+
+# GPU 0: layers 0–14
+- match:
+    name: "^model\\.layers\\.([0-9]|1[0-4])\\.mlp\\.gate$"
+    class: ktransformers.models.modeling_deepseek_v3.MoEGate
+  replace:
+    class: ktransformers.operators.gate.KMoEGate
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+
+# GPU 1: layers 15–29
+- match:
+    name: "^model\\.layers\\.(1[5-9]|2[0-9])\\.mlp\\.gate$"
+    class: ktransformers.models.modeling_deepseek_v3.MoEGate
+  replace:
+    class: ktransformers.operators.gate.KMoEGate
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+
+# GPU 2: layers 30–44
+- match:
+    name: "^model\\.layers\\.(3[0-9]|4[0-4])\\.mlp\\.gate$"
+    class: ktransformers.models.modeling_deepseek_v3.MoEGate
+  replace:
+    class: ktransformers.operators.gate.KMoEGate
+    kwargs:
+      generate_device: "cuda:2"
+      prefill_device: "cuda:2"
+
+# GPU 3: layers 45–60
+- match:
+    name: "^model\\.layers\\.(4[5-9]|5[0-9]|60)\\.mlp\\.gate$"
+    class: ktransformers.models.modeling_deepseek_v3.MoEGate
+  replace:
+    class: ktransformers.operators.gate.KMoEGate
+    kwargs:
+      generate_device: "cuda:3"
+      prefill_device: "cuda:3"
+
+# === MLP Experts Replacement ===
+# replace with marlin expert. Open and modify layer-num as needed.
+# Each layer of malin experts takes about 6GB of GPU memory.
+# !!!Do remember 'close' cuda graph if you are using marlin expert.!!!
+# !!!KExpertsTorch is untested, we don't have enough VRAM.!!!
+
+# GPU 0: layers 3–4
+# - match:
+#     name: "^model\\.layers\\.([3-4])\\.mlp\\.experts$"
+#   replace:
+#     class: ktransformers.operators.experts.KTransformersExperts
+#     kwargs:
+#       generate_device: "cuda:0"
+#       generate_op:  "KExpertsMarlin"
+#   recursive: False
+
+# # GPU 1: layers 15–17
+# - match:
+#     name: "^model\\.layers\\.(1[5-7])\\.mlp\\.experts$"
+#   replace:
+#     class: ktransformers.operators.experts.KTransformersExperts
+#     kwargs:
+#       generate_device: "cuda:1"
+#       generate_op:  "KExpertsMarlin"
+#   recursive: False
+
+# # GPU 2: layers 30–32
+# - match:
+#     name: "^model\\.layers\\.(3[0-2])\\.mlp\\.experts$"
+#   replace:
+#     class: ktransformers.operators.experts.KTransformersExperts
+#     kwargs:
+#       generate_device: "cuda:2"
+#       generate_op:  "KExpertsMarlin"
+#   recursive: False
+
+# # GPU 3: layers 45–46
+# - match:
+#     name: "^model\\.layers\\.(4[5-6])\\.mlp\\.experts$"
+#   replace:
+#     class: ktransformers.operators.experts.KTransformersExperts
+#     kwargs:
+#       generate_device: "cuda:3"
+#       generate_op:  "KExpertsMarlin"
+#   recursive: False
+
+
+# === MLP Experts Replacement ===
+
+# GPU 0: layers 0–14
+- match:
+    name: "^model\\.layers\\.([0-9]|1[0-4])\\.mlp\\.experts$"
+  replace:
+    class: ktransformers.operators.experts.KTransformersExperts
+    kwargs:
+      prefill_device: "cuda:0"
+      prefill_op: "KExpertsTorch"
+      generate_device: "cpu"
+      generate_op: "KExpertsCPU"
+      out_device: "cuda:0"
+  recursive: False
+
+# GPU 1: layers 15–29
+- match:
+    name: "^model\\.layers\\.(1[5-9]|2[0-9])\\.mlp\\.experts$"
+  replace:
+    class: ktransformers.operators.experts.KTransformersExperts
+    kwargs:
+      prefill_device: "cuda:1"
+      prefill_op: "KExpertsTorch"
+      generate_device: "cpu"
+      generate_op: "KExpertsCPU"
+      out_device: "cuda:1"
+  recursive: False
+
+# GPU 2: layers 30–44
+- match:
+    name: "^model\\.layers\\.(3[0-9]|4[0-4])\\.mlp\\.experts$"
+  replace:
+    class: ktransformers.operators.experts.KTransformersExperts
+    kwargs:
+      prefill_device: "cuda:2"
+      prefill_op: "KExpertsTorch"
+      generate_device: "cpu"
+      generate_op: "KExpertsCPU"
+      out_device: "cuda:2"
+  recursive: False
+
+# GPU 3: layers 45–60
+- match:
+    name: "^model\\.layers\\.(4[5-9]|5[0-9]|60)\\.mlp\\.experts$"
+  replace:
+    class: ktransformers.operators.experts.KTransformersExperts
+    kwargs:
+      prefill_device: "cuda:3"
+      prefill_op: "KExpertsTorch"
+      generate_device: "cpu"
+      generate_op: "KExpertsCPU"
+      out_device: "cuda:3"
+  recursive: False
+
+# === Self-Attention Replacement ===
+
+# GPU 0: layers 0–14
+- match:
+    name: "^model\\.layers\\.([0-9]|1[0-4])\\.self_attn$"
+  replace:
+    class: ktransformers.operators.attention.KDeepseekV2Attention
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+      absorb_for_prefill: False
+
+# GPU 1: layers 15–29
+- match:
+    name: "^model\\.layers\\.(1[5-9]|2[0-9])\\.self_attn$"
+  replace:
+    class: ktransformers.operators.attention.KDeepseekV2Attention
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+      absorb_for_prefill: False
+
+# GPU 2: layers 30–44
+- match:
+    name: "^model\\.layers\\.(3[0-9]|4[0-4])\\.self_attn$"
+  replace:
+    class: ktransformers.operators.attention.KDeepseekV2Attention
+    kwargs:
+      generate_device: "cuda:2"
+      prefill_device: "cuda:2"
+      absorb_for_prefill: False
+
+# GPU 3: layers 45–60
+- match:
+    name: "^model\\.layers\\.(4[5-9]|5[0-9]|60)\\.self_attn$"
+  replace:
+    class: ktransformers.operators.attention.KDeepseekV2Attention
+    kwargs:
+      generate_device: "cuda:3"
+      prefill_device: "cuda:3"
+      absorb_for_prefill: False
+
+# === Overall Model Replacement with Transfer Map ===
+
+- match:
+    name: "^model$"
+  replace:
+    class: "ktransformers.operators.models.KDeepseekV2Model"
+    kwargs:
+      per_layer_prefill_intput_threshold: 0 # 0 means close layer‐wise prefill
+      transfer_map:
+        15: "cuda:1" # Layers 15+ on GPU 1
+        30: "cuda:2" # Layers 30+ on GPU 2
+        45: "cuda:3" # Layers 45+ on GPU 3
+
+# === Default Catch-All for Other Modules ===
+
+# GPU 0: layers 0–14
+- match:
+    name: "^model\\.layers\\.([0-9]|1[0-4])\\."
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+
+# GPU 1: layers 15–29
+- match:
+    name: "^model\\.layers\\.(1[5-9]|2[0-9])\\."
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+
+# GPU 2: layers 30–44
+- match:
+    name: "^model\\.layers\\.(3[0-9]|4[0-4])\\."
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cuda:2"
+      prefill_device: "cuda:2"
+
+- match:
+    name: "^lm_head"
+    class: torch.nn.Linear
+  replace:
+    class: ktransformers.operators.linear.KTransformersLinear
+    kwargs:
+      generate_device: "cuda:3"
+      prefill_device: "cuda:3"
+      generate_op: "KLinearMarlin"
+      prefill_op: "KLinearTorch"
+
+# For final modules (model.norm), ensure they are on GPU 3 (as in your original config)
+- match:
+    name: "(^model\\.layers\\.(4[5-9]|5[0-9]|60)\\.)|(^model\\.norm)"
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cuda:3"
+      prefill_device: "cuda:3"
--- a/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-8.yaml
+++ b/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-8.yaml
@ -0,0 +1,734 @@
+- match:
+    name: "^model.embed_tokens"
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cpu"
+      prefill_device: "cpu"
+
+# === Rotary Embedding Replacement ===
+
+# GPU 0: layers 0–7
+- match:
+    name: "^model\\.layers\\.([0-7])\\."
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
+  replace:
+    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+
+# GPU 1: layers 8–15
+- match:
+    name: "^model\\.layers\\.(8|9|1[0-5])\\."
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
+  replace:
+    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+
+# GPU 2: layers 16–23
+- match:
+    name: "^model\\.layers\\.(1[6-9]|2[0-3])\\."
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
+  replace:
+    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
+    kwargs:
+      generate_device: "cuda:2"
+      prefill_device: "cuda:2"
+
+# GPU 3: layers 24–31
+- match:
+    name: "^model\\.layers\\.(2[4-9]|3[0-1])\\."
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
+  replace:
+    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
+    kwargs:
+      generate_device: "cuda:3"
+      prefill_device: "cuda:3"
+
+# GPU 4: layers 32–39
+- match:
+    name: "^model\\.layers\\.([3][2-9])\\."
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
+  replace:
+    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
+    kwargs:
+      generate_device: "cuda:4"
+      prefill_device: "cuda:4"
+
+# GPU 5: layers 40–47
+- match:
+    name: "^model\\.layers\\.(4[0-7])\\."
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
+  replace:
+    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
+    kwargs:
+      generate_device: "cuda:5"
+      prefill_device: "cuda:5"
+
+# GPU 6: layers 48–55
+- match:
+    name: "^model\\.layers\\.(4[8-9]|5[0-5])\\."
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
+  replace:
+    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
+    kwargs:
+      generate_device: "cuda:6"
+      prefill_device: "cuda:6"
+
+# GPU 7: layers 56–60
+- match:
+    name: "^model\\.layers\\.(5[6-9]|60)\\."
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
+  replace:
+    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
+    kwargs:
+      generate_device: "cuda:7"
+      prefill_device: "cuda:7"
+
+
+# === Linear Layers Replacement (excluding self_attn.kv_b_proj) ===
+
+# GPU 0: layers 0–7
+- match:
+    name: "^model\\.layers\\.([0-7])\\.(?!self_attn\\.kv_b_proj).*$"
+    class: torch.nn.Linear
+  replace:
+    class: ktransformers.operators.linear.KTransformersLinear
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+      generate_op: "KLinearMarlin"
+      prefill_op: "KLinearTorch"
+
+# GPU 1: layers 8–15
+- match:
+    name: "^model\\.layers\\.(8|9|1[0-5])\\.(?!self_attn\\.kv_b_proj).*$"
+    class: torch.nn.Linear
+  replace:
+    class: ktransformers.operators.linear.KTransformersLinear
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+      generate_op: "KLinearMarlin"
+      prefill_op: "KLinearTorch"
+
+# GPU 2: layers 16–23
+- match:
+    name: "^model\\.layers\\.(1[6-9]|2[0-3])\\.(?!self_attn\\.kv_b_proj).*$"
+    class: torch.nn.Linear
+  replace:
+    class: ktransformers.operators.linear.KTransformersLinear
+    kwargs:
+      generate_device: "cuda:2"
+      prefill_device: "cuda:2"
+      generate_op: "KLinearMarlin"
+      prefill_op: "KLinearTorch"
+
+# GPU 3: layers 24–31
+- match:
+    name: "^model\\.layers\\.(2[4-9]|3[0-1])\\.(?!self_attn\\.kv_b_proj).*$"
+    class: torch.nn.Linear
+  replace:
+    class: ktransformers.operators.linear.KTransformersLinear
+    kwargs:
+      generate_device: "cuda:3"
+      prefill_device: "cuda:3"
+      generate_op: "KLinearMarlin"
+      prefill_op: "KLinearTorch"
+
+# GPU 4: layers 32–39
+- match:
+    name: "^model\\.layers\\.(3[2-9])\\.(?!self_attn\\.kv_b_proj).*$"
+    class: torch.nn.Linear
+  replace:
+    class: ktransformers.operators.linear.KTransformersLinear
+    kwargs:
+      generate_device: "cuda:4"
+      prefill_device: "cuda:4"
+      generate_op: "KLinearMarlin"
+      prefill_op: "KLinearTorch"
+
+# GPU 5: layers 40–47
+- match:
+    name: "^model\\.layers\\.(4[0-7])\\.(?!self_attn\\.kv_b_proj).*$"
+    class: torch.nn.Linear
+  replace:
+    class: ktransformers.operators.linear.KTransformersLinear
+    kwargs:
+      generate_device: "cuda:5"
+      prefill_device: "cuda:5"
+      generate_op: "KLinearMarlin"
+      prefill_op: "KLinearTorch"
+
+# GPU 6: layers 48–55
+- match:
+    name: "^model\\.layers\\.(4[8-9]|5[0-5])\\.(?!self_attn\\.kv_b_proj).*$"
+    class: torch.nn.Linear
+  replace:
+    class: ktransformers.operators.linear.KTransformersLinear
+    kwargs:
+      generate_device: "cuda:6"
+      prefill_device: "cuda:6"
+      generate_op: "KLinearMarlin"
+      prefill_op: "KLinearTorch"
+
+# GPU 7: layers 56–63
+- match:
+    name: "^model\\.layers\\.(5[6-9]|60)\\.(?!self_attn\\.kv_b_proj).*$"
+    class: torch.nn.Linear
+  replace:
+    class: ktransformers.operators.linear.KTransformersLinear
+    kwargs:
+      generate_device: "cuda:7"
+      prefill_device: "cuda:7"
+      generate_op: "KLinearMarlin"
+      prefill_op: "KLinearTorch"
+
+
+
+# === MLP (MoE) Replacement ===
+
+# GPU 0: layers 0–7
+- match:
+    name: "^model\\.layers\\.([0-7])\\.mlp$"
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
+  replace:
+    class: ktransformers.operators.experts.KDeepseekV3MoE
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+
+# GPU 1: layers 8–15
+- match:
+    name: "^model\\.layers\\.(8|9|1[0-5])\\.mlp$"
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
+  replace:
+    class: ktransformers.operators.experts.KDeepseekV3MoE
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+
+# GPU 2: layers 16–23
+- match:
+    name: "^model\\.layers\\.(1[6-9]|2[0-3])\\.mlp$"
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
+  replace:
+    class: ktransformers.operators.experts.KDeepseekV3MoE
+    kwargs:
+      generate_device: "cuda:2"
+      prefill_device: "cuda:2"
+
+# GPU 3: layers 24–31
+- match:
+    name: "^model\\.layers\\.(2[4-9]|3[0-1])\\.mlp$"
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
+  replace:
+    class: ktransformers.operators.experts.KDeepseekV3MoE
+    kwargs:
+      generate_device: "cuda:3"
+      prefill_device: "cuda:3"
+
+# GPU 4: layers 32–39
+- match:
+    name: "^model\\.layers\\.(3[2-9])\\.mlp$"
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
+  replace:
+    class: ktransformers.operators.experts.KDeepseekV3MoE
+    kwargs:
+      generate_device: "cuda:4"
+      prefill_device: "cuda:4"
+
+# GPU 5: layers 40–47
+- match:
+    name: "^model\\.layers\\.(4[0-7])\\.mlp$"
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
+  replace:
+    class: ktransformers.operators.experts.KDeepseekV3MoE
+    kwargs:
+      generate_device: "cuda:5"
+      prefill_device: "cuda:5"
+
+# GPU 6: layers 48–55
+- match:
+    name: "^model\\.layers\\.(4[8-9]|5[0-5])\\.mlp$"
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
+  replace:
+    class: ktransformers.operators.experts.KDeepseekV3MoE
+    kwargs:
+      generate_device: "cuda:6"
+      prefill_device: "cuda:6"
+
+# GPU 7: layers 56–60
+- match:
+    name: "^model\\.layers\\.(5[6-9]|60)\\.mlp$"
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
+  replace:
+    class: ktransformers.operators.experts.KDeepseekV3MoE
+    kwargs:
+      generate_device: "cuda:7"
+      prefill_device: "cuda:7"
+
+# === MLP Gate Replacement ===
+
+# GPU 0: layers 0–7
+- match:
+    name: "^model\\.layers\\.([0-7])\\.mlp\\.gate$"
+    class: ktransformers.models.modeling_deepseek_v3.MoEGate
+  replace:
+    class: ktransformers.operators.gate.KMoEGate
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+
+# GPU 1: layers 8–15
+- match:
+    name: "^model\\.layers\\.(8|9|1[0-5])\\.mlp\\.gate$"
+    class: ktransformers.models.modeling_deepseek_v3.MoEGate
+  replace:
+    class: ktransformers.operators.gate.KMoEGate
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+
+# GPU 2: layers 16–23
+- match:
+    name: "^model\\.layers\\.(1[6-9]|2[0-3])\\.mlp\\.gate$"
+    class: ktransformers.models.modeling_deepseek_v3.MoEGate
+  replace:
+    class: ktransformers.operators.gate.KMoEGate
+    kwargs:
+      generate_device: "cuda:2"
+      prefill_device: "cuda:2"
+
+# GPU 3: layers 24–31
+- match:
+    name: "^model\\.layers\\.(2[4-9]|3[0-1])\\.mlp\\.gate$"
+    class: ktransformers.models.modeling_deepseek_v3.MoEGate
+  replace:
+    class: ktransformers.operators.gate.KMoEGate
+    kwargs:
+      generate_device: "cuda:3"
+      prefill_device: "cuda:3"
+
+# GPU 4: layers 32–39
+- match:
+    name: "^model\\.layers\\.(3[2-9])\\.mlp\\.gate$"
+    class: ktransformers.models.modeling_deepseek_v3.MoEGate
+  replace:
+    class: ktransformers.operators.gate.KMoEGate
+    kwargs:
+      generate_device: "cuda:4"
+      prefill_device: "cuda:4"
+
+# GPU 5: layers 40–47
+- match:
+    name: "^model\\.layers\\.(4[0-7])\\.mlp\\.gate$"
+    class: ktransformers.models.modeling_deepseek_v3.MoEGate
+  replace:
+    class: ktransformers.operators.gate.KMoEGate
+    kwargs:
+      generate_device: "cuda:5"
+      prefill_device: "cuda:5"
+
+# GPU 6: layers 48–55
+- match:
+    name: "^model\\.layers\\.(4[8-9]|5[0-5])\\.mlp\\.gate$"
+    class: ktransformers.models.modeling_deepseek_v3.MoEGate
+  replace:
+    class: ktransformers.operators.gate.KMoEGate
+    kwargs:
+      generate_device: "cuda:6"
+      prefill_device: "cuda:6"
+
+# GPU 7: layers 56–60
+- match:
+    name: "^model\\.layers\\.(5[6-9]|60)\\.mlp\\.gate$"
+    class: ktransformers.models.modeling_deepseek_v3.MoEGate
+  replace:
+    class: ktransformers.operators.gate.KMoEGate
+    kwargs:
+      generate_device: "cuda:7"
+      prefill_device: "cuda:7"
+
+
+# === MLP Experts Replacement ===
+# replace with marlin expert. Open and modify layer-num as needed.
+# Each layer of malin experts takes about 6GB of GPU memory.
+# !!!Do remember 'close' cuda graph if you are using marlin expert.!!!
+# !!!Loading marlin expert will take signifcant time.!!!
+
+# GPU 0: layers 0–7
+# - match:
+#     name: "^model\\.layers\\.([0-7])\\.mlp\\.experts$" # inject experts in layer 0~4 as marlin expert
+#   replace:
+#     class: ktransformers.operators.experts.KTransformersExperts  
+#     kwargs:
+#       generate_device: "cuda:0"
+#       generate_op:  "KExpertsMarlin"
+#   recursive: False
+
+# # GPU 1: layers 8–15
+# - match:
+#     name: "^model\\.layers\\.([8-9]|1[0-5)\\.mlp\\.experts$" # inject experts in layer 30~31 as marlin expert
+#   replace:
+#     class: ktransformers.operators.experts.KTransformersExperts
+#     kwargs:
+#       generate_device: "cuda:1"
+#       generate_op:  "KExpertsMarlin"
+#   recursive: False 
+
+# # GPU 2: layers 16–23
+# - match:
+#     name: "^model\\.layers\\.(1[6-9]|2[0-3])\\.mlp\\.experts$" # inject experts in layer 0~4 as marlin expert
+#   replace:
+#     class: ktransformers.operators.experts.KTransformersExperts  
+#     kwargs:
+#       generate_device: "cuda:0" 
+#       generate_op:  "KExpertsMarlin"
+#   recursive: False
+
+# # GPU 3: layers 24–31
+# - match:
+#     name: "^model\\.layers\\.(2[4-9]|3[0-1])\\.mlp\\.experts$" # inject experts in layer 30~31 as marlin expert
+#   replace:
+#     class: ktransformers.operators.experts.KTransformersExperts
+#     kwargs:
+#       generate_device: "cuda:1"
+#       generate_op:  "KExpertsMarlin"
+#   recursive: False 
+
+# # GPU 4: layers 32–39
+# - match:
+#     name: "^model\\.layers\\.(3[2-9])\\.mlp\\.experts$" # inject experts in layer 0~4 as marlin expert
+#   replace:
+#     class: ktransformers.operators.experts.KTransformersExperts  
+#     kwargs:
+#       generate_device: "cuda:0" 
+#       generate_op:  "KExpertsMarlin"
+#   recursive: False
+
+# # GPU 5: layers 40–47
+# - match:
+#     name: "^model\\.layers\\.(4[0-7])\\.mlp\\.experts$" # inject experts in layer 30~31 as marlin expert
+#   replace:
+#     class: ktransformers.operators.experts.KTransformersExperts
+#     kwargs:
+#       generate_device: "cuda:1"
+#       generate_op:  "KExpertsMarlin"
+#   recursive: False 
+
+# # GPU 6: layers 48–55
+# - match:
+#     name: "^model\\.layers\\.(4[8-9]|5[0-5])\\.mlp\\.experts$" # inject experts in layer 0~4 as marlin expert
+#   replace:
+#     class: ktransformers.operators.experts.KTransformersExperts  
+#     kwargs:
+#       generate_device: "cuda:0"
+#       generate_op:  "KExpertsMarlin"
+#   recursive: False
+
+# # GPU 7: layers 56–60
+# - match:
+#     name: "^model\\.layers\\.(5[6-9]|60)\\.mlp\\.experts$" # inject experts in layer 30~31 as marlin expert
+#   replace:
+#     class: ktransformers.operators.experts.KTransformersExperts
+#     kwargs:
+#       generate_device: "cuda:1"
+#       generate_op:  "KExpertsMarlin"
+#   recursive: False 
+
+
+# === MLP Experts Replacement ===
+
+# GPU 0: layers 0–7
+- match:
+    name: "^model\\.layers\\.([0-7])\\.mlp\\.experts$"
+  replace:
+    class: ktransformers.operators.experts.KTransformersExperts
+    kwargs:
+      prefill_device: "cuda:0"
+      prefill_op: "KExpertsTorch"
+      generate_device: "cpu"
+      generate_op: "KExpertsCPU"
+      out_device: "cuda:0"
+  recursive: False
+
+# GPU 1: layers 8–15
+- match:
+    name: "^model\\.layers\\.(8|9|1[0-5])\\.mlp\\.experts$"
+  replace:
+    class: ktransformers.operators.experts.KTransformersExperts
+    kwargs:
+      prefill_device: "cuda:1"
+      prefill_op: "KExpertsTorch"
+      generate_device: "cpu"
+      generate_op: "KExpertsCPU"
+      out_device: "cuda:1"
+  recursive: False
+
+# GPU 2: layers 16–23
+- match:
+    name: "^model\\.layers\\.(1[6-9]|2[0-3])\\.mlp\\.experts$"
+  replace:
+    class: ktransformers.operators.experts.KTransformersExperts
+    kwargs:
+      prefill_device: "cuda:2"
+      prefill_op: "KExpertsTorch"
+      generate_device: "cpu"
+      generate_op: "KExpertsCPU"
+      out_device: "cuda:2"
+  recursive: False
+
+# GPU 3: layers 24–31
+- match:
+    name: "^model\\.layers\\.(2[4-9]|3[0-1])\\.mlp\\.experts$"
+  replace:
+    class: ktransformers.operators.experts.KTransformersExperts
+    kwargs:
+      prefill_device: "cuda:3"
+      prefill_op: "KExpertsTorch"
+      generate_device: "cpu"
+      generate_op: "KExpertsCPU"
+      out_device: "cuda:3"
+  recursive: False
+
+# GPU 4: layers 32–39
+- match:
+    name: "^model\\.layers\\.(3[2-9])\\.mlp\\.experts$"
+  replace:
+    class: ktransformers.operators.experts.KTransformersExperts
+    kwargs:
+      prefill_device: "cuda:4"
+      prefill_op: "KExpertsTorch"
+      generate_device: "cpu"
+      generate_op: "KExpertsCPU"
+      out_device: "cuda:4"
+  recursive: False
+
+# GPU 5: layers 40–47
+- match:
+    name: "^model\\.layers\\.(4[0-7])\\.mlp\\.experts$"
+  replace:
+    class: ktransformers.operators.experts.KTransformersExperts
+    kwargs:
+      prefill_device: "cuda:5"
+      prefill_op: "KExpertsTorch"
+      generate_device: "cpu"
+      generate_op: "KExpertsCPU"
+      out_device: "cuda:5"
+  recursive: False
+
+# GPU 6: layers 48–55
+- match:
+    name: "^model\\.layers\\.(4[8-9]|5[0-5])\\.mlp\\.experts$"
+  replace:
+    class: ktransformers.operators.experts.KTransformersExperts
+    kwargs:
+      prefill_device: "cuda:6"
+      prefill_op: "KExpertsTorch"
+      generate_device: "cpu"
+      generate_op: "KExpertsCPU"
+      out_device: "cuda:6"
+  recursive: False
+
+# GPU 7: layers 56–60
+- match:
+    name: "^model\\.layers\\.(5[6-9]|60)\\.mlp\\.experts$"
+  replace:
+    class: ktransformers.operators.experts.KTransformersExperts
+    kwargs:
+      prefill_device: "cuda:7"
+      prefill_op: "KExpertsTorch"
+      generate_device: "cpu"
+      generate_op: "KExpertsCPU"
+      out_device: "cuda:7"
+  recursive: False
+
+
+# === Self-Attention Replacement ===
+
+# GPU 0: layers 0–7
+- match:
+    name: "^model\\.layers\\.([0-7])\\.self_attn$"
+  replace:
+    class: ktransformers.operators.attention.KDeepseekV2Attention
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+
+# GPU 1: layers 8–15
+- match:
+    name: "^model\\.layers\\.(8|9|1[0-5])\\.self_attn$"
+  replace:
+    class: ktransformers.operators.attention.KDeepseekV2Attention
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+
+# GPU 2: layers 16–23
+- match:
+    name: "^model\\.layers\\.(1[6-9]|2[0-3])\\.self_attn$"
+  replace:
+    class: ktransformers.operators.attention.KDeepseekV2Attention
+    kwargs:
+      generate_device: "cuda:2"
+      prefill_device: "cuda:2"
+
+# GPU 3: layers 24–31
+- match:
+    name: "^model\\.layers\\.(2[4-9]|3[0-1])\\.self_attn$"
+  replace:
+    class: ktransformers.operators.attention.KDeepseekV2Attention
+    kwargs:
+      generate_device: "cuda:3"
+      prefill_device: "cuda:3"
+
+# GPU 4: layers 32–39
+- match:
+    name: "^model\\.layers\\.(3[2-9])\\.self_attn$"
+  replace:
+    class: ktransformers.operators.attention.KDeepseekV2Attention
+    kwargs:
+      generate_device: "cuda:4"
+      prefill_device: "cuda:4"
+
+# GPU 5: layers 40–47
+- match:
+    name: "^model\\.layers\\.(4[0-7])\\.self_attn$"
+  replace:
+    class: ktransformers.operators.attention.KDeepseekV2Attention
+    kwargs:
+      generate_device: "cuda:5"
+      prefill_device: "cuda:5"
+
+# GPU 6: layers 48–55
+- match:
+    name: "^model\\.layers\\.(4[8-9]|5[0-5])\\.self_attn$"
+  replace:
+    class: ktransformers.operators.attention.KDeepseekV2Attention
+    kwargs:
+      generate_device: "cuda:6"
+      prefill_device: "cuda:6"
+
+# GPU 7: layers 56–60
+- match:
+    name: "^model\\.layers\\.(5[6-9]|60)\\.self_attn$"
+  replace:
+    class: ktransformers.operators.attention.KDeepseekV2Attention
+    kwargs:
+      generate_device: "cuda:7"
+      prefill_device: "cuda:7"
+
+# === Overall Model Replacement with Transfer Map ===
+
+- match:
+    name: "^model$"
+  replace:
+    class: "ktransformers.operators.models.KDeepseekV2Model"
+    kwargs:
+      per_layer_prefill_intput_threshold: 0 # 0 means close layer‐wise prefill
+      transfer_map:
+        8: "cuda:1"
+        16: "cuda:2"
+        24: "cuda:3"
+        32: "cuda:4"
+        40: "cuda:5"
+        48: "cuda:6"
+        56: "cuda:7"
+
+# === Default Catch-All for Other Modules ===
+
+# GPU 0: layers 0–7
+- match:
+    name: "^model\\.layers\\.([0-7])\\."
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+
+# GPU 1: layers 8–15
+- match:
+    name: "^model\\.layers\\.(8|9|1[0-5])\\."
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+
+# GPU 2: layers 16–23
+- match:
+    name: "^model\\.layers\\.(1[6-9]|2[0-3])\\."
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cuda:2"
+      prefill_device: "cuda:2"
+
+# GPU 3: layers 24–31
+- match:
+    name: "^model\\.layers\\.(2[4-9]|3[0-1])\\."
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cuda:3"
+      prefill_device: "cuda:3"
+
+# GPU 4: layers 32–39
+- match:
+    name: "^model\\.layers\\.(3[2-9])\\."
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cuda:4"
+      prefill_device: "cuda:4"
+
+# GPU 5: layers 40–47
+- match:
+    name: "^model\\.layers\\.(4[0-7])\\."
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cuda:5"
+      prefill_device: "cuda:5"
+
+# GPU 6: layers 48–55
+- match:
+    name: "^model\\.layers\\.(4[8-9]|5[0-5])\\."
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cuda:6"
+      prefill_device: "cuda:6"
+
+# GPU 7: layers 56–63
+- match:
+    name: "^model\\.layers\\.(5[6-9]|60)\\."
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cuda:7"
+      prefill_device: "cuda:7"
+
+- match:
+    name: "^lm_head"
+    class: torch.nn.Linear
+  replace:
+    class: ktransformers.operators.linear.KTransformersLinear
+    kwargs:
+      generate_device: "cuda:7"
+      prefill_device: "cuda:7"
+      generate_op: "KLinearMarlin"
+      prefill_op: "KLinearTorch"
+
+# For final modules (model.norm), ensure they are on GPU 7 (as in your original config)
+- match:
+    name: "(^model\\.layers\\.(4[5-9]|5[0-9]|60)\\.)|(^model\\.norm)"
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cuda:7"
+      prefill_device: "cuda:7"
--- a/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-fp8-linear-ggml-experts.yaml
+++ b/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-fp8-linear-ggml-experts.yaml
@ -0,0 +1,157 @@
+- match:
+    name: "^model.embed_tokens"
+  replace:
+    class: "default"
+    kwargs:
+        generate_device: "cpu"
+        prefill_device: "cpu"
+
+- match:
+    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\."
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
+  replace:
+    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+- match:
+    name: "^model\\.layers\\.([3456][0-9])\\."
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
+  replace:
+    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+
+- match:
+    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.(?!self_attn\\.kv_b_proj).*$"  # regular expression 
+    class: torch.nn.Linear  # only match modules matching name and class simultaneously
+  replace:
+    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+      generate_op: "KLinearFP8"
+      prefill_op: "KLinearTorch"
+
+- match:
+    name: "^model\\.layers\\.([3456][0-9])\\.(?!self_attn\\.kv_b_proj).*$"  # regular expression 
+    class: torch.nn.Linear  # only match modules matching name and class simultaneously
+  replace:
+    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+      generate_op: "KLinearFP8"
+      prefill_op: "KLinearTorch"
+  
+- match:
+    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp$"
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
+  replace:
+    class: ktransformers.operators.experts.KDeepseekV3MoE     # mlp module with custom forward function
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+- match:
+    name: "^model\\.layers\\.([3456][0-9])\\.mlp$"
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
+  replace:
+    class: ktransformers.operators.experts.KDeepseekV3MoE     # mlp module with custom forward function
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+
+- match:
+    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp\\.gate$"
+    class: ktransformers.models.modeling_deepseek_v3.MoEGate
+  replace:
+    class: ktransformers.operators.gate.KMoEGate
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+- match:
+    name: "^model\\.layers\\.([3456][0-9])\\.mlp\\.gate$"
+    class: ktransformers.models.modeling_deepseek_v3.MoEGate
+  replace:
+    class: ktransformers.operators.gate.KMoEGate     # mlp module with custom forward function
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+
+- match:
+    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp\\.experts$"
+  replace:
+    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
+    kwargs:
+      prefill_device: "cuda:0"
+      prefill_op: "KExpertsTorch"
+      generate_device: "cpu"
+      generate_op:  "KExpertsCPU"
+      out_device: "cuda:0"
+  recursive: False # don't recursively inject submodules of this module
+
+- match:
+    name: "^model\\.layers\\.([3456][0-9])\\.mlp\\.experts$"
+  replace:
+    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
+    kwargs:
+      prefill_device: "cuda:1"
+      prefill_op: "KExpertsTorch"
+      generate_device: "cpu"
+      generate_op:  "KExpertsCPU"
+      out_device: "cuda:1"
+  recursive: False # don't recursively inject submodules of this module
+
+- match:
+    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.self_attn$"
+  replace:
+    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+      absorb_for_prefill: False # change this to True to enable long context(prefill may slower).
+
+- match:
+    name: "^model\\.layers\\.([3456][0-9])\\.self_attn$"
+  replace:
+    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+      absorb_for_prefill: False # change this to True to enable long context(prefill may slower).
+
+- match:
+    name: "^model$"
+  replace:
+    class: "ktransformers.operators.models.KDeepseekV2Model"
+    kwargs:
+      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
+      transfer_map: 
+        30: "cuda:1"
+
+- match:
+    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\."
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+
+- match:
+    name: "^lm_head"
+    class: torch.nn.Linear
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+
+
+- match:
+    name: "(^model\\.layers\\.([3456][0-9])\\.)|(model.norm)"
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
--- a/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-marlin.yaml
+++ b/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-marlin.yaml
@ -153,9 +153,20 @@
      prefill_device: "cuda:0"

 - match:
-    name: "(^model\\.layers\\.([3456][0-9])\\.)|(model.norm)|(lm_head)"
+    name: "^lm_head"
+    class: torch.nn.Linear
  replace:
-    class: "default"
+    class: ktransformers.operators.linear.KTransformersLinear
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
+      generate_op: "KLinearMarlin"
+      prefill_op: "KLinearTorch"
+
+- match:
+    name: "(^model\\.layers\\.([3456][0-9])\\.)|(model.norm)"
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
--- a/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu.yaml
+++ b/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu.yaml
@ -135,7 +135,18 @@
      prefill_device: "cuda:0"

 - match:
-    name: "(^model\\.layers\\.([3456][0-9])\\.)|(model.norm)|(lm_head)"
+    name: "^lm_head"
+    class: torch.nn.Linear
+  replace:
+    class: ktransformers.operators.linear.KTransformersLinear
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+      generate_op: "KLinearMarlin"
+      prefill_op: "KLinearTorch"
+
+- match:
+    name: "(^model\\.layers\\.([3456][0-9])\\.)|(model.norm)"
  replace:
    class: "default"
    kwargs:
--- a/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml
+++ b/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml
@ -5,6 +5,18 @@
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
+
+- match:
+    name: "^lm_head$"  # regular expression 
+    class: torch.nn.Linear  # only match modules matching name and class simultaneously
+  replace:
+    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
+    kwargs:
+      generate_device: "cuda"
+      prefill_device: "cuda"
+      generate_op: "KLinearMarlin"
+      prefill_op: "KLinearTorch"
+
 - match:
    name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
@ -48,6 +60,7 @@
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
+      absorb_for_prefill: False # change this to True to enable long context(prefill may slower).
 - match:
    name: "^model$"
  replace:
--- a/ktransformers/optimize/optimize_rules/Mixtral.yaml
+++ b/ktransformers/optimize/optimize_rules/Mixtral.yaml
@ -15,6 +15,16 @@
      prefill_device: "cuda"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"
+- match:
+    name: "^lm_head"
+    class: torch.nn.Linear
+  replace:
+    class: ktransformers.operators.linear.KTransformersLinear
+    kwargs:
+      generate_device: "cuda"
+      prefill_device: "cuda"
+      generate_op: "KLinearMarlin"
+      prefill_op: "KLinearTorch"
 - match:
    name: "^model\\.layers\\..*\\.block_sparse_moe$"
    class: ktransformers.models.modeling_mixtral.MixtralSparseMoeBlock
--- a/ktransformers/optimize/optimize_rules/Moonlight-16B-A3B.yaml
+++ b/ktransformers/optimize/optimize_rules/Moonlight-16B-A3B.yaml
@ -0,0 +1,86 @@
+- match:
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
+  replace:
+    class: ktransformers.operators.RoPE.RotaryEmbeddingV3
+    kwargs:
+      generate_device: "cuda"
+      prefill_device: "cuda"
+
+- match:
+    name: "^lm_head$"  # regular expression 
+    class: torch.nn.Linear  # only match modules matching name and class simultaneously
+  replace:
+    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
+    kwargs:
+      generate_device: "cuda"
+      prefill_device: "cuda"
+      generate_op: "KLinearMarlin"
+      prefill_op: "KLinearTorch"
+
+- match:
+    name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$"  # regular expression 
+    class: torch.nn.Linear  # only match modules matching name and class simultaneously
+  replace:
+    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
+    kwargs:
+      generate_device: "cuda"
+      prefill_device: "cuda"
+      generate_op: "KLinearMarlin"
+      prefill_op: "KLinearTorch"
+- match:
+    name: "^model\\.layers\\..*\\.mlp$"
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
+  replace:
+    class: ktransformers.operators.experts.KDeepseekV3MoE     # mlp module with custom forward function
+    kwargs:
+      generate_device: "cuda"
+      prefill_device: "cuda"
+- match:
+    class: ktransformers.models.modeling_deepseek_v3.MoEGate
+  replace:
+    class: ktransformers.operators.gate.KMoEGate
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+- match:
+    name: "^model\\.layers\\..*\\.mlp\\.experts$"
+  replace:
+    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
+    kwargs:
+      prefill_device: "cuda"
+      prefill_op: "KExpertsTorch"
+      generate_device: "cpu"
+      generate_op: "KExpertsCPU"
+      out_device: "cuda"
+  recursive: False # don't recursively inject submodules of this module
+# if want to use more VRAM, use experts Marlin and disable CUDA Graph(disable CUDA Graph may cause low performance)
+#- match:
+#    name: "^model\\.layers\\..*\\.mlp\\.experts$"
+#  replace:
+#    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
+#    kwargs:
+#      prefill_device: "cuda"
+#      prefill_op: "KExpertsTorch"
+#      generate_device: "cuda"
+#      generate_op: "KExpertsMarlin"
+#  recursive: False # don't recursively inject submodules of this module
+- match:
+    name: "^model\\.layers\\..*\\.self_attn$"
+  replace:
+    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
+    kwargs:
+      generate_device: "cuda"
+      prefill_device: "cuda"
+- match:
+    name: "^model$"
+  replace:
+    class: "ktransformers.operators.models.KDeepseekV2Model"
+    kwargs:
+      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
+- match:
+    name: "^model.embed_tokens"
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cpu"
+      prefill_device: "cpu"
--- a/ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct-multi-gpu.yaml
+++ b/ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct-multi-gpu.yaml
@ -77,9 +77,19 @@
    kwargs:
        generate_device: "cpu"
        prefill_device: "cpu"
+- match:
+    name: "^lm_head"
+    class: torch.nn.Linear
+  replace:
+    class: ktransformers.operators.linear.KTransformersLinear
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+      generate_op: "KLinearMarlin"
+      prefill_op: "KLinearTorch"

 - match:
-    name: "(^model.norm)|(^lm_head)"
+    name: "(^model.norm)"
  replace:
    class: "default"
    kwargs:
--- a/ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct.yaml
+++ b/ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct.yaml
@ -15,6 +15,16 @@
      prefill_device: "cuda"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"
+- match:
+    name: "^lm_head"
+    class: torch.nn.Linear
+  replace:
+    class: ktransformers.operators.linear.KTransformersLinear
+    kwargs:
+      generate_device: "cuda"
+      prefill_device: "cuda"
+      generate_op: "KLinearMarlin"
+      prefill_op: "KLinearTorch"
 - match:
    name: "^model\\.layers\\..*\\.mlp$"
    class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeSparseMoeBlock
--- a/ktransformers/server/api/ollama/completions.py
+++ b/ktransformers/server/api/ollama/completions.py
@ -12,8 +12,10 @@ from ktransformers.server.config.config import Config
 from ktransformers.server.utils.create_interface import get_interface
 from ktransformers.server.schemas.assistants.streaming import check_link_response
 from ktransformers.server.backend.base import BackendInterfaceBase
-router = APIRouter(prefix='/api')

+from ktransformers.server.schemas.endpoints.chat import RawUsage
+
+router = APIRouter(prefix='/api')

 # https://github.com/ollama/ollama/blob/main/docs/api.md#generate-a-completion
 class OllamaGenerateCompletionRequest(BaseModel):
@ -40,61 +42,129 @@ class OllamaGenerateCompletionRequest(BaseModel):
    keep_alive: Optional[str] = Field(
        "5m", description="Controls how long the model will stay loaded into memory following the request.")

-
 class OllamaGenerationStreamResponse(BaseModel):
    model: str
    created_at: str
    response: str
    done: bool = Field(...)

-
 class OllamaGenerationResponse(BaseModel):
    pass

-
@router.post("/generate", tags=['ollama'])
 async def generate(request: Request, input: OllamaGenerateCompletionRequest):
    id = str(uuid4())
-
    interface: BackendInterfaceBase = get_interface()
    print(f'COMPLETION INPUT:----\n{input.prompt}\n----')
-
    config = Config()

    if input.stream:
        async def inner():
-            async for token in interface.inference(input.prompt,id): 
-                d = OllamaGenerationStreamResponse(model=config.model_name,created_at=str(datetime.now()),response=token,done=False)
+            async for res in interface.inference(input.prompt, id):
+                if isinstance(res, RawUsage):
+                    raw_usage = res
+                else: 
+                    token, finish_reason = res
+                    d = OllamaGenerationStreamResponse(
+                        model=config.model_name,
+                        created_at=str(datetime.now()),
+                        response=token,
+                        done=False
+                    )
                    yield d.model_dump_json() + '\n'
-                # d = {'model':config.model_name,'created_at':"", 'response':token,'done':False}
-                # yield f"{json.dumps(d)}\n"
-            # d = {'model':config.model_name,'created_at':"", 'response':'','done':True}
-            # yield f"{json.dumps(d)}\n"
-            d = OllamaGenerationStreamResponse(model=config.model_name,created_at=str(datetime.now()),response='',done=True)   
+            d = OllamaGenerationStreamResponse(
+                model=config.model_name,
+                created_at=str(datetime.now()),
+                response='',
+                done=True
+            )
            yield d.model_dump_json() + '\n'
        return check_link_response(request, inner())
    else:
        raise NotImplementedError

 # https://github.com/ollama/ollama/blob/main/docs/api.md#generate-a-chat-completion
-
+class OllamaChatCompletionMessage(BaseModel):
+    role: str
+    content: str

 class OllamaChatCompletionRequest(BaseModel):
-    pass
-
+    model: str = Field(..., description="The model name, which is required.")
+    messages: List[OllamaChatCompletionMessage] = Field(
+        ..., description="A list of messages to generate a response for.")
+    stream: bool = Field(True, description="If true, the response will be streamed.")

 class OllamaChatCompletionStreamResponse(BaseModel):
-    pass
+    model: str
+    created_at: str
+    message: dict
+    done: bool = Field(...)
+    total_duration: Optional[int] = Field(None, description="Total time spent in nanoseconds")
+    load_duration: Optional[int] = Field(None, description="Time spent loading model in nanoseconds")
+    prompt_eval_count: Optional[int] = Field(None, description="Number of tokens in prompt")
+    prompt_eval_duration: Optional[int] = Field(None, description="Time spent evaluating prompt in nanoseconds")
+    eval_count: Optional[int] = Field(None, description="Number of tokens generated")
+    eval_duration: Optional[int] = Field(None, description="Time spent generating response in nanoseconds")
+


 class OllamaChatCompletionResponse(BaseModel):
    pass

-
@router.post("/chat", tags=['ollama'])
 async def chat(request: Request, input: OllamaChatCompletionRequest):
-    raise NotImplementedError
+    id = str(uuid4())
+    interface: BackendInterfaceBase = get_interface()
+    config = Config()

+    # 将消息转换为提示字符串
+    prompt = ""
+    for msg in input.messages:
+        prompt += f"{msg.role}: {msg.content}\n"
+    prompt += "assistant:"
+
+    if input.stream:
+        async def inner():
+            start_time = time()  # 记录开始时间（秒）
+            eval_count = 0  # 统计生成的 token 数量
+            tokens = []
+
+            async for res in interface.inference(prompt, id):
+                if isinstance(res, RawUsage):
+                    raw_usage = res
+                else: 
+                    token, finish_reason = res
+                    d = OllamaChatCompletionStreamResponse(
+                        model=config.model_name,
+                        created_at=str(datetime.now()),
+                        message={"role": "assistant", "content": token}, 
+                        done=False
+                    )
+                    yield d.model_dump_json() + '\n'
+            # 计算性能数据
+            end_time = time()
+            total_duration = int((end_time - start_time) * 1_000_000_000)  # 转换为纳秒
+            prompt_eval_count = len(prompt.split())  # 简单估算提示词数量
+            eval_duration = total_duration  # 假设全部时间用于生成（简化）
+            prompt_eval_duration = 0  # 假设无单独提示评估时间
+            load_duration = 0  # 假设加载时间未知
+
+            d = OllamaChatCompletionStreamResponse(
+                model=config.model_name,
+                created_at=str(datetime.now()),
+                message={},
+                done=True,
+                total_duration=total_duration,
+                load_duration=load_duration,
+                prompt_eval_count=prompt_eval_count,
+                prompt_eval_duration=prompt_eval_duration,
+                eval_count=eval_count,
+                eval_duration=eval_duration
+            )
+            yield d.model_dump_json() + '\n'
+        return check_link_response(request, inner())
+    else:
+        raise NotImplementedError("Non-streaming chat is not implemented.")

 # https://github.com/ollama/ollama/blob/main/docs/api.md#list-local-models
 class OllamaModel(BaseModel):
@ -103,7 +173,6 @@ class OllamaModel(BaseModel):
    size: int
    # TODO: fill the rest correctly

-
 # mock ollama
@router.get("/tags", tags=['ollama'])
 async def tags():
@ -138,8 +207,6 @@ class OllamaShowResponse(BaseModel):
    class Config:
        protected_namespaces = ()

-
-
@router.post("/show", tags=['ollama'])
 async def show(request: Request, input: OllamaShowRequest):
    config = Config()
@ -152,9 +219,7 @@ async def show(request: Request, input: OllamaShowRequest):
            parent_model=" ",
            format="gguf",
            family=" ",
-            families = [
-                " " 
-            ],
+            families=[" "],
            parameter_size=" ",
            quantization_level=" "
        ),
--- a/ktransformers/server/api/openai/endpoints/chat.py
+++ b/ktransformers/server/api/openai/endpoints/chat.py
@ -5,18 +5,21 @@ from fastapi import APIRouter
 from fastapi.requests import Request
 from ktransformers.server.utils.create_interface import get_interface
 from ktransformers.server.schemas.assistants.streaming import chat_stream_response
-from ktransformers.server.schemas.endpoints.chat import ChatCompletionCreate,ChatCompletionChunk,ChatCompletionObject
+from ktransformers.server.schemas.endpoints.chat import ChatCompletionCreate
+from ktransformers.server.schemas.endpoints.chat import RawUsage
 from ktransformers.server.backend.base import BackendInterfaceBase
+from ktransformers.server.config.config import Config
+
+from ktransformers.server.schemas.endpoints.chat import ChatCompletionChunk
+from openai.types.chat import ChatCompletion
+from openai.types.completion_usage import  CompletionUsage
+

 router = APIRouter()

-models = [
-    {"id": "0", "name": "ktranformers-model"},
-]
-
@router.get('/models', tags=['openai'])
 async def list_models():
-    return models
+    return {"data": [{"id": Config().model_name, "name": Config().model_name}], "object": "list"}


@router.post('/chat/completions', tags=['openai'])
@ -28,15 +31,80 @@ async def chat_completion(request:Request,create:ChatCompletionCreate):

    input_message = [json.loads(m.model_dump_json()) for m in create.messages]

+    if Config().api_key != '':
+        assert request.headers.get('Authorization', '').split()[-1] == Config().api_key
+
    if create.stream:
+        from openai.types.chat.chat_completion_chunk import Choice, ChoiceDelta
+        
        async def inner():
-            chunk = ChatCompletionChunk(id=id,object='chat.completion.chunk',created=int(time()))
-            async for token in interface.inference(input_message,id):
-                chunk.set_token(token)
+            chunk = ChatCompletionChunk(
+                id = id,
+                choices = [],
+                object = 'chat.completion.chunk',
+                created = int(time()),
+                model = Config().model_name,
+            )
+            
+            async for res in interface.inference(input_message,id, create.temperature, create.top_p):
+                if isinstance(res, RawUsage):
+                    # at the end of inference, interface.inference() will return the usage of inference
+                    raw_usage = res
+                    chunk.choices = []
+                    chunk.usage = CompletionUsage(
+                        prompt_tokens = raw_usage.prefill_count,
+                        completion_tokens = raw_usage.decode_count,
+                        total_tokens = raw_usage.prefill_count + raw_usage.decode_count
+                    )
+
                    yield chunk
+
+                else:
+                    token, finish_reason = res
+                    choice = Choice(
+                        index = 0,
+                        delta = ChoiceDelta(content=token, role=None, tool_calls=None),
+                        finish_reason = finish_reason,
+                        logprobs = None,
+                    )
+                    chunk.choices = [choice]
+                    yield chunk
+
        return chat_stream_response(request, inner())
    else:
-        comp = ChatCompletionObject(id=id,object='chat.completion.chunk',created=int(time()))
-        async for token in interface.inference(input_message,id):
-            comp.append_token(token)
-        return comp
+        from openai.types.chat.chat_completion import Choice
+        from openai.types.chat.chat_completion_message import ChatCompletionMessage
+
+        content = ""
+        finish_reason = None
+        async for res in interface.inference(input_message,id,create.temperature,create.top_p):
+            if isinstance(res, RawUsage):
+                raw_usage = res
+                usage = CompletionUsage(
+                    prompt_tokens = raw_usage.prefill_count,
+                    completion_tokens = raw_usage.decode_count,
+                    total_tokens = raw_usage.prefill_count + raw_usage.decode_count
+                )
+            else:
+                token, finish_reason = res
+                content = content + token
+                finish_reason = finish_reason
+
+        choice = Choice(
+            index = 0,
+            finish_reason = finish_reason,
+            message = ChatCompletionMessage(
+                content=content,
+                role="assistant"
+            ))
+
+        chat_completion = ChatCompletion(
+            id = id,
+            choices = [choice],
+            created = int(time()),
+            model = Config().model_name,
+            object = 'chat.completion',
+            usage = usage
+        )
+
+        return chat_completion
--- a/ktransformers/server/api/openai/legacy/completions.py
+++ b/ktransformers/server/api/openai/legacy/completions.py
@ -6,6 +6,7 @@ from fastapi.requests import Request
 from ktransformers.server.utils.create_interface import get_interface
 from ktransformers.server.schemas.assistants.streaming import stream_response
 from ktransformers.server.schemas.legacy.completions import CompletionCreate,CompletionObject
+from ktransformers.server.schemas.endpoints.chat import RawUsage

 router = APIRouter()

@ -17,10 +18,13 @@ async def create_completion(request:Request,create:CompletionCreate):
    print(f'COMPLETION INPUT:----\n{create.prompt}\n----')

   
-
    if create.stream:
        async def inner():
-            async for token in interface.inference(create.prompt,id):     
+            async for res in interface.inference(create.prompt,id,create.temperature,create.top_p):     
+                if isinstance(res, RawUsage):
+                    raw_usage = res
+                else: 
+                    token, finish_reason = res
                    d = {'choices':[{'delta':{'content':token}}]}
                    yield f"data:{json.dumps(d)}\n\n"
            d = {'choices':[{'delta':{'content':''},'finish_reason':''}]}
@ -28,6 +32,10 @@ async def create_completion(request:Request,create:CompletionCreate):
        return stream_response(request,inner())
    else:
        comp = CompletionObject(id=id,object='text_completion',created=int(time()))
-        async for token in interface.inference(create.prompt,id):     
+        async for res in interface.inference(create.prompt,id,create.temperature,create.top_p):     
+            if isinstance(res, RawUsage):
+                raw_usage = res
+            else: 
+                token, finish_reason = res
                comp.append_token(token) 
        return comp
--- a/ktransformers/server/args.py
+++ b/ktransformers/server/args.py
@ -10,6 +10,7 @@ class ArgumentParser:
        parser = argparse.ArgumentParser(prog="kvcache.ai", description="Ktransformers")
        parser.add_argument("--host", type=str, default=self.cfg.server_ip)
        parser.add_argument("--port", type=int, default=self.cfg.server_port)
+        parser.add_argument("--api_key", type=str, default=self.cfg.api_key)
        parser.add_argument("--ssl_keyfile", type=str)
        parser.add_argument("--ssl_certfile", type=str)
        parser.add_argument("--web", type=bool, default=self.cfg.mount_web)
@ -23,13 +24,13 @@ class ArgumentParser:
        parser.add_argument("--optimize_config_path", default=self.cfg.optimize_config_path, type=str, required=False)
        parser.add_argument("--cpu_infer", type=int, default=self.cfg.cpu_infer)
        parser.add_argument("--type", type=str, default=self.cfg.backend_type)
+        parser.add_argument("--chunk_prefill_size", type=int, default=8192)

        # model configs
        # parser.add_argument("--model_cache_lens", type=int, default=self.cfg.cache_lens)  # int?
        parser.add_argument("--paged", type=bool, default=self.cfg.paged)
        parser.add_argument("--total_context", type=int, default=self.cfg.total_context)
        parser.add_argument("--max_batch_size", type=int, default=self.cfg.max_batch_size)
-        parser.add_argument("--max_chunk_size", type=int, default=self.cfg.max_chunk_size)
        parser.add_argument("--max_new_tokens", type=int, default=self.cfg.max_new_tokens)
        parser.add_argument("--json_mode", type=bool, default=self.cfg.json_mode)
        parser.add_argument("--healing", type=bool, default=self.cfg.healing)
@ -90,7 +91,8 @@ class ArgumentParser:
        # user config
        parser.add_argument("--user_secret_key", type=str, default=self.cfg.user_secret_key)
        parser.add_argument("--user_algorithm", type=str, default=self.cfg.user_algorithm)
-        parser.add_argument("--force_think", type=bool, default=self.cfg.user_force_think)
+        parser.add_argument("--force_think", action=argparse.BooleanOptionalAction, type=bool, default=self.cfg.user_force_think)
+        parser.add_argument("--use_cuda_graph", action=argparse.BooleanOptionalAction, type=bool, default=self.cfg.use_cuda_graph)

        # web config
        parser.add_argument("--web_cross_domain", type=bool, default=self.cfg.web_cross_domain)
--- a/ktransformers/server/backend/args.py
+++ b/ktransformers/server/backend/args.py
@ -23,7 +23,7 @@ class ConfigArgs(BaseModel):
    max_batch_size: int = Field(
        None, description="Max number of batches to run at once, assuming the sequences will fit within total_context"
    )
-    max_chunk_size: int = Field(
+    chunk_prefill_size: int = Field(
        None,
        description=(
            "Max chunk size. Determines the size of prefill operations. Can be reduced to reduce pauses whenever a new"
--- a/ktransformers/server/backend/base.py
+++ b/ktransformers/server/backend/base.py
@ -15,6 +15,7 @@ from ktransformers.server.schemas.assistants.assistants import AssistantObject
 from ktransformers.server.schemas.assistants.messages import MessageCreate, MessageObject, Role
 from ktransformers.server.schemas.assistants.runs import RunObject
 from ktransformers.server.schemas.assistants.threads import ThreadObject
+from ktransformers.server.schemas.endpoints.chat import RawUsage
 from ktransformers.server.schemas.base import ObjectID, Order
 from ktransformers.server.utils.multi_timer import Profiler

@ -142,7 +143,11 @@ class ThreadContext:
        yield reply_message.stream_response_with_event(MessageObject.Status.in_progress)
        yield self.run.stream_response_with_event(RunObject.Status.in_progress)

-        async for token in self.interface.inference(local_messages,self.thread.id):     
+        async for res in self.interface.inference(local_messages,self.thread.id): 
+            if isinstance(res, RawUsage):
+                raw_usage = res
+            else: 
+                token, finish_reason = res    
                if self.run.status == RunObject.Status.cancelling:
                    logger.warn(f'Run {self.run.id} cancelling')
                    break
--- a/ktransformers/server/backend/interfaces/ktransformers.py
+++ b/ktransformers/server/backend/interfaces/ktransformers.py
@ -1,4 +1,5 @@
 import torch
+import asyncio
 from transformers import AutoTokenizer, AutoConfig, GenerationConfig
 from ktransformers.server.backend.interfaces.transformers import (
    TransformersInterface,
@ -13,7 +14,11 @@ from ktransformers.models.custom_cache import StaticCache
 from ktransformers.util.cuda_graph_runner import CUDAGraphRunner
 from ktransformers.local_chat import custom_models, default_optimize_rules
 from ktransformers.util.utils import get_device
+from typing import Optional
+from ktransformers.operators.flashinfer_wrapper import flashinfer_enabled, MLAWrapperSingleton
+from ktransformers.server.schemas.endpoints.chat import RawUsage

+warm_uped = False

 class KTransformersThreadContext(TransformersThreadContext):
    pass
@ -22,19 +27,29 @@ class KTransformersThreadContext(TransformersThreadContext):
 class KTransformersInterface(TransformersInterface):
    def __init__(self, args: ConfigArgs = default_args):
        self.args = args
-        torch.set_default_dtype(torch.bfloat16)
        torch.set_grad_enabled(False)
        self.tokenizer = AutoTokenizer.from_pretrained(args.model_dir, device=args.device, trust_remote_code=args.trust_remote_code)
        config = AutoConfig.from_pretrained(args.model_dir, trust_remote_code=args.trust_remote_code)
+        try:
+            generation_config = GenerationConfig.from_pretrained(args.model_dir)
+        except:
+            generation_config = GenerationConfig(
+                max_length=args.max_new_tokens,
+                temperature=args.temperature,
+                top_p=args.top_p,
+                do_sample=True
+            )
+        
+        torch.set_default_dtype(config.torch_dtype)
        if config.architectures[0] == "Qwen2MoeForCausalLM":
            config._attn_implementation = "flash_attention_2"

        with torch.device("meta"):
            self.model = custom_models[config.architectures[0]](config)
        if default_args.optimize_config_path is None:
-            optimize_rule_path = default_optimize_rules[config.architectures[0]]
+            optimize_config_path = default_optimize_rules[config.architectures[0]]
        else:
-            optimize_rule_path = args.optimize_config_path
+            optimize_config_path = args.optimize_config_path

        # print(optimize_config)

@ -44,8 +59,8 @@ class KTransformersInterface(TransformersInterface):
                "please input the path of your gguf file(gguf file in the dir containing input gguf file must all"
                " belong to current model):"
            )
-        optimize_and_load_gguf(self.model, optimize_rule_path, gguf_path, config)
-
+        optimize_and_load_gguf(self.model, optimize_config_path, gguf_path, config)
+        self.model.generation_config = generation_config
        self.device_map = self.model.gguf_loader.tensor_device_map
        # logger.info(f"{args.model_name} loaded from {args.model_dir} to {self.device_map}")
        self.cache = StaticCache(
@ -56,25 +71,21 @@ class KTransformersInterface(TransformersInterface):
            dtype=self.model.dtype,
        )
        # logger.info(f"StaticCache (length={args.cache_lens}), batch size:{args.batch_size}")
-        try:
-            self.model.generation_config = GenerationConfig.from_pretrained(args.model_dir)
-        except:
-            gen_config = GenerationConfig(
-                max_length=128,
-                temperature=0.7,
-                top_p=0.9,
-                do_sample=True
-            )
-            self.model.generation_config = gen_config
+
        if self.model.generation_config.pad_token_id is None:
            self.model.generation_config.pad_token_id = self.model.generation_config.eos_token_id
        self.streamer = TextStreamer(self.tokenizer)

+        self._infer_lock = asyncio.Lock()
+
    def decode_one_tokens(self):
+        global warm_uped
+
        device_map = self.model.gguf_loader.tensor_device_map
        torch_device = get_device("blk.0.self_attn", device_map)
        torch_device = "cuda:0" if torch_device == "cuda" else torch_device
-        if self.args.use_cuda_graph:
+        torch.cuda.set_device(torch_device)
+        if warm_uped and self.args.use_cuda_graph:
            if not hasattr(self, "cuda_graph_runner"):
                self.cuda_graph_runner = CUDAGraphRunner()
                self.cuda_graph_runner.capture(
@ -97,13 +108,14 @@ class KTransformersInterface(TransformersInterface):
                logits = logits[0, -1, :]
                return self.logits_to_token(logits)
        
+        if self.args.use_cuda_graph:
+            warm_uped = True
+            
        if self.use_static_cache:
-            mask = torch.ones((1, self.seq_length)).to(torch_device)
            logits = self.model(
                self.current_ids.to(torch_device),
                cache_position=self.active_cache_position,
                past_key_values=self.cache,
-                attention_mask=mask,
                return_dict=False,
                use_cache=True,
            )[0]
@ -116,43 +128,74 @@ class KTransformersInterface(TransformersInterface):


    @torch.no_grad
-    def prefill(self, input_ids: torch.Tensor, is_new: bool):
+    def prefill(self, input_ids: torch.Tensor, is_new: bool, temperature: Optional[float], top_p: Optional[float]):
        input_ids_length = input_ids.shape[-1]
-        self.profiler.set_counter("prefill", input_ids_length)
+        if(input_ids_length >= self.args.cache_lens):
+            logger.warning(f"input_ids_length {input_ids_length} > cache_lens {self.args.cache_lens}")
+            self.seq_length = input_ids_length
+            return
        logger.debug(f"input_ids: {input_ids.shape}")
-
        device = self.device_map.get("blk.0.self_attn", {}).get("generate_device", "cuda:0")
+        device = "cuda:0" if device == "cuda" else device

        if is_new:
-            self.cache.reset()
            self.ever_generated_ids.clear()
-            former_seq_length = 0
-            self.seq_length = input_ids_length
+            same_prefix = 0
+            flat_input_ids = input_ids.flatten()
+
+            if getattr(self, 'generated_ids', None) is None:
                self.generated_ids = torch.zeros(
                    self.args.batch_size,
-                self.seq_length + self.args.max_new_tokens + 1,
+                    input_ids.shape[-1] + self.args.max_new_tokens + 1,
                    dtype=torch.int,
                    device=self.args.device,
                )
+                self.seq_length = 1            
+            
+            flat_prev_ids = self.generated_ids.flatten()
+            for i in range(min(self.seq_length, flat_input_ids.shape[0]) - 1):
+                if flat_input_ids[i] == flat_prev_ids[i]:
+                    same_prefix += 1
                else:
+                    break
+            
+            logger.debug(f"same prefix len: {same_prefix}")
+            self.cache.remove_suffix(same_prefix)
+            self.seq_length = same_prefix
+            self.generated_ids = self.generated_ids[..., :same_prefix]
+            input_ids = input_ids[..., same_prefix:]
+            input_ids_length = input_ids.shape[-1]
+
+        self.ever_generated_ids.clear()
+        self.profiler.set_counter("prefill", input_ids_length)
+        logger.debug(f"input_ids: {input_ids.shape}")
        logger.debug(f"generate_ids: {self.generated_ids.shape}")
+        
        former_seq_length = self.seq_length
        self.seq_length += input_ids_length
-            expected_length = self.seq_length + self.args.max_new_tokens + 1
+        expected_length = min(self.seq_length + self.args.max_new_tokens + 1, self.args.cache_lens)
        delta_length = expected_length - self.generated_ids.shape[-1]
        if delta_length > 0:
            new_generate_ids = torch.zeros(
                self.args.batch_size, delta_length, dtype=torch.int, device=self.args.device
            )
            self.generated_ids = torch.cat([self.generated_ids, new_generate_ids], dim=-1)
+        else:
+            logger.warning(f"seq_length bigger than cache_lens, killed")
+            exit(0)
+        
        logger.debug(f"cache position: {former_seq_length} to {self.seq_length}")
        cache_position = torch.arange(former_seq_length, self.seq_length, device=device)
        self.generated_ids[:, cache_position] = input_ids.to(self.args.device).to(torch.int)

-        mask = torch.ones((1, self.seq_length)).to(device)
        if not (type(self) is TransformersInterface):
            input_ids = input_ids.to("cpu")
+        
+        def chunk_prefill(input_ids, cache_position):
            inputs_embeds = self.model.model.embed_tokens(input_ids).to(device)
+            torch.cuda.set_device(device)
+            if flashinfer_enabled:
+                MLAWrapperSingleton.need_plan_all()
            if self.use_static_cache:
                logits = self.model(
                    inputs_embeds=inputs_embeds,
@ -160,11 +203,23 @@ class KTransformersInterface(TransformersInterface):
                    past_key_values=self.cache,
                    return_dict=False,
                    use_cache=True,
-                attention_mask=mask,
                )[0]
            else:
                logits = self.model(inputs_embeds=inputs_embeds, return_dict=False)[0]

+            return logits
+
+        chunk_start = 0
+        while chunk_start < input_ids_length:
+            chunk_end = min(chunk_start + self.args.chunk_prefill_size, input_ids_length)
+            if self.cache != None:
+                self.cache.cur_idx=cache_position[chunk_start:chunk_end]
+            logits = chunk_prefill(input_ids[:, chunk_start:chunk_end], cache_position[chunk_start:chunk_end])
+            chunk_start += self.args.chunk_prefill_size
+            
+        if flashinfer_enabled:
+            MLAWrapperSingleton.reset_buffer()
+        self.prepare_logits_wrapper(input_ids, device, temperature, top_p)
        next_token = self.logits_to_token(logits[0, -1, :])
        yield self.append_new_tokens(next_token)

@ -172,3 +227,17 @@ class KTransformersInterface(TransformersInterface):
    def active_cache_position(self):
        device = self.device_map.get("blk.0.self_attn", {}).get("generate_device", "cuda:0")
        return torch.tensor([self.seq_length - 1], device=device)
+    
+    async def inference(self, local_messages, thread_id: str, temperature: Optional[float] = None, top_p: Optional[float] = None):
+        async with self._infer_lock:
+            async for v in super().inference(local_messages, thread_id, temperature, top_p):
+                yield v
+            
+            # return this inference raw usage
+            yield RawUsage(
+                tokenize_time = self.profiler.get_timer_sec('tokenize'),
+                prefill_time = self.profiler.get_timer_sec('prefill'),
+                decode_time = self.profiler.get_timer_sec('decode'),
+                prefill_count = self.profiler.get_counter('prefill'),
+                decode_count = self.profiler.get_counter('decode'),
+            )
--- a/ktransformers/server/backend/interfaces/transformers.py
+++ b/ktransformers/server/backend/interfaces/transformers.py
@ -13,12 +13,13 @@ from transformers import (
 from ktransformers.server.config.config import Config
 from ktransformers.server.schemas.base import ObjectID
 from ktransformers.server.utils.multi_timer import Profiler
+from torch.nn.attention import SDPBackend
 import torch
 import sys, os
 from ..base import ThreadContext, BackendInterfaceBase
 from ktransformers.server.config.log import logger
 from ..args import ConfigArgs, default_args
-
+from ktransformers.operators.flashinfer_wrapper import flashinfer_enabled, MLAWrapperSingleton

 # This TextStreamer is a modified version from https://github.com/huggingface/transformers/blob/main/src/transformers/generation/streamers.py
 class TextStreamer:
@ -170,7 +171,7 @@ class TransformersInterface(BackendInterfaceBase):
        for m in messages[1:]:
            if m["role"] == "user" and new_messages[-1]["role"] == "user":
                logger.warning("merge two adjacent user messages")
-                new_messages[-1]["content"] += m["content"]
+                new_messages[-1]["content"] += '\n' + m["content"]
            else:
                new_messages.append(m)
        # if (self.last_request_id is not None) and self.last_request_id == thread_id:
@ -179,7 +180,11 @@ class TransformersInterface(BackendInterfaceBase):
        #     input_ids = self.tokenizer.apply_chat_template(
        #         new_messages, return_tensors="pt", add_generation_prompt=True
        #     ).to(self.args.device)
-        input_ids = self.tokenizer.apply_chat_template(new_messages,return_tensors='pt',add_generation_prompt=True).to(self.args.device)
+        input_str: str = self.tokenizer.apply_chat_template(new_messages,tokenize=False,add_generation_prompt=True)
+        # drop <think> token in chat template
+        if input_str.endswith('<think>\n'):
+            input_str = input_str[:-len('<think>\n')]
+        input_ids = self.tokenizer.encode(input_str, return_tensors="pt").to(self.args.device)
        if (self.last_request_id is not None) and self.last_request_id == thread_id:
            x = self.generated_ids[:,:self.seq_length]
            y = input_ids[:,:self.seq_length]
@ -198,14 +203,31 @@ class TransformersInterface(BackendInterfaceBase):
        self.seq_length += 1
        return self.streamer.put(new_tokens)

-    def logits_to_token(self, logits: torch.Tensor):
-        logits = logits / self.args.temperature if self.args.temperature!=0 else logits
+    def prepare_logits_wrapper(self, inputs, device, temperature: Optional[float] = None, top_p: Optional[float] = None):
+        if temperature is None or temperature == 0:
+            temperature = self.model.generation_config.temperature
+        if top_p is None:
+            top_p = self.model.generation_config.top_p
+        generation_config, model_kwargs = self.model._prepare_generation_config(
+            None, max_length=self.args.max_new_tokens,
+            do_sample=True, 
+            top_k=self.args.top_k, 
+            top_p=top_p, 
+            temperature=temperature,
+            repetition_penalty=self.args.repetition_penalty # change this to modify generate config
+        )
+        self.inputs = inputs
+        try: # transformers==4.43
+            self.logits_warper = (
+                self.model._get_logits_warper(generation_config, device=device)
+            )
+        except: 
+            self.logits_warper = (
+                self.model._get_logits_warper(generation_config)
+            )

-        for token_idx in self.ever_generated_ids:
-            if logits[token_idx] < 0:
-                logits[token_idx] *= self.args.repetition_penalty
-            else:
-                logits[token_idx] /= self.args.repetition_penalty
+    def logits_to_token(self, logits: torch.Tensor):
+        logits = self.logits_warper(self.inputs.view(1, -1), logits.view(1, -1))

        probs = torch.nn.functional.softmax(logits, dim=-1)

@ -221,12 +243,10 @@ class TransformersInterface(BackendInterfaceBase):

    def decode_one_tokens(self):
        if self.use_static_cache:
-            mask = torch.ones((1, self.seq_length)).to(self.args.device)
            logits = self.model(
                self.current_ids,
                cache_position=self.active_cache_position,
                past_key_values=self.cache,
-                attention_mask=mask,
                return_dict=False,
                use_cache=True,
            )[0]
@ -237,23 +257,42 @@ class TransformersInterface(BackendInterfaceBase):
        return self.logits_to_token(logits)

    @torch.no_grad
-    def prefill(self, input_ids: torch.Tensor, is_new: bool):
+    def prefill(self, input_ids: torch.Tensor, is_new: bool, temperature: Optional[float] = None, top_p: Optional[float] = None):
        input_ids_length = input_ids.shape[-1]
-        self.profiler.set_counter("prefill", input_ids_length)
        logger.debug(f"input_ids: {input_ids.shape}")

        if is_new:
-            self.cache.reset()
            self.ever_generated_ids.clear()
-            former_seq_length = 0
-            self.seq_length = input_ids_length
+            same_prefix = 0
+            flat_input_ids = input_ids.flatten()
+
+            if getattr(self, 'generated_ids', None) is None:
                self.generated_ids = torch.zeros(
                    self.args.batch_size,
-                self.seq_length + self.args.max_new_tokens + 1,
+                    input_ids.shape[-1] + self.args.max_new_tokens + 1,
                    dtype=torch.int,
                    device=self.args.device,
                )
+                self.seq_length = 1            
+            
+            flat_prev_ids = self.generated_ids.flatten()
+            for i in range(min(self.seq_length, flat_input_ids.shape[0]) - 1):
+                if flat_input_ids[i] == flat_prev_ids[i]:
+                    same_prefix += 1
                else:
+                    break
+            
+            logger.debug(f"same prefix len: {same_prefix}")
+            self.cache.remove_suffix(same_prefix)
+            self.seq_length = same_prefix
+            self.generated_ids = self.generated_ids[..., :same_prefix]
+            input_ids = input_ids[..., same_prefix:]
+            input_ids_length = input_ids.shape[-1]
+        
+        self.ever_generated_ids.clear()
+        self.profiler.set_counter("prefill", input_ids_length)
+        logger.debug(f"input_ids: {input_ids.shape}")
+
        logger.debug(f"generate_ids: {self.generated_ids.shape}")
        former_seq_length = self.seq_length
        self.seq_length += input_ids_length
@ -264,11 +303,11 @@ class TransformersInterface(BackendInterfaceBase):
                self.args.batch_size, delta_length, dtype=torch.int, device=self.args.device
            )
            self.generated_ids = torch.cat([self.generated_ids, new_generate_ids], dim=-1)
+            
        logger.debug(f"cache position: {former_seq_length} to {self.seq_length}")
        cache_position = torch.arange(former_seq_length, self.seq_length, device=self.args.device)
        self.generated_ids[:, cache_position] = input_ids.to(self.args.device).to(torch.int)

-        mask = torch.ones((1, self.seq_length)).to(self.args.device)
        device = input_ids.device
        if not (type(self) is TransformersInterface):
            input_ids = input_ids.to("cpu")
@ -280,26 +319,46 @@ class TransformersInterface(BackendInterfaceBase):
                past_key_values=self.cache,
                return_dict=False,
                use_cache=True,
-                attention_mask=mask,
            )[0]
        else:
            logits = self.model(inputs_embeds=inputs_embeds, return_dict=False)[0]

+        self.prepare_logits_wrapper(input_ids, device, temperature, top_p)
        next_token = self.logits_to_token(logits[0, -1, :])
        yield self.append_new_tokens(next_token)

    @torch.no_grad
    def generate(self):
+        self.max_new_tokens = min(self.args.max_new_tokens, self.args.cache_lens - self.seq_length) - 1 
+        logger.info(f"args.max_new_tokens: {self.args.max_new_tokens}, cache_lens: {self.args.cache_lens}, seq_length: {self.seq_length}")
+        if(self.max_new_tokens <= 0):
+            logger.warning("max_new_tokens is less than 0")
+            yield self.streamer.end(), "length"
+            return
+        logger.info(f"max_new_tokens: {self.max_new_tokens}")
        self.profiler.set_counter("decode", 0)
-        for _ in range(1, self.args.max_new_tokens):
-            with torch.backends.cuda.sdp_kernel(enable_flash=False, enable_mem_efficient=False, enable_math=True):
+
+        for i in range(1, self.max_new_tokens):
+            with torch.nn.attention.sdpa_kernel(backends=[SDPBackend.FLASH_ATTENTION, SDPBackend.MATH, SDPBackend.EFFICIENT_ATTENTION]):
+                if flashinfer_enabled:
+                    MLAWrapperSingleton.plan_all(None,None,None,self.active_cache_position.to(torch.int32)+1,
+                                             num_heads=self.model.config.num_attention_heads, head_dim_ckv=self.model.config.kv_lora_rank, 
+                                             head_dim_kpe=self.model.config.qk_rope_head_dim, page_size=self.cache.page_size,
+                                             sm_scale=self.model.model.layers[0].self_attn.softmax_scale, q_data_type=torch.bfloat16, kv_data_type=torch.bfloat16)
                next_token = self.decode_one_tokens()
                self.profiler.inc("decode")
-                if next_token == self.tokenizer.eos_token_id:
+                if next_token == self.tokenizer.eos_token_id or "<|im_end|>" == self.tokenizer.decode(next_token):
+                    yield self.streamer.end(), None
+                    yield "", "stop"
                    assert self.args.batch_size == 1
                    break
-                yield self.append_new_tokens(next_token)
-        yield self.streamer.end()
+                yield self.append_new_tokens(next_token), None
+
+        else:   # for's else, if output get max new tokens
+            yield self.streamer.end(), None
+            yield "", "length"
+        
+        

    def check_is_new(self, thread_id: str):
        if not self.use_static_cache:
@ -314,7 +373,8 @@ class TransformersInterface(BackendInterfaceBase):
                self.last_request_id = thread_id
                return True

-    async def inference(self, local_messages, thread_id: str):
+    async def inference(self, local_messages, thread_id: str, temperature: Optional[float] = None, top_p: Optional[float] = None):
+        self.streamer.reset()
        self.profiler.create_and_start_timer("tokenize")
        if isinstance(local_messages, List):
            input_ids = self.format_and_tokenize_input_ids(thread_id, local_messages)
@ -324,8 +384,9 @@ class TransformersInterface(BackendInterfaceBase):
            #input_ids = torch.tensor([[6366]], device=input_ids.device)
        else:
            raise ValueError("local_messages should be List or str")
+        
        if Config().user_force_think:
-            token_thinks = torch.tensor([self.tokenizer.encode("<think>\\n",add_special_tokens=False)],device=input_ids.device)
+            token_thinks = torch.tensor([self.tokenizer.encode("<think>\n",add_special_tokens=False)],device=input_ids.device)
            input_ids = torch.cat(
                [input_ids, token_thinks], dim=1
            )
@ -333,21 +394,24 @@ class TransformersInterface(BackendInterfaceBase):
        self.profiler.pause_timer("tokenize")

        self.profiler.create_and_start_timer("prefill")
+
        if Config().user_force_think:
-            t = "<think>\n"
-            print(t,end="",flush=True)
-            yield t
-        for t in self.prefill(input_ids, self.check_is_new(thread_id)):
+            think = '<think>\n'
+            print(think, end="",flush=True)
+            yield think, None
+        
+        for t in self.prefill(input_ids, self.check_is_new(thread_id), temperature, top_p):
+            # output think token after prefill done
            if t is not None:
                print(t, end="",flush=True)
-                yield t
+                yield t, None
        self.profiler.pause_timer("prefill")

        self.profiler.create_and_start_timer("decode")
-        for t in self.generate():
+        for t, finish_reason in self.generate():
            if t is not None:
                print(t, end="",flush=True)
-                yield t 
+                yield t, finish_reason
        print("")
        self.profiler.pause_timer("decode")
        self.report_last_time_performance()
--- a/ktransformers/server/config/config.py
+++ b/ktransformers/server/config/config.py
@ -69,6 +69,7 @@ class Config(metaclass=Singleton):
        self.server: dict = cfg.get("server", {})
        self.server_ip = self.server.get("ip", "0.0.0.0")
        self.server_port = self.server.get("port", 9016)
+        self.api_key = self.server.get("api_key", "")

        # db configs
        self.db_configs: dict = cfg.get("db", {})
@ -104,7 +105,8 @@ class Config(metaclass=Singleton):

        self.total_context = self.model.get("total_context", 2**18)
        self.max_batch_size = self.model.get("max_batch_size", 20 if self.paged else 1)
-        self.max_chunk_size = self.model.get("max_chunk_size", 2048)
+        self.chunk_prefill_size = self.model.get("chunk_prefill_size", 8192)
+        
        self.max_new_tokens = self.model.get("max_new_tokens", 2000)
        self.json_mode = self.model.get("json_mode", False)
        self.healing = self.model.get("healing", False)
--- a/ktransformers/server/main.py
+++ b/ktransformers/server/main.py
@ -105,6 +105,7 @@ def custom_openapi(app):

 def main():
    cfg = Config()
+
    arg_parser = ArgumentParser(cfg)

    # 初始化消息
--- a/ktransformers/server/requirements.txt
+++ b/ktransformers/server/requirements.txt
@ -5,6 +5,7 @@ langchain >= 0.2.0
 blessed >= 1.20.0
 accelerate >= 0.31.0
 sentencepiece >= 0.1.97
+openai
 setuptools
 build
 ninja
--- a/ktransformers/server/schemas/assistants/streaming.py
+++ b/ktransformers/server/schemas/assistants/streaming.py
@ -73,7 +73,7 @@ class RunStepDelta(Object):

 class Done():
    def to_stream_reply(self):
-        return f"event: done\ndata: [DONE]\n\n"
+        return f"data: [DONE]\n\n"


 async def check_client_link(request: Request, async_events: AsyncIterable):
--- a/Show more
+++ b/Show more