diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
new file mode 100644
index 0000000..ef4556e
--- /dev/null
+++ b/.devcontainer/Dockerfile
@@ -0,0 +1,19 @@
+FROM pytorch/pytorch:2.5.1-cuda12.1-cudnn9-devel as compile_server
+WORKDIR /workspace
+ENV CUDA_HOME /usr/local/cuda
+RUN <<EOF
+apt update -y &&  apt install -y  --no-install-recommends \
+    git \
+    wget \
+    vim \
+    gcc \
+    g++ \
+    cmake && 
+rm -rf /var/lib/apt/lists/* &&
+pip install --upgrade pip &&
+pip install ninja pyproject numpy cpufeature &&
+pip install flash-attn &&
+cp /usr/lib/x86_64-linux-gnu/libstdc++.so.6 /opt/conda/lib/
+EOF
+# Set the default shell to bash
+CMD ["/bin/bash"]
\ No newline at end of file
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
new file mode 100644
index 0000000..41007ac
--- /dev/null
+++ b/.devcontainer/devcontainer.json
@@ -0,0 +1,34 @@
+{
+    "name": "Ktrans Dev Container",
+    "privileged": true,
+    "build": {
+        "dockerfile": "Dockerfile",
+        "context": "..",
+        "args": {
+            "http_proxy": "${env:http_proxy}",
+            "https_proxy": "${env:https_proxy}",
+        }
+    },
+    "runArgs": [
+        "--network=host",
+        "--gpus",
+        "all"
+        // "--gpu all"
+    ],
+    "workspaceFolder": "/workspace",
+    "workspaceMount": "source=${localWorkspaceFolder},target=/workspace,type=bind,consistency=cached",
+    "mounts": [
+        "source=/mnt/data,target=/mnt/incontainer,type=bind,consistency=cached"
+    ],
+    "customizations": {
+        "vscode": {
+            "extensions": [
+            ],
+            "settings": {
+                "terminal.integrated.shell.linux": "/bin/bash",
+                "cmake.configureOnOpen": true,
+                "cmake.generator": "Ninja"
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/.github/ISSUE_TEMPLATE/-bug-.yaml b/.github/ISSUE_TEMPLATE/-bug-.yaml
new file mode 100644
index 0000000..7c74c6e
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/-bug-.yaml
@@ -0,0 +1,39 @@
+name: 🐞 Bug report
+description: Create a report to help us reproduce and fix the bug
+title: "[Bug] "
+labels: ['Bug']
+
+body:
+- type: checkboxes
+  attributes:
+    label: Checklist
+    options:
+    - label: 1. I have searched related issues but cannot get the expected help.
+    - label: 2. The bug has not been fixed in the latest version.
+    - label: 3. Please note that if the bug-related issue you submitted lacks corresponding environment info and a minimal reproducible demo, it will be challenging for us to reproduce and resolve the issue, reducing the likelihood of receiving feedback.
+    - label: 4. If the issue you raised is not a bug but a question, please raise a discussion at https://github.com/kvcache-ai/ktransformers/discussions. Otherwise, it will be closed.
+    - label: 5. To help the community, I will use Chinese/English or attach an Chinese/English translation if using another language. Non-Chinese/English content without translation may be closed.
+
+- type: textarea
+  attributes:
+    label: Describe the bug
+    description: A clear and concise description of what the bug is.
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: Reproduction
+    description: |
+      What command or script did you run? Which **model** are you using?
+    placeholder: |
+      A placeholder for the command.
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: Environment
+    description: |
+      Please provide necessary environment information here (e.g. OS/GPU/CPU). Otherwise the issue will be close.
+    placeholder: Environment here.
+  validations:
+    required: true
\ No newline at end of file
diff --git a/.github/ISSUE_TEMPLATE/-bug2-.yaml b/.github/ISSUE_TEMPLATE/-bug2-.yaml
new file mode 100644
index 0000000..85b2180
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/-bug2-.yaml
@@ -0,0 +1,39 @@
+name: 🐞 BUG报告
+description: 创建报告以帮助我们复现并修复BUG
+title: "[Bug] "
+labels: ['Bug']
+
+body:
+- type: checkboxes
+  attributes:
+    label: 检查清单
+    options:
+    - label: 1. 我已经搜索过相关问题，但未能获得预期的帮助
+    - label: 2. 该问题在最新版本中尚未修复
+    - label: 3. 请注意，如果您提交的BUG相关 issue 缺少对应环境信息和最小可复现示例，我们将难以复现和定位问题，降低获得反馈的可能性
+    - label: 4. 如果您提出的不是bug而是问题，请在讨论区发起讨论 https://github.com/kvcache-ai/ktransformers/discussions。否则该 issue 将被关闭
+    - label: 5. 为方便社区交流，我将使用中文/英文或附上中文/英文翻译（如使用其他语言）。未附带翻译的非中文/英语内容可能会被关闭
+
+- type: textarea
+  attributes:
+    label: 问题描述
+    description: 清晰简洁地描述BUG是什么
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: 复现步骤
+    description: |
+      你运行了什么命令或脚本？使用的是哪个**模型**？
+    placeholder: |
+      在此处填写命令
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: 环境信息
+    description: |
+      请提供必要的环境信息（如操作系统/GPU/CPU），否则该 issue 将被关闭
+    placeholder: 在此处填写环境信息
+  validations:
+    required: true
\ No newline at end of file
diff --git a/.github/ISSUE_TEMPLATE/-feature-.yaml b/.github/ISSUE_TEMPLATE/-feature-.yaml
new file mode 100644
index 0000000..4ef23c4
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/-feature-.yaml
@@ -0,0 +1,23 @@
+name: 🚀 Feature request
+description: Suggest an idea for this project
+title: "[Feature] "
+
+body:
+- type: checkboxes
+  attributes:
+    label: Checklist
+    options:
+    - label: 1. If the issue you raised is not a feature but a question, please raise a discussion at https://github.com/kvcache-ai/ktransformers/discussions. Otherwise, it will be closed.
+    - label: 2. To help the community, I will use Chinese/English or attach an Chinese/English translation if using another language. Non-English/Chinese content without translation may be closed.
+- type: textarea
+  attributes:
+    label: Motivation
+    description: |
+      A clear and concise description of the motivation of the feature.
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: Related resources
+    description: |
+      If there is an official code release or third-party implementations, please also provide the information here, which would be very helpful.
\ No newline at end of file
diff --git a/.github/ISSUE_TEMPLATE/-feature2-.yaml b/.github/ISSUE_TEMPLATE/-feature2-.yaml
new file mode 100644
index 0000000..571af4a
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/-feature2-.yaml
@@ -0,0 +1,23 @@
+name: 🚀 新功能请求
+description: 为项目提出新功能建议
+title: "[Feature] "
+
+body:
+- type: checkboxes
+  attributes:
+    label: 检查清单
+    options:
+    - label: 1. 如果您提出的不是新功能而是问题，请在讨论区发起讨论 https://github.com/kvcache-ai/ktransformers/discussions。否则该 issue 将被关闭
+    - label: 2. 为方便社区交流，我将使用中文/英文或附上英文/中文翻译（如使用其他语言）。未附带翻译的非英文/中文内容可能会被关闭
+- type: textarea
+  attributes:
+    label: 需求背景
+    description: |
+      清晰简洁地描述该功能的背景需求
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: 相关资源
+    description: |
+      如果有官方代码实现或第三方实现，请在此提供相关信息，这将非常有帮助
\ No newline at end of file
diff --git a/.github/workflows/book-ci.yml b/.github/workflows/book-ci.yml
new file mode 100644
index 0000000..61f5f56
--- /dev/null
+++ b/.github/workflows/book-ci.yml
@@ -0,0 +1,32 @@
+name: Book-CI
+
+on:
+  push:
+    branches:
+      - main
+      # - server_support
+
+  pull_request:
+    branches:
+      - main
+      # - server_support
+jobs:
+  test:
+    name: test
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-latest, macos-latest, windows-latest]
+    steps:
+      - uses: actions/checkout@v4
+      - name: Install Rust
+        run: |
+          rustup set profile minimal
+          rustup toolchain install stable
+          rustup default stable
+      - name: Setup mdBook
+        uses: peaceiris/actions-mdbook@v2
+        with:
+          mdbook-version: "latest"
+      # - name: Run tests
+      #   run: mdbook test
\ No newline at end of file
diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
new file mode 100644
index 0000000..dd406df
--- /dev/null
+++ b/.github/workflows/deploy.yml
@@ -0,0 +1,49 @@
+name: Deploy
+
+on:
+  push:
+    branches:
+      - main
+      # - server_support
+
+  pull_request:
+    branches:
+      - main
+      # - server_support
+
+defaults:
+  run:
+    shell: bash
+
+permissions:
+  contents: write
+
+jobs:
+  deploy:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-latest, macos-latest, windows-latest]
+    steps:
+      - uses: actions/checkout@v4
+      - name: Install Rust
+        run: |
+          rustup set profile minimal
+          rustup toolchain install stable
+          rustup default stable
+      - name: Setup mdBook
+        uses: peaceiris/actions-mdbook@v2
+        with:
+          mdbook-version: "latest"
+      - run: mdbook build
+      # - name: Copy Assets
+      #   run: |
+      #     chmod +x ci/copy-assets.sh
+      #     ci/copy-assets.sh ${{ matrix.os }}
+      - name: Deploy
+        uses: peaceiris/actions-gh-pages@v3
+        # or || github.ref == 'refs/heads/server_support'
+        if: ${{ github.ref == 'refs/heads/main' }}
+        with:
+          github_token: ${{ secrets.GITHUB_TOKEN }}
+          publish_dir: ./book
\ No newline at end of file
diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml
new file mode 100644
index 0000000..5eb5cec
--- /dev/null
+++ b/.github/workflows/docker-image.yml
@@ -0,0 +1,98 @@
+name: DockerHub CI
+
+on:
+  release:
+    types: [published]
+  workflow_dispatch:
+    inputs:
+      choose:
+        description: 'Will you push the image to DockerHub? 0 for No, 1 for Yes'
+        required: true
+        default: '0'
+        type: string
+
+  # push:
+  #   branches:
+  #     - main
+env:
+  DOCKERHUB_REPO: ${{ secrets.DOCKERHUB_USERNAME }}/ktransformers
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - name: Run tests
+        run: |
+          if [ -f docker-compose.test.yml ]; then
+            docker-compose --file docker-compose.test.yml build
+            docker-compose --file docker-compose.test.yml run sut
+          else
+            docker build . --file Dockerfile
+          fi
+
+  docker_task:
+    needs: test
+    name: ${{ matrix.instruct}}
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+        # for amd64
+          - {instruct: "FANCY",   platform: "linux/amd64"}
+          - {instruct: "AVX512",  platform: "linux/amd64"}
+          - {instruct: "AVX2",    platform: "linux/amd64"}   
+          - {instruct: "NATIVE",  platform: "linux/amd64"}
+        # for arm64
+          - {instruct: "NATIVE",  platform: "linux/arm64"}
+
+    steps:
+        - name: Move Docker data directory
+          run: |
+            sudo systemctl stop docker
+            sudo mkdir -p /mnt/docker
+            sudo rsync -avz /var/lib/docker/ /mnt/docker
+            sudo rm -rf /var/lib/docker 
+            sudo ln -s /mnt/docker /var/lib/docker
+            sudo systemctl start docker
+
+        -
+          name: Set up QEMU
+          uses: docker/setup-qemu-action@v3
+
+        -
+          name: Set up Docker Buildx
+          uses: docker/setup-buildx-action@v3
+
+        -
+          name: Login to Docker Hub
+          uses: docker/login-action@v3
+          with:
+            username: ${{ secrets.DOCKERHUB_USERNAME }}
+            password: ${{ secrets.DOCKERHUB_TOKEN }}
+        -
+          name: Build and push for amd64
+          if: matrix.platform == 'linux/amd64'
+          uses: docker/build-push-action@v6
+          with:
+            push: true
+            platforms: |
+              linux/amd64
+            tags: |
+              ${{ env.DOCKERHUB_REPO }}:latest-${{ matrix.instruct }}
+              ${{ env.DOCKERHUB_REPO }}:${{ github.event.release.tag_name }}-${{ matrix.instruct }}
+            build-args: |
+              CPU_INSTRUCT=${{ matrix.instruct }}
+        -
+          name: Build and push for arm64
+          if: matrix.platform == 'linux/arm64'
+          uses: docker/build-push-action@v6
+          with:
+            push: true
+            platforms: |
+              linux/arm64
+            tags: |
+              ${{ env.DOCKERHUB_REPO }}:latest-${{ matrix.instruct }}
+              ${{ env.DOCKERHUB_REPO }}:${{ github.event.release.tag_name }}-${{ matrix.instruct }}
+            build-args: |
+              CPU_INSTRUCT=${{ matrix.instruct }}
\ No newline at end of file
diff --git a/.github/workflows/install.yml b/.github/workflows/install.yml
new file mode 100644
index 0000000..deb1db9
--- /dev/null
+++ b/.github/workflows/install.yml
@@ -0,0 +1,52 @@
+name: Install KTransformers
+run-name: Install KTransformers
+on:
+  workflow_dispatch:
+    inputs:
+      job_to_run:
+        description: "Which job to run?"
+        required: true
+        default: "install"
+        type: choice
+        options:
+          - create&install
+          - install
+jobs:
+  Install-KTransformers:
+    runs-on: self-hosted
+    steps:
+      - run: echo "🎉 The job was automatically triggered by a ${{ github.event_name }} event."
+      - run: echo "🔎 The name of your branch is ${{ github.ref }} and your repository is ${{ github.repository }}."
+      - name: Check out repository code
+        uses: actions/checkout@v4
+      - run: echo "💡 The ${{ github.repository }} repository has been cloned to the runner."
+      - name: Remove old conda environment
+        continue-on-error: true
+        if: ${{ inputs.job_to_run == 'create&install'}}
+        run: |
+          source /home/qujing3/anaconda3/etc/profile.d/conda.sh
+          conda env remove --name ktransformers-dev -y
+      - name: Create conda environment
+        if: ${{ inputs.job_to_run == 'create&install'}}
+        run: |
+          source /home/qujing3/anaconda3/etc/profile.d/conda.sh
+          conda create --name ktransformers-dev python=3.11
+          conda activate ktransformers-dev
+          conda install -c conda-forge libstdcxx-ng -y
+      - name: Install dependencies
+        if: ${{ inputs.job_to_run == 'create&install'}}
+        run: |
+          source /home/qujing3/anaconda3/etc/profile.d/conda.sh
+          conda activate ktransformers-dev
+          pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126
+          pip3 install packaging ninja cpufeature numpy
+          pip install ~/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiTRUE-cp311-cp311-linux_x86_64.whl
+      - name: Install KTransformers
+        run: |
+          source /home/qujing3/anaconda3/etc/profile.d/conda.sh
+          conda activate ktransformers-dev
+          cd ${{ github.workspace }}
+          git submodule init
+          git submodule update
+          USE_NUMA=1 bash install.sh
+      - run: echo "This job's status is ${{ job.status }}."
diff --git a/.gitignore b/.gitignore
index d45e956..74a7725 100644
--- a/.gitignore
+++ b/.gitignore
@@ -19,6 +19,10 @@ ktransformers/server/local_store/
 ktransformers/server_test1.db
 *.patch
 img/
-tmp1.txt
-test_65_300_1536.txt
+tmp*.txt
 test.txt
+book
+ktransformers/tests/chat_txt.txt
+mmlu_result*
+ktransformers/ktransformers_ext/cuda_musa/
+test_prompt.txt
diff --git a/Dockerfile b/Dockerfile
index d56a607..473fbcb 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -10,7 +10,8 @@ EOF
 
 
 
-FROM pytorch/pytorch:2.3.1-cuda12.1-cudnn8-devel as compile_server
+FROM pytorch/pytorch:2.5.1-cuda12.1-cudnn9-devel as compile_server
+ARG CPU_INSTRUCT=NATIVE
 WORKDIR /workspace
 ENV CUDA_HOME /usr/local/cuda
 COPY --from=web_compile /home/ktransformers /workspace/ktransformers
@@ -26,10 +27,12 @@ rm -rf /var/lib/apt/lists/* &&
 cd ktransformers &&
 git submodule init &&
 git submodule update &&
+pip install --upgrade pip &&
 pip install ninja pyproject numpy cpufeature &&
 pip install flash-attn &&
-CPU_INSTRUCT=NATIVE  KTRANSFORMERS_FORCE_BUILD=TRUE TORCH_CUDA_ARCH_LIST="8.0;8.6;8.7;8.9;9.0+PTX" pip install . --no-build-isolation --verbose &&
-pip cache purge
+CPU_INSTRUCT=${CPU_INSTRUCT}  KTRANSFORMERS_FORCE_BUILD=TRUE TORCH_CUDA_ARCH_LIST="8.0;8.6;8.7;8.9;9.0+PTX" pip install . --no-build-isolation --verbose &&
+pip cache purge &&
+cp /usr/lib/x86_64-linux-gnu/libstdc++.so.6 /opt/conda/lib/
 EOF
 
-ENTRYPOINT [ "/opt/conda/bin/ktransformers" ]
\ No newline at end of file
+ENTRYPOINT ["tail", "-f", "/dev/null"]
\ No newline at end of file
diff --git a/Makefile b/Makefile
index f8633a9..8349809 100644
--- a/Makefile
+++ b/Makefile
@@ -18,4 +18,15 @@ dev_install:
 
 	echo "Installing ktransformers"
 	KTRANSFORMERS_FORCE_BUILD=TRUE pip install -e . -v --no-build-isolation
-	echo "Installation completed successfully"
\ No newline at end of file
+	echo "Installation completed successfully"
+clean:
+	rm -rf build
+	rm -rf *.egg-info
+	rm -rf ktransformers/ktransformers_ext/build
+	rm -rf ktransformers/ktransformers_ext/cuda/build
+	rm -rf ktransformers/ktransformers_ext/cuda/dist
+	rm -rf ktransformers/ktransformers_ext/cuda/*.egg-info	
+install_numa:
+	USE_NUMA=1 make dev_install
+install_no_numa:
+	env -u USE_NUMA make dev_install
diff --git a/README.md b/README.md
index 7ebc88c..76ad6eb 100644
--- a/README.md
+++ b/README.md
@@ -23,14 +23,16 @@ Our vision for KTransformers is to serve as a flexible platform for experimentin
 
 <h2 id="Updates">🔥 Updates</h2>
 
-* **Feb 10, 2025**: Support Deepseek-R1 and V3 on single (24GB VRAM)/multi gpu and 382G DRAM, up to 3~28x speedup. The detailed tutorial is [here](./doc/en/DeepseekR1_V3_tutorial.md).
-* **Aug 28, 2024**: Support 1M context under the InternLM2.5-7B-Chat-1M model, utilizing 24GB of VRAM and 150GB of DRAM. The detailed tutorial is [here](./doc/en/long_context_tutorial.md).
+* **Mar 5, 2025**: Support unsloth 1.58/2.51 bits weights and [IQ1_S/FP8 hybrid](./doc/en/fp8_kernel.md) weights. Support 139K [Longer Context](./doc/en/DeepseekR1_V3_tutorial.md#v022-longer-context) for DeepSeek-V3 and R1 in 24GB VRAM.
+* **Feb 25, 2025**: Support [FP8 GPU kernel](./doc/en/fp8_kernel.md) for DeepSeek-V3 and R1; [Longer Context](./doc/en/DeepseekR1_V3_tutorial.md#v022-longer-context).
+* **Feb 15, 2025**: Longer Context (from 4K to 8K for 24GB VRAM) & Slightly Faster Speed （+15%, up to 16 Tokens/s), update [docs](./doc/en/DeepseekR1_V3_tutorial.md) and [online books](https://kvcache-ai.github.io/ktransformers/).
+* **Feb 10, 2025**: Support Deepseek-R1 and V3 on single (24GB VRAM)/multi gpu and 382G DRAM, up to 3~28x speedup. For detailed show case and reproduction tutorial, see [here](./doc/en/DeepseekR1_V3_tutorial.md).
 * **Aug 28, 2024**: Decrease DeepseekV2's required VRAM from 21G to 11G.
-* **Aug 15, 2024**: Update detailed [TUTORIAL](doc/en/injection_tutorial.md) for injection and multi-GPU. 
+* **Aug 15, 2024**: Update detailed [tutorial](doc/en/injection_tutorial.md) for injection and multi-GPU. 
 * **Aug 14, 2024**: Support llamfile as linear backend. 
 * **Aug 12, 2024**: Support multiple GPU; Support new model: mixtral 8\*7B  and 8\*22B; Support q2k, q3k, q5k dequant on gpu.
 * **Aug 9, 2024**: Support windows native.
-
+<!-- * **Aug 28, 2024**: Support 1M context under the InternLM2.5-7B-Chat-1M model, utilizing 24GB of VRAM and 150GB of DRAM. The detailed tutorial is [here](./doc/en/long_context_tutorial.md). -->
 <h2 id="show-cases">🌟 Show Cases</h2>
 
 <div>
@@ -43,10 +45,10 @@ https://github.com/user-attachments/assets/ebd70bfa-b2c1-4abb-ae3b-296ed38aa285
 
 - **[NEW!!!] Local 671B DeepSeek-Coder-V3/R1:** Running its Q4_K_M version using only 14GB VRAM and 382GB DRAM([Tutorial](./doc/en/DeepseekR1_V3_tutorial.md)).
 	- Prefill Speed (tokens/s): 
- 		- KTransfermor: 54.21 (32 cores) → 74.362 (dual-socket, 2×32 cores) → 255.26 (optimized AMX-based MoE kernel, V0.3 only) → 286.55 (selectively using 6 experts, V0.3 only)  
+ 		- KTransformers: 54.21 (32 cores) → 74.362 (dual-socket, 2×32 cores) → 255.26 (optimized AMX-based MoE kernel, V0.3 only) → 286.55 (selectively using 6 experts, V0.3 only)  
  		- Compared to 10.31 tokens/s in llama.cpp with 2×32 cores, achieving up to **27.79× speedup**.  
  	- Decode Speed (tokens/s):  
- 		- KTransfermor: 8.73 (32 cores) → 11.26 (dual-socket, 2×32 cores) → 13.69 (selectively using 6 experts, V0.3 only)  
+ 		- KTransformers: 8.73 (32 cores) → 11.26 (dual-socket, 2×32 cores) → 13.69 (selectively using 6 experts, V0.3 only)  
  		- Compared to 4.51 tokens/s in llama.cpp with 2×32 cores, achieving up to **3.03× speedup**.  
 	- Upcoming Open Source Release:
 		- AMX optimizations and selective expert activation will be open-sourced in V0.3.  
@@ -69,7 +71,7 @@ https://github.com/user-attachments/assets/4c6a8a38-05aa-497d-8eb1-3a5b3918429c
 
 </p>
 
-<h3>1M Context Local Inference on a Desktop with Only 24GB VRAM</h3>
+<!-- <h3>1M Context Local Inference on a Desktop with Only 24GB VRAM</h3>
 <p align="center">
 
 https://github.com/user-attachments/assets/a865e5e4-bca3-401e-94b8-af3c080e6c12
@@ -91,228 +93,20 @@ https://github.com/user-attachments/assets/a865e5e4-bca3-401e-94b8-af3c080e6c12
 * **Enhanced Speed**: Reaches 16.91 tokens/s for generation with a 1M context using sparse attention, powered by llamafile kernels. This method is over 10 times faster than full attention approach of llama.cpp.
 
 * **Flexible Sparse Attention Framework**: Offers a flexible block sparse attention framework for CPU offloaded decoding. Compatible with SnapKV, Quest, and InfLLm. Further information is available [here](./doc/en/long_context_introduction.md).
-
+ -->
 
 
 <strong>More advanced features will coming soon, so stay tuned!</strong>
 
 <h2 id="quick-start">🚀 Quick Start</h2>
 
-<h3>Preparation</h3>
-Some preparation:
 
-- CUDA 12.1 and above, if you didn't have it yet, you may install from [here](https://developer.nvidia.com/cuda-downloads).
-  
-  ```sh
-  # Adding CUDA to PATH
-  export PATH=/usr/local/cuda/bin:$PATH
-  export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
-  export CUDA_PATH=/usr/local/cuda
-  ```
+Getting started with KTransformers is simple! Follow the steps below to set up and start using it.
 
-- Linux-x86_64 with gcc, g++ and cmake
-  
-  ```sh
-  sudo apt-get update
-  sudo apt-get install gcc g++ cmake ninja-build
-  ```
+### 📥 Installation
 
-- We recommend using [Conda](https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh) to create a virtual environment with Python=3.11 to run our program.
-  
-  ```sh
-  conda create --name ktransformers python=3.11
-  conda activate ktransformers # you may need to run ‘conda init’ and reopen shell first
-  ```
+To install KTransformers, follow the official [Installation Guide](https://kvcache-ai.github.io/ktransformers/en/install.html).
 
-- Make sure that PyTorch, packaging, ninja is installed
-  
-  ```
-  pip install torch packaging ninja cpufeature numpy
-  ```
-
-<h3>Installation</h3>
-
-1. Use a Docker image, see [documentation for Docker](./doc/en/Docker.md) 
-
-2. You can install using Pypi (for linux):
-   
-   ```
-   pip install ktransformers --no-build-isolation
-   ```
-   
-   for windows we prepare a pre compiled whl package on [ktransformers-0.2.0+cu125torch24avx2-cp312-cp312-win_amd64.whl](https://github.com/kvcache-ai/ktransformers/releases/download/v0.2.0/ktransformers-0.2.0+cu125torch24avx2-cp312-cp312-win_amd64.whl), which require cuda-12.5, torch-2.4, python-3.11, more pre compiled package are being produced. 
-
-3. Or you can download source code and compile:
-   
-   - init source code 
-     
-     ```sh
-     git clone https://github.com/kvcache-ai/ktransformers.git
-     cd ktransformers
-     git submodule init
-     git submodule update
-     ```
-
-   - [Optional] If you want to run with website, please [compile the website](./doc/en/api/server/website.md) before execute ```bash install.sh```
-
-   - Compile and install (for Linux)
-     
-     ```
-     bash install.sh
-     ```
-
-   - Compile and install(for Windows)
-     
-     ```
-     install.bat
-     ```
-4. If you are developer, you can make use of the makefile to compile and format the code. <br> the detailed usage of makefile is [here](./doc/en/makefile_usage.md) 
-<h3>Local Chat</h3>
-We provide a simple command-line local chat Python script that you can run for testing.
-
-> Note that this is a very simple test tool only support one round chat without any memory about last input, if you want to try full ability of the model, you may go to [RESTful API and Web UI](#id_666). We use the DeepSeek-V2-Lite-Chat-GGUF model as an example here. But we also support other models, you can replace it with any other model that you want to test. 
-
-<h4>Run Example</h4>
-
-```shell
-# Begin from root of your cloned repo!
-# Begin from root of your cloned repo!!
-# Begin from root of your cloned repo!!! 
-
-# Download mzwing/DeepSeek-V2-Lite-Chat-GGUF from huggingface
-mkdir DeepSeek-V2-Lite-Chat-GGUF
-cd DeepSeek-V2-Lite-Chat-GGUF
-
-wget https://huggingface.co/mzwing/DeepSeek-V2-Lite-Chat-GGUF/resolve/main/DeepSeek-V2-Lite-Chat.Q4_K_M.gguf -O DeepSeek-V2-Lite-Chat.Q4_K_M.gguf
-
-cd .. # Move to repo's root dir
-
-# Start local chat
-python -m ktransformers.local_chat --model_path deepseek-ai/DeepSeek-V2-Lite-Chat --gguf_path ./DeepSeek-V2-Lite-Chat-GGUF
-
-# If you see “OSError: We couldn't connect to 'https://huggingface.co' to load this file”, try：
-# GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite
-# python  ktransformers.local_chat --model_path ./DeepSeek-V2-Lite --gguf_path ./DeepSeek-V2-Lite-Chat-GGUF
-```
-
-It features the following arguments:
-
-- `--model_path` (required): Name of the model (such as "deepseek-ai/DeepSeek-V2-Lite-Chat" which will automatically download configs from [Hugging Face](https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite)). Or if you already got local files  you may directly use that path to initialize the model.  
-  
-  > Note: <strong>.safetensors</strong> files are not required in the directory. We only need config files to build model and tokenizer.
-
-- `--gguf_path` (required): Path of a directory containing GGUF files which could that can be downloaded from [Hugging Face](https://huggingface.co/mzwing/DeepSeek-V2-Lite-Chat-GGUF/tree/main). Note that the directory should only contains GGUF of current model, which means you need one separate directory for each model.
-
-- `--optimize_rule_path` (required except for Qwen2Moe and DeepSeek-V2): Path of YAML file containing optimize rules. There are two rule files pre-written in the [ktransformers/optimize/optimize_rules](ktransformers/optimize/optimize_rules) directory for optimizing DeepSeek-V2 and Qwen2-57B-A14, two SOTA MoE models.
-
-- `--max_new_tokens`: Int (default=1000). Maximum number of new tokens to generate.
-
-- `--cpu_infer`: Int (default=10). The number of CPUs used for inference. Should ideally be set to the (total number of cores - 2).
-
-<h3 id="suggested-model"> Suggested Model</h3>
-
-| Model Name                     | Model Size | VRAM  | Minimum DRAM    | Recommended DRAM  |
-| ------------------------------ | ---------- | ----- | --------------- | ----------------- |
-| DeepSeek-R1-q4_k_m		 | 377G       | 14G   | 382G            | 512G		    |
-| DeepSeek-V3-q4_k_m		 | 377G       | 14G   | 382G            | 512G		    |
-| DeepSeek-V2-q4_k_m             | 133G       | 11G   | 136G            | 192G              |
-| DeepSeek-V2.5-q4_k_m           | 133G       | 11G   | 136G            | 192G              |
-| DeepSeek-V2.5-IQ4_XS           | 117G       | 10G   | 107G            | 128G              |
-| Qwen2-57B-A14B-Instruct-q4_k_m | 33G        | 8G    | 34G             | 64G               |
-| DeepSeek-V2-Lite-q4_k_m        | 9.7G       | 3G    | 13G             | 16G               |
-| Mixtral-8x7B-q4_k_m            | 25G        | 1.6G  | 51G             | 64G               |
-| Mixtral-8x22B-q4_k_m           | 80G        | 4G    | 86.1G           | 96G               |
-| InternLM2.5-7B-Chat-1M         | 15.5G      | 15.5G | 8G(32K context) | 150G (1M context) |
-
-
-More will come soon. Please let us know which models you are most interested in. 
-
-Be aware that you need to be subject to their corresponding model licenses when using [DeepSeek](https://huggingface.co/deepseek-ai/DeepSeek-V2/blob/main/LICENSE) and [QWen](https://huggingface.co/Qwen/Qwen2-72B-Instruct/blob/main/LICENSE).
-
-<details>
-  <summary>Click To Show how to run other examples</summary>
-
-* Qwen2-57B
-
-  ```sh
-  pip install flash_attn # For Qwen2
-
-  mkdir Qwen2-57B-GGUF && cd Qwen2-57B-GGUF
-
-  wget https://huggingface.co/Qwen/Qwen2-57B-A14B-Instruct-GGUF/resolve/main/qwen2-57b-a14b-instruct-q4_k_m.gguf?download=true -O qwen2-57b-a14b-instruct-q4_k_m.gguf
-
-  cd ..
-
-  python -m ktransformers.local_chat --model_name Qwen/Qwen2-57B-A14B-Instruct --gguf_path ./Qwen2-57B-GGUF
-
-  # If you see “OSError: We couldn't connect to 'https://huggingface.co' to load this file”, try：
-  # GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/Qwen/Qwen2-57B-A14B-Instruct
-  # python  ktransformers/local_chat.py --model_path ./Qwen2-57B-A14B-Instruct --gguf_path ./DeepSeek-V2-Lite-Chat-GGUF
-  ```
-
-* DeepseekV2
-  
-  ```sh
-  mkdir DeepSeek-V2-Chat-0628-GGUF && cd DeepSeek-V2-Chat-0628-GGUF
-  # Download weights
-  wget https://huggingface.co/bartowski/DeepSeek-V2-Chat-0628-GGUF/resolve/main/DeepSeek-V2-Chat-0628-Q4_K_M/DeepSeek-V2-Chat-0628-Q4_K_M-00001-of-00004.gguf -o DeepSeek-V2-Chat-0628-Q4_K_M-00001-of-00004.gguf
-  wget https://huggingface.co/bartowski/DeepSeek-V2-Chat-0628-GGUF/resolve/main/DeepSeek-V2-Chat-0628-Q4_K_M/DeepSeek-V2-Chat-0628-Q4_K_M-00002-of-00004.gguf -o DeepSeek-V2-Chat-0628-Q4_K_M-00002-of-00004.gguf
-  wget https://huggingface.co/bartowski/DeepSeek-V2-Chat-0628-GGUF/resolve/main/DeepSeek-V2-Chat-0628-Q4_K_M/DeepSeek-V2-Chat-0628-Q4_K_M-00003-of-00004.gguf -o DeepSeek-V2-Chat-0628-Q4_K_M-00003-of-00004.gguf
-  wget https://huggingface.co/bartowski/DeepSeek-V2-Chat-0628-GGUF/resolve/main/DeepSeek-V2-Chat-0628-Q4_K_M/DeepSeek-V2-Chat-0628-Q4_K_M-00004-of-00004.gguf -o DeepSeek-V2-Chat-0628-Q4_K_M-00004-of-00004.gguf
-
-  cd ..
-
-  python -m ktransformers.local_chat --model_name deepseek-ai/DeepSeek-V2-Chat-0628 --gguf_path ./DeepSeek-V2-Chat-0628-GGUF
-
-  # If you see “OSError: We couldn't connect to 'https://huggingface.co' to load this file”, try：
-
-  # GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/deepseek-ai/DeepSeek-V2-Chat-0628
-
-  # python -m ktransformers.local_chat --model_path ./DeepSeek-V2-Chat-0628 --gguf_path ./DeepSeek-V2-Chat-0628-GGUF
-  ```
-
-| model name | weights download link |
-|----------|----------|
-| Qwen2-57B | [Qwen2-57B-A14B-gguf-Q4K-M](https://huggingface.co/Qwen/Qwen2-57B-A14B-Instruct-GGUF/tree/main) |
-| DeepseekV2-coder |[DeepSeek-Coder-V2-Instruct-gguf-Q4K-M](https://huggingface.co/LoneStriker/DeepSeek-Coder-V2-Instruct-GGUF/tree/main) |
-| DeepseekV2-chat |[DeepSeek-V2-Chat-gguf-Q4K-M](https://huggingface.co/bullerwins/DeepSeek-V2-Chat-0628-GGUF/tree/main) |
-| DeepseekV2-lite | [DeepSeek-V2-Lite-Chat-GGUF-Q4K-M](https://huggingface.co/mzwing/DeepSeek-V2-Lite-Chat-GGUF/tree/main) |
-
-</details>
-
-<!-- pin block for jump -->
-<span id='id_666'> 
-
-<h3>RESTful API and Web UI</h3>
-
-
-Start without website:
-
-```sh
-ktransformers --model_path deepseek-ai/DeepSeek-V2-Lite-Chat --gguf_path /path/to/DeepSeek-V2-Lite-Chat-GGUF --port 10002
-```
-
-Start with website:
-
-```sh
-ktransformers --model_path deepseek-ai/DeepSeek-V2-Lite-Chat --gguf_path /path/to/DeepSeek-V2-Lite-Chat-GGUF  --port 10002 --web True
-```
-
-Or you want to start server with transformers, the model_path should include safetensors
-
-```bash
-ktransformers --type transformers --model_path /mnt/data/model/Qwen2-0.5B-Instruct --port 10002 --web True
-```
-
-Access website with url [http://localhost:10002/web/index.html#/chat](http://localhost:10002/web/index.html#/chat) :
-
-<p align="center">
-  <picture>
-    <img alt="Web UI" src="https://github.com/user-attachments/assets/615dca9b-a08c-4183-bbd3-ad1362680faf" width=90%>
-  </picture>
-</p>
-
-More information about the RESTful API server can be found [here](doc/en/api/server/server.md). You can also find an example of integrating with Tabby [here](doc/en/api/server/tabby.md).
 
 <h2 id="tutorial">📃 Brief Injection Tutorial</h2>
 At the heart of KTransformers is a user-friendly, template-based injection framework. 
@@ -333,7 +127,7 @@ To utilize the provided kernels, users only need to create a YAML-based injectio
 ```python
 with torch.device("meta"):
     model = AutoModelForCausalLM.from_config(config, trust_remote_code=True)
-optimize_and_load_gguf(model, optimize_rule_path, gguf_path, config)
+optimize_and_load_gguf(model, optimize_config_path, gguf_path, config)
 ...
 generated = prefill_and_generate(model, tokenizer, input_tensor.cuda(), max_new_tokens=1000)
 ```
@@ -368,14 +162,14 @@ If you are interested in our design principles and the implementation of the inj
 
 <h2 id="ack">Acknowledgment and Contributors</h2>
 
-The development of KTransformer is based on the flexible and versatile framework provided by Transformers. We also benefit from advanced kernels such as GGUF/GGML, Llamafile, and Marlin. We are planning to contribute back to the community by upstreaming our modifications.
+The development of KTransformer is based on the flexible and versatile framework provided by Transformers. We also benefit from advanced kernels such as GGUF/GGML, Llamafile, Marlin, sglang and flashinfer. We are planning to contribute back to the community by upstreaming our modifications.
 
 KTransformer is actively maintained and developed by contributors from the <a href="https://madsys.cs.tsinghua.edu.cn/">MADSys group</a> at Tsinghua University and members from <a href="http://approaching.ai/">Approaching.AI</a>. We welcome new contributors to join us in making KTransformer faster and easier to use.
 
 
 <h2 id="ack">Discussion</h2>
 
-If you have any questions, feel free to open an issue. Alternatively, you can join our WeChat group for further discussion. QR Code: [WeChat Group](WeChatGrouop.jpg)
+If you have any questions, feel free to open an issue. Alternatively, you can join our WeChat group for further discussion. QR Code: [WeChat Group](WeChatGroup.png)
 
 <h2 id="FAQ">🙋 FAQ</h2>
 
diff --git a/README_ZH.md b/README_ZH.md
new file mode 100644
index 0000000..6c82805
--- /dev/null
+++ b/README_ZH.md
@@ -0,0 +1,166 @@
+<div align="center">
+  <!-- <h1>KTransformers</h1> -->
+  <p align="center">
+
+<picture>
+    <img alt="KTransformers" src="https://github.com/user-attachments/assets/d5a2492f-a415-4456-af99-4ab102f13f8b" width=50%>
+
+</picture>
+
+</p>
+  <h3>一个用于体验尖端 LLM 推理优化的灵活框架</h3>
+  <strong><a href="#show-cases">🌟 案例展示</a> | <a href="#quick-start">🚀 快速入门</a> | <a href="#tutorial">📃 教程</a> | <a href="https://github.com/kvcache-ai/ktransformers/discussions">💬 讨论</a> | <a href="#FAQ">🙋 常见问题</a> </strong>
+</div>
+
+<h2 id="intro">🎉 介绍</h2>
+KTransformers（发音为 Quick Transformers）旨在通过先进的内核优化和放置/并行策略来增强您对 🤗 [Transformers](https://github.com/huggingface/transformers) 的体验。
+<br/><br/>
+KTransformers 是一个以 Python 为中心的灵活框架，其核心是可扩展性。通过用一行代码实现并注入优化模块，用户可以获得与 Transformers 兼容的接口、符合 OpenAI 和 Ollama 的 RESTful API，甚至是一个简化的类似 ChatGPT 的 Web 界面。
+<br/><br/>
+我们对 KTransformers 的愿景是成为一个用于实验创新 LLM 推理优化的灵活平台。如果您需要任何其他功能，请告诉我们。
+
+<h2 id="Updates">🔥 更新</h2>
+
+* **2025 年 2 月 15 日**：为DeepSeek-V3/R1支持[FP8 GPU内核](./doc/en/fp8_kernel.md); 支持更长的上下文([教程](./doc/en/DeepseekR1_V3_tutorial.md#v022-longer-context)).
+* **2025 年 2 月 15 日**：长上下文(从4K到8K，24GB VRAM) & 稍快的速度(+15%)(最快 16 Tokens/s)，文档请参见 [这里](./doc/en/DeepseekR1_V3_tutorial.md) 和 [在线指南](https://kvcache-ai.github.io/ktransformers/) 。
+* **2025 年 2 月 10 日**：支持 Deepseek-R1 和 V3 在单个（24GB VRAM）/多 GPU 和 382G DRAM 上运行，速度提升高达 3~28 倍。详细教程请参见 [这里](./doc/en/DeepseekR1_V3_tutorial.md)。
+* **2024 年 8 月 28 日**：支持 InternLM2.5-7B-Chat-1M 模型下的 1M 上下文，使用 24GB 的 VRAM 和 150GB 的 DRAM。详细教程请参见 [这里](./doc/en/long_context_tutorial.md)。
+* **2024 年 8 月 28 日**：将 DeepseekV2 所需的 VRAM 从 21G 降低到 11G。
+* **2024 年 8 月 15 日**：更新了详细的 [教程](doc/en/injection_tutorial.md)，介绍注入和多 GPU 的使用。
+* **2024 年 8 月 14 日**：支持 llamfile 作为线性后端。
+* **2024 年 8 月 12 日**：支持多 GPU；支持新模型：mixtral 8\*7B 和 8\*22B；支持 q2k、q3k、q5k 在 GPU 上的去量化。
+* **2024 年 8 月 9 日**：支持 Windows。
+
+<h2 id="show-cases">🌟 案例展示</h2>
+
+<div>
+<h3>在仅 24GB VRAM 的桌面上运行 GPT-4/o1 级别的本地 VSCode Copilot</h3>
+</div>
+
+https://github.com/user-attachments/assets/ebd70bfa-b2c1-4abb-ae3b-296ed38aa285
+
+</p>
+
+- **[NEW!!!] 本地 671B DeepSeek-Coder-V3/R1**：使用其 Q4_K_M 版本，仅需 14GB VRAM 和 382GB DRAM 即可运行（教程请参见 [这里](./doc/en/DeepseekR1_V3_tutorial.md)）。
+	- 预填充速度（tokens/s）：
+ 		- KTransformers：54.21（32 核）→ 74.362（双插槽，2×32 核）→ 255.26（优化的 AMX 基 MoE 内核，仅 V0.3）→ 286.55（选择性使用 6 个专家，仅 V0.3）
+ 		- 与 llama.cpp 在 2×32 核下相比，达到 **27.79× 速度提升**。
+ 	- 解码速度（tokens/s）：
+ 		- KTransformers：8.73（32 核）→ 11.26（双插槽，2×32 核）→ 13.69（选择性使用 6 个专家，仅 V0.3）
+ 		- 与 llama.cpp 在 2×32 核下相比，达到 **3.03× 速度提升**。
+	- 即将开源发布：
+		- AMX 优化和选择性专家激活将在 V0.3 中开源。
+		- 目前仅在预览二进制分发中可用，可从 [这里](./doc/en/DeepseekR1_V3_tutorial.md) 下载。
+
+- **本地 236B DeepSeek-Coder-V2**：使用其 Q4_K_M 版本，仅需 21GB VRAM 和 136GB DRAM 即可运行，甚至在 [BigCodeBench](https://huggingface.co/blog/leaderboard-bigcodebench) 中得分超过 GPT4-0613。
+
+<p align="center">
+  <picture>
+    <img alt="DeepSeek-Coder-V2 Score" src="https://github.com/user-attachments/assets/d052924e-8631-44de-aad2-97c54b965693" width=100%>
+  </picture>
+</p>
+
+- **更快的速度**：通过 MoE 卸载和注入来自 [Llamafile](https://github.com/Mozilla-Ocho/llamafile/tree/main) 和 [Marlin](https://github.com/IST-DASLab/marlin) 的高级内核，实现了 2K 提示预填充 126 tokens/s 和生成 13.6 tokens/s 的速度。
+- **VSCode 集成**：封装成符合 OpenAI 和 Ollama 的 API，可无缝集成到 [Tabby](https://github.com/TabbyML/tabby) 和其他前端的后端。
+
+<p align="center">
+
+https://github.com/user-attachments/assets/4c6a8a38-05aa-497d-8eb1-3a5b3918429c
+
+</p>
+
+<!-- <h3>在仅 24GB VRAM 的桌面上进行 1M 上下文本地推理</h3>
+<p align="center"> -->
+
+<!-- https://github.com/user-attachments/assets/a865e5e4-bca3-401e-94b8-af3c080e6c12 -->
+<!-- 
+* **1M 上下文 InternLM 2.5 7B**：以全 bf16 精度运行，使用 24GB VRAM 和 150GB DRAM，可在本地桌面设置中实现。在 1M "针在干草堆中" 测试中达到 92.88% 的成功率，在 128K NIAH 测试中达到 100%。
+
+<p align="center">
+  <picture>
+    <img alt="Single Needle Retrieval 128K" src="./doc/assets/needle_128K.png" width=100%>
+  </picture>
+</p>
+
+<p align="center">
+  <picture>
+    <img alt="Single Needle Retrieval 1000K" src="./doc/assets/needle_1M.png" width=100%>
+  </picture>
+</p>
+
+* **增强的速度**：使用稀疏注意力，通过 llamafile 内核实现 1M 上下文生成 16.91 tokens/s 的速度。这种方法比 llama.cpp 的全注意力方法快 10 倍以上。
+
+* **灵活的稀疏注意力框架**：提供了一个灵活的块稀疏注意力框架，用于 CPU 卸载解码。与 SnapKV、Quest 和 InfLLm 兼容。更多信息请参见 [这里](./doc/en/long_context_introduction.md)。 -->
+
+<strong>更多高级功能即将推出，敬请期待！</strong>
+
+<h2 id="quick-start">🚀 快速入门</h2>
+
+
+KTransformers 的入门非常简单！请参考我们的[安装指南]((https://kvcache-ai.github.io/ktransformers/))进行安装。
+
+<h2 id="tutorial">📃 简要注入教程</h2>
+KTransformers 的核心是一个用户友好的、基于模板的注入框架。这使得研究人员可以轻松地将原始 torch 模块替换为优化的变体。它还简化了多种优化的组合过程，允许探索它们的协同效应。
+</br>
+<p align="center">
+  <picture>
+    <img alt="Inject-Struction" src="https://github.com/user-attachments/assets/6b4c1e54-9f6d-45c5-a3fc-8fa45e7d257e" width=65%>
+  </picture>
+</p>
+
+鉴于 vLLM 已经是一个用于大规模部署优化的优秀框架，KTransformers 特别关注受资源限制的本地部署。我们特别关注异构计算时机，例如量化模型的 GPU/CPU 卸载。例如，我们支持高效的 <a herf="https://github.com/Mozilla-Ocho/llamafile/tree/main">Llamafile</a> 和<a herf="https://github.com/IST-DASLab/marlin">Marlin</a> 内核，分别用于 CPU 和 GPU。 更多详细信息可以在 <a herf="doc/en/operators/llamafile.md">这里</a>找到。
+
+
+<h3>示例用法</h3>
+要使用提供的内核，用户只需创建一个基于 YAML 的注入模板，并在使用 Transformers 模型之前添加对 `optimize_and_load_gguf` 的调用。
+
+```python
+with torch.device("meta"):
+    model = AutoModelForCausalLM.from_config(config, trust_remote_code=True)
+optimize_and_load_gguf(model, optimize_config_path, gguf_path, config)
+...
+generated = prefill_and_generate(model, tokenizer, input_tensor.cuda(), max_new_tokens=1000)
+```
+
+在这个示例中，首先在 meta 设备上初始化 AutoModel，以避免占用任何内存资源。然后，`optimize_and_load_gguf` 遍历模型的所有子模块，匹配您的 YAML 规则文件中指定的规则，并将它们替换为指定的高级模块。
+
+注入后，原始的 `generate` 接口仍然可用，但我们还提供了一个兼容的 `prefill_and_generate` 方法，这使得可以进一步优化，例如使用 CUDAGraph 提高生成速度。
+
+<h3>如何自定义您的模型</h3>
+
+一个详细的使用 DeepSeek-V2 作为示例的注入和 multi-GPU 教程在 [这里](doc/en/injection_tutorial.md)。
+
+以下是一个将所有原始 Linear 模块替换为 Marlin 的 YAML 模板示例，Marlin 是一个高级的 4 位量化内核。
+
+```yaml
+- match:
+    name: "^model\\.layers\\..*$"  # 正则表达式 
+    class: torch.nn.Linear  # 仅匹配同时符合名称和类的模块
+  replace:
+    class: ktransformers.operators.linear.KTransformerLinear  # 量化数据类型的优化内核
+    device: "cpu"   # 初始化时加载该模块的 device
+    kwargs:
+      generate_device: "cuda"
+      generate_linear_type: "QuantizedLinearMarlin"
+```
+
+YAML 文件中的每个规则都有两部分：`match` 和 `replace`。`match` 部分指定应替换的模块，`replace` 部分指定要注入到模型中的模块以及初始化关键字。
+
+您可以在 [ktransformers/optimize/optimize_rules](ktransformers/optimize/optimize_rules) 目录中找到用于优化 DeepSeek-V2 和 Qwen2-57B-A14 的示例规则模板。这些模板用于为 `local_chat.py` 示例提供支持。
+
+如果您对我们的设计原则和注入框架的实现感兴趣，请参考 [设计文档](doc/en/deepseek-v2-injection.md)。
+
+<h2 id="ack">致谢和贡献者</h2>
+
+KTransformer 的开发基于 Transformers 提供的灵活和多功能框架。我们还受益于 GGUF/GGML、Llamafile 、 Marlin、sglang和flashinfer 等高级内核。我们计划通过向上游贡献我们的修改来回馈社区。
+
+KTransformer 由清华大学 <a href="https://madsys.cs.tsinghua.edu.cn/">MADSys group</a> 小组的成员以及 <a href="http://approaching.ai/">Approaching.AI</a> 的成员积极维护和开发。我们欢迎新的贡献者加入我们，使 KTransformer 更快、更易于使用。
+
+
+<h2 id="ack">讨论</h2>
+
+如果您有任何问题，欢迎随时提出 issue。或者，您可以加入我们的微信群进行进一步讨论。二维码： [微信群](WeChatGroup.png)
+
+<h2 id="FAQ">🙋 常见问题</h2>
+
+一些常见问题的答案可以在 [FAQ](doc/en/FAQ.md) 中找到。 
diff --git a/WeChatGrouop.jpg b/WeChatGrouop.jpg
deleted file mode 100644
index cffd909..0000000
Binary files a/WeChatGrouop.jpg and /dev/null differ
diff --git a/WeChatGroup.png b/WeChatGroup.png
new file mode 100644
index 0000000..f9ea271
Binary files /dev/null and b/WeChatGroup.png differ
diff --git a/book.toml b/book.toml
new file mode 100644
index 0000000..c88d9b7
--- /dev/null
+++ b/book.toml
@@ -0,0 +1,18 @@
+[book]
+authors = ["kvcache-ai"]
+language = "zh-CN"
+title = "Ktransformers"
+src = "doc"
+
+[output.html]
+git-repository-url = "https://github.com/kvcache-ai/ktransformers"
+edit-url-template = "https://github.com/kvcache-ai/ktransformers/edit/main/{path}"
+
+[output.html.playground]
+editable = true
+copy-js = true
+# line-numbers = true
+
+[output.html.fold]
+enable = true
+level = 0
\ No newline at end of file
diff --git a/doc/README.md b/doc/README.md
new file mode 100644
index 0000000..8bd94a0
--- /dev/null
+++ b/doc/README.md
@@ -0,0 +1,33 @@
+<div align="center">
+  <!-- <h1>KTransformers</h1> -->
+  <p align="center">
+
+<picture>
+    <img alt="KTransformers" src="https://github.com/user-attachments/assets/d5a2492f-a415-4456-af99-4ab102f13f8b" width=50%>
+
+</picture>
+
+</p>
+
+</div>
+
+<h2 id="intro">🎉 Introduction</h2>
+KTransformers, pronounced as Quick Transformers, is designed to enhance your 🤗 <a href="https://github.com/huggingface/transformers">Transformers</a> experience with advanced kernel optimizations and placement/parallelism strategies.
+<br/><br/>
+KTransformers is a flexible, Python-centric framework designed with extensibility at its core. 
+By implementing and injecting an optimized module with a single line of code, users gain access to a Transformers-compatible
+interface, RESTful APIs compliant with OpenAI and Ollama, and even a simplified ChatGPT-like web UI. 
+<br/><br/>
+Our vision for KTransformers is to serve as a flexible platform for experimenting with innovative LLM inference optimizations. Please let us know if you need any other features.
+
+<h2 id="Updates">🔥 Updates</h2>
+
+* **Mar 5, 2025**: Support unsloth 1.58/2.51 bits weights and [IQ1_S/FP8 hybrid](./doc/en/fp8_kernel.md) weights. Support 139K [Longer Context](./doc/en/DeepseekR1_V3_tutorial.md#v022-longer-context) for DeepSeek-V3 and R1 in 24GB VRAM.
+* **Feb 25, 2025**: Support [FP8 GPU kernel](./doc/en/fp8_kernel.md) for DeepSeek-V3 and R1; [Longer Context](./doc/en/DeepseekR1_V3_tutorial.md#v022-longer-context).
+* **Feb 10, 2025**: Support Deepseek-R1 and V3 on single (24GB VRAM)/multi gpu and 382G DRAM, up to 3~28x speedup. The detailed tutorial is [here](./en/DeepseekR1_V3_tutorial.md).
+* **Aug 28, 2024**: Support 1M context under the InternLM2.5-7B-Chat-1M model, utilizing 24GB of VRAM and 150GB of DRAM. The detailed tutorial is [here](./en/long_context_tutorial.md).
+* **Aug 28, 2024**: Decrease DeepseekV2's required VRAM from 21G to 11G.
+* **Aug 15, 2024**: Update detailed [TUTORIAL](./en/injection_tutorial.md) for injection and multi-GPU. 
+* **Aug 14, 2024**: Support llamfile as linear backend. 
+* **Aug 12, 2024**: Support multiple GPU; Support new model: mixtral 8\*7B  and 8\*22B; Support q2k, q3k, q5k dequant on gpu.
+* **Aug 9, 2024**: Support windows native.
diff --git a/doc/SUMMARY.md b/doc/SUMMARY.md
new file mode 100644
index 0000000..d9fa9b8
--- /dev/null
+++ b/doc/SUMMARY.md
@@ -0,0 +1,25 @@
+# Ktransformer
+
+[Introduction](./README.md)
+# Install
+- [Installation Guide](en/install.md)
+
+# Tutorial 
+- [Deepseek-R1/V3 Show Case/Tutorial](en/DeepseekR1_V3_tutorial.md)
+- [Why KTransformers So Fast](en/deepseek-v2-injection.md)
+- [Injection Tutorial](en/injection_tutorial.md)
+- [Multi-GPU Tutorial](en/multi-gpu-tutorial.md)
+- [Use FP8 GPU Kernel](en/fp8_kernel.md)
+# Server
+  - [Server](en/api/server/server.md)
+  - [Website](en/api/server/website.md)
+  - [Tabby](en/api/server/tabby.md)
+# For Developer
+- [Makefile Usage](en/makefile_usage.md)
+
+# FAQ
+- [FAQ](en/FAQ.md)
+# V3 Reproduction
+- [Success List](en/V3-success.md)
+# Benchmark
+- [Benchmark](en/benchmark.md)
\ No newline at end of file
diff --git a/doc/assets/DeepSeek-on-KTransformers.PNG b/doc/assets/DeepSeek-on-KTransformers.PNG
deleted file mode 100644
index 455f210..0000000
Binary files a/doc/assets/DeepSeek-on-KTransformers.PNG and /dev/null differ
diff --git a/doc/assets/DeepSeek-on-KTransformers.png b/doc/assets/DeepSeek-on-KTransformers.png
new file mode 100644
index 0000000..e55f8cb
Binary files /dev/null and b/doc/assets/DeepSeek-on-KTransformers.png differ
diff --git a/doc/basic/note1.md b/doc/basic/note1.md
new file mode 100644
index 0000000..daa3dba
--- /dev/null
+++ b/doc/basic/note1.md
@@ -0,0 +1 @@
+# basic-first20
diff --git a/doc/basic/note2.md b/doc/basic/note2.md
new file mode 100644
index 0000000..b73e982
--- /dev/null
+++ b/doc/basic/note2.md
@@ -0,0 +1 @@
+# basic-data_structure
diff --git a/doc/en/DeepseekR1_V3_tutorial.md b/doc/en/DeepseekR1_V3_tutorial.md
index f5c4972..082078d 100644
--- a/doc/en/DeepseekR1_V3_tutorial.md
+++ b/doc/en/DeepseekR1_V3_tutorial.md
@@ -1,30 +1,40 @@
 <!-- omit in toc -->
 # GPT-4/o1-level Local VSCode Copilot on a Desktop with only 24GB VRAM
 - [SUMMARY](#summary)
-	- [Prerequisites](#prerequisites)
+	- [Show Case Environment](#show-case-environment)
 	- [Bench Result](#bench-result)
+		- [V0.2.1](#v021)
+			- [Memory consumption:](#memory-consumption)
+			- [Change Log](#change-log)
+			- [Benchmark Results](#benchmark-results)
 		- [V0.2](#v02)
 			- [Settings](#settings)
-			- [Memory consumption:](#memory-consumption)
-			- [Benchmark Results](#benchmark-results)
+			- [Memory consumption:](#memory-consumption-1)
+			- [Benchmark Results](#benchmark-results-1)
 		- [V0.3-Preview](#v03-preview)
 			- [Settings](#settings-1)
 			- [Memory consumptions:](#memory-consumptions)
-			- [Benchmark results](#benchmark-results-1)
+			- [Benchmark results](#benchmark-results-2)
 	- [How to Run](#how-to-run)
-		- [V0.2 Showcase](#v02-showcase)
+		- [v0.2.2 \& v0.2.3 longer context \& FP8 kernel](#v022--v023-longer-context--fp8-kernel)
+			- [longer context](#longer-context)
+			- [FP8 kernel](#fp8-kernel)
+		- [V0.2 \& V0.2.1 Showcase](#v02--v021-showcase)
 			- [Single socket version (32 cores)](#single-socket-version-32-cores)
 			- [Dual socket version (64 cores)](#dual-socket-version-64-cores)
 		- [V0.3 Showcase](#v03-showcase)
 			- [Dual socket version (64 cores)](#dual-socket-version-64-cores-1)
 	- [Some Explanations](#some-explanations)
+	- [Next](#next)
+		- [Faster](#faster)
+		- [Easier](#easier)
 	- [FAQ](#faq)
 		- [R1 No Thinking](#r1-no-thinking)
 		- [More FAQ](#more-faq)
 
 # SUMMARY
 
-> **Fed 10, 2025**: Support DeepseekR1 and V3 on single (24GB VRAM)/multi gpu and 382G DRAM, up to 3~28x speedup.<br>
+> **Feb 10, 2025**: Support DeepseekR1 and V3 on single (24GB VRAM)/multi gpu and 382G DRAM, up to 3~28x speedup.<br>
 
 Hi, we're the KTransformers team (formerly known for our local CPU/GPU hybrid inference open source project with DeepSeek-V2).  
 
@@ -39,23 +49,64 @@ https://github.com/user-attachments/assets/ebd70bfa-b2c1-4abb-ae3b-296ed38aa285
 
 - **[NEW!!!] Local 671B DeepSeek-Coder-V3/R1:** Running its Q4_K_M version using only 14GB VRAM and 382GB DRAM.
 	- Prefill Speed (tokens/s): 
- 		- KTransfermor: 54.21 (32 cores) → 74.362 (dual-socket, 2×32 cores) → 255.26 (optimized AMX-based MoE kernel, V0.3 only) → 286.55 (selectively using 6 experts, V0.3 only)  
+ 		- KTransformers: 54.21 (32 cores) → 74.362 (dual-socket, 2×32 cores) → 255.26 (optimized AMX-based MoE kernel, V0.3 only) → 286.55 (selectively using 6 experts, V0.3 only)  
  		- Compared to 10.31 tokens/s in llama.cpp with 2×32 cores, achieving up to **27.79× speedup**.  
  	- Decode Speed (tokens/s):  
- 		- KTransfermor: 8.73 (32 cores) → 11.26 (dual-socket, 2×32 cores) → 13.69 (selectively using 6 experts, V0.3 only)  
+ 		- KTransformers: 8.73 (32 cores) → 11.26 (dual-socket, 2×32 cores) → 13.69 (selectively using 6 experts, V0.3 only)  
  		- Compared to 4.51 tokens/s in llama.cpp with 2×32 cores, achieving up to **3.03× speedup**.  
   
 
 We also give our upcoming optimizations previews, including an Intel AMX-accelerated kernel and a selective expert activation method, which will significantly enhance performance. With V0.3-preview, we achieve up to 286 tokens/s for prefill, making it up to **28× faster than llama.cpp** for local inference.
 The binary distribution is available now and the source code will come ASAP! Check out the wheel package [here](https://github.com/kvcache-ai/ktransformers/releases/download/v0.1.4/ktransformers-0.3.0rc0+cu126torch26fancy-cp311-cp311-linux_x86_64.whl)  
 
+> **Feb 15, 2025**: KTransformers V0.2.1: Longer Context (from 4K to 8K for 24GB VRAM) & Slightly Faster Speed （+15%) (Up to 16 Tokens/s), update docs [here](./doc/en/DeepseekR1_V3_tutorial.md) and [online books](https://kvcache-ai.github.io/ktransformers/).
 
-## Prerequisites
+We speed up the decode and prefill speed a littlt bit. The reason for the limited performance improvement mainly lies in the fact that the inference process is still constrained by the CPU's computational speed and memory bandwidth. The MLA part handled by the GPU accounts for a relatively small proportion.
+
+Besides the improvements in speed, we've also significantly updated the documentation to enhance usability, including:<br>
+- Added Multi-GPU configuration tutorial.
+- Consolidated installation guide.
+- Add a detailed tutorial on registering extra GPU memory with ExpertMarlin;
+
+
+## Show Case Environment
 We run our best performance tests (V0.2) on <br>
 CPU: Intel (R) Xeon (R) Gold 6454S 1T DRAM (2 NUMA nodes) <br>
 GPU: 4090D 24G VRAM <br>
-Memory: standard DDR5-4800 server DRAM (1 TB)
+Memory: standard DDR5-4800 server DRAM (1 TB), each socket with 8×DDR5-4800
 ## Bench Result
+### V0.2.1
+- Model: DeepseekV3-q4km (int4)<br>
+- CPU: cpu_model_name: Intel (R) Xeon (R) Gold 6454S, 32 cores per socket, 2 sockets, 2 numa nodes
+- GPU: 4090 24G VRAM
+- We test after enough warm up
+#### Memory consumption:
+  - Single socket: 382G DRAM, at least 14GB VRAM
+  - Dual socket: 1T DRAM, at least 14GB VRAM
+#### Change Log
+- Longer Context (from 4K to 8K for 24GB VRAM) and Slightly Faster Speed （+15%):<br>
+Integrated the highly efficient Triton MLA Kernel from the fantastic sglang project, enable much longer context length and slightly faster prefill/decode speed
+- We suspect that some of the improvements come from the change of hardware platform (4090D->4090)
+#### Benchmark Results
+
+
+"6 experts" case is part of V0.3's preview
+
+
+| Prompt | hi (2) | 1K (969) | 2K (1930) | 4K (3846) | 8K (7678) | 
+| --- | --- | --- | --- | --- | --- | 
+| Output length | 10tokens | 300tokens | 300tokens | 300tokens | 300tokens | 
+| **6 experts V0.2.0** |  |  |  |  |  |
+| Prefill token/s | 13 | 105 | 102 | 88 | CUDA OOM |
+| decode token/s | 16.8 | 15.4 | 14.2 | 13.0 | CUDA OOM |
+| **6 experts V0.2.1** |   |   |   |   |   |
+| Prefill token/s | 13 | 111 | 112.5 | 102 **(1.16x speedup)** | 101 |
+| decode token/s | 16.8 | 15.9 | 15.4 | 14.9 **(1.15x speedup)** | 13.9 |
+| **8 experts V0.2.1** |   |   |   |   |   |
+| Prefill token/s | 12.2 | 88.2 | 88.5 | 81.9 | 80 |
+| Decode token/s | 13.4 | 13.5 | 13.4 | 13.2 | 12.4 |
+
+
 ### V0.2
 #### Settings
 - Model: DeepseekV3-q4km (int4)<br>
@@ -106,14 +157,41 @@ the output quality doesn't change. But the speed of decoding and prefill
 is speed up which is inspiring. So our showcase makes use of this finding*
 
 ## How to Run
-### V0.2 Showcase
+### v0.2.2 & v0.2.3 longer context & FP8 kernel
+#### longer context
+To use this feature, [install flashinfer](https://github.com/flashinfer-ai/flashinfer) first.
+
+Note: The latest MLA kernel in FlashInfer still has a few minor issues. They are continuously fixing them on the main branch. If you are using FlashInfer, please install it from the main source code.
+
+If you want to use long context(longer than 20K) for prefill, enable the matrix absorption MLA during the prefill phase, which will significantly reduce the size of the kv cache. Modify yaml file like this:
+```
+- match:
+    name: "^model\\.layers\\..*\\.self_attn$"
+  replace:
+    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
+    kwargs:
+      generate_device: "cuda"
+      prefill_device: "cuda"
+      absorb_for_prefill: True # change this to True to enable long context(prefill may slower).
+```
+
+If the VRAM is still insufficient, try reducing the `chunk_prefill_size` parameter (default is 8192) to further decrease the intermediate results during chunk prefill.
+#### FP8 kernel
+
+The DeepSeek-AI team provides FP8 safetensors for DeepSeek-R1/V3 models. We achieve performance optimization through the following works:
+- **FP8 GPU Kernel Integration**: FP8 linear layer acceleration kernels integrated in KTransformers
+- **Hybrid Quantization Architecture**:
+  - Attention and Shared-Expert modules use FP8 precision (enhances computational accuracy)
+  - Experts modules retain GGML quantization (GGUF format, reside in CPU to save GPU memory)
+
+So those who are persuing the best performance can use the FP8 linear kernel for DeepSeek-V3/R1.
+
+The detailed guide is [here](./fp8_kernel.md).
+
+### V0.2 & V0.2.1 Showcase
 #### Single socket version (32 cores)
 Our local_chat test command is:
 ``` shell
-git clone https://github.com/kvcache-ai/ktransformers.git
-cd ktransformers
-git submodule init
-git submodule update
 numactl -N 1 -m 1 python ./ktransformers/local_chat.py --model_path <your model path> --gguf_path <your gguf path>  --prompt_file <your prompt txt file>  --cpu_infer 33 --max_new_tokens 1000
 <when you see chat, then press enter to load the text prompt_file>
 ```
@@ -121,24 +199,28 @@ numactl -N 1 -m 1 python ./ktransformers/local_chat.py --model_path <your model
 `<your gguf path>` can also be online, but as its large we recommend you download it and quantize the model to what you want (notice it's the dir path) <br>
 `--max_new_tokens 1000` is the max output token length. If you find the answer is truncated, you
 can increase the number for longer answer (But be aware of OOM, and increase it will slow down the generation rate.). 
-<br>
-The command numactl -N 1 -m 1 aims to advoid data transfer between numa nodes<br>
+
+The command `numactl -N 1 -m 1` aims to advoid data transfer between numa nodes<br>
 Attention! If you are testing R1 and it may skip thinking. So you can add arg: `--force_think true`. This is explained in [FAQ](#faq) part
 
 #### Dual socket version (64 cores)
-Make suer before you install (use install.sh or `make dev_install`), setting the env var `USE_NUMA=1` by `export USE_NUMA=1` (if already installed, reinstall it with this env var set) <br>
-Our local_chat test command is:
+
+Make sure before you install (use install.sh or `make dev_install`), setting the env var `USE_NUMA=1` by `export USE_NUMA=1` (if already installed, reinstall it with this env var set). You may check the doc [here](./install.md) for install details. <br>
+
+Test Command:
 ``` shell
-git clone https://github.com/kvcache-ai/ktransformers.git
-cd ktransformers
-git submodule init
-git submodule update
-export USE_NUMA=1
-make dev_install # or sh ./install.sh
+# ---For those who have not installed ktransformers---
+# git clone https://github.com/kvcache-ai/ktransformers.git
+# cd ktransformers
+# git submodule init
+# git submodule update
+# export USE_NUMA=1
+# make dev_install # or sh ./install.sh
+# ----------------------------------------------------
 python ./ktransformers/local_chat.py --model_path <your model path> --gguf_path <your gguf path>  --prompt_file <your prompt txt file>  --cpu_infer 65 --max_new_tokens 1000
 <when you see chat, then press enter to load the text prompt_file>
 ```
-The parameters' meaning is the same. But As we  use dual socket, we set cpu_infer to 65
+The parameters' meaning is the same. But As we use dual socket, we set cpu_infer to 65
 
 ### V0.3 Showcase
 #### Dual socket version (64 cores)
@@ -170,9 +252,21 @@ DeepSeek's MLA operators are highly computationally intensive. While running eve
 
 5. Why Intel CPUs?
 Intel is currently the only CPU vendor that supports AMX-like instructions, which delivers significantly better performance compared to AVX-only alternatives.
+## Next
+### Faster
+* The FlashInfer (https://github.com/flashinfer-ai/flashinfer) project is releasing an even more efficient fused MLA operator, promising further speedups
+* vLLM has explored multi-token prediction in DeepSeek-V3, and support is on our roadmap for even better performance
+* We are collaborating with Intel to enhance the AMX kernel (v0.3) and optimize for Xeon6/MRDIMM
+### Easier
+* Official Docker images to simplify installation
+* Fix the server integration for web API access
+* Fix the local chat only accepting a single line prompt (currently \n begins generating prompt)
+* Support for more quantization types, including the highly requested dynamic quantization from unsloth
+
+Stay tuned for more updates! 
 ## FAQ
 ### R1 No Thinking
 Attention! If you are testing R1 and it may skip thinking. So you can add arg: `--force_think true`. The detail is in [FAQ](./FAQ.md) part <br>
 
 ### More FAQ
-[See detail](./FAQ.md)
\ No newline at end of file
+[See detail](./FAQ.md)
diff --git a/doc/en/Docker.md b/doc/en/Docker.md
index 56b2520..f31c3b5 100644
--- a/doc/en/Docker.md
+++ b/doc/en/Docker.md
@@ -7,7 +7,7 @@
 ## Images
 There is a Docker image available for our project, you can pull the docker image by：
 ```
-docker pull approachingai/ktransformers:0.1.1
+docker pull approachingai/ktransformers:0.2.1
 ```
 **Notice**: In this image, we compile the ktransformers in AVX512 instuction CPUs, if your cpu not support AVX512, it is suggested to recompile and install ktransformer in the /workspace/ktransformers directory within the container.
 
@@ -16,14 +16,16 @@ docker pull approachingai/ktransformers:0.1.1
 
  - finish, execute
    ```bash
-   docker build  -t approachingai/ktransformers:v0.1.1 .
+   docker build  -t approachingai/ktransformers:0.2.1 .
    ```
 
 ## Usage
 
 Assuming you have the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) that you can use the GPU in a Docker container.
 ```
-docker run --gpus all -v /path/to/models:/models -p 10002:10002 approachingai/ktransformers:v0.1.1 --port 10002 --gguf_path /models/path/to/gguf_path --model_path /models/path/to/model_path --web True
+docker run --gpus all -v /path/to/models:/models --name ktransformers -itd approachingai/ktransformers:0.2.1
+docker exec -it ktransformers /bin/bash
+python -m ktransformers.local_chat  --gguf_path /models/path/to/gguf_path --model_path /models/path/to/model_path --cpu_infer 33
 ```
 
 More operators you can see in the [readme](../../README.md)
\ No newline at end of file
diff --git a/doc/en/FAQ.md b/doc/en/FAQ.md
index 75e5e10..e001153 100644
--- a/doc/en/FAQ.md
+++ b/doc/en/FAQ.md
@@ -1,4 +1,18 @@
+<!-- omit in toc -->
 # FAQ
+- [Install](#install)
+  - [Q: ImportError: /lib/x86\_64-linux-gnu/libstdc++.so.6: version GLIBCXX\_3.4.32' not found](#q-importerror-libx86_64-linux-gnulibstdcso6-version-glibcxx_3432-not-found)
+  - [Q: DeepSeek-R1 not outputting initial  token](#q-deepseek-r1-not-outputting-initial--token)
+- [Usage](#usage)
+  - [Q: If I got more VRAM than the model's requirement, how can I fully utilize it?](#q-if-i-got-more-vram-than-the-models-requirement-how-can-i-fully-utilize-it)
+  - [Q: If I don't have enough VRAM, but I have multiple GPUs, how can I utilize them?](#q-if-i-dont-have-enough-vram-but-i-have-multiple-gpus-how-can-i-utilize-them)
+  - [Q: How to get the best performance?](#q-how-to-get-the-best-performance)
+  - [Q: My DeepSeek-R1 model is not thinking.](#q-my-deepseek-r1-model-is-not-thinking)
+  - [Q: Loading gguf error](#q-loading-gguf-error)
+  - [Q: Version \`GLIBCXX\_3.4.30' not found](#q-version-glibcxx_3430-not-found)
+  - [Q: When running the bfloat16 moe model, the data shows NaN](#q-when-running-the-bfloat16-moe-model-the-data-shows-nan)
+  - [Q: Using fp8 prefill very slow.](#q-using-fp8-prefill-very-slow)
+  - [Q: Possible ways to run graphics cards using volta and turing architectures](#q-possible-ways-to-run-graphics-cards-using-volta-and-turing-architectures)
 ## Install
 ### Q: ImportError: /lib/x86_64-linux-gnu/libstdc++.so.6: version GLIBCXX_3.4.32' not found
 ```
@@ -25,7 +39,7 @@ from-https://github.com/kvcache-ai/ktransformers/issues/129#issue-2842799552
    1. local_chat.py: You can increase the context window size by setting `--max_new_tokens` to a larger value.
    2. server: Increase the `--cache_lens' to a larger value.
 2. Move more weights to the GPU.
-    Refer to the ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-marlin.yaml
+    Refer to the ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-4.yaml
     ```yaml
     - match:
        name: "^model\\.layers\\.([4-10])\\.mlp\\.experts$" # inject experts in layer 4~10 as marlin expert
@@ -39,11 +53,13 @@ from-https://github.com/kvcache-ai/ktransformers/issues/129#issue-2842799552
     You can modify layer as you want, eg. `name: "^model\\.layers\\.([4-10])\\.mlp\\.experts$"` to `name: "^model\\.layers\\.([4-12])\\.mlp\\.experts$"` to move more weights to the GPU.
 
     > Note: The first matched rule in yaml will be applied. For example, if you have two rules that match the same layer, only the first rule's replacement will be valid.
+    > Note：Currently, executing experts on the GPU will conflict with CUDA Graph. Without CUDA Graph, there will be a significant slowdown. Therefore, unless you have a substantial amount of VRAM (placing a single layer of experts for DeepSeek-V3/R1 on the GPU requires at least 5.6GB of VRAM), we do not recommend enabling this feature. We are actively working on optimization.
+    > Note KExpertsTorch is untested.
 
 
 ### Q: If I don't have enough VRAM, but I have multiple GPUs, how can I utilize them?
 
-Use the `--optimize_rule_path ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu.yaml` to load the two optimized rule yaml file. You may also use it as an example to write your own 4/8 gpu optimized rule yaml file.
+Use the `--optimize_config_path ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu.yaml` to load the two optimized rule yaml file. You may also use it as an example to write your own 4/8 gpu optimized rule yaml file.
 
 > Note: The ktransformers' multi-gpu stratigy is pipline, which is not able to speed up the model's inference. It's only for the model's weight distribution.
 
@@ -53,7 +69,7 @@ You have to set `--cpu_infer` to the number of cores you want to use. The more c
 
 ### Q: My DeepSeek-R1 model is not thinking.
 
-According to DeepSeek, you need to enforce the model to initiate its response with "\<think>\n" at the beginning of every output by passing the arg `--force_think true `.
+According to DeepSeek, you need to enforce the model to initiate its response with "\<think>\n" at the beginning of every output by passing the arg `--force_think True `.
 
 ### Q: Loading gguf error
 
@@ -61,9 +77,91 @@ Make sure you:
 1. Have the `gguf` file in the `--gguf_path` directory.
 2. The directory only contains gguf files from one model. If you have multiple models, you need to separate them into different directories.
 3. The folder name it self should not end with `.gguf`, eg. `Deep-gguf` is correct, `Deep.gguf` is wrong.
+4. The file itself is not corrupted; you can verify this by checking that the sha256sum matches the one from huggingface, modelscope, or hf-mirror.
 
 ### Q: Version `GLIBCXX_3.4.30' not found
 The detailed error:
 >ImportError: /mnt/data/miniconda3/envs/xxx/bin/../lib/libstdc++.so.6: version `GLIBCXX_3.4.30' not found (required by /home/xxx/xxx/ktransformers/./cpuinfer_ext.cpython-312-x86_64-linux-gnu.so)
 
-It may because of your conda env have no this version. Your can first exit your conda env by `conda deactivate` and use `whereis libstdc++.so.6` to find the path. And re enter your conda env and copy the .so by `cp <path of outter libstdc++> <path of your conda env libstdc++>` 
+Running `conda install -c conda-forge libstdcxx-ng` can solve the problem.
+
+
+### Q: When running the bfloat16 moe model, the data shows NaN
+The detailed error:
+```shell
+Traceback (most recent call last):
+  File "/root/ktransformers/ktransformers/local_chat.py", line 183, in <module>
+    fire.Fire(local_chat)
+  File "/usr/local/lib/python3.10/dist-packages/fire/core.py", line 135, in Fire
+    component_trace = _Fire(component, args, parsed_flag_args, context, name)
+  File "/usr/local/lib/python3.10/dist-packages/fire/core.py", line 468, in _Fire
+    component, remaining_args = _CallAndUpdateTrace(
+  File "/usr/local/lib/python3.10/dist-packages/fire/core.py", line 684, in _CallAndUpdateTrace
+    component = fn(*varargs, **kwargs)
+  File "/root/ktransformers/ktransformers/local_chat.py", line 177, in local_chat
+    generated = prefill_and_generate(
+  File "/root/ktransformers/ktransformers/util/utils.py", line 204, in prefill_and_generate
+    next_token = decode_one_tokens(cuda_graph_runner, next_token.unsqueeze(0), position_ids, cache_position, past_key_values, use_cuda_graph).to(torch_device)
+  File "/root/ktransformers/ktransformers/util/utils.py", line 128, in decode_one_tokens
+    next_token = torch.multinomial(probs, num_samples=1).squeeze(1)
+RuntimeError: probability tensor contains either `inf`, `nan` or element < 0
+```
+**SOLUTION**: The issue of running ktransformers on Ubuntu 22.04 is caused by the current system's g++ version being too old, and the pre-defined macros do not include avx_bf16. We have tested and confirmed that it works on g++ 11.4 in Ubuntu 22.04.
+
+### Q: Using fp8 prefill very slow.
+
+The FP8 kernel is build by JIT, so the first run will be slow. The subsequent runs will be faster.
+
+### Q: Possible ways to run graphics cards using volta and turing architectures
+
+From: https://github.com/kvcache-ai/ktransformers/issues/374
+
+1. First, download the latest source code using git.
+2. Then, modify the DeepSeek-V3-Chat-multi-gpu-4.yaml in the source code and all related yaml files, replacing all instances of KLinearMarlin with KLinearTorch.
+3. Next, you need to compile from the ktransformer source code until it successfully compiles on your local machine.
+4. Then, install flash-attn. It won't be used, but not installing it will cause an error.
+5. Then, modify local_chat.py, replacing all instances of flash_attention_2 with eager.
+6. Then, run local_chat.py. Be sure to follow the official tutorial's commands and adjust according to your local machine's parameters.
+7. During the running process, check the memory usage. Observe its invocation through the top command. The memory capacity on a single CPU must be greater than the complete size of the model. (For multiple CPUs, it's just a copy.)
+Finally, confirm that the model is fully loaded into memory and specific weight layers are fully loaded into the GPU memory. Then, try to input content in the chat interface and observe if there are any errors.
+
+Attention, for better perfomance, you can check this [method](https://github.com/kvcache-ai/ktransformers/issues/374#issuecomment-2667520838) in the issue
+>
+>https://github.com/kvcache-ai/ktransformers/blob/89f8218a2ab7ff82fa54dbfe30df741c574317fc/ktransformers/operators/attention.py#L274-L279
+>
+>```diff
+>+ original_dtype = query_states.dtype
+>+ target_dtype = torch.half
+>+ query_states = query_states.to(target_dtype)
+>+ compressed_kv_with_k_pe = compressed_kv_with_k_pe.to(target_dtype)
+>+ compressed_kv = compressed_kv.to(target_dtype)
+>+ attn_output = attn_output.to(target_dtype)
+>
+>decode_attention_fwd_grouped(query_states, compressed_kv_with_k_pe, compressed_kv, attn_output,
+>                             page_table,
+>                             position_ids.squeeze(0).to(torch.int32)+1, attn_logits,
+>                             4, #num_kv_splits # follow vLLM, fix it TODO
+>                             self.softmax_scale,
+>                             past_key_value.page_size)
+>
+>+ attn_output = attn_output.to(original_dtype)
+>```
+>
+>https://github.com/kvcache-ai/ktransformers/blob/89f8218a2ab7ff82fa54dbfe30df741c574317fc/ktransformers/operators/attention.py#L320-L326
+>
+>```diff
+>- attn_output = flash_attn_func( 
+>-     query_states, 
+>-     key_states, 
+>-     value_states_padded, 
+>-     softmax_scale=self.softmax_scale, 
+>-     causal=True, 
+>- )
+>+ attn_output = F.scaled_dot_product_attention(
+>+     query_states.transpose(1, 2),
+>+     key_states.transpose(1, 2),
+>+     value_states_padded.transpose(1, 2),
+>+     scale=self.softmax_scale,
+>+     is_causal=True
+>+ ).transpose(1, 2)
+>```
\ No newline at end of file
diff --git a/doc/en/V3-success.md b/doc/en/V3-success.md
new file mode 100644
index 0000000..fed1664
--- /dev/null
+++ b/doc/en/V3-success.md
@@ -0,0 +1,11 @@
+## Hello everyone, here is the successfully reproduced environment configuration for your reference:
+### Case 1
+- Configuration: l40s 48G + 9654 x2 (192 cores) + 768G DDR5 12-channel
+- Performance: prefill 108 tokens/s, decode 10.8 tokens/s
+- Used version: main source code compiled 
+### Case 2
+- Configuration: Dual Xeon 6430 32C processors, totaling 64 cores and 128 threads, 480GB DDR5 memory, single 4090 24G graphics card
+- Performance: Running speed approximately 6-8 tokens per second 
+## NOTE
+If there are any other configurations that have been successfully run, please feel free to let us know. We will keep updating for everyone to refer to when reproducing. (It has been found that it also works on 2080, AMD, etc. (doge : )
+[click here](https://docs.qq.com/smartsheet/form/AVxgQOYhhNfl%2FBB08J2%2Fv3rnnq?tab=BB08J2)
\ No newline at end of file
diff --git a/doc/en/api/server/website.md b/doc/en/api/server/website.md
index bd380cd..a057898 100644
--- a/doc/en/api/server/website.md
+++ b/doc/en/api/server/website.md
@@ -8,6 +8,20 @@ This document provides the necessary steps to set up and run the web service for
 
 Before you can compile the web code, make sure you have installed [Node.js](https://nodejs.org) version 18.3 or higher
 
+Note: The version of Node.js in the Ubuntu or Debian GNU/Linux software repository is too low, causing compilation errors. Users can also install Node.js through the Nodesource repository, provided they uninstall the outdated version first.
+
+```bash
+
+  # sudo apt-get remove nodejs npm -y && sudo apt-get autoremove -y
+  sudo apt-get update -y && sudo apt-get install -y apt-transport-https ca-certificates curl gnupg
+  curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key | sudo gpg --dearmor -o /usr/share/keyrings/nodesource.gpg
+  sudo chmod 644 /usr/share/keyrings/nodesource.gpg
+  echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/nodesource.gpg] https://deb.nodesource.com/node_23.x nodistro main" | sudo tee /etc/apt/sources.list.d/nodesource.list
+  sudo apt-get update -y
+  sudo apt-get install nodejs -y
+
+```
+
 Once npm is installed, navigate to the `ktransformers/website` directory:
 
 ```bash
diff --git a/doc/en/benchmark.md b/doc/en/benchmark.md
new file mode 100644
index 0000000..c498d4d
--- /dev/null
+++ b/doc/en/benchmark.md
@@ -0,0 +1,63 @@
+## Benchmark
+
+To conduct a quick and convenient check, we have employed a simple Python script available [here](https://github.com/kvcache-ai/ktransformers/tree/main/ktransformers/tests) to assess the precision of our **[ktransformers](https://github.com/kvcache-ai/ktransformers)** project. For this evaluation, we utilized the same dataset, which was shuffled in a consistent manner and limited to the first 1,000 data points, to test our implementation across a variety of CPU kernels, MLA kernels, and quantization formats.
+
+We selected the DeepSeek-V3 model in its bf16, int8, and q4km versions for this test. The MMLU dataset, which can be found [here](https://huggingface.co/datasets/cais/mmlu), was used (we selected all datasets and shuffled them with a fixed random seed).
+
+**!!! However, we skipped the few-shot part and only chose the first 1,000 data points for a quick check.** Please note that this approach may result in results that are not consistent with the technical report of DeepSeek-V3. And the test of R1 and further more tests are on going.
+
+To verify our results, we chose [cloud service platform](https://cloud.siliconflow.cn/models) as baseline. All tests were conducted using the same script and datasets, allowing us to make a preliminary assessment of our project's precision.
+
+We set the argument `temperature=0.6`, and to simplify the test process, we skipped the few-shot part and used the following prompt: `There is a single choice question. Answer the question by replying A, B, C, D. No other answers are accepted. Just the letter. \nQuestion: {question}\nA. {option_a}\nB. {option_b}\nC. {option_c}\nD. {option_d}\nAnswer: '`. For more details, please refer to the [script](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/tests/mmlu_test.py).
+
+Given that we have only tested 1,000 cases, which provides only a preliminary judgment, some fluctuations in the results are reasonable. We selected all datasets and shuffled them with a fixed random seed to ensure consistency.
+
+## Some Details
+
+- The bf16 model of DeepSeek-V3 is available [here](https://huggingface.co/opensourcerelease/DeepSeek-V3-bf16/tree/main) (you may convert it to gguf by llama.cpp). The q4km model can be found [here](https://huggingface.co/unsloth/DeepSeek-V3-GGUF/tree/main/DeepSeek-V3-Q4_K_M).
+    
+- The optimization YAML file is located [here](https://github.com/kvcache-ai/ktransformers/tree/main/ktransformers/optimize/optimize_rules). For the GEMM Kernel, you can change `KLinearMarlin` to `KLinearTorch`.
+    
+- To switch the MLA Kernel from Triton to Torch, you can check and modify [this file](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/operators/attention.py), specifically by using the `forward_windows` method.
+    
+- When attempting to conduct the bf16 test (both CPU Weight and GPU Weight), you may encounter issues stemming from older versions of g++ and as, particularly when using Ubuntu 20 or earlier versions. To facilitate a smoother experience and enable you to reproduce our results, we have provided a development container. This container offers a pre-configured environment tailored for this purpose. However, please note that the container does not have the ktrans package installed. Therefore, you may still need to manually install certain packages to ensure everything runs smoothly.
+    
+    - You may config the model mount dir in `devcontainer/devcontainer.json`, check the `"mouts":` config.
+
+
+## The Result Table
+Uses DeepSeek-V3 model (Some specific cases are R1)
+|                          |                   |            |                   |         |            |                                                        |              |
+| ------------------------ | ----------------- | ---------- | ----------------- | ------- | ---------- | ------------------------------------------------------ | ------------ |
+| DataSet                  | CPU Weight Format | CPU Kernel | GPU Weight Format | GEMM Kernel   | MLA Kernel | [Siliconflow](https://cloud.siliconflow.cn/models)<br> | Ktrans Point |
+| MMLU<br><br>(shuffle 1k) |               |    |               |    |       |                                                    |          |
+|          1                | bf16              | cpuinfer   | bf16              | torch   | torch      | 81.6                                                   | 81.9         |
+|           2               | q8_0              | cpuinfer   | bf16              | torch   | torch      | 81.6                                                   | 83.1         |
+|             3             | q4km              | cpuinfer   | bf16              | torch   | triton     | 81.6                                                   | 81.4         |
+|              4            | q4km              | cpuinfer   | q4km->marlin 8    | marlin  | triton     | 81.6                                                   | 81.1         |
+|               5           | q4km              | cpuinfer   | q4km->marlin 4    | marlin  | triton     | 81.6                                                   | 81           |
+|                6          | q4km              | cpuinfer   | fp8               | fp8gemm  | triton     | 81.6                                                   | 81.5         |
+|                7 (DeepSeek-R1)          |  iq1             | cpuinfer   |     fp8           |  fp8gemm | triton     | 78.6                                                   | 83.6         |
+| MMLU-pro<br>(shuffle 1k)                 |               |    |                |  |      |                                                    |          |
+| 1                 | q4km              | cpuinfer   | fp8               | fp8gemm | triton     | 57.7                                                   | 57.6         |
+|  2             | q4km              | cpuinfer   | q4km->marlin 4    | marlin  | triton     | 57.7                                                   | 57.5         |
+|  3 (DeepSeek-R1)             | iq1              | cpuinfer   | fp8    | fp8gem  | triton     | 71.9                                                   | tbd         |
+| HumanEval                | tbd               | tbd        | tbd               | tbd     | tbd        | tbd                                                    | tbd          |
+| GSM8K                    | tbd               | tbd        | tbd               | tbd     | tbd        | tbd                                                    | tbd          |
+
+**The details for each case are listed below**:
+
+By default, The MLA kernel uses triton in linux and torch in windows. But we need to test torch in linux, so we manually modify the [file](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/operators/attention.py#L592). Just get rid of all the if branch and force it to use `self.forward_windows`
+
+- MMLU test
+  1. [v3-chat_yaml](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml) change all the `KLinearMarlin` to `KLinearTorch` (just find all the usage in this file). The source weight comes from [there](https://huggingface.co/opensourcerelease/DeepSeek-V3-bf16) (you need to use llama.cpp to convert it to gguf)
+  2. [v3-chat_yaml](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml). You need to modify the code to separately load cpu's expert weight. We leave this as comment in these places: [1](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/operators/experts.py#L122), [2](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/operators/experts.py#L136), [3](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/operators/experts.py#L137) (note in 3, change the path to your local weight file path). The weight file for q8_0 is [here](https://huggingface.co/unsloth/DeepSeek-V3-GGUF/tree/main/DeepSeek-V3-Q8_0)
+  3. [v3-chat_yaml](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml). You need to modify the code to separately load cpu's expert weight. We leave this as comment in these places: [1](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/operators/experts.py#L122), [2](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/operators/experts.py#L136), [3](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/operators/experts.py#L137) (note in 3, change the path to your local weight file path). The weight file for q4km is [here](https://huggingface.co/unsloth/DeepSeek-V3-GGUF/tree/main/DeepSeek-V3-Q4_K_M)
+  4. [v3-chat_yaml](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml). You don't need to change the source code as they both use q4km. But note the yaml file [here](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml#L29) and [here](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml#L18), below these lines you need to add `num_bits: 8` (in other words: add this kwargs to all that use `KLinearMarlin`). The weight file for q4km is [here](https://huggingface.co/unsloth/DeepSeek-V3-GGUF/tree/main/DeepSeek-V3-Q4_K_M)
+  5. [v3-chat_yaml](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml). No need to change yaml, just use the default. The weight file for q4km is [here](https://huggingface.co/unsloth/DeepSeek-V3-GGUF/tree/main/DeepSeek-V3-Q4_K_M)
+  6. You should check the [doc](./fp8_kernel.md) to learn how to test this case. This is a mixture tensor case.
+  7. You should check the [doc](./fp8_kernel.md) to learn how to test this case. This is a mixture tensor case.
+- MMLU-pro test
+  1. You should check the [doc](./fp8_kernel.md) to learn how to test this case. This is a mixture tensor case. 
+  2. [v3-chat_yaml](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml). No need to change yaml, just use the default. The weight file for q4km is [here](https://huggingface.co/unsloth/DeepSeek-V3-GGUF/tree/main/DeepSeek-V3-Q4_K_M)
+  3. You should check the [doc](./fp8_kernel.md) to learn how to test this case. This is a mixture tensor case.
\ No newline at end of file
diff --git a/doc/en/deepseek-v2-injection.md b/doc/en/deepseek-v2-injection.md
index e5dc1c2..fcd5abe 100644
--- a/doc/en/deepseek-v2-injection.md
+++ b/doc/en/deepseek-v2-injection.md
@@ -1,6 +1,6 @@
-# Tutorial: Heterogeneous and Local DeepSeek-V2 Inference
+# Tutorial: Heterogeneous and Local MoE Inference
 
-DeepSeek-(Code)-V2 is a series of strong mixture-of-experts (MoE) models, featuring a total of 236 billion parameters, with 21 billion parameters activated per token. This model has demonstrated remarkable reasoning capabilities across various benchmarks, positioning it as one of the SOTA open models and nearly comparable in performance to GPT-4. 
+DeepSeek-(Code)-V2 is a series of strong mixture-of-experts (MoE) models, featuring a total of 236 billion parameters, with 21 billion parameters activated per token. This model has demonstrated remarkable reasoning capabilities across various benchmarks, positioning it as one of the SOTA open models and nearly comparable in performance to GPT-4. DeepSeek-R1 uses a similar architecture to DeepSeek-V2, but with a bigger number of parameters.
 
 <p align="center">
   <picture>
@@ -24,7 +24,7 @@ The following figure provides a brief overview of DeepSeek-V2 architecture. At t
 
 <p align="center">
   <picture>
-    <img alt="DeepSeek on KTransformers" src="../assets/DeepSeek-on-KTransformers.PNG" width=80%>
+    <img alt="DeepSeek on KTransformers" src="../assets/DeepSeek-on-KTransformers.png" width=80%>
   </picture>
 </p>
 
diff --git a/doc/en/fp8_kernel.md b/doc/en/fp8_kernel.md
new file mode 100644
index 0000000..e76bae5
--- /dev/null
+++ b/doc/en/fp8_kernel.md
@@ -0,0 +1,76 @@
+# FP8 Linear Kernel for DeepSeek-V3/R1
+
+## Overview
+The DeepSeek-AI team provides FP8 safetensors for DeepSeek-R1/V3 models. We achieve performance optimization through the following works:
+- **FP8 GPU Kernel Integration**: FP8 linear layer acceleration kernels integrated in KTransformers
+- **Hybrid Quantization Architecture**:
+  - Attention and Shared-Expert modules use FP8 precision (enhances computational accuracy)
+  - Experts modules retain GGML quantization (GGUF format, reside in CPU to save GPU memory)
+
+So those who are persuing the best performance can use the FP8 linear kernel for DeepSeek-V3/R1.
+
+## Key Features
+
+✅ Hybrid Precision Architecture (FP8 + GGML)<br>
+✅ Memory Optimization (~19GB VRAM usage)
+
+## Quick Start
+### Using Pre-Merged Weights
+
+Pre-merged weights are available on Hugging Face:<br>
+[KVCache-ai/DeepSeek-V3-GGML-FP8-Hybrid](https://huggingface.co/KVCache-ai/DeepSeek-V3)<br>
+[KVCache-ai/DeepSeek-R1-GGML-FP8-Hybrid](https://huggingface.co/KVCache-ai/DeepSeek-R1)
+
+> Please confirm the weights are fully uploaded before downloading. The large file size may extend Hugging Face upload time.
+
+
+Download Pre-Merged Weights
+```shell
+pip install -U huggingface_hub
+
+# Optional: Use HF Mirror for faster downloads in special area.
+# export HF_ENDPOINT=https://hf-mirror.com 
+
+huggingface-cli download --resume-download KVCache-ai/DeepSeek-V3-GGML-FP8-Hybrid --local-dir <local_dir>
+```
+### Using merge scripts
+If you got local DeepSeek-R1/V3 fp8 safetensors and gguf weights(eg.q4km), you can merge them using the following scripts.
+
+```shell
+python merge_tensors/merge_safetensor_gguf.py \
+  --safetensor_path <fp8_safetensor_path> \
+  --gguf_path <gguf_folder_path> \
+  --output_path <merged_output_path>
+```
+
+* `--safetensor_path`:	input path of safetensor file([Download](https://huggingface.co/deepseek-ai/DeepSeek-V3/tree/main)).
+* `--gguf_path`: input path of gguf folder ([Download](https://huggingface.co/unsloth/DeepSeek-V3-GGUF/tree/main/DeepSeek-V3-Q4_K_M)).
+* `--output_path`: output path of merged file.
+
+
+### Execution Notes
+
+Launch local_chat.py with custom quantized experts
+```shell
+python ktransformers/local_chat.py \
+  --model_path deepseek-ai/DeepSeek-V3 \
+  --gguf_path <merged_weights_folder> \
+  --optimize_config_path ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-fp8-linear-ggml-experts.yaml \
+  --cpu_infer <cpu_cores + 1>
+```
+
+
+## Notes
+
+⚠️ Hardware Requirements<br>
+* Recommended minimum 19GB available VRAM for FP8 kernel.
+* Requires GPU with FP8 support (e.g., 4090)
+
+⏳ First-Run Optimization
+JIT compilation causes longer initial execution (subsequent runs retain optimized speed).
+
+🔄 Temporary Interface<br>
+Current weight loading implementation is provisional - will be refined in future versions
+
+📁 Path Specification<br>
+Despite hybrid quantization, merged weights are stored as .safetensors - pass the containing folder path to `--gguf_path`
\ No newline at end of file
diff --git a/doc/en/injection_tutorial.md b/doc/en/injection_tutorial.md
index 5ebb327..4518836 100644
--- a/doc/en/injection_tutorial.md
+++ b/doc/en/injection_tutorial.md
@@ -59,6 +59,7 @@ Supported operators and their corresponding classes are as follows:
 | Linear    | KTransformersLinear    | KLinearMarlin           | Marlin as backend    |
 |           |                        | KLinearTorch            | pytorch as backend   |
 |           |                        | KLinearCPUInfer         | llamafile as backend |
+|           |                        | KLinearFP8         | Triton fp8_gemm kernel. Requires GPU be able to caluculate fp8 data |
 | experts   | KTransformersExperts   | KExpertsTorch           | pytorch as backend   |
 |           |                        | KExpertsMarlin          | Marlin as backend    |
 |           |                        | KExpertsCPU             | llamafile as backend |
diff --git a/doc/en/install.md b/doc/en/install.md
new file mode 100644
index 0000000..8752fe4
--- /dev/null
+++ b/doc/en/install.md
@@ -0,0 +1,295 @@
+<!-- omit in toc -->
+# How to Run DeepSeek-R1
+- [Preparation](#preparation)
+- [Installation](#installation)
+  - [Attention](#attention)
+  - [Supported models include:](#supported-models-include)
+  - [Support quantize format:](#support-quantize-format)
+
+In this document, we will show you how to install and run KTransformers on your local machine. There are two versions: 
+* V0.2 is the current main branch.
+* V0.3 is a preview version only provides binary distribution for now.
+* To reproduce our DeepSeek-R1/V3 results, please refer to [Deepseek-R1/V3 Tutorial](./DeepseekR1_V3_tutorial.md) for more detail settings after installation.
+## Preparation
+Some preparation:
+
+- CUDA 12.1 and above, if you didn't have it yet, you may install from [here](https://developer.nvidia.com/cuda-downloads).
+  
+  ```sh
+  # Adding CUDA to PATH
+  if [ -d "/usr/local/cuda/bin" ]; then
+      export PATH=$PATH:/usr/local/cuda/bin
+  fi
+
+  if [ -d "/usr/local/cuda/lib64" ]; then
+      export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64
+      # Or you can add it to /etc/ld.so.conf and run ldconfig as root:
+      # echo "/usr/local/cuda-12.x/lib64" | sudo tee -a /etc/ld.so.conf
+      # sudo ldconfig
+  fi
+
+  if [ -d "/usr/local/cuda" ]; then
+      export CUDA_PATH=$CUDA_PATH:/usr/local/cuda
+  fi
+  ```
+
+- Linux-x86_64 with gcc, g++ and cmake (using Ubuntu as an example)
+  
+  ```sh
+  sudo apt-get update
+  sudo apt-get install build-essential cmake ninja-build
+  ```
+
+- We recommend using [Miniconda3](https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh) or [Anaconda3](https://repo.anaconda.com/archive/Anaconda3-2024.10-1-Linux-x86_64.sh) to create a virtual environment with Python=3.11 to run our program. Assuming your Anaconda installation directory is `~/anaconda3`, you should ensure that the version identifier of the GNU C++standard library used by Anaconda includes `GLIBCXX-3.4.32`
+
+  
+  ```sh
+  conda create --name ktransformers python=3.11
+  conda activate ktransformers # you may need to run ‘conda init’ and reopen shell first
+  
+  conda install -c conda-forge libstdcxx-ng # Anaconda provides a package called `libstdcxx-ng` that includes a newer version of `libstdc++`, which can be installed via `conda-forge`.
+
+  strings ~/anaconda3/envs/ktransformers-0.3/lib/libstdc++.so.6 | grep GLIBCXX
+  ```
+
+- Make sure that PyTorch, packaging, ninja is installed You can also [install previous versions of PyTorch](https://pytorch.org/get-started/previous-versions/)
+  
+  ```
+  pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126
+  pip3 install packaging ninja cpufeature numpy
+  ```
+
+ - At the same time, you should download and install the corresponding version of flash-attention from https://github.com/Dao-AILab/flash-attention/releases.
+
+## Installation
+### Attention
+If you want to use numa support, not only do you need to set USE_NUMA=1, but you also need to make sure you have installed the libnuma-dev (`sudo apt-get install libnuma-dev` may help you).
+
+<!-- 1. ~~Use a Docker image, see [documentation for Docker](./doc/en/Docker.md)~~
+   
+   >We are working on the latest docker image, please wait for a while.
+
+2. ~~You can install using Pypi (for linux):~~
+    > We are working on the latest pypi package, please wait for a while.
+   
+   ```
+   pip install ktransformers --no-build-isolation
+   ```
+   
+   for windows we prepare a pre compiled whl package on [ktransformers-0.2.0+cu125torch24avx2-cp312-cp312-win_amd64.whl](https://github.com/kvcache-ai/ktransformers/releases/download/v0.2.0/ktransformers-0.2.0+cu125torch24avx2-cp312-cp312-win_amd64.whl), which require cuda-12.5, torch-2.4, python-3.11, more pre compiled package are being produced.  -->
+
+* Download source code and compile:
+   
+   - init source code 
+     
+     ```sh
+     git clone https://github.com/kvcache-ai/ktransformers.git
+     cd ktransformers
+     git submodule init
+     git submodule update
+     ```
+
+   - [Optional] If you want to run with website, please [compile the website](./api/server/website.md) before execute ```bash install.sh```
+
+   - For Linux
+     - For simple install:
+     
+        ```shell
+        bash install.sh
+        ```
+     - For those who have two cpu and 1T RAM:
+
+       ```shell
+        # Make sure your system has dual sockets and double size RAM than the model's size (e.g. 1T RAM for 512G model)
+        apt install libnuma-dev
+        export USE_NUMA=1
+        bash install.sh # or #make dev_install
+        ```
+
+   - For Windows
+     
+     ```shell
+     install.bat
+     ```
+
+* If you are developer, you can make use of the makefile to compile and format the code. <br> the detailed usage of makefile is [here](./makefile_usage.md) 
+
+<h3>Local Chat</h3>
+We provide a simple command-line local chat Python script that you can run for testing.
+
+> Note: this is a very simple test tool only support one round chat without any memory about last input, if you want to try full ability of the model, you may go to [RESTful API and Web UI](#id_666). 
+
+<h4>Run Example</h4>
+
+```shell
+# Begin from root of your cloned repo!
+# Begin from root of your cloned repo!!
+# Begin from root of your cloned repo!!! 
+
+# Download mzwing/DeepSeek-V2-Lite-Chat-GGUF from huggingface
+mkdir DeepSeek-V2-Lite-Chat-GGUF
+cd DeepSeek-V2-Lite-Chat-GGUF
+
+wget https://huggingface.co/mradermacher/DeepSeek-V2-Lite-GGUF/resolve/main/DeepSeek-V2-Lite.Q4_K_M.gguf -O DeepSeek-V2-Lite-Chat.Q4_K_M.gguf
+
+cd .. # Move to repo's root dir
+
+# Start local chat
+python -m ktransformers.local_chat --model_path deepseek-ai/DeepSeek-V2-Lite-Chat --gguf_path ./DeepSeek-V2-Lite-Chat-GGUF
+
+# If you see “OSError: We couldn't connect to 'https://huggingface.co' to load this file”, try：
+# GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite
+# python  ktransformers.local_chat --model_path ./DeepSeek-V2-Lite --gguf_path ./DeepSeek-V2-Lite-Chat-GGUF
+```
+
+It features the following arguments:
+
+- `--model_path` (required): Name of the model (such as "deepseek-ai/DeepSeek-V2-Lite-Chat" which will automatically download configs from [Hugging Face](https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite)). Or if you already got local files  you may directly use that path to initialize the model.  
+  
+  > Note: <strong>.safetensors</strong> files are not required in the directory. We only need config files to build model and tokenizer.
+
+- `--gguf_path` (required): Path of a directory containing GGUF files which could that can be downloaded from [Hugging Face](https://huggingface.co/mzwing/DeepSeek-V2-Lite-Chat-GGUF/tree/main). Note that the directory should only contains GGUF of current model, which means you need one separate directory for each model.
+
+- `--optimize_config_path` (required except for Qwen2Moe and DeepSeek-V2): Path of YAML file containing optimize rules. There are two rule files pre-written in the [ktransformers/optimize/optimize_rules](ktransformers/optimize/optimize_rules) directory for optimizing DeepSeek-V2 and Qwen2-57B-A14, two SOTA MoE models.
+
+- `--max_new_tokens`: Int (default=1000). Maximum number of new tokens to generate.
+
+- `--cpu_infer`: Int (default=10). The number of CPUs used for inference. Should ideally be set to the (total number of cores - 2).
+
+<details>
+<summary>Supported Models/quantization</summary>
+
+### Supported models include:
+
+| ✅ **Supported Models** | ❌ **Deprecated Models** |
+|------------------------|------------------------|
+| DeepSeek-R1 | ~~InternLM2.5-7B-Chat-1M~~ |
+| DeepSeek-V3 |  |
+| DeepSeek-V2 |  |
+| DeepSeek-V2.5 |  |
+| Qwen2-57B |  |
+| DeepSeek-V2-Lite |  |
+| Mixtral-8x7B |  |
+| Mixtral-8x22B |  |
+
+### Support quantize format:
+
+| ✅ **Supported Formats** | ❌ **Deprecated Formats** |
+|--------------------------|--------------------------|
+| Q2_K_L | ~~IQ2_XXS~~ |
+| Q2_K_XS |  |
+| Q3_K_M |  |
+| Q4_K_M |  |
+| Q5_K_M |  |
+| Q6_K |  |
+| Q8_0 |  |
+</details>
+
+<details>
+<summary>Suggested Model</summary>
+
+| Model Name                     | Model Size | VRAM  | Minimum DRAM    | Recommended DRAM  |
+| ------------------------------ | ---------- | ----- | --------------- | ----------------- |
+| DeepSeek-R1-q4_k_m		 | 377G       | 14G   | 382G            | 512G		    |
+| DeepSeek-V3-q4_k_m		 | 377G       | 14G   | 382G            | 512G		    |
+| DeepSeek-V2-q4_k_m             | 133G       | 11G   | 136G            | 192G              |
+| DeepSeek-V2.5-q4_k_m           | 133G       | 11G   | 136G            | 192G              |
+| DeepSeek-V2.5-IQ4_XS           | 117G       | 10G   | 107G            | 128G              |
+| Qwen2-57B-A14B-Instruct-q4_k_m | 33G        | 8G    | 34G             | 64G               |
+| DeepSeek-V2-Lite-q4_k_m        | 9.7G       | 3G    | 13G             | 16G               |
+| Mixtral-8x7B-q4_k_m            | 25G        | 1.6G  | 51G             | 64G               |
+| Mixtral-8x22B-q4_k_m           | 80G        | 4G    | 86.1G           | 96G               |
+| InternLM2.5-7B-Chat-1M         | 15.5G      | 15.5G | 8G(32K context) | 150G (1M context) |
+
+
+More will come soon. Please let us know which models you are most interested in. 
+
+Be aware that you need to be subject to their corresponding model licenses when using [DeepSeek](https://huggingface.co/deepseek-ai/DeepSeek-V2/blob/main/LICENSE) and [QWen](https://huggingface.co/Qwen/Qwen2-72B-Instruct/blob/main/LICENSE).
+</details>
+
+
+<details>
+  <summary>Click To Show how to run other examples</summary>
+
+* Qwen2-57B
+
+  ```sh
+  pip install flash_attn # For Qwen2
+
+  mkdir Qwen2-57B-GGUF && cd Qwen2-57B-GGUF
+
+  wget https://huggingface.co/Qwen/Qwen2-57B-A14B-Instruct-GGUF/resolve/main/qwen2-57b-a14b-instruct-q4_k_m.gguf?download=true -O qwen2-57b-a14b-instruct-q4_k_m.gguf
+
+  cd ..
+
+  python -m ktransformers.local_chat --model_name Qwen/Qwen2-57B-A14B-Instruct --gguf_path ./Qwen2-57B-GGUF
+
+  # If you see “OSError: We couldn't connect to 'https://huggingface.co' to load this file”, try：
+  # GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/Qwen/Qwen2-57B-A14B-Instruct
+  # python  ktransformers/local_chat.py --model_path ./Qwen2-57B-A14B-Instruct --gguf_path ./DeepSeek-V2-Lite-Chat-GGUF
+  ```
+
+* Deepseek-V2
+  
+  ```sh
+  mkdir DeepSeek-V2-Chat-0628-GGUF && cd DeepSeek-V2-Chat-0628-GGUF
+  # Download weights
+  wget https://huggingface.co/bartowski/DeepSeek-V2-Chat-0628-GGUF/resolve/main/DeepSeek-V2-Chat-0628-Q4_K_M/DeepSeek-V2-Chat-0628-Q4_K_M-00001-of-00004.gguf -o DeepSeek-V2-Chat-0628-Q4_K_M-00001-of-00004.gguf
+  wget https://huggingface.co/bartowski/DeepSeek-V2-Chat-0628-GGUF/resolve/main/DeepSeek-V2-Chat-0628-Q4_K_M/DeepSeek-V2-Chat-0628-Q4_K_M-00002-of-00004.gguf -o DeepSeek-V2-Chat-0628-Q4_K_M-00002-of-00004.gguf
+  wget https://huggingface.co/bartowski/DeepSeek-V2-Chat-0628-GGUF/resolve/main/DeepSeek-V2-Chat-0628-Q4_K_M/DeepSeek-V2-Chat-0628-Q4_K_M-00003-of-00004.gguf -o DeepSeek-V2-Chat-0628-Q4_K_M-00003-of-00004.gguf
+  wget https://huggingface.co/bartowski/DeepSeek-V2-Chat-0628-GGUF/resolve/main/DeepSeek-V2-Chat-0628-Q4_K_M/DeepSeek-V2-Chat-0628-Q4_K_M-00004-of-00004.gguf -o DeepSeek-V2-Chat-0628-Q4_K_M-00004-of-00004.gguf
+
+  cd ..
+
+  python -m ktransformers.local_chat --model_name deepseek-ai/DeepSeek-V2-Chat-0628 --gguf_path ./DeepSeek-V2-Chat-0628-GGUF
+
+  # If you see “OSError: We couldn't connect to 'https://huggingface.co' to load this file”, try：
+
+  # GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/deepseek-ai/DeepSeek-V2-Chat-0628
+
+  # python -m ktransformers.local_chat --model_path ./DeepSeek-V2-Chat-0628 --gguf_path ./DeepSeek-V2-Chat-0628-GGUF
+  ```
+
+| model name | weights download link |
+|----------|----------|
+| Qwen2-57B | [Qwen2-57B-A14B-gguf-Q4K-M](https://huggingface.co/Qwen/Qwen2-57B-A14B-Instruct-GGUF/tree/main) |
+| DeepseekV2-coder |[DeepSeek-Coder-V2-Instruct-gguf-Q4K-M](https://huggingface.co/LoneStriker/DeepSeek-Coder-V2-Instruct-GGUF/tree/main) |
+| DeepseekV2-chat |[DeepSeek-V2-Chat-gguf-Q4K-M](https://huggingface.co/bullerwins/DeepSeek-V2-Chat-0628-GGUF/tree/main) |
+| DeepseekV2-lite | [DeepSeek-V2-Lite-Chat-GGUF-Q4K-M](https://huggingface.co/mzwing/DeepSeek-V2-Lite-Chat-GGUF/tree/main) |
+| DeepSeek-R1 | [DeepSeek-R1-gguf-Q4K-M](https://huggingface.co/unsloth/DeepSeek-R1-GGUF/tree/main/DeepSeek-R1-Q4_K_M) |
+
+</details>
+
+<!-- pin block for jump -->
+<span id='id_666'> 
+
+<h3>RESTful API and Web UI  </h3>
+
+
+Start without website:
+
+```sh
+ktransformers --model_path deepseek-ai/DeepSeek-V2-Lite-Chat --gguf_path /path/to/DeepSeek-V2-Lite-Chat-GGUF --port 10002
+```
+
+Start with website:
+
+```sh
+ktransformers --model_path deepseek-ai/DeepSeek-V2-Lite-Chat --gguf_path /path/to/DeepSeek-V2-Lite-Chat-GGUF  --port 10002 --web True
+```
+
+Or you want to start server with transformers, the model_path should include safetensors
+
+```bash
+ktransformers --type transformers --model_path /mnt/data/model/Qwen2-0.5B-Instruct --port 10002 --web True
+```
+
+Access website with url [http://localhost:10002/web/index.html#/chat](http://localhost:10002/web/index.html#/chat) :
+
+<p align="center">
+  <picture>
+    <img alt="Web UI" src="https://github.com/user-attachments/assets/615dca9b-a08c-4183-bbd3-ad1362680faf" width=90%>
+  </picture>
+</p>
+
+More information about the RESTful API server can be found [here](doc/en/api/server/server.md). You can also find an example of integrating with Tabby [here](doc/en/api/server/tabby.md).
diff --git a/doc/en/multi-gpu-tutorial.md b/doc/en/multi-gpu-tutorial.md
new file mode 100644
index 0000000..29bd496
--- /dev/null
+++ b/doc/en/multi-gpu-tutorial.md
@@ -0,0 +1,118 @@
+
+# Muti-GPU
+
+Assume you have read the [Injection Tutorial](./injection_tutorial.md) and have a basic understanding of how to inject a model. In this tutorial, we will show you how to use KTransformers to run a model on multiple GPUs.
+
+If you have multiple GPUs, you can set the device for each module to different GPUs. 
+DeepseekV2-Chat got 60 layers, if we got 2 GPUs, we can allocate 30 layers to each GPU. Complete multi GPU rule examples [here](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu.yaml).
+
+
+<p align="center">
+  <picture>
+    <img alt="Inject-Struction" src="../assets/multi_gpu.png" width=60%>
+  </picture>
+</p>
+
+First of all, for multi-GPU, we have to inject an new operator `KDeepseekV2Model`. And set division of the layers to different GPUs. For our case, we have to set the `transfer_map` in the `KDeepseekV2Model` operatoras as follows:
+
+```yaml
+- match:
+    name: "^model$"
+  replace:
+    class: "ktransformers.operators.models.KDeepseekV2Model"
+    kwargs:
+      transfer_map: 
+        30: "cuda:1"
+```
+
+And we have to set the device for each module in the model. 
+
+For example, for `routed experts`, the yaml for one GPU is:
+```yaml
+- match:
+    name: "^model\\.layers\\..*\\.mlp\\.experts$"
+  replace:
+    class: ktransformers.operators.experts.KTransformersExperts     # Custom MoE kernel with expert parallelism
+    kwargs:
+      generate_device: "cuda:0"
+      generate_op: "MLPCUDAExperts"
+      out_device: "cuda:0"
+  recursive: False # Don't recursively inject submodules of this module
+```
+But for two GPUs, we need to set the device for each module in the model. 
+
+```yaml
+# allcate 0-29 layers‘s out_device to cuda:0
+- match:
+    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp\\.experts$"
+  replace:
+    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
+    kwargs:
+      generate_device: "cpu"
+      generate_op:  "KExpertsCPU"
+      out_device: "cuda:0"
+  recursive: False # don't recursively inject submodules of this module
+
+# allocate 30-59 layers‘s out_device to cuda:1
+- match:
+    name: "^model\\.layers\\.([345][0-9])\\.mlp\\.experts$"
+  replace:
+    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
+    kwargs:
+      generate_device: "cpu"
+      generate_op:  "KExpertsCPU"
+      out_device: "cuda:1"
+  recursive: False # don't recursively inject submodules of this module
+```
+For other modules, we can set the device in the same way.
+
+# How to fully utilize multi-GPU's VRAM
+
+When you have multiple GPUs, you can fully utilize the VRAM of each GPU by moving more weights to the GPU.
+
+For example, for DeepSeekV2-Chat, we can move the weights of the experts to the GPU. 
+
+For example, the yaml for two GPUs is:
+```yaml
+- match:
+    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp\\.experts$"
+  replace:
+    class: ktransformers.operators.experts.KTransformersExperts
+    kwargs:
+      generate_device: "cpu"
+      generate_op:  "KExpertsCPU"
+      out_device: "cuda:0"
+  recursive: False
+```
+
+But we got extra 60GB VRAM on cuda:0, we can move experts in layer 4~8 to cuda:0. 
+
+```yaml
+# Add new rule before old rule.
+- match:
+    name: "^model\\.layers\\.([4-8])\\.mlp\\.experts$" # inject experts in layer 4~8 as marlin expert
+  replace:
+    class: ktransformers.operators.experts.KTransformersExperts  
+    kwargs:
+      generate_device: "cuda:0"
+      generate_op:  "KExpertsMarlin"
+  recursive: False
+
+- match:
+    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp\\.experts$"
+  replace:
+    class: ktransformers.operators.experts.KTransformersExperts     
+    kwargs:
+      generate_device: "cpu"
+      generate_op:  "KExpertsCPU"
+      out_device: "cuda:0"
+  recursive: False 
+```
+
+Adjust the layer range as you want. Note that:
+* The loading speed will be significantly slower for each expert moved to the GPU.
+* You have to close the cuda graph if you want to move the experts to the GPU.
+* For DeepSeek-R1/V3, each expert moved to the GPU will consume approximately 6GB of VRAM.
+* The first matched rule in yaml will be applied. For example, if you have two rules that match the same layer, only the first rule's replacement will be valid.
+
+
diff --git a/doc/zh/DeepseekR1_V3_tutorial_zh.md b/doc/zh/DeepseekR1_V3_tutorial_zh.md
new file mode 100644
index 0000000..ba9d7e8
--- /dev/null
+++ b/doc/zh/DeepseekR1_V3_tutorial_zh.md
@@ -0,0 +1,173 @@
+<!-- omit in toc -->
+# GPT-4/o1 级别本地 VSCode Copilot 在仅 24GB 显存的台式机上的表现
+- [摘要](#摘要)
+	- [先决条件](#先决条件)
+	- [基准测试结果](#基准测试结果)
+		- [V0.2](#v02)
+			- [设置](#设置)
+			- [内存占用](#内存占用)
+			- [基准测试结果](#基准测试结果)
+		- [V0.3-Preview](#V0.3-Preview)
+			- [设置](#设置-1)
+			- [内存占用](#内存占用-1)
+			- [基准测试结果](#基准测试结果-1)
+	- [如何运行](#如何运行)
+		- [V0.2 展示](#v02-展示)
+			- [单插槽版本 (32 核心)](#单插槽版本（32 核心）)
+			- [双插槽版本 (64 核心)](#双插槽版本（64 核心）)
+		- [V0.3 展示](#v03-展示)
+			- [双插槽版本 (64 核心)](#双插槽版本（64 核心）-1)
+	- [一些解释](#一些解释)
+	- [常见问题解答](#常见问题解答)
+		- [R1 不思考](#R1 不返回思考过程)
+		- [更多常见问题解答](#更多常见问题解答)
+
+# 摘要
+
+> **2025年2月10日**: 支持在单个（24GB 显存）/多个 GPU 和 382GB 内存上运行 DeepseekR1 和 V3，速度提升高达 3~28 倍。<br>
+
+嗨，我们是 KTransformers 团队（以前因本地 CPU/GPU 混合推理开源项目 DeepSeek-V2 而闻名）。
+
+我们听到了您对 DeepSeek-R1/V3 支持的请求——我们很高兴终于可以交付了！很抱歉让您久等了，但我们一直在酝酿一些真正令人惊叹的东西！
+
+今天，我们自豪地宣布，我们不仅支持 DeepSeek-R1/V3，如下视频所示：
+
+https://github.com/user-attachments/assets/ebd70bfa-b2c1-4abb-ae3b-296ed38aa285
+
+</p>
+
+- **[NEW!!!] 本地 671B DeepSeek-Coder-V3/R1:** 仅使用 14GB 显存和 382GB 内存运行其 Q4_K_M 版本。
+	- 预填充(Prefill)速度 (tokens/s): 
+ 		- KTransformers: 54.21 (32 核心) → 74.362 (双插槽，2×32 核心) → 255.26 (优化的 AMX 基 MoE 内核，仅 V0.3) → 286.55 (选择性使用 6 个专家，仅 V0.3)  
+ 		- 与 llama.cpp 在 2×32 核心下 10.31 tokens/s 相比，速度提升高达 **27.79 倍**
+ 	- 解码(Decode)速度 (tokens/s):  
+ 		- KTransformers: 8.73 (32 核心) → 11.26 (双插槽， 2×32 核心) → 13.69 (选择性使用 6 个专家，仅 V0.3)  
+ 		- 与 llama.cpp 在 2×32 核心下 4.51 tokens/s 相比，速度提升高达 **3.03 倍**
+
+	
+我们还提供了即将推出的优化预览，包括英特尔 AMX 加速内核和选择性专家激活方法，这将显著提升性能。通过 V0.3 预览版，我们在预填充方面实现了高达 286 tokens/s 的速度，比本地推理的 llama.cpp **快 28 倍**。二进制发行版现已可用，源代码即将推出！请查看 wheel 包 [此处](https://github.com/kvcache-ai/ktransformers/releases/download/v0.1.4/ktransformers-0.3.0rc0+cu126torch26fancy-cp311-cp311-linux_x86_64.whl) 。
+
+
+## 先决条件
+我们在以下配置下进行了最佳性能测试（V0.2）： <br>
+CPU: Intel (R) Xeon (R) Gold 6454S 1T 内存 (2 NUMA 节点) <br>
+GPU: 4090D 24G 显存 <br>
+内存: 标准 DDR5-4800 服务器内存 (1 TB)
+## 基准测试结果
+### V0.2
+#### 设置
+- Model: DeepseekV3-q4km (int4)<br>
+- CPU: cpu_model_name: Intel (R) Xeon (R) Gold 6454S，每个插槽 32 核心，2 个插槽，2 个 NUMA 节点
+- GPU: 4090D 24G 显存
+- 我们在充分预热后进行测试
+#### 内存占用:
+  - 单插槽: 382G 内存，至少 14GB 显存
+  - 双插槽: 1T 内存，至少 14GB 显存
+
+#### 基准测试结果
+
+“6 个专家” 情况是 V0.3 预览版中内容
+
+| Prompt<br>(500 tokens) | 双插槽 Ktrans (6 个专家) | 双插槽 Ktrans (8 个专家) | Single socket Ktrans (6 个专家) | Single socket Ktrans (8 个专家)| llama.cpp (8 个专家) | 
+|------------------------| --- | --- | --- | --- | --- | 
+| 预填充(Prefill) token/s   | 97.32 | 82.94 | 65.14 | 54.21 | 10.31 |
+| 解码(Decode) token/s     | 13.69 | 12.208 | 10.303 | 8.73 |4.51 |
+
+**最高加速比在解码方面达到 <u>3.03x</u> 倍，在预填充方面达到 <u>9.44x</u> 倍。**
+
+### V0.3-Preview
+#### 设置
+- Model: DeepseekV3-BF16 (在线量化为 CPU 的 int8 和 GPU 的 int4)
+- CPU: cpu_model_name: Intel (R) Xeon (R) Gold 6454S，每个插槽 32 核心，2 个插槽，2 个 NUMA 节点
+- GPU: (1~4)x 4090D 24G 显存 (更长的 prompt 需要更多显存)
+
+#### 内存占用:
+- 644GB 内存，至少 14GB 显存
+
+#### 基准测试结果
+| Prompt length  | 1K  | 2K  | 4K  | 8K |
+|---------------|-----|-----|-----|-----|
+| KTrans (8 个专家) Prefill token/s |   185.96  |  255.26   |  252.58   |  195.62   |
+| KTrans (6 个专家) Prefill token/s |   203.70  |  286.55   |  271.08   |  207.20   |
+
+**KTrans V0.3 的预填充速度比 KTrans V0.2 快 <u>3.45x</u> 倍，比 llama.cpp 快 <u>27.79x</u> 倍。**
+**解码速度与 KTrans V0.2（6 个专家版本）相同，因此省略。**
+
+主要加速来自于 
+- 英特尔 AMX 指令集和我们专门设计的缓存友好内存布局
+- 专家选择策略，根据离线配置文件结果选择更少的专家
+
+
+*从我们对 DeepSeekV2、DeepSeekV3 和 DeepSeekR1 的研究中，当我们略微减少推理中的激活专家数量时，输出质量没有变化。但解码和预填充的速度加快了，这令人鼓舞。因此，我们的展示利用了这一发现。*
+
+## 如何运行
+### V0.2 展示
+#### 单插槽版本（32 核心）
+我们的 local_chat 测试命令是:
+``` shell
+git clone https://github.com/kvcache-ai/ktransformers.git
+cd ktransformers
+git submodule init
+git submodule update
+numactl -N 1 -m 1 python ./ktransformers/local_chat.py --model_path <your model path> --gguf_path <your gguf path>  --prompt_file <your prompt txt file>  --cpu_infer 33 --max_new_tokens 1000
+<当您看到聊天时，按回车键加载文本提示文件>
+```
+`<your model path>` 可以是本地路径，也可以是在线路径，例如 deepseek-ai/DeepSeek-V3。如果在线连接出现问题，可以尝试使用镜像（hf-mirror.com） <br>
+`<your gguf path>` 也可以是在线路径，但由于其体积较大，我们建议您下载并量化模型（注意这是目录路径） <br>
+`--max_new_tokens 1000` 是最大输出 token 长度。如果发现答案被截断，可以增加此数字以获得更长的答案（但要注意内存不足问题，增加此数字会降低生成速度）. 
+<br>
+命令 numactl -N 1 -m 1 的目的是避免 NUMA 节点之间的数据传输<br>
+注意！如果测试 R1 可能会跳过思考。因此，可以添加参数：`--force_think true`，这在 [常见问题解答](#常见问题解答) 部分中解释。
+
+#### 双插槽版本（64 核心）
+在安装之前（使用 install.sh 或 `make dev_install`），请确保设置环境变量 `USE_NUMA=1`，方法是 `export USE_NUMA=1`（如果已经安装，请重新安装并设置此环境变量） <br>
+我们的 local_chat 测试命令是：
+``` shell
+git clone https://github.com/kvcache-ai/ktransformers.git
+cd ktransformers
+git submodule init
+git submodule update
+export USE_NUMA=1
+make dev_install # or sh ./install.sh
+python ./ktransformers/local_chat.py --model_path <your model path> --gguf_path <your gguf path>  --prompt_file <your prompt txt file>  --cpu_infer 65 --max_new_tokens 1000
+<当您看到聊天时，按回车键加载文本提示文件>
+```
+参数的含义相同。但因为我们使用双插槽，所以将 cpu_infer 设置为 65。
+
+### V0.3 展示
+#### 双插槽版本（64 核心）
+我们的 local_chat 测试命令是：
+``` shell
+wget https://github.com/kvcache-ai/ktransformers/releases/download/v0.1.4/ktransformers-0.3.0rc0+cu126torch26fancy-cp311-cp311-linux_x86_64.whl
+pip install ./ktransformers-0.3.0rc0+cu126torch26fancy-cp311-cp311-linux_x86_64.whl
+python -m ktransformers.local_chat --model_path <your model path> --gguf_path <your gguf path>  --prompt_file <your prompt txt file>  --cpu_infer 65 --max_new_tokens 1000
+<当您看到聊天时，按回车键加载文本提示文件>
+```
+参数的含义与 V0.2 相同。但因为我们使用双插槽，所以将 cpu_infer 设置为 65。
+
+## 一些解释
+1. 我们还想进一步利用 Xeon Gold CPU 上的两个 NUMA 节点。为了避免节点之间的数据传输成本，我们在两个节点上 "copy" 了关键矩阵，这会增加内存占用，但会加速预填充和解码过程。但这种方法占用大量内存，加载权重时速度较慢，因此加载时请耐心等待并监控内存使用情况。我们计划优化这一巨大的内存开销。敬请期待。
+
+2. 命令参数 `--cpu_infer 65` 指定使用多少核心（超过物理核心数量是可以的，但并不是越多越好。根据实际核心数量适当降低此值）。<br>
+
+3. 为什么使用 CPU/GPU 混合推理？
+DeepSeek 的 MLA 操作符计算密集。虽然全部在 CPU 上运行是可行的，但将繁重的计算任务卸载到 GPU 上能带来巨大的性能提升。
+
+4. 加速来自哪里？
+
+   - 专家卸载：与传统的基于层或 KVCache 卸载（如 llama.cpp 中的）不同，我们将专家计算卸载到 CPU，将 MLA/KVCache 卸载到 GPU，与 DeepSeek 的架构完美对齐，实现最佳效率。
+   - 英特尔 AMX 优化 – 我们的 AMX 加速内核经过精心调优，运行速度是现有 llama.cpp 实现的数倍。我们计划在清理后开源此内核，并考虑向 llama.cpp 上游贡献代码。 
+
+5. 为什么选择英特尔 CPU？
+英特尔目前是唯一支持 AMX 类似指令的 CPU 供应商，与仅支持 AVX 的替代方案相比，性能显著更好。
+
+## 常见问题解答
+### R1 不返回思考过程
+注意！如果测试 R1 可能会跳过思考。因此，可以添加参数：`--force_think true`。详细信息在 [常见问题解答](./FAQ.md) 部分中。 <br>
+
+## 问题
+* 修复服务器集成功能以实现网络API访问支持
+* 修复本地聊天功能仅支持单行提示输入的问题（目前输入换行符(\n)即开始生成提示）
+
+### 更多常见问题解答
+[详见](./FAQ.md)
diff --git a/install.sh b/install.sh
index ffb7aca..c5773ec 100644
--- a/install.sh
+++ b/install.sh
@@ -2,6 +2,8 @@
 set -e  
 
 # clear build dirs
+rm -rf build
+rm -rf *.egg-info
 rm -rf ktransformers/ktransformers_ext/build
 rm -rf ktransformers/ktransformers_ext/cuda/build
 rm -rf ktransformers/ktransformers_ext/cuda/dist
diff --git a/ktransformers/__init__.py b/ktransformers/__init__.py
index 8c5108b..b100dcb 100644
--- a/ktransformers/__init__.py
+++ b/ktransformers/__init__.py
@@ -5,7 +5,7 @@ Description  :
 Author       : kkk1nak0
 Date         : 2024-08-15 07:34:46
 Version      : 1.0.0
-LastEditors  : unicornchan 
-LastEditTime : 2025-02-10 00:59:53
+LastEditors  : chenxl 
+LastEditTime : 2025-02-15 03:53:02
 '''
-__version__ = "0.2.0"
\ No newline at end of file
+__version__ = "0.2.3.post1"
diff --git a/ktransformers/ktransformers_ext/CMakeLists.txt b/ktransformers/ktransformers_ext/CMakeLists.txt
index 60cf721..eefcadf 100644
--- a/ktransformers/ktransformers_ext/CMakeLists.txt
+++ b/ktransformers/ktransformers_ext/CMakeLists.txt
@@ -30,6 +30,9 @@ if (NOT MSVC)
     option(LLAMA_F16C                        "llama: enable F16C"                               OFF)
 endif()
 option(LLAMA_AVX512_FANCY_SIMD               "llama: enable AVX512-VL, AVX512-BW, AVX512-DQ, AVX512-VNNI"                        OFF)
+option(KTRANSFORMERS_USE_CUDA                "ktransformers: use CUDA"                          OFF)
+option(KTRANSFORMERS_USE_MUSA                "ktransformers: use MUSA"                          OFF)
+option(KTRANSFORMERS_USE_ROCM                "ktransformers: use ROCM"                          OFF)
 
 # Architecture specific
 # TODO: probably these flags need to be tweaked on some architectures
@@ -173,6 +176,7 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
             list(APPEND ARCH_FLAGS -mavx512bw)
             list(APPEND ARCH_FLAGS -mavx512dq)
             list(APPEND ARCH_FLAGS -mavx512vnni)
+            list(APPEND ARCH_FLAGS -mavx512vpopcntdq)
         endif()
         if (LLAMA_AVX512_BF16)
             list(APPEND ARCH_FLAGS -mavx512bf16)
@@ -232,18 +236,40 @@ add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../third_party/llama.cpp ${CMAKE
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../../third_party)
 if (WIN32)
     include_directories("$ENV{CUDA_PATH}/include")
+    add_compile_definitions(KTRANSFORMERS_USE_CUDA=1)
 elseif (UNIX)
-    find_package(CUDA)
-    find_package(HIP)
-    find_package(MUSAToolkit)
-    if(CUDA_FOUND)
+    if (KTRANSFORMERS_USE_CUDA)
+        find_package(CUDA REQUIRED)
         include_directories("${CUDA_INCLUDE_DIRS}")
+        add_compile_definitions(KTRANSFORMERS_USE_CUDA=1)
     endif()
-    if(HIP_FOUND)
-        include_directories("${HIP_INCLUDE_DIRS}")
+
+    if (KTRANSFORMERS_USE_ROCM)
+        find_package(HIP REQUIRED)
+        if(HIP_FOUND)
+            include_directories("${HIP_INCLUDE_DIRS}")
+            add_compile_definitions(KTRANSFORMERS_USE_ROCM=1)
+        endif()
     endif()
-    if(MUSAToolkit_FOUND)
-        include_directories("${MUSA_INCLUDE_DIRS}")
+
+    if (KTRANSFORMERS_USE_MUSA)
+        if (NOT EXISTS $ENV{MUSA_PATH})
+            if (NOT EXISTS /opt/musa)
+                set(MUSA_PATH /usr/local/musa)
+            else()
+                set(MUSA_PATH /opt/musa)
+            endif()
+        else()
+            set(MUSA_PATH $ENV{MUSA_PATH})
+        endif()
+
+        list(APPEND CMAKE_MODULE_PATH "${MUSA_PATH}/cmake")
+
+        find_package(MUSAToolkit)
+        if (MUSAToolkit_FOUND)
+            message(STATUS "MUSA Toolkit found")
+            add_compile_definitions(KTRANSFORMERS_USE_MUSA=1)
+        endif()
     endif()
 endif()
 
@@ -260,22 +286,19 @@ target_link_libraries(${PROJECT_NAME} PRIVATE llama)
 if(WIN32)
     target_link_libraries(${PROJECT_NAME} PRIVATE "$ENV{CUDA_PATH}/lib/x64/cudart.lib")#CUDA::cudart
 elseif(UNIX)
-    if(NOT DEFINED ENV{CUDA_HOME} OR "$ENV{CUDA_HOME}" STREQUAL "")
-        set(ENV{CUDA_HOME} "/usr/local/cuda")
-    endif()
-    if(CUDA_FOUND)
-        add_compile_definitions(USE_CUDA=1)
+    if(KTRANSFORMERS_USE_CUDA)
+        if(NOT DEFINED ENV{CUDA_HOME} OR "$ENV{CUDA_HOME}" STREQUAL "")
+            set(ENV{CUDA_HOME} "/usr/local/cuda")
+        endif()
         target_link_libraries(${PROJECT_NAME} PRIVATE "$ENV{CUDA_HOME}/lib64/libcudart.so")
-        message(STATUS "Building for CUDA")
     endif()
-    if(HIP_FOUND)
+    if (KTRANSFORMERS_USE_ROCM)
         add_compile_definitions(USE_HIP=1)
         target_link_libraries(${PROJECT_NAME} PRIVATE "${ROCM_PATH}/lib/libamdhip64.so")
         message(STATUS "Building for HIP")
     endif()
-    if(MUSAToolkit_FOUND)
-        add_compile_definitions(USE_MUSA=1)
-        message(STATUS "Building for MUSA")
+    if(KTRANSFORMERS_USE_MUSA)
+        target_link_libraries(${PROJECT_NAME} PRIVATE MUSA::musart)
     endif()
 endif()
 
diff --git a/ktransformers/ktransformers_ext/cpu_backend/backend.cpp b/ktransformers/ktransformers_ext/cpu_backend/backend.cpp
index 5980ba3..a254db9 100644
--- a/ktransformers/ktransformers_ext/cpu_backend/backend.cpp
+++ b/ktransformers/ktransformers_ext/cpu_backend/backend.cpp
@@ -54,7 +54,12 @@ void Backend::do_work_stealing_job(int task_num,
     init_func_ = init_func;
     compute_func_ = compute_func;
     finalize_func_ = finalize_func;
+#ifdef USE_NUMA
+    // numa node location will be calculated based on the number of threads
+    thread_num_ = max_thread_num_;
+#else
     thread_num_ = std::min(max_thread_num_, task_num);
+#endif
     int base = task_num / thread_num_;
     int remain = task_num % thread_num_;
     thread_state_[0].end = base + (0 < remain);
@@ -146,4 +151,4 @@ void Backend::worker_thread(int thread_id) {
             return;
         }
     }
-}
\ No newline at end of file
+}
diff --git a/ktransformers/ktransformers_ext/cpu_backend/cpuinfer.h b/ktransformers/ktransformers_ext/cpu_backend/cpuinfer.h
index 180eb1d..d0f7b11 100644
--- a/ktransformers/ktransformers_ext/cpu_backend/cpuinfer.h
+++ b/ktransformers/ktransformers_ext/cpu_backend/cpuinfer.h
@@ -7,75 +7,83 @@
  * @LastEditTime : 2024-08-07 09:47:43
  * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
  **/
-#ifndef CPUINFER_CPUINFER_H
-#define CPUINFER_CPUINFER_H
-
-#include <atomic>
-#include <condition_variable>
-#include <functional>
-#include <mutex>
-#include <queue>
-#include <thread>
-#include <vector>
-
-#include "backend.h"
-#include "task_queue.h"
-#include "../vendors/vendor.h"
-
-#include "llama.cpp/ggml-impl.h"
-
-class CPUInfer {
-   public:
-    CPUInfer(int thread_num) {
-        backend_ = new Backend(thread_num - 1);
-        task_queue_ = new TaskQueue();
-        for (int i = 0; i < (1 << 16); ++i) {
-            ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(i);
-        }
-    }
-
-    ~CPUInfer() {
-        delete backend_;
-        delete task_queue_;
-    }
-
-    template <typename Func, typename Obj, typename... Args>
-    void enqueue(Func f, Obj* obj, Args... args) {
-        task_queue_->enqueue([=]() {
-            std::invoke(f, *obj, args..., backend_);
-        });
-    }
-
-    void submit(std::pair<intptr_t, intptr_t> params) {
-        void (*func)(void*) = (void (*)(void*))params.first;
-        void* args = (void*)params.second;
-        *((CPUInfer**)args) = this;
-        func(args);
-    }
-
-    void sync() {
-        task_queue_->sync();
-    }
-
-    void submit_with_cuda_stream(intptr_t user_cuda_stream, std::pair<intptr_t, intptr_t> params) {
-        void (*func)(void*) = (void (*)(void*))params.first;
-        void* args = (void*)params.second;
-        *((CPUInfer**)args) = this;
-        cudaLaunchHostFunc((cudaStream_t)user_cuda_stream, (cudaHostFn_t)func, args);
-    }
-
-    static void sync_(void* cpu_infer_ptr) {
-        CPUInfer* cpuinfer = (CPUInfer*)cpu_infer_ptr;
-        cpuinfer->sync();
-    }
-
-    void sync_with_cuda_stream(intptr_t user_cuda_stream) {
-        cudaLaunchHostFunc((cudaStream_t)user_cuda_stream, (cudaHostFn_t)&sync_, (void*)this);
-    }
-
-   public:
-    Backend* backend_;
-    TaskQueue* task_queue_;
-};
-
-#endif
\ No newline at end of file
+ #ifndef CPUINFER_CPUINFER_H
+ #define CPUINFER_CPUINFER_H
+ 
+ #include <atomic>
+ #include <condition_variable>
+ #include <functional>
+ #include <mutex>
+ #include <queue>
+ #include <thread>
+ #include <vector>
+ #ifdef KTRANSFORMERS_USE_CUDA
+ #include "vendors/cuda.h"
+ #elif KTRANSFORMERS_USE_MUSA
+ #include "vendors/musa.h"
+ #elif KTRANSFORMERS_USE_ROCM
+ #define __HIP_PLATFORM_AMD__
+ #include "vendors/hip.h"
+ #endif
+ 
+ #include "backend.h"
+ #include "task_queue.h"
+ #include "../vendors/vendor.h"
+ 
+ #include "llama.cpp/ggml-impl.h"
+ 
+ class CPUInfer {
+    public:
+     CPUInfer(int thread_num) {
+         backend_ = new Backend(thread_num - 1);
+         task_queue_ = new TaskQueue();
+         for (int i = 0; i < (1 << 16); ++i) {
+             ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(i);
+         }
+     }
+ 
+     ~CPUInfer() {
+         delete backend_;
+         delete task_queue_;
+     }
+ 
+     template <typename Func, typename Obj, typename... Args>
+     void enqueue(Func f, Obj* obj, Args... args) {
+         task_queue_->enqueue([=]() {
+             std::invoke(f, *obj, args..., backend_);
+         });
+     }
+ 
+     void submit(std::pair<intptr_t, intptr_t> params) {
+         void (*func)(void*) = (void (*)(void*))params.first;
+         void* args = (void*)params.second;
+         *((CPUInfer**)args) = this;
+         func(args);
+     }
+ 
+     void sync() {
+         task_queue_->sync();
+     }
+ 
+     void submit_with_cuda_stream(intptr_t user_cuda_stream, std::pair<intptr_t, intptr_t> params) {
+         void (*func)(void*) = (void (*)(void*))params.first;
+         void* args = (void*)params.second;
+         *((CPUInfer**)args) = this;
+         cudaLaunchHostFunc((cudaStream_t)user_cuda_stream, (cudaHostFn_t)func, args);
+     }
+ 
+     static void sync_(void* cpu_infer_ptr) {
+         CPUInfer* cpuinfer = (CPUInfer*)cpu_infer_ptr;
+         cpuinfer->sync();
+     }
+ 
+     void sync_with_cuda_stream(intptr_t user_cuda_stream) {
+         cudaLaunchHostFunc((cudaStream_t)user_cuda_stream, (cudaHostFn_t)&sync_, (void*)this);
+     }
+ 
+    public:
+     Backend* backend_;
+     TaskQueue* task_queue_;
+ };
+ 
+ #endif
\ No newline at end of file
diff --git a/ktransformers/ktransformers_ext/cpu_backend/vendors/README.md b/ktransformers/ktransformers_ext/cpu_backend/vendors/README.md
new file mode 100644
index 0000000..d179f66
--- /dev/null
+++ b/ktransformers/ktransformers_ext/cpu_backend/vendors/README.md
@@ -0,0 +1,3 @@
+## TODO
+
+This directory can be removed after updating the version of `llama.cpp`.
\ No newline at end of file
diff --git a/ktransformers/ktransformers_ext/cpu_backend/vendors/cuda.h b/ktransformers/ktransformers_ext/cpu_backend/vendors/cuda.h
new file mode 100644
index 0000000..1746b07
--- /dev/null
+++ b/ktransformers/ktransformers_ext/cpu_backend/vendors/cuda.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include <cuda_runtime.h>
+#include <cuda.h>
+#include <cublas_v2.h>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+
+#if CUDART_VERSION < 11020
+#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
+#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
+#define CUBLAS_COMPUTE_16F CUDA_R_16F
+#define CUBLAS_COMPUTE_32F CUDA_R_32F
+#define cublasComputeType_t cudaDataType_t
+#endif // CUDART_VERSION < 11020
diff --git a/ktransformers/ktransformers_ext/cpu_backend/vendors/hip.h b/ktransformers/ktransformers_ext/cpu_backend/vendors/hip.h
new file mode 100644
index 0000000..abbc1e8
--- /dev/null
+++ b/ktransformers/ktransformers_ext/cpu_backend/vendors/hip.h
@@ -0,0 +1,172 @@
+#pragma once
+
+#define HIP_ENABLE_WARP_SYNC_BUILTINS 1
+#include <hip/hip_runtime.h>
+#include <hipblas/hipblas.h>
+#include <hip/hip_fp16.h>
+#include <hip/hip_bfloat16.h>
+#ifdef __HIP_PLATFORM_AMD__
+// for rocblas_initialize()
+#include "rocblas/rocblas.h"
+#endif // __HIP_PLATFORM_AMD__
+
+#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
+#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
+#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
+#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
+#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
+#define CUBLAS_OP_N HIPBLAS_OP_N
+#define CUBLAS_OP_T HIPBLAS_OP_T
+#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#define CUBLAS_TF32_TENSOR_OP_MATH 0
+#define CUDA_R_16F  HIPBLAS_R_16F
+#define CUDA_R_32F  HIPBLAS_R_32F
+#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED hipDeviceAttributeVirtualMemoryManagementSupported
+#define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED hipMemAllocationGranularityRecommended
+#define CU_MEM_ALLOCATION_TYPE_PINNED hipMemAllocationTypePinned
+#define CU_MEM_LOCATION_TYPE_DEVICE hipMemLocationTypeDevice
+#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE hipMemAccessFlagsProtReadWrite
+#define CU_CHECK(fn) {hipError_t err = fn; if(err != hipSuccess) { GGML_ABORT("HipVMM Failure: %s\n", hipGetErrorString(err)); }}
+#define __shfl_sync(mask, var, laneMask, width) __shfl(var, laneMask, width)
+#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
+#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
+#define cublasCreate hipblasCreate
+#define cublasDestroy hipblasDestroy
+#define cublasGemmEx hipblasGemmEx
+#define cublasGemmBatchedEx hipblasGemmBatchedEx
+#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
+#define cublasHandle_t hipblasHandle_t
+#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
+#define cublasSetStream hipblasSetStream
+#define cublasSgemm hipblasSgemm
+#define cublasStatus_t hipblasStatus_t
+#define cublasOperation_t hipblasOperation_t
+#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6
+#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
+#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
+#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
+#define cudaDeviceProp hipDeviceProp_t
+#define cudaDeviceSynchronize hipDeviceSynchronize
+#define cudaError_t hipError_t
+#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
+#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
+#define cudaEventCreateWithFlags hipEventCreateWithFlags
+#define cudaEventDisableTiming hipEventDisableTiming
+#define cudaEventRecord hipEventRecord
+#define cudaEventSynchronize hipEventSynchronize
+#define cudaEvent_t hipEvent_t
+#define cudaEventDestroy hipEventDestroy
+#define cudaFree hipFree
+#define cudaFreeHost hipHostFree
+#define cudaGetDevice hipGetDevice
+#define cudaGetDeviceCount hipGetDeviceCount
+#define cudaGetDeviceProperties hipGetDeviceProperties
+#define cudaGetErrorString hipGetErrorString
+#define cudaGetLastError hipGetLastError
+#define cudaHostRegister hipHostRegister
+#define cudaHostRegisterPortable hipHostRegisterPortable
+#define cudaHostRegisterReadOnly hipHostRegisterReadOnly
+#define cudaHostUnregister hipHostUnregister
+#define cudaLaunchHostFunc hipLaunchHostFunc
+#define cudaMalloc hipMalloc
+#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
+#define cudaMemcpy hipMemcpy
+#define cudaMemcpyAsync hipMemcpyAsync
+#define cudaMemcpyPeerAsync hipMemcpyPeerAsync
+#define cudaMemcpy2DAsync hipMemcpy2DAsync
+#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
+#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
+#define cudaMemcpyKind hipMemcpyKind
+#define cudaMemset hipMemset
+#define cudaMemsetAsync hipMemsetAsync
+#define cudaMemGetInfo hipMemGetInfo
+#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
+#define cudaSetDevice hipSetDevice
+#define cuDeviceGet hipDeviceGet
+#define CUdevice hipDevice_t
+#define CUdeviceptr hipDeviceptr_t
+#define cuMemUnmap hipMemUnmap
+#define CUmemAccessDesc hipMemAccessDesc
+#define cuMemAddressFree hipMemAddressFree
+#define cuMemRelease hipMemRelease
+#define CUmemGenericAllocationHandle hipMemGenericAllocationHandle_t
+#define cuMemCreate hipMemCreate
+#define cuMemAddressReserve hipMemAddressReserve
+#define cuMemMap hipMemMap
+#define cuMemSetAccess hipMemSetAccess
+#define cuMemGetAllocationGranularity hipMemGetAllocationGranularity
+#define CUmemAllocationProp hipMemAllocationProp
+#define cuDeviceGetAttribute hipDeviceGetAttribute
+#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
+#define cudaStreamDestroy hipStreamDestroy
+#define cudaStreamFireAndForget hipStreamFireAndForget
+#define cudaStreamNonBlocking hipStreamNonBlocking
+#define cudaStreamPerThread hipStreamPerThread
+#define cudaStreamSynchronize hipStreamSynchronize
+#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
+#define cudaGraphExec_t hipGraphExec_t
+#define cudaGraphNode_t hipGraphNode_t
+#define cudaKernelNodeParams hipKernelNodeParams
+#define cudaKernelNodeParams hipKernelNodeParams
+#define cudaGraphExecDestroy hipGraphExecDestroy
+#define cudaGraphLaunch hipGraphLaunch
+#define cudaErrorGraphExecUpdateFailure hipErrorGraphExecUpdateFailure
+#define cudaGraphExecUpdateResultInfo hipGraphExecUpdateResult
+#define cudaGraphNodeType hipGraphNodeType
+#define cudaGraphNodeTypeKernel hipGraphNodeTypeKernel
+#define cudaGraphInstantiate hipGraphInstantiate
+#define cudaStreamEndCapture hipStreamEndCapture
+#define cudaGraphDestroy hipGraphDestroy
+#define cudaGraphKernelNodeSetParams hipGraphKernelNodeSetParams
+#define cudaErrorInvalidDeviceFunction hipErrorInvalidDeviceFunction
+#define cudaGraphKernelNodeGetParams hipGraphKernelNodeGetParams
+#define cudaGraphNodeGetType hipGraphNodeGetType
+#define cudaGraphGetNodes hipGraphGetNodes
+#define cudaGraphExecUpdate hipGraphExecUpdate
+#define cudaStreamCaptureModeRelaxed hipStreamCaptureModeRelaxed
+#define cudaStreamBeginCapture hipStreamBeginCapture
+#define cudaGraph_t hipGraph_t
+#define cudaStream_t hipStream_t
+#define cudaSuccess hipSuccess
+#define cudaHostFn_t hipHostFn_t
+#define __trap() do { abort(); __builtin_unreachable(); } while(0)
+#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
+#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED
+#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE
+#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH
+#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR
+#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED
+#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
+#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
+
+#define __CUDA_ARCH__ 1300
+
+#if defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__)
+#define GCN
+#endif
+
+#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx942__)
+#define CDNA
+#endif
+
+#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
+    defined(__gfx1150__) || defined(__gfx1151__)
+#define RDNA3
+#endif
+
+#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
+    defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
+#define RDNA2
+#endif
+
+#if defined(__gfx1010__) || defined(__gfx1012__)
+#define RDNA1
+#endif
+
+#ifndef __has_builtin
+    #define __has_builtin(x) 0
+#endif
+
+typedef hip_bfloat16 nv_bfloat16;
diff --git a/ktransformers/ktransformers_ext/cpu_backend/vendors/musa.h b/ktransformers/ktransformers_ext/cpu_backend/vendors/musa.h
new file mode 100644
index 0000000..6cc1b69
--- /dev/null
+++ b/ktransformers/ktransformers_ext/cpu_backend/vendors/musa.h
@@ -0,0 +1,137 @@
+#pragma once
+
+#include <musa_runtime.h>
+#include <musa.h>
+#include <mublas.h>
+#include <musa_bf16.h>
+#include <musa_fp16.h>
+#define CUBLAS_COMPUTE_16F CUDA_R_16F
+#define CUBLAS_COMPUTE_32F CUDA_R_32F
+#define CUBLAS_COMPUTE_32F_FAST_16F MUBLAS_COMPUTE_32F_FAST_16F
+#define CUBLAS_GEMM_DEFAULT MUBLAS_GEMM_DEFAULT
+#define CUBLAS_GEMM_DEFAULT_TENSOR_OP MUBLAS_GEMM_DEFAULT
+#define CUBLAS_OP_N MUBLAS_OP_N
+#define CUBLAS_OP_T MUBLAS_OP_T
+#define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS
+#define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_MATH_MODE_DEFAULT
+#define CUDA_R_16F  MUSA_R_16F
+#define CUDA_R_32F  MUSA_R_32F
+#define cublasComputeType_t cudaDataType_t
+#define cublasCreate mublasCreate
+#define cublasDestroy mublasDestroy
+#define cublasGemmEx mublasGemmEx
+#define cublasGemmBatchedEx mublasGemmBatchedEx
+#define cublasGemmStridedBatchedEx mublasGemmStridedBatchedEx
+#define cublasHandle_t mublasHandle_t
+#define cublasSetMathMode mublasSetMathMode
+#define cublasSetStream mublasSetStream
+#define cublasSgemm mublasSgemm
+#define cublasStatus_t mublasStatus_t
+#define cublasOperation_t mublasOperation_t
+#define cublasGetStatusString mublasStatus_to_string
+#define cudaDataType_t musaDataType_t
+#define cudaDeviceCanAccessPeer musaDeviceCanAccessPeer
+#define cudaDeviceDisablePeerAccess musaDeviceDisablePeerAccess
+#define cudaDeviceEnablePeerAccess musaDeviceEnablePeerAccess
+#define cudaDeviceProp musaDeviceProp
+#define cudaDeviceSynchronize musaDeviceSynchronize
+#define cudaError_t musaError_t
+#define cudaErrorPeerAccessAlreadyEnabled musaErrorPeerAccessAlreadyEnabled
+#define cudaErrorPeerAccessNotEnabled musaErrorPeerAccessNotEnabled
+#define cudaEventCreateWithFlags musaEventCreateWithFlags
+#define cudaEventDisableTiming musaEventDisableTiming
+#define cudaEventRecord musaEventRecord
+#define cudaEventSynchronize musaEventSynchronize
+#define cudaEvent_t musaEvent_t
+#define cudaEventDestroy musaEventDestroy
+#define cudaFree musaFree
+#define cudaFreeHost musaFreeHost
+#define cudaGetDevice musaGetDevice
+#define cudaGetDeviceCount musaGetDeviceCount
+#define cudaGetDeviceProperties musaGetDeviceProperties
+#define cudaGetErrorString musaGetErrorString
+#define cudaGetLastError musaGetLastError
+#define cudaHostRegister musaHostRegister
+#define cudaHostRegisterPortable musaHostRegisterPortable
+#define cudaHostRegisterReadOnly musaHostRegisterReadOnly
+#define cudaHostUnregister musaHostUnregister
+#define cudaLaunchHostFunc musaLaunchHostFunc
+#define cudaMalloc musaMalloc
+#define cudaMallocHost musaMallocHost
+#define cudaMallocManaged musaMallocManaged
+#define cudaMemcpy musaMemcpy
+#define cudaMemcpyAsync musaMemcpyAsync
+#define cudaMemcpyPeerAsync musaMemcpyPeerAsync
+#define cudaMemcpy2DAsync musaMemcpy2DAsync
+#define cudaMemcpyDeviceToDevice musaMemcpyDeviceToDevice
+#define cudaMemcpyDeviceToHost musaMemcpyDeviceToHost
+#define cudaMemcpyHostToDevice musaMemcpyHostToDevice
+#define cudaMemcpyKind musaMemcpyKind
+#define cudaMemset musaMemset
+#define cudaMemsetAsync musaMemsetAsync
+#define cudaMemGetInfo musaMemGetInfo
+#define cudaOccupancyMaxPotentialBlockSize musaOccupancyMaxPotentialBlockSize
+#define cudaSetDevice musaSetDevice
+#define cudaStreamCreateWithFlags musaStreamCreateWithFlags
+#define cudaStreamDestroy musaStreamDestroy
+#define cudaStreamFireAndForget musaStreamFireAndForget
+#define cudaStreamNonBlocking musaStreamNonBlocking
+#define cudaStreamPerThread musaStreamPerThread
+#define cudaStreamSynchronize musaStreamSynchronize
+#define cudaStreamWaitEvent musaStreamWaitEvent
+#define cudaStream_t musaStream_t
+#define cudaSuccess musaSuccess
+
+// Additional mappings for MUSA virtual memory pool
+#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED MU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
+#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE MU_MEM_ACCESS_FLAGS_PROT_READWRITE
+#define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED MU_MEM_ALLOC_GRANULARITY_RECOMMENDED
+#define CU_MEM_ALLOCATION_TYPE_PINNED MU_MEM_ALLOCATION_TYPE_PINNED
+#define CU_MEM_LOCATION_TYPE_DEVICE MU_MEM_LOCATION_TYPE_DEVICE
+#define CUdevice MUdevice
+#define CUdeviceptr MUdeviceptr
+#define CUmemAccessDesc MUmemAccessDesc
+#define CUmemAllocationProp MUmemAllocationProp
+#define CUmemGenericAllocationHandle MUmemGenericAllocationHandle
+#define cuDeviceGet muDeviceGet
+#define cuDeviceGetAttribute muDeviceGetAttribute
+#define cuMemAddressFree muMemAddressFree
+#define cuMemAddressReserve muMemAddressReserve
+#define cuMemCreate muMemCreate
+#define cuMemGetAllocationGranularity muMemGetAllocationGranularity
+#define cuMemMap muMemMap
+#define cuMemRelease muMemRelease
+#define cuMemSetAccess muMemSetAccess
+#define cuMemUnmap muMemUnmap
+#define cudaFuncAttributeMaxDynamicSharedMemorySize musaFuncAttributeMaxDynamicSharedMemorySize
+#define cudaFuncSetAttribute musaFuncSetAttribute
+#define cudaMemcpy3DPeerParms musaMemcpy3DPeerParms
+#define make_cudaExtent make_musaExtent
+#define make_cudaPitchedPtr make_musaPitchedPtr
+
+// Additional mappings for MUSA graphs
+#define CUDA_SUCCESS MUSA_SUCCESS
+#define CUresult MUresult
+#define cuGetErrorString muGetErrorString
+#define cudaErrorGraphExecUpdateFailure musaErrorGraphExecUpdateFailure
+#define cudaErrorInvalidDeviceFunction musaErrorInvalidDeviceFunction
+#define cudaGraphDestroy musaGraphDestroy
+#define cudaGraphExecDestroy musaGraphExecDestroy
+#define cudaGraphExec_t musaGraphExec_t
+#define cudaGraphExecUpdate musaGraphExecUpdate
+#define cudaGraphExecUpdateResultInfo musaGraphExecUpdateResult
+#define cudaGraphGetNodes musaGraphGetNodes
+#define cudaGraphInstantiate musaGraphInstantiate
+#define cudaGraphKernelNodeGetParams musaGraphKernelNodeGetParams
+#define cudaGraphKernelNodeSetParams musaGraphKernelNodeSetParams
+#define cudaGraphLaunch musaGraphLaunch
+#define cudaGraphNodeGetType musaGraphNodeGetType
+#define cudaGraphNode_t musaGraphNode_t
+#define cudaGraphNodeType musaGraphNodeType
+#define cudaGraphNodeTypeKernel musaGraphNodeTypeKernel
+#define cudaGraph_t musaGraph_t
+#define cudaKernelNodeParams musaKernelNodeParams
+#define cudaStreamCaptureModeRelaxed musaStreamCaptureModeRelaxed
+#define cudaStreamEndCapture musaStreamEndCapture
+
+typedef mt_bfloat16 nv_bfloat16;
diff --git a/ktransformers/ktransformers_ext/cpu_backend/vendors/vendor.h b/ktransformers/ktransformers_ext/cpu_backend/vendors/vendor.h
new file mode 100644
index 0000000..8470438
--- /dev/null
+++ b/ktransformers/ktransformers_ext/cpu_backend/vendors/vendor.h
@@ -0,0 +1,13 @@
+#ifndef CPUINFER_VENDOR_VENDOR_H
+#define CPUINFER_VENDOR_VENDOR_H
+
+#ifdef USE_CUDA
+#include "cuda.h"
+#elif USE_HIP
+#define __HIP_PLATFORM_AMD__
+#include "hip.h"
+#elif USE_MUSA
+#include "musa.h"
+#endif
+
+#endif  // CPUINFER_VENDOR_VENDOR_H
\ No newline at end of file
diff --git a/ktransformers/ktransformers_ext/cuda/binding.cpp b/ktransformers/ktransformers_ext/cuda/binding.cpp
index 65c8bc4..5bba873 100644
--- a/ktransformers/ktransformers_ext/cuda/binding.cpp
+++ b/ktransformers/ktransformers_ext/cuda/binding.cpp
@@ -1,15 +1,15 @@
 /**
- * @Description  :  
- * @Author       : Azure-Tang
+ * @Description  :
+ * @Author       : Azure-Tang, Boxin Zhang
  * @Date         : 2024-07-25 13:38:30
- * @Version      : 1.0.0
- * @LastEditors  : kkk1nak0
- * @LastEditTime : 2024-08-12 03:05:04
- * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
+ * @Version      : 0.2.2
+ * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/
 
 #include "custom_gguf/ops.h"
+#ifdef KTRANSFORMERS_USE_CUDA
 #include "gptq_marlin/ops.h"
+#endif
 // Python bindings
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
@@ -19,22 +19,53 @@
 // namespace py = pybind11;
 
 PYBIND11_MODULE(KTransformersOps, m) {
-      m.def("dequantize_q8_0", &dequantize_q8_0, "Function to dequantize q8_0 data.",
-            py::arg("data"), py::arg("blk_size"), py::arg("device"));
-      m.def("dequantize_q6_k", &dequantize_q6_k, "Function to dequantize q6_k data.",
-            py::arg("data"), py::arg("blk_size"), py::arg("device"));
-      m.def("dequantize_q5_k", &dequantize_q5_k, "Function to dequantize q5_k data.",
-            py::arg("data"), py::arg("blk_size"), py::arg("device"));
-      m.def("dequantize_q4_k",  &dequantize_q4_k, "Function to dequantize q4_k data.",
-            py::arg("data"), py::arg("blk_size"), py::arg("device"));
-      m.def("dequantize_q3_k",  &dequantize_q3_k, "Function to dequantize q3_k data.",
-            py::arg("data"), py::arg("blk_size"), py::arg("device"));
-      m.def("dequantize_q2_k",  &dequantize_q2_k, "Function to dequantize q2_k data.",
-            py::arg("data"), py::arg("blk_size"), py::arg("device"));
-      m.def("dequantize_iq4_xs",  &dequantize_iq4_xs, "Function to dequantize iq4_xs data.",
-            py::arg("data"), py::arg("blk_size"), py::arg("device"));
-      m.def("gptq_marlin_gemm", &gptq_marlin_gemm, "Function to perform GEMM using Marlin quantization.",
-            py::arg("a"), py::arg("b_q_weight"), py::arg("b_scales"), py::arg("g_idx"),
-            py::arg("perm"), py::arg("workspace"), py::arg("num_bits"), py::arg("size_m"),
-            py::arg("size_n"), py::arg("size_k"), py::arg("is_k_full"));
+
+    m.def("dequantize_q8_0", [](const intptr_t data, int num_bytes, int blk_size, const int ele_per_blk, torch::Device device, py::object target_dtype) {
+        torch::Dtype dtype = torch::python::detail::py_object_to_dtype(target_dtype);
+        return dequantize_q8_0((int8_t*)data, num_bytes, blk_size, ele_per_blk, device, dtype);
+        }, "Function to dequantize q8_0 data.",
+        py::arg("data"), py::arg("num_bytes"), py::arg("blk_size"), py::arg("ele_per_blk"), py::arg("device"), py::arg("target_dtype"));
+
+    m.def("dequantize_q6_k", [](const intptr_t data, int num_bytes, int blk_size, const int ele_per_blk, torch::Device device, py::object target_dtype) {
+        torch::Dtype dtype = torch::python::detail::py_object_to_dtype(target_dtype);
+        return dequantize_q6_k((int8_t*)data, num_bytes, blk_size, ele_per_blk, device, dtype);
+        }, "Function to dequantize q6_k data.",
+        py::arg("data"), py::arg("num_bytes"), py::arg("blk_size"), py::arg("ele_per_blk"), py::arg("device"), py::arg("target_dtype"));
+
+    m.def("dequantize_q5_k", [](const intptr_t data, int num_bytes, int blk_size, const int ele_per_blk, torch::Device device, py::object target_dtype) {
+        torch::Dtype dtype = torch::python::detail::py_object_to_dtype(target_dtype);
+        return dequantize_q5_k((int8_t*)data, num_bytes, blk_size, ele_per_blk, device, dtype);
+        }, "Function to dequantize q5_k data.",
+        py::arg("data"), py::arg("num_bytes"), py::arg("blk_size"), py::arg("ele_per_blk"), py::arg("device"), py::arg("target_dtype"));
+
+    m.def("dequantize_q4_k", [](const intptr_t data, int num_bytes, int blk_size, const int ele_per_blk, torch::Device device, py::object target_dtype) {
+        torch::Dtype dtype = torch::python::detail::py_object_to_dtype(target_dtype);
+        return dequantize_q4_k((int8_t*)data, num_bytes, blk_size, ele_per_blk, device, dtype);
+        }, "Function to dequantize q4_k data.",
+        py::arg("data"), py::arg("num_bytes"), py::arg("blk_size"), py::arg("ele_per_blk"), py::arg("device"), py::arg("target_dtype"));
+
+    m.def("dequantize_q3_k", [](const intptr_t data, int num_bytes, int blk_size, const int ele_per_blk, torch::Device device, py::object target_dtype) {
+        torch::Dtype dtype = torch::python::detail::py_object_to_dtype(target_dtype);
+        return dequantize_q3_k((int8_t*)data, num_bytes, blk_size, ele_per_blk, device, dtype);
+        }, "Function to dequantize q3_k data.",
+        py::arg("data"), py::arg("num_bytes"), py::arg("blk_size"), py::arg("ele_per_blk"), py::arg("device"), py::arg("target_dtype"));
+
+    m.def("dequantize_q2_k", [](const intptr_t data, int num_bytes, int blk_size, const int ele_per_blk, torch::Device device, py::object target_dtype) {
+        torch::Dtype dtype = torch::python::detail::py_object_to_dtype(target_dtype);
+        return dequantize_q2_k((int8_t*)data, num_bytes, blk_size, ele_per_blk, device, dtype);
+        }, "Function to dequantize q2_k data.",
+        py::arg("data"), py::arg("num_bytes"), py::arg("blk_size"), py::arg("ele_per_blk"), py::arg("device"), py::arg("target_dtype"));
+
+    m.def("dequantize_iq4_xs", [](const intptr_t data, int num_bytes, int blk_size, const int ele_per_blk, torch::Device device, py::object target_dtype) {
+        torch::Dtype dtype = torch::python::detail::py_object_to_dtype(target_dtype);
+        return dequantize_iq4_xs((int8_t*)data, num_bytes, blk_size, ele_per_blk, device, dtype);
+        }, "Function to dequantize iq4_xs data.",
+        py::arg("data"), py::arg("num_bytes"), py::arg("blk_size"), py::arg("ele_per_blk"), py::arg("device"), py::arg("target_dtype"));
+
+#ifdef KTRANSFORMERS_USE_CUDA
+    m.def("gptq_marlin_gemm", &gptq_marlin_gemm, "Function to perform GEMM using Marlin quantization.",
+        py::arg("a"), py::arg("b_q_weight"), py::arg("b_scales"), py::arg("g_idx"),
+        py::arg("perm"), py::arg("workspace"), py::arg("num_bits"), py::arg("size_m"),
+        py::arg("size_n"), py::arg("size_k"), py::arg("is_k_full"));
+#endif
 }
diff --git a/ktransformers/ktransformers_ext/cuda/custom_gguf/binding.cpp b/ktransformers/ktransformers_ext/cuda/custom_gguf/binding.cpp
deleted file mode 100644
index 99069d8..0000000
--- a/ktransformers/ktransformers_ext/cuda/custom_gguf/binding.cpp
+++ /dev/null
@@ -1,35 +0,0 @@
-#include "ops.h"
-// Python bindings
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <torch/library.h>
-#include <torch/extension.h>
-#include <torch/torch.h>
-// namespace py = pybind11;
-
-int test(){
-    return 5;
-}
-
-torch::Tensor dequantize_q6_k(torch::Tensor data, int blk_size, torch::Device device);
-torch::Tensor dequantize_q5_k(torch::Tensor data, int blk_size, torch::Device device);
-torch::Tensor dequantize_q2_k(torch::Tensor data, int blk_size, torch::Device device);
-
-PYBIND11_MODULE(cudaops, m) {
-    m.def("dequantize_q8_0", &dequantize_q8_0, "Function to dequantize q8_0 data.",
-          py::arg("data"), py::arg("blk_size"), py::arg("device"));
-    m.def("dequantize_q6_k", &dequantize_q6_k, "Function to dequantize q6_k data.",
-          py::arg("data"), py::arg("blk_size"), py::arg("device"));
-    m.def("dequantize_q5_k", &dequantize_q5_k, "Function to dequantize q5_k data.",
-          py::arg("data"), py::arg("blk_size"), py::arg("device"));
-    m.def("dequantize_q4_k",  &dequantize_q4_k, "Function to dequantize q4_k data.",
-          py::arg("data"), py::arg("blk_size"), py::arg("device"));
-    m.def("dequantize_q3_k",  &dequantize_q3_k, "Function to dequantize q3_k data.",
-            py::arg("data"), py::arg("blk_size"), py::arg("device"));
-    m.def("dequantize_q2_k",  &dequantize_q2_k, "Function to dequantize q2_k data.",
-          py::arg("data"), py::arg("blk_size"), py::arg("device"));
-    m.def("dequantize_iq4_xs",  &dequantize_iq4_xs, "Function to dequantize iq4_xs data.",
-          py::arg("data"), py::arg("blk_size"), py::arg("device"));
-    m.def("test", &test, "Function to test.");
-    
-}
diff --git a/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu b/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu
index 0c49fa7..e80efc4 100644
--- a/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu
+++ b/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu
@@ -2,26 +2,55 @@
  * @Description  :  
  * @Author       : Azure-Tang, Boxin Zhang
  * @Date         : 2024-07-25 13:38:30
- * @Version      : 1.0.0
- * @LastEditors  : kkk1nak0
- * @LastEditTime : 2024-08-12 04:18:04
+ * @Version      : 0.2.2
  * Adapted from https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.c
  * Copyright (c) 2023-2024 The ggml authors
  * Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
  */
 #include <cuda_runtime.h>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
 #include <torch/library.h>
 #include <torch/extension.h>
 #include <torch/torch.h>
 #include <cstdint>
 #include <c10/cuda/CUDAGuard.h>
 
-__global__ void dequantize_q8_0_kernel(float* output, const float* scales, const int8_t* qs, int num_blocks, int blk_size) {
-    int global_idx = blockIdx.x * blockDim.x + threadIdx.x;
-    for (auto block_id=global_idx; block_id<num_blocks;block_id+=blockDim.x * gridDim.x){
-        for(int i=0;i<blk_size;i++){
-            float scale = scales[block_id];
-            output[block_id * blk_size + i] = scale * qs[block_id * blk_size + i];
+__global__ void dequantize_q8_0_fp32_kernel(const int8_t* data, float* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
+    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    for (long long block_id = global_idx; block_id < num_blocks; block_id += blockDim.x * gridDim.x){
+        float* __restrict__ output_blk = (float*)(output + block_id * ele_per_blk);
+        const int8_t* cur_block = data + block_id * blk_size;
+        float scale = __half2float(*((half*)cur_block));
+        cur_block += 2;
+        for (int i = 0; i < ele_per_blk; i++){
+            output_blk[i] = scale * cur_block[i];
+        }
+    }
+}
+
+__global__ void dequantize_q8_0_fp16_kernel(const int8_t* data, __half* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
+    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    for (long long block_id = global_idx; block_id < num_blocks; block_id += blockDim.x * gridDim.x) {
+        __half* __restrict__ output_blk = (__half*)(output + block_id * ele_per_blk);
+        const int8_t* cur_block = data + block_id * blk_size;
+        float scale = __half2float(*((half*)cur_block));
+        cur_block += 2;
+        for (int i = 0; i < ele_per_blk; i++) {
+            output_blk[i] = __float2half(scale * cur_block[i]);
+        }
+    }
+}
+
+__global__ void dequantize_q8_0_bf16_kernel(const int8_t* data, nv_bfloat16* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
+    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    for (long long block_id = global_idx; block_id < num_blocks; block_id += blockDim.x * gridDim.x) {
+        nv_bfloat16* __restrict__ output_blk = (nv_bfloat16*)(output + block_id * ele_per_blk);
+        const int8_t* cur_block = data + block_id * blk_size;
+        float scale = __half2float(*((half*)cur_block));
+        cur_block += 2;
+        for (int i = 0; i < ele_per_blk; i++) {
+            output_blk[i] = __float2bfloat16(scale * cur_block[i]);
         }
     }
 }
@@ -36,13 +65,13 @@ __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t * __restrict_
     }
 }
 
-__global__ void dequantize_q2_k_kernel(int8_t* data, float* output, int blk_size, int num_blocks) {
-    int global_idx = blockIdx.x * blockDim.x + threadIdx.x;
-    for (auto block_id=global_idx; block_id<num_blocks; block_id+= blockDim.x * gridDim.x){
-        float* __restrict__ output_blk = (float*)(output + block_id * 256);
+__global__ void dequantize_q2_k_fp32_kernel(const int8_t* data, float* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
+    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    for (long long block_id=global_idx; block_id<num_blocks; block_id+= blockDim.x * gridDim.x){
+        float* __restrict__ output_blk = (float*)(output + block_id * ele_per_blk);
 
-        const float d   = __half2float(*(reinterpret_cast<half*>(data + block_id * blk_size + 80)));
-        const float min = __half2float(*(reinterpret_cast<half*>(data + block_id * blk_size + 82)));
+        const float d   = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 80)));
+        const float min = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 82)));
 
         const uint8_t * __restrict__ q = (uint8_t*)(data + block_id * blk_size + 16);
 
@@ -70,17 +99,85 @@ __global__ void dequantize_q2_k_kernel(int8_t* data, float* output, int blk_size
     }
 }
 
-__global__ void dequantize_q3_k_kernel(int8_t* data, float* output, int blk_size, int num_blocks) {
+__global__ void dequantize_q2_k_fp16_kernel(const int8_t* data, __half* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
+    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    for (long long block_id=global_idx; block_id<num_blocks; block_id+= blockDim.x * gridDim.x){
+        __half* __restrict__ output_blk = (__half*)(output + block_id * ele_per_blk);
+
+        const float d   = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 80)));
+        const float min = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 82)));
+
+        const uint8_t * __restrict__ q = (uint8_t*)(data + block_id * blk_size + 16);
+
+        int is = 0;
+        float dl, ml;
+
+        for (int n = 0; n < 256; n += 128) {
+            int shift = 0;
+            for (int j = 0; j < 4; ++j) {
+                uint8_t* scales = (uint8_t*)(data + block_id * blk_size + (is++));
+                uint8_t sc = *scales;
+                dl = d * (sc & 0xF); ml = min * (sc >> 4);
+                for (int l = 0; l < 16; ++l) *output_blk++ = __float2half(dl * ((int8_t)((q[l] >> shift) & 3)) - ml);
+
+                scales = (uint8_t*)(data + block_id * blk_size + (is++));
+                sc = *scales;
+
+                dl = d * (sc & 0xF); ml = min * (sc >> 4);
+                for (int l = 0; l < 16; ++l) *output_blk++ = __float2half(dl * ((int8_t)((q[l+16] >> shift) & 3)) - ml);
+
+                shift += 2;
+            }
+            q += 32;
+        }
+    }
+}
+
+__global__ void dequantize_q2_k_bf16_kernel(const int8_t* data, nv_bfloat16* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
+    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    for (long long block_id=global_idx; block_id<num_blocks; block_id+= blockDim.x * gridDim.x){
+        nv_bfloat16* __restrict__ output_blk = (nv_bfloat16*)(output + block_id * ele_per_blk);
+
+        const float d   = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 80)));
+        const float min = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 82)));
+
+        const uint8_t * __restrict__ q = (uint8_t*)(data + block_id * blk_size + 16);
+
+        int is = 0;
+        float dl, ml;
+
+        for (int n = 0; n < 256; n += 128) {
+            int shift = 0;
+            for (int j = 0; j < 4; ++j) {
+                uint8_t* scales = (uint8_t*)(data + block_id * blk_size + (is++));
+                uint8_t sc = *scales;
+                dl = d * (sc & 0xF); ml = min * (sc >> 4);
+                for (int l = 0; l < 16; ++l) *output_blk++ = __float2bfloat16(dl * ((int8_t)((q[l] >> shift) & 3)) - ml);
+
+                scales = (uint8_t*)(data + block_id * blk_size + (is++));
+                sc = *scales;
+
+                dl = d * (sc & 0xF); ml = min * (sc >> 4);
+                for (int l = 0; l < 16; ++l) *output_blk++ = __float2bfloat16(dl * ((int8_t)((q[l+16] >> shift) & 3)) - ml);
+
+                shift += 2;
+            }
+            q += 32;
+        }
+    }
+}
+
+__global__ void dequantize_q3_k_fp32_kernel(const int8_t* data, float* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
     
-    int global_idx = blockIdx.x * blockDim.x + threadIdx.x;    
+    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;    
     const uint32_t kmask1 = 0x03030303;
     const uint32_t kmask2 = 0x0f0f0f0f;
-    for (auto block_id=global_idx; block_id<num_blocks; block_id+= blockDim.x * gridDim.x){
-        float* __restrict__ output_blk = (float*)(output + block_id * 256);
+    for (long long block_id=global_idx; block_id<num_blocks; block_id+= blockDim.x * gridDim.x){
+        float* __restrict__ output_blk = (float*)(output + block_id * ele_per_blk);
 
         uint32_t aux[4];
         const int8_t * scales = (const int8_t*)aux;
-        const float d_all = __half2float(*(reinterpret_cast<half*>(data + block_id * blk_size + 108)));
+        const float d_all = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 108)));
 
         const uint8_t * __restrict__ q  = (uint8_t*)(data + block_id * blk_size + 32);
         const uint8_t * __restrict__ hm = (uint8_t*)(data + block_id * blk_size + 0);
@@ -126,19 +223,131 @@ __global__ void dequantize_q3_k_kernel(int8_t* data, float* output, int blk_size
     }
 }
 
+__global__ void dequantize_q3_k_fp16_kernel(const int8_t* data, __half* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
+    
+    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;    
+    const uint32_t kmask1 = 0x03030303;
+    const uint32_t kmask2 = 0x0f0f0f0f;
+    for (long long block_id=global_idx; block_id<num_blocks; block_id+= blockDim.x * gridDim.x){
+        __half* __restrict__ output_blk = (__half*)(output + block_id * ele_per_blk);
 
-__global__ void dequantize_q4_k_kernel(int8_t* data, float* output, int blk_size, int num_blocks) {
-    int global_idx = blockIdx.x * blockDim.x + threadIdx.x;
-    for (auto block_id=global_idx; block_id<num_blocks;block_id+=blockDim.x * gridDim.x){
-        float* __restrict__ output_blk = (float*)(output + block_id * 256);
+        uint32_t aux[4];
+        const int8_t * scales = (const int8_t*)aux;
+        const float d_all = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 108)));
+
+        const uint8_t * __restrict__ q  = (uint8_t*)(data + block_id * blk_size + 32);
+        const uint8_t * __restrict__ hm = (uint8_t*)(data + block_id * blk_size + 0);
+        uint8_t m = 1;
+
+
+        uint8_t* block_scales = (uint8_t*)(data + block_id * blk_size + 96);
+
+        for (int i = 0; i < 3; i++) {  
+            aux[i] = 0;  
+            for (int j = 0; j < 4; j++) {  
+                aux[i] |= ((uint32_t)block_scales[i * 4 + j]) << (j * 8);
+            }
+        }
+
+        uint32_t tmp = aux[2];
+        aux[2] = ((aux[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
+        aux[3] = ((aux[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
+        aux[0] = (aux[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
+        aux[1] = (aux[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
+
+        int is = 0;
+        float dl;
+        for (int n = 0; n < 256; n += 128) {
+            int shift = 0;
+            for (int j = 0; j < 4; ++j) {
+
+                dl = d_all * (scales[is++] - 32);
+                for (int l = 0; l < 16; ++l) {
+                    *output_blk++ = __float2half(dl * ((int8_t)((q[l+ 0] >> shift) & 3) - ((hm[l+ 0] & m) ? 0 : 4)));
+                }
+
+                dl = d_all * (scales[is++] - 32);
+                for (int l = 0; l < 16; ++l) {
+                    *output_blk++ = __float2half(dl * ((int8_t)((q[l+16] >> shift) & 3) - ((hm[l+16] & m) ? 0 : 4)));
+                }
+
+                shift += 2;
+                m <<= 1;
+            }
+            q += 32;
+        }
+    }
+}
+
+__global__ void dequantize_q3_k_bf16_kernel(const int8_t* data, nv_bfloat16* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
+    
+    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;    
+    const uint32_t kmask1 = 0x03030303;
+    const uint32_t kmask2 = 0x0f0f0f0f;
+    for (long long block_id=global_idx; block_id<num_blocks; block_id+= blockDim.x * gridDim.x){
+        nv_bfloat16* __restrict__ output_blk = (nv_bfloat16*)(output + block_id * ele_per_blk);
+
+        uint32_t aux[4];
+        const int8_t * scales = (const int8_t*)aux;
+        const float d_all = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 108)));
+
+        const uint8_t * __restrict__ q  = (uint8_t*)(data + block_id * blk_size + 32);
+        const uint8_t * __restrict__ hm = (uint8_t*)(data + block_id * blk_size + 0);
+        uint8_t m = 1;
+
+
+        uint8_t* block_scales = (uint8_t*)(data + block_id * blk_size + 96);
+
+        for (int i = 0; i < 3; i++) {  
+            aux[i] = 0;  
+            for (int j = 0; j < 4; j++) {  
+                aux[i] |= ((uint32_t)block_scales[i * 4 + j]) << (j * 8);
+            }
+        }
+
+        uint32_t tmp = aux[2];
+        aux[2] = ((aux[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
+        aux[3] = ((aux[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
+        aux[0] = (aux[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
+        aux[1] = (aux[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
+
+        int is = 0;
+        float dl;
+        for (int n = 0; n < 256; n += 128) {
+            int shift = 0;
+            for (int j = 0; j < 4; ++j) {
+
+                dl = d_all * (scales[is++] - 32);
+                for (int l = 0; l < 16; ++l) {
+                    *output_blk++ = __float2bfloat16(dl * ((int8_t)((q[l+ 0] >> shift) & 3) - ((hm[l+ 0] & m) ? 0 : 4)));
+                }
+
+                dl = d_all * (scales[is++] - 32);
+                for (int l = 0; l < 16; ++l) {
+                    *output_blk++ = __float2bfloat16(dl * ((int8_t)((q[l+16] >> shift) & 3) - ((hm[l+16] & m) ? 0 : 4)));
+                }
+
+                shift += 2;
+                m <<= 1;
+            }
+            q += 32;
+        }
+    }
+}
+
+
+__global__ void dequantize_q4_k_fp32_kernel(const int8_t* data, float* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
+    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    for (long long block_id=global_idx; block_id<num_blocks; block_id+=blockDim.x * gridDim.x){
+        float* __restrict__ output_blk = (float*)(output + block_id * ele_per_blk);
         // const uint8_t * q = data[i].qs;
         const uint8_t * q = (uint8_t*)(data + block_id * 144 + 16);
 
-        const float d   = __half2float(*(reinterpret_cast<half*>(data + block_id * 144 + 0)));
-        const float min = __half2float(*(reinterpret_cast<half*>(data + block_id * 144 + 2)));
+        const float d   = __half2float(*(reinterpret_cast<const half*>(data + block_id * 144 + 0)));
+        const float min = __half2float(*(reinterpret_cast<const half*>(data + block_id * 144 + 2)));
         int is = 0;
         uint8_t sc, m;
-        for (int j = 0; j < blk_size; j += 64) {
+        for (int j = 0; j < ele_per_blk; j += 64) {
             uint8_t* scales = (uint8_t*)(data + block_id * 144 + 4);
             get_scale_min_k4(is + 0, scales, &sc, &m);
             const float d1 = d * sc; const float m1 = min * m;
@@ -151,13 +360,61 @@ __global__ void dequantize_q4_k_kernel(int8_t* data, float* output, int blk_size
     }
 }
 
-__global__ void dequantize_q5_k_kernel(int8_t* data, float* output, int blk_size, int num_blocks) {
-    int global_idx = blockIdx.x * blockDim.x + threadIdx.x;
-    for (auto block_id=global_idx; block_id<num_blocks; block_id+= blockDim.x * gridDim.x){
-        float* __restrict__ output_blk = (float*)(output + block_id * 256);
+__global__ void dequantize_q4_k_fp16_kernel(const int8_t* data, __half* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
+    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    for (long long block_id=global_idx; block_id<num_blocks; block_id+=blockDim.x * gridDim.x){
+        __half* __restrict__ output_blk = (__half*)(output + block_id * ele_per_blk);
+        // const uint8_t * q = data[i].qs;
+        const uint8_t * q = (uint8_t*)(data + block_id * 144 + 16);
 
-        const float d   = __half2float(*(reinterpret_cast<half*>(data + block_id * blk_size + 0)));
-        const float min = __half2float(*(reinterpret_cast<half*>(data + block_id * blk_size + 2)));
+        const float d   = __half2float(*(reinterpret_cast<const half*>(data + block_id * 144 + 0)));
+        const float min = __half2float(*(reinterpret_cast<const half*>(data + block_id * 144 + 2)));
+        int is = 0;
+        uint8_t sc, m;
+        for (int j = 0; j < ele_per_blk; j += 64) {
+            uint8_t* scales = (uint8_t*)(data + block_id * 144 + 4);
+            get_scale_min_k4(is + 0, scales, &sc, &m);
+            const float d1 = d * sc; const float m1 = min * m;
+            get_scale_min_k4(is + 1, scales, &sc, &m);
+            const float d2 = d * sc; const float m2 = min * m;
+            for (int l = 0; l < 32; ++l) *output_blk++ = __float2half(d1 * (q[l] & 0xF) - m1);
+            for (int l = 0; l < 32; ++l) *output_blk++ = __float2half(d2 * (q[l]  >> 4) - m2);
+            q += 32; is += 2;
+        }
+    }
+}
+
+__global__ void dequantize_q4_k_bf16_kernel(const int8_t* data, nv_bfloat16* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
+    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    for (long long block_id=global_idx; block_id<num_blocks; block_id+=blockDim.x * gridDim.x){
+        nv_bfloat16* __restrict__ output_blk = (nv_bfloat16*)(output + block_id * ele_per_blk);
+        // const uint8_t * q = data[i].qs;
+        const uint8_t * q = (uint8_t*)(data + block_id * 144 + 16);
+
+        const float d   = __half2float(*(reinterpret_cast<const half*>(data + block_id * 144 + 0)));
+        const float min = __half2float(*(reinterpret_cast<const half*>(data + block_id * 144 + 2)));
+        int is = 0;
+        uint8_t sc, m;
+        for (int j = 0; j < ele_per_blk; j += 64) {
+            uint8_t* scales = (uint8_t*)(data + block_id * 144 + 4);
+            get_scale_min_k4(is + 0, scales, &sc, &m);
+            const float d1 = d * sc; const float m1 = min * m;
+            get_scale_min_k4(is + 1, scales, &sc, &m);
+            const float d2 = d * sc; const float m2 = min * m;
+            for (int l = 0; l < 32; ++l) *output_blk++ = __float2bfloat16(d1 * (q[l] & 0xF) - m1);
+            for (int l = 0; l < 32; ++l) *output_blk++ = __float2bfloat16(d2 * (q[l]  >> 4) - m2);
+            q += 32; is += 2;
+        }
+    }
+}
+
+__global__ void dequantize_q5_k_fp32_kernel(const int8_t* data, float* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
+    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    for (long long block_id = global_idx; block_id < num_blocks; block_id += blockDim.x * gridDim.x){
+        float* __restrict__ output_blk = (float*)(output + block_id * ele_per_blk);
+
+        const float d   = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 0)));
+        const float min = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 2)));
 
         const uint8_t * __restrict__ qh = (uint8_t*)(data + block_id * blk_size + 16);
         const uint8_t * __restrict__ ql = (uint8_t*)(data + block_id * blk_size + 48);
@@ -180,46 +437,165 @@ __global__ void dequantize_q5_k_kernel(int8_t* data, float* output, int blk_size
     }
 }
 
-__global__ void dequantize_q6_k_kernel(int8_t* data, float* output, int blk_size, int num_blocks) {
-    int global_idx = blockIdx.x * blockDim.x + threadIdx.x;
-    for (auto block_id=global_idx; block_id<num_blocks;block_id+=blockDim.x * gridDim.x){
-        float* __restrict__ output_blk = (float*)(output + block_id * 256);
-        const float d = __half2float(*(reinterpret_cast<half*>(data + block_id * blk_size + 208)));
+__global__ void dequantize_q5_k_fp16_kernel(const int8_t* data, __half* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
+    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    for (long long block_id = global_idx; block_id < num_blocks; block_id += blockDim.x * gridDim.x){
+        __half* __restrict__ output_blk = (__half*)(output + block_id * ele_per_blk);
+
+        const float d   = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 0)));
+        const float min = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 2)));
+
+        const uint8_t * __restrict__ qh = (uint8_t*)(data + block_id * blk_size + 16);
+        const uint8_t * __restrict__ ql = (uint8_t*)(data + block_id * blk_size + 48);
+
+        int is = 0;
+        uint8_t sc, m;
+        uint8_t u1 = 1, u2 = 2;
+        uint8_t* scales = (uint8_t*)(data + block_id * blk_size + 4);
+
+        for (int j = 0; j < 256; j += 64) {
+            get_scale_min_k4(is + 0, scales, &sc, &m);
+            const float d1 = d * sc; const float m1 = min * m;
+            get_scale_min_k4(is + 1, scales, &sc, &m);
+            const float d2 = d * sc; const float m2 = min * m;
+            for (int l = 0; l < 32; ++l) *output_blk++ = __float2half(d1 * ((ql[l] & 0xF) + (qh[l] & u1 ? 16 : 0)) - m1);
+            for (int l = 0; l < 32; ++l) *output_blk++ = __float2half(d2 * ((ql[l]  >> 4) + (qh[l] & u2 ? 16 : 0)) - m2);
+            ql += 32; is += 2;
+            u1 <<= 2; u2 <<= 2;
+        }
+    }
+}
+
+__global__ void dequantize_q5_k_bf16_kernel(const int8_t* data, nv_bfloat16* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
+    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    for (long long block_id = global_idx; block_id < num_blocks; block_id += blockDim.x * gridDim.x){
+        nv_bfloat16* __restrict__ output_blk = (nv_bfloat16*)(output + block_id * ele_per_blk);
+
+        const float d   = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 0)));
+        const float min = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 2)));
+
+        const uint8_t * __restrict__ qh = (uint8_t*)(data + block_id * blk_size + 16);
+        const uint8_t * __restrict__ ql = (uint8_t*)(data + block_id * blk_size + 48);
+
+        int is = 0;
+        uint8_t sc, m;
+        uint8_t u1 = 1, u2 = 2;
+        uint8_t* scales = (uint8_t*)(data + block_id * blk_size + 4);
+
+        for (int j = 0; j < 256; j += 64) {
+            get_scale_min_k4(is + 0, scales, &sc, &m);
+            const float d1 = d * sc; const float m1 = min * m;
+            get_scale_min_k4(is + 1, scales, &sc, &m);
+            const float d2 = d * sc; const float m2 = min * m;
+            for (int l = 0; l < 32; ++l) *output_blk++ = __float2bfloat16(d1 * ((ql[l] & 0xF) + (qh[l] & u1 ? 16 : 0)) - m1);
+            for (int l = 0; l < 32; ++l) *output_blk++ = __float2bfloat16(d2 * ((ql[l]  >> 4) + (qh[l] & u2 ? 16 : 0)) - m2);
+            ql += 32; is += 2;
+            u1 <<= 2; u2 <<= 2;
+        }
+    }
+}
+
+__global__ void dequantize_q6_k_fp32_kernel(const int8_t* data, float* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
+    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    for (long long  block_id=global_idx; block_id<num_blocks;block_id+=blockDim.x * gridDim.x){
+        float* __restrict__ output_blk = (float*)(output + block_id * ele_per_blk);
+        const float d = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 208)));
 
         const uint8_t * __restrict__ ql = (uint8_t*)(data + block_id * blk_size);
         const uint8_t * __restrict__ qh = (uint8_t*)(data + block_id * blk_size + 128);
         const int8_t  * __restrict__ sc = (int8_t*)(data + block_id * blk_size + 192);
 
 
-        //if (blk_size == 256){
-            for (int n = 0; n < blk_size; n += 128) {
-                for (int l = 0; l < 32; ++l) {
-                    int is = l/16;
-                    const int8_t q1 = (int8_t)((ql[l +  0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
-                    const int8_t q2 = (int8_t)((ql[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
-                    const int8_t q3 = (int8_t)((ql[l +  0]  >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
-                    const int8_t q4 = (int8_t)((ql[l + 32]  >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
-                    output_blk[l +  0] = d * sc[is + 0] * q1;
-                    output_blk[l + 32] = d * sc[is + 2] * q2;
-                    output_blk[l + 64] = d * sc[is + 4] * q3;
-                    output_blk[l + 96] = d * sc[is + 6] * q4;
-                }
-                output_blk += 128;
-                ql += 64;
-                qh += 32;
-                sc += 8;
+        for (int n = 0; n < ele_per_blk; n += 128) {
+            for (int l = 0; l < 32; ++l) {
+                int is = l/16;
+                const int8_t q1 = (int8_t)((ql[l +  0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
+                const int8_t q2 = (int8_t)((ql[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
+                const int8_t q3 = (int8_t)((ql[l +  0]  >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
+                const int8_t q4 = (int8_t)((ql[l + 32]  >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
+                output_blk[l +  0] = d * sc[is + 0] * q1;
+                output_blk[l + 32] = d * sc[is + 2] * q2;
+                output_blk[l + 64] = d * sc[is + 4] * q3;
+                output_blk[l + 96] = d * sc[is + 6] * q4;
             }
+            output_blk += 128;
+            ql += 64;
+            qh += 32;
+            sc += 8;
+        }
+    }
+}
+
+__global__ void dequantize_q6_k_fp16_kernel(const int8_t* data, __half* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
+    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    for (long long  block_id=global_idx; block_id<num_blocks;block_id+=blockDim.x * gridDim.x){
+        __half* __restrict__ output_blk = (__half*)(output + block_id * ele_per_blk);
+        const float d = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 208)));
+
+        const uint8_t * __restrict__ ql = (uint8_t*)(data + block_id * blk_size);
+        const uint8_t * __restrict__ qh = (uint8_t*)(data + block_id * blk_size + 128);
+        const int8_t  * __restrict__ sc = (int8_t*)(data + block_id * blk_size + 192);
+
+
+        for (int n = 0; n < ele_per_blk; n += 128) {
+            for (int l = 0; l < 32; ++l) {
+                int is = l/16;
+                const int8_t q1 = (int8_t)((ql[l +  0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
+                const int8_t q2 = (int8_t)((ql[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
+                const int8_t q3 = (int8_t)((ql[l +  0]  >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
+                const int8_t q4 = (int8_t)((ql[l + 32]  >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
+                output_blk[l +  0] = __float2half(d * sc[is + 0] * q1);
+                output_blk[l + 32] = __float2half(d * sc[is + 2] * q2);
+                output_blk[l + 64] = __float2half(d * sc[is + 4] * q3);
+                output_blk[l + 96] = __float2half(d * sc[is + 6] * q4);
+            }
+            output_blk += 128;
+            ql += 64;
+            qh += 32;
+            sc += 8;
+        }
+    }
+}
+
+__global__ void dequantize_q6_k_bf16_kernel(const int8_t* data, nv_bfloat16* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
+    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    for (long long  block_id=global_idx; block_id<num_blocks;block_id+=blockDim.x * gridDim.x){
+        nv_bfloat16* __restrict__ output_blk = (nv_bfloat16*)(output + block_id * ele_per_blk);
+        const float d = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 208)));
+
+        const uint8_t * __restrict__ ql = (uint8_t*)(data + block_id * blk_size);
+        const uint8_t * __restrict__ qh = (uint8_t*)(data + block_id * blk_size + 128);
+        const int8_t  * __restrict__ sc = (int8_t*)(data + block_id * blk_size + 192);
+
+
+        for (int n = 0; n < ele_per_blk; n += 128) {
+            for (int l = 0; l < 32; ++l) {
+                int is = l/16;
+                const int8_t q1 = (int8_t)((ql[l +  0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
+                const int8_t q2 = (int8_t)((ql[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
+                const int8_t q3 = (int8_t)((ql[l +  0]  >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
+                const int8_t q4 = (int8_t)((ql[l + 32]  >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
+                output_blk[l +  0] = __float2bfloat16(d * sc[is + 0] * q1);
+                output_blk[l + 32] = __float2bfloat16(d * sc[is + 2] * q2);
+                output_blk[l + 64] = __float2bfloat16(d * sc[is + 4] * q3);
+                output_blk[l + 96] = __float2bfloat16(d * sc[is + 6] * q4);
+            }
+            output_blk += 128;
+            ql += 64;
+            qh += 32;
+            sc += 8;
+        }
     }
 }
 
 static constexpr __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
 
-__global__ void dequantize_iq4_xs_kernel(int8_t* data, float* output, int blk_size, int num_blocks) {
-    int global_idx = blockIdx.x * blockDim.x + threadIdx.x;
-    for (auto block_id=global_idx; block_id<num_blocks; block_id+=blockDim.x * gridDim.x) {
-        float* __restrict__ output_blk = (float*)(output + block_id * 256);
-        const float d = __half2float(*(reinterpret_cast<half*>(data + block_id * blk_size)));
-        const uint16_t scales_h = *(reinterpret_cast<uint16_t*>(data + block_id * blk_size + 2));
+__global__ void dequantize_iq4_xs_fp32_kernel(const int8_t* data, float* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
+    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    for (long long block_id=global_idx; block_id<num_blocks; block_id+=blockDim.x * gridDim.x) {
+        float* __restrict__ output_blk = (float*)(output + block_id * ele_per_blk);
+        const float d = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size)));
+        const uint16_t scales_h = *(reinterpret_cast<const uint16_t*>(data + block_id * blk_size + 2));
         const uint8_t* scales_l = (uint8_t*)(data + block_id * blk_size + 2 + 2);
         const uint8_t* qs = (uint8_t*)(data + block_id * blk_size + 2 + 2 + 4);
 
@@ -236,152 +612,267 @@ __global__ void dequantize_iq4_xs_kernel(int8_t* data, float* output, int blk_si
     }
 }
 
-torch::Tensor dequantize_q8_0(torch::Tensor data, int blk_size, torch::Device device) {
-    int num_blocks = data.numel() / blk_size;
+__global__ void dequantize_iq4_xs_fp16_kernel(const int8_t* data, __half* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
+    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    for (long long block_id=global_idx; block_id<num_blocks; block_id+=blockDim.x * gridDim.x) {
+        __half* __restrict__ output_blk = (__half*)(output + block_id * ele_per_blk);
+        const float d = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size)));
+        const uint16_t scales_h = *(reinterpret_cast<const uint16_t*>(data + block_id * blk_size + 2));
+        const uint8_t* scales_l = (uint8_t*)(data + block_id * blk_size + 2 + 2);
+        const uint8_t* qs = (uint8_t*)(data + block_id * blk_size + 2 + 2 + 4);
+
+        for (int ib = 0; ib < 8; ++ib) {
+            const int ls = ((scales_l[ib / 2] >> 4 * (ib % 2)) & 0xf) | (((scales_h >> 2 * ib) & 3) << 4);
+            const float dl = d * (ls - 32);
+            for (int j = 0; j < 16; ++j) {
+                output_blk[j + 0] = __float2half(dl * kvalues_iq4nl[qs[j] & 0xf]);
+                output_blk[j + 16] = __float2half(dl * kvalues_iq4nl[qs[j] >> 4]);
+            }
+            output_blk += 32;
+            qs += 16;
+        }
+    }
+}
+
+__global__ void dequantize_iq4_xs_bf16_kernel(const int8_t* data, nv_bfloat16* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
+    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    for (long long block_id=global_idx; block_id<num_blocks; block_id+=blockDim.x * gridDim.x) {
+        nv_bfloat16* __restrict__ output_blk = (nv_bfloat16*)(output + block_id * ele_per_blk);
+        const float d = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size)));
+        const uint16_t scales_h = *(reinterpret_cast<const uint16_t*>(data + block_id * blk_size + 2));
+        const uint8_t* scales_l = (uint8_t*)(data + block_id * blk_size + 2 + 2);
+        const uint8_t* qs = (uint8_t*)(data + block_id * blk_size + 2 + 2 + 4);
+
+        for (int ib = 0; ib < 8; ++ib) {
+            const int ls = ((scales_l[ib / 2] >> 4 * (ib % 2)) & 0xf) | (((scales_h >> 2 * ib) & 3) << 4);
+            const float dl = d * (ls - 32);
+            for (int j = 0; j < 16; ++j) {
+                output_blk[j + 0] = __float2bfloat16(dl * kvalues_iq4nl[qs[j] & 0xf]);
+                output_blk[j + 16] = __float2bfloat16(dl * kvalues_iq4nl[qs[j] >> 4]);
+            }
+            output_blk += 32;
+            qs += 16;
+        }
+    }
+}
+
+torch::Tensor dequantize_q8_0(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype) {
+    int num_blocks = num_bytes / blk_size;
     const at::cuda::OptionalCUDAGuard device_guard(device);
-    // create gpu
-    auto options_scales = torch::TensorOptions().dtype(torch::kFloat32).device(device).memory_format(torch::MemoryFormat::Contiguous);
-    auto options_qs = torch::TensorOptions().dtype(torch::kInt8).device(device).memory_format(torch::MemoryFormat::Contiguous);
-    auto scales_gpu = torch::empty({{num_blocks, 1}}, options_scales);
-    auto qs_gpu = torch::empty({num_blocks, 32}, options_qs);
 
-    // read on cpu
-    options_scales = torch::TensorOptions().dtype(torch::kFloat16).device(torch::kCPU);
-    options_qs = torch::TensorOptions().dtype(torch::kInt8).device(torch::kCPU);
+    auto options = torch::TensorOptions().dtype(torch::kInt8).device(device).memory_format(torch::MemoryFormat::Contiguous);
+    auto data_gpu = torch::empty({ num_bytes }, options);
 
-    // // reinterpret
-    auto scales = torch::from_blob(data.data_ptr(), {num_blocks, 1 + 16}, options_scales).slice(1, 0, 1);
-    auto qs = torch::from_blob(data.data_ptr(), {num_blocks, 2 + 32}, options_qs).slice(1, 2);
-    
-    auto scales_f32 = scales.to(torch::kFloat32);
-    scales_gpu.copy_(scales_f32, false);
-    qs_gpu.copy_(qs, false);
+    cudaMemcpy(data_gpu.data_ptr<int8_t>(), data, num_bytes, cudaMemcpyHostToDevice);
+    //data_gpu.copy_(data, false);
 
     // Create output tensor
-    auto output = torch::zeros_like(qs, torch::dtype(torch::kFloat32).device(device));
+    auto output = torch::zeros({ num_blocks, 32 }, torch::dtype(target_dtype).device(device));
 
-    // Launch kernel
-    dequantize_q8_0_kernel<<< 512, 256 >>>(
-        output.data_ptr<float>(), scales_gpu.data_ptr<float>(), qs_gpu.data_ptr<int8_t>(), num_blocks, 32);
+    switch (target_dtype) {
+        case torch::kFloat16:
+            dequantize_q8_0_fp16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (__half*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
+            break;
+        case torch::kBFloat16:
+            dequantize_q8_0_bf16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (nv_bfloat16*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
+            break;
+        case torch::kFloat32:
+            dequantize_q8_0_fp32_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), output.data_ptr<float>(), blk_size, ele_per_blk, num_blocks);
+            break;
+        default:
+            printf("target type not support\n");
+            exit(0);
+    }
 
     cudaDeviceSynchronize();
     return output;
 }
 
 
-torch::Tensor dequantize_q6_k(torch::Tensor data, int blk_size, torch::Device device) {
+torch::Tensor dequantize_q6_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype) {
     // data.numel%blk_size should be 0, else raise err
-    int num_blocks = data.numel() / blk_size;
+    int num_blocks = num_bytes / blk_size;
 
     const at::cuda::OptionalCUDAGuard device_guard(device);
     auto options = torch::TensorOptions().dtype(torch::kInt8).device(device).memory_format(torch::MemoryFormat::Contiguous);
-    auto data_gpu = torch::empty({data.numel()}, options);
+    auto data_gpu = torch::empty({num_bytes}, options);
 
-    data_gpu.copy_(data, false);
+    cudaMemcpy(data_gpu.data_ptr<int8_t>(), data, num_bytes, cudaMemcpyHostToDevice);
+    //data_gpu.copy_(data, false);
 
     // Create output tensor
-    auto output = torch::zeros({num_blocks, 256}, torch::dtype(torch::kFloat32).device(device));
-
-    // Launch kernel
-    dequantize_q6_k_kernel<<< 512, 256 >>>(data_gpu.data_ptr<int8_t>(), output.data_ptr<float>(), blk_size, num_blocks);
-    // dequantize_q6_k_kernel<<< 512, 256 >>>(data_gpu.data_ptr<int8_t>(), output.data_ptr<float>(), 256, num_blocks);
+    auto output = torch::zeros({num_blocks, 256}, torch::dtype(target_dtype).device(device));
 
+    switch (target_dtype) {
+        case torch::kFloat16:
+            dequantize_q6_k_fp16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (__half*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
+            break;
+        case torch::kBFloat16:
+            dequantize_q6_k_bf16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (nv_bfloat16*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
+            break;
+        case torch::kFloat32:
+            dequantize_q6_k_fp32_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), output.data_ptr<float>(), blk_size, ele_per_blk, num_blocks);
+            break;
+        default:
+            printf("target type not support\n");
+            exit(0);
+    }
     cudaDeviceSynchronize();
     return output;
 }
 
-torch::Tensor dequantize_q5_k(torch::Tensor data, int blk_size, torch::Device device) {
-    int num_blocks = data.numel() / blk_size;
+torch::Tensor dequantize_q5_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype) {
+    int num_blocks = num_bytes / blk_size;
     const at::cuda::OptionalCUDAGuard device_guard(device);
 
     auto options = torch::TensorOptions().dtype(torch::kInt8).device(device).memory_format(torch::MemoryFormat::Contiguous);
-    auto data_gpu = torch::empty({data.numel()}, options);
+    auto data_gpu = torch::empty({num_bytes}, options);
 
-    data_gpu.copy_(data, false);
+    cudaMemcpy(data_gpu.data_ptr<int8_t>(), data, num_bytes, cudaMemcpyHostToDevice);
+    //data_gpu.copy_(data, false);
 
     // Create output tensor
-    auto output = torch::zeros({num_blocks, 256}, torch::dtype(torch::kFloat32).device(device));
-
-    // Launch kernel
-    dequantize_q5_k_kernel<<< 512, 256 >>>(data_gpu.data_ptr<int8_t>(), output.data_ptr<float>(), blk_size, num_blocks);
+    auto output = torch::zeros({num_blocks, 256}, torch::dtype(target_dtype).device(device));
 
+    switch (target_dtype) {
+        case torch::kFloat16:
+            dequantize_q5_k_fp16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (__half*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
+            break;
+        case torch::kBFloat16:
+            dequantize_q5_k_bf16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (nv_bfloat16*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
+            break;
+        case torch::kFloat32:
+            dequantize_q5_k_fp32_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), output.data_ptr<float>(), blk_size, ele_per_blk, num_blocks);
+            break;
+        default:
+            printf("target type not support\n");
+            exit(0);
+    }
     cudaDeviceSynchronize();
     return output;
 }
 
-torch::Tensor dequantize_q4_k(torch::Tensor data, int blk_size, torch::Device device) {
+torch::Tensor dequantize_q4_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype) {
     // data.numel%blk_size should be 0, else raise err
-    int num_blocks = data.numel() / blk_size;
+    int num_blocks = num_bytes / blk_size;
     const at::cuda::OptionalCUDAGuard device_guard(device);
 
     auto options = torch::TensorOptions().dtype(torch::kInt8).device(device).memory_format(torch::MemoryFormat::Contiguous);
-    auto data_gpu = torch::empty({data.numel()}, options);
+    auto data_gpu = torch::empty({num_bytes}, options);
 
-    data_gpu.copy_(data, false);
+    cudaMemcpy(data_gpu.data_ptr<int8_t>(), data, num_bytes, cudaMemcpyHostToDevice);
+    //data_gpu.copy_(data, false);
 
     // Create output tensor
-    auto output = torch::zeros({num_blocks, 256}, torch::dtype(torch::kFloat32).device(device));
-
-    // Launch kernel
-    dequantize_q4_k_kernel<<< 512, 256 >>>(data_gpu.data_ptr<int8_t>(), output.data_ptr<float>(), 256, num_blocks);
+    auto output = torch::zeros({num_blocks, 256}, torch::dtype(target_dtype).device(device));
 
+    switch (target_dtype) {
+        case torch::kFloat16:
+            dequantize_q4_k_fp16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (__half*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
+            break;
+        case torch::kBFloat16:
+            dequantize_q4_k_bf16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (nv_bfloat16*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
+            break;
+        case torch::kFloat32:
+            dequantize_q4_k_fp32_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), output.data_ptr<float>(), blk_size, ele_per_blk, num_blocks);
+            break;
+        default:
+            printf("target type not support\n");
+            exit(0);
+    }
     cudaDeviceSynchronize();
     return output;
 }
 
-torch::Tensor dequantize_q3_k(torch::Tensor data, int blk_size, torch::Device device) {
-    int num_blocks = data.numel() / blk_size;
+torch::Tensor dequantize_q3_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype) {
+    int num_blocks = num_bytes / blk_size;
     const at::cuda::OptionalCUDAGuard device_guard(device);
 
     auto options = torch::TensorOptions().dtype(torch::kInt8).device(device).memory_format(torch::MemoryFormat::Contiguous);
-    auto data_gpu = torch::empty({data.numel()}, options);
+    auto data_gpu = torch::empty({num_bytes}, options);
 
-    data_gpu.copy_(data, false);
+    cudaMemcpy(data_gpu.data_ptr<int8_t>(), data, num_bytes, cudaMemcpyHostToDevice);
+    //data_gpu.copy_(data, false);
 
     // Create output tensor
-    auto output = torch::zeros({num_blocks, 256}, torch::dtype(torch::kFloat32).device(device));
-
-    // Launch kernel
-    dequantize_q3_k_kernel<<< 512, 256 >>>(data_gpu.data_ptr<int8_t>(), output.data_ptr<float>(), blk_size, num_blocks);
+    auto output = torch::zeros({num_blocks, 256}, torch::dtype(target_dtype).device(device));
 
+    switch (target_dtype) {
+        case torch::kFloat16:
+            dequantize_q3_k_fp16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (__half*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
+            break;
+        case torch::kBFloat16:
+            dequantize_q3_k_bf16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (nv_bfloat16*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
+            break;
+        case torch::kFloat32:
+            dequantize_q3_k_fp32_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), output.data_ptr<float>(), blk_size, ele_per_blk, num_blocks);
+            break;
+        default:
+            printf("target type not support\n");
+            exit(0);
+    }
     cudaDeviceSynchronize();
     return output;
 }
 
-torch::Tensor dequantize_q2_k(torch::Tensor data, int blk_size, torch::Device device) {
-    int num_blocks = data.numel() / blk_size;
+torch::Tensor dequantize_q2_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype) {
+    int num_blocks = num_bytes / blk_size;
     const at::cuda::OptionalCUDAGuard device_guard(device);
 
     auto options = torch::TensorOptions().dtype(torch::kInt8).device(device).memory_format(torch::MemoryFormat::Contiguous);
-    auto data_gpu = torch::empty({data.numel()}, options);
+    auto data_gpu = torch::empty({num_bytes}, options);
 
-    data_gpu.copy_(data, false);
+    cudaMemcpy(data_gpu.data_ptr<int8_t>(), data, num_bytes, cudaMemcpyHostToDevice);
+    //data_gpu.copy_(data, false);
 
     // Create output tensor
-    auto output = torch::zeros({num_blocks, 256}, torch::dtype(torch::kFloat32).device(device));
-
-    // Launch kernel
-    dequantize_q2_k_kernel<<< 512, 256 >>>(data_gpu.data_ptr<int8_t>(), output.data_ptr<float>(), blk_size, num_blocks);
+    auto output = torch::zeros({num_blocks, 256}, torch::dtype(target_dtype).device(device));
 
+    switch (target_dtype) {
+        case torch::kFloat16:
+            dequantize_q2_k_fp16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (__half*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
+            break;
+        case torch::kBFloat16:
+            dequantize_q2_k_bf16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (nv_bfloat16*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
+            break;
+        case torch::kFloat32:
+            dequantize_q2_k_fp32_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), output.data_ptr<float>(), blk_size, ele_per_blk, num_blocks);
+            break;
+        default:
+            printf("target type not support\n");
+            exit(0);
+    }
     cudaDeviceSynchronize();
     return output;
 }
 
-torch::Tensor dequantize_iq4_xs(torch::Tensor data, int blk_size, torch::Device device) {
-    int num_blocks = data.numel() / blk_size;
+torch::Tensor dequantize_iq4_xs(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype) {
+    int num_blocks = num_bytes / blk_size;
     const at::cuda::OptionalCUDAGuard device_guard(device);
 
     auto options = torch::TensorOptions().dtype(torch::kInt8).device(device).memory_format(torch::MemoryFormat::Contiguous);
-    auto data_gpu = torch::empty({data.numel()}, options);
+    auto data_gpu = torch::empty({num_bytes}, options);
 
-    data_gpu.copy_(data, false);
+    cudaMemcpy(data_gpu.data_ptr<int8_t>(), data, num_bytes, cudaMemcpyHostToDevice);
+    //data_gpu.copy_(data, false);
 
     // Create output tensor
-    auto output = torch::zeros({num_blocks, 256}, torch::dtype(torch::kFloat32).device(device));
-
-    // Launch kernel
-    dequantize_iq4_xs_kernel<<< 512, 256 >>>(data_gpu.data_ptr<int8_t>(), output.data_ptr<float>(), blk_size, num_blocks);
+    auto output = torch::zeros({num_blocks, 256}, torch::dtype(target_dtype).device(device));
 
+    switch (target_dtype) {
+        case torch::kFloat16:
+            dequantize_iq4_xs_fp16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (__half*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
+            break;
+        case torch::kBFloat16:
+            dequantize_iq4_xs_bf16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (nv_bfloat16*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
+            break;
+        case torch::kFloat32:
+            dequantize_iq4_xs_fp32_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), output.data_ptr<float>(), blk_size, ele_per_blk, num_blocks);
+            break;
+        default:
+            printf("target type not support\n");
+            exit(0);
+    }
     cudaDeviceSynchronize();
     return output;
 }
diff --git a/ktransformers/ktransformers_ext/cuda/custom_gguf/ops.h b/ktransformers/ktransformers_ext/cuda/custom_gguf/ops.h
index 666d455..1740cbf 100644
--- a/ktransformers/ktransformers_ext/cuda/custom_gguf/ops.h
+++ b/ktransformers/ktransformers_ext/cuda/custom_gguf/ops.h
@@ -1,11 +1,11 @@
 /**
- * @Description  :  
+ * @Description  :
  * @Author       : Azure-Tang
  * @Date         : 2024-07-22 09:27:55
  * @Version      : 1.0.0
  * @LastEditors  : kkk1nak0
  * @LastEditTime : 2024-08-12 03:48:46
- * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
+ * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/
 #pragma once
 
@@ -13,10 +13,10 @@
 #include <torch/extension.h>
 #include <torch/torch.h>
 
-torch::Tensor dequantize_q8_0(torch::Tensor data, int blk_size, torch::Device device);
-torch::Tensor dequantize_q6_k(torch::Tensor data, int blk_size, torch::Device device);
-torch::Tensor dequantize_q5_k(torch::Tensor data, int blk_size, torch::Device device);
-torch::Tensor dequantize_q4_k(torch::Tensor data, int blk_size, torch::Device device);
-torch::Tensor dequantize_q3_k(torch::Tensor data, int blk_size, torch::Device device);
-torch::Tensor dequantize_q2_k(torch::Tensor data, int blk_size, torch::Device device);
-torch::Tensor dequantize_iq4_xs(torch::Tensor data, int blk_size, torch::Device device);
+torch::Tensor dequantize_q8_0(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype);
+torch::Tensor dequantize_q6_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype);
+torch::Tensor dequantize_q5_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype);
+torch::Tensor dequantize_q4_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype);
+torch::Tensor dequantize_q3_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype);
+torch::Tensor dequantize_q2_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype);
+torch::Tensor dequantize_iq4_xs(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype);
diff --git a/ktransformers/ktransformers_ext/cuda/test_dequant.py b/ktransformers/ktransformers_ext/cuda/test_dequant.py
new file mode 100644
index 0000000..abca745
--- /dev/null
+++ b/ktransformers/ktransformers_ext/cuda/test_dequant.py
@@ -0,0 +1,16 @@
+import os
+import sys
+sys.path.insert(0,"/home/zbx/ktransformers")
+from ktransformers.util.custom_gguf import GGUFLoader
+import torch
+
+gguf_loader_1 = GGUFLoader("/mnt/data/model/DeepseekV3-q4km-gguf")
+gguf_loader_2 = GGUFLoader("/mnt/data/chenht/model/gguf_for_ktransformers/DeepSeek-V3-bf16/")
+
+torch.set_default_dtype(torch.bfloat16)
+
+tensor_1 = gguf_loader_1.load_gguf_tensor("blk.0.attn_kv_a_mqa.weight", "cuda")
+tensor_2 = gguf_loader_2.load_gguf_tensor("blk.0.attn_kv_a_mqa.weight", "cuda")
+
+print(tensor_1[0, -64:])
+print(tensor_2[0, -64:])
\ No newline at end of file
diff --git a/ktransformers/ktransformers_ext/operators/custom_marlin/quantize/utils/marlin_utils.py b/ktransformers/ktransformers_ext/operators/custom_marlin/quantize/utils/marlin_utils.py
index accbc00..fadfb11 100644
--- a/ktransformers/ktransformers_ext/operators/custom_marlin/quantize/utils/marlin_utils.py
+++ b/ktransformers/ktransformers_ext/operators/custom_marlin/quantize/utils/marlin_utils.py
@@ -90,7 +90,7 @@ def marlin_quantize(
     assert group_size <= size_k
 
     # Quantize (and apply act_order if provided)
-    w_ref, q_w, s, g_idx, rand_perm = quantize_weights(w, num_bits, group_size,
+    q_w, s, g_idx, rand_perm = quantize_weights(w, num_bits, group_size,
                                                        act_order)
 
     # For act_order, sort the "weights" and "g_idx" so that group ids are
@@ -107,7 +107,7 @@ def marlin_quantize(
                                      marlin_scale_perm_single[num_bits])
 
     # Create result
-    res_list = [w_ref, marlin_q_w, marlin_s, g_idx, sort_indices, rand_perm]
+    res_list = [marlin_q_w, marlin_s, g_idx, sort_indices, rand_perm]
     for i in range(len(res_list)):
         res_list[i] = res_list[i].to(w.device)
 
diff --git a/ktransformers/ktransformers_ext/operators/custom_marlin/quantize/utils/quant_utils.py b/ktransformers/ktransformers_ext/operators/custom_marlin/quantize/utils/quant_utils.py
index b3a0ba5..de73667 100644
--- a/ktransformers/ktransformers_ext/operators/custom_marlin/quantize/utils/quant_utils.py
+++ b/ktransformers/ktransformers_ext/operators/custom_marlin/quantize/utils/quant_utils.py
@@ -11,8 +11,7 @@ def get_pack_factor(num_bits):
     return 32 // num_bits
 
 
-def permute_rows(q_w: torch.Tensor, w_ref: torch.Tensor, group_size: int):
-    assert q_w.shape == w_ref.shape
+def permute_rows(q_w: torch.Tensor, group_size: int):
 
     orig_device = q_w.device
     k_size, _ = q_w.shape
@@ -26,10 +25,8 @@ def permute_rows(q_w: torch.Tensor, w_ref: torch.Tensor, group_size: int):
 
     g_idx = g_idx[rand_perm].contiguous()
     q_w = q_w[rand_perm, :].contiguous()
-    w_ref = w_ref[rand_perm, :].contiguous()
 
     return (
-        w_ref.to(device=orig_device),
         q_w.to(device=orig_device),
         g_idx.to(device=orig_device),
         rand_perm.to(device=orig_device),
@@ -69,9 +66,6 @@ def quantize_weights(w: torch.Tensor, num_bits: int, group_size: int,
     q_w += half_q_val
     q_w = torch.clamp(q_w, 0, max_q_val)
 
-    # Compute ref (dequantized)
-    w_ref = (q_w - half_q_val).half() * s
-
     # Restore original shapes
     if group_size < size_k:
 
@@ -82,7 +76,6 @@ def quantize_weights(w: torch.Tensor, num_bits: int, group_size: int,
             return w
 
         q_w = reshape_w(q_w)
-        w_ref = reshape_w(w_ref)
 
     s = s.reshape((-1, size_n)).contiguous()
 
@@ -95,10 +88,9 @@ def quantize_weights(w: torch.Tensor, num_bits: int, group_size: int,
         ), "For act_order, groupsize = {} must be less than size_k = {}".format(
             group_size, size_k)
 
-        w_ref, q_w, g_idx, rand_perm = permute_rows(q_w, w_ref, group_size)
+        q_w, g_idx, rand_perm = permute_rows(q_w, group_size)
 
     return (
-        w_ref.to(device=orig_device),
         q_w.to(device=orig_device),
         s.to(device=orig_device),
         g_idx.to(device=orig_device),
diff --git a/ktransformers/ktransformers_ext/operators/kvcache/kvcache_attn.cpp b/ktransformers/ktransformers_ext/operators/kvcache/kvcache_attn.cpp
index c59cb94..4190c03 100644
--- a/ktransformers/ktransformers_ext/operators/kvcache/kvcache_attn.cpp
+++ b/ktransformers/ktransformers_ext/operators/kvcache/kvcache_attn.cpp
@@ -10,6 +10,8 @@
 
 #include "kvcache.h"
 
+#include <chrono>
+
 void KVCache::attention_kvhead_(const uint16_t *q_in_data, ggml_fp16_t *output,
                                 float *attn_lse, int batch_size,
                                 Backend *backend) {
diff --git a/ktransformers/ktransformers_ext/operators/kvcache/kvcache_load_dump.cpp b/ktransformers/ktransformers_ext/operators/kvcache/kvcache_load_dump.cpp
index eadf90f..4de217f 100644
--- a/ktransformers/ktransformers_ext/operators/kvcache/kvcache_load_dump.cpp
+++ b/ktransformers/ktransformers_ext/operators/kvcache/kvcache_load_dump.cpp
@@ -9,6 +9,9 @@
  **/
 
 #include "kvcache.h"
+
+#include <chrono>
+
 void KVCache::load_kvcache(std::string tensor_file_path, Backend *backend) {
     // Timer start
     auto start = std::chrono::high_resolution_clock::now();
diff --git a/ktransformers/ktransformers_ext/operators/kvcache/kvcache_read_write.cpp b/ktransformers/ktransformers_ext/operators/kvcache/kvcache_read_write.cpp
index 998f1b0..0104905 100644
--- a/ktransformers/ktransformers_ext/operators/kvcache/kvcache_read_write.cpp
+++ b/ktransformers/ktransformers_ext/operators/kvcache/kvcache_read_write.cpp
@@ -10,6 +10,8 @@
 
 #include "kvcache.h"
 
+#include <chrono>
+
 void KVCache::get_anchor_one_block(ggml_fp16_t *anchor, int layer_id,
                                    int block_idx, Backend *backend) {
     // Timer start
diff --git a/ktransformers/ktransformers_ext/operators/kvcache/kvcache_utils.cpp b/ktransformers/ktransformers_ext/operators/kvcache/kvcache_utils.cpp
index f1d6f7d..c57d475 100644
--- a/ktransformers/ktransformers_ext/operators/kvcache/kvcache_utils.cpp
+++ b/ktransformers/ktransformers_ext/operators/kvcache/kvcache_utils.cpp
@@ -10,6 +10,8 @@
 
 #include "kvcache.h"
 
+#include <chrono>
+
 std::string ggml_type_to_string(ggml_type type) {
     switch (type) {
     case GGML_TYPE_F32:
diff --git a/ktransformers/ktransformers_ext/triton/fp8gemm.py b/ktransformers/ktransformers_ext/triton/fp8gemm.py
new file mode 100644
index 0000000..d5c913d
--- /dev/null
+++ b/ktransformers/ktransformers_ext/triton/fp8gemm.py
@@ -0,0 +1,193 @@
+# Adopted from https://huggingface.co/deepseek-ai/DeepSeek-V3/blob/main/inference/kernel.py
+from typing import Tuple
+
+import torch
+import triton
+import triton.language as tl
+from triton import Config
+
+
+@triton.jit
+def act_quant_kernel(x_ptr, y_ptr, s_ptr, BLOCK_SIZE: tl.constexpr):
+    """
+    Quantizes the input tensor `x_ptr` and stores the result in `y_ptr` and the scaling factor in `s_ptr`.
+
+    Args:
+        x_ptr (triton.Pointer): Pointer to the input tensor.
+        y_ptr (triton.Pointer): Pointer to the output tensor where quantized values will be stored.
+        s_ptr (triton.Pointer): Pointer to the output tensor where scaling factors will be stored.
+        BLOCK_SIZE (tl.constexpr): The size of the block to be processed by each program instance.
+
+    Returns:
+        None
+    """
+    pid = tl.program_id(axis=0)
+    offs = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    x = tl.load(x_ptr + offs).to(tl.float32)
+    s = tl.max(tl.abs(x)) / 448.
+    y = x / s
+    y = y.to(y_ptr.dtype.element_ty)
+    tl.store(y_ptr + offs, y)
+    tl.store(s_ptr + pid, s)
+
+
+def act_quant(x: torch.Tensor, block_size: int = 128) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Quantizes the input tensor `x` using block-wise quantization.
+
+    Args:
+        x (torch.Tensor): The input tensor to be quantized. Must be contiguous and its last dimension size must be divisible by `block_size`.
+        block_size (int, optional): The size of the blocks to be used for quantization. Default is 128.
+
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: A tuple containing:
+            - The quantized tensor with dtype `torch.float8_e4m3fn`.
+            - A tensor of scaling factors with dtype `torch.float32`.
+    """
+    assert x.is_contiguous(), 'Input tensor must be contiguous'
+    assert x.size(-1) % block_size == 0, f'Last dimension size must be divisible by block_size (block_size={block_size})'
+    y = torch.empty_like(x, dtype=torch.float8_e4m3fn)
+    s = x.new_empty(*x.size()[:-1], x.size(-1) // block_size, dtype=torch.float32)
+    grid = lambda meta: (triton.cdiv(x.numel(), meta['BLOCK_SIZE']), )
+    act_quant_kernel[grid](x, y, s, BLOCK_SIZE=block_size)
+    return y, s
+
+
+@triton.jit
+def weight_dequant_kernel(x_ptr, s_ptr, y_ptr, M, N, BLOCK_SIZE: tl.constexpr):
+    """
+    Dequantizes weights using the provided scaling factors and stores the result.
+
+    Args:
+        x_ptr (tl.pointer): Pointer to the quantized weights.
+        s_ptr (tl.pointer): Pointer to the scaling factors.
+        y_ptr (tl.pointer): Pointer to the output buffer for dequantized weights.
+        M (int): Number of rows in the weight matrix.
+        N (int): Number of columns in the weight matrix.
+        BLOCK_SIZE (tl.constexpr): Size of the block for tiling.
+
+    Returns:
+        None
+    """
+    pid_m = tl.program_id(axis=0)
+    pid_n = tl.program_id(axis=1)
+    n = tl.cdiv(N, BLOCK_SIZE)
+    offs_m = pid_m * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    offs_n = pid_n * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    offs = offs_m[:, None] * N + offs_n[None, :]
+    mask = (offs_m[:, None] < M) & (offs_n[None, :] < N)
+    x = tl.load(x_ptr + offs, mask=mask).to(tl.float32)
+    s = tl.load(s_ptr + pid_m * n + pid_n)
+    y = x * s
+    tl.store(y_ptr + offs, y, mask=mask)
+
+
+def weight_dequant(x: torch.Tensor, s: torch.Tensor, block_size: int = 128) -> torch.Tensor:
+    """
+    Dequantizes the given weight tensor using the provided scale tensor.
+
+    Args:
+        x (torch.Tensor): The quantized weight tensor of shape (M, N).
+        s (torch.Tensor): The scale tensor of shape (M, N).
+        block_size (int, optional): The block size to use for dequantization. Defaults to 128.
+
+    Returns:
+        torch.Tensor: The dequantized weight tensor of the same shape as `x`.
+
+    Raises:
+        AssertionError: If `x` or `s` are not contiguous or if their dimensions are not 2.
+    """
+    assert x.is_contiguous() and s.is_contiguous(), 'Input tensors must be contiguous'
+    assert x.dim() == 2 and s.dim() == 2, 'Input tensors must have 2 dimensions'
+    M, N = x.size()
+    y = torch.empty_like(x, dtype=torch.get_default_dtype())
+    grid = lambda meta: (triton.cdiv(M, meta['BLOCK_SIZE']), triton.cdiv(N, meta['BLOCK_SIZE']))
+    with torch.cuda.device(x.device):
+        weight_dequant_kernel[grid](x, s, y, M, N, BLOCK_SIZE=block_size)
+    return y
+
+
+fp8_gemm_configs = [
+    Config({'BLOCK_SIZE_M': block_m, 'BLOCK_SIZE_N': block_n, 'BLOCK_SIZE_K': 128}, num_stages=num_stages, num_warps=8)
+    for block_m in [16, 32, 64] for block_n in [32, 64, 128] for num_stages in [3, 4, 5, 6]
+]
+
+@triton.autotune(configs=fp8_gemm_configs, key=['N', 'K'])
+@triton.jit
+def fp8_gemm_kernel(a_ptr, b_ptr, c_ptr,
+                    a_s_ptr, b_s_ptr,
+                    M, N: tl.constexpr, K: tl.constexpr,
+                    BLOCK_SIZE_M: tl.constexpr,
+                    BLOCK_SIZE_N: tl.constexpr,
+                    BLOCK_SIZE_K: tl.constexpr):
+    """
+    Performs a matrix multiplication operation on FP8 matrices with scaling factors.
+
+    Args:
+        a_ptr (tl.tensor): Pointer to the first input matrix A.
+        b_ptr (tl.tensor): Pointer to the second input matrix B.
+        c_ptr (tl.tensor): Pointer to the output matrix C.
+        a_s_ptr (tl.tensor): Pointer to the scaling factors for matrix A.
+        b_s_ptr (tl.tensor): Pointer to the scaling factors for matrix B.
+        M (int): Number of rows in matrix A and C.
+        N (tl.constexpr): Number of columns in matrix B and C.
+        K (tl.constexpr): Number of columns in matrix A and rows in matrix B.
+        BLOCK_SIZE_M (tl.constexpr): Block size for the M dimension.
+        BLOCK_SIZE_N (tl.constexpr): Block size for the N dimension.
+        BLOCK_SIZE_K (tl.constexpr): Block size for the K dimension.
+
+    Returns:
+        None
+    """
+    pid_m = tl.program_id(axis=0)
+    pid_n = tl.program_id(axis=1)
+    k = tl.cdiv(K, BLOCK_SIZE_K)
+    offs_m = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_n = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = a_ptr + offs_m[:, None] * K + offs_k[None, :]
+    b_ptrs = b_ptr + offs_n[None, :] * K + offs_k[:, None]
+    a_s_ptrs = a_s_ptr + offs_m * k
+    b_s_ptrs = b_s_ptr + (offs_n // BLOCK_SIZE_K) * k
+
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for i in range(k):
+        a = tl.load(a_ptrs, mask=offs_k[None, :] < K - i * BLOCK_SIZE_K, other=0.0)
+        b = tl.load(b_ptrs, mask=offs_k[:, None] < K - i * BLOCK_SIZE_K, other=0.0)
+        a_s = tl.load(a_s_ptrs)
+        b_s = tl.load(b_s_ptrs)
+        accumulator += tl.dot(a, b) * a_s[:, None] * b_s[None, :]
+        a_ptrs += BLOCK_SIZE_K
+        b_ptrs += BLOCK_SIZE_K
+        a_s_ptrs += 1
+        b_s_ptrs += 1
+    c = accumulator.to(c_ptr.dtype.element_ty)
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    c_ptrs = c_ptr + offs_m[:, None] * N + offs_n[None, :]
+    mask = (offs_m[:, None] < M) & (offs_n[None, :] < N)
+    tl.store(c_ptrs, c, mask=mask)
+
+
+def fp8_gemm(a: torch.Tensor, a_s: torch.Tensor, b: torch.Tensor, b_s: torch.Tensor):
+    """
+    Perform a matrix multiplication using FP8 precision.
+
+    Args:
+        a (torch.Tensor): The first input matrix, must be contiguous.
+        a_s (torch.Tensor): The scaling factor for the first input matrix, must be contiguous.
+        b (torch.Tensor): The second input matrix, must be contiguous.
+        b_s (torch.Tensor): The scaling factor for the second input matrix, must be contiguous.
+
+    Returns:
+        torch.Tensor: The result of the matrix multiplication.
+    """
+    assert a.is_contiguous() and b.is_contiguous(), 'Input tensors must be contiguous'
+    assert a_s.is_contiguous() and b_s.is_contiguous(), 'Scaling factor tensors must be contiguous'
+    K = a.size(-1)
+    M = a.numel() // K
+    N = b.size(0)
+    c = a.new_empty(*a.size()[:-1], N, dtype=torch.get_default_dtype())
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']), triton.cdiv(N, META['BLOCK_SIZE_N']))
+    fp8_gemm_kernel[grid](a, b, c, a_s, b_s, M, N, K)
+    return c
\ No newline at end of file
diff --git a/ktransformers/local_chat.py b/ktransformers/local_chat.py
index 676ea67..4acaf86 100644
--- a/ktransformers/local_chat.py
+++ b/ktransformers/local_chat.py
@@ -28,8 +28,9 @@ from ktransformers.models.modeling_qwen2_moe import Qwen2MoeForCausalLM
 from ktransformers.models.modeling_deepseek_v3 import DeepseekV3ForCausalLM
 from ktransformers.models.modeling_llama import LlamaForCausalLM
 from ktransformers.models.modeling_mixtral import MixtralForCausalLM
-from ktransformers.util.utils import prefill_and_generate
+from ktransformers.util.utils import prefill_and_generate, get_compute_capability
 from ktransformers.server.config.config import Config
+from ktransformers.operators.flashinfer_wrapper import flashinfer_enabled
 
 custom_models = {
     "DeepseekV2ForCausalLM": DeepseekV2ForCausalLM,
@@ -53,7 +54,7 @@ default_optimize_rules = {
 
 def local_chat(
     model_path: str | None = None,
-    optimize_rule_path: str = None,
+    optimize_config_path: str = None,
     gguf_path: str | None = None,
     max_new_tokens: int = 300,
     cpu_infer: int = Config().cpu_infer,
@@ -61,9 +62,9 @@ def local_chat(
     prompt_file : str | None = None,
     mode: str = "normal",
     force_think: bool = False,
+    chunk_prefill_size: int = 8192
 ):
 
-
     torch.set_grad_enabled(False)
 
     Config().cpu_infer = cpu_infer
@@ -94,12 +95,12 @@ def local_chat(
                 config, trust_remote_code=True, attn_implementation="flash_attention_2"
             )
 
-    if optimize_rule_path is None:
+    if optimize_config_path is None:
         if config.architectures[0] in default_optimize_rules:
             print("using default_optimize_rule for", config.architectures[0])
-            optimize_rule_path = default_optimize_rules[config.architectures[0]]
+            optimize_config_path = default_optimize_rules[config.architectures[0]]
         else:
-            optimize_rule_path = input(
+            optimize_config_path = input(
                 "please input the path of your rule file(yaml file containing optimize rules):"
             )
 
@@ -107,18 +108,18 @@ def local_chat(
         gguf_path = input(
             "please input the path of your gguf file(gguf file in the dir containing input gguf file must all belong to current model):"
         )
-    optimize_and_load_gguf(model, optimize_rule_path, gguf_path, config)
+    optimize_and_load_gguf(model, optimize_config_path, gguf_path, config)
     
     try:
-            model.generation_config = GenerationConfig.from_pretrained(model_path)
-    except:
-            gen_config = GenerationConfig(
-                max_length=128,
-                temperature=0.7,
-                top_p=0.9,
-                do_sample=True
-            )
-            model.generation_config = gen_config
+        model.generation_config = GenerationConfig.from_pretrained(model_path)
+    except Exception as e:
+        print(f"generation config can't auto create, make default. Message: {e}")
+        gen_config = GenerationConfig(
+            temperature=0.6,
+            top_p=0.95,
+            do_sample=True
+        )
+        model.generation_config = gen_config
     # model.generation_config = GenerationConfig.from_pretrained(model_path)
     if model.generation_config.pad_token_id is None:
         model.generation_config.pad_token_id = model.generation_config.eos_token_id
@@ -167,13 +168,17 @@ def local_chat(
         if mode == 'long_context':
             assert Config().long_context_config['max_seq_len'] > input_tensor.shape[1] + max_new_tokens, \
             "please change max_seq_len in  ~/.ktransformers/config.yaml"
-        torch.set_default_dtype(
-            torch.bfloat16
-        )  # TODO: Remove this, replace dtype using config
-        generated = prefill_and_generate(
-            model, tokenizer, input_tensor.cuda(), max_new_tokens, use_cuda_graph, mode, force_think
-        )
+        
+        if system != "Windows" and (config.architectures[0] == "DeepseekV2ForCausalLM" or config.architectures[0] == "DeepseekV3ForCausalLM") and flashinfer_enabled and get_compute_capability() >= 8:
+            generated = prefill_and_generate(
+                model, tokenizer, input_tensor.cuda(), max_new_tokens, use_cuda_graph, mode = mode, force_think = force_think, chunk_prefill_size = chunk_prefill_size,
+                use_flashinfer_mla = True, num_heads = config.num_attention_heads, head_dim_ckv = config.kv_lora_rank, head_dim_kpe = config.qk_rope_head_dim, q_head_dim = config.qk_rope_head_dim + config.qk_nope_head_dim
+            )
+        else:
+            generated = prefill_and_generate(
+                model, tokenizer, input_tensor.cuda(), max_new_tokens, use_cuda_graph, mode = mode, force_think = force_think, chunk_prefill_size = chunk_prefill_size,
+            )
 
 
 if __name__ == "__main__":
-    fire.Fire(local_chat)
\ No newline at end of file
+    fire.Fire(local_chat)
diff --git a/ktransformers/models/custom_cache.py b/ktransformers/models/custom_cache.py
index e402506..434399f 100644
--- a/ktransformers/models/custom_cache.py
+++ b/ktransformers/models/custom_cache.py
@@ -51,13 +51,34 @@ class StaticCache(transformers.StaticCache):
         cache_shape = (max_batch_size, self.num_key_value_heads, self.max_cache_len, self.head_dim)
         if config.architectures[0] == "DeepseekV2ForCausalLM" or config.architectures[0] == "DeepseekV3ForCausalLM":
             # TODO: for deepseek, cache_shape is different whether using Absorbed MLA, check it automatically
-            # key_shape = (max_batch_size, self.num_key_value_heads, self.max_cache_len, config.qk_rope_head_dim + config.qk_nope_head_dim)
-            # value_shape = (max_batch_size, self.num_key_value_heads, self.max_cache_len, config.v_head_dim)
-            key_shape = (max_batch_size, 1, self.max_cache_len, config.qk_rope_head_dim)
-            value_shape = (max_batch_size, 1, self.max_cache_len, config.kv_lora_rank)
+            self.page_size = 64
+            self.max_pages = (self.max_cache_len + self.page_size - 1) // self.page_size
+            latent_shape = (self.max_pages, self.page_size, 1, config.kv_lora_rank + config.qk_rope_head_dim)
+            self.kv_lora_rank = config.kv_lora_rank
+            self.qk_rope_head_dim = config.qk_rope_head_dim
+            # TODO: support real page table
+            self.page_table_map = dict()
+            self.page_table_list = []
+            for idx in range(config.num_hidden_layers):
+                if isinstance(device, dict):
+                    target_device = device[f"blk.{idx}.self_attn"]["generate_device"]
+                else:
+                    target_device = device
+                
+                if target_device not in self.page_table_map:
+                    page_table = torch.zeros((max_batch_size, self.max_pages), dtype=torch.int32, device=target_device)
+                    for seq_id in range(max_batch_size):
+                        page_table[seq_id, :] = torch.arange(seq_id * self.max_pages, seq_id * self.max_pages + self.max_pages, dtype=torch.int32, device=target_device)
+                    self.page_table_map[target_device] = page_table
+                    
+                self.page_table_list.append(self.page_table_map[target_device])
+                    
+            self.is_MLA = True
+            self.is_page = True
         else:
             key_shape = cache_shape
             value_shape = cache_shape
+            self.is_MLA = False
 
         self.past_tokens = []
         self.num_hidden_layers = config.num_hidden_layers
@@ -68,10 +89,17 @@ class StaticCache(transformers.StaticCache):
                 target_device = device[f"blk.{idx}.self_attn"]["generate_device"]
             else:
                 target_device = device
-            new_layer_key_cache = torch.zeros(key_shape, dtype=self.dtype, device=target_device)
-            new_layer_value_cache = torch.zeros(value_shape, dtype=self.dtype, device=target_device)
-            torch._dynamo.mark_static_address(new_layer_key_cache)
-            torch._dynamo.mark_static_address(new_layer_value_cache)
+            
+            if self.is_MLA:
+                new_layer_key_cache = torch.zeros(latent_shape, dtype=self.dtype, device=target_device)
+                new_layer_value_cache = None
+                torch._dynamo.mark_static_address(new_layer_key_cache)
+            else:
+                new_layer_key_cache = torch.zeros(key_shape, dtype=self.dtype, device=target_device)
+                new_layer_value_cache = torch.zeros(value_shape, dtype=self.dtype, device=target_device)
+                torch._dynamo.mark_static_address(new_layer_key_cache)
+                torch._dynamo.mark_static_address(new_layer_value_cache)
+                
             self.key_cache.append(new_layer_key_cache)
             self.value_cache.append(new_layer_value_cache)
             self.past_tokens.append(0)
@@ -104,11 +132,19 @@ class StaticCache(transformers.StaticCache):
         cache_position = cache_kwargs.get("cache_position")
         k_out = self.key_cache[layer_idx]
         v_out = self.value_cache[layer_idx]
-        #print(cache_position)
-        k_out[:, :, cache_position] = key_states
-        v_out[:, :, cache_position] = value_states
         self.past_tokens[layer_idx] += cache_position.size(0)
-        return k_out, v_out
+        #print(cache_position)
+        if self.is_MLA:
+            page_idx = cache_position // self.page_size
+            page_offset = cache_position % self.page_size
+            # key shape (self.max_pages, self.page_size, 1, config.kv_lora_rank + config.qk_rope_head_dim)
+            k_out[page_idx, page_offset, :, :self.kv_lora_rank] = key_states
+            k_out[page_idx, page_offset, :, self.kv_lora_rank:] = value_states
+            return k_out, self.page_table_list[layer_idx]
+        else:
+            k_out[:, :, cache_position] = key_states
+            v_out[:, :, cache_position] = value_states
+            return k_out, v_out
 
     def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
         """Returns the sequence length of the cached states that were seen by the model."""
@@ -134,8 +170,21 @@ class StaticCache(transformers.StaticCache):
         for layer_idx in range(len(self.key_cache)):
             # In-place ops prevent breaking the static address
             self.key_cache[layer_idx].zero_()
-            self.value_cache[layer_idx].zero_()
+            if self.value_cache[layer_idx] is not None:
+                self.value_cache[layer_idx].zero_()
+            self.past_tokens[layer_idx] = 0
+
+    def remove_suffix(self, start_pos):
+        for layer_idx in range(len(self.key_cache)):
+            # In-place ops prevent breaking the static address
+            if self.is_MLA:
+                k_cache = self.key_cache[layer_idx]
+                k_cache.view(-1, k_cache.shape[-1])[start_pos:].zero_()
+            else:
+                self.key_cache[layer_idx][..., start_pos:, :].zero_()
+                self.value_cache[layer_idx][..., start_pos:, :].zero_()
+            self.past_tokens[layer_idx] = start_pos
     
     def get_max_cache_shape(self) -> Tuple[int, int, int, int]:
         """Returns the maximum shape of the cache."""
-        return self.max_cache_len
\ No newline at end of file
+        return self.max_cache_len
diff --git a/ktransformers/models/modeling_deepseek.py b/ktransformers/models/modeling_deepseek.py
index 692020d..e14a521 100644
--- a/ktransformers/models/modeling_deepseek.py
+++ b/ktransformers/models/modeling_deepseek.py
@@ -1742,8 +1742,7 @@ class DeepseekV2ForCausalLM(DeepseekV2PreTrainedModel):
         )
 
         hidden_states = outputs[0]
-        logits = self.lm_head(hidden_states)
-        logits = logits[:,-1,:].unsqueeze(0).float()
+        logits = self.lm_head(hidden_states[:,-1:,:]).float()
 
         loss = None
         if labels is not None:
diff --git a/ktransformers/models/modeling_deepseek_v3.py b/ktransformers/models/modeling_deepseek_v3.py
index 277258a..952eed7 100644
--- a/ktransformers/models/modeling_deepseek_v3.py
+++ b/ktransformers/models/modeling_deepseek_v3.py
@@ -1699,7 +1699,7 @@ class DeepseekV3ForCausalLM(DeepseekV3PreTrainedModel):
         )
 
         hidden_states = outputs[0]
-        logits = self.lm_head(hidden_states.to(self.lm_head.weight.device))
+        logits = self.lm_head(hidden_states[:,-1:,:])
         logits = logits.float()
 
         loss = None
diff --git a/ktransformers/operators/RoPE.py b/ktransformers/operators/RoPE.py
index dc5902c..adc1c5f 100644
--- a/ktransformers/operators/RoPE.py
+++ b/ktransformers/operators/RoPE.py
@@ -42,7 +42,7 @@ class RotaryEmbedding(BaseInjectedModule, DeepseekV2RotaryEmbedding):
         **kwargs,
     ):
         BaseInjectedModule.__init__(
-            self, key, gguf_loader, config, orig_module, generate_device, **kwargs
+            self, key, gguf_loader, config, orig_module, prefill_device, generate_device, **kwargs
         )
         self.orig_module.__init__(
             orig_module.dim, orig_module.max_position_embeddings, orig_module.base
@@ -72,7 +72,7 @@ class RotaryEmbeddingV3(BaseInjectedModule):
         **kwargs,
     ):
         BaseInjectedModule.__init__(
-            self, key, gguf_loader, config, orig_module, generate_device, **kwargs
+            self, key, gguf_loader, config, orig_module, prefill_device, generate_device, **kwargs
         )
         self.generate_device = generate_device
         self.prefill_device = prefill_device
@@ -122,7 +122,7 @@ class RotaryEmbeddingV2(BaseInjectedModule, LlamaRotaryEmbedding):
         **kwargs,
     ):
         BaseInjectedModule.__init__(
-            self, key, gguf_loader, config, orig_module, generate_device, **kwargs
+            self, key, gguf_loader, config, orig_module, prefill_device, generate_device, **kwargs
         )
         self.orig_module.__init__(
             orig_module.dim,
@@ -160,7 +160,7 @@ class YarnRotaryEmbedding(BaseInjectedModule, DeepseekV2YarnRotaryEmbedding):
         **kwargs,
     ):
         BaseInjectedModule.__init__(
-            self, key, gguf_loader, config, orig_module, generate_device, **kwargs
+            self, key, gguf_loader, config, orig_module, prefill_device, generate_device, **kwargs
         )
         self.orig_module.__init__(
             orig_module.dim,
@@ -204,7 +204,7 @@ class YarnRotaryEmbedding(BaseInjectedModule, DeepseekV2YarnRotaryEmbedding):
 #         **kwargs,
 #     ):
 #         BaseInjectedModule.__init__(
-#             self, key, gguf_loader, config, orig_module, generate_device, **kwargs
+#             self, key, gguf_loader, config, orig_module, prefill_device, generate_device, **kwargs
 #         )
 #         self.generate_device = generate_device
 #         self.prefill_device = prefill_device
@@ -230,7 +230,7 @@ class YarnRotaryEmbeddingV3(BaseInjectedModule):
         **kwargs,
     ):
         BaseInjectedModule.__init__(
-            self, key, gguf_loader, config, orig_module, generate_device, **kwargs
+            self, key, gguf_loader, config, orig_module, prefill_device, generate_device, **kwargs
         )
         self.generate_device = generate_device
         self.prefill_device = prefill_device
@@ -332,11 +332,12 @@ class DynamicNTKScalingRotaryEmbedding(
         gguf_loader: GGUFLoader,
         config: PretrainedConfig,
         orig_module: nn.Module,
-        device: str = "cuda",
+        prefill_device: str = "cuda",
+        generate_device: str = "cuda",
         **kwargs,
     ):
         BaseInjectedModule.__init__(
-            self, key, gguf_loader, config, orig_module, device, **kwargs
+            self, key, gguf_loader, config, orig_module, prefill_device, generate_device, **kwargs
         )
         self.orig_module.__init__(
             orig_module.dim,
diff --git a/ktransformers/operators/attention.py b/ktransformers/operators/attention.py
index 9b47b89..a9bbea6 100644
--- a/ktransformers/operators/attention.py
+++ b/ktransformers/operators/attention.py
@@ -13,215 +13,30 @@ from ktransformers.models.configuration_deepseek import DeepseekV2Config
 from ktransformers.models.configuration_llama import LlamaConfig
 from ktransformers.models.modeling_llama import LlamaRotaryEmbedding
 from ktransformers.models.modeling_deepseek import DeepseekV2Attention, apply_rotary_pos_emb
-from ktransformers.models.modeling_deepseek_v3 import DeepseekV3Attention
-from ktransformers.models.modeling_deepseek_v3 import apply_rotary_pos_emb as apply_rotary_pos_emb_v3
 from typing import Optional, Tuple
 from ktransformers.operators.base_operator import BaseInjectedModule
 from ktransformers.util.custom_gguf import GGUFLoader
+from ktransformers.util.utils import get_compute_capability
 import logging
 from transformers.configuration_utils import PretrainedConfig
 from transformers.cache_utils import Cache
+from flash_attn import flash_attn_func
+from ktransformers.operators.triton_attention import decode_attention_fwd_grouped
+import os
+from ktransformers.operators.flashinfer_wrapper import flashinfer_enabled
+if flashinfer_enabled:
+    from ktransformers.operators.flashinfer_wrapper import MLAWrapperSingleton
+
 logger = logging.getLogger("attention")
 
-class KDeepseekV3Attention(BaseInjectedModule, DeepseekV3Attention):
-    """Multi-headed attention from 'Attention Is All You Need' paper"""
-    attn_mask: Optional[torch.Tensor] = None
-
-    def __init__(self,
-                 key: str,
-                 gguf_loader : GGUFLoader,
-                 config: PretrainedConfig,
-                 orig_module: nn.Module,
-                 device: str = "cuda",
-                 chunck_size: int = 1000,
-                 **kwargs):
-        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, device, **kwargs)
-        self.orig_module.__init__(orig_module.config,
-            orig_module.layer_idx)
-        self.chunck_size = chunck_size # TODO, generate chunck_size automatically.
-        self.softmax_scale = self.q_head_dim ** (-0.5)
-
-    def get_absorbed(self) -> Tuple[torch.Tensor, torch.Tensor]:
-        if not (hasattr(self, 'q_absorb') and hasattr(self, 'out_absorb')):
-            kv_b_proj = self.kv_b_proj.weight.view(self.num_heads, -1, self.kv_lora_rank)
-            q_absorb = kv_b_proj[:, :self.qk_nope_head_dim, :].reshape(-1, self.kv_lora_rank)
-            out_absorb = kv_b_proj[:, self.qk_nope_head_dim:, :].reshape(-1, self.kv_lora_rank)
-            self.q_absorb = nn.Linear(self.kv_lora_rank, self.num_heads * self.qk_nope_head_dim, 
-                                      bias=False, dtype=q_absorb.dtype, device=q_absorb.device)
-            self.q_absorb.weight.data = q_absorb
-            self.out_absorb = nn.Linear(self.kv_lora_rank, self.num_heads * self.v_head_dim, 
-                                        bias=False, dtype=out_absorb.dtype, device=out_absorb.device)
-            self.out_absorb.weight.data = out_absorb
-            del self.orig_module.kv_b_proj
-        q_absorb = self.q_absorb.weight.view(self.num_heads, self.qk_nope_head_dim, self.kv_lora_rank)
-        out_absorb = self.out_absorb.weight.view(self.num_heads, self.v_head_dim, self.kv_lora_rank)
-        return q_absorb, out_absorb
-
-    def forward_chunck(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        cache_position: Optional[torch.LongTensor] = None,
-        **kwargs
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        bsz, q_len, _ = hidden_states.size()
-        if self.q_lora_rank is None:
-            q = self.q_proj(hidden_states)
-        else:
-            q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
-        q = q.view(bsz, q_len, self.num_heads, self.q_head_dim).transpose(1, 2)
-        q_nope, q_pe = torch.split(
-            q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
-        )
-
-        compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
-        compressed_kv, k_pe = torch.split(
-            compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
-        )
-        compressed_kv = self.kv_a_layernorm(compressed_kv)
-        k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim).transpose(1, 2)
-
-        kv_seq_len = k_pe.shape[-2]
-        if past_key_value is not None:
-            if self.layer_idx is None:
-                raise ValueError(
-                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
-                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
-                    "with a layer index."
-                )
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-
-        cos, sin = self.rotary_emb(q_pe, position_ids)
-        q_pe, k_pe = apply_rotary_pos_emb_v3(q_pe, k_pe, cos, sin)
-
-        if past_key_value is not None:
-            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
-            compressed_kv = compressed_kv.unsqueeze(1)
-            k_pe, compressed_kv = past_key_value.update(k_pe, compressed_kv, self.layer_idx, cache_kwargs)
-            compressed_kv = compressed_kv.squeeze(1)
-            #if cache_position is not None:  
-            #    compressed_kv = compressed_kv[:,: cache_position[-1] + 1,:]
-            #    k_pe = k_pe[:,:,: cache_position[-1] + 1,:]
-        q_absorb, out_absorb = self.get_absorbed()
-
-        q_nope = torch.matmul(q_nope, q_absorb)
-        attn_weights = (torch.matmul(q_pe, k_pe.mT) + torch.matmul(q_nope, compressed_kv.unsqueeze(-3).mT)) * self.softmax_scale
-        """
-        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
-            raise ValueError(
-                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
-                f" {attn_weights.size()}"
-            )
-        assert attention_mask is not None
-        """
-        if attention_mask is not None:
-            """
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-                )
-            """
-            #causal_mask = attention_mask[:, :, :, : kv_seq_len]
-            attn_weights = attn_weights + attention_mask
-
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(
-            attn_weights, dim=-1, dtype=torch.float32
-        ).to(q_pe.dtype)
-        attn_weights = nn.functional.dropout(
-            attn_weights, p=self.attention_dropout, training=self.training
-        )
-        attn_output = torch.einsum('bhql,blc->bhqc', attn_weights, compressed_kv)
-
-        attn_output = torch.matmul(attn_output, out_absorb.mT) 
-
-        if attn_output.size() != (bsz, self.num_heads, q_len, self.v_head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.v_head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-
-        attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.v_head_dim)
-
-        attn_output = self.o_proj(attn_output)
-
-        return attn_output, attn_weights, past_key_value
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        cache_position: Optional[torch.LongTensor] = None,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-            )
-        bsz, q_len, _ = hidden_states.size()
-        
-        if q_len <= self.chunck_size:
-            return self.forward_chunck(
-                            hidden_states,
-                            attention_mask,
-                            position_ids,
-                            past_key_value,
-                            output_attentions,
-                            use_cache,
-                            cache_position,
-                            **kwargs
-                        )
-
-        assert output_attentions == False, "output_attentions is not supported when using chunked attention"
-        attn_output = None
-        attn_weight = None
-        cur_idx = 0
-        while cur_idx < q_len:
-            if attention_mask is not None:
-                chunk_mask = attention_mask[:, :, cur_idx:min(cur_idx + self.chunck_size, q_len), ...]
-            else:
-                # generate chunk_mask automatically.
-                self.attn_mask = \
-                    torch.zeros(1, 1, self.chunck_size, past_key_value.max_cache_len, device=hidden_states.device) \
-                        if self.attn_mask is None \
-                            else self.attn_mask
-                self.attn_mask[:, :, :, cur_idx:min(cur_idx+self.chunck_size, past_key_value.max_cache_len)] = \
-                    -1e+38 * torch.triu(torch.ones(self.chunck_size, self.chunck_size, device=hidden_states.device), diagonal=1)\
-                        [:,:min(self.chunck_size, min(past_key_value.max_cache_len-cur_idx, self.chunck_size))]
-                self.attn_mask[:, :, :, cur_idx+self.chunck_size:] = -1e+38
-                self.attn_mask[:, :, :, :cur_idx] = 0
-                chunk_mask = torch.narrow(self.attn_mask, 2, 0, min(self.chunck_size, q_len-cur_idx))
-
-            cur_output, cur_attn_weight = self.forward_chunck(
-                            hidden_states[:, cur_idx:min(cur_idx + self.chunck_size, q_len), ...],
-                            chunk_mask,
-                            position_ids[:, cur_idx:min(cur_idx + self.chunck_size, q_len)],
-                            past_key_value,
-                            output_attentions,
-                            use_cache,
-                            cache_position[cur_idx:min(cur_idx + self.chunck_size, q_len)],
-                            **kwargs
-                        )
-            cur_idx += self.chunck_size
-            if attn_output is None:
-                attn_output = cur_output
-                attn_weight = cur_attn_weight
-            else:
-                attn_output = torch.cat((attn_output, cur_output), dim=-2)
-                attn_weight = torch.cat((attn_weight, cur_attn_weight), dim=-2)
-                
-        return attn_output, attn_weight, past_key_value
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
 
+# V3 MLA is same to V2
 class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
     attn_mask: Optional[torch.Tensor] = None
@@ -231,29 +46,25 @@ class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention):
                  gguf_loader : GGUFLoader,
                  config: PretrainedConfig,
                  orig_module: nn.Module,
-                 device: str = "cuda",
+                 prefill_device: str = "cuda",
+                 generate_device: str = "cuda",
                  chunck_size: int = 1000,
+                 absorb_for_prefill: bool = False,
                  **kwargs):
-        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, device, **kwargs)
+        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, generate_device, **kwargs)
         self.orig_module.__init__(orig_module.config,
             orig_module.layer_idx)
         self.chunck_size = chunck_size # TODO, generate chunck_size automatically.
+        self.mla_wrapper = None
+        self.absorb_for_prefill = absorb_for_prefill
 
     def get_absorbed(self) -> Tuple[torch.Tensor, torch.Tensor]:
         if not (hasattr(self, 'q_absorb') and hasattr(self, 'out_absorb')):
             kv_b_proj = self.kv_b_proj.weight.view(self.num_heads, -1, self.kv_lora_rank)
-            q_absorb = kv_b_proj[:, :self.qk_nope_head_dim, :].reshape(-1, self.kv_lora_rank)
-            out_absorb = kv_b_proj[:, self.qk_nope_head_dim:, :].reshape(-1, self.kv_lora_rank)
-            self.q_absorb = nn.Linear(self.kv_lora_rank, self.num_heads * self.qk_nope_head_dim, 
-                                      bias=False, dtype=q_absorb.dtype, device=q_absorb.device)
-            self.q_absorb.weight.data = q_absorb
-            self.out_absorb = nn.Linear(self.kv_lora_rank, self.num_heads * self.v_head_dim, 
-                                        bias=False, dtype=out_absorb.dtype, device=out_absorb.device)
-            self.out_absorb.weight.data = out_absorb
-            del self.orig_module.kv_b_proj
-        q_absorb = self.q_absorb.weight.view(self.num_heads, self.qk_nope_head_dim, self.kv_lora_rank)
-        out_absorb = self.out_absorb.weight.view(self.num_heads, self.v_head_dim, self.kv_lora_rank)
-        return q_absorb, out_absorb
+            self.q_absorb = kv_b_proj[:, :self.qk_nope_head_dim, :].view(self.num_heads, self.qk_nope_head_dim, self.kv_lora_rank)
+            self.out_absorb = kv_b_proj[:, self.qk_nope_head_dim:, :].view(self.num_heads, self.v_head_dim, self.kv_lora_rank)
+            
+        return self.q_absorb, self.out_absorb
 
     def forward_chunck(
         self,
@@ -275,6 +86,8 @@ class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention):
         q_nope, q_pe = torch.split(
             q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
         )
+        # q_nope [bsz, self.num_heads, q_len, self.qk_nope_head_dim]
+        # q_pe [bsz, self.num_heads, q_len, self.qk_rope_head_dim]
 
         compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
         compressed_kv, k_pe = torch.split(
@@ -287,7 +100,7 @@ class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention):
         if past_key_value is not None:
             if self.layer_idx is None:
                 raise ValueError(
-                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                    f"The cache structure has changed since transformer version v4.36. If you are using {self.__class__.__name__} "
                     "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
                     "with a layer index."
                 )
@@ -298,16 +111,36 @@ class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention):
 
         if past_key_value is not None:
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
-            compressed_kv = compressed_kv.unsqueeze(1)
-            k_pe, compressed_kv = past_key_value.update(k_pe, compressed_kv, self.layer_idx, cache_kwargs)
-            compressed_kv = compressed_kv.squeeze(1)
-            #if cache_position is not None:  
-            #    compressed_kv = compressed_kv[:,: cache_position[-1] + 1,:]
-            #    k_pe = k_pe[:,:,: cache_position[-1] + 1,:]
+            
+            # compressed_kv [bsz, q_len, self.kv_lora_rank]
+            # k_pe [bsz, 1, q_len, self.qk_rope_head_dim]
+            k_pe = k_pe.transpose(1,2)
+            compressed_kv = compressed_kv.unsqueeze(2)
+            compressed_kv_with_k_pe, _ = past_key_value.update(compressed_kv, k_pe, self.layer_idx, cache_kwargs)
+            compressed_kv, k_pe = torch.split(
+                compressed_kv_with_k_pe, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
+            )
+            # k_pe [pages, page_size, 1, self.qk_rope_head_dim]
+            # compressed_kv [pages, page_size, 1, self.kv_lora_rank]
+            
         q_absorb, out_absorb = self.get_absorbed()
 
+        # q_nope [bsz, self.num_heads, q_len, self.qk_nope_head_dim]
+        # q_pe [bsz, self.num_heads, q_len, self.qk_rope_head_dim]
+        k_pe = k_pe.view(bsz, 1, -1, self.qk_rope_head_dim)[:,:,:attention_mask.size(-1),:]
+        compressed_kv = compressed_kv.view(bsz, 1, -1, self.kv_lora_rank)[:,:,:attention_mask.size(-1),:]
+        # k_pe [bsz, 1, cache_len, self.qk_rope_head_dim]
+        # compressed_kv [bsz, 1, cache_len,self.kv_lora_rank]
         q_nope = torch.matmul(q_nope, q_absorb)
-        attn_weights = (torch.matmul(q_pe, k_pe.mT) + torch.matmul(q_nope, compressed_kv.unsqueeze(-3).mT)) * self.softmax_scale
+        #print(q_pe.shape)
+        #print(k_pe.shape)
+        #print(q_nope.shape)
+        #print(compressed_kv.shape)
+        
+        attn_weights = (torch.matmul(q_pe, k_pe.mT) + torch.matmul(q_nope, compressed_kv.mT)) * self.softmax_scale
+        
+        #attn_weights [bsz, self.num_heads, q_len, kv_seq_len]
+        compressed_kv = compressed_kv.squeeze(1)
         """
         if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
             raise ValueError(
@@ -333,8 +166,9 @@ class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention):
         attn_weights = nn.functional.dropout(
             attn_weights, p=self.attention_dropout, training=self.training
         )
+        
         attn_output = torch.einsum('bhql,blc->bhqc', attn_weights, compressed_kv)
-
+        
         attn_output = torch.matmul(attn_output, out_absorb.mT) 
 
         if attn_output.size() != (bsz, self.num_heads, q_len, self.v_head_dim):
@@ -344,14 +178,341 @@ class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention):
             )
 
         attn_output = attn_output.transpose(1, 2).contiguous()
-
+        
         attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.v_head_dim)
 
         attn_output = self.o_proj(attn_output)
 
         return attn_output, None, past_key_value
 
-    def forward(
+    def forward_linux_triton(
+            self,
+            hidden_states: torch.Tensor,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            past_key_value: Optional[Cache] = None,
+            output_attentions: bool = False,
+            use_cache: bool = False,
+            cache_position: Optional[torch.LongTensor] = None,
+            **kwargs,
+        ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+
+        bsz, q_len, _ = hidden_states.size()
+
+        if self.q_lora_rank is None:
+            q = self.q_proj(hidden_states)
+        else:
+            q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
+        q = q.view(bsz, q_len, self.num_heads, self.q_head_dim)
+        q_nope, q_pe = torch.split(
+            q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
+        )
+
+        compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
+        compressed_kv, k_pe = torch.split(
+            compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
+        )
+        compressed_kv = self.kv_a_layernorm(compressed_kv)
+        k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim)
+        compressed_kv = compressed_kv.view(bsz, q_len, 1, self.kv_lora_rank)
+
+        kv_seq_len = q_len
+        if past_key_value is not None:
+            if self.layer_idx is None:
+                raise ValueError(
+                    f"The cache structure has changed since transformer version v4.36. If you are using {self.__class__.__name__} "
+                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                    "with a layer index."
+                )
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        
+        cos, sin = self.rotary_emb(q_pe, position_ids)
+        q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, unsqueeze_dim=2)
+        # q_pe [bsz, q_len, self.num_heads, self.qk_rope_head_dim] k_pe [bsz, q_len, 1, self.qk_rope_head_dim]
+        
+        # decode
+        if q_len == 1:
+            if past_key_value is not None:
+                cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
+                compressed_kv_with_k_pe, page_table = past_key_value.update(compressed_kv, k_pe, self.layer_idx, cache_kwargs)
+                compressed_kv = compressed_kv_with_k_pe [:, :, :, :self.kv_lora_rank] # for speed
+                # compressed_kv_with_k_pe [bsz, q_len, 1, self.kv_lora_rank + self.qk_rope_head_dim]
+                # compressed_kv [bsz, q_len, 1, self.kv_lora_rank]
+
+            # q_nope [bsz, q_len, self.num_heads, self.qk_nope_head_dim]
+            # q_absorb [self.num_heads, self.qk_nope_head_dim, self.kv_lora_rank]
+            q_absorb, out_absorb = self.get_absorbed()
+            q_nope = q_nope.transpose(1, 2) # q_len is 1, no GPU overhead, same below
+            q_nope = torch.matmul(q_nope, q_absorb) # batched MM
+            q_nope = q_nope.transpose(1, 2)
+            #assert q_nope.is_contiguous()
+            
+            # q_nope [bsz, q_len, self.num_heads, self.kv_lora_rank]
+            # q_pe [bsz, q_len, self.num_heads, self.qk_rope_head_dim]
+            query_states = torch.cat([q_nope, q_pe], dim=-1)
+            
+            query_states = query_states.squeeze(1)
+            attn_output = torch.zeros_like(q_nope) # [bsz, q_len, self.num_heads, self.kv_lora_rank]
+            
+            attn_logits = torch.empty(
+                    (
+                        bsz,
+                        self.num_heads,
+                        4, #num_kv_splits # follow vLLM, fix it TODO
+                        self.kv_lora_rank + 1, 
+                    ),
+                    dtype=torch.float32,
+                    device = attn_output.device
+                )
+
+            """
+            print("query_states", torch.isnan(query_states).any())
+            print("compressed_kv_with_k_pe", torch.isnan(compressed_kv_with_k_pe[:,:,0,:]).any())
+            print("compressed_kv", torch.isnan(compressed_kv[:,:,0,:]).any())
+            print("position_ids", torch.isnan(position_ids).any())
+            """
+
+            # flash attn doesn't support head_dim bigger than 256
+            # use triton attention kernel adapted from vLLM and SGLang for MQA
+            decode_attention_fwd_grouped(query_states, compressed_kv_with_k_pe, compressed_kv, attn_output,
+                             page_table,
+                             position_ids.squeeze(0).to(torch.int32)+1, attn_logits,
+                             4, #num_kv_splits # follow vLLM, fix it TODO
+                             self.softmax_scale,
+                             past_key_value.page_size)
+            
+            # attn_output [bsz, q_len, self.num_heads, self.kv_lora_rank]
+            # out_absorb [self.num_heads, self.v_head_dim, self.kv_lora_rank]
+            attn_output = attn_output.transpose(1, 2)
+            attn_output = torch.matmul(attn_output, out_absorb.mT)
+            attn_output = attn_output.transpose(1, 2)
+            
+            attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.v_head_dim)
+            attn_output = self.o_proj(attn_output)
+            
+            #print("attn_output", torch.isnan(attn_output).any())
+            return attn_output, None, past_key_value
+        else:
+            if past_key_value is not None:
+                cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
+                k_pe.squeeze(0)
+                compressed_kv.squeeze(0)
+                compressed_kv_with_k_pe, _ = past_key_value.update(compressed_kv, k_pe, self.layer_idx, cache_kwargs)
+                compressed_kv, k_pe = torch.split(
+                    compressed_kv_with_k_pe, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
+                )
+            k_pe = k_pe.view(bsz, -1, self.qk_rope_head_dim)
+            k_pe = k_pe[:, :kv_seq_len]
+            compressed_kv = compressed_kv.view(bsz, -1, self.kv_lora_rank)
+            compressed_kv = compressed_kv[:, :kv_seq_len]
+            kv = (
+                self.kv_b_proj(compressed_kv)
+                .view(bsz, kv_seq_len, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
+            )
+            k_nope, value_states = torch.split(kv, [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
+            query_states = k_pe.new_empty(bsz, q_len, self.num_heads, self.q_head_dim)
+            query_states[:, :, :, : self.qk_nope_head_dim] = q_nope
+            query_states[:, :, :, self.qk_nope_head_dim :] = q_pe
+
+            key_states = k_pe.new_empty(bsz, kv_seq_len, self.num_heads, self.q_head_dim)
+            key_states[:, :, :, :self.qk_nope_head_dim] = k_nope
+            key_states[:, :, :, self.qk_nope_head_dim:] = k_pe.view(bsz, kv_seq_len, 1, -1)
+            
+            value_states = value_states.view(bsz, kv_seq_len, self.num_heads, self.v_head_dim)
+            value_states_padded = torch.nn.functional.pad(value_states, [0, query_states.shape[-1] - value_states.shape[-1]], value=0)
+
+            attn_output = flash_attn_func(
+                query_states,
+                key_states,
+                value_states_padded,
+                softmax_scale=self.softmax_scale,
+                causal=True,
+            )
+
+            if self.q_head_dim != self.v_head_dim:
+                attn_output = attn_output[:, :, :, : self.v_head_dim]
+
+            attn_output = attn_output.reshape(
+                bsz, q_len, self.num_heads * self.v_head_dim
+            ).contiguous()
+            attn_output = self.o_proj(attn_output)
+            return attn_output, None, past_key_value
+
+    def forward_linux_flashinfer(
+            self,
+            hidden_states: torch.Tensor,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.Tensor] = None,
+            past_key_value: Optional[Cache] = None,
+            output_attentions: bool = False,
+            use_cache: bool = False,
+            cache_position: Optional[torch.Tensor] = None,
+            **kwargs,
+        ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+
+        bsz, q_len, _ = hidden_states.size()
+
+        if self.q_lora_rank is None:
+            q = self.q_proj(hidden_states)
+        else:
+            q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
+        q = q.view(bsz, q_len, self.num_heads, self.q_head_dim)
+        q_nope, q_pe = torch.split(
+            q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
+        )
+
+        compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
+        compressed_kv, k_pe = torch.split(
+            compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
+        )
+        compressed_kv = self.kv_a_layernorm(compressed_kv)
+        k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim)
+        compressed_kv = compressed_kv.view(bsz, q_len, 1, self.kv_lora_rank)
+
+        kv_seq_len = q_len
+        if past_key_value is not None:
+            if self.layer_idx is None:
+                raise ValueError(
+                    f"The cache structure has changed since version transformer verision v4.36. If you are using {self.__class__.__name__} "
+                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                    "with a layer index."
+                )
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        
+        cos, sin = self.rotary_emb(q_pe, position_ids)
+        q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, unsqueeze_dim=2)
+        # q_pe [bsz, q_len, self.num_heads, self.qk_rope_head_dim] k_pe [bsz, q_len, 1, self.qk_rope_head_dim]
+        
+        # decode
+        if q_len == 1 or self.absorb_for_prefill:
+            if past_key_value is not None:
+                cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
+                compressed_kv_with_k_pe, page_table = past_key_value.update(compressed_kv, k_pe, self.layer_idx, cache_kwargs)
+                compressed_kv = compressed_kv_with_k_pe [:, :, :, :self.kv_lora_rank].view(-1, past_key_value.page_size, self.kv_lora_rank)
+                k_pe = compressed_kv_with_k_pe [:, :, :, self.kv_lora_rank:].view(-1, past_key_value.page_size, self.qk_rope_head_dim)
+                # k_pe [max_pages, page_size, self.qk_rope_head_dim]
+                # compressed_kv [max_pages, page_size, self.kv_lora_rank]
+
+            # q_nope [bsz, q_len, self.num_heads, self.qk_nope_head_dim]
+            # q_absorb [self.num_heads, self.qk_nope_head_dim, self.kv_lora_rank]
+            q_absorb, out_absorb = self.get_absorbed()
+            q_nope = q_nope.transpose(1, 2) # q_len is 1, no GPU overhead, same below
+            q_nope = torch.matmul(q_nope, q_absorb) # batched MM
+            q_nope = q_nope.transpose(1, 2)
+            q_nope = q_nope.contiguous()
+            #assert q_nope.is_contiguous()
+            
+            # q_nope [bsz, q_len, self.num_heads, self.kv_lora_rank]
+            # q_pe [bsz, q_len, self.num_heads, self.qk_rope_head_dim]
+            q_nope.squeeze_(0)
+            q_pe.squeeze_(0)
+
+            # flash attn doesn't support head_dim bigger than 256, use flashinfer
+            if self.mla_wrapper is None:
+                self.mla_wrapper = MLAWrapperSingleton.get_instance(self.device, 1, past_key_value.max_pages, use_cuda_graph = True)
+            if self.mla_wrapper.need_plan:
+                self.mla_wrapper.need_plan = False
+                if q_len == 1:
+                    self.mla_wrapper.plan(None,None,None,
+                                        position_ids.squeeze(1)+1,
+                                        self.num_heads,
+                                        self.kv_lora_rank,
+                                        self.qk_rope_head_dim,
+                                        past_key_value.page_size,
+                                        self.softmax_scale,
+                                        q_nope.dtype,
+                                        compressed_kv.dtype)
+                else:
+                    qo_indptr = torch.tensor([0, q_len], dtype=torch.int32, device=self.device)
+                    kv_len_arr = torch.tensor([position_ids[0, -1].item()+1], dtype=torch.int32, device=self.device)
+                    self.mla_wrapper.plan(qo_indptr,None,None,
+                                        kv_len_arr,
+                                        self.num_heads,
+                                        self.kv_lora_rank,
+                                        self.qk_rope_head_dim,
+                                        past_key_value.page_size,
+                                        self.softmax_scale,
+                                        q_nope.dtype,
+                                        compressed_kv.dtype)
+            attn_output = self.mla_wrapper.run(q_nope, q_pe, compressed_kv, k_pe).view(bsz, q_len, self.num_heads, self.kv_lora_rank)
+            """
+            k = (
+                torch.cat([compressed_kv, k_pe], dim=-1)
+                .view(-1, 1, 512 + 64)
+                .repeat_interleave(self.num_heads, dim=1)
+            )
+            v = compressed_kv.view(-1, 1, 512).repeat_interleave(self.num_heads, dim=1)
+            lens = position_ids.item() + 1
+            #print("lens", lens)
+            attn_ref, lse_ref = attention_ref(
+                1,
+                torch.cat([q_nope, q_pe], dim=-1),
+                k[:lens],
+                v[:lens],
+                False,
+                self.softmax_scale
+            )
+            attn_output = attn_ref.view(bsz, q_len, self.num_heads, self.kv_lora_rank)
+            """
+            
+            # mla_wrapper run output: [tokens, self.num_heads, self.kv_lora_rank]
+            # attn_output [bsz, q_len, self.num_heads, self.kv_lora_rank]
+            # out_absorb [self.num_heads, self.v_head_dim, self.kv_lora_rank]
+            attn_output = attn_output.transpose(1, 2) # [bsz, self.num_heads, q_len, self.kv_lora_rank]
+            attn_output = torch.matmul(attn_output, out_absorb.mT) # [bsz, self.num_heads, q_len, self.v_head_dim]
+            attn_output = attn_output.transpose(1, 2).contiguous() # [bsz, q_len, self.num_heads, self.kv_lora_rank]
+            
+            attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.v_head_dim) # [bsz, q_len, self.num_heads * self.v_head_dim]
+            attn_output = self.o_proj(attn_output)
+            
+            return attn_output, None, past_key_value
+        else:
+            if past_key_value is not None:
+                cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
+                k_pe.squeeze(0)
+                compressed_kv.squeeze(0)
+                compressed_kv_with_k_pe, _ = past_key_value.update(compressed_kv, k_pe, self.layer_idx, cache_kwargs)
+                compressed_kv, k_pe = torch.split(
+                    compressed_kv_with_k_pe, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
+                )
+            k_pe = k_pe.view(bsz, -1, self.qk_rope_head_dim)
+            k_pe = k_pe[:, :kv_seq_len]
+            compressed_kv = compressed_kv.view(bsz, -1, self.kv_lora_rank)
+            compressed_kv = compressed_kv[:, :kv_seq_len]
+            kv = (
+                self.kv_b_proj(compressed_kv)
+                .view(bsz, kv_seq_len, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
+            )
+            k_nope, value_states = torch.split(kv, [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
+            query_states = k_pe.new_empty(bsz, q_len, self.num_heads, self.q_head_dim)
+            query_states[:, :, :, : self.qk_nope_head_dim] = q_nope
+            query_states[:, :, :, self.qk_nope_head_dim :] = q_pe
+
+            key_states = k_pe.new_empty(bsz, kv_seq_len, self.num_heads, self.q_head_dim)
+            key_states[:, :, :, :self.qk_nope_head_dim] = k_nope
+            key_states[:, :, :, self.qk_nope_head_dim:] = k_pe.view(bsz, kv_seq_len, 1, -1)
+            
+            value_states = value_states.view(bsz, kv_seq_len, self.num_heads, self.v_head_dim)
+            value_states_padded = torch.nn.functional.pad(value_states, [0, query_states.shape[-1] - value_states.shape[-1]], value=0)
+
+            attn_output = flash_attn_func(
+                query_states,
+                key_states,
+                value_states_padded,
+                softmax_scale=self.softmax_scale,
+                causal=True,
+            )
+
+            if self.q_head_dim != self.v_head_dim:
+                attn_output = attn_output[:, :, :, : self.v_head_dim]
+
+            attn_output = attn_output.reshape(
+                bsz, q_len, self.num_heads * self.v_head_dim
+            ).contiguous()
+            attn_output = self.o_proj(attn_output)
+            return attn_output, None, past_key_value
+        
+    def forward_windows(
         self,
         hidden_states: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
@@ -367,7 +528,7 @@ class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention):
                 "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
             )
         bsz, q_len, _ = hidden_states.size()
-        
+
         if q_len <= self.chunck_size:
             return self.forward_chunck(
                             hidden_states,
@@ -416,13 +577,53 @@ class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention):
                 attn_output = torch.cat((attn_output, cur_output), dim=-2)
                 
         return attn_output, None, past_key_value
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., : x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2 :]
-    return torch.cat((-x2, x1), dim=-1)
-
 
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if os.name == 'nt' or get_compute_capability()<8:
+            print("for Windows or GPU before ampere, use forward_windows")
+            return self.forward_windows(
+                hidden_states,
+                attention_mask,
+                position_ids,
+                past_key_value,
+                output_attentions,
+                use_cache,
+                cache_position,
+                **kwargs,
+            )
+        else:
+            if flashinfer_enabled:
+                return self.forward_linux_flashinfer(
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    past_key_value,
+                    output_attentions,
+                    use_cache,
+                    cache_position,
+                    **kwargs,
+                )
+            else:
+                return self.forward_linux_triton(
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    past_key_value,
+                    output_attentions,
+                    use_cache,
+                    cache_position,
+                    **kwargs,
+                )
 
 
 class KLlamaAttention(BaseInjectedModule):
@@ -433,9 +634,10 @@ class KLlamaAttention(BaseInjectedModule):
                  gguf_loader : GGUFLoader,
                  config: PretrainedConfig,
                  orig_module: nn.Module,
-                 device: str = "cuda",
+                 prefill_device: str = "cuda",
+                 generate_device: str = "cuda",
                  **kwargs):
-        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, device, **kwargs)
+        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, generate_device, **kwargs)
         self.orig_module.__init__(orig_module.config,
             orig_module.layer_idx)
     def apply_rotary_pos_emb(self, q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
@@ -551,4 +753,4 @@ class KLlamaAttention(BaseInjectedModule):
         if not output_attentions:
             attn_weights = None
 
-        return attn_output, attn_weights, past_key_value
\ No newline at end of file
+        return attn_output, attn_weights, past_key_value
diff --git a/ktransformers/operators/base_operator.py b/ktransformers/operators/base_operator.py
index 1cf1471..0fa2efd 100644
--- a/ktransformers/operators/base_operator.py
+++ b/ktransformers/operators/base_operator.py
@@ -16,14 +16,17 @@ class BaseInjectedModule(nn.Module):
                  gguf_loader : GGUFLoader,
                  config: PretrainedConfig,
                  orig_module: nn.Module,
-                 device: str = "cuda",
+                 prefill_device: str = "cuda",
+                 generate_device: str = "cuda",
                  **kwargs):
         nn.Module.__init__(self)
         nn.Module.__setattr__(self, "orig_module", orig_module)
         object.__setattr__(self, "key", key)
         object.__setattr__(self, "gguf_loader", gguf_loader)
         object.__setattr__(self, "config", config)
-        object.__setattr__(self, "device", device)
+        object.__setattr__(self, "prefill_device", prefill_device)
+        object.__setattr__(self, "generate_device", generate_device)
+        object.__setattr__(self, "device", generate_device)
         
     def __getattr__(self, name: str) -> Any:
         # __getattr__ in nn.Module doesn't call super().__getattribute__ when name is not in nn.Module.__dict__,
diff --git a/ktransformers/operators/experts.py b/ktransformers/operators/experts.py
index 274a3ca..88960c7 100644
--- a/ktransformers/operators/experts.py
+++ b/ktransformers/operators/experts.py
@@ -18,6 +18,7 @@ import torch.nn.functional as F
 import torch
 import sys, os
 from ktransformers.operators.base_operator import BaseInjectedModule
+from tqdm import tqdm
 
 sys.path.append(os.path.join(os.path.dirname(__file__), "..", "ktransformers_ext", "build"))
 sys.path.append(os.path.join(os.path.dirname(__file__), "..", "ktransformers_ext", "build", "Release"))
@@ -118,6 +119,7 @@ class KExpertsCPU(KExpertsBase):
     output_cpu:Tensor = None
     output_gpu_map:dict = {} # Manage output tensor buffer on different gpu
     #stream_map:dict = {} # Manage cuda stream on different gpu
+    #gguf_loader:GGUFLoader = None
     CPU_INFER = CPUInfer(Config().cpu_infer)
     def __init__(
         self,
@@ -131,6 +133,9 @@ class KExpertsCPU(KExpertsBase):
         **kwargs
     ):
         super().__init__(key, gguf_loader, config, orig_module, device, **kwargs)
+        #if KExpertsCPU.gguf_loader is None:
+        #    KExpertsCPU.gguf_loader = GGUFLoader("/mnt/data/model/DeepseekV3-q4km-gguf")
+        self.gguf_loader = gguf_loader
         assert device.lower() == "cpu", "KExpertsCPU can only be loaded on CPU"
         self.n_routed_experts = n_routed_experts
         self.out_device = out_device
@@ -154,7 +159,7 @@ class KExpertsCPU(KExpertsBase):
         down_ptr = ctypes.addressof(
             ctypes.cast(self.down.ctypes.data, ctypes.POINTER(ctypes.c_uint64)).contents
         )
-        # print(self.gate_qtype, self.up_qtype, self.down_qtype)
+        #print(self.gate_type, self.up_type, self.down_type)
         n_routed_experts = self.n_routed_experts
         # n_routed_experts = len(self.orig_module)
         moe_config = MOEConfig(
@@ -225,6 +230,7 @@ class KExpertsCPU(KExpertsBase):
         return
 
     def load_weights(self, override_key: str | None = None, device: str = "cpu"):
+        # TODO: support Bias
         res = {}
         if override_key is not None:
             keys = override_key
@@ -239,7 +245,16 @@ class KExpertsCPU(KExpertsBase):
         down_type = None
 
         for key in keys:
-            if key + ".ffn_gate_exps.weight" in self.gguf_loader.tensor_info:
+            if self.gguf_loader.safetensor_loader is not None:
+                # using a temp ugly way to temprary load the tensor
+                gate = self.gguf_loader.safetensor_loader.load_tensor(key + ".ffn_gate_exps.weight").numpy()
+                up = self.gguf_loader.safetensor_loader.load_tensor(key + ".ffn_up_exps.weight").numpy()
+                down = self.gguf_loader.safetensor_loader.load_tensor(key + ".ffn_down_exps.weight").numpy()
+                gate_type = self.gguf_loader.safetensor_loader.load_tensor(key + ".ffn_gate_exps.ggml_type").item()
+                up_type = self.gguf_loader.safetensor_loader.load_tensor(key + ".ffn_up_exps.ggml_type").item()
+                down_type = self.gguf_loader.safetensor_loader.load_tensor(key + ".ffn_down_exps.ggml_type").item()
+            
+            elif key + ".ffn_gate_exps.weight" in self.gguf_loader.tensor_info:
                 gate = self.gguf_loader.get_mmap_tensor(key + ".ffn_gate_exps.weight")
                 up = self.gguf_loader.get_mmap_tensor(key + ".ffn_up_exps.weight")
                 down = self.gguf_loader.get_mmap_tensor(key + ".ffn_down_exps.weight")
@@ -288,6 +303,8 @@ class KExpertsMarlin(KExpertsBase):
         self.act_fn = ACT2FN[config.hidden_act]
         assert device.lower() != "cpu", "Marlin experts can only be loaded on GPU"
         self.device = device
+        self.elements_per_tensor = config.moe_intermediate_size * config.hidden_size
+
         # create empty marlin experts according to the number of experts per token
         # up
         self.up_projs = [KLinearMarlin(key+ "." + "ffn_up_exps", gguf_loader, config, device=device) for i in range(self.expert_num)]
@@ -299,17 +316,34 @@ class KExpertsMarlin(KExpertsBase):
     def load(self, w: dict | nn.Parameter | tuple | None = None, device: str | None = None, warmup: bool = False):
         if device is None: device = self.device
         assert device.lower() != "cpu", "Marlin experts can only be loaded on GPU"
-        if w is None: w = self.load_weights()[self.key]
+        if w is None:
+            w = self.load_weights()
+            load_by_experts = True
 
-        if isinstance(w, dict):
-            self.gate = w["gate"]
-            self.up = (w["up"])
-            self.down = (w["down"])
-            for i in range(self.expert_num):
-                self.up_projs[i].load(nn.Parameter(self.up[i,...]), device=device)
-                self.gate_projs[i].load(nn.Parameter(self.gate[i,...]), device=device)
-                self.down_projs[i].load(nn.Parameter(self.down[i,...]), device=device)
-                self.loaded_experts_idx.append(i)
+        if load_by_experts:
+            if isinstance(w, dict):
+                self.gate = w["gate"]
+                self.up = (w["up"])
+                self.down = (w["down"])
+                for i in tqdm(range(self.expert_num), desc=f"Dequanting and quanting for KExpertsMarlin {self.key}"):
+                    up_weights = self.gguf_loader.load_expert_tensor(self.key + ".ffn_up_exps.weight", self.up, i, self.elements_per_tensor, device=self.device)
+                    gate_weights = self.gguf_loader.load_expert_tensor(self.key + ".ffn_gate_exps.weight", self.gate, i, self.elements_per_tensor, device=self.device)
+                    down_weights = self.gguf_loader.load_expert_tensor(self.key + ".ffn_down_exps.weight", self.down, i, self.elements_per_tensor, device=self.device)
+                    
+                    self.up_projs[i].load(nn.Parameter(up_weights), device=device)
+                    self.gate_projs[i].load(nn.Parameter(gate_weights), device=device)
+                    self.down_projs[i].load(nn.Parameter(down_weights), device=device)
+                    self.loaded_experts_idx.append(i)
+        else:
+            if isinstance(w, dict):
+                self.gate = w["gate"]
+                self.up = (w["up"])
+                self.down = (w["down"])
+                for i in range(self.expert_num):
+                    self.up_projs[i].load(nn.Parameter(self.up[i,...]), device=device)
+                    self.gate_projs[i].load(nn.Parameter(self.gate[i,...]), device=device)
+                    self.down_projs[i].load(nn.Parameter(self.down[i,...]), device=device)
+                    self.loaded_experts_idx.append(i)
         return 
 
     def unload(self):
@@ -329,20 +363,13 @@ class KExpertsMarlin(KExpertsBase):
         gate = None
         up = None
         down = None
-        gate_type = None
-        up_type = None
-        down_type = None
 
         for key in keys:
             if key + ".ffn_gate_exps.weight" in self.gguf_loader.tensor_info:
-                gate = self.gguf_loader.load_gguf_tensor(key + ".ffn_gate_exps.weight")
-                up = self.gguf_loader.load_gguf_tensor(key + ".ffn_up_exps.weight")
-                down = self.gguf_loader.load_gguf_tensor(key + ".ffn_down_exps.weight")
-                gate_type = self.gguf_loader.tensor_info[key + ".ffn_gate_exps.weight"]["ggml_type"]
-                up_type = self.gguf_loader.tensor_info[key + ".ffn_up_exps.weight"]["ggml_type"]
-                down_type = self.gguf_loader.tensor_info[key + ".ffn_down_exps.weight"]["ggml_type"]
-                # tensors = self.load_multi(key, [".ffn_gate_exps.weight", ".ffn_up_exps.weight", ".ffn_down_exps.weight"])
-            res = {key:{"gate": nn.Parameter(gate), "up": nn.Parameter(up), "down": nn.Parameter(down), "gate_type": gate_type, "up_type": up_type, "down_type": down_type}}
+                gate = self.gguf_loader.get_mmap_tensor(key + ".ffn_gate_exps.weight")
+                up = self.gguf_loader.get_mmap_tensor(key + ".ffn_up_exps.weight")
+                down = self.gguf_loader.get_mmap_tensor(key + ".ffn_down_exps.weight")
+            res = {"gate": gate, "up": up, "down": down}
         return res
 
     def forward(self, hidden_states_cpu: torch.Tensor, selected_experts_cpu: torch.Tensor, routing_weights_cpu: torch.Tensor) -> torch.Tensor:
@@ -381,6 +408,7 @@ class KExpertsMarlin(KExpertsBase):
         
         return final_hidden_states.to(dtype=org_dtype, device=org_device)
     
+# untested, CUDA OOM
 class KExpertsTorch(KExpertsBase):
     expert_num: int
     loaded_experts_idx: list[int]
@@ -402,19 +430,39 @@ class KExpertsTorch(KExpertsBase):
         # self.loaded_experts_idx = []
         self.act_fn = ACT2FN[config.hidden_act]
         self.device = device
-        self.gate = None
-        self.up = None
-        self.donw = None
+        self.elements_per_tensor = config.moe_intermediate_size * config.hidden_size
+        self.gate = [None for _ in range(self.expert_num)]
+        self.up = [None for _ in range(self.expert_num)]
+        self.down = [None for _ in range(self.expert_num)]
         self.dtype = torch.get_default_dtype()
 
     def load(self, w: dict | nn.Parameter | tuple | None = None, device: str | None = None, warmup: bool = False):
         if device is None: device = self.device
-        if w is None: w = self.load_weights(device=device)[self.key]
+        if w is None:
+            w = self.load_weights()
+            load_by_experts = True
 
-        if isinstance(w, dict):
-            self.gate = w["gate"].to(device=device, dtype=self.dtype)
-            self.up = w["up"].to(device=device, dtype=self.dtype)
-            self.down = w["down"].to(device=device, dtype=self.dtype)
+        if load_by_experts:
+            if isinstance(w, dict):
+                for i in tqdm(range(self.expert_num), desc=f"Dequanting for KExpertsTorch {self.key}"):
+                    up_weights = self.gguf_loader.load_expert_tensor(self.key + ".ffn_up_exps.weight", w["up"], i, self.elements_per_tensor, device=self.device)
+                    gate_weights = self.gguf_loader.load_expert_tensor(self.key + ".ffn_gate_exps.weight", w["gate"], i, self.elements_per_tensor, device=self.device)
+                    down_weights = self.gguf_loader.load_expert_tensor(self.key + ".ffn_down_exps.weight", w["down"], i, self.elements_per_tensor, device=self.device)
+                    
+                    self.up[i] = up_weights
+                    self.gate[i] = gate_weights
+                    self.down[i] = down_weights
+        else:
+            if isinstance(w, dict):
+                for i in range(self.expert_num):
+                    self.gate[i] = w["gate"][i, ...].to(device=device, dtype=self.dtype)
+                    self.up[i] = w["up"][i, ...].to(device=device, dtype=self.dtype)
+                    self.down[i] = w["down"][i, ...].to(device=device, dtype=self.dtype)
+        
+        self.up = torch.stack(self.up, dim=0)
+        self.gate = torch.stack(self.gate, dim=0)
+        self.down = torch.stack(self.down, dim=0)
+        return 
 
     def unload(self):
         if self.gate is not None:
@@ -422,6 +470,25 @@ class KExpertsTorch(KExpertsBase):
             self.up = None
             self.down = None
 
+    def load_weights(self, override_key: str | None = None):
+        res = {}
+        if override_key is not None:
+            keys = override_key
+        else:
+            keys = [self.key]
+
+        gate = None
+        up = None
+        down = None
+
+        for key in keys:
+            if key + ".ffn_gate_exps.weight" in self.gguf_loader.tensor_info:
+                gate = self.gguf_loader.get_mmap_tensor(key + ".ffn_gate_exps.weight")
+                up = self.gguf_loader.get_mmap_tensor(key + ".ffn_up_exps.weight")
+                down = self.gguf_loader.get_mmap_tensor(key + ".ffn_down_exps.weight")
+            res = {"gate": gate, "up": up, "down": down}
+        return res
+
     def forward(self, hidden_states_cpu: torch.Tensor, selected_experts_cpu: torch.Tensor, routing_weights_cpu: torch.Tensor) -> torch.Tensor:
 
         org_device = hidden_states_cpu.device
@@ -478,7 +545,7 @@ class KTransformersExperts(BaseInjectedModule, KExpertsBase):
                  generate_device: str = "cpu",
                  generate_op: str | None = "KExpertsCPU",
                  **kwargs):
-        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, generate_device, **kwargs)
+        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, generate_device, **kwargs)
         KExpertsBase.__init__(self, key, gguf_loader, config, orig_module, generate_device, **kwargs)
         if generate_op is not None:
             self.generate_experts = EXPERTS_MAP[generate_op](key, gguf_loader, config, len(orig_module), device=generate_device, **kwargs)
@@ -582,7 +649,7 @@ class KQwen2MoeSparseMoeBlock(BaseInjectedModule, Qwen2MoeSparseMoeBlock):
 
         if isinstance(self.experts, KExpertsBase):
             y = (
-                self.moe_on_cpuinfer(
+                self.moe_kexperts(
                     hidden_states_expert, selected_experts_expert, routing_weights_expert
                 )
                 .view(*orig_shape)
@@ -601,8 +668,7 @@ class KQwen2MoeSparseMoeBlock(BaseInjectedModule, Qwen2MoeSparseMoeBlock):
         return y, router_logits
     
     @torch.no_grad()
-    def moe_on_cpuinfer(self, x: torch.Tensor, topk_ids: torch.Tensor, topk_weight: torch.Tensor) -> torch.Tensor:
-        outs = torch.empty_like(x)
+    def moe_kexperts(self, x: torch.Tensor, topk_ids: torch.Tensor, topk_weight: torch.Tensor) -> torch.Tensor:
         outs = self.experts(x, topk_ids, topk_weight)
         return outs
 
@@ -672,7 +738,7 @@ class KDeepseekV2MoE(BaseInjectedModule, DeepseekV2MoE):
             y_ = self.shared_experts(identity).squeeze(0)
             
         if isinstance(self.experts, KExpertsBase):
-            y = self.moe_on_cpuinfer(hidden_states, topk_idx, topk_weight).view(*orig_shape).to(device=hidden_states.device)
+            y = self.moe_kexperts(hidden_states, topk_idx, topk_weight).view(*orig_shape).to(device=hidden_states.device)
         elif hidden_states.size(0) > 10:
             # TODO may bugs here
             y = (
@@ -692,8 +758,7 @@ class KDeepseekV2MoE(BaseInjectedModule, DeepseekV2MoE):
         return y
 
     @torch.no_grad()
-    def moe_on_cpuinfer(self, x: torch.Tensor, topk_ids: torch.Tensor, topk_weight: torch.Tensor) -> torch.Tensor:
-        outs = torch.empty_like(x)
+    def moe_kexperts(self, x: torch.Tensor, topk_ids: torch.Tensor, topk_weight: torch.Tensor) -> torch.Tensor:
         outs = self.experts(x, topk_ids, topk_weight)
         return outs
 
@@ -773,7 +838,7 @@ class KDeepseekV3MoE(BaseInjectedModule, DeepseekV3MoE):
             y_ = self.shared_experts(identity).squeeze(0)
             
         if isinstance(self.experts, KExpertsBase):
-            y = self.moe_on_cpuinfer(hidden_states, topk_idx, topk_weight).view(*orig_shape).to(device=hidden_states.device)
+            y = self.moe_kexperts(hidden_states, topk_idx, topk_weight).view(*orig_shape).to(device=hidden_states.device)
         elif hidden_states.size(0) > 10:
             # TODO may bugs here
             y = (
@@ -793,8 +858,7 @@ class KDeepseekV3MoE(BaseInjectedModule, DeepseekV3MoE):
         return y
 
     @torch.no_grad()
-    def moe_on_cpuinfer(self, x: torch.Tensor, topk_ids: torch.Tensor, topk_weight: torch.Tensor) -> torch.Tensor:
-        outs = torch.empty_like(x)
+    def moe_kexperts(self, x: torch.Tensor, topk_ids: torch.Tensor, topk_weight: torch.Tensor) -> torch.Tensor:
         outs = self.experts(x, topk_ids, topk_weight)
         return outs
 
@@ -881,7 +945,7 @@ class KMistralSparseMoEBlock(BaseInjectedModule, MixtralSparseMoeBlock):
 
         if isinstance(self.experts, KExpertsBase):
             y = (
-                self.moe_on_cpuinfer(
+                self.moe_kexperts(
                     hidden_states_expert, selected_experts_expert, routing_weights_expert
                 )
                 .view(*orig_shape)
@@ -900,8 +964,7 @@ class KMistralSparseMoEBlock(BaseInjectedModule, MixtralSparseMoeBlock):
         return y, router_logits
     
     @torch.no_grad()
-    def moe_on_cpuinfer(self, x: torch.Tensor, topk_ids: torch.Tensor, topk_weight: torch.Tensor) -> torch.Tensor:
-        outs = torch.empty_like(x)
+    def moe_kexperts(self, x: torch.Tensor, topk_ids: torch.Tensor, topk_weight: torch.Tensor) -> torch.Tensor:
         outs = self.experts(x, topk_ids, topk_weight)
         return outs
 
diff --git a/ktransformers/operators/flashinfer_wrapper.py b/ktransformers/operators/flashinfer_wrapper.py
new file mode 100644
index 0000000..a702872
--- /dev/null
+++ b/ktransformers/operators/flashinfer_wrapper.py
@@ -0,0 +1,380 @@
+'''
+Description  : flashinfer MLA wrapper
+Author       : Boxin Zhang
+Version      : 0.2.3
+'''
+import torch
+import os
+from ktransformers.operators.triton_attention import decode_attention_fwd_grouped
+
+flashinfer_enabled = False
+
+try:
+    import flashinfer
+    flashinfer_enabled = True
+    print("found flashinfer")
+    
+except ImportError:
+    print("flashinfer not found, use triton for linux")
+
+import math
+
+def attention_ref_torch(
+    batch_size,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    causal: bool,
+    sm_scale: float,
+) -> torch.Tensor:
+    qo_len = q.shape[0] // batch_size
+    kv_len = k.shape[0] // batch_size
+    num_qo_heads = q.shape[1]
+    head_dim_qk = q.shape[2]
+    head_dim_vo = v.shape[2]
+    logits = (
+        torch.einsum(
+            "bmhd,bnhd->bhmn",
+            q.view(batch_size, qo_len, num_qo_heads, head_dim_qk).float(),
+            k.view(batch_size, kv_len, num_qo_heads, head_dim_qk).float(),
+        )
+        * sm_scale
+    )
+
+    #print("attn weights", logits)
+
+    if causal:
+        mask = (
+            torch.arange(kv_len - qo_len, kv_len).unsqueeze(1)
+            >= torch.arange(0, kv_len).unsqueeze(0)
+        ).to(q.device)
+    else:
+        mask = torch.ones(qo_len, kv_len).to(q.device)
+
+    logits = logits.masked_fill(mask.unsqueeze(0).unsqueeze(0) == 0, float("-inf"))
+    lse_ref = torch.logsumexp(logits, -1).transpose(-1, -2)
+    p = torch.softmax(logits, dim=-1)
+    o_ref = (
+        torch.einsum(
+            "bhmn,bnhd->bmhd",
+            p,
+            v.view(batch_size, kv_len, num_qo_heads, head_dim_vo).float(),
+        )
+        .contiguous()
+        .view(batch_size * qo_len, num_qo_heads, head_dim_vo)
+        .to(q)
+    )
+
+    return o_ref, lse_ref * math.log2(math.e)
+
+class MLAWrapper():
+    def __init__(self,
+                 max_batch_size,
+                 max_pages,
+                 use_cuda_graph = True,
+                 device = "cuda",
+                 ):
+        self.float_workspace_buffer = torch.empty(128*1024*1024, dtype=torch.int8, device=device)
+        self.max_batch_size = max_batch_size
+        self.max_pages = max_pages
+        if use_cuda_graph:
+            if self.max_batch_size == 1:
+                self.qo_indptr_buf = torch.arange(0, max_batch_size+1, dtype=torch.int32, device=device)
+                self.kv_indptr_buf = torch.tensor([0, max_pages], dtype=torch.int32, device=device)
+                self.kv_indices_buf = torch.arange(0, max_pages, dtype=torch.int32, device=device)
+            else:
+                self.qo_indptr_buf = torch.empty(max_batch_size+1, dtype=torch.int32, device=device)
+                self.kv_indptr_buf = torch.empty(max_batch_size+1, dtype=torch.int32, device=device)
+                self.kv_indices_buf = torch.empty(max_pages, dtype=torch.int32, device=device)
+            self.kv_len_arr_buf = torch.empty(max_batch_size, dtype=torch.int32, device=device)
+        else:
+            self.qo_indptr_buf = None
+            self.kv_indptr_buf = None
+            self.kv_indices_buf = None
+            self.kv_len_arr_buf = None
+        self.wrapper = flashinfer.mla.BatchMLAPagedAttentionWrapper(
+            self.float_workspace_buffer,
+            use_cuda_graph=False,
+            qo_indptr=self.qo_indptr_buf,
+            kv_indptr=self.kv_indptr_buf,
+            kv_indices=self.kv_indices_buf,
+            kv_len_arr=self.kv_len_arr_buf,
+        )
+        self.need_plan = True
+    
+    def plan(self,
+             qo_indptr,
+             kv_indptr,
+             kv_indices,
+             kv_len_arr,
+             num_heads,
+             head_dim_ckv,
+             head_dim_kpe,
+             page_size,
+             sm_scale,
+             q_data_type,
+             kv_data_type,
+             ):
+        if qo_indptr is None:
+            assert self.max_batch_size == 1
+            qo_indptr = self.qo_indptr_buf
+        if kv_indptr is None:
+            assert self.max_batch_size == 1
+            kv_indptr = self.kv_indptr_buf
+        if kv_indices is None:
+            assert self.max_batch_size == 1
+            kv_indices = self.kv_indices_buf
+        
+        self.wrapper.plan(
+            qo_indptr,
+            kv_indptr,
+            kv_indices,
+            kv_len_arr,
+            num_heads,
+            head_dim_ckv,
+            head_dim_kpe,
+            page_size,
+            True, # causal
+            sm_scale,
+            q_data_type,
+            kv_data_type,
+        )
+
+    def run(self, q_nope, q_pe, ckv, k_pe, return_lse = False):
+        return self.wrapper.run(q_nope, q_pe, ckv, k_pe, return_lse = return_lse)
+
+class MLAWrapperSingleton():
+    wrappers:dict = {}
+
+    @classmethod
+    def get_instance(cls, device, *args, **kwargs)->MLAWrapper:
+        if device not in cls.wrappers:
+            cls.make_instance(device, *args, **kwargs)
+        return cls.wrappers[device]
+    
+    @classmethod
+    def make_instance(cls, device, *args, **kwargs):
+        cls.wrappers[device] = MLAWrapper(*args, **kwargs, device=device)
+
+    @classmethod
+    def plan_all(cls, qo_indptr,
+             kv_indptr,
+             kv_indices,
+             kv_len_arr,
+             num_heads,
+             head_dim_ckv,
+             head_dim_kpe,
+             page_size,
+             sm_scale,
+             q_data_type,
+             kv_data_type,):
+        for device, wrapper in cls.wrappers.items():
+            kv_len_arr_cur_device = kv_len_arr.to(device)
+            wrapper.plan(qo_indptr,
+                kv_indptr,
+                kv_indices,
+                kv_len_arr_cur_device,
+                num_heads,
+                head_dim_ckv,
+                head_dim_kpe,
+                page_size,
+                sm_scale,
+                q_data_type,
+                kv_data_type,)
+            wrapper.need_plan = False
+            
+    @classmethod
+    def need_plan_all(cls):
+        for device, wrapper in cls.wrappers.items():
+            wrapper.need_plan = True
+        
+    @classmethod
+    def reset_buffer(cls):
+        for device, wrapper in cls.wrappers.items():
+            wrapper.qo_indptr_buf[1] = 1 # assert max_batch_size=1 here.
+            
+    @classmethod
+    def update_buffer(cls, max_pages):
+        for device, wrapper in cls.wrappers.items():
+            wrapper.kv_indptr_buf[1] = max_pages # assert max_batch_size=1 here.
+            wrapper.kv_indices_buf = torch.arange(0, max_pages, dtype=torch.int32, device=device)
+            wrapper.wrapper._kv_indices_buf = wrapper.kv_indices_buf
+
+def checksame():
+    flashinfer_folder = "./flashinfer_output"
+    flashinfer_folder = "./kv_cache_flashinfer"
+    triton_folder = "./triton_output"
+    triton_folder = "./kv_cache_triton"
+    
+    max_layer_id = 1
+    max_forward_id = 2
+
+    for forward_id in range(0, 19):
+        print("forward_id", forward_id)
+        for layer_id in range(max_layer_id):
+            print(layer_id)
+            #file_name = f"layer_{layer_id}_forward_{forward_id}_attn_output.pt"
+            #file_name = f"layer_{layer_id}_forward_{forward_id}_q_pe.pt"
+            file_name = f"layer_{layer_id}.pt"
+            
+            flashinfer_path = os.path.join(flashinfer_folder, file_name)
+            triton_path = os.path.join(triton_folder, file_name)
+            
+            if not os.path.exists(triton_path):
+                print(f"{file_name} not exist in {triton_folder}")
+                continue
+            if not os.path.exists(flashinfer_path):
+                print(f"{file_name} not exist in {flashinfer_folder}")
+                continue
+            
+            
+            flashinfer_tensor = torch.load(flashinfer_path)[1:2, :62]#
+            triton_tensor = torch.load(triton_path)[1:2, :62]#.squeeze(1)#
+            try:
+                torch.testing.assert_close(flashinfer_tensor, triton_tensor, rtol=1e-9, atol=1e-9)
+            except AssertionError as e:
+                print(e)
+
+if __name__ == "__main__":
+    
+    #checksame()
+    #exit(0)
+
+    max_batch_size = 1
+    max_pages = 64
+    page_size = 64
+    num_heads = 128
+    
+    # warm-up
+    kv_len = 4023
+    q_len = 1
+    q_nope_buf = torch.randn((q_len, num_heads, 512), dtype=torch.bfloat16, device="cuda")
+    q_pe_buf = torch.randn((q_len, num_heads, 64), dtype=torch.bfloat16, device="cuda")
+    kv_buf = torch.randn((max_pages, page_size, 576), dtype=torch.bfloat16, device="cuda")
+    ckv, k_pe = torch.split(kv_buf, [512, 64], dim=-1)
+    
+
+    wrapper = MLAWrapperSingleton.get_instance(
+        "cuda",
+        max_batch_size,
+        max_pages,
+    )
+    
+    kv_len_arr = torch.tensor([kv_len], dtype=torch.int32, device="cuda")
+    qo_indptr = torch.tensor([0, q_len], dtype=torch.int32, device="cuda")
+    wrapper.plan(
+        qo_indptr,
+        None,
+        None,
+        kv_len_arr,
+        128,
+        512,
+        64,
+        page_size,
+        192 ** (-0.5),
+        torch.bfloat16,
+        torch.bfloat16,
+    )
+
+    attn_output = wrapper.run(q_nope_buf, q_pe_buf, ckv, k_pe)
+    print(attn_output.shape)
+    
+    graph = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(graph):
+        attn_output = wrapper.run(q_nope_buf, q_pe_buf, ckv, k_pe)
+    # warm-up finished
+
+    for forward_id in range(0, 1):
+        print("forward_id", forward_id)
+        for layer_id in range(1):
+            print(layer_id)
+            flashinfer_folder = "./kv_cache_flashinfer"
+            forward_id = 17
+            layer_id = 0
+            file_name = f"layer_{layer_id}.pt"
+            kv_cache_path = os.path.join(flashinfer_folder, file_name)
+            flashinfer_folder = "./flashinfer_output"
+
+            q_len = 1
+            kv_len = 126
+            file_name = f"layer_{layer_id}_forward_{forward_id}_q_nope.pt"
+            q_nope = torch.load(os.path.join(flashinfer_folder, file_name)).view(q_len,128,512).to(device="cuda")
+            file_name = f"layer_{layer_id}_forward_{forward_id}_q_pe.pt"
+            q_pe = torch.load(os.path.join(flashinfer_folder, file_name)).view(q_len,128,64).to(device="cuda")
+            q = torch.cat([q_nope, q_pe], dim=-1)
+            kv_cache = torch.load(kv_cache_path).to(device="cuda")
+            pages, page_size, _, head_dim = kv_cache.shape
+            kv_cache = kv_cache.view(pages, page_size, head_dim)
+            ckv, k_pe = torch.split(kv_cache, [512, 64], dim=-1)
+    
+            kv_len_arr = torch.tensor([kv_len], dtype=torch.int32, device="cuda")
+            qo_indptr = torch.tensor([0, q_len], dtype=torch.int32, device="cuda")
+            wrapper.plan(
+                None,
+                None,
+                None,
+                kv_len_arr,
+                128,
+                512,
+                64,
+                page_size,
+                192 ** (-0.5),
+                torch.bfloat16,
+                torch.bfloat16,
+            )
+    
+            q_nope_buf.copy_(q_nope)
+            q_pe_buf.copy_(q_pe)
+            kv_buf[:pages].copy_(kv_cache)
+
+            torch.cuda.synchronize()
+            graph.replay()
+            torch.cuda.synchronize()
+
+            # ref_torch
+            k = (
+                torch.cat([ckv, k_pe], dim=-1)
+                .view(-1, 1, 512 + 64)
+                .repeat_interleave(num_heads, dim=1)
+            )
+            v = ckv.view(-1, 1, 512).repeat_interleave(num_heads, dim=1)
+            attn_ref, lse_ref = attention_ref_torch(
+                max_batch_size,
+                q,
+                k[:kv_len],
+                v[:kv_len],
+                False,
+                192 ** (-0.5)
+            )
+            torch.testing.assert_close(attn_output, attn_ref, rtol=1e-3, atol=1e-3)
+    
+            # ref_triton
+            attn_logits = torch.empty(
+                    (
+                        max_batch_size,
+                        num_heads,
+                        4, #num_kv_splits # follow vLLM, fix it TODO
+                        512 + 1, 
+                    ),
+                    dtype=torch.float32,
+                    device = "cuda"
+                )
+            
+            triton_ref = torch.zeros_like(q_nope)
+            page_table = torch.arange(max_pages, dtype=torch.int32, device="cuda")
+            ckv_with_pe = torch.cat([ckv, k_pe], dim=-1).contiguous().view(pages, page_size, 1, 576)
+            ckv = ckv.view(pages, page_size, 1, 512)
+            decode_attention_fwd_grouped(q, ckv_with_pe, ckv, triton_ref,
+                page_table,
+                kv_len_arr, attn_logits,
+                4, #num_kv_splits # follow vLLM, fix it TODO
+                192 ** (-0.5),
+                page_size)
+
+            torch.testing.assert_close(attn_output, triton_ref, rtol=1e-3, atol=1e-3)
+            
+            #file_name = f"./flashinfer_output/layer_{layer_id}_forward_{forward_id}_attn_output.pt"
+            #ktrans_output = torch.load(file_name)
+            #torch.testing.assert_close(attn_output, ktrans_output.squeeze(1), rtol=1e-3, atol=1e-3)
+            print("test past")
+    
diff --git a/ktransformers/operators/gate.py b/ktransformers/operators/gate.py
index ab7d0b2..d908093 100644
--- a/ktransformers/operators/gate.py
+++ b/ktransformers/operators/gate.py
@@ -67,7 +67,14 @@ class KMoEGateBase(ABC):
 
         for key in keys:
             key = ".".join(key.split(".")[:-1])
-            if key + ".ffn_gate_inp.weight" in self.gguf_loader.tensor_info:
+            if self.gguf_loader.safetensor_loader is not None:
+                targets = [".ffn_gate_inp.weight", ".exp_probs_b.bias"]
+                weight = self.gguf_loader.safetensor_loader.load_tensor(key + ".ffn_gate_inp.weight") 
+                e_score_correction_bias = self.gguf_loader.safetensor_loader.load_tensor(key + ".exp_probs_b.bias")
+                weight_type = weight.dtype
+                e_score_correction_bias_type = e_score_correction_bias.dtype
+                res = {"weight": weight, "e_score_correction_bias": e_score_correction_bias,  "weight_type": weight_type, "e_score_correction_bias_type": e_score_correction_bias_type}
+            elif key + ".ffn_gate_inp.weight" in self.gguf_loader.tensor_info:
                 targets = [".ffn_gate_inp.weight", ".exp_probs_b.bias"]
                 tensors = self.load_multi(key, targets, device=device)
                 weight = tensors[".ffn_gate_inp.weight"]
@@ -93,11 +100,11 @@ class KMoEGate(BaseInjectedModule, KMoEGateBase):
         gguf_loader: GGUFLoader,
         config: PretrainedConfig,
         orig_module: nn.Module = None,
-        generate_device: str = "cuda",
         prefill_device: str = "cuda",
+        generate_device: str = "cuda",
         **kwargs,
     ):
-        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, generate_device, **kwargs)
+        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, generate_device, **kwargs)
         KMoEGateBase.__init__(self, key, gguf_loader, config, orig_module, generate_device, **kwargs)
         self.generate_device = generate_device
         self.prefill_device = prefill_device
@@ -116,8 +123,8 @@ class KMoEGate(BaseInjectedModule, KMoEGateBase):
             self.orig_module.e_score_correction_bias = nn.Parameter(w["e_score_correction_bias"])
         else:
             raise ValueError("Invalid weight type")
-        self.orig_module.weight = self.orig_module.weight.to(device)
-        self.orig_module.e_score_correction_bias = self.orig_module.e_score_correction_bias.to(device)
+        self.orig_module.weight = nn.Parameter(self.orig_module.weight.to(device))
+        self.orig_module.e_score_correction_bias = nn.Parameter(self.orig_module.e_score_correction_bias.to(device))
 
     def unload(self):
         if self.weight is not None:
diff --git a/ktransformers/operators/linear.py b/ktransformers/operators/linear.py
index 9e35e8d..103fc1a 100644
--- a/ktransformers/operators/linear.py
+++ b/ktransformers/operators/linear.py
@@ -21,10 +21,12 @@ from ktransformers.ktransformers_ext.operators.custom_marlin.quantize.utils.marl
     MarlinWorkspace,
     marlin_quantize,
     GPTQ_MARLIN_MIN_THREAD_N,
+    GPTQ_MARLIN_MIN_THREAD_K,
     GPTQ_MARLIN_MAX_PARALLEL,
 )
 from ktransformers.operators.base_operator import BaseInjectedModule
 from transformers.configuration_utils import PretrainedConfig
+from ktransformers.ktransformers_ext.triton.fp8gemm import fp8_gemm, act_quant, weight_dequant
 from abc import ABC, abstractmethod
 import sys, os
 sys.path.append(os.path.join(os.path.dirname(__file__), "..", "ktransformers_ext", "build"))
@@ -54,15 +56,17 @@ class KLinearBase(ABC):
 
         self.has_bias = False
         self.dtype = torch.get_default_dtype()
-        # if orig_module is not None:
-        #     self.in_features = orig_module.in_features
-        #     self.out_features = orig_module.out_features
-        # else:
-        shape = self.gguf_loader.tensor_info[key + ".weight"]["shape"]
-        if len(shape) == 1:
-            print("Warning: orig_module is not set, but has in_features or out_features equals to 1, can't get in_features and out_features from GGUF")
-        self.in_features  = self.gguf_loader.tensor_info[key + ".weight"]["shape"][0]
-        self.out_features = self.gguf_loader.tensor_info[key + ".weight"]["shape"][1]
+        if orig_module is not None:
+            self.in_features = orig_module.in_features
+            self.out_features = orig_module.out_features
+        else:
+            shape = self.gguf_loader.tensor_info[key + ".weight"]["shape"]
+            if len(shape) == 1:
+                print("Warning: orig_module is not set, but has in_features or out_features equals to 1, can't get in_features and out_features from GGUF")
+            self.in_features  = self.gguf_loader.tensor_info[key + ".weight"]["shape"][0]
+            self.out_features = self.gguf_loader.tensor_info[key + ".weight"]["shape"][1]
+
+        self.loaded = False # for lm_head pre-load, TODO: use new way to do lm_head pre-load when layer wise prefill.
 
     @abstractmethod
     def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -75,7 +79,13 @@ class KLinearBase(ABC):
             keys = [self.key]
 
         for key in keys:
-            if key + ".weight" in self.gguf_loader.tensor_file_map:
+            if self.gguf_loader.safetensor_loader is not None:
+                # using safetensor_loader
+                tensor = self.gguf_loader.safetensor_loader.load_tensor(key+'.weight')
+                weight_scale_inv = self.gguf_loader.safetensor_loader.load_tensor(key+'.weight_scale_inv')
+                return nn.Parameter(tensor), nn.Parameter(weight_scale_inv)
+                
+            elif key + ".weight" in self.gguf_loader.tensor_file_map:
                 if key + ".bias" in self.gguf_loader.tensor_file_map:
                     tensors = self.load_multi(key, ["weight", "bias"], device=device)
                     tensor = tensors["weight"]
@@ -119,7 +129,7 @@ class KLinearTorch(KLinearBase):
         super().__init__(key, gguf_loader, config, orig_module, device, **kwargs)
         self.has_bias = False
         self.dtype = torch.get_default_dtype()
-        self.w = None
+        self.weight = None
         self.has_bias = False
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -127,37 +137,100 @@ class KLinearTorch(KLinearBase):
         out_device = x.device
         # TODO: support CUDA Graph when using cpu, but CPUInfer is recommended.
         x = x.to(device=self.device, dtype=self.dtype)
-        x = x @ self.w
+        x = x @ self.weight
         if self.has_bias:
             x = x + self.bias
         x = x.to(dtype=dtype, device=out_device)
         return x
 
     def load(self, w: dict | nn.Parameter | tuple | None = None, device: str|None = None):
+        if self.loaded: return
         if device is None: device = self.device
         if w is None: w = self.load_weight(device=device)
+        # else: self.out_features = w.shape[0], self.in_features = w.shape[1]
         
         if isinstance(w, nn.Parameter):
-            self.w = w.to(dtype=self.dtype).T
+            try:
+                self.weight = w.to(dtype=self.dtype).view(self.out_features, self.in_features).T
+            except: 
+                self.weight = w.to(dtype=self.dtype).T
             self.has_bias = False
         elif isinstance(w, tuple):
-            self.w = w[0].to(dtype=self.dtype).T
+            try:
+                self.weight = w[0].to(dtype=self.dtype).view(self.out_features, self.in_features).T
+            except:
+                self.weight = w[0].to(dtype=self.dtype).T
             self.bias = w[1].to(dtype=self.dtype)
             self.has_bias = True
         else:
             raise ValueError("Invalid weight type")
         # self.linear = self.linear.to(device)
-        self.w = self.w.to(device)
+        self.weight = self.weight.to(device)
         if self.has_bias:
             self.bias = self.bias.to(device)
+        self.loaded = True
 
     def unload(self):
-        if self.w is not None:
-            self.w = None
+        if self.weight is not None:
+            self.weight = None
         if self.has_bias:
             self.bias = None
 
-
+class KLinearFP8(KLinearBase):
+    # this kernel requires special handling for weight
+    # Please load the weight file downloaded from KVCache.AI
+    marlin_q_w: torch.Tensor
+    marlin_s: torch.Tensor
+    g_idx: torch.Tensor
+    sort_indices: torch.Tensor
+    has_bias: bool
+    weight: torch.Tensor
+    scale_w: torch.Tensor
+    bias: torch.Tensor
+    def __init__(
+        self,
+        key: str,
+        gguf_loader: GGUFLoader,
+        config: PretrainedConfig,
+        orig_module: nn.Module = None,
+        device: str = "cuda",
+        block_size: int = 128,
+        **kwargs,
+    ):
+        super().__init__(key, gguf_loader, config, orig_module, device, **kwargs)
+        self.has_bias = False
+        self.dtype = torch.get_default_dtype()
+        self.block_size = block_size
+    
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x.to(self.device)
+        orig_dtype = x.dtype        
+        x_quantized, scale_x = act_quant(x, self.block_size)
+        y = fp8_gemm(x_quantized, scale_x, self.weight, self.weight_scale_inv)
+        return y.to(dtype=orig_dtype)
+    
+    def load(self, w: dict | nn.Parameter | tuple | None = None, device: str|None = None):
+        if device is None: device = self.device
+        if w is None: 
+            w = self.load_weight(device=device) 
+        ### TODO fit weight_inv format
+        if isinstance(w, tuple):
+            self.weight = w[0].to(device)
+            self.weight_scale_inv = w[1].to(device)
+            self.has_bias = False
+        else:
+            raise ValueError("Invalid weight type")
+        self.weight = self.weight.to(device)
+        if self.has_bias:
+            self.bias = self.bias.to(device)
+        
+    def unload(self):
+        if self.weight is not None:
+            self.weight = None
+        if self.has_bias:
+            self.bias = None
+        
+        
 class KLinearMarlin(KLinearBase):
     marlin_q_w: torch.Tensor
     marlin_s: torch.Tensor
@@ -183,19 +256,36 @@ class KLinearMarlin(KLinearBase):
         self.group_size = group_size
         self.act_order = act_order
         self.is_k_full = is_k_full
+        self.padding = False
+        self.orin_in_features = self.in_features
+        self.orin_out_features = self.out_features
+        if self.in_features%GPTQ_MARLIN_MIN_THREAD_K!=0 or self.out_features%GPTQ_MARLIN_MIN_THREAD_K!=0:
+            #print(f"warning!, in_features={in_features} or out_features={out_features} is undivisible by GPTQ_MARLIN_MIN_THREAD_K={GPTQ_MARLIN_MIN_THREAD_K} and GPTQ_MARLIN_MIN_THREAD_N={GPTQ_MARLIN_MIN_THREAD_N}, padding")
+            self.padding = True
+            self.in_features = (self.in_features+GPTQ_MARLIN_MIN_THREAD_K-1)//GPTQ_MARLIN_MIN_THREAD_K*GPTQ_MARLIN_MIN_THREAD_K
+            self.out_features = (self.out_features+GPTQ_MARLIN_MIN_THREAD_N-1)//GPTQ_MARLIN_MIN_THREAD_N*GPTQ_MARLIN_MIN_THREAD_N
+            #print(f"After padding: in_features={in_features}, out_features={out_features}")
+        
+        self.k = self.in_features
+        self.n = self.out_features
 
     def load(self, w: dict | nn.Parameter | tuple | None = None, device: str|None = None):
+        if self.loaded: return
         if device is None: device = self.device
         assert device.lower() != "cpu", "Marlin quantized linear only supports GPU device"
-        if w is None: w = self.load_weight(device=device)
+        
+        #if self.in_features * self.out_features:
+        if w is None: 
+            w = self.load_weight(device=device) 
 
         if isinstance(w, nn.Parameter):
             # pad weight
-            weight = w.view(self.out_features, self.in_features).T
+            weight = w.view(self.orin_out_features, self.orin_in_features).T
             self.has_bias = False
         elif isinstance(w, tuple):
             w = list(w)
-            weight = w[0].view(self.out_features, self.in_features).T
+            weight = w[0].view(self.orin_out_features, self.orin_in_features).T
+            self.bias = w[1].view(self.orin_out_features)
             self.bias = w[1]
             self.has_bias = True
         else:
@@ -203,19 +293,27 @@ class KLinearMarlin(KLinearBase):
         weight = weight.to(device)
         if self.has_bias:
             self.bias = self.bias.to(device)
+            
+        if self.padding:
+            padded_weight = torch.zeros(self.in_features, self.out_features, device=self.device)
+            padded_weight[:self.orin_in_features, :self.orin_out_features] = weight
+            weight = padded_weight
+
         # Pack Marlin linear
-        w_ref, marlin_q_w, marlin_s, g_idx, sort_indices, _ = marlin_quantize(
+        marlin_q_w, marlin_s, g_idx, sort_indices, _ = marlin_quantize(
             weight, self.num_bits, self.group_size, self.act_order
         )
         self.workspace = MarlinWorkspace(
             self.out_features, GPTQ_MARLIN_MIN_THREAD_N, GPTQ_MARLIN_MAX_PARALLEL,self.device
         )
+        self.weight = marlin_q_w # modeling_xxx.py may use linear.weight
         self.marlin_q_w = marlin_q_w
         self.marlin_s = marlin_s
         self.g_idx = g_idx
         self.sort_indices = sort_indices
         self.k = weight.shape[0]
         self.n = weight.shape[1]
+        self.loaded = True
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         # Only support input x as BF16 and FP16
@@ -223,6 +321,11 @@ class KLinearMarlin(KLinearBase):
         orig_shape = list(x.shape)
         orig_dtype = x.dtype
         x = x.reshape(-1, orig_shape[-1])
+        x = x.reshape(-1, x.shape[-1])
+        if self.padding:
+            padding_input=torch.empty(x.shape[0], self.in_features, device=x.device, dtype=x.dtype)
+            padding_input[:,:self.orin_in_features] = x
+            x = padding_input
         marlin_s = self.marlin_s.to(x.dtype)
         x = KTransformersOps.gptq_marlin_gemm(
             x,
@@ -237,9 +340,13 @@ class KLinearMarlin(KLinearBase):
             x.shape[-1],
             self.is_k_full,
         )
+        if self.padding:
+            x = x[:,:self.orin_out_features]
+            orig_shape[-1] = self.orin_out_features
+        else:
+            orig_shape[-1] = self.out_features
         if self.has_bias:
             x = x + self.bias
-        orig_shape[-1] = self.n
         return x.reshape(orig_shape).to(orig_dtype)
 
     def unload(self):
@@ -357,7 +464,8 @@ class KLinearCPUInfer(KLinearBase):
 LINEAR_MAP = {
     "KLinearMarlin": KLinearMarlin,
     "KLinearTorch": KLinearTorch,
-    "KLinearCPUInfer": KLinearCPUInfer
+    "KLinearCPUInfer": KLinearCPUInfer,
+    "KLinearFP8": KLinearFP8,
 }
 
 class KTransformersLinear(BaseInjectedModule, KLinearBase):
@@ -374,29 +482,18 @@ class KTransformersLinear(BaseInjectedModule, KLinearBase):
         prefill_op: str| None = "KLinearTorch",
         **kwargs,
     ):
-        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, generate_device, **kwargs)
+        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, generate_device, **kwargs)
         KLinearBase.__init__(self, key, gguf_loader, config, orig_module, generate_device, **kwargs)
         # build all the linear operators
         if prefill_op is not None:
             assert prefill_op in LINEAR_MAP, f"linear_type {prefill_op} not supported"
-            if prefill_op == "KLinearMarlin" and (orig_module.in_features%GPTQ_MARLIN_MIN_THREAD_N!=0 or orig_module.out_features%GPTQ_MARLIN_MIN_THREAD_N!=0):
-                print(f"This linear module's in_features or out_features is not divisible by GPTQ_MARLIN_MIN_THREAD_N({GPTQ_MARLIN_MIN_THREAD_N}), using KLinearTorch instead.")
-                print(f"module info: key:{key} orig_module:{orig_module}")
-                self.prefill_linear = KLinearTorch(key, gguf_loader, config, orig_module, prefill_device, **kwargs)
-            else:
-                self.prefill_linear = LINEAR_MAP[prefill_op](key, gguf_loader, config, orig_module, prefill_device, **kwargs)
+            self.prefill_linear = LINEAR_MAP[prefill_op](key, gguf_loader, config, orig_module, prefill_device, **kwargs)
         else:
             self.prefill_linear = None
 
         if generate_op is not None:
             assert generate_op in LINEAR_MAP, f"linear_type {generate_op} not supported"
-            if generate_op == "KLinearMarlin" and (orig_module.in_features%GPTQ_MARLIN_MIN_THREAD_N!=0 or orig_module.out_features%GPTQ_MARLIN_MIN_THREAD_N!=0):
-                print(f"This linear module's in_features or out_features is not divisible by GPTQ_MARLIN_MIN_THREAD_N({GPTQ_MARLIN_MIN_THREAD_N}), using KLinearTorch instead.")
-                print(f"module info: key:{key} orig_module:{orig_module}")
-                self.generate_op = "KLinearTorch"
-                self.generate_linear = KLinearTorch(key, gguf_loader, config, orig_module, generate_device, **kwargs)
-            else:
-                self.generate_linear = LINEAR_MAP[generate_op](key, gguf_loader, config, orig_module, generate_device, **kwargs)
+            self.generate_linear = LINEAR_MAP[generate_op](key, gguf_loader, config, orig_module, generate_device, **kwargs)
         else:
             self.generate_linear = None
         self.mode = InferenceState.UNLOAD
@@ -404,10 +501,11 @@ class KTransformersLinear(BaseInjectedModule, KLinearBase):
     def forward(self, x):
         if self.mode == InferenceState.PREFILL:
             assert self.prefill_linear is not None, "cpu linear is not initialized"
-            return self.prefill_linear.forward(x)
+            y = self.prefill_linear.forward(x)
         else:
             assert self.generate_linear is not None, "gpu linear is not initialized"
-            return self.generate_linear.forward(x)
+            y = self.generate_linear.forward(x)
+        return y
 
     def load(self, w: dict | nn.Parameter | tuple | None = None, mode: InferenceState = InferenceState.GENERATE):
         if not mode:
@@ -416,11 +514,13 @@ class KTransformersLinear(BaseInjectedModule, KLinearBase):
         if mode == InferenceState.PREFILL:
             self.generate_linear.unload()
             self.prefill_linear.load(w=w)
-            self.device = self.prefill_linear.device 
+            self.device = self.prefill_linear.device
+            self.weight = self.prefill_linear.weight # modeling_xxx.py may use linear.weight
         elif mode == InferenceState.GENERATE:
             self.prefill_linear.unload()
             self.generate_linear.load(w=w)
             self.device = self.generate_linear.device
+            self.weight = self.generate_linear.weight # modeling_xxx.py may use linear.weight
         elif mode == InferenceState.UNLOAD:
             self.prefill_linear.unload()
             self.generate_linear.unload()
diff --git a/ktransformers/operators/models.py b/ktransformers/operators/models.py
index 5d2e911..57d4bea 100644
--- a/ktransformers/operators/models.py
+++ b/ktransformers/operators/models.py
@@ -56,7 +56,7 @@ from ktransformers.models.modeling_deepseek import (
 from transformers.models.qwen2_moe.configuration_qwen2_moe import Qwen2MoeConfig
 from ktransformers.models.configuration_llama import LlamaConfig
 from ktransformers.operators.base_operator import BaseInjectedModule
-from ktransformers.util.utils import InferenceState
+from ktransformers.util.utils import InferenceState, get_compute_capability
 from ktransformers.util.custom_gguf import GGUFLoader
 from transformers.configuration_utils import PretrainedConfig
 from ktransformers.models.modeling_llama import (
@@ -649,9 +649,14 @@ class KDeepseekV2Model(BaseInjectedModule):
         if per_layer_prefill_flag:
             causal_mask = None
         else:
-            causal_mask = self._update_causal_mask(
-                attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
-            )
+            if os.name == 'nt' or get_compute_capability()<8:
+                print("for Windows or GPU before ampere, use forward_windows")
+                # only use mask in forward windows or can't flash attn
+                causal_mask = self._update_causal_mask(
+                    attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+                )
+            else:
+                causal_mask = None
 
         # embed positions
         hidden_states = inputs_embeds
diff --git a/ktransformers/operators/triton_attention.py b/ktransformers/operators/triton_attention.py
new file mode 100644
index 0000000..4437520
--- /dev/null
+++ b/ktransformers/operators/triton_attention.py
@@ -0,0 +1,385 @@
+# Adapted from
+# https://github.com/sgl-project/sglang/blob/9f635ea50de920aa507f486daafba26a5b837574/python/sglang/srt/layers/attention/triton_ops/decode_attention.py
+# which was originally adapted from
+# https://github.com/ModelTC/lightllm/blob/96353e868a840db4d103138caf15ed9dbea8c186/lightllm/models/deepseek2/triton_kernel/gqa_flash_decoding_stage1.py
+# https://github.com/ModelTC/lightllm/blob/96353e868a840db4d103138caf15ed9dbea8c186/lightllm/models/deepseek2/triton_kernel/gqa_flash_decoding_stage2.py
+
+import triton
+import triton.language as tl
+
+@triton.jit
+def tanh(x):
+    # Tanh is just a scaled sigmoid
+    return 2 * tl.sigmoid(2 * x) - 1
+
+@triton.jit
+def _fwd_grouped_kernel_stage1(
+    Q,
+    K_Buffer,
+    V_Buffer,
+    sm_scale,
+    Req_to_tokens,
+    B_Seqlen,
+    Att_Out,
+    stride_req_to_tokens_b,
+    stride_qbs,
+    stride_qh,
+    stride_buf_kbs,
+    stride_buf_kh,
+    stride_buf_vbs,
+    stride_buf_vh,
+    stride_mid_ob,
+    stride_mid_oh,
+    stride_mid_os,
+    kv_group_num: tl.constexpr,
+    q_head_num: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+    BLOCK_DPE: tl.constexpr,
+    BLOCK_DV: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_H: tl.constexpr,
+    NUM_KV_SPLITS: tl.constexpr,
+    PAGE_SIZE: tl.constexpr,
+    logit_cap: tl.constexpr,
+    Lk: tl.constexpr,
+    Lv: tl.constexpr,
+):
+    cur_batch = tl.program_id(0)
+    cur_head_id = tl.program_id(1)
+    cur_kv_head = cur_head_id // tl.cdiv(kv_group_num, BLOCK_H)
+    split_kv_id = tl.program_id(2)
+
+    if kv_group_num > BLOCK_H:
+        VALID_BLOCK_H: tl.constexpr = BLOCK_H
+    else:
+        VALID_BLOCK_H: tl.constexpr = kv_group_num
+    cur_head = cur_head_id * VALID_BLOCK_H + tl.arange(0, BLOCK_H)
+    mask_h = cur_head < (cur_head_id + 1) * VALID_BLOCK_H
+    mask_h = mask_h & (cur_head < q_head_num)
+
+    offs_d = tl.arange(0, BLOCK_DMODEL)
+    offs_dv = tl.arange(0, BLOCK_DV)
+    mask_d = offs_d < Lk
+    mask_dv = offs_dv < Lv
+    cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
+    cur_batch_req_idx = cur_batch
+
+    offs_q = cur_batch * stride_qbs + cur_head[:, None] * stride_qh + offs_d[
+        None, :]
+    q = tl.load(Q + offs_q,
+                mask=(mask_h[:, None]) & (mask_d[None, :]),
+                other=0.0)
+
+    if BLOCK_DPE > 0:
+        offs_dpe = BLOCK_DMODEL + tl.arange(0, BLOCK_DPE)
+        mask_dpe = offs_dpe < Lk
+        off_qpe = (cur_batch * stride_qbs + cur_head[:, None] * stride_qh +
+                   offs_dpe[None, :])
+        qpe = tl.load(Q + off_qpe,
+                      mask=(mask_h[:, None]) & (mask_dpe[None, :]),
+                      other=0.0)
+
+    kv_len_per_split = tl.cdiv(cur_batch_seq_len, NUM_KV_SPLITS)
+    split_kv_start = kv_len_per_split * split_kv_id
+    split_kv_end = tl.minimum(split_kv_start + kv_len_per_split,
+                              cur_batch_seq_len)
+    
+    e_max = tl.zeros([BLOCK_H], dtype=tl.float32) - float("inf")
+    e_sum = tl.zeros([BLOCK_H], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_H, BLOCK_DV], dtype=tl.float32)
+
+    if split_kv_end > split_kv_start:
+        for start_n in range(split_kv_start, split_kv_end, BLOCK_N):
+            offs_n = start_n + tl.arange(0, BLOCK_N)
+            kv_page_number = tl.load(
+                Req_to_tokens + stride_req_to_tokens_b * cur_batch_req_idx +
+                offs_n // PAGE_SIZE,
+                mask=offs_n < split_kv_end,
+                other=0,
+            )
+            kv_loc = kv_page_number * PAGE_SIZE + offs_n % PAGE_SIZE
+            offs_buf_k = (kv_loc[None, :] * stride_buf_kbs +
+                          cur_kv_head * stride_buf_kh + offs_d[:, None])
+            k = tl.load(
+                K_Buffer + offs_buf_k,
+                mask=(offs_n[None, :] < split_kv_end) & (mask_d[:, None]),
+                other=0.0,
+            )
+            qk = tl.dot(q, k.to(q.dtype))
+            
+            if BLOCK_DPE > 0:
+                offs_buf_kpe = (kv_loc[None, :] * stride_buf_kbs +
+                                cur_kv_head * stride_buf_kh +
+                                offs_dpe[:, None])
+                kpe = tl.load(
+                    K_Buffer + offs_buf_kpe,
+                    mask=(offs_n[None, :] < split_kv_end) &
+                    (mask_dpe[:, None]),
+                    other=0.0,
+                )
+                qk += tl.dot(qpe, kpe.to(qpe.dtype))
+            qk *= sm_scale
+
+            if logit_cap > 0:
+                qk = logit_cap * tanh(qk / logit_cap)
+
+            qk = tl.where(mask_h[:, None] & (offs_n[None, :] < split_kv_end),
+                          qk, float("-inf"))
+
+            offs_buf_v = (kv_loc[:, None] * stride_buf_vbs +
+                          cur_kv_head * stride_buf_vh + offs_dv[None, :])
+            v = tl.load(
+                V_Buffer + offs_buf_v,
+                mask=(offs_n[:, None] < split_kv_end) & (mask_dv[None, :]),
+                other=0.0,
+            )
+
+            n_e_max = tl.maximum(tl.max(qk, 1), e_max)
+            re_scale = tl.exp(e_max - n_e_max)
+            p = tl.exp(qk - n_e_max[:, None])
+            acc *= re_scale[:, None]
+            acc += tl.dot(p.to(v.dtype), v)
+
+            e_sum = e_sum * re_scale + tl.sum(p, 1)
+            e_max = n_e_max
+
+        offs_mid_o = (cur_batch * stride_mid_ob +
+                      cur_head[:, None] * stride_mid_oh +
+                      split_kv_id * stride_mid_os + offs_dv[None, :])
+
+        tl.store(
+            Att_Out + offs_mid_o,
+            acc / e_sum[:, None],
+            mask=(mask_h[:, None]) & (mask_dv[None, :]),
+        )
+
+        offs_mid_o_1 = (cur_batch * stride_mid_ob + cur_head * stride_mid_oh +
+                        split_kv_id * stride_mid_os + Lv)
+
+        tl.store(
+            Att_Out + offs_mid_o_1,
+            e_max + tl.log(e_sum),
+            mask=mask_h,
+        )
+
+def _decode_grouped_att_m_fwd(
+    q,
+    k_buffer,
+    v_buffer,
+    att_out,
+    Req_to_tokens,
+    B_Seqlen,
+    num_kv_splits,
+    sm_scale,
+    page_size,
+    logit_cap,
+):
+    BLOCK = 32
+    Lk = k_buffer.shape[-1]
+    Lv = v_buffer.shape[-1]
+
+    # [TODO] work around shmem limit on MI3xx
+    
+    # TODO: support hip
+    #if is_hip_ and Lk >= 576:
+    #    BLOCK = 16
+
+    if Lk == 576:
+        BLOCK_DMODEL = 512
+        BLOCK_DPE = 64
+    elif Lk == 288:
+        BLOCK_DMODEL = 256
+        BLOCK_DPE = 32
+    else:
+        BLOCK_DMODEL = triton.next_power_of_2(Lk)
+        BLOCK_DPE = 0
+    BLOCK_DV = triton.next_power_of_2(Lv)
+
+    batch, head_num = q.shape[0], q.shape[1]
+    kv_group_num = q.shape[1] // k_buffer.shape[-2]
+
+    BLOCK_H = 16
+    NUM_KV_SPLITS = num_kv_splits
+    grid = (
+        batch,
+        triton.cdiv(head_num, min(BLOCK_H, kv_group_num)),
+        NUM_KV_SPLITS,
+    )
+
+    extra_kargs = {}
+    # TODO: support hip
+    """
+    if is_hip_:
+        # https://rocm.docs.amd.com/en/docs-6.2.0/how-to/llm-fine-tuning-optimization/optimizing-triton-kernel.html
+        # https://github.com/triton-lang/triton/blob/main/third_party/amd/backend/compiler.py
+        extra_kargs = {
+            "waves_per_eu": 4,
+            "matrix_instr_nonkdim": 16,
+            "kpack": 2
+        }
+    """
+    
+    _fwd_grouped_kernel_stage1[grid](
+        q,
+        k_buffer,
+        v_buffer,
+        sm_scale,
+        Req_to_tokens,
+        B_Seqlen,
+        att_out,
+        Req_to_tokens.stride(0),
+        q.stride(0),
+        q.stride(1),
+        k_buffer.stride(-3),  # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM)
+        k_buffer.stride(-2),  # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM)
+        v_buffer.stride(-3),  # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM)
+        v_buffer.stride(-2),  # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM)
+        att_out.stride(0),
+        att_out.stride(1),
+        att_out.stride(2),
+        kv_group_num=kv_group_num,
+        q_head_num=head_num,
+        BLOCK_DMODEL=BLOCK_DMODEL,
+        BLOCK_DPE=BLOCK_DPE,
+        BLOCK_DV=BLOCK_DV,
+        BLOCK_N=BLOCK,
+        BLOCK_H=BLOCK_H,
+        NUM_KV_SPLITS=NUM_KV_SPLITS,
+        PAGE_SIZE=page_size,
+        logit_cap=logit_cap,
+        num_warps=4,
+        num_stages=2,
+        Lk=Lk,
+        Lv=Lv,
+        **extra_kargs,
+    )
+
+@triton.jit
+def _fwd_kernel_stage2(
+    Mid_O,
+    o,
+    B_Seqlen,
+    stride_mid_ob,
+    stride_mid_oh,
+    stride_mid_os,
+    stride_obs,
+    stride_oh,
+    NUM_KV_SPLITS: tl.constexpr,
+    BLOCK_DV: tl.constexpr,
+    Lv: tl.constexpr,
+):
+    cur_batch = tl.program_id(0)
+    cur_head = tl.program_id(1)
+
+    cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
+
+    offs_d = tl.arange(0, BLOCK_DV)
+    mask_d = offs_d < Lv
+
+    e_sum = 0.0
+    e_max = -float("inf")
+    acc = tl.zeros([BLOCK_DV], dtype=tl.float32)
+
+    offs_v = cur_batch * stride_mid_ob + cur_head * stride_mid_oh + offs_d
+    offs_logic = cur_batch * stride_mid_ob + cur_head * stride_mid_oh + Lv
+
+    for split_kv_id in range(0, NUM_KV_SPLITS):
+        kv_len_per_split = tl.cdiv(cur_batch_seq_len, NUM_KV_SPLITS)
+        split_kv_start = kv_len_per_split * split_kv_id
+        split_kv_end = tl.minimum(split_kv_start + kv_len_per_split,
+                                  cur_batch_seq_len)
+
+        if split_kv_end > split_kv_start:
+            tv = tl.load(Mid_O + offs_v + split_kv_id * stride_mid_os,
+                         mask=mask_d,
+                         other=0.0)
+            tlogic = tl.load(Mid_O + offs_logic + split_kv_id * stride_mid_os)
+            n_e_max = tl.maximum(tlogic, e_max)
+
+            old_scale = tl.exp(e_max - n_e_max)
+            acc *= old_scale
+            exp_logic = tl.exp(tlogic - n_e_max)
+            acc += exp_logic * tv
+
+            e_sum = e_sum * old_scale + exp_logic
+            e_max = n_e_max
+
+    tl.store(
+        o + cur_batch * stride_obs + cur_head * stride_oh + offs_d,
+        acc / e_sum,
+        mask=mask_d,
+    )
+
+def _decode_softmax_reducev_fwd(
+    logits,
+    q,
+    o,
+    v_buffer,
+    b_seq_len,
+    num_kv_splits,
+):
+    batch, head_num = q.shape[0], q.shape[1]
+    Lv = v_buffer.shape[-1]
+    BLOCK_DV = triton.next_power_of_2(Lv)
+
+    NUM_KV_SPLITS = num_kv_splits
+
+    extra_kargs = {}
+    # TODO: support hip
+    """
+    if is_hip_:
+        # https://rocm.docs.amd.com/en/docs-6.2.0/how-to/llm-fine-tuning-optimization/optimizing-triton-kernel.html
+        # https://github.com/triton-lang/triton/blob/main/third_party/amd/backend/compiler.py
+        extra_kargs = {
+            "waves_per_eu": 4,
+            "matrix_instr_nonkdim": 16,
+            "kpack": 2
+        }
+    """
+    
+    grid = (batch, head_num)
+    _fwd_kernel_stage2[grid](
+        logits,
+        o,
+        b_seq_len,
+        logits.stride(0),
+        logits.stride(1),
+        logits.stride(2),
+        o.stride(0),
+        o.stride(1),
+        NUM_KV_SPLITS=NUM_KV_SPLITS,
+        BLOCK_DV=BLOCK_DV,
+        Lv=Lv,
+        num_warps=4,
+        num_stages=2,
+        **extra_kargs,
+    )
+
+def decode_attention_fwd_grouped(
+    q,
+    k_buffer,
+    v_buffer,
+    o,
+    req_to_token,
+    b_seq_len,
+    attn_logits,
+    num_kv_splits,
+    sm_scale,
+    page_size,
+    logit_cap=0.0,
+):
+    _decode_grouped_att_m_fwd(
+        q,
+        k_buffer,
+        v_buffer,
+        attn_logits,
+        req_to_token,
+        b_seq_len,
+        num_kv_splits,
+        sm_scale,
+        page_size,
+        logit_cap,
+    )
+
+    _decode_softmax_reducev_fwd(attn_logits, q, o, v_buffer, b_seq_len,
+                                num_kv_splits)
diff --git a/ktransformers/optimize/optimize.py b/ktransformers/optimize/optimize.py
index 32eab01..331e6cf 100644
--- a/ktransformers/optimize/optimize.py
+++ b/ktransformers/optimize/optimize.py
@@ -126,6 +126,8 @@ def optimize_and_load_gguf(module: nn.Module, rule_file: str, gguf_path: str, mo
     gguf_loader=GGUFLoader(gguf_path)
     with torch.device("meta"):
         inject(module, optimize_config, model_config, gguf_loader)
+    # pre load lm_head because its big inter result
+    load_weights(module.lm_head, gguf_loader, "lm_head.")
     load_weights(module, gguf_loader)
     module.gguf_loader = gguf_loader
     del_meta(module)
diff --git a/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu-4.yaml b/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu-4.yaml
index a87a30c..66a420a 100644
--- a/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu-4.yaml
+++ b/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu-4.yaml
@@ -219,8 +219,20 @@
     kwargs:
       generate_device: "cuda:2"
       prefill_device: "cuda:2"
+
 - match:
-    name: "(^model\\.layers\\.([5][0-9]|[4][5-9])\\.)|(^model.norm)|(^lm_head)"
+    name: "^lm_head"
+    class: torch.nn.Linear
+  replace:
+    class: ktransformers.operators.linear.KTransformersLinear
+    kwargs:
+      generate_device: "cuda:3"
+      prefill_device: "cuda:3"
+      generate_op: "KLinearMarlin"
+      prefill_op: "KLinearTorch"
+      
+- match:
+    name: "(^model\\.layers\\.([5][0-9]|[4][5-9])\\.)|(^model.norm)"
   replace:
     class: "default"
     kwargs:
diff --git a/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu.yaml b/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu.yaml
index 269257e..f409376 100644
--- a/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu.yaml
+++ b/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu.yaml
@@ -118,7 +118,18 @@
       prefill_device: "cuda:0"
 
 - match:
-    name: "(^model\\.layers\\.([345][0-9])\\.)|(model.norm)|(lm_head)"
+    name: "^lm_head"
+    class: torch.nn.Linear
+  replace:
+    class: ktransformers.operators.linear.KTransformersLinear
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+      generate_op: "KLinearMarlin"
+      prefill_op: "KLinearTorch"
+
+- match:
+    name: "(^model\\.layers\\.([345][0-9])\\.)|(model.norm)"
   replace:
     class: "default"
     kwargs:
diff --git a/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat.yaml b/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat.yaml
index b115aba..7f3e44e 100644
--- a/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat.yaml
+++ b/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat.yaml
@@ -15,6 +15,18 @@
       prefill_device: "cuda"
       generate_op: "KLinearMarlin"
       prefill_op: "KLinearTorch"
+
+- match:
+    name: "^lm_head"
+    class: torch.nn.Linear
+  replace:
+    class: ktransformers.operators.linear.KTransformersLinear
+    kwargs:
+      generate_device: "cuda"
+      prefill_device: "cuda"
+      generate_op: "KLinearMarlin"
+      prefill_op: "KLinearTorch"
+
 - match:
     name: "^model\\.layers\\..*\\.mlp$"
     class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
diff --git a/ktransformers/optimize/optimize_rules/DeepSeek-V2-Lite-Chat-multi-gpu.yaml b/ktransformers/optimize/optimize_rules/DeepSeek-V2-Lite-Chat-multi-gpu.yaml
index 99d01c0..158892d 100644
--- a/ktransformers/optimize/optimize_rules/DeepSeek-V2-Lite-Chat-multi-gpu.yaml
+++ b/ktransformers/optimize/optimize_rules/DeepSeek-V2-Lite-Chat-multi-gpu.yaml
@@ -118,7 +118,18 @@
       prefill_device: "cuda:0"
 
 - match:
-    name: "(^model\\.layers\\.([12][0-9])\\.)|(model.norm)|(lm_head)"
+    name: "^lm_head"
+    class: torch.nn.Linear
+  replace:
+    class: ktransformers.operators.linear.KTransformersLinear
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+      generate_op: "KLinearMarlin"
+      prefill_op: "KLinearTorch"
+
+- match:
+    name: "(^model\\.layers\\.([12][0-9])\\.)|(model.norm)"
   replace:
     class: "default"
     kwargs:
diff --git a/ktransformers/optimize/optimize_rules/DeepSeek-V2-Lite-Chat.yaml b/ktransformers/optimize/optimize_rules/DeepSeek-V2-Lite-Chat.yaml
index b115aba..7f3e44e 100644
--- a/ktransformers/optimize/optimize_rules/DeepSeek-V2-Lite-Chat.yaml
+++ b/ktransformers/optimize/optimize_rules/DeepSeek-V2-Lite-Chat.yaml
@@ -15,6 +15,18 @@
       prefill_device: "cuda"
       generate_op: "KLinearMarlin"
       prefill_op: "KLinearTorch"
+
+- match:
+    name: "^lm_head"
+    class: torch.nn.Linear
+  replace:
+    class: ktransformers.operators.linear.KTransformersLinear
+    kwargs:
+      generate_device: "cuda"
+      prefill_device: "cuda"
+      generate_op: "KLinearMarlin"
+      prefill_op: "KLinearTorch"
+
 - match:
     name: "^model\\.layers\\..*\\.mlp$"
     class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
diff --git a/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-fp8-linear-ggml-experts.yaml b/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-fp8-linear-ggml-experts.yaml
new file mode 100644
index 0000000..25f021e
--- /dev/null
+++ b/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-fp8-linear-ggml-experts.yaml
@@ -0,0 +1,63 @@
+- match:
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
+  replace:
+    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
+    kwargs:
+      generate_device: "cuda"
+      prefill_device: "cuda"
+- match:
+    name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$"  # regular expression 
+    class: torch.nn.Linear  # only match modules matching name and class simultaneously
+  replace:
+    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
+    kwargs:
+      generate_device: "cuda"
+      prefill_device: "cuda"
+      generate_op: "KLinearFP8"
+      prefill_op: "KLinearTorch"
+- match:
+    name: "^model\\.layers\\..*\\.mlp$"
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
+  replace:
+    class: ktransformers.operators.experts.KDeepseekV3MoE     # mlp module with custom forward function
+    kwargs:
+      generate_device: "cuda"
+      prefill_device: "cuda"
+- match:
+    class: ktransformers.models.modeling_deepseek_v3.MoEGate
+  replace:
+    class: ktransformers.operators.gate.KMoEGate
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+- match:
+    name: "^model\\.layers\\..*\\.mlp\\.experts$"
+  replace:
+    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
+    kwargs:
+      prefill_device: "cuda"
+      prefill_op: "KExpertsTorch"
+      generate_device: "cpu"
+      generate_op: "KExpertsCPU"
+      out_device: "cuda"
+  recursive: False # don't recursively inject submodules of this module
+- match:
+    name: "^model\\.layers\\..*\\.self_attn$"
+  replace:
+    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
+    kwargs:
+      generate_device: "cuda"
+      prefill_device: "cuda"
+- match:
+    name: "^model$"
+  replace:
+    class: "ktransformers.operators.models.KDeepseekV2Model"
+    kwargs:
+      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
+- match:
+    name: "^model.embed_tokens"
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cpu"
+      prefill_device: "cpu"
\ No newline at end of file
diff --git a/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-4.yaml b/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-4.yaml
new file mode 100644
index 0000000..ea75b30
--- /dev/null
+++ b/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-4.yaml
@@ -0,0 +1,388 @@
+- match:
+    name: "^model.embed_tokens"
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cpu"
+      prefill_device: "cpu"
+
+# === Rotary Embedding Replacement ===
+
+# GPU 0: layers 0–14
+- match:
+    name: "^model\\.layers\\.([0-9]|1[0-4])\\."
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
+  replace:
+    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+
+# GPU 1: layers 15–29
+- match:
+    name: "^model\\.layers\\.(1[5-9]|2[0-9])\\."
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
+  replace:
+    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+
+# GPU 2: layers 30–44
+- match:
+    name: "^model\\.layers\\.(3[0-9]|4[0-4])\\."
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
+  replace:
+    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
+    kwargs:
+      generate_device: "cuda:2"
+      prefill_device: "cuda:2"
+
+# GPU 3: layers 45–60
+- match:
+    name: "^model\\.layers\\.(4[5-9]|5[0-9]|60)\\."
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
+  replace:
+    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
+    kwargs:
+      generate_device: "cuda:3"
+      prefill_device: "cuda:3"
+
+# === Linear Layers Replacement (excluding self_attn.kv_b_proj) ===
+
+# GPU 0: layers 0–14
+- match:
+    name: "^model\\.layers\\.([0-9]|1[0-4])\\.(?!self_attn\\.kv_b_proj).*$"
+    class: torch.nn.Linear
+  replace:
+    class: ktransformers.operators.linear.KTransformersLinear
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+      generate_op: "KLinearMarlin"
+      prefill_op: "KLinearTorch"
+
+# GPU 1: layers 15–29
+- match:
+    name: "^model\\.layers\\.(1[5-9]|2[0-9])\\.(?!self_attn\\.kv_b_proj).*$"
+    class: torch.nn.Linear
+  replace:
+    class: ktransformers.operators.linear.KTransformersLinear
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+      generate_op: "KLinearMarlin"
+      prefill_op: "KLinearTorch"
+
+# GPU 2: layers 30–44
+- match:
+    name: "^model\\.layers\\.(3[0-9]|4[0-4])\\.(?!self_attn\\.kv_b_proj).*$"
+    class: torch.nn.Linear
+  replace:
+    class: ktransformers.operators.linear.KTransformersLinear
+    kwargs:
+      generate_device: "cuda:2"
+      prefill_device: "cuda:2"
+      generate_op: "KLinearMarlin"
+      prefill_op: "KLinearTorch"
+
+# GPU 3: layers 45–60
+- match:
+    name: "^model\\.layers\\.(4[5-9]|5[0-9]|60)\\.(?!self_attn\\.kv_b_proj).*$"
+    class: torch.nn.Linear
+  replace:
+    class: ktransformers.operators.linear.KTransformersLinear
+    kwargs:
+      generate_device: "cuda:3"
+      prefill_device: "cuda:3"
+      generate_op: "KLinearMarlin"
+      prefill_op: "KLinearTorch"
+
+# === MLP (MoE) Replacement ===
+
+# GPU 0: layers 0–14
+- match:
+    name: "^model\\.layers\\.([0-9]|1[0-4])\\.mlp$"
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
+  replace:
+    class: ktransformers.operators.experts.KDeepseekV3MoE
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+
+# GPU 1: layers 15–29
+- match:
+    name: "^model\\.layers\\.(1[5-9]|2[0-9])\\.mlp$"
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
+  replace:
+    class: ktransformers.operators.experts.KDeepseekV3MoE
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+
+# GPU 2: layers 30–44
+- match:
+    name: "^model\\.layers\\.(3[0-9]|4[0-4])\\.mlp$"
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
+  replace:
+    class: ktransformers.operators.experts.KDeepseekV3MoE
+    kwargs:
+      generate_device: "cuda:2"
+      prefill_device: "cuda:2"
+
+# GPU 3: layers 45–60
+- match:
+    name: "^model\\.layers\\.(4[5-9]|5[0-9]|60)\\.mlp$"
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
+  replace:
+    class: ktransformers.operators.experts.KDeepseekV3MoE
+    kwargs:
+      generate_device: "cuda:3"
+      prefill_device: "cuda:3"
+
+# === MLP Gate Replacement ===
+
+# GPU 0: layers 0–14
+- match:
+    name: "^model\\.layers\\.([0-9]|1[0-4])\\.mlp\\.gate$"
+    class: ktransformers.models.modeling_deepseek_v3.MoEGate
+  replace:
+    class: ktransformers.operators.gate.KMoEGate
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+
+# GPU 1: layers 15–29
+- match:
+    name: "^model\\.layers\\.(1[5-9]|2[0-9])\\.mlp\\.gate$"
+    class: ktransformers.models.modeling_deepseek_v3.MoEGate
+  replace:
+    class: ktransformers.operators.gate.KMoEGate
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+
+# GPU 2: layers 30–44
+- match:
+    name: "^model\\.layers\\.(3[0-9]|4[0-4])\\.mlp\\.gate$"
+    class: ktransformers.models.modeling_deepseek_v3.MoEGate
+  replace:
+    class: ktransformers.operators.gate.KMoEGate
+    kwargs:
+      generate_device: "cuda:2"
+      prefill_device: "cuda:2"
+
+# GPU 3: layers 45–60
+- match:
+    name: "^model\\.layers\\.(4[5-9]|5[0-9]|60)\\.mlp\\.gate$"
+    class: ktransformers.models.modeling_deepseek_v3.MoEGate
+  replace:
+    class: ktransformers.operators.gate.KMoEGate
+    kwargs:
+      generate_device: "cuda:3"
+      prefill_device: "cuda:3"
+
+# === MLP Experts Replacement ===
+# replace with marlin expert. Open and modify layer-num as needed.
+# Each layer of malin experts takes about 6GB of GPU memory.
+# !!!Do remember 'close' cuda graph if you are using marlin expert.!!!
+# !!!KExpertsTorch is untested, we don't have enough VRAM.!!!
+
+# GPU 0: layers 3–4
+# - match:
+#     name: "^model\\.layers\\.([3-4])\\.mlp\\.experts$"
+#   replace:
+#     class: ktransformers.operators.experts.KTransformersExperts
+#     kwargs:
+#       generate_device: "cuda:0"
+#       generate_op:  "KExpertsMarlin"
+#   recursive: False
+
+# # GPU 1: layers 15–17
+# - match:
+#     name: "^model\\.layers\\.(1[5-7])\\.mlp\\.experts$"
+#   replace:
+#     class: ktransformers.operators.experts.KTransformersExperts
+#     kwargs:
+#       generate_device: "cuda:1"
+#       generate_op:  "KExpertsMarlin"
+#   recursive: False
+
+# # GPU 2: layers 30–32
+# - match:
+#     name: "^model\\.layers\\.(3[0-2])\\.mlp\\.experts$"
+#   replace:
+#     class: ktransformers.operators.experts.KTransformersExperts
+#     kwargs:
+#       generate_device: "cuda:2"
+#       generate_op:  "KExpertsMarlin"
+#   recursive: False
+
+# # GPU 3: layers 45–46
+# - match:
+#     name: "^model\\.layers\\.(4[5-6])\\.mlp\\.experts$"
+#   replace:
+#     class: ktransformers.operators.experts.KTransformersExperts
+#     kwargs:
+#       generate_device: "cuda:3"
+#       generate_op:  "KExpertsMarlin"
+#   recursive: False
+
+
+# === MLP Experts Replacement ===
+
+# GPU 0: layers 0–14
+- match:
+    name: "^model\\.layers\\.([0-9]|1[0-4])\\.mlp\\.experts$"
+  replace:
+    class: ktransformers.operators.experts.KTransformersExperts
+    kwargs:
+      prefill_device: "cuda:0"
+      prefill_op: "KExpertsTorch"
+      generate_device: "cpu"
+      generate_op: "KExpertsCPU"
+      out_device: "cuda:0"
+  recursive: False
+
+# GPU 1: layers 15–29
+- match:
+    name: "^model\\.layers\\.(1[5-9]|2[0-9])\\.mlp\\.experts$"
+  replace:
+    class: ktransformers.operators.experts.KTransformersExperts
+    kwargs:
+      prefill_device: "cuda:1"
+      prefill_op: "KExpertsTorch"
+      generate_device: "cpu"
+      generate_op: "KExpertsCPU"
+      out_device: "cuda:1"
+  recursive: False
+
+# GPU 2: layers 30–44
+- match:
+    name: "^model\\.layers\\.(3[0-9]|4[0-4])\\.mlp\\.experts$"
+  replace:
+    class: ktransformers.operators.experts.KTransformersExperts
+    kwargs:
+      prefill_device: "cuda:2"
+      prefill_op: "KExpertsTorch"
+      generate_device: "cpu"
+      generate_op: "KExpertsCPU"
+      out_device: "cuda:2"
+  recursive: False
+
+# GPU 3: layers 45–60
+- match:
+    name: "^model\\.layers\\.(4[5-9]|5[0-9]|60)\\.mlp\\.experts$"
+  replace:
+    class: ktransformers.operators.experts.KTransformersExperts
+    kwargs:
+      prefill_device: "cuda:3"
+      prefill_op: "KExpertsTorch"
+      generate_device: "cpu"
+      generate_op: "KExpertsCPU"
+      out_device: "cuda:3"
+  recursive: False
+
+# === Self-Attention Replacement ===
+
+# GPU 0: layers 0–14
+- match:
+    name: "^model\\.layers\\.([0-9]|1[0-4])\\.self_attn$"
+  replace:
+    class: ktransformers.operators.attention.KDeepseekV2Attention
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+      absorb_for_prefill: False
+
+# GPU 1: layers 15–29
+- match:
+    name: "^model\\.layers\\.(1[5-9]|2[0-9])\\.self_attn$"
+  replace:
+    class: ktransformers.operators.attention.KDeepseekV2Attention
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+      absorb_for_prefill: False
+
+# GPU 2: layers 30–44
+- match:
+    name: "^model\\.layers\\.(3[0-9]|4[0-4])\\.self_attn$"
+  replace:
+    class: ktransformers.operators.attention.KDeepseekV2Attention
+    kwargs:
+      generate_device: "cuda:2"
+      prefill_device: "cuda:2"
+      absorb_for_prefill: False
+
+# GPU 3: layers 45–60
+- match:
+    name: "^model\\.layers\\.(4[5-9]|5[0-9]|60)\\.self_attn$"
+  replace:
+    class: ktransformers.operators.attention.KDeepseekV2Attention
+    kwargs:
+      generate_device: "cuda:3"
+      prefill_device: "cuda:3"
+      absorb_for_prefill: False
+
+# === Overall Model Replacement with Transfer Map ===
+
+- match:
+    name: "^model$"
+  replace:
+    class: "ktransformers.operators.models.KDeepseekV2Model"
+    kwargs:
+      per_layer_prefill_intput_threshold: 0 # 0 means close layer‐wise prefill
+      transfer_map:
+        15: "cuda:1" # Layers 15+ on GPU 1
+        30: "cuda:2" # Layers 30+ on GPU 2
+        45: "cuda:3" # Layers 45+ on GPU 3
+
+# === Default Catch-All for Other Modules ===
+
+# GPU 0: layers 0–14
+- match:
+    name: "^model\\.layers\\.([0-9]|1[0-4])\\."
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+
+# GPU 1: layers 15–29
+- match:
+    name: "^model\\.layers\\.(1[5-9]|2[0-9])\\."
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+
+# GPU 2: layers 30–44
+- match:
+    name: "^model\\.layers\\.(3[0-9]|4[0-4])\\."
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cuda:2"
+      prefill_device: "cuda:2"
+
+- match:
+    name: "^lm_head"
+    class: torch.nn.Linear
+  replace:
+    class: ktransformers.operators.linear.KTransformersLinear
+    kwargs:
+      generate_device: "cuda:3"
+      prefill_device: "cuda:3"
+      generate_op: "KLinearMarlin"
+      prefill_op: "KLinearTorch"
+
+# For final modules (model.norm), ensure they are on GPU 3 (as in your original config)
+- match:
+    name: "(^model\\.layers\\.(4[5-9]|5[0-9]|60)\\.)|(^model\\.norm)"
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cuda:3"
+      prefill_device: "cuda:3"
diff --git a/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-8.yaml b/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-8.yaml
new file mode 100644
index 0000000..b00d2b4
--- /dev/null
+++ b/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-8.yaml
@@ -0,0 +1,734 @@
+- match:
+    name: "^model.embed_tokens"
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cpu"
+      prefill_device: "cpu"
+
+# === Rotary Embedding Replacement ===
+
+# GPU 0: layers 0–7
+- match:
+    name: "^model\\.layers\\.([0-7])\\."
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
+  replace:
+    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+
+# GPU 1: layers 8–15
+- match:
+    name: "^model\\.layers\\.(8|9|1[0-5])\\."
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
+  replace:
+    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+
+# GPU 2: layers 16–23
+- match:
+    name: "^model\\.layers\\.(1[6-9]|2[0-3])\\."
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
+  replace:
+    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
+    kwargs:
+      generate_device: "cuda:2"
+      prefill_device: "cuda:2"
+
+# GPU 3: layers 24–31
+- match:
+    name: "^model\\.layers\\.(2[4-9]|3[0-1])\\."
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
+  replace:
+    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
+    kwargs:
+      generate_device: "cuda:3"
+      prefill_device: "cuda:3"
+
+# GPU 4: layers 32–39
+- match:
+    name: "^model\\.layers\\.([3][2-9])\\."
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
+  replace:
+    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
+    kwargs:
+      generate_device: "cuda:4"
+      prefill_device: "cuda:4"
+
+# GPU 5: layers 40–47
+- match:
+    name: "^model\\.layers\\.(4[0-7])\\."
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
+  replace:
+    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
+    kwargs:
+      generate_device: "cuda:5"
+      prefill_device: "cuda:5"
+
+# GPU 6: layers 48–55
+- match:
+    name: "^model\\.layers\\.(4[8-9]|5[0-5])\\."
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
+  replace:
+    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
+    kwargs:
+      generate_device: "cuda:6"
+      prefill_device: "cuda:6"
+
+# GPU 7: layers 56–60
+- match:
+    name: "^model\\.layers\\.(5[6-9]|60)\\."
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
+  replace:
+    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
+    kwargs:
+      generate_device: "cuda:7"
+      prefill_device: "cuda:7"
+
+
+# === Linear Layers Replacement (excluding self_attn.kv_b_proj) ===
+
+# GPU 0: layers 0–7
+- match:
+    name: "^model\\.layers\\.([0-7])\\.(?!self_attn\\.kv_b_proj).*$"
+    class: torch.nn.Linear
+  replace:
+    class: ktransformers.operators.linear.KTransformersLinear
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+      generate_op: "KLinearMarlin"
+      prefill_op: "KLinearTorch"
+
+# GPU 1: layers 8–15
+- match:
+    name: "^model\\.layers\\.(8|9|1[0-5])\\.(?!self_attn\\.kv_b_proj).*$"
+    class: torch.nn.Linear
+  replace:
+    class: ktransformers.operators.linear.KTransformersLinear
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+      generate_op: "KLinearMarlin"
+      prefill_op: "KLinearTorch"
+
+# GPU 2: layers 16–23
+- match:
+    name: "^model\\.layers\\.(1[6-9]|2[0-3])\\.(?!self_attn\\.kv_b_proj).*$"
+    class: torch.nn.Linear
+  replace:
+    class: ktransformers.operators.linear.KTransformersLinear
+    kwargs:
+      generate_device: "cuda:2"
+      prefill_device: "cuda:2"
+      generate_op: "KLinearMarlin"
+      prefill_op: "KLinearTorch"
+
+# GPU 3: layers 24–31
+- match:
+    name: "^model\\.layers\\.(2[4-9]|3[0-1])\\.(?!self_attn\\.kv_b_proj).*$"
+    class: torch.nn.Linear
+  replace:
+    class: ktransformers.operators.linear.KTransformersLinear
+    kwargs:
+      generate_device: "cuda:3"
+      prefill_device: "cuda:3"
+      generate_op: "KLinearMarlin"
+      prefill_op: "KLinearTorch"
+
+# GPU 4: layers 32–39
+- match:
+    name: "^model\\.layers\\.(3[2-9])\\.(?!self_attn\\.kv_b_proj).*$"
+    class: torch.nn.Linear
+  replace:
+    class: ktransformers.operators.linear.KTransformersLinear
+    kwargs:
+      generate_device: "cuda:4"
+      prefill_device: "cuda:4"
+      generate_op: "KLinearMarlin"
+      prefill_op: "KLinearTorch"
+
+# GPU 5: layers 40–47
+- match:
+    name: "^model\\.layers\\.(4[0-7])\\.(?!self_attn\\.kv_b_proj).*$"
+    class: torch.nn.Linear
+  replace:
+    class: ktransformers.operators.linear.KTransformersLinear
+    kwargs:
+      generate_device: "cuda:5"
+      prefill_device: "cuda:5"
+      generate_op: "KLinearMarlin"
+      prefill_op: "KLinearTorch"
+
+# GPU 6: layers 48–55
+- match:
+    name: "^model\\.layers\\.(4[8-9]|5[0-5])\\.(?!self_attn\\.kv_b_proj).*$"
+    class: torch.nn.Linear
+  replace:
+    class: ktransformers.operators.linear.KTransformersLinear
+    kwargs:
+      generate_device: "cuda:6"
+      prefill_device: "cuda:6"
+      generate_op: "KLinearMarlin"
+      prefill_op: "KLinearTorch"
+
+# GPU 7: layers 56–63
+- match:
+    name: "^model\\.layers\\.(5[6-9]|60)\\.(?!self_attn\\.kv_b_proj).*$"
+    class: torch.nn.Linear
+  replace:
+    class: ktransformers.operators.linear.KTransformersLinear
+    kwargs:
+      generate_device: "cuda:7"
+      prefill_device: "cuda:7"
+      generate_op: "KLinearMarlin"
+      prefill_op: "KLinearTorch"
+
+
+
+# === MLP (MoE) Replacement ===
+
+# GPU 0: layers 0–7
+- match:
+    name: "^model\\.layers\\.([0-7])\\.mlp$"
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
+  replace:
+    class: ktransformers.operators.experts.KDeepseekV3MoE
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+
+# GPU 1: layers 8–15
+- match:
+    name: "^model\\.layers\\.(8|9|1[0-5])\\.mlp$"
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
+  replace:
+    class: ktransformers.operators.experts.KDeepseekV3MoE
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+
+# GPU 2: layers 16–23
+- match:
+    name: "^model\\.layers\\.(1[6-9]|2[0-3])\\.mlp$"
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
+  replace:
+    class: ktransformers.operators.experts.KDeepseekV3MoE
+    kwargs:
+      generate_device: "cuda:2"
+      prefill_device: "cuda:2"
+
+# GPU 3: layers 24–31
+- match:
+    name: "^model\\.layers\\.(2[4-9]|3[0-1])\\.mlp$"
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
+  replace:
+    class: ktransformers.operators.experts.KDeepseekV3MoE
+    kwargs:
+      generate_device: "cuda:3"
+      prefill_device: "cuda:3"
+
+# GPU 4: layers 32–39
+- match:
+    name: "^model\\.layers\\.(3[2-9])\\.mlp$"
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
+  replace:
+    class: ktransformers.operators.experts.KDeepseekV3MoE
+    kwargs:
+      generate_device: "cuda:4"
+      prefill_device: "cuda:4"
+
+# GPU 5: layers 40–47
+- match:
+    name: "^model\\.layers\\.(4[0-7])\\.mlp$"
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
+  replace:
+    class: ktransformers.operators.experts.KDeepseekV3MoE
+    kwargs:
+      generate_device: "cuda:5"
+      prefill_device: "cuda:5"
+
+# GPU 6: layers 48–55
+- match:
+    name: "^model\\.layers\\.(4[8-9]|5[0-5])\\.mlp$"
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
+  replace:
+    class: ktransformers.operators.experts.KDeepseekV3MoE
+    kwargs:
+      generate_device: "cuda:6"
+      prefill_device: "cuda:6"
+
+# GPU 7: layers 56–60
+- match:
+    name: "^model\\.layers\\.(5[6-9]|60)\\.mlp$"
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
+  replace:
+    class: ktransformers.operators.experts.KDeepseekV3MoE
+    kwargs:
+      generate_device: "cuda:7"
+      prefill_device: "cuda:7"
+
+# === MLP Gate Replacement ===
+
+# GPU 0: layers 0–7
+- match:
+    name: "^model\\.layers\\.([0-7])\\.mlp\\.gate$"
+    class: ktransformers.models.modeling_deepseek_v3.MoEGate
+  replace:
+    class: ktransformers.operators.gate.KMoEGate
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+
+# GPU 1: layers 8–15
+- match:
+    name: "^model\\.layers\\.(8|9|1[0-5])\\.mlp\\.gate$"
+    class: ktransformers.models.modeling_deepseek_v3.MoEGate
+  replace:
+    class: ktransformers.operators.gate.KMoEGate
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+
+# GPU 2: layers 16–23
+- match:
+    name: "^model\\.layers\\.(1[6-9]|2[0-3])\\.mlp\\.gate$"
+    class: ktransformers.models.modeling_deepseek_v3.MoEGate
+  replace:
+    class: ktransformers.operators.gate.KMoEGate
+    kwargs:
+      generate_device: "cuda:2"
+      prefill_device: "cuda:2"
+
+# GPU 3: layers 24–31
+- match:
+    name: "^model\\.layers\\.(2[4-9]|3[0-1])\\.mlp\\.gate$"
+    class: ktransformers.models.modeling_deepseek_v3.MoEGate
+  replace:
+    class: ktransformers.operators.gate.KMoEGate
+    kwargs:
+      generate_device: "cuda:3"
+      prefill_device: "cuda:3"
+
+# GPU 4: layers 32–39
+- match:
+    name: "^model\\.layers\\.(3[2-9])\\.mlp\\.gate$"
+    class: ktransformers.models.modeling_deepseek_v3.MoEGate
+  replace:
+    class: ktransformers.operators.gate.KMoEGate
+    kwargs:
+      generate_device: "cuda:4"
+      prefill_device: "cuda:4"
+
+# GPU 5: layers 40–47
+- match:
+    name: "^model\\.layers\\.(4[0-7])\\.mlp\\.gate$"
+    class: ktransformers.models.modeling_deepseek_v3.MoEGate
+  replace:
+    class: ktransformers.operators.gate.KMoEGate
+    kwargs:
+      generate_device: "cuda:5"
+      prefill_device: "cuda:5"
+
+# GPU 6: layers 48–55
+- match:
+    name: "^model\\.layers\\.(4[8-9]|5[0-5])\\.mlp\\.gate$"
+    class: ktransformers.models.modeling_deepseek_v3.MoEGate
+  replace:
+    class: ktransformers.operators.gate.KMoEGate
+    kwargs:
+      generate_device: "cuda:6"
+      prefill_device: "cuda:6"
+
+# GPU 7: layers 56–60
+- match:
+    name: "^model\\.layers\\.(5[6-9]|60)\\.mlp\\.gate$"
+    class: ktransformers.models.modeling_deepseek_v3.MoEGate
+  replace:
+    class: ktransformers.operators.gate.KMoEGate
+    kwargs:
+      generate_device: "cuda:7"
+      prefill_device: "cuda:7"
+
+
+# === MLP Experts Replacement ===
+# replace with marlin expert. Open and modify layer-num as needed.
+# Each layer of malin experts takes about 6GB of GPU memory.
+# !!!Do remember 'close' cuda graph if you are using marlin expert.!!!
+# !!!Loading marlin expert will take signifcant time.!!!
+
+# GPU 0: layers 0–7
+# - match:
+#     name: "^model\\.layers\\.([0-7])\\.mlp\\.experts$" # inject experts in layer 0~4 as marlin expert
+#   replace:
+#     class: ktransformers.operators.experts.KTransformersExperts  
+#     kwargs:
+#       generate_device: "cuda:0"
+#       generate_op:  "KExpertsMarlin"
+#   recursive: False
+
+# # GPU 1: layers 8–15
+# - match:
+#     name: "^model\\.layers\\.([8-9]|1[0-5)\\.mlp\\.experts$" # inject experts in layer 30~31 as marlin expert
+#   replace:
+#     class: ktransformers.operators.experts.KTransformersExperts
+#     kwargs:
+#       generate_device: "cuda:1"
+#       generate_op:  "KExpertsMarlin"
+#   recursive: False 
+
+# # GPU 2: layers 16–23
+# - match:
+#     name: "^model\\.layers\\.(1[6-9]|2[0-3])\\.mlp\\.experts$" # inject experts in layer 0~4 as marlin expert
+#   replace:
+#     class: ktransformers.operators.experts.KTransformersExperts  
+#     kwargs:
+#       generate_device: "cuda:0" 
+#       generate_op:  "KExpertsMarlin"
+#   recursive: False
+
+# # GPU 3: layers 24–31
+# - match:
+#     name: "^model\\.layers\\.(2[4-9]|3[0-1])\\.mlp\\.experts$" # inject experts in layer 30~31 as marlin expert
+#   replace:
+#     class: ktransformers.operators.experts.KTransformersExperts
+#     kwargs:
+#       generate_device: "cuda:1"
+#       generate_op:  "KExpertsMarlin"
+#   recursive: False 
+
+# # GPU 4: layers 32–39
+# - match:
+#     name: "^model\\.layers\\.(3[2-9])\\.mlp\\.experts$" # inject experts in layer 0~4 as marlin expert
+#   replace:
+#     class: ktransformers.operators.experts.KTransformersExperts  
+#     kwargs:
+#       generate_device: "cuda:0" 
+#       generate_op:  "KExpertsMarlin"
+#   recursive: False
+
+# # GPU 5: layers 40–47
+# - match:
+#     name: "^model\\.layers\\.(4[0-7])\\.mlp\\.experts$" # inject experts in layer 30~31 as marlin expert
+#   replace:
+#     class: ktransformers.operators.experts.KTransformersExperts
+#     kwargs:
+#       generate_device: "cuda:1"
+#       generate_op:  "KExpertsMarlin"
+#   recursive: False 
+
+# # GPU 6: layers 48–55
+# - match:
+#     name: "^model\\.layers\\.(4[8-9]|5[0-5])\\.mlp\\.experts$" # inject experts in layer 0~4 as marlin expert
+#   replace:
+#     class: ktransformers.operators.experts.KTransformersExperts  
+#     kwargs:
+#       generate_device: "cuda:0"
+#       generate_op:  "KExpertsMarlin"
+#   recursive: False
+
+# # GPU 7: layers 56–60
+# - match:
+#     name: "^model\\.layers\\.(5[6-9]|60)\\.mlp\\.experts$" # inject experts in layer 30~31 as marlin expert
+#   replace:
+#     class: ktransformers.operators.experts.KTransformersExperts
+#     kwargs:
+#       generate_device: "cuda:1"
+#       generate_op:  "KExpertsMarlin"
+#   recursive: False 
+
+
+# === MLP Experts Replacement ===
+
+# GPU 0: layers 0–7
+- match:
+    name: "^model\\.layers\\.([0-7])\\.mlp\\.experts$"
+  replace:
+    class: ktransformers.operators.experts.KTransformersExperts
+    kwargs:
+      prefill_device: "cuda:0"
+      prefill_op: "KExpertsTorch"
+      generate_device: "cpu"
+      generate_op: "KExpertsCPU"
+      out_device: "cuda:0"
+  recursive: False
+
+# GPU 1: layers 8–15
+- match:
+    name: "^model\\.layers\\.(8|9|1[0-5])\\.mlp\\.experts$"
+  replace:
+    class: ktransformers.operators.experts.KTransformersExperts
+    kwargs:
+      prefill_device: "cuda:1"
+      prefill_op: "KExpertsTorch"
+      generate_device: "cpu"
+      generate_op: "KExpertsCPU"
+      out_device: "cuda:1"
+  recursive: False
+
+# GPU 2: layers 16–23
+- match:
+    name: "^model\\.layers\\.(1[6-9]|2[0-3])\\.mlp\\.experts$"
+  replace:
+    class: ktransformers.operators.experts.KTransformersExperts
+    kwargs:
+      prefill_device: "cuda:2"
+      prefill_op: "KExpertsTorch"
+      generate_device: "cpu"
+      generate_op: "KExpertsCPU"
+      out_device: "cuda:2"
+  recursive: False
+
+# GPU 3: layers 24–31
+- match:
+    name: "^model\\.layers\\.(2[4-9]|3[0-1])\\.mlp\\.experts$"
+  replace:
+    class: ktransformers.operators.experts.KTransformersExperts
+    kwargs:
+      prefill_device: "cuda:3"
+      prefill_op: "KExpertsTorch"
+      generate_device: "cpu"
+      generate_op: "KExpertsCPU"
+      out_device: "cuda:3"
+  recursive: False
+
+# GPU 4: layers 32–39
+- match:
+    name: "^model\\.layers\\.(3[2-9])\\.mlp\\.experts$"
+  replace:
+    class: ktransformers.operators.experts.KTransformersExperts
+    kwargs:
+      prefill_device: "cuda:4"
+      prefill_op: "KExpertsTorch"
+      generate_device: "cpu"
+      generate_op: "KExpertsCPU"
+      out_device: "cuda:4"
+  recursive: False
+
+# GPU 5: layers 40–47
+- match:
+    name: "^model\\.layers\\.(4[0-7])\\.mlp\\.experts$"
+  replace:
+    class: ktransformers.operators.experts.KTransformersExperts
+    kwargs:
+      prefill_device: "cuda:5"
+      prefill_op: "KExpertsTorch"
+      generate_device: "cpu"
+      generate_op: "KExpertsCPU"
+      out_device: "cuda:5"
+  recursive: False
+
+# GPU 6: layers 48–55
+- match:
+    name: "^model\\.layers\\.(4[8-9]|5[0-5])\\.mlp\\.experts$"
+  replace:
+    class: ktransformers.operators.experts.KTransformersExperts
+    kwargs:
+      prefill_device: "cuda:6"
+      prefill_op: "KExpertsTorch"
+      generate_device: "cpu"
+      generate_op: "KExpertsCPU"
+      out_device: "cuda:6"
+  recursive: False
+
+# GPU 7: layers 56–60
+- match:
+    name: "^model\\.layers\\.(5[6-9]|60)\\.mlp\\.experts$"
+  replace:
+    class: ktransformers.operators.experts.KTransformersExperts
+    kwargs:
+      prefill_device: "cuda:7"
+      prefill_op: "KExpertsTorch"
+      generate_device: "cpu"
+      generate_op: "KExpertsCPU"
+      out_device: "cuda:7"
+  recursive: False
+
+
+# === Self-Attention Replacement ===
+
+# GPU 0: layers 0–7
+- match:
+    name: "^model\\.layers\\.([0-7])\\.self_attn$"
+  replace:
+    class: ktransformers.operators.attention.KDeepseekV2Attention
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+
+# GPU 1: layers 8–15
+- match:
+    name: "^model\\.layers\\.(8|9|1[0-5])\\.self_attn$"
+  replace:
+    class: ktransformers.operators.attention.KDeepseekV2Attention
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+
+# GPU 2: layers 16–23
+- match:
+    name: "^model\\.layers\\.(1[6-9]|2[0-3])\\.self_attn$"
+  replace:
+    class: ktransformers.operators.attention.KDeepseekV2Attention
+    kwargs:
+      generate_device: "cuda:2"
+      prefill_device: "cuda:2"
+
+# GPU 3: layers 24–31
+- match:
+    name: "^model\\.layers\\.(2[4-9]|3[0-1])\\.self_attn$"
+  replace:
+    class: ktransformers.operators.attention.KDeepseekV2Attention
+    kwargs:
+      generate_device: "cuda:3"
+      prefill_device: "cuda:3"
+
+# GPU 4: layers 32–39
+- match:
+    name: "^model\\.layers\\.(3[2-9])\\.self_attn$"
+  replace:
+    class: ktransformers.operators.attention.KDeepseekV2Attention
+    kwargs:
+      generate_device: "cuda:4"
+      prefill_device: "cuda:4"
+
+# GPU 5: layers 40–47
+- match:
+    name: "^model\\.layers\\.(4[0-7])\\.self_attn$"
+  replace:
+    class: ktransformers.operators.attention.KDeepseekV2Attention
+    kwargs:
+      generate_device: "cuda:5"
+      prefill_device: "cuda:5"
+
+# GPU 6: layers 48–55
+- match:
+    name: "^model\\.layers\\.(4[8-9]|5[0-5])\\.self_attn$"
+  replace:
+    class: ktransformers.operators.attention.KDeepseekV2Attention
+    kwargs:
+      generate_device: "cuda:6"
+      prefill_device: "cuda:6"
+
+# GPU 7: layers 56–60
+- match:
+    name: "^model\\.layers\\.(5[6-9]|60)\\.self_attn$"
+  replace:
+    class: ktransformers.operators.attention.KDeepseekV2Attention
+    kwargs:
+      generate_device: "cuda:7"
+      prefill_device: "cuda:7"
+
+# === Overall Model Replacement with Transfer Map ===
+
+- match:
+    name: "^model$"
+  replace:
+    class: "ktransformers.operators.models.KDeepseekV2Model"
+    kwargs:
+      per_layer_prefill_intput_threshold: 0 # 0 means close layer‐wise prefill
+      transfer_map:
+        8: "cuda:1"
+        16: "cuda:2"
+        24: "cuda:3"
+        32: "cuda:4"
+        40: "cuda:5"
+        48: "cuda:6"
+        56: "cuda:7"
+
+# === Default Catch-All for Other Modules ===
+
+# GPU 0: layers 0–7
+- match:
+    name: "^model\\.layers\\.([0-7])\\."
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+
+# GPU 1: layers 8–15
+- match:
+    name: "^model\\.layers\\.(8|9|1[0-5])\\."
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+
+# GPU 2: layers 16–23
+- match:
+    name: "^model\\.layers\\.(1[6-9]|2[0-3])\\."
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cuda:2"
+      prefill_device: "cuda:2"
+
+# GPU 3: layers 24–31
+- match:
+    name: "^model\\.layers\\.(2[4-9]|3[0-1])\\."
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cuda:3"
+      prefill_device: "cuda:3"
+
+# GPU 4: layers 32–39
+- match:
+    name: "^model\\.layers\\.(3[2-9])\\."
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cuda:4"
+      prefill_device: "cuda:4"
+
+# GPU 5: layers 40–47
+- match:
+    name: "^model\\.layers\\.(4[0-7])\\."
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cuda:5"
+      prefill_device: "cuda:5"
+
+# GPU 6: layers 48–55
+- match:
+    name: "^model\\.layers\\.(4[8-9]|5[0-5])\\."
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cuda:6"
+      prefill_device: "cuda:6"
+
+# GPU 7: layers 56–63
+- match:
+    name: "^model\\.layers\\.(5[6-9]|60)\\."
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cuda:7"
+      prefill_device: "cuda:7"
+
+- match:
+    name: "^lm_head"
+    class: torch.nn.Linear
+  replace:
+    class: ktransformers.operators.linear.KTransformersLinear
+    kwargs:
+      generate_device: "cuda:7"
+      prefill_device: "cuda:7"
+      generate_op: "KLinearMarlin"
+      prefill_op: "KLinearTorch"
+
+# For final modules (model.norm), ensure they are on GPU 7 (as in your original config)
+- match:
+    name: "(^model\\.layers\\.(4[5-9]|5[0-9]|60)\\.)|(^model\\.norm)"
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cuda:7"
+      prefill_device: "cuda:7"
\ No newline at end of file
diff --git a/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-fp8-linear-ggml-experts.yaml b/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-fp8-linear-ggml-experts.yaml
new file mode 100644
index 0000000..fa8c03d
--- /dev/null
+++ b/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-fp8-linear-ggml-experts.yaml
@@ -0,0 +1,157 @@
+- match:
+    name: "^model.embed_tokens"
+  replace:
+    class: "default"
+    kwargs:
+        generate_device: "cpu"
+        prefill_device: "cpu"
+
+- match:
+    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\."
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
+  replace:
+    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+- match:
+    name: "^model\\.layers\\.([3456][0-9])\\."
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
+  replace:
+    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+
+- match:
+    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.(?!self_attn\\.kv_b_proj).*$"  # regular expression 
+    class: torch.nn.Linear  # only match modules matching name and class simultaneously
+  replace:
+    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+      generate_op: "KLinearFP8"
+      prefill_op: "KLinearTorch"
+
+- match:
+    name: "^model\\.layers\\.([3456][0-9])\\.(?!self_attn\\.kv_b_proj).*$"  # regular expression 
+    class: torch.nn.Linear  # only match modules matching name and class simultaneously
+  replace:
+    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+      generate_op: "KLinearFP8"
+      prefill_op: "KLinearTorch"
+  
+- match:
+    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp$"
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
+  replace:
+    class: ktransformers.operators.experts.KDeepseekV3MoE     # mlp module with custom forward function
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+- match:
+    name: "^model\\.layers\\.([3456][0-9])\\.mlp$"
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
+  replace:
+    class: ktransformers.operators.experts.KDeepseekV3MoE     # mlp module with custom forward function
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+
+- match:
+    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp\\.gate$"
+    class: ktransformers.models.modeling_deepseek_v3.MoEGate
+  replace:
+    class: ktransformers.operators.gate.KMoEGate
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+- match:
+    name: "^model\\.layers\\.([3456][0-9])\\.mlp\\.gate$"
+    class: ktransformers.models.modeling_deepseek_v3.MoEGate
+  replace:
+    class: ktransformers.operators.gate.KMoEGate     # mlp module with custom forward function
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+
+- match:
+    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp\\.experts$"
+  replace:
+    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
+    kwargs:
+      prefill_device: "cuda:0"
+      prefill_op: "KExpertsTorch"
+      generate_device: "cpu"
+      generate_op:  "KExpertsCPU"
+      out_device: "cuda:0"
+  recursive: False # don't recursively inject submodules of this module
+
+- match:
+    name: "^model\\.layers\\.([3456][0-9])\\.mlp\\.experts$"
+  replace:
+    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
+    kwargs:
+      prefill_device: "cuda:1"
+      prefill_op: "KExpertsTorch"
+      generate_device: "cpu"
+      generate_op:  "KExpertsCPU"
+      out_device: "cuda:1"
+  recursive: False # don't recursively inject submodules of this module
+
+- match:
+    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.self_attn$"
+  replace:
+    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+      absorb_for_prefill: False # change this to True to enable long context(prefill may slower).
+
+- match:
+    name: "^model\\.layers\\.([3456][0-9])\\.self_attn$"
+  replace:
+    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+      absorb_for_prefill: False # change this to True to enable long context(prefill may slower).
+
+- match:
+    name: "^model$"
+  replace:
+    class: "ktransformers.operators.models.KDeepseekV2Model"
+    kwargs:
+      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
+      transfer_map: 
+        30: "cuda:1"
+
+- match:
+    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\."
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+
+- match:
+    name: "^lm_head"
+    class: torch.nn.Linear
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+
+
+- match:
+    name: "(^model\\.layers\\.([3456][0-9])\\.)|(model.norm)"
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
diff --git a/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-marlin.yaml b/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-marlin.yaml
index 92571b5..e04c6ce 100644
--- a/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-marlin.yaml
+++ b/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-marlin.yaml
@@ -153,9 +153,20 @@
       prefill_device: "cuda:0"
 
 - match:
-    name: "(^model\\.layers\\.([3456][0-9])\\.)|(model.norm)|(lm_head)"
+    name: "^lm_head"
+    class: torch.nn.Linear
   replace:
-    class: "default"
+    class: ktransformers.operators.linear.KTransformersLinear
     kwargs:
       generate_device: "cuda:0"
       prefill_device: "cuda:0"
+      generate_op: "KLinearMarlin"
+      prefill_op: "KLinearTorch"
+
+- match:
+    name: "(^model\\.layers\\.([3456][0-9])\\.)|(model.norm)"
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
diff --git a/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu.yaml b/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu.yaml
index 06ab4db..50e282d 100644
--- a/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu.yaml
+++ b/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu.yaml
@@ -135,7 +135,18 @@
       prefill_device: "cuda:0"
 
 - match:
-    name: "(^model\\.layers\\.([3456][0-9])\\.)|(model.norm)|(lm_head)"
+    name: "^lm_head"
+    class: torch.nn.Linear
+  replace:
+    class: ktransformers.operators.linear.KTransformersLinear
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+      generate_op: "KLinearMarlin"
+      prefill_op: "KLinearTorch"
+
+- match:
+    name: "(^model\\.layers\\.([3456][0-9])\\.)|(model.norm)"
   replace:
     class: "default"
     kwargs:
diff --git a/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml b/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml
index 7a44c5d..d28e016 100644
--- a/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml
+++ b/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml
@@ -5,6 +5,18 @@
     kwargs:
       generate_device: "cuda"
       prefill_device: "cuda"
+
+- match:
+    name: "^lm_head$"  # regular expression 
+    class: torch.nn.Linear  # only match modules matching name and class simultaneously
+  replace:
+    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
+    kwargs:
+      generate_device: "cuda"
+      prefill_device: "cuda"
+      generate_op: "KLinearMarlin"
+      prefill_op: "KLinearTorch"
+
 - match:
     name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$"  # regular expression 
     class: torch.nn.Linear  # only match modules matching name and class simultaneously
@@ -48,6 +60,7 @@
     kwargs:
       generate_device: "cuda"
       prefill_device: "cuda"
+      absorb_for_prefill: False # change this to True to enable long context(prefill may slower).
 - match:
     name: "^model$"
   replace:
diff --git a/ktransformers/optimize/optimize_rules/Mixtral.yaml b/ktransformers/optimize/optimize_rules/Mixtral.yaml
index 7d48812..80a346a 100644
--- a/ktransformers/optimize/optimize_rules/Mixtral.yaml
+++ b/ktransformers/optimize/optimize_rules/Mixtral.yaml
@@ -15,6 +15,16 @@
       prefill_device: "cuda"
       generate_op: "KLinearMarlin"
       prefill_op: "KLinearTorch"
+- match:
+    name: "^lm_head"
+    class: torch.nn.Linear
+  replace:
+    class: ktransformers.operators.linear.KTransformersLinear
+    kwargs:
+      generate_device: "cuda"
+      prefill_device: "cuda"
+      generate_op: "KLinearMarlin"
+      prefill_op: "KLinearTorch"
 - match:
     name: "^model\\.layers\\..*\\.block_sparse_moe$"
     class: ktransformers.models.modeling_mixtral.MixtralSparseMoeBlock
diff --git a/ktransformers/optimize/optimize_rules/Moonlight-16B-A3B.yaml b/ktransformers/optimize/optimize_rules/Moonlight-16B-A3B.yaml
new file mode 100644
index 0000000..6cea246
--- /dev/null
+++ b/ktransformers/optimize/optimize_rules/Moonlight-16B-A3B.yaml
@@ -0,0 +1,86 @@
+- match:
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
+  replace:
+    class: ktransformers.operators.RoPE.RotaryEmbeddingV3
+    kwargs:
+      generate_device: "cuda"
+      prefill_device: "cuda"
+
+- match:
+    name: "^lm_head$"  # regular expression 
+    class: torch.nn.Linear  # only match modules matching name and class simultaneously
+  replace:
+    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
+    kwargs:
+      generate_device: "cuda"
+      prefill_device: "cuda"
+      generate_op: "KLinearMarlin"
+      prefill_op: "KLinearTorch"
+
+- match:
+    name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$"  # regular expression 
+    class: torch.nn.Linear  # only match modules matching name and class simultaneously
+  replace:
+    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
+    kwargs:
+      generate_device: "cuda"
+      prefill_device: "cuda"
+      generate_op: "KLinearMarlin"
+      prefill_op: "KLinearTorch"
+- match:
+    name: "^model\\.layers\\..*\\.mlp$"
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
+  replace:
+    class: ktransformers.operators.experts.KDeepseekV3MoE     # mlp module with custom forward function
+    kwargs:
+      generate_device: "cuda"
+      prefill_device: "cuda"
+- match:
+    class: ktransformers.models.modeling_deepseek_v3.MoEGate
+  replace:
+    class: ktransformers.operators.gate.KMoEGate
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+- match:
+    name: "^model\\.layers\\..*\\.mlp\\.experts$"
+  replace:
+    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
+    kwargs:
+      prefill_device: "cuda"
+      prefill_op: "KExpertsTorch"
+      generate_device: "cpu"
+      generate_op: "KExpertsCPU"
+      out_device: "cuda"
+  recursive: False # don't recursively inject submodules of this module
+# if want to use more VRAM, use experts Marlin and disable CUDA Graph(disable CUDA Graph may cause low performance)
+#- match:
+#    name: "^model\\.layers\\..*\\.mlp\\.experts$"
+#  replace:
+#    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
+#    kwargs:
+#      prefill_device: "cuda"
+#      prefill_op: "KExpertsTorch"
+#      generate_device: "cuda"
+#      generate_op: "KExpertsMarlin"
+#  recursive: False # don't recursively inject submodules of this module
+- match:
+    name: "^model\\.layers\\..*\\.self_attn$"
+  replace:
+    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
+    kwargs:
+      generate_device: "cuda"
+      prefill_device: "cuda"
+- match:
+    name: "^model$"
+  replace:
+    class: "ktransformers.operators.models.KDeepseekV2Model"
+    kwargs:
+      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
+- match:
+    name: "^model.embed_tokens"
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cpu"
+      prefill_device: "cpu"
\ No newline at end of file
diff --git a/ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct-multi-gpu.yaml b/ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct-multi-gpu.yaml
index da4fb4a..da01c82 100644
--- a/ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct-multi-gpu.yaml
+++ b/ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct-multi-gpu.yaml
@@ -77,9 +77,19 @@
     kwargs:
         generate_device: "cpu"
         prefill_device: "cpu"
+- match:
+    name: "^lm_head"
+    class: torch.nn.Linear
+  replace:
+    class: ktransformers.operators.linear.KTransformersLinear
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+      generate_op: "KLinearMarlin"
+      prefill_op: "KLinearTorch"
 
 - match:
-    name: "(^model.norm)|(^lm_head)"
+    name: "(^model.norm)"
   replace:
     class: "default"
     kwargs:
diff --git a/ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct.yaml b/ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct.yaml
index 0cc2edf..38e9e73 100644
--- a/ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct.yaml
+++ b/ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct.yaml
@@ -15,6 +15,16 @@
       prefill_device: "cuda"
       generate_op: "KLinearMarlin"
       prefill_op: "KLinearTorch"
+- match:
+    name: "^lm_head"
+    class: torch.nn.Linear
+  replace:
+    class: ktransformers.operators.linear.KTransformersLinear
+    kwargs:
+      generate_device: "cuda"
+      prefill_device: "cuda"
+      generate_op: "KLinearMarlin"
+      prefill_op: "KLinearTorch"
 - match:
     name: "^model\\.layers\\..*\\.mlp$"
     class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeSparseMoeBlock
diff --git a/ktransformers/server/api/ollama/completions.py b/ktransformers/server/api/ollama/completions.py
index e3a1a51..3c37c54 100644
--- a/ktransformers/server/api/ollama/completions.py
+++ b/ktransformers/server/api/ollama/completions.py
@@ -12,8 +12,10 @@ from ktransformers.server.config.config import Config
 from ktransformers.server.utils.create_interface import get_interface
 from ktransformers.server.schemas.assistants.streaming import check_link_response
 from ktransformers.server.backend.base import BackendInterfaceBase
-router = APIRouter(prefix='/api')
 
+from ktransformers.server.schemas.endpoints.chat import RawUsage
+
+router = APIRouter(prefix='/api')
 
 # https://github.com/ollama/ollama/blob/main/docs/api.md#generate-a-completion
 class OllamaGenerateCompletionRequest(BaseModel):
@@ -40,61 +42,129 @@ class OllamaGenerateCompletionRequest(BaseModel):
     keep_alive: Optional[str] = Field(
         "5m", description="Controls how long the model will stay loaded into memory following the request.")
 
-
 class OllamaGenerationStreamResponse(BaseModel):
     model: str
     created_at: str
     response: str
     done: bool = Field(...)
 
-
 class OllamaGenerationResponse(BaseModel):
     pass
 
-
 @router.post("/generate", tags=['ollama'])
 async def generate(request: Request, input: OllamaGenerateCompletionRequest):
     id = str(uuid4())
-
     interface: BackendInterfaceBase = get_interface()
     print(f'COMPLETION INPUT:----\n{input.prompt}\n----')
-
     config = Config()
 
     if input.stream:
         async def inner():
-            async for token in interface.inference(input.prompt,id): 
-                d = OllamaGenerationStreamResponse(model=config.model_name,created_at=str(datetime.now()),response=token,done=False)
-                yield d.model_dump_json()+'\n' 
-                # d = {'model':config.model_name,'created_at':"", 'response':token,'done':False}
-                # yield f"{json.dumps(d)}\n"
-            # d = {'model':config.model_name,'created_at':"", 'response':'','done':True}
-            # yield f"{json.dumps(d)}\n"
-            d = OllamaGenerationStreamResponse(model=config.model_name,created_at=str(datetime.now()),response='',done=True)   
-            yield d.model_dump_json()+'\n'
-        return check_link_response(request,inner())
+            async for res in interface.inference(input.prompt, id):
+                if isinstance(res, RawUsage):
+                    raw_usage = res
+                else: 
+                    token, finish_reason = res
+                    d = OllamaGenerationStreamResponse(
+                        model=config.model_name,
+                        created_at=str(datetime.now()),
+                        response=token,
+                        done=False
+                    )
+                    yield d.model_dump_json() + '\n'
+            d = OllamaGenerationStreamResponse(
+                model=config.model_name,
+                created_at=str(datetime.now()),
+                response='',
+                done=True
+            )
+            yield d.model_dump_json() + '\n'
+        return check_link_response(request, inner())
     else:
         raise NotImplementedError
 
 # https://github.com/ollama/ollama/blob/main/docs/api.md#generate-a-chat-completion
-
+class OllamaChatCompletionMessage(BaseModel):
+    role: str
+    content: str
 
 class OllamaChatCompletionRequest(BaseModel):
-    pass
-
+    model: str = Field(..., description="The model name, which is required.")
+    messages: List[OllamaChatCompletionMessage] = Field(
+        ..., description="A list of messages to generate a response for.")
+    stream: bool = Field(True, description="If true, the response will be streamed.")
 
 class OllamaChatCompletionStreamResponse(BaseModel):
-    pass
+    model: str
+    created_at: str
+    message: dict
+    done: bool = Field(...)
+    total_duration: Optional[int] = Field(None, description="Total time spent in nanoseconds")
+    load_duration: Optional[int] = Field(None, description="Time spent loading model in nanoseconds")
+    prompt_eval_count: Optional[int] = Field(None, description="Number of tokens in prompt")
+    prompt_eval_duration: Optional[int] = Field(None, description="Time spent evaluating prompt in nanoseconds")
+    eval_count: Optional[int] = Field(None, description="Number of tokens generated")
+    eval_duration: Optional[int] = Field(None, description="Time spent generating response in nanoseconds")
+
 
 
 class OllamaChatCompletionResponse(BaseModel):
     pass
 
-
 @router.post("/chat", tags=['ollama'])
 async def chat(request: Request, input: OllamaChatCompletionRequest):
-    raise NotImplementedError
+    id = str(uuid4())
+    interface: BackendInterfaceBase = get_interface()
+    config = Config()
 
+    # 将消息转换为提示字符串
+    prompt = ""
+    for msg in input.messages:
+        prompt += f"{msg.role}: {msg.content}\n"
+    prompt += "assistant:"
+
+    if input.stream:
+        async def inner():
+            start_time = time()  # 记录开始时间（秒）
+            eval_count = 0  # 统计生成的 token 数量
+            tokens = []
+
+            async for res in interface.inference(prompt, id):
+                if isinstance(res, RawUsage):
+                    raw_usage = res
+                else: 
+                    token, finish_reason = res
+                    d = OllamaChatCompletionStreamResponse(
+                        model=config.model_name,
+                        created_at=str(datetime.now()),
+                        message={"role": "assistant", "content": token}, 
+                        done=False
+                    )
+                    yield d.model_dump_json() + '\n'
+            # 计算性能数据
+            end_time = time()
+            total_duration = int((end_time - start_time) * 1_000_000_000)  # 转换为纳秒
+            prompt_eval_count = len(prompt.split())  # 简单估算提示词数量
+            eval_duration = total_duration  # 假设全部时间用于生成（简化）
+            prompt_eval_duration = 0  # 假设无单独提示评估时间
+            load_duration = 0  # 假设加载时间未知
+
+            d = OllamaChatCompletionStreamResponse(
+                model=config.model_name,
+                created_at=str(datetime.now()),
+                message={},
+                done=True,
+                total_duration=total_duration,
+                load_duration=load_duration,
+                prompt_eval_count=prompt_eval_count,
+                prompt_eval_duration=prompt_eval_duration,
+                eval_count=eval_count,
+                eval_duration=eval_duration
+            )
+            yield d.model_dump_json() + '\n'
+        return check_link_response(request, inner())
+    else:
+        raise NotImplementedError("Non-streaming chat is not implemented.")
 
 # https://github.com/ollama/ollama/blob/main/docs/api.md#list-local-models
 class OllamaModel(BaseModel):
@@ -103,9 +173,8 @@ class OllamaModel(BaseModel):
     size: int
     # TODO: fill the rest correctly
 
-
 # mock ollama
-@router.get("/tags",tags=['ollama'])
+@router.get("/tags", tags=['ollama'])
 async def tags():
     config = Config()
     # TODO: fill this correctly, although it does not effect Tabby
@@ -138,25 +207,21 @@ class OllamaShowResponse(BaseModel):
     class Config:
         protected_namespaces = ()
 
-
-
 @router.post("/show", tags=['ollama'])
 async def show(request: Request, input: OllamaShowRequest):
     config = Config()
     # TODO: Add more info in config to return, although it does not effect Tabby
     return OllamaShowResponse(
-        modelfile = "# Modelfile generated by ...",
-        parameters = " ",
-        template = " ",
-        details = OllamaShowDetial(
-            parent_model = " ",
-            format = "gguf",
-            family = " ",
-            families = [
-                " " 
-            ],
-            parameter_size = " ",
-            quantization_level = " "
+        modelfile="# Modelfile generated by ...",
+        parameters=" ",
+        template=" ",
+        details=OllamaShowDetial(
+            parent_model=" ",
+            format="gguf",
+            family=" ",
+            families=[" "],
+            parameter_size=" ",
+            quantization_level=" "
         ),
-        model_info = OllamaModelInfo()
+        model_info=OllamaModelInfo()
     )
\ No newline at end of file
diff --git a/ktransformers/server/api/openai/endpoints/chat.py b/ktransformers/server/api/openai/endpoints/chat.py
index 4da3bc9..c9f7bfc 100644
--- a/ktransformers/server/api/openai/endpoints/chat.py
+++ b/ktransformers/server/api/openai/endpoints/chat.py
@@ -5,18 +5,21 @@ from fastapi import APIRouter
 from fastapi.requests import Request
 from ktransformers.server.utils.create_interface import get_interface
 from ktransformers.server.schemas.assistants.streaming import chat_stream_response
-from ktransformers.server.schemas.endpoints.chat import ChatCompletionCreate,ChatCompletionChunk,ChatCompletionObject
+from ktransformers.server.schemas.endpoints.chat import ChatCompletionCreate
+from ktransformers.server.schemas.endpoints.chat import RawUsage
 from ktransformers.server.backend.base import BackendInterfaceBase
+from ktransformers.server.config.config import Config
+
+from ktransformers.server.schemas.endpoints.chat import ChatCompletionChunk
+from openai.types.chat import ChatCompletion
+from openai.types.completion_usage import  CompletionUsage
+
 
 router = APIRouter()
 
-models = [
-    {"id": "0", "name": "ktranformers-model"},
-]
-
 @router.get('/models', tags=['openai'])
 async def list_models():
-    return models
+    return {"data": [{"id": Config().model_name, "name": Config().model_name}], "object": "list"}
 
 
 @router.post('/chat/completions', tags=['openai'])
@@ -28,15 +31,80 @@ async def chat_completion(request:Request,create:ChatCompletionCreate):
 
     input_message = [json.loads(m.model_dump_json()) for m in create.messages]
 
+    if Config().api_key != '':
+        assert request.headers.get('Authorization', '').split()[-1] == Config().api_key
+
     if create.stream:
+        from openai.types.chat.chat_completion_chunk import Choice, ChoiceDelta
+        
         async def inner():
-            chunk = ChatCompletionChunk(id=id,object='chat.completion.chunk',created=int(time()))
-            async for token in interface.inference(input_message,id):
-                chunk.set_token(token)
-                yield chunk
-        return chat_stream_response(request,inner())
+            chunk = ChatCompletionChunk(
+                id = id,
+                choices = [],
+                object = 'chat.completion.chunk',
+                created = int(time()),
+                model = Config().model_name,
+            )
+            
+            async for res in interface.inference(input_message,id, create.temperature, create.top_p):
+                if isinstance(res, RawUsage):
+                    # at the end of inference, interface.inference() will return the usage of inference
+                    raw_usage = res
+                    chunk.choices = []
+                    chunk.usage = CompletionUsage(
+                        prompt_tokens = raw_usage.prefill_count,
+                        completion_tokens = raw_usage.decode_count,
+                        total_tokens = raw_usage.prefill_count + raw_usage.decode_count
+                    )
+
+                    yield chunk
+
+                else:
+                    token, finish_reason = res
+                    choice = Choice(
+                        index = 0,
+                        delta = ChoiceDelta(content=token, role=None, tool_calls=None),
+                        finish_reason = finish_reason,
+                        logprobs = None,
+                    )
+                    chunk.choices = [choice]
+                    yield chunk
+
+        return chat_stream_response(request, inner())
     else:
-        comp = ChatCompletionObject(id=id,object='chat.completion.chunk',created=int(time()))
-        async for token in interface.inference(input_message,id):
-            comp.append_token(token)
-        return comp
+        from openai.types.chat.chat_completion import Choice
+        from openai.types.chat.chat_completion_message import ChatCompletionMessage
+
+        content = ""
+        finish_reason = None
+        async for res in interface.inference(input_message,id,create.temperature,create.top_p):
+            if isinstance(res, RawUsage):
+                raw_usage = res
+                usage = CompletionUsage(
+                    prompt_tokens = raw_usage.prefill_count,
+                    completion_tokens = raw_usage.decode_count,
+                    total_tokens = raw_usage.prefill_count + raw_usage.decode_count
+                )
+            else:
+                token, finish_reason = res
+                content = content + token
+                finish_reason = finish_reason
+
+        choice = Choice(
+            index = 0,
+            finish_reason = finish_reason,
+            message = ChatCompletionMessage(
+                content=content,
+                role="assistant"
+            ))
+
+        chat_completion = ChatCompletion(
+            id = id,
+            choices = [choice],
+            created = int(time()),
+            model = Config().model_name,
+            object = 'chat.completion',
+            usage = usage
+        )
+
+        return chat_completion
diff --git a/ktransformers/server/api/openai/legacy/completions.py b/ktransformers/server/api/openai/legacy/completions.py
index be85a29..7ce2d2a 100644
--- a/ktransformers/server/api/openai/legacy/completions.py
+++ b/ktransformers/server/api/openai/legacy/completions.py
@@ -6,6 +6,7 @@ from fastapi.requests import Request
 from ktransformers.server.utils.create_interface import get_interface
 from ktransformers.server.schemas.assistants.streaming import stream_response
 from ktransformers.server.schemas.legacy.completions import CompletionCreate,CompletionObject
+from ktransformers.server.schemas.endpoints.chat import RawUsage
 
 router = APIRouter()
 
@@ -17,17 +18,24 @@ async def create_completion(request:Request,create:CompletionCreate):
     print(f'COMPLETION INPUT:----\n{create.prompt}\n----')
 
    
-
     if create.stream:
         async def inner():
-            async for token in interface.inference(create.prompt,id):     
-                d = {'choices':[{'delta':{'content':token}}]}
-                yield f"data:{json.dumps(d)}\n\n"
+            async for res in interface.inference(create.prompt,id,create.temperature,create.top_p):     
+                if isinstance(res, RawUsage):
+                    raw_usage = res
+                else: 
+                    token, finish_reason = res
+                    d = {'choices':[{'delta':{'content':token}}]}
+                    yield f"data:{json.dumps(d)}\n\n"
             d = {'choices':[{'delta':{'content':''},'finish_reason':''}]}
             yield f"data:{json.dumps(d)}\n\n"
         return stream_response(request,inner())
     else:
         comp = CompletionObject(id=id,object='text_completion',created=int(time()))
-        async for token in interface.inference(create.prompt,id):     
-            comp.append_token(token)
+        async for res in interface.inference(create.prompt,id,create.temperature,create.top_p):     
+            if isinstance(res, RawUsage):
+                raw_usage = res
+            else: 
+                token, finish_reason = res
+                comp.append_token(token) 
         return comp
diff --git a/ktransformers/server/args.py b/ktransformers/server/args.py
index 44fe7d2..1f9af76 100644
--- a/ktransformers/server/args.py
+++ b/ktransformers/server/args.py
@@ -10,6 +10,7 @@ class ArgumentParser:
         parser = argparse.ArgumentParser(prog="kvcache.ai", description="Ktransformers")
         parser.add_argument("--host", type=str, default=self.cfg.server_ip)
         parser.add_argument("--port", type=int, default=self.cfg.server_port)
+        parser.add_argument("--api_key", type=str, default=self.cfg.api_key)
         parser.add_argument("--ssl_keyfile", type=str)
         parser.add_argument("--ssl_certfile", type=str)
         parser.add_argument("--web", type=bool, default=self.cfg.mount_web)
@@ -23,13 +24,13 @@ class ArgumentParser:
         parser.add_argument("--optimize_config_path", default=self.cfg.optimize_config_path, type=str, required=False)
         parser.add_argument("--cpu_infer", type=int, default=self.cfg.cpu_infer)
         parser.add_argument("--type", type=str, default=self.cfg.backend_type)
+        parser.add_argument("--chunk_prefill_size", type=int, default=8192)
 
         # model configs
         # parser.add_argument("--model_cache_lens", type=int, default=self.cfg.cache_lens)  # int?
         parser.add_argument("--paged", type=bool, default=self.cfg.paged)
         parser.add_argument("--total_context", type=int, default=self.cfg.total_context)
         parser.add_argument("--max_batch_size", type=int, default=self.cfg.max_batch_size)
-        parser.add_argument("--max_chunk_size", type=int, default=self.cfg.max_chunk_size)
         parser.add_argument("--max_new_tokens", type=int, default=self.cfg.max_new_tokens)
         parser.add_argument("--json_mode", type=bool, default=self.cfg.json_mode)
         parser.add_argument("--healing", type=bool, default=self.cfg.healing)
@@ -90,7 +91,8 @@ class ArgumentParser:
         # user config
         parser.add_argument("--user_secret_key", type=str, default=self.cfg.user_secret_key)
         parser.add_argument("--user_algorithm", type=str, default=self.cfg.user_algorithm)
-        parser.add_argument("--force_think", type=bool, default=self.cfg.user_force_think)
+        parser.add_argument("--force_think", action=argparse.BooleanOptionalAction, type=bool, default=self.cfg.user_force_think)
+        parser.add_argument("--use_cuda_graph", action=argparse.BooleanOptionalAction, type=bool, default=self.cfg.use_cuda_graph)
 
         # web config
         parser.add_argument("--web_cross_domain", type=bool, default=self.cfg.web_cross_domain)
diff --git a/ktransformers/server/backend/args.py b/ktransformers/server/backend/args.py
index 0b473e7..0f025d4 100644
--- a/ktransformers/server/backend/args.py
+++ b/ktransformers/server/backend/args.py
@@ -23,7 +23,7 @@ class ConfigArgs(BaseModel):
     max_batch_size: int = Field(
         None, description="Max number of batches to run at once, assuming the sequences will fit within total_context"
     )
-    max_chunk_size: int = Field(
+    chunk_prefill_size: int = Field(
         None,
         description=(
             "Max chunk size. Determines the size of prefill operations. Can be reduced to reduce pauses whenever a new"
diff --git a/ktransformers/server/backend/base.py b/ktransformers/server/backend/base.py
index 4cbcdfa..aa011bf 100644
--- a/ktransformers/server/backend/base.py
+++ b/ktransformers/server/backend/base.py
@@ -15,6 +15,7 @@ from ktransformers.server.schemas.assistants.assistants import AssistantObject
 from ktransformers.server.schemas.assistants.messages import MessageCreate, MessageObject, Role
 from ktransformers.server.schemas.assistants.runs import RunObject
 from ktransformers.server.schemas.assistants.threads import ThreadObject
+from ktransformers.server.schemas.endpoints.chat import RawUsage
 from ktransformers.server.schemas.base import ObjectID, Order
 from ktransformers.server.utils.multi_timer import Profiler
 
@@ -142,12 +143,16 @@ class ThreadContext:
         yield reply_message.stream_response_with_event(MessageObject.Status.in_progress)
         yield self.run.stream_response_with_event(RunObject.Status.in_progress)
 
-        async for token in self.interface.inference(local_messages,self.thread.id):     
-            if self.run.status == RunObject.Status.cancelling:
-                logger.warn(f'Run {self.run.id} cancelling')
-                break
-            yield reply_message.append_message_delta(token)
-            response_str_count+=1
+        async for res in self.interface.inference(local_messages,self.thread.id): 
+            if isinstance(res, RawUsage):
+                raw_usage = res
+            else: 
+                token, finish_reason = res    
+                if self.run.status == RunObject.Status.cancelling:
+                    logger.warn(f'Run {self.run.id} cancelling')
+                    break
+                yield reply_message.append_message_delta(token)
+                response_str_count+=1
         
         if self.run.status == RunObject.Status.cancelling:
             yield self.run.stream_response_with_event(RunObject.Status.cancelled)
diff --git a/ktransformers/server/backend/interfaces/ktransformers.py b/ktransformers/server/backend/interfaces/ktransformers.py
index d228b64..1752a3c 100644
--- a/ktransformers/server/backend/interfaces/ktransformers.py
+++ b/ktransformers/server/backend/interfaces/ktransformers.py
@@ -1,4 +1,5 @@
 import torch
+import asyncio
 from transformers import AutoTokenizer, AutoConfig, GenerationConfig
 from ktransformers.server.backend.interfaces.transformers import (
     TransformersInterface,
@@ -13,7 +14,11 @@ from ktransformers.models.custom_cache import StaticCache
 from ktransformers.util.cuda_graph_runner import CUDAGraphRunner
 from ktransformers.local_chat import custom_models, default_optimize_rules
 from ktransformers.util.utils import get_device
+from typing import Optional
+from ktransformers.operators.flashinfer_wrapper import flashinfer_enabled, MLAWrapperSingleton
+from ktransformers.server.schemas.endpoints.chat import RawUsage
 
+warm_uped = False
 
 class KTransformersThreadContext(TransformersThreadContext):
     pass
@@ -22,19 +27,29 @@ class KTransformersThreadContext(TransformersThreadContext):
 class KTransformersInterface(TransformersInterface):
     def __init__(self, args: ConfigArgs = default_args):
         self.args = args
-        torch.set_default_dtype(torch.bfloat16)
         torch.set_grad_enabled(False)
         self.tokenizer = AutoTokenizer.from_pretrained(args.model_dir, device=args.device, trust_remote_code=args.trust_remote_code)
         config = AutoConfig.from_pretrained(args.model_dir, trust_remote_code=args.trust_remote_code)
+        try:
+            generation_config = GenerationConfig.from_pretrained(args.model_dir)
+        except:
+            generation_config = GenerationConfig(
+                max_length=args.max_new_tokens,
+                temperature=args.temperature,
+                top_p=args.top_p,
+                do_sample=True
+            )
+        
+        torch.set_default_dtype(config.torch_dtype)
         if config.architectures[0] == "Qwen2MoeForCausalLM":
             config._attn_implementation = "flash_attention_2"
 
         with torch.device("meta"):
             self.model = custom_models[config.architectures[0]](config)
         if default_args.optimize_config_path is None:
-            optimize_rule_path = default_optimize_rules[config.architectures[0]]
+            optimize_config_path = default_optimize_rules[config.architectures[0]]
         else:
-            optimize_rule_path = args.optimize_config_path
+            optimize_config_path = args.optimize_config_path
 
         # print(optimize_config)
 
@@ -44,8 +59,8 @@ class KTransformersInterface(TransformersInterface):
                 "please input the path of your gguf file(gguf file in the dir containing input gguf file must all"
                 " belong to current model):"
             )
-        optimize_and_load_gguf(self.model, optimize_rule_path, gguf_path, config)
-
+        optimize_and_load_gguf(self.model, optimize_config_path, gguf_path, config)
+        self.model.generation_config = generation_config
         self.device_map = self.model.gguf_loader.tensor_device_map
         # logger.info(f"{args.model_name} loaded from {args.model_dir} to {self.device_map}")
         self.cache = StaticCache(
@@ -56,25 +71,21 @@ class KTransformersInterface(TransformersInterface):
             dtype=self.model.dtype,
         )
         # logger.info(f"StaticCache (length={args.cache_lens}), batch size:{args.batch_size}")
-        try:
-            self.model.generation_config = GenerationConfig.from_pretrained(args.model_dir)
-        except:
-            gen_config = GenerationConfig(
-                max_length=128,
-                temperature=0.7,
-                top_p=0.9,
-                do_sample=True
-            )
-            self.model.generation_config = gen_config
+
         if self.model.generation_config.pad_token_id is None:
             self.model.generation_config.pad_token_id = self.model.generation_config.eos_token_id
         self.streamer = TextStreamer(self.tokenizer)
 
+        self._infer_lock = asyncio.Lock()
+
     def decode_one_tokens(self):
+        global warm_uped
+
         device_map = self.model.gguf_loader.tensor_device_map
         torch_device = get_device("blk.0.self_attn", device_map)
         torch_device = "cuda:0" if torch_device == "cuda" else torch_device
-        if self.args.use_cuda_graph:
+        torch.cuda.set_device(torch_device)
+        if warm_uped and self.args.use_cuda_graph:
             if not hasattr(self, "cuda_graph_runner"):
                 self.cuda_graph_runner = CUDAGraphRunner()
                 self.cuda_graph_runner.capture(
@@ -96,14 +107,15 @@ class KTransformersInterface(TransformersInterface):
                 torch.cuda.synchronize()
                 logits = logits[0, -1, :]
                 return self.logits_to_token(logits)
-
+        
+        if self.args.use_cuda_graph:
+            warm_uped = True
+            
         if self.use_static_cache:
-            mask = torch.ones((1, self.seq_length)).to(torch_device)
             logits = self.model(
                 self.current_ids.to(torch_device),
                 cache_position=self.active_cache_position,
                 past_key_values=self.cache,
-                attention_mask=mask,
                 return_dict=False,
                 use_cache=True,
             )[0]
@@ -116,59 +128,116 @@ class KTransformersInterface(TransformersInterface):
 
 
     @torch.no_grad
-    def prefill(self, input_ids: torch.Tensor, is_new: bool):
+    def prefill(self, input_ids: torch.Tensor, is_new: bool, temperature: Optional[float], top_p: Optional[float]):
         input_ids_length = input_ids.shape[-1]
-        self.profiler.set_counter("prefill", input_ids_length)
+        if(input_ids_length >= self.args.cache_lens):
+            logger.warning(f"input_ids_length {input_ids_length} > cache_lens {self.args.cache_lens}")
+            self.seq_length = input_ids_length
+            return
         logger.debug(f"input_ids: {input_ids.shape}")
-
         device = self.device_map.get("blk.0.self_attn", {}).get("generate_device", "cuda:0")
+        device = "cuda:0" if device == "cuda" else device
 
         if is_new:
-            self.cache.reset()
             self.ever_generated_ids.clear()
-            former_seq_length = 0
-            self.seq_length = input_ids_length
-            self.generated_ids = torch.zeros(
-                self.args.batch_size,
-                self.seq_length + self.args.max_new_tokens + 1,
-                dtype=torch.int,
-                device=self.args.device,
-            )
-        else:
-            logger.debug(f"generate_ids: {self.generated_ids.shape}")
-            former_seq_length = self.seq_length
-            self.seq_length += input_ids_length
-            expected_length = self.seq_length + self.args.max_new_tokens + 1
-            delta_length = expected_length - self.generated_ids.shape[-1]
-            if delta_length > 0:
-                new_generate_ids = torch.zeros(
-                    self.args.batch_size, delta_length, dtype=torch.int, device=self.args.device
+            same_prefix = 0
+            flat_input_ids = input_ids.flatten()
+
+            if getattr(self, 'generated_ids', None) is None:
+                self.generated_ids = torch.zeros(
+                    self.args.batch_size,
+                    input_ids.shape[-1] + self.args.max_new_tokens + 1,
+                    dtype=torch.int,
+                    device=self.args.device,
                 )
-                self.generated_ids = torch.cat([self.generated_ids, new_generate_ids], dim=-1)
+                self.seq_length = 1            
+            
+            flat_prev_ids = self.generated_ids.flatten()
+            for i in range(min(self.seq_length, flat_input_ids.shape[0]) - 1):
+                if flat_input_ids[i] == flat_prev_ids[i]:
+                    same_prefix += 1
+                else:
+                    break
+            
+            logger.debug(f"same prefix len: {same_prefix}")
+            self.cache.remove_suffix(same_prefix)
+            self.seq_length = same_prefix
+            self.generated_ids = self.generated_ids[..., :same_prefix]
+            input_ids = input_ids[..., same_prefix:]
+            input_ids_length = input_ids.shape[-1]
+
+        self.ever_generated_ids.clear()
+        self.profiler.set_counter("prefill", input_ids_length)
+        logger.debug(f"input_ids: {input_ids.shape}")
+        logger.debug(f"generate_ids: {self.generated_ids.shape}")
+        
+        former_seq_length = self.seq_length
+        self.seq_length += input_ids_length
+        expected_length = min(self.seq_length + self.args.max_new_tokens + 1, self.args.cache_lens)
+        delta_length = expected_length - self.generated_ids.shape[-1]
+        if delta_length > 0:
+            new_generate_ids = torch.zeros(
+                self.args.batch_size, delta_length, dtype=torch.int, device=self.args.device
+            )
+            self.generated_ids = torch.cat([self.generated_ids, new_generate_ids], dim=-1)
+        else:
+            logger.warning(f"seq_length bigger than cache_lens, killed")
+            exit(0)
+        
         logger.debug(f"cache position: {former_seq_length} to {self.seq_length}")
         cache_position = torch.arange(former_seq_length, self.seq_length, device=device)
         self.generated_ids[:, cache_position] = input_ids.to(self.args.device).to(torch.int)
 
-        mask = torch.ones((1, self.seq_length)).to(device)
         if not (type(self) is TransformersInterface):
             input_ids = input_ids.to("cpu")
-        inputs_embeds = self.model.model.embed_tokens(input_ids).to(device)
-        if self.use_static_cache:
-            logits = self.model(
-                inputs_embeds=inputs_embeds,
-                cache_position=cache_position,
-                past_key_values=self.cache,
-                return_dict=False,
-                use_cache=True,
-                attention_mask=mask,
-            )[0]
-        else:
-            logits = self.model(inputs_embeds=inputs_embeds, return_dict=False)[0]
+        
+        def chunk_prefill(input_ids, cache_position):
+            inputs_embeds = self.model.model.embed_tokens(input_ids).to(device)
+            torch.cuda.set_device(device)
+            if flashinfer_enabled:
+                MLAWrapperSingleton.need_plan_all()
+            if self.use_static_cache:
+                logits = self.model(
+                    inputs_embeds=inputs_embeds,
+                    cache_position=cache_position,
+                    past_key_values=self.cache,
+                    return_dict=False,
+                    use_cache=True,
+                )[0]
+            else:
+                logits = self.model(inputs_embeds=inputs_embeds, return_dict=False)[0]
 
+            return logits
+
+        chunk_start = 0
+        while chunk_start < input_ids_length:
+            chunk_end = min(chunk_start + self.args.chunk_prefill_size, input_ids_length)
+            if self.cache != None:
+                self.cache.cur_idx=cache_position[chunk_start:chunk_end]
+            logits = chunk_prefill(input_ids[:, chunk_start:chunk_end], cache_position[chunk_start:chunk_end])
+            chunk_start += self.args.chunk_prefill_size
+            
+        if flashinfer_enabled:
+            MLAWrapperSingleton.reset_buffer()
+        self.prepare_logits_wrapper(input_ids, device, temperature, top_p)
         next_token = self.logits_to_token(logits[0, -1, :])
         yield self.append_new_tokens(next_token)
 
     @property
     def active_cache_position(self):
         device = self.device_map.get("blk.0.self_attn", {}).get("generate_device", "cuda:0")
-        return torch.tensor([self.seq_length - 1], device=device)
\ No newline at end of file
+        return torch.tensor([self.seq_length - 1], device=device)
+    
+    async def inference(self, local_messages, thread_id: str, temperature: Optional[float] = None, top_p: Optional[float] = None):
+        async with self._infer_lock:
+            async for v in super().inference(local_messages, thread_id, temperature, top_p):
+                yield v
+            
+            # return this inference raw usage
+            yield RawUsage(
+                tokenize_time = self.profiler.get_timer_sec('tokenize'),
+                prefill_time = self.profiler.get_timer_sec('prefill'),
+                decode_time = self.profiler.get_timer_sec('decode'),
+                prefill_count = self.profiler.get_counter('prefill'),
+                decode_count = self.profiler.get_counter('decode'),
+            )
\ No newline at end of file
diff --git a/ktransformers/server/backend/interfaces/transformers.py b/ktransformers/server/backend/interfaces/transformers.py
index f18581a..c7ac80f 100644
--- a/ktransformers/server/backend/interfaces/transformers.py
+++ b/ktransformers/server/backend/interfaces/transformers.py
@@ -13,12 +13,13 @@ from transformers import (
 from ktransformers.server.config.config import Config
 from ktransformers.server.schemas.base import ObjectID
 from ktransformers.server.utils.multi_timer import Profiler
+from torch.nn.attention import SDPBackend
 import torch
 import sys, os
 from ..base import ThreadContext, BackendInterfaceBase
 from ktransformers.server.config.log import logger
 from ..args import ConfigArgs, default_args
-
+from ktransformers.operators.flashinfer_wrapper import flashinfer_enabled, MLAWrapperSingleton
 
 # This TextStreamer is a modified version from https://github.com/huggingface/transformers/blob/main/src/transformers/generation/streamers.py
 class TextStreamer:
@@ -170,7 +171,7 @@ class TransformersInterface(BackendInterfaceBase):
         for m in messages[1:]:
             if m["role"] == "user" and new_messages[-1]["role"] == "user":
                 logger.warning("merge two adjacent user messages")
-                new_messages[-1]["content"] += m["content"]
+                new_messages[-1]["content"] += '\n' + m["content"]
             else:
                 new_messages.append(m)
         # if (self.last_request_id is not None) and self.last_request_id == thread_id:
@@ -179,7 +180,11 @@ class TransformersInterface(BackendInterfaceBase):
         #     input_ids = self.tokenizer.apply_chat_template(
         #         new_messages, return_tensors="pt", add_generation_prompt=True
         #     ).to(self.args.device)
-        input_ids = self.tokenizer.apply_chat_template(new_messages,return_tensors='pt',add_generation_prompt=True).to(self.args.device)
+        input_str: str = self.tokenizer.apply_chat_template(new_messages,tokenize=False,add_generation_prompt=True)
+        # drop <think> token in chat template
+        if input_str.endswith('<think>\n'):
+            input_str = input_str[:-len('<think>\n')]
+        input_ids = self.tokenizer.encode(input_str, return_tensors="pt").to(self.args.device)
         if (self.last_request_id is not None) and self.last_request_id == thread_id:
             x = self.generated_ids[:,:self.seq_length]
             y = input_ids[:,:self.seq_length]
@@ -198,14 +203,31 @@ class TransformersInterface(BackendInterfaceBase):
         self.seq_length += 1
         return self.streamer.put(new_tokens)
 
-    def logits_to_token(self, logits: torch.Tensor):
-        logits = logits / self.args.temperature if self.args.temperature!=0 else logits
+    def prepare_logits_wrapper(self, inputs, device, temperature: Optional[float] = None, top_p: Optional[float] = None):
+        if temperature is None or temperature == 0:
+            temperature = self.model.generation_config.temperature
+        if top_p is None:
+            top_p = self.model.generation_config.top_p
+        generation_config, model_kwargs = self.model._prepare_generation_config(
+            None, max_length=self.args.max_new_tokens,
+            do_sample=True, 
+            top_k=self.args.top_k, 
+            top_p=top_p, 
+            temperature=temperature,
+            repetition_penalty=self.args.repetition_penalty # change this to modify generate config
+        )
+        self.inputs = inputs
+        try: # transformers==4.43
+            self.logits_warper = (
+                self.model._get_logits_warper(generation_config, device=device)
+            )
+        except: 
+            self.logits_warper = (
+                self.model._get_logits_warper(generation_config)
+            )
 
-        for token_idx in self.ever_generated_ids:
-            if logits[token_idx] < 0:
-                logits[token_idx] *= self.args.repetition_penalty
-            else:
-                logits[token_idx] /= self.args.repetition_penalty
+    def logits_to_token(self, logits: torch.Tensor):
+        logits = self.logits_warper(self.inputs.view(1, -1), logits.view(1, -1))
 
         probs = torch.nn.functional.softmax(logits, dim=-1)
 
@@ -221,12 +243,10 @@ class TransformersInterface(BackendInterfaceBase):
 
     def decode_one_tokens(self):
         if self.use_static_cache:
-            mask = torch.ones((1, self.seq_length)).to(self.args.device)
             logits = self.model(
                 self.current_ids,
                 cache_position=self.active_cache_position,
                 past_key_values=self.cache,
-                attention_mask=mask,
                 return_dict=False,
                 use_cache=True,
             )[0]
@@ -237,38 +257,57 @@ class TransformersInterface(BackendInterfaceBase):
         return self.logits_to_token(logits)
 
     @torch.no_grad
-    def prefill(self, input_ids: torch.Tensor, is_new: bool):
+    def prefill(self, input_ids: torch.Tensor, is_new: bool, temperature: Optional[float] = None, top_p: Optional[float] = None):
         input_ids_length = input_ids.shape[-1]
-        self.profiler.set_counter("prefill", input_ids_length)
         logger.debug(f"input_ids: {input_ids.shape}")
 
         if is_new:
-            self.cache.reset()
             self.ever_generated_ids.clear()
-            former_seq_length = 0
-            self.seq_length = input_ids_length
-            self.generated_ids = torch.zeros(
-                self.args.batch_size,
-                self.seq_length + self.args.max_new_tokens + 1,
-                dtype=torch.int,
-                device=self.args.device,
-            )
-        else:
-            logger.debug(f"generate_ids: {self.generated_ids.shape}")
-            former_seq_length = self.seq_length
-            self.seq_length += input_ids_length
-            expected_length = self.seq_length + self.args.max_new_tokens + 1
-            delta_length = expected_length - self.generated_ids.shape[-1]
-            if delta_length > 0:
-                new_generate_ids = torch.zeros(
-                    self.args.batch_size, delta_length, dtype=torch.int, device=self.args.device
+            same_prefix = 0
+            flat_input_ids = input_ids.flatten()
+
+            if getattr(self, 'generated_ids', None) is None:
+                self.generated_ids = torch.zeros(
+                    self.args.batch_size,
+                    input_ids.shape[-1] + self.args.max_new_tokens + 1,
+                    dtype=torch.int,
+                    device=self.args.device,
                 )
-                self.generated_ids = torch.cat([self.generated_ids, new_generate_ids], dim=-1)
+                self.seq_length = 1            
+            
+            flat_prev_ids = self.generated_ids.flatten()
+            for i in range(min(self.seq_length, flat_input_ids.shape[0]) - 1):
+                if flat_input_ids[i] == flat_prev_ids[i]:
+                    same_prefix += 1
+                else:
+                    break
+            
+            logger.debug(f"same prefix len: {same_prefix}")
+            self.cache.remove_suffix(same_prefix)
+            self.seq_length = same_prefix
+            self.generated_ids = self.generated_ids[..., :same_prefix]
+            input_ids = input_ids[..., same_prefix:]
+            input_ids_length = input_ids.shape[-1]
+        
+        self.ever_generated_ids.clear()
+        self.profiler.set_counter("prefill", input_ids_length)
+        logger.debug(f"input_ids: {input_ids.shape}")
+
+        logger.debug(f"generate_ids: {self.generated_ids.shape}")
+        former_seq_length = self.seq_length
+        self.seq_length += input_ids_length
+        expected_length = self.seq_length + self.args.max_new_tokens + 1
+        delta_length = expected_length - self.generated_ids.shape[-1]
+        if delta_length > 0:
+            new_generate_ids = torch.zeros(
+                self.args.batch_size, delta_length, dtype=torch.int, device=self.args.device
+            )
+            self.generated_ids = torch.cat([self.generated_ids, new_generate_ids], dim=-1)
+            
         logger.debug(f"cache position: {former_seq_length} to {self.seq_length}")
         cache_position = torch.arange(former_seq_length, self.seq_length, device=self.args.device)
         self.generated_ids[:, cache_position] = input_ids.to(self.args.device).to(torch.int)
 
-        mask = torch.ones((1, self.seq_length)).to(self.args.device)
         device = input_ids.device
         if not (type(self) is TransformersInterface):
             input_ids = input_ids.to("cpu")
@@ -280,26 +319,46 @@ class TransformersInterface(BackendInterfaceBase):
                 past_key_values=self.cache,
                 return_dict=False,
                 use_cache=True,
-                attention_mask=mask,
             )[0]
         else:
             logits = self.model(inputs_embeds=inputs_embeds, return_dict=False)[0]
 
+        self.prepare_logits_wrapper(input_ids, device, temperature, top_p)
         next_token = self.logits_to_token(logits[0, -1, :])
         yield self.append_new_tokens(next_token)
 
     @torch.no_grad
     def generate(self):
+        self.max_new_tokens = min(self.args.max_new_tokens, self.args.cache_lens - self.seq_length) - 1 
+        logger.info(f"args.max_new_tokens: {self.args.max_new_tokens}, cache_lens: {self.args.cache_lens}, seq_length: {self.seq_length}")
+        if(self.max_new_tokens <= 0):
+            logger.warning("max_new_tokens is less than 0")
+            yield self.streamer.end(), "length"
+            return
+        logger.info(f"max_new_tokens: {self.max_new_tokens}")
         self.profiler.set_counter("decode", 0)
-        for _ in range(1, self.args.max_new_tokens):
-            with torch.backends.cuda.sdp_kernel(enable_flash=False, enable_mem_efficient=False, enable_math=True):
+
+        for i in range(1, self.max_new_tokens):
+            with torch.nn.attention.sdpa_kernel(backends=[SDPBackend.FLASH_ATTENTION, SDPBackend.MATH, SDPBackend.EFFICIENT_ATTENTION]):
+                if flashinfer_enabled:
+                    MLAWrapperSingleton.plan_all(None,None,None,self.active_cache_position.to(torch.int32)+1,
+                                             num_heads=self.model.config.num_attention_heads, head_dim_ckv=self.model.config.kv_lora_rank, 
+                                             head_dim_kpe=self.model.config.qk_rope_head_dim, page_size=self.cache.page_size,
+                                             sm_scale=self.model.model.layers[0].self_attn.softmax_scale, q_data_type=torch.bfloat16, kv_data_type=torch.bfloat16)
                 next_token = self.decode_one_tokens()
                 self.profiler.inc("decode")
-                if next_token == self.tokenizer.eos_token_id:
+                if next_token == self.tokenizer.eos_token_id or "<|im_end|>" == self.tokenizer.decode(next_token):
+                    yield self.streamer.end(), None
+                    yield "", "stop"
                     assert self.args.batch_size == 1
                     break
-                yield self.append_new_tokens(next_token)
-        yield self.streamer.end()
+                yield self.append_new_tokens(next_token), None
+
+        else:   # for's else, if output get max new tokens
+            yield self.streamer.end(), None
+            yield "", "length"
+        
+        
 
     def check_is_new(self, thread_id: str):
         if not self.use_static_cache:
@@ -314,7 +373,8 @@ class TransformersInterface(BackendInterfaceBase):
                 self.last_request_id = thread_id
                 return True
 
-    async def inference(self, local_messages, thread_id: str):
+    async def inference(self, local_messages, thread_id: str, temperature: Optional[float] = None, top_p: Optional[float] = None):
+        self.streamer.reset()
         self.profiler.create_and_start_timer("tokenize")
         if isinstance(local_messages, List):
             input_ids = self.format_and_tokenize_input_ids(thread_id, local_messages)
@@ -324,8 +384,9 @@ class TransformersInterface(BackendInterfaceBase):
             #input_ids = torch.tensor([[6366]], device=input_ids.device)
         else:
             raise ValueError("local_messages should be List or str")
+        
         if Config().user_force_think:
-            token_thinks = torch.tensor([self.tokenizer.encode("<think>\\n",add_special_tokens=False)],device=input_ids.device)
+            token_thinks = torch.tensor([self.tokenizer.encode("<think>\n",add_special_tokens=False)],device=input_ids.device)
             input_ids = torch.cat(
                 [input_ids, token_thinks], dim=1
             )
@@ -333,21 +394,24 @@ class TransformersInterface(BackendInterfaceBase):
         self.profiler.pause_timer("tokenize")
 
         self.profiler.create_and_start_timer("prefill")
+
         if Config().user_force_think:
-            t = "<think>\n"
-            print(t,end="",flush=True)
-            yield t
-        for t in self.prefill(input_ids, self.check_is_new(thread_id)):
+            think = '<think>\n'
+            print(think, end="",flush=True)
+            yield think, None
+        
+        for t in self.prefill(input_ids, self.check_is_new(thread_id), temperature, top_p):
+            # output think token after prefill done
             if t is not None:
                 print(t, end="",flush=True)
-                yield t
+                yield t, None
         self.profiler.pause_timer("prefill")
 
         self.profiler.create_and_start_timer("decode")
-        for t in self.generate():
+        for t, finish_reason in self.generate():
             if t is not None:
                 print(t, end="",flush=True)
-                yield t 
+                yield t, finish_reason
         print("")
         self.profiler.pause_timer("decode")
         self.report_last_time_performance()
diff --git a/ktransformers/server/config/config.py b/ktransformers/server/config/config.py
index 7dc9921..332e398 100644
--- a/ktransformers/server/config/config.py
+++ b/ktransformers/server/config/config.py
@@ -69,6 +69,7 @@ class Config(metaclass=Singleton):
         self.server: dict = cfg.get("server", {})
         self.server_ip = self.server.get("ip", "0.0.0.0")
         self.server_port = self.server.get("port", 9016)
+        self.api_key = self.server.get("api_key", "")
 
         # db configs
         self.db_configs: dict = cfg.get("db", {})
@@ -104,7 +105,8 @@ class Config(metaclass=Singleton):
 
         self.total_context = self.model.get("total_context", 2**18)
         self.max_batch_size = self.model.get("max_batch_size", 20 if self.paged else 1)
-        self.max_chunk_size = self.model.get("max_chunk_size", 2048)
+        self.chunk_prefill_size = self.model.get("chunk_prefill_size", 8192)
+        
         self.max_new_tokens = self.model.get("max_new_tokens", 2000)
         self.json_mode = self.model.get("json_mode", False)
         self.healing = self.model.get("healing", False)
diff --git a/ktransformers/server/main.py b/ktransformers/server/main.py
index 5e01a48..f536f9c 100644
--- a/ktransformers/server/main.py
+++ b/ktransformers/server/main.py
@@ -105,6 +105,7 @@ def custom_openapi(app):
 
 def main():
     cfg = Config()
+
     arg_parser = ArgumentParser(cfg)
 
     # 初始化消息
diff --git a/ktransformers/server/requirements.txt b/ktransformers/server/requirements.txt
index d324cf2..9a4c9c5 100644
--- a/ktransformers/server/requirements.txt
+++ b/ktransformers/server/requirements.txt
@@ -5,6 +5,7 @@ langchain >= 0.2.0
 blessed >= 1.20.0
 accelerate >= 0.31.0
 sentencepiece >= 0.1.97
+openai
 setuptools
 build
 ninja
diff --git a/ktransformers/server/schemas/assistants/streaming.py b/ktransformers/server/schemas/assistants/streaming.py
index c5d8a04..0c3b1a7 100644
--- a/ktransformers/server/schemas/assistants/streaming.py
+++ b/ktransformers/server/schemas/assistants/streaming.py
@@ -73,7 +73,7 @@ class RunStepDelta(Object):
 
 class Done():
     def to_stream_reply(self):
-        return f"event: done\ndata: [DONE]\n\n"
+        return f"data: [DONE]\n\n"
 
 
 async def check_client_link(request: Request, async_events: AsyncIterable):
diff --git a/ktransformers/server/schemas/endpoints/chat.py b/ktransformers/server/schemas/endpoints/chat.py
index 5c4dc4e..eb0081a 100644
--- a/ktransformers/server/schemas/endpoints/chat.py
+++ b/ktransformers/server/schemas/endpoints/chat.py
@@ -1,10 +1,15 @@
 from typing import List, Optional
+from typing_extensions import Literal
 from enum import Enum
 
 from pydantic import BaseModel
 
 from ktransformers.server.schemas.base import Object
 
+from openai.types.completion_usage import CompletionUsage
+from openai.types.chat.chat_completion_chunk import Choice
+
+
 class Role(Enum):
     system = 'system'
     user = 'user'
@@ -25,54 +30,31 @@ class ChatCompletionCreate(BaseModel):
     messages: List[Message]
     model : str
     stream : bool = False
-
+    temperature: Optional[float] = None
+    top_p: Optional[float] = None
+    
     def get_tokenizer_messages(self):
         return [m.to_tokenizer_message() for m in self.messages]
 
-class FinishReason(Enum):
-    stop = 'stop'
-    length = 'length'
 
-class Choice(BaseModel):
-    index: int
-    message: Message
-    logprobs: Optional[str] = None
-    finish_reason: FinishReason = None
+class ChatCompletionChunk(BaseModel):
+    id: str
+    choices: List[Choice]
+    created: int
+    model: str
+    object: Literal["chat.completion.chunk"]
+    service_tier: Optional[Literal["scale", "default"]] = None
+    system_fingerprint: Optional[str] = None
+    usage: Optional[CompletionUsage] = None
 
-class DeltaChoice(BaseModel):
-    index: int
-    delta: Message
-    logprobs: Optional[str] = None
-    finish_reason: FinishReason = None
-
-
-class Usage(BaseModel):
-    completion_tokens:int
-    prompt_tokens:int
-    total_tokens:int
-
-
-class ChatCompletionBase(Object):
-    created:int
-    model:str = 'not implmented'
-    system_fingerprint:str = 'not implmented'
-    usage: Optional[Usage] = None
-
-class ChatCompletionObject(ChatCompletionBase):
-    choices:List[Choice] = []
-
-    def append_token(self,token:str):
-        if len(self.choices) == 0:
-            self.choices.append(Choice(index=0,message=Message(content='',role=Role.assistant)))
-        self.choices[0].message.content += token
-
-class ChatCompletionChunk(ChatCompletionBase):
-    choices:List[DeltaChoice] = []
-
-    def set_token(self,token:str):
-        self.choices = [
-            DeltaChoice(index=0,delta=Message(content=token,role=Role.assistant))
-        ]
 
     def to_stream_reply(self):
-        return f"data:{self.model_dump_json()}\n\n"
+        return f"data: {self.model_dump_json()}\n\n"
+
+
+class RawUsage(BaseModel):
+    tokenize_time: float
+    prefill_time: float
+    decode_time: float
+    prefill_count: int
+    decode_count: int
diff --git a/ktransformers/server/schemas/legacy/completions.py b/ktransformers/server/schemas/legacy/completions.py
index 874e556..ea936ea 100644
--- a/ktransformers/server/schemas/legacy/completions.py
+++ b/ktransformers/server/schemas/legacy/completions.py
@@ -9,6 +9,8 @@ class CompletionCreate(BaseModel):
     model: str
     prompt: str | List[str]
     stream: bool = False
+    temperature: Optional[float] = None
+    top_p: Optional[float] = None
 
     def get_tokenizer_messages(self):
         if isinstance(self.prompt,List):
diff --git a/ktransformers/tests/.gitignore b/ktransformers/tests/.gitignore
new file mode 100644
index 0000000..68bcbc9
--- /dev/null
+++ b/ktransformers/tests/.gitignore
@@ -0,0 +1 @@
+results/
\ No newline at end of file
diff --git a/ktransformers/tests/AIME_2024/eval_api.py b/ktransformers/tests/AIME_2024/eval_api.py
new file mode 100644
index 0000000..c508cc4
--- /dev/null
+++ b/ktransformers/tests/AIME_2024/eval_api.py
@@ -0,0 +1,137 @@
+# adapt from https://github.com/abacaj/code-eval?tab=readme-ov-file
+import argparse
+import json
+import os
+import time
+import requests
+import tqdm
+
+from evaluation import filter_answer
+from prompts import instruct_prompt
+import pandas as pd
+from datasets import load_dataset
+os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
+
+
+def generate_text(api_url,question , model_name, stream=False, auth_token=None):
+    headers = {
+        'accept': 'application/json',
+        'Content-Type': 'application/json',
+        # 添加 API Key
+        'Authorization' : 'Bearer ' + auth_token if auth_token else ''
+    }
+    question = instruct_prompt(question)
+    data = {
+        "messages": [{"content": question, "role": "user"}],
+        "model": model_name,
+        "stream": stream,
+        "temperature": 0.6,
+        "max_tokens": 10240,
+    }
+    print(f"content: {question}")
+    response = requests.post(api_url, headers=headers, json=data,verify=False)
+    if response.status_code == 200:
+        result = response.json()
+        results = result.get('choices', [{}])[0].get('message', {}).get('content', '')
+        return filter_answer(results)
+    else:
+        print(f"API Request failed with status code {response.status_code}")
+        return None
+def load_data(file_path):
+        """
+        Load data from a Parquet file into a list.
+        Each record in the Parquet file should represent an individual record.
+        """
+        # 读取 Parquet 文件
+        # dataset = load_dataset('parquet', data_files=file_path)
+        data = []
+        ds = load_dataset(file_path)
+        df = pd.DataFrame(ds['train'])
+        for _, row in df.iterrows():
+            data.append(row.to_dict())
+        return data
+
+def get_score(pred, answer):
+        """
+        Calculate scores between the prediction and the answer.
+        Uses ROUGE scores as the evaluation metric.
+        :param pred: The predicted string.
+        :param answer: The reference answer string.
+        :return: A dictionary containing ROUGE scores.
+        """
+        if pred == answer:
+            return 1
+        # if we need to compare str with number, convert teh str to number
+        try:
+            pred = float(pred)
+            answer = float(answer)
+        except:
+            pass
+        if pred == answer:
+            return 1
+        return 0
+
+def run_eval_api(
+    api_url: str,
+    model_name: str,
+    out_path: str,
+    format_tabs: bool = False,
+    auth_token: str = None,
+    problem_file: str = None,
+    append: bool = False,
+    skip: int = 0
+):
+  
+    data = load_data(problem_file)
+    pbar = tqdm.tqdm(total=len(data) * 1)
+    pbar.update(skip)
+    for i in range(len(data)):
+        i = i+skip
+        data_item = data[i]
+        question = data_item['Problem']
+        # Start the timer for this evaluation
+        start_time = time.time()
+        try:
+            completion = generate_text(api_url, question, model_name, auth_token=auth_token)
+            if completion is None:
+                raise Exception(f"Failed to get prediction for {question}")
+            answer = data_item['Answer']
+            score = get_score(completion, answer)
+            elapsed_time = time.time() - start_time
+            result = {
+                "index": i,
+                "question_id": data_item["ID"],
+                "answer": answer,
+                "prediction": completion,
+                "score": score,
+                "time": elapsed_time
+            }
+            with open(out_path, "a" if append else "w") as f:
+                f.write(json.dumps(result) + "\n")
+            
+        except Exception as e:
+            print(f"Failed to get prediction for {question}")
+            print(e)
+            continue
+
+        pbar.update(1)
+    
+
+def main(output_path, api_url, model_name, auth_token, format_tabs,problem_file, append,skip):
+    os.makedirs(os.path.dirname(output_path), exist_ok=True)
+    run_eval_api(api_url, model_name, output_path, format_tabs, auth_token, problem_file,append,skip)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="API Generate Tester")
+    parser.add_argument("--api_url", type=str, default="https://api.siliconflow.cn/v1/chat/completions", help="API URL")
+    parser.add_argument("--model_name", type=str, default="Pro/deepseek-ai/DeepSeek-R1", help="Model Name")
+    parser.add_argument("--out_path", type=str, default="results/api/eval_aime.jsonl", help="Output Path")
+    parser.add_argument("--auth_token", type=str, default=None, help="Auth Token")
+    parser.add_argument("--format_tabs", action="store_true", help="Format Tabs")
+    parser.add_argument("--problem_file", type=str, default="Maxwell-Jia/AIME_2024", help="Evalset File")
+    parser.add_argument("--no_append", action="store_false", help="Append to existing file")
+    parser.add_argument("--skip", type=int, default=0, help="Skip some tasks")
+    args = parser.parse_args()
+    # api_url = "https://api.siliconflow.cn/v1/chat/completions"
+    main(args.out_path, args.api_url, args.model_name, args.auth_token, args.format_tabs, args.problem_file, args.no_append, args.skip)
\ No newline at end of file
diff --git a/ktransformers/tests/AIME_2024/evaluation.py b/ktransformers/tests/AIME_2024/evaluation.py
new file mode 100644
index 0000000..d43a25a
--- /dev/null
+++ b/ktransformers/tests/AIME_2024/evaluation.py
@@ -0,0 +1,10 @@
+# reference: https://github.com/declare-lab/instruct-eval/blob/main/human_eval/main.py#L35
+def filter_answer(completion: str) -> str:
+    # the answer is the last part of the completion, it's a int64 number
+    # get the last line
+    completion = completion.strip().split("\n")[-1]
+    # handle the $\\boxed{...}$ format
+    if "$\\boxed{" in completion:
+        return completion.split("}")[0].split("{")[-1]
+    return completion.split()[-1]
+
diff --git a/ktransformers/tests/AIME_2024/prompts.py b/ktransformers/tests/AIME_2024/prompts.py
new file mode 100644
index 0000000..ef75841
--- /dev/null
+++ b/ktransformers/tests/AIME_2024/prompts.py
@@ -0,0 +1,2 @@
+def instruct_prompt(prompt: str) -> str:
+    return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nSolve the following math problem without any tests or explanation only one answer surrounede by '$\\boxed{{}}$'\n{prompt}\n\n### Response:"""
diff --git a/ktransformers/tests/humaneval/eval_api.py b/ktransformers/tests/humaneval/eval_api.py
new file mode 100644
index 0000000..f312653
--- /dev/null
+++ b/ktransformers/tests/humaneval/eval_api.py
@@ -0,0 +1,99 @@
+# adapt from https://github.com/abacaj/code-eval?tab=readme-ov-file
+import argparse
+import os
+import requests
+from human_eval.data import write_jsonl, read_problems
+import tqdm
+
+from evaluation import filter_code, fix_indents
+from prompts import instruct_prompt
+
+def generate_text(api_url,question , model_name, stream=False, auth_token=None):
+    headers = {
+        'accept': 'application/json',
+        'Content-Type': 'application/json',
+        # 添加 API Key
+        'Authorization' : 'Bearer ' + auth_token if auth_token else ''
+    }
+    question = instruct_prompt(question)
+    data = {
+        "messages": [{"content": question, "role": "user"}],
+        "model": model_name,
+        "stream": stream,
+        "temperature": 0.6
+    }
+    print(f"content: {question}")
+    response = requests.post(api_url, headers=headers, json=data,verify=False)
+    if response.status_code == 200:
+        result = response.json()
+        results = result.get('choices', [{}])[0].get('message', {}).get('content', '')
+        return [filter_code(fix_indents(results))]
+    else:
+        print(f"API Request failed with status code {response.status_code}")
+        return None
+
+def run_eval_api(
+    api_url: str,
+    model_name: str,
+    out_path: str,
+    format_tabs: bool = False,
+    auth_token: str = None,
+    problem_file: str = None,
+    append: bool = False,
+    skip: int = 0
+):
+    if(problem_file is None):
+        problems = read_problems()
+    else:
+        problems = read_problems(problem_file)
+    samples = []
+    pbar = tqdm.tqdm(total=len(problems) * 1)
+    pbar.update(skip)
+    try:
+        for task_id in problems:
+            # skip some tasks
+            if skip > 0:
+                skip -= 1
+                continue
+
+            if format_tabs:
+                prompt = problems[task_id]["prompt"].replace("    ", "\t")
+            else:
+                prompt = problems[task_id]["prompt"]
+            completion = generate_text(api_url, prompt, model_name, auth_token=auth_token)
+            # samples.append({"task_id": task_id, "completion": completion})
+            for sample in completion:
+                result = dict(
+                    task_id=task_id,
+                    completion=sample,
+                )
+                samples += [result]
+                if append:
+                    write_jsonl(out_path, [result],append=append)
+            pbar.update(1)
+        if not append:
+            write_jsonl(out_path, samples,append=append)
+    except Exception as e:
+        if not append:
+            write_jsonl(out_path, samples,append=append)
+        print(f"Error: {e}")
+
+def main(output_path, api_url, model_name, auth_token, format_tabs,problem_file, append,skip):
+    os.makedirs(os.path.dirname(output_path), exist_ok=True)
+    run_eval_api(api_url, model_name, output_path, format_tabs, auth_token, problem_file,append,skip)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="API Generate Tester")
+    #parser.add_argument("--api_url", type=str, default="https://api.siliconflow.cn/v1/chat/completions", help="API URL")
+    parser.add_argument("--api_url", type=str, default="http://localhost:10002/v1/chat/completions", help="API URL")
+    parser.add_argument("--model_name", type=str, default="Pro/deepseek-ai/DeepSeek-V3", help="Model Name")
+    parser.add_argument("--out_path", type=str, default="results/api/eval_b.jsonl", help="Output Path")
+    parser.add_argument("--auth_token", type=str, default=None, help="Auth Token")
+    parser.add_argument("--format_tabs", action="store_true", help="Format Tabs")
+    parser.add_argument("--problem_file", type=str, default=None, help="Evalset File")
+    parser.add_argument("--no_append", action="store_false", help="Append to existing file")
+    parser.add_argument("--skip", type=int, default=0, help="Skip first n problems")
+    args = parser.parse_args()
+    # api_url = "https://api.siliconflow.cn/v1/chat/completions"
+    main(args.out_path, args.api_url, args.model_name, args.auth_token, args.format_tabs, args.problem_file, args.no_append,args.skip)
\ No newline at end of file
diff --git a/ktransformers/tests/humaneval/evaluation.py b/ktransformers/tests/humaneval/evaluation.py
new file mode 100644
index 0000000..e4860c6
--- /dev/null
+++ b/ktransformers/tests/humaneval/evaluation.py
@@ -0,0 +1,15 @@
+# reference: https://github.com/declare-lab/instruct-eval/blob/main/human_eval/main.py#L35
+def filter_code(completion: str) -> str:
+    # The program tends to overwrite, we only take the first function
+    completion = completion.lstrip("\n")
+    # we also remove ```python\n and ```
+    completion = completion.replace("```python\n", "").replace("```", "")
+    if 'if __name__ == "__main__":' in completion:
+        completion = completion.split('if __name__ == "__main__":')[0]
+    if "# Example usage" in completion:
+        completion = completion.split("# Example usage")[0]
+    return completion
+
+
+def fix_indents(text: str) -> str:
+    return text.replace("\t", "    ")
diff --git a/ktransformers/tests/humaneval/prompts.py b/ktransformers/tests/humaneval/prompts.py
new file mode 100644
index 0000000..694bac8
--- /dev/null
+++ b/ktransformers/tests/humaneval/prompts.py
@@ -0,0 +1,14 @@
+def instruct_prompt(prompt: str) -> str:
+    return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nComplete the following Python code without any tests or explanation\n{prompt}\n\n### Response:"""
+
+
+def standard_prompt(prompt: str) -> str:
+    return f"""Complete the following Python code without any tests or explanation\n{prompt}"""
+
+
+def write_prompt(prompt: str) -> str:
+    return f"""Write a python program to complete the following code:\n{prompt}"""
+
+
+def replit_glaive_prompt(prompt: str) -> str:
+    return f"""Below is an instruction that describes a task, paired with an input that provides further context.\n Write a response that appropriately completes the request.\n\n ### Instruction:\nWrite a program to perform the given task.\n\n Input:\n{prompt}\n\n### Response:"""
diff --git a/ktransformers/tests/mmlu_pro_test.py b/ktransformers/tests/mmlu_pro_test.py
new file mode 100644
index 0000000..c0db4a5
--- /dev/null
+++ b/ktransformers/tests/mmlu_pro_test.py
@@ -0,0 +1,195 @@
+import argparse
+import random
+import time
+import json
+import requests
+import pandas as pd
+from datasets import load_dataset
+
+import os
+os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
+os.environ['https_proxy'] = ''
+os.environ['http_proxy'] = ''
+hint = 'There is a single choice question. Answer the question by replying A, B, C, D, E, F, G, H, I, J. No other answers are accepted. Just the letter.'
+
+
+class DataEvaluator:
+    def __init__(self):
+        # self.template_prompt = template_prompt
+        self.data = []
+
+    def load_data(self, file_path):
+        """
+        Load data from a Parquet file into a list.
+        Each record in the Parquet file should represent an individual record.
+        """
+        # 读取 Parquet 文件
+        # dataset = load_dataset('parquet', data_files=file_path)
+        ds = load_dataset("TIGER-Lab/MMLU-Pro")
+        df = pd.DataFrame(ds['test'])
+        # print(ds)
+        # # ds_1 =  ds['train']
+        # ds_2 =  ds['validation']
+        # ds_3 =  ds['test']
+        # # 将数据集转换为 Pandas DataFrame
+        # df_test = pd.DataFrame(ds['test'])
+        # df_val = pd.DataFrame(ds['validation'])
+
+        # for _, row in df.iterrows():
+        #     self.data.append(row.to_dict())
+        # df = pd.read_parquet(file_path)
+
+        for _, row in df.iterrows():
+            self.data.append(row.to_dict())
+
+    def get_prompt(self, record):
+        """
+        Combine fields from a record with the template prompt to create a full prompt.
+        :param record: Dictionary containing fields to populate the template.
+        :return: A formatted prompt string.
+        """
+        # 查看ABCD。。。的选项
+        options_str = "\n".join([f"{chr(65+i)}. {opt}" for i, opt in enumerate(record['options'])])
+        prompt = hint + "\nQuestion: " + record['question'] + "\n" + options_str + "\nAnswer: '"
+        return prompt
+        
+    def post_processing(self, text):
+        """
+        Perform post-processing on the prediction string.
+        :param text: The raw prediction string.
+        :return: Processed prediction string.
+        """
+        text = text.lstrip('\n').split('\n')[-1]
+        return text[-1:]
+
+    def score(self, pred, answers):
+        """
+        Calculate scores between the prediction and the answer.
+        Uses ROUGE scores as the evaluation metric.
+        :param pred: The predicted string.
+        :param answer: The reference answer string.
+        :return: A dictionary containing ROUGE scores.
+        """
+        for answer in answers:
+            if pred == answer:
+                return 1
+
+        return 0
+
+# Function to generate text using API
+def generate_text(api_url, question, model_name, stream=False):
+    headers = {
+        'accept': 'application/json',
+        'Content-Type': 'application/json',
+        # 添加 API Key
+        'Authorization' : 'Bearer '
+    }
+    data = {
+        "messages": [{"content": question, "role": "user"}],
+        "model": model_name,
+        "stream": stream,
+        # "temperature": 0.0
+    }
+    
+    print("POST data:", data)
+    response = requests.post(api_url, headers=headers, json=data)
+    
+    if response.status_code == 200:
+        result = response.json()
+        return result.get('choices', [{}])[0].get('message', {}).get('content', '').strip()
+    else:
+        print(f"API Request failed with status code {response.status_code}")
+        return None
+
+# Main function to handle multiple evaluations
+def main(concurrent_requests, data_evaluator: DataEvaluator, result_file, log_file, api_url, model_name):
+    start_total_time = time.time()
+
+    total_score = 0
+
+    results = []
+    # 设置随机数种子
+    random.seed(42)
+    random.shuffle(data_evaluator.data)
+    for i in range(min(concurrent_requests, len(data_evaluator.data))):
+        # Randomly select a data item from data for each request
+        data_item = data_evaluator.data[i]
+        question = data_evaluator.get_prompt(data_item)
+        # print(question)
+
+        # Start the timer for this evaluation
+        start_time = time.time()
+        try:
+            # Generate prediction using the API
+            prediction = generate_text(api_url, question, model_name)
+
+            if prediction is None:
+                raise Exception(f"Failed to get prediction for {question}")
+
+            answer = data_item['answer']
+            # Compute score
+            score = data_evaluator.score(data_evaluator.post_processing(prediction), answer)
+
+            # Calculate the time taken
+            elapsed_time = time.time() - start_time
+
+            # Collect the result data
+            result_data = {
+                "question_id": data_item['question_id'],
+                "answer": answer,
+                "prediction": data_evaluator.post_processing(prediction),
+                "score": score,
+                "time": elapsed_time
+            }
+
+            # Write results to result.json with each field on a new line
+            with open(result_file, 'a', encoding='utf-8') as f:
+                json.dump(result_data, f, ensure_ascii=False, indent=4)
+                f.write("\n")  # Ensure each JSON object is on a new line
+
+            results.append(result_data)
+
+            # Aggregate scores
+            total_score += score
+
+        except Exception as e:
+            print(f"Error processing request {i}: {e}")
+
+    # Calculate total time and throughput
+    total_time = time.time() - start_total_time
+    throughput = concurrent_requests / total_time
+
+    # Log the total time, throughput, and average ROUGE scores
+    with open(log_file, 'a', encoding='utf-8') as log_f:
+        log_f.write(f"Total Time: {total_time:.2f} seconds\n")
+        log_f.write(f"Throughput: {throughput:.2f} requests per second\n")
+        log_f.write(f"Average Scores: {total_score / concurrent_requests}\n")
+        log_f.write('-' * 40 + '\n')
+
+    print(f"Results saved to {result_file}")
+    print(f"Log saved to {log_file}")
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="API Generate Tester")
+    parser.add_argument("--concurrent", type=int, default=1000, help="Number of concurrent evaluations")
+    parser.add_argument("--file", type=str, default="TIGER-Lab/MMLU-Pro", help="Path to the mmlu.jsonl file")
+    parser.add_argument("--result", type=str, default="./mmlu_result_pro.json", help="Path to save the result JSON file")
+    parser.add_argument("--log", type=str, default="./mmlu_result_pro.log", help="Path to save the log file")
+    parser.add_argument("--model", type=str, default="Pro/deepseek-ai/DeepSeek-V3", help="Model name or path")
+    parser.add_argument("--api_url", type=str, default="http://localhost:15488/v1/chat/completions", help="API URL")
+    # parser.add_argument("--api_url", type=str, default="https://api.siliconflow.cn/v1/chat/completions", help="API URL")
+
+    args = parser.parse_args()
+
+    # Load the data from the provided file
+    # template_prompt = hint + "\nQuestion: {question}\nA. {options}\nB. {option_b}\nC. {option_c}\nD. {option_d}\nAnswer: '"
+    # template_prompt_pro = hint + "\nQuestion: {question}\nA. {options[0]}\nB. {options[1]}\nC. {options[2]}\nD. {options[3]}\nE. {options[4]}\nF. {options[5]}\nG. \
+        # {options[6]}\nH. {options[7]}\nI. {options[8]}\nJ. {options[9]}\nAnswer: '"
+
+
+    # Load the data from the provided file
+    data_evaluator = DataEvaluator()
+    data_evaluator.load_data(args.file)
+
+    # Run the main function with the specified number of concurrent evaluations
+    main(args.concurrent, data_evaluator, args.result, args.log, args.api_url, args.model)
\ No newline at end of file
diff --git a/ktransformers/tests/mmlu_test.py b/ktransformers/tests/mmlu_test.py
new file mode 100644
index 0000000..452cbbf
--- /dev/null
+++ b/ktransformers/tests/mmlu_test.py
@@ -0,0 +1,195 @@
+import argparse
+import random
+import time
+import json
+import requests
+import pandas as pd
+from datasets import load_dataset
+
+import os
+os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
+os.environ['https_proxy'] = ''
+os.environ['http_proxy'] = ''
+hint = 'There is a single choice question. Answer the question by replying A, B, C, D. No other answers are accepted. Just the letter.'
+
+
+class DataEvaluator:
+    def __init__(self):
+        # self.template_prompt = template_prompt
+        self.data = []
+
+    def load_data(self, file_path):
+        """
+        Load data from a Parquet file into a list.
+        Each record in the Parquet file should represent an individual record.
+        """
+        # 读取 Parquet 文件
+        # dataset = load_dataset('parquet', data_files=file_path)
+        ds = load_dataset(file_path,"all")
+        df = pd.DataFrame(ds['test'])
+        # print(ds)
+        # # ds_1 =  ds['train']
+        # ds_2 =  ds['validation']
+        # ds_3 =  ds['test']
+        # # 将数据集转换为 Pandas DataFrame
+        # df_test = pd.DataFrame(ds['test'])
+        # df_val = pd.DataFrame(ds['validation'])
+
+        # for _, row in df.iterrows():
+        #     self.data.append(row.to_dict())
+        # df = pd.read_parquet(file_path)
+
+        for _, row in df.iterrows():
+            self.data.append(row.to_dict())
+
+    def get_prompt(self, record):
+        """
+        Combine fields from a record with the template prompt to create a full prompt.
+        :param record: Dictionary containing fields to populate the template.
+        :return: A formatted prompt string.
+        """
+        # 查看ABCD。。。的选项
+        options_str = "\n".join([f"{chr(65 + i)}. {opt}" for i, opt in enumerate(record['choices'])])
+        prompt = hint + "\nQuestion: " + record['question'] + "\n" + options_str + "\nAnswer: '"
+        return prompt
+        
+    def post_processing(self, text):
+        """
+        Perform post-processing on the prediction string.
+        :param text: The raw prediction string.
+        :return: Processed prediction string.
+        """
+        text = text.lstrip('\n').split('\n')[-1]
+        return text[-1:]
+
+    def score(self, pred, answers):
+        """
+        Calculate scores between the prediction and the answer.
+        Uses ROUGE scores as the evaluation metric.
+        :param pred: The predicted string.
+        :param answer: The reference answer string.
+        :return: A dictionary containing ROUGE scores.
+        """
+        for answer in answers:
+            if pred == answer:
+                return 1
+
+        return 0
+
+# Function to generate text using API
+def generate_text(api_url, question, model_name, stream=False):
+    headers = {
+        'accept': 'application/json',
+        'Content-Type': 'application/json',
+        # 添加 API Key
+        'Authorization' : 'Bearer '
+    }
+    data = {
+        "messages": [{"content": question, "role": "user"}],
+        "model": model_name,
+        "stream": stream,
+        # "temperature": 0.0
+    }
+    
+    print("POST data:", data)
+    response = requests.post(api_url, headers=headers, json=data)
+    
+    if response.status_code == 200:
+        result = response.json()
+        return result.get('choices', [{}])[0].get('message', {}).get('content', '').strip()
+    else:
+        print(f"API Request failed with status code {response.status_code}")
+        return None
+
+# Main function to handle multiple evaluations
+def main(concurrent_requests, data_evaluator: DataEvaluator, result_file, log_file, api_url, model_name):
+    start_total_time = time.time()
+
+    total_score = 0
+
+    results = []
+   # 设置随机数种子
+    random.seed(42)
+    random.shuffle(data_evaluator.data)
+    for i in range(min(concurrent_requests, len(data_evaluator.data))):
+        # Randomly select a data item from data for each request
+        data_item = data_evaluator.data[i]
+        question = data_evaluator.get_prompt(data_item)
+        # print(question)
+
+        # Start the timer for this evaluation
+        start_time = time.time()
+        try:
+            # Generate prediction using the API
+            prediction = generate_text(api_url, question, model_name)
+
+            if prediction is None:
+                raise Exception(f"Failed to get prediction for {question}")
+
+            answer = chr(data_item['answer'] + 65)
+            # Compute score
+            score = data_evaluator.score(data_evaluator.post_processing(prediction), answer)
+
+            # Calculate the time taken
+            elapsed_time = time.time() - start_time
+
+            # Collect the result data
+            result_data = {
+                "question_id": i,
+                "answer": answer,
+                "prediction": data_evaluator.post_processing(prediction),
+                "score": score,
+                "time": elapsed_time
+            }
+
+            # Write results to result.json with each field on a new line
+            with open(result_file, 'a', encoding='utf-8') as f:
+                json.dump(result_data, f, ensure_ascii=False, indent=4)
+                f.write("\n")  # Ensure each JSON object is on a new line
+
+            results.append(result_data)
+
+            # Aggregate scores
+            total_score += score
+
+        except Exception as e:
+            print(f"Error processing request {i}: {e}")
+
+    # Calculate total time and throughput
+    total_time = time.time() - start_total_time
+    throughput = concurrent_requests / total_time
+
+    # Log the total time, throughput, and average ROUGE scores
+    with open(log_file, 'a', encoding='utf-8') as log_f:
+        log_f.write(f"Total Time: {total_time:.2f} seconds\n")
+        log_f.write(f"Throughput: {throughput:.2f} requests per second\n")
+        log_f.write(f"Average Scores: {total_score / concurrent_requests}\n")
+        log_f.write('-' * 40 + '\n')
+
+    print(f"Results saved to {result_file}")
+    print(f"Log saved to {log_file}")
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="API Generate Tester")
+    parser.add_argument("--concurrent", type=int, default=1000, help="Number of concurrent evaluations")
+    parser.add_argument("--file", type=str, default="cais/mmlu", help="Path to the mmlu.jsonl file")
+    parser.add_argument("--result", type=str, default="./mmlu_result_silicon.json", help="Path to save the result JSON file")
+    parser.add_argument("--log", type=str, default="./mmlu_result_silicon.log", help="Path to save the log file")
+    parser.add_argument("--model", type=str, default="Pro/deepseek-ai/DeepSeek-V3", help="Model name or path")
+    parser.add_argument("--api_url", type=str, default="http://localhost:10003/v1/chat/completions", help="API URL")
+    # parser.add_argument("--api_url", type=str, default="https://api.siliconflow.cn/v1/chat/completions", help="API URL")
+
+    args = parser.parse_args()
+
+    # Load the data from the provided file
+    # template_prompt = hint + "\nQuestion: {question}\nA. {options}\nB. {option_b}\nC. {option_c}\nD. {option_d}\nAnswer: '"
+    # template_prompt_pro = hint + "\nQuestion: {question}\nA. {options[0]}\nB. {options[1]}\nC. {options[2]}\nD. {options[3]}\nE. {options[4]}\nF. {options[5]}\nG. \
+        # {options[6]}\nH. {options[7]}\nI. {options[8]}\nJ. {options[9]}\nAnswer: '"
+
+
+    # Load the data from the provided file
+    data_evaluator = DataEvaluator()
+    data_evaluator.load_data(args.file)
+
+    # Run the main function with the specified number of concurrent evaluations
+    main(args.concurrent, data_evaluator, args.result, args.log, args.api_url, args.model)
\ No newline at end of file
diff --git a/ktransformers/tests/triton_fp8gemm_test.py b/ktransformers/tests/triton_fp8gemm_test.py
new file mode 100644
index 0000000..58888d6
--- /dev/null
+++ b/ktransformers/tests/triton_fp8gemm_test.py
@@ -0,0 +1,116 @@
+import torch
+import torch.nn.functional as F
+from typing import Optional
+import pytest
+from typing import Tuple, Optional, Literal
+import time
+# use dir path
+import os
+import sys
+sys.path.insert(0, "/home/azure/ktransformers")
+print(sys.path)
+from ktransformers.ktransformers_ext.triton.fp8gemm import fp8_gemm, act_quant, weight_dequant
+from safetensors import safe_open
+
+world_size = 1
+rank = 0
+block_size = 128
+gemm_impl: Literal["bf16", "fp8"] = "bf16"
+# Assuming `fp8_gemm`, `act_quant`, `weight_dequant` and other relevant functions are already defined
+
+def test_fp8_gemm_vs_torch_matmul():
+    # Test case 1: Create random matrices of size (M, K) and (K, N)
+    M, K, N = 64, 128, 256  # Matrix dimensions
+    x = torch.randn(M, K, dtype=torch.bfloat16, device='cuda')
+    weight = torch.randn(N, K, dtype=torch.bfloat16, device='cuda')
+
+    # Apply act_quant to both matrices
+    x_quantized, scale_x = act_quant(x, block_size)
+    weight_quantized, scale_w = act_quant(weight, block_size)
+    
+    # mk continous
+    x_quantized = x_quantized.contiguous()
+    weight_quantized = weight_quantized.contiguous()
+    scale_x = scale_x.contiguous()
+    scale_w = scale_w.contiguous()
+
+    # Perform fp8_gemm using the quantized tensors
+    result_fp8_gemm = fp8_gemm(x_quantized, scale_x, weight_quantized, scale_w)
+
+    # Perform torch.matmul using the original floating point tensors
+    result_torch_matmul = torch.matmul(x, weight.T)
+    print(f'result_torch_matmul: {result_torch_matmul.shape}')
+    print(f'result_fp8_gemm: {result_fp8_gemm.shape}')
+
+    print(f"result_fp8_gemm:\n {result_fp8_gemm}")
+    print(f"result_torch_matmul:\n {result_torch_matmul}")
+    
+def test_fp8_gemm_vs_torch_matmul_load():
+    file_path = "/mnt/data/model/DeepSeek-V3/model-00001-of-000163.safetensors"
+    with safe_open(file_path, framework="pt", device=0) as f:
+        weight = f.get_tensor("model.layers.0.mlp.down_proj.weight")
+        scale = f.get_tensor("model.layers.0.mlp.down_proj.weight_scale_inv")
+
+    # weight_dequant
+    weight_dequantized = weight_dequant(weight, scale)
+    print(f"weight_dequantized: {weight_dequantized.shape}")
+    N, K = weight_dequantized.shape
+    M = 64
+    x = torch.randn(2 ,M, K, dtype=torch.bfloat16, device='cuda')
+    x_quantized, scale_x = act_quant(x, block_size)
+    
+    # Test case 1: quantized x matmal with undequantized weight
+    result_fp8_gemm = fp8_gemm(x_quantized, scale_x, weight, scale)
+    print(f"result_fp8_gemm:\n {result_fp8_gemm}")
+    print(f"dtype {result_fp8_gemm.dtype}")
+
+    # Perform torch.matmul using the original floating point tensors
+    result_torch_matmul = torch.matmul(x, weight_dequantized.to(torch.bfloat16).T)
+    print(f"result_torch_matmul:\n {result_torch_matmul}")
+
+def test_fp8_gemm_tplops():
+    file_path = "/mnt/data/model/DeepSeek-V3/model-00001-of-000163.safetensors"
+    with safe_open(file_path, framework="pt", device=0) as f:
+        weight = f.get_tensor("model.layers.0.mlp.down_proj.weight")
+        scale = f.get_tensor("model.layers.0.mlp.down_proj.weight_scale_inv")
+
+    # weight_dequant
+    weight_dequantized = weight_dequant(weight, scale)
+    print(f"weight_dequantized: {weight_dequantized.shape}")
+    N, K = weight_dequantized.shape
+    M = 6400
+    x = torch.randn(2 ,M, K, dtype=torch.bfloat16, device='cuda')
+    # x_quantized, scale_x = act_quant(x, block_size)
+    
+    # Calculate time for 1000 fp8_gemm
+    i = 10
+    flops_per_gemm = 2 * M * N * K
+    total_flops = i * flops_per_gemm
+    
+    x_quantized, scale_x = act_quant(x, block_size)
+    result_fp8_gemm = fp8_gemm(x_quantized, scale_x, weight, scale)
+    x_quantized, scale_x = act_quant(x, block_size)
+    result_fp8_gemm = fp8_gemm(x_quantized, scale_x, weight, scale)
+
+    
+    t0 = time.time()
+    torch.cuda.synchronize()
+    for i in range(i):
+        x_quantized, scale_x = act_quant(x, block_size)
+        result_fp8_gemm = fp8_gemm(x_quantized, scale_x, weight, scale)
+    torch.cuda.synchronize()
+    t1 = time.time()
+    
+    total_time = t1 - t0
+    tflops = total_flops / total_time / 1e12
+    print(f"total_time: {total_time}")
+    print(f"tflops: {tflops}")
+    
+
+    
+    
+if __name__ == "__main__":
+    test_fp8_gemm_vs_torch_matmul()
+    test_fp8_gemm_vs_torch_matmul_load()
+    test_fp8_gemm_tplops()
+    
\ No newline at end of file
diff --git a/ktransformers/util/custom_gguf.py b/ktransformers/util/custom_gguf.py
index a90c0ed..84ada15 100644
--- a/ktransformers/util/custom_gguf.py
+++ b/ktransformers/util/custom_gguf.py
@@ -25,6 +25,9 @@ import os
 from enum import IntEnum
 import torch
 import KTransformersOps
+from .custom_loader import SafeTensorLoader
+import ctypes
+import math
 
 class GGMLQuantizationType(IntEnum):
     F32     = 0
@@ -109,6 +112,7 @@ GGML_TYPES = {
     "Q5_K": 13,
     "Q6_K": 14,
     "IQ4_XS": 23,
+    "BF16": 30,
 }
 
 GGML_NAMES = {ggml_type: name for name, ggml_type in GGML_TYPES.items()}
@@ -116,6 +120,7 @@ GGML_NAMES = {ggml_type: name for name, ggml_type in GGML_TYPES.items()}
 GGML_BLOCK_SIZES = {
     "F32": 4,
     "F16": 2,
+    "BF16": 2,
     "Q4_0": 2 + 16,
     "Q5_0": 2 + 4 + 16,
     "Q8_0": 2 + 32,
@@ -125,11 +130,13 @@ GGML_BLOCK_SIZES = {
     "Q5_K": 2 + 2 + 12 + 256 // 8 + 256 // 2,
     "Q6_K": 256 // 2 + 256 // 4 + 256 // 16 + 2,
     "IQ4_XS": 2 + 2 + 256 // 2 + 256 // 64,
+    "FP8": 1,
 }
 
 GGML_ELEMENTS_PER_BLOCK = {
     "F32": 1,
     "F16": 1,
+    "BF16": 1,
     "Q4_0": 32,
     "Q5_0": 32,
     "Q8_0": 32,
@@ -139,6 +146,7 @@ GGML_ELEMENTS_PER_BLOCK = {
     "Q5_K": 256,
     "Q6_K": 256,
     "IQ4_XS": 256,
+    "FP8": 1,
 }
 
 DATA_TYPES = {
@@ -155,6 +163,7 @@ DATA_TYPES = {
     "uint64": 10,
     "int64": 11,
     "float64": 12,
+    "FP8": 13,
 }
 
 class GGUFLoader:
@@ -162,10 +171,15 @@ class GGUFLoader:
     gguf_path: str
     tensor_file_map: dict # {tensor_name: tensor_file_path}
     gguf_file_meta: dict
+    safetensor_loader: SafeTensorLoader
     def __init__(self, gguf_path: str):
         # Check dir exist
         if not os.path.exists(gguf_path):
             raise FileNotFoundError(f"GGUF dir not found: {gguf_path}")
+        if os.path.isfile(gguf_path):
+            gguf_path = os.path.dirname(gguf_path)
+
+        self.safetensor_loader = None
         
         self.tensor_info = {}
         self.gguf_path = gguf_path
@@ -173,16 +187,26 @@ class GGUFLoader:
         self.file_data_map = {}
         self.gguf_file_meta = {}
         self.tensor_device_map = {}
-        
+
+        # I know this is ugly, but I don't want to change the original code too much
+        # TODO: merge gguf load and other loads.
+        safetensor_loader = SafeTensorLoader(gguf_path)
+        if safetensor_loader.tensor_file_map:
+            self.safetensor_loader = safetensor_loader
+            return
         # Walk through all the .gguf files in the directory
+        found_gguf = False
         for root, dirs, files in os.walk(gguf_path):
             for file in files:
                 if file.endswith(".gguf"):
+                    found_gguf = True
                     file_name = os.path.join(root, file)
                     with open(file_name, "rb") as f:
                         self.load_gguf(f)
                         if file_name not in self.file_data_map:
                             self.file_data_map[file_name] = np.memmap(file_name, mode = 'r')
+        if not found_gguf:
+            raise FileNotFoundError(f"Cannot find any .gguf files in: {gguf_path}")
                             
     def load_gguf(self, f):
         f.seek(0)
@@ -207,7 +231,7 @@ class GGUFLoader:
             shape = [read_value(f, DATA_TYPES["uint64"]) for _ in range(shape_len)]
             ggml_type = read_value(f, DATA_TYPES["uint32"])
             bad_offset = read_value(f, DATA_TYPES["uint64"])
-            n_elems = int(np.prod(shape))
+            n_elems = int(math.prod(shape))
             block_size, type_size = GGML_QUANT_SIZES[ggml_type]
             n_bytes = n_elems * type_size // block_size
             np_dims = tuple(reversed(shape))
@@ -276,8 +300,49 @@ class GGUFLoader:
         itemsize = int(np.empty([], dtype = item_type).itemsize)
         return mmap_data[offset : offset + itemsize * item_count]
     
-    def load_gguf_tensor(self, name: str, device:str = "cpu")->torch.Tensor:
+    def get_undequanted_tensor_and_ggml_type(self, name):
         t = self.tensor_info[name]
+        data = self.get_mmap_tensor(name)
+        ggml_type = t["ggml_type"]
+        data = torch.from_numpy(data)
+        return data, ggml_type
+
+    def load_expert_tensor(self, name, data, expert_id, elements_per_expert, device = "cuda", target_dtype = torch.get_default_dtype())->torch.Tensor:
+        t = self.tensor_info[name]
+        if device.lower() == "cpu":
+            print(f"loading expert {expert_id} of {name} with CPU")
+        shape = t["shape"]
+        ggml_type = t["ggml_type"]
+        if ggml_type not in GGML_NAMES:
+            raise NotImplementedError(f"ggml_type {ggml_type} not implemented")
+        ggml_name = GGML_NAMES[ggml_type]
+
+        # TODO: experts may fused in quant block, split it
+        assert elements_per_expert % GGML_ELEMENTS_PER_BLOCK[ggml_name] == 0, "experts may fused in quant block, please use CPU dequant"
+
+        blocks_per_experts = elements_per_expert // GGML_ELEMENTS_PER_BLOCK[ggml_name]
+        block_size = GGML_BLOCK_SIZES[ggml_name]
+        offset = expert_id * block_size * blocks_per_experts
+        data = data[offset: offset + block_size * blocks_per_experts]
+        
+        if "cuda" in device.lower():
+            values = GGML_DEQUANTIZE_GPU[ggml_name](data, device, target_dtype)
+        else:
+            values = GGML_DEQUANTIZE[ggml_name](data)
+            values = torch.from_numpy(values.copy())
+
+        if ggml_name == "BF16":
+            values = values.view(torch.bfloat16)
+        values = values.view(shape[-2::-1])
+
+        return values
+
+    def load_gguf_tensor(self, name: str, device:str = "cpu", target_dtype = None)->torch.Tensor:
+        t = self.tensor_info[name]
+        if device.lower() == "cpu":
+            print(f"loading {name} with CPU")
+        if target_dtype == None:
+            target_dtype = torch.get_default_dtype()
         
         shape = t["shape"]
         ggml_type = t["ggml_type"]
@@ -289,14 +354,38 @@ class GGUFLoader:
 
         data = self.get_mmap_tensor(name)
 
-        if "cuda" in device.lower():
-            values = GGML_DEQUANTIZE_GPU[ggml_name](data, device)
-            #values = GGML_DEQUANTIZE[ggml_name](data)
-            #print("load_gguf_tensor")
-            #values = torch.from_numpy(values).to(device = device)
+        block_size = GGML_BLOCK_SIZES[ggml_name]
+        elements_per_block = GGML_ELEMENTS_PER_BLOCK[ggml_name]
+        num_elements = int(np.prod(shape))
+        num_blocks = num_elements // elements_per_block
+        
+        blocks_per_iter = 16384
+        if num_blocks > blocks_per_iter: # dequant large tensor
+            values = torch.empty((num_blocks, elements_per_block), dtype=target_dtype, device=device)
+            for i in range( (num_blocks + blocks_per_iter - 1) // blocks_per_iter):
+                blocks_begin = i * blocks_per_iter
+                blocks_end = min(blocks_begin + blocks_per_iter, num_blocks)
+                if "cuda" in device.lower():
+                    cur_values = GGML_DEQUANTIZE_GPU[ggml_name](data[blocks_begin*block_size : blocks_end*block_size], device, target_dtype)
+                else:
+                    cur_values = GGML_DEQUANTIZE[ggml_name](data[blocks_begin*block_size : blocks_end*block_size])
+                    cur_values = torch.from_numpy(cur_values.copy())
+                
+                cur_values = cur_values.view(-1, elements_per_block)
+                if ggml_name == "BF16":
+                    cur_values = cur_values.view(torch.bfloat16)
+                values[blocks_begin : blocks_end] = cur_values
         else:
-            values = GGML_DEQUANTIZE[ggml_name](data)
-            values = torch.from_numpy(values)
+            if "cuda" in device.lower():
+                values = GGML_DEQUANTIZE_GPU[ggml_name](data, device)
+            else:
+                values = GGML_DEQUANTIZE[ggml_name](data)
+                values = torch.from_numpy(values)
+                
+        if ggml_name == "BF16":
+            values = values.view(torch.bfloat16)
+            
+
         values = values.view(shape[::-1])
         if "attn_q" in name and self.gguf_file_meta['general.architecture'] in ["llama"]:
             n_head = self.gguf_file_meta['llama.attention.head_count']
@@ -352,6 +441,9 @@ def read_value(f, data_type):
         elem_type, count = struct.unpack("<IQ", f.read(4 + 8))
         return [read_value(f, elem_type) for _ in range(count)]
 
+    elif data_type == DATA_TYPES["FP8"]:
+        return struct.unpack("<B", f.read(1))[0]
+
     else:
         raise NotImplementedError(f"Data type {data_type} not implemented")
 
@@ -392,14 +484,15 @@ def dequantize_q2_k(data):
 
     return d * (scales & 15) * (tmp & 3) - dmin * (scales >> 4)
 
-def dequantize_q2_k_gpu(data, device:str ="cuda"):
+def dequantize_q2_k_gpu(data, device:str ="cuda", target_dtype = torch.get_default_dtype()):
     block_size = GGML_BLOCK_SIZES["Q2_K"]
+    ele_per_blk = GGML_ELEMENTS_PER_BLOCK["Q2_K"]
     data = np.frombuffer(data, dtype=data.dtype)
     device = torch.device(device)
     # TODO: this and from_numpy in other functions will cause a warning saying that numpy is not writable, 
     # the best way to fix this is transfer ptr to KTransformersOps instead of Tensor.
-    data = torch.from_numpy(data)
-    return KTransformersOps.dequantize_q2_k(data, block_size, device)
+    c_pointer = ctypes.addressof(ctypes.cast(data.ctypes.data, ctypes.POINTER(ctypes.c_int8)).contents)
+    return KTransformersOps.dequantize_q2_k(c_pointer, data.size, block_size, ele_per_blk, device, target_dtype)
 
 def dequantize_q3_k(data):
     # C implementation
@@ -443,14 +536,15 @@ def dequantize_q3_k(data):
         (((qs[:, 48:64] >> 6) & 3) - bits[:, 16:, 7])
     ], axis=1)
 
-def dequantize_q3_k_gpu(data, device:str ="cuda"):
+def dequantize_q3_k_gpu(data, device:str ="cuda", target_dtype = torch.get_default_dtype()):
     block_size = GGML_BLOCK_SIZES["Q3_K"]
+    ele_per_blk = GGML_ELEMENTS_PER_BLOCK["Q3_K"]
     data = np.frombuffer(data, dtype=data.dtype)
     device = torch.device(device)
     # TODO: this and from_numpy in other functions will cause a warning saying that numpy is not writable, 
     # the best way to fix this is transfer ptr to KTransformersOps instead of Tensor.
-    data = torch.from_numpy(data)
-    return KTransformersOps.dequantize_q3_k(data, block_size, device)
+    c_pointer = ctypes.addressof(ctypes.cast(data.ctypes.data, ctypes.POINTER(ctypes.c_int8)).contents)
+    return KTransformersOps.dequantize_q3_k(c_pointer, data.size, block_size, ele_per_blk, device, target_dtype)
 
 def dequantize_q4_k(data):
     # C implementation
@@ -474,13 +568,15 @@ def dequantize_q4_k(data):
     # Dequantize final weights using scales and offsets
     return factors * qs2 - offsets
 
-def dequantize_q4_k_gpu(data, device:str ="cuda"):
+def dequantize_q4_k_gpu(data, device:str ="cuda", target_dtype = torch.get_default_dtype()):
+    block_size = GGML_BLOCK_SIZES["Q4_K"]
+    ele_per_blk = GGML_ELEMENTS_PER_BLOCK["Q4_K"]
     data = np.frombuffer(data, dtype=data.dtype)
     device = torch.device(device)
     # TODO: this and from_numpy in other functions will cause a warning saying that numpy is not writable, 
     # the best way to fix this is transfer ptr to KTransformersOps instead of Tensor.
-    data = torch.from_numpy(data)
-    return KTransformersOps.dequantize_q4_k(data, 144, device)
+    c_pointer = ctypes.addressof(ctypes.cast(data.ctypes.data, ctypes.POINTER(ctypes.c_int8)).contents)
+    return KTransformersOps.dequantize_q4_k(c_pointer, data.size, block_size, ele_per_blk, device, target_dtype)
 
 def dequantize_q5_k(data):
     # C implementation
@@ -538,14 +634,15 @@ def dequantize_q5_k(data):
         d8 * (qs_hi_4[:, 3] + (bits[:, :, 7] << 4)) - m8,
     ], axis=1)
 
-def dequantize_q5_k_gpu(data, device:str ="cuda"):
+def dequantize_q5_k_gpu(data, device:str ="cuda", target_dtype = torch.get_default_dtype()):
     block_size = GGML_BLOCK_SIZES["Q5_K"]
+    ele_per_blk = GGML_ELEMENTS_PER_BLOCK["Q5_K"]
     data = np.frombuffer(data, dtype=data.dtype)
     device = torch.device(device)
     # TODO: this and from_numpy in other functions will cause a warning saying that numpy is not writable, 
     # the best way to fix this is transfer ptr to KTransformersOps instead of Tensor.
-    data = torch.from_numpy(data)
-    return KTransformersOps.dequantize_q5_k(data, block_size, device)
+    c_pointer = ctypes.addressof(ctypes.cast(data.ctypes.data, ctypes.POINTER(ctypes.c_int8)).contents)
+    return KTransformersOps.dequantize_q5_k(c_pointer, data.size, block_size, ele_per_blk, device, target_dtype)
 
 def dequantize_q6_k(data):
     # C implementation
@@ -596,13 +693,14 @@ def dequantize_q6_k(data):
     ], axis=1) 
 
 # @torch.jit.script
-def dequantize_q6_k_gpu(data: np.ndarray, device:str = "cuda"):
+def dequantize_q6_k_gpu(data: np.ndarray, device:str = "cuda", target_dtype = torch.get_default_dtype()):
     block_size = GGML_BLOCK_SIZES["Q6_K"]
+    ele_per_blk = GGML_ELEMENTS_PER_BLOCK["Q6_K"]
     device = torch.device(device)
     num_blocks = len(data) // block_size
     data = np.frombuffer(data, dtype=data.dtype)
-    data = torch.from_numpy(data)
-    return KTransformersOps.dequantize_q6_k(data, block_size, device)
+    c_pointer = ctypes.addressof(ctypes.cast(data.ctypes.data, ctypes.POINTER(ctypes.c_int8)).contents)
+    return KTransformersOps.dequantize_q6_k(c_pointer, data.size, block_size, ele_per_blk, device, target_dtype)
 
 kvalues_iq4nl = np.array([-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113], dtype=np.int8)
 
@@ -636,13 +734,14 @@ def dequantize_iq4_xs(data):
 
     return y.flatten()
 
-def dequantize_iq4_xs_gpu(data: np.ndarray, device:str = "cuda"):
+def dequantize_iq4_xs_gpu(data: np.ndarray, device:str = "cuda", target_dtype = torch.get_default_dtype()):
     block_size = GGML_BLOCK_SIZES["IQ4_XS"]
+    ele_per_blk = GGML_ELEMENTS_PER_BLOCK["IQ4_XS"]
     device = torch.device(device)
     num_blocks = len(data) // block_size
     data = np.frombuffer(data, dtype=data.dtype)
-    data = torch.from_numpy(data)
-    return KTransformersOps.dequantize_iq4_xs(data, block_size, device)
+    c_pointer = ctypes.addressof(ctypes.cast(data.ctypes.data, ctypes.POINTER(ctypes.c_int8)).contents)
+    return KTransformersOps.dequantize_iq4_xs(c_pointer, data.size, block_size, ele_per_blk, device, target_dtype)
 
 def dequantize_q4_0(data):
     # C implementation
@@ -659,7 +758,7 @@ def dequantize_q4_0(data):
         scales * ((qs >> 4).astype(np.int8) - 8),
     ], axis=1)
 
-def dequantize_q4_0_gpu(data):
+def dequantize_q4_0_gpu(data, device:str = "cuda", target_dtype = torch.get_default_dtype()):
     raise NotImplementedError()
 
 def dequantize_q5_0(data):
@@ -683,7 +782,7 @@ def dequantize_q5_0(data):
         scales * x1,
     ], axis=1)
 
-def dequantize_q5_0_gpu(data):
+def dequantize_q5_0_gpu(data, device:str = "cuda", target_dtype = torch.get_default_dtype()):
     raise NotImplementedError()
 
 def dequantize_q8_0(data):
@@ -695,32 +794,41 @@ def dequantize_q8_0(data):
     qs = np.frombuffer(data, dtype=np.int8).reshape(num_blocks, 2 + 32)[:, 2:]
     return scales * qs
 
-def dequantize_q8_0_gpu(data, device:str = "cuda"):
+def dequantize_q8_0_gpu(data, device:str = "cuda", target_dtype = torch.get_default_dtype()):
     # C struct definition
     # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L43
-    num_blocks = len(data) // GGML_BLOCK_SIZES["Q8_0"]
+    
+    block_size = GGML_BLOCK_SIZES["Q8_0"]
+    ele_per_blk = GGML_ELEMENTS_PER_BLOCK["Q8_0"]
     device = torch.device(device)
     data = np.frombuffer(data, dtype=data.dtype)
-    data = torch.from_numpy(data)
-    return KTransformersOps.dequantize_q8_0(data, 34, device)
+    c_pointer = ctypes.addressof(ctypes.cast(data.ctypes.data, ctypes.POINTER(ctypes.c_int8)).contents)
+    return KTransformersOps.dequantize_q8_0(c_pointer, data.size, block_size, ele_per_blk, device, target_dtype)
 
 
 def dequantize_f32(data):
     return np.frombuffer(data, dtype=np.float32)
 
-def dequantize_f32_gpu(data, device):
+def dequantize_f32_gpu(data, device, target_dtype = torch.get_default_dtype()):
     data = np.frombuffer(data, dtype=np.float32)
-    res = torch.from_numpy(data)
-    res_gpu = torch.empty_like(res, device=device)
+    res = torch.from_numpy(data.copy())
+    res_gpu = torch.empty_like(res, device=device, dtype=target_dtype)
     res_gpu.copy_(res)
     return res_gpu
 
 def dequantize_f16(data):
     return np.frombuffer(data, dtype=np.float16)
 
-def dequantize_f16_gpu(data, device):
+def dequantize_f16_gpu(data, device, target_dtype = torch.get_default_dtype()):
     data = np.frombuffer(data, dtype=np.float16)
-    res = torch.from_numpy(data)
+    res = torch.from_numpy(data.copy())
+    res_gpu = torch.empty_like(res, device=device, dtype=target_dtype)
+    res_gpu.copy_(res)
+    return res_gpu
+
+def dequantize_bf16_gpu(data, device, target_dtype = torch.get_default_dtype()):
+    data = np.frombuffer(data, dtype=np.float16)
+    res = torch.from_numpy(data.copy())
     res_gpu = torch.empty_like(res, device=device)
     res_gpu.copy_(res)
     return res_gpu
@@ -728,6 +836,7 @@ def dequantize_f16_gpu(data, device):
 GGML_DEQUANTIZE = {
     "F32": dequantize_f32,
     "F16": dequantize_f16,
+    "BF16": dequantize_f16,
     "Q4_0": dequantize_q4_0,
     "Q5_0": dequantize_q5_0,
     "Q8_0": dequantize_q8_0,
@@ -742,6 +851,7 @@ GGML_DEQUANTIZE = {
 GGML_DEQUANTIZE_GPU = {
     "F32": dequantize_f32_gpu,
     "F16": dequantize_f16_gpu,
+    "BF16": dequantize_bf16_gpu,
     "Q4_0": dequantize_q4_0_gpu,
     "Q5_0": dequantize_q5_0_gpu,
     "Q8_0": dequantize_q8_0_gpu,
diff --git a/ktransformers/util/custom_loader.py b/ktransformers/util/custom_loader.py
new file mode 100644
index 0000000..ecc09a0
--- /dev/null
+++ b/ktransformers/util/custom_loader.py
@@ -0,0 +1,86 @@
+import struct
+import warnings
+import numpy as np
+import re
+import numpy.typing as npt
+from typing import Sequence
+import os
+from enum import IntEnum
+import torch
+import KTransformersOps
+from safetensors import safe_open
+from ktransformers.ktransformers_ext.triton.fp8gemm import fp8_gemm, act_quant, weight_dequant
+from safetensors.torch import save_file
+
+class SafeTensorLoader:
+    tensor_file_map = {}
+    tensor_type_map = {}
+    file_handle_map = {}
+    
+    def __init__(self, file_path: str):
+        self.__load_tensor_file_map(file_path)
+
+    def __load_tensor_file_map(self, file_path: str):
+        # 处理传入路径，确保是文件夹路径
+        if not os.path.exists(file_path):
+            raise FileNotFoundError(f"Path not found: {file_path}")
+        if os.path.isfile(file_path):
+            folder_path = os.path.dirname(file_path)
+        else:
+            folder_path = file_path
+
+        found_safetensor = False
+        for root, _, files in os.walk(folder_path):
+            files = sorted(files)
+            for file in files:
+                if file.endswith(".safetensors"):
+                    found_safetensor = True
+                    file_path = os.path.join(root, file)
+                    if file not in self.file_handle_map:
+                        try:
+                            handle = safe_open(file_path, framework="pt")
+                            self.file_handle_map[file] = handle
+                        except Exception as e:
+                            print(f"Error opening Safetensor file {file_path}: {e}")
+                            continue
+
+                    f = self.file_handle_map.get(file)
+                    if f is None:
+                        continue
+                    try:
+                        for key in f.keys():
+                            self.tensor_file_map[key] = file
+                    except Exception as e:
+                        print(f"Error reading Safetensor file {file_path}: {e}")
+
+        # if not found_safetensor:
+        #     raise FileNotFoundError(f"No Safetensor files found in {folder_path}")
+
+    def load_tensor(self, key: str, device: str="cpu"):
+        if key not in self.tensor_file_map:
+            raise KeyError(f"Key {key} not found in Safetensor files")
+        file = self.tensor_file_map[key]
+        f = self.file_handle_map.get(file)
+        if f is None:
+            raise FileNotFoundError(f"File {file} not found in Safetensor files")
+        tensor = f.get_tensor(key)
+        return tensor.to(device)
+
+    def close_all_handles(self):
+        for handle in self.file_handle_map.values():
+            handle.close()
+        self.file_handle_map.clear()
+
+    def load_dequantized_tensor(self, key:str, device: str="cpu"):
+        if key not in self.tensor_file_map:
+            raise KeyError(f"Key {key} not found in Safetensor files")
+        file = self.tensor_file_map[key]
+        f = self.file_handle_map.get(file)
+        if f is None:
+            raise FileNotFoundError(f"File {file} not found in Safetensor files")
+        tensor = f.get_tensor(key).to(device)
+        if key.endswith(".weight"):
+            if key[:-7] + ".weight_scale_inv" in self.tensor_file_map:
+                weight_scale_inv = f.get_tensor(key[:-7] + ".weight_scale_inv").to(device)
+                tensor = weight_dequant(tensor, weight_scale_inv)
+        return tensor.to(device)
\ No newline at end of file
diff --git a/ktransformers/util/utils.py b/ktransformers/util/utils.py
index 88c33fd..6f3b049 100644
--- a/ktransformers/util/utils.py
+++ b/ktransformers/util/utils.py
@@ -17,6 +17,21 @@ from ktransformers.operators import base_operator
 from ktransformers.models.custom_cache import StaticCache
 from ktransformers.util.cuda_graph_runner import CUDAGraphRunner
 from ktransformers.util.textstream import TextStreamer
+from ktransformers.operators.flashinfer_wrapper import MLAWrapperSingleton
+
+warm_uped = False
+
+def get_compute_capability(device:torch.device = None):
+    if torch.cuda.is_available():
+        if device is None:
+            num_gpus = torch.cuda.device_count()
+            min_compute_capability_major = 100
+            for gpu_id in range(num_gpus):
+                gpu_props = torch.cuda.get_device_properties(gpu_id)
+                min_compute_capability_major = min(min_compute_capability_major, gpu_props.major)
+            return min_compute_capability_major
+        else:
+            return torch.cuda.get_device_properties(device)
 
 def set_module(model, submodule_key, module):
     tokens = submodule_key.split('.')
@@ -63,12 +78,22 @@ def load_cur_state_dict(module: nn.Module, gguf_loader: GGUFLoader, prefix: str
     for name, param in local_state.items():
         key = prefix + name
         translated_key = translate_name_to_gguf(key)
-        if translated_key in gguf_loader.tensor_file_map:
+        
+        # TODO: Merge all loader.
+        # I know this is ugly but lets do it for now.
+        if gguf_loader.safetensor_loader is not None:
+            load_dequantized_tensor = gguf_loader.safetensor_loader.load_dequantized_tensor
+            tensor_file_map = gguf_loader.safetensor_loader.tensor_file_map
+        else:
+            load_dequantized_tensor = gguf_loader.load_gguf_tensor
+            tensor_file_map = gguf_loader.tensor_file_map
+        
+        if translated_key in tensor_file_map:
             target_dtype = torch.get_default_dtype()
             device = get_device(translated_key[:translated_key.rfind(".")], gguf_loader.tensor_device_map)
             print(f"loading {translated_key} to {device}")
-            # device = "cpu" if "embd" in translated_key else "cuda"
-            weights = gguf_loader.load_gguf_tensor(translated_key, device = device).to(dtype = target_dtype)
+            torch.cuda.empty_cache()
+            weights = load_dequantized_tensor(translated_key, device=device).to(dtype=target_dtype)
             set_param(module, name, weights)
             del weights
         else:
@@ -76,7 +101,7 @@ def load_cur_state_dict(module: nn.Module, gguf_loader: GGUFLoader, prefix: str
             raise Exception(f"can't find {translated_key} in GGUF file!")
         
 def load_weights(module:nn.Module, gguf_loader:GGUFLoader, prefix=''):
-    # print(f"recursively loading weights {prefix},{return_when_injected=}, {only_load_injected=}")
+    #print(f"recursively loading weights {prefix}")
     if not isinstance(module, base_operator.BaseInjectedModule):
         load_cur_state_dict(module, gguf_loader, prefix)
         for name, child in module._modules.items():
@@ -85,7 +110,8 @@ def load_weights(module:nn.Module, gguf_loader:GGUFLoader, prefix=''):
         module.load()
 
 def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000, use_cuda_graph: bool = True,
-                         mode = 'normal', force_think: bool = False):
+                         mode = 'normal', force_think: bool = False, chunk_prefill_size = 16384, use_flashinfer_mla = False,
+                         num_heads = None, head_dim_ckv = None, head_dim_kpe = None, q_head_dim = None):
     import os
     os.environ["TOKENIZERS_PARALLELISM"] = "false"
     torch._dynamo.config.suppress_errors = True
@@ -98,7 +124,9 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000, use_cud
 
     tokens = []
     
-    def decode_one_tokens(cuda_graph_runner, cur_token, position_ids, cache_position, past_key_values, use_cuda_graph: bool = True):
+    def decode_one_tokens(cuda_graph_runner, cur_token, position_ids, cache_position, past_key_values, logits_warper, generation_config, use_cuda_graph: bool = True):
+        if cuda_graph_runner is None:
+            use_cuda_graph = False
         if use_cuda_graph:
             logits = cuda_graph_runner(cur_token, position_ids, cache_position)
         else:
@@ -124,8 +152,25 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000, use_cud
             next_token = torch.argmax(next_token_scores, dim=-1)
         return next_token
     
+    # TODO: use CUDA Graph for chunk prefill, may get small improvement
+    def chunk_prefill(inputs, cache_position, past_key_values):
+        if mode == "long_context":
+            inputs_embeds = model.model.embed_tokens(inputs.to("cpu"))
+        else:
+            inputs_embeds = model.model.embed_tokens(inputs.to("cpu")).to(torch_device)
+        if use_flashinfer_mla:
+            MLAWrapperSingleton.update_buffer(past_key_values.max_pages)
+            MLAWrapperSingleton.need_plan_all()
+            
+        logits = model(
+            inputs_embeds = inputs_embeds, cache_position=cache_position, past_key_values=past_key_values, return_dict=False, use_cache=True
+        )[0][:,-1,:].unsqueeze(0).clone().to(torch_device)
+        
+        return logits
+    
     torch.cuda.set_device(torch_device)
     with torch.no_grad():
+        
         stream = TextStreamer(tokenizer)
         if mode != 'long_context':
             past_key_values = StaticCache(
@@ -133,26 +178,11 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000, use_cud
             )
         else:
             past_key_values = None
-        cache_position = torch.arange(seq_length, device=torch_device)
-        generated_ids = torch.zeros(
-            batch_size, seq_length + max_new_tokens + 1, dtype=torch.int, device=torch_device
-        )
-        generated_ids[:, cache_position] = inputs.to(torch_device).to(torch.int)
-        if past_key_values != None:
-            past_key_values.cur_idx=cache_position
-        start_time = time.time()
-
-        inputs_embeds = model.model.embed_tokens(inputs.to("cpu")).to(torch_device)
-        if mode == "long_context":
-            inputs_embeds = model.model.embed_tokens(inputs.to("cpu"))
-        else:
-            inputs_embeds = model.model.embed_tokens(inputs.to("cpu")).to(torch_device)
-        logits = model(
-            inputs_embeds = inputs_embeds, cache_position=cache_position, past_key_values=past_key_values, return_dict=False, use_cache=True
-        )[0][:,-1,:].unsqueeze(0).clone().to(torch_device)
+        
         generation_config, model_kwargs = model._prepare_generation_config(
-            None, max_length=max_new_tokens,
-            do_sample=True, top_k=5, top_p=0.85, temperature=0.1 # change this to modify generate config
+            None, do_sample=True
+            # change this to modify generate config
+            #top_k=5, top_p=0.85, temperature=0.1
         )
         try: # transformers==4.43
             logits_warper = (
@@ -162,41 +192,66 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000, use_cud
             logits_warper = (
                 model._get_logits_warper(generation_config)
             )
+
+        cache_position = torch.arange(seq_length, device=torch_device, dtype=torch.int32)
+        generated_ids = torch.zeros(
+            batch_size, seq_length + max_new_tokens + 1, dtype=torch.int, device=torch_device
+        )
+        generated_ids[:, cache_position] = inputs.to(torch_device).to(torch.int)
+        start_time = time.time()
+
+        chunk_start = 0
+        while chunk_start < seq_length:
+            chunk_end = min(chunk_start + chunk_prefill_size, seq_length)
+            if past_key_values != None:
+                past_key_values.cur_idx=cache_position[chunk_start:chunk_end]
+            logits = chunk_prefill(inputs[:, chunk_start:chunk_end], cache_position[chunk_start:chunk_end], past_key_values)
+            chunk_start += chunk_prefill_size
+
         next_token_scores = logits_warper(inputs, logits[:, -1, :])
         if generation_config.do_sample:
             probs = nn.functional.softmax(next_token_scores, dim=-1)
             next_token = torch.multinomial(probs, num_samples=1).squeeze(1)
         else:
             next_token = torch.argmax(next_token_scores, dim=-1)
+
         first_token_time = time.time() - start_time
+        
+        if use_flashinfer_mla:
+            MLAWrapperSingleton.reset_buffer()
 
         prefill_count = seq_length
         prefill_time = first_token_time
         if force_think:
-            print("<think>\n")
+            print("<think>")
         print(stream.put(next_token.item()), end="", flush=True)
         generated_ids[:, seq_length] = next_token
         tokens.append(int(next_token))
         inputs = torch.cat((inputs, next_token.unsqueeze(0)), dim=-1)
-        cache_position = torch.tensor([seq_length], device=torch_device)
+        cache_position = torch.tensor([seq_length], device=torch_device, dtype=torch.int32)
         position_ids = cache_position.unsqueeze(0)
         seq_length += 1
         
-        if use_cuda_graph:
-            cuda_graph_runner = CUDAGraphRunner()
-            cuda_graph_runner.capture(model, next_token.unsqueeze(0), position_ids, cache_position, past_key_values, torch_device, return_dict=False, use_cache=True)
-        else:
-            cuda_graph_runner = None
+        cuda_graph_runner = None
             
         start_time = time.time()
-        for _ in range(1, max_new_tokens):
-            next_token = decode_one_tokens(cuda_graph_runner, next_token.unsqueeze(0), position_ids, cache_position, past_key_values, use_cuda_graph).to(torch_device)
+        for i in range(1, max_new_tokens):
+            if use_flashinfer_mla:
+                MLAWrapperSingleton.plan_all(None,None,None,position_ids.squeeze(1)+1,
+                                             num_heads, head_dim_ckv, head_dim_kpe, past_key_values.page_size,
+                                             model.model.layers[0].self_attn.softmax_scale, torch.bfloat16, torch.bfloat16)
+            global warm_uped
+            if use_cuda_graph and ( (warm_uped == True and int(i) == 1) or (warm_uped == False and int(i) == 2) ):
+                warm_uped = True
+                cuda_graph_runner = CUDAGraphRunner()
+                cuda_graph_runner.capture(model, next_token.unsqueeze(0), position_ids, cache_position, past_key_values, torch_device, return_dict=False, use_cache=True)
+            next_token = decode_one_tokens(cuda_graph_runner, next_token.unsqueeze(0), position_ids, cache_position, past_key_values, logits_warper, generation_config, use_cuda_graph).to(torch_device)
             inputs = torch.cat((inputs, next_token.unsqueeze(0)), dim=-1)
             generated_ids[:, cache_position] = next_token.int()
             tokens.append(int(next_token))
             seq_length += 1
             
-            if next_token[0].item() == tokenizer.eos_token_id or tokenizer.decode(next_token) == '<|im_end|>':
+            if next_token[0].item() == tokenizer.eos_token_id or tokenizer.decode(next_token.tolist()) == '<|im_end|>':
                 print(stream.end(), end="", flush=True)
                 break
             else:
diff --git a/merge_tensors/merge_safetensor_gguf.py b/merge_tensors/merge_safetensor_gguf.py
new file mode 100644
index 0000000..69780fe
--- /dev/null
+++ b/merge_tensors/merge_safetensor_gguf.py
@@ -0,0 +1,214 @@
+# this script targets to merge the fp8 safe tensor and the gguf quantized tensors.
+
+import os
+# insert the path of the project
+import sys
+sys.path.insert(0, "/home/azure/ktransformers")
+import argparse
+import torch
+from ktransformers.util.custom_gguf import GGUFLoader, translate_name_to_gguf
+from safetensors import safe_open
+from safetensors.torch import save_file
+import re
+from collections import defaultdict
+
+def read_safetensor_keys_from_folder(folder_path)->dict:
+    """    
+    :param folder_path: folder path
+    :return: key_to_file_map
+    """
+    # check if the folder path is exist
+    if not os.path.exists(folder_path):
+        raise FileNotFoundError(f"GGUF dir not found: {folder_path}")
+    if os.path.isfile(folder_path):
+        folder_path = os.path.dirname(folder_path)
+    
+    key_to_file_map = {}
+
+    found_safetensor = False
+    for root, dirs, files in os.walk(folder_path):
+        # sort files
+        files = sorted(files)
+        for file in files:
+            if file.endswith(".safetensors"):
+                found_safetensor = True
+                file_path = os.path.join(root, file)
+                try:
+                    with safe_open(file_path, framework="pt") as f:
+                        for key in f.keys():
+                            if "model.layers.61" in key:
+                                # skip MTP layer
+                                continue
+                            # try:
+                            #     if int(key.split('.')[2]) > 4:
+                            #         continue
+                            # except:
+                            #     pass
+                            key_to_file_map[key] = file_path
+                except Exception as e:
+                    print(f"Error reading Safetensor file {file_path}: {e}")
+    
+    if not found_safetensor:
+        raise FileNotFoundError(f"No Safetensor files found in {folder_path}")
+    
+    return key_to_file_map
+
+tensor_from_gguf = [] # todo: add keys in gguf that should be used in the final tensor
+
+def translate_name(name:str)->str:
+    """
+    :param name: name of the tensor
+    :return: translated name
+    """
+    name = translate_name_to_gguf(name)
+    name = name.replace(".up_proj.", ".ffn_up_exps.")
+    name = name.replace(".down_proj.", ".ffn_down_exps.")
+    name = name.replace(".gate_proj.", ".ffn_gate_exps.")
+    name = name.replace(".ffn_gate_inp.e_score_correction_bias", ".exp_probs_b.bias") 
+    return name
+    
+
+def combine_tensor_sources(safetensor_path:str, gguf_path:str):
+    gguf_loader = GGUFLoader(gguf_path)
+    gguf_tensor_file_map = gguf_loader.tensor_file_map
+    safetensor_tensor_file_map = read_safetensor_keys_from_folder(safetensor_path)
+    
+    # build a map for the key to the tensor
+    # according to the key, we can get the tensor from the file
+    
+    target_tensor_map = {}
+    for key in safetensor_tensor_file_map.keys():
+        # for all experts, we use the gguf tensor
+        if ".mlp.experts." in key:
+            if '.weight_scale_inv' in key:
+                continue
+            key = '.'.join(key.split('.')[:5]+key.split('.')[-2:])
+            translated_key = translate_name(key)
+            target_tensor_map[key] = gguf_tensor_file_map[translated_key]
+            continue
+        
+        if any(target_key in key for target_key in tensor_from_gguf):
+            target_tensor_map[key] = gguf_tensor_file_map[translate_name(key)]
+        else:
+            target_tensor_map[key] = safetensor_tensor_file_map[key]
+    
+    return target_tensor_map, gguf_loader
+
+def write_combined_tensor(target_tensor_map: dict, output_path: str, gguf_loader: GGUFLoader):
+    # Ensure output directory exists
+    os.makedirs(output_path, exist_ok=True)
+    
+    # Cache for safetensor file handles and GGUF loaders
+    safetensors_cache = {}
+    gguf_cache = {}
+    
+    # Group tensors by layer
+    layer_groups = defaultdict(list)
+    non_layer_keys = []
+    layer_pattern = re.compile(r'\.layers\.(\d+)\.')
+    
+    for key in target_tensor_map:
+        match = layer_pattern.search(key)
+        if match:
+            layer_num = int(match.group(1))
+            layer_groups[layer_num].append(key)
+        else:
+            non_layer_keys.append(key)
+    
+    # Calculate total shards
+    total_shards = len(layer_groups) + (1 if non_layer_keys else 0) - 1
+    if total_shards == 0:
+        raise ValueError("No tensors to save")
+    
+    shard_idx = 0
+    
+    # Save non-layer tensors to the first shard if they exist
+    if non_layer_keys:
+        tensors = {}
+        for key in non_layer_keys:
+            file_path = target_tensor_map[key]
+            tensor = None
+            ggml_type = None
+            if file_path.endswith('.safetensors'):
+                if file_path not in safetensors_cache:
+                    safetensors_cache[file_path] = safe_open(file_path, framework='pt')
+                f = safetensors_cache[file_path]
+                tensor = f.get_tensor(key)
+            elif file_path.endswith('.gguf'):
+                gguf_name = translate_name(key)
+                tensor, ggml_type = gguf_loader.get_undequanted_tensor_and_ggml_type(gguf_name)
+            else:
+                raise ValueError(f"Unsupported file format: {file_path}")
+            tensors[translate_name(key)] = tensor
+            if ggml_type:
+                ggml_type = torch.tensor(ggml_type)
+                ggml_key = translate_name(key)[:-7] + ".ggml_type" if translate_name(key).endswith(".weight") else translate_name(key) + ".ggml_type"
+                tensors[ggml_key] = ggml_type
+        
+        output_file = os.path.join(output_path, f"model-{shard_idx:05}-of-{total_shards:05}.safetensors")
+        print(f"Saving non-layer tensors to {output_file}")
+        save_file(tensors, output_file)
+        print(tensors.keys())
+
+        shard_idx += 1
+    
+    # Save each layer's tensors to subsequent shards
+    for layer_num in sorted(layer_groups.keys()):
+        layer_keys = layer_groups[layer_num]
+        tensors = {}
+        for key in layer_keys:
+            file_path = target_tensor_map[key]
+            tensor = None
+            ggml_type = None
+            if file_path.endswith('.safetensors'):
+                if file_path not in safetensors_cache:
+                    safetensors_cache[file_path] = safe_open(file_path, framework='pt')
+                f = safetensors_cache[file_path]
+                tensor = f.get_tensor(key)
+                tensor_info = tensor.shape
+            elif file_path.endswith('.gguf'):
+                gguf_name = translate_name(key)
+                tensor, ggml_type = gguf_loader.get_undequanted_tensor_and_ggml_type(gguf_name)
+                # tensor_info = gguf_loader.tensor_info[gguf_name]
+                # ggml_type = gguf_loader.tensor_info[gguf_name]['ggml_type']
+            else:
+                raise ValueError(f"Unsupported file format: {file_path}")
+            tensors[translate_name(key)] = tensor
+            if ggml_type:
+                ggml_type = torch.tensor(ggml_type)
+                ggml_key = translate_name(key)[:-7] + ".ggml_type" if translate_name(key).endswith(".weight") else translate_name(key) + ".ggml_type"
+                tensors[ggml_key] = ggml_type
+        
+        output_file = os.path.join(output_path, f"model-{shard_idx:05}-of-{total_shards:05}.safetensors")
+        print(f"Saving layer {layer_num} to {output_file}")
+        # print(tensors.keys())
+        save_file(tensors, output_file)
+        shard_idx += 1
+    
+    return
+    
+def main():
+    # 创建命令行参数解析器
+    parser = argparse.ArgumentParser(description="Read parameters from Safetensor and GGUF files")
+    parser.add_argument("--safetensor_path", type=str, help="Path to the Safetensor file", default="/mnt/data/model/DeepSeek-V3")
+    parser.add_argument("--gguf_path", type=str, help="Path to the GGUF file", default="/mnt/data/model/DeepseekV3-q4km-gguf")
+    parser.add_argument("--output_path", type=str, help="Path to the output file", default="/mnt/data/model/ktrans-safetensors/DeepSeek-V3-q4km-fp8")
+    
+    # print all the arguments
+    print("All the arguments:")
+    print(parser.parse_args())
+    
+    # 解析命令行参数
+    args = parser.parse_args()
+
+    safetensor_path = args.safetensor_path
+    gguf_path = args.gguf_path
+    output_path = args.output_path
+    
+    target_tensor_map, gguf_loader = combine_tensor_sources(safetensor_path, gguf_path)
+    write_combined_tensor(target_tensor_map, output_path, gguf_loader)
+    
+    return
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/requirements-local_chat.txt b/requirements-local_chat.txt
index 0479d36..ad280c0 100644
--- a/requirements-local_chat.txt
+++ b/requirements-local_chat.txt
@@ -4,4 +4,6 @@ numpy
 torch>=2.3.0
 packaging
 cpufeature
-protobuf
\ No newline at end of file
+protobuf
+tiktoken
+blobfile
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 390fec6..5c29b8f 100644
--- a/setup.py
+++ b/setup.py
@@ -1,16 +1,16 @@
 #!/usr/bin/env python
 # coding=utf-8
 '''
-Description  :  
+Description  :
 Author       : chenxl
 Date         : 2024-07-27 16:15:27
 Version      : 1.0.0
-LastEditors  : chenxl 
+LastEditors  : chenxl
 LastEditTime : 2024-08-14 16:36:19
 Adapted from:
 https://github.com/Dao-AILab/flash-attention/blob/v2.6.3/setup.py
 Copyright (c) 2023, Tri Dao.
-Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
+Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 '''
 
 import os
@@ -29,7 +29,12 @@ import torch.version
 from wheel.bdist_wheel import bdist_wheel as _bdist_wheel
 from setuptools import setup, Extension
 from cpufeature.extension import CPUFeature
-from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME, ROCM_HOME
+try:
+    from torch_musa.utils.simple_porting import SimplePorting
+    from torch_musa.utils.musa_extension import BuildExtension, MUSAExtension, MUSA_HOME
+except ImportError:
+    MUSA_HOME=None
 
 class CpuInstructInfo:
     CPU_INSTRUCT = os.getenv("CPU_INSTRUCT", "NATIVE")
@@ -40,7 +45,7 @@ class CpuInstructInfo:
     CMAKE_FANCY = "-DLLAMA_NATIVE=OFF -DLLAMA_FMA=ON -DLLAMA_F16C=ON -DLLAMA_AVX=ON -DLLAMA_AVX2=ON -DLLAMA_AVX512=ON -DLLAMA_AVX512_FANCY_SIMD=ON"
     CMAKE_AVX512 = "-DLLAMA_NATIVE=OFF -DLLAMA_FMA=ON -DLLAMA_F16C=ON -DLLAMA_AVX=ON -DLLAMA_AVX2=ON -DLLAMA_AVX512=ON"
     CMAKE_AVX2 = "-DLLAMA_NATIVE=OFF -DLLAMA_FMA=ON -DLLAMA_F16C=ON -DLLAMA_AVX=ON -DLLAMA_AVX2=ON"
-    
+
 class VersionInfo:
     THIS_DIR = os.path.dirname(os.path.abspath(__file__))
     PACKAGE_NAME = "ktransformers"
@@ -49,6 +54,80 @@ class VersionInfo:
     )
     FORCE_BUILD = os.getenv("KTRANSFORMERS_FORCE_BUILD", "FALSE") == "TRUE"
 
+    def get_musa_bare_metal_version(self, musa_dir):
+        raw_output = subprocess.run(
+            [musa_dir + "/bin/mcc", "-v"], check=True,
+            stdout=subprocess.PIPE, stderr=subprocess.STDOUT).stdout.decode("utf-8")
+        output = raw_output.split()
+        release_idx = output.index("version") + 1
+        bare_metal_version = parse(output[release_idx].split(",")[0])
+        musa_version = f"{bare_metal_version.major}{bare_metal_version.minor}"
+        return musa_version
+
+    def get_rocm_bare_metal_version(self, rocm_dir):
+        """
+        Get the ROCm version from the ROCm installation directory.
+        
+        Args:
+            rocm_dir: Path to the ROCm installation directory
+        
+        Returns:
+            A string representation of the ROCm version (e.g., "63" for ROCm 6.3)
+        """
+        try:
+            # Try using rocm_agent_enumerator to get version info
+            raw_output = subprocess.check_output(
+                [rocm_dir + "/bin/rocminfo", "--version"], 
+                universal_newlines=True,
+                stderr=subprocess.STDOUT)
+            # Extract version number from output
+            match = re.search(r'(\d+\.\d+)', raw_output)
+            if match:
+                version_str = match.group(1)
+                version = parse(version_str)
+                rocm_version = f"{version.major}{version.minor}"
+                return rocm_version
+        except (subprocess.CalledProcessError, FileNotFoundError):
+            # If rocminfo --version fails, try alternative methods
+            pass
+        
+        try:
+            # Try reading version from release file
+            with open(os.path.join(rocm_dir, "share/doc/hip/version.txt"), "r") as f:
+                version_str = f.read().strip()
+                version = parse(version_str)
+                rocm_version = f"{version.major}{version.minor}"
+                return rocm_version
+        except (FileNotFoundError, IOError):
+            pass
+        
+        # If all else fails, try to extract from directory name
+        dir_name = os.path.basename(os.path.normpath(rocm_dir))
+        match = re.search(r'rocm-(\d+\.\d+)', dir_name)
+        if match:
+            version_str = match.group(1)
+            version = parse(version_str)
+            rocm_version = f"{version.major}{version.minor}"
+            return rocm_version
+        
+        # Fallback to extracting from hipcc version
+        try:
+            raw_output = subprocess.check_output(
+                [rocm_dir + "/bin/hipcc", "--version"],
+                universal_newlines=True,
+                stderr=subprocess.STDOUT)
+            match = re.search(r'HIP version: (\d+\.\d+)', raw_output)
+            if match:
+                version_str = match.group(1)
+                version = parse(version_str)
+                rocm_version = f"{version.major}{version.minor}"
+                return rocm_version
+        except (subprocess.CalledProcessError, FileNotFoundError):
+            pass
+        
+        # If we still can't determine the version, raise an error
+        raise ValueError(f"Could not determine ROCm version from directory: {rocm_dir}")
+
     def get_cuda_bare_metal_version(self, cuda_dir):
         raw_output = subprocess.check_output(
             [cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True)
@@ -58,7 +137,7 @@ class VersionInfo:
         cuda_version = f"{bare_metal_version.major}{bare_metal_version.minor}"
         return cuda_version
 
-    def get_cuda_version_of_torch(self,):
+    def get_cuda_version_of_torch(self):
         torch_cuda_version = parse(torch.version.cuda)
         cuda_version = f"{torch_cuda_version.major}{torch_cuda_version.minor}"
         return cuda_version
@@ -117,7 +196,7 @@ class VersionInfo:
         torch_version_raw = parse(torch.__version__)
         torch_version = f"{torch_version_raw.major}{torch_version_raw.minor}"
         return torch_version
-    
+
     def get_flash_version(self,):
         version_file = os.path.join(
             Path(VersionInfo.THIS_DIR), VersionInfo.PACKAGE_NAME, "__init__.py")
@@ -128,12 +207,23 @@ class VersionInfo:
         return flash_version
 
     def get_package_version(self, full_version=False):
-        flash_version = self.get_flash_version()
-        package_version = f"{str(flash_version)}+torch{self.get_torch_version()}{self.get_cpu_instruct()}"
+        flash_version = str(self.get_flash_version())
+        torch_version = self.get_torch_version()
+        cpu_instruct = self.get_cpu_instruct()
+        backend_version = ""
+        if CUDA_HOME is not None:
+            backend_version = f""
+        elif MUSA_HOME is not None:
+            backend_version = f"mu{self.get_musa_bare_metal_version(MUSA_HOME)}"
+        elif ROCM_HOME is not None:
+            backend_version = f"rocm{self.get_rocm_bare_metal_version(ROCM_HOME)}"
+        else:
+            raise ValueError("Unsupported backend: CUDA_HOME MUSA_HOME ROCM_HOME all not set.")
+        package_version = f"{flash_version}+{backend_version}torch{torch_version}{cpu_instruct}"
         if full_version:
             return package_version
         if not VersionInfo.FORCE_BUILD:
-            return str(flash_version)
+            return flash_version
         return package_version
 
 
@@ -218,11 +308,23 @@ class CMakeBuild(BuildExtension):
             f"-DPYTHON_EXECUTABLE={sys.executable}",
             f"-DCMAKE_BUILD_TYPE={cfg}",  # not used on MSVC, but no harm
         ]
+
+        if CUDA_HOME is not None:
+            cmake_args += ["-DKTRANSFORMERS_USE_CUDA=ON"]
+        elif MUSA_HOME is not None:
+            cmake_args += ["-DKTRANSFORMERS_USE_MUSA=ON"]
+        elif ROCM_HOME is not None:
+            cmake_args += ["-DKTRANSFORMERS_USE_ROCM=ON"]
+        else:
+            raise ValueError("Unsupported backend: CUDA_HOME and MUSA_HOME are not set.")
+        # log cmake_args
+        print("CMake args:", cmake_args)
+        
         build_args = []
         if "CMAKE_ARGS" in os.environ:
             cmake_args += [
                 item for item in os.environ["CMAKE_ARGS"].split(" ") if item]
-            
+
         if CpuInstructInfo.CPU_INSTRUCT == CpuInstructInfo.FANCY:
             cpu_args = CpuInstructInfo.CMAKE_FANCY
         elif CpuInstructInfo.CPU_INSTRUCT == CpuInstructInfo.AVX512:
@@ -231,7 +333,7 @@ class CMakeBuild(BuildExtension):
             cpu_args = CpuInstructInfo.CMAKE_AVX2
         else:
             cpu_args = CpuInstructInfo.CMAKE_NATIVE
-        
+
         cmake_args += [
             item for item in cpu_args.split(" ") if item
         ]
@@ -258,7 +360,7 @@ class CMakeBuild(BuildExtension):
 
             # CMake allows an arch-in-generator style for backward compatibility
             contains_arch = any(x in cmake_generator for x in {"ARM", "Win64"})
-            if not single_config and not contains_arch:
+            if not single_config and not contains_arch and cmake_generator:
                 cmake_args += ["-A", PLAT_TO_CMAKE[self.plat_name]]
 
             # Multi-config generators have a different way to specify configs
@@ -276,8 +378,13 @@ class CMakeBuild(BuildExtension):
                     "-DCMAKE_OSX_ARCHITECTURES={}".format(";".join(archs))]
 
         if "CMAKE_BUILD_PARALLEL_LEVEL" not in os.environ:
+            cpu_count = os.cpu_count()
+            if cpu_count is None:
+                cpu_count = 1
             if hasattr(self, "parallel") and self.parallel:
-                build_args += [f"-j{self.parallel}"]
+                build_args += [f"--parallel={self.parallel}"]
+            else:
+                build_args += [f"--parallel={cpu_count}"]
         print("CMake args:", cmake_args)
         build_temp = Path(ext.sourcedir) / "build"
         if not build_temp.exists():
@@ -288,28 +395,57 @@ class CMakeBuild(BuildExtension):
         print("Standard output:", result.stdout)
         print("Standard error:", result.stderr)
         subprocess.run(
-            ["cmake", "--build", ".", *build_args], cwd=build_temp, check=True
+            ["cmake", "--build", ".", "--verbose", *build_args], cwd=build_temp, check=True
         )
 
+if CUDA_HOME is not None or ROCM_HOME is not None:
+    ops_module = CUDAExtension('KTransformersOps', [
+        'ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu',
+        'ktransformers/ktransformers_ext/cuda/binding.cpp',
+        'ktransformers/ktransformers_ext/cuda/gptq_marlin/gptq_marlin.cu'
+    ],
+    extra_compile_args={
+            'cxx': ['-O3', '-DKTRANSFORMERS_USE_CUDA'],
+            'nvcc': [
+                '-O3',
+                # '--use_fast_math',
+                '-Xcompiler', '-fPIC',
+                '-DKTRANSFORMERS_USE_CUDA',
+            ]
+        }
+    )
+elif MUSA_HOME is not None:
+    SimplePorting(cuda_dir_path="ktransformers/ktransformers_ext/cuda", mapping_rule={
+        # Common rules
+        "at::cuda": "at::musa",
+        "#include <ATen/cuda/CUDAContext.h>": "#include \"torch_musa/csrc/aten/musa/MUSAContext.h\"",
+        "#include <c10/cuda/CUDAGuard.h>": "#include \"torch_musa/csrc/core/MUSAGuard.h\"",
+        "nv_bfloat16": "mt_bfloat16",
+        }).run()
+    ops_module = MUSAExtension('KTransformersOps', [
+        'ktransformers/ktransformers_ext/cuda_musa/custom_gguf/dequant.mu',
+        'ktransformers/ktransformers_ext/cuda_musa/binding.cpp',
+        # TODO: Add Marlin support for MUSA.
+        # 'ktransformers/ktransformers_ext/cuda_musa/gptq_marlin/gptq_marlin.mu'
+    ],
+    extra_compile_args={
+            'cxx': ['force_mcc'],
+            'mcc': [
+                '-O3',
+                '-DKTRANSFORMERS_USE_MUSA',
+                '-DTHRUST_IGNORE_CUB_VERSION_CHECK',
+            ]
+        }
+    )
+else:
+    raise ValueError("Unsupported backend: CUDA_HOME and MUSA_HOME are not set.")
 
 setup(
+    name=VersionInfo.PACKAGE_NAME,
     version=VersionInfo().get_package_version(),
     cmdclass={"bdist_wheel":BuildWheelsCommand ,"build_ext": CMakeBuild},
     ext_modules=[
         CMakeExtension("cpuinfer_ext"),
-        CUDAExtension('KTransformersOps', [
-            'ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu',
-            'ktransformers/ktransformers_ext/cuda/binding.cpp',
-            'ktransformers/ktransformers_ext/cuda/gptq_marlin/gptq_marlin.cu'
-        ],
-        extra_compile_args={
-                'cxx': ['-O3'],
-                'nvcc': [
-                    '-O3',
-                    # '--use_fast_math',
-                    '-Xcompiler', '-fPIC',
-                ]
-            }
-        )
+        ops_module,
     ]
 )
diff --git a/third_party/llamafile/iqk_mul_mat.inc b/third_party/llamafile/iqk_mul_mat.inc
index 5e9d688..35bc7b8 100644
--- a/third_party/llamafile/iqk_mul_mat.inc
+++ b/third_party/llamafile/iqk_mul_mat.inc
@@ -69,6 +69,10 @@
 
 #endif
 
+constexpr ggml_type GGML_TYPE_Q8_0_X4 = static_cast<ggml_type>(98);
+constexpr ggml_type GGML_TYPE_Q8_1_X4 = static_cast<ggml_type>(99);
+
+
 namespace {
 
 typedef struct {
@@ -106,13 +110,36 @@ struct DataInfo {
     }
 };
 
+/*
+moonll 
+change param for set_mul_mat 
+add func16
+*/
+
 typedef void (*mul_mat_t)(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x);
 
 struct MulMat {
     std::array<mul_mat_t, 8> funcs = {};
+    mul_mat_t func16 = nullptr;
     //inline void mul_mat_NxM(int n, const void * vx, size_t bx, DataInfo& info, int nrc_x, int nrc_y) {
     IQK_NOINLINE void mul_mat_NxM(int n, const void * vx, size_t bx, DataInfo& info, int nrc_x, int nrc_y) {
         constexpr int k_x_step = 64; // This works best on my Ryzen-7950X and M2 Max CPUs (but differences to other tile size are small)
+
+        if (func16 && nrc_y >= 16) {
+            int n_step = (nrc_y - info.cur_y)/16;
+            for (int ix = 0; ix < nrc_x; ix += k_x_step) {
+                auto this_info = info;
+                this_info.s += ix;
+                int this_nrc_x = ix + k_x_step <= nrc_x ? k_x_step : nrc_x - ix;
+                for (int iy = 0; iy < n_step; ++iy) {
+                    func16(n, (const void *)((const char *)vx + ix*bx), bx, this_info, this_nrc_x);
+                    this_info.cur_y += 16;
+                }
+            }
+            info.cur_y += 16 * n_step;
+            if (info.cur_y == nrc_y) return;
+        }
+
         int n_step = (nrc_y - info.cur_y)/funcs.size();
         if (n_step > 0) {
             for (int ix = 0; ix < nrc_x; ix += k_x_step) {
@@ -131,7 +158,7 @@ struct MulMat {
             funcs[n_left-1](n, vx, bx, info, nrc_x);
         }
     }
-    static IQK_NOINLINE bool set_mul_mat(int typeA, int ne00, MulMat& mm, int& row_size_q8, int Ny);
+    static IQK_NOINLINE bool set_mul_mat(int typeA, int typeB,int ne00, MulMat& mm, int Ny);
 private:
     template <typename Dequantizer> static IQK_NOINLINE void set_functions(MulMat& m);
 };
@@ -147,6 +174,787 @@ inline void make_q4_scales(const uint8_t * scales8, uint32_t * aux32) {
     aux32[0] = a0 & 0x3f3f3f3f;
 }
 
+/*
+moonll
+decoding tables
+*/
+#ifdef __AVX2__
+static const uint64_t iq1s_grid_us[2048] = {
+    0x0000000000000000, 0x0000000000000002, 0x0000000000000101, 0x0000000000000200,
+    0x0000000000000202, 0x0000000000010001, 0x0000000000010101, 0x0000000000020000,
+    0x0000000000020002, 0x0000000000020200, 0x0000000000020202, 0x0000000001000101,
+    0x0000000001010001, 0x0000000001010100, 0x0000000001010102, 0x0000000001020101,
+    0x0000000002000000, 0x0000000002000002, 0x0000000002000200, 0x0000000002000202,
+    0x0000000002010101, 0x0000000002020000, 0x0000000002020002, 0x0000000002020200,
+    0x0000000002020202, 0x0000000100000100, 0x0000000100000101, 0x0000000100010001,
+    0x0000000100010100, 0x0000000100010102, 0x0000000100010201, 0x0000000100010202,
+    0x0000000100020101, 0x0000000101000001, 0x0000000101000102, 0x0000000101000201,
+    0x0000000101010002, 0x0000000101010101, 0x0000000101010202, 0x0000000101020001,
+    0x0000000101020100, 0x0000000101020102, 0x0000000101020200, 0x0000000102000101,
+    0x0000000102010001, 0x0000000102010100, 0x0000000102010102, 0x0000000102020101,
+    0x0000000200000000, 0x0000000200000002, 0x0000000200000200, 0x0000000200000202,
+    0x0000000200010101, 0x0000000200020000, 0x0000000200020002, 0x0000000200020200,
+    0x0000000200020202, 0x0000000201000101, 0x0000000201010001, 0x0000000201010201,
+    0x0000000201020100, 0x0000000201020201, 0x0000000202000000, 0x0000000202000002,
+    0x0000000202000200, 0x0000000202000202, 0x0000000202010001, 0x0000000202010101,
+    0x0000000202010201, 0x0000000202020000, 0x0000000202020002, 0x0000000202020200,
+    0x0000000202020202, 0x0000010000010001, 0x0000010000010100, 0x0000010000010102,
+    0x0000010000020101, 0x0000010001000001, 0x0000010001000201, 0x0000010001010101,
+    0x0000010001010202, 0x0000010001020100, 0x0000010001020101, 0x0000010002010001,
+    0x0000010002010201, 0x0000010002020101, 0x0000010100000001, 0x0000010100000100,
+    0x0000010100000101, 0x0000010100000102, 0x0000010100010101, 0x0000010100010200,
+    0x0000010100010202, 0x0000010100020201, 0x0000010101000000, 0x0000010101000101,
+    0x0000010101000202, 0x0000010101010000, 0x0000010101010001, 0x0000010101010100,
+    0x0000010101010101, 0x0000010101010102, 0x0000010101010201, 0x0000010101020000,
+    0x0000010101020002, 0x0000010101020101, 0x0000010101020200, 0x0000010101020202,
+    0x0000010102000001, 0x0000010102010001, 0x0000010102010101, 0x0000010102010200,
+    0x0000010102010202, 0x0000010102020001, 0x0000010102020100, 0x0000010102020101,
+    0x0000010102020102, 0x0000010102020201, 0x0000010200010100, 0x0000010200010201,
+    0x0000010201000001, 0x0000010201000100, 0x0000010201010000, 0x0000010201010002,
+    0x0000010201010101, 0x0000010201010200, 0x0000010201020000, 0x0000010201020001,
+    0x0000010201020102, 0x0000010201020201, 0x0000010202000101, 0x0000010202010001,
+    0x0000010202010100, 0x0000010202010201, 0x0000020000000000, 0x0000020000000002,
+    0x0000020000000200, 0x0000020000000202, 0x0000020000010101, 0x0000020000020000,
+    0x0000020000020002, 0x0000020000020200, 0x0000020000020202, 0x0000020001000101,
+    0x0000020001010001, 0x0000020001010102, 0x0000020001020101, 0x0000020002000000,
+    0x0000020002000002, 0x0000020002000200, 0x0000020002000202, 0x0000020002010101,
+    0x0000020002020000, 0x0000020002020002, 0x0000020002020200, 0x0000020002020202,
+    0x0000020100000101, 0x0000020100010001, 0x0000020100010100, 0x0000020100010201,
+    0x0000020100020100, 0x0000020100020101, 0x0000020101000001, 0x0000020101010000,
+    0x0000020101010001, 0x0000020101010101, 0x0000020101020001, 0x0000020101020100,
+    0x0000020101020201, 0x0000020102010001, 0x0000020102010100, 0x0000020102010102,
+    0x0000020102010201, 0x0000020102020101, 0x0000020200000000, 0x0000020200000002,
+    0x0000020200000200, 0x0000020200000202, 0x0000020200010101, 0x0000020200020000,
+    0x0000020200020002, 0x0000020200020200, 0x0000020200020202, 0x0000020201000101,
+    0x0000020201010001, 0x0000020201010201, 0x0000020201020001, 0x0000020201020101,
+    0x0000020202000000, 0x0000020202000002, 0x0000020202000101, 0x0000020202000200,
+    0x0000020202000202, 0x0000020202010101, 0x0000020202020000, 0x0000020202020002,
+    0x0000020202020200, 0x0000020202020202, 0x0001000000010000, 0x0001000000010001,
+    0x0001000000010100, 0x0001000000010201, 0x0001000000020100, 0x0001000000020101,
+    0x0001000001000001, 0x0001000001000100, 0x0001000001010000, 0x0001000001010101,
+    0x0001000001010200, 0x0001000001020001, 0x0001000001020100, 0x0001000001020101,
+    0x0001000001020201, 0x0001000002010001, 0x0001000002010100, 0x0001000002010102,
+    0x0001000002020001, 0x0001000002020101, 0x0001000100000001, 0x0001000100000100,
+    0x0001000100000102, 0x0001000100000201, 0x0001000100010000, 0x0001000100010002,
+    0x0001000100010101, 0x0001000100010200, 0x0001000100020001, 0x0001000100020100,
+    0x0001000100020201, 0x0001000101000101, 0x0001000101000202, 0x0001000101010000,
+    0x0001000101010001, 0x0001000101010002, 0x0001000101010100, 0x0001000101010101,
+    0x0001000101010102, 0x0001000101010201, 0x0001000101020000, 0x0001000101020101,
+    0x0001000102000100, 0x0001000102010002, 0x0001000102010101, 0x0001000102020001,
+    0x0001000102020100, 0x0001000200010001, 0x0001000200010100, 0x0001000200010102,
+    0x0001000200020101, 0x0001000201000000, 0x0001000201000102, 0x0001000201000201,
+    0x0001000201010002, 0x0001000201010101, 0x0001000201010200, 0x0001000201010202,
+    0x0001000201020100, 0x0001000201020102, 0x0001000202000101, 0x0001000202010001,
+    0x0001000202010100, 0x0001000202010102, 0x0001000202020101, 0x0001010000000001,
+    0x0001010000000102, 0x0001010000000201, 0x0001010000010100, 0x0001010000010101,
+    0x0001010000010200, 0x0001010000010201, 0x0001010000020001, 0x0001010000020102,
+    0x0001010001000001, 0x0001010001000101, 0x0001010001000102, 0x0001010001000200,
+    0x0001010001000202, 0x0001010001010001, 0x0001010001010100, 0x0001010001010101,
+    0x0001010001010102, 0x0001010001010201, 0x0001010001020002, 0x0001010001020101,
+    0x0001010001020200, 0x0001010002000100, 0x0001010002000201, 0x0001010002010000,
+    0x0001010002010100, 0x0001010002010101, 0x0001010002010200, 0x0001010002010201,
+    0x0001010002010202, 0x0001010002020001, 0x0001010002020100, 0x0001010002020101,
+    0x0001010002020201, 0x0001010100000002, 0x0001010100000101, 0x0001010100000202,
+    0x0001010100010001, 0x0001010100010100, 0x0001010100010101, 0x0001010100010102,
+    0x0001010100010201, 0x0001010100020000, 0x0001010100020002, 0x0001010100020101,
+    0x0001010100020200, 0x0001010100020202, 0x0001010101000001, 0x0001010101000100,
+    0x0001010101000101, 0x0001010101000102, 0x0001010101010001, 0x0001010101010002,
+    0x0001010101010100, 0x0001010101010101, 0x0001010101010102, 0x0001010101010201,
+    0x0001010101010202, 0x0001010101020001, 0x0001010101020100, 0x0001010101020101,
+    0x0001010101020102, 0x0001010101020201, 0x0001010102000000, 0x0001010102000002,
+    0x0001010102000100, 0x0001010102000101, 0x0001010102000200, 0x0001010102000202,
+    0x0001010102010000, 0x0001010102010001, 0x0001010102010100, 0x0001010102010101,
+    0x0001010102010102, 0x0001010102010201, 0x0001010102010202, 0x0001010102020000,
+    0x0001010102020002, 0x0001010102020101, 0x0001010200000001, 0x0001010200000100,
+    0x0001010200000101, 0x0001010200000102, 0x0001010200010101, 0x0001010200010102,
+    0x0001010200010200, 0x0001010200010202, 0x0001010200020001, 0x0001010200020102,
+    0x0001010201000000, 0x0001010201000002, 0x0001010201000100, 0x0001010201000101,
+    0x0001010201000200, 0x0001010201000202, 0x0001010201010001, 0x0001010201010101,
+    0x0001010201010102, 0x0001010201010200, 0x0001010201010201, 0x0001010201020001,
+    0x0001010201020100, 0x0001010201020101, 0x0001010201020200, 0x0001010201020201,
+    0x0001010201020202, 0x0001010202000102, 0x0001010202000202, 0x0001010202010002,
+    0x0001010202010101, 0x0001010202020100, 0x0001010202020201, 0x0001020000010001,
+    0x0001020000010102, 0x0001020000020101, 0x0001020001000001, 0x0001020001000100,
+    0x0001020001000102, 0x0001020001000201, 0x0001020001010000, 0x0001020001010101,
+    0x0001020001010200, 0x0001020001010202, 0x0001020001020000, 0x0001020001020001,
+    0x0001020001020100, 0x0001020001020102, 0x0001020001020201, 0x0001020002000101,
+    0x0001020002010001, 0x0001020002010100, 0x0001020002020101, 0x0001020100010000,
+    0x0001020100010002, 0x0001020100010101, 0x0001020100010202, 0x0001020100020001,
+    0x0001020100020101, 0x0001020101000002, 0x0001020101000100, 0x0001020101000101,
+    0x0001020101000200, 0x0001020101010001, 0x0001020101010100, 0x0001020101010101,
+    0x0001020101010102, 0x0001020101010201, 0x0001020101010202, 0x0001020101020000,
+    0x0001020101020101, 0x0001020101020202, 0x0001020102000201, 0x0001020102010001,
+    0x0001020102010002, 0x0001020102010101, 0x0001020102010200, 0x0001020102020001,
+    0x0001020102020102, 0x0001020102020201, 0x0001020200000201, 0x0001020200010102,
+    0x0001020200020100, 0x0001020200020102, 0x0001020201000100, 0x0001020201000102,
+    0x0001020201000201, 0x0001020201010000, 0x0001020201010002, 0x0001020201010101,
+    0x0001020201010200, 0x0001020201020001, 0x0001020201020102, 0x0001020201020201,
+    0x0001020202000101, 0x0001020202010001, 0x0001020202010102, 0x0001020202010202,
+    0x0002000000000000, 0x0002000000000002, 0x0002000000000200, 0x0002000000000202,
+    0x0002000000010101, 0x0002000000020000, 0x0002000000020002, 0x0002000000020101,
+    0x0002000000020200, 0x0002000000020202, 0x0002000001000101, 0x0002000001010001,
+    0x0002000001010201, 0x0002000001020001, 0x0002000001020101, 0x0002000002000000,
+    0x0002000002000002, 0x0002000002000200, 0x0002000002000202, 0x0002000002010101,
+    0x0002000002020000, 0x0002000002020002, 0x0002000002020101, 0x0002000002020200,
+    0x0002000002020202, 0x0002000100000101, 0x0002000100010001, 0x0002000100010100,
+    0x0002000100010201, 0x0002000100020101, 0x0002000101000002, 0x0002000101000100,
+    0x0002000101000201, 0x0002000101010101, 0x0002000101010200, 0x0002000101010202,
+    0x0002000101020001, 0x0002000101020100, 0x0002000101020101, 0x0002000101020102,
+    0x0002000102000101, 0x0002000102010000, 0x0002000102010102, 0x0002000102010201,
+    0x0002000102020101, 0x0002000200000001, 0x0002000200000200, 0x0002000200000202,
+    0x0002000200010001, 0x0002000200010101, 0x0002000200020000, 0x0002000200020002,
+    0x0002000200020200, 0x0002000200020202, 0x0002000201000101, 0x0002000201010001,
+    0x0002000201010102, 0x0002000201010201, 0x0002000201020101, 0x0002000202000001,
+    0x0002000202000200, 0x0002000202000202, 0x0002000202010001, 0x0002000202010101,
+    0x0002000202020000, 0x0002000202020002, 0x0002000202020200, 0x0002000202020202,
+    0x0002010000000101, 0x0002010000010100, 0x0002010000010102, 0x0002010000010201,
+    0x0002010000020101, 0x0002010001000100, 0x0002010001000101, 0x0002010001000102,
+    0x0002010001000201, 0x0002010001010002, 0x0002010001010101, 0x0002010001010200,
+    0x0002010001010202, 0x0002010001020102, 0x0002010002000101, 0x0002010002010001,
+    0x0002010002010100, 0x0002010002010201, 0x0002010002020001, 0x0002010002020101,
+    0x0002010100000201, 0x0002010100010101, 0x0002010100020001, 0x0002010100020201,
+    0x0002010101000000, 0x0002010101000101, 0x0002010101000200, 0x0002010101010001,
+    0x0002010101010100, 0x0002010101010101, 0x0002010101010201, 0x0002010101020002,
+    0x0002010101020101, 0x0002010101020200, 0x0002010102000201, 0x0002010102010000,
+    0x0002010102010100, 0x0002010102010101, 0x0002010102010200, 0x0002010102010202,
+    0x0002010102020001, 0x0002010102020100, 0x0002010102020102, 0x0002010102020201,
+    0x0002010200000101, 0x0002010200010000, 0x0002010200010002, 0x0002010200010201,
+    0x0002010200020101, 0x0002010201000001, 0x0002010201000201, 0x0002010201010101,
+    0x0002010201020000, 0x0002010201020001, 0x0002010201020201, 0x0002010202000100,
+    0x0002010202000102, 0x0002010202010000, 0x0002010202010202, 0x0002020000000000,
+    0x0002020000000002, 0x0002020000000200, 0x0002020000000202, 0x0002020000010101,
+    0x0002020000020000, 0x0002020000020002, 0x0002020000020200, 0x0002020000020202,
+    0x0002020001000101, 0x0002020001010001, 0x0002020001010100, 0x0002020001020101,
+    0x0002020002000000, 0x0002020002000002, 0x0002020002000200, 0x0002020002000202,
+    0x0002020002020000, 0x0002020002020002, 0x0002020002020200, 0x0002020002020202,
+    0x0002020100000201, 0x0002020100010001, 0x0002020100010100, 0x0002020100010201,
+    0x0002020100020101, 0x0002020101000102, 0x0002020101000201, 0x0002020101010002,
+    0x0002020101010101, 0x0002020101020001, 0x0002020101020100, 0x0002020101020102,
+    0x0002020101020201, 0x0002020102000101, 0x0002020102010000, 0x0002020102010102,
+    0x0002020102010201, 0x0002020102020100, 0x0002020102020101, 0x0002020200000000,
+    0x0002020200000002, 0x0002020200000200, 0x0002020200000202, 0x0002020200020000,
+    0x0002020200020002, 0x0002020200020200, 0x0002020200020202, 0x0002020201000101,
+    0x0002020201010001, 0x0002020201010102, 0x0002020201010201, 0x0002020201020101,
+    0x0002020202000000, 0x0002020202000002, 0x0002020202000200, 0x0002020202000202,
+    0x0002020202010101, 0x0002020202020000, 0x0002020202020002, 0x0002020202020200,
+    0x0002020202020202, 0x0100000000000101, 0x0100000000010001, 0x0100000000010102,
+    0x0100000000020101, 0x0100000001000201, 0x0100000001010002, 0x0100000001010101,
+    0x0100000001010200, 0x0100000001010202, 0x0100000001020001, 0x0100000001020100,
+    0x0100000001020102, 0x0100000002010100, 0x0100000002010201, 0x0100000002020001,
+    0x0100000002020102, 0x0100000100000000, 0x0100000100000001, 0x0100000100000100,
+    0x0100000100000102, 0x0100000100000201, 0x0100000100010002, 0x0100000100010101,
+    0x0100000100010102, 0x0100000100010200, 0x0100000100010202, 0x0100000100020001,
+    0x0100000100020102, 0x0100000100020201, 0x0100000101000101, 0x0100000101000200,
+    0x0100000101000202, 0x0100000101010001, 0x0100000101010100, 0x0100000101010101,
+    0x0100000101010102, 0x0100000101010201, 0x0100000101010202, 0x0100000101020101,
+    0x0100000101020200, 0x0100000101020202, 0x0100000102000001, 0x0100000102000100,
+    0x0100000102000102, 0x0100000102010000, 0x0100000102010002, 0x0100000102010101,
+    0x0100000102020000, 0x0100000102020001, 0x0100000102020002, 0x0100000200000101,
+    0x0100000200010001, 0x0100000200010100, 0x0100000200010102, 0x0100000200020101,
+    0x0100000201000001, 0x0100000201010002, 0x0100000201010101, 0x0100000201010202,
+    0x0100000201020100, 0x0100000201020201, 0x0100000202000201, 0x0100000202010100,
+    0x0100000202020101, 0x0100010000000001, 0x0100010000010101, 0x0100010000010201,
+    0x0100010000020201, 0x0100010001000101, 0x0100010001000200, 0x0100010001000202,
+    0x0100010001010001, 0x0100010001010100, 0x0100010001010101, 0x0100010001010102,
+    0x0100010001020001, 0x0100010001020002, 0x0100010001020101, 0x0100010001020200,
+    0x0100010001020202, 0x0100010002000001, 0x0100010002000102, 0x0100010002000201,
+    0x0100010002010000, 0x0100010002010002, 0x0100010002010101, 0x0100010002020000,
+    0x0100010002020001, 0x0100010002020201, 0x0100010100000001, 0x0100010100000002,
+    0x0100010100000101, 0x0100010100000202, 0x0100010100010001, 0x0100010100010100,
+    0x0100010100010101, 0x0100010100010102, 0x0100010100010201, 0x0100010100020000,
+    0x0100010100020101, 0x0100010100020202, 0x0100010101000001, 0x0100010101000100,
+    0x0100010101000101, 0x0100010101000102, 0x0100010101000201, 0x0100010101010000,
+    0x0100010101010001, 0x0100010101010100, 0x0100010101010101, 0x0100010101010102,
+    0x0100010101010200, 0x0100010101010201, 0x0100010101020001, 0x0100010101020100,
+    0x0100010101020101, 0x0100010101020102, 0x0100010101020201, 0x0100010102000002,
+    0x0100010102000100, 0x0100010102000101, 0x0100010102000200, 0x0100010102010001,
+    0x0100010102010100, 0x0100010102010101, 0x0100010102010102, 0x0100010102010201,
+    0x0100010102010202, 0x0100010102020101, 0x0100010102020200, 0x0100010102020202,
+    0x0100010200000001, 0x0100010200000101, 0x0100010200000201, 0x0100010200010100,
+    0x0100010200010101, 0x0100010200010200, 0x0100010200010202, 0x0100010200020001,
+    0x0100010200020100, 0x0100010200020201, 0x0100010201000000, 0x0100010201000002,
+    0x0100010201000101, 0x0100010201000200, 0x0100010201010000, 0x0100010201010001,
+    0x0100010201010002, 0x0100010201010101, 0x0100010201010102, 0x0100010201010201,
+    0x0100010201020002, 0x0100010201020101, 0x0100010201020200, 0x0100010202000001,
+    0x0100010202000101, 0x0100010202000202, 0x0100010202010100, 0x0100010202010101,
+    0x0100010202020001, 0x0100010202020100, 0x0100010202020102, 0x0100020000000101,
+    0x0100020000010001, 0x0100020000010101, 0x0100020000010202, 0x0100020000020101,
+    0x0100020001000002, 0x0100020001000201, 0x0100020001010000, 0x0100020001010101,
+    0x0100020001010200, 0x0100020001020001, 0x0100020001020100, 0x0100020001020102,
+    0x0100020001020201, 0x0100020002000101, 0x0100020002010001, 0x0100020002010100,
+    0x0100020002010102, 0x0100020002010201, 0x0100020002020101, 0x0100020100000001,
+    0x0100020100000101, 0x0100020100000102, 0x0100020100000202, 0x0100020100010000,
+    0x0100020100010100, 0x0100020100010101, 0x0100020100010200, 0x0100020100020001,
+    0x0100020100020100, 0x0100020100020102, 0x0100020101000000, 0x0100020101000101,
+    0x0100020101000202, 0x0100020101010001, 0x0100020101010002, 0x0100020101010100,
+    0x0100020101010101, 0x0100020101010102, 0x0100020101010201, 0x0100020101020000,
+    0x0100020101020002, 0x0100020101020101, 0x0100020101020102, 0x0100020101020202,
+    0x0100020102000102, 0x0100020102000201, 0x0100020102010002, 0x0100020102010101,
+    0x0100020102010102, 0x0100020102010200, 0x0100020102020001, 0x0100020102020100,
+    0x0100020102020102, 0x0100020102020201, 0x0100020200010102, 0x0100020201000100,
+    0x0100020201000102, 0x0100020201000201, 0x0100020201010101, 0x0100020201010200,
+    0x0100020201010202, 0x0100020201020100, 0x0100020201020201, 0x0100020202010100,
+    0x0100020202020101, 0x0101000000000001, 0x0101000000000100, 0x0101000000000101,
+    0x0101000000000102, 0x0101000000000201, 0x0101000000010002, 0x0101000000010101,
+    0x0101000000010202, 0x0101000000020001, 0x0101000000020100, 0x0101000000020201,
+    0x0101000001000000, 0x0101000001000101, 0x0101000001000200, 0x0101000001010001,
+    0x0101000001010100, 0x0101000001010101, 0x0101000001010102, 0x0101000001010201,
+    0x0101000001020101, 0x0101000001020200, 0x0101000002000102, 0x0101000002000201,
+    0x0101000002010101, 0x0101000002010200, 0x0101000002020000, 0x0101000002020001,
+    0x0101000002020102, 0x0101000002020201, 0x0101000100000101, 0x0101000100000200,
+    0x0101000100000201, 0x0101000100000202, 0x0101000100010001, 0x0101000100010100,
+    0x0101000100010101, 0x0101000100010102, 0x0101000100010200, 0x0101000100010201,
+    0x0101000100020000, 0x0101000100020101, 0x0101000100020102, 0x0101000100020200,
+    0x0101000100020202, 0x0101000101000001, 0x0101000101000100, 0x0101000101000101,
+    0x0101000101000102, 0x0101000101000201, 0x0101000101010000, 0x0101000101010001,
+    0x0101000101010002, 0x0101000101010100, 0x0101000101010101, 0x0101000101010102,
+    0x0101000101010200, 0x0101000101010201, 0x0101000101010202, 0x0101000101020001,
+    0x0101000101020100, 0x0101000101020101, 0x0101000101020102, 0x0101000101020201,
+    0x0101000102000002, 0x0101000102000101, 0x0101000102010001, 0x0101000102010100,
+    0x0101000102010101, 0x0101000102010102, 0x0101000102010201, 0x0101000102020000,
+    0x0101000102020101, 0x0101000102020202, 0x0101000200000001, 0x0101000200000102,
+    0x0101000200010002, 0x0101000200010101, 0x0101000200010202, 0x0101000200020001,
+    0x0101000200020100, 0x0101000201000002, 0x0101000201000101, 0x0101000201000202,
+    0x0101000201010001, 0x0101000201010100, 0x0101000201010101, 0x0101000201010102,
+    0x0101000201010201, 0x0101000201020002, 0x0101000201020101, 0x0101000202000101,
+    0x0101000202010000, 0x0101000202010002, 0x0101000202010101, 0x0101000202010201,
+    0x0101000202010202, 0x0101000202020100, 0x0101010000000100, 0x0101010000000101,
+    0x0101010000010001, 0x0101010000010100, 0x0101010000010101, 0x0101010000010102,
+    0x0101010000010200, 0x0101010000010201, 0x0101010000020001, 0x0101010000020101,
+    0x0101010000020200, 0x0101010000020202, 0x0101010001000001, 0x0101010001000100,
+    0x0101010001000101, 0x0101010001000102, 0x0101010001000201, 0x0101010001000202,
+    0x0101010001010000, 0x0101010001010001, 0x0101010001010100, 0x0101010001010101,
+    0x0101010001010102, 0x0101010001010200, 0x0101010001010201, 0x0101010001010202,
+    0x0101010001020001, 0x0101010001020002, 0x0101010001020100, 0x0101010001020101,
+    0x0101010001020102, 0x0101010001020201, 0x0101010002000000, 0x0101010002000200,
+    0x0101010002000202, 0x0101010002010001, 0x0101010002010100, 0x0101010002010101,
+    0x0101010002010102, 0x0101010002010201, 0x0101010002020001, 0x0101010002020100,
+    0x0101010002020101, 0x0101010002020202, 0x0101010100000001, 0x0101010100000002,
+    0x0101010100000100, 0x0101010100000101, 0x0101010100000102, 0x0101010100000201,
+    0x0101010100010000, 0x0101010100010001, 0x0101010100010002, 0x0101010100010100,
+    0x0101010100010101, 0x0101010100010102, 0x0101010100010201, 0x0101010100010202,
+    0x0101010100020001, 0x0101010100020100, 0x0101010100020101, 0x0101010100020102,
+    0x0101010100020201, 0x0101010101000000, 0x0101010101000001, 0x0101010101000002,
+    0x0101010101000100, 0x0101010101000101, 0x0101010101000102, 0x0101010101000200,
+    0x0101010101000201, 0x0101010101010000, 0x0101010101010001, 0x0101010101010002,
+    0x0101010101010100, 0x0101010101010101, 0x0101010101010102, 0x0101010101010200,
+    0x0101010101010201, 0x0101010101010202, 0x0101010101020000, 0x0101010101020001,
+    0x0101010101020100, 0x0101010101020101, 0x0101010101020102, 0x0101010101020200,
+    0x0101010101020201, 0x0101010101020202, 0x0101010102000001, 0x0101010102000100,
+    0x0101010102000101, 0x0101010102000201, 0x0101010102000202, 0x0101010102010000,
+    0x0101010102010001, 0x0101010102010100, 0x0101010102010101, 0x0101010102010102,
+    0x0101010102010200, 0x0101010102010201, 0x0101010102020001, 0x0101010102020100,
+    0x0101010102020101, 0x0101010102020102, 0x0101010102020201, 0x0101010200000000,
+    0x0101010200000001, 0x0101010200000002, 0x0101010200000100, 0x0101010200000102,
+    0x0101010200000200, 0x0101010200000201, 0x0101010200010001, 0x0101010200010100,
+    0x0101010200010101, 0x0101010200010200, 0x0101010200010201, 0x0101010200020000,
+    0x0101010200020001, 0x0101010200020002, 0x0101010200020100, 0x0101010200020101,
+    0x0101010200020102, 0x0101010200020200, 0x0101010200020201, 0x0101010201000001,
+    0x0101010201000101, 0x0101010201000102, 0x0101010201000200, 0x0101010201000201,
+    0x0101010201000202, 0x0101010201010000, 0x0101010201010001, 0x0101010201010002,
+    0x0101010201010100, 0x0101010201010101, 0x0101010201010102, 0x0101010201010200,
+    0x0101010201010201, 0x0101010201010202, 0x0101010201020001, 0x0101010201020100,
+    0x0101010201020101, 0x0101010201020201, 0x0101010202000002, 0x0101010202000101,
+    0x0101010202000102, 0x0101010202000200, 0x0101010202000201, 0x0101010202000202,
+    0x0101010202010001, 0x0101010202010101, 0x0101010202010202, 0x0101010202020002,
+    0x0101010202020101, 0x0101010202020102, 0x0101010202020200, 0x0101010202020201,
+    0x0101020000000100, 0x0101020000000101, 0x0101020000000102, 0x0101020000000201,
+    0x0101020000010000, 0x0101020000010101, 0x0101020000010200, 0x0101020000020001,
+    0x0101020000020202, 0x0101020001000101, 0x0101020001000200, 0x0101020001000202,
+    0x0101020001010001, 0x0101020001010100, 0x0101020001010101, 0x0101020001010102,
+    0x0101020001010200, 0x0101020001010201, 0x0101020001020000, 0x0101020001020002,
+    0x0101020001020100, 0x0101020001020101, 0x0101020002000002, 0x0101020002000201,
+    0x0101020002010000, 0x0101020002010002, 0x0101020002010101, 0x0101020002010200,
+    0x0101020002020001, 0x0101020002020201, 0x0101020100000001, 0x0101020100000002,
+    0x0101020100000101, 0x0101020100000202, 0x0101020100010001, 0x0101020100010100,
+    0x0101020100010101, 0x0101020100010102, 0x0101020100010201, 0x0101020100020101,
+    0x0101020101000001, 0x0101020101000100, 0x0101020101000101, 0x0101020101000102,
+    0x0101020101000201, 0x0101020101010000, 0x0101020101010001, 0x0101020101010002,
+    0x0101020101010100, 0x0101020101010101, 0x0101020101010102, 0x0101020101010200,
+    0x0101020101010201, 0x0101020101010202, 0x0101020101020001, 0x0101020101020100,
+    0x0101020101020101, 0x0101020101020102, 0x0101020101020201, 0x0101020102000001,
+    0x0101020102000101, 0x0101020102000201, 0x0101020102010001, 0x0101020102010100,
+    0x0101020102010101, 0x0101020102010102, 0x0101020102010200, 0x0101020102010201,
+    0x0101020102020101, 0x0101020200000100, 0x0101020200000200, 0x0101020200010101,
+    0x0101020200010202, 0x0101020200020000, 0x0101020200020101, 0x0101020200020102,
+    0x0101020200020201, 0x0101020201000101, 0x0101020201000200, 0x0101020201000201,
+    0x0101020201010001, 0x0101020201010101, 0x0101020201010102, 0x0101020201010200,
+    0x0101020201010201, 0x0101020201020002, 0x0101020201020101, 0x0101020201020200,
+    0x0101020201020202, 0x0101020202000001, 0x0101020202000202, 0x0101020202010002,
+    0x0101020202010101, 0x0101020202010102, 0x0101020202010200, 0x0101020202010202,
+    0x0101020202020001, 0x0102000000000101, 0x0102000000010100, 0x0102000000010102,
+    0x0102000000010201, 0x0102000000020101, 0x0102000001000100, 0x0102000001010000,
+    0x0102000001010101, 0x0102000001010102, 0x0102000001010200, 0x0102000001010202,
+    0x0102000001020001, 0x0102000001020100, 0x0102000001020102, 0x0102000001020201,
+    0x0102000002000001, 0x0102000002010102, 0x0102000002020101, 0x0102000100000001,
+    0x0102000100000100, 0x0102000100000102, 0x0102000100000201, 0x0102000100010002,
+    0x0102000100010101, 0x0102000100020001, 0x0102000100020002, 0x0102000100020102,
+    0x0102000100020201, 0x0102000101000101, 0x0102000101000201, 0x0102000101010001,
+    0x0102000101010101, 0x0102000101010102, 0x0102000101010201, 0x0102000101020101,
+    0x0102000101020102, 0x0102000101020202, 0x0102000102000100, 0x0102000102000202,
+    0x0102000102010002, 0x0102000102010101, 0x0102000102020001, 0x0102000102020102,
+    0x0102000102020201, 0x0102000200010001, 0x0102000200010102, 0x0102000200010201,
+    0x0102000201000000, 0x0102000201000001, 0x0102000201000102, 0x0102000201010101,
+    0x0102000201010102, 0x0102000201010200, 0x0102000201020000, 0x0102000202000101,
+    0x0102000202010001, 0x0102000202010102, 0x0102000202020101, 0x0102010000010001,
+    0x0102010000010002, 0x0102010000010101, 0x0102010000010102, 0x0102010000010202,
+    0x0102010000020001, 0x0102010000020102, 0x0102010000020201, 0x0102010001000000,
+    0x0102010001000002, 0x0102010001000101, 0x0102010001000200, 0x0102010001000202,
+    0x0102010001010001, 0x0102010001010100, 0x0102010001010101, 0x0102010001010102,
+    0x0102010001010201, 0x0102010001010202, 0x0102010001020000, 0x0102010001020002,
+    0x0102010001020101, 0x0102010002000100, 0x0102010002000101, 0x0102010002000201,
+    0x0102010002010000, 0x0102010002010002, 0x0102010002010100, 0x0102010002010101,
+    0x0102010002010102, 0x0102010002010200, 0x0102010002010202, 0x0102010002020001,
+    0x0102010002020100, 0x0102010002020201, 0x0102010100000101, 0x0102010100000200,
+    0x0102010100000202, 0x0102010100010001, 0x0102010100010101, 0x0102010100010102,
+    0x0102010100010201, 0x0102010101000100, 0x0102010101000101, 0x0102010101000102,
+    0x0102010101000201, 0x0102010101010000, 0x0102010101010001, 0x0102010101010100,
+    0x0102010101010101, 0x0102010101010102, 0x0102010101010201, 0x0102010101020001,
+    0x0102010101020100, 0x0102010101020101, 0x0102010101020102, 0x0102010101020201,
+    0x0102010102000102, 0x0102010102000201, 0x0102010102000202, 0x0102010102010001,
+    0x0102010102010101, 0x0102010102010102, 0x0102010102010201, 0x0102010102010202,
+    0x0102010102020002, 0x0102010102020101, 0x0102010102020102, 0x0102010102020200,
+    0x0102010200000002, 0x0102010200000201, 0x0102010200010101, 0x0102010200020000,
+    0x0102010200020102, 0x0102010200020200, 0x0102010200020201, 0x0102010201000000,
+    0x0102010201000101, 0x0102010201000200, 0x0102010201000202, 0x0102010201010001,
+    0x0102010201010100, 0x0102010201010101, 0x0102010201010102, 0x0102010201010200,
+    0x0102010201010202, 0x0102010201020000, 0x0102010201020101, 0x0102010201020200,
+    0x0102010202000000, 0x0102010202000002, 0x0102010202000101, 0x0102010202000202,
+    0x0102010202010100, 0x0102010202010102, 0x0102010202010200, 0x0102010202010201,
+    0x0102010202020000, 0x0102010202020100, 0x0102010202020102, 0x0102010202020202,
+    0x0102020000010102, 0x0102020000010201, 0x0102020000020101, 0x0102020001000001,
+    0x0102020001010002, 0x0102020001010101, 0x0102020001010202, 0x0102020001020001,
+    0x0102020001020201, 0x0102020002000101, 0x0102020002010001, 0x0102020002010200,
+    0x0102020002020102, 0x0102020100000001, 0x0102020100000100, 0x0102020100010000,
+    0x0102020100010101, 0x0102020100020001, 0x0102020100020100, 0x0102020100020102,
+    0x0102020100020201, 0x0102020101000000, 0x0102020101000001, 0x0102020101000101,
+    0x0102020101000102, 0x0102020101000200, 0x0102020101010001, 0x0102020101010100,
+    0x0102020101010101, 0x0102020101010102, 0x0102020101010201, 0x0102020101020000,
+    0x0102020101020101, 0x0102020101020202, 0x0102020102000002, 0x0102020102000100,
+    0x0102020102000202, 0x0102020102010101, 0x0102020102020001, 0x0102020102020100,
+    0x0102020102020101, 0x0102020102020201, 0x0102020200010001, 0x0102020200010102,
+    0x0102020200010200, 0x0102020201000001, 0x0102020201000100, 0x0102020201000201,
+    0x0102020201010000, 0x0102020201010101, 0x0102020201010200, 0x0102020201010202,
+    0x0102020201020100, 0x0102020201020101, 0x0102020201020201, 0x0102020202000102,
+    0x0102020202010100, 0x0102020202010200, 0x0102020202010202, 0x0102020202020102,
+    0x0200000000000000, 0x0200000000000002, 0x0200000000000200, 0x0200000000000202,
+    0x0200000000020000, 0x0200000000020002, 0x0200000000020200, 0x0200000000020202,
+    0x0200000001000101, 0x0200000001010000, 0x0200000001010001, 0x0200000001010100,
+    0x0200000001010102, 0x0200000001010201, 0x0200000001020101, 0x0200000002000000,
+    0x0200000002000002, 0x0200000002000200, 0x0200000002000202, 0x0200000002010101,
+    0x0200000002020000, 0x0200000002020002, 0x0200000002020200, 0x0200000002020202,
+    0x0200000100000101, 0x0200000100010001, 0x0200000100010100, 0x0200000100010102,
+    0x0200000100010201, 0x0200000100020101, 0x0200000101000001, 0x0200000101000100,
+    0x0200000101000201, 0x0200000101010000, 0x0200000101010002, 0x0200000101010101,
+    0x0200000101010102, 0x0200000101010200, 0x0200000101010201, 0x0200000101020100,
+    0x0200000101020102, 0x0200000101020201, 0x0200000102000101, 0x0200000102000201,
+    0x0200000102010100, 0x0200000102010102, 0x0200000102010201, 0x0200000102020101,
+    0x0200000200000000, 0x0200000200000002, 0x0200000200000200, 0x0200000200000202,
+    0x0200000200010101, 0x0200000200020000, 0x0200000200020002, 0x0200000200020200,
+    0x0200000200020202, 0x0200000201010001, 0x0200000201010100, 0x0200000201010201,
+    0x0200000201020101, 0x0200000202000000, 0x0200000202000002, 0x0200000202000200,
+    0x0200000202000202, 0x0200000202010101, 0x0200000202020000, 0x0200000202020002,
+    0x0200000202020200, 0x0200000202020202, 0x0200010000010100, 0x0200010000010201,
+    0x0200010001000001, 0x0200010001000100, 0x0200010001010001, 0x0200010001010101,
+    0x0200010001010202, 0x0200010001020001, 0x0200010001020100, 0x0200010001020201,
+    0x0200010002010100, 0x0200010002010201, 0x0200010100000001, 0x0200010100000201,
+    0x0200010100010002, 0x0200010100010101, 0x0200010100010202, 0x0200010100020102,
+    0x0200010100020201, 0x0200010101000000, 0x0200010101000001, 0x0200010101000101,
+    0x0200010101000200, 0x0200010101010001, 0x0200010101010100, 0x0200010101010101,
+    0x0200010101010102, 0x0200010101010201, 0x0200010101010202, 0x0200010101020101,
+    0x0200010101020102, 0x0200010101020200, 0x0200010101020202, 0x0200010102000001,
+    0x0200010102000100, 0x0200010102000102, 0x0200010102000201, 0x0200010102010000,
+    0x0200010102010002, 0x0200010102010101, 0x0200010102010200, 0x0200010102020102,
+    0x0200010200010001, 0x0200010200010102, 0x0200010200010201, 0x0200010200020101,
+    0x0200010201000001, 0x0200010201000100, 0x0200010201000201, 0x0200010201000202,
+    0x0200010201010000, 0x0200010201010101, 0x0200010201010201, 0x0200010201010202,
+    0x0200010201020001, 0x0200010201020102, 0x0200010201020202, 0x0200010202000101,
+    0x0200010202010001, 0x0200010202010202, 0x0200010202020100, 0x0200020000000000,
+    0x0200020000000002, 0x0200020000000200, 0x0200020000000202, 0x0200020000010101,
+    0x0200020000020000, 0x0200020000020002, 0x0200020000020200, 0x0200020000020202,
+    0x0200020001000001, 0x0200020001000101, 0x0200020001010001, 0x0200020001010100,
+    0x0200020001010201, 0x0200020001020101, 0x0200020001020201, 0x0200020002000000,
+    0x0200020002000002, 0x0200020002000200, 0x0200020002000202, 0x0200020002010101,
+    0x0200020002020000, 0x0200020002020002, 0x0200020002020200, 0x0200020002020202,
+    0x0200020100000101, 0x0200020100000102, 0x0200020100010001, 0x0200020100010100,
+    0x0200020100010102, 0x0200020100020101, 0x0200020101000001, 0x0200020101000100,
+    0x0200020101000102, 0x0200020101000201, 0x0200020101010000, 0x0200020101010002,
+    0x0200020101010101, 0x0200020101010202, 0x0200020101020001, 0x0200020101020100,
+    0x0200020102000101, 0x0200020102010102, 0x0200020102010201, 0x0200020102020101,
+    0x0200020200000000, 0x0200020200000002, 0x0200020200000200, 0x0200020200000202,
+    0x0200020200010101, 0x0200020200020000, 0x0200020200020002, 0x0200020200020200,
+    0x0200020200020202, 0x0200020201000101, 0x0200020201010001, 0x0200020201010100,
+    0x0200020201010102, 0x0200020202000000, 0x0200020202000002, 0x0200020202000200,
+    0x0200020202000202, 0x0200020202010101, 0x0200020202020000, 0x0200020202020002,
+    0x0200020202020200, 0x0200020202020202, 0x0201000000000101, 0x0201000000010001,
+    0x0201000000010102, 0x0201000000010200, 0x0201000000010201, 0x0201000000020101,
+    0x0201000001000001, 0x0201000001000102, 0x0201000001000201, 0x0201000001010101,
+    0x0201000001010200, 0x0201000001010202, 0x0201000001020201, 0x0201000001020202,
+    0x0201000002000101, 0x0201000002010001, 0x0201000002010100, 0x0201000002010102,
+    0x0201000002010201, 0x0201000002020101, 0x0201000100000001, 0x0201000100000100,
+    0x0201000100000102, 0x0201000100000201, 0x0201000100010000, 0x0201000100010101,
+    0x0201000100010200, 0x0201000100010202, 0x0201000100020001, 0x0201000100020100,
+    0x0201000100020102, 0x0201000100020201, 0x0201000101000000, 0x0201000101000101,
+    0x0201000101010000, 0x0201000101010001, 0x0201000101010100, 0x0201000101010101,
+    0x0201000101010102, 0x0201000101010201, 0x0201000101020002, 0x0201000101020101,
+    0x0201000102000100, 0x0201000102000102, 0x0201000102010002, 0x0201000102010101,
+    0x0201000102010200, 0x0201000102020001, 0x0201000102020100, 0x0201000102020102,
+    0x0201000102020201, 0x0201000200000101, 0x0201000200010001, 0x0201000200010100,
+    0x0201000200010201, 0x0201000200020101, 0x0201000201000100, 0x0201000201000102,
+    0x0201000201000201, 0x0201000201010000, 0x0201000201010002, 0x0201000201010101,
+    0x0201000201010200, 0x0201000201020102, 0x0201000201020201, 0x0201000202000101,
+    0x0201000202010100, 0x0201000202010102, 0x0201000202020201, 0x0201010000000001,
+    0x0201010000000100, 0x0201010000000102, 0x0201010000010000, 0x0201010000010101,
+    0x0201010000010200, 0x0201010000020102, 0x0201010001000000, 0x0201010001000202,
+    0x0201010001010001, 0x0201010001010100, 0x0201010001010101, 0x0201010001010102,
+    0x0201010001010200, 0x0201010001010201, 0x0201010001020000, 0x0201010001020001,
+    0x0201010001020002, 0x0201010001020101, 0x0201010002000100, 0x0201010002000102,
+    0x0201010002010002, 0x0201010002010100, 0x0201010002010101, 0x0201010002010200,
+    0x0201010002020001, 0x0201010002020201, 0x0201010100000000, 0x0201010100000101,
+    0x0201010100000200, 0x0201010100000202, 0x0201010100010000, 0x0201010100010001,
+    0x0201010100010100, 0x0201010100010101, 0x0201010100010102, 0x0201010100010201,
+    0x0201010100020001, 0x0201010100020101, 0x0201010100020201, 0x0201010100020202,
+    0x0201010101000001, 0x0201010101000100, 0x0201010101000101, 0x0201010101000102,
+    0x0201010101000201, 0x0201010101010000, 0x0201010101010001, 0x0201010101010002,
+    0x0201010101010100, 0x0201010101010101, 0x0201010101010102, 0x0201010101010200,
+    0x0201010101010201, 0x0201010101010202, 0x0201010101020001, 0x0201010101020100,
+    0x0201010101020101, 0x0201010101020102, 0x0201010101020201, 0x0201010102000001,
+    0x0201010102000101, 0x0201010102000200, 0x0201010102010001, 0x0201010102010002,
+    0x0201010102010100, 0x0201010102010101, 0x0201010102010102, 0x0201010102010201,
+    0x0201010102010202, 0x0201010102020000, 0x0201010102020002, 0x0201010102020101,
+    0x0201010102020200, 0x0201010102020202, 0x0201010200000001, 0x0201010200000100,
+    0x0201010200010000, 0x0201010200010101, 0x0201010200010201, 0x0201010200020000,
+    0x0201010200020102, 0x0201010200020201, 0x0201010201000101, 0x0201010201000200,
+    0x0201010201000201, 0x0201010201010001, 0x0201010201010002, 0x0201010201010101,
+    0x0201010201010102, 0x0201010201010201, 0x0201010201020101, 0x0201010201020200,
+    0x0201010202000002, 0x0201010202000100, 0x0201010202000201, 0x0201010202000202,
+    0x0201010202010002, 0x0201010202010100, 0x0201010202010101, 0x0201010202020100,
+    0x0201010202020102, 0x0201010202020201, 0x0201020000000101, 0x0201020000010102,
+    0x0201020000010201, 0x0201020000020101, 0x0201020001000001, 0x0201020001000102,
+    0x0201020001010000, 0x0201020001010002, 0x0201020001010101, 0x0201020001010102,
+    0x0201020001010202, 0x0201020001020100, 0x0201020001020101, 0x0201020002000101,
+    0x0201020002010001, 0x0201020002010102, 0x0201020002010201, 0x0201020002020101,
+    0x0201020100000100, 0x0201020100000102, 0x0201020100000201, 0x0201020100010000,
+    0x0201020100010002, 0x0201020100010101, 0x0201020100010200, 0x0201020100010202,
+    0x0201020100020000, 0x0201020100020001, 0x0201020100020100, 0x0201020100020102,
+    0x0201020101000000, 0x0201020101000002, 0x0201020101000101, 0x0201020101000200,
+    0x0201020101000202, 0x0201020101010001, 0x0201020101010100, 0x0201020101010101,
+    0x0201020101010102, 0x0201020101010201, 0x0201020101020002, 0x0201020101020101,
+    0x0201020101020102, 0x0201020101020202, 0x0201020102000001, 0x0201020102000100,
+    0x0201020102010000, 0x0201020102010002, 0x0201020102010101, 0x0201020102010202,
+    0x0201020102020001, 0x0201020102020102, 0x0201020200000101, 0x0201020200010101,
+    0x0201020200020101, 0x0201020201000100, 0x0201020201000102, 0x0201020201000201,
+    0x0201020201010000, 0x0201020201010101, 0x0201020201010200, 0x0201020201020001,
+    0x0201020202000101, 0x0201020202010001, 0x0201020202010100, 0x0201020202010101,
+    0x0201020202010102, 0x0202000000000000, 0x0202000000000002, 0x0202000000000200,
+    0x0202000000000202, 0x0202000000010101, 0x0202000000020000, 0x0202000000020002,
+    0x0202000000020200, 0x0202000000020202, 0x0202000001000101, 0x0202000001010001,
+    0x0202000001010100, 0x0202000001010102, 0x0202000001010201, 0x0202000002000000,
+    0x0202000002000002, 0x0202000002000200, 0x0202000002000202, 0x0202000002010101,
+    0x0202000002020000, 0x0202000002020002, 0x0202000002020200, 0x0202000002020202,
+    0x0202000100000101, 0x0202000100000201, 0x0202000100010001, 0x0202000100010100,
+    0x0202000100010102, 0x0202000100010201, 0x0202000100010202, 0x0202000101000102,
+    0x0202000101000201, 0x0202000101010001, 0x0202000101010101, 0x0202000101010200,
+    0x0202000101010202, 0x0202000101020001, 0x0202000101020100, 0x0202000102000101,
+    0x0202000102010000, 0x0202000102010002, 0x0202000102010102, 0x0202000102010201,
+    0x0202000200000002, 0x0202000200000200, 0x0202000200000202, 0x0202000200010000,
+    0x0202000200010201, 0x0202000200020002, 0x0202000200020200, 0x0202000200020202,
+    0x0202000201000101, 0x0202000201010001, 0x0202000201010102, 0x0202000201010201,
+    0x0202000201020101, 0x0202000202000000, 0x0202000202000002, 0x0202000202000200,
+    0x0202000202000202, 0x0202000202010101, 0x0202000202020000, 0x0202000202020002,
+    0x0202000202020200, 0x0202000202020202, 0x0202010000010201, 0x0202010000020101,
+    0x0202010001000001, 0x0202010001000100, 0x0202010001010000, 0x0202010001010100,
+    0x0202010001010101, 0x0202010001010200, 0x0202010001010202, 0x0202010001020001,
+    0x0202010001020101, 0x0202010001020102, 0x0202010001020200, 0x0202010001020201,
+    0x0202010002000101, 0x0202010100000102, 0x0202010100000201, 0x0202010100010000,
+    0x0202010100010002, 0x0202010100010101, 0x0202010100010200, 0x0202010100020102,
+    0x0202010100020201, 0x0202010101000002, 0x0202010101000101, 0x0202010101010001,
+    0x0202010101010100, 0x0202010101010101, 0x0202010101010102, 0x0202010101010201,
+    0x0202010101020101, 0x0202010101020202, 0x0202010102000001, 0x0202010102000100,
+    0x0202010102000101, 0x0202010102000102, 0x0202010102000201, 0x0202010102010002,
+    0x0202010102010101, 0x0202010102010200, 0x0202010200000101, 0x0202010200010001,
+    0x0202010200010102, 0x0202010200010202, 0x0202010200020001, 0x0202010200020101,
+    0x0202010201000100, 0x0202010201000102, 0x0202010201000202, 0x0202010201010002,
+    0x0202010201010101, 0x0202010201010102, 0x0202010201010200, 0x0202010201020000,
+    0x0202010201020002, 0x0202010202000102, 0x0202010202010000, 0x0202010202010101,
+    0x0202010202010102, 0x0202010202010201, 0x0202010202020001, 0x0202010202020100,
+    0x0202010202020102, 0x0202020000000000, 0x0202020000000002, 0x0202020000000200,
+    0x0202020000000202, 0x0202020000020000, 0x0202020000020002, 0x0202020000020200,
+    0x0202020000020202, 0x0202020001010001, 0x0202020001010100, 0x0202020001010102,
+    0x0202020001010201, 0x0202020002000000, 0x0202020002000002, 0x0202020002000200,
+    0x0202020002000202, 0x0202020002010101, 0x0202020002020000, 0x0202020002020002,
+    0x0202020002020200, 0x0202020002020202, 0x0202020100000101, 0x0202020100010100,
+    0x0202020100010201, 0x0202020100020001, 0x0202020100020101, 0x0202020101000001,
+    0x0202020101010000, 0x0202020101010101, 0x0202020101010202, 0x0202020101020001,
+    0x0202020101020102, 0x0202020101020201, 0x0202020102010000, 0x0202020102010102,
+    0x0202020200000000, 0x0202020200000002, 0x0202020200000200, 0x0202020200000202,
+    0x0202020200020000, 0x0202020200020002, 0x0202020200020200, 0x0202020200020202,
+    0x0202020201010001, 0x0202020201010100, 0x0202020201010102, 0x0202020202000000,
+    0x0202020202000002, 0x0202020202000200, 0x0202020202000202, 0x0202020202010101,
+    0x0202020202020000, 0x0202020202020002, 0x0202020202020200, 0x0202020202020202,
+};
+#else
+static const uint32_t iq1s_grid_us[2048] = {
+    0x00000000, 0x00000002, 0x00000101, 0x00000200, 0x00000202, 0x00010001, 0x00010101, 0x00020000,
+    0x00020002, 0x00020200, 0x00020202, 0x01000101, 0x01010001, 0x01010100, 0x01010102, 0x01020101,
+    0x02000000, 0x02000002, 0x02000200, 0x02000202, 0x02010101, 0x02020000, 0x02020002, 0x02020200,
+    0x02020202, 0x00000110, 0x00000111, 0x00010011, 0x00010110, 0x00010112, 0x00010211, 0x00010212,
+    0x00020111, 0x01000011, 0x01000112, 0x01000211, 0x01010012, 0x01010111, 0x01010212, 0x01020011,
+    0x01020110, 0x01020112, 0x01020210, 0x02000111, 0x02010011, 0x02010110, 0x02010112, 0x02020111,
+    0x00000020, 0x00000022, 0x00000220, 0x00000222, 0x00010121, 0x00020020, 0x00020022, 0x00020220,
+    0x00020222, 0x01000121, 0x01010021, 0x01010221, 0x01020120, 0x01020221, 0x02000020, 0x02000022,
+    0x02000220, 0x02000222, 0x02010021, 0x02010121, 0x02010221, 0x02020020, 0x02020022, 0x02020220,
+    0x02020222, 0x00011001, 0x00011100, 0x00011102, 0x00021101, 0x01001001, 0x01001201, 0x01011101,
+    0x01011202, 0x01021100, 0x01021101, 0x02011001, 0x02011201, 0x02021101, 0x00001011, 0x00001110,
+    0x00001111, 0x00001112, 0x00011111, 0x00011210, 0x00011212, 0x00021211, 0x01001010, 0x01001111,
+    0x01001212, 0x01011010, 0x01011011, 0x01011110, 0x01011111, 0x01011112, 0x01011211, 0x01021010,
+    0x01021012, 0x01021111, 0x01021210, 0x01021212, 0x02001011, 0x02011011, 0x02011111, 0x02011210,
+    0x02011212, 0x02021011, 0x02021110, 0x02021111, 0x02021112, 0x02021211, 0x00011120, 0x00011221,
+    0x01001021, 0x01001120, 0x01011020, 0x01011022, 0x01011121, 0x01011220, 0x01021020, 0x01021021,
+    0x01021122, 0x01021221, 0x02001121, 0x02011021, 0x02011120, 0x02011221, 0x00002000, 0x00002002,
+    0x00002200, 0x00002202, 0x00012101, 0x00022000, 0x00022002, 0x00022200, 0x00022202, 0x01002101,
+    0x01012001, 0x01012102, 0x01022101, 0x02002000, 0x02002002, 0x02002200, 0x02002202, 0x02012101,
+    0x02022000, 0x02022002, 0x02022200, 0x02022202, 0x00002111, 0x00012011, 0x00012110, 0x00012211,
+    0x00022110, 0x00022111, 0x01002011, 0x01012010, 0x01012011, 0x01012111, 0x01022011, 0x01022110,
+    0x01022211, 0x02012011, 0x02012110, 0x02012112, 0x02012211, 0x02022111, 0x00002020, 0x00002022,
+    0x00002220, 0x00002222, 0x00012121, 0x00022020, 0x00022022, 0x00022220, 0x00022222, 0x01002121,
+    0x01012021, 0x01012221, 0x01022021, 0x01022121, 0x02002020, 0x02002022, 0x02002121, 0x02002220,
+    0x02002222, 0x02012121, 0x02022020, 0x02022022, 0x02022220, 0x02022222, 0x00110000, 0x00110001,
+    0x00110100, 0x00110201, 0x00120100, 0x00120101, 0x01100001, 0x01100100, 0x01110000, 0x01110101,
+    0x01110200, 0x01120001, 0x01120100, 0x01120101, 0x01120201, 0x02110001, 0x02110100, 0x02110102,
+    0x02120001, 0x02120101, 0x00100011, 0x00100110, 0x00100112, 0x00100211, 0x00110010, 0x00110012,
+    0x00110111, 0x00110210, 0x00120011, 0x00120110, 0x00120211, 0x01100111, 0x01100212, 0x01110010,
+    0x01110011, 0x01110012, 0x01110110, 0x01110111, 0x01110112, 0x01110211, 0x01120010, 0x01120111,
+    0x02100110, 0x02110012, 0x02110111, 0x02120011, 0x02120110, 0x00110021, 0x00110120, 0x00110122,
+    0x00120121, 0x01100020, 0x01100122, 0x01100221, 0x01110022, 0x01110121, 0x01110220, 0x01110222,
+    0x01120120, 0x01120122, 0x02100121, 0x02110021, 0x02110120, 0x02110122, 0x02120121, 0x00101001,
+    0x00101102, 0x00101201, 0x00111100, 0x00111101, 0x00111200, 0x00111201, 0x00121001, 0x00121102,
+    0x01101001, 0x01101101, 0x01101102, 0x01101200, 0x01101202, 0x01111001, 0x01111100, 0x01111101,
+    0x01111102, 0x01111201, 0x01121002, 0x01121101, 0x01121200, 0x02101100, 0x02101201, 0x02111000,
+    0x02111100, 0x02111101, 0x02111200, 0x02111201, 0x02111202, 0x02121001, 0x02121100, 0x02121101,
+    0x02121201, 0x00101012, 0x00101111, 0x00101212, 0x00111011, 0x00111110, 0x00111111, 0x00111112,
+    0x00111211, 0x00121010, 0x00121012, 0x00121111, 0x00121210, 0x00121212, 0x01101011, 0x01101110,
+    0x01101111, 0x01101112, 0x01111011, 0x01111012, 0x01111110, 0x01111111, 0x01111112, 0x01111211,
+    0x01111212, 0x01121011, 0x01121110, 0x01121111, 0x01121112, 0x01121211, 0x02101010, 0x02101012,
+    0x02101110, 0x02101111, 0x02101210, 0x02101212, 0x02111010, 0x02111011, 0x02111110, 0x02111111,
+    0x02111112, 0x02111211, 0x02111212, 0x02121010, 0x02121012, 0x02121111, 0x00101021, 0x00101120,
+    0x00101121, 0x00101122, 0x00111121, 0x00111122, 0x00111220, 0x00111222, 0x00121021, 0x00121122,
+    0x01101020, 0x01101022, 0x01101120, 0x01101121, 0x01101220, 0x01101222, 0x01111021, 0x01111121,
+    0x01111122, 0x01111220, 0x01111221, 0x01121021, 0x01121120, 0x01121121, 0x01121220, 0x01121221,
+    0x01121222, 0x02101122, 0x02101222, 0x02111022, 0x02111121, 0x02121120, 0x02121221, 0x00112001,
+    0x00112102, 0x00122101, 0x01102001, 0x01102100, 0x01102102, 0x01102201, 0x01112000, 0x01112101,
+    0x01112200, 0x01112202, 0x01122000, 0x01122001, 0x01122100, 0x01122102, 0x01122201, 0x02102101,
+    0x02112001, 0x02112100, 0x02122101, 0x00112010, 0x00112012, 0x00112111, 0x00112212, 0x00122011,
+    0x00122111, 0x01102012, 0x01102110, 0x01102111, 0x01102210, 0x01112011, 0x01112110, 0x01112111,
+    0x01112112, 0x01112211, 0x01112212, 0x01122010, 0x01122111, 0x01122212, 0x02102211, 0x02112011,
+    0x02112012, 0x02112111, 0x02112210, 0x02122011, 0x02122112, 0x02122211, 0x00102221, 0x00112122,
+    0x00122120, 0x00122122, 0x01102120, 0x01102122, 0x01102221, 0x01112020, 0x01112022, 0x01112121,
+    0x01112220, 0x01122021, 0x01122122, 0x01122221, 0x02102121, 0x02112021, 0x02112122, 0x02112222,
+    0x00200000, 0x00200002, 0x00200200, 0x00200202, 0x00210101, 0x00220000, 0x00220002, 0x00220101,
+    0x00220200, 0x00220202, 0x01200101, 0x01210001, 0x01210201, 0x01220001, 0x01220101, 0x02200000,
+    0x02200002, 0x02200200, 0x02200202, 0x02210101, 0x02220000, 0x02220002, 0x02220101, 0x02220200,
+    0x02220202, 0x00200111, 0x00210011, 0x00210110, 0x00210211, 0x00220111, 0x01200012, 0x01200110,
+    0x01200211, 0x01210111, 0x01210210, 0x01210212, 0x01220011, 0x01220110, 0x01220111, 0x01220112,
+    0x02200111, 0x02210010, 0x02210112, 0x02210211, 0x02220111, 0x00200021, 0x00200220, 0x00200222,
+    0x00210021, 0x00210121, 0x00220020, 0x00220022, 0x00220220, 0x00220222, 0x01200121, 0x01210021,
+    0x01210122, 0x01210221, 0x01220121, 0x02200021, 0x02200220, 0x02200222, 0x02210021, 0x02210121,
+    0x02220020, 0x02220022, 0x02220220, 0x02220222, 0x00201101, 0x00211100, 0x00211102, 0x00211201,
+    0x00221101, 0x01201100, 0x01201101, 0x01201102, 0x01201201, 0x01211002, 0x01211101, 0x01211200,
+    0x01211202, 0x01221102, 0x02201101, 0x02211001, 0x02211100, 0x02211201, 0x02221001, 0x02221101,
+    0x00201211, 0x00211111, 0x00221011, 0x00221211, 0x01201010, 0x01201111, 0x01201210, 0x01211011,
+    0x01211110, 0x01211111, 0x01211211, 0x01221012, 0x01221111, 0x01221210, 0x02201211, 0x02211010,
+    0x02211110, 0x02211111, 0x02211210, 0x02211212, 0x02221011, 0x02221110, 0x02221112, 0x02221211,
+    0x00201121, 0x00211020, 0x00211022, 0x00211221, 0x00221121, 0x01201021, 0x01201221, 0x01211121,
+    0x01221020, 0x01221021, 0x01221221, 0x02201120, 0x02201122, 0x02211020, 0x02211222, 0x00202000,
+    0x00202002, 0x00202200, 0x00202202, 0x00212101, 0x00222000, 0x00222002, 0x00222200, 0x00222202,
+    0x01202101, 0x01212001, 0x01212100, 0x01222101, 0x02202000, 0x02202002, 0x02202200, 0x02202202,
+    0x02222000, 0x02222002, 0x02222200, 0x02222202, 0x00202211, 0x00212011, 0x00212110, 0x00212211,
+    0x00222111, 0x01202112, 0x01202211, 0x01212012, 0x01212111, 0x01222011, 0x01222110, 0x01222112,
+    0x01222211, 0x02202111, 0x02212010, 0x02212112, 0x02212211, 0x02222110, 0x02222111, 0x00202020,
+    0x00202022, 0x00202220, 0x00202222, 0x00222020, 0x00222022, 0x00222220, 0x00222222, 0x01202121,
+    0x01212021, 0x01212122, 0x01212221, 0x01222121, 0x02202020, 0x02202022, 0x02202220, 0x02202222,
+    0x02212121, 0x02222020, 0x02222022, 0x02222220, 0x02222222, 0x10000101, 0x10010001, 0x10010102,
+    0x10020101, 0x11000201, 0x11010002, 0x11010101, 0x11010200, 0x11010202, 0x11020001, 0x11020100,
+    0x11020102, 0x12010100, 0x12010201, 0x12020001, 0x12020102, 0x10000010, 0x10000011, 0x10000110,
+    0x10000112, 0x10000211, 0x10010012, 0x10010111, 0x10010112, 0x10010210, 0x10010212, 0x10020011,
+    0x10020112, 0x10020211, 0x11000111, 0x11000210, 0x11000212, 0x11010011, 0x11010110, 0x11010111,
+    0x11010112, 0x11010211, 0x11010212, 0x11020111, 0x11020210, 0x11020212, 0x12000011, 0x12000110,
+    0x12000112, 0x12010010, 0x12010012, 0x12010111, 0x12020010, 0x12020011, 0x12020012, 0x10000121,
+    0x10010021, 0x10010120, 0x10010122, 0x10020121, 0x11000021, 0x11010022, 0x11010121, 0x11010222,
+    0x11020120, 0x11020221, 0x12000221, 0x12010120, 0x12020121, 0x10001001, 0x10011101, 0x10011201,
+    0x10021201, 0x11001101, 0x11001200, 0x11001202, 0x11011001, 0x11011100, 0x11011101, 0x11011102,
+    0x11021001, 0x11021002, 0x11021101, 0x11021200, 0x11021202, 0x12001001, 0x12001102, 0x12001201,
+    0x12011000, 0x12011002, 0x12011101, 0x12021000, 0x12021001, 0x12021201, 0x10001011, 0x10001012,
+    0x10001111, 0x10001212, 0x10011011, 0x10011110, 0x10011111, 0x10011112, 0x10011211, 0x10021010,
+    0x10021111, 0x10021212, 0x11001011, 0x11001110, 0x11001111, 0x11001112, 0x11001211, 0x11011010,
+    0x11011011, 0x11011110, 0x11011111, 0x11011112, 0x11011210, 0x11011211, 0x11021011, 0x11021110,
+    0x11021111, 0x11021112, 0x11021211, 0x12001012, 0x12001110, 0x12001111, 0x12001210, 0x12011011,
+    0x12011110, 0x12011111, 0x12011112, 0x12011211, 0x12011212, 0x12021111, 0x12021210, 0x12021212,
+    0x10001021, 0x10001121, 0x10001221, 0x10011120, 0x10011121, 0x10011220, 0x10011222, 0x10021021,
+    0x10021120, 0x10021221, 0x11001020, 0x11001022, 0x11001121, 0x11001220, 0x11011020, 0x11011021,
+    0x11011022, 0x11011121, 0x11011122, 0x11011221, 0x11021022, 0x11021121, 0x11021220, 0x12001021,
+    0x12001121, 0x12001222, 0x12011120, 0x12011121, 0x12021021, 0x12021120, 0x12021122, 0x10002101,
+    0x10012001, 0x10012101, 0x10012202, 0x10022101, 0x11002002, 0x11002201, 0x11012000, 0x11012101,
+    0x11012200, 0x11022001, 0x11022100, 0x11022102, 0x11022201, 0x12002101, 0x12012001, 0x12012100,
+    0x12012102, 0x12012201, 0x12022101, 0x10002011, 0x10002111, 0x10002112, 0x10002212, 0x10012010,
+    0x10012110, 0x10012111, 0x10012210, 0x10022011, 0x10022110, 0x10022112, 0x11002010, 0x11002111,
+    0x11002212, 0x11012011, 0x11012012, 0x11012110, 0x11012111, 0x11012112, 0x11012211, 0x11022010,
+    0x11022012, 0x11022111, 0x11022112, 0x11022212, 0x12002112, 0x12002211, 0x12012012, 0x12012111,
+    0x12012112, 0x12012210, 0x12022011, 0x12022110, 0x12022112, 0x12022211, 0x10012122, 0x11002120,
+    0x11002122, 0x11002221, 0x11012121, 0x11012220, 0x11012222, 0x11022120, 0x11022221, 0x12012120,
+    0x12022121, 0x10100001, 0x10100100, 0x10100101, 0x10100102, 0x10100201, 0x10110002, 0x10110101,
+    0x10110202, 0x10120001, 0x10120100, 0x10120201, 0x11100000, 0x11100101, 0x11100200, 0x11110001,
+    0x11110100, 0x11110101, 0x11110102, 0x11110201, 0x11120101, 0x11120200, 0x12100102, 0x12100201,
+    0x12110101, 0x12110200, 0x12120000, 0x12120001, 0x12120102, 0x12120201, 0x10100111, 0x10100210,
+    0x10100211, 0x10100212, 0x10110011, 0x10110110, 0x10110111, 0x10110112, 0x10110210, 0x10110211,
+    0x10120010, 0x10120111, 0x10120112, 0x10120210, 0x10120212, 0x11100011, 0x11100110, 0x11100111,
+    0x11100112, 0x11100211, 0x11110010, 0x11110011, 0x11110012, 0x11110110, 0x11110111, 0x11110112,
+    0x11110210, 0x11110211, 0x11110212, 0x11120011, 0x11120110, 0x11120111, 0x11120112, 0x11120211,
+    0x12100012, 0x12100111, 0x12110011, 0x12110110, 0x12110111, 0x12110112, 0x12110211, 0x12120010,
+    0x12120111, 0x12120212, 0x10100021, 0x10100122, 0x10110022, 0x10110121, 0x10110222, 0x10120021,
+    0x10120120, 0x11100022, 0x11100121, 0x11100222, 0x11110021, 0x11110120, 0x11110121, 0x11110122,
+    0x11110221, 0x11120022, 0x11120121, 0x12100121, 0x12110020, 0x12110022, 0x12110121, 0x12110221,
+    0x12110222, 0x12120120, 0x10101100, 0x10101101, 0x10111001, 0x10111100, 0x10111101, 0x10111102,
+    0x10111200, 0x10111201, 0x10121001, 0x10121101, 0x10121200, 0x10121202, 0x11101001, 0x11101100,
+    0x11101101, 0x11101102, 0x11101201, 0x11101202, 0x11111000, 0x11111001, 0x11111100, 0x11111101,
+    0x11111102, 0x11111200, 0x11111201, 0x11111202, 0x11121001, 0x11121002, 0x11121100, 0x11121101,
+    0x11121102, 0x11121201, 0x12101000, 0x12101200, 0x12101202, 0x12111001, 0x12111100, 0x12111101,
+    0x12111102, 0x12111201, 0x12121001, 0x12121100, 0x12121101, 0x12121202, 0x10101011, 0x10101012,
+    0x10101110, 0x10101111, 0x10101112, 0x10101211, 0x10111010, 0x10111011, 0x10111012, 0x10111110,
+    0x10111111, 0x10111112, 0x10111211, 0x10111212, 0x10121011, 0x10121110, 0x10121111, 0x10121112,
+    0x10121211, 0x11101010, 0x11101011, 0x11101012, 0x11101110, 0x11101111, 0x11101112, 0x11101210,
+    0x11101211, 0x11111010, 0x11111011, 0x11111012, 0x11111110, 0x11111111, 0x11111112, 0x11111210,
+    0x11111211, 0x11111212, 0x11121010, 0x11121011, 0x11121110, 0x11121111, 0x11121112, 0x11121210,
+    0x11121211, 0x11121212, 0x12101011, 0x12101110, 0x12101111, 0x12101211, 0x12101212, 0x12111010,
+    0x12111011, 0x12111110, 0x12111111, 0x12111112, 0x12111210, 0x12111211, 0x12121011, 0x12121110,
+    0x12121111, 0x12121112, 0x12121211, 0x10101020, 0x10101021, 0x10101022, 0x10101120, 0x10101122,
+    0x10101220, 0x10101221, 0x10111021, 0x10111120, 0x10111121, 0x10111220, 0x10111221, 0x10121020,
+    0x10121021, 0x10121022, 0x10121120, 0x10121121, 0x10121122, 0x10121220, 0x10121221, 0x11101021,
+    0x11101121, 0x11101122, 0x11101220, 0x11101221, 0x11101222, 0x11111020, 0x11111021, 0x11111022,
+    0x11111120, 0x11111121, 0x11111122, 0x11111220, 0x11111221, 0x11111222, 0x11121021, 0x11121120,
+    0x11121121, 0x11121221, 0x12101022, 0x12101121, 0x12101122, 0x12101220, 0x12101221, 0x12101222,
+    0x12111021, 0x12111121, 0x12111222, 0x12121022, 0x12121121, 0x12121122, 0x12121220, 0x12121221,
+    0x10102100, 0x10102101, 0x10102102, 0x10102201, 0x10112000, 0x10112101, 0x10112200, 0x10122001,
+    0x10122202, 0x11102101, 0x11102200, 0x11102202, 0x11112001, 0x11112100, 0x11112101, 0x11112102,
+    0x11112200, 0x11112201, 0x11122000, 0x11122002, 0x11122100, 0x11122101, 0x12102002, 0x12102201,
+    0x12112000, 0x12112002, 0x12112101, 0x12112200, 0x12122001, 0x12122201, 0x10102011, 0x10102012,
+    0x10102111, 0x10102212, 0x10112011, 0x10112110, 0x10112111, 0x10112112, 0x10112211, 0x10122111,
+    0x11102011, 0x11102110, 0x11102111, 0x11102112, 0x11102211, 0x11112010, 0x11112011, 0x11112012,
+    0x11112110, 0x11112111, 0x11112112, 0x11112210, 0x11112211, 0x11112212, 0x11122011, 0x11122110,
+    0x11122111, 0x11122112, 0x11122211, 0x12102011, 0x12102111, 0x12102211, 0x12112011, 0x12112110,
+    0x12112111, 0x12112112, 0x12112210, 0x12112211, 0x12122111, 0x10102120, 0x10102220, 0x10112121,
+    0x10112222, 0x10122020, 0x10122121, 0x10122122, 0x10122221, 0x11102121, 0x11102220, 0x11102221,
+    0x11112021, 0x11112121, 0x11112122, 0x11112220, 0x11112221, 0x11122022, 0x11122121, 0x11122220,
+    0x11122222, 0x12102021, 0x12102222, 0x12112022, 0x12112121, 0x12112122, 0x12112220, 0x12112222,
+    0x12122021, 0x10200101, 0x10210100, 0x10210102, 0x10210201, 0x10220101, 0x11200100, 0x11210000,
+    0x11210101, 0x11210102, 0x11210200, 0x11210202, 0x11220001, 0x11220100, 0x11220102, 0x11220201,
+    0x12200001, 0x12210102, 0x12220101, 0x10200011, 0x10200110, 0x10200112, 0x10200211, 0x10210012,
+    0x10210111, 0x10220011, 0x10220012, 0x10220112, 0x10220211, 0x11200111, 0x11200211, 0x11210011,
+    0x11210111, 0x11210112, 0x11210211, 0x11220111, 0x11220112, 0x11220212, 0x12200110, 0x12200212,
+    0x12210012, 0x12210111, 0x12220011, 0x12220112, 0x12220211, 0x10210021, 0x10210122, 0x10210221,
+    0x11200020, 0x11200021, 0x11200122, 0x11210121, 0x11210122, 0x11210220, 0x11220020, 0x12200121,
+    0x12210021, 0x12210122, 0x12220121, 0x10211001, 0x10211002, 0x10211101, 0x10211102, 0x10211202,
+    0x10221001, 0x10221102, 0x10221201, 0x11201000, 0x11201002, 0x11201101, 0x11201200, 0x11201202,
+    0x11211001, 0x11211100, 0x11211101, 0x11211102, 0x11211201, 0x11211202, 0x11221000, 0x11221002,
+    0x11221101, 0x12201100, 0x12201101, 0x12201201, 0x12211000, 0x12211002, 0x12211100, 0x12211101,
+    0x12211102, 0x12211200, 0x12211202, 0x12221001, 0x12221100, 0x12221201, 0x10201111, 0x10201210,
+    0x10201212, 0x10211011, 0x10211111, 0x10211112, 0x10211211, 0x11201110, 0x11201111, 0x11201112,
+    0x11201211, 0x11211010, 0x11211011, 0x11211110, 0x11211111, 0x11211112, 0x11211211, 0x11221011,
+    0x11221110, 0x11221111, 0x11221112, 0x11221211, 0x12201112, 0x12201211, 0x12201212, 0x12211011,
+    0x12211111, 0x12211112, 0x12211211, 0x12211212, 0x12221012, 0x12221111, 0x12221112, 0x12221210,
+    0x10201022, 0x10201221, 0x10211121, 0x10221020, 0x10221122, 0x10221220, 0x10221221, 0x11201020,
+    0x11201121, 0x11201220, 0x11201222, 0x11211021, 0x11211120, 0x11211121, 0x11211122, 0x11211220,
+    0x11211222, 0x11221020, 0x11221121, 0x11221220, 0x12201020, 0x12201022, 0x12201121, 0x12201222,
+    0x12211120, 0x12211122, 0x12211220, 0x12211221, 0x12221020, 0x12221120, 0x12221122, 0x12221222,
+    0x10212102, 0x10212201, 0x10222101, 0x11202001, 0x11212002, 0x11212101, 0x11212202, 0x11222001,
+    0x11222201, 0x12202101, 0x12212001, 0x12212200, 0x12222102, 0x10202011, 0x10202110, 0x10212010,
+    0x10212111, 0x10222011, 0x10222110, 0x10222112, 0x10222211, 0x11202010, 0x11202011, 0x11202111,
+    0x11202112, 0x11202210, 0x11212011, 0x11212110, 0x11212111, 0x11212112, 0x11212211, 0x11222010,
+    0x11222111, 0x11222212, 0x12202012, 0x12202110, 0x12202212, 0x12212111, 0x12222011, 0x12222110,
+    0x12222111, 0x12222211, 0x10212021, 0x10212122, 0x10212220, 0x11202021, 0x11202120, 0x11202221,
+    0x11212020, 0x11212121, 0x11212220, 0x11212222, 0x11222120, 0x11222121, 0x11222221, 0x12202122,
+    0x12212120, 0x12212220, 0x12212222, 0x12222122, 0x20000000, 0x20000002, 0x20000200, 0x20000202,
+    0x20020000, 0x20020002, 0x20020200, 0x20020202, 0x21000101, 0x21010000, 0x21010001, 0x21010100,
+    0x21010102, 0x21010201, 0x21020101, 0x22000000, 0x22000002, 0x22000200, 0x22000202, 0x22010101,
+    0x22020000, 0x22020002, 0x22020200, 0x22020202, 0x20000111, 0x20010011, 0x20010110, 0x20010112,
+    0x20010211, 0x20020111, 0x21000011, 0x21000110, 0x21000211, 0x21010010, 0x21010012, 0x21010111,
+    0x21010112, 0x21010210, 0x21010211, 0x21020110, 0x21020112, 0x21020211, 0x22000111, 0x22000211,
+    0x22010110, 0x22010112, 0x22010211, 0x22020111, 0x20000020, 0x20000022, 0x20000220, 0x20000222,
+    0x20010121, 0x20020020, 0x20020022, 0x20020220, 0x20020222, 0x21010021, 0x21010120, 0x21010221,
+    0x21020121, 0x22000020, 0x22000022, 0x22000220, 0x22000222, 0x22010121, 0x22020020, 0x22020022,
+    0x22020220, 0x22020222, 0x20011100, 0x20011201, 0x21001001, 0x21001100, 0x21011001, 0x21011101,
+    0x21011202, 0x21021001, 0x21021100, 0x21021201, 0x22011100, 0x22011201, 0x20001011, 0x20001211,
+    0x20011012, 0x20011111, 0x20011212, 0x20021112, 0x20021211, 0x21001010, 0x21001011, 0x21001111,
+    0x21001210, 0x21011011, 0x21011110, 0x21011111, 0x21011112, 0x21011211, 0x21011212, 0x21021111,
+    0x21021112, 0x21021210, 0x21021212, 0x22001011, 0x22001110, 0x22001112, 0x22001211, 0x22011010,
+    0x22011012, 0x22011111, 0x22011210, 0x22021112, 0x20011021, 0x20011122, 0x20011221, 0x20021121,
+    0x21001021, 0x21001120, 0x21001221, 0x21001222, 0x21011020, 0x21011121, 0x21011221, 0x21011222,
+    0x21021021, 0x21021122, 0x21021222, 0x22001121, 0x22011021, 0x22011222, 0x22021120, 0x20002000,
+    0x20002002, 0x20002200, 0x20002202, 0x20012101, 0x20022000, 0x20022002, 0x20022200, 0x20022202,
+    0x21002001, 0x21002101, 0x21012001, 0x21012100, 0x21012201, 0x21022101, 0x21022201, 0x22002000,
+    0x22002002, 0x22002200, 0x22002202, 0x22012101, 0x22022000, 0x22022002, 0x22022200, 0x22022202,
+    0x20002111, 0x20002112, 0x20012011, 0x20012110, 0x20012112, 0x20022111, 0x21002011, 0x21002110,
+    0x21002112, 0x21002211, 0x21012010, 0x21012012, 0x21012111, 0x21012212, 0x21022011, 0x21022110,
+    0x22002111, 0x22012112, 0x22012211, 0x22022111, 0x20002020, 0x20002022, 0x20002220, 0x20002222,
+    0x20012121, 0x20022020, 0x20022022, 0x20022220, 0x20022222, 0x21002121, 0x21012021, 0x21012120,
+    0x21012122, 0x22002020, 0x22002022, 0x22002220, 0x22002222, 0x22012121, 0x22022020, 0x22022022,
+    0x22022220, 0x22022222, 0x20100101, 0x20110001, 0x20110102, 0x20110200, 0x20110201, 0x20120101,
+    0x21100001, 0x21100102, 0x21100201, 0x21110101, 0x21110200, 0x21110202, 0x21120201, 0x21120202,
+    0x22100101, 0x22110001, 0x22110100, 0x22110102, 0x22110201, 0x22120101, 0x20100011, 0x20100110,
+    0x20100112, 0x20100211, 0x20110010, 0x20110111, 0x20110210, 0x20110212, 0x20120011, 0x20120110,
+    0x20120112, 0x20120211, 0x21100010, 0x21100111, 0x21110010, 0x21110011, 0x21110110, 0x21110111,
+    0x21110112, 0x21110211, 0x21120012, 0x21120111, 0x22100110, 0x22100112, 0x22110012, 0x22110111,
+    0x22110210, 0x22120011, 0x22120110, 0x22120112, 0x22120211, 0x20100121, 0x20110021, 0x20110120,
+    0x20110221, 0x20120121, 0x21100120, 0x21100122, 0x21100221, 0x21110020, 0x21110022, 0x21110121,
+    0x21110220, 0x21120122, 0x21120221, 0x22100121, 0x22110120, 0x22110122, 0x22120221, 0x20101001,
+    0x20101100, 0x20101102, 0x20111000, 0x20111101, 0x20111200, 0x20121102, 0x21101000, 0x21101202,
+    0x21111001, 0x21111100, 0x21111101, 0x21111102, 0x21111200, 0x21111201, 0x21121000, 0x21121001,
+    0x21121002, 0x21121101, 0x22101100, 0x22101102, 0x22111002, 0x22111100, 0x22111101, 0x22111200,
+    0x22121001, 0x22121201, 0x20101010, 0x20101111, 0x20101210, 0x20101212, 0x20111010, 0x20111011,
+    0x20111110, 0x20111111, 0x20111112, 0x20111211, 0x20121011, 0x20121111, 0x20121211, 0x20121212,
+    0x21101011, 0x21101110, 0x21101111, 0x21101112, 0x21101211, 0x21111010, 0x21111011, 0x21111012,
+    0x21111110, 0x21111111, 0x21111112, 0x21111210, 0x21111211, 0x21111212, 0x21121011, 0x21121110,
+    0x21121111, 0x21121112, 0x21121211, 0x22101011, 0x22101111, 0x22101210, 0x22111011, 0x22111012,
+    0x22111110, 0x22111111, 0x22111112, 0x22111211, 0x22111212, 0x22121010, 0x22121012, 0x22121111,
+    0x22121210, 0x22121212, 0x20101021, 0x20101120, 0x20111020, 0x20111121, 0x20111221, 0x20121020,
+    0x20121122, 0x20121221, 0x21101121, 0x21101220, 0x21101221, 0x21111021, 0x21111022, 0x21111121,
+    0x21111122, 0x21111221, 0x21121121, 0x21121220, 0x22101022, 0x22101120, 0x22101221, 0x22101222,
+    0x22111022, 0x22111120, 0x22111121, 0x22121120, 0x22121122, 0x22121221, 0x20102101, 0x20112102,
+    0x20112201, 0x20122101, 0x21102001, 0x21102102, 0x21112000, 0x21112002, 0x21112101, 0x21112102,
+    0x21112202, 0x21122100, 0x21122101, 0x22102101, 0x22112001, 0x22112102, 0x22112201, 0x22122101,
+    0x20102110, 0x20102112, 0x20102211, 0x20112010, 0x20112012, 0x20112111, 0x20112210, 0x20112212,
+    0x20122010, 0x20122011, 0x20122110, 0x20122112, 0x21102010, 0x21102012, 0x21102111, 0x21102210,
+    0x21102212, 0x21112011, 0x21112110, 0x21112111, 0x21112112, 0x21112211, 0x21122012, 0x21122111,
+    0x21122112, 0x21122212, 0x22102011, 0x22102110, 0x22112010, 0x22112012, 0x22112111, 0x22112212,
+    0x22122011, 0x22122112, 0x20102121, 0x20112121, 0x20122121, 0x21102120, 0x21102122, 0x21102221,
+    0x21112020, 0x21112121, 0x21112220, 0x21122021, 0x22102121, 0x22112021, 0x22112120, 0x22112121,
+    0x22112122, 0x20200000, 0x20200002, 0x20200200, 0x20200202, 0x20210101, 0x20220000, 0x20220002,
+    0x20220200, 0x20220202, 0x21200101, 0x21210001, 0x21210100, 0x21210102, 0x21210201, 0x22200000,
+    0x22200002, 0x22200200, 0x22200202, 0x22210101, 0x22220000, 0x22220002, 0x22220200, 0x22220202,
+    0x20200111, 0x20200211, 0x20210011, 0x20210110, 0x20210112, 0x20210211, 0x20210212, 0x21200112,
+    0x21200211, 0x21210011, 0x21210111, 0x21210210, 0x21210212, 0x21220011, 0x21220110, 0x22200111,
+    0x22210010, 0x22210012, 0x22210112, 0x22210211, 0x20200022, 0x20200220, 0x20200222, 0x20210020,
+    0x20210221, 0x20220022, 0x20220220, 0x20220222, 0x21200121, 0x21210021, 0x21210122, 0x21210221,
+    0x21220121, 0x22200020, 0x22200022, 0x22200220, 0x22200222, 0x22210121, 0x22220020, 0x22220022,
+    0x22220220, 0x22220222, 0x20211201, 0x20221101, 0x21201001, 0x21201100, 0x21211000, 0x21211100,
+    0x21211101, 0x21211200, 0x21211202, 0x21221001, 0x21221101, 0x21221102, 0x21221200, 0x21221201,
+    0x22201101, 0x20201112, 0x20201211, 0x20211010, 0x20211012, 0x20211111, 0x20211210, 0x20221112,
+    0x20221211, 0x21201012, 0x21201111, 0x21211011, 0x21211110, 0x21211111, 0x21211112, 0x21211211,
+    0x21221111, 0x21221212, 0x22201011, 0x22201110, 0x22201111, 0x22201112, 0x22201211, 0x22211012,
+    0x22211111, 0x22211210, 0x20201121, 0x20211021, 0x20211122, 0x20211222, 0x20221021, 0x20221121,
+    0x21201120, 0x21201122, 0x21201222, 0x21211022, 0x21211121, 0x21211122, 0x21211220, 0x21221020,
+    0x21221022, 0x22201122, 0x22211020, 0x22211121, 0x22211122, 0x22211221, 0x22221021, 0x22221120,
+    0x22221122, 0x20202000, 0x20202002, 0x20202200, 0x20202202, 0x20222000, 0x20222002, 0x20222200,
+    0x20222202, 0x21212001, 0x21212100, 0x21212102, 0x21212201, 0x22202000, 0x22202002, 0x22202200,
+    0x22202202, 0x22212101, 0x22222000, 0x22222002, 0x22222200, 0x22222202, 0x20202111, 0x20212110,
+    0x20212211, 0x20222011, 0x20222111, 0x21202011, 0x21212010, 0x21212111, 0x21212212, 0x21222011,
+    0x21222112, 0x21222211, 0x22212010, 0x22212112, 0x20202020, 0x20202022, 0x20202220, 0x20202222,
+    0x20222020, 0x20222022, 0x20222220, 0x20222222, 0x21212021, 0x21212120, 0x21212122, 0x22202020,
+    0x22202022, 0x22202220, 0x22202222, 0x22212121, 0x22222020, 0x22222022, 0x22222220, 0x22222222,
+};
+#endif
+
+#ifndef HAVE_FANCY_SIMD
 const uint64_t keven_signs[128] = {
     0x0101010101010101, 0xff010101010101ff, 0xff0101010101ff01, 0x010101010101ffff,
     0xff01010101ff0101, 0x0101010101ff01ff, 0x0101010101ffff01, 0xff01010101ffffff,
@@ -181,31 +989,41 @@ const uint64_t keven_signs[128] = {
     0x01ffffffff010101, 0xffffffffff0101ff, 0xffffffffff01ff01, 0x01ffffffff01ffff,
     0xffffffffffff0101, 0x01ffffffffff01ff, 0x01ffffffffffff01, 0xffffffffffffffff,
 };
+#endif
 
 }
 
-bool iqk_mul_mat(long Nx, long Ny, long ne00, int typeA, const void * A, const void * B,
-        float * C, long stride_C, int ith, int nth) {
+/* moonll change mulmat
+add typeB and strideB
+}*/
 
-    MulMat mm;
-    int row_size_q8;
-    if (!MulMat::set_mul_mat(typeA, ne00, mm, row_size_q8, Ny)) {
-        return false;
-    }
+bool iqk_mul_mat(long Nx, long Ny, long ne00,
+    int typeA, const void * A, long strideA,
+    int typeB, const void * B, long strideB,
+    float * C, long stride_C, int ith, int nth) {
 
-    auto row_size_qx = ggml_row_size((ggml_type)typeA, ne00);
+        MulMat mm;
+    
+        if (!MulMat::set_mul_mat(typeA, typeB, ne00, mm, Ny)) {
+            return false;
+        }
 
-    auto nrc_x = (Nx + nth - 1)/nth;
-    auto first_x = ith*nrc_x;
-    if (first_x + nrc_x > Nx) nrc_x = Nx - first_x;
+        size_t row_size_qx = strideA*ggml_type_size(ggml_type(typeA));
+        size_t row_size_qy = strideB*ggml_type_size(ggml_type(typeB));
+      
+        
+        auto nrc_x = (Nx + nth - 1)/nth;
+        auto first_x = ith*nrc_x;
+        if (first_x + nrc_x > Nx) nrc_x = Nx - first_x;
 
-    DataInfo info{C + first_x, (const char *)B, (size_t)stride_C, (size_t)row_size_q8, 0, 1, nullptr, 0};
+        DataInfo info{C + first_x, (const char *)B, (size_t)stride_C, row_size_qy, 0, 1, nullptr, 0};
 
-    mm.mul_mat_NxM(ne00, (const char *)A + row_size_qx*first_x, row_size_qx, info, nrc_x, Ny);
+        mm.mul_mat_NxM(ne00, (const char *)A + row_size_qx*first_x, row_size_qx, info, nrc_x, Ny);
 
-    return true;
+        return true;
 }
 
+
 bool iqk_mul_mat_moe(long Nx, long Ny, long ne00, int ne11, int typeA, const void * A, const void * B,
         float * C, long nb1, long nb2, const void * vrow_mapping, int ith, int nth) {
     const mmid_row_mapping * row_mapping = (const mmid_row_mapping *)vrow_mapping;
@@ -213,9 +1031,11 @@ bool iqk_mul_mat_moe(long Nx, long Ny, long ne00, int ne11, int typeA, const voi
 
     MulMat mm;
     int row_size_q8;
+    /* moonll
+
     if (!MulMat::set_mul_mat(typeA, ne00, mm, row_size_q8, Ny)) {
         return false;
-    }
+    }*/
     int row_size_qx = ggml_row_size((ggml_type)typeA, ne00);
     int nrc_x = (Nx + nth - 1)/nth;
     int first_x = ith*nrc_x;
@@ -233,6 +1053,7 @@ bool iqk_mul_mat_moe(long Nx, long Ny, long ne00, int ne11, int typeA, const voi
 #if defined(__AVX512F__) && defined(__AVX512VNNI__) && defined(__AVX512VL__) && defined(__AVX512BW__) && defined(__AVX512DQ__)
     #define HAVE_FANCY_SIMD
 #endif
+//#define HAVE_FANCY_SIMD
 
 namespace {
 
@@ -257,10 +1078,9 @@ template <int nrc, typename block_q8 = block_q8_K> struct Q8 {
     }
 
 #ifdef HAVE_FANCY_SIMD
-    inline __m512i load_quants(int iy, int i, int j) const { return _mm512_loadu_si512((const __m512i*)y[iy][i].qs + j); }
-#else
-    inline __m256i load_quants(int iy, int i, int j) const { return _mm256_loadu_si256((const __m256i*)y[iy][i].qs + j); }
+    inline __m512i load_quants64(int iy, int i, int j) const { return _mm512_loadu_si512((const __m512i*)y[iy][i].qs + j); }
 #endif
+    inline __m256i load_quants(int iy, int i, int j) const { return _mm256_loadu_si256((const __m256i*)y[iy][i].qs + j); }
     inline __m256i load_bsums(int iy, int i) const { return _mm256_loadu_si256((const __m256i*)y[iy][i].bsums); }
     inline float scale(int iy, int i) const { return y[iy][i].d; }
 
@@ -353,6 +1173,23 @@ struct ScaleIQ4XS {
     const __m128i m32 = _mm_set1_epi16(-32);
 };
 
+struct Scales8KBase {
+    template <typename Q8>
+    inline void accum_mins(const __m128i& mins128, const Q8& q8, int i, float c, __m256 * accd) const {
+        const __m256i mins = MM256_SET_M128I(_mm_shuffle_epi8(mins128, shuffles[1]), _mm_shuffle_epi8(mins128, shuffles[0]));
+        for (int iy = 0; iy < Q8::nrc_y; ++iy) {
+            const __m256i q8s = q8.load_bsums(iy, i);
+            const __m256i prod = _mm256_madd_epi16(mins, q8s);
+            accd[iy] = _mm256_fmadd_ps(_mm256_set1_ps(c*q8.scale(iy, i)), _mm256_cvtepi32_ps(prod), accd[iy]);
+        }
+    }
+    inline __m256i shuffle(__m128i mins) const {
+        return MM256_SET_M128I(_mm_shuffle_epi8(mins, shuffles[1]), _mm_shuffle_epi8(mins, shuffles[0]));
+    }
+    const __m128i shuffles[2] = {_mm_set_epi32(0x07060706, 0x05040504, 0x03020302, 0x01000100),
+                                 _mm_set_epi32(0x0f0e0f0e, 0x0d0c0d0c, 0x0b0a0b0a, 0x09080908)};
+};
+
 template <typename Block>
 struct BaseDequantizer {
     BaseDequantizer(const void * vx, size_t bx) : vx(vx), bx(bx) {}
@@ -367,6 +1204,16 @@ struct BaseDequantizer {
     float d;
 };
 
+__m128i inline load_iq4nl_values_128() {
+    static const uint8_t kvalues_iq4nl[16] = {1, 24, 45, 63, 79, 93, 106, 118, 129, 141, 153, 166, 181, 197, 217, 241};
+    return _mm_loadu_si128((const __m128i *)kvalues_iq4nl);
+}
+
+__m256i inline load_iq4nl_values_256() {
+    auto val128 = load_iq4nl_values_128();
+    return MM256_SET_M128I(val128, val128);
+}
+
 #ifdef HAVE_FANCY_SIMD
 //====================================== Zen4 ==================================================
 
@@ -434,8 +1281,17 @@ struct DequantizerQ4K final : public BaseDequantizer<block_q4_K> {
     Scales8K s8k;
 };
 
+/*
+moonll DequantizerIQ4XS
+*/
+
+__m512i inline load_iq4nl_values_512() {
+    auto val256 = load_iq4nl_values_256();
+    return _mm512_inserti32x8(_mm512_castsi256_si512(val256), val256, 1);
+}
+
 struct DequantizerIQ4XS final : public BaseDequantizer<block_iq4_xs> {
-    DequantizerIQ4XS(const void * vx, size_t bx) : BaseDequantizer(vx, bx), values(load_values()) {}
+    DequantizerIQ4XS(const void * vx, size_t bx) : BaseDequantizer(vx, bx), values(load_iq4nl_values_512()) {}
     template <typename Q8>
     inline void new_block(int i, const Q8& q8, __m256 * accd, __m512i * scales) {
         d = GGML_FP16_TO_FP32(x[i].d);
@@ -444,14 +1300,10 @@ struct DequantizerIQ4XS final : public BaseDequantizer<block_iq4_xs> {
         s8k.accum_mins(scales128, q8, i, -128.f*d, accd);
         auto scales256 = MM256_SET_M128I(scales128, scales128);
         auto all_scales = _mm512_inserti32x8(_mm512_castsi256_si512(scales256), scales256, 1);
-        scales[0] = _mm512_shuffle_epi8(all_scales, s8k.shuffles512[0]);
-        scales[1] = _mm512_shuffle_epi8(all_scales, s8k.shuffles512[1]);
-    }
-    static __m512i load_values() {
-        static const uint8_t kvalues_iq4nl[16] = {1, 24, 45, 63, 79, 93, 106, 118, 129, 141, 153, 166, 181, 197, 217, 241};
-        auto val128 = _mm_loadu_si128((const __m128i *)kvalues_iq4nl);
-        auto val256 = MM256_SET_M128I(val128, val128);
-        return _mm512_inserti32x8(_mm512_castsi256_si512(val256), val256, 1);
+        scales[0] = _mm512_shuffle_epi8(all_scales, shuffles[0]);
+        scales[1] = _mm512_shuffle_epi8(all_scales, shuffles[1]);
+        scales[2] = _mm512_shuffle_epi8(all_scales, shuffles[2]);
+        scales[3] = _mm512_shuffle_epi8(all_scales, shuffles[3]);
     }
     inline void prepare(const uint8_t * q4) {
         bits.prepare64(q4);
@@ -467,11 +1319,17 @@ struct DequantizerIQ4XS final : public BaseDequantizer<block_iq4_xs> {
     }
 
     Q4Bits bits;
-    Scales8K s8k;
+    Scales8KBase s8k;
     ScaleIQ4XS siq4;
     const __m512i values;
     const __m512i permute1 = _mm512_set_epi64(11, 10, 3, 2,  9,  8, 1, 0);
     const __m512i permute2 = _mm512_set_epi64(15, 14, 7, 6, 13, 12, 5, 4);
+    const __m512i shuffles[4] = {
+        _mm512_inserti32x8(_mm512_set1_epi16(0x0100), _mm256_set1_epi16(0x0302), 1),
+        _mm512_inserti32x8(_mm512_set1_epi16(0x0504), _mm256_set1_epi16(0x0706), 1),
+        _mm512_inserti32x8(_mm512_set1_epi16(0x0908), _mm256_set1_epi16(0x0b0a), 1),
+        _mm512_inserti32x8(_mm512_set1_epi16(0x0d0c), _mm256_set1_epi16(0x0f0e), 1),
+    };
 };
 
 struct HighBit5 {
@@ -646,6 +1504,149 @@ static void mul_mat_qX_K_q8_K_T(int n, const void * vx, size_t bx, const DataInf
 
     }
 }
+template <typename Q8>
+inline void compute_block(int iy, int i, float d, const Q8& q8, const __m512i * values, const __m512i * scales, __m512 * accd) {
+    const __m512i p1 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), values[0], q8.load_quants64(iy, i, 0));
+    const __m512i p2 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), values[1], q8.load_quants64(iy, i, 1));
+    const __m512i p3 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), values[2], q8.load_quants64(iy, i, 2));
+    const __m512i p4 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), values[3], q8.load_quants64(iy, i, 3));
+    auto sumi = _mm512_dpwssd_epi32(_mm512_setzero_si512(), scales[0], _mm512_packs_epi32(p1, p2));
+    sumi = _mm512_dpwssd_epi32(sumi, scales[1], _mm512_packs_epi32(p3, p4));
+    accd[iy] = _mm512_fmadd_ps(_mm512_set1_ps(d*q8.scale(iy, i)), _mm512_cvtepi32_ps(sumi), accd[iy]);
+}
+
+template <typename Dequantizer, int nrc_y>
+static void mul_mat_qX_K_q8_K_AVX512(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
+    assert(n % QK_K == 0);
+    const int nb = n / QK_K;
+
+    Q8<nrc_y> q8(info);
+
+    Dequantizer deq(vx, bx);
+
+    __m256  accm[nrc_y];
+    __m512  accd[nrc_y];
+    __m512i scales[2];
+
+    for (int ix = 0; ix < nrc_x; ++ix) {
+
+        for (int iy = 0; iy < nrc_y; ++iy) accd[iy] = _mm512_setzero_ps();
+        for (int iy = 0; iy < nrc_y; ++iy) accm[iy] = _mm256_setzero_ps();
+
+        deq.new_row(ix);
+
+        for (int i = 0; i < nb; ++i) {
+
+            deq.new_block(i, q8, accm, scales);
+
+            for (int iy = 0; iy < nrc_y; ++iy) {
+                const __m512i p1 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[0], q8.load_quants64(iy, i, 0));
+                const __m512i p2 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[1], q8.load_quants64(iy, i, 1));
+                const __m512i p3 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[2], q8.load_quants64(iy, i, 2));
+                const __m512i p4 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[3], q8.load_quants64(iy, i, 3));
+                auto sumi = _mm512_dpwssd_epi32(_mm512_setzero_si512(), scales[0], _mm512_packs_epi32(p1, p2));
+                sumi = _mm512_dpwssd_epi32(sumi, scales[1], _mm512_packs_epi32(p3, p4));
+                accd[iy] = _mm512_fmadd_ps(_mm512_set1_ps(deq.d*q8.scale(iy, i)), _mm512_cvtepi32_ps(sumi), accd[iy]);
+            }
+
+        }
+
+        for (int iy = 0; iy < nrc_y; ++iy) {
+            auto sum256 = _mm256_add_ps(_mm512_castps512_ps256(accd[iy]), _mm512_extractf32x8_ps(accd[iy], 1));
+            info.store(ix, iy, hsum_float_8(_mm256_add_ps(accm[iy], sum256)));
+        }
+
+    }
+}
+
+template <typename Dequantizer, int nrc_y>
+static void mul_mat_iqX_k_q8_K_AVX512(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
+    assert(n % QK_K == 0);
+    const int nb = n / QK_K;
+
+    Q8<nrc_y> q8(info);
+
+    Dequantizer deq(vx, bx);
+
+    __m256  accm[nrc_y];
+    __m512  accd[nrc_y];
+    __m512i scales[4];
+
+    for (int ix = 0; ix < nrc_x; ++ix) {
+
+        for (int iy = 0; iy < nrc_y; ++iy) accd[iy] = _mm512_setzero_ps();
+        for (int iy = 0; iy < nrc_y; ++iy) accm[iy] = _mm256_setzero_ps();
+
+        deq.new_row(ix);
+
+        for (int i = 0; i < nb; ++i) {
+
+            deq.new_block(i, q8, accm, scales);
+
+            for (int iy = 0; iy < nrc_y; ++iy) {
+                const __m512i p1 = _mm512_maddubs_epi16(deq.bits.values[0], q8.load_quants64(iy, i, 0));
+                const __m512i p2 = _mm512_maddubs_epi16(deq.bits.values[1], q8.load_quants64(iy, i, 1));
+                const __m512i p3 = _mm512_maddubs_epi16(deq.bits.values[2], q8.load_quants64(iy, i, 2));
+                const __m512i p4 = _mm512_maddubs_epi16(deq.bits.values[3], q8.load_quants64(iy, i, 3));
+                auto sumi = _mm512_dpwssd_epi32(_mm512_dpwssd_epi32(_mm512_dpwssd_epi32(_mm512_dpwssd_epi32(_mm512_setzero_si512(),
+                                    p1, scales[0]), p2, scales[1]), p3, scales[2]), p4, scales[3]);
+                accd[iy] = _mm512_fmadd_ps(_mm512_set1_ps(deq.d*q8.scale(iy, i)), _mm512_cvtepi32_ps(sumi), accd[iy]);
+            }
+
+        }
+
+        for (int iy = 0; iy < nrc_y; ++iy) {
+            auto sum256 = _mm256_add_ps(_mm512_castps512_ps256(accd[iy]), _mm512_extractf32x8_ps(accd[iy], 1));
+            info.store(ix, iy, hsum_float_8(_mm256_add_ps(accm[iy], sum256)));
+        }
+
+    }
+}
+
+template <typename Dequantizer>
+static void mul_mat_qX_K_q8_K_AVX512_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
+    assert(n % QK_K == 0);
+    const int nb = n / QK_K;
+
+    constexpr int k_nx = 2;
+
+    Q8<1> q8(info);
+
+    Dequantizer deq1(vx, bx);
+    Dequantizer deq2(vx, bx);
+
+    Dequantizer * deq[k_nx];
+    deq[0] = &deq1;
+    deq[1] = &deq2;
+
+    __m512i scales[2*k_nx];
+
+    for (int ix = 0; ix < nrc_x; ++ix) {
+
+        auto accd = _mm512_setzero_ps();
+        auto accm = _mm256_setzero_ps();
+
+        for (int kx = 0; kx < k_nx; ++kx) deq[kx]->new_row(ix);
+
+        for (int i = 0; i < nb/k_nx; ++i) {
+
+            for (int kx = 0; kx < k_nx; ++kx) deq[kx]->new_block(k_nx*i+kx, q8, &accm, scales+2*kx);
+
+            for (int kx = 0; kx < k_nx; ++kx) {
+                compute_block(0, k_nx*i+kx, deq[kx]->d, q8, deq[kx]->bits.values, scales+2*kx, &accd);
+            }
+
+        }
+        if (2*(nb/2) < nb) {
+            int i0 = 2*(nb/2);
+            deq[0]->new_block(i0, q8, &accm, scales);
+            compute_block(0, i0, deq[0]->d, q8, deq[0]->bits.values, scales, &accd);
+        }
+
+        auto sum256 = _mm256_add_ps(_mm512_castps512_ps256(accd), _mm512_extractf32x8_ps(accd, 1));
+        info.store(ix, 0, hsum_float_8(_mm256_add_ps(accm, sum256)));
+    }
+}
 
 #else
 // ===================================== Vanilla AVX2 =====================================
@@ -724,17 +1725,8 @@ struct HighBit3 {
     __m256i hbits;
 };
 
-inline __m256i get_scale_shuffle_8(int i) {
-    return _mm256_set1_epi16((2*i) | ((2*i+1) << 8));
-}
-
-inline void set_scales_8(const __m256i& all_scales, int j, __m256i * scales) {
-    scales[0] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_8(4*j+0));
-    scales[1] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_8(4*j+1));
-    scales[2] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_8(4*j+2));
-    scales[3] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_8(4*j+3));
-}
 
+/*
 template <typename Q8, typename Bits>
 inline void multiply_add(const Bits& bits, const __m256i * scales, int j, int i, const Q8& q8, __m256i * sumi) {
     if (j == 0) {
@@ -755,7 +1747,7 @@ inline void multiply_add(const Bits& bits, const __m256i * scales, int j, int i,
             sumi[iy] = _mm256_add_epi32(sumi[iy], _mm256_add_epi32(p2, p4));
         }
     }
-}
+}*/
 
 struct DequantizerQ4K final : public BaseDequantizer<block_q4_K> {
     DequantizerQ4K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {}
@@ -889,22 +1881,14 @@ struct DequantizerQ6K final : public BaseDequantizer<block_q6_K> {
     const __m256i mh = _mm256_set1_epi8(0x30);
 };
 
-inline __m256i get_scale_shuffle_16(int i) {
-    static const uint8_t k_shuffle[128] = {
-         0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,     2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
-         4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5,     6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7,
-         8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9,    10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,
-        12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,    14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,
-    };
-    return _mm256_loadu_si256((const __m256i*)k_shuffle + i);
-}
+inline __m256i get_scale_shuffle_8(int i);
+
+inline void set_scales_8(const __m256i& all_scales, int j, __m256i* scales);
+
+inline __m256i get_scale_shuffle_16(int i);
+
+inline void set_scales_16(const __m256i& all_scales, __m256i* scales);
 
-inline void set_scales_16(const __m256i& all_scales, __m256i * scales) {
-    scales[0] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_16(0));
-    scales[1] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_16(1));
-    scales[2] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_16(2));
-    scales[3] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_16(3));
-}
 
 template <typename Dequantizer, int nrc_y>
 static void mul_mat_qY_K_q8_K_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
@@ -1000,6 +1984,8 @@ static void mul_mat_qX_K_q8_K_T(int n, const void * vx, size_t bx, const DataInf
 }
 #endif  // Zen4 or vanilla AVX2
 
+
+
 //
 // ============================== Legacy quants
 //
@@ -1075,6 +2061,28 @@ struct ScaleHelperQ_0 {
     template <typename Q> inline float prepare1(const Q * y) const { return GGML_FP16_TO_FP32(y->d); }
     template <typename Q> inline float prepare1(float d, const Q * y) const { return d*prepare1(y); }
 };
+template <int min_value>
+struct ScaleHelperQ_0_1 {
+    ggml_half scales8[4];
+    template <typename Q>
+    inline __m256 prepare4(const Q * y) {
+        for (int j = 0; j < 4; ++j) scales8[j] = y[j].d;
+        auto s4 = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)scales8));
+        return _mm256_set_m128(_mm_mul_ps(s4, min), s4);
+    }
+    template <typename Q>
+    inline __m256 prepare4(__m256 other_scales, const Q * y) {
+        return _mm_mul256_ps(other_scales, prepare4<Q>(y));
+    }
+    template <typename Q> inline std::pair<float, float> prepare1(const Q * y) const {
+        float d = GGML_FP16_TO_FP32(y->d);
+        return std::make_pair(d, -d*float(min_value));
+    }
+    std::pair<float, float> inline prepare1(const std::pair<float, float>& dm, const block_q8_1 * y) const {
+        return std::make_pair(dm.first*GGML_FP16_TO_FP32(y->d), dm.second*GGML_FP16_TO_FP32(y->s));
+    }
+    const __m128 min = _mm_set1_ps(float(-min_value));
+};
 
 struct ScaleHelperQ_1 {
     uint32_t scales8[4];
@@ -1235,6 +2243,12 @@ struct Q8_0_Dequantizer {
     }
 };
 
+struct Q8_0_1_Dequantizer {
+    inline __m256i dequant(const block_q8_0 * x) const {
+        return _mm256_add_epi8(_mm256_set1_epi8(127), _mm256_loadu_si256((const __m256i *)x->qs));
+    }
+};
+
 struct Q4_0_Dequantizer {
     Dequantizer4bit b4;
     const __m256i m8 = _mm256_set1_epi8(-8);
@@ -1320,6 +2334,11 @@ struct Q8_0_Unpacker final : public Q_Unpacker<block_q8_0, ScaleHelperQ_0, Q8_0_
     Q8_0_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
     inline static int block_size() { return QK4_0; }
 };
+struct Q8_0_1_Unpacker final : public Q_Unpacker<block_q8_0, ScaleHelperQ_0_1<127>, Q8_0_1_Dequantizer> {
+    Q8_0_1_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
+//    using Sum4T = Sum4TypeQ81;
+    inline static int block_size() { return QK8_0; }
+};
 struct Q4_0_Unpacker final : public Q_Unpacker<block_q4_0, ScaleHelperQ_0, Q4_0_Dequantizer> {
     Q4_0_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
     inline static int block_size() { return QK4_0; }
@@ -1353,8 +2372,488 @@ void mul_mat_q8_0_q8_0_T(int n, const void * vx, size_t bx, const DataInfo& info
     }
 }
 
+
+
+
+/*
+moonll
+add some structs for DequantizerIQ2XXS
+SimpleBits
+EvenSignHelper
+*/
+struct SimpleBits {
+    __m256i values[4];
+};
+
+// fix for #829: 添加对 AVX512VPOPCNTDQ 的检测
+#if defined(HAVE_FANCY_SIMD) && defined(__AVX512VPOPCNTDQ__)
+#define HAVE_AVX512_POPCNT 1
+#else
+#define HAVE_AVX512_POPCNT 0
+#endif
+
+struct EvenSignHelper {
+    #if defined HAVE_FANCY_SIMD
+    // #pragma message("Using AVX512VPOPCNTDQ in even sign helper")
+        union sbits_t {
+            __m128i vec;
+            __mmask32 mask[4];
+        };
+        IQK_ALWAYS_INLINE void sign_2_values(__m256i aux, __m256i * values) const {
+            aux = _mm256_and_si256(_mm256_srlv_epi32(aux, shifts), mask);
+            
+            // fix for #829: 兼容Intel Cascade Lake架构的CPU，如果不支持AVX512VPOPCNTDQ扩展，则使用替代实现
+            #if HAVE_AVX512_POPCNT
+                auto pcnt = _mm256_popcnt_epi32(aux);
+                
+            #else
+                // 提供替代实现，使用标准的位计数方法
+                __m256i pcnt;
+                int* pcnt_ptr = reinterpret_cast<int*>(&pcnt);
+                int* aux_ptr = reinterpret_cast<int*>(&aux); // 直接获取 aux 的地址，避免不必要的复制
+                
+                #pragma unroll 8  // 提示编译器展开循环，提高 SIMD 计算吞吐量
+                for (int i = 0; i < 8; i++) {
+                    pcnt_ptr[i] = __builtin_popcount(aux_ptr[i]); // 使用编译器内置 popcount
+                }
+            #endif
+            
+            sbits_t sbits;
+            sbits.vec = _mm256_cvtepi32_epi8(_mm256_or_si256(aux, _mm256_slli_epi32(_mm256_and_si256(pcnt, mone), 7)));
+            values[0] = _mm256_mask_sub_epi8(values[0], sbits.mask[0], _mm256_setzero_si256(), values[0]);
+            values[1] = _mm256_mask_sub_epi8(values[1], sbits.mask[1], _mm256_setzero_si256(), values[1]);
+            //auto sign_bits = _mm256_cvtepi32_epi8(_mm256_or_si256(aux, _mm256_slli_epi32(_mm256_and_si256(pcnt, mone), 7)));
+            //const __mmask32 * m32 = (const __mmask32 *)&sign_bits;
+            //values[0] = _mm256_mask_sub_epi8(values[0], m32[0], _mm256_setzero_si256(), values[0]);
+            //values[1] = _mm256_mask_sub_epi8(values[1], m32[1], _mm256_setzero_si256(), values[1]);
+        }
+        const __m256i shifts = _mm256_set_epi32(21, 14, 7, 0, 21, 14, 7, 0);
+        const __m256i mask   = _mm256_set1_epi32(127);
+        const __m256i mone   = _mm256_set1_epi32(1);
+    #else
+        inline void sign_value(uint32_t aux32, __m256i& value) const {
+            auto signs = _mm256_set_epi64x(keven_signs[(aux32 >> 21) & 127], keven_signs[(aux32 >> 14) & 127],
+                                           keven_signs[(aux32 >>  7) & 127], keven_signs[(aux32 >>  0) & 127]);
+            value = _mm256_sign_epi8(value, signs);
+        }
+    #endif
+};
+
+/*
+moonll ad multiply_add for mul_mat_qX_K_q8_K_IQ_1
+add func
+get_scale_shuffle_8
+get_scale_shuffle_16
+set_scales_16
+*/
+
+inline __m256i get_scale_shuffle_8(int i) {
+    return _mm256_set1_epi16((2*i) | ((2*i+1) << 8));
+}
+
+inline void set_scales_8(const __m256i& all_scales, int j, __m256i * scales) {
+    scales[0] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_8(4*j+0));
+    scales[1] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_8(4*j+1));
+    scales[2] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_8(4*j+2));
+    scales[3] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_8(4*j+3));
+}
+
+
+inline __m256i get_scale_shuffle_16(int i) {
+    static const uint8_t k_shuffle[128] = {
+         0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,     2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
+         4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5,     6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7,
+         8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9,    10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,
+        12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,    14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,
+    };
+    return _mm256_loadu_si256((const __m256i*)k_shuffle + i);
+}
+
+inline void set_scales_16(const __m256i& all_scales, __m256i * scales) {
+    scales[0] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_16(0));
+    scales[1] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_16(1));
+    scales[2] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_16(2));
+    scales[3] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_16(3));
+}
+
+
+template <typename Q8, typename Bits>
+inline void multiply_add(const Bits& bits, const __m256i * scales, int j, int i, const Q8& q8, __m256i * sumi) {
+    if (j == 0) {
+#ifdef HAVE_FANCY_SIMD
+        for (int iy = 0; iy < Q8::nrc_y; ++iy) {
+            sumi[iy] = _mm256_dpwssd_epi32(_mm256_setzero_si256(), scales[0], _mm256_maddubs_epi16(bits.values[0], q8.load_quants(iy, i, 0)));
+            sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[1], _mm256_maddubs_epi16(bits.values[1], q8.load_quants(iy, i, 1)));
+            sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[2], _mm256_maddubs_epi16(bits.values[2], q8.load_quants(iy, i, 2)));
+            sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[3], _mm256_maddubs_epi16(bits.values[3], q8.load_quants(iy, i, 3)));
+        }
+#else
+        for (int iy = 0; iy < Q8::nrc_y; ++iy) {
+            const __m256i p1 = _mm256_madd_epi16(scales[0], _mm256_maddubs_epi16(bits.values[0], q8.load_quants(iy, i, 0)));
+            const __m256i p2 = _mm256_madd_epi16(scales[1], _mm256_maddubs_epi16(bits.values[1], q8.load_quants(iy, i, 1)));
+            const __m256i p3 = _mm256_madd_epi16(scales[2], _mm256_maddubs_epi16(bits.values[2], q8.load_quants(iy, i, 2)));
+            const __m256i p4 = _mm256_madd_epi16(scales[3], _mm256_maddubs_epi16(bits.values[3], q8.load_quants(iy, i, 3)));
+            sumi[iy] = _mm256_add_epi32(_mm256_add_epi32(p1, p3), _mm256_add_epi32(p2, p4));
+        }
+#endif
+    } else {
+#ifdef HAVE_FANCY_SIMD
+        for (int iy = 0; iy < Q8::nrc_y; ++iy) {
+            sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[0], _mm256_maddubs_epi16(bits.values[0], q8.load_quants(iy, i, 4)));
+            sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[1], _mm256_maddubs_epi16(bits.values[1], q8.load_quants(iy, i, 5)));
+            sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[2], _mm256_maddubs_epi16(bits.values[2], q8.load_quants(iy, i, 6)));
+            sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[3], _mm256_maddubs_epi16(bits.values[3], q8.load_quants(iy, i, 7)));
+        }
+#else
+        for (int iy = 0; iy < Q8::nrc_y; ++iy) {
+            const __m256i p1 = _mm256_madd_epi16(scales[0], _mm256_maddubs_epi16(bits.values[0], q8.load_quants(iy, i, 4)));
+            const __m256i p2 = _mm256_madd_epi16(scales[1], _mm256_maddubs_epi16(bits.values[1], q8.load_quants(iy, i, 5)));
+            const __m256i p3 = _mm256_madd_epi16(scales[2], _mm256_maddubs_epi16(bits.values[2], q8.load_quants(iy, i, 6)));
+            const __m256i p4 = _mm256_madd_epi16(scales[3], _mm256_maddubs_epi16(bits.values[3], q8.load_quants(iy, i, 7)));
+            sumi[iy] = _mm256_add_epi32(sumi[iy], _mm256_add_epi32(p1, p3));
+            sumi[iy] = _mm256_add_epi32(sumi[iy], _mm256_add_epi32(p2, p4));
+        }
+#endif
+    }
+}
+
+/*
+moonll ad multiply_add_1 for mul_mat_qX_K_q8_K_IQ_1
+add func
+set_scales_8_iq
+set_scales_16_iq
+
+add MUL_MAT
+mul_mat_qX_K_q8_K_IQ_1
+mul_mat_qX_K_q8_K_IQ_N
+mul_mat_qX_K_q8_K_IQ
+*/
+
+template <typename Bits>
+inline void multiply_add_1(int j, const Bits& bits, const __m256i * scales, const __m256i * q8, __m256i * sumi) {
+    if (j == 0) {
+#ifdef HAVE_FANCY_SIMD
+        auto p1 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[0], q8[0]);
+        auto p2 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[1], q8[1]);
+        auto p3 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[2], q8[2]);
+        auto p4 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[3], q8[3]);
+        sumi[0] = _mm256_dpwssd_epi32(_mm256_setzero_si256(), scales[0], _mm256_packs_epi32(p1, p2));
+        sumi[1] = _mm256_dpwssd_epi32(_mm256_setzero_si256(), scales[1], _mm256_packs_epi32(p3, p4));
+#else
+        const __m256i p1 = _mm256_madd_epi16(scales[0], _mm256_maddubs_epi16(bits.values[0], q8[0]));
+        const __m256i p2 = _mm256_madd_epi16(scales[1], _mm256_maddubs_epi16(bits.values[1], q8[1]));
+        const __m256i p3 = _mm256_madd_epi16(scales[2], _mm256_maddubs_epi16(bits.values[2], q8[2]));
+        const __m256i p4 = _mm256_madd_epi16(scales[3], _mm256_maddubs_epi16(bits.values[3], q8[3]));
+        sumi[0] = _mm256_add_epi32(p1, p3);
+        sumi[1] = _mm256_add_epi32(p2, p4);
+#endif
+    } else {
+#ifdef HAVE_FANCY_SIMD
+        auto p1 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[0], q8[0]);
+        auto p2 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[1], q8[1]);
+        auto p3 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[2], q8[2]);
+        auto p4 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[3], q8[3]);
+        sumi[0] = _mm256_dpwssd_epi32(sumi[0], scales[0], _mm256_packs_epi32(p1, p2));
+        sumi[1] = _mm256_dpwssd_epi32(sumi[1], scales[1], _mm256_packs_epi32(p3, p4));
+#else
+        const __m256i p1 = _mm256_madd_epi16(scales[0], _mm256_maddubs_epi16(bits.values[0], q8[0]));
+        const __m256i p2 = _mm256_madd_epi16(scales[1], _mm256_maddubs_epi16(bits.values[1], q8[1]));
+        const __m256i p3 = _mm256_madd_epi16(scales[2], _mm256_maddubs_epi16(bits.values[2], q8[2]));
+        const __m256i p4 = _mm256_madd_epi16(scales[3], _mm256_maddubs_epi16(bits.values[3], q8[3]));
+        sumi[0] = _mm256_add_epi32(sumi[0], _mm256_add_epi32(p1, p3));
+        sumi[1] = _mm256_add_epi32(sumi[1], _mm256_add_epi32(p2, p4));
+#endif
+    }
+}
+
+
+inline void set_scales_8_iq(int j, const __m256i& all_scales, __m256i * scales) {
+    //#ifdef HAVE_FANCY_SIMD
+        auto shuffle = j == 0 ? _mm256_set_epi64x(0x0302030203020302, 0x0100010001000100, 0x0302030203020302, 0x0100010001000100)
+                              : _mm256_set_epi64x(0x0b0a0b0a0b0a0b0a, 0x0908090809080908, 0x0b0a0b0a0b0a0b0a, 0x0908090809080908);
+        scales[0] = _mm256_shuffle_epi8(all_scales, shuffle);
+        scales[1] = _mm256_shuffle_epi8(all_scales, _mm256_add_epi8(shuffle, _mm256_set1_epi8(4)));
+    //#else
+    //    set_scales_8(all_scales, j, scales);
+    //#endif
+    }
+    
+inline void set_scales_16_iq(const __m256i& all_scales, __m256i * scales) {
+    #ifdef HAVE_FANCY_SIMD
+        auto shuffle = _mm256_set_epi64x(0x0706070607060706, 0x0302030203020302, 0x0504050405040504, 0x0100010001000100);
+        scales[0] = _mm256_shuffle_epi8(all_scales, shuffle);
+        scales[1] = _mm256_shuffle_epi8(all_scales, _mm256_add_epi8(shuffle, _mm256_set1_epi8(8)));
+    #else
+        set_scales_16(all_scales, scales);
+    #endif
+    }
+    
+template <typename Dequantizer>
+static void mul_mat_qX_K_q8_K_IQ_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
+        const int nb = n / QK_K;
+        Q8<1> q8(info);
+        Dequantizer deq(vx, bx);
+        __m256i scales[2];
+        __m256i q8_quants[4];
+        for (int ix = 0; ix < nrc_x; ++ix) {
+    
+            __m256 accd = _mm256_setzero_ps();
+            deq.new_row(ix);
+    
+            for (int i = 0; i < nb; ++i) {
+    
+                __m256i sumi[2], all_scales[Dequantizer::num_blocks/8];
+                deq.new_block(i, all_scales);
+    
+                for (int j = 0; j < QK_K/128; ++j) {
+                    deq.prepare(i, j, q8, q8_quants);
+                    if constexpr (Dequantizer::num_blocks == 8) {
+                        set_scales_8_iq(j, all_scales[0], scales);
+                    } else {
+                        set_scales_16_iq(all_scales[j], scales);
+                    }
+                    multiply_add_1(j, deq.bits, scales, q8_quants, sumi);
+                }
+                accd = _mm256_fmadd_ps(_mm256_set1_ps(deq.d*q8.scale(0, i)), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi[0], sumi[1])), accd);
+            }
+    
+            info.store(ix, 0, hsum_float_8(accd));
+        }
+    }
+
+
+template <typename Dequantizer, int nrc_y>
+static void mul_mat_qX_K_q8_K_IQ_N(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
+    const int nb = n / QK_K;
+    Q8<nrc_y> q8(info);
+    Dequantizer deq(vx, bx);
+    __m256i scales[4];
+    __m256  accd[nrc_y];
+
+    for (int ix = 0; ix < nrc_x; ++ix) {
+
+        for (int iy = 0; iy < nrc_y; ++iy) accd[iy] = _mm256_setzero_ps();
+
+        deq.new_row(ix);
+
+        for (int i = 0; i < nb; ++i) {
+
+            __m256i sumi[nrc_y], all_scales[Dequantizer::num_blocks/8];
+            //for (int iy = 0; iy < nrc_y; ++iy) sumi[iy] = _mm256_setzero_si256();
+            __m256i mins;
+            float dmin = deq.new_block(i, all_scales, mins);
+            for (int iy = 0; iy < nrc_y; ++iy) {
+                auto bsums = q8.load_bsums(iy, i);
+                auto prod  = _mm256_madd_epi16(mins, bsums);
+                accd[iy] = _mm256_fmadd_ps(_mm256_set1_ps(dmin*q8.scale(iy, i)), _mm256_cvtepi32_ps(prod), accd[iy]);
+            }
+
+            for (int j = 0; j < QK_K/128; ++j) {
+                deq.prepare(i, j);
+                if constexpr (Dequantizer::num_blocks == 8) {
+                    set_scales_8(all_scales[0], j, scales);
+                } else {
+                    set_scales_16(all_scales[j], scales);
+                }
+                //multiply_add_iq(deq.bits, scales, j, i, q8, sumi);
+                multiply_add(deq.bits, scales, j, i, q8, sumi);
+            }
+            for (int iy = 0; iy < nrc_y; ++iy) {
+                const __m256 vd = _mm256_set1_ps(deq.d*q8.scale(iy, i));
+                accd[iy] = _mm256_fmadd_ps(vd, _mm256_cvtepi32_ps(sumi[iy]), accd[iy]);
+            }
+        }
+
+        for (int iy = 0; iy < nrc_y; ++iy) {
+            info.store(ix, iy, hsum_float_8(accd[iy]));
+        }
+    }
+}
+
+template <typename Dequantizer, int nrc_y>
+static void mul_mat_qX_K_q8_K_IQ(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
+    assert(n % QK_K == 0);
+#ifdef HAVE_FANCY_SIMD
+    if constexpr (nrc_y == 1) {
+        mul_mat_qX_K_q8_K_IQ_1<Dequantizer>(n, vx, bx, info, nrc_x);
+    } else {
+        mul_mat_qX_K_q8_K_IQ_N<Dequantizer, nrc_y>(n, vx, bx, info, nrc_x);
+    }
+#else
+    mul_mat_qX_K_q8_K_IQ_N<Dequantizer, nrc_y>(n, vx, bx, info, nrc_x);
+#endif
+}
+
+/*
+moonll iq1s
+core func for iq1s mul_mat_iq1_s_q8_K
+
+*/
+
+template <int nrc_y>
+static void mul_mat_iq1_s_q8_K(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
+    GGML_ASSERT(n%QK_K == 0);
+    Q8<nrc_y, block_q8_K> q8(info);
+    __m256i qx[8];
+    __m256i scales[4];
+    __m256  acc[nrc_y] = {};
+    auto delta_mask = _mm_set1_epi16(-32768); // to avoid stupid overflow warnings when using 0x8000
+    __m256i shuffle0 = _mm256_set_epi64x(0x0302030203020302, 0x0100010001000100, 0x0302030203020302, 0x0100010001000100);
+    for (int ix = 0; ix < nrc_x; ++ix) {
+        auto iq1s = (const block_iq1_s *)((const char *)vx + ix*bx);
+        for (int ibl = 0; ibl < n/QK_K; ++ibl) {
+            float d = GGML_FP16_TO_FP32(iq1s[ibl].d);
+            auto qhb = _mm_loadu_si128((const __m128i *)iq1s[ibl].qh);
+            auto scales128 = _mm_and_si128(_mm_srli_epi16(qhb, 12), _mm_set1_epi16(7));
+            scales128 = _mm_add_epi16(_mm_slli_epi16(scales128, 1), _mm_set1_epi16(1));
+#ifdef HAVE_FANCY_SIMD
+            auto mask = _mm_cmpeq_epi16_mask(_mm_and_si128(qhb, delta_mask), delta_mask);
+            auto deltas128 = _mm_mask_blend_epi16(mask, _mm_set1_epi16(-7), _mm_set1_epi16(-9));
+#else
+            auto mask = _mm_cmpeq_epi16(_mm_and_si128(qhb, delta_mask), delta_mask);
+            auto deltas128 = _mm_or_si128(_mm_and_si128(mask, _mm_set1_epi16(-9)), _mm_andnot_si128(mask, _mm_set1_epi16(-7)));
+#endif
+            deltas128 = _mm_mullo_epi16(scales128, deltas128);
+            scales128 = _mm_slli_epi16(scales128, 3);
+            auto deltas_l = _mm_unpacklo_epi16(deltas128, deltas128);
+            auto deltas_h = _mm_unpackhi_epi16(deltas128, deltas128);
+            auto deltas = MM256_SET_M128I(deltas_h, deltas_l); // blocks 0,0, 1,1, 2,2, ..., 7,7
+            auto all_scales = MM256_SET_M128I(scales128, scales128);
+            auto shuffle = shuffle0;
+            for (int ib64 = 0; ib64 < QK_K/64; ++ib64) {
+                scales[ib64] = _mm256_shuffle_epi8(all_scales, shuffle);
+                shuffle = _mm256_add_epi8(shuffle, _mm256_set1_epi8(4));
+            }
+            const uint8_t  * qs = iq1s[ibl].qs;
+            const uint16_t * qh = iq1s[ibl].qh;
+            for (int ib = 0; ib < QK_K/32; ib += 2) {
+                qx[ib+0] = _mm256_set_epi64x(iq1s_grid_us[qs[3] | ((qh[ib+0] >> 1) & 0x700)], iq1s_grid_us[qs[2] | ((qh[ib+0] << 2) & 0x700)],
+                                             iq1s_grid_us[qs[1] | ((qh[ib+0] << 5) & 0x700)], iq1s_grid_us[qs[0] | ((qh[ib+0] << 8) & 0x700)]);
+                qx[ib+1] = _mm256_set_epi64x(iq1s_grid_us[qs[7] | ((qh[ib+1] >> 1) & 0x700)], iq1s_grid_us[qs[6] | ((qh[ib+1] << 2) & 0x700)],
+                                             iq1s_grid_us[qs[5] | ((qh[ib+1] << 5) & 0x700)], iq1s_grid_us[qs[4] | ((qh[ib+1] << 8) & 0x700)]);
+                qs += 8;
+            }
+            for (int iy = 0; iy < nrc_y; ++iy) {
+                auto bsums = q8.load_bsums(iy, ibl);
+                auto sumi = _mm256_setzero_si256();
+                for (int ib64 = 0; ib64 < QK_K/64; ++ib64) {
+                    auto qy1 = q8.load_quants(iy, ibl, 2*ib64+0);
+                    auto qy2 = q8.load_quants(iy, ibl, 2*ib64+1);
+#ifdef HAVE_FANCY_SIMD
+                    auto dot1 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), qx[2*ib64+0], qy1);
+                    auto dot2 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), qx[2*ib64+1], qy2);
+                    sumi = _mm256_dpwssd_epi32(sumi, scales[ib64], _mm256_packs_epi32(dot1, dot2));
+#else
+                    auto dot1 = _mm256_maddubs_epi16(qx[2*ib64+0], qy1);
+                    auto dot2 = _mm256_maddubs_epi16(qx[2*ib64+1], qy2);
+                    auto dot  = _mm256_add_epi16(_mm256_unpacklo_epi64(dot1, dot2), _mm256_unpackhi_epi64(dot1, dot2));
+                    sumi = _mm256_add_epi32(sumi, _mm256_madd_epi16(scales[ib64], dot));
+#endif
+                }
+#ifdef HAVE_FANCY_SIMD
+                sumi = _mm256_dpwssd_epi32(sumi, bsums, deltas);
+#else
+                sumi = _mm256_add_epi32(sumi, _mm256_madd_epi16(bsums, deltas));
+#endif
+                acc[iy] = _mm256_fmadd_ps(_mm256_set1_ps(d*q8.scale(iy, ibl)), _mm256_cvtepi32_ps(sumi), acc[iy]);
+            }
+        }
+        for (int iy = 0; iy < nrc_y; ++iy) {
+            info.store(ix, iy, 0.125f*hsum_float_8(acc[iy]));
+            acc[iy] = _mm256_setzero_ps();
+        }
+    }
+}
+
+/*
+moonll iq1s
+DequantizerIQ2XXS
+DequantizerIQ2XXS is important Dequantizer for DequantizerIQ1_S
+*/
+
+struct DequantizerIQ2XXS final : public BaseDequantizer<block_iq2_xxs> {
+    DequantizerIQ2XXS(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {}
+
+    constexpr static int num_blocks = 8;
+
+    union Data {
+        __m256i vec;
+        uint32_t val[8];
+    };
+
+    inline __m128i load_scales(int i) {
+        d = 0.125f * GGML_FP16_TO_FP32(x[i].d);
+        const uint16_t * a16 = (const uint16_t *)x[i].qs;
+        auto scales = _mm_srli_epi16(_mm_set_epi16(a16[31], a16[27], a16[23], a16[19], a16[15], a16[11], a16[7], a16[3]), 12);
+        return _mm_or_si128(_mm_slli_epi16(scales, 1), _mm_set1_epi16(1));
+    }
+
+    inline void new_block(int i, __m256i * scales) {
+        auto sc16 = load_scales(i);
+        scales[0] = MM256_SET_M128I(sc16, sc16);
+    }
+    inline float new_block(int i, __m256i * scales, __m256i& mins) {
+        auto sc16 = load_scales(i);
+        mins = scb.shuffle(sc16);
+        scales[0] = MM256_SET_M128I(sc16, sc16);
+        return -d*minv;
+    }
+
+    inline static void make4(const uint32_t * aux32, __m256i * values) {
+        const uint8_t * aux8 = (const uint8_t *)aux32;
+        values[0] = _mm256_set_epi64x(iq2xxs_grid[aux8[ 3]], iq2xxs_grid[aux8[ 2]], iq2xxs_grid[aux8[ 1]], iq2xxs_grid[aux8[ 0]]);
+        values[1] = _mm256_set_epi64x(iq2xxs_grid[aux8[11]], iq2xxs_grid[aux8[10]], iq2xxs_grid[aux8[ 9]], iq2xxs_grid[aux8[ 8]]);
+        values[2] = _mm256_set_epi64x(iq2xxs_grid[aux8[19]], iq2xxs_grid[aux8[18]], iq2xxs_grid[aux8[17]], iq2xxs_grid[aux8[16]]);
+        values[3] = _mm256_set_epi64x(iq2xxs_grid[aux8[27]], iq2xxs_grid[aux8[26]], iq2xxs_grid[aux8[25]], iq2xxs_grid[aux8[24]]);
+    }
+
+    IQK_ALWAYS_INLINE void sign_values(const uint32_t * aux32, __m256i * values) const {
+#ifdef HAVE_FANCY_SIMD
+        esh.sign_2_values(MM256_SET_M128I(_mm_set1_epi32(aux32[3]), _mm_set1_epi32(aux32[1])), values+0);
+        esh.sign_2_values(MM256_SET_M128I(_mm_set1_epi32(aux32[7]), _mm_set1_epi32(aux32[5])), values+2);
+#else
+        esh.sign_value(aux32[1], values[0]);
+        esh.sign_value(aux32[3], values[1]);
+        esh.sign_value(aux32[5], values[2]);
+        esh.sign_value(aux32[7], values[3]);
+#endif
+    }
+    inline void make4_signed(const uint32_t * aux32, const __m256i& min_value, __m256i * values) const {
+        make4(aux32, values);
+        sign_values(aux32, values);
+        for (int k = 0; k < 4; ++k) values[k] = _mm256_add_epi8(values[k], min_value);
+    }
+    inline void make4(const uint32_t * aux32, __m256i * values, __m256i * q8) const {
+        make4(aux32, values);
+        sign_values(aux32, q8);
+    }
+    inline void prepare(int i, int j) {
+        Data data; data.vec = _mm256_loadu_si256((const __m256i *)x[i].qs + j);
+        make4_signed(data.val, min_value, bits.values);
+    }
+    inline void prepare(int i, int j, const Q8<1>& q8, __m256i * q8_quants) {
+        for (int k = 0; k < 4; ++k) q8_quants[k] = q8.load_quants(0, i, 4*j+k);
+        Data data; data.vec = _mm256_loadu_si256((const __m256i *)x[i].qs + j);
+        make4(data.val, bits.values, q8_quants);
+    }
+
+    constexpr static int minv = 43;
+    SimpleBits bits;
+    Scales8KBase scb;
+    EvenSignHelper esh;
+    const __m256i min_value = _mm256_set1_epi8(minv);
+    const __m256i shuffle = _mm256_set_epi32(7, 5, 3, 1, 7, 5, 3, 1);
+};
+
+/*
+moonll
+add Q8_0_Unpacker && DequantizerIQ2XXS support
+add func mul_mat_qX_K_q8_K_IQ
+*/
+
 template <typename Dequantizer> void MulMat::set_functions(MulMat& m) {
-        if constexpr (std::is_same_v<Dequantizer, Q4_0_Unpacker> || std::is_same_v<Dequantizer, Q5_0_Unpacker>) {
+    if constexpr (std::is_same_v<Dequantizer, Q4_0_Unpacker> || std::is_same_v<Dequantizer, Q5_0_Unpacker> ||
+        std::is_same_v<Dequantizer, Q8_0_Unpacker>) {
             m.funcs[0] = mul_mat_qX_0_q8_0_T<Dequantizer, 1>;
             m.funcs[1] = mul_mat_qX_0_q8_0_T<Dequantizer, 2>;
             m.funcs[2] = mul_mat_qX_0_q8_0_T<Dequantizer, 3>;
@@ -1364,7 +2863,7 @@ template <typename Dequantizer> void MulMat::set_functions(MulMat& m) {
             m.funcs[6] = mul_mat_qX_0_q8_0_T<Dequantizer, 7>;
             m.funcs[7] = mul_mat_qX_0_q8_0_T<Dequantizer, 8>;
         }
-        else if constexpr (std::is_same_v<Dequantizer, Q4_1_Unpacker> || std::is_same_v<Dequantizer, Q5_1_Unpacker>) {
+        else if constexpr (std::is_same_v<Dequantizer, Q4_1_Unpacker> || std::is_same_v<Dequantizer, Q5_1_Unpacker>|| std::is_same_v<Dequantizer, Q8_0_1_Unpacker>) {
             m.funcs[0] = mul_mat_qX_1_q8_1_T<Dequantizer, 1>;
             m.funcs[1] = mul_mat_qX_1_q8_1_T<Dequantizer, 2>;
             m.funcs[2] = mul_mat_qX_1_q8_1_T<Dequantizer, 3>;
@@ -1374,16 +2873,37 @@ template <typename Dequantizer> void MulMat::set_functions(MulMat& m) {
             m.funcs[6] = mul_mat_qX_1_q8_1_T<Dequantizer, 7>;
             m.funcs[7] = mul_mat_qX_1_q8_1_T<Dequantizer, 8>;
         }
-        else {
+        else if constexpr (std::is_same_v<Dequantizer, DequantizerIQ2XXS>) {
+            m.funcs[0] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 1>;
+            m.funcs[1] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 2>;
+            m.funcs[2] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 3>;
+            m.funcs[3] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 4>;
+            m.funcs[4] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 5>;
+            m.funcs[5] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 6>;
+            m.funcs[6] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 7>;
+            m.funcs[7] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 8>;
+            }
+            else {
 #ifdef HAVE_FANCY_SIMD
-            m.funcs[0] = mul_mat_qX_K_q8_K_T<Dequantizer, 1>;
-            m.funcs[1] = mul_mat_qX_K_q8_K_T<Dequantizer, 2>;
-            m.funcs[2] = mul_mat_qX_K_q8_K_T<Dequantizer, 3>;
-            m.funcs[3] = mul_mat_qX_K_q8_K_T<Dequantizer, 4>;
-            m.funcs[4] = mul_mat_qX_K_q8_K_T<Dequantizer, 5>;
-            m.funcs[5] = mul_mat_qX_K_q8_K_T<Dequantizer, 6>;
-            m.funcs[6] = mul_mat_qX_K_q8_K_T<Dequantizer, 7>;
-            m.funcs[7] = mul_mat_qX_K_q8_K_T<Dequantizer, 8>;
+            if constexpr (std::is_same_v<Dequantizer, DequantizerIQ4XS>) {
+            m.funcs[0] = mul_mat_iqX_k_q8_K_AVX512<Dequantizer, 1>;
+            m.funcs[1] = mul_mat_iqX_k_q8_K_AVX512<Dequantizer, 2>;
+            m.funcs[2] = mul_mat_iqX_k_q8_K_AVX512<Dequantizer, 3>;
+            m.funcs[3] = mul_mat_iqX_k_q8_K_AVX512<Dequantizer, 4>;
+            m.funcs[4] = mul_mat_iqX_k_q8_K_AVX512<Dequantizer, 5>;
+            m.funcs[5] = mul_mat_iqX_k_q8_K_AVX512<Dequantizer, 6>;
+            m.funcs[6] = mul_mat_iqX_k_q8_K_AVX512<Dequantizer, 7>;
+            m.funcs[7] = mul_mat_iqX_k_q8_K_AVX512<Dequantizer, 8>;
+            } else {
+            m.funcs[0] = mul_mat_qX_K_q8_K_AVX512_1<Dequantizer>;
+            m.funcs[1] = mul_mat_qX_K_q8_K_AVX512<Dequantizer, 2>;
+            m.funcs[2] = mul_mat_qX_K_q8_K_AVX512<Dequantizer, 3>;
+            m.funcs[3] = mul_mat_qX_K_q8_K_AVX512<Dequantizer, 4>;
+            m.funcs[4] = mul_mat_qX_K_q8_K_AVX512<Dequantizer, 5>;
+            m.funcs[5] = mul_mat_qX_K_q8_K_AVX512<Dequantizer, 6>;
+            m.funcs[6] = mul_mat_qX_K_q8_K_AVX512<Dequantizer, 7>;
+            m.funcs[7] = mul_mat_qX_K_q8_K_AVX512<Dequantizer, 8>;
+            }
 #else
             if constexpr (std::is_same_v<Dequantizer, DequantizerQ2K> ||
                           std::is_same_v<Dequantizer, DequantizerQ3K> ||
@@ -1410,11 +2930,260 @@ template <typename Dequantizer> void MulMat::set_functions(MulMat& m) {
         }
 }
 
-bool MulMat::set_mul_mat(int typeA, int ne00, MulMat& mm, int& row_size_q8, int) {
+struct QFBase {
+    #ifdef __AVX512F__
+        constexpr static int k_step = 16;
+        using Data = __m512;
+        using Acc  = __m512;
+        static inline Data load(const ggml_half * x) { return _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)x)); }
+        static inline Data load(const float * x) { return _mm512_loadu_ps(x); }
+        static inline Data load(const ggml_bf16_t * x) {
+            return _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i*)x)), 16));
+        }
+        static inline Acc acc(Acc prev, const Data& y, const Data& x) {
+            return _mm512_fmadd_ps(y, x, prev);
+        }
+        static inline Acc acc_first(const Data& y, const Data& x) {
+            return _mm512_mul_ps(y, x);
+        }
+        static inline Acc add(Acc x, Acc y) { return _mm512_add_ps(x, y); }
+        static inline float hsum(Acc acc) {
+            return _mm512_reduce_add_ps(acc);
+        }
+        template <typename Float>
+        static inline Data load4Floats(const Float * x) {
+            return _mm512_insertf32x4(_mm512_setzero_ps(), load128(x), 0);
+        }
+        static inline Acc acc_r4(Acc acc, const Data * xv, const Data& yv) {
+            acc = _mm512_fmadd_ps(xv[0], _mm512_shuffle_ps(yv, yv, 0x00), acc);
+            acc = _mm512_fmadd_ps(xv[1], _mm512_shuffle_ps(yv, yv, 0x55), acc);
+            acc = _mm512_fmadd_ps(xv[2], _mm512_shuffle_ps(yv, yv, 0xaa), acc);
+            acc = _mm512_fmadd_ps(xv[3], _mm512_shuffle_ps(yv, yv, 0xff), acc);
+            return acc;
+        }
+        static inline Acc acc_r4_first(const Data * xv, const Data& yv) {
+            auto acc = _mm512_mul_ps(xv[0], _mm512_shuffle_ps(yv, yv, 0x00));
+            acc = _mm512_fmadd_ps(xv[1], _mm512_shuffle_ps(yv, yv, 0x55), acc);
+            acc = _mm512_fmadd_ps(xv[2], _mm512_shuffle_ps(yv, yv, 0xaa), acc);
+            acc = _mm512_fmadd_ps(xv[3], _mm512_shuffle_ps(yv, yv, 0xff), acc);
+            return acc;
+        }
+        static inline __m128 hsum_r4(Acc acc) {
+            auto sum1 = _mm_add_ps(_mm512_extractf32x4_ps(acc, 0), _mm512_extractf32x4_ps(acc, 1));
+            auto sum2 = _mm_add_ps(_mm512_extractf32x4_ps(acc, 2), _mm512_extractf32x4_ps(acc, 3));
+            return _mm_add_ps(sum1, sum2);
+        }
+    #else
+        constexpr static int k_step = 8;
+        using Data = __m256;
+        using Acc  = __m256;
+        static inline Data load(const ggml_half * x) { return _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)x)); }
+        static inline Data load(const float * x) { return _mm256_loadu_ps(x); }
+        static inline Data load(const ggml_bf16_t * x) {
+            return _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i*)x)), 16));
+        }
+        static inline Acc acc(Acc prev, const Data& y, const Data& x) {
+            return _mm256_fmadd_ps(y, x, prev);
+        }
+        static inline Acc add(Acc x, Acc y) { return _mm256_add_ps(x, y); }
+        static inline Acc acc_r4(Acc acc, const Data * xv, const Data& yv) {
+            acc = _mm256_fmadd_ps(xv[0], _mm256_shuffle_ps(yv, yv, 0x00), acc);
+            acc = _mm256_fmadd_ps(xv[1], _mm256_shuffle_ps(yv, yv, 0x55), acc);
+            acc = _mm256_fmadd_ps(xv[2], _mm256_shuffle_ps(yv, yv, 0xaa), acc);
+            acc = _mm256_fmadd_ps(xv[3], _mm256_shuffle_ps(yv, yv, 0xff), acc);
+            return acc;
+        }
+        static inline Acc acc_r4_first(const Data * xv, const Data& yv) {
+            auto acc = _mm256_mul_ps(xv[0], _mm256_shuffle_ps(yv, yv, 0x00));
+            acc = _mm256_fmadd_ps(xv[1], _mm256_shuffle_ps(yv, yv, 0x55), acc);
+            acc = _mm256_fmadd_ps(xv[2], _mm256_shuffle_ps(yv, yv, 0xaa), acc);
+            acc = _mm256_fmadd_ps(xv[3], _mm256_shuffle_ps(yv, yv, 0xff), acc);
+            return acc;
+        }
+        static inline Acc acc_first(const Data& y, const Data& x) {
+            return _mm256_mul_ps(y, x);
+        }
+        static inline float hsum(Acc acc) {
+            return hsum_float_8(acc);
+        }
+        static inline __m128 hsum_r4(Acc acc) {
+            return _mm_add_ps(_mm256_castps256_ps128(acc), _mm256_extractf128_ps(acc, 1));
+        }
+        template <typename Float>
+        static inline Data load4Floats(const Float * x) {
+            return _mm256_insertf128_ps(_mm256_setzero_ps(), load128(x), 0);
+        }
+    #endif
+        static inline __m128 load128(const ggml_half * x) { return _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)x)); }
+        static inline __m128 load128(const float * x) { return _mm_loadu_ps(x); }
+        static inline __m128 load128(const ggml_bf16_t * x) {
+            return _mm_castsi128_ps(_mm_slli_epi32(_mm_cvtepu16_epi32(_mm_loadl_epi64((const __m128i*)x)), 16));
+        }
+    };
+    template <typename Float, int nrc_in> struct QFT final : public QFBase {
+        constexpr static int nrc = nrc_in;
+        QFT(const DataInfo& info) {
+            for (int iy = 0; iy < nrc; ++iy) y[iy] = (const Float *)info.src1_row(iy);
+        }
+        QFT(const char * cx, size_t bx) {
+            for (int iy = 0; iy < nrc; ++iy) y[iy] = (const Float *)(cx + iy*bx);
+        }
+        IQK_ALWAYS_INLINE Data load1(int iy, int i) const { return load(y[iy] + k_step*i); }
+        IQK_ALWAYS_INLINE Data load_tail(int iy, int i) const { return load4Floats(y[iy] + 4*i); }
+        IQK_ALWAYS_INLINE void load_r4(int ix, int i, Data * xv) const {
+            xv[0] = load1(ix+0, i);
+            xv[1] = load1(ix+1, i);
+            xv[2] = load1(ix+2, i);
+            xv[3] = load1(ix+3, i);
+    #ifdef __AVX512F__
+            auto t0 = _mm512_unpacklo_ps(xv[0], xv[1]);
+            auto t1 = _mm512_unpacklo_ps(xv[2], xv[3]);
+            auto t2 = _mm512_unpackhi_ps(xv[0], xv[1]);
+            auto t3 = _mm512_unpackhi_ps(xv[2], xv[3]);
+            xv[0] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(t0), _mm512_castps_pd(t1)));
+            xv[1] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(t0), _mm512_castps_pd(t1)));
+            xv[2] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(t2), _mm512_castps_pd(t3)));
+            xv[3] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(t2), _mm512_castps_pd(t3)));
+    #else
+            auto t0 = _mm256_unpacklo_ps(xv[0], xv[1]);
+            auto t1 = _mm256_unpacklo_ps(xv[2], xv[3]);
+            auto t2 = _mm256_unpackhi_ps(xv[0], xv[1]);
+            auto t3 = _mm256_unpackhi_ps(xv[2], xv[3]);
+            xv[0] = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(t0), _mm256_castps_pd(t1)));
+            xv[1] = _mm256_castpd_ps(_mm256_unpackhi_pd(_mm256_castps_pd(t0), _mm256_castps_pd(t1)));
+            xv[2] = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(t2), _mm256_castps_pd(t3)));
+            xv[3] = _mm256_castpd_ps(_mm256_unpackhi_pd(_mm256_castps_pd(t2), _mm256_castps_pd(t3)));
+    #endif
+        }
+        const Float * y[nrc];
+    };
+    
 
-    if (ne00 % ggml_blck_size(GGML_TYPE_Q8_K) == 0)
-        row_size_q8 = ggml_row_size(GGML_TYPE_Q8_K, ne00);
 
+template <typename Qy, typename Qx>
+IQK_NOINLINE void mul_mat_Qx_Qy_MxN(int n, const char * cx, size_t bx, int ix0, const DataInfo& info) {
+    int nb = n/QFBase::k_step;
+    int nb4 = n/4;
+    Qy y(info);
+    Qx x(cx + ix0*bx, bx);
+    QFBase::Data xv[Qx::nrc];
+    QFBase::Acc  acc[Qx::nrc*Qy::nrc];
+    auto yv = y.load1(0, 0);
+    for (int ix = 0; ix < Qx::nrc; ++ix) {
+        xv[ix] = x.load1(ix, 0);
+        acc[ix] = QFBase::acc_first(yv, xv[ix]);
+    }
+    for (int iy = 1; iy < Qy::nrc; ++iy) {
+        yv = y.load1(iy, 0);
+        for (int ix = 0; ix < Qx::nrc; ++ix) acc[Qx::nrc*iy + ix] = QFBase::acc_first(yv, xv[ix]);
+    }
+    for (int i = 1; i < nb; ++i) {
+        yv = y.load1(0, i);
+        for (int ix = 0; ix < Qx::nrc; ++ix) {
+            xv[ix] = x.load1(ix, i);
+            acc[ix] = QFBase::acc(acc[ix], yv, xv[ix]);
+        }
+        for (int iy = 1; iy < Qy::nrc; ++iy) {
+            yv = y.load1(iy, i);
+            for (int ix = 0; ix < Qx::nrc; ++ix) acc[Qx::nrc*iy + ix] = QFBase::acc(acc[Qx::nrc*iy + ix], yv, xv[ix]);
+        }
+    }
+    for (int i = (QFBase::k_step/4)*nb; i < nb4; ++i) {
+        yv = y.load_tail(0, i);
+        for (int ix = 0; ix < Qx::nrc; ++ix) {
+            xv[ix] = x.load_tail(ix, i);
+            acc[ix] = QFBase::acc(acc[ix], yv, xv[ix]);
+        }
+        for (int iy = 1; iy < Qy::nrc; ++iy) {
+            yv = y.load_tail(iy, i);
+            for (int ix = 0; ix < Qx::nrc; ++ix) acc[Qx::nrc*iy + ix] = QFBase::acc(acc[Qx::nrc*iy + ix], yv, xv[ix]);
+        }
+    }
+    for (int iy = 0; iy < Qy::nrc; ++iy) for (int ix = 0; ix < Qx::nrc; ++ix) info.store(ix0+ix, iy, QFBase::hsum(acc[Qx::nrc*iy+ix]));
+}
+// This will handle any of f16 x f32, f32 x f16, f16 x f16, f32 x f32, with computations done
+// in f32 (i.e., f16 is first converted to f32). It is easy to extend to computations done in
+// f16, but I don't have a CPU capable of f16 vector arithmetic, so not doing it for now.
+template <int nrc_y, typename FloatX, typename FloatY>
+void mul_mat_fX_fY_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
+    const char * cx = (const char *)vx;
+    // TBD if we want this
+    //if constexpr (nrc_y == 1) {
+    //    constexpr int k_nx = 2;
+    //    for (int ix = 0; ix < nrc_x/k_nx; ++ix) {
+    //        mul_mat_Qx_Qy_Mx1<QFT<FloatY, nrc_y>, QFT<FloatX, k_nx>>(n, cx, bx, ix*k_nx, info);
+    //    }
+    //    if (int lastx = k_nx*(nrc_x/k_nx); lastx < nrc_x) {
+    //        int nx = nrc_x - lastx;
+    //        switch (nx) {
+    //            case 1: mul_mat_Qx_Qy_Mx1<QFT<FloatY, nrc_y>, QFT<FloatX, 1>>(n, cx, bx, lastx, info); break;
+    //            case 2: mul_mat_Qx_Qy_Mx1<QFT<FloatY, nrc_y>, QFT<FloatX, 2>>(n, cx, bx, lastx, info); break;
+    //            case 3: mul_mat_Qx_Qy_Mx1<QFT<FloatY, nrc_y>, QFT<FloatX, 3>>(n, cx, bx, lastx, info); break;
+    //        }
+    //        //mul_mat_Qx_Qy_Mx1<QFT<FloatY, nrc_y>, QFT<FloatX, 1>>(n, cx, bx, lastx, info);
+    //    }
+    //    return;
+    //}
+#ifdef __AVX512F__
+    constexpr int k_nx = 5;
+#else
+    constexpr int k_nx = nrc_y == 1 ? 4 : 2;
+#endif
+    for (int ix = 0; ix < nrc_x/k_nx; ++ix) {
+        mul_mat_Qx_Qy_MxN<QFT<FloatY, nrc_y>, QFT<FloatX, k_nx>>(n, cx, bx, ix*k_nx, info);
+    }
+    int last_x = k_nx*(nrc_x/k_nx);
+    if (last_x == nrc_x) return;
+    int nx = nrc_x - last_x;
+#ifdef __AVX512F__
+    switch (nx) {
+        case 1: mul_mat_Qx_Qy_MxN<QFT<FloatY, nrc_y>, QFT<FloatX, 1>>(n, cx, bx, last_x, info); break;
+        case 2: mul_mat_Qx_Qy_MxN<QFT<FloatY, nrc_y>, QFT<FloatX, 2>>(n, cx, bx, last_x, info); break;
+        case 3: mul_mat_Qx_Qy_MxN<QFT<FloatY, nrc_y>, QFT<FloatX, 3>>(n, cx, bx, last_x, info); break;
+        case 4: mul_mat_Qx_Qy_MxN<QFT<FloatY, nrc_y>, QFT<FloatX, 4>>(n, cx, bx, last_x, info); break;
+    }
+#else
+    if constexpr (nrc_y == 1) {
+        switch (nx) {
+            case 1: mul_mat_Qx_Qy_MxN<QFT<FloatY, nrc_y>, QFT<FloatX, 1>>(n, cx, bx, last_x, info); break;
+            case 2: mul_mat_Qx_Qy_MxN<QFT<FloatY, nrc_y>, QFT<FloatX, 2>>(n, cx, bx, last_x, info); break;
+            case 3: mul_mat_Qx_Qy_MxN<QFT<FloatY, nrc_y>, QFT<FloatX, 3>>(n, cx, bx, last_x, info); break;
+        }
+    } else {
+        switch (nx) {
+            case 1: mul_mat_Qx_Qy_MxN<QFT<FloatY, nrc_y>, QFT<FloatX, 1>>(n, cx, bx, last_x, info); break;
+        }
+    }
+#endif
+}
+
+template <typename FloatX, typename FloatY>
+void set_mul_mat_f(MulMat& mm) {
+    for (auto& f : mm.funcs) f = nullptr;
+    mm.funcs[0] = mul_mat_fX_fY_T<1, FloatX, FloatY>;
+    mm.funcs[1] = mul_mat_fX_fY_T<2, FloatX, FloatY>;
+    mm.funcs[2] = mul_mat_fX_fY_T<3, FloatX, FloatY>;
+    mm.funcs[3] = mul_mat_fX_fY_T<4, FloatX, FloatY>;
+    mm.funcs[4] = mul_mat_fX_fY_T<5, FloatX, FloatY>;
+#ifndef __AVX512F__
+    mm.funcs[5] = mul_mat_fX_fY_T<6, FloatX, FloatY>;
+#endif
+}
+
+
+
+/*
+moonll
+add typeb TO compare return not expected type of weight matrix
+add IQ2XSS
+add IQ1_S
+add GGML_TYPE_IQ4_XS
+*/
+
+bool MulMat::set_mul_mat(int typeA, int typeB, int ne00, MulMat& mm, int Ny) {
+    (void)Ny;
+
+        auto expected_typeB = GGML_TYPE_Q8_K;
     switch (typeA) {
         case GGML_TYPE_Q2_K:
             assert (ne00 % QK_K == 0);
@@ -1440,37 +3209,75 @@ bool MulMat::set_mul_mat(int typeA, int ne00, MulMat& mm, int& row_size_q8, int)
             assert (ne00 % QK_K == 0);
             MulMat::set_functions<DequantizerIQ4XS>(mm);
             break;
+        case GGML_TYPE_IQ2_XXS:
+            assert (ne00 % QK_K == 0);
+            MulMat::set_functions<DequantizerIQ2XXS>(mm);
+            break;
         case GGML_TYPE_Q4_0:
             assert (ne00 % QK4_0 == 0);
             MulMat::set_functions<Q4_0_Unpacker>(mm);
-            row_size_q8 = ggml_row_size(GGML_TYPE_Q8_0, ne00);
+            expected_typeB = GGML_TYPE_Q8_0;
             break;
         case GGML_TYPE_Q4_1:
             assert (ne00 % QK4_1 == 0);
             MulMat::set_functions<Q4_1_Unpacker>(mm);
-            row_size_q8 = ggml_row_size(GGML_TYPE_Q8_1, ne00);
+            expected_typeB = GGML_TYPE_Q8_1_X4;
             break;
         case GGML_TYPE_Q5_0:
             assert (ne00 % QK5_0 == 0);
             MulMat::set_functions<Q5_0_Unpacker>(mm);
-            row_size_q8 = ggml_row_size(GGML_TYPE_Q8_0, ne00);
+            expected_typeB = GGML_TYPE_Q8_0;
             break;
         case GGML_TYPE_Q5_1:
             assert (ne00 % QK5_1 == 0);
             MulMat::set_functions<Q5_1_Unpacker>(mm);
-            row_size_q8 = ggml_row_size(GGML_TYPE_Q8_1, ne00);
+            expected_typeB = GGML_TYPE_Q8_1_X4;
+            break;
+        case GGML_TYPE_Q8_0:
+            assert (ne00 % QK8_0 == 0);
+#ifdef HAVE_FANCY_SIMD
+            MulMat::set_functions<Q8_0_1_Unpacker>(mm);
+            expected_typeB = GGML_TYPE_Q8_1_X4;
+#else
+            MulMat::set_functions<Q8_0_Unpacker>(mm);
+            expected_typeB = GGML_TYPE_Q8_0_X4;
+#endif
+            break;
+        case GGML_TYPE_IQ1_S:
+            mm.funcs[0] = mul_mat_iq1_s_q8_K<1>;
+            mm.funcs[1] = mul_mat_iq1_s_q8_K<2>;
+            mm.funcs[2] = mul_mat_iq1_s_q8_K<3>;
+            mm.funcs[3] = mul_mat_iq1_s_q8_K<4>;
+            mm.funcs[4] = mul_mat_iq1_s_q8_K<5>;
+            mm.funcs[5] = mul_mat_iq1_s_q8_K<6>;
+            mm.funcs[6] = mul_mat_iq1_s_q8_K<7>;
+            mm.funcs[7] = mul_mat_iq1_s_q8_K<8>;
+        #ifdef HAVE_FANCY_SIMD
+             mm.func16 = mul_mat_iq1_s_q8_K<16>;
+        #endif
+       // row_size_q8 = ggml_row_size(GGML_TYPE_Q8_K, ne00);
+              expected_typeB = GGML_TYPE_Q8_K;
             break;
 
         default:
+        {
+            printf("case:%d",typeA);
             return false;
+        }
+            
     }
 
-    return true;
+
+
+    return ggml_type(typeB) == expected_typeB;
+
 }
 
 } // namespace
 
-
+/*
+iq1_s is not support for arm
+*/
 #else   // __aarch64__
 
 namespace {
diff --git a/third_party/llamafile/sgemm.h b/third_party/llamafile/sgemm.h
index 09c47d8..abad1ba 100644
--- a/third_party/llamafile/sgemm.h
+++ b/third_party/llamafile/sgemm.h
@@ -12,10 +12,15 @@ extern "C" {
 
 struct ggml_tensor;
 struct ggml_compute_params;
+/*moonll old
+add more params typeb...
+*/
+
+
+bool iqk_mul_mat(long, long, long,int, const void*, long, int, const void*, long,float*, long, int, int);
+bool iqk_mul_mat_zen4(long, long, long,int, const void*, long, int, const void*, long,float*, long, int, int);
+bool iqk_mul_mat_arm82(long, long, long,int, const void*, long, int, const void*, long,float*, long, int, int);
 
-bool iqk_mul_mat(long, long, long, int, const void*, const void*, float*, long, int, int);
-bool iqk_mul_mat_zen4(long, long, long, int, const void*, const void*, float*, long, int, int);
-bool iqk_mul_mat_arm82(long, long, long, int, const void*, const void*, float*, long, int, int);
 
 bool iqk_mul_mat_moe(long, long, long, int, int, const void*, const void*, float*, long, long, const void*, int, int);
 bool iqk_mul_mat_moe_zen4(long, long, long, int, int, const void*, const void*, float*, long, long, const void*, int, int);
diff --git a/third_party/llamafile/tinyblas_cpu_sgemm.inc b/third_party/llamafile/tinyblas_cpu_sgemm.inc
index 634dc3e..9ed8f35 100644
--- a/third_party/llamafile/tinyblas_cpu_sgemm.inc
+++ b/third_party/llamafile/tinyblas_cpu_sgemm.inc
@@ -323,20 +323,17 @@ bool llamafile_sgemm(long m, long n, long k, const void* A, long lda, const void
 #if QK_K == 256
 #if defined(__x86_64__) || defined(_M_X64)
 #if defined(__AVX2__) && (defined(__FMA__) || (defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))))
-    // if (X86_CHECK(AVX2) && X86_CHECK(FMA)) {
-    if (Btype == GGML_TYPE_Q8_K && Ctype == GGML_TYPE_F32) {
-        if (iqk_mul_mat(m, n, k * QK_K, Atype, A, B, (float*)C, ldc, ith, nth)) {
+    /* 
+    moonll
+    more Btype accept
+    }*/
+
+    if (Ctype == GGML_TYPE_F32){
+        if (iqk_mul_mat(m, n, k * ggml_blck_size(ggml_type(Atype)), Atype, A,lda,Btype, B,ldb, (float*)C, ldc, ith, nth)) {
             return true;
         }
     }
-    if ((Btype == GGML_TYPE_Q8_0 || Btype == GGML_TYPE_Q8_1) && Ctype == GGML_TYPE_F32) {
-        // assert(QK8_0 == QK8_1 == QK4_0 == QK4_1 == QK5_0 == QK5_1 == 32);
-        assert((QK8_0 == 32) && (QK8_1 == 32) && (QK4_0 == 32) && (QK4_1 == 32) && (QK5_0 == 32) && (QK5_1 == 32));
-        if (iqk_mul_mat(m, n, k * QK8_0, Atype, A, B, (float*)C, ldc, ith, nth)) {
-            return true;
-        }
-    }
-    // }
+
 #endif
 #elif defined __aarch64__ && defined __ARM_FEATURE_DOTPROD && !defined _MSC_VER
     if (Btype == GGML_TYPE_Q8_K && Ctype == GGML_TYPE_F32) {