Merge pull request #57 from UnicornChan/develop-0.1.3

[feature] release 0.1.3
2025-09-06 04:30:03 +00:00 · 2024-08-29 01:57:34 +08:00 · 2024-08-29 01:57:34 +08:00 · 233bbb8c55
commit 233bbb8c55
parent 67f8b370c3 4d1d561d28
58 changed files with 11709 additions and 374 deletions
--- a/.github/workflows/package_wheel_release.yml
+++ b/.github/workflows/package_wheel_release.yml
@ -29,11 +29,6 @@ jobs:
         - { os: ubuntu-20.04, pyver:  '3.12', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
         - { os: ubuntu-20.04, pyver:  '3.12', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
         - { os: ubuntu-20.04, pyver:  '3.12', cuda: '12.5.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '124'}
-         - { os: ubuntu-20.04, pyver:  '3.12', cuda: '12.5.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'}
-         - { os: ubuntu-20.04, pyver:  '3.12', cuda: '12.5.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'}
-         - { os: ubuntu-20.04, pyver:  '3.12', cuda: '12.4.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '124'}
-         - { os: ubuntu-20.04, pyver:  '3.12', cuda: '12.4.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'}
-         - { os: ubuntu-20.04, pyver:  '3.12', cuda: '12.4.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'}
         - { os: ubuntu-20.04, pyver:  '3.12', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '121'}
         - { os: ubuntu-20.04, pyver:  '3.12', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
         - { os: ubuntu-20.04, pyver:  '3.12', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
@ -52,12 +47,6 @@ jobs:
         - { os: ubuntu-20.04, pyver:  '3.11', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '121'}
         - { os: ubuntu-20.04, pyver:  '3.11', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
         - { os: ubuntu-20.04, pyver:  '3.11', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.11', cuda: '12.5.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '124'}
-         - { os: ubuntu-20.04, pyver:  '3.11', cuda: '12.5.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'}
-         - { os: ubuntu-20.04, pyver:  '3.11', cuda: '12.5.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'}
-         - { os: ubuntu-20.04, pyver:  '3.11', cuda: '12.4.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '124'}
-         - { os: ubuntu-20.04, pyver:  '3.11', cuda: '12.4.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'}
-         - { os: ubuntu-20.04, pyver:  '3.11', cuda: '12.4.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'}
         - { os: ubuntu-20.04, pyver:  '3.11', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '121'}
         - { os: ubuntu-20.04, pyver:  '3.11', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
         - { os: ubuntu-20.04, pyver:  '3.11', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
@ -76,12 +65,6 @@ jobs:
         - { os: ubuntu-20.04, pyver:  '3.10', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '121'}
         - { os: ubuntu-20.04, pyver:  '3.10', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
         - { os: ubuntu-20.04, pyver:  '3.10', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.10', cuda: '12.5.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '124'}
-         - { os: ubuntu-20.04, pyver:  '3.10', cuda: '12.5.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'}
-         - { os: ubuntu-20.04, pyver:  '3.10', cuda: '12.5.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'}
-         - { os: ubuntu-20.04, pyver:  '3.10', cuda: '12.4.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '124'}
-         - { os: ubuntu-20.04, pyver:  '3.10', cuda: '12.4.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'}
-         - { os: ubuntu-20.04, pyver:  '3.10', cuda: '12.4.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'}
         - { os: ubuntu-20.04, pyver:  '3.10', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '121'}
         - { os: ubuntu-20.04, pyver:  '3.10', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
         - { os: ubuntu-20.04, pyver:  '3.10', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
@ -98,10 +81,6 @@ jobs:
         - { os: windows-2022, pyver:  '3.12', cuda: '12.2.2', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
         - { os: windows-2022, pyver:  '3.12', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
         - { os: windows-2022, pyver:  '3.12', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
-         - { os: windows-2022, pyver:  '3.12', cuda: '12.5.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'}
-         - { os: windows-2022, pyver:  '3.12', cuda: '12.5.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'}
-         - { os: windows-2022, pyver:  '3.12', cuda: '12.4.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'}
-         - { os: windows-2022, pyver:  '3.12', cuda: '12.4.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'}
         - { os: windows-2022, pyver:  '3.12', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
         - { os: windows-2022, pyver:  '3.12', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
         - { os: windows-2022, pyver:  '3.12', cuda: '12.1.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
@ -114,10 +93,6 @@ jobs:
         - { os: windows-2022, pyver:  '3.11', cuda: '12.2.2', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
         - { os: windows-2022, pyver:  '3.11', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
         - { os: windows-2022, pyver:  '3.11', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
-         - { os: windows-2022, pyver:  '3.11', cuda: '12.5.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'}
-         - { os: windows-2022, pyver:  '3.11', cuda: '12.5.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'}
-         - { os: windows-2022, pyver:  '3.11', cuda: '12.4.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'}
-         - { os: windows-2022, pyver:  '3.11', cuda: '12.4.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'}
         - { os: windows-2022, pyver:  '3.11', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
         - { os: windows-2022, pyver:  '3.11', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
         - { os: windows-2022, pyver:  '3.11', cuda: '12.1.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
@ -130,10 +105,6 @@ jobs:
         - { os: windows-2022, pyver:  '3.10', cuda: '12.2.2', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
         - { os: windows-2022, pyver:  '3.10', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
         - { os: windows-2022, pyver:  '3.10', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
-         - { os: windows-2022, pyver:  '3.10', cuda: '12.5.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'}
-         - { os: windows-2022, pyver:  '3.10', cuda: '12.5.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'}
-         - { os: windows-2022, pyver:  '3.10', cuda: '12.4.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'}
-         - { os: windows-2022, pyver:  '3.10', cuda: '12.4.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'}
         - { os: windows-2022, pyver:  '3.10', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
         - { os: windows-2022, pyver:  '3.10', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
         - { os: windows-2022, pyver:  '3.10', cuda: '12.1.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
@ -219,6 +190,11 @@ jobs:
            $env:CUDA_PATH = "$env:CUDA_PATH/Library"
            $env:CUDA_HOME = $env:CUDA_PATH
            $env:PATH = "$env:CUDA_PATH/bin;" + $env:PATH
+            $directory = "$env:CUDA_PATH/lib/x64/"
+            if (-not (Test-Path -Path $directory)) {
+              New-Item -ItemType Directory -Path $directory
+              Write-Output "Directory '$directory' created."
+            }
            cp $env:CUDA_PATH/lib/*.lib $env:CUDA_PATH/lib/x64/
            $env:INCLUDE =$env:CUDA_PATH + "/include/targets/x64;" + $env:INCLUDE

--- a/.gitignore
+++ b/.gitignore
@ -17,4 +17,5 @@ compile_commands.json
 *dist/
 ktransformers/server/local_store/
 ktransformers/server_test1.db
-*.patch
+*.patch
+local_chat_djw.py
--- a/README.md
+++ b/README.md
@ -1,18 +1,17 @@
 <div align="center">
  <!-- <h1>KTransformers</h1> -->
  <p align="center">
-  
-  <picture>
+
+<picture>
    <img alt="KTransformers" src="https://github.com/user-attachments/assets/d5a2492f-a415-4456-af99-4ab102f13f8b" width=50%>

-  </picture>
+</picture>

-  </p>
+</p>
  <h3>A Flexible Framework for Experiencing Cutting-edge LLM Inference Optimizations</h3>
  <strong><a href="#show-cases">🔥 Show Cases</a> | <a href="#quick-start">🚀 Quick Start</a> | <a href="#tutorial">📃 Tutorial</a> | <a href="https://github.com/kvcache-ai/ktransformers/discussions">💬  Discussion </a> </strong>
 </div>

-
 <h2 id="intro">🎉 Introduction</h2>
 KTransformers, pronounced as Quick Transformers, is designed to enhance your 🤗 <a href="https://github.com/huggingface/transformers">Transformers</a> experience with advanced kernel optimizations and placement/parallelism strategies.
 <br/><br/>
@ -22,17 +21,43 @@ interface, RESTful APIs compliant with OpenAI and Ollama, and even a simplified
 <br/><br/>
 Our vision for KTransformers is to serve as a flexible platform for experimenting with innovative LLM inference optimizations. Please let us know if you need any other features.

-<h2 id="Updates">✨ Updates</h2>
+<h2 id="Updates">🔥 Updates</h2>

+* **Aug 28, 2024**: Support 1M context under the InternLM2.5-7B-Chat-1M model, utilizing 24GB of VRAM and 150GB of DRAM.
+* **Aug 28, 2024**: Decrease DeepseekV2's required DRAM from 20G to 10G.
 * **Aug 15, 2024**: Update detailed [TUTORIAL](doc/en/injection_tutorial.md) for injection and multi-GPU. 
-* **Aug 14, 2024**: Support llamfile as linear backend, 
+* **Aug 14, 2024**: Support llamfile as linear backend. 
 * **Aug 12, 2024**: Support multiple GPU; Support new model: mixtral 8\*7B  and 8\*22B; Support q2k, q3k, q5k dequant on gpu.
 * **Aug 9, 2024**: Support windows native.

 <h2 id="show-cases">🔥 Show Cases</h2>
-<h3>GPT-4-level Local VSCode Copilot on a Desktop with only 24GB VRAM</h3>
+<h3>1M Context Local Inference on a Desktop with Only 24GB VRAM</h3>
 <p align="center">

+https://github.com/user-attachments/assets/a865e5e4-bca3-401e-94b8-af3c080e6c12
+
+* **1M Context InternLM 2.5 7B**: Operates at full bf16 precision, utilizing 24GB VRAM and 150GB DRAM, which is feasible on a local desktop setup. It achieves a 92.88% success rate on the 1M "Needle In a Haystack" test and 100% on the 128K NIAH test.
+
+<p align="center">
+  <picture>
+    <img alt="Single Needle Retrieval 128K" src="./doc/assets/needle_128K.png" width=100%>
+  </picture>
+</p>
+
+<p align="center">
+  <picture>
+    <img alt="Single Needle Retrieval 1000K" src="./doc/assets/needle_1M.png" width=100%>
+  </picture>
+</p>
+
+* **Enhanced Speed**: Reaches 16.91 tokens/s for generation with a 1M context using sparse attention, powered by llamafile kernels. This method is over 10 times faster than full attention approach of llama.cpp.
+
+* **Flexible Sparse Attention Framework**: Offers a flexible block sparse attention framework for CPU offloaded decoding. Compatible with SnapKV, Quest, and InfLLm. Further information is available [here](./doc/en/long_context_tutorial.md).
+
+<div>
+<h3>GPT-4-level Local VSCode Copilot on a Desktop with only 24GB VRAM</h3>
+</div>
+
 https://github.com/user-attachments/assets/0b9fa2da-66f0-48eb-b4b9-f0e1f06f8927

 </p>
@ -54,7 +79,6 @@ https://github.com/user-attachments/assets/4c6a8a38-05aa-497d-8eb1-3a5b3918429c

 </p>

-
 <strong>More advanced features will coming soon, so stay tuned!</strong>

 <h2 id="quick-start">🚀 Quick Start</h2>
@ -89,17 +113,21 @@ Some preparation:
  ```
  
 - Linux-x86_64 with gcc, g++ and cmake
+  
  ```sh
  sudo apt-get update
  sudo apt-get install gcc g++ cmake ninja-build
  ```
+
 - We recommend using [Conda](https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh) to create a virtual environment with Python=3.11 to run our program.
+  
  ```sh
  conda create --name ktransformers python=3.11
  conda activate ktransformers # you may need to run ‘conda init’ and reopen shell first
  ```

 - Make sure that PyTorch, packaging, ninja is installed
+  
  ```
  pip install torch packaging ninja
  ```
@ -107,37 +135,44 @@ Some preparation:
 <h3>Installation</h3>

 1. Use a Docker image, see [documentation for Docker](./doc/en/docker.md) 
-2. You can install using Pypi (for linux):

+2. You can install using Pypi (for linux):
+   
   ```
   pip install ktransformers --no-build-isolation
   ```
+   
   for windows we prepare a pre compiled whl package in [ktransformers-0.1.1+cu125torch24avx2-cp311-cp311-win_amd64.whl](https://github.com/kvcache-ai/ktransformers/releases/download/v0.1.1/ktransformers-0.1.1+cu125torch24avx2-cp311-cp311-win_amd64.whl), which require cuda-12.5, torch-2.4, python-3.11, more pre compiled package are being produced. 

 3. Or you can download source code and compile:
+   
   - init source code 
+     
     ```sh
     git clone https://github.com/kvcache-ai/ktransformers.git
     cd ktransformers
     git submodule init
     git submodule update
     ```
+   
   - [Optional] If you want to run with website, please [compile the website](./doc/en/api/server/website.md) before execute ```bash install.sh```
+   
   - Compile and install (for Linux)
+     
     ```
     bash install.sh
     ```
-
+   
   - Compile and install(for Windows)
+     
     ```
     install.bat
-     ``` 
+     ```

 <h3>Local Chat</h3>
-We provide a simple command-line local chat Python script that you can run for testing. 
-
-  > Note that this is a very simple test tool only support one round chat without any memory about last input, if you want to try full ability of the model, you may go to [RESTful API and Web UI](#id_666). We use the DeepSeek-V2-Lite-Chat-GGUF model as an example here. But we also support other models, you can replace it with any other model that you want to test. 
+We provide a simple command-line local chat Python script that you can run for testing.

+> Note that this is a very simple test tool only support one round chat without any memory about last input, if you want to try full ability of the model, you may go to [RESTful API and Web UI](#id_666). We use the DeepSeek-V2-Lite-Chat-GGUF model as an example here. But we also support other models, you can replace it with any other model that you want to test. 

 <h4>Run Example</h4>

@ -162,23 +197,30 @@ python -m ktransformers.local_chat --model_path deepseek-ai/DeepSeek-V2-Lite-Cha
 # python  ktransformers.local_chat --model_path ./DeepSeek-V2-Lite --gguf_path ./DeepSeek-V2-Lite-Chat-GGUF
 ```

-
 It features the following arguments:

 - `--model_path` (required): Name of the model (such as "deepseek-ai/DeepSeek-V2-Lite-Chat" which will automatically download configs from [Hugging Face](https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite)). Or if you already got local files  you may directly use that path to initialize the model.  
-  >Note: <strong>.safetensors</strong> files are not required in the directory. We only need config files to build model and tokenizer.
+  
+  > Note: <strong>.safetensors</strong> files are not required in the directory. We only need config files to build model and tokenizer.
+
 - `--gguf_path` (required): Path of a directory containing GGUF files which could that can be downloaded from [Hugging Face](https://huggingface.co/mzwing/DeepSeek-V2-Lite-Chat-GGUF/tree/main) (we only support q4_k_m and q8_0 for now, more formats are coming soon).
+
 - `--optimize_rule_path` (required except for Qwen2Moe and DeepSeek-V2): Path of YAML file containing optimize rules. There are two rule files pre-written in the [ktransformers/optimize/optimize_rules](ktransformers/optimize/optimize_rules) directory for optimizing DeepSeek-V2 and Qwen2-57B-A14, two SOTA MoE models.
+
 - `--max_new_tokens`: Int (default=1000). Maximum number of new tokens to generate.
+
 - `--cpu_infer`: Int (default=10). The number of CPUs used for inference. Should ideally be set to the (total number of cores - 2).

 <h3 id="supported-model"> Supported Model</h3>

-| Model Name | Model Size | VRAM | Minimum DRAM | Recommended DRAM |
-| ----  | ---- | ---- | ---- | ---- |
-| DeepSeek-V2-q4_k_m | 133G | 24G | 136G | 192G |
-| Qwen2-57B-A14B-Instruct-q4_k_m | 33G | 8G | 34G | 64G |
-| DeepSeek-V2-Lite-q4_k_m | 9.7G | 3G | 13G | 16G |
+| Model Name                     | Model Size | VRAM  | Minimum DRAM    | Recommended DRAM  |
+| ------------------------------ | ---------- | ----- | --------------- | ----------------- |
+| DeepSeek-V2-q4_k_m             | 133G       | 10G   | 136G            | 192G              |
+| Qwen2-57B-A14B-Instruct-q4_k_m | 33G        | 8G    | 34G             | 64G               |
+| DeepSeek-V2-Lite-q4_k_m        | 9.7G       | 3G    | 13G             | 16G               |
+| Mixtral-8x7B-q4_k_m            | 25G        | 1.6G  | 51G             | 64G               |
+| Mixtral-8x22B-q4_k_m           | 80G        | 4G    | 86.1G           | 96G               |
+| InternLM2.5-7B-Chat-1M         | 15.5G      | 15.5G | 8G(32K context) | 150G (1M context) |


 More will come soon. Please let us know which models you are most interested in. 
@ -188,7 +230,6 @@ Be aware that you need to be subject to their corresponding model licenses when
 <details>
  <summary>Click To Show how to run other examples</summary>

-
 * Qwen2-57B

 ```sh
@ -208,6 +249,7 @@ python -m ktransformers.local_chat --model_name Qwen/Qwen2-57B-A14B-Instruct --g
 ```

 * DeepseekV2
+  
 ```sh
 mkdir DeepSeek-V2-Chat-0628-GGUF && cd DeepSeek-V2-Chat-0628-GGUF
 # Download weights
@ -221,8 +263,11 @@ cd ..
 python -m ktransformers.local_chat --model_name deepseek-ai/DeepSeek-V2-Chat-0628 --gguf_path ./DeepSeek-V2-Chat-0628-GGUF

 # If you see “OSError: We couldn't connect to 'https://huggingface.co' to load this file”, try：
+
 # GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/deepseek-ai/DeepSeek-V2-Chat-0628
+
 # python -m ktransformers.local_chat --model_path ./DeepSeek-V2-Chat-0628 --gguf_path ./DeepSeek-V2-Chat-0628-GGUF
+
 ```

 | model name | weights download link |
@ -245,11 +290,15 @@ Start without website:
 ```sh
 ktransformers --model_path deepseek-ai/DeepSeek-V2-Lite-Chat --gguf_path /path/to/DeepSeek-V2-Lite-Chat-GGUF --port 10002
 ```
+
 Start with website:
+
 ```sh
 ktransformers --model_path deepseek-ai/DeepSeek-V2-Lite-Chat --gguf_path /path/to/DeepSeek-V2-Lite-Chat-GGUF  --port 10002 --web True
 ```
+
 Or you want to start server with transformers, the model_path should include safetensors
+
 ```bash
 ktransformers --type transformers --model_path /mnt/data/model/Qwen2-0.5B-Instruct --port 10002 --web True
 ```
@ -264,10 +313,9 @@ Access website with url [http://localhost:10002/web/index.html#/chat](http://loc

 More information about the RESTful API server can be found [here](doc/en/api/server/server.md). You can also find an example of integrating with Tabby [here](doc/en/api/server/tabby.md).

-
 <h2 id="tutorial">📃 Brief Injection Tutorial</h2>
 At the heart of KTransformers is a user-friendly, template-based injection framework. 
-This allows researchers to easily replace original torch modules with optimized variants. It also simplifies the process of combining multiple optimizations, allowing the exploration of their synergistic effects. 
+This allows researchers to easily replace original torch modules with optimized variants. It also simplifies the process of combining multiple optimizations, allowing the exploration of their synergistic effects.

 </br>
 <p align="center">
--- a/doc/assets/Framework_effect.png
+++ b/doc/assets/Framework_effect.png
--- a/doc/assets/InfLLM_equation.jpg
+++ b/doc/assets/InfLLM_equation.jpg
--- a/doc/assets/InfLLM_framework.png
+++ b/doc/assets/InfLLM_framework.png
--- a/doc/assets/KTransformers_long_context_v1.png
+++ b/doc/assets/KTransformers_long_context_v1.png
--- a/doc/assets/KTransformers_long_context_v2.png
+++ b/doc/assets/KTransformers_long_context_v2.png
--- a/doc/assets/Quest_framework.png
+++ b/doc/assets/Quest_framework.png
--- a/doc/assets/SnapKV_framework.png
+++ b/doc/assets/SnapKV_framework.png
--- a/doc/assets/SparQ_attention.png
+++ b/doc/assets/SparQ_attention.png
--- a/doc/assets/internlm_memory.png
+++ b/doc/assets/internlm_memory.png
--- a/doc/assets/long_context_generate.png
+++ b/doc/assets/long_context_generate.png
--- a/doc/assets/long_context_prefill.png
+++ b/doc/assets/long_context_prefill.png
--- a/doc/assets/needle_128K.png
+++ b/doc/assets/needle_128K.png
--- a/doc/assets/needle_1M.png
+++ b/doc/assets/needle_1M.png
--- a/doc/en/long_context_tutorial.md
+++ b/doc/en/long_context_tutorial.md
@ -0,0 +1,316 @@
+# KVCache Long Context
+
+## TL;DR
+
+Training larger models and supporting longer text sequences are currently the two most widely agreed-upon directions toward achieving AGI. After lowering the barrier for local inference with trillion-parameter MoE models, the second showcase scenario for KTransformers is reducing the inference barrier for ultra-long context sequences. Recently, both ChatGLM and InternLM have released open-source models supporting 1M tokens of context. This article will use InternLM2.5-7B-Chat-1M as an example to introduce a method that leverages the sparsity of attention to accelerate long-text inference on heterogeneous CPU/GPU systems.
+
+After optimization, KTransformers has achieved native-precision inference for 128K and even 1M tokens of context on a single 24GB GPU with CPU/DRAM support. In the 128K context scenario, the generation speed is 7.1 times faster than llama.cpp, while also achieving 100% accuary on relatively simple test sets like "needle in haystack" and "passkey". On the more challenging dataset kvretrieval, through flexible framework configurations, we can achieve a **6.22x speedup** during inference while obtaining even higher scores than running the original model directly (**21.2 -> 24.4**). In the 1M context scenario on a single 24GB GPU, KTransformers can similarly achieve a 16 tokens/s inference speed, nearly 10 times faster than llama.cpp under the same conditions, with the "needle in haystack" evaluation score even surpassing the original model (**89.31 -> 92.88**).
+
+Project url: https://github.com/kvcache-ai/ktransformers
+
+## Mathematical Principle: The computational overhead of long-text inference and the sparsity in Attention caused by Softmax.
+
+As the demand for longer context windows increases, not only have commercial large models like Kimi and Claude/Gemini started supporting increasingly longer context windows, but open-source models have also begun to catch up. Notably, both ChatGLM 4 and InternLM 2.5 have released versions that are under 10 billion parameters but support up to 1 million tokens of context. However, despite the relatively small size of these models, the enormous KVCache required for such ultra-long contexts still prevents local users from practically running these models. As shown in the figure below, while the InternLM2.5-7B-Chat-1M model weights only require 15.49GB of GPU memory, an additional 145.49GB is needed to store the entire 1M-token KVCache, which is clearly beyond the memory capacity of local users. Even when using the KVCache Offload feature of llama.cpp to offload the KVCache to CPU/DRAM, barely making the model runnable, performance remains unacceptable due to the need to fully scan the entire KVCache each time a single token is generated.
+
+| <img title="" src="../assets/internlm_memory.png" alt="internlm_memory" width="882"> | <img src="../assets/SparQ_attention.png" title="" alt="sparQ" width="691"> |
+| ------------------------------------------------------------------------------------ | -------------------------------------------------------------------------- |
+
+Fortunately, many studies have noticed that attention distribution during the inference phase tends to be **sparse**. For example, the right figure shows SparQ's experimental statistics based on LLaMa 7B, where less than 1% of tokens in a 3k context have relatively high attention scores. Similar conclusions are not only reflected in many other papers, such as H2O, Quest, InfLLM, and SnapKV, but we have also further validated this through long-text experiments with InternLM 2.5-7B-1M. Although the proportion isn't as extreme as 1%, due to the inherent head-focused effect of the softmax operation in attention mechanisms, it is theoretically possible that if we can identify in advance which tokens have high attention scores, scanning less than 5% of the tokens would suffice to essentially replicate the original result.
+
+Thus, the problem narrows down to how to quickly identify these tokens with high attention scores without scanning them all. In the following sections, we will first briefly survey several key related papers, then summarize and propose a general framework we designed and implemented within KTransformers—a highly efficient sparse attention operator for CPUs.
+
+## Related Papers and Conclusions
+
+### Prune or Retrieval？
+
+Based on the aforementioned points, we studied papers from recent years related to sparse selection in KVCache. The earliest of these is the paper H2O, which suggested that the attention distribution during inference is sparse and that only 5% of the KVCache is needed during inference. Following this, a series of works built on H2O's approach by designing more complex methods for selecting tokens that perform better in different scenarios. These methods are quite reasonable for single-word inference. However, as we previously explored in the Mooncake project, **we believe that the future trend is to precompute reusable KVCache as much as possible, and then use it to answer different questions.** This "compute once, use many" approach aims to reduce computational costs. Therefore, with this goal in mind, we prefer not to delete any tokens from the KVCache, or at least not remove a significant portion of them, to ensure that different questions can focus on different parts of the context in the future.
+
+![InfLLM Framework](../assets/InfLLM_framework.png)
+
+We further investigated related research, among which InfLLM proposed a very promising framework. Not only does it recognize that attention is sparse, but it also suggests that overly long contexts can cause attention to be dispersed into irrelevant noise, thereby reducing the model's ability to focus on key information. To address this issue, InfLLM introduces an external memory module (Memory Units) to store the context's KVCache. In each computation step, the most relevant semantic information is retrieved from this external memory module to participate in the calculation, thus enhancing the model's ability to handle long-context inference.
+
+Specifically, InfLLM organizes the external memory module using semantic blocks composed of neighboring tokens and employs a sliding window mechanism during computation. In each step, it selects only the semantic blocks at the head of the context (Initial Tokens), the blocks near the current token (Local Tokens), and a few blocks with the highest semantic similarity to the current token to participate in the attention calculation. As shown in equation 1, to efficiently retrieve the blocks with the highest similarity, InfLLM selects a few representative tokens whose scores $$r_m
+$$ are the highest within each block. Use Equation 2 to calculate the semantic similarity between the current token and each semantic block.
+
+![InfLLM Equation](../assets/InfLLM_equation.jpg)
+
+Compared to the previously mentioned H2O, the differences in InfLLM are as follows:
+
+1. The KVCache is not discarded but stored in memory and dynamically loaded onto the GPU during inference.
+
+2. KVCache is managed at the granularity of blocks rather than tokens, with each block selecting a few tokens as its representative index tokens.
+
+InfLLM's proposed method aligns with our "compute once, use many" approach of reusing KVCache. The external memory units in this method can be offloaded to CPU/DRAM or even SSD storage, allowing different parts to be selected for computation based on the specific question. This significantly improves the efficiency of attention computation.
+
+### Other Improvements
+
+Similarly, after InfLLM, Quest also manages tokens at the granularity of blocks. Quest analyzed the recall rate of key tokens in H2O and full attention, finding that the Top-10 attention score token recall rate for the H2O algorithm is around 50%, which indicates that too much key information was lost. To improve the recall rate of key tokens, Quest chooses two "representative tokens" from each block for retrieval. In the prefill stage, each KVCache block records the maximum and minimum values for each channel, as shown in the figure below under "Reduced Keys," which contains the element-wise min key and element-wise max key.
+
+During the attention computation stage, the dot product is computed between the current query vector and the max key and min key of each KVCache block, respectively. Then, for each channel, the maximum value between the two resulting product vectors is selected and summed to serve as the upper bound of the relevance score for that KVCache block, as shown in stage 1 of the diagram. Based on the relevance scores, the top-k KVCache blocks are selected to participate in the attention computation, as illustrated in stage 2 of the diagram.
+
+![Quest Framework](../assets/Quest_framework.png)
+
+Compared to InfLLM, Quest does not take heterogeneous architectures into account. Instead, it assumes that all KVCache can still fit into memory, simply leveraging sparse attention to accelerate the inference process. Ultimately, Quest achieves a 7.03x speedup in attention computation and a 2.23x improvement in end-to-end inference latency.
+
+Going further, SnapKV proposes retaining two parts of the tokens during the prefill stage, as shown in the diagram below with the orange and green segments. The difference from InfLLM lies only in the method of selecting the middle tokens. SnapKV selects tokens at the token level rather than the block level, with the score calculation being similar to H2O, i.e., $$softmax(\frac{qk^T}{\sqrt{d_k}})$$. However, when summing across columns, only the rows within the final green window are selected for computation, corresponding to the Local Tokens section in InfLLM. Additionally, SnapKV introduces a pooling operation on top of attention, which the paper explains as ensuring that the recalled tokens retain more complete semantic information.
+
+This approach in SnapKV involves a one-time selection during the inference phase, after which only the selected tokens are used for attention computation, while the rest of the KVCache is discarded.
+
+![SnapKV Framework](../assets/SnapKV_framework.png)
+
+
+Other related papers include PyramidKV, which observed that attention scores exhibit a pyramid-shaped distribution across attention layers. In lower attention layers, attention is widely distributed, while in higher layers, the attention scores for a few key tokens become increasingly prominent. Therefore, PyramidKV allocates more KVCache storage space to lower layers and less space to higher layers.
+
+MagicPiG, based on Locality-Sensitive Hashing (LSH), proposes a dynamic KVCache management strategy. First, it uses SnapKV to select a portion of important tokens to be stored in the GPU, while the KVCache of other tokens is placed in memory. By leveraging the high efficiency of LSH in high-dimensional space searches and the multithreading capabilities of CPUs, MagicPiG retrieves KVCache from memory that is similar to the current query and loads it into memory for inference. Compared to the earlier methods like InfLLM, Quest, and SnapKV, MagicPiG does not need to scan all representative tokens and select the top-k KVCache. Instead, it utilizes the mathematical properties of LSH, which not only simulates attention scores but also allows for identifying important KVCache with low overhead and high speed.
+
+The above are just descriptions of some key points. For more detailed explanations, you can refer to the existing articles on Zhihu in Chinese:
+
+- https://zhuanlan.zhihu.com/p/701580870
+
+- https://zhuanlan.zhihu.com/p/714288577
+
+## KTransformers CPU Sparse Attn Framework
+
+### Framework Prototype
+
+Based on the introduction of the above papers, we have distilled the following key points:
+
+- The distribution of attention weights is sparse, and useless KVCache may introduce noise, which could actually reduce performance during the inference stage.
+
+- For the KVCache eviction strategy during the inference stage, the common approach is to retain the tokens from the beginning and the end of the prompt, while designing algorithms to select the tokens from the middle portion. One of the main factors affecting the model's performance is the ability to accurately identify the key tokens.
+
+- Managing the middle portion of tokens in blocks can improve memory swapping and attention computation efficiency, and smaller blocks do not seem to perform worse than token-level granularity.
+
+- The tokens that each attention layer focuses on during inference differ, and even the allocated KVCache capacity for different layers should vary.
+
+Based on these insights and inspirations, we developed a general framework for implementing sparse CPU attention operators during the inference phase. In the prefill stage, we use chunked prefill, loading only one layer of KVCache into GPU memory at a time for computation. Once completed, the KVCache is stored on CPU/DRAM. In the subsequent decode stage, instead of swapping KVCache in and out, the sparse attention operator runs directly on the CPU. **This significantly reduces the minimum** **GPU** **memory requirements, making local 128K or even 1M token contexts possible.**
+
+Specifically during the generation phase, we implemented the entire framework as shown in the diagram below.
+
+![KTransformers long congtext v1](../assets/KTransformers_long_context_v1.png)
+
+We organized the KVCache in units of blocks. Specifically:
+
+- **KVCache Partitioning:** A complete input prompt is divided into three configurable parts: Initial, Context, and Local. During the computation process, the Initial/Local parts will be fully attended to, while the Context part will be sparsely retrieved. This approach is based on findings from many papers (such as streamingLLM and Minference) which mention the existence of "attention sinks," where higher attention weights are often found at the beginning and the end of the sequence.
+
+- **Context Block Partitioning:** For the middle Context, we follow the InfLLM approach by dividing it into blocks based on a configurable fixed number of tokens. Each block can select 1 to k tokens as its representative tokens. During the actual inference phase, the Context blocks that require attention are selected based on these representative tokens.
+  
+  - Specifically, we have implemented the following methods for selecting representative tokens, based on the approaches outlined in various papers.
+    
+    - Max: The maximum values of multiple tokens within a block, across each channel, are concatenated to form the representative token for the current block.
+    
+    - Mean: The average values of multiple tokens within a block, across each channel, are concatenated to form the representative token for the current block.
+    
+    - Quest: A combination of the previous two methods: the maximum and minimum values of multiple tokens within a block, across each channel, are taken as the representative tokens for the block. Under this method, the number of representative tokens is fixed at 2
+    
+    - Dynamic: By calculating the cumulative attention score for each token using a specific method, each block selects the top-k tokens with the highest scores as the representative tokens for the block. This is similar to InfLLM but with some simplifications.
+    
+    - Fix: Select tokens at fixed intervals within the block.
+  
+  - Once the representative tokens for each block are determined, use Equation 2 from InfLLM to calculate the similarity between the input X and the k representative tokens of each block B, and only select the top $$r_k$$ blocks for attention computation, where $$l_P $$ represents the length of the historical tokens:
+
+Since InfLLM requires calculating a representative score for each token during the prefill stage and then selecting a representative token for each block based on these scores, this operation involves invasive modifications to the prefill implementation, making it difficult to integrate with other methods. Furthermore, in actual testing, we found that in most scenarios, similar or even better results can be achieved through a combination of other methods. Therefore, we ultimately decided not to integrate this method into the framework.
+
+## Further Optimizations
+
+After implementing the above framework, we conducted a series of evaluations based on LongBench and InfiniteBench.
+
+At the beginning of the experiment, we designed the architecture so that for each inference token, the most relevant KVCache blocks would be reselected. On the one hand, this strategy incurred significant overhead during the retrieval process. On the other hand, we found that in some scenarios, f**requently changing the selection of retrieved blocks did not lead to better results**. For example, in the kvretrieval dataset, we observed that the model's responses were often correct in the first half but incorrect in the second half. Since the answers to kvretrieval questions consist of long and meaningless strings, this indicates that the correct KVCache blocks were selected during the inference of the earlier tokens but incorrect blocks were chosen during the later stages of inference.
+
+To address this issue, we further integrated the method proposed in SnapKV. Before starting the inference, we preselect relevant KVCache blocks by analyzing the attention scores of the context tokens, based on the question. During the subsequent inference stages, the selection of KVCache blocks is restricted to this preselected range. This approach allowed us to select the block containing the correct answer 100% of the time in the kvretrieval dataset.
+
+However, it should be noted that this method strictly relies on the structure of the Benchmark Prompt and **does not necessarily guarantee optimal performance in other scenarios, such as complex document understanding and generation tasks.** Therefore, we have integrated it into our framework as an optional module. The final framework and configurable parameters are as follows:
+
+![KTransformers long congtext v2](../assets/KTransformers_long_context_v2.png)
+
+
+Configuration：
+
+- **threads_num:** Number of CPU Threads
+
+- **block_size:** KVCache Block Size
+
+- **local_windows_len:** Prompt End Window Size
+
+- **preselect_block_count:** Number of Preselected Blocks
+
+- **second_block_count:** Number of Blocks Selected After Preselection
+
+- **preselect_block:** Whether to Enable Preselection
+
+- **token_step:** Interval Between Token Selections for KVCache
+
+- **layer_step:** Interval Between Layer Selections for KVCache
+
+- **dense_layer_num:** Number of Initial Layers Without KVCache Selection, Importing All KVCache
+
+- **head_select_mode:SEPARATE**(In the GQA scenario, each kv_head is selected separately) / **SHARED:** (All kv_heads are selected together)
+
+- **representative_type:** Method of Selecting Representative Tokens
+
+- **representative_num:** Number of Representative Tokens
+
+By modifying configuration options, various KVCache eviction or compression methods can be easily reproduced within our framework. For example:
+
+- Setting `block_size` to 1 and `preselect_block` to True results in a version of SnapKV without the pooling operation.
+
+- Setting `representative_type` to Quest, `preselect_block` to False, and `head_select_mode` to SEPARATE replicates the Quest method.
+
+Below is the pseudocode for the framework:
+
+```python
+def preselect_block(local_q, kvcache):
+    key_states = kvcache.keycache
+    attn_scores = torch.matmul(
+                local_q, key_states.transpose(2, 3)
+            ) / math.sqrt(head_dim)
+    attn_scores += attn_mask
+    attn_scores = nn.functional.softmax(
+                attn_scores, dim=-1, dtype=torch.float32
+            ).to(query_states.dtype)
+    vote = attn_scores[..., initial_size:-local_size:, :].sum(dim=-2)
+    pool_vote = pool1d(vote, kernel_size=kernel_size, padding=kernel_size//2, stride=1)
+    indices = pool_vote.topk(max_capacity_prompt - local_size, dim=-1).indices
+    kv_cache_block_indices = find_representative_tokens_block(indices)
+    kvcache_after_preselected = kvcache[kv_cache_block_indices]
+    ...
+    return kvcache_after_preselected
+def get_representative_tokens():
+    Calculate the representative token for each block based on the representative_type.
+    return ...
+def decode_attention(query, key, value):
+  # Select once every token_steps tokens.
+  token_steps = 4
+  # Select once every layer_steps layers.
+  layer_steps = 4
+  for token_idx in range(max_new_tokens):
+      for layer_idx in range(config.num_hidden_layers):
+          if token_idx % token_steps != 0 or layer_idx % layer_steps != 0:
+            # If the attention of the current layer in this round does not require reselection, the historical selection results from the kvcache will be retained.
+            kvcache_after_retrieval = history_kvcache_after_retrieval[layer_idx//layer_steps]
+          else:
+            # Otherwise, use the query from the current round's current layer to reselect the kvcache.
+            kvcache_after_retrieval = retrieval_kvcache(query, kvcache)
+            # Save it to the kvcache historical selection results.
+            history_kvcache_after_retrieval[layer_idx//layer_steps] = kvcache_after_retrieval
+          # calculate attention
+          output = attn(query, kvcache_after_retrieval)
+          yield output
+
+# Model prefill, if preselection is required, local_q still needs to be saved.
+local_q, KVCache = model.prefill(input_ids)
+if preselect_block:
+    # Preselection round
+    KVCache = preselect_block(local_q, kvcache)
+# Find the representative token for each block.
+block_representative_tokens = get_representative_tokens(
+   kvcache,                      
+   config.representative_type
+)
+
+# model generate
+'''
+'''
+decode_attention(query, key, value)
+'''
+'''
+```
+
+## Experiment
+
+At the beginning of testing, we will use the following basic configuration, which will be further optimized through the extended framework.
+
+```python
+max_seq_len: 256000 # KVCache length
+block_size: 128 # KVCache block size
+local_windows_len: 4096 # The KVCache of length local_windows_len is stored on the GPU.
+second_block_count: 96 # After preselection, each time select the number of KVCache blocks. If >= preselect_block_count, use the preselected blocks.
+threads_num: 64 # CPU thread num
+representative_type: DYNAMIC # KVCache block representative token selection method.
+kv_type: FP16 
+dense_layer_num: 0 # The first few layers do not need to fill or select KVCache
+representative_num: 1 # The number of representative tokens within a KVCache block.
+preselect_block: False # Whether to preselect.
+head_select_mode: SHARED # All kv_heads jointly select.
+preselect_block_count: 0 # Number of preselected blocks.
+layer_step: 1 # Select every few layers.
+token_step: 1 # Select every few tokens.
+```
+
+Under our framework, the comparison between the original model and KTransformers after acceleration on datasets such as 128K Big Needle-in-a-Haystack, passkey, kvretrieval, etc., is as follows. The passkey dataset involves inserting a small segment of numbers at varying depths within a redundant text. kvretrieval is about finding a matching item in randomly generated key-value pairs. All tests were conducted under the opencompass framework:
+
+![needle_128K.png](../assets/needle_128K.png)
+
+|                                                             |                                 |         |             |
+| ----------------------------------------------------------- | ------------------------------- | ------- | ----------- |
+|                                                             | Single needle retrieval zh 128k | passkey | kvretrieval |
+| Original model                                              | 99.89                           | 100     | 21.0        |
+| KTransformers (reselect KVCache blocks for each generation) | 100                             | 100     | 15.40       |
+
+We can see that both the original model and the accelerated KTransformers achieve perfect scores on the relatively simpler datasets, such as Single Needle Retrieval and passkey. At the same time, the generation speed has significantly improved, increasing from 4.86 tokens/s with llama.cpp to 27.49 tokens/s with KTransformers, achieving up to a 5.65x speedup. Although the current configuration shows a noticeable drop in performance on the more challenging kvretrieval dataset, in the next section, we will address this by implementing a more optimized selection strategy to compensate for or even surpass the original model's accuracy.
+
+Additionally, we tested the performance of the KTransformers-based configuration framework in reproducing the results of Quest. However, since InternLM2.5-7B-Chat-1M uses GQA (Grouped Query Attention) while the Quest paper primarily focuses on optimizing MHA (Multi-Head Attention) models, the actual testing results were not particularly favorable. The official team also mentioned that further support for GQA models is needed, so we will not discuss this in detail for now.
+
+### Further improve performance
+
+By modifying certain configurations within our flexible framework on the basis of reproduction, **we can actually achieve better results than those reported in the previous paper,** as shown in the figure below:
+
+![](../assets/Framework_effect.png)
+
+As mentioned earlier, the goal of the kvretrieval dataset is to find a matching key-value pair within a long sequence of semantically meaningless pairs. If tokens are generated by reselecting based on the current query each time, the likelihood of deviation increases as the text grows, leading to the selection of different KVCache blocks compared to previous selections. To address this, we introduced a preselection mechanism using SnapKV to calculate the method for selecting representative tokens, which preselects a portion of the KVCache blocks. During the subsequent inference process, the selection is limited to these blocks. After one round of preselection, the score increased from 15.4 to 24.2, **surpassing the original model + full attention's performance of 21 points.** Further research indicates that the sparsity effect of the KVCache in the first few layers of LLMs is not as significant. Therefore, we set the first two layers to fully reuse the KVCache, ultimately achieving a score of **24.4**.
+
+Similarly, when testing the needle-in-a-haystack task on the 1M dataset, we not only reproduced the original model's reported score but also further improved accuracy (**from 89.31 to 92.88**) by using the KTransformers CPU Sparse Attn Framework to selectively compute only certain KVCache blocks. Additionally, the inference speed **reached nearly 10 times that of llama.cpp**.
+
+![needle 1M.png](../assets/needle_1M.png)
+
+### More comparisons
+
+As shown in the two figures below, using the Single Needle Retrieval dataset as an example, we set llama.cpp to store the KVCache on CPU/DRAM while performing all computations on the GPU. On a 4090D server, we compared the KTransformers CPU Sparse Attn Framework with llama.cpp. While maintaining **100% answer accuracy**, we achieved a 20.6 to 94.1 times prefill speed increase and a **1.2 to 7.1 times inference speed boost**.
+
+| ![long context prefill.png](../assets/long_context_prefill.png) | ![long context generate.png](../assets/long_context_generate.png) |
+| --------------------------------------------------------------- | ----------------------------------------------------------------- |
+
+The main reason for the significant gap in prefill speed is that after enabling KVCache offload, llama.cpp performs the attention (attn) computation on the CPU. In long-text scenarios, attention not only requires heavy computation but also takes up the majority of the computation time. In contrast, KTransformers leverages a flexible template injection framework to implement GPU Chunk Prefill layer by layer. Moving forward, we plan to further integrate high-performance sparse prefill methods such as MInference to boost speed even further.
+
+Additionally, as a key focus of this article, the right-hand graph shows that as the prompt length increases, the inference speed of KTransformers remains stable, hovering near a horizontal line. In contrast, llama.cpp slows down as the prompt length increases. By selecting only the most important 16K KVCache blocks to participate in the inference computation, KTransformers maintains a consistent inference speed comparable to llama.cpp when processing a 16K prompt, without any performance degradation (at least on these test datasets).
+
+## How to Use
+
+Currently, long context is only supported by our **local_chat.py** interface, and the integration with the server interface is under development.
+
+To facilitate user management, we have uploaded the model config, gguf, and tokenizer to a repo. URL: https://huggingface.co/nilv234/internlm2_5_to_llama_1m/tree/main
+
+By setting the model_path and gguf_path in the local_chat function to **/path/to/repo** and setting the mode to **"long_context"**, you can use the InternLM2.5-7B-Chat-1M model with 1m functionality on a 24G VRAM.
+
+After running local_chat.py for the first time, a config.yaml file will be automatically created under ** ~/.ktransformers**. The relevant configurations for long context are as follows:
+
+```python
+chunk_size: 4096 # prefill chunk size
+max_seq_len: 100000 # KVCache length
+block_size: 128 # KVCache block size
+local_windows_len: 4096 # The KVCache of length local_windows_len is stored on the GPU.
+second_select_num: 96 # After preselection, each time select the number of KVCache blocks. If >= preselect_block_count, use the preselected blocks.
+threads_num: 64 # CPU thread num
+anchor_type: DYNAMIC # KVCache block representative token selection method.
+kv_type: FP16
+dense_layer_num: 0 # The first few layers do not need to fill or select KVCache
+anchor_num: 1 # The number of representative tokens within a KVCache block.
+preselect_block: False # Whether to preselect.
+head_select_mode: SHARED # All kv_heads jointly select.
+preselect_block_count: 96 # Number of preselected blocks.
+layer_step: 1 # Select every few layers.
+token_step: 1 # Select every few tokens.
+```
+
+The memory required for different context lengths is shown in the table below:
+
+|                | 4K  | 32K  | 64K  | 128K | 512K | 1M     |
+| -------------- | --- | ---- | ---- | ---- | ---- | ------ |
+| DRAM Size (GB) | 0.5 | 4.29 | 8.58 | 17.1 | 68.7 | 145.49 |
+
+Please choose an appropriate max_seq_len based on your DRAM size.
+For example:
+```python
+python local_chat.py --model_path="/data/model/internlm2_5_to_llama_1m"  --gguf_path="/data/model/internlm2_5_to_llama_1m" --max_new_tokens=500 --cpu_infer=10  --use_cuda_graph=True  --mode="long_context" --prompt_file="/path/to/file"
+```
+
--- a/ktransformers/init.py
+++ b/ktransformers/init.py
@ -1 +1,11 @@
-__version__ = "0.1.2"
+#!/usr/bin/env python
+# coding=utf-8
+'''
+Description  : 
+Author       : kkk1nak0
+Date         : 2024-08-15 07:34:46
+Version      : 1.0.0
+LastEditors  : chenxl 
+LastEditTime : 2024-08-28 15:19:03
+'''
+__version__ = "0.1.3"
--- a/ktransformers/configs/config.yaml
+++ b/ktransformers/configs/config.yaml
@ -34,4 +34,20 @@ web:
  open_cross_domain: True

 ext:
-  cpu_infer: 10
+  cpu_infer: 10
+
+long_context:
+  chunk_size: 4096
+  max_seq_len: 32000
+  block_size: 128
+  local_windows_len: 4096
+  second_select_num: 32
+  anchor_type: DYNAMIC
+  kv_type: FP16
+  dense_layer_num: 2
+  anchor_num: 1
+  preselect_block: True
+  head_select_mode: SHARED
+  preselect_block_count: 32
+  layer_step: 1
+  token_step: 100
--- a/ktransformers/ktransformers_ext/CMakeLists.txt
+++ b/ktransformers/ktransformers_ext/CMakeLists.txt
@ -1,6 +1,7 @@
 cmake_minimum_required(VERSION 3.16)
 project(cpuinfer_ext VERSION 0.1.0)

+
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -ffast-math")
 set(CMAKE_BUILD_TYPE "Release")
@ -215,7 +216,8 @@ aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR} SOURCE_DIR1)
 aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/cpu_backend SOURCE_DIR2)
 aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/operators/llamafile SOURCE_DIR3)
 aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/../../third_party/llamafile SOURCE_DIR4)
-set(ALL_SOURCES ${SOURCE_DIR1} ${SOURCE_DIR2} ${SOURCE_DIR3} ${SOURCE_DIR4})
+aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/operators/kvcache SOURCE_DIR5)
+set(ALL_SOURCES ${SOURCE_DIR1} ${SOURCE_DIR2} ${SOURCE_DIR3} ${SOURCE_DIR4} ${SOURCE_DIR5})
 message(STATUS "ALL_SOURCES: ${ALL_SOURCES}")

 pybind11_add_module(${PROJECT_NAME} MODULE ${ALL_SOURCES})
@ -223,5 +225,8 @@ target_link_libraries(${PROJECT_NAME} PRIVATE llama)
 if(WIN32)
    target_link_libraries(${PROJECT_NAME} PRIVATE "$ENV{CUDA_PATH}/lib/x64/cudart.lib")#CUDA::cudart
 elseif(UNIX)
+    if(NOT DEFINED ENV{CUDA_HOME} OR "$ENV{CUDA_HOME}" STREQUAL "")
+        set(ENV{CUDA_HOME} "/usr/local/cuda")
+    endif()
    target_link_libraries(${PROJECT_NAME} PRIVATE "$ENV{CUDA_HOME}/lib64/libcudart.so")
-endif()
+endif()
--- a/ktransformers/ktransformers_ext/bench/bench_attention.py
+++ b/ktransformers/ktransformers_ext/bench/bench_attention.py
@ -0,0 +1,178 @@
+#!/usr/bin/env python
+# coding=utf-8
+"""
+Description  :  
+Author       : Jianwei Dong
+Date         : 2024-08-28 10:32:05
+Version      : 1.0.0
+LastEditors  : Jianwei Dong 
+LastEditTime : 2024-08-28 10:32:05
+Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
+"""
+import os, sys
+import time
+
+sys.path.append(os.path.dirname(__file__) + "/../build")
+import cpuinfer_ext
+import torch
+
+layer_num = 10
+kv_head_num = 8
+q_head_num = 32
+head_dim = 128
+block_len = 128
+anchor_num = 1
+
+anchor_type = cpuinfer_ext.kvcache.AnchorType.DYNAMIC
+kv_type = cpuinfer_ext.kvcache.ggml_type.FP16
+retrieval_type = cpuinfer_ext.kvcache.RetrievalType.LAYER
+layer_step: int = 1
+token_step: int = 1
+layer_offset: int = 0
+max_thread_num: int = 64
+max_batch_size: int = 1
+max_block_num: int = 1024
+CPUInfer = cpuinfer_ext.CPUInfer(max_thread_num)
+
+warm_up_iter = 1000
+test_iter = 10000
+
+
+def bench_linear(cache_seqlen: int):
+    with torch.inference_mode(mode=True):
+        cache_seqlens = torch.tensor([cache_seqlen], dtype=torch.int32, device="cpu")
+        seqlens_zero = torch.zeros((1,), dtype=torch.int32, device="cpu")
+
+        config = cpuinfer_ext.kvcache.KVCacheConfig(
+            layer_num,
+            kv_head_num,
+            q_head_num,
+            head_dim,
+            block_len,
+            anchor_num,
+            anchor_type,
+            kv_type,
+            retrieval_type,
+            layer_step,
+            token_step,
+            layer_offset,
+            max_block_num,
+            max_batch_size,
+            max_thread_num,
+        )
+        local_kvcache = cpuinfer_ext.kvcache.KVCache(config)
+        block_table = (
+            torch.arange(max_block_num, dtype=torch.int32, device="cpu")
+            .contiguous()
+            .view(1, -1)
+        )
+
+        for layer_idx in range(layer_num):
+            k_cache = torch.randn(
+                (1, cache_seqlen, kv_head_num, head_dim),
+                dtype=torch.float16,
+                device="cpu",
+            ).contiguous()
+            v_cache = torch.randn(
+                (1, cache_seqlen, kv_head_num, head_dim),
+                dtype=torch.float16,
+                device="cpu",
+            ).contiguous()
+
+            CPUInfer.submit(
+                local_kvcache.update_kvcache_fp16(
+                    k_cache.data_ptr(),
+                    v_cache.data_ptr(),
+                    layer_idx,
+                    block_table.data_ptr(),
+                    1,
+                    max_block_num,
+                    seqlens_zero.data_ptr(),
+                    cache_seqlen,
+                )
+            )
+            CPUInfer.sync()
+
+        input = torch.randn(
+            (1, 1, q_head_num, head_dim), dtype=torch.float16, device="cpu"
+        ).contiguous()
+        output = torch.empty(
+            (1, 1, q_head_num, head_dim), dtype=torch.float16, device="cpu"
+        ).contiguous()
+
+        # attn_lse: (bsz, q_len, q_head_num)
+        attn_lse = torch.empty(
+            (1, 1, q_head_num), dtype=torch.float32, device="cpu"
+        ).contiguous()
+        input = input / 100
+
+        # warm up
+        for i in range(warm_up_iter):
+            CPUInfer.submit(
+                local_kvcache.attn(
+                    input.data_ptr(),
+                    output.data_ptr(),
+                    attn_lse.data_ptr(),
+                    i % layer_num,
+                    0,
+                    1,
+                    1,
+                    max_block_num,
+                    block_table.data_ptr(),
+                    cache_seqlens.data_ptr(),
+                    -1,
+                    -1,
+                    -1,
+                )
+            )
+            CPUInfer.sync()
+
+        # test
+        start = time.perf_counter()
+        for i in range(test_iter):
+            CPUInfer.submit(
+                local_kvcache.attn(
+                    input.data_ptr(),
+                    output.data_ptr(),
+                    attn_lse.data_ptr(),
+                    i % layer_num,
+                    0,
+                    1,
+                    1,
+                    max_block_num,
+                    block_table.data_ptr(),
+                    cache_seqlens.data_ptr(),
+                    -1,
+                    -1,
+                    -1,
+                )
+            )
+            CPUInfer.sync()
+        end = time.perf_counter()
+        total_time = end - start
+        print("cache sequence length: ", cache_seqlen)
+        print("Time(s): ", total_time)
+        print("Iteration: ", test_iter)
+        print("Time(us) per iteration: ", total_time / test_iter * 1000000)
+        print(
+            "Bandwidth: ",
+            cache_seqlen
+            * kv_head_num
+            * head_dim
+            * 2
+            * 2
+            * test_iter
+            / total_time
+            / 1000
+            / 1000
+            / 1000,
+            "GB/s",
+        )
+        print("")
+
+
+bench_linear(1024)
+bench_linear(4096)
+bench_linear(16384)
+bench_linear(32768)
+bench_linear(65536)
--- a/ktransformers/ktransformers_ext/bench/bench_attention_torch.py
+++ b/ktransformers/ktransformers_ext/bench/bench_attention_torch.py
@ -0,0 +1,94 @@
+#!/usr/bin/env python
+# coding=utf-8
+"""
+Description  :  
+Author       : Jianwei Dong
+Date         : 2024-08-28 10:32:05
+Version      : 1.0.0
+LastEditors  : Jianwei Dong 
+LastEditTime : 2024-08-28 10:32:05
+Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
+"""
+import os, sys
+import time
+
+sys.path.append(os.path.dirname(__file__) + "/../build")
+import cpuinfer_ext
+import torch
+
+layer_num = 10
+kv_head_num = 8
+q_head_num = 32
+head_dim = 128
+block_len = 128
+anchor_num = 1
+warm_up_iter = 1000
+test_iter = 10000
+
+
+def bench_linear(cache_seqlen: int, device):
+    with torch.inference_mode(mode=True):
+
+        kvcaches = []
+
+        for layer_idx in range(layer_num):
+            k_cache = torch.randn(
+                (1, 32, cache_seqlen, head_dim),
+                dtype=torch.float16,
+                device=device,
+            ).contiguous()
+            v_cache = torch.randn(
+                (1, 32, cache_seqlen, head_dim),
+                dtype=torch.float16,
+                device=device,
+            ).contiguous()
+
+            kvcaches.append((k_cache, v_cache))
+
+        input = torch.randn(
+            (1, q_head_num, 1, head_dim), dtype=torch.float16, device=device
+        ).contiguous()
+        input = input / 100
+
+        # warm up
+        for i in range(warm_up_iter):
+            k_cache = kvcaches[i % layer_num][0]
+            v_cache = kvcaches[i % layer_num][1]
+            torch.nn.functional.scaled_dot_product_attention(input, k_cache, v_cache)
+
+        # test
+        start = time.perf_counter()
+        for i in range(test_iter):
+            k_cache = kvcaches[i % layer_num][0]
+            v_cache = kvcaches[i % layer_num][1]
+            torch.nn.functional.scaled_dot_product_attention(input, k_cache, v_cache)
+        end = time.perf_counter()
+        total_time = end - start
+        print("cache sequence length: ", cache_seqlen)
+        print("Time(s): ", total_time)
+        print("Iteration: ", test_iter)
+        print("Time(us) per iteration: ", total_time / test_iter * 1000000)
+        print(
+            "Bandwidth: ",
+            cache_seqlen
+            * q_head_num
+            * head_dim
+            * 2
+            * 2
+            * test_iter
+            / total_time
+            / 1000
+            / 1000
+            / 1000,
+            "GB/s",
+        )
+        print("")
+
+
+bench_linear(1024, "cpu")
+bench_linear(4096, "cpu")
+bench_linear(1024, "cuda")
+bench_linear(4096, "cuda")
+bench_linear(16384, "cuda")
+bench_linear(32768, "cuda")
+bench_linear(65536, "cuda")
--- a/ktransformers/ktransformers_ext/cpu_backend/backend.cpp
+++ b/ktransformers/ktransformers_ext/cpu_backend/backend.cpp
@ -3,93 +3,125 @@
 * @Author       : chenht2022
 * @Date         : 2024-07-22 02:03:05
 * @Version      : 1.0.0
- * @LastEditors  : chenht2022 
+ * @LastEditors  : chenht2022
 * @LastEditTime : 2024-07-25 10:33:34
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/
+
 #include "backend.h"

-Backend::Backend(int thread_num) {
-    thread_num_ = thread_num;
-    thread_state_.resize(thread_num);
-    for (int i = 0; i < thread_num; i++) {
+thread_local int Backend::thread_local_id = -1;
+
+Backend::Backend(int max_thread_num) {
+    max_thread_num_ = max_thread_num;
+    thread_state_.resize(max_thread_num_);
+    for (int i = 0; i < max_thread_num_; i++) {
        thread_state_[i].curr = std::make_unique<std::atomic<int>>();
-        thread_state_[i].status = std::make_unique<std::atomic<ThreadStatus>>(ThreadStatus::WAITING);
+        thread_state_[i].status =
+            std::make_unique<std::atomic<ThreadStatus>>(ThreadStatus::WAITING);
    }
-    workers_.resize(thread_num);
-    for (int i = 1; i < thread_num; i++) {
+    workers_.resize(max_thread_num_);
+    for (int i = 1; i < max_thread_num_; i++) {
        workers_[i] = std::thread(&Backend::worker_thread, this, i);
    }
 }

 Backend::~Backend() {
-    for (int i = 0; i < thread_num_; i++) {
-        thread_state_[i].status->store(ThreadStatus::EXIT, std::memory_order_release);
+    for (int i = 0; i < max_thread_num_; i++) {
+        thread_state_[i].status->store(ThreadStatus::EXIT,
+                                       std::memory_order_release);
    }
-    for (int i = 1; i < thread_num_; i++) {
+    for (int i = 1; i < max_thread_num_; i++) {
        if (workers_[i].joinable()) {
            workers_[i].join();
        }
    }
 }

-int Backend::get_thread_num() {
-    return thread_num_;
-}
+int Backend::get_thread_num() { return max_thread_num_; }

-void Backend::do_work_stealing_job(int task_num, std::function<void(int)> func) {
-    func_ = func;
+void Backend::do_work_stealing_job(int task_num,
+                                   std::function<void(int)> init_func,
+                                   std::function<void(int)> compute_func,
+                                   std::function<void(int)> finalize_func) {
+    init_func_ = init_func;
+    compute_func_ = compute_func;
+    finalize_func_ = finalize_func;
+    thread_num_ = std::min(max_thread_num_, task_num);
    int base = task_num / thread_num_;
    int remain = task_num % thread_num_;
    thread_state_[0].end = base + (0 < remain);
+
+    // 为主线程设置 thread_local_id
+    thread_local_id = 0;
+
    for (int i = 1; i < thread_num_; i++) {
-        thread_state_[i].curr->store(thread_state_[i - 1].end, std::memory_order_relaxed);
+        thread_state_[i].curr->store(thread_state_[i - 1].end,
+                                     std::memory_order_relaxed);
        thread_state_[i].end = thread_state_[i - 1].end + base + (i < remain);
-        thread_state_[i].status->store(ThreadStatus::WORKING, std::memory_order_release);
+        thread_state_[i].status->store(ThreadStatus::WORKING,
+                                       std::memory_order_release);
    }
    thread_state_[0].curr->store(0, std::memory_order_relaxed);
-    thread_state_[0].status->store(ThreadStatus::WORKING, std::memory_order_release);
+    thread_state_[0].status->store(ThreadStatus::WORKING,
+                                   std::memory_order_release);
    process_tasks(0);
    for (int i = 1; i < thread_num_; i++) {
-        while (thread_state_[i].status->load(std::memory_order_acquire) == ThreadStatus::WORKING) {
+        while (thread_state_[i].status->load(std::memory_order_acquire) ==
+               ThreadStatus::WORKING) {
        }
    }
 }

 void Backend::process_tasks(int thread_id) {
+    if (init_func_ != nullptr) {
+        init_func_(thread_id);
+    }
    while (true) {
-        int task_id = thread_state_[thread_id].curr->fetch_add(1, std::memory_order_acq_rel);
+        int task_id = thread_state_[thread_id].curr->fetch_add(
+            1, std::memory_order_acq_rel);
        if (task_id >= thread_state_[thread_id].end) {
            break;
        }
-        func_(task_id);
+        compute_func_(task_id);
    }
    for (int t_offset = 1; t_offset < thread_num_; t_offset++) {
        int t_i = (thread_id + t_offset) % thread_num_;
-        if (thread_state_[t_i].status->load(std::memory_order_acquire) != ThreadStatus::WORKING) {
+        if (thread_state_[t_i].status->load(std::memory_order_acquire) !=
+            ThreadStatus::WORKING) {
            continue;
        }
        while (true) {
-            int task_id = thread_state_[t_i].curr->fetch_add(1, std::memory_order_acq_rel);
+            int task_id = thread_state_[t_i].curr->fetch_add(
+                1, std::memory_order_acq_rel);
            if (task_id >= thread_state_[t_i].end) {
                break;
            }
-            func_(task_id);
+            compute_func_(task_id);
        }
    }
-    thread_state_[thread_id].status->store(ThreadStatus::WAITING, std::memory_order_release);
+    if (finalize_func_ != nullptr) {
+        finalize_func_(thread_id);
+    }
+    thread_state_[thread_id].status->store(ThreadStatus::WAITING,
+                                           std::memory_order_release);
 }

 void Backend::worker_thread(int thread_id) {
    auto start = std::chrono::steady_clock::now();
+    thread_local_id = thread_id; // 设置线程本地变量
    while (true) {
-        ThreadStatus status = thread_state_[thread_id].status->load(std::memory_order_acquire);
+        ThreadStatus status =
+            thread_state_[thread_id].status->load(std::memory_order_acquire);
        if (status == ThreadStatus::WORKING) {
            process_tasks(thread_id);
            start = std::chrono::steady_clock::now();
        } else if (status == ThreadStatus::WAITING) {
            auto now = std::chrono::steady_clock::now();
-            auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(now - start).count();
+            auto duration =
+                std::chrono::duration_cast<std::chrono::milliseconds>(now -
+                                                                      start)
+                    .count();
            if (duration > 50) {
                std::this_thread::sleep_for(std::chrono::milliseconds(1));
            }
--- a/ktransformers/ktransformers_ext/cpu_backend/backend.h
+++ b/ktransformers/ktransformers_ext/cpu_backend/backend.h
@ -3,7 +3,7 @@
 * @Author       : chenht2022
 * @Date         : 2024-07-22 02:03:05
 * @Version      : 1.0.0
- * @LastEditors  : chenht2022 
+ * @LastEditors  : chenht2022
 * @LastEditTime : 2024-07-25 10:33:38
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/
@ -31,20 +31,25 @@ struct ThreadState {
 };

 class Backend {
-   public:
+  public:
    Backend(int);
    ~Backend();
    int get_thread_num();
-    void do_work_stealing_job(int, std::function<void(int)>);
+    void do_work_stealing_job(int, std::function<void(int)>,
+                              std::function<void(int)>,
+                              std::function<void(int)>);
+    static thread_local int thread_local_id;

-   private:
+  private:
    int thread_num_;
-    std::vector<ThreadState> thread_state_;  // [thread_num]
-    std::function<void(int)> func_;
+    int max_thread_num_;
+    std::vector<ThreadState> thread_state_; // [thread_num]
+    std::function<void(int)> init_func_;
+    std::function<void(int)> compute_func_;
+    std::function<void(int)> finalize_func_;
    std::vector<std::thread> workers_;

    void process_tasks(int);
    void worker_thread(int);
 };
-
 #endif
--- a/ktransformers/ktransformers_ext/cpu_backend/task_queue.cpp
+++ b/ktransformers/ktransformers_ext/cpu_backend/task_queue.cpp
@ -54,4 +54,4 @@ void TaskQueue::processTasks() {
        }
        mutex.unlock();
    }
-}
+}
--- a/ktransformers/ktransformers_ext/cpu_backend/task_queue.h
+++ b/ktransformers/ktransformers_ext/cpu_backend/task_queue.h
@ -4,7 +4,7 @@
 * @Date         : 2024-07-16 10:43:18
 * @Version      : 1.0.0
 * @LastEditors  : chenxl 
- * @LastEditTime : 2024-08-12 12:28:25
+ * @LastEditTime : 2024-08-08 04:23:51
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/
 #ifndef CPUINFER_TASKQUEUE_H
--- a/ktransformers/ktransformers_ext/examples/test_attention.py
+++ b/ktransformers/ktransformers_ext/examples/test_attention.py
@ -0,0 +1,142 @@
+#!/usr/bin/env python
+# coding=utf-8
+"""
+Description  :  
+Author       : Jianwei Dong
+Date         : 2024-08-28 10:32:05
+Version      : 1.0.0
+LastEditors  : chenht2022 
+LastEditTime : 2024-08-28 10:32:05
+Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
+"""
+import os, sys
+import time
+
+sys.path.append(os.path.dirname(__file__) + "/../build")
+import cpuinfer_ext
+from flash_attn import flash_attn_with_kvcache
+import torch
+
+layer_num = 10
+kv_head_num = 8
+q_head_num = 32
+head_dim = 128
+block_len = 128
+anchor_num = 1
+cache_seqlen = 8192
+cache_seqlens = torch.tensor([cache_seqlen], dtype=torch.int32, device="cpu")
+seqlens_zero = torch.zeros((1,), dtype=torch.int32, device="cpu")
+anchor_type = cpuinfer_ext.kvcache.AnchorType.DYNAMIC
+kv_type = cpuinfer_ext.kvcache.ggml_type.FP16
+retrieval_type = cpuinfer_ext.kvcache.RetrievalType.LAYER
+layer_step: int = 1
+token_step: int = 1
+layer_offset: int = 0
+max_thread_num: int = 2
+max_batch_size: int = 1
+max_block_num: int = 512
+CPUInfer = cpuinfer_ext.CPUInfer(max_thread_num)
+validation_iter = 100
+
+with torch.inference_mode(mode=True):
+    config = cpuinfer_ext.kvcache.KVCacheConfig(
+        layer_num,
+        kv_head_num,
+        q_head_num,
+        head_dim,
+        block_len,
+        anchor_num,
+        anchor_type,
+        kv_type,
+        retrieval_type,
+        layer_step,
+        token_step,
+        layer_offset,
+        max_block_num,
+        max_batch_size,
+        max_thread_num,
+    )
+    local_kvcache = cpuinfer_ext.kvcache.KVCache(config)
+
+    kvcaches = []
+    block_table = (
+        torch.arange(max_block_num, dtype=torch.int32, device="cpu")
+        .contiguous()
+        .view(1, -1)
+    )
+
+    for layer_idx in range(layer_num):
+        k_cache = torch.randn(
+            (1, cache_seqlen, kv_head_num, head_dim), dtype=torch.float16, device="cpu"
+        ).contiguous()
+        v_cache = torch.randn(
+            (1, cache_seqlen, kv_head_num, head_dim), dtype=torch.float16, device="cpu"
+        ).contiguous()
+
+        CPUInfer.submit(
+            local_kvcache.update_kvcache_fp16(
+                k_cache.data_ptr(),
+                v_cache.data_ptr(),
+                layer_idx,
+                block_table.data_ptr(),
+                1,
+                max_block_num,
+                seqlens_zero.data_ptr(),
+                cache_seqlen,
+            )
+        )
+        CPUInfer.sync()
+
+        kvcaches.append((k_cache.to("cuda"), v_cache.to("cuda")))
+
+    # validation
+    for i in range(validation_iter):
+
+        k_cache = kvcaches[i % layer_num][0]
+        v_cache = kvcaches[i % layer_num][1]
+        input = torch.randn(
+            (1, 1, q_head_num, head_dim), dtype=torch.float16, device="cpu"
+        ).contiguous()
+        output = torch.empty(
+            (1, 1, q_head_num, head_dim), dtype=torch.float16, device="cpu"
+        ).contiguous()
+
+        # attn_lse: (bsz, q_len, q_head_num)
+        attn_lse = torch.empty(
+            (1, 1, q_head_num), dtype=torch.float32, device="cpu"
+        ).contiguous()
+        input = input / 100
+
+        CPUInfer.submit(
+            local_kvcache.attn(
+                input.data_ptr(),
+                output.data_ptr(),
+                attn_lse.data_ptr(),
+                i % layer_num,
+                0,
+                1,
+                1,
+                max_block_num,
+                block_table.data_ptr(),
+                cache_seqlens.data_ptr(),
+                -1,
+                -1,
+                -1,
+            )
+        )
+        CPUInfer.sync()
+        # print("cpuinfer output", output)
+
+        t_output = flash_attn_with_kvcache(
+            q=input.to("cuda"),
+            k_cache=k_cache,
+            v_cache=v_cache,
+            cache_seqlens=cache_seqlens.to("cuda"),
+        )
+        # print("torch output", t_output)
+
+        diff = torch.mean(torch.abs(output.to("cuda") - t_output)) / torch.mean(
+            torch.abs(t_output)
+        )
+        print("diff = ", diff)
+        assert diff < 0.001
--- a/ktransformers/ktransformers_ext/ext_bindings.cpp
+++ b/ktransformers/ktransformers_ext/ext_bindings.cpp
@ -1,19 +1,17 @@
 /**
 * @Description  :
- * @Author       : chenht2022
+ * @Author       : chenht2022, Jianwei Dong
 * @Date         : 2024-07-22 02:03:22
 * @Version      : 1.0.0
- * @LastEditors  : chenht2022
- * @LastEditTime : 2024-08-07 10:39:37
+ * @LastEditors  : Jianwei Dong
+ * @LastEditTime : 2024-08-26 22:47:06
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/
 // Python bindings
-#include <cstdint>
-#include <iostream>
-#include <memory>
 #include "cpu_backend/cpuinfer.h"
 #include "device_launch_parameters.h"
 #include "llamafile/flags.h"
+#include "operators/kvcache/kvcache.h"
 #include "operators/llamafile/linear.h"
 #include "operators/llamafile/mlp.h"
 #include "operators/llamafile/moe.h"
@ -21,119 +19,541 @@
 #include "pybind11/operators.h"
 #include "pybind11/pybind11.h"
 #include "pybind11/stl.h"
+#include <cstdint>
+#include <iostream>
+#include <memory>

 namespace py = pybind11;
 using namespace pybind11::literals;

-class LinearBindings {
-   public:
-    class WarmUpBindinds {
-       public:
+// Binding functions for the KVCache class
+class KVCacheBindings {
+  public:
+    class AttnBindings {
+      public:
        struct Args {
-            CPUInfer* cpuinfer;
-            Linear* linear;
+            CPUInfer *cpuinfer;
+            KVCache *kv_cache;
+            const ggml_fp16_t *q_in;
+            ggml_fp16_t *output;
+            float *attn_lse;
+            int layer_idx;
+            int generate_token_idx;
+            int q_len;
+            int batch_size;
+            int max_block_num;
+            int *block_table;
+            int *cache_seqlens;
+            int pick_block_num;
+            int init_block_num;
+            int local_block_num;
        };
-        static void inner(void* args) {
-            Args* args_ = (Args*)args;
+        static void inner(void *args) {
+            Args *args_ = (Args *)args;
+            args_->cpuinfer->enqueue(
+                &KVCache::attn, args_->kv_cache, args_->q_in, args_->output,
+                args_->attn_lse, args_->layer_idx, args_->generate_token_idx,
+                args_->q_len, args_->batch_size, args_->max_block_num,
+                args_->block_table, args_->cache_seqlens, args_->pick_block_num,
+                args_->init_block_num, args_->local_block_num);
+        }
+        static std::pair<intptr_t, intptr_t>
+        cpuinfer_interface(KVCache &kv_cache, intptr_t q_in, intptr_t output,
+                           intptr_t attn_lse, int layer_idx,
+                           int generate_token_idx, int q_len, int batch_size,
+                           int max_block_num, intptr_t block_table,
+                           intptr_t cache_seqlens, int pick_block_num,
+                           int init_block_num, int local_block_num) {
+            Args *args = new Args{nullptr,
+                                  &kv_cache,
+                                  (const ggml_fp16_t *)q_in,
+                                  (ggml_fp16_t *)output,
+                                  (float *)attn_lse,
+                                  layer_idx,
+                                  generate_token_idx,
+                                  q_len,
+                                  batch_size,
+                                  max_block_num,
+                                  (int *)block_table,
+                                  (int *)cache_seqlens,
+                                  pick_block_num,
+                                  init_block_num,
+                                  local_block_num};
+            return std::make_pair((intptr_t)&inner, (intptr_t)args);
+        }
+    };
+
+    class GetAllKVCacheOneLayerBindings {
+      public:
+        struct Args {
+            CPUInfer *cpuinfer;
+            KVCache *kv_cache;
+            int layer_id;
+            ggml_fp16_t *k_in;
+            ggml_fp16_t *v_in;
+        };
+        static void inner(void *args) {
+            Args *args_ = (Args *)args;
+            args_->cpuinfer->enqueue(&KVCache::get_all_kvcache_one_layer,
+                                     args_->kv_cache, args_->layer_id,
+                                     args_->k_in, args_->v_in);
+        }
+        static std::pair<intptr_t, intptr_t>
+        cpuinfer_interface(KVCache &kv_cache, intptr_t k_in, intptr_t v_in,
+                           int layer_id) {
+            Args *args = new Args{nullptr, &kv_cache, layer_id,
+                                  (ggml_fp16_t *)k_in, (ggml_fp16_t *)v_in};
+            return std::make_pair((intptr_t)&inner, (intptr_t)args);
+        }
+    };
+
+    class GetAndUpdateKVCacheFp16Bindings {
+      public:
+        struct Args {
+            CPUInfer *cpuinfer;
+            KVCache *kv_cache;
+            ggml_fp16_t *k_in;
+            ggml_fp16_t *v_in;
+            int layer_id;
+            int *block_table;
+            int batch_size;
+            int max_block_num;
+            int *cache_seqlens;
+            int q_len;
+        };
+        static void inner(void *args) {
+            Args *args_ = (Args *)args;
+            args_->cpuinfer->enqueue(&KVCache::get_and_update_kvcache_fp16,
+                                     args_->kv_cache, args_->k_in, args_->v_in,
+                                     args_->layer_id, args_->block_table,
+                                     args_->batch_size, args_->max_block_num,
+                                     args_->cache_seqlens, args_->q_len);
+        }
+        static std::pair<intptr_t, intptr_t>
+        cpuinfer_interface(KVCache &kv_cache, intptr_t k_in, intptr_t v_in,
+                           int layer_id, intptr_t block_table, int batch_size,
+                           int max_block_num, intptr_t cache_seqlens,
+                           int q_len) {
+            Args *args = new Args{nullptr,
+                                  &kv_cache,
+                                  (ggml_fp16_t *)k_in,
+                                  (ggml_fp16_t *)v_in,
+                                  layer_id,
+                                  (int *)block_table,
+                                  batch_size,
+                                  max_block_num,
+                                  (int *)cache_seqlens,
+                                  q_len};
+            return std::make_pair((intptr_t)&inner, (intptr_t)args);
+        }
+    };
+    class GetKVCacheFp16Bindings {
+      public:
+        struct Args {
+            CPUInfer *cpuinfer;
+            KVCache *kv_cache;
+            ggml_fp16_t *k_in;
+            ggml_fp16_t *v_in;
+            int layer_id;
+            int *block_table;
+            int batch_size;
+            int max_block_num;
+            int *cache_seqlens;
+        };
+        static void inner(void *args) {
+            Args *args_ = (Args *)args;
+            args_->cpuinfer->enqueue(
+                &KVCache::get_kvcache_fp16, args_->kv_cache, args_->k_in,
+                args_->v_in, args_->layer_id, args_->block_table,
+                args_->batch_size, args_->max_block_num, args_->cache_seqlens);
+        }
+        static std::pair<intptr_t, intptr_t>
+        cpuinfer_interface(KVCache &kv_cache, intptr_t k_in, intptr_t v_in,
+                           int layer_id, intptr_t block_table, int batch_size,
+                           int max_block_num, intptr_t cache_seqlens) {
+            Args *args = new Args{nullptr,
+                                  &kv_cache,
+                                  (ggml_fp16_t *)k_in,
+                                  (ggml_fp16_t *)v_in,
+                                  layer_id,
+                                  (int *)block_table,
+                                  batch_size,
+                                  max_block_num,
+                                  (int *)cache_seqlens};
+            return std::make_pair((intptr_t)&inner, (intptr_t)args);
+        }
+    };
+
+    class UpdateKVCacheFp16Bindings {
+      public:
+        struct Args {
+            CPUInfer *cpuinfer;
+            KVCache *kv_cache;
+            ggml_fp16_t *k_in;
+            ggml_fp16_t *v_in;
+            int layer_id;
+            int *block_table;
+            int batch_size;
+            int max_block_num;
+            int *cache_seqlens;
+            int q_len;
+        };
+        static void inner(void *args) {
+            Args *args_ = (Args *)args;
+            args_->cpuinfer->enqueue(&KVCache::update_kvcache_fp16,
+                                     args_->kv_cache, args_->k_in, args_->v_in,
+                                     args_->layer_id, args_->block_table,
+                                     args_->batch_size, args_->max_block_num,
+                                     args_->cache_seqlens, args_->q_len);
+        }
+        static std::pair<intptr_t, intptr_t>
+        cpuinfer_interface(KVCache &kv_cache, intptr_t k_in, intptr_t v_in,
+                           int layer_id, intptr_t block_table, int batch_size,
+                           int max_block_num, intptr_t cache_seqlens,
+                           int q_len) {
+            Args *args = new Args{nullptr,
+                                  &kv_cache,
+                                  (ggml_fp16_t *)k_in,
+                                  (ggml_fp16_t *)v_in,
+                                  layer_id,
+                                  (int *)block_table,
+                                  batch_size,
+                                  max_block_num,
+                                  (int *)cache_seqlens,
+                                  q_len};
+            return std::make_pair((intptr_t)&inner, (intptr_t)args);
+        }
+    };
+
+    class UpdateImportanceBindings {
+      public:
+        struct Args {
+            CPUInfer *cpuinfer;
+            KVCache *kv_cache;
+            const ggml_fp16_t *importance;
+            int layer_id;
+            int *block_table;
+            int batch_size;
+            int max_block_num;
+            int *offset;
+            int width;
+        };
+        static void inner(void *args) {
+            Args *args_ = (Args *)args;
+            args_->cpuinfer->enqueue(
+                &KVCache::update_importance, args_->kv_cache, args_->importance,
+                args_->layer_id, args_->block_table, args_->batch_size,
+                args_->max_block_num, args_->offset, args_->width);
+        }
+        static std::pair<intptr_t, intptr_t>
+        cpuinfer_interface(KVCache &kv_cache, intptr_t importance, int layer_id,
+                           intptr_t block_table, int batch_size,
+                           int max_block_num, intptr_t offset, int width) {
+            Args *args = new Args{nullptr,
+                                  &kv_cache,
+                                  (const ggml_fp16_t *)importance,
+                                  layer_id,
+                                  (int *)block_table,
+                                  batch_size,
+                                  max_block_num,
+                                  (int *)offset,
+                                  width};
+            return std::make_pair((intptr_t)&inner, (intptr_t)args);
+        }
+    };
+
+    class AttnWithKVCacheBindings {
+      public:
+        struct Args {
+            CPUInfer *cpuinfer;
+            KVCache *kv_cache;
+            const ggml_fp16_t *q_in;
+            const ggml_fp16_t *k_in;
+            const ggml_fp16_t *v_in;
+            ggml_fp16_t *output;
+            float *attn_lse;
+            int layer_idx;
+            int generate_token_idx;
+            int q_len;
+            int batch_size;
+            int max_block_num;
+            int *block_table;
+            int *cache_seqlens;
+            int topk;
+            int local;
+        };
+        static void inner(void *args) {
+            Args *args_ = (Args *)args;
+            args_->cpuinfer->enqueue(
+                &KVCache::attn_with_kvcache, args_->kv_cache, args_->q_in,
+                args_->k_in, args_->v_in, args_->output, args_->attn_lse,
+                args_->layer_idx, args_->generate_token_idx, args_->q_len,
+                args_->batch_size, args_->max_block_num, args_->block_table,
+                args_->cache_seqlens, args_->topk, args_->local);
+        }
+        static std::pair<intptr_t, intptr_t>
+        cpuinfer_interface(KVCache &kv_cache, intptr_t q_in, intptr_t k_in,
+                           intptr_t v_in, intptr_t output, intptr_t attn_lse,
+                           int layer_idx, int generate_token_idx, int q_len,
+                           int batch_size, int max_block_num,
+                           intptr_t block_table, intptr_t cache_seqlens,
+                           int topk, int local) {
+            Args *args = new Args{nullptr,
+                                  &kv_cache,
+                                  (const ggml_fp16_t *)q_in,
+                                  (const ggml_fp16_t *)k_in,
+                                  (const ggml_fp16_t *)v_in,
+                                  (ggml_fp16_t *)output,
+                                  (float *)attn_lse,
+                                  layer_idx,
+                                  generate_token_idx,
+                                  q_len,
+                                  batch_size,
+                                  max_block_num,
+                                  (int *)block_table,
+                                  (int *)cache_seqlens,
+                                  topk,
+                                  local};
+            return std::make_pair((intptr_t)&inner, (intptr_t)args);
+        }
+    };
+
+    class ClearImportanceAllLayersBindings {
+      public:
+        struct Args {
+            CPUInfer *cpuinfer;
+            KVCache *kv_cache;
+            int *block_table;
+            int *cache_seqlens;
+            int batch_size;
+            int max_block_num;
+        };
+        static void inner(void *args) {
+            Args *args_ = (Args *)args;
+            args_->cpuinfer->enqueue(&KVCache::clear_importance_all_layers,
+                                     args_->kv_cache, args_->block_table,
+                                     args_->cache_seqlens, args_->batch_size,
+                                     args_->max_block_num);
+        }
+        static std::pair<intptr_t, intptr_t>
+        cpuinfer_interface(KVCache &kv_cache, intptr_t block_table,
+                           intptr_t cache_seqlens, int batch_size,
+                           int max_block_num) {
+            Args *args = new Args{nullptr,
+                                  &kv_cache,
+                                  (int *)block_table,
+                                  (int *)cache_seqlens,
+                                  batch_size,
+                                  max_block_num};
+            return std::make_pair((intptr_t)&inner, (intptr_t)args);
+        }
+    };
+
+    class CalcAnchorAllLayersBindinds {
+      public:
+        struct Args {
+            CPUInfer *cpuinfer;
+            KVCache *kv_cache;
+            int *block_table;
+            int *cache_seqlens;
+            int batch_size;
+            int max_block_num;
+        };
+        static void inner(void *args) {
+            Args *args_ = (Args *)args;
+            args_->cpuinfer->enqueue(&KVCache::calc_anchor_all_layers,
+                                     args_->kv_cache, args_->block_table,
+                                     args_->cache_seqlens, args_->batch_size,
+                                     args_->max_block_num);
+        }
+        static std::pair<intptr_t, intptr_t>
+        cpuinfer_interface(KVCache &kv_cache, intptr_t block_table,
+                           intptr_t cache_seqlens, int batch_size,
+                           int max_block_num) {
+            Args *args = new Args{nullptr,
+                                  &kv_cache,
+                                  (int *)block_table,
+                                  (int *)cache_seqlens,
+                                  batch_size,
+                                  max_block_num};
+            return std::make_pair((intptr_t)&inner, (intptr_t)args);
+        }
+    };
+
+    class LoadKVCacheBindings {
+      public:
+        struct Args {
+            CPUInfer *cpuinfer;
+            KVCache *kv_cache;
+            std::string tensor_file_path;
+        };
+        static void inner(void *args) {
+            Args *args_ = (Args *)args;
+            args_->cpuinfer->enqueue(&KVCache::load_kvcache, args_->kv_cache,
+                                     args_->tensor_file_path);
+        }
+        static std::pair<intptr_t, intptr_t>
+        cpuinfer_interface(KVCache &kv_cache, std::string tensor_file_path) {
+            Args *args =
+                new Args{nullptr, &kv_cache, (std::string)tensor_file_path};
+            return std::make_pair((intptr_t)&inner, (intptr_t)args);
+        }
+    };
+    class DumpKVCacheBindings {
+      public:
+        struct Args {
+            CPUInfer *cpuinfer;
+            KVCache *kv_cache;
+            int *block_table;
+            int cache_total_len;
+            std::string tensor_file_path;
+        };
+        static void inner(void *args) {
+            Args *args_ = (Args *)args;
+            args_->cpuinfer->enqueue(&KVCache::dump_kvcache, args_->kv_cache,
+                                     args_->block_table, args_->cache_total_len,
+                                     args_->tensor_file_path);
+        }
+        static std::pair<intptr_t, intptr_t>
+        cpuinfer_interface(KVCache &kv_cache, intptr_t block_table,
+                           int cache_total_len, std::string tensor_file_path) {
+            Args *args =
+                new Args{nullptr, &kv_cache, (int *)block_table,
+                         cache_total_len, (std::string)tensor_file_path};
+            return std::make_pair((intptr_t)&inner, (intptr_t)args);
+        }
+    };
+};
+
+class LinearBindings {
+  public:
+    class WarmUpBindinds {
+      public:
+        struct Args {
+            CPUInfer *cpuinfer;
+            Linear *linear;
+        };
+        static void inner(void *args) {
+            Args *args_ = (Args *)args;
            args_->cpuinfer->enqueue(&Linear::warm_up, args_->linear);
        }
-        static std::pair<intptr_t, intptr_t> cpuinfer_interface(Linear& linear) {
-            Args* args = new Args{nullptr, &linear};
+        static std::pair<intptr_t, intptr_t>
+        cpuinfer_interface(Linear &linear) {
+            Args *args = new Args{nullptr, &linear};
            return std::make_pair((intptr_t)&inner, (intptr_t)args);
        }
    };
    class ForwardBindings {
-       public:
+      public:
        struct Args {
-            CPUInfer* cpuinfer;
-            Linear* linear;
+            CPUInfer *cpuinfer;
+            Linear *linear;
            int qlen;
-            const void* input;
-            void* output;
+            const void *input;
+            void *output;
        };
-        static void inner(void* args) {
-            Args* args_ = (Args*)args;
-            args_->cpuinfer->enqueue(&Linear::forward, args_->linear, args_->qlen, args_->input, args_->output);
+        static void inner(void *args) {
+            Args *args_ = (Args *)args;
+            args_->cpuinfer->enqueue(&Linear::forward, args_->linear,
+                                     args_->qlen, args_->input, args_->output);
        }
-        static std::pair<intptr_t, intptr_t> cpuinfer_interface(Linear& linear, int qlen, intptr_t input, intptr_t output) {
-            Args* args = new Args{nullptr, &linear, qlen, (const void*)input, (void*)output};
+        static std::pair<intptr_t, intptr_t>
+        cpuinfer_interface(Linear &linear, int qlen, intptr_t input,
+                           intptr_t output) {
+            Args *args = new Args{nullptr, &linear, qlen, (const void *)input,
+                                  (void *)output};
            return std::make_pair((intptr_t)&inner, (intptr_t)args);
        }
    };
 };

 class MLPBindings {
-   public:
+  public:
    class WarmUpBindinds {
-       public:
+      public:
        struct Args {
-            CPUInfer* cpuinfer;
-            MLP* mlp;
+            CPUInfer *cpuinfer;
+            MLP *mlp;
        };
-        static void inner(void* args) {
-            Args* args_ = (Args*)args;
+        static void inner(void *args) {
+            Args *args_ = (Args *)args;
            args_->cpuinfer->enqueue(&MLP::warm_up, args_->mlp);
        }
-        static std::pair<intptr_t, intptr_t> cpuinfer_interface(MLP& mlp) {
-            Args* args = new Args{nullptr, &mlp};
+        static std::pair<intptr_t, intptr_t> cpuinfer_interface(MLP &mlp) {
+            Args *args = new Args{nullptr, &mlp};
            return std::make_pair((intptr_t)&inner, (intptr_t)args);
        }
    };
    class ForwardBindings {
-       public:
+      public:
        struct Args {
-            CPUInfer* cpuinfer;
-            MLP* mlp;
+            CPUInfer *cpuinfer;
+            MLP *mlp;
            int qlen;
-            const void* input;
-            void* output;
+            const void *input;
+            void *output;
        };
-        static void inner(void* args) {
-            Args* args_ = (Args*)args;
-            args_->cpuinfer->enqueue(&MLP::forward, args_->mlp, args_->qlen, args_->input, args_->output);
+        static void inner(void *args) {
+            Args *args_ = (Args *)args;
+            args_->cpuinfer->enqueue(&MLP::forward, args_->mlp, args_->qlen,
+                                     args_->input, args_->output);
        }
-        static std::pair<intptr_t, intptr_t> cpuinfer_interface(MLP& mlp, int qlen, intptr_t input, intptr_t output) {
-            Args* args = new Args{nullptr, &mlp, qlen, (const void*)input, (void*)output};
+        static std::pair<intptr_t, intptr_t>
+        cpuinfer_interface(MLP &mlp, int qlen, intptr_t input,
+                           intptr_t output) {
+            Args *args = new Args{nullptr, &mlp, qlen, (const void *)input,
+                                  (void *)output};
            return std::make_pair((intptr_t)&inner, (intptr_t)args);
        }
    };
 };

 class MOEBindings {
-   public:
+  public:
    class WarmUpBindinds {
-       public:
+      public:
        struct Args {
-            CPUInfer* cpuinfer;
-            MOE* moe;
+            CPUInfer *cpuinfer;
+            MOE *moe;
        };
-        static void inner(void* args) {
-            Args* args_ = (Args*)args;
+        static void inner(void *args) {
+            Args *args_ = (Args *)args;
            args_->cpuinfer->enqueue(&MOE::warm_up, args_->moe);
        }
-        static std::pair<intptr_t, intptr_t> cpuinfer_interface(MOE& moe) {
-            Args* args = new Args{nullptr, &moe};
+        static std::pair<intptr_t, intptr_t> cpuinfer_interface(MOE &moe) {
+            Args *args = new Args{nullptr, &moe};
            return std::make_pair((intptr_t)&inner, (intptr_t)args);
        }
    };
    class ForwardBindings {
-       public:
+      public:
        struct Args {
-            CPUInfer* cpuinfer;
-            MOE* moe;
+            CPUInfer *cpuinfer;
+            MOE *moe;
            int qlen;
            int k;
-            const uint64_t* expert_ids;
-            const float* weights;
-            const void* input;
-            void* output;
+            const uint64_t *expert_ids;
+            const float *weights;
+            const void *input;
+            void *output;
        };
-        static void inner(void* args) {
-            Args* args_ = (Args*)args;
-            args_->cpuinfer->enqueue(&MOE::forward, args_->moe, args_->qlen, args_->k, args_->expert_ids, args_->weights, args_->input, args_->output);
+        static void inner(void *args) {
+            Args *args_ = (Args *)args;
+            args_->cpuinfer->enqueue(
+                &MOE::forward, args_->moe, args_->qlen, args_->k,
+                args_->expert_ids, args_->weights, args_->input, args_->output);
        }
-        static std::pair<intptr_t, intptr_t> cpuinfer_interface(MOE& moe, int qlen, int k, intptr_t expert_ids, intptr_t weights, intptr_t input, intptr_t output) {
-            Args* args = new Args{nullptr, &moe, qlen, k, (const uint64_t*)expert_ids, (const float*)weights, (const void*)input, (void*)output};
+        static std::pair<intptr_t, intptr_t>
+        cpuinfer_interface(MOE &moe, int qlen, int k, intptr_t expert_ids,
+                           intptr_t weights, intptr_t input, intptr_t output) {
+            Args *args = new Args{nullptr,
+                                  &moe,
+                                  qlen,
+                                  k,
+                                  (const uint64_t *)expert_ids,
+                                  (const float *)weights,
+                                  (const void *)input,
+                                  (void *)output};
            return std::make_pair((intptr_t)&inner, (intptr_t)args);
        }
    };
@ -149,8 +569,12 @@ PYBIND11_MODULE(cpuinfer_ext, m) {

    auto linear_module = m.def_submodule("linear");
    py::class_<LinearConfig>(linear_module, "LinearConfig")
-        .def(py::init([](int hidden_size, int intermediate_size, int stride, int group_max_len, intptr_t proj, int proj_type, int hidden_type) {
-            return LinearConfig(hidden_size, intermediate_size, stride, group_max_len, (void*)proj, (ggml_type)proj_type, (ggml_type)hidden_type);
+        .def(py::init([](int hidden_size, int intermediate_size, int stride,
+                         int group_max_len, intptr_t proj, int proj_type,
+                         int hidden_type) {
+            return LinearConfig(hidden_size, intermediate_size, stride,
+                                group_max_len, (void *)proj,
+                                (ggml_type)proj_type, (ggml_type)hidden_type);
        }));
    py::class_<Linear>(linear_module, "Linear")
        .def(py::init<LinearConfig>())
@ -159,8 +583,15 @@ PYBIND11_MODULE(cpuinfer_ext, m) {

    auto mlp_module = m.def_submodule("mlp");
    py::class_<MLPConfig>(mlp_module, "MLPConfig")
-        .def(py::init([](int hidden_size, int intermediate_size, int stride, int group_max_len, intptr_t gate_proj, intptr_t up_proj, intptr_t down_proj, int gate_type, int up_type, int down_type, int hidden_type) {
-            return MLPConfig(hidden_size, intermediate_size, stride, group_max_len, (void*)gate_proj, (void*)up_proj, (void*)down_proj, (ggml_type)gate_type, (ggml_type)up_type, (ggml_type)down_type, (ggml_type)hidden_type);
+        .def(py::init([](int hidden_size, int intermediate_size, int stride,
+                         int group_max_len, intptr_t gate_proj,
+                         intptr_t up_proj, intptr_t down_proj, int gate_type,
+                         int up_type, int down_type, int hidden_type) {
+            return MLPConfig(hidden_size, intermediate_size, stride,
+                             group_max_len, (void *)gate_proj, (void *)up_proj,
+                             (void *)down_proj, (ggml_type)gate_type,
+                             (ggml_type)up_type, (ggml_type)down_type,
+                             (ggml_type)hidden_type);
        }));
    py::class_<MLP>(mlp_module, "MLP")
        .def(py::init<MLPConfig>())
@ -169,11 +600,84 @@ PYBIND11_MODULE(cpuinfer_ext, m) {

    auto moe_module = m.def_submodule("moe");
    py::class_<MOEConfig>(moe_module, "MOEConfig")
-        .def(py::init([](int expert_num, int routed_expert_num, int hidden_size, int intermediate_size, int stride, int group_min_len, int group_max_len, intptr_t gate_proj, intptr_t up_proj, intptr_t down_proj, int gate_type, int up_type, int down_type, int hidden_type) {
-            return MOEConfig(expert_num, routed_expert_num, hidden_size, intermediate_size, stride, group_min_len, group_max_len, (void*)gate_proj, (void*)up_proj, (void*)down_proj, (ggml_type)gate_type, (ggml_type)up_type, (ggml_type)down_type, (ggml_type)hidden_type);
+        .def(py::init([](int expert_num, int routed_expert_num, int hidden_size,
+                         int intermediate_size, int stride, int group_min_len,
+                         int group_max_len, intptr_t gate_proj,
+                         intptr_t up_proj, intptr_t down_proj, int gate_type,
+                         int up_type, int down_type, int hidden_type) {
+            return MOEConfig(expert_num, routed_expert_num, hidden_size,
+                             intermediate_size, stride, group_min_len,
+                             group_max_len, (void *)gate_proj, (void *)up_proj,
+                             (void *)down_proj, (ggml_type)gate_type,
+                             (ggml_type)up_type, (ggml_type)down_type,
+                             (ggml_type)hidden_type);
        }));
    py::class_<MOE>(moe_module, "MOE")
        .def(py::init<MOEConfig>())
        .def("warm_up", &MOEBindings::WarmUpBindinds::cpuinfer_interface)
        .def("forward", &MOEBindings::ForwardBindings::cpuinfer_interface);
+
+    auto kvcache_module = m.def_submodule("kvcache");
+
+    py::enum_<AnchorType>(kvcache_module, "AnchorType")
+        .value("FIXED", AnchorType::FIXED_ANCHOR)
+        .value("DYNAMIC", AnchorType::DYNAMIC)
+        .value("QUEST", AnchorType::QUEST)
+        .value("BLOCK_MAX", AnchorType::BLOCK_MAX)
+        .value("BLOCK_MEAN", AnchorType::BLOCK_MEAN);
+    py::enum_<ggml_type>(kvcache_module, "ggml_type")
+        .value("FP16", ggml_type::GGML_TYPE_F16)
+        .value("FP32", ggml_type::GGML_TYPE_F32)
+        .value("Q4_0", ggml_type::GGML_TYPE_Q4_0)
+        .value("Q8_0", ggml_type::GGML_TYPE_Q8_0);
+    py::enum_<RetrievalType>(kvcache_module, "RetrievalType")
+        .value("LAYER", RetrievalType::LAYER)
+        .value("KVHEAD", RetrievalType::KVHEAD)
+        .value("QHEAD", RetrievalType::QHEAD);
+
+    py::class_<KVCacheConfig>(kvcache_module, "KVCacheConfig")
+        .def(py::init<int, int, int, int, int, int, AnchorType, ggml_type,
+                      RetrievalType, int, int, int, int, int, int>())
+        .def_readwrite("layer_num", &KVCacheConfig::layer_num)
+        .def_readwrite("kv_head_num", &KVCacheConfig::kv_head_num)
+        .def_readwrite("q_head_num", &KVCacheConfig::q_head_num)
+        .def_readwrite("head_dim", &KVCacheConfig::head_dim)
+        .def_readwrite("block_len", &KVCacheConfig::block_len)
+        .def_readwrite("anchor_num", &KVCacheConfig::anchor_num)
+        .def_readwrite("anchor_type", &KVCacheConfig::anchor_type)
+        .def_readwrite("kv_type", &KVCacheConfig::kv_type)
+        .def_readwrite("retrieval_type", &KVCacheConfig::retrieval_type)
+        .def_readwrite("layer_step", &KVCacheConfig::layer_step)
+        .def_readwrite("token_step", &KVCacheConfig::token_step)
+        .def_readwrite("layer_offset", &KVCacheConfig::layer_offset)
+        .def_readwrite("max_block_num", &KVCacheConfig::max_block_num)
+        .def_readwrite("max_batch_size", &KVCacheConfig::max_batch_size)
+        .def_readwrite("max_thread_num", &KVCacheConfig::max_thread_num);
+    py::class_<KVCache>(kvcache_module, "KVCache")
+        .def(py::init<KVCacheConfig>())
+        .def("get_cache_total_len", &KVCache::get_cache_total_len)
+        .def("update_cache_total_len",
+             [](KVCache &kvcache, int cache_total_len) {
+                 kvcache.update_cache_total_len(cache_total_len);
+             })
+        .def("attn", &KVCacheBindings::AttnBindings::cpuinfer_interface)
+        .def(
+            "get_all_kvcache_one_layer",
+            &KVCacheBindings::GetAllKVCacheOneLayerBindings::cpuinfer_interface)
+        .def("get_and_update_kvcache_fp16",
+             &KVCacheBindings::GetAndUpdateKVCacheFp16Bindings::
+                 cpuinfer_interface)
+        .def("get_kvcache_fp16",
+             &KVCacheBindings::GetKVCacheFp16Bindings::cpuinfer_interface)
+        .def("update_kvcache_fp16",
+             &KVCacheBindings::UpdateKVCacheFp16Bindings::cpuinfer_interface)
+        .def("update_importance",
+             &KVCacheBindings::UpdateImportanceBindings::cpuinfer_interface)
+        .def("attn_with_kvcache",
+             &KVCacheBindings::AttnWithKVCacheBindings::cpuinfer_interface)
+        .def("clear_importance_all_layers",
+             &KVCacheBindings::ClearImportanceAllLayersBindings::
+                 cpuinfer_interface)
+        .def("calc_anchor_all_layers",
+             &KVCacheBindings::CalcAnchorAllLayersBindinds::cpuinfer_interface);
 }
--- a/ktransformers/ktransformers_ext/operators/kvcache/kvcache.h
+++ b/ktransformers/ktransformers_ext/operators/kvcache/kvcache.h
@ -0,0 +1,727 @@
+/**
+ * @Description  :
+ * @Author       : Jianwei Dong
+ * @Date         : 2024-08-26 22:47:06
+ * @Version      : 1.0.0
+ * @LastEditors  : Jianwei Dong
+ * @LastEditTime : 2024-08-26 22:47:06
+ * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
+ **/
+
+#ifndef CPUINFER_OPERATOR_KVCACHE_H
+#define CPUINFER_OPERATOR_KVCACHE_H
+
+#include <algorithm>
+#include <atomic>
+#include <cassert>
+#include <condition_variable>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <functional>
+#include <future>
+#include <iostream>
+#include <memory>
+#include <mutex>
+#include <queue>
+#include <random>
+#include <stdexcept>
+#include <thread>
+#include <vector>
+
+#include "../../cpu_backend/backend.h"
+#include "llama.cpp/ggml-common.h"
+#include "llama.cpp/ggml-impl.h"
+#include "llama.cpp/ggml-quants.h"
+#include "llama.cpp/ggml.h"
+#include "llamafile/sgemm.h"
+
+#define CHUNK_SIZE 32
+
+/**
+ * @brief Converts a ggml_type enum value to its corresponding string
+ * representation.
+ *
+ * This function provides a human-readable string representation for a given
+ * ggml_type enum value. The string can be used for logging, debugging, or
+ * displaying information in a user interface.
+ *
+ * @param type The ggml_type enum value to convert.
+ * @return A string representation of the enum value.
+ */
+std::string ggml_type_to_string(ggml_type type);
+
+/**
+ * @enum AnchorType
+ * @brief Defines the types of anchors used in attention mechanisms.
+ *
+ * This enum specifies different types of anchors that can be used in attention
+ * mechanisms, such as fixed anchors, dynamic anchors, or special anchors like
+ * QUEST, BLOCK_MEAN, or BLOCK_MAX.
+ */
+enum AnchorType {
+    FIXED_ANCHOR, /**< A fixed anchor that does not change. */
+    DYNAMIC,      /**< A dynamic anchor that can change over time. */
+    QUEST, /**< A special anchor type used for QUEST (Query and Embedding Space
+              Transformation). */
+    BLOCK_MEAN, /**< An anchor based on the mean of a block of data. */
+    BLOCK_MAX /**< An anchor based on the maximum value within a block of data.
+               */
+};
+
+/**
+ * @brief Converts an AnchorType enum value to its corresponding string
+ * representation.
+ *
+ * This function provides a human-readable string representation for a given
+ * AnchorType enum value. The string can be used for logging, debugging, or
+ * displaying information in a user interface.
+ *
+ * @param anchor_type The AnchorType enum value to convert.
+ * @return A string representation of the enum value.
+ */
+std::string AnchorTypeToString(AnchorType anchor_type);
+
+/**
+ * @enum RetrievalType
+ * @brief Defines the types of retrieval strategies in attention mechanisms.
+ *
+ * This enum specifies different retrieval strategies that can be used in
+ * attention mechanisms, such as layer-level retrieval, key-value head-level
+ * retrieval, or query head-level retrieval.
+ */
+enum RetrievalType {
+    LAYER,  /**< Retrieval at the layer level. */
+    KVHEAD, /**< Retrieval at the key-value head level. */
+    QHEAD   /**< Retrieval at the query head level. */
+};
+
+/**
+ * @brief Converts a RetrievalType enum value to its corresponding string
+ * representation.
+ *
+ * This function provides a human-readable string representation for a given
+ * RetrievalType enum value. The string can be used for logging, debugging, or
+ * displaying information in a user interface.
+ *
+ * @param retrieval_type The RetrievalType enum value to convert.
+ * @return A string representation of the enum value.
+ */
+std::string RetrievalTypeToString(RetrievalType retrieval_type);
+
+/**
+ * @struct KVCacheConfig
+ * @brief Configuration structure for Key-Value (KV) Cache.
+ *
+ * This structure holds configuration parameters for setting up and managing
+ * a Key-Value (KV) Cache used in various attention mechanisms. It includes
+ * parameters such as the number of layers, the number of heads, the dimension
+ * of each head, block length, anchor information, and memory-related settings.
+ */
+struct KVCacheConfig {
+    int layer_num;   /**< Number of layers in the model. */
+    int kv_head_num; /**< Number of heads in the KV Cache. */
+    int q_head_num;  /**< Number of heads in the query. */
+    int head_dim;    /**< Dimension of each head. */
+    int block_len;   /**< Length of each block in the cache. */
+    int anchor_num;  /**< Number of anchors used in attention. */
+
+    ggml_type kv_type; /**< Data type of the KV Cache (e.g., fp16, q8_0). */
+
+    // Controls the pre-allocated memory size
+    int max_block_num;  /**< Maximum number of blocks that can be allocated. */
+    int max_batch_size; /**< Maximum batch size that can be processed. */
+    int max_thread_num; /**< Maximum number of threads that can be used. */
+
+    AnchorType
+        anchor_type; /**< Type of anchors used in the attention mechanism. */
+    RetrievalType
+        retrieval_type; /**< Type of retrieval strategy used in the cache. */
+
+    int layer_step;   /**< Step size between layers. */
+    int token_step;   /**< Step size between tokens. */
+    int layer_offset; /**< Offset value for layers. */
+
+    /**
+     * @brief Default constructor for KVCacheConfig.
+     *
+     * Initializes the configuration with default values. This constructor
+     * does not initialize any member variables explicitly.
+     */
+    KVCacheConfig() = default;
+
+    /**
+     * @brief Parameterized constructor for KVCacheConfig.
+     *
+     * This constructor initializes the configuration with specific values
+     * for all member variables.
+     *
+     * @param layer_num The number of layers in the model.
+     * @param kv_head_num The number of heads in the KV Cache.
+     * @param q_head_num The number of heads in the query.
+     * @param head_dim The dimension of each head.
+     * @param block_len The length of each block in the cache.
+     * @param anchor_num The number of anchors used in attention.
+     * @param anchor_type The type of anchors used in the attention mechanism.
+     * @param kv_type The data type of the KV Cache (e.g., fp16, q8_0).
+     * @param retrieval_type The type of retrieval strategy used in the cache.
+     * @param layer_step The step size between layers.
+     * @param token_step The step size between tokens.
+     * @param layer_offset The offset value for layers.
+     * @param max_block_num The maximum number of blocks that can be allocated.
+     * @param max_batch_size The maximum batch size that can be processed.
+     * @param max_thread_num The maximum number of threads that can be used.
+     */
+    KVCacheConfig(int layer_num, int kv_head_num, int q_head_num, int head_dim,
+                  int block_len, int anchor_num, AnchorType anchor_type,
+                  ggml_type kv_type, RetrievalType retrieval_type,
+                  int layer_step, int token_step, int layer_offset,
+                  int max_block_num, int max_batch_size, int max_thread_num);
+};
+
+/**
+ * @class KVCache
+ * @brief Manages the Key-Value (KV) Cache used in attention mechanisms.
+ *
+ * The KVCache class provides functionality for managing the Key-Value Cache,
+ * including resizing the cache, retrieving configuration parameters, and
+ * updating internal states. This class is typically used in transformer models
+ * to store and manage past key and value states for efficient attention
+ * computations.
+ */
+class KVCache {
+  public:
+    /**
+     * @brief Constructs a KVCache object with the given configuration.
+     *
+     * Initializes the KVCache with the specified configuration parameters,
+     * such as the number of layers, heads, head dimensions, and other
+     * relevant settings.
+     *
+     * @param config The configuration object containing initialization
+     * parameters.
+     */
+    KVCache(KVCacheConfig config);
+
+    /**
+     * @brief Resizes the number of threads used by the cache.
+     *
+     * This function adjusts the number of threads that the cache can utilize.
+     * It allows dynamic reconfiguration of the parallel processing capabilities
+     * based on the current workload or system resources.
+     *
+     * @param thread_num The new number of threads to use.
+     */
+    void ThreadResize(int thread_num);
+
+    /**
+     * @brief Resizes the batch size managed by the cache.
+     *
+     * This function adjusts the batch size that the cache can handle. It
+     * is useful when the input batch size changes dynamically, allowing
+     * the cache to be reconfigured accordingly.
+     *
+     * @param batch_size The new batch size.
+     */
+    void BatchResize(int batch_size);
+
+    /**
+     * @brief Resizes the number of blocks managed by the cache.
+     *
+     * This function adjusts the number of blocks that the cache can manage.
+     * It allows dynamic reconfiguration of the block structure based on the
+     * current sequence length or other factors.
+     *
+     * @param block_num The new number of blocks.
+     */
+    void BlockResize(int block_num);
+
+    /**
+     * @brief Gets the number of layers in the cache.
+     *
+     * @return The number of layers configured in the cache.
+     */
+    int get_layer_num() { return config_.layer_num; }
+
+    /**
+     * @brief Gets the number of KV heads in the cache.
+     *
+     * @return The number of KV heads configured in the cache.
+     */
+    int get_kv_head_num() { return config_.kv_head_num; }
+
+    /**
+     * @brief Gets the number of query heads in the cache.
+     *
+     * @return The number of query heads configured in the cache.
+     */
+    int get_q_head_num() { return config_.q_head_num; }
+
+    /**
+     * @brief Gets the dimension of each head in the cache.
+     *
+     * @return The dimension of each head.
+     */
+    int get_head_dim() { return config_.head_dim; }
+
+    /**
+     * @brief Gets the length of each block in the cache.
+     *
+     * @return The length of each block.
+     */
+    int get_block_len() { return config_.block_len; }
+
+    /**
+     * @brief Gets the number of blocks for a specific layer.
+     *
+     * @param layer_id The ID of the layer for which to retrieve the block
+     * number.
+     * @return The number of blocks in the specified layer.
+     */
+    int get_block_num(int layer_id) { return past_block_num_[layer_id]; }
+
+    /**
+     * @brief Gets the number of anchors in the cache.
+     *
+     * @return The number of anchors configured in the cache.
+     */
+    int get_anchor_num() { return config_.anchor_num; }
+
+    /**
+     * @brief Gets the total length of the cache.
+     *
+     * @return The total length of the cache.
+     */
+    int get_cache_total_len() { return cache_total_len_; }
+
+    /**
+     * @brief Gets the total number of blocks in the cache.
+     *
+     * This function computes and returns the total number of blocks in the
+     * cache based on the total cache length and the block length configuration.
+     *
+     * @return The total number of blocks in the cache.
+     */
+    int get_cache_total_block_num() {
+        return (cache_total_len_ + config_.block_len - 1) / config_.block_len;
+    }
+
+    /**
+     * @brief Updates the total length of the cache.
+     *
+     * This function sets a new total length for the cache, allowing dynamic
+     * adjustment of the cache size during runtime.
+     *
+     * @param cache_total_len The new total length of the cache.
+     */
+    void update_cache_total_len(int cache_total_len) {
+        cache_total_len_ = cache_total_len;
+    }
+    void attn(const ggml_fp16_t *q_in, ggml_fp16_t *output, float *attn_lse,
+              int layer_idx, int generate_token_idx, int q_len, int batch_size,
+              int max_block_num, int *block_table, int *cache_seqlens,
+              int pick_block_num, int init_block_num, int local_block_num,
+              Backend *backend);
+
+    void update_kvcache_one_block_fp16(const ggml_fp16_t *k_in,
+                                       const ggml_fp16_t *v_in, int layer_id,
+                                       int block_idx, Backend *backend);
+
+    void get_kvcache_one_block_fp16(ggml_fp16_t *k_in, ggml_fp16_t *v_in,
+                                    int layer_id, int block_idx,
+                                    Backend *backend);
+
+    void update_importance_one_block(const ggml_fp16_t *importance,
+                                     int layer_id, int block_idx,
+                                     Backend *backend);
+    void get_importance_one_block(ggml_fp16_t *importance, int layer_id,
+                                  int block_idx, Backend *backend);
+
+    void get_anchor_one_block(ggml_fp16_t *anchor, int layer_id, int block_idx,
+                              Backend *backend);
+
+    void update_anchor_one_block(const ggml_fp16_t *anchor, int layer_id,
+                                 int block_idx, Backend *backend);
+
+    void calc_anchor_all_layers(int *block_table, int *cache_seqlens,
+                                int batch_size, int max_block_num,
+                                Backend *backend);
+
+    void load_kvcache(std::string tensor_file_path, Backend *backend);
+    void dump_kvcache(int *block_table, int cache_total_len,
+                      std::string tensor_file_path, Backend *backend);
+
+    void get_and_update_kvcache_fp16(ggml_fp16_t *k_in, ggml_fp16_t *v_in,
+                                     int layer_id, int *block_table,
+                                     int batch_size, int max_block_num,
+                                     int *cache_seqlens, int q_len,
+                                     Backend *backend);
+
+    void get_kvcache_fp16(ggml_fp16_t *k_in, ggml_fp16_t *v_in, int layer_id,
+                          int *block_table, int batch_size, int max_block_num,
+                          int *cache_seqlens, Backend *backend);
+
+    void update_kvcache_fp16(const ggml_fp16_t *k_in, const ggml_fp16_t *v_in,
+                             int layer_id, int *block_table, int batch_size,
+                             int max_block_num, int *cache_seqlens, int q_len,
+                             Backend *backend);
+
+    void update_importance(const ggml_fp16_t *importance, int layer_id,
+                           int *block_table, int batch_size, int max_block_num,
+                           int *offset, int width, Backend *backend);
+
+    void attn_with_kvcache(const ggml_fp16_t *q_in, const ggml_fp16_t *k_in,
+                           const ggml_fp16_t *v_in, ggml_fp16_t *output,
+                           float *attn_lse, int layer_idx,
+                           int generate_token_idx, int q_len, int batch_size,
+                           int max_block_num, int *block_table,
+                           int *cache_seqlens, int topk, int local,
+                           Backend *backend);
+
+    void clear_importance_all_layers(int *block_table, int *cache_seqlens,
+                                     int batch_size, int max_block_num,
+                                     Backend *backend);
+
+    void clear_kvcache_all_layers(int *block_table, int *cache_seqlens,
+                                  int batch_size, int max_block_num,
+                                  Backend *backend);
+
+    void get_sincos(ggml_fp16_t *sin, ggml_fp16_t *cos, int seqlen);
+
+    void get_attn_sparsity(const ggml_fp16_t *q_in, float *attn_sparsity,
+                           int layer_idx, int generate_token_idx, int q_len,
+                           int batch_size, int max_block_num, int *block_table,
+                           int *cache_seqlens, int *block_table_origin,
+                           int *cache_seqlens_origin, int max_block_num_origin,
+                           int topk, int local, Backend *backend);
+
+    void get_all_kvcache_one_layer(int layer_id, ggml_fp16_t *k_in,
+                                   ggml_fp16_t *v_in, Backend *backend);
+
+  private:
+    // Persistent data
+    KVCacheConfig config_;
+    int n_gqa_;                            // q_head_num / kv_head_num
+    int cache_total_len_;                  // Number of tokens in cache
+    std::vector<uint64_t> past_block_num_; // [layer_num]
+    std::vector<std::vector<std::vector<std::vector<block_q4_0>>>>
+        k_cache_q4; // [layer_num, kv_head_num, past_block_num, block_len *
+                    // (head_dim / QK_4)]
+    std::vector<std::vector<std::vector<std::vector<block_q4_0>>>>
+        v_cache_q4; // [layer_num, kv_head_num, past_block_num, head_dim *
+                    // (block_len / QK_4)]
+    std::vector<std::vector<std::vector<std::vector<block_q8_0>>>>
+        k_cache_q8; // [layer_num, kv_head_num, past_block_num, block_len *
+                    // (head_dim / QK_8)]
+    std::vector<std::vector<std::vector<std::vector<block_q8_0>>>>
+        v_cache_q8; // [layer_num, kv_head_num, past_block_num, head_dim *
+                    // (block_len / QK_8)]
+
+    std::vector<std::vector<std::vector<std::vector<ggml_fp16_t>>>>
+        k_cache_fp16_; // [layer_num, kv_head_num, past_block_num, block_len *
+                       // head_dim]
+    std::vector<std::vector<std::vector<std::vector<ggml_fp16_t>>>>
+        v_cache_fp16_; // [layer_num, kv_head_num, past_block_num, head_dim *
+                       // block_len]
+
+    std::vector<std::vector<std::vector<std::vector<ggml_fp16_t>>>>
+        importance_; // [layer_num, past_block_num, block_len,
+                     // attention_head_num]
+
+    std::vector<ggml_fp16_t>
+        anchor_; // [layer_num * past_block_num * anchor_num *
+                 // attention_head_num * head_dim]
+
+    // Runtime data
+    int64_t layer_id_;
+    int64_t block_idx_;
+    int *block_table_;
+    uint64_t block_num_;
+    int max_block_num_after_retrieval_;
+
+    // Rotary positional embeddings
+    std::vector<std::vector<ggml_fp16_t>> sin_; // [seq_len, head_dim]
+    std::vector<std::vector<ggml_fp16_t>> cos_; // [seq_len, head_dim]
+
+    // update/get
+    int seq_len_;
+    uint16_t *k_scales_;        // q4_0
+    uint8_t *k_in_;             // q4_0
+    uint16_t *v_scales_;        // q4_0
+    uint8_t *v_in_;             // q4_0
+    uint16_t *k_data_;          // fp16
+    uint16_t *v_data_;          // fp16
+    uint16_t *importance_data_; // fp16
+    uint16_t *anchor_data_;     // fp16
+
+    // sparsity = (sigma(block lse / lse))
+    std::vector<std::vector<std::vector<float>>>
+        block_lse_; // [batch_size, max_block_num, q_head_num]
+    std::vector<std::vector<float>> attn_sparsity_; // [batch_size, q_head_num]
+
+    // attn
+    std::vector<std::vector<float>>
+        avg_q; // [batch_size, q_head_num * head_dim]
+
+    std::vector<std::vector<ggml_fp16_t>>
+        avg_q_fp16; // [batch_size, q_head_num * head_dim]
+    std::vector<
+        std::priority_queue<std::pair<float, int>,
+                            std::vector<std::pair<float, int>>, std::greater<>>>
+        top_similar_block_;
+
+    std::vector<std::vector<float>> block_similar_;
+    std::vector<std::vector<std::vector<float>>> block_similar_kv_head_;
+    std::vector<std::vector<std::vector<float>>> block_similar_q_head_;
+
+    std::vector<int> cache_seqlens_;               // [batch_size]
+    std::vector<int> selected_blocks_num_history_; // [layer_num // layer_step]
+
+    std::vector<std::vector<std::vector<int>>> selected_blocks_history_;
+    // [layer_num // layer_step, batch_size, max_block_num]
+
+    std::vector<std::vector<std::vector<std::vector<int>>>>
+        selected_blocks_history_kvhead_; // [layer_num // layer_step,
+                                         // batch_size, max_block_num,
+                                         // kv_head_num]
+
+    std::vector<std::vector<int>>
+        block_table_before_retrieval_; // [batch_size, max_block_num]
+    std::vector<std::vector<int>>
+        block_table_after_retrieval_; // [batch_size, pick_block_num]
+
+    std::vector<std::vector<std::vector<int>>>
+        block_table_before_retrieval_qhead_; // [batch_size, max_block_num,
+                                             // q_head_num]
+    std::vector<std::vector<std::vector<int>>>
+        block_table_after_retrieval_qhead_; // [batch_size, pick_block_num,
+                                            // q_head_num]
+
+    std::vector<std::vector<std::vector<int>>>
+        block_table_before_retrieval_kvhead_; // [batch_size, max_block_num,
+                                              // kv_head_num]
+    std::vector<std::vector<std::vector<int>>>
+        block_table_after_retrieval_kvhead_; // [batch_size, pick_block_num,
+                                             // kv_head_num]
+
+    std::vector<std::vector<std::unique_ptr<std::mutex>>>
+        mutex_; // [batch_size, kv_head_num]
+    std::vector<std::vector<std::vector<block_q8_0>>>
+        q_q8_0_; // [batch_size, kv_head_num, n_gqa * head_dim / QK8_0]
+    std::vector<std::vector<std::vector<float>>>
+        q_fp32_; // [batch_size, kv_head_num, n_gqa * head_dim]
+
+    std::vector<std::vector<std::vector<float>>>
+        output_fp32_; // [batch_size, kv_head_num, n_gqa * head_dim]
+    std::vector<std::vector<std::vector<float>>>
+        attn_lse_; // [batch_size, kv_head_num, n_gqa]
+
+    std::vector<std::pair<int, int>> thread_cur_head_idx_; // [thread_num]
+
+    std::vector<std::vector<block_q8_0>>
+        thread_local_output_q8_0_; // [thread_num, n_gqa * head_dim / QK8_0]
+    std::vector<std::vector<float>>
+        thread_local_attn_score_; // [thread_num, n_gqa * block_len]
+    std::vector<std::vector<float>>
+        thread_local_output_fp32_; // [thread_num, n_gqa * head_dim]
+    std::vector<std::vector<float>>
+        thread_local_attn_lse_; // [thread_num, n_gqa]
+    std::vector<std::vector<float>>
+        thread_local_cur_output_fp32_; // [thread_num, n_gqa * head_dim]
+    std::vector<std::vector<float>>
+        thread_local_cur_attn_lse_; // [thread_num, n_gqa]
+    std::vector<std::vector<uint8_t>>
+        thread_local_attn_mask_; // [thread_num, block_len // 8]
+    std::vector<std::vector<char>>
+        thread_local_draft_; // [thread_num, 2 * n_gqa * block_len + 6 * n_gqa *
+                             // head_dim + 2 * block_len * head_dim]
+
+    // tmp space
+    std::vector<float> q_fp32; // [n_gqa * head_dim]
+
+    void quantize_q_(const uint16_t *q_in_data, int batch_size);
+    void attn_initialize_layer_(int batch_size, int layer_idx, int *block_table,
+                                int &max_block_num, int *cache_seqlens);
+    void attn_initialize_kvhead_(int batch_size, int layer_idx,
+                                 int *block_table, int &max_block_num,
+                                 int *cache_seqlens);
+    void retrieval_kvcache_layer_(const uint16_t *q_in_data, int init_block_num,
+                                  int local_block_num, int pick_block_num,
+                                  int q_len, int generate_token_idx,
+                                  int batch_size, int layer_idx,
+                                  int *cache_seqlens, int &max_block_num,
+                                  Backend *backend);
+    void retrieval_kvcache_kvhead_(const uint16_t *q_in_data,
+                                   int init_block_num, int local_block_num,
+                                   int pick_block_num, int q_len,
+                                   int generate_token_idx, int batch_size,
+                                   int layer_idx, int *cache_seqlens,
+                                   int &max_block_num, Backend *backend);
+
+    void calculate_block_similarity_layer_(
+        const uint16_t *q_in_data, int batch_size, int layer_idx, int q_len,
+        int max_block_num, int *cache_seqlens, int init_block_num,
+        int local_block_num, int pick_block_num, Backend *backend);
+    void calculate_block_similarity_kvhead_(
+        const uint16_t *q_in_data, int batch_size, int layer_idx, int q_len,
+        int max_block_num, int *cache_seqlens, int init_block_num,
+        int local_block_num, int pick_block_num, Backend *backend);
+
+    void select_block_layer_(int batch_size, int layer_idx, int max_block_num,
+                             int init_block_num, int local_block_num,
+                             int pick_block_num);
+    void select_block_kvhead_(int batch_size, int layer_idx, int max_block_num,
+                              int init_block_num, int local_block_num,
+                              int pick_block_num);
+
+    void calculate_sparsity_layer_(const uint16_t *q_in_data,
+                                   float *attn_sparsity, int batch_size,
+                                   int max_block_num, int *block_table,
+                                   int *cache_seqlens, Backend *backend);
+    void calculate_sparsity_kvhead_(const uint16_t *q_in_data,
+                                    float *attn_sparsity, int batch_size,
+                                    int max_block_num, int *block_table,
+                                    int *cache_seqlens, Backend *backend);
+
+    void attention_kvhead_(const uint16_t *q_in_data, ggml_fp16_t *output,
+                           float *attn_lse, int batch_size, Backend *backend);
+    void attention_layer_(const uint16_t *q_in_data, ggml_fp16_t *output,
+                          float *attn_lse, int batch_size, Backend *backend);
+
+    /**
+     * @brief Computes attention with KV cache for one block.
+     *
+     * This function performs attention computation for one block using KV
+     * cache. The function supports different data types for Q, K, and V caches,
+     * and provides options for quantization. The function does not perform any
+     * dynamic memory allocation internally, so all necessary buffers must be
+     * pre-allocated externally.
+     *
+     * @param head_dim The dimension of the head.
+     * @param bsz The batch size.
+     * @param q_type The data type of Q (GGML data type). Only supports fp16 and
+     * q8_0.
+     * @param q Pointer to the Q tensor [bsz, head_dim]. The quantization is
+     *          always applied along the head_dim dimension. The size must be
+     *          bsz * head_dim/32 * qtype_size. If head_dim % 32 != 0, an error
+     *          will be raised.
+     * @param past_kv_len The length of the past KV cache.
+     * @param past_kv_offset The offset in the past KV cache.
+     * @param is_full_attn Boolean flag indicating whether to use full attention
+     *                     (true for full 1 mask).
+     * @param attn_mask Pointer to the attention mask [bsz, past_kv_len]. If
+     *                  is_full_attn = false, a bit matrix is passed to
+     * represent the mask.
+     * @param k_type The data type of K cache (GGML data type). Only supports
+     *               fp16, q4_0, and q8_0.
+     * @param k_quant_type Quantization type for K cache. 0 for per_token, 1 for
+     *                     per_channel. Other values will raise an error.
+     * @param k_cache Pointer to the K cache tensor [seq_len, head_dim]. If
+     *                quant_type == 0, head_dim % 32 must be 0. If quant_type ==
+     * 1, seq_len % 32 must be 0.
+     * @param num_k_anchor The number of K anchors. If num_k_anchor == 0, it
+     * means no anchor is present.
+     * @param k_cache_anchors Pointer to the K cache anchors [num_k_anchor,
+     * head_dim]. The k_anchor_type must be fp16.
+     * @param k_cache_anchor_pos Pointer to the K cache anchor positions. Each
+     * token is associated with the nearest previous anchor position.
+     * @param v_type The data type of V cache (GGML data type).
+     * @param v_quant_type Quantization type for V cache.
+     * @param v_cache Pointer to the V cache tensor [head_dim, seq_len].
+     * @param num_v_anchor The number of V anchors.
+     * @param v_cache_anchors Pointer to the V cache anchors.
+     * @param v_cache_anchor_pos Pointer to the V cache anchor positions.
+     * @param attn_score Pre-allocated buffer for attention scores [bsz,
+     * past_kv_len].
+     * @param output Output tensor [bsz, head_dim] with the same type as q_type.
+     * @param lse Pre-allocated buffer [bsz] for the log-sum-exp of the
+     * attention scores.
+     * @param draft Pre-allocated temporary buffer. The buffer size should be
+     * enough to hold (2 * bsz * past_kv_len + 6 * bsz * head_dim + 2 *
+     *              past_kv_len * head_dim + past_kv_len * head_dim / 32) bytes.
+     * @param rotary_angle Pointer to the rotary angle tensor.
+     * @param rotary_cos Pointer to the cosine values for rotary embedding.
+     * @param rotary_sin Pointer to the sine values for rotary embedding.
+     */
+    void attn_with_kvcache_one_block_(
+        int head_dim, int bsz,
+        ggml_type q_type, // GGML data type of `Q`, only supports fp16 and q8_0
+        // [bsz, head_dim]
+        // Quantization is always on the head_dim dimension (per_token). If
+        // head_dim % 32 != 0, an error will be raised. The size must be bsz *
+        // head_dim/32 * qtype_size.
+        const void *q,
+
+        int past_kv_len, int past_kv_offset,
+        bool is_full_attn, // true indicates a full 1 mask
+        // If is_full_attn = false, a bit matrix representing the mask is
+        // passed. [bsz, past_kv_len]
+        const uint8_t *attn_mask,
+
+        ggml_type k_type, // GGML data type of `K Cache`, only supports fp16,
+                          // q4_0, q8_0
+        int k_quant_type, // 0 for per_token, 1 for per_channel, others raise an
+                          // error
+        // [seq_len, head_dim]
+        // If quant_type == 0, head_dim % 32 must be 0.
+        // If quant_type == 1, seq_len % 32 must be 0.
+        const void *k_cache,
+
+        // k_anchor_type must be fp16
+        int num_k_anchor, // num_k_anchor == 0 indicates no anchor
+        // [num_k_anchor, head_dim]
+        const void *k_cache_anchors,
+        // Each token is associated with the nearest previous position's anchor,
+        // with the same distance.
+        const int *k_cache_anchor_pos,
+
+        // v_cache similar to k_cache
+        ggml_type v_type, int v_quant_type,
+        // [head_dim, seq_len]
+        const void *v_cache, int num_v_anchor, const void *v_cache_anchors,
+        const int *v_cache_anchor_pos,
+
+        // Pre-allocated buffer for intermediate calculations [bsz,
+        // past_kv_len]. No malloc is performed inside this function.
+        float *attn_score,
+
+        // Output: [bsz, head_dim], with the same type as q_type
+        void *output,
+        // [bsz]
+        float *lse,
+
+        // Pre-allocated temporary buffer with sufficient size:
+        // (2 * bsz * past_kv_len + 6 * bsz * head_dim + 2 * past_kv_len *
+        // head_dim + past_kv_len * head_dim / 32) bytes.
+        void *draft,
+
+        // Apply rotary embedding online
+        const int *rotary_angle, const void *rotary_cos, const void *rotary_sin
+        // rotary_cos=None,
+        // rotary_sin=None,
+        // cache_seqlens: Optional[Union[(int, torch.Tensor)]] = None,
+        // cache_batch_idx: Optional[torch.Tensor] = None,
+        // rotary_interleaved=True,
+
+        // // Not supported for now
+        // window_size=(-1, -1),  # -1 means infinite context window
+        // alibi_slopes=None,
+    );
+};
+
+/**
+ * @brief Scales a float32 vector by a given scalar value.
+ *
+ * This function multiplies each element of the input vector `y` by a scalar
+ * `v`. It uses platform-specific optimizations if available, such as Apple's
+ * Accelerate framework or SIMD instructions. If no specific optimization is
+ * available, the function falls back to a simple scalar multiplication loop.
+ *
+ * @param n The number of elements in the vector `y`.
+ * @param y The input vector to be scaled. The result will be stored in the same
+ * vector.
+ * @param v The scalar value by which to scale the vector.
+ */
+void ggml_vec_scale_f32(const int n, float *y, const float v);
+#endif
--- a/ktransformers/ktransformers_ext/operators/kvcache/kvcache_attn.cpp
+++ b/ktransformers/ktransformers_ext/operators/kvcache/kvcache_attn.cpp
--- a/ktransformers/ktransformers_ext/operators/kvcache/kvcache_load_dump.cpp
+++ b/ktransformers/ktransformers_ext/operators/kvcache/kvcache_load_dump.cpp
@ -0,0 +1,123 @@
+/**
+ * @Description  :
+ * @Author       : Jianwei Dong
+ * @Date         : 2024-08-26 22:47:06
+ * @Version      : 1.0.0
+ * @LastEditors  : Jianwei Dong
+ * @LastEditTime : 2024-08-26 22:47:06
+ * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
+ **/
+
+#include "kvcache.h"
+void KVCache::load_kvcache(std::string tensor_file_path, Backend *backend) {
+    // Timer start
+    auto start = std::chrono::high_resolution_clock::now();
+    std::ifstream ifs_tensor(tensor_file_path, std::ios::binary);
+    if (!ifs_tensor) {
+        throw std::runtime_error("Failed to open tensor file");
+    }
+    ifs_tensor.read(reinterpret_cast<char *>(&cache_total_len_),
+                    sizeof(cache_total_len_));
+    int past_block_num =
+        (cache_total_len_ + config_.block_len - 1) / config_.block_len;
+    printf("cache_total_len: %d, past_block_num: %d\n", cache_total_len_,
+           past_block_num);
+    for (int i = 0; i < config_.layer_num; ++i) {
+        past_block_num_[i] = past_block_num;
+    }
+    ifs_tensor.read(reinterpret_cast<char *>(anchor_.data()),
+                    anchor_.size() * sizeof(ggml_fp16_t));
+    for (int i = 0; i < config_.layer_num; ++i) {
+        for (int j = 0; j < config_.kv_head_num; ++j) {
+            for (int k = 0; k < past_block_num_[i]; ++k) {
+                if (config_.kv_type == GGML_TYPE_F16) {
+                    ifs_tensor.read(
+                        reinterpret_cast<char *>(k_cache_fp16_[i][j][k].data()),
+                        k_cache_fp16_[i][j][k].size() * sizeof(ggml_fp16_t));
+                    ifs_tensor.read(
+                        reinterpret_cast<char *>(v_cache_fp16_[i][j][k].data()),
+                        v_cache_fp16_[i][j][k].size() * sizeof(ggml_fp16_t));
+                } else if (config_.kv_type == GGML_TYPE_Q4_0) {
+                    ifs_tensor.read(
+                        reinterpret_cast<char *>(k_cache_q4[i][j][k].data()),
+                        k_cache_q4[i][j][k].size() * sizeof(block_q4_0));
+                    ifs_tensor.read(
+                        reinterpret_cast<char *>(v_cache_q4[i][j][k].data()),
+                        v_cache_q4[i][j][k].size() * sizeof(block_q4_0));
+                }
+            }
+        }
+        for (int k = 0; k < past_block_num_[i]; ++k) {
+            for (int l = 0; l < config_.block_len; l++) {
+                ifs_tensor.read(
+                    reinterpret_cast<char *>(importance_[i][k][l].data()),
+                    importance_[i][k][l].size() * sizeof(ggml_fp16_t));
+            }
+        }
+    }
+    ifs_tensor.close();
+    // Timer end
+    auto end = std::chrono::high_resolution_clock::now();
+    std::chrono::duration<double> diff = end - start;
+    printf("time of load: %f s\n", diff.count());
+}
+void KVCache::dump_kvcache(int *block_table, int cache_total_len,
+                           std::string tensor_file_path, Backend *backend) {
+    // Timer start
+    auto start = std::chrono::high_resolution_clock::now();
+    std::ofstream ofs(tensor_file_path, std::ios::binary);
+    printf("dump_kvcache: %s\n", tensor_file_path.c_str());
+    if (!ofs.is_open()) {
+        std::cerr << "Cannot open file " << tensor_file_path << std::endl;
+        return;
+    }
+    ofs.write(reinterpret_cast<const char *>(&cache_total_len),
+              sizeof(cache_total_len));
+    int past_block_num =
+        (cache_total_len + config_.block_len - 1) / config_.block_len;
+    printf("cache_total_len: %d, past_block_num: %d\n", cache_total_len,
+           past_block_num);
+    ofs.write(reinterpret_cast<const char *>(anchor_.data()),
+              anchor_.size() * sizeof(ggml_fp16_t));
+    for (int i = 0; i < config_.layer_num; ++i) {
+        for (int j = 0; j < config_.kv_head_num; ++j) {
+            for (int k = 0; k < past_block_num; ++k) {
+                int block_idx = block_table[k];
+                if (config_.kv_type == GGML_TYPE_F16) {
+                    ofs.write(reinterpret_cast<const char *>(
+                                  k_cache_fp16_[i][j][block_idx].data()),
+                              k_cache_fp16_[i][j][block_idx].size() *
+                                  sizeof(ggml_fp16_t));
+                    ofs.write(reinterpret_cast<const char *>(
+                                  v_cache_fp16_[i][j][block_idx].data()),
+                              v_cache_fp16_[i][j][block_idx].size() *
+                                  sizeof(ggml_fp16_t));
+
+                } else if (config_.kv_type == GGML_TYPE_Q4_0) {
+                    ofs.write(reinterpret_cast<const char *>(
+                                  k_cache_q4[i][j][block_idx].data()),
+                              k_cache_q4[i][j][block_idx].size() *
+                                  sizeof(block_q4_0));
+                    ofs.write(reinterpret_cast<const char *>(
+                                  v_cache_q4[i][j][block_idx].data()),
+                              v_cache_q4[i][j][block_idx].size() *
+                                  sizeof(block_q4_0));
+                }
+            }
+        }
+        for (int k = 0; k < past_block_num; ++k) {
+            int block_idx = block_table[k];
+            for (int l = 0; l < config_.block_len; l++) {
+                ofs.write(reinterpret_cast<const char *>(
+                              importance_[i][block_idx][l].data()),
+                          importance_[i][block_idx][l].size() *
+                              sizeof(ggml_fp16_t));
+            }
+        }
+    }
+    ofs.close();
+    // Timer end
+    auto end = std::chrono::high_resolution_clock::now();
+    std::chrono::duration<double> diff = end - start;
+    printf("time of dump: %f s\n", diff.count());
+}
--- a/ktransformers/ktransformers_ext/operators/kvcache/kvcache_read_write.cpp
+++ b/ktransformers/ktransformers_ext/operators/kvcache/kvcache_read_write.cpp
--- a/ktransformers/ktransformers_ext/operators/kvcache/kvcache_utils.cpp
+++ b/ktransformers/ktransformers_ext/operators/kvcache/kvcache_utils.cpp
--- a/ktransformers/ktransformers_ext/operators/llamafile/linear.cpp
+++ b/ktransformers/ktransformers_ext/operators/llamafile/linear.cpp
@ -3,8 +3,8 @@
 * @Author       : chenht2022
 * @Date         : 2024-07-12 10:07:58
 * @Version      : 1.0.0
- * @LastEditors  : chenht2022
- * @LastEditTime : 2024-07-25 10:34:58
+ * @LastEditors  : kkk1nak0
+ * @LastEditTime : 2024-08-15 07:45:18
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/
 #include "linear.h"
@ -24,10 +24,14 @@ Linear::~Linear() {
    shared_mem_buffer.dealloc(this);
 }

-void Linear::warm_up(Backend* backend) {
+void Linear::warm_up(Backend *backend) {
    std::vector<float> input_fp32(config_.input_size);
-    std::vector<uint8_t> input(config_.input_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type));
-    std::vector<uint8_t> output(config_.output_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type));
+    std::vector<uint8_t> input(config_.input_size *
+                               ggml_type_size(config_.hidden_type) /
+                               ggml_blck_size(config_.hidden_type));
+    std::vector<uint8_t> output(config_.output_size *
+                                ggml_type_size(config_.hidden_type) /
+                                ggml_blck_size(config_.hidden_type));
    for (int i = 0; i < config_.input_size; i++) {
        input_fp32[i] = 0;
    }
@ -45,7 +49,7 @@ void Linear::forward_many(int qlen, const void* input, void* output, Backend* ba
        proj_input_ptr = proj_input_;
    }
    int nth = config_.output_size / config_.stride;
-    backend->do_work_stealing_job(nth, [&](int task_id) {
+    backend->do_work_stealing_job(nth, nullptr, [&](int task_id) {
        int ith = task_id;
        void* proj_ptr = (uint8_t*)proj_ + ith * config_.stride * config_.input_size * ggml_type_size(config_.proj_type) / ggml_blck_size(config_.proj_type);
        float* proj_output_ptr = proj_output_ + ith * config_.stride;
@ -57,7 +61,7 @@ void Linear::forward_many(int qlen, const void* input, void* output, Backend* ba
                from_float(output_fp32_ptr, output_ptr, config_.stride, config_.hidden_type);
            }
        }
-    });
+    }, nullptr);
    if (config_.stride % ggml_blck_size(config_.hidden_type) != 0) {
        from_float(proj_output_, output, qlen * config_.output_size, config_.hidden_type);
    }
--- a/ktransformers/ktransformers_ext/operators/llamafile/mlp.cpp
+++ b/ktransformers/ktransformers_ext/operators/llamafile/mlp.cpp
@ -3,8 +3,8 @@
 * @Author       : chenht2022
 * @Date         : 2024-07-16 10:43:18
 * @Version      : 1.0.0
- * @LastEditors  : chenht2022
- * @LastEditTime : 2024-07-25 10:35:04
+ * @LastEditors  : kkk1nak0
+ * @LastEditTime : 2024-08-15 07:44:38
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/
 #include "mlp.h"
@ -31,10 +31,14 @@ MLP::~MLP() {
    shared_mem_buffer.dealloc(this);
 }

-void MLP::warm_up(Backend* backend) {
+void MLP::warm_up(Backend *backend) {
    std::vector<float> input_fp32(config_.hidden_size);
-    std::vector<uint8_t> input(config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type));
-    std::vector<uint8_t> output(config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type));
+    std::vector<uint8_t> input(config_.hidden_size *
+                               ggml_type_size(config_.hidden_type) /
+                               ggml_blck_size(config_.hidden_type));
+    std::vector<uint8_t> output(config_.hidden_size *
+                                ggml_type_size(config_.hidden_type) /
+                                ggml_blck_size(config_.hidden_type));
    for (int i = 0; i < config_.hidden_size; i++) {
        input_fp32[i] = 0;
    }
@ -42,9 +46,7 @@ void MLP::warm_up(Backend* backend) {
    forward_many(1, input.data(), output.data(), backend);
 }

-static float act_fn(float x) {
-    return x / (1.0f + expf(-x));
-}
+static float act_fn(float x) { return x / (1.0f + expf(-x)); }

 void MLP::forward_many(int qlen, const void* input, void* output, Backend* backend) {
    const void* gate_input_ptr;
@ -72,7 +74,7 @@ void MLP::forward_many(int qlen, const void* input, void* output, Backend* backe
        }
    }
    int nth = config_.intermediate_size / config_.stride;
-    backend->do_work_stealing_job(nth, [&](int task_id) {
+    backend->do_work_stealing_job(nth, nullptr, [&](int task_id) {
        int ith = task_id;
        void* gate_proj_ptr = (uint8_t*)gate_proj_ + ith * config_.stride * config_.hidden_size * ggml_type_size(config_.gate_type) / ggml_blck_size(config_.gate_type);
        float* gate_output_ptr = gate_output_ + ith * config_.stride;
@ -90,12 +92,12 @@ void MLP::forward_many(int qlen, const void* input, void* output, Backend* backe
                from_float(intermediate_fp32_ptr, down_input_ptr, config_.stride, ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
            }
        }
-    });
+    }, nullptr);
    if (config_.stride % ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) != 0) {
        from_float(intermediate_fp32_, down_input_, qlen * config_.intermediate_size, ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
    }
    nth = config_.hidden_size / config_.stride;
-    backend->do_work_stealing_job(nth, [&](int task_id) {
+    backend->do_work_stealing_job(nth, nullptr, [&](int task_id) {
        int ith = task_id;
        void* down_proj_ptr = (uint8_t*)down_proj_ + ith * config_.stride * config_.intermediate_size * ggml_type_size(config_.down_type) / ggml_blck_size(config_.down_type);
        float* down_output_ptr = down_output_ + ith * config_.stride;
@ -107,7 +109,7 @@ void MLP::forward_many(int qlen, const void* input, void* output, Backend* backe
                from_float(output_fp32_ptr, output_ptr, config_.stride, config_.hidden_type);
            }
        }
-    });
+    }, nullptr);
    if (config_.stride % ggml_blck_size(config_.hidden_type) != 0) {
        from_float(down_output_, output, qlen * config_.hidden_size, config_.hidden_type);
    }
--- a/ktransformers/ktransformers_ext/operators/llamafile/moe.cpp
+++ b/ktransformers/ktransformers_ext/operators/llamafile/moe.cpp
@ -3,8 +3,8 @@
 * @Author       : chenht2022
 * @Date         : 2024-07-22 02:03:22
 * @Version      : 1.0.0
- * @LastEditors  : chenht2022
- * @LastEditTime : 2024-07-25 10:35:07
+ * @LastEditors  : kkk1nak0
+ * @LastEditTime : 2024-08-15 07:43:41
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/
 #include "moe.h"
@ -121,7 +121,7 @@ void MOE::forward_one(int k, const uint64_t* expert_ids, const float* weights, c
        }
    }
    int nth = config_.intermediate_size / config_.stride;
-    backend->do_work_stealing_job(nth * k, [&](int task_id) {
+    backend->do_work_stealing_job(nth * k, nullptr, [&](int task_id) {
        int expert_idx = task_id / nth;
        uint64_t expert_id = expert_ids[expert_idx];
        int ith = task_id % nth;
@ -139,14 +139,14 @@ void MOE::forward_one(int k, const uint64_t* expert_ids, const float* weights, c
            void* down_input_ptr = s_down_input_[expert_idx] + ith * config_.stride * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
            from_float(intermediate_fp32_ptr, down_input_ptr, config_.stride, ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
        }
-    });
+    }, nullptr);
    if (config_.stride % ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) != 0) {
        for (int i = 0; i < k; i++) {
            from_float(s_intermediate_fp32_[i], s_down_input_[i], config_.intermediate_size, ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
        }
    }
    nth = config_.hidden_size / config_.stride;
-    backend->do_work_stealing_job(nth, [&](int task_id) {
+    backend->do_work_stealing_job(nth, nullptr, [&](int task_id) {
        int ith = task_id;
        for (int i = ith * config_.stride; i < (ith + 1) * config_.stride; i++) {
            s_output_fp32_[i] = 0;
@ -165,7 +165,7 @@ void MOE::forward_one(int k, const uint64_t* expert_ids, const float* weights, c
            void* output_ptr = (uint8_t*)output + ith * config_.stride * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type);
            from_float(output_fp32_ptr, output_ptr, config_.stride, config_.hidden_type);
        }
-    });
+    }, nullptr);
    if (config_.stride % ggml_blck_size(config_.hidden_type) != 0) {
        from_float(s_output_fp32_, output, config_.hidden_size, config_.hidden_type);
    }
@ -191,7 +191,7 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
        m_local_down_output_ptr_[i] = m_local_down_output_ + offset * config_.hidden_size;
        offset += m_local_num_[i];
    }
-    backend->do_work_stealing_job(qlen, [&](int i) {
+    backend->do_work_stealing_job(qlen, nullptr, [&](int i) {
        const void* gate_input_ptr;
        const void* up_input_ptr;
        if (config_.hidden_type == ggml_internal_get_type_traits(config_.gate_type).vec_dot_type && config_.hidden_type == ggml_internal_get_type_traits(config_.up_type).vec_dot_type) {
@ -220,10 +220,10 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
            memcpy(m_local_gate_input_ptr_[expert_ids[i * k + j]] + m_local_pos_[i][j] * config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type), gate_input_ptr, config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type));
            memcpy(m_local_up_input_ptr_[expert_ids[i * k + j]] + m_local_pos_[i][j] * config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type), up_input_ptr, config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type));
        }
-    });
+    }, nullptr);
    int stride = QK_K;
    int nth = config_.intermediate_size / stride;
-    backend->do_work_stealing_job(nth * config_.expert_num, [&](int task_id) {
+    backend->do_work_stealing_job(nth * config_.expert_num, nullptr, [&](int task_id) {
        int expert_idx = task_id / nth;
        int ith = task_id % nth;
        void* gate_input_ptr = m_local_gate_input_ptr_[expert_idx];
@ -242,18 +242,18 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
            void* down_input_ptr = m_local_down_input_ptr_[expert_idx] + i * config_.intermediate_size * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) + ith * stride * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
            from_float(intermediate_fp32_ptr, down_input_ptr, stride, ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
        }
-    });
+    }, nullptr);
    stride = QK_K;
    nth = config_.hidden_size / stride;
-    backend->do_work_stealing_job(nth * config_.expert_num, [&](int task_id) {
+    backend->do_work_stealing_job(nth * config_.expert_num, nullptr, [&](int task_id) {
        int expert_idx = task_id / nth;
        int ith = task_id % nth;
        void* down_input_ptr = m_local_down_input_ptr_[expert_idx];
        void* down_proj_ptr = (uint8_t*)down_proj_ + (expert_idx * config_.hidden_size + ith * stride) * config_.intermediate_size * ggml_type_size(config_.down_type) / ggml_blck_size(config_.down_type);
        float* down_output_ptr = m_local_down_output_ptr_[expert_idx] + ith * stride;
        llamafile_sgemm(stride, m_local_num_[expert_idx], config_.intermediate_size / ggml_blck_size(config_.down_type), down_proj_ptr, config_.intermediate_size / ggml_blck_size(config_.down_type), down_input_ptr, config_.intermediate_size / ggml_blck_size(config_.down_type), down_output_ptr, config_.hidden_size, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.down_type, ggml_internal_get_type_traits(config_.down_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
-    });
-    backend->do_work_stealing_job(qlen, [&](int i) {
+    }, nullptr);
+    backend->do_work_stealing_job(qlen, nullptr, [&](int i) {
        for (int e = 0; e < config_.hidden_size; e++) {
            m_output_fp32_[i][e] = 0;
        }
@ -263,7 +263,7 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
            }
        }
        from_float(m_output_fp32_[i], (uint8_t*)output + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), config_.hidden_size, config_.hidden_type);
-    });
+    }, nullptr);
 }

 void MOE::forward(int qlen, int k, const uint64_t* expert_ids, const float* weights, const void* input, void* output, Backend* backend) {
--- a/ktransformers/local_chat.py
+++ b/ktransformers/local_chat.py
@ -1,20 +1,14 @@
-# Copyright 2024 Shaoyuan Chen
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+"""
+Description  :  
+Author       : Boxin Zhang, Azure-Tang
+Version      : 0.1.0
+Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
+"""

 import os
 import platform
 import sys
+
 project_dir = os.path.dirname(os.path.dirname(__file__))
 sys.path.insert(0, project_dir)
 import torch
@ -31,6 +25,7 @@ import fire
 from ktransformers.optimize.optimize import optimize_and_load_gguf
 from ktransformers.models.modeling_deepseek import DeepseekV2ForCausalLM
 from ktransformers.models.modeling_qwen2_moe import Qwen2MoeForCausalLM
+from ktransformers.models.modeling_llama import LlamaForCausalLM
 from ktransformers.models.modeling_mixtral import MixtralForCausalLM
 from ktransformers.util.utils import prefill_and_generate
 from ktransformers.server.config.config import Config
@ -38,38 +33,56 @@ from ktransformers.server.config.config import Config
 custom_models = {
    "DeepseekV2ForCausalLM": DeepseekV2ForCausalLM,
    "Qwen2MoeForCausalLM": Qwen2MoeForCausalLM,
+    "LlamaForCausalLM": LlamaForCausalLM,
    "MixtralForCausalLM": MixtralForCausalLM,
 }

-ktransformer_rules_dir = os.path.dirname(os.path.abspath(__file__)) + "/optimize/optimize_rules/"
-default_optimize_rules ={
+ktransformer_rules_dir = (
+    os.path.dirname(os.path.abspath(__file__)) + "/optimize/optimize_rules/"
+)
+default_optimize_rules = {
    "DeepseekV2ForCausalLM": ktransformer_rules_dir + "DeepSeek-V2-Chat.yaml",
    "Qwen2MoeForCausalLM": ktransformer_rules_dir + "Qwen2-57B-A14B-Instruct.yaml",
+    "LlamaForCausalLM": ktransformer_rules_dir + "Internlm2_5-7b-Chat-1m.yaml",
    "MixtralForCausalLM": ktransformer_rules_dir + "Mixtral.yaml",
 }

+
 def local_chat(
-    model_path: str,
+    model_path: str | None = None,
    optimize_rule_path: str = None,
-    gguf_path: str = None,
+    gguf_path: str | None = None,
    max_new_tokens: int = 1000,
    cpu_infer: int = Config().cpu_infer,
    use_cuda_graph: bool = True,
+    prompt_file : str | None = None,
+    mode: str = "normal",
 ):
+
+
    torch.set_grad_enabled(False)
-    
+
    Config().cpu_infer = cpu_infer
-    tokenizer = AutoTokenizer.from_pretrained(model_path)
+
+    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
    config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
-    torch.set_default_dtype(config.torch_dtype)
+    if mode == 'long_context':
+        torch.set_default_dtype(torch.float16)
+    else:
+        torch.set_default_dtype(config.torch_dtype)

    with torch.device("meta"):
        if config.architectures[0] in custom_models:
            print("using custom modeling_xxx.py.")
-            if "Qwen2Moe" in config.architectures[0]: # Qwen2Moe must use flash_attention_2 to avoid overflow.
+            if (
+                "Qwen2Moe" in config.architectures[0]
+            ):  # Qwen2Moe must use flash_attention_2 to avoid overflow.
                config._attn_implementation = "flash_attention_2"
-            if "Mixtral" in config.architectures[0]: 
+            if "Llama" in config.architectures[0]:
+                config._attn_implementation = "eager"
+            if "Mixtral" in config.architectures[0]:
                config._attn_implementation = "flash_attention_2"
+
            model = custom_models[config.architectures[0]](config)
        else:
            model = AutoModelForCausalLM.from_config(
@ -95,26 +108,50 @@ def local_chat(
    if model.generation_config.pad_token_id is None:
        model.generation_config.pad_token_id = model.generation_config.eos_token_id
    model.eval()
-
    logging.basicConfig(level=logging.INFO)

    system = platform.system()
-    if (system == u'Windows'):
-        os.system('cls')
+    if system == "Windows":
+        os.system("cls")
    else:
-        os.system('clear')
+        os.system("clear")

    while True:
        content = input("Chat: ")
-        if content == "":
-            content = "Please write a piece of quicksort code in C++." 
+        if content.startswith('"""'):  # prefix """
+            # multi lines input
+            content = content[3:] + "\n"
+            while True:
+                line = input("")
+                if line.endswith('"""'):
+                    # end multi lines input
+                    line = line[:-3]  # suffix """
+                    if line:
+                        content += line + "\n"
+                    break
+                else:
+                    content += line + "\n"

+        if content == "":
+            if prompt_file != None:
+                content = open(prompt_file, "r").read()
+            else:
+                content = "Please write a piece of quicksort code in C++."
+        elif os.path.isfile(content):
+            content = open(content, "r").read()
        messages = [{"role": "user", "content": content}]
        input_tensor = tokenizer.apply_chat_template(
            messages, add_generation_prompt=True, return_tensors="pt"
        )
-        torch.set_default_dtype(torch.bfloat16) # TODO: Remove this, replace dtype using config
-        generated = prefill_and_generate(model, tokenizer, input_tensor.cuda(), max_new_tokens, use_cuda_graph)
+        assert Config().long_context_config['max_seq_len'] > input_tensor.shape[1] + max_new_tokens, \
+        "please change max_seq_len in  ~/.ktransformers/config.yaml"
+        torch.set_default_dtype(
+            torch.bfloat16
+        )  # TODO: Remove this, replace dtype using config
+        generated = prefill_and_generate(
+            model, tokenizer, input_tensor.cuda(), max_new_tokens, use_cuda_graph, mode
+        )
+

 if __name__ == "__main__":
-    fire.Fire(local_chat)
+    fire.Fire(local_chat)
--- a/ktransformers/models/configuration_llama.py
+++ b/ktransformers/models/configuration_llama.py
@ -0,0 +1,203 @@
+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""LLaMA model configuration"""
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_rope_utils import rope_config_validation
+
+
+class LlamaConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`LlamaModel`]. It is used to instantiate an LLaMA
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the LLaMA-7B.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the LLaMA model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`LlamaModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with. Llama 1 supports up to 2048 tokens,
+            Llama 2 up to 4096, CodeLlama up to 16384.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            End of stream token id.
+        pretraining_tp (`int`, *optional*, defaults to 1):
+            Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
+            document](https://huggingface.co/docs/transformers/main/perf_train_gpu_many#tensor-parallelism) to
+            understand more about it. This value is necessary to ensure exact reproducibility of the pretraining
+            results. Please refer to [this issue](https://github.com/pytorch/pytorch/issues/76232).
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        mlp_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers.
+
+    ```python
+    >>> from transformers import LlamaModel, LlamaConfig
+
+    >>> # Initializing a LLaMA llama-7b style configuration
+    >>> configuration = LlamaConfig()
+
+    >>> # Initializing a model from the llama-7b style configuration
+    >>> model = LlamaModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "llama"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        pretraining_tp=1,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        mlp_bias=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.mlp_bias = mlp_bias
+
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, move it to 'rope_type'.
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_config_validation(self)
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
--- a/ktransformers/models/modeling_llama.py
+++ b/ktransformers/models/modeling_llama.py
--- a/ktransformers/operators/RoPE.py
+++ b/ktransformers/operators/RoPE.py
@ -1,67 +1,128 @@
-'''
+"""
 Description  :  
 Author       : Boxin Zhang
 Version      : 0.1.0
 Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
-'''
+"""
+
 from torch import nn
-from ktransformers.models.modeling_deepseek import DeepseekV2YarnRotaryEmbedding, DeepseekV2RotaryEmbedding
+from transformers import ROPE_INIT_FUNCTIONS
+from ktransformers.models.modeling_llama import (
+    LlamaRotaryEmbedding,
+    LlamaLinearScalingRotaryEmbedding,
+    LlamaDynamicNTKScalingRotaryEmbedding,
+)
+from ktransformers.models.modeling_deepseek import (
+    DeepseekV2YarnRotaryEmbedding,
+    DeepseekV2RotaryEmbedding,
+)
 from ktransformers.operators.base_operator import BaseInjectedModule
 from ktransformers.util.custom_gguf import GGUFLoader
 from ktransformers.util.utils import InferenceState
 from transformers.configuration_utils import PretrainedConfig

+
 # Copied from transformers.models.mixtral.modeling_mixtral.MixtralRotaryEmbedding with Mixtral->Qwen2Moe
 class RotaryEmbedding(BaseInjectedModule, DeepseekV2RotaryEmbedding):
-    def __init__(self,
-                 key: str,
-                 gguf_loader : GGUFLoader,
-                 config: PretrainedConfig,
-                 orig_module: nn.Module,
-                #  device: str = "cuda",
-                 generate_device: str = "cuda",
-                 prefill_device: str = "cuda",
-                 **kwargs):
-        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, generate_device, **kwargs)
-        self.orig_module.__init__(orig_module.dim,
-            orig_module.max_position_embeddings,
-            orig_module.base)
+    def __init__(
+        self,
+        key: str,
+        gguf_loader: GGUFLoader,
+        config: PretrainedConfig,
+        orig_module: nn.Module,
+        #  device: str = "cuda",
+        generate_device: str = "cuda",
+        prefill_device: str = "cuda",
+        **kwargs,
+    ):
+        BaseInjectedModule.__init__(
+            self, key, gguf_loader, config, orig_module, generate_device, **kwargs
+        )
+        self.orig_module.__init__(
+            orig_module.dim, orig_module.max_position_embeddings, orig_module.base
+        )
        self.generate_device = generate_device
        self.prefill_device = prefill_device
-    
+
    def load(self):
-        self.orig_module.__init__(self.orig_module.dim,
+        self.orig_module.__init__(
+            self.orig_module.dim,
            self.orig_module.max_position_embeddings,
            self.orig_module.base,
-            self.device)
-    
-class YarnRotaryEmbedding(BaseInjectedModule, DeepseekV2YarnRotaryEmbedding):
-    def __init__(self,
-                 key: str,
-                 gguf_loader : GGUFLoader,
-                 config: PretrainedConfig,
-                 orig_module: nn.Module,
-                #  device: str = "cuda",
-                 generate_device: str = "cuda",
-                 prefill_device: str = "cuda",
-                 **kwargs):
-        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, generate_device, **kwargs)
-        self.orig_module.__init__(orig_module.dim,
+            self.device,
+        )
+
+
+class RotaryEmbeddingV2(BaseInjectedModule, LlamaRotaryEmbedding):
+    def __init__(
+        self,
+        key: str,
+        gguf_loader: GGUFLoader,
+        config: PretrainedConfig,
+        orig_module: nn.Module,
+        generate_device: str = "cuda",
+        prefill_device: str = "cuda",
+        **kwargs,
+    ):
+        BaseInjectedModule.__init__(
+            self, key, gguf_loader, config, orig_module, generate_device, **kwargs
+        )
+        self.orig_module.__init__(
+            orig_module.dim,
            orig_module.max_position_embeddings,
            orig_module.base,
-            None, #device
+            None,
+            orig_module.scaling_factor,
+            orig_module.rope_type,
+            orig_module.config,
+        )
+        self.generate_device = generate_device
+        self.prefill_device = prefill_device
+
+    def load(self):
+        self.orig_module.__init__(
+            self.orig_module.dim,
+            self.orig_module.max_position_embeddings,
+            self.orig_module.base,
+            self.device,
+            self.orig_module.scaling_factor,
+            self.orig_module.rope_type,
+            self.orig_module.config,
+        )
+
+class YarnRotaryEmbedding(BaseInjectedModule, DeepseekV2YarnRotaryEmbedding):
+    def __init__(
+        self,
+        key: str,
+        gguf_loader: GGUFLoader,
+        config: PretrainedConfig,
+        orig_module: nn.Module,
+        #  device: str = "cuda",
+        generate_device: str = "cuda",
+        prefill_device: str = "cuda",
+        **kwargs,
+    ):
+        BaseInjectedModule.__init__(
+            self, key, gguf_loader, config, orig_module, generate_device, **kwargs
+        )
+        self.orig_module.__init__(
+            orig_module.dim,
+            orig_module.max_position_embeddings,
+            orig_module.base,
+            None,  # device
            orig_module.scaling_factor,
            orig_module.original_max_position_embeddings,
            orig_module.beta_fast,
            orig_module.beta_slow,
            orig_module.mscale,
-            orig_module.mscale_all_dim)
+            orig_module.mscale_all_dim,
+        )
        self.generate_device = generate_device
        self.prefill_device = prefill_device
-        
-    
+
    def load(self):
-        self.orig_module.__init__(self.orig_module.dim,
+        self.orig_module.__init__(
+            self.orig_module.dim,
            self.orig_module.max_position_embeddings,
            self.orig_module.base,
            self.generate_device,
@ -70,5 +131,42 @@ class YarnRotaryEmbedding(BaseInjectedModule, DeepseekV2YarnRotaryEmbedding):
            self.orig_module.beta_fast,
            self.orig_module.beta_slow,
            self.orig_module.mscale,
-            self.orig_module.mscale_all_dim)
-    
+            self.orig_module.mscale_all_dim,
+        )
+
+
+class DynamicNTKScalingRotaryEmbedding(
+    BaseInjectedModule, LlamaDynamicNTKScalingRotaryEmbedding
+):
+    def __init__(
+        self,
+        key: str,
+        gguf_loader: GGUFLoader,
+        config: PretrainedConfig,
+        orig_module: nn.Module,
+        device: str = "cuda",
+        **kwargs,
+    ):
+        BaseInjectedModule.__init__(
+            self, key, gguf_loader, config, orig_module, device, **kwargs
+        )
+        self.orig_module.__init__(
+            orig_module.dim,
+            orig_module.max_position_embeddings,
+            orig_module.base,
+            None,  # device
+            orig_module.scaling_factor,
+            orig_module.rope_type,
+            orig_module.config,
+        )
+
+    def load(self):
+        self.orig_module.__init__(
+            self.orig_module.dim,
+            self.orig_module.max_position_embeddings,
+            self.orig_module.base,
+            self.orig_module.device,
+            self.orig_module.scaling_factor,
+            self.orig_module.rope_type,
+            self.orig_module.config,
+        )
--- a/ktransformers/operators/attention.py
+++ b/ktransformers/operators/attention.py
@ -7,16 +7,22 @@ Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 import torch
 from torch import nn
 import warnings
+import torch.nn.functional as F
+from ktransformers.operators.models import KLlamaModel
 from ktransformers.models.configuration_deepseek import DeepseekV2Config
+from ktransformers.models.configuration_llama import LlamaConfig
+from ktransformers.models.modeling_llama import LlamaRotaryEmbedding
 from ktransformers.models.modeling_deepseek import DeepseekV2Attention, apply_rotary_pos_emb
 from typing import Optional, Tuple
 from ktransformers.operators.base_operator import BaseInjectedModule
 from ktransformers.util.custom_gguf import GGUFLoader
+import logging
 from transformers.configuration_utils import PretrainedConfig
 from transformers.cache_utils import Cache
-
+logger = logging.getLogger("attention")
 class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention):
    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    attn_mask: Optional[torch.Tensor] = None

    def __init__(self,
                 key: str,
@ -24,10 +30,12 @@ class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention):
                 config: PretrainedConfig,
                 orig_module: nn.Module,
                 device: str = "cuda",
+                 chunck_size: int = 1000,
                 **kwargs):
        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, device, **kwargs)
        self.orig_module.__init__(orig_module.config,
            orig_module.layer_idx)
+        self.chunck_size = chunck_size # TODO, generate chunck_size automatically.

    def get_absorbed(self) -> Tuple[torch.Tensor, torch.Tensor]:
        if not (hasattr(self, 'q_absorb') and hasattr(self, 'out_absorb')):
@ -157,9 +165,8 @@ class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention):
                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
            )
        bsz, q_len, _ = hidden_states.size()
-        chunck_size = 256 # TODO, generate chunck_size automatically.
        
-        if q_len <= chunck_size:
+        if q_len <= self.chunck_size:
            return self.forward_chunck(
                            hidden_states,
                            attention_mask,
@ -176,24 +183,170 @@ class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention):
        cur_idx = 0
        while cur_idx < q_len:
            if attention_mask is not None:
-                chunk_mask = attention_mask[:, :, cur_idx:min(cur_idx + chunck_size, q_len), ...]
+                chunk_mask = attention_mask[:, :, cur_idx:min(cur_idx + self.chunck_size, q_len), ...]
            else:
-                chunk_mask = None
+                # generate chunk_mask automatically.
+                self.attn_mask = \
+                    torch.zeros(1, 1, self.chunck_size, past_key_value.max_cache_len, device=hidden_states.device) \
+                        if self.attn_mask is None \
+                            else self.attn_mask
+                self.attn_mask[:, :, :, cur_idx:min(cur_idx+self.chunck_size, past_key_value.max_cache_len)] = \
+                    -1e+38 * torch.triu(torch.ones(self.chunck_size, self.chunck_size, device=hidden_states.device), diagonal=1)\
+                        [:,:min(self.chunck_size, min(past_key_value.max_cache_len-cur_idx, self.chunck_size))]
+                self.attn_mask[:, :, :, cur_idx+self.chunck_size:] = -1e+38
+                self.attn_mask[:, :, :, :cur_idx] = 0
+                chunck_mask = torch.narrow(self.attn_mask, 2, 0, min(self.chunck_size, q_len-cur_idx))

            cur_output, _, _ = self.forward_chunck(
-                            hidden_states[:, cur_idx:min(cur_idx + chunck_size, q_len), ...],
-                            chunk_mask,
-                            position_ids[:, cur_idx:min(cur_idx + chunck_size, q_len)],
+                            hidden_states[:, cur_idx:min(cur_idx + self.chunck_size, q_len), ...],
+                            chunck_mask,
+                            position_ids[:, cur_idx:min(cur_idx + self.chunck_size, q_len)],
                            past_key_value,
                            output_attentions,
                            use_cache,
-                            cache_position[cur_idx:min(cur_idx + chunck_size, q_len)],
+                            cache_position[cur_idx:min(cur_idx + self.chunck_size, q_len)],
                            **kwargs
                        )
-            cur_idx += chunck_size
+            cur_idx += self.chunck_size
            if attn_output is None:
                attn_output = cur_output
            else:
                attn_output = torch.cat((attn_output, cur_output), dim=-2)
                
        return attn_output, None, past_key_value
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+
+
+class KLlamaAttention(BaseInjectedModule):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self,
+                 key: str,
+                 gguf_loader : GGUFLoader,
+                 config: PretrainedConfig,
+                 orig_module: nn.Module,
+                 device: str = "cuda",
+                 **kwargs):
+        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, device, **kwargs)
+        self.orig_module.__init__(orig_module.config,
+            orig_module.layer_idx)
+    def apply_rotary_pos_emb(self, q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+        """Applies Rotary Position Embedding to the query and key tensors.
+
+        Args:
+            q (`torch.Tensor`): The query tensor.
+            k (`torch.Tensor`): The key tensor.
+            cos (`torch.Tensor`): The cosine part of the rotary embedding.
+            sin (`torch.Tensor`): The sine part of the rotary embedding.
+            position_ids (`torch.Tensor`, *optional*):
+                Deprecated and unused.
+            unsqueeze_dim (`int`, *optional*, defaults to 1):
+                The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+                sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+                that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+                k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+                cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+                the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+        Returns:
+            `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+        """
+        cos = cos.unsqueeze(unsqueeze_dim)
+        sin = sin.unsqueeze(unsqueeze_dim)
+        q_embed = (q * cos) + (rotate_half(q) * sin)
+        k_embed = (k * cos) + (rotate_half(k) * sin)
+        return q_embed, k_embed
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.45
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+
+        if self.config.pretraining_tp > 1:
+            key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
+            query_slices = self.q_proj.weight.split(
+                (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0
+            )
+            key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
+            value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
+
+            query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)]
+            query_states = torch.cat(query_states, dim=-1)
+
+            key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)]
+            key_states = torch.cat(key_states, dim=-1)
+
+            value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)]
+            value_states = torch.cat(value_states, dim=-1)
+
+        else:
+            query_states = self.q_proj(hidden_states)
+            key_states = self.k_proj(hidden_states)
+            value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        if position_embeddings is None:
+
+            logger.warning(
+                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+                "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
+                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.45 `position_ids` will be "
+                "removed and `position_embeddings` will be mandatory."
+            )
+            cos, sin = self.rotary_emb(value_states, position_ids)
+        else:
+            cos, sin = position_embeddings
+        query_states, key_states = self.apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        if q_len == 1:
+            position_ids = position_ids[0][-1].unsqueeze(0).unsqueeze(0)
+            query_states = query_states[:, :, -1:]
+            key_states = key_states[:, :, -1:]
+
+        attn_output = KLlamaModel.dynamic_sdpa.apply(
+            self.layer_idx,
+            bsz,
+            position_ids[0][0],
+            query_states.transpose(1, 2).to(torch.float16),
+            key_states.transpose(1, 2).to(torch.float16),
+            value_states.transpose(1, 2).to(torch.float16),
+            mode="prefill" if q_len > 1 else "generate",
+        )
+
+
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+
+        attn_output = attn_output.reshape(bsz, q_len, -1)
+
+        if self.config.pretraining_tp > 1:
+            attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2)
+            o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1)
+            attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)])
+        else:
+            attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
--- a/ktransformers/operators/cpuinfer.py
+++ b/ktransformers/operators/cpuinfer.py
@ -1,18 +1,746 @@
+#!/usr/bin/env python
+# coding=utf-8
+"""
+Description  : This script defines the `CPUInferKVCache` and `CPUInfer` classes for performing inference 
+               with a Key-Value Cache on the CPU. The `CPUInferKVCache` class is responsible for configuring 
+               and managing key-value caches, updating and retrieving cache data, and handling attention 
+               operations. It supports different cache types (e.g., Q4_0, FP16) and retrieval strategies 
+               (e.g., shared, separate). The `CPUInfer` class handles task submission and synchronization 
+               on the CPU, with optional CUDA stream integration for tasks involving GPU acceleration. 
+               These classes facilitate efficient caching and memory management for deep learning models 
+               that leverage key-value attention mechanisms, particularly on CPU-based systems.
+Author       : djw
+Date         : 2024-08-26 23:25:24
+Version      : 1.0.0
+LastEditors  : djw 
+LastEditTime : 2024-08-26 23:25:24
+Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
+"""
 import sys, os
 from typing import Any
+import torch
 sys.path.append(os.path.join(os.path.dirname(__file__), "..", "ktransformers_ext", "build"))
 sys.path.append(os.path.join(os.path.dirname(__file__), "..", "ktransformers_ext", "build", "Release"))
 sys.path.append(os.path.join(os.path.dirname(__file__), "..", "ktransformers_ext", "build", "Debug"))
 import cpuinfer_ext
 from ktransformers.server.config.config import Config
+
+
+class CPUInferKVCache:
+    def __init__(
+        self,
+        layer_num: int = 32,
+        kv_head_num: int = 8,
+        q_head_num: int = 32,
+        head_dim: int = 128,
+        block_len: int = 256,
+        anchor_num: int = 4,
+        anchor_type: str = "FIXED",
+        kv_type: str = "Q4_0",
+        retrieval_type: str = "SHARED",
+        layer_step: int = 1,
+        token_step: int = 1,
+        layer_offset: int = 0,
+        max_thread_num: int = 32,
+        max_batch_size: int = 4,
+        max_block_num: int = 512,
+    ):
+
+        if anchor_type == "FIXED":
+            anchor_type = cpuinfer_ext.kvcache.AnchorType.FIXED
+        elif anchor_type == "QUEST":
+            anchor_type = cpuinfer_ext.kvcache.AnchorType.QUEST
+        elif anchor_type == "DYNAMIC":
+            anchor_type = cpuinfer_ext.kvcache.AnchorType.DYNAMIC
+        elif anchor_type == "BLOCK_MEAN":
+            anchor_type = cpuinfer_ext.kvcache.AnchorType.BLOCK_MEAN
+        elif anchor_type == "BLOCK_MAX":
+            anchor_type = cpuinfer_ext.kvcache.AnchorType.BLOCK_MAX
+        else:
+            raise ValueError(f"Unknown anchor type: {anchor_type}")
+
+        if kv_type == "FP16":
+            kv_type = cpuinfer_ext.kvcache.ggml_type.FP16
+        elif kv_type == "FP32":
+            assert False, "FP32 is not supported yet."
+            kv_type = cpuinfer_ext.kvcache.ggml_type.FP32
+        elif kv_type == "Q4_0":
+            kv_type = cpuinfer_ext.kvcache.ggml_type.Q4_0
+        elif kv_type == "Q8_0":
+            kv_type = cpuinfer_ext.kvcache.ggml_type.Q8_0
+        else:
+            raise ValueError(f"Unknown kv type: {kv_type}")
+
+        if retrieval_type == "SHARED":
+            retrieval_type = cpuinfer_ext.kvcache.RetrievalType.LAYER
+        elif retrieval_type == "INDIVIDUAL":
+            retrieval_type = cpuinfer_ext.kvcache.RetrievalType.QHEAD
+        elif retrieval_type == "SEPARATE":
+            retrieval_type = cpuinfer_ext.kvcache.RetrievalType.KVHEAD
+
+        self.config = cpuinfer_ext.kvcache.KVCacheConfig(
+            layer_num,
+            kv_head_num,
+            q_head_num,
+            head_dim,
+            block_len,
+            anchor_num,
+            anchor_type,
+            kv_type,
+            retrieval_type,
+            layer_step,
+            token_step,
+            layer_offset,
+            max_block_num,
+            max_batch_size,
+            max_thread_num,
+        )
+        self.kvcache = cpuinfer_ext.kvcache.KVCache(self.config)
+
+    def load_kvcache(self, tensor_file_path: str):
+        if not os.path.exists(tensor_file_path):
+            raise FileNotFoundError(f"The file {tensor_file_path} does not exist.")
+        return self.kvcache.load_kvcache(tensor_file_path,)
+
+    def dump_kvcache(
+        self, block_table: torch.Tensor, cache_total_len: int, tensor_file_path: str
+    ):
+        assert (
+            block_table.dim() == 1
+            and block_table.dtype == torch.int
+            and block_table.is_contiguous()
+            and block_table.device == torch.device("cpu")
+        ), "block_table dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
+            block_table.dim(),
+            block_table.size(),
+            block_table.dtype,
+            block_table.is_contiguous(),
+            block_table.device,
+        )
+
+        assert (
+            cache_total_len > 0
+            and cache_total_len <= self.config.block_len * block_table.size(0)
+        ), "cache_total_len: {}".format(cache_total_len)
+
+        if not os.path.exists(os.path.dirname(tensor_file_path)):
+            os.makedirs(os.path.dirname(tensor_file_path))
+
+        return self.kvcache.dump_kvcache(
+            block_table.data_ptr(),
+            cache_total_len,
+            tensor_file_path,
+        )
+
+    def update_cache_total_len(self, cache_total_len: int):
+        assert cache_total_len > 0, "cache_total_len: {}".format(cache_total_len)
+        self.kvcache.update_cache_total_len(cache_total_len)
+
+    # q_in: (bsz, q_len, q_head_num, head_dim)
+    # output: (bsz, q_len, q_head_num, head_dim)
+    # attn_lse: (bsz, q_len, q_head_num)
+    # block_table: (bsz, max_block_num)
+    def attn(
+        self,
+        q_in: torch.Tensor,
+        output: torch.Tensor,
+        attn_lse: torch.Tensor,
+        layer_idx: int,
+        generate_token_idx: int,
+        block_table: torch.Tensor | None = None,
+        cache_seqlens: torch.Tensor | None = None,
+        pick_block_num: int | None = None,
+        init_block_num: int | None = None,
+        local_block_num: int | None = None,
+    ):
+
+        assert (
+            q_in.dim() == 4
+            and q_in.size(2) == self.config.q_head_num
+            and q_in.size(3) == self.config.head_dim
+            and q_in.dtype == torch.float16
+            and q_in.is_contiguous()
+            and q_in.device == torch.device("cpu")
+        ), "q_in dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
+            q_in.dim(), q_in.size(), q_in.dtype, q_in.is_contiguous(), q_in.device
+        )
+
+        batch_size = q_in.size(0)
+        q_len = q_in.size(1)
+
+        assert (block_table is None) or (
+            block_table.dim() == 2
+            and block_table.size(0) == batch_size
+            and block_table.dtype == torch.int
+            and block_table.is_contiguous()
+            and block_table.device == torch.device("cpu")
+        ), "block_table dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
+            block_table.dim(),
+            block_table.size(),
+            block_table.dtype,
+            block_table.is_contiguous(),
+            block_table.device,
+        )
+
+        max_block_num = block_table.size(1) if block_table is not None else 0
+
+        assert (
+            output.dim() == 4
+            and output.size(0) == batch_size
+            and output.size(2) == self.config.q_head_num
+            and output.size(1) == q_len
+            and output.size(3) == self.config.head_dim
+            and output.dtype == torch.float16
+            and output.is_contiguous()
+            and output.device == torch.device("cpu")
+        ), "output dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
+            output.dim(),
+            output.size(),
+            output.dtype,
+            output.is_contiguous(),
+            output.device,
+        )
+
+        assert (
+            attn_lse.dim() == 3
+            and attn_lse.size(0) == batch_size
+            and attn_lse.size(1) == q_len
+            and attn_lse.size(2) == self.config.q_head_num
+            and attn_lse.dtype == torch.float32
+            and attn_lse.is_contiguous()
+            and attn_lse.device == torch.device("cpu")
+        ), "attn_lse dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
+            attn_lse.dim(),
+            attn_lse.size(),
+            attn_lse.dtype,
+            attn_lse.is_contiguous(),
+            attn_lse.device,
+        )
+
+        assert (
+            layer_idx >= 0 and layer_idx < self.config.layer_num
+        ), "layer_idx: {}".format(layer_idx)
+
+        assert (cache_seqlens is None) or (
+            cache_seqlens.dim() == 1
+            and cache_seqlens.size(0) == batch_size
+            and cache_seqlens.dtype == torch.int
+            and cache_seqlens.is_contiguous()
+            and cache_seqlens.device == torch.device("cpu")
+        ), "cache_seqlens dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
+            cache_seqlens.dim(),
+            cache_seqlens.size(),
+            cache_seqlens.dtype,
+            cache_seqlens.is_contiguous(),
+            cache_seqlens.device,
+        )
+
+        return self.kvcache.attn(
+            q_in.data_ptr(),
+            output.data_ptr(),
+            attn_lse.data_ptr(),
+            layer_idx,
+            generate_token_idx,
+            q_len,
+            batch_size,
+            max_block_num,
+            block_table.data_ptr() if block_table is not None else 0,
+            cache_seqlens.data_ptr() if cache_seqlens is not None else 0,
+            pick_block_num,
+            init_block_num,
+            local_block_num,
+        )
+
+    # k_in: (block_len, kv_head_num, head_dim)
+    # v_in: (block_len, kv_head_num, head_dim)
+    def update_kvcache_one_block_fp16(
+        self, k_in: torch.Tensor, v_in: torch.Tensor, layer_id: int, block_idx: int
+    ):
+        assert (
+            k_in.dim() == 3
+            and k_in.size(1) == self.config.block_len
+            and k_in.size(0) == self.config.kv_head_num
+            and k_in.size(2) == self.config.head_dim
+            and k_in.dtype == torch.float16
+            and k_in.is_contiguous()
+            and k_in.device == torch.device("cpu")
+        ), "k_in dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
+            k_in.dim(), k_in.size(), k_in.dtype, k_in.is_contiguous(), k_in.device
+        )
+        assert (
+            v_in.dim() == 3
+            and v_in.size(1) == self.config.block_len
+            and v_in.size(0) == self.config.kv_head_num
+            and v_in.size(2) == self.config.head_dim
+            and v_in.dtype == torch.float16
+            and v_in.is_contiguous()
+            and v_in.device == torch.device("cpu")
+        ), "v_in dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
+            v_in.dim(), v_in.size(), v_in.dtype, v_in.is_contiguous(), v_in.device
+        )
+        assert (
+            layer_id >= 0 and layer_id < self.config.layer_num
+        ), "layer_id: {}".format(layer_id)
+        assert block_idx >= 0, "block_idx: {}".format(block_idx)
+        return self.kvcache.update_one_block_fp16(
+            k_in.data_ptr(),
+            v_in.data_ptr(),
+            layer_id,
+            block_idx,
+        )
+
+    def get_kvcache_one_block_fp16(
+        self, k_in: torch.Tensor, v_in: torch.Tensor, layer_id: int, block_idx: int
+    ):
+        assert (
+            k_in.dim() == 3
+            and k_in.size(1) == self.config.block_len
+            and k_in.size(0) == self.config.kv_head_num
+            and k_in.size(2) == self.config.head_dim
+            and k_in.dtype == torch.float16
+            and k_in.is_contiguous()
+            and k_in.device == torch.device("cpu")
+        ), "k_in dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
+            k_in.dim(), k_in.size(), k_in.dtype, k_in.is_contiguous(), k_in.device
+        )
+        assert (
+            v_in.dim() == 3
+            and v_in.size(1) == self.config.block_len
+            and v_in.size(0) == self.config.kv_head_num
+            and v_in.size(2) == self.config.head_dim
+            and v_in.dtype == torch.float16
+            and v_in.is_contiguous()
+            and v_in.device == torch.device("cpu")
+        ), "v_in dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
+            v_in.dim(), v_in.size(), v_in.dtype, v_in.is_contiguous(), v_in.device
+        )
+        assert (
+            layer_id >= 0 and layer_id < self.config.layer_num
+        ), "layer_id: {}".format(layer_id)
+        assert block_idx >= 0, "block_idx: {}".format(block_idx)
+        return self.kvcache.get_one_block_fp16(
+            k_in.data_ptr(),
+            v_in.data_ptr(),
+            layer_id,
+            block_idx,
+        )
+
+    def update_importance_one_block(
+        self, importance: torch.Tensor, layer_id: int, block_idx: int
+    ):
+        assert (
+            importance.dim() == 1
+            and importance.size(0) == self.config.block_len
+            and importance.dtype == torch.float16
+            and importance.is_contiguous()
+            and importance.device == torch.device("cpu")
+        ), "importance dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
+            importance.dim(),
+            importance.size(),
+            importance.dtype,
+            importance.is_contiguous(),
+            importance.device,
+        )
+        assert (
+            layer_id >= 0 and layer_id < self.config.layer_num
+        ), "layer_id: {}".format(layer_id)
+        assert block_idx >= 0, "block_idx: {}".format(block_idx)
+        return self.kvcache.update_importance_one_block(
+            importance.data_ptr(),
+            layer_id,
+            block_idx,
+        )
+
+    def get_importance_one_block(
+        self, importance: torch.Tensor, layer_id: int, block_idx: int
+    ):
+        assert (
+            importance.dim() == 1
+            and importance.size(0) == self.config.block_len
+            and importance.dtype == torch.float16
+            and importance.is_contiguous()
+            and importance.device == torch.device("cpu")
+        ), "importance dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
+            importance.dim(),
+            importance.size(),
+            importance.dtype,
+            importance.is_contiguous(),
+            importance.device,
+        )
+        assert (
+            layer_id >= 0 and layer_id < self.config.layer_num
+        ), "layer_id: {}".format(layer_id)
+        assert block_idx >= 0, "block_idx: {}".format(block_idx)
+        return self.kvcache.get_importance_one_block(
+            importance.data_ptr(),
+            layer_id,
+            block_idx,
+        )
+
+    def get_anchor_one_block(self, anchor: torch.Tensor, layer_id: int, block_idx: int):
+        assert (
+            anchor.dim() == 3
+            and anchor.size(0) == self.config.kv_head_num
+            and anchor.size(1) == self.config.anchor_num
+            and anchor.size(2) == self.config.head_dim
+            and anchor.dtype == torch.float16
+            and anchor.is_contiguous()
+            and anchor.device == torch.device("cpu")
+        ), "anchor dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
+            anchor.dim(),
+            anchor.size(),
+            anchor.dtype,
+            anchor.is_contiguous(),
+            anchor.device,
+        )
+        assert (
+            layer_id >= 0 and layer_id < self.config.layer_num
+        ), "layer_id: {}".format(layer_id)
+        assert block_idx >= 0, "block_idx: {}".format(block_idx)
+        return self.kvcache.get_anchor_one_block(
+            anchor.data_ptr(),
+            layer_id,
+            block_idx,
+        )
+
+    def update_anchor_one_block(
+        self, anchor: torch.Tensor, layer_id: int, block_idx: int
+    ):
+        assert (
+            anchor.dim() == 3
+            and anchor.size(0) == self.config.kv_head_num
+            and anchor.size(1) == self.config.anchor_num
+            and anchor.size(2) == self.config.head_dim
+            and anchor.dtype == torch.float16
+            and anchor.is_contiguous()
+            and anchor.device == torch.device("cpu")
+        ), "anchor dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
+            anchor.dim(),
+            anchor.size(),
+            anchor.dtype,
+            anchor.is_contiguous(),
+            anchor.device,
+        )
+        assert (
+            layer_id >= 0 and layer_id < self.config.layer_num
+        ), "layer_id: {}".format(layer_id)
+        assert block_idx >= 0, "block_idx: {}".format(block_idx)
+        return self.kvcache.update_anchor_one_block(
+            anchor.data_ptr(),
+            layer_id,
+            block_idx,
+        )
+
+    def calc_anchor_all_layers(
+        self,
+        block_table: torch.Tensor,
+        cache_seqlens: torch.Tensor,
+    ):
+        assert (
+            block_table.dim() == 2
+            and block_table.size(0) == cache_seqlens.size(0)
+            and block_table.dtype == torch.int
+            and block_table.is_contiguous()
+            and block_table.device == torch.device("cpu")
+        ), "block_table dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
+            block_table.dim(),
+            block_table.size(),
+            block_table.dtype,
+            block_table.is_contiguous(),
+            block_table.device,
+        )
+        assert (
+            cache_seqlens.dim() == 1
+            and cache_seqlens.dtype == torch.int
+            and cache_seqlens.is_contiguous()
+            and cache_seqlens.device == torch.device("cpu")
+        ), "cache_seqlens dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
+            cache_seqlens.dim(),
+            cache_seqlens.size(),
+            cache_seqlens.dtype,
+            cache_seqlens.is_contiguous(),
+            cache_seqlens.device,
+        )
+        batch_size = block_table.size(0)
+        max_block_num = block_table.size(1)
+        return self.kvcache.calc_anchor_all_layers(
+            block_table.data_ptr(),
+            cache_seqlens.data_ptr(),
+            batch_size,
+            max_block_num,
+        )
+
+    def clear_importance_all_layers(
+        self,
+        block_table: torch.Tensor,
+        cache_seqlens: torch.Tensor,
+    ):
+        assert (
+            block_table.dim() == 2
+            and block_table.size(0) == cache_seqlens.size(0)
+            and block_table.dtype == torch.int
+            and block_table.is_contiguous()
+            and block_table.device == torch.device("cpu")
+        ), "block_table dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
+            block_table.dim(),
+            block_table.size(),
+            block_table.dtype,
+            block_table.is_contiguous(),
+            block_table.device,
+        )
+        assert (
+            cache_seqlens.dim() == 1
+            and cache_seqlens.dtype == torch.int
+            and cache_seqlens.is_contiguous()
+            and cache_seqlens.device == torch.device("cpu")
+        ), "cache_seqlens dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
+            cache_seqlens.dim(),
+            cache_seqlens.size(),
+            cache_seqlens.dtype,
+            cache_seqlens.is_contiguous(),
+            cache_seqlens.device,
+        )
+        batch_size = block_table.size(0)
+        max_block_num = block_table.size(1)
+        return self.kvcache.clear_importance_all_layers(
+            block_table.data_ptr(),
+            cache_seqlens.data_ptr(),
+            batch_size,
+            max_block_num,
+        )
+
+    def get_cache_total_len(self):
+        return self.kvcache.get_cache_total_len()
+
+    def update_kvcache_q4(
+        self,
+        k_in: torch.Tensor,
+        k_scales: torch.Tensor,
+        v_in: torch.Tensor,
+        v_scales: torch.Tensor,
+        layer_id: int,
+        seq_offset: int | None = None,
+        seq_len: int | None = None,
+        block_table: torch.Tensor | None = None,
+    ):
+        raise NotImplementedError
+
+    def update_kvcache_fp16(
+        self,
+        k_in: torch.Tensor,
+        v_in: torch.Tensor,
+        layer_idx,
+        block_table: torch.Tensor,
+        max_block_num,
+        past_len: torch.Tensor,
+        q_len,
+    ):
+        batch_size = block_table.size(0)
+        return self.kvcache.get_kvcache_fp16(
+            k_in.data_ptr(),
+            v_in.data_ptr(),
+            layer_idx,
+            block_table.data_ptr(),
+            batch_size,
+            max_block_num,
+            past_len.data_ptr(),
+            q_len
+        )
+
+    def get_kvcache_q4(
+        self,
+        k_in: torch.Tensor,
+        k_scales: torch.Tensor,
+        v_in: torch.Tensor,
+        v_scales: torch.Tensor,
+        layer_id: int,
+        seq_offset: int | None = None,
+        seq_len: int | None = None,
+        block_table: torch.Tensor | None = None,
+    ):
+        raise NotImplementedError
+
+    def get_kvcache_fp16(
+        self,
+        k_in: torch.Tensor,
+        v_in: torch.Tensor,
+        layer_id: int,
+        layer_idx,
+        block_table: torch.Tensor,
+        max_block_num,
+        past_len: torch.Tensor,
+    ):
+        batch_size = block_table.size(0)
+        return self.kvcache.get_kvcache_fp16(
+            k_in.data_ptr(),
+            v_in.data_ptr(),
+            layer_idx,
+            block_table.data_ptr(),
+            batch_size,
+            max_block_num,
+            past_len.data_ptr(),
+        )
+
+    def get_and_update_kvcache_fp16(
+        self,
+        k_cache_cpu: torch.Tensor,
+        v_cache_cpu: torch.Tensor,
+        layer_idx,
+        block_table: torch.Tensor,
+        max_block_num,
+        past_len: torch.Tensor,
+        q_len,
+    ):
+        batch_size = block_table.size(0)
+        return self.kvcache.get_and_update_kvcache_fp16(
+            k_cache_cpu.data_ptr(),
+            v_cache_cpu.data_ptr(),
+            layer_idx,
+            block_table.data_ptr(),
+            batch_size,
+            max_block_num,
+            past_len.data_ptr(),
+            q_len,
+        )
+
+    def update_importance(
+        self,
+        importance_cache: torch.Tensor,
+        layer_idx,
+        block_table: torch.Tensor,
+        max_block_num,
+        offset: torch.Tensor,
+        width,
+    ):
+        batch_size = block_table.size(0)
+        return self.kvcache.update_importance(
+            importance_cache.data_ptr(),
+            layer_idx,
+            block_table.data_ptr(),
+            batch_size,
+            max_block_num,
+            offset.data_ptr(),
+            width,
+        )
+
+    # attn_sparsity: ((bsz, q_len, q_head_num), dtype = torch.float32)
+    def get_attn_sparsity(
+        self,
+        q_in: torch.Tensor,
+        attn_sparsity: torch.Tensor,
+        layer_idx: int,
+        block_table: torch.Tensor,
+        cache_seqlens: torch.Tensor,
+        block_table_origin: torch.Tensor,
+        cache_seqlens_origin: torch.Tensor,
+        generate_token_idx: int = 0,
+        topk: int | None = None,
+        local: int | None = None,
+    ):
+        batch_size = block_table.size(0)
+        max_block_num = block_table.size(1)
+        max_block_num_origin = block_table_origin.size(1)
+        q_len = q_in.size(1)
+
+        if topk is None or local is None or topk + local >= max_block_num:
+            topk = -1
+            local = -1
+        return self.kvcache.get_attn_sparsity(
+            q_in.data_ptr(),
+            attn_sparsity.data_ptr(),
+            layer_idx,
+            generate_token_idx,
+            q_len,
+            batch_size,
+            max_block_num,
+            block_table.data_ptr(),
+            cache_seqlens.data_ptr(),
+            block_table_origin.data_ptr(),
+            cache_seqlens_origin.data_ptr(),
+            max_block_num_origin,
+            topk,
+            local,
+        )
+
+    def attn_with_kvcache(
+        self,
+        q_in: torch.Tensor,
+        k_in: torch.Tensor,
+        v_in: torch.Tensor,
+        output: torch.Tensor,
+        attn_lse: torch.Tensor,
+        layer_idx: int,
+        block_table: torch.Tensor,
+        cache_seqlens: torch.Tensor,
+        generate_token_idx: int = 0,
+        topk: int | None = None,
+        local: int | None = None,
+    ):
+
+        batch_size = block_table.size(0)
+        max_block_num = block_table.size(1)
+        q_len = q_in.size(1)
+
+        if topk is None or local is None or topk + local >= max_block_num:
+            topk = -1
+            local = -1
+        return self.kvcache.attn_with_kvcache(
+            q_in.data_ptr(),
+            k_in.data_ptr(),
+            v_in.data_ptr(),
+            output.data_ptr(),
+            attn_lse.data_ptr(),
+            layer_idx,
+            generate_token_idx,
+            q_len,
+            batch_size,
+            max_block_num,
+            block_table.data_ptr(),
+            cache_seqlens.data_ptr(),
+            topk,
+            local,
+        )
+
+    def get_all_kvcache_one_layer(
+        self, k_in: torch.Tensor, v_in: torch.Tensor, layer_id: int
+    ):
+        return self.kvcache.get_all_kvcache_one_layer(
+            k_in.data_ptr(),
+            v_in.data_ptr(),
+            layer_id,
+        )
+
+    def get_importance(
+        self,
+        importance: torch.Tensor,
+        block_table: torch.Tensor,
+    ):
+        raise NotImplementedError
+
+    def get_anchor(
+        self,
+        anchor: torch.Tensor,
+        block_table: torch.Tensor,
+    ):
+        raise NotImplementedError
+
+
 class CPUInfer:
-    cpu_infer = None
-    def __init__(self, cpu_infer:int = Config().cpu_infer):
-        if CPUInfer.cpu_infer is None:
-            CPUInfer.cpu_infer = cpuinfer_ext.CPUInfer(cpu_infer)
+    cpuinfer = None
+    def __init__(self, thread_num):
+        CPUInfer.cpuinfer = cpuinfer_ext.CPUInfer(thread_num)
+
+    def submit(self, task):
+        CPUInfer.cpuinfer.submit(task)
+
+    def submit_with_cuda_stream(self, current_cuda_stream, task):
+        CPUInfer.cpuinfer.submit_with_cuda_stream(current_cuda_stream, task)
+
+    def sync(self):
+        CPUInfer.cpuinfer.sync()
+
+    def sync_with_cuda_stream(self, current_cuda_stream):
+        CPUInfer.cpuinfer.sync_with_cuda_stream(current_cuda_stream)
+
+
        
-    def __getattribute__(self, __name: str) -> Any:
-        return CPUInfer.cpu_infer.__getattribute__(__name)
-    
-    def __setattr__(self, __name: str, __value: Any) -> None:
-        return CPUInfer.cpu_infer.__setattr__(__name, __value)
--- a/ktransformers/operators/dynamic_attention.py
+++ b/ktransformers/operators/dynamic_attention.py
@ -0,0 +1,775 @@
+#!/usr/bin/env python
+# coding=utf-8
+"""
+Description  :  
+Author       : Jianwei Dong
+Date         : 2024-08-26 23:25:24
+Version      : 1.0.0
+LastEditors  : Jianwei Dong
+LastEditTime : 2024-08-26 23:25:24
+Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
+"""
+
+import torch
+from transformers import AutoConfig
+import sys, os
+import logging
+logger = logging.getLogger("dynamic_attention")
+sys.path.append(os.path.dirname(__file__) + "/../ktransformers_ext/cpu_backend")
+from ktransformers.operators.cpuinfer import CPUInfer, CPUInferKVCache
+from flash_attn import flash_attn_func, flash_attn_with_kvcache
+
+
+import math
+import json
+
+
+class DynamicScaledDotProductAttention:
+    remaining_length: int
+
+    def __init__(
+        self,
+        max_seq_len: int,
+        block_size: int,
+        config: AutoConfig,
+        device: torch.device,
+        local_windows_len: int,
+        topk: int,
+        threads_num: int,
+        anchor_type: str = "DYNAMIC",
+        kv_type: str = "FP16",
+        dense_layer_num: int = 0,
+        anchor_num: int = 1,
+        block_selection_mode: str = "SHARED",
+        layer_step: int = 1,
+        token_step: int = 1,
+        preselect_block: bool = False,
+        preselect_block_count: int = 96,
+        prefill_chunk_size: int = 20480,
+        use_attn_sparsity: bool = False,
+    ):
+        # assert anchor_num == 1
+        # assert anchor_type == "DYNAMIC"
+        self.remaining_length = 0
+        valid_anchor_types = ["DYNAMIC", "FIXED", "BLOCK_MEAN", "BLOCK_MAX", "QUEST"]
+        assert anchor_type in valid_anchor_types
+        if anchor_type == "QUEST":
+            assert anchor_num == 2
+        elif anchor_type != "FIXED" and anchor_type != "DYNAMIC":
+            assert anchor_num == 1
+
+        valid_kv_types = ["FP16", "FP32", "Q4_0", "Q8_0"]
+        assert kv_type in valid_kv_types
+        if kv_type != "FP16" and kv_type != "FP32":
+            assert block_size % 32 == 0
+
+        valid_block_selection_modes = ["SHARED", "SEPARATE"]  # individual
+        assert block_selection_mode in valid_block_selection_modes
+
+        self.max_seq_len = max_seq_len
+        self.block_num = max_seq_len // block_size
+        self.block_size = block_size
+        self.anchor_type = anchor_type
+        self.kv_type = kv_type
+        self.anchor_num = anchor_num
+        self.threads_num = threads_num
+        self.layer_step = layer_step
+        self.token_step = token_step
+        self.preselect_block = preselect_block
+        self.preselect_block_count = preselect_block_count
+        self.block_selection_mode = block_selection_mode
+        self.use_attn_sparsity = use_attn_sparsity
+
+        # model config
+        self.kv_head_num = config.num_key_value_heads
+        self.q_head_num = config.num_attention_heads
+        self.head_dim = config.hidden_size // config.num_attention_heads
+        self.layer_num = config.num_hidden_layers
+
+        self.device = device
+        self.local_windows_len = local_windows_len
+        self.local_block_num = self.local_windows_len // self.block_size + 1
+        self.prefill_chunk_size = prefill_chunk_size
+
+        self.topk = topk
+        self.dense_layer_num = dense_layer_num
+        # self.dense_layer_num = 32
+        self.cache_key_states = torch.zeros(
+            (self.block_num, block_size, self.kv_head_num, self.head_dim),
+            device=device,
+            dtype=torch.float16,
+        )
+        self.cache_value_states = torch.zeros(
+            (self.block_num, block_size, self.kv_head_num, self.head_dim),
+            device=device,
+            dtype=torch.float16,
+        )
+        # [max_num_block, block_size, head_num]
+        self.cache_importance = torch.zeros(
+            (self.block_num, block_size, self.q_head_num),
+            device=device,
+            dtype=torch.float16,
+        )
+
+        # key_states: [bsz, q_len, kv_head_num, head_dim]
+        # value_states: [bsz, q_len, kv_head_num, head_dim]
+        # query_states: [bsz, q_len, q_head_num, head_dim]
+        self.q_in_cpu = torch.zeros(
+            (1, 1, self.q_head_num, self.head_dim),
+            device="cpu",
+            dtype=torch.float16,
+            pin_memory=True,
+        )
+        self.k_in_cpu = torch.zeros(
+            (1, 1, self.kv_head_num, self.head_dim),
+            device="cpu",
+            dtype=torch.float16,
+            pin_memory=True,
+        )
+        self.v_in_cpu = torch.zeros(
+            (1, 1, self.kv_head_num, self.head_dim),
+            device="cpu",
+            dtype=torch.float16,
+            pin_memory=True,
+        )
+
+        self.cache_seqlens_cpu = torch.empty(
+            (1,), device="cpu", dtype=torch.int32, pin_memory=True
+        )
+
+        self.cache_seqlens_cuda = torch.empty((1,), device=device, dtype=torch.int32)
+
+        self.prefix_block_table = torch.arange(
+            self.block_num, device="cpu", dtype=torch.int32, pin_memory=True
+        ).view(1, -1)
+
+        self.block_table_cpu = torch.arange(
+            self.block_num, device="cpu", dtype=torch.int32, pin_memory=True
+        ).view(1, -1)
+
+        # assert (
+        #     self.local_windows_len // self.block_size + 1 + self.preselect_block_count
+        #     <= self.block_num
+        # )
+
+        self.output_cpu = torch.empty(
+            (1, 1, self.q_head_num, self.head_dim),
+            device="cpu",
+            dtype=torch.float16,
+            pin_memory=True,
+        )
+        self.lse_cpu = torch.empty(
+            (1, 1, self.q_head_num), device="cpu", dtype=torch.float32, pin_memory=True
+        )
+
+        self.output_cuda = torch.empty(
+            (1, 1, self.q_head_num, self.head_dim), device=device, dtype=torch.float16
+        )
+
+        self.attn_sparsity = torch.zeros(
+            (1, 1, self.q_head_num), device="cpu", dtype=torch.float32, pin_memory=True
+        )
+
+        if preselect_block == True:
+            self.preselect_block_table = torch.zeros(
+                self.layer_num,
+                self.preselect_block_count,
+                device=device,
+                dtype=torch.int32,
+            )
+            self.preselect_block_num = 0  # block_num before preselect
+            self.evict_tokens = 0
+
+        self.cpu_infer = CPUInfer(threads_num)
+        self.local_thread = CPUInferKVCache(
+            self.layer_num,
+            self.kv_head_num,
+            self.q_head_num,
+            self.head_dim,
+            self.block_size,
+            anchor_num=self.anchor_num,
+            anchor_type=anchor_type,
+            kv_type=self.kv_type,
+            retrieval_type=self.block_selection_mode,
+            layer_step=self.layer_step,
+            token_step=self.token_step,
+            layer_offset=self.dense_layer_num % self.layer_step,
+            max_batch_size=1,
+            max_block_num=self.block_num,
+            max_thread_num=self.threads_num,
+        )
+
+        print(
+            f"local_windows_len: {local_windows_len}, topk: {topk}, dense_layer_num: {dense_layer_num}, kv_type: {self.kv_type}, anchor_type: {self.anchor_type}, preselect_block: {self.preselect_block}, preselect_block_count: {self.preselect_block_count}, token_step: {self.token_step}, layer_step: {self.layer_step}"
+        )
+
+        self.shape_mask = (
+            self.q_head_num,
+            self.block_size,
+            self.block_size,
+        )
+
+        mask = torch.zeros(
+            self.shape_mask, dtype=torch.uint8, device=device
+        ).contiguous()
+        elm_idx = torch.arange(self.block_size, device=device)
+
+        for i in range(mask.size(-2)):
+            idx = i + mask.size(-1) - mask.size(-2) - elm_idx
+            idx = idx[idx >= 0]
+            mask[..., i, idx] = 1
+
+        self.tril_mask = mask
+        self.triu_mask = mask ^ 1
+
+        self.generate_token_idx = 0
+
+    def get_attn_score_one_block(
+        self,
+        batch_idx: int,
+        max_block_num: int,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        offset: int,
+        width: int,
+        mask_mode: str | None = None,
+        use_softmax: bool = True,
+    ):
+        n_rep = self.q_head_num // self.kv_head_num
+        importance = self.cache_importance.view(-1, self.q_head_num)
+        importance = importance.narrow(0, batch_idx * max_block_num + offset, width)
+        n_gqa_ = self.q_head_num // self.kv_head_num 
+        for head_idx in range(self.q_head_num):
+            key_item = key[..., head_idx // n_gqa_, :].view(key.size(0), -1)
+            qk = torch.einsum(
+                "qd,kd->qk", query[:,head_idx,:], key_item
+            )  # (num_attention_heads, len_q, len_k)
+
+            if mask_mode == "tril":
+                mask = self.tril_mask
+                mask = mask[0, -qk.size(-2) :, -qk.size(-1) :]
+                qk = qk * mask
+            elif mask_mode == "triu":
+                mask = self.triu_mask
+                mask = mask[0, -qk.size(-2) :, -qk.size(-1) :]
+                qk = qk * mask
+
+            if use_softmax:
+                qk = torch.nn.functional.softmax(
+                    qk / math.sqrt(self.head_dim), dim=-1, dtype=torch.float32
+                ).to(torch.float16)
+              
+            qk = torch.sum(qk, dim=-2)
+            importance[...,head_idx] += qk
+
+    def get_preselect_block_table_and_attn_score(
+        self,
+        layer_idx: int,
+        batch_size: int,
+        offset: torch.Tensor,
+        width: int,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        union_with_last_layer: bool = True,
+    ):
+        max_seqs_len = offset.max().item() + width
+        max_block_num = (max_seqs_len + self.block_size - 1) // self.block_size
+
+        for batch_idx in range(batch_size):
+            query_cur = query[batch_idx][-128:]
+            self.get_attn_score_one_block(
+                batch_idx,
+                max_block_num,
+                query_cur,
+                key[batch_idx][: offset[batch_idx].item() + width],
+                0,
+                offset[batch_idx].item() + width,
+                mask_mode=None,
+            )
+
+        if self.preselect_block:
+            self.prefill_block_num = max(
+                0, max_block_num - self.local_windows_len // self.block_size
+            )
+            self.evict_tokens = (
+                max(self.prefill_block_num - self.preselect_block_count, 0)
+                * self.block_size
+            )
+
+            if self.prefill_block_num != 0:
+                importance_cache = self.cache_importance.narrow(
+                    0, 0, self.prefill_block_num * batch_size
+                ).view(
+                    batch_size, self.prefill_block_num, self.block_size, self.q_head_num
+                )
+
+                importance_r = importance_cache[:, 1:, : self.block_size // 4]
+                pad_r = torch.zeros_like(importance_r[:, :1])
+                importance_r = torch.cat((importance_r, pad_r), dim=1)
+                importance_l = importance_cache[:, :-1, -self.block_size // 4 :]
+                pad_l = torch.zeros_like(importance_l[:, :1])
+                importance_l = torch.cat((pad_l, importance_l), dim=1)
+                importance = torch.cat(
+                    (importance_l, importance_cache, importance_r), dim=2
+                )
+                importance = importance.mean(dim=-1)
+                importance = importance.mean(dim=-1)
+                # importance: (batch_size, max_block_num)
+                topk = min(self.preselect_block_count, self.prefill_block_num)
+                values, indices = torch.topk(
+                    importance,
+                    k=topk,
+                    dim=1,
+                )
+
+                self.preselect_block_table[
+                    layer_idx : layer_idx + 1,
+                    :topk,
+                ].copy_(indices)
+
+                if union_with_last_layer and layer_idx == 31:
+                    for tmp_layer_idx in range(self.layer_num - 1):
+                        for i in range(1, min(topk, 6)):
+                            x = self.preselect_block_table[-1, i]
+                            if x not in self.preselect_block_table[tmp_layer_idx]:
+                                self.preselect_block_table[tmp_layer_idx, topk - i] = x
+        if self.anchor_type == "DYNAMIC":
+            importance_cache = self.cache_importance.narrow(
+                0, 0, max_block_num * batch_size
+            ).view(batch_size, max_block_num * self.block_size, self.q_head_num)
+            importance_cache_cpu = torch.empty_like(
+                importance_cache, device="cpu", pin_memory=True
+            )
+
+            importance_cache_cpu.copy_(importance_cache)
+
+            block_table_cpu = self.prefix_block_table[:, :max_block_num].to("cpu")
+            offset_cpu = offset.contiguous().to("cpu")
+
+            self.cpu_infer.submit(
+                self.local_thread.update_importance(
+                    importance_cache_cpu,
+                    layer_idx,
+                    block_table_cpu,
+                    max_block_num,
+                    offset_cpu,
+                    width,
+                )
+            )
+            self.cpu_infer.sync()
+
+        importance_cache = self.cache_importance.narrow(
+            0, 0, max_block_num * batch_size
+        ).view(batch_size, max_block_num * self.block_size, self.q_head_num)
+        importance_cache.zero_()
+
+    # key: [bsz, past_len, head_num, head_dim] float16
+    # query: [bsz, q_len, q_head_num, head_dim] float16
+    def get_attn_score(
+        self,
+        layer_idx: int,
+        batch_size: int,
+        offset: torch.Tensor,
+        width: int,
+        query: torch.Tensor,
+        key: torch.Tensor,
+    ):
+        max_seqs_len = offset.max().item() + width
+        max_block_num = (max_seqs_len + self.block_size - 1) // self.block_size
+
+        for batch_idx in range(batch_size):
+            for idx in range(width // self.block_size):
+                offset_cur = idx * self.block_size
+                query_cur = query[batch_idx, offset_cur : offset_cur + self.block_size]
+                self.get_attn_score_one_block(
+                    batch_idx,
+                    max_block_num,
+                    query_cur,
+                    key[
+                        batch_idx,
+                        offset[batch_idx]
+                        + offset_cur : offset[batch_idx]
+                        + offset_cur
+                        + self.block_size,
+                    ],
+                    offset[batch_idx].item() + offset_cur,
+                    self.block_size,
+                    mask_mode="tril",
+                    use_softmax=False,
+                )
+
+                offset_key = (
+                    offset[batch_idx].item()
+                    + idx * self.block_size
+                    - self.local_windows_len
+                )
+                if offset_key >= 0:
+                    self.get_attn_score_one_block(
+                        batch_idx,
+                        max_block_num,
+                        query_cur,
+                        key[batch_idx, offset_key : offset_key + self.block_size],
+                        offset_key,
+                        self.block_size,
+                        mask_mode="triu",
+                        use_softmax=False,
+                    )
+
+                offset_key = max(0, offset_key + self.block_size)
+                width_key = (
+                    offset[batch_idx].item() + idx * self.block_size - offset_key
+                )
+                if width_key > 0:
+                    self.get_attn_score_one_block(
+                        batch_idx,
+                        max_block_num,
+                        query_cur,
+                        key[batch_idx, offset_key : offset_key + width_key],
+                        offset_key,
+                        width_key,
+                        mask_mode=None,
+                        use_softmax=False,
+                    )
+
+        importance_cache = self.cache_importance.narrow(
+            0, 0, max_block_num * batch_size
+        ).view(batch_size, max_block_num * self.block_size, self.q_head_num)
+        importance_cache_cpu = torch.empty_like(
+            importance_cache, device="cpu", pin_memory=True
+        )
+
+        importance_cache_cpu.copy_(importance_cache)
+
+        block_table_cpu = self.prefix_block_table[:, :max_block_num].to("cpu")
+        offset_cpu = offset.contiguous().to("cpu")
+
+        self.cpu_infer.submit(
+            self.local_thread.update_importance(
+                importance_cache_cpu,
+                layer_idx,
+                block_table_cpu,
+                max_block_num,
+                offset_cpu,
+                width,
+            )
+        )
+        self.cpu_infer.sync()
+        importance_cache.zero_()
+
+    # key: [bsz, q_len, head_num, head_dim] float16
+    # value: [bsz, q_len, head_num, head_dim] float16
+    def swap_in_and_swap_out(self, layer_idx, past_len, q_len, key, value):
+        batch_size = 1
+        max_seqs_len = past_len.max().item() + q_len
+        max_block_num = (max_seqs_len + self.block_size - 1) // self.block_size
+        k_cache = self.cache_key_states.narrow(0, 0, max_block_num * batch_size).view(
+            batch_size, max_block_num * self.block_size, self.kv_head_num, self.head_dim
+        )
+        v_cache = self.cache_value_states.narrow(0, 0, max_block_num * batch_size).view(
+            batch_size, max_block_num * self.block_size, self.kv_head_num, self.head_dim
+        )
+
+        for batch_idx in range(batch_size):
+            offset = past_len[batch_idx]
+            width = q_len
+            k_cache[batch_idx][offset : offset + width].copy_(
+                key[batch_idx].view(-1, self.kv_head_num, self.head_dim)
+            )
+            v_cache[batch_idx][offset : offset + width].copy_(
+                value[batch_idx].view(-1, self.kv_head_num, self.head_dim)
+            )
+
+        k_cache_cpu = torch.empty_like(k_cache, device="cpu", pin_memory=True)
+        v_cache_cpu = torch.empty_like(v_cache, device="cpu", pin_memory=True)
+
+        k_cache_cpu.copy_(k_cache)
+        v_cache_cpu.copy_(v_cache)
+
+        cur_block_num = (
+            q_len + past_len[0].item() + self.block_size - 1
+        ) // self.block_size
+        block_table_cpu = self.prefix_block_table[:, :cur_block_num].to("cpu")
+        past_len_cpu = past_len.contiguous().to("cpu")
+
+        self.cpu_infer.submit(
+            self.local_thread.get_and_update_kvcache_fp16(
+                k_cache_cpu,
+                v_cache_cpu,
+                layer_idx,
+                block_table_cpu,
+                max_block_num,
+                past_len_cpu,
+                q_len,
+            )
+        )
+
+        self.cpu_infer.sync()
+        k_cache.copy_(k_cache_cpu)
+        v_cache.copy_(v_cache_cpu)
+
+        return k_cache, v_cache
+
+    def calc_anchor(self, cache_seqlens: int):
+        cur_block_num = (cache_seqlens + self.block_size - 1) // self.block_size
+        block_table_cpu = self.prefix_block_table[:, :cur_block_num].to("cpu")
+        cache_seqlens_cpu = torch.tensor(
+            [cache_seqlens], device="cpu", dtype=torch.int32
+        )
+
+        self.cpu_infer.submit(
+            self.local_thread.calc_anchor_all_layers(
+                block_table_cpu,
+                cache_seqlens_cpu,
+            )
+        )
+        self.cpu_infer.sync()
+
+    def clear_importance(self, cache_seqlens: int):
+        print(f"clear importance: {cache_seqlens}")
+        cur_block_num = (cache_seqlens + self.block_size - 1) // self.block_size
+        block_table_cpu = self.prefix_block_table[:, :cur_block_num].to("cpu")
+        cache_seqlens_cpu = torch.tensor(
+            [cache_seqlens], device="cpu", dtype=torch.int32
+        )
+
+        self.cpu_infer.submit(
+            self.local_thread.clear_importance_all_layers(
+                block_table_cpu,
+                cache_seqlens_cpu,
+            )
+        )
+        self.cpu_infer.sync()
+
+    def clear_kvcache(self, cache_seqlens: int):
+        cur_block_num = (cache_seqlens + self.block_size - 1) // self.block_size
+        block_table_cpu = self.prefix_block_table[:, :cur_block_num].to("cpu")
+        cache_seqlens_cpu = torch.tensor(
+            [cache_seqlens], device="cpu", dtype=torch.int32
+        )
+
+        self.cpu_infer.submit(
+            self.local_thread.clear_kvcache_all_layers(
+                block_table_cpu,
+                cache_seqlens_cpu,
+            )
+        )
+        self.cpu_infer.sync()
+
+    def get_attn_sparsity(
+        self,
+        q_in: torch.Tensor,
+        layer_idx: int,
+        block_table: torch.Tensor,
+        cache_seqlens: torch.Tensor,
+        block_table_origin: torch.Tensor,
+        cache_seqlens_origin: torch.Tensor,
+        generate_token_idx: int = 0,
+        topk: int | None = None,
+        local: int | None = None,
+        output_path: str = "./attn_sparsity.json",
+    ):
+        self.attn_sparsity.zero_()
+        self.pcinfer.submit(
+            self.local_thread.get_attn_sparsity(
+                q_in,
+                self.attn_sparsity,
+                layer_idx,
+                block_table,
+                cache_seqlens,
+                block_table_origin,
+                cache_seqlens_origin,
+                generate_token_idx,
+                topk,
+                local,
+            )
+        )
+        self.cpu_infer.sync()
+        with open(output_path, "a") as file:
+            for head_idx in range(self.q_head_num):
+                sparsity = self.attn_sparsity[0][0][head_idx].item()
+                json_obj = {
+                    "token_idx": generate_token_idx,
+                    "layer_idx": layer_idx,
+                    "head_idx": head_idx,
+                    "sparsity": sparsity,
+                }
+                json.dump(json_obj, file)
+                file.write("\n")
+
+    def apply(
+        self,
+        layer_idx: int,
+        bsz: int,
+        past_len: int,
+        query_states: torch.Tensor,
+        key_states: torch.Tensor,
+        value_states: torch.Tensor,
+        mode: str = "prefill",
+        generate_token_idx: int = -1,
+    ):
+
+        # key_states: [bsz, q_len, kv_head_num, head_dim]
+        # value_states: [bsz, q_len, kv_head_num, head_dim]
+        # query_states: [bsz, q_len, q_head_num, head_dim]
+        assert query_states.dtype == torch.float16
+        assert key_states.dtype == torch.float16
+        assert value_states.dtype == torch.float16
+
+        assert key_states.size(2) == self.kv_head_num
+        assert value_states.size(2) == self.kv_head_num
+        assert query_states.size(2) == self.q_head_num
+
+        q_len = query_states.size(1)
+        batch_size = query_states.size(0)
+        self.cache_seqlens_cuda.fill_(past_len)
+        last_chunk = False
+        if self.remaining_length <= self.prefill_chunk_size and q_len != 1:
+            last_chunk = True
+        device = query_states.device
+        if layer_idx == 0:
+            if q_len == 1:
+                self.generate_token_idx += 1
+            elif last_chunk:
+                self.generate_token_idx = -1
+
+        if mode == "prefill":
+            key, value = self.swap_in_and_swap_out(
+                layer_idx,
+                self.cache_seqlens_cuda,
+                q_len,
+                key_states,
+                value_states,
+            )
+
+            if last_chunk and (self.anchor_type == "DYNAMIC" or self.preselect_block):
+                self.get_preselect_block_table_and_attn_score(
+                    layer_idx,
+                    bsz,
+                    self.cache_seqlens_cuda,
+                    q_len,
+                    query_states,
+                    key,
+                )
+            output = flash_attn_with_kvcache(
+                q=query_states,
+                k_cache=key,
+                v_cache=value,
+                cache_seqlens=self.cache_seqlens_cuda + q_len,
+                causal=True,
+            )
+            return output.transpose(1, 2)
+
+        elif mode == "generate":
+            assert self.generate_token_idx >= 0
+            self.q_in_cpu.copy_(query_states, non_blocking=True)
+            self.k_in_cpu.copy_(key_states, non_blocking=True)
+            self.v_in_cpu.copy_(value_states, non_blocking=True)
+            self.cache_seqlens_cpu.copy_(self.cache_seqlens_cuda, non_blocking=True)
+            #            print(layer_idx)
+            if layer_idx < self.dense_layer_num:
+                self.block_table_cpu.copy_(self.prefix_block_table, non_blocking=True)
+                self.cpu_infer.submit_with_cuda_stream(
+                    torch.cuda.current_stream("cuda").cuda_stream,
+                    self.local_thread.attn_with_kvcache(
+                        q_in=self.q_in_cpu,
+                        k_in=self.k_in_cpu,
+                        v_in=self.v_in_cpu,
+                        output=self.output_cpu,
+                        attn_lse=self.lse_cpu,
+                        layer_idx=layer_idx,
+                        block_table=self.block_table_cpu,
+                        cache_seqlens=self.cache_seqlens_cpu,
+                    ),
+                )
+            else:
+                if self.preselect_block:
+                    self.cache_seqlens_cpu.copy_(
+                        self.cache_seqlens_cuda - self.evict_tokens, non_blocking=True
+                    )
+                    if self.preselect_block_count < self.prefill_block_num:
+                        self.block_table_cpu[:, : self.preselect_block_count].copy_(
+                            self.preselect_block_table[layer_idx : layer_idx + 1],
+                            non_blocking=True,
+                        )
+
+                        self.block_table_cpu[
+                            :,
+                            self.preselect_block_count : self.preselect_block_count
+                            + self.local_block_num,
+                        ].copy_(
+                            self.prefix_block_table[
+                                :,
+                                self.prefill_block_num : self.prefill_block_num
+                                + self.local_block_num,
+                            ],
+                            non_blocking=True,
+                        )
+                    #                   print("submit_with_cuda_stream")
+                    self.cpu_infer.submit_with_cuda_stream(
+                        torch.cuda.current_stream("cuda").cuda_stream,
+                        self.local_thread.attn_with_kvcache(
+                            q_in=self.q_in_cpu,
+                            k_in=self.k_in_cpu,
+                            v_in=self.v_in_cpu,
+                            output=self.output_cpu,
+                            attn_lse=self.lse_cpu,
+                            layer_idx=layer_idx,
+                            generate_token_idx=self.generate_token_idx,
+                            block_table=self.block_table_cpu,
+                            cache_seqlens=self.cache_seqlens_cpu,
+                            topk=(
+                                self.topk
+                                if self.topk <= self.preselect_block_count
+                                else None
+                            ),
+                            local=self.local_windows_len // self.block_size,
+                        ),
+                    )
+                #                    print("submit_with_cuda_stream enqueue\n")
+                else:
+                    self.block_table_cpu.copy_(
+                        self.prefix_block_table, non_blocking=True
+                    )
+                    self.cpu_infer.submit_with_cuda_stream(
+                        torch.cuda.current_stream("cuda").cuda_stream,
+                        self.local_thread.attn_with_kvcache(
+                            q_in=self.q_in_cpu,
+                            k_in=self.k_in_cpu,
+                            v_in=self.v_in_cpu,
+                            output=self.output_cpu,
+                            attn_lse=self.lse_cpu,
+                            layer_idx=layer_idx,
+                            generate_token_idx=self.generate_token_idx,
+                            block_table=self.block_table_cpu,
+                            cache_seqlens=self.cache_seqlens_cpu,
+                            topk=self.topk,
+                            local=self.local_windows_len // self.block_size,
+                        ),
+                    )
+            self.cpu_infer.sync_with_cuda_stream(
+                torch.cuda.current_stream("cuda").cuda_stream
+            )
+            #            print("submit_with_cuda_stream finished\n")
+            self.output_cuda.copy_(self.output_cpu, non_blocking=True)
+            return self.output_cuda.transpose(1, 2)
+
+    def save(self, path: str, length: int):
+        cur_block_num = (length + self.block_size - 1) // self.block_size
+        block_table_cpu = self.prefix_block_table[0, :cur_block_num].to("cpu")
+        cache_seqlens_cpu = torch.tensor([length], device="cpu", dtype=torch.int32)
+        self.cpu_infer.submit(
+            self.local_thread.dump_kvcache(
+                block_table_cpu,
+                cache_seqlens_cpu,
+                path,
+            )
+        )
+        self.cpu_infer.sync()
+
+    def load(self, path: str, length: int):
+        self.cpu_infer.submit(
+            self.local_thread.load_kvcache(
+                path,
+            )
+        )
+        self.cpu_infer.sync()
--- a/ktransformers/operators/experts.py
+++ b/ktransformers/operators/experts.py
@ -6,7 +6,7 @@ Author       : Azure-Tang, Boxin Zhang, chenht2022
 Date         : 2024-07-25 11:25:24
 Version      : 0.1.0
 LastEditors  : Azure 
-LastEditTime : 2024-08-15 02:36:29
+LastEditTime : 2024-08-27 03:50:23
 Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
 '''

@ -436,7 +436,7 @@ class KExpertsTorch(KExpertsBase):
            final_hidden_states.index_add_(0, top_x, current_hidden_states)


-        return final_hidden_states.to(org_dtype, device=org_device)
+        return final_hidden_states.to(dtype=org_dtype, device=org_device)

 EXPERTS_MAP = {
    "KExpertsCPU": KExpertsCPU,
--- a/ktransformers/operators/models.py
+++ b/ktransformers/operators/models.py
@ -1,14 +1,14 @@
 #!/usr/bin/env python
 # coding=utf-8
-'''
+"""
 Description  :  
 Author       : Azure-Tang
 Date         : 2024-07-25 11:25:24
 Version      : 1.0.0
 LastEditors  : Azure 
-LastEditTime : 2024-08-14 14:53:05
+LastEditTime : 2024-08-27 07:29:04
 Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
-'''
+"""

 import inspect
 import math
@ -19,7 +19,10 @@ import torch.nn.functional as F
 import torch.utils.checkpoint
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-
+from ktransformers.operators.dynamic_attention import DynamicScaledDotProductAttention
+from ktransformers.server.config.config import Config
+import os
+import yaml
 from transformers.activations import ACT2FN
 from transformers.cache_utils import Cache, DynamicCache, StaticCache
 from transformers.modeling_attn_mask_utils import (
@ -40,19 +43,35 @@ from transformers.utils import (
    logging,
    replace_return_docstrings,
 )
-from ktransformers.models.modeling_qwen2_moe import Qwen2MoeSparseMoeBlock, Qwen2MoeMLP, Qwen2MoeDecoderLayer
-from ktransformers.models.modeling_deepseek import BaseModelOutputWithPast, DeepseekV2DecoderLayer, DeepseekV2MoE
+from ktransformers.models.modeling_qwen2_moe import (
+    Qwen2MoeSparseMoeBlock,
+    Qwen2MoeMLP,
+    Qwen2MoeDecoderLayer,
+)
+from ktransformers.models.modeling_deepseek import (
+    BaseModelOutputWithPast,
+    DeepseekV2DecoderLayer,
+    DeepseekV2MoE,
+)
 from transformers.models.qwen2_moe.configuration_qwen2_moe import Qwen2MoeConfig
+from ktransformers.models.configuration_llama import LlamaConfig
 from ktransformers.operators.base_operator import BaseInjectedModule
 from ktransformers.util.utils import InferenceState
 from ktransformers.util.custom_gguf import GGUFLoader
 from transformers.configuration_utils import PretrainedConfig
+from ktransformers.models.modeling_llama import (
+    LlamaDecoderLayer,
+    LlamaRMSNorm,
+    LlamaRotaryEmbedding,
+)

 if is_flash_attn_2_available():
    from flash_attn import flash_attn_func, flash_attn_varlen_func
    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa

-    _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
+    _flash_supports_window_size = "window_size" in list(
+        inspect.signature(flash_attn_func).parameters
+    )

 logger = logging.get_logger(__name__)

@ -151,6 +170,7 @@ QWEN2MOE_INPUTS_DOCSTRING = r"""
            the complete sequence length.
 """

+
@add_start_docstrings(
    "The bare Qwen2MoE Model outputting raw hidden-states without any specific head on top.",
    QWEN2MOE_START_DOCSTRING,
@ -162,18 +182,21 @@ class KQwen2MoeModel(BaseInjectedModule):
    Args:
        config: Qwen2MoeConfig
    """
+
    def __init__(
        self,
        key: str,
-        gguf_loader : GGUFLoader,
+        gguf_loader: GGUFLoader,
        config: PretrainedConfig,
        orig_module: nn.Module,
        device: str = "cuda",
-        per_layer_prefill_intput_threshold: int = 30000, # if None, no per-layer prefill
+        per_layer_prefill_intput_threshold: int = 30000,  # if None, no per-layer prefill
        transfer_map: dict = None,
        **kwargs,
    ):
-        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, device, **kwargs)
+        BaseInjectedModule.__init__(
+            self, key, gguf_loader, config, orig_module, device, **kwargs
+        )
        self.per_layer_prefill_intput_threshold = per_layer_prefill_intput_threshold
        self.transfer_map = transfer_map
        self.stream_device_map = dict()
@ -192,29 +215,47 @@ class KQwen2MoeModel(BaseInjectedModule):
        output_router_logits: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        cache_position: Optional[torch.LongTensor] = None,
-        per_layer_prefill_intput_threshold: int | None = None, # if None or 0, close per-layer prefill
+        per_layer_prefill_intput_threshold: (
+            int | None
+        ) = None,  # if None or 0, close per-layer prefill
    ) -> Union[Tuple, MoeModelOutputWithPast]:
        # print(f'Total length of input_ids: {input_ids.size(1)}, {input_ids.size()}')

-        if per_layer_prefill_intput_threshold is None: per_layer_prefill_intput_threshold = self.per_layer_prefill_intput_threshold
+        if per_layer_prefill_intput_threshold is None:
+            per_layer_prefill_intput_threshold = self.per_layer_prefill_intput_threshold
        per_layer_prefill_flag = False
-        seq_lenth = inputs_embeds.size(1) if inputs_embeds is not None else input_ids.size(1)
-        if per_layer_prefill_intput_threshold and per_layer_prefill_intput_threshold < seq_lenth:
+        seq_lenth = (
+            inputs_embeds.size(1) if inputs_embeds is not None else input_ids.size(1)
+        )
+        if (
+            per_layer_prefill_intput_threshold
+            and per_layer_prefill_intput_threshold < seq_lenth
+        ):
            per_layer_prefill_flag = True
            for layer in self.layers:
                self.load_layer_to(layer, InferenceState.UNLOAD)
        else:
            pass
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
        output_router_logits = (
-            output_router_logits if output_router_logits is not None else self.config.output_router_logits
+            output_router_logits
+            if output_router_logits is not None
+            else self.config.output_router_logits
        )
        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
        )
        use_cache = use_cache if use_cache is not None else self.config.use_cache

-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )

        if (input_ids is None) ^ (inputs_embeds is not None):
            raise ValueError(
@ -243,15 +284,23 @@ class KQwen2MoeModel(BaseInjectedModule):
            inputs_embeds = inputs_embeds.to("cuda")

        if cache_position is None:
-            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            past_seen_tokens = (
+                past_key_values.get_seq_length() if past_key_values is not None else 0
+            )
            cache_position = torch.arange(
-                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+                past_seen_tokens,
+                past_seen_tokens + inputs_embeds.shape[1],
+                device=inputs_embeds.device,
            )
        if position_ids is None:
            position_ids = cache_position.unsqueeze(0)

        causal_mask = self._update_causal_mask(
-            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+            attention_mask,
+            inputs_embeds,
+            cache_position,
+            past_key_values,
+            output_attentions,
        )

        hidden_states = inputs_embeds
@ -263,7 +312,7 @@ class KQwen2MoeModel(BaseInjectedModule):
        next_decoder_cache = None

        for i, decoder_layer in enumerate(self.layers):
-            if self.transfer_map is not None and i in self.transfer_map: 
+            if self.transfer_map is not None and i in self.transfer_map:
                prev_stream = torch.cuda.current_stream()
                cur_device = self.transfer_map[i]
                if cur_device not in self.stream_device_map:
@ -271,11 +320,25 @@ class KQwen2MoeModel(BaseInjectedModule):
                torch.cuda.set_device(cur_device)
                self.stream_device_map[cur_device].wait_stream(prev_stream)
                torch.cuda.set_stream(self.stream_device_map[cur_device])
-                hidden_states = hidden_states.to(self.transfer_map[i], non_blocking = True)
-                causal_mask = causal_mask.to(self.transfer_map[i], non_blocking = True) if causal_mask is not None else None
-                position_ids = position_ids.to(self.transfer_map[i], non_blocking = True) if position_ids is not None else None
-                cache_position = cache_position.to(self.transfer_map[i], non_blocking = True) if cache_position is not None else None
-                
+                hidden_states = hidden_states.to(
+                    self.transfer_map[i], non_blocking=True
+                )
+                causal_mask = (
+                    causal_mask.to(self.transfer_map[i], non_blocking=True)
+                    if causal_mask is not None
+                    else None
+                )
+                position_ids = (
+                    position_ids.to(self.transfer_map[i], non_blocking=True)
+                    if position_ids is not None
+                    else None
+                )
+                cache_position = (
+                    cache_position.to(self.transfer_map[i], non_blocking=True)
+                    if cache_position is not None
+                    else None
+                )
+
            if output_hidden_states:
                all_hidden_states += (hidden_states,)

@ -323,7 +386,6 @@ class KQwen2MoeModel(BaseInjectedModule):

        hidden_states = self.norm(hidden_states)

-
        if per_layer_prefill_flag:
            per_layer_prefill_flag = False
            for layer in self.layers:
@ -333,12 +395,22 @@ class KQwen2MoeModel(BaseInjectedModule):

        next_cache = None
        if use_cache:
-            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
+            next_cache = (
+                next_decoder_cache.to_legacy_cache()
+                if use_legacy_cache
+                else next_decoder_cache
+            )

        if not return_dict:
            return tuple(
                v
-                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_router_logits]
+                for v in [
+                    hidden_states,
+                    next_cache,
+                    all_hidden_states,
+                    all_self_attns,
+                    all_router_logits,
+                ]
                if v is not None
            )
        return MoeModelOutputWithPast(
@ -349,11 +421,13 @@ class KQwen2MoeModel(BaseInjectedModule):
            router_logits=all_router_logits,
        )

-    def load_layer_to(self,  layer:Qwen2MoeDecoderLayer, target: InferenceState):
-        assert isinstance(layer, Qwen2MoeDecoderLayer), "module should be nn.ModuleList of decoder layers"
+    def load_layer_to(self, layer: Qwen2MoeDecoderLayer, target: InferenceState):
+        assert isinstance(
+            layer, Qwen2MoeDecoderLayer
+        ), "module should be nn.ModuleList of decoder layers"

        # TODO Support restore to original device, not only cuda
-        device = "cpu" if target == InferenceState.UNLOAD else "cuda" 
+        device = "cpu" if target == InferenceState.UNLOAD else "cuda"

        # attn
        layer.self_attn.q_proj.set_inference_mode(target)
@ -458,18 +532,21 @@ class KDeepseekV2Model(BaseInjectedModule):
    Args:
        config: DeepseekV2Config
    """
+
    def __init__(
        self,
        key: str,
-        gguf_loader : GGUFLoader,
+        gguf_loader: GGUFLoader,
        config: PretrainedConfig,
        orig_module: nn.Module,
        device: str = "cuda",
-        per_layer_prefill_intput_threshold: int = 30000, # if None, no per-layer prefill
+        per_layer_prefill_intput_threshold: int = 30000,  # if None, no per-layer prefill
        transfer_map: dict = None,
        **kwargs,
    ):
-        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, device, **kwargs)
+        BaseInjectedModule.__init__(
+            self, key, gguf_loader, config, orig_module, device, **kwargs
+        )
        self.per_layer_prefill_intput_threshold = per_layer_prefill_intput_threshold
        self.transfer_map = transfer_map
        self.stream_device_map = dict()
@ -487,15 +564,23 @@ class KDeepseekV2Model(BaseInjectedModule):
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        cache_position: Optional[torch.LongTensor] = None,
-        per_layer_prefill_intput_threshold: int | None = None, # if None, no per-layer prefill
+        per_layer_prefill_intput_threshold: (
+            int | None
+        ) = None,  # if None, no per-layer prefill
    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        if per_layer_prefill_intput_threshold is None: per_layer_prefill_intput_threshold = self.per_layer_prefill_intput_threshold
+        if per_layer_prefill_intput_threshold is None:
+            per_layer_prefill_intput_threshold = self.per_layer_prefill_intput_threshold
        per_layer_prefill_flag = False
-        seq_lenth = inputs_embeds.size(1) if inputs_embeds is not None else input_ids.size(1)
-        if per_layer_prefill_intput_threshold and per_layer_prefill_intput_threshold < seq_lenth:
+        seq_lenth = (
+            inputs_embeds.size(1) if inputs_embeds is not None else input_ids.size(1)
+        )
+        if (
+            per_layer_prefill_intput_threshold
+            and per_layer_prefill_intput_threshold < seq_lenth
+        ):
            per_layer_prefill_flag = True
            for layer in self.layers:
-                self.load_layer_to(layer,  InferenceState.UNLOAD)
+                self.load_layer_to(layer, InferenceState.UNLOAD)
            torch.cuda.empty_cache()
        else:
            pass
@ -542,9 +627,13 @@ class KDeepseekV2Model(BaseInjectedModule):
            past_key_values_length = past_key_values.get_usable_length(seq_length)

        if cache_position is None:
-            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            past_seen_tokens = (
+                past_key_values.get_seq_length() if past_key_values is not None else 0
+            )
            cache_position = torch.arange(
-                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+                past_seen_tokens,
+                past_seen_tokens + inputs_embeds.shape[1],
+                device=inputs_embeds.device,
            )

        if position_ids is None:
@ -556,15 +645,17 @@ class KDeepseekV2Model(BaseInjectedModule):
            inputs_embeds = self.embed_tokens(input_ids)
            input_ids = input_ids.to(org_device)

-
-        causal_mask = self._update_causal_mask(
-            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
-        )
+        if per_layer_prefill_flag:
+            causal_mask = None
+        else:
+            causal_mask = self._update_causal_mask(
+                attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+            )

        # embed positions
        hidden_states = inputs_embeds
        if per_layer_prefill_flag:
-            print(f'Total length of input_ids: {hidden_states.size(1)}')
+            print(f"Total length of input_ids: {hidden_states.size(1)}")

        # decoder layers
        all_hidden_states = () if output_hidden_states else None
@ -576,7 +667,7 @@ class KDeepseekV2Model(BaseInjectedModule):
        t_f = 0

        for i, decoder_layer in enumerate(self.layers):
-            if self.transfer_map is not None and i in self.transfer_map: 
+            if self.transfer_map is not None and i in self.transfer_map:
                prev_stream = torch.cuda.current_stream()
                cur_device = self.transfer_map[i]
                if cur_device not in self.stream_device_map:
@ -584,10 +675,24 @@ class KDeepseekV2Model(BaseInjectedModule):
                torch.cuda.set_device(cur_device)
                self.stream_device_map[cur_device].wait_stream(prev_stream)
                torch.cuda.set_stream(self.stream_device_map[cur_device])
-                hidden_states = hidden_states.to(self.transfer_map[i], non_blocking = True)
-                causal_mask = causal_mask.to(self.transfer_map[i], non_blocking = True) if causal_mask is not None else None
-                position_ids = position_ids.to(self.transfer_map[i], non_blocking = True) if position_ids is not None else None
-                cache_position = cache_position.to(self.transfer_map[i], non_blocking = True) if cache_position is not None else None
+                hidden_states = hidden_states.to(
+                    self.transfer_map[i], non_blocking=True
+                )
+                causal_mask = (
+                    causal_mask.to(self.transfer_map[i], non_blocking=True)
+                    if causal_mask is not None
+                    else None
+                )
+                position_ids = (
+                    position_ids.to(self.transfer_map[i], non_blocking=True)
+                    if position_ids is not None
+                    else None
+                )
+                cache_position = (
+                    cache_position.to(self.transfer_map[i], non_blocking=True)
+                    if cache_position is not None
+                    else None
+                )

            if output_hidden_states:
                all_hidden_states += (hidden_states,)
@ -622,12 +727,12 @@ class KDeepseekV2Model(BaseInjectedModule):
                t5 = time.time()
                if per_layer_prefill_flag:
                    # print(f"to cpu")
-                    self.load_layer_to(decoder_layer,  InferenceState.UNLOAD)
+                    self.load_layer_to(decoder_layer, InferenceState.UNLOAD)
                    torch.cuda.empty_cache()
                t6 = time.time()
-            t_gpu += t4-t3
-            t_cpu += t6-t5
-            t_f += t5-t4
+            t_gpu += t4 - t3
+            t_cpu += t6 - t5
+            t_f += t5 - t4

            hidden_states = layer_outputs[0]

@ -648,7 +753,9 @@ class KDeepseekV2Model(BaseInjectedModule):
            torch.cuda.empty_cache()
            t7 = time.time()

-            print(f"total time: {t7-t3}, \n layer num{len(self.layers)}, gpu time: {t_gpu}, cpu time: {t_cpu}, forward time: {t_f}, restore time: {t7-t6}")
+            print(
+                f"total time: {t7-t3}, \n layer num{len(self.layers)}, gpu time: {t_gpu}, cpu time: {t_cpu}, forward time: {t_f}, restore time: {t7-t6}"
+            )

        # add hidden states from the last decoder layer
        if output_hidden_states:
@ -674,16 +781,18 @@ class KDeepseekV2Model(BaseInjectedModule):
            attentions=all_self_attns,
        )

-    def load_layer_to(self,  layer: DeepseekV2DecoderLayer, target: InferenceState):
-        assert isinstance(layer, DeepseekV2DecoderLayer), "module should be nn.ModuleList of decoder layers"
+    def load_layer_to(self, layer: DeepseekV2DecoderLayer, target: InferenceState):
+        assert isinstance(
+            layer, DeepseekV2DecoderLayer
+        ), "module should be nn.ModuleList of decoder layers"

        # TODO Support restore to original device, not only cuda
-        device = "cpu" if target == InferenceState.UNLOAD else "cuda" 
+        device = "cpu" if target == InferenceState.UNLOAD else "cuda"

        # TODO Support DFS to auto use {to, set_inference_mode} according to the module type

        # attn
-        layer.self_attn.to(device) #
+        layer.self_attn.to(device)  #

        # mlp
        if isinstance(layer.mlp, DeepseekV2MoE):
@ -702,3 +811,526 @@ class KDeepseekV2Model(BaseInjectedModule):
        # layer norm
        layer.input_layernorm.to(device)
        layer.post_attention_layernorm.to(device)
+
+
+LLAMA_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`LlamaConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+LLAMA_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance;
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
+"""
+
+
+@add_start_docstrings(
+    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
+    LLAMA_START_DOCSTRING,
+)
+class LlamaPreTrainedModel(PreTrainedModel):
+    config_class = LlamaConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["LlamaDecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+    _supports_quantized_cache = True
+    _supports_static_cache = True
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+class KLlamaModel(BaseInjectedModule):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
+
+    Args:
+        config: LlamaConfig
+    """
+
+    dynamic_sdpa = None
+
+    def __init__(
+        self,
+        key: str,
+        gguf_loader: GGUFLoader,
+        config: PretrainedConfig,
+        orig_module: nn.Module,
+        device: str = "cuda",
+        per_layer_prefill_intput_threshold: int = 30000,  # if None, no per-layer prefill
+        transfer_map: dict = None,
+        **kwargs,
+    ):
+
+        BaseInjectedModule.__init__(
+            self, key, gguf_loader, config, orig_module, device, **kwargs
+        )
+        self.per_layer_prefill_intput_threshold = per_layer_prefill_intput_threshold
+        self.transfer_map = transfer_map
+        self.stream_device_map = dict()
+        user_path: str = os.path.expanduser('~')
+        localstore_path: str = os.path.join(user_path,'.ktransformers')
+        config_path: str = os.path.join(localstore_path,Config.CONFIG_FILE_NAME)
+        with open(config_path,"r") as file:
+            config_yaml = yaml.safe_load(file.read())
+            self.long_context_config = config_yaml.get("long_context")
+            self.ext_config = config_yaml.get("ext")
+
+        KLlamaModel.dynamic_sdpa = DynamicScaledDotProductAttention(
+            max_seq_len=self.long_context_config["max_seq_len"],
+            block_size=self.long_context_config["block_size"],
+            config=config,
+            device=torch.device("cuda"),
+            local_windows_len=self.long_context_config["local_windows_len"],
+            topk=self.long_context_config["second_select_num"],
+            threads_num=self.ext_config["cpu_infer"],
+            anchor_type=self.long_context_config["anchor_type"],
+            kv_type=self.long_context_config["kv_type"],
+            dense_layer_num=self.long_context_config["dense_layer_num"],
+            anchor_num=self.long_context_config["anchor_num"],
+            preselect_block=self.long_context_config["preselect_block"],
+            block_selection_mode=self.long_context_config["head_select_mode"],
+            preselect_block_count=self.long_context_config["preselect_block_count"],
+            layer_step=self.long_context_config["layer_step"],
+            token_step=self.long_context_config["token_step"],
+            prefill_chunk_size=self.long_context_config["chunk_size"],
+            use_attn_sparsity=False,
+        )
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+            )
+
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+
+        return_legacy_cache = False
+        if (
+            use_cache and not isinstance(past_key_values, Cache) and not self.training
+        ):  # kept for BC (non `Cache` `past_key_values` inputs)
+            return_legacy_cache = True
+            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            logger.warning_once(
+                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
+                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)"
+            )
+
+        if cache_position is None:
+            past_seen_tokens = (
+                past_key_values.get_seq_length() if past_key_values is not None else 0
+            )
+            cache_position = torch.arange(
+                past_seen_tokens,
+                past_seen_tokens + inputs_embeds.shape[1],
+                device="cuda",
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = None
+        chunck_size = self.long_context_config["chunk_size"]
+        cur_idx = 0
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids.to("cpu"))
+        q_len = cache_position.size(0)
+
+        # generate
+        if q_len == 1:
+            x = inputs_embeds[:, -1:, :]
+            position_ids = position_ids[:, -1:]
+            return self.forward_chunk(
+                x,
+                causal_mask,
+                position_ids,
+                past_key_values,
+                output_attentions,
+                use_cache,
+                cache_position,
+                output_hidden_states,
+                return_dict,
+            )
+        elif q_len <= chunck_size:
+            inputs_embeds = inputs_embeds.to('cuda')
+            output = self.forward_chunk(
+                inputs_embeds,
+                causal_mask,
+                position_ids,
+                past_key_values,
+                output_attentions,
+                use_cache,
+                cache_position,
+                output_hidden_states,
+                return_dict,
+            )
+            KLlamaModel.dynamic_sdpa.calc_anchor(cache_position[-1] + 1)
+            KLlamaModel.dynamic_sdpa.clear_importance(cache_position[-1] + 1)
+            return output
+        cur_idx = 0
+        assert (
+            output_attentions == False
+        ), "output_attentions is not supported when using chunked attention"
+        attn_output = None
+        # prefill
+        KLlamaModel.dynamic_sdpa.remaining_length = q_len
+        while cur_idx < q_len:
+            print(f'current prefill length: {cur_idx}')
+            chunk_mask = None
+            if inputs_embeds.device.type == 'cpu':
+                tmp_inputs_embeds = inputs_embeds[:, cur_idx : min(cur_idx + chunck_size, q_len)].to("cuda")
+            else:
+                tmp_inputs_embeds = inputs_embeds[:, cur_idx : min(cur_idx + chunck_size, q_len)]
+            output_with_past = self.forward_chunk(
+                tmp_inputs_embeds,
+                chunk_mask,
+                position_ids[:, cur_idx : min(cur_idx + chunck_size, q_len)],
+                past_key_values,
+                output_attentions,
+                use_cache,
+                cache_position[cur_idx : min(cur_idx + chunck_size, q_len)],
+            )
+            cur_output = output_with_past.last_hidden_state
+            KLlamaModel.dynamic_sdpa.remaining_length -= (
+                min(cur_idx + chunck_size, q_len) - cur_idx
+            )
+            cur_idx += chunck_size
+            # if attn_output is None:
+            attn_output = cur_output
+            # else:
+            #     attn_output = torch.cat((attn_output, cur_output), dim=-2)
+
+        KLlamaModel.dynamic_sdpa.calc_anchor(cache_position[-1] + 1)
+        KLlamaModel.dynamic_sdpa.clear_importance(cache_position[-1] + 1)
+        return BaseModelOutputWithPast(last_hidden_state=attn_output)
+
+    def forward_chunk(
+        self,
+        inputs_embeds,
+        causal_mask,
+        position_ids,
+        past_key_values,
+        output_attentions,
+        use_cache,
+        cache_position,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_legacy_cache = False
+        if use_cache and not isinstance(
+            past_key_values, Cache
+        ):  # kept for BC (non `Cache` `past_key_values` inputs)
+            return_legacy_cache = True
+            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+
+        hidden_states = inputs_embeds
+
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    causal_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    cache_position,
+                    position_embeddings,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=causal_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                    position_embeddings=position_embeddings,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+        if return_legacy_cache:
+            next_cache = next_cache.to_legacy_cache()
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
+                if v is not None
+            )
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+    def _update_causal_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Cache,
+        output_attentions: bool,
+    ):
+        # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static
+        # KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at each decode steps due to the dynamic shapes.
+        # (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using
+        # `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114
+
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and 0.0 in attention_mask:
+                return attention_mask
+            return None
+
+        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+        # to infer the attention mask.
+        past_seen_tokens = (
+            past_key_values.get_seq_length() if past_key_values is not None else 0
+        )
+        using_static_cache = isinstance(past_key_values, StaticCache)
+
+        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+        if (
+            self.config._attn_implementation == "sdpa"
+            and not using_static_cache
+            and not output_attentions
+        ):
+            if AttentionMaskConverter._ignore_causal_mask_sdpa(
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                is_training=self.training,
+            ):
+                return None
+
+        dtype, device = input_tensor.dtype, input_tensor.device
+        min_dtype = torch.finfo(dtype).min
+        sequence_length = input_tensor.shape[1]
+        if using_static_cache:
+            target_length = past_key_values.get_max_length()
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else past_seen_tokens + sequence_length + 1
+            )
+
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # in this case we assume that the mask comes already in inverted form and requires no inversion or slicing
+            if attention_mask.max() != 0:
+                raise ValueError(
+                    "Custom 4D attention mask should be passed in inverted form with max==0`"
+                )
+            causal_mask = attention_mask
+        else:
+            causal_mask = torch.full(
+                (sequence_length, target_length),
+                fill_value=min_dtype,
+                dtype=dtype,
+                device=device,
+            )
+            if sequence_length != 1:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            causal_mask *= torch.arange(
+                target_length, device=device
+            ) > cache_position.reshape(-1, 1)
+            causal_mask = causal_mask[None, None, :, :].expand(
+                input_tensor.shape[0], 1, -1, -1
+            )
+            if attention_mask is not None:
+                causal_mask = (
+                    causal_mask.clone()
+                )  # copy to contiguous memory for in-place edit
+                mask_length = attention_mask.shape[-1]
+                padding_mask = (
+                    causal_mask[:, :, :, :mask_length]
+                    + attention_mask[:, None, None, :]
+                )
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[
+                    :, :, :, :mask_length
+                ].masked_fill(padding_mask, min_dtype)
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type == "cuda"
+            and not output_attentions
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            causal_mask = AttentionMaskConverter._unmask_unattended(
+                causal_mask, min_dtype
+            )
+
+        return causal_mask
--- a/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu-4.yaml
+++ b/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu-4.yaml
@ -225,4 +225,4 @@
    class: "default"
    kwargs:
      generate_device: "cuda:3"
-      prefill_device: "cuda:3"
+      prefill_device: "cuda:3"
--- a/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu.yaml
+++ b/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu.yaml
@ -123,4 +123,4 @@
    class: "default"
    kwargs:
      generate_device: "cuda:1"
-      prefill_device: "cuda:1"
+      prefill_device: "cuda:1"
--- a/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat.yaml
+++ b/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat.yaml
@ -6,7 +6,7 @@
      generate_device: "cuda"
      prefill_device: "cuda"
 - match:
-    name: "^model\\.layers\\.(?!.*self_attn).*$"  # regular expression 
+    name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
@ -41,6 +41,12 @@
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
+- match:
+    name: "^model$"
+  replace:
+    class: "ktransformers.operators.models.KDeepseekV2Model"
+    kwargs:
+      per_layer_prefill_intput_threshold: 2000 # 0 is close layer wise prefill
 - match:
    name: "^model.embed_tokens"
  replace:
--- a/ktransformers/optimize/optimize_rules/DeepSeek-V2-Lite-Chat-multi-gpu.yaml
+++ b/ktransformers/optimize/optimize_rules/DeepSeek-V2-Lite-Chat-multi-gpu.yaml
@ -123,4 +123,4 @@
    class: "default"
    kwargs:
      generate_device: "cuda:1"
-      prefill_device: "cuda:1"
+      prefill_device: "cuda:1"
--- a/ktransformers/optimize/optimize_rules/Internlm2_5-7b-Chat-1m.yaml
+++ b/ktransformers/optimize/optimize_rules/Internlm2_5-7b-Chat-1m.yaml
@ -0,0 +1,28 @@
+- match:
+    class: ktransformers.models.modeling_llama.LlamaRotaryEmbedding
+  replace:
+    class: ktransformers.operators.RoPE.RotaryEmbeddingV2
+- match:
+    name: "^model.embed_tokens"
+  replace:
+    class: "default"
+    kwargs:
+        generate_device: "cpu"
+        prefill_device: "cpu"
+- match:
+    class: ktransformers.models.modeling_llama.LlamaModel
+  replace:
+    class: ktransformers.operators.models.KLlamaModel
+    kwargs:
+      generate_device: "cuda"
+      prefill_device: "cuda"
+      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
+
+- match:
+    name: "^model\\.layers\\..*\\.self_attn$"
+  replace:
+    class: ktransformers.operators.attention.KLlamaAttention
+    kwargs:
+      generate_device: "cuda"
+      prefill_device: "cuda"
+
--- a/ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct-multi-gpu.yaml
+++ b/ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct-multi-gpu.yaml
@ -109,4 +109,4 @@
    class: "default"
    kwargs:
      generate_device: "cuda:1"
-      prefill_device: "cuda:1"
+      prefill_device: "cuda:1"
--- a/ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct.yaml
+++ b/ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct.yaml
@ -1,3 +1,10 @@
+- match:
+    name: "^model\\.layers\\..*\\."
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cuda"
+      prefill_device: "cuda"
 - match:
    class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeRotaryEmbedding
  replace:
@ -54,4 +61,4 @@
    class: "default"
    kwargs:
      generate_device: "cuda"
-      prefill_device: "cuda"
+      prefill_device: "cuda"
--- a/ktransformers/server/config/config.py
+++ b/ktransformers/server/config/config.py
@ -5,10 +5,11 @@ Description  :
 Author       : unicornchan
 Date         : 2024-06-11 16:35:42
 Version      : 1.0.0
-LastEditors  : chenxl 
-LastEditTime : 2024-07-27 01:55:42
+LastEditors  : WuHao 
+LastEditTime : 2024-08-12 06:31:14
 '''
 import os
+import shutil
 import yaml

 from ktransformers.server.config.singleton import Singleton
@ -30,10 +31,18 @@ class Config(metaclass=Singleton):
            os.path.dirname(os.path.dirname(__file__)))
        config_yaml: str = os.path.join(
            base_path, "configs", Config.CONFIG_FILE_NAME)
+        
+        user_path: str = os.path.expanduser('~')
+        localstore_path: str = os.path.join(user_path,'.ktransformers')
+        config_path: str = os.path.join(localstore_path,Config.CONFIG_FILE_NAME)
        if not os.path.exists(config_yaml):
            print(f"Can't find config file, {config_yaml}")
            exit(-1)
-        with open(config_yaml, 'r', encoding="utf-8") as fp:
+        if not os.path.exists(localstore_path):
+            os.mkdir(localstore_path)
+        if not os.path.exists(config_path):
+            shutil.copyfile(config_yaml,config_path)
+        with open(config_path, 'r', encoding="utf-8") as fp:
            config = yaml.safe_load(fp)
        return config

@ -51,6 +60,8 @@ class Config(metaclass=Singleton):
        cfg = Config.load()
        self.base_path = os.path.dirname(
            os.path.dirname(os.path.dirname(__file__)))
+        self.user_path: str = os.path.expanduser('~')
+        self.localstore_path: str = os.path.join(self.user_path,'.ktransformers')
        # log configs
        self.log_dir = os.path.join(self.base_path, Config.to_path(cfg["log"]["dir"]))
        self.log_file = cfg["log"]["file"]
@ -83,11 +94,20 @@ class Config(metaclass=Singleton):
        self.model_name: str = self.model.get("name", "")
        self.model_device: str = self.model.get("device", "cuda:0")
        self.gguf_path: str = self.model.get("gguf_path", "")
+        self.model_cache_lens = self.model.get("cache_lens")
        
        # web config
        self.web: dict = cfg.get("web", {})
        self.web_cross_domain: bool = self.web.get("open_cross_domain", True)
        self.mount_web: bool = self.web.get("mount", False)
-        
+
        self.ext: dict = cfg.get("ext", {})
        self.cpu_infer = self.ext.get("cpu_infer", 10)
+
+        #file config
+        self.local_store_configs: dict = cfg.get("local_store",{})
+        self.file_upload_dir: str = os.path.join(self.localstore_path,self.local_store_configs.get("file_upload_dir",""))
+        self.assistant_store_dir: str = os.path.join(self.localstore_path,self.local_store_configs.get("assistant_store_dir",""))
+
+        #long context config
+        self.long_context_config: dict = cfg.get("long_context",{})
--- a/ktransformers/util/cuda_graph_runner.py
+++ b/ktransformers/util/cuda_graph_runner.py
@ -46,7 +46,8 @@ class CUDAGraphRunner:
            capture_stream.wait_stream(torch.cuda.current_stream())
            torch.cuda.set_device(main_device)
            torch.cuda.set_stream(capture_stream)
-        past_key_values.change_seq_length(-1)
+        if past_key_values != None:    
+            past_key_values.change_seq_length(-1)
        torch.cuda.synchronize(self.main_device)
        #self.graph.debug_dump("cuda_graph_hooked.dot")

--- a/ktransformers/util/custom_gguf.py
+++ b/ktransformers/util/custom_gguf.py
@ -6,7 +6,7 @@ Author       : Azure-Tang, Boxin Zhang, chenht2022
 Date         : 2024-07-26 08:48:54
 Version      : 1.0.0
 LastEditors  : kkk1nak0
-LastEditTime : 2024-08-12 07:21:55
+LastEditTime : 2024-08-14 08:20:45
 Adapted from https://github.com/99991/pygguf/blob/main/gguf.py
 Copyright (c) 2023-2024 The ggml authors
 Copyright (c) 2024 Thomas Germer
@ -294,7 +294,6 @@ class GGUFLoader:
        else:
            values = GGML_DEQUANTIZE[ggml_name](data)
            values = torch.from_numpy(values)
-
        values = values.view(shape[::-1])
        if "attn_q" in name and self.gguf_file_meta['general.architecture'] in ["llama"]:
            n_head = self.gguf_file_meta['llama.attention.head_count']
--- a/ktransformers/util/utils.py
+++ b/ktransformers/util/utils.py
@ -84,7 +84,8 @@ def load_weights(module:nn.Module, gguf_loader:GGUFLoader, prefix=''):
    else:
        module.load()

-def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000, use_cuda_graph: bool = True):
+def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000, use_cuda_graph: bool = True,
+                         mode = 'normal'):
    import os
    os.environ["TOKENIZERS_PARALLELISM"] = "false"
    torch._dynamo.config.suppress_errors = True
@ -110,7 +111,8 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000, use_cud
                        cache_position=cache_position,
                        past_key_values=past_key_values,
                        return_dict=False, use_cache=True)[0]
-        past_key_values.change_seq_length(1)
+        if past_key_values != None:
+            past_key_values.change_seq_length(1)
        for device in all_cuda_device:
            torch.cuda.synchronize(device)
        #print(logits)
@ -125,18 +127,26 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000, use_cud
    torch.cuda.set_device(torch_device)
    with torch.no_grad():
        stream = TextStreamer(tokenizer)
-        past_key_values = StaticCache(
-            config = model.config, max_batch_size = 1, max_cache_len = seq_length + max_new_tokens, device = device_map, dtype = model.dtype
-        )
+        if mode != 'long_context':
+            past_key_values = StaticCache(
+                config = model.config, max_batch_size = 1, max_cache_len = seq_length + max_new_tokens, device = device_map, dtype = model.dtype
+            )
+        else:
+            past_key_values = None
        cache_position = torch.arange(seq_length, device=torch_device)
        generated_ids = torch.zeros(
            batch_size, seq_length + max_new_tokens + 1, dtype=torch.int, device=torch_device
        )
        generated_ids[:, cache_position] = inputs.to(torch_device).to(torch.int)
-        past_key_values.cur_idx=cache_position
+        if past_key_values != None:
+            past_key_values.cur_idx=cache_position
        start_time = time.time()

        inputs_embeds = model.model.embed_tokens(inputs.to("cpu")).to(torch_device)
+        if mode == "long_context":
+            inputs_embeds = model.model.embed_tokens(inputs.to("cpu"))
+        else:
+            inputs_embeds = model.model.embed_tokens(inputs.to("cpu")).to(torch_device)
        logits = model(
            inputs_embeds = inputs_embeds, cache_position=cache_position, past_key_values=past_key_values, return_dict=False, use_cache=True
        )[0][:,-1,:].unsqueeze(0).clone().to(torch_device)
@ -184,7 +194,7 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000, use_cud
            tokens.append(next_token.int())
            seq_length += 1
            
-            if next_token[0].item() == tokenizer.eos_token_id:
+            if next_token[0].item() == tokenizer.eos_token_id or tokenizer.decode(next_token) == '<|im_end|>':
                print(stream.end(), end="", flush=True)
                break
            else:
--- a/pyproject.toml
+++ b/pyproject.toml
@ -27,7 +27,8 @@ dependencies = [
  "wheel",
  "colorlog",
  "build",
-  "fire"
+  "fire",
+  "protobuf"
 ]

 requires-python = ">=3.10"
--- a/requirements-local_chat.txt
+++ b/requirements-local_chat.txt
@ -3,4 +3,5 @@ transformers
 numpy
 torch>=2.3.0
 packaging
-cpufeature
+cpufeature
+protobuf