Merge pull request #57 from UnicornChan/develop-0.1.3

[feature] release 0.1.3
This commit is contained in:
UnicornChan 2024-08-29 01:57:34 +08:00 committed by GitHub
commit 233bbb8c55
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
58 changed files with 11709 additions and 374 deletions

View file

@ -29,11 +29,6 @@ jobs:
- { os: ubuntu-20.04, pyver: '3.12', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
- { os: ubuntu-20.04, pyver: '3.12', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
- { os: ubuntu-20.04, pyver: '3.12', cuda: '12.5.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '124'}
- { os: ubuntu-20.04, pyver: '3.12', cuda: '12.5.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'}
- { os: ubuntu-20.04, pyver: '3.12', cuda: '12.5.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'}
- { os: ubuntu-20.04, pyver: '3.12', cuda: '12.4.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '124'}
- { os: ubuntu-20.04, pyver: '3.12', cuda: '12.4.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'}
- { os: ubuntu-20.04, pyver: '3.12', cuda: '12.4.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'}
- { os: ubuntu-20.04, pyver: '3.12', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '121'}
- { os: ubuntu-20.04, pyver: '3.12', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
- { os: ubuntu-20.04, pyver: '3.12', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
@ -52,12 +47,6 @@ jobs:
- { os: ubuntu-20.04, pyver: '3.11', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '121'}
- { os: ubuntu-20.04, pyver: '3.11', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
- { os: ubuntu-20.04, pyver: '3.11', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
- { os: ubuntu-20.04, pyver: '3.11', cuda: '12.5.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '124'}
- { os: ubuntu-20.04, pyver: '3.11', cuda: '12.5.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'}
- { os: ubuntu-20.04, pyver: '3.11', cuda: '12.5.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'}
- { os: ubuntu-20.04, pyver: '3.11', cuda: '12.4.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '124'}
- { os: ubuntu-20.04, pyver: '3.11', cuda: '12.4.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'}
- { os: ubuntu-20.04, pyver: '3.11', cuda: '12.4.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'}
- { os: ubuntu-20.04, pyver: '3.11', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '121'}
- { os: ubuntu-20.04, pyver: '3.11', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
- { os: ubuntu-20.04, pyver: '3.11', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
@ -76,12 +65,6 @@ jobs:
- { os: ubuntu-20.04, pyver: '3.10', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '121'}
- { os: ubuntu-20.04, pyver: '3.10', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
- { os: ubuntu-20.04, pyver: '3.10', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
- { os: ubuntu-20.04, pyver: '3.10', cuda: '12.5.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '124'}
- { os: ubuntu-20.04, pyver: '3.10', cuda: '12.5.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'}
- { os: ubuntu-20.04, pyver: '3.10', cuda: '12.5.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'}
- { os: ubuntu-20.04, pyver: '3.10', cuda: '12.4.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '124'}
- { os: ubuntu-20.04, pyver: '3.10', cuda: '12.4.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'}
- { os: ubuntu-20.04, pyver: '3.10', cuda: '12.4.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'}
- { os: ubuntu-20.04, pyver: '3.10', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '121'}
- { os: ubuntu-20.04, pyver: '3.10', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
- { os: ubuntu-20.04, pyver: '3.10', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
@ -98,10 +81,6 @@ jobs:
- { os: windows-2022, pyver: '3.12', cuda: '12.2.2', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
- { os: windows-2022, pyver: '3.12', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
- { os: windows-2022, pyver: '3.12', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
- { os: windows-2022, pyver: '3.12', cuda: '12.5.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'}
- { os: windows-2022, pyver: '3.12', cuda: '12.5.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'}
- { os: windows-2022, pyver: '3.12', cuda: '12.4.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'}
- { os: windows-2022, pyver: '3.12', cuda: '12.4.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'}
- { os: windows-2022, pyver: '3.12', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
- { os: windows-2022, pyver: '3.12', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
- { os: windows-2022, pyver: '3.12', cuda: '12.1.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
@ -114,10 +93,6 @@ jobs:
- { os: windows-2022, pyver: '3.11', cuda: '12.2.2', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
- { os: windows-2022, pyver: '3.11', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
- { os: windows-2022, pyver: '3.11', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
- { os: windows-2022, pyver: '3.11', cuda: '12.5.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'}
- { os: windows-2022, pyver: '3.11', cuda: '12.5.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'}
- { os: windows-2022, pyver: '3.11', cuda: '12.4.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'}
- { os: windows-2022, pyver: '3.11', cuda: '12.4.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'}
- { os: windows-2022, pyver: '3.11', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
- { os: windows-2022, pyver: '3.11', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
- { os: windows-2022, pyver: '3.11', cuda: '12.1.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
@ -130,10 +105,6 @@ jobs:
- { os: windows-2022, pyver: '3.10', cuda: '12.2.2', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
- { os: windows-2022, pyver: '3.10', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
- { os: windows-2022, pyver: '3.10', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
- { os: windows-2022, pyver: '3.10', cuda: '12.5.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'}
- { os: windows-2022, pyver: '3.10', cuda: '12.5.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'}
- { os: windows-2022, pyver: '3.10', cuda: '12.4.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'}
- { os: windows-2022, pyver: '3.10', cuda: '12.4.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'}
- { os: windows-2022, pyver: '3.10', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
- { os: windows-2022, pyver: '3.10', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
- { os: windows-2022, pyver: '3.10', cuda: '12.1.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
@ -219,6 +190,11 @@ jobs:
$env:CUDA_PATH = "$env:CUDA_PATH/Library"
$env:CUDA_HOME = $env:CUDA_PATH
$env:PATH = "$env:CUDA_PATH/bin;" + $env:PATH
$directory = "$env:CUDA_PATH/lib/x64/"
if (-not (Test-Path -Path $directory)) {
New-Item -ItemType Directory -Path $directory
Write-Output "Directory '$directory' created."
}
cp $env:CUDA_PATH/lib/*.lib $env:CUDA_PATH/lib/x64/
$env:INCLUDE =$env:CUDA_PATH + "/include/targets/x64;" + $env:INCLUDE

3
.gitignore vendored
View file

@ -17,4 +17,5 @@ compile_commands.json
*dist/
ktransformers/server/local_store/
ktransformers/server_test1.db
*.patch
*.patch
local_chat_djw.py

View file

@ -1,18 +1,17 @@
<div align="center">
<!-- <h1>KTransformers</h1> -->
<p align="center">
<picture>
<picture>
<img alt="KTransformers" src="https://github.com/user-attachments/assets/d5a2492f-a415-4456-af99-4ab102f13f8b" width=50%>
</picture>
</picture>
</p>
</p>
<h3>A Flexible Framework for Experiencing Cutting-edge LLM Inference Optimizations</h3>
<strong><a href="#show-cases">🔥 Show Cases</a> | <a href="#quick-start">🚀 Quick Start</a> | <a href="#tutorial">📃 Tutorial</a> | <a href="https://github.com/kvcache-ai/ktransformers/discussions">💬 Discussion </a> </strong>
</div>
<h2 id="intro">🎉 Introduction</h2>
KTransformers, pronounced as Quick Transformers, is designed to enhance your 🤗 <a href="https://github.com/huggingface/transformers">Transformers</a> experience with advanced kernel optimizations and placement/parallelism strategies.
<br/><br/>
@ -22,17 +21,43 @@ interface, RESTful APIs compliant with OpenAI and Ollama, and even a simplified
<br/><br/>
Our vision for KTransformers is to serve as a flexible platform for experimenting with innovative LLM inference optimizations. Please let us know if you need any other features.
<h2 id="Updates"> Updates</h2>
<h2 id="Updates">🔥 Updates</h2>
* **Aug 28, 2024**: Support 1M context under the InternLM2.5-7B-Chat-1M model, utilizing 24GB of VRAM and 150GB of DRAM.
* **Aug 28, 2024**: Decrease DeepseekV2's required DRAM from 20G to 10G.
* **Aug 15, 2024**: Update detailed [TUTORIAL](doc/en/injection_tutorial.md) for injection and multi-GPU.
* **Aug 14, 2024**: Support llamfile as linear backend,
* **Aug 14, 2024**: Support llamfile as linear backend.
* **Aug 12, 2024**: Support multiple GPU; Support new model: mixtral 8\*7B and 8\*22B; Support q2k, q3k, q5k dequant on gpu.
* **Aug 9, 2024**: Support windows native.
<h2 id="show-cases">🔥 Show Cases</h2>
<h3>GPT-4-level Local VSCode Copilot on a Desktop with only 24GB VRAM</h3>
<h3>1M Context Local Inference on a Desktop with Only 24GB VRAM</h3>
<p align="center">
https://github.com/user-attachments/assets/a865e5e4-bca3-401e-94b8-af3c080e6c12
* **1M Context InternLM 2.5 7B**: Operates at full bf16 precision, utilizing 24GB VRAM and 150GB DRAM, which is feasible on a local desktop setup. It achieves a 92.88% success rate on the 1M "Needle In a Haystack" test and 100% on the 128K NIAH test.
<p align="center">
<picture>
<img alt="Single Needle Retrieval 128K" src="./doc/assets/needle_128K.png" width=100%>
</picture>
</p>
<p align="center">
<picture>
<img alt="Single Needle Retrieval 1000K" src="./doc/assets/needle_1M.png" width=100%>
</picture>
</p>
* **Enhanced Speed**: Reaches 16.91 tokens/s for generation with a 1M context using sparse attention, powered by llamafile kernels. This method is over 10 times faster than full attention approach of llama.cpp.
* **Flexible Sparse Attention Framework**: Offers a flexible block sparse attention framework for CPU offloaded decoding. Compatible with SnapKV, Quest, and InfLLm. Further information is available [here](./doc/en/long_context_tutorial.md).
<div>
<h3>GPT-4-level Local VSCode Copilot on a Desktop with only 24GB VRAM</h3>
</div>
https://github.com/user-attachments/assets/0b9fa2da-66f0-48eb-b4b9-f0e1f06f8927
</p>
@ -54,7 +79,6 @@ https://github.com/user-attachments/assets/4c6a8a38-05aa-497d-8eb1-3a5b3918429c
</p>
<strong>More advanced features will coming soon, so stay tuned!</strong>
<h2 id="quick-start">🚀 Quick Start</h2>
@ -89,17 +113,21 @@ Some preparation:
```
- Linux-x86_64 with gcc, g++ and cmake
```sh
sudo apt-get update
sudo apt-get install gcc g++ cmake ninja-build
```
- We recommend using [Conda](https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh) to create a virtual environment with Python=3.11 to run our program.
```sh
conda create --name ktransformers python=3.11
conda activate ktransformers # you may need to run conda init and reopen shell first
```
- Make sure that PyTorch, packaging, ninja is installed
```
pip install torch packaging ninja
```
@ -107,37 +135,44 @@ Some preparation:
<h3>Installation</h3>
1. Use a Docker image, see [documentation for Docker](./doc/en/docker.md)
2. You can install using Pypi (for linux):
2. You can install using Pypi (for linux):
```
pip install ktransformers --no-build-isolation
```
for windows we prepare a pre compiled whl package in [ktransformers-0.1.1+cu125torch24avx2-cp311-cp311-win_amd64.whl](https://github.com/kvcache-ai/ktransformers/releases/download/v0.1.1/ktransformers-0.1.1+cu125torch24avx2-cp311-cp311-win_amd64.whl), which require cuda-12.5, torch-2.4, python-3.11, more pre compiled package are being produced.
3. Or you can download source code and compile:
- init source code
```sh
git clone https://github.com/kvcache-ai/ktransformers.git
cd ktransformers
git submodule init
git submodule update
```
- [Optional] If you want to run with website, please [compile the website](./doc/en/api/server/website.md) before execute ```bash install.sh```
- Compile and install (for Linux)
```
bash install.sh
```
- Compile and install(for Windows)
```
install.bat
```
```
<h3>Local Chat</h3>
We provide a simple command-line local chat Python script that you can run for testing.
> Note that this is a very simple test tool only support one round chat without any memory about last input, if you want to try full ability of the model, you may go to [RESTful API and Web UI](#id_666). We use the DeepSeek-V2-Lite-Chat-GGUF model as an example here. But we also support other models, you can replace it with any other model that you want to test.
We provide a simple command-line local chat Python script that you can run for testing.
> Note that this is a very simple test tool only support one round chat without any memory about last input, if you want to try full ability of the model, you may go to [RESTful API and Web UI](#id_666). We use the DeepSeek-V2-Lite-Chat-GGUF model as an example here. But we also support other models, you can replace it with any other model that you want to test.
<h4>Run Example</h4>
@ -162,23 +197,30 @@ python -m ktransformers.local_chat --model_path deepseek-ai/DeepSeek-V2-Lite-Cha
# python ktransformers.local_chat --model_path ./DeepSeek-V2-Lite --gguf_path ./DeepSeek-V2-Lite-Chat-GGUF
```
It features the following arguments:
- `--model_path` (required): Name of the model (such as "deepseek-ai/DeepSeek-V2-Lite-Chat" which will automatically download configs from [Hugging Face](https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite)). Or if you already got local files you may directly use that path to initialize the model.
>Note: <strong>.safetensors</strong> files are not required in the directory. We only need config files to build model and tokenizer.
> Note: <strong>.safetensors</strong> files are not required in the directory. We only need config files to build model and tokenizer.
- `--gguf_path` (required): Path of a directory containing GGUF files which could that can be downloaded from [Hugging Face](https://huggingface.co/mzwing/DeepSeek-V2-Lite-Chat-GGUF/tree/main) (we only support q4_k_m and q8_0 for now, more formats are coming soon).
- `--optimize_rule_path` (required except for Qwen2Moe and DeepSeek-V2): Path of YAML file containing optimize rules. There are two rule files pre-written in the [ktransformers/optimize/optimize_rules](ktransformers/optimize/optimize_rules) directory for optimizing DeepSeek-V2 and Qwen2-57B-A14, two SOTA MoE models.
- `--max_new_tokens`: Int (default=1000). Maximum number of new tokens to generate.
- `--cpu_infer`: Int (default=10). The number of CPUs used for inference. Should ideally be set to the (total number of cores - 2).
<h3 id="supported-model"> Supported Model</h3>
| Model Name | Model Size | VRAM | Minimum DRAM | Recommended DRAM |
| ---- | ---- | ---- | ---- | ---- |
| DeepSeek-V2-q4_k_m | 133G | 24G | 136G | 192G |
| Qwen2-57B-A14B-Instruct-q4_k_m | 33G | 8G | 34G | 64G |
| DeepSeek-V2-Lite-q4_k_m | 9.7G | 3G | 13G | 16G |
| Model Name | Model Size | VRAM | Minimum DRAM | Recommended DRAM |
| ------------------------------ | ---------- | ----- | --------------- | ----------------- |
| DeepSeek-V2-q4_k_m | 133G | 10G | 136G | 192G |
| Qwen2-57B-A14B-Instruct-q4_k_m | 33G | 8G | 34G | 64G |
| DeepSeek-V2-Lite-q4_k_m | 9.7G | 3G | 13G | 16G |
| Mixtral-8x7B-q4_k_m | 25G | 1.6G | 51G | 64G |
| Mixtral-8x22B-q4_k_m | 80G | 4G | 86.1G | 96G |
| InternLM2.5-7B-Chat-1M | 15.5G | 15.5G | 8G(32K context) | 150G (1M context) |
More will come soon. Please let us know which models you are most interested in.
@ -188,7 +230,6 @@ Be aware that you need to be subject to their corresponding model licenses when
<details>
<summary>Click To Show how to run other examples</summary>
* Qwen2-57B
```sh
@ -208,6 +249,7 @@ python -m ktransformers.local_chat --model_name Qwen/Qwen2-57B-A14B-Instruct --g
```
* DeepseekV2
```sh
mkdir DeepSeek-V2-Chat-0628-GGUF && cd DeepSeek-V2-Chat-0628-GGUF
# Download weights
@ -221,8 +263,11 @@ cd ..
python -m ktransformers.local_chat --model_name deepseek-ai/DeepSeek-V2-Chat-0628 --gguf_path ./DeepSeek-V2-Chat-0628-GGUF
# If you see “OSError: We couldn't connect to 'https://huggingface.co' to load this file”, try
# GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/deepseek-ai/DeepSeek-V2-Chat-0628
# python -m ktransformers.local_chat --model_path ./DeepSeek-V2-Chat-0628 --gguf_path ./DeepSeek-V2-Chat-0628-GGUF
```
| model name | weights download link |
@ -245,11 +290,15 @@ Start without website:
```sh
ktransformers --model_path deepseek-ai/DeepSeek-V2-Lite-Chat --gguf_path /path/to/DeepSeek-V2-Lite-Chat-GGUF --port 10002
```
Start with website:
```sh
ktransformers --model_path deepseek-ai/DeepSeek-V2-Lite-Chat --gguf_path /path/to/DeepSeek-V2-Lite-Chat-GGUF --port 10002 --web True
```
Or you want to start server with transformers, the model_path should include safetensors
```bash
ktransformers --type transformers --model_path /mnt/data/model/Qwen2-0.5B-Instruct --port 10002 --web True
```
@ -264,10 +313,9 @@ Access website with url [http://localhost:10002/web/index.html#/chat](http://loc
More information about the RESTful API server can be found [here](doc/en/api/server/server.md). You can also find an example of integrating with Tabby [here](doc/en/api/server/tabby.md).
<h2 id="tutorial">📃 Brief Injection Tutorial</h2>
At the heart of KTransformers is a user-friendly, template-based injection framework.
This allows researchers to easily replace original torch modules with optimized variants. It also simplifies the process of combining multiple optimizations, allowing the exploration of their synergistic effects.
This allows researchers to easily replace original torch modules with optimized variants. It also simplifies the process of combining multiple optimizations, allowing the exploration of their synergistic effects.
</br>
<p align="center">

Binary file not shown.

After

Width:  |  Height:  |  Size: 108 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 17 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 118 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 131 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 157 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 284 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 209 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 137 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 125 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 181 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 147 KiB

BIN
doc/assets/needle_128K.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 135 KiB

BIN
doc/assets/needle_1M.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 101 KiB

View file

@ -0,0 +1,316 @@
# KVCache Long Context
## TL;DR
Training larger models and supporting longer text sequences are currently the two most widely agreed-upon directions toward achieving AGI. After lowering the barrier for local inference with trillion-parameter MoE models, the second showcase scenario for KTransformers is reducing the inference barrier for ultra-long context sequences. Recently, both ChatGLM and InternLM have released open-source models supporting 1M tokens of context. This article will use InternLM2.5-7B-Chat-1M as an example to introduce a method that leverages the sparsity of attention to accelerate long-text inference on heterogeneous CPU/GPU systems.
After optimization, KTransformers has achieved native-precision inference for 128K and even 1M tokens of context on a single 24GB GPU with CPU/DRAM support. In the 128K context scenario, the generation speed is 7.1 times faster than llama.cpp, while also achieving 100% accuary on relatively simple test sets like "needle in haystack" and "passkey". On the more challenging dataset kvretrieval, through flexible framework configurations, we can achieve a **6.22x speedup** during inference while obtaining even higher scores than running the original model directly (**21.2 -> 24.4**). In the 1M context scenario on a single 24GB GPU, KTransformers can similarly achieve a 16 tokens/s inference speed, nearly 10 times faster than llama.cpp under the same conditions, with the "needle in haystack" evaluation score even surpassing the original model (**89.31 -> 92.88**).
Project url: https://github.com/kvcache-ai/ktransformers
## Mathematical Principle: The computational overhead of long-text inference and the sparsity in Attention caused by Softmax.
As the demand for longer context windows increases, not only have commercial large models like Kimi and Claude/Gemini started supporting increasingly longer context windows, but open-source models have also begun to catch up. Notably, both ChatGLM 4 and InternLM 2.5 have released versions that are under 10 billion parameters but support up to 1 million tokens of context. However, despite the relatively small size of these models, the enormous KVCache required for such ultra-long contexts still prevents local users from practically running these models. As shown in the figure below, while the InternLM2.5-7B-Chat-1M model weights only require 15.49GB of GPU memory, an additional 145.49GB is needed to store the entire 1M-token KVCache, which is clearly beyond the memory capacity of local users. Even when using the KVCache Offload feature of llama.cpp to offload the KVCache to CPU/DRAM, barely making the model runnable, performance remains unacceptable due to the need to fully scan the entire KVCache each time a single token is generated.
| <img title="" src="../assets/internlm_memory.png" alt="internlm_memory" width="882"> | <img src="../assets/SparQ_attention.png" title="" alt="sparQ" width="691"> |
| ------------------------------------------------------------------------------------ | -------------------------------------------------------------------------- |
Fortunately, many studies have noticed that attention distribution during the inference phase tends to be **sparse**. For example, the right figure shows SparQ's experimental statistics based on LLaMa 7B, where less than 1% of tokens in a 3k context have relatively high attention scores. Similar conclusions are not only reflected in many other papers, such as H2O, Quest, InfLLM, and SnapKV, but we have also further validated this through long-text experiments with InternLM 2.5-7B-1M. Although the proportion isn't as extreme as 1%, due to the inherent head-focused effect of the softmax operation in attention mechanisms, it is theoretically possible that if we can identify in advance which tokens have high attention scores, scanning less than 5% of the tokens would suffice to essentially replicate the original result.
Thus, the problem narrows down to how to quickly identify these tokens with high attention scores without scanning them all. In the following sections, we will first briefly survey several key related papers, then summarize and propose a general framework we designed and implemented within KTransformers—a highly efficient sparse attention operator for CPUs.
## Related Papers and Conclusions
### Prune or Retrieval
Based on the aforementioned points, we studied papers from recent years related to sparse selection in KVCache. The earliest of these is the paper H2O, which suggested that the attention distribution during inference is sparse and that only 5% of the KVCache is needed during inference. Following this, a series of works built on H2O's approach by designing more complex methods for selecting tokens that perform better in different scenarios. These methods are quite reasonable for single-word inference. However, as we previously explored in the Mooncake project, **we believe that the future trend is to precompute reusable KVCache as much as possible, and then use it to answer different questions.** This "compute once, use many" approach aims to reduce computational costs. Therefore, with this goal in mind, we prefer not to delete any tokens from the KVCache, or at least not remove a significant portion of them, to ensure that different questions can focus on different parts of the context in the future.
![InfLLM Framework](../assets/InfLLM_framework.png)
We further investigated related research, among which InfLLM proposed a very promising framework. Not only does it recognize that attention is sparse, but it also suggests that overly long contexts can cause attention to be dispersed into irrelevant noise, thereby reducing the model's ability to focus on key information. To address this issue, InfLLM introduces an external memory module (Memory Units) to store the context's KVCache. In each computation step, the most relevant semantic information is retrieved from this external memory module to participate in the calculation, thus enhancing the model's ability to handle long-context inference.
Specifically, InfLLM organizes the external memory module using semantic blocks composed of neighboring tokens and employs a sliding window mechanism during computation. In each step, it selects only the semantic blocks at the head of the context (Initial Tokens), the blocks near the current token (Local Tokens), and a few blocks with the highest semantic similarity to the current token to participate in the attention calculation. As shown in equation 1, to efficiently retrieve the blocks with the highest similarity, InfLLM selects a few representative tokens whose scores $$r_m
$$ are the highest within each block. Use Equation 2 to calculate the semantic similarity between the current token and each semantic block.
![InfLLM Equation](../assets/InfLLM_equation.jpg)
Compared to the previously mentioned H2O, the differences in InfLLM are as follows:
1. The KVCache is not discarded but stored in memory and dynamically loaded onto the GPU during inference.
2. KVCache is managed at the granularity of blocks rather than tokens, with each block selecting a few tokens as its representative index tokens.
InfLLM's proposed method aligns with our "compute once, use many" approach of reusing KVCache. The external memory units in this method can be offloaded to CPU/DRAM or even SSD storage, allowing different parts to be selected for computation based on the specific question. This significantly improves the efficiency of attention computation.
### Other Improvements
Similarly, after InfLLM, Quest also manages tokens at the granularity of blocks. Quest analyzed the recall rate of key tokens in H2O and full attention, finding that the Top-10 attention score token recall rate for the H2O algorithm is around 50%, which indicates that too much key information was lost. To improve the recall rate of key tokens, Quest chooses two "representative tokens" from each block for retrieval. In the prefill stage, each KVCache block records the maximum and minimum values for each channel, as shown in the figure below under "Reduced Keys," which contains the element-wise min key and element-wise max key.
During the attention computation stage, the dot product is computed between the current query vector and the max key and min key of each KVCache block, respectively. Then, for each channel, the maximum value between the two resulting product vectors is selected and summed to serve as the upper bound of the relevance score for that KVCache block, as shown in stage 1 of the diagram. Based on the relevance scores, the top-k KVCache blocks are selected to participate in the attention computation, as illustrated in stage 2 of the diagram.
![Quest Framework](../assets/Quest_framework.png)
Compared to InfLLM, Quest does not take heterogeneous architectures into account. Instead, it assumes that all KVCache can still fit into memory, simply leveraging sparse attention to accelerate the inference process. Ultimately, Quest achieves a 7.03x speedup in attention computation and a 2.23x improvement in end-to-end inference latency.
Going further, SnapKV proposes retaining two parts of the tokens during the prefill stage, as shown in the diagram below with the orange and green segments. The difference from InfLLM lies only in the method of selecting the middle tokens. SnapKV selects tokens at the token level rather than the block level, with the score calculation being similar to H2O, i.e., $$softmax(\frac{qk^T}{\sqrt{d_k}})$$. However, when summing across columns, only the rows within the final green window are selected for computation, corresponding to the Local Tokens section in InfLLM. Additionally, SnapKV introduces a pooling operation on top of attention, which the paper explains as ensuring that the recalled tokens retain more complete semantic information.
This approach in SnapKV involves a one-time selection during the inference phase, after which only the selected tokens are used for attention computation, while the rest of the KVCache is discarded.
![SnapKV Framework](../assets/SnapKV_framework.png)
Other related papers include PyramidKV, which observed that attention scores exhibit a pyramid-shaped distribution across attention layers. In lower attention layers, attention is widely distributed, while in higher layers, the attention scores for a few key tokens become increasingly prominent. Therefore, PyramidKV allocates more KVCache storage space to lower layers and less space to higher layers.
MagicPiG, based on Locality-Sensitive Hashing (LSH), proposes a dynamic KVCache management strategy. First, it uses SnapKV to select a portion of important tokens to be stored in the GPU, while the KVCache of other tokens is placed in memory. By leveraging the high efficiency of LSH in high-dimensional space searches and the multithreading capabilities of CPUs, MagicPiG retrieves KVCache from memory that is similar to the current query and loads it into memory for inference. Compared to the earlier methods like InfLLM, Quest, and SnapKV, MagicPiG does not need to scan all representative tokens and select the top-k KVCache. Instead, it utilizes the mathematical properties of LSH, which not only simulates attention scores but also allows for identifying important KVCache with low overhead and high speed.
The above are just descriptions of some key points. For more detailed explanations, you can refer to the existing articles on Zhihu in Chinese:
- https://zhuanlan.zhihu.com/p/701580870
- https://zhuanlan.zhihu.com/p/714288577
## KTransformers CPU Sparse Attn Framework
### Framework Prototype
Based on the introduction of the above papers, we have distilled the following key points:
- The distribution of attention weights is sparse, and useless KVCache may introduce noise, which could actually reduce performance during the inference stage.
- For the KVCache eviction strategy during the inference stage, the common approach is to retain the tokens from the beginning and the end of the prompt, while designing algorithms to select the tokens from the middle portion. One of the main factors affecting the model's performance is the ability to accurately identify the key tokens.
- Managing the middle portion of tokens in blocks can improve memory swapping and attention computation efficiency, and smaller blocks do not seem to perform worse than token-level granularity.
- The tokens that each attention layer focuses on during inference differ, and even the allocated KVCache capacity for different layers should vary.
Based on these insights and inspirations, we developed a general framework for implementing sparse CPU attention operators during the inference phase. In the prefill stage, we use chunked prefill, loading only one layer of KVCache into GPU memory at a time for computation. Once completed, the KVCache is stored on CPU/DRAM. In the subsequent decode stage, instead of swapping KVCache in and out, the sparse attention operator runs directly on the CPU. **This significantly reduces the minimum** **GPU** **memory requirements, making local 128K or even 1M token contexts possible.**
Specifically during the generation phase, we implemented the entire framework as shown in the diagram below.
![KTransformers long congtext v1](../assets/KTransformers_long_context_v1.png)
We organized the KVCache in units of blocks. Specifically:
- **KVCache Partitioning:** A complete input prompt is divided into three configurable parts: Initial, Context, and Local. During the computation process, the Initial/Local parts will be fully attended to, while the Context part will be sparsely retrieved. This approach is based on findings from many papers (such as streamingLLM and Minference) which mention the existence of "attention sinks," where higher attention weights are often found at the beginning and the end of the sequence.
- **Context Block Partitioning:** For the middle Context, we follow the InfLLM approach by dividing it into blocks based on a configurable fixed number of tokens. Each block can select 1 to k tokens as its representative tokens. During the actual inference phase, the Context blocks that require attention are selected based on these representative tokens.
- Specifically, we have implemented the following methods for selecting representative tokens, based on the approaches outlined in various papers.
- Max: The maximum values of multiple tokens within a block, across each channel, are concatenated to form the representative token for the current block.
- Mean: The average values of multiple tokens within a block, across each channel, are concatenated to form the representative token for the current block.
- Quest: A combination of the previous two methods: the maximum and minimum values of multiple tokens within a block, across each channel, are taken as the representative tokens for the block. Under this method, the number of representative tokens is fixed at 2
- Dynamic: By calculating the cumulative attention score for each token using a specific method, each block selects the top-k tokens with the highest scores as the representative tokens for the block. This is similar to InfLLM but with some simplifications.
- Fix: Select tokens at fixed intervals within the block.
- Once the representative tokens for each block are determined, use Equation 2 from InfLLM to calculate the similarity between the input X and the k representative tokens of each block B, and only select the top $$r_k$$ blocks for attention computation, where $$l_P $$ represents the length of the historical tokens:
Since InfLLM requires calculating a representative score for each token during the prefill stage and then selecting a representative token for each block based on these scores, this operation involves invasive modifications to the prefill implementation, making it difficult to integrate with other methods. Furthermore, in actual testing, we found that in most scenarios, similar or even better results can be achieved through a combination of other methods. Therefore, we ultimately decided not to integrate this method into the framework.
## Further Optimizations
After implementing the above framework, we conducted a series of evaluations based on LongBench and InfiniteBench.
At the beginning of the experiment, we designed the architecture so that for each inference token, the most relevant KVCache blocks would be reselected. On the one hand, this strategy incurred significant overhead during the retrieval process. On the other hand, we found that in some scenarios, f**requently changing the selection of retrieved blocks did not lead to better results**. For example, in the kvretrieval dataset, we observed that the model's responses were often correct in the first half but incorrect in the second half. Since the answers to kvretrieval questions consist of long and meaningless strings, this indicates that the correct KVCache blocks were selected during the inference of the earlier tokens but incorrect blocks were chosen during the later stages of inference.
To address this issue, we further integrated the method proposed in SnapKV. Before starting the inference, we preselect relevant KVCache blocks by analyzing the attention scores of the context tokens, based on the question. During the subsequent inference stages, the selection of KVCache blocks is restricted to this preselected range. This approach allowed us to select the block containing the correct answer 100% of the time in the kvretrieval dataset.
However, it should be noted that this method strictly relies on the structure of the Benchmark Prompt and **does not necessarily guarantee optimal performance in other scenarios, such as complex document understanding and generation tasks.** Therefore, we have integrated it into our framework as an optional module. The final framework and configurable parameters are as follows:
![KTransformers long congtext v2](../assets/KTransformers_long_context_v2.png)
Configuration
- **threads_num:** Number of CPU Threads
- **block_size:** KVCache Block Size
- **local_windows_len:** Prompt End Window Size
- **preselect_block_count:** Number of Preselected Blocks
- **second_block_count:** Number of Blocks Selected After Preselection
- **preselect_block:** Whether to Enable Preselection
- **token_step:** Interval Between Token Selections for KVCache
- **layer_step:** Interval Between Layer Selections for KVCache
- **dense_layer_num:** Number of Initial Layers Without KVCache Selection, Importing All KVCache
- **head_select_mode:SEPARATE**(In the GQA scenario, each kv_head is selected separately) / **SHARED:** (All kv_heads are selected together)
- **representative_type:** Method of Selecting Representative Tokens
- **representative_num:** Number of Representative Tokens
By modifying configuration options, various KVCache eviction or compression methods can be easily reproduced within our framework. For example:
- Setting `block_size` to 1 and `preselect_block` to True results in a version of SnapKV without the pooling operation.
- Setting `representative_type` to Quest, `preselect_block` to False, and `head_select_mode` to SEPARATE replicates the Quest method.
Below is the pseudocode for the framework:
```python
def preselect_block(local_q, kvcache):
key_states = kvcache.keycache
attn_scores = torch.matmul(
local_q, key_states.transpose(2, 3)
) / math.sqrt(head_dim)
attn_scores += attn_mask
attn_scores = nn.functional.softmax(
attn_scores, dim=-1, dtype=torch.float32
).to(query_states.dtype)
    vote = attn_scores[..., initial_size:-local_size:, :].sum(dim=-2)
    pool_vote = pool1d(vote, kernel_size=kernel_size, padding=kernel_size//2, stride=1)
    indices = pool_vote.topk(max_capacity_prompt - local_size, dim=-1).indices
    kv_cache_block_indices = find_representative_tokens_block(indices)
    kvcache_after_preselected = kvcache[kv_cache_block_indices]
    ...
    return kvcache_after_preselected
def get_representative_tokens():
    Calculate the representative token for each block based on the representative_type.
    return ...
def decode_attention(query, key, value):
# Select once every token_steps tokens.
token_steps = 4
  # Select once every layer_steps layers.
layer_steps = 4
for token_idx in range(max_new_tokens):
     for layer_idx in range(config.num_hidden_layers):
         if token_idx % token_steps != 0 or layer_idx % layer_steps != 0:
    # If the attention of the current layer in this round does not require reselection, the historical selection results from the kvcache will be retained.
           kvcache_after_retrieval = history_kvcache_after_retrieval[layer_idx//layer_steps]
else:
            # Otherwise, use the query from the current round's current layer to reselect the kvcache.
            kvcache_after_retrieval = retrieval_kvcache(query, kvcache)
# Save it to the kvcache historical selection results.
history_kvcache_after_retrieval[layer_idx//layer_steps] = kvcache_after_retrieval
# calculate attention
output = attn(query, kvcache_after_retrieval)
yield output
# Model prefill, if preselection is required, local_q still needs to be saved.
local_q, KVCache = model.prefill(input_ids)
if preselect_block:
    # Preselection round
    KVCache = preselect_block(local_q, kvcache)
# Find the representative token for each block.
block_representative_tokens = get_representative_tokens(
   kvcache,                      
   config.representative_type
)
# model generate
'''
'''
decode_attention(query, key, value)
'''
'''
```
## Experiment
At the beginning of testing, we will use the following basic configuration, which will be further optimized through the extended framework.
```python
max_seq_len: 256000 # KVCache length
block_size: 128 # KVCache block size
local_windows_len: 4096 # The KVCache of length local_windows_len is stored on the GPU.
second_block_count: 96 # After preselection, each time select the number of KVCache blocks. If >= preselect_block_count, use the preselected blocks.
threads_num: 64 # CPU thread num
representative_type: DYNAMIC # KVCache block representative token selection method.
kv_type: FP16
dense_layer_num: 0 # The first few layers do not need to fill or select KVCache
representative_num: 1 # The number of representative tokens within a KVCache block.
preselect_block: False # Whether to preselect.
head_select_mode: SHARED # All kv_heads jointly select.
preselect_block_count: 0 # Number of preselected blocks.
layer_step: 1 # Select every few layers.
token_step: 1 # Select every few tokens.
```
Under our framework, the comparison between the original model and KTransformers after acceleration on datasets such as 128K Big Needle-in-a-Haystack, passkey, kvretrieval, etc., is as follows. The passkey dataset involves inserting a small segment of numbers at varying depths within a redundant text. kvretrieval is about finding a matching item in randomly generated key-value pairs. All tests were conducted under the opencompass framework:
![needle_128K.png](../assets/needle_128K.png)
| | | | |
| ----------------------------------------------------------- | ------------------------------- | ------- | ----------- |
| | Single needle retrieval zh 128k | passkey | kvretrieval |
| Original model | 99.89 | 100 | 21.0 |
| KTransformers (reselect KVCache blocks for each generation) | 100 | 100 | 15.40 |
We can see that both the original model and the accelerated KTransformers achieve perfect scores on the relatively simpler datasets, such as Single Needle Retrieval and passkey. At the same time, the generation speed has significantly improved, increasing from 4.86 tokens/s with llama.cpp to 27.49 tokens/s with KTransformers, achieving up to a 5.65x speedup. Although the current configuration shows a noticeable drop in performance on the more challenging kvretrieval dataset, in the next section, we will address this by implementing a more optimized selection strategy to compensate for or even surpass the original model's accuracy.
Additionally, we tested the performance of the KTransformers-based configuration framework in reproducing the results of Quest. However, since InternLM2.5-7B-Chat-1M uses GQA (Grouped Query Attention) while the Quest paper primarily focuses on optimizing MHA (Multi-Head Attention) models, the actual testing results were not particularly favorable. The official team also mentioned that further support for GQA models is needed, so we will not discuss this in detail for now.
### Further improve performance
By modifying certain configurations within our flexible framework on the basis of reproduction, **we can actually achieve better results than those reported in the previous paper,** as shown in the figure below:
![](../assets/Framework_effect.png)
As mentioned earlier, the goal of the kvretrieval dataset is to find a matching key-value pair within a long sequence of semantically meaningless pairs. If tokens are generated by reselecting based on the current query each time, the likelihood of deviation increases as the text grows, leading to the selection of different KVCache blocks compared to previous selections. To address this, we introduced a preselection mechanism using SnapKV to calculate the method for selecting representative tokens, which preselects a portion of the KVCache blocks. During the subsequent inference process, the selection is limited to these blocks. After one round of preselection, the score increased from 15.4 to 24.2, **surpassing the original model + full attention's performance of 21 points.** Further research indicates that the sparsity effect of the KVCache in the first few layers of LLMs is not as significant. Therefore, we set the first two layers to fully reuse the KVCache, ultimately achieving a score of **24.4**.
Similarly, when testing the needle-in-a-haystack task on the 1M dataset, we not only reproduced the original model's reported score but also further improved accuracy (**from 89.31 to 92.88**) by using the KTransformers CPU Sparse Attn Framework to selectively compute only certain KVCache blocks. Additionally, the inference speed **reached nearly 10 times that of llama.cpp**.
![needle 1M.png](../assets/needle_1M.png)
### More comparisons
As shown in the two figures below, using the Single Needle Retrieval dataset as an example, we set llama.cpp to store the KVCache on CPU/DRAM while performing all computations on the GPU. On a 4090D server, we compared the KTransformers CPU Sparse Attn Framework with llama.cpp. While maintaining **100% answer accuracy**, we achieved a 20.6 to 94.1 times prefill speed increase and a **1.2 to 7.1 times inference speed boost**.
| ![long context prefill.png](../assets/long_context_prefill.png) | ![long context generate.png](../assets/long_context_generate.png) |
| --------------------------------------------------------------- | ----------------------------------------------------------------- |
The main reason for the significant gap in prefill speed is that after enabling KVCache offload, llama.cpp performs the attention (attn) computation on the CPU. In long-text scenarios, attention not only requires heavy computation but also takes up the majority of the computation time. In contrast, KTransformers leverages a flexible template injection framework to implement GPU Chunk Prefill layer by layer. Moving forward, we plan to further integrate high-performance sparse prefill methods such as MInference to boost speed even further.
Additionally, as a key focus of this article, the right-hand graph shows that as the prompt length increases, the inference speed of KTransformers remains stable, hovering near a horizontal line. In contrast, llama.cpp slows down as the prompt length increases. By selecting only the most important 16K KVCache blocks to participate in the inference computation, KTransformers maintains a consistent inference speed comparable to llama.cpp when processing a 16K prompt, without any performance degradation (at least on these test datasets).
## How to Use
Currently, long context is only supported by our **local_chat.py** interface, and the integration with the server interface is under development.
To facilitate user management, we have uploaded the model config, gguf, and tokenizer to a repo. URL: https://huggingface.co/nilv234/internlm2_5_to_llama_1m/tree/main
By setting the model_path and gguf_path in the local_chat function to **/path/to/repo** and setting the mode to **"long_context"**, you can use the InternLM2.5-7B-Chat-1M model with 1m functionality on a 24G VRAM.
After running local_chat.py for the first time, a config.yaml file will be automatically created under ** ~/.ktransformers**. The relevant configurations for long context are as follows:
```python
chunk_size: 4096 # prefill chunk size
max_seq_len: 100000 # KVCache length
block_size: 128 # KVCache block size
local_windows_len: 4096 # The KVCache of length local_windows_len is stored on the GPU.
second_select_num: 96 # After preselection, each time select the number of KVCache blocks. If >= preselect_block_count, use the preselected blocks.
threads_num: 64 # CPU thread num
anchor_type: DYNAMIC # KVCache block representative token selection method.
kv_type: FP16
dense_layer_num: 0 # The first few layers do not need to fill or select KVCache
anchor_num: 1 # The number of representative tokens within a KVCache block.
preselect_block: False # Whether to preselect.
head_select_mode: SHARED # All kv_heads jointly select.
preselect_block_count: 96 # Number of preselected blocks.
layer_step: 1 # Select every few layers.
token_step: 1 # Select every few tokens.
```
The memory required for different context lengths is shown in the table below:
| | 4K | 32K | 64K | 128K | 512K | 1M |
| -------------- | --- | ---- | ---- | ---- | ---- | ------ |
| DRAM Size (GB) | 0.5 | 4.29 | 8.58 | 17.1 | 68.7 | 145.49 |
Please choose an appropriate max_seq_len based on your DRAM size.
For example:
```python
python local_chat.py --model_path="/data/model/internlm2_5_to_llama_1m" --gguf_path="/data/model/internlm2_5_to_llama_1m" --max_new_tokens=500 --cpu_infer=10 --use_cuda_graph=True --mode="long_context" --prompt_file="/path/to/file"
```

View file

@ -1 +1,11 @@
__version__ = "0.1.2"
#!/usr/bin/env python
# coding=utf-8
'''
Description :
Author : kkk1nak0
Date : 2024-08-15 07:34:46
Version : 1.0.0
LastEditors : chenxl
LastEditTime : 2024-08-28 15:19:03
'''
__version__ = "0.1.3"

View file

@ -34,4 +34,20 @@ web:
open_cross_domain: True
ext:
cpu_infer: 10
cpu_infer: 10
long_context:
chunk_size: 4096
max_seq_len: 32000
block_size: 128
local_windows_len: 4096
second_select_num: 32
anchor_type: DYNAMIC
kv_type: FP16
dense_layer_num: 2
anchor_num: 1
preselect_block: True
head_select_mode: SHARED
preselect_block_count: 32
layer_step: 1
token_step: 100

View file

@ -1,6 +1,7 @@
cmake_minimum_required(VERSION 3.16)
project(cpuinfer_ext VERSION 0.1.0)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -ffast-math")
set(CMAKE_BUILD_TYPE "Release")
@ -215,7 +216,8 @@ aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR} SOURCE_DIR1)
aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/cpu_backend SOURCE_DIR2)
aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/operators/llamafile SOURCE_DIR3)
aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/../../third_party/llamafile SOURCE_DIR4)
set(ALL_SOURCES ${SOURCE_DIR1} ${SOURCE_DIR2} ${SOURCE_DIR3} ${SOURCE_DIR4})
aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/operators/kvcache SOURCE_DIR5)
set(ALL_SOURCES ${SOURCE_DIR1} ${SOURCE_DIR2} ${SOURCE_DIR3} ${SOURCE_DIR4} ${SOURCE_DIR5})
message(STATUS "ALL_SOURCES: ${ALL_SOURCES}")
pybind11_add_module(${PROJECT_NAME} MODULE ${ALL_SOURCES})
@ -223,5 +225,8 @@ target_link_libraries(${PROJECT_NAME} PRIVATE llama)
if(WIN32)
target_link_libraries(${PROJECT_NAME} PRIVATE "$ENV{CUDA_PATH}/lib/x64/cudart.lib")#CUDA::cudart
elseif(UNIX)
if(NOT DEFINED ENV{CUDA_HOME} OR "$ENV{CUDA_HOME}" STREQUAL "")
set(ENV{CUDA_HOME} "/usr/local/cuda")
endif()
target_link_libraries(${PROJECT_NAME} PRIVATE "$ENV{CUDA_HOME}/lib64/libcudart.so")
endif()
endif()

View file

@ -0,0 +1,178 @@
#!/usr/bin/env python
# coding=utf-8
"""
Description :
Author : Jianwei Dong
Date : 2024-08-28 10:32:05
Version : 1.0.0
LastEditors : Jianwei Dong
LastEditTime : 2024-08-28 10:32:05
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
"""
import os, sys
import time
sys.path.append(os.path.dirname(__file__) + "/../build")
import cpuinfer_ext
import torch
layer_num = 10
kv_head_num = 8
q_head_num = 32
head_dim = 128
block_len = 128
anchor_num = 1
anchor_type = cpuinfer_ext.kvcache.AnchorType.DYNAMIC
kv_type = cpuinfer_ext.kvcache.ggml_type.FP16
retrieval_type = cpuinfer_ext.kvcache.RetrievalType.LAYER
layer_step: int = 1
token_step: int = 1
layer_offset: int = 0
max_thread_num: int = 64
max_batch_size: int = 1
max_block_num: int = 1024
CPUInfer = cpuinfer_ext.CPUInfer(max_thread_num)
warm_up_iter = 1000
test_iter = 10000
def bench_linear(cache_seqlen: int):
with torch.inference_mode(mode=True):
cache_seqlens = torch.tensor([cache_seqlen], dtype=torch.int32, device="cpu")
seqlens_zero = torch.zeros((1,), dtype=torch.int32, device="cpu")
config = cpuinfer_ext.kvcache.KVCacheConfig(
layer_num,
kv_head_num,
q_head_num,
head_dim,
block_len,
anchor_num,
anchor_type,
kv_type,
retrieval_type,
layer_step,
token_step,
layer_offset,
max_block_num,
max_batch_size,
max_thread_num,
)
local_kvcache = cpuinfer_ext.kvcache.KVCache(config)
block_table = (
torch.arange(max_block_num, dtype=torch.int32, device="cpu")
.contiguous()
.view(1, -1)
)
for layer_idx in range(layer_num):
k_cache = torch.randn(
(1, cache_seqlen, kv_head_num, head_dim),
dtype=torch.float16,
device="cpu",
).contiguous()
v_cache = torch.randn(
(1, cache_seqlen, kv_head_num, head_dim),
dtype=torch.float16,
device="cpu",
).contiguous()
CPUInfer.submit(
local_kvcache.update_kvcache_fp16(
k_cache.data_ptr(),
v_cache.data_ptr(),
layer_idx,
block_table.data_ptr(),
1,
max_block_num,
seqlens_zero.data_ptr(),
cache_seqlen,
)
)
CPUInfer.sync()
input = torch.randn(
(1, 1, q_head_num, head_dim), dtype=torch.float16, device="cpu"
).contiguous()
output = torch.empty(
(1, 1, q_head_num, head_dim), dtype=torch.float16, device="cpu"
).contiguous()
# attn_lse: (bsz, q_len, q_head_num)
attn_lse = torch.empty(
(1, 1, q_head_num), dtype=torch.float32, device="cpu"
).contiguous()
input = input / 100
# warm up
for i in range(warm_up_iter):
CPUInfer.submit(
local_kvcache.attn(
input.data_ptr(),
output.data_ptr(),
attn_lse.data_ptr(),
i % layer_num,
0,
1,
1,
max_block_num,
block_table.data_ptr(),
cache_seqlens.data_ptr(),
-1,
-1,
-1,
)
)
CPUInfer.sync()
# test
start = time.perf_counter()
for i in range(test_iter):
CPUInfer.submit(
local_kvcache.attn(
input.data_ptr(),
output.data_ptr(),
attn_lse.data_ptr(),
i % layer_num,
0,
1,
1,
max_block_num,
block_table.data_ptr(),
cache_seqlens.data_ptr(),
-1,
-1,
-1,
)
)
CPUInfer.sync()
end = time.perf_counter()
total_time = end - start
print("cache sequence length: ", cache_seqlen)
print("Time(s): ", total_time)
print("Iteration: ", test_iter)
print("Time(us) per iteration: ", total_time / test_iter * 1000000)
print(
"Bandwidth: ",
cache_seqlen
* kv_head_num
* head_dim
* 2
* 2
* test_iter
/ total_time
/ 1000
/ 1000
/ 1000,
"GB/s",
)
print("")
bench_linear(1024)
bench_linear(4096)
bench_linear(16384)
bench_linear(32768)
bench_linear(65536)

View file

@ -0,0 +1,94 @@
#!/usr/bin/env python
# coding=utf-8
"""
Description :
Author : Jianwei Dong
Date : 2024-08-28 10:32:05
Version : 1.0.0
LastEditors : Jianwei Dong
LastEditTime : 2024-08-28 10:32:05
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
"""
import os, sys
import time
sys.path.append(os.path.dirname(__file__) + "/../build")
import cpuinfer_ext
import torch
layer_num = 10
kv_head_num = 8
q_head_num = 32
head_dim = 128
block_len = 128
anchor_num = 1
warm_up_iter = 1000
test_iter = 10000
def bench_linear(cache_seqlen: int, device):
with torch.inference_mode(mode=True):
kvcaches = []
for layer_idx in range(layer_num):
k_cache = torch.randn(
(1, 32, cache_seqlen, head_dim),
dtype=torch.float16,
device=device,
).contiguous()
v_cache = torch.randn(
(1, 32, cache_seqlen, head_dim),
dtype=torch.float16,
device=device,
).contiguous()
kvcaches.append((k_cache, v_cache))
input = torch.randn(
(1, q_head_num, 1, head_dim), dtype=torch.float16, device=device
).contiguous()
input = input / 100
# warm up
for i in range(warm_up_iter):
k_cache = kvcaches[i % layer_num][0]
v_cache = kvcaches[i % layer_num][1]
torch.nn.functional.scaled_dot_product_attention(input, k_cache, v_cache)
# test
start = time.perf_counter()
for i in range(test_iter):
k_cache = kvcaches[i % layer_num][0]
v_cache = kvcaches[i % layer_num][1]
torch.nn.functional.scaled_dot_product_attention(input, k_cache, v_cache)
end = time.perf_counter()
total_time = end - start
print("cache sequence length: ", cache_seqlen)
print("Time(s): ", total_time)
print("Iteration: ", test_iter)
print("Time(us) per iteration: ", total_time / test_iter * 1000000)
print(
"Bandwidth: ",
cache_seqlen
* q_head_num
* head_dim
* 2
* 2
* test_iter
/ total_time
/ 1000
/ 1000
/ 1000,
"GB/s",
)
print("")
bench_linear(1024, "cpu")
bench_linear(4096, "cpu")
bench_linear(1024, "cuda")
bench_linear(4096, "cuda")
bench_linear(16384, "cuda")
bench_linear(32768, "cuda")
bench_linear(65536, "cuda")

View file

@ -3,93 +3,125 @@
* @Author : chenht2022
* @Date : 2024-07-22 02:03:05
* @Version : 1.0.0
* @LastEditors : chenht2022
* @LastEditors : chenht2022
* @LastEditTime : 2024-07-25 10:33:34
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#include "backend.h"
Backend::Backend(int thread_num) {
thread_num_ = thread_num;
thread_state_.resize(thread_num);
for (int i = 0; i < thread_num; i++) {
thread_local int Backend::thread_local_id = -1;
Backend::Backend(int max_thread_num) {
max_thread_num_ = max_thread_num;
thread_state_.resize(max_thread_num_);
for (int i = 0; i < max_thread_num_; i++) {
thread_state_[i].curr = std::make_unique<std::atomic<int>>();
thread_state_[i].status = std::make_unique<std::atomic<ThreadStatus>>(ThreadStatus::WAITING);
thread_state_[i].status =
std::make_unique<std::atomic<ThreadStatus>>(ThreadStatus::WAITING);
}
workers_.resize(thread_num);
for (int i = 1; i < thread_num; i++) {
workers_.resize(max_thread_num_);
for (int i = 1; i < max_thread_num_; i++) {
workers_[i] = std::thread(&Backend::worker_thread, this, i);
}
}
Backend::~Backend() {
for (int i = 0; i < thread_num_; i++) {
thread_state_[i].status->store(ThreadStatus::EXIT, std::memory_order_release);
for (int i = 0; i < max_thread_num_; i++) {
thread_state_[i].status->store(ThreadStatus::EXIT,
std::memory_order_release);
}
for (int i = 1; i < thread_num_; i++) {
for (int i = 1; i < max_thread_num_; i++) {
if (workers_[i].joinable()) {
workers_[i].join();
}
}
}
int Backend::get_thread_num() {
return thread_num_;
}
int Backend::get_thread_num() { return max_thread_num_; }
void Backend::do_work_stealing_job(int task_num, std::function<void(int)> func) {
func_ = func;
void Backend::do_work_stealing_job(int task_num,
std::function<void(int)> init_func,
std::function<void(int)> compute_func,
std::function<void(int)> finalize_func) {
init_func_ = init_func;
compute_func_ = compute_func;
finalize_func_ = finalize_func;
thread_num_ = std::min(max_thread_num_, task_num);
int base = task_num / thread_num_;
int remain = task_num % thread_num_;
thread_state_[0].end = base + (0 < remain);
// 为主线程设置 thread_local_id
thread_local_id = 0;
for (int i = 1; i < thread_num_; i++) {
thread_state_[i].curr->store(thread_state_[i - 1].end, std::memory_order_relaxed);
thread_state_[i].curr->store(thread_state_[i - 1].end,
std::memory_order_relaxed);
thread_state_[i].end = thread_state_[i - 1].end + base + (i < remain);
thread_state_[i].status->store(ThreadStatus::WORKING, std::memory_order_release);
thread_state_[i].status->store(ThreadStatus::WORKING,
std::memory_order_release);
}
thread_state_[0].curr->store(0, std::memory_order_relaxed);
thread_state_[0].status->store(ThreadStatus::WORKING, std::memory_order_release);
thread_state_[0].status->store(ThreadStatus::WORKING,
std::memory_order_release);
process_tasks(0);
for (int i = 1; i < thread_num_; i++) {
while (thread_state_[i].status->load(std::memory_order_acquire) == ThreadStatus::WORKING) {
while (thread_state_[i].status->load(std::memory_order_acquire) ==
ThreadStatus::WORKING) {
}
}
}
void Backend::process_tasks(int thread_id) {
if (init_func_ != nullptr) {
init_func_(thread_id);
}
while (true) {
int task_id = thread_state_[thread_id].curr->fetch_add(1, std::memory_order_acq_rel);
int task_id = thread_state_[thread_id].curr->fetch_add(
1, std::memory_order_acq_rel);
if (task_id >= thread_state_[thread_id].end) {
break;
}
func_(task_id);
compute_func_(task_id);
}
for (int t_offset = 1; t_offset < thread_num_; t_offset++) {
int t_i = (thread_id + t_offset) % thread_num_;
if (thread_state_[t_i].status->load(std::memory_order_acquire) != ThreadStatus::WORKING) {
if (thread_state_[t_i].status->load(std::memory_order_acquire) !=
ThreadStatus::WORKING) {
continue;
}
while (true) {
int task_id = thread_state_[t_i].curr->fetch_add(1, std::memory_order_acq_rel);
int task_id = thread_state_[t_i].curr->fetch_add(
1, std::memory_order_acq_rel);
if (task_id >= thread_state_[t_i].end) {
break;
}
func_(task_id);
compute_func_(task_id);
}
}
thread_state_[thread_id].status->store(ThreadStatus::WAITING, std::memory_order_release);
if (finalize_func_ != nullptr) {
finalize_func_(thread_id);
}
thread_state_[thread_id].status->store(ThreadStatus::WAITING,
std::memory_order_release);
}
void Backend::worker_thread(int thread_id) {
auto start = std::chrono::steady_clock::now();
thread_local_id = thread_id; // 设置线程本地变量
while (true) {
ThreadStatus status = thread_state_[thread_id].status->load(std::memory_order_acquire);
ThreadStatus status =
thread_state_[thread_id].status->load(std::memory_order_acquire);
if (status == ThreadStatus::WORKING) {
process_tasks(thread_id);
start = std::chrono::steady_clock::now();
} else if (status == ThreadStatus::WAITING) {
auto now = std::chrono::steady_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(now - start).count();
auto duration =
std::chrono::duration_cast<std::chrono::milliseconds>(now -
start)
.count();
if (duration > 50) {
std::this_thread::sleep_for(std::chrono::milliseconds(1));
}

View file

@ -3,7 +3,7 @@
* @Author : chenht2022
* @Date : 2024-07-22 02:03:05
* @Version : 1.0.0
* @LastEditors : chenht2022
* @LastEditors : chenht2022
* @LastEditTime : 2024-07-25 10:33:38
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
@ -31,20 +31,25 @@ struct ThreadState {
};
class Backend {
public:
public:
Backend(int);
~Backend();
int get_thread_num();
void do_work_stealing_job(int, std::function<void(int)>);
void do_work_stealing_job(int, std::function<void(int)>,
std::function<void(int)>,
std::function<void(int)>);
static thread_local int thread_local_id;
private:
private:
int thread_num_;
std::vector<ThreadState> thread_state_; // [thread_num]
std::function<void(int)> func_;
int max_thread_num_;
std::vector<ThreadState> thread_state_; // [thread_num]
std::function<void(int)> init_func_;
std::function<void(int)> compute_func_;
std::function<void(int)> finalize_func_;
std::vector<std::thread> workers_;
void process_tasks(int);
void worker_thread(int);
};
#endif

View file

@ -54,4 +54,4 @@ void TaskQueue::processTasks() {
}
mutex.unlock();
}
}
}

View file

@ -4,7 +4,7 @@
* @Date : 2024-07-16 10:43:18
* @Version : 1.0.0
* @LastEditors : chenxl
* @LastEditTime : 2024-08-12 12:28:25
* @LastEditTime : 2024-08-08 04:23:51
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#ifndef CPUINFER_TASKQUEUE_H

View file

@ -0,0 +1,142 @@
#!/usr/bin/env python
# coding=utf-8
"""
Description :
Author : Jianwei Dong
Date : 2024-08-28 10:32:05
Version : 1.0.0
LastEditors : chenht2022
LastEditTime : 2024-08-28 10:32:05
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
"""
import os, sys
import time
sys.path.append(os.path.dirname(__file__) + "/../build")
import cpuinfer_ext
from flash_attn import flash_attn_with_kvcache
import torch
layer_num = 10
kv_head_num = 8
q_head_num = 32
head_dim = 128
block_len = 128
anchor_num = 1
cache_seqlen = 8192
cache_seqlens = torch.tensor([cache_seqlen], dtype=torch.int32, device="cpu")
seqlens_zero = torch.zeros((1,), dtype=torch.int32, device="cpu")
anchor_type = cpuinfer_ext.kvcache.AnchorType.DYNAMIC
kv_type = cpuinfer_ext.kvcache.ggml_type.FP16
retrieval_type = cpuinfer_ext.kvcache.RetrievalType.LAYER
layer_step: int = 1
token_step: int = 1
layer_offset: int = 0
max_thread_num: int = 2
max_batch_size: int = 1
max_block_num: int = 512
CPUInfer = cpuinfer_ext.CPUInfer(max_thread_num)
validation_iter = 100
with torch.inference_mode(mode=True):
config = cpuinfer_ext.kvcache.KVCacheConfig(
layer_num,
kv_head_num,
q_head_num,
head_dim,
block_len,
anchor_num,
anchor_type,
kv_type,
retrieval_type,
layer_step,
token_step,
layer_offset,
max_block_num,
max_batch_size,
max_thread_num,
)
local_kvcache = cpuinfer_ext.kvcache.KVCache(config)
kvcaches = []
block_table = (
torch.arange(max_block_num, dtype=torch.int32, device="cpu")
.contiguous()
.view(1, -1)
)
for layer_idx in range(layer_num):
k_cache = torch.randn(
(1, cache_seqlen, kv_head_num, head_dim), dtype=torch.float16, device="cpu"
).contiguous()
v_cache = torch.randn(
(1, cache_seqlen, kv_head_num, head_dim), dtype=torch.float16, device="cpu"
).contiguous()
CPUInfer.submit(
local_kvcache.update_kvcache_fp16(
k_cache.data_ptr(),
v_cache.data_ptr(),
layer_idx,
block_table.data_ptr(),
1,
max_block_num,
seqlens_zero.data_ptr(),
cache_seqlen,
)
)
CPUInfer.sync()
kvcaches.append((k_cache.to("cuda"), v_cache.to("cuda")))
# validation
for i in range(validation_iter):
k_cache = kvcaches[i % layer_num][0]
v_cache = kvcaches[i % layer_num][1]
input = torch.randn(
(1, 1, q_head_num, head_dim), dtype=torch.float16, device="cpu"
).contiguous()
output = torch.empty(
(1, 1, q_head_num, head_dim), dtype=torch.float16, device="cpu"
).contiguous()
# attn_lse: (bsz, q_len, q_head_num)
attn_lse = torch.empty(
(1, 1, q_head_num), dtype=torch.float32, device="cpu"
).contiguous()
input = input / 100
CPUInfer.submit(
local_kvcache.attn(
input.data_ptr(),
output.data_ptr(),
attn_lse.data_ptr(),
i % layer_num,
0,
1,
1,
max_block_num,
block_table.data_ptr(),
cache_seqlens.data_ptr(),
-1,
-1,
-1,
)
)
CPUInfer.sync()
# print("cpuinfer output", output)
t_output = flash_attn_with_kvcache(
q=input.to("cuda"),
k_cache=k_cache,
v_cache=v_cache,
cache_seqlens=cache_seqlens.to("cuda"),
)
# print("torch output", t_output)
diff = torch.mean(torch.abs(output.to("cuda") - t_output)) / torch.mean(
torch.abs(t_output)
)
print("diff = ", diff)
assert diff < 0.001

View file

@ -1,19 +1,17 @@
/**
* @Description :
* @Author : chenht2022
* @Author : chenht2022, Jianwei Dong
* @Date : 2024-07-22 02:03:22
* @Version : 1.0.0
* @LastEditors : chenht2022
* @LastEditTime : 2024-08-07 10:39:37
* @LastEditors : Jianwei Dong
* @LastEditTime : 2024-08-26 22:47:06
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
// Python bindings
#include <cstdint>
#include <iostream>
#include <memory>
#include "cpu_backend/cpuinfer.h"
#include "device_launch_parameters.h"
#include "llamafile/flags.h"
#include "operators/kvcache/kvcache.h"
#include "operators/llamafile/linear.h"
#include "operators/llamafile/mlp.h"
#include "operators/llamafile/moe.h"
@ -21,119 +19,541 @@
#include "pybind11/operators.h"
#include "pybind11/pybind11.h"
#include "pybind11/stl.h"
#include <cstdint>
#include <iostream>
#include <memory>
namespace py = pybind11;
using namespace pybind11::literals;
class LinearBindings {
public:
class WarmUpBindinds {
public:
// Binding functions for the KVCache class
class KVCacheBindings {
public:
class AttnBindings {
public:
struct Args {
CPUInfer* cpuinfer;
Linear* linear;
CPUInfer *cpuinfer;
KVCache *kv_cache;
const ggml_fp16_t *q_in;
ggml_fp16_t *output;
float *attn_lse;
int layer_idx;
int generate_token_idx;
int q_len;
int batch_size;
int max_block_num;
int *block_table;
int *cache_seqlens;
int pick_block_num;
int init_block_num;
int local_block_num;
};
static void inner(void* args) {
Args* args_ = (Args*)args;
static void inner(void *args) {
Args *args_ = (Args *)args;
args_->cpuinfer->enqueue(
&KVCache::attn, args_->kv_cache, args_->q_in, args_->output,
args_->attn_lse, args_->layer_idx, args_->generate_token_idx,
args_->q_len, args_->batch_size, args_->max_block_num,
args_->block_table, args_->cache_seqlens, args_->pick_block_num,
args_->init_block_num, args_->local_block_num);
}
static std::pair<intptr_t, intptr_t>
cpuinfer_interface(KVCache &kv_cache, intptr_t q_in, intptr_t output,
intptr_t attn_lse, int layer_idx,
int generate_token_idx, int q_len, int batch_size,
int max_block_num, intptr_t block_table,
intptr_t cache_seqlens, int pick_block_num,
int init_block_num, int local_block_num) {
Args *args = new Args{nullptr,
&kv_cache,
(const ggml_fp16_t *)q_in,
(ggml_fp16_t *)output,
(float *)attn_lse,
layer_idx,
generate_token_idx,
q_len,
batch_size,
max_block_num,
(int *)block_table,
(int *)cache_seqlens,
pick_block_num,
init_block_num,
local_block_num};
return std::make_pair((intptr_t)&inner, (intptr_t)args);
}
};
class GetAllKVCacheOneLayerBindings {
public:
struct Args {
CPUInfer *cpuinfer;
KVCache *kv_cache;
int layer_id;
ggml_fp16_t *k_in;
ggml_fp16_t *v_in;
};
static void inner(void *args) {
Args *args_ = (Args *)args;
args_->cpuinfer->enqueue(&KVCache::get_all_kvcache_one_layer,
args_->kv_cache, args_->layer_id,
args_->k_in, args_->v_in);
}
static std::pair<intptr_t, intptr_t>
cpuinfer_interface(KVCache &kv_cache, intptr_t k_in, intptr_t v_in,
int layer_id) {
Args *args = new Args{nullptr, &kv_cache, layer_id,
(ggml_fp16_t *)k_in, (ggml_fp16_t *)v_in};
return std::make_pair((intptr_t)&inner, (intptr_t)args);
}
};
class GetAndUpdateKVCacheFp16Bindings {
public:
struct Args {
CPUInfer *cpuinfer;
KVCache *kv_cache;
ggml_fp16_t *k_in;
ggml_fp16_t *v_in;
int layer_id;
int *block_table;
int batch_size;
int max_block_num;
int *cache_seqlens;
int q_len;
};
static void inner(void *args) {
Args *args_ = (Args *)args;
args_->cpuinfer->enqueue(&KVCache::get_and_update_kvcache_fp16,
args_->kv_cache, args_->k_in, args_->v_in,
args_->layer_id, args_->block_table,
args_->batch_size, args_->max_block_num,
args_->cache_seqlens, args_->q_len);
}
static std::pair<intptr_t, intptr_t>
cpuinfer_interface(KVCache &kv_cache, intptr_t k_in, intptr_t v_in,
int layer_id, intptr_t block_table, int batch_size,
int max_block_num, intptr_t cache_seqlens,
int q_len) {
Args *args = new Args{nullptr,
&kv_cache,
(ggml_fp16_t *)k_in,
(ggml_fp16_t *)v_in,
layer_id,
(int *)block_table,
batch_size,
max_block_num,
(int *)cache_seqlens,
q_len};
return std::make_pair((intptr_t)&inner, (intptr_t)args);
}
};
class GetKVCacheFp16Bindings {
public:
struct Args {
CPUInfer *cpuinfer;
KVCache *kv_cache;
ggml_fp16_t *k_in;
ggml_fp16_t *v_in;
int layer_id;
int *block_table;
int batch_size;
int max_block_num;
int *cache_seqlens;
};
static void inner(void *args) {
Args *args_ = (Args *)args;
args_->cpuinfer->enqueue(
&KVCache::get_kvcache_fp16, args_->kv_cache, args_->k_in,
args_->v_in, args_->layer_id, args_->block_table,
args_->batch_size, args_->max_block_num, args_->cache_seqlens);
}
static std::pair<intptr_t, intptr_t>
cpuinfer_interface(KVCache &kv_cache, intptr_t k_in, intptr_t v_in,
int layer_id, intptr_t block_table, int batch_size,
int max_block_num, intptr_t cache_seqlens) {
Args *args = new Args{nullptr,
&kv_cache,
(ggml_fp16_t *)k_in,
(ggml_fp16_t *)v_in,
layer_id,
(int *)block_table,
batch_size,
max_block_num,
(int *)cache_seqlens};
return std::make_pair((intptr_t)&inner, (intptr_t)args);
}
};
class UpdateKVCacheFp16Bindings {
public:
struct Args {
CPUInfer *cpuinfer;
KVCache *kv_cache;
ggml_fp16_t *k_in;
ggml_fp16_t *v_in;
int layer_id;
int *block_table;
int batch_size;
int max_block_num;
int *cache_seqlens;
int q_len;
};
static void inner(void *args) {
Args *args_ = (Args *)args;
args_->cpuinfer->enqueue(&KVCache::update_kvcache_fp16,
args_->kv_cache, args_->k_in, args_->v_in,
args_->layer_id, args_->block_table,
args_->batch_size, args_->max_block_num,
args_->cache_seqlens, args_->q_len);
}
static std::pair<intptr_t, intptr_t>
cpuinfer_interface(KVCache &kv_cache, intptr_t k_in, intptr_t v_in,
int layer_id, intptr_t block_table, int batch_size,
int max_block_num, intptr_t cache_seqlens,
int q_len) {
Args *args = new Args{nullptr,
&kv_cache,
(ggml_fp16_t *)k_in,
(ggml_fp16_t *)v_in,
layer_id,
(int *)block_table,
batch_size,
max_block_num,
(int *)cache_seqlens,
q_len};
return std::make_pair((intptr_t)&inner, (intptr_t)args);
}
};
class UpdateImportanceBindings {
public:
struct Args {
CPUInfer *cpuinfer;
KVCache *kv_cache;
const ggml_fp16_t *importance;
int layer_id;
int *block_table;
int batch_size;
int max_block_num;
int *offset;
int width;
};
static void inner(void *args) {
Args *args_ = (Args *)args;
args_->cpuinfer->enqueue(
&KVCache::update_importance, args_->kv_cache, args_->importance,
args_->layer_id, args_->block_table, args_->batch_size,
args_->max_block_num, args_->offset, args_->width);
}
static std::pair<intptr_t, intptr_t>
cpuinfer_interface(KVCache &kv_cache, intptr_t importance, int layer_id,
intptr_t block_table, int batch_size,
int max_block_num, intptr_t offset, int width) {
Args *args = new Args{nullptr,
&kv_cache,
(const ggml_fp16_t *)importance,
layer_id,
(int *)block_table,
batch_size,
max_block_num,
(int *)offset,
width};
return std::make_pair((intptr_t)&inner, (intptr_t)args);
}
};
class AttnWithKVCacheBindings {
public:
struct Args {
CPUInfer *cpuinfer;
KVCache *kv_cache;
const ggml_fp16_t *q_in;
const ggml_fp16_t *k_in;
const ggml_fp16_t *v_in;
ggml_fp16_t *output;
float *attn_lse;
int layer_idx;
int generate_token_idx;
int q_len;
int batch_size;
int max_block_num;
int *block_table;
int *cache_seqlens;
int topk;
int local;
};
static void inner(void *args) {
Args *args_ = (Args *)args;
args_->cpuinfer->enqueue(
&KVCache::attn_with_kvcache, args_->kv_cache, args_->q_in,
args_->k_in, args_->v_in, args_->output, args_->attn_lse,
args_->layer_idx, args_->generate_token_idx, args_->q_len,
args_->batch_size, args_->max_block_num, args_->block_table,
args_->cache_seqlens, args_->topk, args_->local);
}
static std::pair<intptr_t, intptr_t>
cpuinfer_interface(KVCache &kv_cache, intptr_t q_in, intptr_t k_in,
intptr_t v_in, intptr_t output, intptr_t attn_lse,
int layer_idx, int generate_token_idx, int q_len,
int batch_size, int max_block_num,
intptr_t block_table, intptr_t cache_seqlens,
int topk, int local) {
Args *args = new Args{nullptr,
&kv_cache,
(const ggml_fp16_t *)q_in,
(const ggml_fp16_t *)k_in,
(const ggml_fp16_t *)v_in,
(ggml_fp16_t *)output,
(float *)attn_lse,
layer_idx,
generate_token_idx,
q_len,
batch_size,
max_block_num,
(int *)block_table,
(int *)cache_seqlens,
topk,
local};
return std::make_pair((intptr_t)&inner, (intptr_t)args);
}
};
class ClearImportanceAllLayersBindings {
public:
struct Args {
CPUInfer *cpuinfer;
KVCache *kv_cache;
int *block_table;
int *cache_seqlens;
int batch_size;
int max_block_num;
};
static void inner(void *args) {
Args *args_ = (Args *)args;
args_->cpuinfer->enqueue(&KVCache::clear_importance_all_layers,
args_->kv_cache, args_->block_table,
args_->cache_seqlens, args_->batch_size,
args_->max_block_num);
}
static std::pair<intptr_t, intptr_t>
cpuinfer_interface(KVCache &kv_cache, intptr_t block_table,
intptr_t cache_seqlens, int batch_size,
int max_block_num) {
Args *args = new Args{nullptr,
&kv_cache,
(int *)block_table,
(int *)cache_seqlens,
batch_size,
max_block_num};
return std::make_pair((intptr_t)&inner, (intptr_t)args);
}
};
class CalcAnchorAllLayersBindinds {
public:
struct Args {
CPUInfer *cpuinfer;
KVCache *kv_cache;
int *block_table;
int *cache_seqlens;
int batch_size;
int max_block_num;
};
static void inner(void *args) {
Args *args_ = (Args *)args;
args_->cpuinfer->enqueue(&KVCache::calc_anchor_all_layers,
args_->kv_cache, args_->block_table,
args_->cache_seqlens, args_->batch_size,
args_->max_block_num);
}
static std::pair<intptr_t, intptr_t>
cpuinfer_interface(KVCache &kv_cache, intptr_t block_table,
intptr_t cache_seqlens, int batch_size,
int max_block_num) {
Args *args = new Args{nullptr,
&kv_cache,
(int *)block_table,
(int *)cache_seqlens,
batch_size,
max_block_num};
return std::make_pair((intptr_t)&inner, (intptr_t)args);
}
};
class LoadKVCacheBindings {
public:
struct Args {
CPUInfer *cpuinfer;
KVCache *kv_cache;
std::string tensor_file_path;
};
static void inner(void *args) {
Args *args_ = (Args *)args;
args_->cpuinfer->enqueue(&KVCache::load_kvcache, args_->kv_cache,
args_->tensor_file_path);
}
static std::pair<intptr_t, intptr_t>
cpuinfer_interface(KVCache &kv_cache, std::string tensor_file_path) {
Args *args =
new Args{nullptr, &kv_cache, (std::string)tensor_file_path};
return std::make_pair((intptr_t)&inner, (intptr_t)args);
}
};
class DumpKVCacheBindings {
public:
struct Args {
CPUInfer *cpuinfer;
KVCache *kv_cache;
int *block_table;
int cache_total_len;
std::string tensor_file_path;
};
static void inner(void *args) {
Args *args_ = (Args *)args;
args_->cpuinfer->enqueue(&KVCache::dump_kvcache, args_->kv_cache,
args_->block_table, args_->cache_total_len,
args_->tensor_file_path);
}
static std::pair<intptr_t, intptr_t>
cpuinfer_interface(KVCache &kv_cache, intptr_t block_table,
int cache_total_len, std::string tensor_file_path) {
Args *args =
new Args{nullptr, &kv_cache, (int *)block_table,
cache_total_len, (std::string)tensor_file_path};
return std::make_pair((intptr_t)&inner, (intptr_t)args);
}
};
};
class LinearBindings {
public:
class WarmUpBindinds {
public:
struct Args {
CPUInfer *cpuinfer;
Linear *linear;
};
static void inner(void *args) {
Args *args_ = (Args *)args;
args_->cpuinfer->enqueue(&Linear::warm_up, args_->linear);
}
static std::pair<intptr_t, intptr_t> cpuinfer_interface(Linear& linear) {
Args* args = new Args{nullptr, &linear};
static std::pair<intptr_t, intptr_t>
cpuinfer_interface(Linear &linear) {
Args *args = new Args{nullptr, &linear};
return std::make_pair((intptr_t)&inner, (intptr_t)args);
}
};
class ForwardBindings {
public:
public:
struct Args {
CPUInfer* cpuinfer;
Linear* linear;
CPUInfer *cpuinfer;
Linear *linear;
int qlen;
const void* input;
void* output;
const void *input;
void *output;
};
static void inner(void* args) {
Args* args_ = (Args*)args;
args_->cpuinfer->enqueue(&Linear::forward, args_->linear, args_->qlen, args_->input, args_->output);
static void inner(void *args) {
Args *args_ = (Args *)args;
args_->cpuinfer->enqueue(&Linear::forward, args_->linear,
args_->qlen, args_->input, args_->output);
}
static std::pair<intptr_t, intptr_t> cpuinfer_interface(Linear& linear, int qlen, intptr_t input, intptr_t output) {
Args* args = new Args{nullptr, &linear, qlen, (const void*)input, (void*)output};
static std::pair<intptr_t, intptr_t>
cpuinfer_interface(Linear &linear, int qlen, intptr_t input,
intptr_t output) {
Args *args = new Args{nullptr, &linear, qlen, (const void *)input,
(void *)output};
return std::make_pair((intptr_t)&inner, (intptr_t)args);
}
};
};
class MLPBindings {
public:
public:
class WarmUpBindinds {
public:
public:
struct Args {
CPUInfer* cpuinfer;
MLP* mlp;
CPUInfer *cpuinfer;
MLP *mlp;
};
static void inner(void* args) {
Args* args_ = (Args*)args;
static void inner(void *args) {
Args *args_ = (Args *)args;
args_->cpuinfer->enqueue(&MLP::warm_up, args_->mlp);
}
static std::pair<intptr_t, intptr_t> cpuinfer_interface(MLP& mlp) {
Args* args = new Args{nullptr, &mlp};
static std::pair<intptr_t, intptr_t> cpuinfer_interface(MLP &mlp) {
Args *args = new Args{nullptr, &mlp};
return std::make_pair((intptr_t)&inner, (intptr_t)args);
}
};
class ForwardBindings {
public:
public:
struct Args {
CPUInfer* cpuinfer;
MLP* mlp;
CPUInfer *cpuinfer;
MLP *mlp;
int qlen;
const void* input;
void* output;
const void *input;
void *output;
};
static void inner(void* args) {
Args* args_ = (Args*)args;
args_->cpuinfer->enqueue(&MLP::forward, args_->mlp, args_->qlen, args_->input, args_->output);
static void inner(void *args) {
Args *args_ = (Args *)args;
args_->cpuinfer->enqueue(&MLP::forward, args_->mlp, args_->qlen,
args_->input, args_->output);
}
static std::pair<intptr_t, intptr_t> cpuinfer_interface(MLP& mlp, int qlen, intptr_t input, intptr_t output) {
Args* args = new Args{nullptr, &mlp, qlen, (const void*)input, (void*)output};
static std::pair<intptr_t, intptr_t>
cpuinfer_interface(MLP &mlp, int qlen, intptr_t input,
intptr_t output) {
Args *args = new Args{nullptr, &mlp, qlen, (const void *)input,
(void *)output};
return std::make_pair((intptr_t)&inner, (intptr_t)args);
}
};
};
class MOEBindings {
public:
public:
class WarmUpBindinds {
public:
public:
struct Args {
CPUInfer* cpuinfer;
MOE* moe;
CPUInfer *cpuinfer;
MOE *moe;
};
static void inner(void* args) {
Args* args_ = (Args*)args;
static void inner(void *args) {
Args *args_ = (Args *)args;
args_->cpuinfer->enqueue(&MOE::warm_up, args_->moe);
}
static std::pair<intptr_t, intptr_t> cpuinfer_interface(MOE& moe) {
Args* args = new Args{nullptr, &moe};
static std::pair<intptr_t, intptr_t> cpuinfer_interface(MOE &moe) {
Args *args = new Args{nullptr, &moe};
return std::make_pair((intptr_t)&inner, (intptr_t)args);
}
};
class ForwardBindings {
public:
public:
struct Args {
CPUInfer* cpuinfer;
MOE* moe;
CPUInfer *cpuinfer;
MOE *moe;
int qlen;
int k;
const uint64_t* expert_ids;
const float* weights;
const void* input;
void* output;
const uint64_t *expert_ids;
const float *weights;
const void *input;
void *output;
};
static void inner(void* args) {
Args* args_ = (Args*)args;
args_->cpuinfer->enqueue(&MOE::forward, args_->moe, args_->qlen, args_->k, args_->expert_ids, args_->weights, args_->input, args_->output);
static void inner(void *args) {
Args *args_ = (Args *)args;
args_->cpuinfer->enqueue(
&MOE::forward, args_->moe, args_->qlen, args_->k,
args_->expert_ids, args_->weights, args_->input, args_->output);
}
static std::pair<intptr_t, intptr_t> cpuinfer_interface(MOE& moe, int qlen, int k, intptr_t expert_ids, intptr_t weights, intptr_t input, intptr_t output) {
Args* args = new Args{nullptr, &moe, qlen, k, (const uint64_t*)expert_ids, (const float*)weights, (const void*)input, (void*)output};
static std::pair<intptr_t, intptr_t>
cpuinfer_interface(MOE &moe, int qlen, int k, intptr_t expert_ids,
intptr_t weights, intptr_t input, intptr_t output) {
Args *args = new Args{nullptr,
&moe,
qlen,
k,
(const uint64_t *)expert_ids,
(const float *)weights,
(const void *)input,
(void *)output};
return std::make_pair((intptr_t)&inner, (intptr_t)args);
}
};
@ -149,8 +569,12 @@ PYBIND11_MODULE(cpuinfer_ext, m) {
auto linear_module = m.def_submodule("linear");
py::class_<LinearConfig>(linear_module, "LinearConfig")
.def(py::init([](int hidden_size, int intermediate_size, int stride, int group_max_len, intptr_t proj, int proj_type, int hidden_type) {
return LinearConfig(hidden_size, intermediate_size, stride, group_max_len, (void*)proj, (ggml_type)proj_type, (ggml_type)hidden_type);
.def(py::init([](int hidden_size, int intermediate_size, int stride,
int group_max_len, intptr_t proj, int proj_type,
int hidden_type) {
return LinearConfig(hidden_size, intermediate_size, stride,
group_max_len, (void *)proj,
(ggml_type)proj_type, (ggml_type)hidden_type);
}));
py::class_<Linear>(linear_module, "Linear")
.def(py::init<LinearConfig>())
@ -159,8 +583,15 @@ PYBIND11_MODULE(cpuinfer_ext, m) {
auto mlp_module = m.def_submodule("mlp");
py::class_<MLPConfig>(mlp_module, "MLPConfig")
.def(py::init([](int hidden_size, int intermediate_size, int stride, int group_max_len, intptr_t gate_proj, intptr_t up_proj, intptr_t down_proj, int gate_type, int up_type, int down_type, int hidden_type) {
return MLPConfig(hidden_size, intermediate_size, stride, group_max_len, (void*)gate_proj, (void*)up_proj, (void*)down_proj, (ggml_type)gate_type, (ggml_type)up_type, (ggml_type)down_type, (ggml_type)hidden_type);
.def(py::init([](int hidden_size, int intermediate_size, int stride,
int group_max_len, intptr_t gate_proj,
intptr_t up_proj, intptr_t down_proj, int gate_type,
int up_type, int down_type, int hidden_type) {
return MLPConfig(hidden_size, intermediate_size, stride,
group_max_len, (void *)gate_proj, (void *)up_proj,
(void *)down_proj, (ggml_type)gate_type,
(ggml_type)up_type, (ggml_type)down_type,
(ggml_type)hidden_type);
}));
py::class_<MLP>(mlp_module, "MLP")
.def(py::init<MLPConfig>())
@ -169,11 +600,84 @@ PYBIND11_MODULE(cpuinfer_ext, m) {
auto moe_module = m.def_submodule("moe");
py::class_<MOEConfig>(moe_module, "MOEConfig")
.def(py::init([](int expert_num, int routed_expert_num, int hidden_size, int intermediate_size, int stride, int group_min_len, int group_max_len, intptr_t gate_proj, intptr_t up_proj, intptr_t down_proj, int gate_type, int up_type, int down_type, int hidden_type) {
return MOEConfig(expert_num, routed_expert_num, hidden_size, intermediate_size, stride, group_min_len, group_max_len, (void*)gate_proj, (void*)up_proj, (void*)down_proj, (ggml_type)gate_type, (ggml_type)up_type, (ggml_type)down_type, (ggml_type)hidden_type);
.def(py::init([](int expert_num, int routed_expert_num, int hidden_size,
int intermediate_size, int stride, int group_min_len,
int group_max_len, intptr_t gate_proj,
intptr_t up_proj, intptr_t down_proj, int gate_type,
int up_type, int down_type, int hidden_type) {
return MOEConfig(expert_num, routed_expert_num, hidden_size,
intermediate_size, stride, group_min_len,
group_max_len, (void *)gate_proj, (void *)up_proj,
(void *)down_proj, (ggml_type)gate_type,
(ggml_type)up_type, (ggml_type)down_type,
(ggml_type)hidden_type);
}));
py::class_<MOE>(moe_module, "MOE")
.def(py::init<MOEConfig>())
.def("warm_up", &MOEBindings::WarmUpBindinds::cpuinfer_interface)
.def("forward", &MOEBindings::ForwardBindings::cpuinfer_interface);
auto kvcache_module = m.def_submodule("kvcache");
py::enum_<AnchorType>(kvcache_module, "AnchorType")
.value("FIXED", AnchorType::FIXED_ANCHOR)
.value("DYNAMIC", AnchorType::DYNAMIC)
.value("QUEST", AnchorType::QUEST)
.value("BLOCK_MAX", AnchorType::BLOCK_MAX)
.value("BLOCK_MEAN", AnchorType::BLOCK_MEAN);
py::enum_<ggml_type>(kvcache_module, "ggml_type")
.value("FP16", ggml_type::GGML_TYPE_F16)
.value("FP32", ggml_type::GGML_TYPE_F32)
.value("Q4_0", ggml_type::GGML_TYPE_Q4_0)
.value("Q8_0", ggml_type::GGML_TYPE_Q8_0);
py::enum_<RetrievalType>(kvcache_module, "RetrievalType")
.value("LAYER", RetrievalType::LAYER)
.value("KVHEAD", RetrievalType::KVHEAD)
.value("QHEAD", RetrievalType::QHEAD);
py::class_<KVCacheConfig>(kvcache_module, "KVCacheConfig")
.def(py::init<int, int, int, int, int, int, AnchorType, ggml_type,
RetrievalType, int, int, int, int, int, int>())
.def_readwrite("layer_num", &KVCacheConfig::layer_num)
.def_readwrite("kv_head_num", &KVCacheConfig::kv_head_num)
.def_readwrite("q_head_num", &KVCacheConfig::q_head_num)
.def_readwrite("head_dim", &KVCacheConfig::head_dim)
.def_readwrite("block_len", &KVCacheConfig::block_len)
.def_readwrite("anchor_num", &KVCacheConfig::anchor_num)
.def_readwrite("anchor_type", &KVCacheConfig::anchor_type)
.def_readwrite("kv_type", &KVCacheConfig::kv_type)
.def_readwrite("retrieval_type", &KVCacheConfig::retrieval_type)
.def_readwrite("layer_step", &KVCacheConfig::layer_step)
.def_readwrite("token_step", &KVCacheConfig::token_step)
.def_readwrite("layer_offset", &KVCacheConfig::layer_offset)
.def_readwrite("max_block_num", &KVCacheConfig::max_block_num)
.def_readwrite("max_batch_size", &KVCacheConfig::max_batch_size)
.def_readwrite("max_thread_num", &KVCacheConfig::max_thread_num);
py::class_<KVCache>(kvcache_module, "KVCache")
.def(py::init<KVCacheConfig>())
.def("get_cache_total_len", &KVCache::get_cache_total_len)
.def("update_cache_total_len",
[](KVCache &kvcache, int cache_total_len) {
kvcache.update_cache_total_len(cache_total_len);
})
.def("attn", &KVCacheBindings::AttnBindings::cpuinfer_interface)
.def(
"get_all_kvcache_one_layer",
&KVCacheBindings::GetAllKVCacheOneLayerBindings::cpuinfer_interface)
.def("get_and_update_kvcache_fp16",
&KVCacheBindings::GetAndUpdateKVCacheFp16Bindings::
cpuinfer_interface)
.def("get_kvcache_fp16",
&KVCacheBindings::GetKVCacheFp16Bindings::cpuinfer_interface)
.def("update_kvcache_fp16",
&KVCacheBindings::UpdateKVCacheFp16Bindings::cpuinfer_interface)
.def("update_importance",
&KVCacheBindings::UpdateImportanceBindings::cpuinfer_interface)
.def("attn_with_kvcache",
&KVCacheBindings::AttnWithKVCacheBindings::cpuinfer_interface)
.def("clear_importance_all_layers",
&KVCacheBindings::ClearImportanceAllLayersBindings::
cpuinfer_interface)
.def("calc_anchor_all_layers",
&KVCacheBindings::CalcAnchorAllLayersBindinds::cpuinfer_interface);
}

View file

@ -0,0 +1,727 @@
/**
* @Description :
* @Author : Jianwei Dong
* @Date : 2024-08-26 22:47:06
* @Version : 1.0.0
* @LastEditors : Jianwei Dong
* @LastEditTime : 2024-08-26 22:47:06
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#ifndef CPUINFER_OPERATOR_KVCACHE_H
#define CPUINFER_OPERATOR_KVCACHE_H
#include <algorithm>
#include <atomic>
#include <cassert>
#include <condition_variable>
#include <cstdint>
#include <cstdio>
#include <cstring>
#include <fstream>
#include <functional>
#include <future>
#include <iostream>
#include <memory>
#include <mutex>
#include <queue>
#include <random>
#include <stdexcept>
#include <thread>
#include <vector>
#include "../../cpu_backend/backend.h"
#include "llama.cpp/ggml-common.h"
#include "llama.cpp/ggml-impl.h"
#include "llama.cpp/ggml-quants.h"
#include "llama.cpp/ggml.h"
#include "llamafile/sgemm.h"
#define CHUNK_SIZE 32
/**
* @brief Converts a ggml_type enum value to its corresponding string
* representation.
*
* This function provides a human-readable string representation for a given
* ggml_type enum value. The string can be used for logging, debugging, or
* displaying information in a user interface.
*
* @param type The ggml_type enum value to convert.
* @return A string representation of the enum value.
*/
std::string ggml_type_to_string(ggml_type type);
/**
* @enum AnchorType
* @brief Defines the types of anchors used in attention mechanisms.
*
* This enum specifies different types of anchors that can be used in attention
* mechanisms, such as fixed anchors, dynamic anchors, or special anchors like
* QUEST, BLOCK_MEAN, or BLOCK_MAX.
*/
enum AnchorType {
FIXED_ANCHOR, /**< A fixed anchor that does not change. */
DYNAMIC, /**< A dynamic anchor that can change over time. */
QUEST, /**< A special anchor type used for QUEST (Query and Embedding Space
Transformation). */
BLOCK_MEAN, /**< An anchor based on the mean of a block of data. */
BLOCK_MAX /**< An anchor based on the maximum value within a block of data.
*/
};
/**
* @brief Converts an AnchorType enum value to its corresponding string
* representation.
*
* This function provides a human-readable string representation for a given
* AnchorType enum value. The string can be used for logging, debugging, or
* displaying information in a user interface.
*
* @param anchor_type The AnchorType enum value to convert.
* @return A string representation of the enum value.
*/
std::string AnchorTypeToString(AnchorType anchor_type);
/**
* @enum RetrievalType
* @brief Defines the types of retrieval strategies in attention mechanisms.
*
* This enum specifies different retrieval strategies that can be used in
* attention mechanisms, such as layer-level retrieval, key-value head-level
* retrieval, or query head-level retrieval.
*/
enum RetrievalType {
LAYER, /**< Retrieval at the layer level. */
KVHEAD, /**< Retrieval at the key-value head level. */
QHEAD /**< Retrieval at the query head level. */
};
/**
* @brief Converts a RetrievalType enum value to its corresponding string
* representation.
*
* This function provides a human-readable string representation for a given
* RetrievalType enum value. The string can be used for logging, debugging, or
* displaying information in a user interface.
*
* @param retrieval_type The RetrievalType enum value to convert.
* @return A string representation of the enum value.
*/
std::string RetrievalTypeToString(RetrievalType retrieval_type);
/**
* @struct KVCacheConfig
* @brief Configuration structure for Key-Value (KV) Cache.
*
* This structure holds configuration parameters for setting up and managing
* a Key-Value (KV) Cache used in various attention mechanisms. It includes
* parameters such as the number of layers, the number of heads, the dimension
* of each head, block length, anchor information, and memory-related settings.
*/
struct KVCacheConfig {
int layer_num; /**< Number of layers in the model. */
int kv_head_num; /**< Number of heads in the KV Cache. */
int q_head_num; /**< Number of heads in the query. */
int head_dim; /**< Dimension of each head. */
int block_len; /**< Length of each block in the cache. */
int anchor_num; /**< Number of anchors used in attention. */
ggml_type kv_type; /**< Data type of the KV Cache (e.g., fp16, q8_0). */
// Controls the pre-allocated memory size
int max_block_num; /**< Maximum number of blocks that can be allocated. */
int max_batch_size; /**< Maximum batch size that can be processed. */
int max_thread_num; /**< Maximum number of threads that can be used. */
AnchorType
anchor_type; /**< Type of anchors used in the attention mechanism. */
RetrievalType
retrieval_type; /**< Type of retrieval strategy used in the cache. */
int layer_step; /**< Step size between layers. */
int token_step; /**< Step size between tokens. */
int layer_offset; /**< Offset value for layers. */
/**
* @brief Default constructor for KVCacheConfig.
*
* Initializes the configuration with default values. This constructor
* does not initialize any member variables explicitly.
*/
KVCacheConfig() = default;
/**
* @brief Parameterized constructor for KVCacheConfig.
*
* This constructor initializes the configuration with specific values
* for all member variables.
*
* @param layer_num The number of layers in the model.
* @param kv_head_num The number of heads in the KV Cache.
* @param q_head_num The number of heads in the query.
* @param head_dim The dimension of each head.
* @param block_len The length of each block in the cache.
* @param anchor_num The number of anchors used in attention.
* @param anchor_type The type of anchors used in the attention mechanism.
* @param kv_type The data type of the KV Cache (e.g., fp16, q8_0).
* @param retrieval_type The type of retrieval strategy used in the cache.
* @param layer_step The step size between layers.
* @param token_step The step size between tokens.
* @param layer_offset The offset value for layers.
* @param max_block_num The maximum number of blocks that can be allocated.
* @param max_batch_size The maximum batch size that can be processed.
* @param max_thread_num The maximum number of threads that can be used.
*/
KVCacheConfig(int layer_num, int kv_head_num, int q_head_num, int head_dim,
int block_len, int anchor_num, AnchorType anchor_type,
ggml_type kv_type, RetrievalType retrieval_type,
int layer_step, int token_step, int layer_offset,
int max_block_num, int max_batch_size, int max_thread_num);
};
/**
* @class KVCache
* @brief Manages the Key-Value (KV) Cache used in attention mechanisms.
*
* The KVCache class provides functionality for managing the Key-Value Cache,
* including resizing the cache, retrieving configuration parameters, and
* updating internal states. This class is typically used in transformer models
* to store and manage past key and value states for efficient attention
* computations.
*/
class KVCache {
public:
/**
* @brief Constructs a KVCache object with the given configuration.
*
* Initializes the KVCache with the specified configuration parameters,
* such as the number of layers, heads, head dimensions, and other
* relevant settings.
*
* @param config The configuration object containing initialization
* parameters.
*/
KVCache(KVCacheConfig config);
/**
* @brief Resizes the number of threads used by the cache.
*
* This function adjusts the number of threads that the cache can utilize.
* It allows dynamic reconfiguration of the parallel processing capabilities
* based on the current workload or system resources.
*
* @param thread_num The new number of threads to use.
*/
void ThreadResize(int thread_num);
/**
* @brief Resizes the batch size managed by the cache.
*
* This function adjusts the batch size that the cache can handle. It
* is useful when the input batch size changes dynamically, allowing
* the cache to be reconfigured accordingly.
*
* @param batch_size The new batch size.
*/
void BatchResize(int batch_size);
/**
* @brief Resizes the number of blocks managed by the cache.
*
* This function adjusts the number of blocks that the cache can manage.
* It allows dynamic reconfiguration of the block structure based on the
* current sequence length or other factors.
*
* @param block_num The new number of blocks.
*/
void BlockResize(int block_num);
/**
* @brief Gets the number of layers in the cache.
*
* @return The number of layers configured in the cache.
*/
int get_layer_num() { return config_.layer_num; }
/**
* @brief Gets the number of KV heads in the cache.
*
* @return The number of KV heads configured in the cache.
*/
int get_kv_head_num() { return config_.kv_head_num; }
/**
* @brief Gets the number of query heads in the cache.
*
* @return The number of query heads configured in the cache.
*/
int get_q_head_num() { return config_.q_head_num; }
/**
* @brief Gets the dimension of each head in the cache.
*
* @return The dimension of each head.
*/
int get_head_dim() { return config_.head_dim; }
/**
* @brief Gets the length of each block in the cache.
*
* @return The length of each block.
*/
int get_block_len() { return config_.block_len; }
/**
* @brief Gets the number of blocks for a specific layer.
*
* @param layer_id The ID of the layer for which to retrieve the block
* number.
* @return The number of blocks in the specified layer.
*/
int get_block_num(int layer_id) { return past_block_num_[layer_id]; }
/**
* @brief Gets the number of anchors in the cache.
*
* @return The number of anchors configured in the cache.
*/
int get_anchor_num() { return config_.anchor_num; }
/**
* @brief Gets the total length of the cache.
*
* @return The total length of the cache.
*/
int get_cache_total_len() { return cache_total_len_; }
/**
* @brief Gets the total number of blocks in the cache.
*
* This function computes and returns the total number of blocks in the
* cache based on the total cache length and the block length configuration.
*
* @return The total number of blocks in the cache.
*/
int get_cache_total_block_num() {
return (cache_total_len_ + config_.block_len - 1) / config_.block_len;
}
/**
* @brief Updates the total length of the cache.
*
* This function sets a new total length for the cache, allowing dynamic
* adjustment of the cache size during runtime.
*
* @param cache_total_len The new total length of the cache.
*/
void update_cache_total_len(int cache_total_len) {
cache_total_len_ = cache_total_len;
}
void attn(const ggml_fp16_t *q_in, ggml_fp16_t *output, float *attn_lse,
int layer_idx, int generate_token_idx, int q_len, int batch_size,
int max_block_num, int *block_table, int *cache_seqlens,
int pick_block_num, int init_block_num, int local_block_num,
Backend *backend);
void update_kvcache_one_block_fp16(const ggml_fp16_t *k_in,
const ggml_fp16_t *v_in, int layer_id,
int block_idx, Backend *backend);
void get_kvcache_one_block_fp16(ggml_fp16_t *k_in, ggml_fp16_t *v_in,
int layer_id, int block_idx,
Backend *backend);
void update_importance_one_block(const ggml_fp16_t *importance,
int layer_id, int block_idx,
Backend *backend);
void get_importance_one_block(ggml_fp16_t *importance, int layer_id,
int block_idx, Backend *backend);
void get_anchor_one_block(ggml_fp16_t *anchor, int layer_id, int block_idx,
Backend *backend);
void update_anchor_one_block(const ggml_fp16_t *anchor, int layer_id,
int block_idx, Backend *backend);
void calc_anchor_all_layers(int *block_table, int *cache_seqlens,
int batch_size, int max_block_num,
Backend *backend);
void load_kvcache(std::string tensor_file_path, Backend *backend);
void dump_kvcache(int *block_table, int cache_total_len,
std::string tensor_file_path, Backend *backend);
void get_and_update_kvcache_fp16(ggml_fp16_t *k_in, ggml_fp16_t *v_in,
int layer_id, int *block_table,
int batch_size, int max_block_num,
int *cache_seqlens, int q_len,
Backend *backend);
void get_kvcache_fp16(ggml_fp16_t *k_in, ggml_fp16_t *v_in, int layer_id,
int *block_table, int batch_size, int max_block_num,
int *cache_seqlens, Backend *backend);
void update_kvcache_fp16(const ggml_fp16_t *k_in, const ggml_fp16_t *v_in,
int layer_id, int *block_table, int batch_size,
int max_block_num, int *cache_seqlens, int q_len,
Backend *backend);
void update_importance(const ggml_fp16_t *importance, int layer_id,
int *block_table, int batch_size, int max_block_num,
int *offset, int width, Backend *backend);
void attn_with_kvcache(const ggml_fp16_t *q_in, const ggml_fp16_t *k_in,
const ggml_fp16_t *v_in, ggml_fp16_t *output,
float *attn_lse, int layer_idx,
int generate_token_idx, int q_len, int batch_size,
int max_block_num, int *block_table,
int *cache_seqlens, int topk, int local,
Backend *backend);
void clear_importance_all_layers(int *block_table, int *cache_seqlens,
int batch_size, int max_block_num,
Backend *backend);
void clear_kvcache_all_layers(int *block_table, int *cache_seqlens,
int batch_size, int max_block_num,
Backend *backend);
void get_sincos(ggml_fp16_t *sin, ggml_fp16_t *cos, int seqlen);
void get_attn_sparsity(const ggml_fp16_t *q_in, float *attn_sparsity,
int layer_idx, int generate_token_idx, int q_len,
int batch_size, int max_block_num, int *block_table,
int *cache_seqlens, int *block_table_origin,
int *cache_seqlens_origin, int max_block_num_origin,
int topk, int local, Backend *backend);
void get_all_kvcache_one_layer(int layer_id, ggml_fp16_t *k_in,
ggml_fp16_t *v_in, Backend *backend);
private:
// Persistent data
KVCacheConfig config_;
int n_gqa_; // q_head_num / kv_head_num
int cache_total_len_; // Number of tokens in cache
std::vector<uint64_t> past_block_num_; // [layer_num]
std::vector<std::vector<std::vector<std::vector<block_q4_0>>>>
k_cache_q4; // [layer_num, kv_head_num, past_block_num, block_len *
// (head_dim / QK_4)]
std::vector<std::vector<std::vector<std::vector<block_q4_0>>>>
v_cache_q4; // [layer_num, kv_head_num, past_block_num, head_dim *
// (block_len / QK_4)]
std::vector<std::vector<std::vector<std::vector<block_q8_0>>>>
k_cache_q8; // [layer_num, kv_head_num, past_block_num, block_len *
// (head_dim / QK_8)]
std::vector<std::vector<std::vector<std::vector<block_q8_0>>>>
v_cache_q8; // [layer_num, kv_head_num, past_block_num, head_dim *
// (block_len / QK_8)]
std::vector<std::vector<std::vector<std::vector<ggml_fp16_t>>>>
k_cache_fp16_; // [layer_num, kv_head_num, past_block_num, block_len *
// head_dim]
std::vector<std::vector<std::vector<std::vector<ggml_fp16_t>>>>
v_cache_fp16_; // [layer_num, kv_head_num, past_block_num, head_dim *
// block_len]
std::vector<std::vector<std::vector<std::vector<ggml_fp16_t>>>>
importance_; // [layer_num, past_block_num, block_len,
// attention_head_num]
std::vector<ggml_fp16_t>
anchor_; // [layer_num * past_block_num * anchor_num *
// attention_head_num * head_dim]
// Runtime data
int64_t layer_id_;
int64_t block_idx_;
int *block_table_;
uint64_t block_num_;
int max_block_num_after_retrieval_;
// Rotary positional embeddings
std::vector<std::vector<ggml_fp16_t>> sin_; // [seq_len, head_dim]
std::vector<std::vector<ggml_fp16_t>> cos_; // [seq_len, head_dim]
// update/get
int seq_len_;
uint16_t *k_scales_; // q4_0
uint8_t *k_in_; // q4_0
uint16_t *v_scales_; // q4_0
uint8_t *v_in_; // q4_0
uint16_t *k_data_; // fp16
uint16_t *v_data_; // fp16
uint16_t *importance_data_; // fp16
uint16_t *anchor_data_; // fp16
// sparsity = (sigma(block lse / lse))
std::vector<std::vector<std::vector<float>>>
block_lse_; // [batch_size, max_block_num, q_head_num]
std::vector<std::vector<float>> attn_sparsity_; // [batch_size, q_head_num]
// attn
std::vector<std::vector<float>>
avg_q; // [batch_size, q_head_num * head_dim]
std::vector<std::vector<ggml_fp16_t>>
avg_q_fp16; // [batch_size, q_head_num * head_dim]
std::vector<
std::priority_queue<std::pair<float, int>,
std::vector<std::pair<float, int>>, std::greater<>>>
top_similar_block_;
std::vector<std::vector<float>> block_similar_;
std::vector<std::vector<std::vector<float>>> block_similar_kv_head_;
std::vector<std::vector<std::vector<float>>> block_similar_q_head_;
std::vector<int> cache_seqlens_; // [batch_size]
std::vector<int> selected_blocks_num_history_; // [layer_num // layer_step]
std::vector<std::vector<std::vector<int>>> selected_blocks_history_;
// [layer_num // layer_step, batch_size, max_block_num]
std::vector<std::vector<std::vector<std::vector<int>>>>
selected_blocks_history_kvhead_; // [layer_num // layer_step,
// batch_size, max_block_num,
// kv_head_num]
std::vector<std::vector<int>>
block_table_before_retrieval_; // [batch_size, max_block_num]
std::vector<std::vector<int>>
block_table_after_retrieval_; // [batch_size, pick_block_num]
std::vector<std::vector<std::vector<int>>>
block_table_before_retrieval_qhead_; // [batch_size, max_block_num,
// q_head_num]
std::vector<std::vector<std::vector<int>>>
block_table_after_retrieval_qhead_; // [batch_size, pick_block_num,
// q_head_num]
std::vector<std::vector<std::vector<int>>>
block_table_before_retrieval_kvhead_; // [batch_size, max_block_num,
// kv_head_num]
std::vector<std::vector<std::vector<int>>>
block_table_after_retrieval_kvhead_; // [batch_size, pick_block_num,
// kv_head_num]
std::vector<std::vector<std::unique_ptr<std::mutex>>>
mutex_; // [batch_size, kv_head_num]
std::vector<std::vector<std::vector<block_q8_0>>>
q_q8_0_; // [batch_size, kv_head_num, n_gqa * head_dim / QK8_0]
std::vector<std::vector<std::vector<float>>>
q_fp32_; // [batch_size, kv_head_num, n_gqa * head_dim]
std::vector<std::vector<std::vector<float>>>
output_fp32_; // [batch_size, kv_head_num, n_gqa * head_dim]
std::vector<std::vector<std::vector<float>>>
attn_lse_; // [batch_size, kv_head_num, n_gqa]
std::vector<std::pair<int, int>> thread_cur_head_idx_; // [thread_num]
std::vector<std::vector<block_q8_0>>
thread_local_output_q8_0_; // [thread_num, n_gqa * head_dim / QK8_0]
std::vector<std::vector<float>>
thread_local_attn_score_; // [thread_num, n_gqa * block_len]
std::vector<std::vector<float>>
thread_local_output_fp32_; // [thread_num, n_gqa * head_dim]
std::vector<std::vector<float>>
thread_local_attn_lse_; // [thread_num, n_gqa]
std::vector<std::vector<float>>
thread_local_cur_output_fp32_; // [thread_num, n_gqa * head_dim]
std::vector<std::vector<float>>
thread_local_cur_attn_lse_; // [thread_num, n_gqa]
std::vector<std::vector<uint8_t>>
thread_local_attn_mask_; // [thread_num, block_len // 8]
std::vector<std::vector<char>>
thread_local_draft_; // [thread_num, 2 * n_gqa * block_len + 6 * n_gqa *
// head_dim + 2 * block_len * head_dim]
// tmp space
std::vector<float> q_fp32; // [n_gqa * head_dim]
void quantize_q_(const uint16_t *q_in_data, int batch_size);
void attn_initialize_layer_(int batch_size, int layer_idx, int *block_table,
int &max_block_num, int *cache_seqlens);
void attn_initialize_kvhead_(int batch_size, int layer_idx,
int *block_table, int &max_block_num,
int *cache_seqlens);
void retrieval_kvcache_layer_(const uint16_t *q_in_data, int init_block_num,
int local_block_num, int pick_block_num,
int q_len, int generate_token_idx,
int batch_size, int layer_idx,
int *cache_seqlens, int &max_block_num,
Backend *backend);
void retrieval_kvcache_kvhead_(const uint16_t *q_in_data,
int init_block_num, int local_block_num,
int pick_block_num, int q_len,
int generate_token_idx, int batch_size,
int layer_idx, int *cache_seqlens,
int &max_block_num, Backend *backend);
void calculate_block_similarity_layer_(
const uint16_t *q_in_data, int batch_size, int layer_idx, int q_len,
int max_block_num, int *cache_seqlens, int init_block_num,
int local_block_num, int pick_block_num, Backend *backend);
void calculate_block_similarity_kvhead_(
const uint16_t *q_in_data, int batch_size, int layer_idx, int q_len,
int max_block_num, int *cache_seqlens, int init_block_num,
int local_block_num, int pick_block_num, Backend *backend);
void select_block_layer_(int batch_size, int layer_idx, int max_block_num,
int init_block_num, int local_block_num,
int pick_block_num);
void select_block_kvhead_(int batch_size, int layer_idx, int max_block_num,
int init_block_num, int local_block_num,
int pick_block_num);
void calculate_sparsity_layer_(const uint16_t *q_in_data,
float *attn_sparsity, int batch_size,
int max_block_num, int *block_table,
int *cache_seqlens, Backend *backend);
void calculate_sparsity_kvhead_(const uint16_t *q_in_data,
float *attn_sparsity, int batch_size,
int max_block_num, int *block_table,
int *cache_seqlens, Backend *backend);
void attention_kvhead_(const uint16_t *q_in_data, ggml_fp16_t *output,
float *attn_lse, int batch_size, Backend *backend);
void attention_layer_(const uint16_t *q_in_data, ggml_fp16_t *output,
float *attn_lse, int batch_size, Backend *backend);
/**
* @brief Computes attention with KV cache for one block.
*
* This function performs attention computation for one block using KV
* cache. The function supports different data types for Q, K, and V caches,
* and provides options for quantization. The function does not perform any
* dynamic memory allocation internally, so all necessary buffers must be
* pre-allocated externally.
*
* @param head_dim The dimension of the head.
* @param bsz The batch size.
* @param q_type The data type of Q (GGML data type). Only supports fp16 and
* q8_0.
* @param q Pointer to the Q tensor [bsz, head_dim]. The quantization is
* always applied along the head_dim dimension. The size must be
* bsz * head_dim/32 * qtype_size. If head_dim % 32 != 0, an error
* will be raised.
* @param past_kv_len The length of the past KV cache.
* @param past_kv_offset The offset in the past KV cache.
* @param is_full_attn Boolean flag indicating whether to use full attention
* (true for full 1 mask).
* @param attn_mask Pointer to the attention mask [bsz, past_kv_len]. If
* is_full_attn = false, a bit matrix is passed to
* represent the mask.
* @param k_type The data type of K cache (GGML data type). Only supports
* fp16, q4_0, and q8_0.
* @param k_quant_type Quantization type for K cache. 0 for per_token, 1 for
* per_channel. Other values will raise an error.
* @param k_cache Pointer to the K cache tensor [seq_len, head_dim]. If
* quant_type == 0, head_dim % 32 must be 0. If quant_type ==
* 1, seq_len % 32 must be 0.
* @param num_k_anchor The number of K anchors. If num_k_anchor == 0, it
* means no anchor is present.
* @param k_cache_anchors Pointer to the K cache anchors [num_k_anchor,
* head_dim]. The k_anchor_type must be fp16.
* @param k_cache_anchor_pos Pointer to the K cache anchor positions. Each
* token is associated with the nearest previous anchor position.
* @param v_type The data type of V cache (GGML data type).
* @param v_quant_type Quantization type for V cache.
* @param v_cache Pointer to the V cache tensor [head_dim, seq_len].
* @param num_v_anchor The number of V anchors.
* @param v_cache_anchors Pointer to the V cache anchors.
* @param v_cache_anchor_pos Pointer to the V cache anchor positions.
* @param attn_score Pre-allocated buffer for attention scores [bsz,
* past_kv_len].
* @param output Output tensor [bsz, head_dim] with the same type as q_type.
* @param lse Pre-allocated buffer [bsz] for the log-sum-exp of the
* attention scores.
* @param draft Pre-allocated temporary buffer. The buffer size should be
* enough to hold (2 * bsz * past_kv_len + 6 * bsz * head_dim + 2 *
* past_kv_len * head_dim + past_kv_len * head_dim / 32) bytes.
* @param rotary_angle Pointer to the rotary angle tensor.
* @param rotary_cos Pointer to the cosine values for rotary embedding.
* @param rotary_sin Pointer to the sine values for rotary embedding.
*/
void attn_with_kvcache_one_block_(
int head_dim, int bsz,
ggml_type q_type, // GGML data type of `Q`, only supports fp16 and q8_0
// [bsz, head_dim]
// Quantization is always on the head_dim dimension (per_token). If
// head_dim % 32 != 0, an error will be raised. The size must be bsz *
// head_dim/32 * qtype_size.
const void *q,
int past_kv_len, int past_kv_offset,
bool is_full_attn, // true indicates a full 1 mask
// If is_full_attn = false, a bit matrix representing the mask is
// passed. [bsz, past_kv_len]
const uint8_t *attn_mask,
ggml_type k_type, // GGML data type of `K Cache`, only supports fp16,
// q4_0, q8_0
int k_quant_type, // 0 for per_token, 1 for per_channel, others raise an
// error
// [seq_len, head_dim]
// If quant_type == 0, head_dim % 32 must be 0.
// If quant_type == 1, seq_len % 32 must be 0.
const void *k_cache,
// k_anchor_type must be fp16
int num_k_anchor, // num_k_anchor == 0 indicates no anchor
// [num_k_anchor, head_dim]
const void *k_cache_anchors,
// Each token is associated with the nearest previous position's anchor,
// with the same distance.
const int *k_cache_anchor_pos,
// v_cache similar to k_cache
ggml_type v_type, int v_quant_type,
// [head_dim, seq_len]
const void *v_cache, int num_v_anchor, const void *v_cache_anchors,
const int *v_cache_anchor_pos,
// Pre-allocated buffer for intermediate calculations [bsz,
// past_kv_len]. No malloc is performed inside this function.
float *attn_score,
// Output: [bsz, head_dim], with the same type as q_type
void *output,
// [bsz]
float *lse,
// Pre-allocated temporary buffer with sufficient size:
// (2 * bsz * past_kv_len + 6 * bsz * head_dim + 2 * past_kv_len *
// head_dim + past_kv_len * head_dim / 32) bytes.
void *draft,
// Apply rotary embedding online
const int *rotary_angle, const void *rotary_cos, const void *rotary_sin
// rotary_cos=None,
// rotary_sin=None,
// cache_seqlens: Optional[Union[(int, torch.Tensor)]] = None,
// cache_batch_idx: Optional[torch.Tensor] = None,
// rotary_interleaved=True,
// // Not supported for now
// window_size=(-1, -1), # -1 means infinite context window
// alibi_slopes=None,
);
};
/**
* @brief Scales a float32 vector by a given scalar value.
*
* This function multiplies each element of the input vector `y` by a scalar
* `v`. It uses platform-specific optimizations if available, such as Apple's
* Accelerate framework or SIMD instructions. If no specific optimization is
* available, the function falls back to a simple scalar multiplication loop.
*
* @param n The number of elements in the vector `y`.
* @param y The input vector to be scaled. The result will be stored in the same
* vector.
* @param v The scalar value by which to scale the vector.
*/
void ggml_vec_scale_f32(const int n, float *y, const float v);
#endif

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,123 @@
/**
* @Description :
* @Author : Jianwei Dong
* @Date : 2024-08-26 22:47:06
* @Version : 1.0.0
* @LastEditors : Jianwei Dong
* @LastEditTime : 2024-08-26 22:47:06
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#include "kvcache.h"
void KVCache::load_kvcache(std::string tensor_file_path, Backend *backend) {
// Timer start
auto start = std::chrono::high_resolution_clock::now();
std::ifstream ifs_tensor(tensor_file_path, std::ios::binary);
if (!ifs_tensor) {
throw std::runtime_error("Failed to open tensor file");
}
ifs_tensor.read(reinterpret_cast<char *>(&cache_total_len_),
sizeof(cache_total_len_));
int past_block_num =
(cache_total_len_ + config_.block_len - 1) / config_.block_len;
printf("cache_total_len: %d, past_block_num: %d\n", cache_total_len_,
past_block_num);
for (int i = 0; i < config_.layer_num; ++i) {
past_block_num_[i] = past_block_num;
}
ifs_tensor.read(reinterpret_cast<char *>(anchor_.data()),
anchor_.size() * sizeof(ggml_fp16_t));
for (int i = 0; i < config_.layer_num; ++i) {
for (int j = 0; j < config_.kv_head_num; ++j) {
for (int k = 0; k < past_block_num_[i]; ++k) {
if (config_.kv_type == GGML_TYPE_F16) {
ifs_tensor.read(
reinterpret_cast<char *>(k_cache_fp16_[i][j][k].data()),
k_cache_fp16_[i][j][k].size() * sizeof(ggml_fp16_t));
ifs_tensor.read(
reinterpret_cast<char *>(v_cache_fp16_[i][j][k].data()),
v_cache_fp16_[i][j][k].size() * sizeof(ggml_fp16_t));
} else if (config_.kv_type == GGML_TYPE_Q4_0) {
ifs_tensor.read(
reinterpret_cast<char *>(k_cache_q4[i][j][k].data()),
k_cache_q4[i][j][k].size() * sizeof(block_q4_0));
ifs_tensor.read(
reinterpret_cast<char *>(v_cache_q4[i][j][k].data()),
v_cache_q4[i][j][k].size() * sizeof(block_q4_0));
}
}
}
for (int k = 0; k < past_block_num_[i]; ++k) {
for (int l = 0; l < config_.block_len; l++) {
ifs_tensor.read(
reinterpret_cast<char *>(importance_[i][k][l].data()),
importance_[i][k][l].size() * sizeof(ggml_fp16_t));
}
}
}
ifs_tensor.close();
// Timer end
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> diff = end - start;
printf("time of load: %f s\n", diff.count());
}
void KVCache::dump_kvcache(int *block_table, int cache_total_len,
std::string tensor_file_path, Backend *backend) {
// Timer start
auto start = std::chrono::high_resolution_clock::now();
std::ofstream ofs(tensor_file_path, std::ios::binary);
printf("dump_kvcache: %s\n", tensor_file_path.c_str());
if (!ofs.is_open()) {
std::cerr << "Cannot open file " << tensor_file_path << std::endl;
return;
}
ofs.write(reinterpret_cast<const char *>(&cache_total_len),
sizeof(cache_total_len));
int past_block_num =
(cache_total_len + config_.block_len - 1) / config_.block_len;
printf("cache_total_len: %d, past_block_num: %d\n", cache_total_len,
past_block_num);
ofs.write(reinterpret_cast<const char *>(anchor_.data()),
anchor_.size() * sizeof(ggml_fp16_t));
for (int i = 0; i < config_.layer_num; ++i) {
for (int j = 0; j < config_.kv_head_num; ++j) {
for (int k = 0; k < past_block_num; ++k) {
int block_idx = block_table[k];
if (config_.kv_type == GGML_TYPE_F16) {
ofs.write(reinterpret_cast<const char *>(
k_cache_fp16_[i][j][block_idx].data()),
k_cache_fp16_[i][j][block_idx].size() *
sizeof(ggml_fp16_t));
ofs.write(reinterpret_cast<const char *>(
v_cache_fp16_[i][j][block_idx].data()),
v_cache_fp16_[i][j][block_idx].size() *
sizeof(ggml_fp16_t));
} else if (config_.kv_type == GGML_TYPE_Q4_0) {
ofs.write(reinterpret_cast<const char *>(
k_cache_q4[i][j][block_idx].data()),
k_cache_q4[i][j][block_idx].size() *
sizeof(block_q4_0));
ofs.write(reinterpret_cast<const char *>(
v_cache_q4[i][j][block_idx].data()),
v_cache_q4[i][j][block_idx].size() *
sizeof(block_q4_0));
}
}
}
for (int k = 0; k < past_block_num; ++k) {
int block_idx = block_table[k];
for (int l = 0; l < config_.block_len; l++) {
ofs.write(reinterpret_cast<const char *>(
importance_[i][block_idx][l].data()),
importance_[i][block_idx][l].size() *
sizeof(ggml_fp16_t));
}
}
}
ofs.close();
// Timer end
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> diff = end - start;
printf("time of dump: %f s\n", diff.count());
}

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -3,8 +3,8 @@
* @Author : chenht2022
* @Date : 2024-07-12 10:07:58
* @Version : 1.0.0
* @LastEditors : chenht2022
* @LastEditTime : 2024-07-25 10:34:58
* @LastEditors : kkk1nak0
* @LastEditTime : 2024-08-15 07:45:18
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#include "linear.h"
@ -24,10 +24,14 @@ Linear::~Linear() {
shared_mem_buffer.dealloc(this);
}
void Linear::warm_up(Backend* backend) {
void Linear::warm_up(Backend *backend) {
std::vector<float> input_fp32(config_.input_size);
std::vector<uint8_t> input(config_.input_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type));
std::vector<uint8_t> output(config_.output_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type));
std::vector<uint8_t> input(config_.input_size *
ggml_type_size(config_.hidden_type) /
ggml_blck_size(config_.hidden_type));
std::vector<uint8_t> output(config_.output_size *
ggml_type_size(config_.hidden_type) /
ggml_blck_size(config_.hidden_type));
for (int i = 0; i < config_.input_size; i++) {
input_fp32[i] = 0;
}
@ -45,7 +49,7 @@ void Linear::forward_many(int qlen, const void* input, void* output, Backend* ba
proj_input_ptr = proj_input_;
}
int nth = config_.output_size / config_.stride;
backend->do_work_stealing_job(nth, [&](int task_id) {
backend->do_work_stealing_job(nth, nullptr, [&](int task_id) {
int ith = task_id;
void* proj_ptr = (uint8_t*)proj_ + ith * config_.stride * config_.input_size * ggml_type_size(config_.proj_type) / ggml_blck_size(config_.proj_type);
float* proj_output_ptr = proj_output_ + ith * config_.stride;
@ -57,7 +61,7 @@ void Linear::forward_many(int qlen, const void* input, void* output, Backend* ba
from_float(output_fp32_ptr, output_ptr, config_.stride, config_.hidden_type);
}
}
});
}, nullptr);
if (config_.stride % ggml_blck_size(config_.hidden_type) != 0) {
from_float(proj_output_, output, qlen * config_.output_size, config_.hidden_type);
}

View file

@ -3,8 +3,8 @@
* @Author : chenht2022
* @Date : 2024-07-16 10:43:18
* @Version : 1.0.0
* @LastEditors : chenht2022
* @LastEditTime : 2024-07-25 10:35:04
* @LastEditors : kkk1nak0
* @LastEditTime : 2024-08-15 07:44:38
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#include "mlp.h"
@ -31,10 +31,14 @@ MLP::~MLP() {
shared_mem_buffer.dealloc(this);
}
void MLP::warm_up(Backend* backend) {
void MLP::warm_up(Backend *backend) {
std::vector<float> input_fp32(config_.hidden_size);
std::vector<uint8_t> input(config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type));
std::vector<uint8_t> output(config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type));
std::vector<uint8_t> input(config_.hidden_size *
ggml_type_size(config_.hidden_type) /
ggml_blck_size(config_.hidden_type));
std::vector<uint8_t> output(config_.hidden_size *
ggml_type_size(config_.hidden_type) /
ggml_blck_size(config_.hidden_type));
for (int i = 0; i < config_.hidden_size; i++) {
input_fp32[i] = 0;
}
@ -42,9 +46,7 @@ void MLP::warm_up(Backend* backend) {
forward_many(1, input.data(), output.data(), backend);
}
static float act_fn(float x) {
return x / (1.0f + expf(-x));
}
static float act_fn(float x) { return x / (1.0f + expf(-x)); }
void MLP::forward_many(int qlen, const void* input, void* output, Backend* backend) {
const void* gate_input_ptr;
@ -72,7 +74,7 @@ void MLP::forward_many(int qlen, const void* input, void* output, Backend* backe
}
}
int nth = config_.intermediate_size / config_.stride;
backend->do_work_stealing_job(nth, [&](int task_id) {
backend->do_work_stealing_job(nth, nullptr, [&](int task_id) {
int ith = task_id;
void* gate_proj_ptr = (uint8_t*)gate_proj_ + ith * config_.stride * config_.hidden_size * ggml_type_size(config_.gate_type) / ggml_blck_size(config_.gate_type);
float* gate_output_ptr = gate_output_ + ith * config_.stride;
@ -90,12 +92,12 @@ void MLP::forward_many(int qlen, const void* input, void* output, Backend* backe
from_float(intermediate_fp32_ptr, down_input_ptr, config_.stride, ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
}
}
});
}, nullptr);
if (config_.stride % ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) != 0) {
from_float(intermediate_fp32_, down_input_, qlen * config_.intermediate_size, ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
}
nth = config_.hidden_size / config_.stride;
backend->do_work_stealing_job(nth, [&](int task_id) {
backend->do_work_stealing_job(nth, nullptr, [&](int task_id) {
int ith = task_id;
void* down_proj_ptr = (uint8_t*)down_proj_ + ith * config_.stride * config_.intermediate_size * ggml_type_size(config_.down_type) / ggml_blck_size(config_.down_type);
float* down_output_ptr = down_output_ + ith * config_.stride;
@ -107,7 +109,7 @@ void MLP::forward_many(int qlen, const void* input, void* output, Backend* backe
from_float(output_fp32_ptr, output_ptr, config_.stride, config_.hidden_type);
}
}
});
}, nullptr);
if (config_.stride % ggml_blck_size(config_.hidden_type) != 0) {
from_float(down_output_, output, qlen * config_.hidden_size, config_.hidden_type);
}

View file

@ -3,8 +3,8 @@
* @Author : chenht2022
* @Date : 2024-07-22 02:03:22
* @Version : 1.0.0
* @LastEditors : chenht2022
* @LastEditTime : 2024-07-25 10:35:07
* @LastEditors : kkk1nak0
* @LastEditTime : 2024-08-15 07:43:41
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#include "moe.h"
@ -121,7 +121,7 @@ void MOE::forward_one(int k, const uint64_t* expert_ids, const float* weights, c
}
}
int nth = config_.intermediate_size / config_.stride;
backend->do_work_stealing_job(nth * k, [&](int task_id) {
backend->do_work_stealing_job(nth * k, nullptr, [&](int task_id) {
int expert_idx = task_id / nth;
uint64_t expert_id = expert_ids[expert_idx];
int ith = task_id % nth;
@ -139,14 +139,14 @@ void MOE::forward_one(int k, const uint64_t* expert_ids, const float* weights, c
void* down_input_ptr = s_down_input_[expert_idx] + ith * config_.stride * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
from_float(intermediate_fp32_ptr, down_input_ptr, config_.stride, ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
}
});
}, nullptr);
if (config_.stride % ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) != 0) {
for (int i = 0; i < k; i++) {
from_float(s_intermediate_fp32_[i], s_down_input_[i], config_.intermediate_size, ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
}
}
nth = config_.hidden_size / config_.stride;
backend->do_work_stealing_job(nth, [&](int task_id) {
backend->do_work_stealing_job(nth, nullptr, [&](int task_id) {
int ith = task_id;
for (int i = ith * config_.stride; i < (ith + 1) * config_.stride; i++) {
s_output_fp32_[i] = 0;
@ -165,7 +165,7 @@ void MOE::forward_one(int k, const uint64_t* expert_ids, const float* weights, c
void* output_ptr = (uint8_t*)output + ith * config_.stride * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type);
from_float(output_fp32_ptr, output_ptr, config_.stride, config_.hidden_type);
}
});
}, nullptr);
if (config_.stride % ggml_blck_size(config_.hidden_type) != 0) {
from_float(s_output_fp32_, output, config_.hidden_size, config_.hidden_type);
}
@ -191,7 +191,7 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
m_local_down_output_ptr_[i] = m_local_down_output_ + offset * config_.hidden_size;
offset += m_local_num_[i];
}
backend->do_work_stealing_job(qlen, [&](int i) {
backend->do_work_stealing_job(qlen, nullptr, [&](int i) {
const void* gate_input_ptr;
const void* up_input_ptr;
if (config_.hidden_type == ggml_internal_get_type_traits(config_.gate_type).vec_dot_type && config_.hidden_type == ggml_internal_get_type_traits(config_.up_type).vec_dot_type) {
@ -220,10 +220,10 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
memcpy(m_local_gate_input_ptr_[expert_ids[i * k + j]] + m_local_pos_[i][j] * config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type), gate_input_ptr, config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type));
memcpy(m_local_up_input_ptr_[expert_ids[i * k + j]] + m_local_pos_[i][j] * config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type), up_input_ptr, config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type));
}
});
}, nullptr);
int stride = QK_K;
int nth = config_.intermediate_size / stride;
backend->do_work_stealing_job(nth * config_.expert_num, [&](int task_id) {
backend->do_work_stealing_job(nth * config_.expert_num, nullptr, [&](int task_id) {
int expert_idx = task_id / nth;
int ith = task_id % nth;
void* gate_input_ptr = m_local_gate_input_ptr_[expert_idx];
@ -242,18 +242,18 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
void* down_input_ptr = m_local_down_input_ptr_[expert_idx] + i * config_.intermediate_size * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) + ith * stride * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
from_float(intermediate_fp32_ptr, down_input_ptr, stride, ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
}
});
}, nullptr);
stride = QK_K;
nth = config_.hidden_size / stride;
backend->do_work_stealing_job(nth * config_.expert_num, [&](int task_id) {
backend->do_work_stealing_job(nth * config_.expert_num, nullptr, [&](int task_id) {
int expert_idx = task_id / nth;
int ith = task_id % nth;
void* down_input_ptr = m_local_down_input_ptr_[expert_idx];
void* down_proj_ptr = (uint8_t*)down_proj_ + (expert_idx * config_.hidden_size + ith * stride) * config_.intermediate_size * ggml_type_size(config_.down_type) / ggml_blck_size(config_.down_type);
float* down_output_ptr = m_local_down_output_ptr_[expert_idx] + ith * stride;
llamafile_sgemm(stride, m_local_num_[expert_idx], config_.intermediate_size / ggml_blck_size(config_.down_type), down_proj_ptr, config_.intermediate_size / ggml_blck_size(config_.down_type), down_input_ptr, config_.intermediate_size / ggml_blck_size(config_.down_type), down_output_ptr, config_.hidden_size, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.down_type, ggml_internal_get_type_traits(config_.down_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
});
backend->do_work_stealing_job(qlen, [&](int i) {
}, nullptr);
backend->do_work_stealing_job(qlen, nullptr, [&](int i) {
for (int e = 0; e < config_.hidden_size; e++) {
m_output_fp32_[i][e] = 0;
}
@ -263,7 +263,7 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
}
}
from_float(m_output_fp32_[i], (uint8_t*)output + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), config_.hidden_size, config_.hidden_type);
});
}, nullptr);
}
void MOE::forward(int qlen, int k, const uint64_t* expert_ids, const float* weights, const void* input, void* output, Backend* backend) {

99
ktransformers/local_chat.py Executable file → Normal file
View file

@ -1,20 +1,14 @@
# Copyright 2024 Shaoyuan Chen
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Description :
Author : Boxin Zhang, Azure-Tang
Version : 0.1.0
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
"""
import os
import platform
import sys
project_dir = os.path.dirname(os.path.dirname(__file__))
sys.path.insert(0, project_dir)
import torch
@ -31,6 +25,7 @@ import fire
from ktransformers.optimize.optimize import optimize_and_load_gguf
from ktransformers.models.modeling_deepseek import DeepseekV2ForCausalLM
from ktransformers.models.modeling_qwen2_moe import Qwen2MoeForCausalLM
from ktransformers.models.modeling_llama import LlamaForCausalLM
from ktransformers.models.modeling_mixtral import MixtralForCausalLM
from ktransformers.util.utils import prefill_and_generate
from ktransformers.server.config.config import Config
@ -38,38 +33,56 @@ from ktransformers.server.config.config import Config
custom_models = {
"DeepseekV2ForCausalLM": DeepseekV2ForCausalLM,
"Qwen2MoeForCausalLM": Qwen2MoeForCausalLM,
"LlamaForCausalLM": LlamaForCausalLM,
"MixtralForCausalLM": MixtralForCausalLM,
}
ktransformer_rules_dir = os.path.dirname(os.path.abspath(__file__)) + "/optimize/optimize_rules/"
default_optimize_rules ={
ktransformer_rules_dir = (
os.path.dirname(os.path.abspath(__file__)) + "/optimize/optimize_rules/"
)
default_optimize_rules = {
"DeepseekV2ForCausalLM": ktransformer_rules_dir + "DeepSeek-V2-Chat.yaml",
"Qwen2MoeForCausalLM": ktransformer_rules_dir + "Qwen2-57B-A14B-Instruct.yaml",
"LlamaForCausalLM": ktransformer_rules_dir + "Internlm2_5-7b-Chat-1m.yaml",
"MixtralForCausalLM": ktransformer_rules_dir + "Mixtral.yaml",
}
def local_chat(
model_path: str,
model_path: str | None = None,
optimize_rule_path: str = None,
gguf_path: str = None,
gguf_path: str | None = None,
max_new_tokens: int = 1000,
cpu_infer: int = Config().cpu_infer,
use_cuda_graph: bool = True,
prompt_file : str | None = None,
mode: str = "normal",
):
torch.set_grad_enabled(False)
Config().cpu_infer = cpu_infer
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
torch.set_default_dtype(config.torch_dtype)
if mode == 'long_context':
torch.set_default_dtype(torch.float16)
else:
torch.set_default_dtype(config.torch_dtype)
with torch.device("meta"):
if config.architectures[0] in custom_models:
print("using custom modeling_xxx.py.")
if "Qwen2Moe" in config.architectures[0]: # Qwen2Moe must use flash_attention_2 to avoid overflow.
if (
"Qwen2Moe" in config.architectures[0]
): # Qwen2Moe must use flash_attention_2 to avoid overflow.
config._attn_implementation = "flash_attention_2"
if "Mixtral" in config.architectures[0]:
if "Llama" in config.architectures[0]:
config._attn_implementation = "eager"
if "Mixtral" in config.architectures[0]:
config._attn_implementation = "flash_attention_2"
model = custom_models[config.architectures[0]](config)
else:
model = AutoModelForCausalLM.from_config(
@ -95,26 +108,50 @@ def local_chat(
if model.generation_config.pad_token_id is None:
model.generation_config.pad_token_id = model.generation_config.eos_token_id
model.eval()
logging.basicConfig(level=logging.INFO)
system = platform.system()
if (system == u'Windows'):
os.system('cls')
if system == "Windows":
os.system("cls")
else:
os.system('clear')
os.system("clear")
while True:
content = input("Chat: ")
if content == "":
content = "Please write a piece of quicksort code in C++."
if content.startswith('"""'): # prefix """
# multi lines input
content = content[3:] + "\n"
while True:
line = input("")
if line.endswith('"""'):
# end multi lines input
line = line[:-3] # suffix """
if line:
content += line + "\n"
break
else:
content += line + "\n"
if content == "":
if prompt_file != None:
content = open(prompt_file, "r").read()
else:
content = "Please write a piece of quicksort code in C++."
elif os.path.isfile(content):
content = open(content, "r").read()
messages = [{"role": "user", "content": content}]
input_tensor = tokenizer.apply_chat_template(
messages, add_generation_prompt=True, return_tensors="pt"
)
torch.set_default_dtype(torch.bfloat16) # TODO: Remove this, replace dtype using config
generated = prefill_and_generate(model, tokenizer, input_tensor.cuda(), max_new_tokens, use_cuda_graph)
assert Config().long_context_config['max_seq_len'] > input_tensor.shape[1] + max_new_tokens, \
"please change max_seq_len in ~/.ktransformers/config.yaml"
torch.set_default_dtype(
torch.bfloat16
) # TODO: Remove this, replace dtype using config
generated = prefill_and_generate(
model, tokenizer, input_tensor.cuda(), max_new_tokens, use_cuda_graph, mode
)
if __name__ == "__main__":
fire.Fire(local_chat)
fire.Fire(local_chat)

View file

@ -0,0 +1,203 @@
# coding=utf-8
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""LLaMA model configuration"""
from transformers.configuration_utils import PretrainedConfig
from transformers.modeling_rope_utils import rope_config_validation
class LlamaConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`LlamaModel`]. It is used to instantiate an LLaMA
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the LLaMA-7B.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
vocab_size (`int`, *optional*, defaults to 32000):
Vocabulary size of the LLaMA model. Defines the number of different tokens that can be represented by the
`inputs_ids` passed when calling [`LlamaModel`]
hidden_size (`int`, *optional*, defaults to 4096):
Dimension of the hidden representations.
intermediate_size (`int`, *optional*, defaults to 11008):
Dimension of the MLP representations.
num_hidden_layers (`int`, *optional*, defaults to 32):
Number of hidden layers in the Transformer decoder.
num_attention_heads (`int`, *optional*, defaults to 32):
Number of attention heads for each attention layer in the Transformer decoder.
num_key_value_heads (`int`, *optional*):
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
by meanpooling all the original heads within that group. For more details checkout [this
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
`num_attention_heads`.
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
The non-linear activation function (function or string) in the decoder.
max_position_embeddings (`int`, *optional*, defaults to 2048):
The maximum sequence length that this model might ever be used with. Llama 1 supports up to 2048 tokens,
Llama 2 up to 4096, CodeLlama up to 16384.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
rms_norm_eps (`float`, *optional*, defaults to 1e-06):
The epsilon used by the rms normalization layers.
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models). Only
relevant if `config.is_decoder=True`.
pad_token_id (`int`, *optional*):
Padding token id.
bos_token_id (`int`, *optional*, defaults to 1):
Beginning of stream token id.
eos_token_id (`int`, *optional*, defaults to 2):
End of stream token id.
pretraining_tp (`int`, *optional*, defaults to 1):
Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
document](https://huggingface.co/docs/transformers/main/perf_train_gpu_many#tensor-parallelism) to
understand more about it. This value is necessary to ensure exact reproducibility of the pretraining
results. Please refer to [this issue](https://github.com/pytorch/pytorch/issues/76232).
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
Whether to tie weight embeddings
rope_theta (`float`, *optional*, defaults to 10000.0):
The base period of the RoPE embeddings.
rope_scaling (`Dict`, *optional*):
Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
accordingly.
Expected contents:
`rope_type` (`str`):
The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
'llama3'], with 'default' being the original RoPE implementation.
`factor` (`float`, *optional*):
Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
most scaling types, a `factor` of x will enable the model to handle sequences of length x *
original maximum pre-trained length.
`original_max_position_embeddings` (`int`, *optional*):
Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
pretraining.
`attention_factor` (`float`, *optional*):
Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
computation. If unspecified, it defaults to value recommended by the implementation, using the
`factor` field to infer the suggested value.
`beta_fast` (`float`, *optional*):
Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
ramp function. If unspecified, it defaults to 32.
`beta_slow` (`float`, *optional*):
Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
ramp function. If unspecified, it defaults to 1.
`short_factor` (`List[float]`, *optional*):
Only used with 'longrope'. The scaling factor to be applied to short contexts (<
`original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
size divided by the number of attention heads divided by 2
`long_factor` (`List[float]`, *optional*):
Only used with 'longrope'. The scaling factor to be applied to long contexts (<
`original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
size divided by the number of attention heads divided by 2
`low_freq_factor` (`float`, *optional*):
Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
`high_freq_factor` (`float`, *optional*):
Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
attention_bias (`bool`, *optional*, defaults to `False`):
Whether to use a bias in the query, key, value and output projection layers during self-attention.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
mlp_bias (`bool`, *optional*, defaults to `False`):
Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers.
```python
>>> from transformers import LlamaModel, LlamaConfig
>>> # Initializing a LLaMA llama-7b style configuration
>>> configuration = LlamaConfig()
>>> # Initializing a model from the llama-7b style configuration
>>> model = LlamaModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "llama"
keys_to_ignore_at_inference = ["past_key_values"]
def __init__(
self,
vocab_size=32000,
hidden_size=4096,
intermediate_size=11008,
num_hidden_layers=32,
num_attention_heads=32,
num_key_value_heads=None,
hidden_act="silu",
max_position_embeddings=2048,
initializer_range=0.02,
rms_norm_eps=1e-6,
use_cache=True,
pad_token_id=None,
bos_token_id=1,
eos_token_id=2,
pretraining_tp=1,
tie_word_embeddings=False,
rope_theta=10000.0,
rope_scaling=None,
attention_bias=False,
attention_dropout=0.0,
mlp_bias=False,
**kwargs,
):
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
# for backward compatibility
if num_key_value_heads is None:
num_key_value_heads = num_attention_heads
self.num_key_value_heads = num_key_value_heads
self.hidden_act = hidden_act
self.initializer_range = initializer_range
self.rms_norm_eps = rms_norm_eps
self.pretraining_tp = pretraining_tp
self.use_cache = use_cache
self.rope_theta = rope_theta
self.rope_scaling = rope_scaling
self.attention_bias = attention_bias
self.attention_dropout = attention_dropout
self.mlp_bias = mlp_bias
# Validate the correctness of rotary position embeddings parameters
# BC: if there is a 'type' field, move it to 'rope_type'.
if self.rope_scaling is not None and "type" in self.rope_scaling:
self.rope_scaling["rope_type"] = self.rope_scaling["type"]
rope_config_validation(self)
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)

File diff suppressed because it is too large Load diff

View file

@ -1,67 +1,128 @@
'''
"""
Description :
Author : Boxin Zhang
Version : 0.1.0
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
'''
"""
from torch import nn
from ktransformers.models.modeling_deepseek import DeepseekV2YarnRotaryEmbedding, DeepseekV2RotaryEmbedding
from transformers import ROPE_INIT_FUNCTIONS
from ktransformers.models.modeling_llama import (
LlamaRotaryEmbedding,
LlamaLinearScalingRotaryEmbedding,
LlamaDynamicNTKScalingRotaryEmbedding,
)
from ktransformers.models.modeling_deepseek import (
DeepseekV2YarnRotaryEmbedding,
DeepseekV2RotaryEmbedding,
)
from ktransformers.operators.base_operator import BaseInjectedModule
from ktransformers.util.custom_gguf import GGUFLoader
from ktransformers.util.utils import InferenceState
from transformers.configuration_utils import PretrainedConfig
# Copied from transformers.models.mixtral.modeling_mixtral.MixtralRotaryEmbedding with Mixtral->Qwen2Moe
class RotaryEmbedding(BaseInjectedModule, DeepseekV2RotaryEmbedding):
def __init__(self,
key: str,
gguf_loader : GGUFLoader,
config: PretrainedConfig,
orig_module: nn.Module,
# device: str = "cuda",
generate_device: str = "cuda",
prefill_device: str = "cuda",
**kwargs):
BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, generate_device, **kwargs)
self.orig_module.__init__(orig_module.dim,
orig_module.max_position_embeddings,
orig_module.base)
def __init__(
self,
key: str,
gguf_loader: GGUFLoader,
config: PretrainedConfig,
orig_module: nn.Module,
# device: str = "cuda",
generate_device: str = "cuda",
prefill_device: str = "cuda",
**kwargs,
):
BaseInjectedModule.__init__(
self, key, gguf_loader, config, orig_module, generate_device, **kwargs
)
self.orig_module.__init__(
orig_module.dim, orig_module.max_position_embeddings, orig_module.base
)
self.generate_device = generate_device
self.prefill_device = prefill_device
def load(self):
self.orig_module.__init__(self.orig_module.dim,
self.orig_module.__init__(
self.orig_module.dim,
self.orig_module.max_position_embeddings,
self.orig_module.base,
self.device)
class YarnRotaryEmbedding(BaseInjectedModule, DeepseekV2YarnRotaryEmbedding):
def __init__(self,
key: str,
gguf_loader : GGUFLoader,
config: PretrainedConfig,
orig_module: nn.Module,
# device: str = "cuda",
generate_device: str = "cuda",
prefill_device: str = "cuda",
**kwargs):
BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, generate_device, **kwargs)
self.orig_module.__init__(orig_module.dim,
self.device,
)
class RotaryEmbeddingV2(BaseInjectedModule, LlamaRotaryEmbedding):
def __init__(
self,
key: str,
gguf_loader: GGUFLoader,
config: PretrainedConfig,
orig_module: nn.Module,
generate_device: str = "cuda",
prefill_device: str = "cuda",
**kwargs,
):
BaseInjectedModule.__init__(
self, key, gguf_loader, config, orig_module, generate_device, **kwargs
)
self.orig_module.__init__(
orig_module.dim,
orig_module.max_position_embeddings,
orig_module.base,
None, #device
None,
orig_module.scaling_factor,
orig_module.rope_type,
orig_module.config,
)
self.generate_device = generate_device
self.prefill_device = prefill_device
def load(self):
self.orig_module.__init__(
self.orig_module.dim,
self.orig_module.max_position_embeddings,
self.orig_module.base,
self.device,
self.orig_module.scaling_factor,
self.orig_module.rope_type,
self.orig_module.config,
)
class YarnRotaryEmbedding(BaseInjectedModule, DeepseekV2YarnRotaryEmbedding):
def __init__(
self,
key: str,
gguf_loader: GGUFLoader,
config: PretrainedConfig,
orig_module: nn.Module,
# device: str = "cuda",
generate_device: str = "cuda",
prefill_device: str = "cuda",
**kwargs,
):
BaseInjectedModule.__init__(
self, key, gguf_loader, config, orig_module, generate_device, **kwargs
)
self.orig_module.__init__(
orig_module.dim,
orig_module.max_position_embeddings,
orig_module.base,
None, # device
orig_module.scaling_factor,
orig_module.original_max_position_embeddings,
orig_module.beta_fast,
orig_module.beta_slow,
orig_module.mscale,
orig_module.mscale_all_dim)
orig_module.mscale_all_dim,
)
self.generate_device = generate_device
self.prefill_device = prefill_device
def load(self):
self.orig_module.__init__(self.orig_module.dim,
self.orig_module.__init__(
self.orig_module.dim,
self.orig_module.max_position_embeddings,
self.orig_module.base,
self.generate_device,
@ -70,5 +131,42 @@ class YarnRotaryEmbedding(BaseInjectedModule, DeepseekV2YarnRotaryEmbedding):
self.orig_module.beta_fast,
self.orig_module.beta_slow,
self.orig_module.mscale,
self.orig_module.mscale_all_dim)
self.orig_module.mscale_all_dim,
)
class DynamicNTKScalingRotaryEmbedding(
BaseInjectedModule, LlamaDynamicNTKScalingRotaryEmbedding
):
def __init__(
self,
key: str,
gguf_loader: GGUFLoader,
config: PretrainedConfig,
orig_module: nn.Module,
device: str = "cuda",
**kwargs,
):
BaseInjectedModule.__init__(
self, key, gguf_loader, config, orig_module, device, **kwargs
)
self.orig_module.__init__(
orig_module.dim,
orig_module.max_position_embeddings,
orig_module.base,
None, # device
orig_module.scaling_factor,
orig_module.rope_type,
orig_module.config,
)
def load(self):
self.orig_module.__init__(
self.orig_module.dim,
self.orig_module.max_position_embeddings,
self.orig_module.base,
self.orig_module.device,
self.orig_module.scaling_factor,
self.orig_module.rope_type,
self.orig_module.config,
)

View file

@ -7,16 +7,22 @@ Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
import torch
from torch import nn
import warnings
import torch.nn.functional as F
from ktransformers.operators.models import KLlamaModel
from ktransformers.models.configuration_deepseek import DeepseekV2Config
from ktransformers.models.configuration_llama import LlamaConfig
from ktransformers.models.modeling_llama import LlamaRotaryEmbedding
from ktransformers.models.modeling_deepseek import DeepseekV2Attention, apply_rotary_pos_emb
from typing import Optional, Tuple
from ktransformers.operators.base_operator import BaseInjectedModule
from ktransformers.util.custom_gguf import GGUFLoader
import logging
from transformers.configuration_utils import PretrainedConfig
from transformers.cache_utils import Cache
logger = logging.getLogger("attention")
class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention):
"""Multi-headed attention from 'Attention Is All You Need' paper"""
attn_mask: Optional[torch.Tensor] = None
def __init__(self,
key: str,
@ -24,10 +30,12 @@ class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention):
config: PretrainedConfig,
orig_module: nn.Module,
device: str = "cuda",
chunck_size: int = 1000,
**kwargs):
BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, device, **kwargs)
self.orig_module.__init__(orig_module.config,
orig_module.layer_idx)
self.chunck_size = chunck_size # TODO, generate chunck_size automatically.
def get_absorbed(self) -> Tuple[torch.Tensor, torch.Tensor]:
if not (hasattr(self, 'q_absorb') and hasattr(self, 'out_absorb')):
@ -157,9 +165,8 @@ class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention):
"Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
)
bsz, q_len, _ = hidden_states.size()
chunck_size = 256 # TODO, generate chunck_size automatically.
if q_len <= chunck_size:
if q_len <= self.chunck_size:
return self.forward_chunck(
hidden_states,
attention_mask,
@ -176,24 +183,170 @@ class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention):
cur_idx = 0
while cur_idx < q_len:
if attention_mask is not None:
chunk_mask = attention_mask[:, :, cur_idx:min(cur_idx + chunck_size, q_len), ...]
chunk_mask = attention_mask[:, :, cur_idx:min(cur_idx + self.chunck_size, q_len), ...]
else:
chunk_mask = None
# generate chunk_mask automatically.
self.attn_mask = \
torch.zeros(1, 1, self.chunck_size, past_key_value.max_cache_len, device=hidden_states.device) \
if self.attn_mask is None \
else self.attn_mask
self.attn_mask[:, :, :, cur_idx:min(cur_idx+self.chunck_size, past_key_value.max_cache_len)] = \
-1e+38 * torch.triu(torch.ones(self.chunck_size, self.chunck_size, device=hidden_states.device), diagonal=1)\
[:,:min(self.chunck_size, min(past_key_value.max_cache_len-cur_idx, self.chunck_size))]
self.attn_mask[:, :, :, cur_idx+self.chunck_size:] = -1e+38
self.attn_mask[:, :, :, :cur_idx] = 0
chunck_mask = torch.narrow(self.attn_mask, 2, 0, min(self.chunck_size, q_len-cur_idx))
cur_output, _, _ = self.forward_chunck(
hidden_states[:, cur_idx:min(cur_idx + chunck_size, q_len), ...],
chunk_mask,
position_ids[:, cur_idx:min(cur_idx + chunck_size, q_len)],
hidden_states[:, cur_idx:min(cur_idx + self.chunck_size, q_len), ...],
chunck_mask,
position_ids[:, cur_idx:min(cur_idx + self.chunck_size, q_len)],
past_key_value,
output_attentions,
use_cache,
cache_position[cur_idx:min(cur_idx + chunck_size, q_len)],
cache_position[cur_idx:min(cur_idx + self.chunck_size, q_len)],
**kwargs
)
cur_idx += chunck_size
cur_idx += self.chunck_size
if attn_output is None:
attn_output = cur_output
else:
attn_output = torch.cat((attn_output, cur_output), dim=-2)
return attn_output, None, past_key_value
def rotate_half(x):
"""Rotates half the hidden dims of the input."""
x1 = x[..., : x.shape[-1] // 2]
x2 = x[..., x.shape[-1] // 2 :]
return torch.cat((-x2, x1), dim=-1)
class KLlamaAttention(BaseInjectedModule):
"""Multi-headed attention from 'Attention Is All You Need' paper"""
def __init__(self,
key: str,
gguf_loader : GGUFLoader,
config: PretrainedConfig,
orig_module: nn.Module,
device: str = "cuda",
**kwargs):
BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, device, **kwargs)
self.orig_module.__init__(orig_module.config,
orig_module.layer_idx)
def apply_rotary_pos_emb(self, q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
"""Applies Rotary Position Embedding to the query and key tensors.
Args:
q (`torch.Tensor`): The query tensor.
k (`torch.Tensor`): The key tensor.
cos (`torch.Tensor`): The cosine part of the rotary embedding.
sin (`torch.Tensor`): The sine part of the rotary embedding.
position_ids (`torch.Tensor`, *optional*):
Deprecated and unused.
unsqueeze_dim (`int`, *optional*, defaults to 1):
The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
`tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
"""
cos = cos.unsqueeze(unsqueeze_dim)
sin = sin.unsqueeze(unsqueeze_dim)
q_embed = (q * cos) + (rotate_half(q) * sin)
k_embed = (k * cos) + (rotate_half(k) * sin)
return q_embed, k_embed
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_value: Optional[Cache] = None,
output_attentions: bool = False,
use_cache: bool = False,
cache_position: Optional[torch.LongTensor] = None,
position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.45
**kwargs,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
bsz, q_len, _ = hidden_states.size()
if self.config.pretraining_tp > 1:
key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
query_slices = self.q_proj.weight.split(
(self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0
)
key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)]
query_states = torch.cat(query_states, dim=-1)
key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)]
key_states = torch.cat(key_states, dim=-1)
value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)]
value_states = torch.cat(value_states, dim=-1)
else:
query_states = self.q_proj(hidden_states)
key_states = self.k_proj(hidden_states)
value_states = self.v_proj(hidden_states)
query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
if position_embeddings is None:
logger.warning(
"The attention layers in this model are transitioning from computing the RoPE embeddings internally "
"through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
"`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.45 `position_ids` will be "
"removed and `position_embeddings` will be mandatory."
)
cos, sin = self.rotary_emb(value_states, position_ids)
else:
cos, sin = position_embeddings
query_states, key_states = self.apply_rotary_pos_emb(query_states, key_states, cos, sin)
if q_len == 1:
position_ids = position_ids[0][-1].unsqueeze(0).unsqueeze(0)
query_states = query_states[:, :, -1:]
key_states = key_states[:, :, -1:]
attn_output = KLlamaModel.dynamic_sdpa.apply(
self.layer_idx,
bsz,
position_ids[0][0],
query_states.transpose(1, 2).to(torch.float16),
key_states.transpose(1, 2).to(torch.float16),
value_states.transpose(1, 2).to(torch.float16),
mode="prefill" if q_len > 1 else "generate",
)
if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
raise ValueError(
f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
f" {attn_output.size()}"
)
attn_output = attn_output.transpose(1, 2).contiguous()
attn_output = attn_output.reshape(bsz, q_len, -1)
if self.config.pretraining_tp > 1:
attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2)
o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1)
attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)])
else:
attn_output = self.o_proj(attn_output)
if not output_attentions:
attn_weights = None
return attn_output, attn_weights, past_key_value

View file

@ -1,18 +1,746 @@
#!/usr/bin/env python
# coding=utf-8
"""
Description : This script defines the `CPUInferKVCache` and `CPUInfer` classes for performing inference
with a Key-Value Cache on the CPU. The `CPUInferKVCache` class is responsible for configuring
and managing key-value caches, updating and retrieving cache data, and handling attention
operations. It supports different cache types (e.g., Q4_0, FP16) and retrieval strategies
(e.g., shared, separate). The `CPUInfer` class handles task submission and synchronization
on the CPU, with optional CUDA stream integration for tasks involving GPU acceleration.
These classes facilitate efficient caching and memory management for deep learning models
that leverage key-value attention mechanisms, particularly on CPU-based systems.
Author : djw
Date : 2024-08-26 23:25:24
Version : 1.0.0
LastEditors : djw
LastEditTime : 2024-08-26 23:25:24
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
"""
import sys, os
from typing import Any
import torch
sys.path.append(os.path.join(os.path.dirname(__file__), "..", "ktransformers_ext", "build"))
sys.path.append(os.path.join(os.path.dirname(__file__), "..", "ktransformers_ext", "build", "Release"))
sys.path.append(os.path.join(os.path.dirname(__file__), "..", "ktransformers_ext", "build", "Debug"))
import cpuinfer_ext
from ktransformers.server.config.config import Config
class CPUInferKVCache:
def __init__(
self,
layer_num: int = 32,
kv_head_num: int = 8,
q_head_num: int = 32,
head_dim: int = 128,
block_len: int = 256,
anchor_num: int = 4,
anchor_type: str = "FIXED",
kv_type: str = "Q4_0",
retrieval_type: str = "SHARED",
layer_step: int = 1,
token_step: int = 1,
layer_offset: int = 0,
max_thread_num: int = 32,
max_batch_size: int = 4,
max_block_num: int = 512,
):
if anchor_type == "FIXED":
anchor_type = cpuinfer_ext.kvcache.AnchorType.FIXED
elif anchor_type == "QUEST":
anchor_type = cpuinfer_ext.kvcache.AnchorType.QUEST
elif anchor_type == "DYNAMIC":
anchor_type = cpuinfer_ext.kvcache.AnchorType.DYNAMIC
elif anchor_type == "BLOCK_MEAN":
anchor_type = cpuinfer_ext.kvcache.AnchorType.BLOCK_MEAN
elif anchor_type == "BLOCK_MAX":
anchor_type = cpuinfer_ext.kvcache.AnchorType.BLOCK_MAX
else:
raise ValueError(f"Unknown anchor type: {anchor_type}")
if kv_type == "FP16":
kv_type = cpuinfer_ext.kvcache.ggml_type.FP16
elif kv_type == "FP32":
assert False, "FP32 is not supported yet."
kv_type = cpuinfer_ext.kvcache.ggml_type.FP32
elif kv_type == "Q4_0":
kv_type = cpuinfer_ext.kvcache.ggml_type.Q4_0
elif kv_type == "Q8_0":
kv_type = cpuinfer_ext.kvcache.ggml_type.Q8_0
else:
raise ValueError(f"Unknown kv type: {kv_type}")
if retrieval_type == "SHARED":
retrieval_type = cpuinfer_ext.kvcache.RetrievalType.LAYER
elif retrieval_type == "INDIVIDUAL":
retrieval_type = cpuinfer_ext.kvcache.RetrievalType.QHEAD
elif retrieval_type == "SEPARATE":
retrieval_type = cpuinfer_ext.kvcache.RetrievalType.KVHEAD
self.config = cpuinfer_ext.kvcache.KVCacheConfig(
layer_num,
kv_head_num,
q_head_num,
head_dim,
block_len,
anchor_num,
anchor_type,
kv_type,
retrieval_type,
layer_step,
token_step,
layer_offset,
max_block_num,
max_batch_size,
max_thread_num,
)
self.kvcache = cpuinfer_ext.kvcache.KVCache(self.config)
def load_kvcache(self, tensor_file_path: str):
if not os.path.exists(tensor_file_path):
raise FileNotFoundError(f"The file {tensor_file_path} does not exist.")
return self.kvcache.load_kvcache(tensor_file_path,)
def dump_kvcache(
self, block_table: torch.Tensor, cache_total_len: int, tensor_file_path: str
):
assert (
block_table.dim() == 1
and block_table.dtype == torch.int
and block_table.is_contiguous()
and block_table.device == torch.device("cpu")
), "block_table dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
block_table.dim(),
block_table.size(),
block_table.dtype,
block_table.is_contiguous(),
block_table.device,
)
assert (
cache_total_len > 0
and cache_total_len <= self.config.block_len * block_table.size(0)
), "cache_total_len: {}".format(cache_total_len)
if not os.path.exists(os.path.dirname(tensor_file_path)):
os.makedirs(os.path.dirname(tensor_file_path))
return self.kvcache.dump_kvcache(
block_table.data_ptr(),
cache_total_len,
tensor_file_path,
)
def update_cache_total_len(self, cache_total_len: int):
assert cache_total_len > 0, "cache_total_len: {}".format(cache_total_len)
self.kvcache.update_cache_total_len(cache_total_len)
# q_in: (bsz, q_len, q_head_num, head_dim)
# output: (bsz, q_len, q_head_num, head_dim)
# attn_lse: (bsz, q_len, q_head_num)
# block_table: (bsz, max_block_num)
def attn(
self,
q_in: torch.Tensor,
output: torch.Tensor,
attn_lse: torch.Tensor,
layer_idx: int,
generate_token_idx: int,
block_table: torch.Tensor | None = None,
cache_seqlens: torch.Tensor | None = None,
pick_block_num: int | None = None,
init_block_num: int | None = None,
local_block_num: int | None = None,
):
assert (
q_in.dim() == 4
and q_in.size(2) == self.config.q_head_num
and q_in.size(3) == self.config.head_dim
and q_in.dtype == torch.float16
and q_in.is_contiguous()
and q_in.device == torch.device("cpu")
), "q_in dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
q_in.dim(), q_in.size(), q_in.dtype, q_in.is_contiguous(), q_in.device
)
batch_size = q_in.size(0)
q_len = q_in.size(1)
assert (block_table is None) or (
block_table.dim() == 2
and block_table.size(0) == batch_size
and block_table.dtype == torch.int
and block_table.is_contiguous()
and block_table.device == torch.device("cpu")
), "block_table dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
block_table.dim(),
block_table.size(),
block_table.dtype,
block_table.is_contiguous(),
block_table.device,
)
max_block_num = block_table.size(1) if block_table is not None else 0
assert (
output.dim() == 4
and output.size(0) == batch_size
and output.size(2) == self.config.q_head_num
and output.size(1) == q_len
and output.size(3) == self.config.head_dim
and output.dtype == torch.float16
and output.is_contiguous()
and output.device == torch.device("cpu")
), "output dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
output.dim(),
output.size(),
output.dtype,
output.is_contiguous(),
output.device,
)
assert (
attn_lse.dim() == 3
and attn_lse.size(0) == batch_size
and attn_lse.size(1) == q_len
and attn_lse.size(2) == self.config.q_head_num
and attn_lse.dtype == torch.float32
and attn_lse.is_contiguous()
and attn_lse.device == torch.device("cpu")
), "attn_lse dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
attn_lse.dim(),
attn_lse.size(),
attn_lse.dtype,
attn_lse.is_contiguous(),
attn_lse.device,
)
assert (
layer_idx >= 0 and layer_idx < self.config.layer_num
), "layer_idx: {}".format(layer_idx)
assert (cache_seqlens is None) or (
cache_seqlens.dim() == 1
and cache_seqlens.size(0) == batch_size
and cache_seqlens.dtype == torch.int
and cache_seqlens.is_contiguous()
and cache_seqlens.device == torch.device("cpu")
), "cache_seqlens dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
cache_seqlens.dim(),
cache_seqlens.size(),
cache_seqlens.dtype,
cache_seqlens.is_contiguous(),
cache_seqlens.device,
)
return self.kvcache.attn(
q_in.data_ptr(),
output.data_ptr(),
attn_lse.data_ptr(),
layer_idx,
generate_token_idx,
q_len,
batch_size,
max_block_num,
block_table.data_ptr() if block_table is not None else 0,
cache_seqlens.data_ptr() if cache_seqlens is not None else 0,
pick_block_num,
init_block_num,
local_block_num,
)
# k_in: (block_len, kv_head_num, head_dim)
# v_in: (block_len, kv_head_num, head_dim)
def update_kvcache_one_block_fp16(
self, k_in: torch.Tensor, v_in: torch.Tensor, layer_id: int, block_idx: int
):
assert (
k_in.dim() == 3
and k_in.size(1) == self.config.block_len
and k_in.size(0) == self.config.kv_head_num
and k_in.size(2) == self.config.head_dim
and k_in.dtype == torch.float16
and k_in.is_contiguous()
and k_in.device == torch.device("cpu")
), "k_in dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
k_in.dim(), k_in.size(), k_in.dtype, k_in.is_contiguous(), k_in.device
)
assert (
v_in.dim() == 3
and v_in.size(1) == self.config.block_len
and v_in.size(0) == self.config.kv_head_num
and v_in.size(2) == self.config.head_dim
and v_in.dtype == torch.float16
and v_in.is_contiguous()
and v_in.device == torch.device("cpu")
), "v_in dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
v_in.dim(), v_in.size(), v_in.dtype, v_in.is_contiguous(), v_in.device
)
assert (
layer_id >= 0 and layer_id < self.config.layer_num
), "layer_id: {}".format(layer_id)
assert block_idx >= 0, "block_idx: {}".format(block_idx)
return self.kvcache.update_one_block_fp16(
k_in.data_ptr(),
v_in.data_ptr(),
layer_id,
block_idx,
)
def get_kvcache_one_block_fp16(
self, k_in: torch.Tensor, v_in: torch.Tensor, layer_id: int, block_idx: int
):
assert (
k_in.dim() == 3
and k_in.size(1) == self.config.block_len
and k_in.size(0) == self.config.kv_head_num
and k_in.size(2) == self.config.head_dim
and k_in.dtype == torch.float16
and k_in.is_contiguous()
and k_in.device == torch.device("cpu")
), "k_in dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
k_in.dim(), k_in.size(), k_in.dtype, k_in.is_contiguous(), k_in.device
)
assert (
v_in.dim() == 3
and v_in.size(1) == self.config.block_len
and v_in.size(0) == self.config.kv_head_num
and v_in.size(2) == self.config.head_dim
and v_in.dtype == torch.float16
and v_in.is_contiguous()
and v_in.device == torch.device("cpu")
), "v_in dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
v_in.dim(), v_in.size(), v_in.dtype, v_in.is_contiguous(), v_in.device
)
assert (
layer_id >= 0 and layer_id < self.config.layer_num
), "layer_id: {}".format(layer_id)
assert block_idx >= 0, "block_idx: {}".format(block_idx)
return self.kvcache.get_one_block_fp16(
k_in.data_ptr(),
v_in.data_ptr(),
layer_id,
block_idx,
)
def update_importance_one_block(
self, importance: torch.Tensor, layer_id: int, block_idx: int
):
assert (
importance.dim() == 1
and importance.size(0) == self.config.block_len
and importance.dtype == torch.float16
and importance.is_contiguous()
and importance.device == torch.device("cpu")
), "importance dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
importance.dim(),
importance.size(),
importance.dtype,
importance.is_contiguous(),
importance.device,
)
assert (
layer_id >= 0 and layer_id < self.config.layer_num
), "layer_id: {}".format(layer_id)
assert block_idx >= 0, "block_idx: {}".format(block_idx)
return self.kvcache.update_importance_one_block(
importance.data_ptr(),
layer_id,
block_idx,
)
def get_importance_one_block(
self, importance: torch.Tensor, layer_id: int, block_idx: int
):
assert (
importance.dim() == 1
and importance.size(0) == self.config.block_len
and importance.dtype == torch.float16
and importance.is_contiguous()
and importance.device == torch.device("cpu")
), "importance dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
importance.dim(),
importance.size(),
importance.dtype,
importance.is_contiguous(),
importance.device,
)
assert (
layer_id >= 0 and layer_id < self.config.layer_num
), "layer_id: {}".format(layer_id)
assert block_idx >= 0, "block_idx: {}".format(block_idx)
return self.kvcache.get_importance_one_block(
importance.data_ptr(),
layer_id,
block_idx,
)
def get_anchor_one_block(self, anchor: torch.Tensor, layer_id: int, block_idx: int):
assert (
anchor.dim() == 3
and anchor.size(0) == self.config.kv_head_num
and anchor.size(1) == self.config.anchor_num
and anchor.size(2) == self.config.head_dim
and anchor.dtype == torch.float16
and anchor.is_contiguous()
and anchor.device == torch.device("cpu")
), "anchor dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
anchor.dim(),
anchor.size(),
anchor.dtype,
anchor.is_contiguous(),
anchor.device,
)
assert (
layer_id >= 0 and layer_id < self.config.layer_num
), "layer_id: {}".format(layer_id)
assert block_idx >= 0, "block_idx: {}".format(block_idx)
return self.kvcache.get_anchor_one_block(
anchor.data_ptr(),
layer_id,
block_idx,
)
def update_anchor_one_block(
self, anchor: torch.Tensor, layer_id: int, block_idx: int
):
assert (
anchor.dim() == 3
and anchor.size(0) == self.config.kv_head_num
and anchor.size(1) == self.config.anchor_num
and anchor.size(2) == self.config.head_dim
and anchor.dtype == torch.float16
and anchor.is_contiguous()
and anchor.device == torch.device("cpu")
), "anchor dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
anchor.dim(),
anchor.size(),
anchor.dtype,
anchor.is_contiguous(),
anchor.device,
)
assert (
layer_id >= 0 and layer_id < self.config.layer_num
), "layer_id: {}".format(layer_id)
assert block_idx >= 0, "block_idx: {}".format(block_idx)
return self.kvcache.update_anchor_one_block(
anchor.data_ptr(),
layer_id,
block_idx,
)
def calc_anchor_all_layers(
self,
block_table: torch.Tensor,
cache_seqlens: torch.Tensor,
):
assert (
block_table.dim() == 2
and block_table.size(0) == cache_seqlens.size(0)
and block_table.dtype == torch.int
and block_table.is_contiguous()
and block_table.device == torch.device("cpu")
), "block_table dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
block_table.dim(),
block_table.size(),
block_table.dtype,
block_table.is_contiguous(),
block_table.device,
)
assert (
cache_seqlens.dim() == 1
and cache_seqlens.dtype == torch.int
and cache_seqlens.is_contiguous()
and cache_seqlens.device == torch.device("cpu")
), "cache_seqlens dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
cache_seqlens.dim(),
cache_seqlens.size(),
cache_seqlens.dtype,
cache_seqlens.is_contiguous(),
cache_seqlens.device,
)
batch_size = block_table.size(0)
max_block_num = block_table.size(1)
return self.kvcache.calc_anchor_all_layers(
block_table.data_ptr(),
cache_seqlens.data_ptr(),
batch_size,
max_block_num,
)
def clear_importance_all_layers(
self,
block_table: torch.Tensor,
cache_seqlens: torch.Tensor,
):
assert (
block_table.dim() == 2
and block_table.size(0) == cache_seqlens.size(0)
and block_table.dtype == torch.int
and block_table.is_contiguous()
and block_table.device == torch.device("cpu")
), "block_table dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
block_table.dim(),
block_table.size(),
block_table.dtype,
block_table.is_contiguous(),
block_table.device,
)
assert (
cache_seqlens.dim() == 1
and cache_seqlens.dtype == torch.int
and cache_seqlens.is_contiguous()
and cache_seqlens.device == torch.device("cpu")
), "cache_seqlens dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
cache_seqlens.dim(),
cache_seqlens.size(),
cache_seqlens.dtype,
cache_seqlens.is_contiguous(),
cache_seqlens.device,
)
batch_size = block_table.size(0)
max_block_num = block_table.size(1)
return self.kvcache.clear_importance_all_layers(
block_table.data_ptr(),
cache_seqlens.data_ptr(),
batch_size,
max_block_num,
)
def get_cache_total_len(self):
return self.kvcache.get_cache_total_len()
def update_kvcache_q4(
self,
k_in: torch.Tensor,
k_scales: torch.Tensor,
v_in: torch.Tensor,
v_scales: torch.Tensor,
layer_id: int,
seq_offset: int | None = None,
seq_len: int | None = None,
block_table: torch.Tensor | None = None,
):
raise NotImplementedError
def update_kvcache_fp16(
self,
k_in: torch.Tensor,
v_in: torch.Tensor,
layer_idx,
block_table: torch.Tensor,
max_block_num,
past_len: torch.Tensor,
q_len,
):
batch_size = block_table.size(0)
return self.kvcache.get_kvcache_fp16(
k_in.data_ptr(),
v_in.data_ptr(),
layer_idx,
block_table.data_ptr(),
batch_size,
max_block_num,
past_len.data_ptr(),
q_len
)
def get_kvcache_q4(
self,
k_in: torch.Tensor,
k_scales: torch.Tensor,
v_in: torch.Tensor,
v_scales: torch.Tensor,
layer_id: int,
seq_offset: int | None = None,
seq_len: int | None = None,
block_table: torch.Tensor | None = None,
):
raise NotImplementedError
def get_kvcache_fp16(
self,
k_in: torch.Tensor,
v_in: torch.Tensor,
layer_id: int,
layer_idx,
block_table: torch.Tensor,
max_block_num,
past_len: torch.Tensor,
):
batch_size = block_table.size(0)
return self.kvcache.get_kvcache_fp16(
k_in.data_ptr(),
v_in.data_ptr(),
layer_idx,
block_table.data_ptr(),
batch_size,
max_block_num,
past_len.data_ptr(),
)
def get_and_update_kvcache_fp16(
self,
k_cache_cpu: torch.Tensor,
v_cache_cpu: torch.Tensor,
layer_idx,
block_table: torch.Tensor,
max_block_num,
past_len: torch.Tensor,
q_len,
):
batch_size = block_table.size(0)
return self.kvcache.get_and_update_kvcache_fp16(
k_cache_cpu.data_ptr(),
v_cache_cpu.data_ptr(),
layer_idx,
block_table.data_ptr(),
batch_size,
max_block_num,
past_len.data_ptr(),
q_len,
)
def update_importance(
self,
importance_cache: torch.Tensor,
layer_idx,
block_table: torch.Tensor,
max_block_num,
offset: torch.Tensor,
width,
):
batch_size = block_table.size(0)
return self.kvcache.update_importance(
importance_cache.data_ptr(),
layer_idx,
block_table.data_ptr(),
batch_size,
max_block_num,
offset.data_ptr(),
width,
)
# attn_sparsity: ((bsz, q_len, q_head_num), dtype = torch.float32)
def get_attn_sparsity(
self,
q_in: torch.Tensor,
attn_sparsity: torch.Tensor,
layer_idx: int,
block_table: torch.Tensor,
cache_seqlens: torch.Tensor,
block_table_origin: torch.Tensor,
cache_seqlens_origin: torch.Tensor,
generate_token_idx: int = 0,
topk: int | None = None,
local: int | None = None,
):
batch_size = block_table.size(0)
max_block_num = block_table.size(1)
max_block_num_origin = block_table_origin.size(1)
q_len = q_in.size(1)
if topk is None or local is None or topk + local >= max_block_num:
topk = -1
local = -1
return self.kvcache.get_attn_sparsity(
q_in.data_ptr(),
attn_sparsity.data_ptr(),
layer_idx,
generate_token_idx,
q_len,
batch_size,
max_block_num,
block_table.data_ptr(),
cache_seqlens.data_ptr(),
block_table_origin.data_ptr(),
cache_seqlens_origin.data_ptr(),
max_block_num_origin,
topk,
local,
)
def attn_with_kvcache(
self,
q_in: torch.Tensor,
k_in: torch.Tensor,
v_in: torch.Tensor,
output: torch.Tensor,
attn_lse: torch.Tensor,
layer_idx: int,
block_table: torch.Tensor,
cache_seqlens: torch.Tensor,
generate_token_idx: int = 0,
topk: int | None = None,
local: int | None = None,
):
batch_size = block_table.size(0)
max_block_num = block_table.size(1)
q_len = q_in.size(1)
if topk is None or local is None or topk + local >= max_block_num:
topk = -1
local = -1
return self.kvcache.attn_with_kvcache(
q_in.data_ptr(),
k_in.data_ptr(),
v_in.data_ptr(),
output.data_ptr(),
attn_lse.data_ptr(),
layer_idx,
generate_token_idx,
q_len,
batch_size,
max_block_num,
block_table.data_ptr(),
cache_seqlens.data_ptr(),
topk,
local,
)
def get_all_kvcache_one_layer(
self, k_in: torch.Tensor, v_in: torch.Tensor, layer_id: int
):
return self.kvcache.get_all_kvcache_one_layer(
k_in.data_ptr(),
v_in.data_ptr(),
layer_id,
)
def get_importance(
self,
importance: torch.Tensor,
block_table: torch.Tensor,
):
raise NotImplementedError
def get_anchor(
self,
anchor: torch.Tensor,
block_table: torch.Tensor,
):
raise NotImplementedError
class CPUInfer:
cpu_infer = None
def __init__(self, cpu_infer:int = Config().cpu_infer):
if CPUInfer.cpu_infer is None:
CPUInfer.cpu_infer = cpuinfer_ext.CPUInfer(cpu_infer)
cpuinfer = None
def __init__(self, thread_num):
CPUInfer.cpuinfer = cpuinfer_ext.CPUInfer(thread_num)
def submit(self, task):
CPUInfer.cpuinfer.submit(task)
def submit_with_cuda_stream(self, current_cuda_stream, task):
CPUInfer.cpuinfer.submit_with_cuda_stream(current_cuda_stream, task)
def sync(self):
CPUInfer.cpuinfer.sync()
def sync_with_cuda_stream(self, current_cuda_stream):
CPUInfer.cpuinfer.sync_with_cuda_stream(current_cuda_stream)
def __getattribute__(self, __name: str) -> Any:
return CPUInfer.cpu_infer.__getattribute__(__name)
def __setattr__(self, __name: str, __value: Any) -> None:
return CPUInfer.cpu_infer.__setattr__(__name, __value)

View file

@ -0,0 +1,775 @@
#!/usr/bin/env python
# coding=utf-8
"""
Description :
Author : Jianwei Dong
Date : 2024-08-26 23:25:24
Version : 1.0.0
LastEditors : Jianwei Dong
LastEditTime : 2024-08-26 23:25:24
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
"""
import torch
from transformers import AutoConfig
import sys, os
import logging
logger = logging.getLogger("dynamic_attention")
sys.path.append(os.path.dirname(__file__) + "/../ktransformers_ext/cpu_backend")
from ktransformers.operators.cpuinfer import CPUInfer, CPUInferKVCache
from flash_attn import flash_attn_func, flash_attn_with_kvcache
import math
import json
class DynamicScaledDotProductAttention:
remaining_length: int
def __init__(
self,
max_seq_len: int,
block_size: int,
config: AutoConfig,
device: torch.device,
local_windows_len: int,
topk: int,
threads_num: int,
anchor_type: str = "DYNAMIC",
kv_type: str = "FP16",
dense_layer_num: int = 0,
anchor_num: int = 1,
block_selection_mode: str = "SHARED",
layer_step: int = 1,
token_step: int = 1,
preselect_block: bool = False,
preselect_block_count: int = 96,
prefill_chunk_size: int = 20480,
use_attn_sparsity: bool = False,
):
# assert anchor_num == 1
# assert anchor_type == "DYNAMIC"
self.remaining_length = 0
valid_anchor_types = ["DYNAMIC", "FIXED", "BLOCK_MEAN", "BLOCK_MAX", "QUEST"]
assert anchor_type in valid_anchor_types
if anchor_type == "QUEST":
assert anchor_num == 2
elif anchor_type != "FIXED" and anchor_type != "DYNAMIC":
assert anchor_num == 1
valid_kv_types = ["FP16", "FP32", "Q4_0", "Q8_0"]
assert kv_type in valid_kv_types
if kv_type != "FP16" and kv_type != "FP32":
assert block_size % 32 == 0
valid_block_selection_modes = ["SHARED", "SEPARATE"] # individual
assert block_selection_mode in valid_block_selection_modes
self.max_seq_len = max_seq_len
self.block_num = max_seq_len // block_size
self.block_size = block_size
self.anchor_type = anchor_type
self.kv_type = kv_type
self.anchor_num = anchor_num
self.threads_num = threads_num
self.layer_step = layer_step
self.token_step = token_step
self.preselect_block = preselect_block
self.preselect_block_count = preselect_block_count
self.block_selection_mode = block_selection_mode
self.use_attn_sparsity = use_attn_sparsity
# model config
self.kv_head_num = config.num_key_value_heads
self.q_head_num = config.num_attention_heads
self.head_dim = config.hidden_size // config.num_attention_heads
self.layer_num = config.num_hidden_layers
self.device = device
self.local_windows_len = local_windows_len
self.local_block_num = self.local_windows_len // self.block_size + 1
self.prefill_chunk_size = prefill_chunk_size
self.topk = topk
self.dense_layer_num = dense_layer_num
# self.dense_layer_num = 32
self.cache_key_states = torch.zeros(
(self.block_num, block_size, self.kv_head_num, self.head_dim),
device=device,
dtype=torch.float16,
)
self.cache_value_states = torch.zeros(
(self.block_num, block_size, self.kv_head_num, self.head_dim),
device=device,
dtype=torch.float16,
)
# [max_num_block, block_size, head_num]
self.cache_importance = torch.zeros(
(self.block_num, block_size, self.q_head_num),
device=device,
dtype=torch.float16,
)
# key_states: [bsz, q_len, kv_head_num, head_dim]
# value_states: [bsz, q_len, kv_head_num, head_dim]
# query_states: [bsz, q_len, q_head_num, head_dim]
self.q_in_cpu = torch.zeros(
(1, 1, self.q_head_num, self.head_dim),
device="cpu",
dtype=torch.float16,
pin_memory=True,
)
self.k_in_cpu = torch.zeros(
(1, 1, self.kv_head_num, self.head_dim),
device="cpu",
dtype=torch.float16,
pin_memory=True,
)
self.v_in_cpu = torch.zeros(
(1, 1, self.kv_head_num, self.head_dim),
device="cpu",
dtype=torch.float16,
pin_memory=True,
)
self.cache_seqlens_cpu = torch.empty(
(1,), device="cpu", dtype=torch.int32, pin_memory=True
)
self.cache_seqlens_cuda = torch.empty((1,), device=device, dtype=torch.int32)
self.prefix_block_table = torch.arange(
self.block_num, device="cpu", dtype=torch.int32, pin_memory=True
).view(1, -1)
self.block_table_cpu = torch.arange(
self.block_num, device="cpu", dtype=torch.int32, pin_memory=True
).view(1, -1)
# assert (
# self.local_windows_len // self.block_size + 1 + self.preselect_block_count
# <= self.block_num
# )
self.output_cpu = torch.empty(
(1, 1, self.q_head_num, self.head_dim),
device="cpu",
dtype=torch.float16,
pin_memory=True,
)
self.lse_cpu = torch.empty(
(1, 1, self.q_head_num), device="cpu", dtype=torch.float32, pin_memory=True
)
self.output_cuda = torch.empty(
(1, 1, self.q_head_num, self.head_dim), device=device, dtype=torch.float16
)
self.attn_sparsity = torch.zeros(
(1, 1, self.q_head_num), device="cpu", dtype=torch.float32, pin_memory=True
)
if preselect_block == True:
self.preselect_block_table = torch.zeros(
self.layer_num,
self.preselect_block_count,
device=device,
dtype=torch.int32,
)
self.preselect_block_num = 0 # block_num before preselect
self.evict_tokens = 0
self.cpu_infer = CPUInfer(threads_num)
self.local_thread = CPUInferKVCache(
self.layer_num,
self.kv_head_num,
self.q_head_num,
self.head_dim,
self.block_size,
anchor_num=self.anchor_num,
anchor_type=anchor_type,
kv_type=self.kv_type,
retrieval_type=self.block_selection_mode,
layer_step=self.layer_step,
token_step=self.token_step,
layer_offset=self.dense_layer_num % self.layer_step,
max_batch_size=1,
max_block_num=self.block_num,
max_thread_num=self.threads_num,
)
print(
f"local_windows_len: {local_windows_len}, topk: {topk}, dense_layer_num: {dense_layer_num}, kv_type: {self.kv_type}, anchor_type: {self.anchor_type}, preselect_block: {self.preselect_block}, preselect_block_count: {self.preselect_block_count}, token_step: {self.token_step}, layer_step: {self.layer_step}"
)
self.shape_mask = (
self.q_head_num,
self.block_size,
self.block_size,
)
mask = torch.zeros(
self.shape_mask, dtype=torch.uint8, device=device
).contiguous()
elm_idx = torch.arange(self.block_size, device=device)
for i in range(mask.size(-2)):
idx = i + mask.size(-1) - mask.size(-2) - elm_idx
idx = idx[idx >= 0]
mask[..., i, idx] = 1
self.tril_mask = mask
self.triu_mask = mask ^ 1
self.generate_token_idx = 0
def get_attn_score_one_block(
self,
batch_idx: int,
max_block_num: int,
query: torch.Tensor,
key: torch.Tensor,
offset: int,
width: int,
mask_mode: str | None = None,
use_softmax: bool = True,
):
n_rep = self.q_head_num // self.kv_head_num
importance = self.cache_importance.view(-1, self.q_head_num)
importance = importance.narrow(0, batch_idx * max_block_num + offset, width)
n_gqa_ = self.q_head_num // self.kv_head_num
for head_idx in range(self.q_head_num):
key_item = key[..., head_idx // n_gqa_, :].view(key.size(0), -1)
qk = torch.einsum(
"qd,kd->qk", query[:,head_idx,:], key_item
) # (num_attention_heads, len_q, len_k)
if mask_mode == "tril":
mask = self.tril_mask
mask = mask[0, -qk.size(-2) :, -qk.size(-1) :]
qk = qk * mask
elif mask_mode == "triu":
mask = self.triu_mask
mask = mask[0, -qk.size(-2) :, -qk.size(-1) :]
qk = qk * mask
if use_softmax:
qk = torch.nn.functional.softmax(
qk / math.sqrt(self.head_dim), dim=-1, dtype=torch.float32
).to(torch.float16)
qk = torch.sum(qk, dim=-2)
importance[...,head_idx] += qk
def get_preselect_block_table_and_attn_score(
self,
layer_idx: int,
batch_size: int,
offset: torch.Tensor,
width: int,
query: torch.Tensor,
key: torch.Tensor,
union_with_last_layer: bool = True,
):
max_seqs_len = offset.max().item() + width
max_block_num = (max_seqs_len + self.block_size - 1) // self.block_size
for batch_idx in range(batch_size):
query_cur = query[batch_idx][-128:]
self.get_attn_score_one_block(
batch_idx,
max_block_num,
query_cur,
key[batch_idx][: offset[batch_idx].item() + width],
0,
offset[batch_idx].item() + width,
mask_mode=None,
)
if self.preselect_block:
self.prefill_block_num = max(
0, max_block_num - self.local_windows_len // self.block_size
)
self.evict_tokens = (
max(self.prefill_block_num - self.preselect_block_count, 0)
* self.block_size
)
if self.prefill_block_num != 0:
importance_cache = self.cache_importance.narrow(
0, 0, self.prefill_block_num * batch_size
).view(
batch_size, self.prefill_block_num, self.block_size, self.q_head_num
)
importance_r = importance_cache[:, 1:, : self.block_size // 4]
pad_r = torch.zeros_like(importance_r[:, :1])
importance_r = torch.cat((importance_r, pad_r), dim=1)
importance_l = importance_cache[:, :-1, -self.block_size // 4 :]
pad_l = torch.zeros_like(importance_l[:, :1])
importance_l = torch.cat((pad_l, importance_l), dim=1)
importance = torch.cat(
(importance_l, importance_cache, importance_r), dim=2
)
importance = importance.mean(dim=-1)
importance = importance.mean(dim=-1)
# importance: (batch_size, max_block_num)
topk = min(self.preselect_block_count, self.prefill_block_num)
values, indices = torch.topk(
importance,
k=topk,
dim=1,
)
self.preselect_block_table[
layer_idx : layer_idx + 1,
:topk,
].copy_(indices)
if union_with_last_layer and layer_idx == 31:
for tmp_layer_idx in range(self.layer_num - 1):
for i in range(1, min(topk, 6)):
x = self.preselect_block_table[-1, i]
if x not in self.preselect_block_table[tmp_layer_idx]:
self.preselect_block_table[tmp_layer_idx, topk - i] = x
if self.anchor_type == "DYNAMIC":
importance_cache = self.cache_importance.narrow(
0, 0, max_block_num * batch_size
).view(batch_size, max_block_num * self.block_size, self.q_head_num)
importance_cache_cpu = torch.empty_like(
importance_cache, device="cpu", pin_memory=True
)
importance_cache_cpu.copy_(importance_cache)
block_table_cpu = self.prefix_block_table[:, :max_block_num].to("cpu")
offset_cpu = offset.contiguous().to("cpu")
self.cpu_infer.submit(
self.local_thread.update_importance(
importance_cache_cpu,
layer_idx,
block_table_cpu,
max_block_num,
offset_cpu,
width,
)
)
self.cpu_infer.sync()
importance_cache = self.cache_importance.narrow(
0, 0, max_block_num * batch_size
).view(batch_size, max_block_num * self.block_size, self.q_head_num)
importance_cache.zero_()
# key: [bsz, past_len, head_num, head_dim] float16
# query: [bsz, q_len, q_head_num, head_dim] float16
def get_attn_score(
self,
layer_idx: int,
batch_size: int,
offset: torch.Tensor,
width: int,
query: torch.Tensor,
key: torch.Tensor,
):
max_seqs_len = offset.max().item() + width
max_block_num = (max_seqs_len + self.block_size - 1) // self.block_size
for batch_idx in range(batch_size):
for idx in range(width // self.block_size):
offset_cur = idx * self.block_size
query_cur = query[batch_idx, offset_cur : offset_cur + self.block_size]
self.get_attn_score_one_block(
batch_idx,
max_block_num,
query_cur,
key[
batch_idx,
offset[batch_idx]
+ offset_cur : offset[batch_idx]
+ offset_cur
+ self.block_size,
],
offset[batch_idx].item() + offset_cur,
self.block_size,
mask_mode="tril",
use_softmax=False,
)
offset_key = (
offset[batch_idx].item()
+ idx * self.block_size
- self.local_windows_len
)
if offset_key >= 0:
self.get_attn_score_one_block(
batch_idx,
max_block_num,
query_cur,
key[batch_idx, offset_key : offset_key + self.block_size],
offset_key,
self.block_size,
mask_mode="triu",
use_softmax=False,
)
offset_key = max(0, offset_key + self.block_size)
width_key = (
offset[batch_idx].item() + idx * self.block_size - offset_key
)
if width_key > 0:
self.get_attn_score_one_block(
batch_idx,
max_block_num,
query_cur,
key[batch_idx, offset_key : offset_key + width_key],
offset_key,
width_key,
mask_mode=None,
use_softmax=False,
)
importance_cache = self.cache_importance.narrow(
0, 0, max_block_num * batch_size
).view(batch_size, max_block_num * self.block_size, self.q_head_num)
importance_cache_cpu = torch.empty_like(
importance_cache, device="cpu", pin_memory=True
)
importance_cache_cpu.copy_(importance_cache)
block_table_cpu = self.prefix_block_table[:, :max_block_num].to("cpu")
offset_cpu = offset.contiguous().to("cpu")
self.cpu_infer.submit(
self.local_thread.update_importance(
importance_cache_cpu,
layer_idx,
block_table_cpu,
max_block_num,
offset_cpu,
width,
)
)
self.cpu_infer.sync()
importance_cache.zero_()
# key: [bsz, q_len, head_num, head_dim] float16
# value: [bsz, q_len, head_num, head_dim] float16
def swap_in_and_swap_out(self, layer_idx, past_len, q_len, key, value):
batch_size = 1
max_seqs_len = past_len.max().item() + q_len
max_block_num = (max_seqs_len + self.block_size - 1) // self.block_size
k_cache = self.cache_key_states.narrow(0, 0, max_block_num * batch_size).view(
batch_size, max_block_num * self.block_size, self.kv_head_num, self.head_dim
)
v_cache = self.cache_value_states.narrow(0, 0, max_block_num * batch_size).view(
batch_size, max_block_num * self.block_size, self.kv_head_num, self.head_dim
)
for batch_idx in range(batch_size):
offset = past_len[batch_idx]
width = q_len
k_cache[batch_idx][offset : offset + width].copy_(
key[batch_idx].view(-1, self.kv_head_num, self.head_dim)
)
v_cache[batch_idx][offset : offset + width].copy_(
value[batch_idx].view(-1, self.kv_head_num, self.head_dim)
)
k_cache_cpu = torch.empty_like(k_cache, device="cpu", pin_memory=True)
v_cache_cpu = torch.empty_like(v_cache, device="cpu", pin_memory=True)
k_cache_cpu.copy_(k_cache)
v_cache_cpu.copy_(v_cache)
cur_block_num = (
q_len + past_len[0].item() + self.block_size - 1
) // self.block_size
block_table_cpu = self.prefix_block_table[:, :cur_block_num].to("cpu")
past_len_cpu = past_len.contiguous().to("cpu")
self.cpu_infer.submit(
self.local_thread.get_and_update_kvcache_fp16(
k_cache_cpu,
v_cache_cpu,
layer_idx,
block_table_cpu,
max_block_num,
past_len_cpu,
q_len,
)
)
self.cpu_infer.sync()
k_cache.copy_(k_cache_cpu)
v_cache.copy_(v_cache_cpu)
return k_cache, v_cache
def calc_anchor(self, cache_seqlens: int):
cur_block_num = (cache_seqlens + self.block_size - 1) // self.block_size
block_table_cpu = self.prefix_block_table[:, :cur_block_num].to("cpu")
cache_seqlens_cpu = torch.tensor(
[cache_seqlens], device="cpu", dtype=torch.int32
)
self.cpu_infer.submit(
self.local_thread.calc_anchor_all_layers(
block_table_cpu,
cache_seqlens_cpu,
)
)
self.cpu_infer.sync()
def clear_importance(self, cache_seqlens: int):
print(f"clear importance: {cache_seqlens}")
cur_block_num = (cache_seqlens + self.block_size - 1) // self.block_size
block_table_cpu = self.prefix_block_table[:, :cur_block_num].to("cpu")
cache_seqlens_cpu = torch.tensor(
[cache_seqlens], device="cpu", dtype=torch.int32
)
self.cpu_infer.submit(
self.local_thread.clear_importance_all_layers(
block_table_cpu,
cache_seqlens_cpu,
)
)
self.cpu_infer.sync()
def clear_kvcache(self, cache_seqlens: int):
cur_block_num = (cache_seqlens + self.block_size - 1) // self.block_size
block_table_cpu = self.prefix_block_table[:, :cur_block_num].to("cpu")
cache_seqlens_cpu = torch.tensor(
[cache_seqlens], device="cpu", dtype=torch.int32
)
self.cpu_infer.submit(
self.local_thread.clear_kvcache_all_layers(
block_table_cpu,
cache_seqlens_cpu,
)
)
self.cpu_infer.sync()
def get_attn_sparsity(
self,
q_in: torch.Tensor,
layer_idx: int,
block_table: torch.Tensor,
cache_seqlens: torch.Tensor,
block_table_origin: torch.Tensor,
cache_seqlens_origin: torch.Tensor,
generate_token_idx: int = 0,
topk: int | None = None,
local: int | None = None,
output_path: str = "./attn_sparsity.json",
):
self.attn_sparsity.zero_()
self.pcinfer.submit(
self.local_thread.get_attn_sparsity(
q_in,
self.attn_sparsity,
layer_idx,
block_table,
cache_seqlens,
block_table_origin,
cache_seqlens_origin,
generate_token_idx,
topk,
local,
)
)
self.cpu_infer.sync()
with open(output_path, "a") as file:
for head_idx in range(self.q_head_num):
sparsity = self.attn_sparsity[0][0][head_idx].item()
json_obj = {
"token_idx": generate_token_idx,
"layer_idx": layer_idx,
"head_idx": head_idx,
"sparsity": sparsity,
}
json.dump(json_obj, file)
file.write("\n")
def apply(
self,
layer_idx: int,
bsz: int,
past_len: int,
query_states: torch.Tensor,
key_states: torch.Tensor,
value_states: torch.Tensor,
mode: str = "prefill",
generate_token_idx: int = -1,
):
# key_states: [bsz, q_len, kv_head_num, head_dim]
# value_states: [bsz, q_len, kv_head_num, head_dim]
# query_states: [bsz, q_len, q_head_num, head_dim]
assert query_states.dtype == torch.float16
assert key_states.dtype == torch.float16
assert value_states.dtype == torch.float16
assert key_states.size(2) == self.kv_head_num
assert value_states.size(2) == self.kv_head_num
assert query_states.size(2) == self.q_head_num
q_len = query_states.size(1)
batch_size = query_states.size(0)
self.cache_seqlens_cuda.fill_(past_len)
last_chunk = False
if self.remaining_length <= self.prefill_chunk_size and q_len != 1:
last_chunk = True
device = query_states.device
if layer_idx == 0:
if q_len == 1:
self.generate_token_idx += 1
elif last_chunk:
self.generate_token_idx = -1
if mode == "prefill":
key, value = self.swap_in_and_swap_out(
layer_idx,
self.cache_seqlens_cuda,
q_len,
key_states,
value_states,
)
if last_chunk and (self.anchor_type == "DYNAMIC" or self.preselect_block):
self.get_preselect_block_table_and_attn_score(
layer_idx,
bsz,
self.cache_seqlens_cuda,
q_len,
query_states,
key,
)
output = flash_attn_with_kvcache(
q=query_states,
k_cache=key,
v_cache=value,
cache_seqlens=self.cache_seqlens_cuda + q_len,
causal=True,
)
return output.transpose(1, 2)
elif mode == "generate":
assert self.generate_token_idx >= 0
self.q_in_cpu.copy_(query_states, non_blocking=True)
self.k_in_cpu.copy_(key_states, non_blocking=True)
self.v_in_cpu.copy_(value_states, non_blocking=True)
self.cache_seqlens_cpu.copy_(self.cache_seqlens_cuda, non_blocking=True)
# print(layer_idx)
if layer_idx < self.dense_layer_num:
self.block_table_cpu.copy_(self.prefix_block_table, non_blocking=True)
self.cpu_infer.submit_with_cuda_stream(
torch.cuda.current_stream("cuda").cuda_stream,
self.local_thread.attn_with_kvcache(
q_in=self.q_in_cpu,
k_in=self.k_in_cpu,
v_in=self.v_in_cpu,
output=self.output_cpu,
attn_lse=self.lse_cpu,
layer_idx=layer_idx,
block_table=self.block_table_cpu,
cache_seqlens=self.cache_seqlens_cpu,
),
)
else:
if self.preselect_block:
self.cache_seqlens_cpu.copy_(
self.cache_seqlens_cuda - self.evict_tokens, non_blocking=True
)
if self.preselect_block_count < self.prefill_block_num:
self.block_table_cpu[:, : self.preselect_block_count].copy_(
self.preselect_block_table[layer_idx : layer_idx + 1],
non_blocking=True,
)
self.block_table_cpu[
:,
self.preselect_block_count : self.preselect_block_count
+ self.local_block_num,
].copy_(
self.prefix_block_table[
:,
self.prefill_block_num : self.prefill_block_num
+ self.local_block_num,
],
non_blocking=True,
)
# print("submit_with_cuda_stream")
self.cpu_infer.submit_with_cuda_stream(
torch.cuda.current_stream("cuda").cuda_stream,
self.local_thread.attn_with_kvcache(
q_in=self.q_in_cpu,
k_in=self.k_in_cpu,
v_in=self.v_in_cpu,
output=self.output_cpu,
attn_lse=self.lse_cpu,
layer_idx=layer_idx,
generate_token_idx=self.generate_token_idx,
block_table=self.block_table_cpu,
cache_seqlens=self.cache_seqlens_cpu,
topk=(
self.topk
if self.topk <= self.preselect_block_count
else None
),
local=self.local_windows_len // self.block_size,
),
)
# print("submit_with_cuda_stream enqueue\n")
else:
self.block_table_cpu.copy_(
self.prefix_block_table, non_blocking=True
)
self.cpu_infer.submit_with_cuda_stream(
torch.cuda.current_stream("cuda").cuda_stream,
self.local_thread.attn_with_kvcache(
q_in=self.q_in_cpu,
k_in=self.k_in_cpu,
v_in=self.v_in_cpu,
output=self.output_cpu,
attn_lse=self.lse_cpu,
layer_idx=layer_idx,
generate_token_idx=self.generate_token_idx,
block_table=self.block_table_cpu,
cache_seqlens=self.cache_seqlens_cpu,
topk=self.topk,
local=self.local_windows_len // self.block_size,
),
)
self.cpu_infer.sync_with_cuda_stream(
torch.cuda.current_stream("cuda").cuda_stream
)
# print("submit_with_cuda_stream finished\n")
self.output_cuda.copy_(self.output_cpu, non_blocking=True)
return self.output_cuda.transpose(1, 2)
def save(self, path: str, length: int):
cur_block_num = (length + self.block_size - 1) // self.block_size
block_table_cpu = self.prefix_block_table[0, :cur_block_num].to("cpu")
cache_seqlens_cpu = torch.tensor([length], device="cpu", dtype=torch.int32)
self.cpu_infer.submit(
self.local_thread.dump_kvcache(
block_table_cpu,
cache_seqlens_cpu,
path,
)
)
self.cpu_infer.sync()
def load(self, path: str, length: int):
self.cpu_infer.submit(
self.local_thread.load_kvcache(
path,
)
)
self.cpu_infer.sync()

View file

@ -6,7 +6,7 @@ Author : Azure-Tang, Boxin Zhang, chenht2022
Date : 2024-07-25 11:25:24
Version : 0.1.0
LastEditors : Azure
LastEditTime : 2024-08-15 02:36:29
LastEditTime : 2024-08-27 03:50:23
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
'''
@ -436,7 +436,7 @@ class KExpertsTorch(KExpertsBase):
final_hidden_states.index_add_(0, top_x, current_hidden_states)
return final_hidden_states.to(org_dtype, device=org_device)
return final_hidden_states.to(dtype=org_dtype, device=org_device)
EXPERTS_MAP = {
"KExpertsCPU": KExpertsCPU,

View file

@ -1,14 +1,14 @@
#!/usr/bin/env python
# coding=utf-8
'''
"""
Description :
Author : Azure-Tang
Date : 2024-07-25 11:25:24
Version : 1.0.0
LastEditors : Azure
LastEditTime : 2024-08-14 14:53:05
LastEditTime : 2024-08-27 07:29:04
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
'''
"""
import inspect
import math
@ -19,7 +19,10 @@ import torch.nn.functional as F
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ktransformers.operators.dynamic_attention import DynamicScaledDotProductAttention
from ktransformers.server.config.config import Config
import os
import yaml
from transformers.activations import ACT2FN
from transformers.cache_utils import Cache, DynamicCache, StaticCache
from transformers.modeling_attn_mask_utils import (
@ -40,19 +43,35 @@ from transformers.utils import (
logging,
replace_return_docstrings,
)
from ktransformers.models.modeling_qwen2_moe import Qwen2MoeSparseMoeBlock, Qwen2MoeMLP, Qwen2MoeDecoderLayer
from ktransformers.models.modeling_deepseek import BaseModelOutputWithPast, DeepseekV2DecoderLayer, DeepseekV2MoE
from ktransformers.models.modeling_qwen2_moe import (
Qwen2MoeSparseMoeBlock,
Qwen2MoeMLP,
Qwen2MoeDecoderLayer,
)
from ktransformers.models.modeling_deepseek import (
BaseModelOutputWithPast,
DeepseekV2DecoderLayer,
DeepseekV2MoE,
)
from transformers.models.qwen2_moe.configuration_qwen2_moe import Qwen2MoeConfig
from ktransformers.models.configuration_llama import LlamaConfig
from ktransformers.operators.base_operator import BaseInjectedModule
from ktransformers.util.utils import InferenceState
from ktransformers.util.custom_gguf import GGUFLoader
from transformers.configuration_utils import PretrainedConfig
from ktransformers.models.modeling_llama import (
LlamaDecoderLayer,
LlamaRMSNorm,
LlamaRotaryEmbedding,
)
if is_flash_attn_2_available():
from flash_attn import flash_attn_func, flash_attn_varlen_func
from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa
_flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
_flash_supports_window_size = "window_size" in list(
inspect.signature(flash_attn_func).parameters
)
logger = logging.get_logger(__name__)
@ -151,6 +170,7 @@ QWEN2MOE_INPUTS_DOCSTRING = r"""
the complete sequence length.
"""
@add_start_docstrings(
"The bare Qwen2MoE Model outputting raw hidden-states without any specific head on top.",
QWEN2MOE_START_DOCSTRING,
@ -162,18 +182,21 @@ class KQwen2MoeModel(BaseInjectedModule):
Args:
config: Qwen2MoeConfig
"""
def __init__(
self,
key: str,
gguf_loader : GGUFLoader,
gguf_loader: GGUFLoader,
config: PretrainedConfig,
orig_module: nn.Module,
device: str = "cuda",
per_layer_prefill_intput_threshold: int = 30000, # if None, no per-layer prefill
per_layer_prefill_intput_threshold: int = 30000, # if None, no per-layer prefill
transfer_map: dict = None,
**kwargs,
):
BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, device, **kwargs)
BaseInjectedModule.__init__(
self, key, gguf_loader, config, orig_module, device, **kwargs
)
self.per_layer_prefill_intput_threshold = per_layer_prefill_intput_threshold
self.transfer_map = transfer_map
self.stream_device_map = dict()
@ -192,29 +215,47 @@ class KQwen2MoeModel(BaseInjectedModule):
output_router_logits: Optional[bool] = None,
return_dict: Optional[bool] = None,
cache_position: Optional[torch.LongTensor] = None,
per_layer_prefill_intput_threshold: int | None = None, # if None or 0, close per-layer prefill
per_layer_prefill_intput_threshold: (
int | None
) = None, # if None or 0, close per-layer prefill
) -> Union[Tuple, MoeModelOutputWithPast]:
# print(f'Total length of input_ids: {input_ids.size(1)}, {input_ids.size()}')
if per_layer_prefill_intput_threshold is None: per_layer_prefill_intput_threshold = self.per_layer_prefill_intput_threshold
if per_layer_prefill_intput_threshold is None:
per_layer_prefill_intput_threshold = self.per_layer_prefill_intput_threshold
per_layer_prefill_flag = False
seq_lenth = inputs_embeds.size(1) if inputs_embeds is not None else input_ids.size(1)
if per_layer_prefill_intput_threshold and per_layer_prefill_intput_threshold < seq_lenth:
seq_lenth = (
inputs_embeds.size(1) if inputs_embeds is not None else input_ids.size(1)
)
if (
per_layer_prefill_intput_threshold
and per_layer_prefill_intput_threshold < seq_lenth
):
per_layer_prefill_flag = True
for layer in self.layers:
self.load_layer_to(layer, InferenceState.UNLOAD)
else:
pass
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_attentions = (
output_attentions
if output_attentions is not None
else self.config.output_attentions
)
output_router_logits = (
output_router_logits if output_router_logits is not None else self.config.output_router_logits
output_router_logits
if output_router_logits is not None
else self.config.output_router_logits
)
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
output_hidden_states
if output_hidden_states is not None
else self.config.output_hidden_states
)
use_cache = use_cache if use_cache is not None else self.config.use_cache
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
return_dict = (
return_dict if return_dict is not None else self.config.use_return_dict
)
if (input_ids is None) ^ (inputs_embeds is not None):
raise ValueError(
@ -243,15 +284,23 @@ class KQwen2MoeModel(BaseInjectedModule):
inputs_embeds = inputs_embeds.to("cuda")
if cache_position is None:
past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
past_seen_tokens = (
past_key_values.get_seq_length() if past_key_values is not None else 0
)
cache_position = torch.arange(
past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
past_seen_tokens,
past_seen_tokens + inputs_embeds.shape[1],
device=inputs_embeds.device,
)
if position_ids is None:
position_ids = cache_position.unsqueeze(0)
causal_mask = self._update_causal_mask(
attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
attention_mask,
inputs_embeds,
cache_position,
past_key_values,
output_attentions,
)
hidden_states = inputs_embeds
@ -263,7 +312,7 @@ class KQwen2MoeModel(BaseInjectedModule):
next_decoder_cache = None
for i, decoder_layer in enumerate(self.layers):
if self.transfer_map is not None and i in self.transfer_map:
if self.transfer_map is not None and i in self.transfer_map:
prev_stream = torch.cuda.current_stream()
cur_device = self.transfer_map[i]
if cur_device not in self.stream_device_map:
@ -271,11 +320,25 @@ class KQwen2MoeModel(BaseInjectedModule):
torch.cuda.set_device(cur_device)
self.stream_device_map[cur_device].wait_stream(prev_stream)
torch.cuda.set_stream(self.stream_device_map[cur_device])
hidden_states = hidden_states.to(self.transfer_map[i], non_blocking = True)
causal_mask = causal_mask.to(self.transfer_map[i], non_blocking = True) if causal_mask is not None else None
position_ids = position_ids.to(self.transfer_map[i], non_blocking = True) if position_ids is not None else None
cache_position = cache_position.to(self.transfer_map[i], non_blocking = True) if cache_position is not None else None
hidden_states = hidden_states.to(
self.transfer_map[i], non_blocking=True
)
causal_mask = (
causal_mask.to(self.transfer_map[i], non_blocking=True)
if causal_mask is not None
else None
)
position_ids = (
position_ids.to(self.transfer_map[i], non_blocking=True)
if position_ids is not None
else None
)
cache_position = (
cache_position.to(self.transfer_map[i], non_blocking=True)
if cache_position is not None
else None
)
if output_hidden_states:
all_hidden_states += (hidden_states,)
@ -323,7 +386,6 @@ class KQwen2MoeModel(BaseInjectedModule):
hidden_states = self.norm(hidden_states)
if per_layer_prefill_flag:
per_layer_prefill_flag = False
for layer in self.layers:
@ -333,12 +395,22 @@ class KQwen2MoeModel(BaseInjectedModule):
next_cache = None
if use_cache:
next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
next_cache = (
next_decoder_cache.to_legacy_cache()
if use_legacy_cache
else next_decoder_cache
)
if not return_dict:
return tuple(
v
for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_router_logits]
for v in [
hidden_states,
next_cache,
all_hidden_states,
all_self_attns,
all_router_logits,
]
if v is not None
)
return MoeModelOutputWithPast(
@ -349,11 +421,13 @@ class KQwen2MoeModel(BaseInjectedModule):
router_logits=all_router_logits,
)
def load_layer_to(self, layer:Qwen2MoeDecoderLayer, target: InferenceState):
assert isinstance(layer, Qwen2MoeDecoderLayer), "module should be nn.ModuleList of decoder layers"
def load_layer_to(self, layer: Qwen2MoeDecoderLayer, target: InferenceState):
assert isinstance(
layer, Qwen2MoeDecoderLayer
), "module should be nn.ModuleList of decoder layers"
# TODO Support restore to original device, not only cuda
device = "cpu" if target == InferenceState.UNLOAD else "cuda"
device = "cpu" if target == InferenceState.UNLOAD else "cuda"
# attn
layer.self_attn.q_proj.set_inference_mode(target)
@ -458,18 +532,21 @@ class KDeepseekV2Model(BaseInjectedModule):
Args:
config: DeepseekV2Config
"""
def __init__(
self,
key: str,
gguf_loader : GGUFLoader,
gguf_loader: GGUFLoader,
config: PretrainedConfig,
orig_module: nn.Module,
device: str = "cuda",
per_layer_prefill_intput_threshold: int = 30000, # if None, no per-layer prefill
per_layer_prefill_intput_threshold: int = 30000, # if None, no per-layer prefill
transfer_map: dict = None,
**kwargs,
):
BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, device, **kwargs)
BaseInjectedModule.__init__(
self, key, gguf_loader, config, orig_module, device, **kwargs
)
self.per_layer_prefill_intput_threshold = per_layer_prefill_intput_threshold
self.transfer_map = transfer_map
self.stream_device_map = dict()
@ -487,15 +564,23 @@ class KDeepseekV2Model(BaseInjectedModule):
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
cache_position: Optional[torch.LongTensor] = None,
per_layer_prefill_intput_threshold: int | None = None, # if None, no per-layer prefill
per_layer_prefill_intput_threshold: (
int | None
) = None, # if None, no per-layer prefill
) -> Union[Tuple, BaseModelOutputWithPast]:
if per_layer_prefill_intput_threshold is None: per_layer_prefill_intput_threshold = self.per_layer_prefill_intput_threshold
if per_layer_prefill_intput_threshold is None:
per_layer_prefill_intput_threshold = self.per_layer_prefill_intput_threshold
per_layer_prefill_flag = False
seq_lenth = inputs_embeds.size(1) if inputs_embeds is not None else input_ids.size(1)
if per_layer_prefill_intput_threshold and per_layer_prefill_intput_threshold < seq_lenth:
seq_lenth = (
inputs_embeds.size(1) if inputs_embeds is not None else input_ids.size(1)
)
if (
per_layer_prefill_intput_threshold
and per_layer_prefill_intput_threshold < seq_lenth
):
per_layer_prefill_flag = True
for layer in self.layers:
self.load_layer_to(layer, InferenceState.UNLOAD)
self.load_layer_to(layer, InferenceState.UNLOAD)
torch.cuda.empty_cache()
else:
pass
@ -542,9 +627,13 @@ class KDeepseekV2Model(BaseInjectedModule):
past_key_values_length = past_key_values.get_usable_length(seq_length)
if cache_position is None:
past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
past_seen_tokens = (
past_key_values.get_seq_length() if past_key_values is not None else 0
)
cache_position = torch.arange(
past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
past_seen_tokens,
past_seen_tokens + inputs_embeds.shape[1],
device=inputs_embeds.device,
)
if position_ids is None:
@ -556,15 +645,17 @@ class KDeepseekV2Model(BaseInjectedModule):
inputs_embeds = self.embed_tokens(input_ids)
input_ids = input_ids.to(org_device)
causal_mask = self._update_causal_mask(
attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
)
if per_layer_prefill_flag:
causal_mask = None
else:
causal_mask = self._update_causal_mask(
attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
)
# embed positions
hidden_states = inputs_embeds
if per_layer_prefill_flag:
print(f'Total length of input_ids: {hidden_states.size(1)}')
print(f"Total length of input_ids: {hidden_states.size(1)}")
# decoder layers
all_hidden_states = () if output_hidden_states else None
@ -576,7 +667,7 @@ class KDeepseekV2Model(BaseInjectedModule):
t_f = 0
for i, decoder_layer in enumerate(self.layers):
if self.transfer_map is not None and i in self.transfer_map:
if self.transfer_map is not None and i in self.transfer_map:
prev_stream = torch.cuda.current_stream()
cur_device = self.transfer_map[i]
if cur_device not in self.stream_device_map:
@ -584,10 +675,24 @@ class KDeepseekV2Model(BaseInjectedModule):
torch.cuda.set_device(cur_device)
self.stream_device_map[cur_device].wait_stream(prev_stream)
torch.cuda.set_stream(self.stream_device_map[cur_device])
hidden_states = hidden_states.to(self.transfer_map[i], non_blocking = True)
causal_mask = causal_mask.to(self.transfer_map[i], non_blocking = True) if causal_mask is not None else None
position_ids = position_ids.to(self.transfer_map[i], non_blocking = True) if position_ids is not None else None
cache_position = cache_position.to(self.transfer_map[i], non_blocking = True) if cache_position is not None else None
hidden_states = hidden_states.to(
self.transfer_map[i], non_blocking=True
)
causal_mask = (
causal_mask.to(self.transfer_map[i], non_blocking=True)
if causal_mask is not None
else None
)
position_ids = (
position_ids.to(self.transfer_map[i], non_blocking=True)
if position_ids is not None
else None
)
cache_position = (
cache_position.to(self.transfer_map[i], non_blocking=True)
if cache_position is not None
else None
)
if output_hidden_states:
all_hidden_states += (hidden_states,)
@ -622,12 +727,12 @@ class KDeepseekV2Model(BaseInjectedModule):
t5 = time.time()
if per_layer_prefill_flag:
# print(f"to cpu")
self.load_layer_to(decoder_layer, InferenceState.UNLOAD)
self.load_layer_to(decoder_layer, InferenceState.UNLOAD)
torch.cuda.empty_cache()
t6 = time.time()
t_gpu += t4-t3
t_cpu += t6-t5
t_f += t5-t4
t_gpu += t4 - t3
t_cpu += t6 - t5
t_f += t5 - t4
hidden_states = layer_outputs[0]
@ -648,7 +753,9 @@ class KDeepseekV2Model(BaseInjectedModule):
torch.cuda.empty_cache()
t7 = time.time()
print(f"total time: {t7-t3}, \n layer num{len(self.layers)}, gpu time: {t_gpu}, cpu time: {t_cpu}, forward time: {t_f}, restore time: {t7-t6}")
print(
f"total time: {t7-t3}, \n layer num{len(self.layers)}, gpu time: {t_gpu}, cpu time: {t_cpu}, forward time: {t_f}, restore time: {t7-t6}"
)
# add hidden states from the last decoder layer
if output_hidden_states:
@ -674,16 +781,18 @@ class KDeepseekV2Model(BaseInjectedModule):
attentions=all_self_attns,
)
def load_layer_to(self, layer: DeepseekV2DecoderLayer, target: InferenceState):
assert isinstance(layer, DeepseekV2DecoderLayer), "module should be nn.ModuleList of decoder layers"
def load_layer_to(self, layer: DeepseekV2DecoderLayer, target: InferenceState):
assert isinstance(
layer, DeepseekV2DecoderLayer
), "module should be nn.ModuleList of decoder layers"
# TODO Support restore to original device, not only cuda
device = "cpu" if target == InferenceState.UNLOAD else "cuda"
device = "cpu" if target == InferenceState.UNLOAD else "cuda"
# TODO Support DFS to auto use {to, set_inference_mode} according to the module type
# attn
layer.self_attn.to(device) #
layer.self_attn.to(device) #
# mlp
if isinstance(layer.mlp, DeepseekV2MoE):
@ -702,3 +811,526 @@ class KDeepseekV2Model(BaseInjectedModule):
# layer norm
layer.input_layernorm.to(device)
layer.post_attention_layernorm.to(device)
LLAMA_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`LlamaConfig`]):
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
LLAMA_INPUTS_DOCSTRING = r"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
`past_key_values`).
If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
information on the default strategy.
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
config.n_positions - 1]`.
[What are position IDs?](../glossary#position-ids)
past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
Two formats are allowed:
- a [`~cache_utils.Cache`] instance;
- Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
cache format.
The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
legacy cache format will be returned.
If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
of shape `(batch_size, sequence_length)`.
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
model's internal embedding lookup matrix.
use_cache (`bool`, *optional*):
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
`past_key_values`).
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
the complete sequence length.
"""
@add_start_docstrings(
"The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
LLAMA_START_DOCSTRING,
)
class LlamaPreTrainedModel(PreTrainedModel):
config_class = LlamaConfig
base_model_prefix = "model"
supports_gradient_checkpointing = True
_no_split_modules = ["LlamaDecoderLayer"]
_skip_keys_device_placement = ["past_key_values"]
_supports_flash_attn_2 = True
_supports_sdpa = True
_supports_cache_class = True
_supports_quantized_cache = True
_supports_static_cache = True
def _init_weights(self, module):
std = self.config.initializer_range
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=std)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=std)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
class KLlamaModel(BaseInjectedModule):
"""
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
Args:
config: LlamaConfig
"""
dynamic_sdpa = None
def __init__(
self,
key: str,
gguf_loader: GGUFLoader,
config: PretrainedConfig,
orig_module: nn.Module,
device: str = "cuda",
per_layer_prefill_intput_threshold: int = 30000, # if None, no per-layer prefill
transfer_map: dict = None,
**kwargs,
):
BaseInjectedModule.__init__(
self, key, gguf_loader, config, orig_module, device, **kwargs
)
self.per_layer_prefill_intput_threshold = per_layer_prefill_intput_threshold
self.transfer_map = transfer_map
self.stream_device_map = dict()
user_path: str = os.path.expanduser('~')
localstore_path: str = os.path.join(user_path,'.ktransformers')
config_path: str = os.path.join(localstore_path,Config.CONFIG_FILE_NAME)
with open(config_path,"r") as file:
config_yaml = yaml.safe_load(file.read())
self.long_context_config = config_yaml.get("long_context")
self.ext_config = config_yaml.get("ext")
KLlamaModel.dynamic_sdpa = DynamicScaledDotProductAttention(
max_seq_len=self.long_context_config["max_seq_len"],
block_size=self.long_context_config["block_size"],
config=config,
device=torch.device("cuda"),
local_windows_len=self.long_context_config["local_windows_len"],
topk=self.long_context_config["second_select_num"],
threads_num=self.ext_config["cpu_infer"],
anchor_type=self.long_context_config["anchor_type"],
kv_type=self.long_context_config["kv_type"],
dense_layer_num=self.long_context_config["dense_layer_num"],
anchor_num=self.long_context_config["anchor_num"],
preselect_block=self.long_context_config["preselect_block"],
block_selection_mode=self.long_context_config["head_select_mode"],
preselect_block_count=self.long_context_config["preselect_block_count"],
layer_step=self.long_context_config["layer_step"],
token_step=self.long_context_config["token_step"],
prefill_chunk_size=self.long_context_config["chunk_size"],
use_attn_sparsity=False,
)
def get_input_embeddings(self):
return self.embed_tokens
def set_input_embeddings(self, value):
self.embed_tokens = value
@add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
cache_position: Optional[torch.LongTensor] = None,
) -> Union[Tuple, BaseModelOutputWithPast]:
output_attentions = (
output_attentions
if output_attentions is not None
else self.config.output_attentions
)
output_hidden_states = (
output_hidden_states
if output_hidden_states is not None
else self.config.output_hidden_states
)
use_cache = use_cache if use_cache is not None else self.config.use_cache
return_dict = (
return_dict if return_dict is not None else self.config.use_return_dict
)
if (input_ids is None) ^ (inputs_embeds is not None):
raise ValueError(
"You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
)
if self.gradient_checkpointing and self.training and use_cache:
logger.warning_once(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
)
use_cache = False
return_legacy_cache = False
if (
use_cache and not isinstance(past_key_values, Cache) and not self.training
): # kept for BC (non `Cache` `past_key_values` inputs)
return_legacy_cache = True
past_key_values = DynamicCache.from_legacy_cache(past_key_values)
logger.warning_once(
"We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
"Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)"
)
if cache_position is None:
past_seen_tokens = (
past_key_values.get_seq_length() if past_key_values is not None else 0
)
cache_position = torch.arange(
past_seen_tokens,
past_seen_tokens + inputs_embeds.shape[1],
device="cuda",
)
if position_ids is None:
position_ids = cache_position.unsqueeze(0)
causal_mask = None
chunck_size = self.long_context_config["chunk_size"]
cur_idx = 0
if inputs_embeds is None:
inputs_embeds = self.embed_tokens(input_ids.to("cpu"))
q_len = cache_position.size(0)
# generate
if q_len == 1:
x = inputs_embeds[:, -1:, :]
position_ids = position_ids[:, -1:]
return self.forward_chunk(
x,
causal_mask,
position_ids,
past_key_values,
output_attentions,
use_cache,
cache_position,
output_hidden_states,
return_dict,
)
elif q_len <= chunck_size:
inputs_embeds = inputs_embeds.to('cuda')
output = self.forward_chunk(
inputs_embeds,
causal_mask,
position_ids,
past_key_values,
output_attentions,
use_cache,
cache_position,
output_hidden_states,
return_dict,
)
KLlamaModel.dynamic_sdpa.calc_anchor(cache_position[-1] + 1)
KLlamaModel.dynamic_sdpa.clear_importance(cache_position[-1] + 1)
return output
cur_idx = 0
assert (
output_attentions == False
), "output_attentions is not supported when using chunked attention"
attn_output = None
# prefill
KLlamaModel.dynamic_sdpa.remaining_length = q_len
while cur_idx < q_len:
print(f'current prefill length: {cur_idx}')
chunk_mask = None
if inputs_embeds.device.type == 'cpu':
tmp_inputs_embeds = inputs_embeds[:, cur_idx : min(cur_idx + chunck_size, q_len)].to("cuda")
else:
tmp_inputs_embeds = inputs_embeds[:, cur_idx : min(cur_idx + chunck_size, q_len)]
output_with_past = self.forward_chunk(
tmp_inputs_embeds,
chunk_mask,
position_ids[:, cur_idx : min(cur_idx + chunck_size, q_len)],
past_key_values,
output_attentions,
use_cache,
cache_position[cur_idx : min(cur_idx + chunck_size, q_len)],
)
cur_output = output_with_past.last_hidden_state
KLlamaModel.dynamic_sdpa.remaining_length -= (
min(cur_idx + chunck_size, q_len) - cur_idx
)
cur_idx += chunck_size
# if attn_output is None:
attn_output = cur_output
# else:
# attn_output = torch.cat((attn_output, cur_output), dim=-2)
KLlamaModel.dynamic_sdpa.calc_anchor(cache_position[-1] + 1)
KLlamaModel.dynamic_sdpa.clear_importance(cache_position[-1] + 1)
return BaseModelOutputWithPast(last_hidden_state=attn_output)
def forward_chunk(
self,
inputs_embeds,
causal_mask,
position_ids,
past_key_values,
output_attentions,
use_cache,
cache_position,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
output_hidden_states = (
output_hidden_states
if output_hidden_states is not None
else self.config.output_hidden_states
)
return_legacy_cache = False
if use_cache and not isinstance(
past_key_values, Cache
): # kept for BC (non `Cache` `past_key_values` inputs)
return_legacy_cache = True
past_key_values = DynamicCache.from_legacy_cache(past_key_values)
return_dict = (
return_dict if return_dict is not None else self.config.use_return_dict
)
hidden_states = inputs_embeds
# create position embeddings to be shared across the decoder layers
position_embeddings = self.rotary_emb(hidden_states, position_ids)
# decoder layers
all_hidden_states = () if output_hidden_states else None
all_self_attns = () if output_attentions else None
next_decoder_cache = None
# decoder layers
all_hidden_states = () if output_hidden_states else None
all_self_attns = () if output_attentions else None
next_decoder_cache = None
for decoder_layer in self.layers:
if output_hidden_states:
all_hidden_states += (hidden_states,)
if self.gradient_checkpointing and self.training:
layer_outputs = self._gradient_checkpointing_func(
decoder_layer.__call__,
hidden_states,
causal_mask,
position_ids,
past_key_values,
output_attentions,
use_cache,
cache_position,
position_embeddings,
)
else:
layer_outputs = decoder_layer(
hidden_states,
attention_mask=causal_mask,
position_ids=position_ids,
past_key_value=past_key_values,
output_attentions=output_attentions,
use_cache=use_cache,
cache_position=cache_position,
position_embeddings=position_embeddings,
)
hidden_states = layer_outputs[0]
if use_cache:
next_decoder_cache = layer_outputs[2 if output_attentions else 1]
if output_attentions:
all_self_attns += (layer_outputs[1],)
hidden_states = self.norm(hidden_states)
# add hidden states from the last decoder layer
if output_hidden_states:
all_hidden_states += (hidden_states,)
next_cache = next_decoder_cache if use_cache else None
if return_legacy_cache:
next_cache = next_cache.to_legacy_cache()
if not return_dict:
return tuple(
v
for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
if v is not None
)
return BaseModelOutputWithPast(
last_hidden_state=hidden_states,
past_key_values=next_cache,
hidden_states=all_hidden_states,
attentions=all_self_attns,
)
def _update_causal_mask(
self,
attention_mask: torch.Tensor,
input_tensor: torch.Tensor,
cache_position: torch.Tensor,
past_key_values: Cache,
output_attentions: bool,
):
# TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static
# KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at each decode steps due to the dynamic shapes.
# (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using
# `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114
if self.config._attn_implementation == "flash_attention_2":
if attention_mask is not None and 0.0 in attention_mask:
return attention_mask
return None
# For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
# order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
# to infer the attention mask.
past_seen_tokens = (
past_key_values.get_seq_length() if past_key_values is not None else 0
)
using_static_cache = isinstance(past_key_values, StaticCache)
# When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
if (
self.config._attn_implementation == "sdpa"
and not using_static_cache
and not output_attentions
):
if AttentionMaskConverter._ignore_causal_mask_sdpa(
attention_mask,
inputs_embeds=input_tensor,
past_key_values_length=past_seen_tokens,
is_training=self.training,
):
return None
dtype, device = input_tensor.dtype, input_tensor.device
min_dtype = torch.finfo(dtype).min
sequence_length = input_tensor.shape[1]
if using_static_cache:
target_length = past_key_values.get_max_length()
else:
target_length = (
attention_mask.shape[-1]
if isinstance(attention_mask, torch.Tensor)
else past_seen_tokens + sequence_length + 1
)
if attention_mask is not None and attention_mask.dim() == 4:
# in this case we assume that the mask comes already in inverted form and requires no inversion or slicing
if attention_mask.max() != 0:
raise ValueError(
"Custom 4D attention mask should be passed in inverted form with max==0`"
)
causal_mask = attention_mask
else:
causal_mask = torch.full(
(sequence_length, target_length),
fill_value=min_dtype,
dtype=dtype,
device=device,
)
if sequence_length != 1:
causal_mask = torch.triu(causal_mask, diagonal=1)
causal_mask *= torch.arange(
target_length, device=device
) > cache_position.reshape(-1, 1)
causal_mask = causal_mask[None, None, :, :].expand(
input_tensor.shape[0], 1, -1, -1
)
if attention_mask is not None:
causal_mask = (
causal_mask.clone()
) # copy to contiguous memory for in-place edit
mask_length = attention_mask.shape[-1]
padding_mask = (
causal_mask[:, :, :, :mask_length]
+ attention_mask[:, None, None, :]
)
padding_mask = padding_mask == 0
causal_mask[:, :, :, :mask_length] = causal_mask[
:, :, :, :mask_length
].masked_fill(padding_mask, min_dtype)
if (
self.config._attn_implementation == "sdpa"
and attention_mask is not None
and attention_mask.device.type == "cuda"
and not output_attentions
):
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
# using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
# Details: https://github.com/pytorch/pytorch/issues/110213
causal_mask = AttentionMaskConverter._unmask_unattended(
causal_mask, min_dtype
)
return causal_mask

View file

@ -225,4 +225,4 @@
class: "default"
kwargs:
generate_device: "cuda:3"
prefill_device: "cuda:3"
prefill_device: "cuda:3"

View file

@ -123,4 +123,4 @@
class: "default"
kwargs:
generate_device: "cuda:1"
prefill_device: "cuda:1"
prefill_device: "cuda:1"

View file

@ -6,7 +6,7 @@
generate_device: "cuda"
prefill_device: "cuda"
- match:
name: "^model\\.layers\\.(?!.*self_attn).*$" # regular expression
name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$" # regular expression
class: torch.nn.Linear # only match modules matching name and class simultaneously
replace:
class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types
@ -41,6 +41,12 @@
kwargs:
generate_device: "cuda"
prefill_device: "cuda"
- match:
name: "^model$"
replace:
class: "ktransformers.operators.models.KDeepseekV2Model"
kwargs:
per_layer_prefill_intput_threshold: 2000 # 0 is close layer wise prefill
- match:
name: "^model.embed_tokens"
replace:

View file

@ -123,4 +123,4 @@
class: "default"
kwargs:
generate_device: "cuda:1"
prefill_device: "cuda:1"
prefill_device: "cuda:1"

View file

@ -0,0 +1,28 @@
- match:
class: ktransformers.models.modeling_llama.LlamaRotaryEmbedding
replace:
class: ktransformers.operators.RoPE.RotaryEmbeddingV2
- match:
name: "^model.embed_tokens"
replace:
class: "default"
kwargs:
generate_device: "cpu"
prefill_device: "cpu"
- match:
class: ktransformers.models.modeling_llama.LlamaModel
replace:
class: ktransformers.operators.models.KLlamaModel
kwargs:
generate_device: "cuda"
prefill_device: "cuda"
per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
- match:
name: "^model\\.layers\\..*\\.self_attn$"
replace:
class: ktransformers.operators.attention.KLlamaAttention
kwargs:
generate_device: "cuda"
prefill_device: "cuda"

View file

@ -109,4 +109,4 @@
class: "default"
kwargs:
generate_device: "cuda:1"
prefill_device: "cuda:1"
prefill_device: "cuda:1"

View file

@ -1,3 +1,10 @@
- match:
name: "^model\\.layers\\..*\\."
replace:
class: "default"
kwargs:
generate_device: "cuda"
prefill_device: "cuda"
- match:
class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeRotaryEmbedding
replace:
@ -54,4 +61,4 @@
class: "default"
kwargs:
generate_device: "cuda"
prefill_device: "cuda"
prefill_device: "cuda"

View file

@ -5,10 +5,11 @@ Description :
Author : unicornchan
Date : 2024-06-11 16:35:42
Version : 1.0.0
LastEditors : chenxl
LastEditTime : 2024-07-27 01:55:42
LastEditors : WuHao
LastEditTime : 2024-08-12 06:31:14
'''
import os
import shutil
import yaml
from ktransformers.server.config.singleton import Singleton
@ -30,10 +31,18 @@ class Config(metaclass=Singleton):
os.path.dirname(os.path.dirname(__file__)))
config_yaml: str = os.path.join(
base_path, "configs", Config.CONFIG_FILE_NAME)
user_path: str = os.path.expanduser('~')
localstore_path: str = os.path.join(user_path,'.ktransformers')
config_path: str = os.path.join(localstore_path,Config.CONFIG_FILE_NAME)
if not os.path.exists(config_yaml):
print(f"Can't find config file, {config_yaml}")
exit(-1)
with open(config_yaml, 'r', encoding="utf-8") as fp:
if not os.path.exists(localstore_path):
os.mkdir(localstore_path)
if not os.path.exists(config_path):
shutil.copyfile(config_yaml,config_path)
with open(config_path, 'r', encoding="utf-8") as fp:
config = yaml.safe_load(fp)
return config
@ -51,6 +60,8 @@ class Config(metaclass=Singleton):
cfg = Config.load()
self.base_path = os.path.dirname(
os.path.dirname(os.path.dirname(__file__)))
self.user_path: str = os.path.expanduser('~')
self.localstore_path: str = os.path.join(self.user_path,'.ktransformers')
# log configs
self.log_dir = os.path.join(self.base_path, Config.to_path(cfg["log"]["dir"]))
self.log_file = cfg["log"]["file"]
@ -83,11 +94,20 @@ class Config(metaclass=Singleton):
self.model_name: str = self.model.get("name", "")
self.model_device: str = self.model.get("device", "cuda:0")
self.gguf_path: str = self.model.get("gguf_path", "")
self.model_cache_lens = self.model.get("cache_lens")
# web config
self.web: dict = cfg.get("web", {})
self.web_cross_domain: bool = self.web.get("open_cross_domain", True)
self.mount_web: bool = self.web.get("mount", False)
self.ext: dict = cfg.get("ext", {})
self.cpu_infer = self.ext.get("cpu_infer", 10)
#file config
self.local_store_configs: dict = cfg.get("local_store",{})
self.file_upload_dir: str = os.path.join(self.localstore_path,self.local_store_configs.get("file_upload_dir",""))
self.assistant_store_dir: str = os.path.join(self.localstore_path,self.local_store_configs.get("assistant_store_dir",""))
#long context config
self.long_context_config: dict = cfg.get("long_context",{})

View file

@ -46,7 +46,8 @@ class CUDAGraphRunner:
capture_stream.wait_stream(torch.cuda.current_stream())
torch.cuda.set_device(main_device)
torch.cuda.set_stream(capture_stream)
past_key_values.change_seq_length(-1)
if past_key_values != None:
past_key_values.change_seq_length(-1)
torch.cuda.synchronize(self.main_device)
#self.graph.debug_dump("cuda_graph_hooked.dot")

View file

@ -6,7 +6,7 @@ Author : Azure-Tang, Boxin Zhang, chenht2022
Date : 2024-07-26 08:48:54
Version : 1.0.0
LastEditors : kkk1nak0
LastEditTime : 2024-08-12 07:21:55
LastEditTime : 2024-08-14 08:20:45
Adapted from https://github.com/99991/pygguf/blob/main/gguf.py
Copyright (c) 2023-2024 The ggml authors
Copyright (c) 2024 Thomas Germer
@ -294,7 +294,6 @@ class GGUFLoader:
else:
values = GGML_DEQUANTIZE[ggml_name](data)
values = torch.from_numpy(values)
values = values.view(shape[::-1])
if "attn_q" in name and self.gguf_file_meta['general.architecture'] in ["llama"]:
n_head = self.gguf_file_meta['llama.attention.head_count']

View file

@ -84,7 +84,8 @@ def load_weights(module:nn.Module, gguf_loader:GGUFLoader, prefix=''):
else:
module.load()
def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000, use_cuda_graph: bool = True):
def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000, use_cuda_graph: bool = True,
mode = 'normal'):
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
torch._dynamo.config.suppress_errors = True
@ -110,7 +111,8 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000, use_cud
cache_position=cache_position,
past_key_values=past_key_values,
return_dict=False, use_cache=True)[0]
past_key_values.change_seq_length(1)
if past_key_values != None:
past_key_values.change_seq_length(1)
for device in all_cuda_device:
torch.cuda.synchronize(device)
#print(logits)
@ -125,18 +127,26 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000, use_cud
torch.cuda.set_device(torch_device)
with torch.no_grad():
stream = TextStreamer(tokenizer)
past_key_values = StaticCache(
config = model.config, max_batch_size = 1, max_cache_len = seq_length + max_new_tokens, device = device_map, dtype = model.dtype
)
if mode != 'long_context':
past_key_values = StaticCache(
config = model.config, max_batch_size = 1, max_cache_len = seq_length + max_new_tokens, device = device_map, dtype = model.dtype
)
else:
past_key_values = None
cache_position = torch.arange(seq_length, device=torch_device)
generated_ids = torch.zeros(
batch_size, seq_length + max_new_tokens + 1, dtype=torch.int, device=torch_device
)
generated_ids[:, cache_position] = inputs.to(torch_device).to(torch.int)
past_key_values.cur_idx=cache_position
if past_key_values != None:
past_key_values.cur_idx=cache_position
start_time = time.time()
inputs_embeds = model.model.embed_tokens(inputs.to("cpu")).to(torch_device)
if mode == "long_context":
inputs_embeds = model.model.embed_tokens(inputs.to("cpu"))
else:
inputs_embeds = model.model.embed_tokens(inputs.to("cpu")).to(torch_device)
logits = model(
inputs_embeds = inputs_embeds, cache_position=cache_position, past_key_values=past_key_values, return_dict=False, use_cache=True
)[0][:,-1,:].unsqueeze(0).clone().to(torch_device)
@ -184,7 +194,7 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000, use_cud
tokens.append(next_token.int())
seq_length += 1
if next_token[0].item() == tokenizer.eos_token_id:
if next_token[0].item() == tokenizer.eos_token_id or tokenizer.decode(next_token) == '<|im_end|>':
print(stream.end(), end="", flush=True)
break
else:

View file

@ -27,7 +27,8 @@ dependencies = [
"wheel",
"colorlog",
"build",
"fire"
"fire",
"protobuf"
]
requires-python = ">=3.10"

View file

@ -3,4 +3,5 @@ transformers
numpy
torch>=2.3.0
packaging
cpufeature
cpufeature
protobuf