mirror of
https://github.com/kvcache-ai/ktransformers.git
synced 2025-09-11 07:44:35 +00:00
update install doc and fix local_chat bug
This commit is contained in:
parent
ab0b0f4ea1
commit
1b7672937b
4 changed files with 14 additions and 28 deletions
|
@ -11,7 +11,6 @@
|
||||||
- [Multi-GPU Tutorial](en/multi-gpu-tutorial.md)
|
- [Multi-GPU Tutorial](en/multi-gpu-tutorial.md)
|
||||||
- [Use FP8 GPU Kernel](en/fp8_kernel.md)
|
- [Use FP8 GPU Kernel](en/fp8_kernel.md)
|
||||||
- [Use AMD GPU](en/ROCm.md)
|
- [Use AMD GPU](en/ROCm.md)
|
||||||
- [Use Multi-concurrency](en/balance-serve.md)
|
|
||||||
# Server
|
# Server
|
||||||
- [Server](en/api/server/server.md)
|
- [Server](en/api/server/server.md)
|
||||||
- [Website](en/api/server/website.md)
|
- [Website](en/api/server/website.md)
|
||||||
|
@ -24,28 +23,4 @@
|
||||||
# V3 Reproduction
|
# V3 Reproduction
|
||||||
- [Success List](en/V3-success.md)
|
- [Success List](en/V3-success.md)
|
||||||
# Benchmark
|
# Benchmark
|
||||||
- [Benchmark](# Ktransformer
|
- [Benchmark](en/benchmark.md)
|
||||||
|
|
||||||
[Introduction](./README.md)
|
|
||||||
# Install
|
|
||||||
- [Installation Guide](en/install.md)
|
|
||||||
|
|
||||||
# Tutorial
|
|
||||||
- [Deepseek-R1/V3 Show Case/Tutorial](en/DeepseekR1_V3_tutorial.md)
|
|
||||||
- [Why KTransformers So Fast](en/deepseek-v2-injection.md)
|
|
||||||
- [Injection Tutorial](en/injection_tutorial.md)
|
|
||||||
- [Multi-GPU Tutorial](en/multi-gpu-tutorial.md)
|
|
||||||
- [Use FP8 GPU Kernel](en/fp8_kernel.md)
|
|
||||||
# Server
|
|
||||||
- [Server](en/api/server/server.md)
|
|
||||||
- [Website](en/api/server/website.md)
|
|
||||||
- [Tabby](en/api/server/tabby.md)
|
|
||||||
# For Developer
|
|
||||||
- [Makefile Usage](en/makefile_usage.md)
|
|
||||||
|
|
||||||
# FAQ
|
|
||||||
- [FAQ](en/FAQ.md)
|
|
||||||
# V3 Reproduction
|
|
||||||
- [Success List](en/V3-success.md)
|
|
||||||
# Benchmark
|
|
||||||
- [Benchmark](
|
|
||||||
|
|
|
@ -41,6 +41,16 @@ Implemented **balance_serve** engine based on **FlashInfer** @qiyuxinlin @ovowei
|
||||||
Implemented a **continuous batching** scheduler in C++ @ErvinXie
|
Implemented a **continuous batching** scheduler in C++ @ErvinXie
|
||||||
release: bump version v0.2.4 by @Atream @Azure-Tang @ErvinXie @qiyuxinlin @ovowei @KMSorSMS @SkqLiao
|
release: bump version v0.2.4 by @Atream @Azure-Tang @ErvinXie @qiyuxinlin @ovowei @KMSorSMS @SkqLiao
|
||||||
|
|
||||||
|
## Download the Docker image for testing v0.2.4
|
||||||
|
Visit the [link](https://hub.docker.com/r/approachingai/ktransformers/tags) to pull the image, using `v0.2.4-AVX512` as an example.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker pull approachingai/ktransformers:v0.2.4-AVX512
|
||||||
|
docker run -it --gpus all --privileged --shm-size 64g --name ktrans --network=host -v /mnt:/mnt approachingai/ktransformers:v0.2.4-AVX512 /bin/bash
|
||||||
|
# Open a new terminal
|
||||||
|
docker exec -it ktrans bash
|
||||||
|
```
|
||||||
|
|
||||||
## Installation Guide
|
## Installation Guide
|
||||||
|
|
||||||
⚠️ Please note that installing this project will replace flashinfer in your environment. It is strongly recommended to create a new conda environment!!!
|
⚠️ Please note that installing this project will replace flashinfer in your environment. It is strongly recommended to create a new conda environment!!!
|
||||||
|
@ -49,7 +59,7 @@ release: bump version v0.2.4 by @Atream @Azure-Tang @ErvinXie @qiyuxinlin @ovow
|
||||||
|
|
||||||
⚠️ Please note that installing this project will replace flashinfer in your environment. It is strongly recommended to create a new conda environment!!!
|
⚠️ Please note that installing this project will replace flashinfer in your environment. It is strongly recommended to create a new conda environment!!!
|
||||||
|
|
||||||
### 1. Set Up Conda Environment
|
### 2. Set Up Conda Environment
|
||||||
|
|
||||||
We recommend using Miniconda3/Anaconda3 for environment management:
|
We recommend using Miniconda3/Anaconda3 for environment management:
|
||||||
|
|
||||||
|
|
|
@ -422,6 +422,7 @@ class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention):
|
||||||
if q_len == 1:
|
if q_len == 1:
|
||||||
self.mla_wrapper.plan(None,None,None,
|
self.mla_wrapper.plan(None,None,None,
|
||||||
position_ids.squeeze(1)+1,
|
position_ids.squeeze(1)+1,
|
||||||
|
None,
|
||||||
self.num_heads,
|
self.num_heads,
|
||||||
self.kv_lora_rank,
|
self.kv_lora_rank,
|
||||||
self.qk_rope_head_dim,
|
self.qk_rope_head_dim,
|
||||||
|
|
|
@ -254,7 +254,7 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000, use_cud
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
for i in range(1, max_new_tokens):
|
for i in range(1, max_new_tokens):
|
||||||
if use_flashinfer_mla:
|
if use_flashinfer_mla:
|
||||||
MLAWrapperSingleton.plan_all(None,None,None,position_ids.squeeze(1)+1,
|
MLAWrapperSingleton.plan_all(None,None,None,position_ids.squeeze(1)+1,None,
|
||||||
num_heads, head_dim_ckv, head_dim_kpe, past_key_values.page_size,
|
num_heads, head_dim_ckv, head_dim_kpe, past_key_values.page_size,
|
||||||
model.model.layers[0].self_attn.softmax_scale, torch.bfloat16, torch.bfloat16)
|
model.model.layers[0].self_attn.softmax_scale, torch.bfloat16, torch.bfloat16)
|
||||||
global warm_uped
|
global warm_uped
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue