From 055680e26c97b1e5353e9d989b5e087b78f38511 Mon Sep 17 00:00:00 2001 From: Alisehen <814073252@qq.com> Date: Thu, 15 May 2025 07:03:45 +0000 Subject: [PATCH] add flashinfer to cuda device --- doc/en/balance-serve.md | 2 -- doc/en/install.md | 2 -- doc/en/llama4.md | 2 -- doc/zh/DeepseekR1_V3_tutorial_zh.md | 2 -- install.sh | 4 ++++ 5 files changed, 4 insertions(+), 8 deletions(-) diff --git a/doc/en/balance-serve.md b/doc/en/balance-serve.md index 4d72fbd..5217968 100644 --- a/doc/en/balance-serve.md +++ b/doc/en/balance-serve.md @@ -100,10 +100,8 @@ git submodule update --init --recursive # Install single NUMA dependencies USE_BALANCE_SERVE=1 bash ./install.sh -pip install third_party/custom_flashinfer/ # For those who have two cpu and 1T RAM(Dual NUMA): USE_BALANCE_SERVE=1 USE_NUMA=1 bash ./install.sh -pip install third_party/custom_flashinfer/ ``` ## Running DeepSeek-R1-Q4KM Models diff --git a/doc/en/install.md b/doc/en/install.md index aee923b..031b541 100644 --- a/doc/en/install.md +++ b/doc/en/install.md @@ -117,13 +117,11 @@ Download source code and compile: ```shell USE_BALANCE_SERVE=1 bash ./install.sh - pip install third_party/custom_flashinfer/ ``` - For Multi-concurrency with two cpu and 1T RAM: ```shell USE_BALANCE_SERVE=1 USE_NUMA=1 bash ./install.sh - pip install third_party/custom_flashinfer/ ``` - For Windows (Windows native temporarily deprecated, please try WSL) diff --git a/doc/en/llama4.md b/doc/en/llama4.md index b55c32f..8592871 100644 --- a/doc/en/llama4.md +++ b/doc/en/llama4.md @@ -68,10 +68,8 @@ pip3 install torch torchvision torchaudio --index-url https://download.pytorch.o ```bash # Install single NUMA dependencies USE_BALANCE_SERVE=1 bash ./install.sh -pip install third_party/custom_flashinfer/ # For those who have two cpu and 1T RAM(Dual NUMA): USE_BALANCE_SERVE=1 USE_NUMA=1 bash ./install.sh -pip install third_party/custom_flashinfer/ ``` ### 4. Use our custom config.json diff --git a/doc/zh/DeepseekR1_V3_tutorial_zh.md b/doc/zh/DeepseekR1_V3_tutorial_zh.md index bbe109c..5645f4f 100644 --- a/doc/zh/DeepseekR1_V3_tutorial_zh.md +++ b/doc/zh/DeepseekR1_V3_tutorial_zh.md @@ -127,10 +127,8 @@ cd ktransformers git submodule update --init --recursive # 如果使用双 numa 版本 USE_BALANCE_SERVE=1 USE_NUMA=1 bash ./install.sh -pip install third_party/custom_flashinfer/ # 如果使用单 numa 版本 USE_BALANCE_SERVE=1 bash ./install.sh -pip install third_party/custom_flashinfer/ # 启动命令 python ktransformers/server/main.py --model_path --gguf_path --cpu_infer 62 --optimize_config_path --port 10002 --chunk_size 256 --max_new_tokens 1024 --max_batch_size 4 --port 10002 --cache_lens 32768 --backend_type balance_serve ``` diff --git a/install.sh b/install.sh index 260ae46..c19a18e 100644 --- a/install.sh +++ b/install.sh @@ -38,6 +38,10 @@ fi echo "Installing ktransformers" KTRANSFORMERS_FORCE_BUILD=TRUE pip install -v . --no-build-isolation +if [[ "$DEV_BACKEND" == "cuda" ]]; then + echo "Installing custom_flashinfer for CUDA backend" + pip install third_party/custom_flashinfer/ +fi # SITE_PACKAGES=$(python -c "import site; print(site.getsitepackages()[0])") # echo "Copying thirdparty libs to $SITE_PACKAGES" # cp -a csrc/balance_serve/build/third_party/prometheus-cpp/lib/libprometheus-cpp-*.so* $SITE_PACKAGES/