add flashinfer to cuda device

This commit is contained in:
Alisehen 2025-05-15 07:03:45 +00:00
parent f3be33a313
commit 055680e26c
5 changed files with 4 additions and 8 deletions

View file

@ -100,10 +100,8 @@ git submodule update --init --recursive
# Install single NUMA dependencies # Install single NUMA dependencies
USE_BALANCE_SERVE=1 bash ./install.sh USE_BALANCE_SERVE=1 bash ./install.sh
pip install third_party/custom_flashinfer/
# For those who have two cpu and 1T RAMDual NUMA: # For those who have two cpu and 1T RAMDual NUMA:
USE_BALANCE_SERVE=1 USE_NUMA=1 bash ./install.sh USE_BALANCE_SERVE=1 USE_NUMA=1 bash ./install.sh
pip install third_party/custom_flashinfer/
``` ```
## Running DeepSeek-R1-Q4KM Models ## Running DeepSeek-R1-Q4KM Models

View file

@ -117,13 +117,11 @@ Download source code and compile:
```shell ```shell
USE_BALANCE_SERVE=1 bash ./install.sh USE_BALANCE_SERVE=1 bash ./install.sh
pip install third_party/custom_flashinfer/
``` ```
- For Multi-concurrency with two cpu and 1T RAM: - For Multi-concurrency with two cpu and 1T RAM:
```shell ```shell
USE_BALANCE_SERVE=1 USE_NUMA=1 bash ./install.sh USE_BALANCE_SERVE=1 USE_NUMA=1 bash ./install.sh
pip install third_party/custom_flashinfer/
``` ```
- For Windows (Windows native temporarily deprecated, please try WSL) - For Windows (Windows native temporarily deprecated, please try WSL)

View file

@ -68,10 +68,8 @@ pip3 install torch torchvision torchaudio --index-url https://download.pytorch.o
```bash ```bash
# Install single NUMA dependencies # Install single NUMA dependencies
USE_BALANCE_SERVE=1 bash ./install.sh USE_BALANCE_SERVE=1 bash ./install.sh
pip install third_party/custom_flashinfer/
# For those who have two cpu and 1T RAMDual NUMA: # For those who have two cpu and 1T RAMDual NUMA:
USE_BALANCE_SERVE=1 USE_NUMA=1 bash ./install.sh USE_BALANCE_SERVE=1 USE_NUMA=1 bash ./install.sh
pip install third_party/custom_flashinfer/
``` ```
### 4. Use our custom config.json ### 4. Use our custom config.json

View file

@ -127,10 +127,8 @@ cd ktransformers
git submodule update --init --recursive git submodule update --init --recursive
# 如果使用双 numa 版本 # 如果使用双 numa 版本
USE_BALANCE_SERVE=1 USE_NUMA=1 bash ./install.sh USE_BALANCE_SERVE=1 USE_NUMA=1 bash ./install.sh
pip install third_party/custom_flashinfer/
# 如果使用单 numa 版本 # 如果使用单 numa 版本
USE_BALANCE_SERVE=1 bash ./install.sh USE_BALANCE_SERVE=1 bash ./install.sh
pip install third_party/custom_flashinfer/
# 启动命令 # 启动命令
python ktransformers/server/main.py --model_path <your model path> --gguf_path <your gguf path> --cpu_infer 62 --optimize_config_path <inject rule path> --port 10002 --chunk_size 256 --max_new_tokens 1024 --max_batch_size 4 --port 10002 --cache_lens 32768 --backend_type balance_serve python ktransformers/server/main.py --model_path <your model path> --gguf_path <your gguf path> --cpu_infer 62 --optimize_config_path <inject rule path> --port 10002 --chunk_size 256 --max_new_tokens 1024 --max_batch_size 4 --port 10002 --cache_lens 32768 --backend_type balance_serve
``` ```

View file

@ -38,6 +38,10 @@ fi
echo "Installing ktransformers" echo "Installing ktransformers"
KTRANSFORMERS_FORCE_BUILD=TRUE pip install -v . --no-build-isolation KTRANSFORMERS_FORCE_BUILD=TRUE pip install -v . --no-build-isolation
if [[ "$DEV_BACKEND" == "cuda" ]]; then
echo "Installing custom_flashinfer for CUDA backend"
pip install third_party/custom_flashinfer/
fi
# SITE_PACKAGES=$(python -c "import site; print(site.getsitepackages()[0])") # SITE_PACKAGES=$(python -c "import site; print(site.getsitepackages()[0])")
# echo "Copying thirdparty libs to $SITE_PACKAGES" # echo "Copying thirdparty libs to $SITE_PACKAGES"
# cp -a csrc/balance_serve/build/third_party/prometheus-cpp/lib/libprometheus-cpp-*.so* $SITE_PACKAGES/ # cp -a csrc/balance_serve/build/third_party/prometheus-cpp/lib/libprometheus-cpp-*.so* $SITE_PACKAGES/