mirror of
https://github.com/kvcache-ai/ktransformers.git
synced 2025-09-07 04:59:55 +00:00
Merge branch 'kvcache-ai:main' into main
This commit is contained in:
commit
a289d3f598
56 changed files with 2758 additions and 466 deletions
68
Dockerfile.xpu
Normal file
68
Dockerfile.xpu
Normal file
|
@ -0,0 +1,68 @@
|
|||
# Base image
|
||||
FROM intel/oneapi-basekit:2025.0.1-0-devel-ubuntu22.04
|
||||
|
||||
ARG http_proxy
|
||||
ARG https_proxy
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
ENV CONDA_DIR=/opt/conda
|
||||
|
||||
# Install dependencies
|
||||
RUN apt-get update && apt-get install -y \
|
||||
wget \
|
||||
curl \
|
||||
bash \
|
||||
git \
|
||||
vim \
|
||||
ca-certificates \
|
||||
binutils \
|
||||
cmake \
|
||||
g++ \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install Miniforge
|
||||
RUN wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-x86_64.sh -O /tmp/miniforge.sh && \
|
||||
bash /tmp/miniforge.sh -b -p $CONDA_DIR && \
|
||||
rm /tmp/miniforge.sh && \
|
||||
$CONDA_DIR/bin/conda clean -afy
|
||||
|
||||
# Add conda to PATH
|
||||
ENV PATH=$CONDA_DIR/bin:$PATH
|
||||
|
||||
RUN bash -c "\
|
||||
source /opt/conda/etc/profile.d/conda.sh && \
|
||||
conda create --name ktransformers python=3.11 -y && \
|
||||
conda activate ktransformers && \
|
||||
conda env list && \
|
||||
conda install -c conda-forge libstdcxx-ng -y && \
|
||||
strings \$(find /opt/conda/envs/ktransformers/lib -name 'libstdc++.so.6') | grep GLIBCXX | grep 3.4.32 \
|
||||
"
|
||||
|
||||
RUN bash -c "\
|
||||
source /opt/conda/etc/profile.d/conda.sh && \
|
||||
conda activate ktransformers && \
|
||||
pip install ipex-llm[xpu_2.6]==2.3.0b20250518 --extra-index-url https://download.pytorch.org/whl/xpu && \
|
||||
pip uninstall -y torch torchvision torchaudio && \
|
||||
pip install torch==2.7+xpu torchvision torchaudio --index-url https://download.pytorch.org/whl/test/xpu && \
|
||||
pip uninstall -y intel-opencl-rt dpcpp-cpp-rt && \
|
||||
pip list \
|
||||
"
|
||||
|
||||
# Clone and set up ktransformers repo
|
||||
RUN bash -c "\
|
||||
source $CONDA_DIR/etc/profile.d/conda.sh && \
|
||||
conda activate ktransformers && \
|
||||
git clone https://github.com/kvcache-ai/ktransformers.git && \
|
||||
cd ktransformers && \
|
||||
git submodule update --init && \
|
||||
sed -i 's/torch\.xpu\.is_available()/True/g' setup.py && \
|
||||
bash install.sh --dev xpu \
|
||||
"
|
||||
|
||||
# Init conda and prepare bashrc
|
||||
RUN conda init bash && \
|
||||
echo "source $CONDA_DIR/etc/profile.d/conda.sh" >> ~/.bashrc && \
|
||||
echo "conda activate ktransformers" >> ~/.bashrc
|
||||
|
||||
WORKDIR /ktransformers/
|
||||
CMD ["bash"]
|
12
README.md
12
README.md
|
@ -23,6 +23,8 @@ Our vision for KTransformers is to serve as a flexible platform for experimentin
|
|||
|
||||
<h2 id="Updates">🔥 Updates</h2>
|
||||
|
||||
* **May 14, 2025**: Support Intel Arc GPU ([Tutorial](./doc/en/xpu.md)).
|
||||
|
||||
* **Apr 29, 2025**: Support AMX-Int8、 AMX-BF16 and Qwen3MoE ([Tutorial](./doc/en/AMX.md))
|
||||
|
||||
https://github.com/user-attachments/assets/fafe8aec-4e22-49a8-8553-59fb5c6b00a2
|
||||
|
@ -116,6 +118,16 @@ https://github.com/user-attachments/assets/a865e5e4-bca3-401e-94b8-af3c080e6c12
|
|||
|
||||
Getting started with KTransformers is simple! Follow the steps below to set up and start using it.
|
||||
|
||||
we have already supported vendors:
|
||||
|
||||
- Metax
|
||||
- Sanechips (ZhuFeng V1.0)
|
||||
- Intel
|
||||
- Ascend
|
||||
- Kunpeng
|
||||
- AMD
|
||||
|
||||
|
||||
### 📥 Installation
|
||||
|
||||
To install KTransformers, follow the official [Installation Guide](https://kvcache-ai.github.io/ktransformers/en/install.html).
|
||||
|
|
BIN
WeChatGroup.png
BIN
WeChatGroup.png
Binary file not shown.
Before Width: | Height: | Size: 420 KiB After Width: | Height: | Size: 1.1 MiB |
|
@ -41,6 +41,7 @@ option(LLAMA_AVX512_FANCY_SIMD "llama: enable AVX512-VL, AVX512-BW
|
|||
option(KTRANSFORMERS_USE_CUDA "ktransformers: use CUDA" ON)
|
||||
option(KTRANSFORMERS_USE_MUSA "ktransformers: use MUSA" OFF)
|
||||
option(KTRANSFORMERS_USE_ROCM "ktransformers: use ROCM" OFF)
|
||||
option(KTRANSFORMERS_USE_XPU "ktransformers: use XPU" OFF)
|
||||
|
||||
# Architecture specific
|
||||
# TODO: probably these flags need to be tweaked on some architectures
|
||||
|
@ -303,6 +304,8 @@ elseif (UNIX)
|
|||
message(STATUS "MUSA Toolkit found")
|
||||
add_compile_definitions(KTRANSFORMERS_USE_MUSA=1)
|
||||
endif()
|
||||
elseif (KTRANSFORMERS_USE_XPU)
|
||||
add_compile_definitions(KTRANSFORMERS_USE_XPU=1)
|
||||
else()
|
||||
find_package(CUDA REQUIRED)
|
||||
include_directories("${CUDA_INCLUDE_DIRS}")
|
||||
|
@ -361,6 +364,7 @@ elseif(UNIX)
|
|||
message(STATUS "Building for HIP")
|
||||
elseif(KTRANSFORMERS_USE_MUSA)
|
||||
target_link_libraries(${PROJECT_NAME} PRIVATE MUSA::musart)
|
||||
elseif(KTRANSFORMERS_USE_XPU)
|
||||
else()
|
||||
target_link_libraries(${PROJECT_NAME} PRIVATE "${CUDAToolkit_LIBRARY_DIR}/libcudart.so")
|
||||
endif()
|
||||
|
|
|
@ -17,6 +17,7 @@
|
|||
#include <queue>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
#include <stdexcept>
|
||||
#ifdef KTRANSFORMERS_USE_CUDA
|
||||
#include "vendors/cuda.h"
|
||||
#elif KTRANSFORMERS_USE_MUSA
|
||||
|
@ -66,10 +67,14 @@
|
|||
}
|
||||
|
||||
void submit_with_cuda_stream(intptr_t user_cuda_stream, std::pair<intptr_t, intptr_t> params) {
|
||||
#if defined(KTRANSFORMERS_USE_CUDA) || defined(KTRANSFORMERS_USE_MUSA) || defined(KTRANSFORMERS_USE_ROCM)
|
||||
void (*func)(void*) = (void (*)(void*))params.first;
|
||||
void* args = (void*)params.second;
|
||||
*((CPUInfer**)args) = this;
|
||||
cudaLaunchHostFunc((cudaStream_t)user_cuda_stream, (cudaHostFn_t)func, args);
|
||||
#else
|
||||
throw std::runtime_error("submit_with_cuda_stream is not supported on this platforma");
|
||||
#endif
|
||||
}
|
||||
|
||||
static void sync_(void* cpu_infer_ptr) {
|
||||
|
@ -78,7 +83,11 @@
|
|||
}
|
||||
|
||||
void sync_with_cuda_stream(intptr_t user_cuda_stream) {
|
||||
#if defined(KTRANSFORMERS_USE_CUDA) || defined(KTRANSFORMERS_USE_MUSA) || defined(KTRANSFORMERS_USE_ROCM)
|
||||
cudaLaunchHostFunc((cudaStream_t)user_cuda_stream, (cudaHostFn_t)&sync_, (void*)this);
|
||||
#else
|
||||
throw std::runtime_error("sync_with_cuda_stream is not supported on this platforma");
|
||||
#endif
|
||||
}
|
||||
|
||||
public:
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import os
|
||||
import sys
|
||||
sys.path.insert(0,"/home/zbx/ktransformers")
|
||||
from ktransformers.util.custom_gguf import GGUFLoader
|
||||
from ktransformers.util.custom_loader import GGUFLoader
|
||||
import torch
|
||||
|
||||
gguf_loader_1 = GGUFLoader("/mnt/data/model/DeepseekV3-q4km-gguf")
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
**/
|
||||
// Python bindings
|
||||
#include "cpu_backend/cpuinfer.h"
|
||||
#ifndef KTRANSFORMERS_USE_ROCM
|
||||
#if !defined(KTRANSFORMERS_USE_ROCM) && !defined(KTRANSFORMERS_USE_XPU)
|
||||
#include "device_launch_parameters.h"
|
||||
#endif
|
||||
#include "llamafile/flags.h"
|
||||
|
|
|
@ -843,7 +843,7 @@ inline void mat_mul(int m, int n, int k, std::shared_ptr<GemmKernel224BF::Buffer
|
|||
if (!use_amx) {
|
||||
__m512 *c512 = (__m512 *)c;
|
||||
if (k_block_begin == 0) {
|
||||
for (int m_i = 0; m_i < m; m_i++) {
|
||||
for (int m_i = 0; m_i < m && m_i < K::M_STEP; m_i++) {
|
||||
c512[m_i * 2] = _mm512_setzero_ps();
|
||||
c512[m_i * 2 + 1] = _mm512_setzero_ps();
|
||||
}
|
||||
|
@ -852,7 +852,7 @@ inline void mat_mul(int m, int n, int k, std::shared_ptr<GemmKernel224BF::Buffer
|
|||
for (int k_begin = 0; k_begin < K::K_BLOCK && k_block_begin + k_begin < k; k_begin += K::K_STEP) {
|
||||
int32_t *a32 = (int32_t *)ba->get_submat(m, k, m_begin, k_block_begin + k_begin);
|
||||
__m512bh *b512 = (__m512bh *)bb->get_submat(n, k, n_begin, k_block_begin + k_begin);
|
||||
for (int m_i = 0; m_i < m; m_i++) {
|
||||
for (int m_i = 0; m_i < m && m_i < K::M_STEP; m_i++) {
|
||||
for (int k_i = 0; k_i < 16; k_i++) {
|
||||
__m512bh ma = (__m512bh)_mm512_set1_epi32(a32[m_i * 16 + k_i]);
|
||||
for (int n_i = 0; n_i < 2; n_i++) {
|
||||
|
@ -914,7 +914,7 @@ inline void mat_mul(int m, int n, int k, std::shared_ptr<GemmKernel224Int8::Buff
|
|||
if (!use_amx) {
|
||||
__m512i *c512 = (__m512i *)c;
|
||||
if (k_block_begin == 0) {
|
||||
for (int m_i = 0; m_i < m; m_i++) {
|
||||
for (int m_i = 0; m_i < m && m_i < K::M_STEP; m_i++) {
|
||||
c512[m_i * 2] = _mm512_setzero_si512();
|
||||
c512[m_i * 2 + 1] = _mm512_setzero_si512();
|
||||
}
|
||||
|
@ -926,7 +926,7 @@ inline void mat_mul(int m, int n, int k, std::shared_ptr<GemmKernel224Int8::Buff
|
|||
|
||||
int32_t *a32 = (int32_t *)ba->get_submat(m, k, m_begin, k_block_begin + k_begin);
|
||||
__m512i *b512 = (__m512i *)bb->get_submat(n, k, n_begin, k_block_begin + k_begin);
|
||||
for (int m_i = 0; m_i < m; m_i++) {
|
||||
for (int m_i = 0; m_i < m && m_i < K::M_STEP; m_i++) {
|
||||
for (int k_i = 0; k_i < 16; k_i++) {
|
||||
__m512i ma = _mm512_set1_epi32(a32[m_i * 16 + k_i]);
|
||||
for (int n_i = 0; n_i < 2; n_i++) {
|
||||
|
|
|
@ -272,8 +272,8 @@ public:
|
|||
|
||||
void forward(int qlen, int k, const uint64_t *expert_ids, const float *weights, const void *input, void *output,
|
||||
int *batch_size_tensor, Backend *backend) {
|
||||
bool use_amx = (qlen > 4 * config_.expert_num / config_.routed_expert_num);
|
||||
qlen = batch_size_tensor[0];
|
||||
bool use_amx = (qlen > 4 * config_.expert_num / config_.routed_expert_num);
|
||||
int activated_expert = 0;
|
||||
for (int i = 0; i < config_.expert_num; i++) {
|
||||
m_local_num_[i] = 0;
|
||||
|
@ -395,4 +395,4 @@ public:
|
|||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
|
|
@ -22,6 +22,7 @@ Our vision for KTransformers is to serve as a flexible platform for experimentin
|
|||
|
||||
<h2 id="Updates">🔥 Updates</h2>
|
||||
|
||||
* **May 14, 2025**: Support Intel Arc GPU ([Tutorial](./en/xpu.md)).
|
||||
* **Apr 9, 2025**: Experimental support for LLaMA 4 models ([Tutorial](./en/llama4.md)).
|
||||
* **Apr 2, 2025**: Support Multi-concurrency. ([Tutorial](./en/balance-serve.md)).
|
||||
* **Mar 27, 2025**: Support Multi-concurrency.
|
||||
|
|
94
doc/en/Docker_xpu.md
Normal file
94
doc/en/Docker_xpu.md
Normal file
|
@ -0,0 +1,94 @@
|
|||
# Intel GPU Docker Guide (Beta)
|
||||
|
||||
## Prerequisites
|
||||
|
||||
* Docker must be installed and running on your system.
|
||||
* Create a folder to store big models & intermediate files (e.g., /mnt/models)
|
||||
* **Before proceeding, ensure the Intel GPU driver is installed correctly on your host:** [Installation Guide](./xpu.md#1-install-intel-gpu-driver)
|
||||
|
||||
---
|
||||
|
||||
## Building the Docker Image Locally
|
||||
|
||||
1. Clone the repository and navigate to the project directory:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/kvcache-ai/ktransformers.git
|
||||
cd ktransformers
|
||||
```
|
||||
|
||||
2. Build the Docker image using the XPU-specific [Dockerfile.xpu](../../Dockerfile.xpu):
|
||||
|
||||
```bash
|
||||
sudo http_proxy=$HTTP_PROXY \
|
||||
https_proxy=$HTTPS_PROXY \
|
||||
docker build \
|
||||
--build-arg http_proxy=$HTTP_PROXY \
|
||||
--build-arg https_proxy=$HTTPS_PROXY \
|
||||
-t kt_xpu:0.3.1 \
|
||||
-f Dockerfile.xpu \
|
||||
.
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Running the Container
|
||||
|
||||
### 1. Start the container
|
||||
|
||||
```bash
|
||||
sudo docker run -td --privileged \
|
||||
--net=host \
|
||||
--device=/dev/dri \
|
||||
--shm-size="16g" \
|
||||
-v /path/to/models:/models \
|
||||
-e http_proxy=$HTTP_PROXY \
|
||||
-e https_proxy=$HTTPS_PROXY \
|
||||
--name ktransformers_xpu \
|
||||
kt_xpu:0.3.1
|
||||
```
|
||||
|
||||
**Note**: Replace `/path/to/models` with your actual model directory path (e.g., `/mnt/models`).
|
||||
|
||||
---
|
||||
|
||||
### 2. Access the container
|
||||
|
||||
```bash
|
||||
sudo docker exec -it ktransformers_xpu /bin/bash
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 3. Set required XPU environment variables (inside the container)
|
||||
|
||||
```bash
|
||||
export SYCL_CACHE_PERSISTENT=1
|
||||
export ONEAPI_DEVICE_SELECTOR=level_zero:0
|
||||
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 4. Run the sample script
|
||||
|
||||
```bash
|
||||
python ktransformers/local_chat.py \
|
||||
--model_path deepseek-ai/DeepSeek-R1 \
|
||||
--gguf_path <path_to_gguf_files> \
|
||||
--optimize_config_path ktransformers/optimize/optimize_rules/xpu/DeepSeek-V3-Chat.yaml \
|
||||
--cpu_infer <cpu_cores + 1> \
|
||||
--device xpu \
|
||||
--max_new_tokens 200
|
||||
```
|
||||
|
||||
**Note**:
|
||||
|
||||
* Replace `<path_to_gguf_files>` with the path to your GGUF model files.
|
||||
* Replace `<cpu_cores + 1>` with the number of CPU cores you want to use plus one.
|
||||
|
||||
---
|
||||
|
||||
## Additional Information
|
||||
|
||||
For more configuration options and usage details, refer to the [project README](../../README.md). To run KTransformers natively on XPU (outside of Docker), please refer to [xpu.md](./xpu.md).
|
|
@ -45,7 +45,7 @@ Some preparation:
|
|||
sudo apt-get update
|
||||
sudo apt-get install build-essential cmake ninja-build patchelf
|
||||
```
|
||||
- We recommend using [Miniconda3](https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh) or [Anaconda3](https://repo.anaconda.com/archive/Anaconda3-2024.10-1-Linux-x86_64.sh) to create a virtual environment with Python=3.11 to run our program. Assuming your Anaconda installation directory is `~/anaconda3`, you should ensure that the version identifier of the GNU C++standard library used by Anaconda includes `GLIBCXX-3.4.32`
|
||||
- We recommend using [Miniconda3](https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh) or [Anaconda3](https://repo.anaconda.com/archive/Anaconda3-2024.10-1-Linux-x86_64.sh) to create a virtual environment with Python=3.11 to run our program. Assuming your Anaconda installation directory is `~/anaconda3`, you should ensure that the version identifier of the GNU C++standard library used by Anaconda includes `GLIBCXX_3.4.32`
|
||||
|
||||
```sh
|
||||
conda create --name ktransformers python=3.11
|
||||
|
|
134
doc/en/xpu.md
Normal file
134
doc/en/xpu.md
Normal file
|
@ -0,0 +1,134 @@
|
|||
# Intel GPU Support for KTransformers (Beta)
|
||||
|
||||
## Introduction
|
||||
|
||||
### Overview
|
||||
We are excited to introduce **Intel GPU support** in KTransformers (Beta release). This implementation has been tested and developed using Intel Xeon Scalable processors and Intel Arc GPUs (such as A770 and B580).
|
||||
|
||||
## Installation Guide
|
||||
|
||||
### 1. Install Intel GPU Driver
|
||||
Begin by installing the GPU drivers for your Intel GPU:
|
||||
- [Official GPU Installation Guide for Intel GPUs](https://dgpu-docs.intel.com/driver/overview.html)
|
||||
|
||||
To verify that the kernel and compute drivers are installed and functional:
|
||||
|
||||
```bash
|
||||
clinfo --list | grep Device
|
||||
`-- Device #0: 13th Gen Intel(R) Core(TM) i9-13900K
|
||||
`-- Device #0: Intel(R) Arc(TM) A770 Graphics
|
||||
`-- Device #0: Intel(R) UHD Graphics 770
|
||||
```
|
||||
|
||||
> [!Important]
|
||||
> Ensure that **Resizable BAR** is enabled in your system's BIOS before proceeding. This is essential for optimal GPU performance and to avoid potential issues such as `Bus error (core dumped)`. For detailed steps, please refer to the official guidance [here](https://www.intel.com/content/www/us/en/support/articles/000090831/graphics.html).
|
||||
|
||||
### 2. Set Up Conda Environment
|
||||
We recommend using Miniconda3/Anaconda3 for environment management:
|
||||
|
||||
```bash
|
||||
# Download Miniconda
|
||||
wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
|
||||
|
||||
# Create environment
|
||||
conda create --name ktransformers python=3.11
|
||||
conda activate ktransformers
|
||||
|
||||
# Install required libraries
|
||||
conda install -c conda-forge libstdcxx-ng
|
||||
|
||||
# Verify GLIBCXX version (should include 3.4.32)
|
||||
strings ~/anaconda3/envs/ktransformers/lib/libstdc++.so.6 | grep GLIBCXX
|
||||
```
|
||||
|
||||
> **Note:** Adjust the Anaconda path if your installation directory differs from `~/anaconda3`
|
||||
|
||||
### 3. Install PyTorch and IPEX-LLM
|
||||
Install PyTorch with XPU backend support and [IPEX-LLM](https://github.com/intel/ipex-llm):
|
||||
|
||||
```bash
|
||||
pip install ipex-llm[xpu_2.6]==2.3.0b20250518 --extra-index-url https://download.pytorch.org/whl/xpu
|
||||
pip uninstall torch torchvision torchaudio
|
||||
pip install torch==2.7+xpu torchvision torchaudio --index-url https://download.pytorch.org/whl/test/xpu # install torch2.7
|
||||
pip uninstall intel-opencl-rt dpcpp-cpp-rt
|
||||
```
|
||||
|
||||
### 4. Build ktransformers
|
||||
|
||||
```bash
|
||||
# Clone repository
|
||||
git clone https://github.com/kvcache-ai/ktransformers.git
|
||||
cd ktransformers
|
||||
git submodule update --init
|
||||
|
||||
# Install dependencies
|
||||
bash install.sh --dev xpu
|
||||
```
|
||||
|
||||
## Running DeepSeek-R1 Models
|
||||
|
||||
### Configuration for 16B VRAM GPUs
|
||||
Use our optimized configuration for constrained VRAM:
|
||||
|
||||
```bash
|
||||
export SYCL_CACHE_PERSISTENT=1
|
||||
export ONEAPI_DEVICE_SELECTOR=level_zero:0
|
||||
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
|
||||
|
||||
python ktransformers/local_chat.py \
|
||||
--model_path deepseek-ai/DeepSeek-R1 \
|
||||
--gguf_path <path_to_gguf_files> \
|
||||
--optimize_config_path ktransformers/optimize/optimize_rules/xpu/DeepSeek-V3-Chat.yaml \
|
||||
--cpu_infer <cpu_cores + 1> \
|
||||
--device xpu \
|
||||
--max_new_tokens 200
|
||||
```
|
||||
|
||||
## Known Limitations
|
||||
- Serving function is not supported on Intel GPU platform for now
|
||||
|
||||
## Troubleshooting
|
||||
1. Best Known Config (BKC) to obtain best performance
|
||||
|
||||
To obtain best performance on Intel GPU platform, we recommend to lock GPU frequency and set CPU to performance mode by below settings.
|
||||
```bash
|
||||
echo "performance" | sudo tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
|
||||
echo 0 | sudo tee /sys/devices/system/cpu/cpu*/power/energy_perf_bias
|
||||
# 2400 is max frequency for Arc A770
|
||||
sudo xpu-smi config -d 0 -t 0 --frequencyrange 2400,2400
|
||||
# 2850 is max frequency for Arc B580
|
||||
# sudo xpu-smi config -d 0 -t 0 --frequencyrange 2850,2850
|
||||
```
|
||||
|
||||
2. Runtime error like `xpu/sycl/TensorCompareKernels.cpp:163: xxx. Aborted (core dumped)`
|
||||
|
||||
This error is mostly related to GPU driver. If you meet such error, you could update your `intel-level-zero-gpu` to `1.3.29735.27-914~22.04` (which is a verified version by us) by below command.
|
||||
```bash
|
||||
wget -qO - https://repositories.intel.com/gpu/intel-graphics.key | \
|
||||
sudo gpg --dearmor --output /usr/share/keyrings/intel-graphics.gpg
|
||||
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/gpu/ubuntu jammy client" | \
|
||||
sudo tee /etc/apt/sources.list.d/intel-gpu-jammy.list
|
||||
sudo apt update
|
||||
# or sudo apt update --allow-insecure-repositories
|
||||
sudo apt install intel-level-zero-gpu=1.3.29735.27-914~22.04
|
||||
```
|
||||
|
||||
3. `ImportError: cannot import name 'intel' from 'triton._C.libtriton'`
|
||||
|
||||
Installing Triton causes pytorch-triton-xpu to stop working. You can resolve the issue with following command:
|
||||
```bash
|
||||
pip uninstall triton pytorch-triton-xpu
|
||||
# Reinstall correct version of pytorch-triton-xpu
|
||||
pip install pytorch-triton-xpu==3.3.0 --index-url https://download.pytorch.org/whl/xpu
|
||||
```
|
||||
|
||||
4. `ValueError: Unsupported backend: CUDA_HOME ROCM_HOME MUSA_HOME are not set and XPU is not available.`
|
||||
|
||||
Ensure you have permissions to access /dev/dri/renderD*. This typically requires your user to be in the render group:
|
||||
```bash
|
||||
sudo gpasswd -a ${USER} render
|
||||
newgrp render
|
||||
```
|
||||
|
||||
## Additional Information
|
||||
To run KTransformers on XPU with Docker, please refer to [Docker_xpu.md](./Docker_xpu.md).
|
26
install-with-cache.sh
Executable file
26
install-with-cache.sh
Executable file
|
@ -0,0 +1,26 @@
|
|||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
# clear build dirs
|
||||
# rm -rf build
|
||||
# rm -rf *.egg-info
|
||||
# rm -rf csrc/build
|
||||
# rm -rf csrc/ktransformers_ext/build
|
||||
# rm -rf csrc/ktransformers_ext/cuda/build
|
||||
# rm -rf csrc/ktransformers_ext/cuda/dist
|
||||
# rm -rf csrc/ktransformers_ext/cuda/*.egg-info
|
||||
rm -rf ~/.ktransformers
|
||||
echo "Installing python dependencies from requirements.txt"
|
||||
pip install -r requirements-local_chat.txt
|
||||
pip install -r ktransformers/server/requirements.txt
|
||||
echo "Installing ktransformers"
|
||||
KTRANSFORMERS_FORCE_BUILD=TRUE USE_BALANCE_SERVE=1 pip install -v . --no-build-isolation
|
||||
pip install third_party/custom_flashinfer/ -v
|
||||
|
||||
# SITE_PACKAGES=$(python -c "import site; print(site.getsitepackages()[0])")
|
||||
# echo "Copying thirdparty libs to $SITE_PACKAGES"
|
||||
# cp -a csrc/balance_serve/build/third_party/prometheus-cpp/lib/libprometheus-cpp-*.so* $SITE_PACKAGES/
|
||||
# patchelf --set-rpath '$ORIGIN' $SITE_PACKAGES/sched_ext.cpython*
|
||||
|
||||
|
||||
echo "Installation completed successfully"
|
21
install.sh
21
install.sh
|
@ -1,6 +1,20 @@
|
|||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
# default backend
|
||||
DEV="cuda"
|
||||
|
||||
# parse --dev argument
|
||||
while [[ "$#" -gt 0 ]]; do
|
||||
case $1 in
|
||||
--dev) DEV="$2"; shift ;;
|
||||
*) echo "Unknown parameter passed: $1"; exit 1 ;;
|
||||
esac
|
||||
shift
|
||||
done
|
||||
export DEV_BACKEND="$DEV"
|
||||
echo "Selected backend: $DEV_BACKEND"
|
||||
|
||||
# clear build dirs
|
||||
rm -rf build
|
||||
rm -rf *.egg-info
|
||||
|
@ -13,14 +27,17 @@ rm -rf ~/.ktransformers
|
|||
echo "Installing python dependencies from requirements.txt"
|
||||
pip install -r requirements-local_chat.txt
|
||||
pip install -r ktransformers/server/requirements.txt
|
||||
|
||||
echo "Installing ktransformers"
|
||||
KTRANSFORMERS_FORCE_BUILD=TRUE pip install -v . --no-build-isolation
|
||||
pip install third_party/custom_flashinfer/
|
||||
|
||||
if [[ "$DEV_BACKEND" == "cuda" ]]; then
|
||||
echo "Installing custom_flashinfer for CUDA backend"
|
||||
pip install third_party/custom_flashinfer/
|
||||
fi
|
||||
# SITE_PACKAGES=$(python -c "import site; print(site.getsitepackages()[0])")
|
||||
# echo "Copying thirdparty libs to $SITE_PACKAGES"
|
||||
# cp -a csrc/balance_serve/build/third_party/prometheus-cpp/lib/libprometheus-cpp-*.so* $SITE_PACKAGES/
|
||||
# patchelf --set-rpath '$ORIGIN' $SITE_PACKAGES/sched_ext.cpython*
|
||||
|
||||
|
||||
echo "Installation completed successfully"
|
|
@ -8,4 +8,4 @@ Version : 1.0.0
|
|||
LastEditors : chenxl
|
||||
LastEditTime : 2025-02-15 03:53:02
|
||||
'''
|
||||
__version__ = "0.3"
|
||||
__version__ = "0.3.1"
|
||||
|
|
|
@ -29,7 +29,7 @@ model:
|
|||
gguf_path: ./DeepSeek-V2-Lite-Chat-GGUF
|
||||
|
||||
device: cuda:0
|
||||
cache_lens: 8192
|
||||
cache_lens: 16384
|
||||
max_new_tokens: 500
|
||||
web:
|
||||
mount: False
|
||||
|
|
|
@ -28,7 +28,7 @@ from ktransformers.models.modeling_qwen2_moe import Qwen2MoeForCausalLM
|
|||
from ktransformers.models.modeling_deepseek_v3 import DeepseekV3ForCausalLM
|
||||
from ktransformers.models.modeling_llama import LlamaForCausalLM
|
||||
from ktransformers.models.modeling_mixtral import MixtralForCausalLM
|
||||
from ktransformers.util.utils import prefill_and_generate, get_compute_capability
|
||||
from ktransformers.util.utils import prefill_and_generate, get_compute_capability, xpu_fp16_model
|
||||
from ktransformers.server.config.config import Config
|
||||
from ktransformers.operators.flashinfer_wrapper import flashinfer_enabled
|
||||
from ktransformers.util.vendors import device_manager, get_device, to_device, GPUVendor
|
||||
|
@ -63,18 +63,24 @@ def local_chat(
|
|||
prompt_file : str | None = None,
|
||||
mode: str = "normal",
|
||||
force_think: bool = False,
|
||||
chunk_size: int = 8192
|
||||
chunk_size: int = 8192,
|
||||
device: str = "cuda"
|
||||
):
|
||||
|
||||
torch.set_grad_enabled(False)
|
||||
|
||||
Config().cpu_infer = cpu_infer
|
||||
Config().chunk_size = chunk_size
|
||||
if torch.xpu.is_available():
|
||||
use_cuda_graph = False
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
|
||||
if mode == 'long_context':
|
||||
assert config.architectures[0] == "LlamaForCausalLM", "only LlamaForCausalLM support long_context mode"
|
||||
torch.set_default_dtype(torch.float16)
|
||||
elif xpu_fp16_model(config):
|
||||
torch.set_default_dtype(torch.float16)
|
||||
else:
|
||||
torch.set_default_dtype(config.torch_dtype)
|
||||
|
||||
|
@ -89,11 +95,16 @@ def local_chat(
|
|||
config._attn_implementation = "eager"
|
||||
if "Mixtral" in config.architectures[0]:
|
||||
config._attn_implementation = "flash_attention_2"
|
||||
|
||||
if torch.xpu.is_available():
|
||||
config._attn_implementation = "eager"
|
||||
model = custom_models[config.architectures[0]](config)
|
||||
else:
|
||||
if torch.xpu.is_available():
|
||||
attn_implementation = "eager"
|
||||
else:
|
||||
attn_implementation = "flash_attention_2"
|
||||
model = AutoModelForCausalLM.from_config(
|
||||
config, trust_remote_code=True, attn_implementation="flash_attention_2"
|
||||
config, trust_remote_code=True, attn_implementation=attn_implementation
|
||||
)
|
||||
|
||||
if optimize_config_path is None:
|
||||
|
@ -109,7 +120,7 @@ def local_chat(
|
|||
gguf_path = input(
|
||||
"please input the path of your gguf file(gguf file in the dir containing input gguf file must all belong to current model):"
|
||||
)
|
||||
optimize_and_load_gguf(model, optimize_config_path, gguf_path, config)
|
||||
optimize_and_load_gguf(model, optimize_config_path, gguf_path, config, default_device=device)
|
||||
|
||||
try:
|
||||
model.generation_config = GenerationConfig.from_pretrained(model_path)
|
||||
|
@ -172,12 +183,12 @@ def local_chat(
|
|||
|
||||
if system != "Windows" and (config.architectures[0] == "DeepseekV2ForCausalLM" or config.architectures[0] == "DeepseekV3ForCausalLM") and flashinfer_enabled and get_compute_capability() >= 8 and device_manager.gpu_vendor == GPUVendor.NVIDIA:
|
||||
generated = prefill_and_generate(
|
||||
model, tokenizer, input_tensor.cuda(), max_new_tokens, use_cuda_graph, mode = mode, force_think = force_think, chunk_size = chunk_size,
|
||||
model, tokenizer, input_tensor.to(device), max_new_tokens, use_cuda_graph, mode = mode, force_think = force_think, chunk_size = chunk_size,
|
||||
use_flashinfer_mla = True, num_heads = config.num_attention_heads, head_dim_ckv = config.kv_lora_rank, head_dim_kpe = config.qk_rope_head_dim, q_head_dim = config.qk_rope_head_dim + config.qk_nope_head_dim
|
||||
)
|
||||
else:
|
||||
generated = prefill_and_generate(
|
||||
model, tokenizer, input_tensor.cuda(), max_new_tokens, use_cuda_graph, mode = mode, force_think = force_think, chunk_size = chunk_size,
|
||||
model, tokenizer, input_tensor.to(device), max_new_tokens, use_cuda_graph, mode = mode, force_think = force_think, chunk_size = chunk_size,
|
||||
)
|
||||
|
||||
|
||||
|
|
|
@ -66,7 +66,7 @@ class StaticCache(transformers.StaticCache):
|
|||
self.page_table_list = []
|
||||
for idx in range(config.num_hidden_layers):
|
||||
if isinstance(device, dict):
|
||||
target_device = device[f"blk.{idx}.self_attn"]["generate_device"]
|
||||
target_device = device[f"model.layers.{idx}.self_attn"]["generate_device"]
|
||||
else:
|
||||
target_device = device
|
||||
|
||||
|
@ -91,7 +91,7 @@ class StaticCache(transformers.StaticCache):
|
|||
# Note: `mark_static_address` is used to tag the cache as an fixed data pointer, preventing cuda graph
|
||||
# breaks when updating the cache.
|
||||
if isinstance(device, dict):
|
||||
target_device = device[f"blk.{idx}.self_attn"]["generate_device"]
|
||||
target_device = device[f"model.layers.{idx}.self_attn"]["generate_device"]
|
||||
else:
|
||||
target_device = device
|
||||
|
||||
|
@ -213,7 +213,7 @@ class KDeepSeekV3Cache(nn.Module):
|
|||
self.v_caches = []
|
||||
|
||||
|
||||
def load(self, inference_context: "sched_ext.InferenceContext"):
|
||||
def load(self, inference_context: "sched_ext.InferenceContext"):
|
||||
|
||||
for i in range(self.config.num_hidden_layers):
|
||||
self.k_caches.append(
|
||||
|
@ -293,7 +293,7 @@ class KGQACache(nn.Module):
|
|||
self.v_caches = []
|
||||
|
||||
|
||||
def load(self, inference_context: sched_ext.InferenceContext):
|
||||
def load(self, inference_context: "sched_ext.InferenceContext"):
|
||||
print(self.config.num_hidden_layers)
|
||||
for i in range(self.config.num_hidden_layers):
|
||||
self.k_caches.append(
|
||||
|
|
|
@ -39,7 +39,7 @@ class KQwen2MoeForCausalLM(Qwen2MoePreTrainedModel):
|
|||
self.cache = cache
|
||||
self.vocab_size = config.vocab_size
|
||||
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
|
||||
self.attn = [None] * 10
|
||||
self.attn = [None] * 100
|
||||
|
||||
def init_wrapper(self, use_cuda_graph, device, max_batch_token, max_batch_size, max_pages, cuda_graph_idx = 0):
|
||||
self.attn[cuda_graph_idx] = flashInferAttn(use_cuda_graph=use_cuda_graph, max_batch_token=max_batch_token, max_batch_size=max_batch_size, max_pages=max_pages, device=device)
|
||||
|
|
|
@ -39,7 +39,7 @@ class KQwen3MoeForCausalLM(Qwen3MoePreTrainedModel):
|
|||
self.cache = cache
|
||||
self.vocab_size = config.vocab_size
|
||||
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
|
||||
self.attn = [None] * 10
|
||||
self.attn = [None] * 100
|
||||
|
||||
def init_wrapper(self, use_cuda_graph, device, max_batch_token, max_batch_size, max_pages, cuda_graph_idx = 0):
|
||||
self.attn[cuda_graph_idx] = flashInferAttn(use_cuda_graph=use_cuda_graph, max_batch_token=max_batch_token, max_batch_size=max_batch_size, max_pages=max_pages, device=device)
|
||||
|
|
|
@ -107,6 +107,7 @@ class DeepseekV2RMSNorm(nn.Module):
|
|||
super().__init__()
|
||||
self.weight = nn.Parameter(torch.ones(hidden_size))
|
||||
self.variance_epsilon = eps
|
||||
self.hidden_size = hidden_size
|
||||
|
||||
def forward(self, hidden_states):
|
||||
input_dtype = hidden_states.dtype
|
||||
|
|
|
@ -30,6 +30,7 @@ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
|||
|
||||
from transformers.activations import ACT2FN
|
||||
from transformers.cache_utils import Cache, DynamicCache, StaticCache
|
||||
from transformers.generation import GenerationMixin
|
||||
from transformers.modeling_attn_mask_utils import (
|
||||
AttentionMaskConverter,
|
||||
_prepare_4d_attention_mask,
|
||||
|
@ -1598,7 +1599,7 @@ class DeepseekV3Model(DeepseekV3PreTrainedModel):
|
|||
|
||||
return causal_mask
|
||||
|
||||
class DeepseekV3ForCausalLM(DeepseekV3PreTrainedModel):
|
||||
class DeepseekV3ForCausalLM(DeepseekV3PreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
|
|
@ -23,7 +23,7 @@ from ktransformers.models.modeling_deepseek import (
|
|||
yarn_find_correction_range
|
||||
)
|
||||
from ktransformers.operators.base_operator import BaseInjectedModule
|
||||
from ktransformers.util.custom_gguf import GGUFLoader
|
||||
from ktransformers.util.custom_loader import GGUFLoader
|
||||
from ktransformers.util.utils import InferenceState
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
import torch
|
||||
|
|
|
@ -13,9 +13,10 @@ from ktransformers.models.configuration_deepseek import DeepseekV2Config
|
|||
from ktransformers.models.configuration_llama import LlamaConfig
|
||||
from ktransformers.models.modeling_llama import LlamaRotaryEmbedding
|
||||
from ktransformers.models.modeling_deepseek import DeepseekV2Attention, apply_rotary_pos_emb
|
||||
from ktransformers.models.modeling_qwen3_moe import Qwen3MoeAttention
|
||||
from typing import Optional, Tuple
|
||||
from ktransformers.operators.base_operator import BaseInjectedModule
|
||||
from ktransformers.util.custom_gguf import GGUFLoader
|
||||
from ktransformers.util.custom_loader import GGUFLoader
|
||||
from ktransformers.util.utils import get_compute_capability
|
||||
import logging
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
|
@ -587,6 +588,100 @@ class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention):
|
|||
|
||||
return attn_output, None, past_key_value
|
||||
|
||||
def forward_xpu(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
position_ids: Optional[torch.LongTensor] = None,
|
||||
past_key_value: Optional[Cache] = None,
|
||||
output_attentions: bool = False,
|
||||
use_cache: bool = False,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
**kwargs,
|
||||
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
|
||||
if "padding_mask" in kwargs:
|
||||
warnings.warn(
|
||||
"Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
|
||||
)
|
||||
bsz, q_len, _ = hidden_states.size()
|
||||
|
||||
if self.q_lora_rank is None:
|
||||
q = self.q_proj(hidden_states)
|
||||
else:
|
||||
q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
|
||||
query_states = q.view(bsz, q_len, self.num_heads, self.q_head_dim).transpose(1, 2)
|
||||
|
||||
compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
|
||||
compressed_kv, k_pe = torch.split(
|
||||
compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
|
||||
)
|
||||
k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim).transpose(1, 2)
|
||||
kv = (
|
||||
self.kv_b_proj(self.kv_a_layernorm(compressed_kv))
|
||||
.view(bsz, q_len, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
|
||||
.transpose(1, 2)
|
||||
)
|
||||
|
||||
k_nope, value_states = torch.split(
|
||||
kv, [self.qk_nope_head_dim, self.v_head_dim], dim=-1
|
||||
)
|
||||
kv_seq_len = value_states.shape[-2]
|
||||
if past_key_value is not None:
|
||||
if self.layer_idx is None:
|
||||
raise ValueError(
|
||||
f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
|
||||
"for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
|
||||
"with a layer index."
|
||||
)
|
||||
kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
|
||||
|
||||
position_embeddings = kwargs.get("position_embeddings", None)
|
||||
if position_embeddings is not None:
|
||||
cos, sin = position_embeddings
|
||||
key_states = torch.cat(
|
||||
[k_nope, k_pe.expand([-1, self.num_heads, -1, -1])],
|
||||
dim=-1
|
||||
)
|
||||
from ipex_llm.transformers.models.common import rotary_two_with_cache_inplaced
|
||||
rotary_two_with_cache_inplaced(query_states[:, :, :, self.qk_nope_head_dim :],
|
||||
key_states[:, :, :, self.qk_nope_head_dim:],
|
||||
cos, sin, True)
|
||||
else:
|
||||
q_nope, q_pe = torch.split(
|
||||
query_states, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
|
||||
)
|
||||
cos, sin = self.rotary_emb(q_pe, position_ids)
|
||||
q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin)
|
||||
query_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
|
||||
query_states[:, :, :, : self.qk_nope_head_dim] = q_nope
|
||||
query_states[:, :, :, self.qk_nope_head_dim :] = q_pe
|
||||
|
||||
key_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
|
||||
key_states[:, :, :, : self.qk_nope_head_dim] = k_nope
|
||||
key_states[:, :, :, self.qk_nope_head_dim :] = k_pe
|
||||
|
||||
if past_key_value is not None:
|
||||
cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models
|
||||
key_states, value_states = past_key_value.update(
|
||||
key_states.half(), value_states.half(), self.layer_idx, cache_kwargs
|
||||
)
|
||||
|
||||
attn_weights = None
|
||||
from ipex_llm.transformers.models.common import scaled_dot_product_attention
|
||||
attn_output = scaled_dot_product_attention(
|
||||
query_states.half(), key_states, value_states,
|
||||
attention_mask.half(), q_len == kv_seq_len, self.softmax_scale
|
||||
)
|
||||
|
||||
attn_output = attn_output.transpose(1, 2).contiguous()
|
||||
attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.v_head_dim)
|
||||
attn_output = self.o_proj(attn_output).to(hidden_states.dtype)
|
||||
|
||||
if not output_attentions:
|
||||
attn_weights = None
|
||||
|
||||
return attn_output, attn_weights, past_key_value
|
||||
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
|
@ -598,7 +693,21 @@ class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention):
|
|||
cache_position: Optional[torch.LongTensor] = None,
|
||||
**kwargs,
|
||||
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
|
||||
if os.name == 'nt' or get_compute_capability()<8 or device_manager.gpu_vendor != GPUVendor.NVIDIA:
|
||||
if torch.xpu.is_available():
|
||||
return self.forward_xpu(
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
position_ids,
|
||||
past_key_value,
|
||||
output_attentions,
|
||||
use_cache,
|
||||
cache_position,
|
||||
**kwargs,
|
||||
)
|
||||
elif (os.name == 'nt'
|
||||
or get_compute_capability() < 8
|
||||
or hidden_states.device.type == 'cpu'
|
||||
or device_manager.gpu_vendor != GPUVendor.NVIDIA):
|
||||
return self.forward_windows(
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
|
@ -762,3 +871,75 @@ class KLlamaAttention(BaseInjectedModule):
|
|||
attn_weights = None
|
||||
|
||||
return attn_output, attn_weights, past_key_value
|
||||
|
||||
|
||||
class KQwen3MoeAttentionIPEXLLM(BaseInjectedModule, Qwen3MoeAttention):
|
||||
def __init__(self,
|
||||
key: str,
|
||||
gguf_loader : GGUFLoader,
|
||||
config: PretrainedConfig,
|
||||
orig_module: nn.Module,
|
||||
prefill_device: str = "xpu",
|
||||
generate_device: str = "xpu",
|
||||
chunck_size: int = 1000,
|
||||
**kwargs):
|
||||
BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, **kwargs)
|
||||
self.orig_module.__init__(orig_module.config,
|
||||
orig_module.layer_idx)
|
||||
self.chunck_size = chunck_size # TODO, generate chunck_size automatically.
|
||||
assert prefill_device.lower()[:3] == "xpu", "KQwen3MoeAttentionIPEXLLM only supports XPU device"
|
||||
assert generate_device.lower()[:3] == "xpu", "KQwen3MoeAttentionIPEXLLM only supports XPU device"
|
||||
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
position_ids: Optional[torch.Tensor],
|
||||
position_embeddings: Tuple[torch.Tensor, torch.Tensor],
|
||||
attention_mask: Optional[torch.Tensor],
|
||||
past_key_value: Optional[Cache] = None,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
**kwargs
|
||||
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
|
||||
input_shape = hidden_states.shape[:-1]
|
||||
bsz, q_len, _ = hidden_states.size()
|
||||
input_dtype = hidden_states.dtype
|
||||
hidden_shape = (*input_shape, -1, self.head_dim)
|
||||
|
||||
if not hasattr(self, 'qkv_proj'):
|
||||
from ipex_llm.transformers.models.common import merge_quantized_qkv
|
||||
merge_quantized_qkv(self.q_proj.generate_linear, self.k_proj.generate_linear, self.v_proj.generate_linear, self.orig_module)
|
||||
|
||||
qkv = self.qkv_proj(hidden_states)
|
||||
qkv = qkv.view(bsz, q_len, -1, self.head_dim)
|
||||
qkv = qkv.transpose(1, 2)
|
||||
query_states, key_states, value_states = qkv.split([self.config.num_attention_heads,
|
||||
self.config.num_key_value_heads,
|
||||
self.config.num_key_value_heads], dim=1)
|
||||
query_states = self.q_norm(query_states)
|
||||
key_states = self.k_norm(key_states)
|
||||
|
||||
if position_embeddings is None:
|
||||
position_embeddings = self.rotary_emb(hidden_states, position_ids)
|
||||
|
||||
cos, sin = position_embeddings
|
||||
|
||||
from ipex_llm.transformers.models.common import rotary_half_with_cache_inplaced
|
||||
rotary_half_with_cache_inplaced(query_states, key_states, cos, sin)
|
||||
|
||||
if past_key_value is not None:
|
||||
# sin and cos are specific to RoPE models; cache_position needed for the static cache
|
||||
cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
|
||||
key_states, value_states = past_key_value.update(key_states.half(), value_states.half(),
|
||||
self.layer_idx, cache_kwargs)
|
||||
|
||||
attn_weights = None
|
||||
from ipex_llm.transformers.models.common import scaled_dot_product_attention
|
||||
attn_output = scaled_dot_product_attention(
|
||||
query_states.half(), key_states, value_states,
|
||||
attention_mask.half(), q_len == key_states.size(2), self.scaling
|
||||
)
|
||||
attn_output = attn_output.transpose(1, 2).contiguous()
|
||||
|
||||
attn_output = attn_output.reshape(*input_shape, -1).contiguous()
|
||||
attn_output = self.o_proj(attn_output).to(input_dtype)
|
||||
return attn_output, attn_weights
|
||||
|
|
|
@ -11,7 +11,7 @@ from ktransformers.models.modeling_qwen2_moe import Qwen2MoeAttention
|
|||
from ktransformers.models.modeling_qwen3_moe import Qwen3MoeAttention
|
||||
from typing import Optional, Tuple
|
||||
from ktransformers.operators.base_operator import BaseInjectedModule
|
||||
from ktransformers.util.custom_gguf import GGUFLoader
|
||||
from ktransformers.util.custom_loader import GGUFLoader
|
||||
import logging
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
from flashinfer import BatchMLAPagedAttentionWrapper
|
||||
|
@ -288,3 +288,170 @@ class KQwen3MoeAttention(BaseInjectedModule, Qwen3MoeAttention):
|
|||
attn_output = self.o_proj(attn_output.view(q_len, self.num_heads * self.head_dim), bsz_tensors)
|
||||
|
||||
return attn_output
|
||||
|
||||
|
||||
class deepseek_torch_attn(BaseInjectedModule, DeepseekV2Attention):
|
||||
def __init__(self,
|
||||
key: str,
|
||||
gguf_loader : GGUFLoader,
|
||||
config: PretrainedConfig,
|
||||
orig_module: nn.Module,
|
||||
prefill_device: str = "cuda",
|
||||
generate_device: str = "cuda",
|
||||
chunck_size: int = 1000,
|
||||
**kwargs):
|
||||
BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, **kwargs)
|
||||
self.orig_module.__init__(orig_module.config,
|
||||
orig_module.layer_idx)
|
||||
self.chunck_size = chunck_size # TODO, generate chunck_size automatically.
|
||||
|
||||
|
||||
def get_absorbed(self) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
if not (hasattr(self, 'q_absorb') and hasattr(self, 'out_absorb')):
|
||||
kv_b_proj = self.kv_b_proj.weight.view(self.num_heads, -1, self.kv_lora_rank)
|
||||
q_absorb = kv_b_proj[:, :self.qk_nope_head_dim, :].reshape(-1, self.kv_lora_rank)
|
||||
out_absorb = kv_b_proj[:, self.qk_nope_head_dim:, :].reshape(-1, self.kv_lora_rank)
|
||||
self.q_absorb = nn.Linear(self.kv_lora_rank, self.num_heads * self.qk_nope_head_dim,
|
||||
bias=False, dtype=q_absorb.dtype, device=q_absorb.device)
|
||||
self.q_absorb.weight.data = q_absorb
|
||||
self.out_absorb = nn.Linear(self.kv_lora_rank, self.num_heads * self.v_head_dim,
|
||||
bias=False, dtype=out_absorb.dtype, device=out_absorb.device)
|
||||
self.out_absorb.weight.data = out_absorb
|
||||
#del self.orig_module.kv_b_proj
|
||||
q_absorb = self.q_absorb.weight.view(self.num_heads, self.qk_nope_head_dim, self.kv_lora_rank)
|
||||
out_absorb = self.out_absorb.weight.view(self.num_heads, self.v_head_dim, self.kv_lora_rank)
|
||||
return q_absorb, out_absorb
|
||||
|
||||
|
||||
|
||||
def forward(self,
|
||||
hidden_states: torch.Tensor,
|
||||
kv_cache: KDeepSeekV3Cache,
|
||||
position_ids: torch.Tensor,
|
||||
wrapper: None,
|
||||
num_tokens_tensors: torch.Tensor,
|
||||
page_idx: torch.Tensor,
|
||||
page_offset: torch.Tensor,
|
||||
attention_masks: Optional[list[torch.Tensor]] = None,
|
||||
q_indptr: Optional[torch.Tensor] = None,
|
||||
kv_indices: Optional[torch.Tensor] = None,
|
||||
kv_indptr: Optional[torch.Tensor] = None,
|
||||
bsz_tensors: Optional[torch.Tensor] = None,
|
||||
last_page_len: Optional[torch.Tensor] = None,
|
||||
):
|
||||
# range bsz_tensors
|
||||
final_attention_output = torch.tensor([], device=hidden_states.device)
|
||||
for i in range(bsz_tensors[0]):
|
||||
batch_num_tokens_tensors = q_indptr[i+1] - q_indptr[i]
|
||||
batch_last_page_len = last_page_len[i]
|
||||
# kv_total_len is kv_len, batch_compressed_kv is compressed_kv, batch_k_pe is k_pe
|
||||
batch_page_idx = page_idx[q_indptr[i]:q_indptr[i+1]]
|
||||
batch_page_offset = page_offset[q_indptr[i]:q_indptr[i+1]]
|
||||
# kv_page_nums is the number of pages for the current batch
|
||||
kv_page_nums = kv_indptr[i+1] - kv_indptr[i]
|
||||
# kv_total_len is the total length of the kv cache for the current batch (kv_len for algorithm)
|
||||
kv_total_len = kv_page_nums * kv_cache.page_size
|
||||
if batch_last_page_len is not None:
|
||||
kv_total_len = kv_total_len - (kv_cache.page_size - batch_last_page_len)
|
||||
# print(f"kv_total_len's shape {kv_total_len.shape}")
|
||||
# kv_index is the index of the kv cache pages for the current batch
|
||||
kv_index = kv_indices[kv_indptr[i]:kv_indptr[i+1]]
|
||||
# we can index [kv_index, page_offset_indices] to get the kv cache for the current batch
|
||||
# from q_indptr[i] to q_indptr[i+1] is the range of the current batch
|
||||
batch_hidden_states = hidden_states[q_indptr[i]:q_indptr[i+1]]
|
||||
batch_position_ids = position_ids[q_indptr[i]:q_indptr[i+1]]
|
||||
q_len, _ = batch_hidden_states.size()
|
||||
# print("q_len -> ", q_len)
|
||||
|
||||
if self.q_lora_rank is None:
|
||||
q = self.q_proj(batch_hidden_states, batch_num_tokens_tensors)
|
||||
else:
|
||||
q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(batch_hidden_states, batch_num_tokens_tensors), batch_num_tokens_tensors), batch_num_tokens_tensors)
|
||||
# for v3, bsz, q_len, num_heads(128), qk_head_dim(192=128(nope)+64(rope))
|
||||
q = q.view(q_len, self.num_heads, self.q_head_dim)
|
||||
# q_nope is [q_len, num_heads(128), qk_nope_head_dim(128)]
|
||||
# q_pe is [q_len, num_heads(128), qk_rope_head_dim(64)]
|
||||
q_nope, q_pe = torch.split(
|
||||
q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
|
||||
)
|
||||
# compressed_kv is [q_len, kv_lora_rank(512) + rope(64)]
|
||||
compressed_kv = self.kv_a_proj_with_mqa(batch_hidden_states, batch_num_tokens_tensors)
|
||||
# compressed_kv is [q_len, kv_lora_rank(512)], k_pe is [q_len, rope(64)]
|
||||
compressed_kv, k_pe = torch.split(
|
||||
compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
|
||||
)
|
||||
compressed_kv = compressed_kv.contiguous()
|
||||
compressed_kv = self.kv_a_layernorm(compressed_kv, batch_num_tokens_tensors)
|
||||
# k_pe is [q_len, 1, qk_rope_head_dim(64)]
|
||||
k_pe = k_pe.view(q_len, 1, self.qk_rope_head_dim)
|
||||
# compressed_kv is [q_len, 1, kv_lora_rank(512)]
|
||||
compressed_kv = compressed_kv.view(q_len, 1, self.kv_lora_rank)
|
||||
|
||||
cos, sin = self.rotary_emb(q_pe, batch_position_ids.unsqueeze(0))
|
||||
# print(f"q_pe shape{q_pe.shape}, k_pe shape {k_pe.shape}")
|
||||
q_pe, k_pe = apply_rotary_pos_emb(q_pe.unsqueeze(0), k_pe.unsqueeze(0), cos, sin, unsqueeze_dim=2)
|
||||
q_pe = q_pe.squeeze(0)
|
||||
# q_pe is [num_heads(128), q_len, qk_rope_head_dim(64)]
|
||||
q_pe.transpose_(0, 1)
|
||||
if kv_cache is not None:
|
||||
cache_kwargs = {"sin": sin, "cos": cos, "page_idx": batch_page_idx, "page_offset": batch_page_offset} # Specific to RoPE models
|
||||
compressed_kv_with_k_pe = kv_cache.update(compressed_kv.unsqueeze(0), k_pe, self.layer_idx, batch_page_idx, batch_page_offset, cache_kwargs)
|
||||
compressed_kv = compressed_kv_with_k_pe [:, :, :, :self.kv_lora_rank].view(-1, kv_cache.page_size, self.kv_lora_rank)
|
||||
k_pe = compressed_kv_with_k_pe [:, :, :, self.kv_lora_rank:].view(-1, kv_cache.page_size, self.qk_rope_head_dim)
|
||||
# q_absorb is [num_heads(128), qk_nope_head_dim(128), kv_lora_rank(512)]
|
||||
# out_absorb is [num_heads(128), kv_lora_rank(512), v_head_dim(128)] v_head_dim is also the nope dim
|
||||
q_absorb, out_absorb = self.get_absorbed()
|
||||
# q_nope is [num_heads(128), q_len, qk_nope_head_dim(128)]
|
||||
q_nope = q_nope.transpose(0, 1) # q_len is 1, no GPU overhead, same below
|
||||
# q_nope is [num_heads(128), q_len, kv_lora_rank(512)]
|
||||
q_nope = torch.matmul(q_nope, q_absorb) # batched MM
|
||||
|
||||
# # q_nope is [q_len, num_heads(128), kv_lora_rank(512)]
|
||||
# q_nope = q_nope.transpose(0, 1)
|
||||
|
||||
# we need to index out the compressed_kv and k_pe for the current batch
|
||||
batch_compressed_kv = None
|
||||
batch_k_pe = None
|
||||
for page_index in kv_index:
|
||||
if kv_total_len > kv_cache.page_size:
|
||||
tmp_compressed_kv = compressed_kv[page_index, 0:kv_cache.page_size, :]
|
||||
tmp_k_pe = k_pe[page_index, 0:kv_cache.page_size, :]
|
||||
if batch_compressed_kv is None or batch_k_pe is None:
|
||||
batch_compressed_kv = tmp_compressed_kv
|
||||
batch_k_pe = tmp_k_pe
|
||||
else:
|
||||
batch_compressed_kv = torch.cat((batch_compressed_kv, tmp_compressed_kv), dim=0)
|
||||
batch_k_pe = torch.cat((batch_k_pe, tmp_k_pe), dim=0)
|
||||
kv_total_len -= kv_cache.page_size
|
||||
else:
|
||||
tmp_compressed_kv = compressed_kv[page_index, 0:kv_total_len, :]
|
||||
tmp_k_pe = k_pe[page_index, 0:kv_total_len, :]
|
||||
if batch_compressed_kv is None or batch_k_pe is None:
|
||||
batch_compressed_kv = tmp_compressed_kv
|
||||
batch_k_pe = tmp_k_pe
|
||||
else:
|
||||
batch_compressed_kv = torch.cat((batch_compressed_kv, tmp_compressed_kv), dim=0)
|
||||
batch_k_pe = torch.cat((batch_k_pe, tmp_k_pe), dim=0)
|
||||
break
|
||||
# batch_compressed_kv is [kv_total_len(k_len), kv_lora_rank(512)]
|
||||
# batch_k_pe is [kv_total_len(k_len), qk_rope_head_dim(64)]
|
||||
attention_weights = (torch.matmul(q_pe,batch_k_pe.mT) + torch.matmul(q_nope, batch_compressed_kv.mT)) * self.softmax_scale
|
||||
# attention_weights is [num_heads(128), q_len, k_len]
|
||||
|
||||
# attention_weights = attention_weights.transpose(0,1).unsqueeze(0).squeeze(-1).expand(q_len,-1,-1).transpose(0,1)
|
||||
|
||||
# attention_masks[i] is [q_len, k_len]
|
||||
|
||||
attention_weights = (attention_weights + attention_masks[i])
|
||||
# attention_weights shape is [num_heads(128), q_len, k_len]
|
||||
attention_weights = nn.functional.softmax(attention_weights,dim=-1,dtype=torch.float32).to(q_pe.dtype)
|
||||
attn_output = torch.matmul(attention_weights, batch_compressed_kv) # [num_heads(128),q_len, lora_rank(512)]
|
||||
# out_absorb shape is [num_heads(128), kv_lora_rank(512), v_head_dim(128)]
|
||||
out_absorb = out_absorb.transpose(1,2)
|
||||
# q for q_len, n for num_heads, h for v_head_dim, v for kv_lora_rank
|
||||
attn_output = torch.matmul(attn_output, out_absorb) # [num_heads(128), q_len, v_head_dim(128)]
|
||||
attn_output = attn_output.transpose(0, 1) # [q_len, num_heads(128), v_head_dim(128)]
|
||||
attn_output = attn_output.reshape(q_len, self.num_heads * self.v_head_dim)
|
||||
attn_output = self.o_proj(attn_output, batch_num_tokens_tensors)
|
||||
final_attention_output = torch.cat((final_attention_output, attn_output), dim=0)
|
||||
return final_attention_output
|
|
@ -6,7 +6,7 @@ Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
|
|||
'''
|
||||
from typing import Any
|
||||
from torch import nn, Tensor
|
||||
from ktransformers.util.custom_gguf import GGUFLoader
|
||||
from ktransformers.util.custom_loader import GGUFLoader
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
import ktransformers.util.utils as utils
|
||||
class BaseInjectedModule(nn.Module):
|
||||
|
|
|
@ -26,7 +26,8 @@ sys.path.append(os.path.join(os.path.dirname(__file__), "..", "ktransformers_ext
|
|||
import cpuinfer_ext
|
||||
from cpuinfer_ext.moe import MOEConfig, MOE
|
||||
import ctypes
|
||||
from ktransformers.util.custom_gguf import GGMLQuantizationType, GGUFLoader
|
||||
from ktransformers.util.custom_gguf import GGMLQuantizationType
|
||||
from ktransformers.util.custom_loader import GGUFLoader, SafeTensorLoader, ModelLoader
|
||||
from ktransformers.util.utils import InferenceState
|
||||
from ktransformers.server.config.config import Config
|
||||
from transformers.activations import ACT2FN
|
||||
|
@ -39,8 +40,21 @@ from ktransformers.operators.cpuinfer import CPUInfer
|
|||
|
||||
def deduplicate_and_sort(lst):
|
||||
return sorted(set(lst))
|
||||
def generate_cuda_graphs(chunk_size: int) -> list:
|
||||
assert chunk_size <= 1024 or chunk_size % 1024 == 0, "chunk_size must <= 1024 or a multiple of 1024"
|
||||
base_list = [1, 2, 3, Config().max_batch_size, 64, 256, 512, chunk_size]
|
||||
|
||||
if chunk_size <= 1024:
|
||||
return deduplicate_and_sort(base_list)
|
||||
|
||||
multiples = [i for i in range(1024, chunk_size + 1, 1024)]
|
||||
|
||||
return deduplicate_and_sort(base_list + multiples)
|
||||
#cuda_graphs = [Config().chunk_size]
|
||||
cuda_graphs = deduplicate_and_sort([1, 2, 3, Config().max_batch_size, 64, Config().chunk_size])
|
||||
if torch.cuda.is_available():
|
||||
cuda_graphs = generate_cuda_graphs(Config().chunk_size)
|
||||
else:
|
||||
cuda_graphs = 1
|
||||
# class Base(BaseInjectedModule, ABC):
|
||||
class KExpertsBase(ABC):
|
||||
def __init__(self, key: str, gguf_loader: GGUFLoader, config: PretrainedConfig, orig_module: nn.Module, device: str = "cuda", **kwargs):
|
||||
|
@ -77,7 +91,7 @@ class KExpertsBase(ABC):
|
|||
down_type = None
|
||||
|
||||
for key in keys:
|
||||
if key + ".ffn_gate_exps.weight" in self.gguf_loader.tensor_info:
|
||||
if self.gguf_loader.has_tensor(key + ".ffn_gate_exps.weight"):
|
||||
targets = [".ffn_gate_exps.weight", ".ffn_up_exps.weight", ".ffn_down_exps.weight" ]
|
||||
tensors = self.load_multi(key, targets, device=device)
|
||||
gate = tensors[".ffn_gate_exps.weight"]
|
||||
|
@ -86,7 +100,7 @@ class KExpertsBase(ABC):
|
|||
gate_type = self.gguf_loader.tensor_info[key + ".ffn_gate_exps.weight"]["ggml_type"]
|
||||
up_type = self.gguf_loader.tensor_info[key + ".ffn_up_exps.weight"]["ggml_type"]
|
||||
down_type = self.gguf_loader.tensor_info[key + ".ffn_down_exps.weight"]["ggml_type"]
|
||||
elif key + ".ffn_down.0.weight" in self.gguf_loader.tensor_info:
|
||||
elif self.gguf_loader.has_tensor(key + ".ffn_down.0.weight"):
|
||||
# for supporting Mixtral-8x7B-Instuct
|
||||
gate = []
|
||||
up = []
|
||||
|
@ -166,6 +180,11 @@ class KExpertsCPU(KExpertsBase):
|
|||
n_routed_experts = self.n_routed_experts
|
||||
self.cpu_infer = KExpertsCPU.CPU_INFER
|
||||
# n_routed_experts = len(self.orig_module)
|
||||
model_dtype = torch.get_default_dtype()
|
||||
if torch.xpu.is_available() and model_dtype == torch.float16:
|
||||
hidden_type = 1 # fp16
|
||||
else:
|
||||
hidden_type = 30 # bf16
|
||||
if self.backend == "llamafile":
|
||||
moe_config = MOEConfig(
|
||||
n_routed_experts,
|
||||
|
@ -181,7 +200,7 @@ class KExpertsCPU(KExpertsBase):
|
|||
self.gate_type,
|
||||
self.up_type,
|
||||
self.down_type,
|
||||
30, # TODO: get from model.dtype
|
||||
hidden_type, # TODO: get from model.dtype
|
||||
)
|
||||
self.moe = MOE(moe_config)
|
||||
elif self.backend == "AMXBF16":
|
||||
|
@ -194,7 +213,7 @@ class KExpertsCPU(KExpertsBase):
|
|||
self.config.num_experts_per_tok,
|
||||
self.config.hidden_size,
|
||||
self.config.moe_intermediate_size,
|
||||
25600,
|
||||
max(cuda_graphs) if isinstance(cuda_graphs, list) else Config().chunk_size,
|
||||
gate_ptr,
|
||||
up_ptr,
|
||||
down_ptr,
|
||||
|
@ -212,7 +231,7 @@ class KExpertsCPU(KExpertsBase):
|
|||
self.config.num_experts_per_tok,
|
||||
self.config.hidden_size,
|
||||
self.config.moe_intermediate_size,
|
||||
25600,
|
||||
max(cuda_graphs) if isinstance(cuda_graphs, list) else Config().chunk_size,
|
||||
gate_ptr,
|
||||
up_ptr,
|
||||
down_ptr,
|
||||
|
@ -241,8 +260,12 @@ class KExpertsCPU(KExpertsBase):
|
|||
KExpertsCPU.input_tensor_cpu = torch.zeros((cuda_graphs, self.config.hidden_size), device="cpu", pin_memory=True)
|
||||
KExpertsCPU.expert_ids_cpu = torch.zeros((cuda_graphs, num_experts_per_tok), device="cpu", dtype=torch.long, pin_memory=True)
|
||||
KExpertsCPU.weights_cpu = torch.zeros((cuda_graphs, num_experts_per_tok), device="cpu", dtype=torch.float32, pin_memory=True)
|
||||
KExpertsCPU.output_cpu = torch.zeros((cuda_graphs, self.config.hidden_size), device="cpu", pin_memory=True, dtype=torch.bfloat16)
|
||||
KExpertsCPU.bsz_tensor_cpu = torch.zeros((1), device="cpu", dtype=torch.int32, pin_memory=True)
|
||||
if torch.xpu.is_available():
|
||||
KExpertsCPU.output_cpu = torch.zeros((cuda_graphs, self.config.hidden_size), device="cpu", pin_memory=True, dtype=model_dtype)
|
||||
KExpertsCPU.bsz_tensor_cpu = torch.ones((1), device="cpu", dtype=torch.int32, pin_memory=True)
|
||||
else:
|
||||
KExpertsCPU.output_cpu = torch.zeros((cuda_graphs, self.config.hidden_size), device="cpu", pin_memory=True, dtype=torch.bfloat16)
|
||||
KExpertsCPU.bsz_tensor_cpu = torch.zeros((1), device="cpu", dtype=torch.int32, pin_memory=True)
|
||||
|
||||
def submit_for_one_decode(self, input_tensor, expert_ids, weights, bsz_tensor=None, cuda_graph_idx=0):
|
||||
if bsz_tensor is None:
|
||||
|
@ -274,9 +297,9 @@ class KExpertsCPU(KExpertsBase):
|
|||
def forward(self, input_tensor, expert_ids, weights, bsz_tensor=None, cuda_graph_idx=0):
|
||||
# generate, capture and run cuda graph
|
||||
# print(expert_ids)
|
||||
if bsz_tensor is None:
|
||||
if bsz_tensor is None and (not torch.xpu.is_available() or input_tensor.size(0) > 1):
|
||||
bsz_tensor = torch.tensor([input_tensor.size(0)], device=input_tensor.device, dtype=torch.int32)
|
||||
if torch.cuda.is_current_stream_capturing():
|
||||
if torch.cuda.is_available() and torch.cuda.is_current_stream_capturing():
|
||||
if cuda_graph_idx != -1:
|
||||
KExpertsCPU.input_tensor_cpu[cuda_graph_idx].copy_(input_tensor, non_blocking=True)
|
||||
KExpertsCPU.expert_ids_cpu[cuda_graph_idx].copy_(expert_ids, non_blocking=True)
|
||||
|
@ -296,6 +319,15 @@ class KExpertsCPU(KExpertsBase):
|
|||
self.cpu_infer.sync_with_cuda_stream(torch.cuda.current_stream().cuda_stream)
|
||||
KExpertsCPU.output_gpu_map[self.out_device].copy_(KExpertsCPU.output_cpu, non_blocking=True)
|
||||
return KExpertsCPU.output_gpu_map[self.out_device]
|
||||
elif input_tensor.size(0)==1 and torch.xpu.is_available():
|
||||
KExpertsCPU.input_tensor_cpu.copy_(input_tensor.view(-1), non_blocking=True)
|
||||
KExpertsCPU.expert_ids_cpu.copy_(expert_ids.view(-1), non_blocking=True)
|
||||
KExpertsCPU.weights_cpu.copy_(weights.view(-1), non_blocking=True)
|
||||
# KExpertsCPU.bsz_tensor_cpu.copy_(bsz_tensor.view(-1), non_blocking=True)
|
||||
self.cpu_infer.submit(self.moe.forward(expert_ids.size(0), expert_ids.size(1), KExpertsCPU.expert_ids_cpu.data_ptr(), KExpertsCPU.weights_cpu.data_ptr(), KExpertsCPU.input_tensor_cpu.data_ptr(), KExpertsCPU.output_cpu.data_ptr(), KExpertsCPU.bsz_tensor_cpu.data_ptr()))
|
||||
self.cpu_infer.sync()
|
||||
KExpertsCPU.output_gpu_map[self.out_device].copy_(KExpertsCPU.output_cpu, non_blocking=True)
|
||||
return KExpertsCPU.output_gpu_map[self.out_device].view(1, -1)
|
||||
else:
|
||||
input_tensor = input_tensor.contiguous().cpu()
|
||||
expert_ids = expert_ids.contiguous().cpu()
|
||||
|
@ -325,14 +357,19 @@ class KExpertsCPU(KExpertsBase):
|
|||
down_type = None
|
||||
|
||||
for key in keys:
|
||||
if self.gguf_loader.safetensor_loader is not None:
|
||||
# using a temp ugly way to temprary load the tensor
|
||||
gate = self.gguf_loader.safetensor_loader.load_tensor(key + ".ffn_gate_exps.weight").numpy()
|
||||
up = self.gguf_loader.safetensor_loader.load_tensor(key + ".ffn_up_exps.weight").numpy()
|
||||
down = self.gguf_loader.safetensor_loader.load_tensor(key + ".ffn_down_exps.weight").numpy()
|
||||
gate_type = self.gguf_loader.safetensor_loader.load_tensor(key + ".ffn_gate_exps.ggml_type").item()
|
||||
up_type = self.gguf_loader.safetensor_loader.load_tensor(key + ".ffn_up_exps.ggml_type").item()
|
||||
down_type = self.gguf_loader.safetensor_loader.load_tensor(key + ".ffn_down_exps.ggml_type").item()
|
||||
if isinstance(self.gguf_loader, SafeTensorLoader):
|
||||
res = self.gguf_loader.load_experts(key)
|
||||
return {key: res}
|
||||
elif self.gguf_loader.has_tensor(key + ".ffn_gate_exps.weight"):
|
||||
gate = self.gguf_loader.get_mmap_tensor(key + ".ffn_gate_exps.weight")
|
||||
up = self.gguf_loader.get_mmap_tensor(key + ".ffn_up_exps.weight")
|
||||
down = self.gguf_loader.get_mmap_tensor(key + ".ffn_down_exps.weight")
|
||||
# gate_type = self.gguf_loader.tensor_info[key + ".ffn_gate_exps.weight"]["ggml_type"]
|
||||
# up_type = self.gguf_loader.tensor_info[key + ".ffn_up_exps.weight"]["ggml_type"]
|
||||
# down_type = self.gguf_loader.tensor_info[key + ".ffn_down_exps.weight"]["ggml_type"]
|
||||
gate_type = self.gguf_loader.get_ggml_type(key + ".ffn_gate_exps.weight")
|
||||
up_type = self.gguf_loader.get_ggml_type(key + ".ffn_up_exps.weight")
|
||||
down_type = self.gguf_loader.get_ggml_type(key + ".ffn_down_exps.weight")
|
||||
|
||||
elif key + ".ffn_gate_exps.weight" in self.gguf_loader.tensor_info:
|
||||
gate = self.gguf_loader.get_mmap_tensor(key + ".ffn_gate_exps.weight")
|
||||
|
@ -356,9 +393,9 @@ class KExpertsCPU(KExpertsBase):
|
|||
gate = np.stack(gate)
|
||||
up = np.stack(up)
|
||||
down = np.stack(down)
|
||||
gate_type = self.gguf_loader.tensor_info[key + ".ffn_gate.0.weight"]["ggml_type"]
|
||||
up_type = self.gguf_loader.tensor_info[key + ".ffn_up.0.weight"]["ggml_type"]
|
||||
down_type = self.gguf_loader.tensor_info[key + ".ffn_down.0.weight"]["ggml_type"]
|
||||
gate_type = self.gguf_loader.get_ggml_type(key + ".ffn_gate.0.weight")
|
||||
up_type = self.gguf_loader.get_ggml_type(key + ".ffn_up.0.weight")
|
||||
down_type = self.gguf_loader.get_ggml_type(key + ".ffn_down.0.weight")
|
||||
else:
|
||||
raise ValueError(f"Experts {key} not found in gguf_loader")
|
||||
res = {key:{"gate": gate, "up": up, "down": down, "gate_type": gate_type, "up_type": up_type, "down_type": down_type}}
|
||||
|
@ -445,7 +482,7 @@ class KExpertsMarlin(KExpertsBase):
|
|||
down = None
|
||||
|
||||
for key in keys:
|
||||
if key + ".ffn_gate_exps.weight" in self.gguf_loader.tensor_info:
|
||||
if self.gguf_loader.has_tensor(key + ".ffn_gate_exps.weight"):
|
||||
gate = self.gguf_loader.get_mmap_tensor(key + ".ffn_gate_exps.weight")
|
||||
up = self.gguf_loader.get_mmap_tensor(key + ".ffn_up_exps.weight")
|
||||
down = self.gguf_loader.get_mmap_tensor(key + ".ffn_down_exps.weight")
|
||||
|
@ -806,7 +843,7 @@ class KDeepseekV2MoE(BaseInjectedModule, DeepseekV2MoE):
|
|||
topk_idx, topk_weight, aux_loss = self.gate(hidden_states)
|
||||
hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
|
||||
|
||||
if sequence_length == 1 and hasattr(self.experts.generate_experts, "submit_for_one_decode") and torch.cuda.is_current_stream_capturing():
|
||||
if sequence_length == 1 and hasattr(self.experts.generate_experts, "submit_for_one_decode") and torch.cuda.is_available() and torch.cuda.is_current_stream_capturing():
|
||||
self.experts.generate_experts.submit_for_one_decode(hidden_states[0], topk_idx[0], topk_weight[0])
|
||||
if self.config.n_shared_experts is not None:
|
||||
y_ = self.shared_experts(identity).squeeze(0)
|
||||
|
@ -906,7 +943,7 @@ class KDeepseekV3MoE(BaseInjectedModule, DeepseekV3MoE):
|
|||
hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
|
||||
|
||||
# only for generate phase
|
||||
if sequence_length == 1 and hasattr(self.experts.generate_experts, "submit_for_one_decode") and torch.cuda.is_current_stream_capturing():
|
||||
if sequence_length == 1 and hasattr(self.experts.generate_experts, "submit_for_one_decode") and torch.cuda.is_available() and torch.cuda.is_current_stream_capturing():
|
||||
self.experts.generate_experts.submit_for_one_decode(hidden_states[0], topk_idx[0], topk_weight[0])
|
||||
if self.config.n_shared_experts is not None:
|
||||
y_ = self.shared_experts(identity).squeeze(0)
|
||||
|
@ -1106,7 +1143,7 @@ class KDeepseekV3MoEV2(BaseInjectedModule, DeepseekV3MoE):
|
|||
|
||||
|
||||
# only for generate phase
|
||||
if hasattr(self.experts.generate_experts, "submit_for_one_decode") and torch.cuda.is_current_stream_capturing(): # TODO: this branch cause jit bug
|
||||
if hasattr(self.experts.generate_experts, "submit_for_one_decode") and torch.cuda.is_available() and torch.cuda.is_current_stream_capturing(): # TODO: this branch cause jit bug
|
||||
self.experts.generate_experts.submit_for_one_decode(hidden_states, topk_idx, topk_weight, bsz_tensor, cuda_graph_idx)
|
||||
if self.config.n_shared_experts is not None:
|
||||
y_ = self.shared_experts(identity, bsz_tensor).squeeze(0)
|
||||
|
@ -1288,7 +1325,7 @@ class KQwen2MoeSparseMoeBlockV2(BaseInjectedModule, Qwen2MoeSparseMoeBlock):
|
|||
routing_weights = routing_weights.to(hidden_states.dtype)
|
||||
|
||||
# only for generate phase
|
||||
if hasattr(self.experts.generate_experts, "submit_for_one_decode") and torch.cuda.is_current_stream_capturing(): # TODO: this branch cause jit bug
|
||||
if hasattr(self.experts.generate_experts, "submit_for_one_decode") and torch.cuda.is_available() and torch.cuda.is_current_stream_capturing(): # TODO: this branch cause jit bug
|
||||
self.experts.generate_experts.submit_for_one_decode(hidden_states, selected_experts, routing_weights, bsz_tensor, cuda_graph_idx)
|
||||
y_ = self.shared_expert(hidden_states, bsz_tensor).squeeze(0)
|
||||
y_ = F.sigmoid(self.shared_expert_gate(hidden_states)) * y_
|
||||
|
@ -1384,24 +1421,33 @@ class KQwen2MoeSparseMoeBlockV2(BaseInjectedModule, Qwen2MoeSparseMoeBlock):
|
|||
return final_out
|
||||
|
||||
class KQwen3MoeSparseMoeBlockV2(BaseInjectedModule, Qwen3MoeSparseMoeBlock):
|
||||
def forward(self, hidden_states, bsz_tensor, cuda_graph_idx=0):
|
||||
def forward(self, hidden_states, bsz_tensor=None, cuda_graph_idx=0):
|
||||
|
||||
orig_shape = hidden_states.shape
|
||||
sequence_length = orig_shape[1]
|
||||
|
||||
hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
|
||||
|
||||
router_logits = self.gate(hidden_states, bsz_tensor)
|
||||
if bsz_tensor is None:
|
||||
router_logits = self.gate(hidden_states)
|
||||
else:
|
||||
router_logits = self.gate(hidden_states, bsz_tensor)
|
||||
|
||||
routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
|
||||
routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
|
||||
if self.norm_topk_prob:
|
||||
routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
|
||||
if router_logits.device.type == "xpu":
|
||||
from ipex_llm.transformers.models.common import moe_softmax_topk
|
||||
selected_experts, routing_weights = moe_softmax_topk(
|
||||
router_logits.half(), self.top_k, self.norm_topk_prob
|
||||
)
|
||||
else:
|
||||
routing_weights = torch.nn.functional.softmax(router_logits, dim=1, dtype=torch.float)
|
||||
routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
|
||||
if self.norm_topk_prob:
|
||||
routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
|
||||
# we cast back to the input dtype
|
||||
routing_weights = routing_weights.to(hidden_states.dtype)
|
||||
|
||||
# only for generate phase
|
||||
if hasattr(self.experts.generate_experts, "submit_for_one_decode") and torch.cuda.is_current_stream_capturing(): # TODO: this branch cause jit bug
|
||||
if hasattr(self.experts.generate_experts, "submit_for_one_decode") and torch.cuda.is_available() and torch.cuda.is_current_stream_capturing(): # TODO: this branch cause jit bug
|
||||
self.experts.generate_experts.submit_for_one_decode(hidden_states, selected_experts, routing_weights, bsz_tensor, cuda_graph_idx)
|
||||
# y_ = self.shared_expert(hidden_states, bsz_tensor).squeeze(0)
|
||||
# y_ = F.sigmoid(self.shared_expert_gate(hidden_states)) * y_
|
||||
|
|
|
@ -40,7 +40,7 @@ class flashInferAttn():
|
|||
self.kv_layout = kv_layout
|
||||
self.use_cuda_graph = use_cuda_graph
|
||||
if flashInferAttn.float_workspace_buffer is None:
|
||||
flashInferAttn.float_workspace_buffer = torch.empty(1024 * 1024 * 1024, dtype=torch.uint8, device=device)
|
||||
flashInferAttn.float_workspace_buffer = torch.empty(max_batch_token * 1024 * 1024, dtype=torch.uint8, device=device)
|
||||
self.qo_indptr_buf = torch.empty((max_batch_size+1,), dtype=torch.int32, device=device)
|
||||
self.paged_kv_indptr_buf = torch.empty((max_batch_size+1,), dtype=torch.int32, device=device)
|
||||
self.paged_kv_indices_buf = torch.empty((max_pages,), dtype=torch.int32, device=device)
|
||||
|
|
|
@ -6,7 +6,7 @@ import os
|
|||
from ktransformers.operators.base_operator import BaseInjectedModule
|
||||
from ktransformers.operators.base_operator import BaseInjectedModule
|
||||
from ktransformers.operators.linear import KTransformersLinear
|
||||
from ktransformers.util.custom_gguf import GGUFLoader
|
||||
from ktransformers.util.custom_loader import GGUFLoader, ModelLoader, SafeTensorLoader
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
|
@ -55,24 +55,20 @@ class KMoEGateBase(ABC):
|
|||
down_type = None
|
||||
|
||||
for key in keys:
|
||||
key = ".".join(key.split(".")[:-1])
|
||||
if self.gguf_loader.safetensor_loader is not None:
|
||||
targets = [".ffn_gate_inp.weight", ".exp_probs_b.bias"]
|
||||
weight = self.gguf_loader.safetensor_loader.load_tensor(key + ".ffn_gate_inp.weight")
|
||||
e_score_correction_bias = self.gguf_loader.safetensor_loader.load_tensor(key + ".exp_probs_b.bias")
|
||||
weight_type = weight.dtype
|
||||
e_score_correction_bias_type = e_score_correction_bias.dtype
|
||||
res = {"weight": weight, "e_score_correction_bias": e_score_correction_bias, "weight_type": weight_type, "e_score_correction_bias_type": e_score_correction_bias_type}
|
||||
elif key + ".ffn_gate_inp.weight" in self.gguf_loader.tensor_info:
|
||||
targets = [".ffn_gate_inp.weight", ".exp_probs_b.bias"]
|
||||
# key = ".".join(key.split(".")[:-1])
|
||||
if isinstance(self.gguf_loader, SafeTensorLoader):
|
||||
res = self.gguf_loader.load_gate(key, device=device)
|
||||
elif self.gguf_loader.has_tensor(key+".weight"):
|
||||
# targets = [".ffn_gate_inp.weight", ".exp_probs_b.bias"]
|
||||
targets = [".weight", ".e_score_correction_bias"]
|
||||
tensors = self.load_multi(key, targets, device=device)
|
||||
weight = tensors[".ffn_gate_inp.weight"]
|
||||
e_score_correction_bias = tensors[".exp_probs_b.bias"]
|
||||
weight_type = self.gguf_loader.tensor_info[key + ".ffn_gate_inp.weight"]["ggml_type"]
|
||||
e_score_correction_bias_type = self.gguf_loader.tensor_info[key + ".exp_probs_b.bias"]["ggml_type"]
|
||||
weight = tensors[".weight"]
|
||||
e_score_correction_bias = tensors[".e_score_correction_bias"]
|
||||
# weight_type = self.gguf_loader.tensor_info[key + ".weight"]["ggml_type"]
|
||||
res = {"weight": weight, "e_score_correction_bias": e_score_correction_bias}
|
||||
else:
|
||||
raise ValueError(f"Experts {key} not found in gguf_loader")
|
||||
res = {"weight": weight, "e_score_correction_bias": e_score_correction_bias, "weight_type": weight_type, "e_score_correction_bias_type": e_score_correction_bias_type}
|
||||
|
||||
return res
|
||||
|
||||
def load_multi(self, key: str, keys: list[str], device: str = "cpu"):
|
||||
|
@ -106,8 +102,6 @@ class KMoEGate(BaseInjectedModule, KMoEGateBase):
|
|||
if w is None: w = self.load_weights(device=device)
|
||||
|
||||
if isinstance(w, dict):
|
||||
self.weight_type = w["weight_type"]
|
||||
self.e_score_correction_bias_type = w["e_score_correction_bias_type"]
|
||||
self.orig_module.weight = nn.Parameter(w["weight"])
|
||||
self.orig_module.e_score_correction_bias = nn.Parameter(w["e_score_correction_bias"])
|
||||
else:
|
||||
|
@ -175,8 +169,6 @@ class KMoEGateQwen2Moe(BaseInjectedModule, KMoEGateBase):
|
|||
if w is None: w = self.load_weights(device=device)
|
||||
|
||||
if isinstance(w, dict):
|
||||
self.weight_type = w["weight_type"]
|
||||
self.e_score_correction_bias_type = w["e_score_correction_bias_type"]
|
||||
self.orig_module.weight = nn.Parameter(w["weight"])
|
||||
self.orig_module.e_score_correction_bias = nn.Parameter(w["e_score_correction_bias"])
|
||||
else:
|
||||
|
@ -190,4 +182,34 @@ class KMoEGateQwen2Moe(BaseInjectedModule, KMoEGateBase):
|
|||
if self.weight is not None:
|
||||
self.weight = None
|
||||
if self.e_score_correction_bias is not None:
|
||||
self.e_score_correction_bias = None
|
||||
self.e_score_correction_bias = None
|
||||
|
||||
|
||||
class KMoEGateIPEXLLM(KMoEGate):
|
||||
def __init__(
|
||||
self,
|
||||
key: str,
|
||||
gguf_loader: GGUFLoader,
|
||||
config: PretrainedConfig,
|
||||
orig_module: nn.Module = None,
|
||||
generate_device: str = "xpu",
|
||||
prefill_device: str = "xpu",
|
||||
**kwargs,
|
||||
):
|
||||
BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, generate_device, **kwargs)
|
||||
KMoEGate.__init__(self, key, gguf_loader, config, orig_module, generate_device, **kwargs)
|
||||
self.generate_device = generate_device
|
||||
self.prefill_device = prefill_device
|
||||
|
||||
def forward(self, hidden_states) -> torch.Tensor:
|
||||
x = hidden_states.view(-1, hidden_states.size(-1))
|
||||
logits = torch.nn.functional.linear(
|
||||
x.type(torch.float32), self.orig_module.weight.type(torch.float32), None
|
||||
)
|
||||
scores = logits.sigmoid()
|
||||
|
||||
from ipex_llm.transformers.models.common import moe_group_topk
|
||||
topk_idx, topk_weight = moe_group_topk(scores, self.orig_module.e_score_correction_bias,
|
||||
self.n_group, self.topk_group, self.top_k,
|
||||
self.norm_topk_prob, self.routed_scaling_factor)
|
||||
return topk_idx, topk_weight.to(x.dtype)
|
|
@ -29,11 +29,12 @@ from ktransformers.models.modeling_deepseek_v3 import DeepseekV3RMSNorm
|
|||
from ktransformers.models.modeling_qwen2_moe import Qwen2MoeRMSNorm
|
||||
from ktransformers.models.modeling_qwen3_moe import Qwen3MoeRMSNorm
|
||||
from ktransformers.operators.base_operator import BaseInjectedModule
|
||||
from ktransformers.util.custom_gguf import GGUFLoader
|
||||
from flashinfer.norm import (
|
||||
fused_add_rmsnorm,
|
||||
rmsnorm,
|
||||
)
|
||||
from ktransformers.util.custom_loader import GGUFLoader
|
||||
if not torch.xpu.is_available():
|
||||
from flashinfer.norm import (
|
||||
fused_add_rmsnorm,
|
||||
rmsnorm,
|
||||
)
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
@ -163,3 +164,62 @@ class KQwen3MoeRMSNorm(Qwen3MoeRMSNorm, BaseInjectedModule):
|
|||
variance = hidden_states.pow(2).mean(-1, keepdim=True)
|
||||
hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
|
||||
return self.weight * hidden_states.to(input_dtype)
|
||||
|
||||
class DeepseekV3RMSNormTorch(DeepseekV3RMSNorm, BaseInjectedModule):
|
||||
def __init__(self,
|
||||
key: str,
|
||||
gguf_loader : GGUFLoader,
|
||||
config: PretrainedConfig,
|
||||
orig_module: nn.Module,
|
||||
prefill_device: str = "cuda",
|
||||
generate_device: str = "cuda",
|
||||
**kwargs):
|
||||
BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, **kwargs)
|
||||
self.orig_module.__init__(orig_module.hidden_size,
|
||||
orig_module.variance_epsilon)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
x,
|
||||
batch_size_tensor: torch.Tensor = None,
|
||||
residual: Optional[torch.Tensor] = None,
|
||||
)-> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
|
||||
if residual is not None:
|
||||
x = x + residual
|
||||
residual = x
|
||||
# range batch_size_tensor for x
|
||||
input_dtype = x.dtype
|
||||
x = x.to(torch.float32)
|
||||
variance = x.pow(2).mean(-1, keepdim=True)
|
||||
x = x * torch.rsqrt(variance + self.variance_epsilon)
|
||||
if residual is not None:
|
||||
return self.weight * x.to(input_dtype), residual
|
||||
return self.weight * x.to(input_dtype)
|
||||
|
||||
|
||||
class KDeepseekRMSNormIPEXLLM(DeepseekV3RMSNorm, BaseInjectedModule):
|
||||
def __init__(self,
|
||||
key: str,
|
||||
gguf_loader : GGUFLoader,
|
||||
config: PretrainedConfig,
|
||||
orig_module: nn.Module,
|
||||
prefill_device: str = "xpu",
|
||||
generate_device: str = "xpu",
|
||||
**kwargs):
|
||||
BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, **kwargs)
|
||||
self.orig_module.__init__(orig_module.weight.shape[0],
|
||||
orig_module.variance_epsilon)
|
||||
self.eps = orig_module.variance_epsilon
|
||||
|
||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||
from ipex_llm.transformers.models.common import rms_norm_forward
|
||||
if x.dtype not in [torch.float32, torch.float16]:
|
||||
output = rms_norm_forward(self, x.float())
|
||||
else:
|
||||
output = rms_norm_forward(self, x)
|
||||
return output.to(x.dtype)
|
||||
|
||||
def load(self):
|
||||
BaseInjectedModule.load(self)
|
||||
if self.weight.dtype not in [torch.float32, torch.float16]:
|
||||
self.weight = self.weight.float()
|
|
@ -14,18 +14,20 @@ Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
|
|||
import ctypes
|
||||
import torch
|
||||
from torch import Tensor, nn
|
||||
import KTransformersOps
|
||||
import vLLMMarlin
|
||||
from ktransformers.util.custom_gguf import GGUFLoader
|
||||
if not torch.xpu.is_available():
|
||||
import KTransformersOps
|
||||
import vLLMMarlin
|
||||
from ktransformers.util.custom_loader import GGUFLoader, SafeTensorLoader
|
||||
from ktransformers.util.utils import InferenceState
|
||||
from ktransformers.ktransformers_ext.operators.custom_marlin.quantize.utils.marlin_utils import (
|
||||
MarlinWorkspace,
|
||||
marlin_quantize,
|
||||
GPTQ_MARLIN_MIN_THREAD_N,
|
||||
GPTQ_MARLIN_MIN_THREAD_K,
|
||||
GPTQ_MARLIN_MAX_PARALLEL,
|
||||
vllm_marlin_quantize
|
||||
)
|
||||
if not torch.xpu.is_available():
|
||||
from ktransformers.ktransformers_ext.operators.custom_marlin.quantize.utils.marlin_utils import (
|
||||
MarlinWorkspace,
|
||||
marlin_quantize,
|
||||
GPTQ_MARLIN_MIN_THREAD_N,
|
||||
GPTQ_MARLIN_MIN_THREAD_K,
|
||||
GPTQ_MARLIN_MAX_PARALLEL,
|
||||
vllm_marlin_quantize
|
||||
)
|
||||
from ktransformers.operators.base_operator import BaseInjectedModule
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
from ktransformers.ktransformers_ext.triton.fp8gemm import fp8_gemm, act_quant, weight_dequant
|
||||
|
@ -83,15 +85,15 @@ class KLinearBase(ABC):
|
|||
keys = [self.key]
|
||||
|
||||
for key in keys:
|
||||
if self.gguf_loader.safetensor_loader is not None:
|
||||
if isinstance(self.gguf_loader, SafeTensorLoader):
|
||||
# using safetensor_loader
|
||||
tensor = self.gguf_loader.safetensor_loader.load_tensor(key+'.weight')
|
||||
if key+'.weight_scale_inv' in self.gguf_loader.safetensor_loader.tensor_file_map:
|
||||
weight_scale_inv = self.gguf_loader.safetensor_loader.load_tensor(key+'.weight_scale_inv')
|
||||
tensor = self.gguf_loader.load_tensor(key+'.weight')
|
||||
if self.gguf_loader.has_tensor(key+'.weight_scale_inv'):
|
||||
weight_scale_inv = self.gguf_loader.load_tensor(key+'.weight_scale_inv')
|
||||
return nn.Parameter(tensor), nn.Parameter(weight_scale_inv)
|
||||
return nn.Parameter(tensor)
|
||||
|
||||
elif key + ".weight" in self.gguf_loader.tensor_file_map:
|
||||
elif self.gguf_loader.has_tensor(key + ".weight") or "kv_b_proj" in key:
|
||||
if key + ".bias" in self.gguf_loader.tensor_file_map:
|
||||
tensors = self.load_multi(key, ["weight", "bias"], device=device)
|
||||
tensor = tensors["weight"]
|
||||
|
@ -99,6 +101,19 @@ class KLinearBase(ABC):
|
|||
# self.qtype = GGML_TYPE_QTYPE_MAP[tensorinfo[key + ".weight"]["ggml_type"]]
|
||||
# print(torch.isinf(tensor).any(), torch.isinf(bias).any())
|
||||
return nn.Parameter(tensor), nn.Parameter(bias)
|
||||
elif "kv_b_proj" in key and not self.gguf_loader.has_tensor(key + ".weight"):
|
||||
attn_k_b_tensors = self.load_multi(key.replace("self_attn.kv_b_proj", "attn_k_b"), ["weight"], device=device)
|
||||
attn_k_b = attn_k_b_tensors["weight"]
|
||||
del attn_k_b_tensors
|
||||
attn_k_b = attn_k_b.transpose(1, 2).contiguous()
|
||||
attn_v_b_tensors = self.load_multi(key.replace("self_attn.kv_b_proj", "attn_v_b"), ["weight"], device=device)
|
||||
attn_v_b = attn_v_b_tensors["weight"]
|
||||
del attn_v_b_tensors
|
||||
kv_b_proj = torch.cat((attn_k_b, attn_v_b), dim=1)
|
||||
kv_b_proj = kv_b_proj.contiguous() if kv_b_proj.ndim == 2 else kv_b_proj.flatten(0, 1).contiguous()
|
||||
del attn_k_b
|
||||
del attn_v_b
|
||||
return nn.Parameter(kv_b_proj)
|
||||
else:
|
||||
tensors = self.load_multi(key, ["weight"], device=device)
|
||||
tensor = tensors["weight"]
|
||||
|
@ -502,6 +517,9 @@ class VLinearMarlin(KLinearBase):
|
|||
marlin_s = self.marlin_s.to(x.dtype)
|
||||
sms = -1
|
||||
|
||||
# padding x.shape[0] to avoid CUDA illegal memory access error
|
||||
x, orig_size_m = self._pad_input(x)
|
||||
|
||||
x = vLLMMarlin.gptq_marlin_gemm(
|
||||
x,
|
||||
self.marlin_q_w,
|
||||
|
@ -511,26 +529,15 @@ class VLinearMarlin(KLinearBase):
|
|||
self.workspace.scratch,
|
||||
self.num_bits,
|
||||
bsz_tensor,
|
||||
# torch.tensor([x.shape[0]], dtype=torch.int32, device=self.device),
|
||||
x.shape[0],
|
||||
self.n,
|
||||
x.shape[-1],
|
||||
sms,
|
||||
self.is_k_full,
|
||||
)
|
||||
# x = KTransformersOps.gptq_marlin_gemm(
|
||||
# x,
|
||||
# self.marlin_q_w,
|
||||
# marlin_s,
|
||||
# self.g_idx,
|
||||
# self.sort_indices,
|
||||
# self.workspace.scratch,
|
||||
# self.num_bits,
|
||||
# x.shape[0],
|
||||
# self.n,
|
||||
# x.shape[-1],
|
||||
# self.is_k_full,
|
||||
# )
|
||||
|
||||
x = x[:orig_size_m]
|
||||
|
||||
if self.has_bias:
|
||||
x = x + self.bias
|
||||
orig_shape[-1] = self.n
|
||||
|
@ -546,6 +553,27 @@ class VLinearMarlin(KLinearBase):
|
|||
self.sort_indices = None
|
||||
self.workspace = None
|
||||
|
||||
def _pad_input(self, x):
|
||||
|
||||
size_m = x.shape[0]
|
||||
size_k = x.shape[1]
|
||||
|
||||
# size_m and align value depends on VLinearMarlin implementation
|
||||
if size_m > 1024:
|
||||
align = 1024
|
||||
elif size_m > 64:
|
||||
align = 64
|
||||
else:
|
||||
align = 1
|
||||
|
||||
padded_size_m = ((size_m + align - 1) // align) * align
|
||||
|
||||
if padded_size_m > size_m:
|
||||
pad_len = padded_size_m - size_m
|
||||
pad_tensor = torch.zeros((pad_len, size_k), dtype=x.dtype, device=x.device)
|
||||
x = torch.cat([x, pad_tensor], dim = 0).contiguous()
|
||||
return x, size_m
|
||||
|
||||
class KLinearMarlin(KLinearBase):
|
||||
marlin_q_w: torch.Tensor
|
||||
marlin_s: torch.Tensor
|
||||
|
@ -760,7 +788,7 @@ class KLinearCPUInfer(KLinearBase):
|
|||
self.output_gpu = torch.zeros((1, 1, self.out_features), device=self.out_device)
|
||||
|
||||
def load_weights(self, w: dict | nn.Parameter | tuple | None = None, device: str = "cpu"):
|
||||
if self.key + ".weight" in self.gguf_loader.tensor_info:
|
||||
if self.gguf_loader.has_tensor(self.key + ".weight"):
|
||||
if self.key + ".bias" in self.gguf_loader.tensor_file_map:
|
||||
self.weight = self.gguf_loader.get_mmap_tensor(self.key + ".weight")
|
||||
self.weight_type = self.gguf_loader.tensor_info[self.key + ".weight"]["ggml_type"]
|
||||
|
@ -778,6 +806,75 @@ class KLinearCPUInfer(KLinearBase):
|
|||
if self.has_bias:
|
||||
self.bias = None
|
||||
|
||||
class KLinearIPEXLLM(KLinearBase):
|
||||
def __init__(
|
||||
self,
|
||||
key: str,
|
||||
gguf_loader: GGUFLoader,
|
||||
config: PretrainedConfig,
|
||||
orig_module: nn.Module = None,
|
||||
device: str = "xpu",
|
||||
precision: str = "sym_int4",
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(key, gguf_loader, config, orig_module, device, **kwargs)
|
||||
self.has_bias = False
|
||||
self.dtype = torch.get_default_dtype()
|
||||
self.weight = None
|
||||
self.has_bias = False
|
||||
self.precision = precision
|
||||
self.qtype = None
|
||||
|
||||
def forward(self, x: torch.Tensor, bsz_tensor: torch.Tensor = None) -> torch.Tensor:
|
||||
dtype = x.dtype
|
||||
out_device = x.device
|
||||
from ipex_llm.transformers.models.common import linear_forward
|
||||
x = linear_forward(x.half(), self.weight, self.qtype, self.out_features)
|
||||
|
||||
if self.has_bias:
|
||||
x = x + self.bias
|
||||
x = x.to(dtype=dtype, device=out_device)
|
||||
return x
|
||||
|
||||
def load(self, w: dict | nn.Parameter | tuple | None = None, device: str|None = None):
|
||||
if self.loaded: return
|
||||
if device is None: device = self.device
|
||||
assert device.lower()[:3] == "xpu", "IPEX-LLM quantized linear only supports XPU device"
|
||||
if w is None: w = self.load_weight(device=device)
|
||||
|
||||
if isinstance(w, nn.Parameter):
|
||||
try:
|
||||
weight = w.to(dtype=self.dtype).view(self.out_features, self.in_features).T
|
||||
except:
|
||||
weight = w.to(dtype=self.dtype).T
|
||||
self.has_bias = False
|
||||
elif isinstance(w, tuple):
|
||||
try:
|
||||
weight = w[0].to(dtype=self.dtype).view(self.out_features, self.in_features).T
|
||||
except:
|
||||
weight = w[0].to(dtype=self.dtype).T
|
||||
self.bias = w[1].to(dtype=self.dtype)
|
||||
self.has_bias = True
|
||||
else:
|
||||
raise ValueError("Invalid weight type")
|
||||
weight = weight.to("cpu").float().transpose(0, 1).contiguous()
|
||||
|
||||
if self.has_bias:
|
||||
self.bias = self.bias.to(device)
|
||||
|
||||
# quantize linear weight
|
||||
from ipex_llm.transformers.models.common import quantize_linear
|
||||
paramsLowBit, qtype = quantize_linear(weight, self.in_features, self.precision)
|
||||
self.weight = paramsLowBit.to(device)
|
||||
self.qtype = qtype
|
||||
self.loaded = True
|
||||
|
||||
def unload(self):
|
||||
if self.weight is not None:
|
||||
self.weight = None
|
||||
if self.has_bias:
|
||||
self.bias = None
|
||||
|
||||
LINEAR_MAP = {
|
||||
"KLinearMarlin": KLinearMarlin,
|
||||
"KLinearTorch": KLinearTorch,
|
||||
|
@ -785,6 +882,7 @@ LINEAR_MAP = {
|
|||
"VLinearMarlin": VLinearMarlin,
|
||||
"KLinearFP8": KLinearFP8,
|
||||
"KLinearQ8": KLinearQ8,
|
||||
"KLinearIPEXLLM": KLinearIPEXLLM,
|
||||
}
|
||||
|
||||
class KTransformersLinear(BaseInjectedModule, KLinearBase):
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
|
||||
from ktransformers.operators.base_operator import BaseInjectedModule
|
||||
from ktransformers.util.custom_gguf import GGUFLoader
|
||||
from ktransformers.util.custom_loader import GGUFLoader
|
||||
from transformers import PretrainedConfig
|
||||
import torch.nn as nn
|
||||
from ktransformers.models.modeling_deepseek_v3 import DeepseekV3MLP
|
||||
|
|
|
@ -58,7 +58,7 @@ from transformers.models.qwen2_moe.configuration_qwen2_moe import Qwen2MoeConfig
|
|||
from ktransformers.models.configuration_llama import LlamaConfig
|
||||
from ktransformers.operators.base_operator import BaseInjectedModule
|
||||
from ktransformers.util.utils import InferenceState, get_compute_capability
|
||||
from ktransformers.util.custom_gguf import GGUFLoader
|
||||
from ktransformers.util.custom_loader import GGUFLoader
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
from ktransformers.models.modeling_llama import (
|
||||
LlamaDecoderLayer,
|
||||
|
@ -306,6 +306,12 @@ class KQwen2MoeModel(BaseInjectedModule):
|
|||
|
||||
hidden_states = inputs_embeds
|
||||
|
||||
# create position embeddings to be shared across the decoder layers
|
||||
if torch.xpu.is_available() and inputs_embeds.device.type == "xpu":
|
||||
position_embeddings = self.rotary_emb(hidden_states, position_ids)
|
||||
else:
|
||||
position_embeddings = None
|
||||
|
||||
# decoder layers
|
||||
all_hidden_states = () if output_hidden_states else None
|
||||
all_self_attns = () if output_attentions else None
|
||||
|
@ -369,6 +375,7 @@ class KQwen2MoeModel(BaseInjectedModule):
|
|||
output_router_logits=output_router_logits,
|
||||
use_cache=use_cache,
|
||||
cache_position=cache_position,
|
||||
position_embeddings=position_embeddings,
|
||||
)
|
||||
if per_layer_prefill_flag:
|
||||
# print(f"to cpu")
|
||||
|
@ -376,8 +383,10 @@ class KQwen2MoeModel(BaseInjectedModule):
|
|||
torch.cuda.empty_cache()
|
||||
hidden_states = layer_outputs[0]
|
||||
|
||||
if use_cache:
|
||||
if use_cache and len(layer_outputs) > 1:
|
||||
next_decoder_cache = layer_outputs[2 if output_attentions else 1]
|
||||
else:
|
||||
next_decoder_cache = None
|
||||
|
||||
if output_attentions:
|
||||
all_self_attns += (layer_outputs[1],)
|
||||
|
@ -396,11 +405,14 @@ class KQwen2MoeModel(BaseInjectedModule):
|
|||
|
||||
next_cache = None
|
||||
if use_cache:
|
||||
next_cache = (
|
||||
next_decoder_cache.to_legacy_cache()
|
||||
if use_legacy_cache
|
||||
else next_decoder_cache
|
||||
)
|
||||
if next_decoder_cache is not None:
|
||||
next_cache = (
|
||||
next_decoder_cache.to_legacy_cache()
|
||||
if use_legacy_cache
|
||||
else next_decoder_cache
|
||||
)
|
||||
else:
|
||||
next_cache = past_key_values
|
||||
|
||||
if not return_dict:
|
||||
return tuple(
|
||||
|
@ -647,10 +659,20 @@ class KDeepseekV2Model(BaseInjectedModule):
|
|||
if position_ids is None:
|
||||
position_ids = cache_position.unsqueeze(0)
|
||||
|
||||
if inputs_embeds.device.type == "xpu" and position_ids is not None:
|
||||
cos, sin = self.layers[0].self_attn.rotary_emb(inputs_embeds,
|
||||
position_ids)
|
||||
position_embeddings = (cos, sin)
|
||||
else:
|
||||
position_embeddings = None
|
||||
|
||||
if per_layer_prefill_flag:
|
||||
causal_mask = None
|
||||
else:
|
||||
if os.name == 'nt' or get_compute_capability()<8 or device_manager.gpu_vendor != GPUVendor.NVIDIA:
|
||||
if (os.name == 'nt'
|
||||
or get_compute_capability() < 8
|
||||
or (self.transfer_map is not None and 'cpu' in self.transfer_map.values())
|
||||
or device_manager.gpu_vendor != GPUVendor.NVIDIA):
|
||||
# print("for Windows or GPU before ampere, use forward_windows")
|
||||
# only use mask in forward windows or can't flash attn
|
||||
causal_mask = self._update_causal_mask(
|
||||
|
@ -734,6 +756,7 @@ class KDeepseekV2Model(BaseInjectedModule):
|
|||
output_attentions=output_attentions,
|
||||
use_cache=use_cache,
|
||||
cache_position=cache_position,
|
||||
position_embeddings=position_embeddings,
|
||||
)
|
||||
t5 = time.time()
|
||||
if per_layer_prefill_flag:
|
||||
|
|
|
@ -12,7 +12,7 @@ from torch import nn
|
|||
from transformers import AutoConfig
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
# from operators import BaseInjectedModule
|
||||
from ktransformers.util.custom_gguf import GGUFLoader, translate_name_to_gguf
|
||||
from ktransformers.util.custom_loader import GGUFLoader, ModelLoaderFactory
|
||||
from ktransformers.util.utils import set_module, load_weights
|
||||
import itertools
|
||||
import copy
|
||||
|
@ -54,7 +54,7 @@ def del_meta(module:nn.Module):
|
|||
|
||||
def gen_optimize_config(module: nn.Module, out_data: Mapping, rule_list: List, prefix: str="", default_device: str = "cuda:0"):
|
||||
module_name = prefix[:-1]
|
||||
translated_name = translate_name_to_gguf(prefix)[:-1]
|
||||
# translated_name = translate_name_to_gguf(prefix)[:-1]
|
||||
#print("gen_optimize_config", prefix, module_name, translated_name)
|
||||
recursive = True
|
||||
for rule in rule_list:
|
||||
|
@ -76,7 +76,7 @@ def gen_optimize_config(module: nn.Module, out_data: Mapping, rule_list: List, p
|
|||
if "replace" in rule:
|
||||
replace_meta = rule["replace"]
|
||||
if module_name not in out_data:
|
||||
out_data[module_name]={"key": translated_name,
|
||||
out_data[module_name]={"key": module_name,
|
||||
"class": replace_meta["class"] if "class" in replace_meta else "default",
|
||||
# "device": replace_meta["device"] if "device" in replace_meta else default_device,
|
||||
"kwargs": copy.deepcopy(replace_meta["kwargs"]) if "kwargs" in replace_meta else dict()}
|
||||
|
@ -91,7 +91,7 @@ def gen_optimize_config(module: nn.Module, out_data: Mapping, rule_list: List, p
|
|||
if module_name not in out_data:
|
||||
out_data[module_name]= {
|
||||
"class": "default",
|
||||
"key": translated_name,
|
||||
"key": module_name,
|
||||
"kwargs": {"generate_device": default_device,
|
||||
"prefill_device": default_device}
|
||||
}
|
||||
|
@ -103,7 +103,7 @@ def gen_optimize_config(module: nn.Module, out_data: Mapping, rule_list: List, p
|
|||
for name, child in module._modules.items():
|
||||
if child is not None:
|
||||
child_prefix = prefix + name + "."
|
||||
gen_optimize_config(child, out_data, rule_list, child_prefix)
|
||||
gen_optimize_config(child, out_data, rule_list, child_prefix, default_device = default_device)
|
||||
|
||||
|
||||
def translate_model_config(model_config: PretrainedConfig):
|
||||
|
@ -123,12 +123,15 @@ def optimize_and_load_gguf(module: nn.Module, rule_file: str, gguf_path: str, mo
|
|||
|
||||
model_config = translate_model_config(model_config)
|
||||
|
||||
gguf_loader=GGUFLoader(gguf_path)
|
||||
weights_loader = ModelLoaderFactory.create_loader(gguf_path)
|
||||
with torch.device("meta"):
|
||||
inject(module, optimize_config, model_config, gguf_loader)
|
||||
inject(module, optimize_config, model_config, weights_loader)
|
||||
# pre load lm_head because its big inter result
|
||||
load_weights(module.lm_head, gguf_loader, "lm_head.")
|
||||
load_weights(module, gguf_loader)
|
||||
module.gguf_loader = gguf_loader
|
||||
load_weights(module.lm_head, weights_loader, "lm_head.", device=default_device)
|
||||
load_weights(module, weights_loader, device=default_device)
|
||||
module.gguf_loader = weights_loader
|
||||
del_meta(module)
|
||||
torch.cuda.empty_cache()
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.empty_cache()
|
||||
elif torch.xpu.is_available():
|
||||
torch.xpu.empty_cache()
|
||||
|
|
|
@ -0,0 +1,184 @@
|
|||
- match:
|
||||
name: "^model.embed_tokens"
|
||||
replace:
|
||||
class: "default"
|
||||
kwargs:
|
||||
generate_device: "cpu"
|
||||
prefill_device: "cpu"
|
||||
|
||||
# === Rotary Embedding Replacement ===
|
||||
|
||||
# GPU 0: layers 0–9
|
||||
- match:
|
||||
name: "^model\\.layers\\.(0|[1-9])\\."
|
||||
class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
|
||||
replace:
|
||||
class: ktransformers.operators.RoPE.YarnRotaryEmbedding
|
||||
kwargs:
|
||||
generate_device: "cuda:0"
|
||||
prefill_device: "cuda:0"
|
||||
# CPU: layers 10-29
|
||||
- match:
|
||||
name: "^model\\.layers\\.([12][0-9])\\."
|
||||
class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
|
||||
replace:
|
||||
class: ktransformers.operators.RoPE.YarnRotaryEmbedding
|
||||
kwargs:
|
||||
generate_device: "cpu"
|
||||
prefill_device: "cpu"
|
||||
|
||||
# === Linear Layers Replacement (excluding self_attn) ===
|
||||
|
||||
# GPU 0: layers 0–9
|
||||
- match:
|
||||
name: "^model\\.layers\\.(0|[1-9])\\.(?!self_attn).*$" # regular expression
|
||||
class: torch.nn.Linear # only match modules matching name and class simultaneously
|
||||
replace:
|
||||
class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types
|
||||
kwargs:
|
||||
generate_device: "cuda:0"
|
||||
prefill_device: "cuda:0"
|
||||
generate_op: "KLinearMarlin"
|
||||
prefill_op: "KLinearTorch"
|
||||
# CPU: layers 10-29
|
||||
- match:
|
||||
name: "^model\\.layers\\.([12][0-9])\\.(?!self_attn).*$" # regular expression
|
||||
class: torch.nn.Linear # only match modules matching name and class simultaneously
|
||||
replace:
|
||||
class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types
|
||||
kwargs:
|
||||
generate_device: "cpu"
|
||||
prefill_device: "cpu"
|
||||
generate_op: "KLinearCPUInfer"
|
||||
prefill_op: "KLinearTorch"
|
||||
out_device: "cpu"
|
||||
|
||||
# === MLP (MoE) Replacement ===
|
||||
|
||||
# GPU 0: layers 0–9
|
||||
- match:
|
||||
name: "^model\\.layers\\.(0|[1-9])\\.mlp$"
|
||||
class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
|
||||
replace:
|
||||
class: ktransformers.operators.experts.KDeepseekV2MoE # mlp module with custom forward function
|
||||
kwargs:
|
||||
generate_device: "cuda:0"
|
||||
prefill_device: "cuda:0"
|
||||
# CPU: layers 10-29
|
||||
- match:
|
||||
name: "^model\\.layers\\.([12][0-9])\\.mlp$"
|
||||
class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
|
||||
replace:
|
||||
class: ktransformers.operators.experts.KDeepseekV2MoE # mlp module with custom forward function
|
||||
kwargs:
|
||||
generate_device: "cpu"
|
||||
prefill_device: "cpu"
|
||||
|
||||
# === MLP Gate Replacement ===
|
||||
|
||||
# GPU 0: layers 0–9
|
||||
- match:
|
||||
name: "^model\\.layers\\.(0|[1-9])\\.mlp\\.gate$"
|
||||
class: ktransformers.models.modeling_deepseek_v3.MoEGate
|
||||
replace:
|
||||
class: ktransformers.operators.gate.KMoEGate
|
||||
kwargs:
|
||||
generate_device: "cuda:0"
|
||||
prefill_device: "cuda:0"
|
||||
# CPU: layers 10-29
|
||||
- match:
|
||||
name: "^model\\.layers\\.([12][0-9])\\.mlp\\.gate$"
|
||||
class: ktransformers.models.modeling_deepseek_v3.MoEGate
|
||||
replace:
|
||||
class: ktransformers.operators.gate.KMoEGate
|
||||
kwargs:
|
||||
generate_device: "cpu"
|
||||
prefill_device: "cpu"
|
||||
|
||||
# === MLP Experts Replacement ===
|
||||
|
||||
# GPU 0: layers 0–9
|
||||
- match:
|
||||
name: "^model\\.layers\\.(0|[1-9])\\.mlp\\.experts$"
|
||||
replace:
|
||||
class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism
|
||||
kwargs:
|
||||
prefill_device: "cuda:0"
|
||||
prefill_op: "KExpertsTorch"
|
||||
generate_device: "cpu"
|
||||
generate_op: "KExpertsCPU"
|
||||
out_device: "cuda:0"
|
||||
recursive: False # don't recursively inject submodules of this module
|
||||
# CPU: layers 10-29
|
||||
- match:
|
||||
name: "^model\\.layers\\.([12][0-9])\\.mlp\\.experts$"
|
||||
replace:
|
||||
class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism
|
||||
kwargs:
|
||||
prefill_device: "cpu"
|
||||
prefill_op: "KExpertsTorch"
|
||||
generate_device: "cpu"
|
||||
generate_op: "KExpertsCPU"
|
||||
out_device: "cpu"
|
||||
recursive: False # don't recursively inject submodules of this module
|
||||
|
||||
# === Self-Attention Replacement ===
|
||||
|
||||
# GPU 0: layers 0–9
|
||||
- match:
|
||||
name: "^model\\.layers\\.(0|[1-9])\\.self_attn$"
|
||||
replace:
|
||||
class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
|
||||
kwargs:
|
||||
generate_device: "cuda:0"
|
||||
prefill_device: "cuda:0"
|
||||
# CPU: layers 10-29
|
||||
- match:
|
||||
name: "^model\\.layers\\.([12][0-9])\\.self_attn$"
|
||||
replace:
|
||||
class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
|
||||
kwargs:
|
||||
generate_device: "cpu"
|
||||
prefill_device: "cpu"
|
||||
|
||||
# === Overall Model Replacement with Transfer Map ===
|
||||
|
||||
- match:
|
||||
name: "^model$"
|
||||
replace:
|
||||
class: "ktransformers.operators.models.KDeepseekV2Model"
|
||||
kwargs:
|
||||
per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
|
||||
transfer_map:
|
||||
10: "cpu"
|
||||
|
||||
# === Default Catch-All for Other Modules ===#
|
||||
# GPU 0: layers 0–9
|
||||
- match:
|
||||
name: "^model\\.layers\\.(0|[1-9])\\."
|
||||
replace:
|
||||
class: "default"
|
||||
kwargs:
|
||||
generate_device: "cuda:0"
|
||||
prefill_device: "cuda:0"
|
||||
|
||||
#lmm_head on GPU 0
|
||||
- match:
|
||||
name: "^lm_head"
|
||||
class: torch.nn.Linear
|
||||
replace:
|
||||
class: ktransformers.operators.linear.KTransformersLinear
|
||||
kwargs:
|
||||
generate_device: "cuda:0"
|
||||
prefill_device: "cuda:0"
|
||||
generate_op: "KLinearMarlin"
|
||||
prefill_op: "KLinearTorch"
|
||||
|
||||
# CPU: layers 10-29
|
||||
- match:
|
||||
name: "(^model\\.layers\\.([12][0-9])\\.)|(model.norm)"
|
||||
replace:
|
||||
class: "default"
|
||||
kwargs:
|
||||
generate_device: "cpu"
|
||||
prefill_device: "cpu"
|
|
@ -0,0 +1,91 @@
|
|||
- match:
|
||||
class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
|
||||
replace:
|
||||
class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
|
||||
kwargs:
|
||||
generate_device: "cuda"
|
||||
prefill_device: "cuda"
|
||||
- match:
|
||||
name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$" # regular expression
|
||||
class: torch.nn.Linear # only match modules matching name and class simultaneously
|
||||
replace:
|
||||
class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types
|
||||
kwargs:
|
||||
generate_device: "cuda"
|
||||
prefill_device: "cuda"
|
||||
generate_op: "KLinearFP8"
|
||||
prefill_op: "KLinearTorch"
|
||||
- match:
|
||||
name: "^model\\.layers\\..*\\.mlp$"
|
||||
class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
|
||||
replace:
|
||||
class: ktransformers.operators.experts.KDeepseekV3MoEV2 # mlp module with custom forward function
|
||||
kwargs:
|
||||
generate_device: "cuda"
|
||||
prefill_device: "cuda"
|
||||
- match:
|
||||
class: ktransformers.models.modeling_deepseek_v3.MoEGate
|
||||
replace:
|
||||
class: ktransformers.operators.gate.KMoEGate
|
||||
kwargs:
|
||||
generate_device: "cuda:0"
|
||||
prefill_device: "cuda:0"
|
||||
- match:
|
||||
name: "^model\\.layers\\..*\\.mlp\\.experts$"
|
||||
replace:
|
||||
class: ktransformers.operators.experts.KTransformersExpertsV2 # custom MoE Kernel with expert paralleism
|
||||
kwargs:
|
||||
prefill_device: "cuda"
|
||||
prefill_op: "KExpertsTorch"
|
||||
generate_device: "cpu"
|
||||
generate_op: "KExpertsCPU"
|
||||
out_device: "cuda"
|
||||
backend: "llamafile"
|
||||
recursive: False # don't recursively inject submodules of this module
|
||||
- match:
|
||||
name: "^model\\.layers\\..*\\.self_attn$"
|
||||
replace:
|
||||
class: ktransformers.operators.balance_serve_attention.flashinfer_attn # optimized MLA implementation
|
||||
kwargs:
|
||||
generate_device: "cuda"
|
||||
prefill_device: "cuda"
|
||||
- match:
|
||||
name: "^model$"
|
||||
replace:
|
||||
class: "ktransformers.operators.models.KDeepseekV2Model"
|
||||
kwargs:
|
||||
per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
|
||||
- match:
|
||||
name: "^model.embed_tokens"
|
||||
replace:
|
||||
class: "default"
|
||||
kwargs:
|
||||
generate_device: "cpu"
|
||||
prefill_device: "cpu"
|
||||
|
||||
- match:
|
||||
class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RMSNorm
|
||||
replace:
|
||||
class: ktransformers.operators.layernorm.RMSNorm
|
||||
kwargs:
|
||||
generate_device: "cuda"
|
||||
prefill_device: "cuda"
|
||||
|
||||
- match:
|
||||
class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MLP
|
||||
replace:
|
||||
class: ktransformers.operators.mlp.kDeepseekV3MLP
|
||||
kwargs:
|
||||
generate_device: "cuda"
|
||||
prefill_device: "cuda"
|
||||
|
||||
- match:
|
||||
name: "^lm_head$" # regular expression
|
||||
class: torch.nn.Linear # only match modules matching name and class simultaneously
|
||||
replace:
|
||||
class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types
|
||||
kwargs:
|
||||
generate_device: "cuda"
|
||||
prefill_device: "cuda"
|
||||
generate_op: "VLinearMarlin"
|
||||
prefill_op: "KLinearTorch"
|
|
@ -0,0 +1,64 @@
|
|||
- match:
|
||||
class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
|
||||
replace:
|
||||
class: ktransformers.operators.RoPE.YarnRotaryEmbedding
|
||||
kwargs:
|
||||
generate_device: "xpu"
|
||||
prefill_device: "xpu"
|
||||
- match:
|
||||
name: "^model\\.layers\\..*" # regular expression
|
||||
class: torch.nn.Linear # only match modules matching name and class simultaneously
|
||||
replace:
|
||||
class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types
|
||||
kwargs:
|
||||
generate_device: "xpu"
|
||||
prefill_device: "xpu"
|
||||
generate_op: "KLinearIPEXLLM"
|
||||
prefill_op: "KLinearIPEXLLM"
|
||||
- match:
|
||||
name: "^model\\.layers\\..*\\.mlp$"
|
||||
class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
|
||||
replace:
|
||||
class: ktransformers.operators.experts.KDeepseekV2MoE # mlp module with custom forward function
|
||||
kwargs:
|
||||
generate_device: "xpu"
|
||||
prefill_device: "xpu"
|
||||
- match:
|
||||
name: "^model\\.layers\\..*\\.mlp\\.experts$"
|
||||
replace:
|
||||
class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism
|
||||
kwargs:
|
||||
prefill_device: "xpu"
|
||||
prefill_op: "KExpertsTorch"
|
||||
generate_device: "cpu"
|
||||
generate_op: "KExpertsCPU"
|
||||
out_device: "xpu"
|
||||
recursive: False # don't recursively inject submodules of this module
|
||||
- match:
|
||||
class: ktransformers.models.modeling_deepseek.DeepseekV2RMSNorm
|
||||
replace:
|
||||
class: ktransformers.operators.layernorm.KDeepseekRMSNormIPEXLLM
|
||||
kwargs:
|
||||
generate_device: "xpu"
|
||||
prefill_device: "xpu"
|
||||
- match:
|
||||
name: "^model\\.layers\\..*\\.self_attn$"
|
||||
replace:
|
||||
class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
|
||||
kwargs:
|
||||
generate_device: "xpu"
|
||||
prefill_device: "xpu"
|
||||
- match:
|
||||
name: "^model$"
|
||||
replace:
|
||||
class: "ktransformers.operators.models.KDeepseekV2Model"
|
||||
kwargs:
|
||||
per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
|
||||
device: "xpu"
|
||||
- match:
|
||||
name: "^model.embed_tokens"
|
||||
replace:
|
||||
class: "default"
|
||||
kwargs:
|
||||
generate_device: "cpu"
|
||||
prefill_device: "cpu"
|
|
@ -0,0 +1,81 @@
|
|||
- match:
|
||||
class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
|
||||
replace:
|
||||
class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
|
||||
kwargs:
|
||||
generate_device: "xpu"
|
||||
prefill_device: "xpu"
|
||||
- match:
|
||||
name: "^lm_head$" # regular expression
|
||||
class: torch.nn.Linear # only match modules matching name and class simultaneously
|
||||
replace:
|
||||
class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types
|
||||
kwargs:
|
||||
generate_device: "xpu"
|
||||
prefill_device: "xpu"
|
||||
generate_op: "KLinearIPEXLLM"
|
||||
prefill_op: "KLinearIPEXLLM"
|
||||
- match:
|
||||
name: "^model\\.layers\\..*" # regular expression
|
||||
class: torch.nn.Linear # only match modules matching name and class simultaneously
|
||||
replace:
|
||||
class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types
|
||||
kwargs:
|
||||
generate_device: "xpu"
|
||||
prefill_device: "xpu"
|
||||
generate_op: "KLinearIPEXLLM"
|
||||
prefill_op: "KLinearIPEXLLM"
|
||||
- match:
|
||||
name: "^model\\.layers\\..*\\.mlp$"
|
||||
class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
|
||||
replace:
|
||||
class: ktransformers.operators.experts.KDeepseekV3MoE # mlp module with custom forward function
|
||||
kwargs:
|
||||
generate_device: "xpu"
|
||||
prefill_device: "xpu"
|
||||
- match:
|
||||
class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RMSNorm
|
||||
replace:
|
||||
class: ktransformers.operators.layernorm.KDeepseekRMSNormIPEXLLM
|
||||
kwargs:
|
||||
generate_device: "xpu"
|
||||
prefill_device: "xpu"
|
||||
- match:
|
||||
class: ktransformers.models.modeling_deepseek_v3.MoEGate
|
||||
replace:
|
||||
class: ktransformers.operators.gate.KMoEGateIPEXLLM
|
||||
kwargs:
|
||||
generate_device: "xpu:0"
|
||||
prefill_device: "xpu:0"
|
||||
- match:
|
||||
name: "^model\\.layers\\..*\\.mlp\\.experts$"
|
||||
replace:
|
||||
class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism
|
||||
kwargs:
|
||||
prefill_device: "xpu"
|
||||
prefill_op: "KExpertsTorch"
|
||||
generate_device: "cpu"
|
||||
generate_op: "KExpertsCPU"
|
||||
out_device: "xpu"
|
||||
recursive: False # don't recursively inject submodules of this module
|
||||
- match:
|
||||
name: "^model\\.layers\\..*\\.self_attn$"
|
||||
replace:
|
||||
class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
|
||||
kwargs:
|
||||
generate_device: "xpu"
|
||||
prefill_device: "xpu"
|
||||
absorb_for_prefill: False # change this to True to enable long context(prefill may slower).
|
||||
- match:
|
||||
name: "^model$"
|
||||
replace:
|
||||
class: "ktransformers.operators.models.KDeepseekV2Model"
|
||||
kwargs:
|
||||
per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
|
||||
- match:
|
||||
name: "^model.embed_tokens"
|
||||
replace:
|
||||
class: "default"
|
||||
kwargs:
|
||||
generate_device: "cpu"
|
||||
prefill_device: "cpu"
|
80
ktransformers/optimize/optimize_rules/xpu/Qwen3Moe-Chat.yaml
Normal file
80
ktransformers/optimize/optimize_rules/xpu/Qwen3Moe-Chat.yaml
Normal file
|
@ -0,0 +1,80 @@
|
|||
- match:
|
||||
name: "rotary_emb$"
|
||||
replace:
|
||||
class: ktransformers.operators.RoPE.KQwen3MoeRotaryEmbedding
|
||||
kwargs:
|
||||
generate_device: "xpu"
|
||||
prefill_device: "xpu"
|
||||
- match:
|
||||
name: "^lm_head$" # regular expression
|
||||
class: torch.nn.Linear # only match modules matching name and class simultaneously
|
||||
replace:
|
||||
class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types
|
||||
kwargs:
|
||||
generate_device: "xpu"
|
||||
prefill_device: "xpu"
|
||||
generate_op: "KLinearIPEXLLM"
|
||||
prefill_op: "KLinearIPEXLLM"
|
||||
- match:
|
||||
name: "^model\\.layers\\.(?!.*mlp\\.gate).*$" # regular expression
|
||||
class: torch.nn.Linear # only match modules matching name and class simultaneously
|
||||
replace:
|
||||
class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types
|
||||
kwargs:
|
||||
generate_device: "xpu"
|
||||
prefill_device: "xpu"
|
||||
generate_op: "KLinearIPEXLLM"
|
||||
prefill_op: "KLinearIPEXLLM"
|
||||
- match:
|
||||
name: "^model\\.layers\\..*\\.mlp$"
|
||||
class: transformers.models.qwen3_moe.modeling_qwen3_moe.Qwen3MoeSparseMoeBlock
|
||||
replace:
|
||||
class: ktransformers.operators.experts.KQwen3MoeSparseMoeBlockV2 # mlp module with custom forward function
|
||||
kwargs:
|
||||
generate_device: "xpu"
|
||||
prefill_device: "xpu"
|
||||
- match:
|
||||
name: "^model\\.layers\\..*\\.mlp\\.experts$"
|
||||
replace:
|
||||
class: ktransformers.operators.experts.KTransformersExpertsV2 # custom MoE Kernel with expert paralleism
|
||||
kwargs:
|
||||
prefill_device: "xpu"
|
||||
prefill_op: "KExpertsTorch"
|
||||
generate_device: "cpu"
|
||||
generate_op: "KExpertsCPU"
|
||||
out_device: "xpu"
|
||||
recursive: False # don't recursively inject submodules of this module
|
||||
- match:
|
||||
name: "^model\\.layers\\..*\\.self_attn$"
|
||||
replace:
|
||||
class: ktransformers.operators.attention.KQwen3MoeAttentionIPEXLLM
|
||||
kwargs:
|
||||
generate_device: "xpu"
|
||||
prefill_device: "xpu"
|
||||
- match:
|
||||
name: "^model$"
|
||||
replace:
|
||||
class: "ktransformers.operators.models.KQwen2MoeModel"
|
||||
kwargs:
|
||||
per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
|
||||
- match:
|
||||
name: "^model.embed_tokens"
|
||||
replace:
|
||||
class: "default"
|
||||
kwargs:
|
||||
generate_device: "cpu"
|
||||
prefill_device: "cpu"
|
||||
- match:
|
||||
class: transformers.models.qwen3_moe.modeling_qwen3_moe.Qwen3MoeRMSNorm
|
||||
replace:
|
||||
class: ktransformers.operators.layernorm.KDeepseekRMSNormIPEXLLM
|
||||
kwargs:
|
||||
generate_device: "xpu"
|
||||
prefill_device: "xpu"
|
||||
- match:
|
||||
class: transformers.models.qwen3_moe.modeling_qwen3_moe.Qwen3MoeMLP
|
||||
replace:
|
||||
class: ktransformers.operators.mlp.KQwen2MoeMLP
|
||||
kwargs:
|
||||
generate_device: "xpu"
|
||||
prefill_device: "xpu"
|
|
@ -128,10 +128,7 @@ class ArgumentParser:
|
|||
else:
|
||||
args.model_dir = self.cfg.model_dir
|
||||
args.model_path = self.cfg.model_path
|
||||
# set config from args
|
||||
for key, value in vars(args).items():
|
||||
if value is not None and hasattr(self.cfg, key):
|
||||
setattr(self.cfg, key, value)
|
||||
|
||||
# we add the name not match args individually
|
||||
self.cfg.model_device = args.device
|
||||
self.cfg.mount_web = args.web
|
||||
|
@ -140,10 +137,15 @@ class ArgumentParser:
|
|||
self.cfg.user_force_think = args.force_think
|
||||
|
||||
model_config = AutoConfig.from_pretrained(args.model_dir, trust_remote_code=True)
|
||||
if args.architectures == "Qwen3MoeForCausalLM" or args.architectures == "Qwen2MoeForCausalLM" :
|
||||
if model_config.architectures[0] == "Qwen3MoeForCausalLM" or model_config.architectures[0] == "Qwen2MoeForCausalLM" :
|
||||
args.gpu_memory_size = args.cache_lens*2*2*model_config.num_hidden_layers*model_config.num_key_value_heads*model_config.head_dim
|
||||
args.architectures = model_config.architectures[0]
|
||||
else:
|
||||
args.gpu_memory_size = args.cache_lens*2*576*61
|
||||
# set config from args
|
||||
for key, value in vars(args).items():
|
||||
if value is not None and hasattr(self.cfg, key):
|
||||
setattr(self.cfg, key, value)
|
||||
self.cfg.gpu_memory_size = args.gpu_memory_size
|
||||
free_ports = get_free_ports(3, [args.port])
|
||||
args.sched_port = free_ports[0]
|
||||
|
|
|
@ -195,13 +195,13 @@ class Engine:
|
|||
|
||||
|
||||
self.block_num = inference_context.k_cache[0].size(1)
|
||||
self.model_runner = ModelRunner(self.model, self.device, self.args.use_cuda_graph, page_size = args.page_size, block_num=self.block_num)
|
||||
#@TODO add config
|
||||
if config.architectures[0] == "Qwen2MoeForCausalLM" or config.architectures[0] == "Qwen3MoeForCausalLM":
|
||||
self.model.init_wrapper(self.args.use_cuda_graph, self.device, 1024 ,args.max_batch_size, self.block_num) # TODO: 1024 is a magic number(max_batch_tokens)
|
||||
self.model.init_wrapper(self.args.use_cuda_graph, self.device, max(self.model_runner.cuda_graphs), args.max_batch_size, self.block_num)
|
||||
else:
|
||||
self.model.init_wrapper(self.args.use_cuda_graph, self.device, args.max_batch_size, self.block_num)
|
||||
|
||||
self.model_runner = ModelRunner(self.model, self.device, self.args.use_cuda_graph, page_size = args.page_size, block_num=self.block_num)
|
||||
self.sampler = Sampler()
|
||||
self.query_manager = QueryManager(device = self.device, page_size = args.page_size)
|
||||
|
||||
|
|
|
@ -11,6 +11,14 @@ from transformers import (
|
|||
StaticCache,
|
||||
AutoModelForCausalLM,
|
||||
BitsAndBytesConfig,
|
||||
LogitsProcessorList,
|
||||
TemperatureLogitsWarper,
|
||||
TopKLogitsWarper,
|
||||
TopPLogitsWarper,
|
||||
MinPLogitsWarper,
|
||||
TypicalLogitsWarper,
|
||||
EpsilonLogitsWarper,
|
||||
EtaLogitsWarper,
|
||||
)
|
||||
|
||||
from ktransformers.server.config.config import Config
|
||||
|
@ -206,6 +214,58 @@ class TransformersInterface(BackendInterfaceBase):
|
|||
self.seq_length += 1
|
||||
return self.streamer.put(new_tokens)
|
||||
|
||||
@staticmethod
|
||||
def tf_logits_warper(generation_config):
|
||||
"""
|
||||
This class returns a [`LogitsProcessorList`] list object that contains all relevant [`LogitsWarper`] instances
|
||||
used for multinomial sampling.
|
||||
"""
|
||||
|
||||
# instantiate warpers list
|
||||
warpers = LogitsProcessorList()
|
||||
|
||||
# In beam methods, we need to keep at least one non-eos token to explore continuations that might have a
|
||||
# better score (i.e. keep len(list(generation_config._eos_token_tensor)) + 1)
|
||||
if generation_config.num_beams > 1:
|
||||
if isinstance(generation_config._eos_token_tensor, list):
|
||||
min_tokens_to_keep = len(generation_config._eos_token_tensor) + 1
|
||||
elif isinstance(generation_config._eos_token_tensor, torch.Tensor):
|
||||
min_tokens_to_keep = generation_config._eos_token_tensor.shape[0] + 1
|
||||
else:
|
||||
min_tokens_to_keep = 2
|
||||
else:
|
||||
min_tokens_to_keep = 1
|
||||
|
||||
# the following idea is largely copied from this PR: https://github.com/huggingface/transformers/pull/5420/files
|
||||
# all samplers can be found in `generation_utils_samplers.py`
|
||||
if generation_config.temperature is not None and generation_config.temperature != 1.0:
|
||||
warpers.append(TemperatureLogitsWarper(generation_config.temperature))
|
||||
if generation_config.top_k is not None and generation_config.top_k != 0:
|
||||
warpers.append(TopKLogitsWarper(top_k=generation_config.top_k, min_tokens_to_keep=min_tokens_to_keep))
|
||||
if generation_config.top_p is not None and generation_config.top_p < 1.0:
|
||||
warpers.append(TopPLogitsWarper(top_p=generation_config.top_p, min_tokens_to_keep=min_tokens_to_keep))
|
||||
if generation_config.min_p is not None:
|
||||
# Applied after temperature scaling (see https://github.com/ggerganov/llama.cpp/pull/3841#issuecomment-2073826084)
|
||||
warpers.append(MinPLogitsWarper(min_p=generation_config.min_p, min_tokens_to_keep=min_tokens_to_keep))
|
||||
if generation_config.typical_p is not None and generation_config.typical_p < 1.0:
|
||||
warpers.append(
|
||||
TypicalLogitsWarper(mass=generation_config.typical_p, min_tokens_to_keep=min_tokens_to_keep)
|
||||
)
|
||||
if generation_config.epsilon_cutoff is not None and 0.0 < generation_config.epsilon_cutoff < 1.0:
|
||||
warpers.append(
|
||||
EpsilonLogitsWarper(epsilon=generation_config.epsilon_cutoff, min_tokens_to_keep=min_tokens_to_keep)
|
||||
)
|
||||
if generation_config.eta_cutoff is not None and 0.0 < generation_config.eta_cutoff < 1.0:
|
||||
warpers.append(
|
||||
EtaLogitsWarper(
|
||||
epsilon=generation_config.eta_cutoff, min_tokens_to_keep=min_tokens_to_keep, device=device
|
||||
)
|
||||
)
|
||||
# `LogitNormalization` should always be the last logit processor, when present
|
||||
if generation_config.renormalize_logits is True:
|
||||
warpers.append(LogitNormalization())
|
||||
return warpers
|
||||
|
||||
def prepare_logits_wrapper(self, inputs, device, temperature: Optional[float] = None, top_p: Optional[float] = None):
|
||||
if temperature is None or temperature == 0:
|
||||
temperature = self.model.generation_config.temperature
|
||||
|
@ -222,14 +282,8 @@ class TransformersInterface(BackendInterfaceBase):
|
|||
repetition_penalty=self.args.repetition_penalty # change this to modify generate config
|
||||
)
|
||||
self.inputs = inputs
|
||||
try: # transformers==4.43
|
||||
self.logits_warper = (
|
||||
self.model._get_logits_warper(generation_config, device=device)
|
||||
)
|
||||
except:
|
||||
self.logits_warper = (
|
||||
self.model._get_logits_warper(generation_config)
|
||||
)
|
||||
|
||||
self.logits_warper = self.tf_logits_warper(generation_config)
|
||||
|
||||
def logits_to_token(self, logits: torch.Tensor):
|
||||
logits = self.logits_warper(self.inputs.view(1, -1), logits.view(1, -1))
|
||||
|
|
|
@ -200,7 +200,7 @@ class ForwardBatchInput:
|
|||
device=None,
|
||||
tokens: torch.Tensor = None,
|
||||
num_mini_batches: int = 1,
|
||||
max_seq_length: int = 1024, # TODO: add to yaml
|
||||
max_seq_length: int = 4096, # TODO: add to yaml
|
||||
prefill_query_length: int = (Config().chunk_size - Config().max_decode_batch_size) // Config().max_prefill_batch_size, # TODO: use config
|
||||
prefill_active_length: int = (Config().chunk_size - Config().max_decode_batch_size) // Config().max_prefill_batch_size,
|
||||
gen_prefill: bool = True,
|
||||
|
@ -223,12 +223,12 @@ class ForwardBatchInput:
|
|||
|
||||
decode_querys_info = []
|
||||
for i in range(min(decode_batch_size, cuda_lens)):
|
||||
query_info = QueryInfo(i+Config().max_prefill_batch_size, prefill_query_length, max_seq_length, page_size, device, is_prefill=False, offset=offset)
|
||||
query_info = QueryInfo(i+Config().max_prefill_batch_size, prefill_query_length, 256, page_size, device, is_prefill=False, offset=offset)
|
||||
offset += max_seq_length // page_size
|
||||
if tokens is not None:
|
||||
query_info.query_tokens[prefill_active_length:prefill_active_length + 1].copy_(tokens)
|
||||
if decode_active_position is None:
|
||||
query_info.active_position = prefill_active_length
|
||||
query_info.active_position = 255
|
||||
else:
|
||||
query_info.active_position = decode_active_position[i]
|
||||
|
||||
|
|
|
@ -39,6 +39,17 @@ def pad_num_tokens(num_tokens):
|
|||
|
||||
def deduplicate_and_sort(lst):
|
||||
return sorted(set(lst))
|
||||
def generate_cuda_graphs(chunk_size: int) -> list:
|
||||
# 如果输入不符合要求,assert掉
|
||||
assert chunk_size <= 1024 or chunk_size % 1024 == 0, "chunk_size must <= 1024 or a multiple of 1024"
|
||||
base_list = [1, 2, 3, Config().max_batch_size, 64, 256, 512, chunk_size]
|
||||
|
||||
if chunk_size <= 1024:
|
||||
return deduplicate_and_sort(base_list)
|
||||
|
||||
multiples = [i for i in range(1024, chunk_size + 1, 1024)]
|
||||
|
||||
return deduplicate_and_sort(base_list + multiples)
|
||||
class ModelRunner:
|
||||
"""A CudaGraphRunner runs the forward pass of a model with CUDA graph and torch.compile."""
|
||||
|
||||
|
@ -56,7 +67,7 @@ class ModelRunner:
|
|||
self.features_buf = None
|
||||
self.output = None
|
||||
self.graph_memory_pool = None
|
||||
self.cuda_graphs = deduplicate_and_sort([1, 2, 3, Config().max_batch_size, 64, Config().chunk_size])
|
||||
self.cuda_graphs = generate_cuda_graphs(Config().chunk_size)
|
||||
self.use_cuda_graph = use_cuda_graph
|
||||
self.model_time = 0
|
||||
self.page_size = page_size
|
||||
|
|
|
@ -7,7 +7,7 @@ sys.path.append(current_path+"/../..")
|
|||
import numpy as np
|
||||
# from ktransformers.operators.linear import KTransformersLinear, KLinearMarlin
|
||||
# from ktransformers.operators.experts import KTransformersExperts, KExpertsTorch
|
||||
from ktransformers.util.custom_gguf import GGUFLoader
|
||||
from ktransformers.util.custom_loader import GGUFLoader
|
||||
import torch
|
||||
import KTransformersOps
|
||||
torch.set_default_dtype(torch.bfloat16)
|
||||
|
|
|
@ -9,7 +9,7 @@ from pycuda.compiler import SourceModule
|
|||
import numpy as np
|
||||
from ktransformers.operators.linear import KTransformersLinear, KLinearMarlin
|
||||
from ktransformers.operators.experts import KTransformersExperts, KExpertsTorch
|
||||
from ktransformers.util.custom_gguf import GGUFLoader, dequantize_q4_k_gpu, dequantize_q4_k
|
||||
from ktransformers.util.custom_loader import GGUFLoader, dequantize_q4_k_gpu, dequantize_q4_k
|
||||
import torch
|
||||
import KTransformersOps
|
||||
torch.set_default_dtype(torch.bfloat16)
|
||||
|
|
|
@ -159,5 +159,7 @@ if __name__ == "__main__":
|
|||
prompt = ktansformer_prompt1024
|
||||
elif args.prompt_lens == 2048:
|
||||
prompt = ktansformer_prompt1024 * 2
|
||||
elif args.prompt_lens == 4096:
|
||||
prompt = ktansformer_prompt1024 * 4
|
||||
asyncio.run(main(args.concurrent, prompt, max_tokens, model))
|
||||
|
||||
|
|
|
@ -24,8 +24,8 @@ from typing import Sequence
|
|||
import os
|
||||
from enum import IntEnum
|
||||
import torch
|
||||
import KTransformersOps
|
||||
from .custom_loader import SafeTensorLoader
|
||||
if not torch.xpu.is_available():
|
||||
import KTransformersOps
|
||||
import ctypes
|
||||
import math
|
||||
|
||||
|
@ -166,238 +166,6 @@ DATA_TYPES = {
|
|||
"FP8": 13,
|
||||
}
|
||||
|
||||
class GGUFLoader:
|
||||
tensor_info: dict
|
||||
gguf_path: str
|
||||
tensor_file_map: dict # {tensor_name: tensor_file_path}
|
||||
gguf_file_meta: dict
|
||||
safetensor_loader: SafeTensorLoader
|
||||
def __init__(self, gguf_path: str):
|
||||
# Check dir exist
|
||||
if not os.path.exists(gguf_path):
|
||||
raise FileNotFoundError(f"GGUF dir not found: {gguf_path}")
|
||||
if os.path.isfile(gguf_path):
|
||||
gguf_path = os.path.dirname(gguf_path)
|
||||
|
||||
self.safetensor_loader = None
|
||||
|
||||
self.tensor_info = {}
|
||||
self.gguf_path = gguf_path
|
||||
self.tensor_file_map = {}
|
||||
self.file_data_map = {}
|
||||
self.gguf_file_meta = {}
|
||||
self.tensor_device_map = {}
|
||||
|
||||
# I know this is ugly, but I don't want to change the original code too much
|
||||
# TODO: merge gguf load and other loads.
|
||||
safetensor_loader = SafeTensorLoader(gguf_path)
|
||||
if safetensor_loader.tensor_file_map:
|
||||
self.safetensor_loader = safetensor_loader
|
||||
return
|
||||
# Walk through all the .gguf files in the directory
|
||||
found_gguf = False
|
||||
for root, dirs, files in os.walk(gguf_path):
|
||||
for file in files:
|
||||
if file.endswith(".gguf"):
|
||||
found_gguf = True
|
||||
file_name = os.path.join(root, file)
|
||||
with open(file_name, "rb") as f:
|
||||
self.load_gguf(f)
|
||||
if file_name not in self.file_data_map:
|
||||
self.file_data_map[file_name] = np.memmap(file_name, mode = 'r')
|
||||
if not found_gguf:
|
||||
raise FileNotFoundError(f"Cannot find any .gguf files in: {gguf_path}")
|
||||
|
||||
def load_gguf(self, f):
|
||||
f.seek(0)
|
||||
assert f.read(4) == b'GGUF'
|
||||
values = struct.unpack("<IQQ", f.read(4+8+8))
|
||||
version, n_tensors, n_kv = values
|
||||
if version != 3:
|
||||
warnings.warn(f"Version {version} has never been tested, might not work")
|
||||
|
||||
info = {}
|
||||
for _ in range(n_kv):
|
||||
name = read_value(f, DATA_TYPES["string"])
|
||||
|
||||
data_type = struct.unpack("<I", f.read(4))[0]
|
||||
|
||||
info[name] = read_value(f, data_type)
|
||||
|
||||
tensor_info = {}
|
||||
for _ in range(n_tensors):
|
||||
name = read_value(f, DATA_TYPES["string"])
|
||||
shape_len = read_value(f, DATA_TYPES["uint32"])
|
||||
shape = [read_value(f, DATA_TYPES["uint64"]) for _ in range(shape_len)]
|
||||
ggml_type = read_value(f, DATA_TYPES["uint32"])
|
||||
bad_offset = read_value(f, DATA_TYPES["uint64"])
|
||||
n_elems = int(math.prod(shape))
|
||||
block_size, type_size = GGML_QUANT_SIZES[ggml_type]
|
||||
n_bytes = n_elems * type_size // block_size
|
||||
np_dims = tuple(reversed(shape))
|
||||
|
||||
item_type: npt.DTypeLike
|
||||
if ggml_type == GGMLQuantizationType.F16:
|
||||
item_count = n_elems
|
||||
item_type = np.float16
|
||||
elif ggml_type == GGMLQuantizationType.F32:
|
||||
item_count = n_elems
|
||||
item_type = np.float32
|
||||
elif ggml_type == GGMLQuantizationType.F64:
|
||||
item_count = n_elems
|
||||
item_type = np.float64
|
||||
elif ggml_type == GGMLQuantizationType.I8:
|
||||
item_count = n_elems
|
||||
item_type = np.int8
|
||||
elif ggml_type == GGMLQuantizationType.I16:
|
||||
item_count = n_elems
|
||||
item_type = np.int16
|
||||
elif ggml_type == GGMLQuantizationType.I32:
|
||||
item_count = n_elems
|
||||
item_type = np.int32
|
||||
elif ggml_type == GGMLQuantizationType.I64:
|
||||
item_count = n_elems
|
||||
item_type = np.int64
|
||||
else:
|
||||
item_count = n_bytes
|
||||
item_type = np.uint8
|
||||
np_dims = quant_shape_to_byte_shape(np_dims, ggml_type)
|
||||
|
||||
tensor_info[name] = {
|
||||
"ggml_type": ggml_type,
|
||||
"shape": shape,
|
||||
"bad_offset": bad_offset,
|
||||
"item_type": item_type,
|
||||
"item_count": item_count,
|
||||
"np_dims": np_dims
|
||||
}
|
||||
|
||||
start = f.tell()
|
||||
# Alignment is 32 by default.
|
||||
# https://github.com/ggerganov/ggml/blob/e1daebbf9d38d510ba456c4d50b4500a73ac2b14/docs/gguf.md?plain=1#L253
|
||||
alignment = info.get("general.alignment", 32)
|
||||
|
||||
# Inconveniently, the offset defined in gguf files is relative to the
|
||||
# end of the header and is unaligned.
|
||||
# We need to compute the absolute file offset ourselves instead.
|
||||
for t in tensor_info.values():
|
||||
offset = start + t["bad_offset"]
|
||||
offset += (alignment - offset % alignment) % alignment
|
||||
t["offset"] = offset
|
||||
|
||||
for name in tensor_info:
|
||||
self.tensor_file_map[name] = f.name
|
||||
self.tensor_info.update(tensor_info)
|
||||
self.gguf_file_meta.update(info)
|
||||
|
||||
def get_mmap_tensor(self, name):
|
||||
t = self.tensor_info[name]
|
||||
mmap_data = self.file_data_map[ self.tensor_file_map[name] ]
|
||||
|
||||
offset = t["offset"]
|
||||
item_type = t["item_type"]
|
||||
item_count = t["item_count"]
|
||||
itemsize = int(np.empty([], dtype = item_type).itemsize)
|
||||
return mmap_data[offset : offset + itemsize * item_count]
|
||||
|
||||
def get_undequanted_tensor_and_ggml_type(self, name):
|
||||
t = self.tensor_info[name]
|
||||
data = self.get_mmap_tensor(name)
|
||||
ggml_type = t["ggml_type"]
|
||||
data = torch.from_numpy(data)
|
||||
return data, ggml_type
|
||||
|
||||
def load_expert_tensor(self, name, data, expert_id, elements_per_expert, device = "cuda", target_dtype = torch.get_default_dtype())->torch.Tensor:
|
||||
t = self.tensor_info[name]
|
||||
if device.lower() == "cpu":
|
||||
print(f"loading expert {expert_id} of {name} with CPU")
|
||||
shape = t["shape"]
|
||||
ggml_type = t["ggml_type"]
|
||||
if ggml_type not in GGML_NAMES:
|
||||
raise NotImplementedError(f"ggml_type {ggml_type} not implemented")
|
||||
ggml_name = GGML_NAMES[ggml_type]
|
||||
|
||||
# TODO: experts may fused in quant block, split it
|
||||
assert elements_per_expert % GGML_ELEMENTS_PER_BLOCK[ggml_name] == 0, "experts may fused in quant block, please use CPU dequant"
|
||||
|
||||
blocks_per_experts = elements_per_expert // GGML_ELEMENTS_PER_BLOCK[ggml_name]
|
||||
block_size = GGML_BLOCK_SIZES[ggml_name]
|
||||
offset = expert_id * block_size * blocks_per_experts
|
||||
data = data[offset: offset + block_size * blocks_per_experts]
|
||||
|
||||
if "cuda" in device.lower():
|
||||
values = GGML_DEQUANTIZE_GPU[ggml_name](data, device, target_dtype)
|
||||
else:
|
||||
values = GGML_DEQUANTIZE[ggml_name](data)
|
||||
values = torch.from_numpy(values.copy())
|
||||
|
||||
if ggml_name == "BF16":
|
||||
values = values.view(torch.bfloat16)
|
||||
values = values.view(shape[-2::-1])
|
||||
|
||||
return values
|
||||
|
||||
def load_gguf_tensor(self, name: str, device:str = "cpu", target_dtype = None)->torch.Tensor:
|
||||
t = self.tensor_info[name]
|
||||
if device.lower() == "cpu":
|
||||
print(f"loading {name} with CPU")
|
||||
if target_dtype == None:
|
||||
target_dtype = torch.get_default_dtype()
|
||||
|
||||
shape = t["shape"]
|
||||
ggml_type = t["ggml_type"]
|
||||
|
||||
if ggml_type not in GGML_NAMES:
|
||||
raise NotImplementedError(f"ggml_type {ggml_type} not implemented")
|
||||
|
||||
ggml_name = GGML_NAMES[ggml_type]
|
||||
|
||||
data = self.get_mmap_tensor(name)
|
||||
|
||||
block_size = GGML_BLOCK_SIZES[ggml_name]
|
||||
elements_per_block = GGML_ELEMENTS_PER_BLOCK[ggml_name]
|
||||
num_elements = int(np.prod(shape))
|
||||
num_blocks = num_elements // elements_per_block
|
||||
|
||||
blocks_per_iter = 16384
|
||||
if num_blocks > blocks_per_iter: # dequant large tensor
|
||||
values = torch.empty((num_blocks, elements_per_block), dtype=target_dtype, device=device)
|
||||
for i in range( (num_blocks + blocks_per_iter - 1) // blocks_per_iter):
|
||||
blocks_begin = i * blocks_per_iter
|
||||
blocks_end = min(blocks_begin + blocks_per_iter, num_blocks)
|
||||
if "cuda" in device.lower():
|
||||
cur_values = GGML_DEQUANTIZE_GPU[ggml_name](data[blocks_begin*block_size : blocks_end*block_size], device, target_dtype)
|
||||
else:
|
||||
cur_values = GGML_DEQUANTIZE[ggml_name](data[blocks_begin*block_size : blocks_end*block_size])
|
||||
cur_values = torch.from_numpy(cur_values.copy())
|
||||
|
||||
cur_values = cur_values.view(-1, elements_per_block)
|
||||
if ggml_name == "BF16":
|
||||
cur_values = cur_values.view(torch.bfloat16)
|
||||
values[blocks_begin : blocks_end] = cur_values
|
||||
else:
|
||||
if "cuda" in device.lower():
|
||||
values = GGML_DEQUANTIZE_GPU[ggml_name](data, device)
|
||||
else:
|
||||
values = GGML_DEQUANTIZE[ggml_name](data)
|
||||
values = torch.from_numpy(values)
|
||||
|
||||
if ggml_name == "BF16":
|
||||
values = values.view(torch.bfloat16)
|
||||
|
||||
|
||||
values = values.view(shape[::-1])
|
||||
if "attn_q" in name and self.gguf_file_meta['general.architecture'] in ["llama"]:
|
||||
n_head = self.gguf_file_meta['llama.attention.head_count']
|
||||
values = (values.reshape(n_head, values.shape[0] // n_head // 2, 2, *values.shape[1:])
|
||||
.swapaxes(1, 2)
|
||||
.reshape(values.shape))
|
||||
elif "attn_k" in name and self.gguf_file_meta['general.architecture'] in ["llama"]:
|
||||
n_head = self.gguf_file_meta['llama.attention.head_count_kv']
|
||||
values = (values.reshape(n_head, values.shape[0] // n_head // 2, 2, *values.shape[1:])
|
||||
.swapaxes(1, 2)
|
||||
.reshape(values.shape))
|
||||
return values
|
||||
|
||||
def read_value(f, data_type):
|
||||
if data_type == DATA_TYPES["string"]:
|
||||
|
@ -921,6 +689,7 @@ def translate_name_to_gguf(name):
|
|||
name = name.replace(".gate_up_proj.", ".up_proj")
|
||||
|
||||
name = name.replace(".mlp.shared_experts.down_proj", ".ffn_down_shexp")
|
||||
name = name.replace(".mlp.gate.e_score_correction_bias", ".exp_probs_b.bias")
|
||||
name = name.replace(".mlp.gate", ".ffn_gate_inp")
|
||||
name = name.replace(".mlp.shared_experts.gate_proj", ".ffn_gate_shexp")
|
||||
name = name.replace(".mlp.shared_experts.up_proj", ".ffn_up_shexp")
|
||||
|
|
|
@ -7,15 +7,39 @@ from typing import Sequence
|
|||
import os
|
||||
from enum import IntEnum
|
||||
import torch
|
||||
import KTransformersOps
|
||||
if not torch.xpu.is_available():
|
||||
import KTransformersOps
|
||||
from safetensors import safe_open
|
||||
from ktransformers.ktransformers_ext.triton.fp8gemm import fp8_gemm, act_quant, weight_dequant
|
||||
from ktransformers.util.custom_gguf import *
|
||||
from safetensors.torch import save_file
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Dict, Any, Optional, Union
|
||||
|
||||
class SafeTensorLoader:
|
||||
class ModelLoader(ABC):
|
||||
"""
|
||||
Abstract base class for model loaders.
|
||||
Defines the interface that all model loaders must implement.
|
||||
"""
|
||||
tensor_file_map = {}
|
||||
tensor_type_map = {}
|
||||
file_handle_map = {}
|
||||
@abstractmethod
|
||||
def has_tensor(cls, name: str):
|
||||
"""
|
||||
Check if the tensor exists in the loader.
|
||||
|
||||
Args:
|
||||
name: Name of the tensor to check
|
||||
|
||||
Returns:
|
||||
bool: True if the tensor exists, False otherwise
|
||||
"""
|
||||
pass
|
||||
|
||||
class SafeTensorLoader(ModelLoader):
|
||||
tensor_file_map: dict
|
||||
tensor_type_map: dict
|
||||
file_handle_map: dict
|
||||
tensor_device_map: dict
|
||||
|
||||
def __init__(self, file_path: str):
|
||||
self.__load_tensor_file_map(file_path)
|
||||
|
@ -28,6 +52,10 @@ class SafeTensorLoader:
|
|||
folder_path = os.path.dirname(file_path)
|
||||
else:
|
||||
folder_path = file_path
|
||||
self.file_handle_map = {}
|
||||
self.tensor_file_map = {}
|
||||
self.tensor_type_map = {}
|
||||
self.tensor_device_map = {}
|
||||
|
||||
found_safetensor = False
|
||||
for root, _, files in os.walk(folder_path):
|
||||
|
@ -57,7 +85,11 @@ class SafeTensorLoader:
|
|||
# raise FileNotFoundError(f"No Safetensor files found in {folder_path}")
|
||||
|
||||
def load_tensor(self, key: str, device: str="cpu"):
|
||||
if key not in self.tensor_file_map:
|
||||
if translate_name_to_gguf(key) in self.tensor_file_map:
|
||||
key = translate_name_to_gguf(key)
|
||||
elif key in self.tensor_file_map:
|
||||
pass
|
||||
else:
|
||||
raise KeyError(f"Key {key} not found in Safetensor files")
|
||||
file = self.tensor_file_map[key]
|
||||
f = self.file_handle_map.get(file)
|
||||
|
@ -66,13 +98,145 @@ class SafeTensorLoader:
|
|||
tensor = f.get_tensor(key)
|
||||
return tensor.to(device)
|
||||
|
||||
def load_experts(self, key: str, device: str="cpu"):
|
||||
'''
|
||||
Load experts from safetensor
|
||||
key: the name of the experts
|
||||
device: the device to load the experts to
|
||||
return: dict,
|
||||
{up: tensor, down: tensor, gate: tensor, up_type: int, down_type: int, gate_type: int}
|
||||
{xxx}_type: the type of the up tensor, corresponding to the ggml type
|
||||
'''
|
||||
if self.has_tensor(translate_name_to_gguf(key)+".ffn_gate_exps.weight"):
|
||||
# legacy branch for loading hybrid model
|
||||
base_key = translate_name_to_gguf(key)
|
||||
# Load experts from safetensor
|
||||
gate_key = f"{base_key}.ffn_gate_exps.weight"
|
||||
gate_type_key = f"{base_key}.ffn_gate_exps.ggml_type"
|
||||
up_key = f"{base_key}.ffn_up_exps.weight"
|
||||
up_type_key = f"{base_key}.ffn_up_exps.ggml_type"
|
||||
down_key = f"{base_key}.ffn_down_exps.weight"
|
||||
down_type_key = f"{base_key}.ffn_down_exps.ggml_type"
|
||||
gate_tensor = self.load_tensor(gate_key, device).numpy()
|
||||
up_tensor = self.load_tensor(up_key, device).numpy()
|
||||
down_tensor = self.load_tensor(down_key, device).numpy()
|
||||
gate_type = self.load_tensor(gate_type_key, device).item()
|
||||
up_type = self.load_tensor(up_type_key, device).item()
|
||||
down_type = self.load_tensor(down_type_key, device).item()
|
||||
|
||||
return {
|
||||
"up": up_tensor,
|
||||
"gate": gate_tensor,
|
||||
"down": down_tensor,
|
||||
"up_type": up_type,
|
||||
"gate_type": gate_type,
|
||||
"down_type": down_type
|
||||
}
|
||||
|
||||
else:
|
||||
# Load experts from safetensor
|
||||
base_key = key # e.g. "model.layers.3.mlp.experts"
|
||||
experts_count = 0
|
||||
|
||||
# First, count how many experts we have by checking for expert 0's up_proj
|
||||
while self.has_tensor(f"{base_key}.{experts_count}.up_proj.weight"):
|
||||
experts_count += 1
|
||||
|
||||
if experts_count == 0:
|
||||
raise ValueError(f"No experts found for key {base_key}")
|
||||
|
||||
# Initialize empty lists to store tensors for each projection type
|
||||
up_projs = []
|
||||
gate_projs = []
|
||||
down_projs = []
|
||||
|
||||
# Load all expert weights
|
||||
for expert_id in range(experts_count):
|
||||
up_key = f"{base_key}.{expert_id}.up_proj.weight"
|
||||
gate_key = f"{base_key}.{expert_id}.gate_proj.weight"
|
||||
down_key = f"{base_key}.{expert_id}.down_proj.weight"
|
||||
|
||||
up_tensor = self.load_tensor(up_key, device)
|
||||
gate_tensor = self.load_tensor(gate_key, device)
|
||||
down_tensor = self.load_tensor(down_key, device)
|
||||
|
||||
up_projs.append(up_tensor)
|
||||
gate_projs.append(gate_tensor)
|
||||
down_projs.append(down_tensor)
|
||||
|
||||
# Stack the tensors along a new dimension
|
||||
up_tensor = torch.stack(up_projs, dim=0)
|
||||
gate_tensor = torch.stack(gate_projs, dim=0)
|
||||
down_tensor = torch.stack(down_projs, dim=0)
|
||||
|
||||
# Get original dtype for GGML type determination
|
||||
orig_up_dtype = up_tensor.dtype
|
||||
orig_gate_dtype = gate_tensor.dtype
|
||||
orig_down_dtype = down_tensor.dtype
|
||||
|
||||
# Convert to numpy with proper bfloat16 support
|
||||
up_numpy = up_tensor.view(torch.uint16).numpy()
|
||||
gate_numpy = gate_tensor.view(torch.uint16).numpy()
|
||||
down_numpy = down_tensor.view(torch.uint16).numpy()
|
||||
|
||||
# Determine tensor data types for GGML conversion
|
||||
def get_ggml_type(dtype):
|
||||
if dtype == torch.float32:
|
||||
return GGMLQuantizationType.F32
|
||||
elif dtype == torch.float16:
|
||||
return GGMLQuantizationType.F16
|
||||
elif dtype == torch.bfloat16:
|
||||
return GGMLQuantizationType.BF16
|
||||
else:
|
||||
raise ValueError(f"Unsupported tensor dtype: {dtype}")
|
||||
|
||||
return {
|
||||
"up": up_numpy,
|
||||
"gate": gate_numpy,
|
||||
"down": down_numpy,
|
||||
"up_type": get_ggml_type(orig_up_dtype),
|
||||
"gate_type": get_ggml_type(orig_gate_dtype),
|
||||
"down_type": get_ggml_type(orig_down_dtype)
|
||||
}
|
||||
|
||||
def load_gate(self, key: str, device: str="cpu"):
|
||||
'''
|
||||
Load gate from safetensor
|
||||
key: the name of the gate
|
||||
device: the device to load the gate to
|
||||
return: dict,
|
||||
{'weight': tensor, 'e_score_correction_bias': tensor}
|
||||
'''
|
||||
target = ["weight", "e_score_correction_bias"]
|
||||
res = {'weight': None, 'e_score_correction_bias': None}
|
||||
if self.has_tensor(translate_name_to_gguf(key)+".ffn_gate_exps.weight"):
|
||||
# legacy branch for loading hybrid model
|
||||
base_key = key
|
||||
for k in target:
|
||||
translated_key = translate_name_to_gguf(f"{base_key}.{k}")
|
||||
if self.has_tensor(translated_key):
|
||||
tensor = self.load_tensor(translated_key, device)
|
||||
res[k] = tensor
|
||||
else:
|
||||
# Load gate from safetensor
|
||||
base_key = key
|
||||
for k in target:
|
||||
if self.has_tensor(f"{base_key}.{k}"):
|
||||
tensor = self.load_tensor(f"{base_key}.{k}", device)
|
||||
res[k] = tensor
|
||||
return res
|
||||
|
||||
def close_all_handles(self):
|
||||
for handle in self.file_handle_map.values():
|
||||
handle.close()
|
||||
self.file_handle_map.clear()
|
||||
|
||||
def load_dequantized_tensor(self, key:str, device: str="cpu"):
|
||||
if key not in self.tensor_file_map:
|
||||
if key in self.tensor_file_map and translate_name_to_gguf(key):
|
||||
pass
|
||||
elif translate_name_to_gguf(key) in self.tensor_file_map:
|
||||
key = translate_name_to_gguf(key)
|
||||
else:
|
||||
raise KeyError(f"Key {key} not found in Safetensor files")
|
||||
file = self.tensor_file_map[key]
|
||||
f = self.file_handle_map.get(file)
|
||||
|
@ -83,4 +247,320 @@ class SafeTensorLoader:
|
|||
if key[:-7] + ".weight_scale_inv" in self.tensor_file_map:
|
||||
weight_scale_inv = f.get_tensor(key[:-7] + ".weight_scale_inv").to(device)
|
||||
tensor = weight_dequant(tensor, weight_scale_inv)
|
||||
return tensor.to(device)
|
||||
return tensor.to(device)
|
||||
|
||||
def has_tensor(self, name: str):
|
||||
return name in self.tensor_file_map or translate_name_to_gguf(name) in self.tensor_file_map
|
||||
|
||||
class GGUFLoader(ModelLoader):
|
||||
tensor_info: dict
|
||||
gguf_path: str
|
||||
tensor_file_map: dict # {tensor_name: tensor_file_path}
|
||||
gguf_file_meta: dict
|
||||
safetensor_loader: SafeTensorLoader
|
||||
def __init__(self, gguf_path: str):
|
||||
# Check dir exist
|
||||
if not os.path.exists(gguf_path):
|
||||
raise FileNotFoundError(f"GGUF dir not found: {gguf_path}")
|
||||
if os.path.isfile(gguf_path):
|
||||
gguf_path = os.path.dirname(gguf_path)
|
||||
|
||||
self.safetensor_loader = None
|
||||
|
||||
self.tensor_info = {}
|
||||
self.gguf_path = gguf_path
|
||||
self.tensor_file_map = {}
|
||||
self.file_data_map = {}
|
||||
self.gguf_file_meta = {}
|
||||
self.tensor_device_map = {}
|
||||
|
||||
# Walk through all the .gguf files in the directory
|
||||
found_gguf = False
|
||||
for root, dirs, files in os.walk(gguf_path):
|
||||
for file in files:
|
||||
if file.endswith(".gguf"):
|
||||
found_gguf = True
|
||||
file_name = os.path.join(root, file)
|
||||
with open(file_name, "rb") as f:
|
||||
self.load_gguf(f)
|
||||
if file_name not in self.file_data_map:
|
||||
self.file_data_map[file_name] = np.memmap(file_name, mode = 'r')
|
||||
if not found_gguf:
|
||||
raise FileNotFoundError(f"Cannot find any .gguf files in: {gguf_path}")
|
||||
|
||||
def load_gguf(self, f):
|
||||
f.seek(0)
|
||||
assert f.read(4) == b'GGUF'
|
||||
values = struct.unpack("<IQQ", f.read(4+8+8))
|
||||
version, n_tensors, n_kv = values
|
||||
if version != 3:
|
||||
warnings.warn(f"Version {version} has never been tested, might not work")
|
||||
|
||||
info = {}
|
||||
for _ in range(n_kv):
|
||||
name = read_value(f, DATA_TYPES["string"])
|
||||
|
||||
data_type = struct.unpack("<I", f.read(4))[0]
|
||||
|
||||
info[name] = read_value(f, data_type)
|
||||
|
||||
tensor_info = {}
|
||||
for _ in range(n_tensors):
|
||||
name = read_value(f, DATA_TYPES["string"])
|
||||
shape_len = read_value(f, DATA_TYPES["uint32"])
|
||||
shape = [read_value(f, DATA_TYPES["uint64"]) for _ in range(shape_len)]
|
||||
ggml_type = read_value(f, DATA_TYPES["uint32"])
|
||||
bad_offset = read_value(f, DATA_TYPES["uint64"])
|
||||
n_elems = int(math.prod(shape))
|
||||
block_size, type_size = GGML_QUANT_SIZES[ggml_type]
|
||||
n_bytes = n_elems * type_size // block_size
|
||||
np_dims = tuple(reversed(shape))
|
||||
|
||||
item_type: npt.DTypeLike
|
||||
if ggml_type == GGMLQuantizationType.F16:
|
||||
item_count = n_elems
|
||||
item_type = np.float16
|
||||
elif ggml_type == GGMLQuantizationType.F32:
|
||||
item_count = n_elems
|
||||
item_type = np.float32
|
||||
elif ggml_type == GGMLQuantizationType.F64:
|
||||
item_count = n_elems
|
||||
item_type = np.float64
|
||||
elif ggml_type == GGMLQuantizationType.I8:
|
||||
item_count = n_elems
|
||||
item_type = np.int8
|
||||
elif ggml_type == GGMLQuantizationType.I16:
|
||||
item_count = n_elems
|
||||
item_type = np.int16
|
||||
elif ggml_type == GGMLQuantizationType.I32:
|
||||
item_count = n_elems
|
||||
item_type = np.int32
|
||||
elif ggml_type == GGMLQuantizationType.I64:
|
||||
item_count = n_elems
|
||||
item_type = np.int64
|
||||
else:
|
||||
item_count = n_bytes
|
||||
item_type = np.uint8
|
||||
np_dims = quant_shape_to_byte_shape(np_dims, ggml_type)
|
||||
|
||||
tensor_info[name] = {
|
||||
"ggml_type": ggml_type,
|
||||
"shape": shape,
|
||||
"bad_offset": bad_offset,
|
||||
"item_type": item_type,
|
||||
"item_count": item_count,
|
||||
"np_dims": np_dims
|
||||
}
|
||||
|
||||
start = f.tell()
|
||||
# Alignment is 32 by default.
|
||||
# https://github.com/ggerganov/ggml/blob/e1daebbf9d38d510ba456c4d50b4500a73ac2b14/docs/gguf.md?plain=1#L253
|
||||
alignment = info.get("general.alignment", 32)
|
||||
|
||||
# Inconveniently, the offset defined in gguf files is relative to the
|
||||
# end of the header and is unaligned.
|
||||
# We need to compute the absolute file offset ourselves instead.
|
||||
for t in tensor_info.values():
|
||||
offset = start + t["bad_offset"]
|
||||
offset += (alignment - offset % alignment) % alignment
|
||||
t["offset"] = offset
|
||||
|
||||
for name in tensor_info:
|
||||
self.tensor_file_map[name] = f.name
|
||||
self.tensor_info.update(tensor_info)
|
||||
self.gguf_file_meta.update(info)
|
||||
|
||||
def get_mmap_tensor(self, name):
|
||||
name = translate_name_to_gguf(name)
|
||||
t = self.tensor_info[name]
|
||||
mmap_data = self.file_data_map[ self.tensor_file_map[name] ]
|
||||
|
||||
offset = t["offset"]
|
||||
item_type = t["item_type"]
|
||||
item_count = t["item_count"]
|
||||
itemsize = int(np.empty([], dtype = item_type).itemsize)
|
||||
return mmap_data[offset : offset + itemsize * item_count]
|
||||
|
||||
def get_undequanted_tensor_and_ggml_type(self, name):
|
||||
name = translate_name_to_gguf(name)
|
||||
t = self.tensor_info[name]
|
||||
data = self.get_mmap_tensor(name)
|
||||
ggml_type = t["ggml_type"]
|
||||
data = torch.from_numpy(data)
|
||||
return data, ggml_type
|
||||
|
||||
def load_expert_tensor(self, name, data, expert_id, elements_per_expert, device = "cuda", target_dtype = torch.get_default_dtype())->torch.Tensor:
|
||||
name = translate_name_to_gguf(name)
|
||||
t = self.tensor_info[name]
|
||||
shape = t["shape"]
|
||||
ggml_type = t["ggml_type"]
|
||||
if ggml_type not in GGML_NAMES:
|
||||
raise NotImplementedError(f"ggml_type {ggml_type} not implemented")
|
||||
ggml_name = GGML_NAMES[ggml_type]
|
||||
|
||||
# TODO: experts may fused in quant block, split it
|
||||
assert elements_per_expert % GGML_ELEMENTS_PER_BLOCK[ggml_name] == 0, "experts may fused in quant block, please use CPU dequant"
|
||||
|
||||
blocks_per_experts = elements_per_expert // GGML_ELEMENTS_PER_BLOCK[ggml_name]
|
||||
block_size = GGML_BLOCK_SIZES[ggml_name]
|
||||
offset = expert_id * block_size * blocks_per_experts
|
||||
data = data[offset: offset + block_size * blocks_per_experts]
|
||||
|
||||
if "cuda" in device.lower():
|
||||
values = GGML_DEQUANTIZE_GPU[ggml_name](data, device, target_dtype)
|
||||
else:
|
||||
values = GGML_DEQUANTIZE[ggml_name](data)
|
||||
values = torch.from_numpy(values.copy())
|
||||
|
||||
if ggml_name == "BF16":
|
||||
values = values.view(torch.bfloat16)
|
||||
values = values.view(shape[-2::-1])
|
||||
|
||||
return values
|
||||
|
||||
def load_gguf_tensor(self, name: str, device:str = "cpu", target_dtype = None)->torch.Tensor:
|
||||
name = translate_name_to_gguf(name)
|
||||
t = self.tensor_info[name]
|
||||
if target_dtype == None:
|
||||
target_dtype = torch.get_default_dtype()
|
||||
|
||||
shape = t["shape"]
|
||||
ggml_type = t["ggml_type"]
|
||||
|
||||
if ggml_type not in GGML_NAMES:
|
||||
raise NotImplementedError(f"ggml_type {ggml_type} not implemented")
|
||||
|
||||
ggml_name = GGML_NAMES[ggml_type]
|
||||
|
||||
data = self.get_mmap_tensor(name)
|
||||
|
||||
block_size = GGML_BLOCK_SIZES[ggml_name]
|
||||
elements_per_block = GGML_ELEMENTS_PER_BLOCK[ggml_name]
|
||||
num_elements = int(np.prod(shape))
|
||||
num_blocks = num_elements // elements_per_block
|
||||
|
||||
blocks_per_iter = 16384
|
||||
if num_blocks > blocks_per_iter: # dequant large tensor
|
||||
values = torch.empty((num_blocks, elements_per_block), dtype=target_dtype, device=device)
|
||||
for i in range( (num_blocks + blocks_per_iter - 1) // blocks_per_iter):
|
||||
blocks_begin = i * blocks_per_iter
|
||||
blocks_end = min(blocks_begin + blocks_per_iter, num_blocks)
|
||||
if "cuda" in device.lower():
|
||||
try:
|
||||
cur_values = GGML_DEQUANTIZE_GPU[ggml_name](data[blocks_begin*block_size : blocks_end*block_size], device, target_dtype)
|
||||
except:
|
||||
cur_values = GGML_DEQUANTIZE[ggml_name](data[blocks_begin*block_size : blocks_end*block_size])
|
||||
cur_values = torch.from_numpy(cur_values.copy()).to(device)
|
||||
else:
|
||||
cur_values = GGML_DEQUANTIZE[ggml_name](data[blocks_begin*block_size : blocks_end*block_size])
|
||||
cur_values = torch.from_numpy(cur_values.copy())
|
||||
|
||||
cur_values = cur_values.view(-1, elements_per_block)
|
||||
if ggml_name == "BF16":
|
||||
cur_values = cur_values.view(torch.bfloat16)
|
||||
values[blocks_begin : blocks_end] = cur_values
|
||||
else:
|
||||
if "cuda" in device.lower():
|
||||
values = GGML_DEQUANTIZE_GPU[ggml_name](data, device)
|
||||
else:
|
||||
np_values = np.copy(GGML_DEQUANTIZE[ggml_name](data))
|
||||
values = torch.from_numpy(np_values).to(device)
|
||||
del np_values
|
||||
|
||||
if ggml_name == "BF16":
|
||||
values = values.view(torch.bfloat16)
|
||||
|
||||
|
||||
values = values.view(shape[::-1])
|
||||
if "attn_q" in name and self.gguf_file_meta['general.architecture'] in ["llama"]:
|
||||
n_head = self.gguf_file_meta['llama.attention.head_count']
|
||||
values = (values.reshape(n_head, values.shape[0] // n_head // 2, 2, *values.shape[1:])
|
||||
.swapaxes(1, 2)
|
||||
.reshape(values.shape))
|
||||
elif "attn_k" in name and self.gguf_file_meta['general.architecture'] in ["llama"]:
|
||||
n_head = self.gguf_file_meta['llama.attention.head_count_kv']
|
||||
values = (values.reshape(n_head, values.shape[0] // n_head // 2, 2, *values.shape[1:])
|
||||
.swapaxes(1, 2)
|
||||
.reshape(values.shape))
|
||||
return values
|
||||
def has_tensor(self, name: str):
|
||||
name = translate_name_to_gguf(name)
|
||||
return name in self.tensor_info
|
||||
|
||||
def get_ggml_type(self, name: str):
|
||||
name = translate_name_to_gguf(name)
|
||||
if name not in self.tensor_info:
|
||||
raise KeyError(f"Key {name} not found in GGUF files")
|
||||
return self.tensor_info[name]["ggml_type"]
|
||||
|
||||
class ModelLoaderFactory:
|
||||
"""
|
||||
Factory class for creating model loaders.
|
||||
Automatically detects the model format based on file extensions in the directory.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def create_loader(path: str):
|
||||
"""
|
||||
Create a model loader for the given path by detecting the model format.
|
||||
The function checks for the presence of .safetensors or .gguf files
|
||||
in the specified path and creates the appropriate loader.
|
||||
|
||||
Args:
|
||||
path: Path to the model directory or file
|
||||
|
||||
Returns:
|
||||
An appropriate ModelLoader instance (SafeTensorLoader or GGUFLoader)
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If no supported model files are found in the path
|
||||
"""
|
||||
if not os.path.exists(path):
|
||||
raise FileNotFoundError(f"Path not found: {path}")
|
||||
|
||||
# Normalize to directory path if a file was provided
|
||||
if os.path.isfile(path):
|
||||
if path.endswith(".safetensors"):
|
||||
return SafeTensorLoader(path)
|
||||
elif path.endswith(".gguf"):
|
||||
return GGUFLoader(path)
|
||||
else:
|
||||
folder_path = os.path.dirname(path)
|
||||
else:
|
||||
folder_path = path
|
||||
|
||||
# Check for safetensors files
|
||||
has_safetensors = False
|
||||
has_gguf = False
|
||||
|
||||
for root, _, files in os.walk(folder_path):
|
||||
for file in files:
|
||||
if file.endswith(".safetensors"):
|
||||
has_safetensors = True
|
||||
break
|
||||
elif file.endswith(".gguf"):
|
||||
has_gguf = True
|
||||
break
|
||||
if has_safetensors or has_gguf:
|
||||
break
|
||||
|
||||
# Create the appropriate loader based on detected file types
|
||||
# Prioritize SafeTensor over GGUF if both are present
|
||||
if has_safetensors:
|
||||
try:
|
||||
return SafeTensorLoader(folder_path)
|
||||
except Exception as e:
|
||||
print(f"Failed to create SafeTensorLoader: {e}")
|
||||
# Fall through to try GGUF if SafeTensor fails
|
||||
if not has_gguf:
|
||||
raise
|
||||
|
||||
if has_gguf:
|
||||
try:
|
||||
return GGUFLoader(folder_path)
|
||||
except Exception as e:
|
||||
print(f"Failed to create GGUFLoader: {e}")
|
||||
raise
|
||||
|
||||
# No supported model files found
|
||||
raise FileNotFoundError(f"No .safetensors or .gguf files found in: {folder_path}")
|
|
@ -11,13 +11,24 @@ from torch import nn
|
|||
import itertools
|
||||
import time
|
||||
import enum
|
||||
from ktransformers.util.custom_gguf import translate_name_to_gguf
|
||||
from ktransformers.util.custom_gguf import GGUFLoader
|
||||
from transformers import (
|
||||
LogitsProcessorList,
|
||||
TemperatureLogitsWarper,
|
||||
TopKLogitsWarper,
|
||||
TopPLogitsWarper,
|
||||
MinPLogitsWarper,
|
||||
TypicalLogitsWarper,
|
||||
EpsilonLogitsWarper,
|
||||
EtaLogitsWarper,
|
||||
)
|
||||
|
||||
from ktransformers.util.custom_loader import ModelLoaderFactory, ModelLoader, SafeTensorLoader, GGUFLoader, translate_name_to_gguf
|
||||
from ktransformers.operators import base_operator
|
||||
from ktransformers.models.custom_cache import StaticCache
|
||||
from ktransformers.util.cuda_graph_runner import CUDAGraphRunner
|
||||
from ktransformers.util.textstream import TextStreamer
|
||||
from ktransformers.operators.flashinfer_wrapper import MLAWrapperSingleton
|
||||
if not torch.xpu.is_available():
|
||||
from ktransformers.operators.flashinfer_wrapper import MLAWrapperSingleton
|
||||
import socket
|
||||
|
||||
warm_uped = False
|
||||
|
@ -49,6 +60,8 @@ def get_compute_capability(device:torch.device = None):
|
|||
return min_compute_capability_major
|
||||
else:
|
||||
return torch.cuda.get_device_properties(device)
|
||||
else:
|
||||
return 0
|
||||
|
||||
def set_module(model, submodule_key, module):
|
||||
tokens = submodule_key.split('.')
|
||||
|
@ -87,45 +100,132 @@ def get_all_used_cuda_device(device_map:dict):
|
|||
all_device_list = list(all_device_list)
|
||||
return all_device_list
|
||||
|
||||
def load_cur_state_dict(module: nn.Module, gguf_loader: GGUFLoader, prefix: str = ""):
|
||||
def load_cur_state_dict(module: nn.Module, gguf_loader: ModelLoader, prefix: str = "", device="cuda"):
|
||||
prefix = prefix.replace("orig_module.", "")
|
||||
persistent_buffers = {k: v for k, v in module._buffers.items() if k not in module._non_persistent_buffers_set}
|
||||
local_name_params = itertools.chain(module._parameters.items(), persistent_buffers.items())
|
||||
local_state = {k: v for k, v in local_name_params if v is not None}
|
||||
for name, param in local_state.items():
|
||||
key = prefix + name
|
||||
translated_key = translate_name_to_gguf(key)
|
||||
translated_key = key
|
||||
|
||||
# TODO: Merge all loader.
|
||||
# I know this is ugly but lets do it for now.
|
||||
if gguf_loader.safetensor_loader is not None:
|
||||
load_dequantized_tensor = gguf_loader.safetensor_loader.load_dequantized_tensor
|
||||
tensor_file_map = gguf_loader.safetensor_loader.tensor_file_map
|
||||
if isinstance(gguf_loader, SafeTensorLoader):
|
||||
load_dequantized_tensor = gguf_loader.load_dequantized_tensor
|
||||
else:
|
||||
load_dequantized_tensor = gguf_loader.load_gguf_tensor
|
||||
tensor_file_map = gguf_loader.tensor_file_map
|
||||
|
||||
if translated_key in tensor_file_map:
|
||||
if gguf_loader.has_tensor(translated_key) or "kv_b_proj" in translated_key:
|
||||
target_dtype = torch.get_default_dtype()
|
||||
device = get_device(translated_key[:translated_key.rfind(".")], gguf_loader.tensor_device_map)
|
||||
print(f"loading {translated_key} to {device}")
|
||||
torch.cuda.empty_cache()
|
||||
weights = load_dequantized_tensor(translated_key, device=device).to(dtype=target_dtype)
|
||||
set_param(module, name, weights)
|
||||
del weights
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.empty_cache()
|
||||
elif torch.xpu.is_available():
|
||||
torch.xpu.empty_cache()
|
||||
if "kv_b_proj" in translated_key and not gguf_loader.has_tensor(translated_key):
|
||||
attn_k_b = load_dequantized_tensor(translated_key.replace("self_attn.kv_b_proj", "attn_k_b"), device=device).to(dtype=target_dtype)
|
||||
attn_k_b = attn_k_b.transpose(1, 2).contiguous()
|
||||
attn_v_b = load_dequantized_tensor(translated_key.replace("self_attn.kv_b_proj", "attn_v_b"), device=device).to(dtype=target_dtype)
|
||||
kv_b_proj = torch.cat((attn_k_b, attn_v_b), dim=1)
|
||||
kv_b_proj = kv_b_proj.contiguous() if kv_b_proj.ndim == 2 else kv_b_proj.flatten(0, 1).contiguous()
|
||||
set_param(module, name, kv_b_proj)
|
||||
del attn_k_b
|
||||
del attn_v_b
|
||||
else:
|
||||
weights = load_dequantized_tensor(translated_key, device=device).to(dtype=target_dtype)
|
||||
set_param(module, name, weights)
|
||||
del weights
|
||||
else:
|
||||
#print(load_config.tensor_file_map.keys())
|
||||
raise Exception(f"can't find {translated_key} in GGUF file!")
|
||||
|
||||
def load_weights(module:nn.Module, gguf_loader:GGUFLoader, prefix=''):
|
||||
|
||||
def sync_all_device(all_device_list):
|
||||
for device in all_device_list:
|
||||
if "cuda" in device.lower():
|
||||
torch.cuda.synchronize(device)
|
||||
elif "xpu" in device.lower():
|
||||
torch.xpu.synchronize(device)
|
||||
else:
|
||||
raise RuntimeError("The device {} is not available".format(device))
|
||||
|
||||
torch_device_mapping ={"cuda": "cuda:0", "xpu": "xpu:0"}
|
||||
|
||||
def xpu_fp16_model(config):
|
||||
# This function is to check if we run this model on XPU with FP16 dtype
|
||||
if not torch.xpu.is_available():
|
||||
return False
|
||||
if config.architectures[0] == "DeepseekV3ForCausalLM":
|
||||
return True
|
||||
if config.architectures[0] == "Qwen3MoeForCausalLM" and config.hidden_size == 4096:
|
||||
# Qwen3-30B seems have precision issue with FP16
|
||||
# so we only use FP16 for Qwen3-235B now
|
||||
return True
|
||||
return False
|
||||
|
||||
def load_weights(module:nn.Module, gguf_loader:ModelLoader, prefix='', device="cuda"):
|
||||
#print(f"recursively loading weights {prefix}")
|
||||
if not isinstance(module, base_operator.BaseInjectedModule):
|
||||
load_cur_state_dict(module, gguf_loader, prefix)
|
||||
load_cur_state_dict(module, gguf_loader, prefix, device=device)
|
||||
for name, child in module._modules.items():
|
||||
load_weights(child, gguf_loader, prefix+name+".")
|
||||
load_weights(child, gguf_loader, prefix+name+".", device=device)
|
||||
else:
|
||||
module.load()
|
||||
|
||||
def tf_logits_warper(generation_config):
|
||||
"""
|
||||
This class returns a [`LogitsProcessorList`] list object that contains all relevant [`LogitsWarper`] instances
|
||||
used for multinomial sampling.
|
||||
"""
|
||||
|
||||
# instantiate warpers list
|
||||
warpers = LogitsProcessorList()
|
||||
|
||||
# In beam methods, we need to keep at least one non-eos token to explore continuations that might have a
|
||||
# better score (i.e. keep len(list(generation_config._eos_token_tensor)) + 1)
|
||||
if generation_config.num_beams > 1:
|
||||
if isinstance(generation_config._eos_token_tensor, list):
|
||||
min_tokens_to_keep = len(generation_config._eos_token_tensor) + 1
|
||||
elif isinstance(generation_config._eos_token_tensor, torch.Tensor):
|
||||
min_tokens_to_keep = generation_config._eos_token_tensor.shape[0] + 1
|
||||
else:
|
||||
min_tokens_to_keep = 2
|
||||
else:
|
||||
min_tokens_to_keep = 1
|
||||
|
||||
# the following idea is largely copied from this PR: https://github.com/huggingface/transformers/pull/5420/files
|
||||
# all samplers can be found in `generation_utils_samplers.py`
|
||||
if generation_config.temperature is not None and generation_config.temperature != 1.0:
|
||||
warpers.append(TemperatureLogitsWarper(generation_config.temperature))
|
||||
if generation_config.top_k is not None and generation_config.top_k != 0:
|
||||
warpers.append(TopKLogitsWarper(top_k=generation_config.top_k, min_tokens_to_keep=min_tokens_to_keep))
|
||||
if generation_config.top_p is not None and generation_config.top_p < 1.0:
|
||||
warpers.append(TopPLogitsWarper(top_p=generation_config.top_p, min_tokens_to_keep=min_tokens_to_keep))
|
||||
if generation_config.min_p is not None:
|
||||
# Applied after temperature scaling (see https://github.com/ggerganov/llama.cpp/pull/3841#issuecomment-2073826084)
|
||||
warpers.append(MinPLogitsWarper(min_p=generation_config.min_p, min_tokens_to_keep=min_tokens_to_keep))
|
||||
if generation_config.typical_p is not None and generation_config.typical_p < 1.0:
|
||||
warpers.append(
|
||||
TypicalLogitsWarper(mass=generation_config.typical_p, min_tokens_to_keep=min_tokens_to_keep)
|
||||
)
|
||||
if generation_config.epsilon_cutoff is not None and 0.0 < generation_config.epsilon_cutoff < 1.0:
|
||||
warpers.append(
|
||||
EpsilonLogitsWarper(epsilon=generation_config.epsilon_cutoff, min_tokens_to_keep=min_tokens_to_keep)
|
||||
)
|
||||
if generation_config.eta_cutoff is not None and 0.0 < generation_config.eta_cutoff < 1.0:
|
||||
warpers.append(
|
||||
EtaLogitsWarper(
|
||||
epsilon=generation_config.eta_cutoff, min_tokens_to_keep=min_tokens_to_keep, device=device
|
||||
)
|
||||
)
|
||||
# `LogitNormalization` should always be the last logit processor, when present
|
||||
if generation_config.renormalize_logits is True:
|
||||
warpers.append(LogitNormalization())
|
||||
return warpers
|
||||
|
||||
def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000, use_cuda_graph: bool = True,
|
||||
mode = 'normal', force_think: bool = False, chunk_size = 16384, use_flashinfer_mla = False,
|
||||
num_heads = None, head_dim_ckv = None, head_dim_kpe = None, q_head_dim = None):
|
||||
|
@ -134,8 +234,8 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000, use_cud
|
|||
torch._dynamo.config.suppress_errors = True
|
||||
batch_size, seq_length = inputs.shape
|
||||
device_map = model.gguf_loader.tensor_device_map
|
||||
torch_device = get_device('blk.0.self_attn', device_map)
|
||||
torch_device = "cuda:0" if torch_device == "cuda" else torch_device
|
||||
torch_device = get_device('model.layers.0.self_attn', device_map)
|
||||
torch_device = torch_device_mapping[torch_device] if torch_device in torch_device_mapping else torch_device
|
||||
inputs = inputs.to(torch_device)
|
||||
all_cuda_device = get_all_used_cuda_device(device_map)
|
||||
|
||||
|
@ -148,7 +248,12 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000, use_cud
|
|||
logits = cuda_graph_runner(cur_token, position_ids, cache_position)
|
||||
else:
|
||||
# custom_stream = torch.cuda.Stream()
|
||||
torch.cuda.set_device(torch_device)
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.set_device(torch_device)
|
||||
elif torch.xpu.is_available():
|
||||
torch.xpu.set_device(torch_device)
|
||||
else:
|
||||
raise RuntimeError(f"The device: {torch_device} is not available")
|
||||
inputs_embeds = model.model.embed_tokens(cur_token.to("cpu")).to(torch_device)
|
||||
# with torch.cuda.stream(custom_stream):
|
||||
logits=model(inputs_embeds=inputs_embeds,
|
||||
|
@ -156,10 +261,9 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000, use_cud
|
|||
cache_position=cache_position,
|
||||
past_key_values=past_key_values,
|
||||
return_dict=False, use_cache=True)[0]
|
||||
if past_key_values != None:
|
||||
if past_key_values != None and isinstance(past_key_values, StaticCache):
|
||||
past_key_values.change_seq_length(1)
|
||||
for device in all_cuda_device:
|
||||
torch.cuda.synchronize(device)
|
||||
sync_all_device(all_cuda_device)
|
||||
#print(logits)
|
||||
next_token_scores = logits_warper(inputs, logits[:, -1, :])
|
||||
if generation_config.do_sample:
|
||||
|
@ -185,11 +289,22 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000, use_cud
|
|||
|
||||
return logits
|
||||
|
||||
torch.cuda.set_device(torch_device)
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.set_device(torch_device)
|
||||
elif torch.xpu.is_available():
|
||||
torch.xpu.set_device(torch_device)
|
||||
else:
|
||||
raise RuntimeError(f"The device: {torch_device} is not available")
|
||||
with torch.no_grad():
|
||||
|
||||
stream = TextStreamer(tokenizer)
|
||||
if mode != 'long_context':
|
||||
if torch.xpu.is_available():
|
||||
from ipex_llm.transformers.kv import DynamicUnbalancedFp8Cache, DynamicNormalCache
|
||||
if model.config.architectures[0] in ["DeepseekV3ForCausalLM", "DeepseekV2ForCausalLM"]:
|
||||
past_key_values = DynamicUnbalancedFp8Cache.from_legacy_cache(None)
|
||||
else:
|
||||
past_key_values = DynamicNormalCache.from_legacy_cache(None)
|
||||
elif mode != 'long_context':
|
||||
past_key_values = StaticCache(
|
||||
config = model.config, max_batch_size = 1, max_cache_len = seq_length + max_new_tokens, device = device_map, dtype = model.dtype
|
||||
)
|
||||
|
@ -201,14 +316,8 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000, use_cud
|
|||
# change this to modify generate config
|
||||
#top_k=5, top_p=0.85, temperature=0.1
|
||||
)
|
||||
try: # transformers==4.43
|
||||
logits_warper = (
|
||||
model._get_logits_warper(generation_config,device=inputs.device)
|
||||
)
|
||||
except:
|
||||
logits_warper = (
|
||||
model._get_logits_warper(generation_config)
|
||||
)
|
||||
|
||||
logits_warper = tf_logits_warper(generation_config)
|
||||
|
||||
cache_position = torch.arange(seq_length, device=torch_device, dtype=torch.int32)
|
||||
generated_ids = torch.zeros(
|
||||
|
|
367
ktransformers/util/weight_loader.py
Normal file
367
ktransformers/util/weight_loader.py
Normal file
|
@ -0,0 +1,367 @@
|
|||
from abc import ABC, abstractmethod
|
||||
import os
|
||||
import torch
|
||||
import numpy as np
|
||||
from safetensors import safe_open
|
||||
from typing import Dict, Any, Optional, Union
|
||||
|
||||
class ModelLoader(ABC):
|
||||
"""
|
||||
Abstract base class for model loaders.
|
||||
Defines the interface that all model loaders must implement.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def load_tensor(self, name: str, device: str = "cpu") -> torch.Tensor:
|
||||
"""
|
||||
Load a tensor by name.
|
||||
|
||||
Args:
|
||||
name: Name of the tensor to load
|
||||
device: Device to load the tensor to
|
||||
|
||||
Returns:
|
||||
The loaded tensor
|
||||
"""
|
||||
pass
|
||||
|
||||
@classmethod
|
||||
@abstractmethod
|
||||
def supports_format(cls, path: str) -> bool:
|
||||
"""
|
||||
Check if this loader supports the given path format.
|
||||
|
||||
Args:
|
||||
path: Path to check
|
||||
|
||||
Returns:
|
||||
True if this loader supports the given path, False otherwise
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class SafeTensorLoader(ModelLoader):
|
||||
"""
|
||||
Loader for SafeTensor format models.
|
||||
"""
|
||||
|
||||
def __init__(self, path: str):
|
||||
"""
|
||||
Initialize the SafeTensor loader.
|
||||
|
||||
Args:
|
||||
path: Path to the model directory or file
|
||||
"""
|
||||
self.tensor_file_map = {} # Maps tensor names to file paths
|
||||
self.file_handle_map = {} # Maps file names to file handles
|
||||
self._load_tensor_file_map(path)
|
||||
|
||||
def _load_tensor_file_map(self, path: str) -> None:
|
||||
"""
|
||||
Load the tensor file map from the given path.
|
||||
|
||||
Args:
|
||||
path: Path to the model directory or file
|
||||
"""
|
||||
# Normalize path to directory
|
||||
if not os.path.exists(path):
|
||||
raise FileNotFoundError(f"Path not found: {path}")
|
||||
if os.path.isfile(path):
|
||||
folder_path = os.path.dirname(path)
|
||||
else:
|
||||
folder_path = path
|
||||
|
||||
found_safetensor = False
|
||||
for root, _, files in os.walk(folder_path):
|
||||
files = sorted(files)
|
||||
for file in files:
|
||||
if file.endswith(".safetensors"):
|
||||
found_safetensor = True
|
||||
file_path = os.path.join(root, file)
|
||||
if file not in self.file_handle_map:
|
||||
try:
|
||||
handle = safe_open(file_path, framework="pt")
|
||||
self.file_handle_map[file] = handle
|
||||
except Exception as e:
|
||||
print(f"Error opening Safetensor file {file_path}: {e}")
|
||||
continue
|
||||
|
||||
f = self.file_handle_map.get(file)
|
||||
if f is None:
|
||||
continue
|
||||
try:
|
||||
for key in f.keys():
|
||||
self.tensor_file_map[key] = file
|
||||
except Exception as e:
|
||||
print(f"Error reading Safetensor file {file_path}: {e}")
|
||||
|
||||
if not found_safetensor:
|
||||
# Not raising an error here allows for the factory to try other loaders
|
||||
print(f"No Safetensor files found in {folder_path}")
|
||||
|
||||
def load_tensor(self, name: str, device: str = "cpu") -> torch.Tensor:
|
||||
"""
|
||||
Load a tensor by name.
|
||||
|
||||
Args:
|
||||
name: Name of the tensor to load
|
||||
device: Device to load the tensor to
|
||||
|
||||
Returns:
|
||||
The loaded tensor
|
||||
"""
|
||||
if name not in self.tensor_file_map:
|
||||
raise KeyError(f"Key {name} not found in Safetensor files")
|
||||
file = self.tensor_file_map[name]
|
||||
f = self.file_handle_map.get(file)
|
||||
if f is None:
|
||||
raise FileNotFoundError(f"File {file} not found in Safetensor files")
|
||||
tensor = f.get_tensor(name)
|
||||
return tensor.to(device)
|
||||
|
||||
def load_dequantized_tensor(self, name: str, device: str = "cpu") -> torch.Tensor:
|
||||
"""
|
||||
Load and dequantize a tensor.
|
||||
|
||||
Args:
|
||||
name: Name of the tensor to load
|
||||
device: Device to load the tensor to
|
||||
|
||||
Returns:
|
||||
The dequantized tensor
|
||||
"""
|
||||
if name not in self.tensor_file_map:
|
||||
raise KeyError(f"Key {name} not found in Safetensor files")
|
||||
file = self.tensor_file_map[name]
|
||||
f = self.file_handle_map.get(file)
|
||||
if f is None:
|
||||
raise FileNotFoundError(f"File {file} not found in Safetensor files")
|
||||
tensor = f.get_tensor(name).to(device)
|
||||
if name.endswith(".weight"):
|
||||
if name[:-7] + ".weight_scale_inv" in self.tensor_file_map:
|
||||
weight_scale_inv = f.get_tensor(name[:-7] + ".weight_scale_inv").to(device)
|
||||
# Assuming weight_dequant function is imported
|
||||
from ktransformers.ktransformers_ext.triton.fp8gemm import weight_dequant
|
||||
tensor = weight_dequant(tensor, weight_scale_inv)
|
||||
return tensor.to(device)
|
||||
|
||||
def close_all_handles(self) -> None:
|
||||
"""
|
||||
Close all file handles.
|
||||
"""
|
||||
for handle in self.file_handle_map.values():
|
||||
handle.close()
|
||||
self.file_handle_map.clear()
|
||||
|
||||
@classmethod
|
||||
def supports_format(cls, path: str) -> bool:
|
||||
"""
|
||||
Check if this loader supports the given path format.
|
||||
|
||||
Args:
|
||||
path: Path to check
|
||||
|
||||
Returns:
|
||||
True if safetensor files are found in the path, False otherwise
|
||||
"""
|
||||
# Normalize path to directory
|
||||
if not os.path.exists(path):
|
||||
return False
|
||||
if os.path.isfile(path):
|
||||
if path.endswith(".safetensors"):
|
||||
return True
|
||||
folder_path = os.path.dirname(path)
|
||||
else:
|
||||
folder_path = path
|
||||
|
||||
# Check if any safetensor files exist in the folder
|
||||
for root, _, files in os.walk(folder_path):
|
||||
for file in files:
|
||||
if file.endswith(".safetensors"):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
class GGUFLoader(ModelLoader):
|
||||
"""
|
||||
Loader for GGUF format models.
|
||||
"""
|
||||
|
||||
def __init__(self, path: str):
|
||||
"""
|
||||
Initialize the GGUF loader.
|
||||
|
||||
Args:
|
||||
path: Path to the model directory or file
|
||||
"""
|
||||
# Check if path exists
|
||||
if not os.path.exists(path):
|
||||
raise FileNotFoundError(f"GGUF dir not found: {path}")
|
||||
if os.path.isfile(path):
|
||||
self.gguf_path = os.path.dirname(path)
|
||||
else:
|
||||
self.gguf_path = path
|
||||
|
||||
self.tensor_info = {} # Stores tensor metadata
|
||||
self.tensor_file_map = {} # Maps tensor names to file paths
|
||||
self.file_data_map = {} # Maps file paths to memory-mapped data
|
||||
self.gguf_file_meta = {} # Stores GGUF metadata
|
||||
|
||||
# For compatibility with the factory pattern
|
||||
self.safetensor_loader = None
|
||||
|
||||
# Scan all GGUF files in the directory
|
||||
found_gguf = False
|
||||
for root, _, files in os.walk(self.gguf_path):
|
||||
for file in files:
|
||||
if file.endswith(".gguf"):
|
||||
found_gguf = True
|
||||
file_path = os.path.join(root, file)
|
||||
with open(file_path, "rb") as f:
|
||||
self._load_gguf(f)
|
||||
if file_path not in self.file_data_map:
|
||||
self.file_data_map[file_path] = np.memmap(file_path, mode='r')
|
||||
|
||||
if not found_gguf:
|
||||
raise FileNotFoundError(f"Cannot find any .gguf files in: {self.gguf_path}")
|
||||
|
||||
def _load_gguf(self, f) -> None:
|
||||
"""
|
||||
Load GGUF file metadata and tensor info.
|
||||
|
||||
Args:
|
||||
f: File handle of the GGUF file
|
||||
"""
|
||||
# Implementation should follow the original GGUFLoader._load_gguf
|
||||
# This is a simplified version for illustration
|
||||
f.seek(0)
|
||||
assert f.read(4) == b'GGUF'
|
||||
|
||||
# Read header
|
||||
values = struct.unpack("<IQQ", f.read(4+8+8))
|
||||
version, n_tensors, n_kv = values
|
||||
if version != 3:
|
||||
warnings.warn(f"Version {version} has never been tested, might not work")
|
||||
|
||||
# Read key-value pairs
|
||||
info = {}
|
||||
for _ in range(n_kv):
|
||||
name = self._read_value(f, 8) # DATA_TYPES["string"]
|
||||
data_type = struct.unpack("<I", f.read(4))[0]
|
||||
info[name] = self._read_value(f, data_type)
|
||||
|
||||
# Read tensor info
|
||||
tensor_info = {}
|
||||
for _ in range(n_tensors):
|
||||
name = self._read_value(f, 8) # DATA_TYPES["string"]
|
||||
shape_len = self._read_value(f, 4) # DATA_TYPES["uint32"]
|
||||
shape = [self._read_value(f, 10) for _ in range(shape_len)] # DATA_TYPES["uint64"]
|
||||
ggml_type = self._read_value(f, 4) # DATA_TYPES["uint32"]
|
||||
offset = self._read_value(f, 10) # DATA_TYPES["uint64"]
|
||||
|
||||
# Additional tensor metadata would be calculated here
|
||||
# For brevity, we're omitting the detailed tensor metadata calculation
|
||||
tensor_info[name] = {
|
||||
"ggml_type": ggml_type,
|
||||
"shape": shape,
|
||||
"offset": offset,
|
||||
# ... other tensor metadata
|
||||
}
|
||||
|
||||
start = f.tell()
|
||||
alignment = info.get("general.alignment", 32)
|
||||
|
||||
# Calculate actual file offsets
|
||||
for t in tensor_info.values():
|
||||
offset = start + t["offset"]
|
||||
offset += (alignment - offset % alignment) % alignment
|
||||
t["offset"] = offset
|
||||
|
||||
# Update file maps
|
||||
for name in tensor_info:
|
||||
self.tensor_file_map[name] = f.name
|
||||
|
||||
self.tensor_info.update(tensor_info)
|
||||
self.gguf_file_meta.update(info)
|
||||
|
||||
def _read_value(self, f, data_type) -> Any:
|
||||
"""
|
||||
Read a value from the file according to its data type.
|
||||
|
||||
Args:
|
||||
f: File handle
|
||||
data_type: Type of data to read
|
||||
|
||||
Returns:
|
||||
The read value
|
||||
"""
|
||||
# Simplified implementation
|
||||
# In a complete implementation, this would handle all data types
|
||||
if data_type == 8: # DATA_TYPES["string"]
|
||||
length = struct.unpack("<Q", f.read(8))[0]
|
||||
return f.read(length).decode("utf-8")
|
||||
elif data_type == 4: # DATA_TYPES["uint32"]
|
||||
return struct.unpack("<I", f.read(4))[0]
|
||||
elif data_type == 10: # DATA_TYPES["uint64"]
|
||||
return struct.unpack("<Q", f.read(8))[0]
|
||||
# ... handling for other data types
|
||||
return None
|
||||
|
||||
def load_tensor(self, name: str, device: str = "cpu") -> torch.Tensor:
|
||||
"""
|
||||
Load a tensor by name.
|
||||
|
||||
Args:
|
||||
name: Name of the tensor to load
|
||||
device: Device to load the tensor to
|
||||
|
||||
Returns:
|
||||
The loaded tensor
|
||||
"""
|
||||
# This should call load_gguf_tensor with the appropriate parameters
|
||||
return self.load_gguf_tensor(name, device)
|
||||
|
||||
def load_gguf_tensor(self, name: str, device: str = "cpu", target_dtype = None) -> torch.Tensor:
|
||||
"""
|
||||
Load a GGUF tensor by name.
|
||||
|
||||
Args:
|
||||
name: Name of the tensor to load
|
||||
device: Device to load the tensor to
|
||||
target_dtype: Target data type for the tensor
|
||||
|
||||
Returns:
|
||||
The loaded tensor
|
||||
"""
|
||||
# Implementation would follow the original GGUFLoader.load_gguf_tensor
|
||||
# This is a placeholder for illustration
|
||||
if name not in self.tensor_info:
|
||||
raise KeyError(f"Tensor {name} not found")
|
||||
|
||||
# Actual implementation would dequantize the tensor data
|
||||
# and return a torch.Tensor
|
||||
return torch.zeros(1, device=device) # Placeholder
|
||||
|
||||
@classmethod
|
||||
def supports_format(cls, path: str) -> bool:
|
||||
"""
|
||||
Check if this loader supports the given path format.
|
||||
|
||||
Args:
|
||||
path: Path to check
|
||||
|
||||
Returns:
|
||||
True if GGUF files are found in the path, False otherwise
|
||||
"""
|
||||
# Normalize path to directory
|
||||
if not os.path.exists(path):
|
||||
return False
|
||||
if os.path.isfile(path):
|
||||
return path.endswith(".gguf")
|
||||
|
||||
# Check if any GGUF files exist in the folder
|
||||
for root, _, files in os.walk(path):
|
||||
for file in files:
|
||||
if file.endswith(".gguf"):
|
||||
return True
|
||||
return False
|
|
@ -6,7 +6,7 @@ import sys
|
|||
# sys.path.insert(0, "/home/azure/ktransformers")
|
||||
import argparse
|
||||
import torch
|
||||
from ktransformers.util.custom_gguf import GGUFLoader, translate_name_to_gguf
|
||||
from ktransformers.util.custom_loader import GGUFLoader, translate_name_to_gguf
|
||||
from safetensors import safe_open
|
||||
from safetensors.torch import save_file
|
||||
import re
|
||||
|
|
|
@ -30,7 +30,6 @@ dependencies = [
|
|||
"build",
|
||||
"fire",
|
||||
"protobuf",
|
||||
"triton >= 3.2"
|
||||
]
|
||||
|
||||
requires-python = ">=3.10"
|
||||
|
@ -70,7 +69,7 @@ ktransformers = "ktransformers.server.main:main"
|
|||
|
||||
[tool.setuptools.packages.find]
|
||||
where = ["./", ]
|
||||
include = ["ktransformers"]
|
||||
include = ["ktransformers","ktransformers.*"]
|
||||
[tool.black]
|
||||
line-length = 120
|
||||
preview = true
|
||||
|
|
|
@ -7,4 +7,3 @@ cpufeature; sys_platform == 'win32' or sys_platform == 'Windows'
|
|||
protobuf
|
||||
tiktoken
|
||||
blobfile
|
||||
triton>=3.2
|
||||
|
|
70
setup.py
70
setup.py
|
@ -39,7 +39,17 @@ try:
|
|||
from torch_musa.utils.musa_extension import BuildExtension, MUSAExtension, MUSA_HOME
|
||||
except ImportError:
|
||||
MUSA_HOME=None
|
||||
|
||||
KTRANSFORMERS_BUILD_XPU = torch.xpu.is_available()
|
||||
|
||||
# 检测 DEV_BACKEND 环境变量
|
||||
dev_backend = os.environ.get("DEV_BACKEND", "").lower()
|
||||
if dev_backend == "xpu":
|
||||
triton_dep = [
|
||||
"pytorch-triton-xpu==3.3.0"
|
||||
]
|
||||
else:
|
||||
triton_dep = ["triton>=3.2"]
|
||||
|
||||
with_balance = os.environ.get("USE_BALANCE_SERVE", "0") == "1"
|
||||
|
||||
class CpuInstructInfo:
|
||||
|
@ -241,8 +251,10 @@ class VersionInfo:
|
|||
backend_version = f"mu{self.get_musa_bare_metal_version(MUSA_HOME)}"
|
||||
elif ROCM_HOME is not None:
|
||||
backend_version = f"rocm{self.get_rocm_bare_metal_version(ROCM_HOME)}"
|
||||
elif torch.xpu.is_available():
|
||||
backend_version = f"xpu"
|
||||
else:
|
||||
raise ValueError("Unsupported backend: CUDA_HOME MUSA_HOME ROCM_HOME all not set.")
|
||||
raise ValueError("Unsupported backend: CUDA_HOME MUSA_HOME ROCM_HOME all not set and XPU is not available.")
|
||||
package_version = f"{flash_version}+{backend_version}torch{torch_version}{cpu_instruct}"
|
||||
if full_version:
|
||||
return package_version
|
||||
|
@ -511,8 +523,10 @@ class CMakeBuild(BuildExtension):
|
|||
cmake_args += ["-DKTRANSFORMERS_USE_MUSA=ON"]
|
||||
elif ROCM_HOME is not None:
|
||||
cmake_args += ["-DKTRANSFORMERS_USE_ROCM=ON"]
|
||||
elif KTRANSFORMERS_BUILD_XPU:
|
||||
cmake_args += ["-DKTRANSFORMERS_USE_XPU=ON", "-DKTRANSFORMERS_USE_CUDA=OFF"]
|
||||
else:
|
||||
raise ValueError("Unsupported backend: CUDA_HOME, MUSA_HOME, and ROCM_HOME are not set.")
|
||||
raise ValueError("Unsupported backend: CUDA_HOME, MUSA_HOME, and ROCM_HOME are not set and XPU is not available.")
|
||||
|
||||
cmake_args = get_cmake_abi_args(cmake_args)
|
||||
# log cmake_args
|
||||
|
@ -636,33 +650,41 @@ elif MUSA_HOME is not None:
|
|||
]
|
||||
}
|
||||
)
|
||||
elif torch.xpu.is_available(): #XPUExtension is not available now.
|
||||
ops_module = None
|
||||
else:
|
||||
raise ValueError("Unsupported backend: CUDA_HOME and MUSA_HOME are not set.")
|
||||
raise ValueError("Unsupported backend: CUDA_HOME ROCM_HOME MUSA_HOME are not set and XPU is not available.")
|
||||
|
||||
ext_modules = [
|
||||
CMakeExtension("cpuinfer_ext", os.fspath(Path("").resolve() / "csrc" / "ktransformers_ext")),
|
||||
ops_module,
|
||||
CUDAExtension(
|
||||
'vLLMMarlin', [
|
||||
'csrc/custom_marlin/binding.cpp',
|
||||
'csrc/custom_marlin/gptq_marlin/gptq_marlin.cu',
|
||||
'csrc/custom_marlin/gptq_marlin/gptq_marlin_repack.cu',
|
||||
],
|
||||
extra_compile_args={
|
||||
'cxx': ['-O3'],
|
||||
'nvcc': ['-O3', '-Xcompiler', '-fPIC'],
|
||||
},
|
||||
)
|
||||
]
|
||||
if with_balance:
|
||||
print("using balance_serve")
|
||||
ext_modules.append(
|
||||
CMakeExtension("balance_serve", os.fspath(Path("").resolve()/ "csrc"/ "balance_serve"))
|
||||
)
|
||||
if not torch.xpu.is_available():
|
||||
ext_modules = [
|
||||
CMakeExtension("cpuinfer_ext", os.fspath(Path("").resolve() / "csrc" / "ktransformers_ext")),
|
||||
ops_module,
|
||||
CUDAExtension(
|
||||
'vLLMMarlin', [
|
||||
'csrc/custom_marlin/binding.cpp',
|
||||
'csrc/custom_marlin/gptq_marlin/gptq_marlin.cu',
|
||||
'csrc/custom_marlin/gptq_marlin/gptq_marlin_repack.cu',
|
||||
],
|
||||
extra_compile_args={
|
||||
'cxx': ['-O3'],
|
||||
'nvcc': ['-O3', '-Xcompiler', '-fPIC'],
|
||||
},
|
||||
)
|
||||
]
|
||||
if with_balance:
|
||||
print("using balance_serve")
|
||||
ext_modules.append(
|
||||
CMakeExtension("balance_serve", os.fspath(Path("").resolve()/ "csrc"/ "balance_serve"))
|
||||
)
|
||||
else:
|
||||
ext_modules = [
|
||||
CMakeExtension("cpuinfer_ext", os.fspath(Path("").resolve() / "csrc" / "ktransformers_ext")),
|
||||
]
|
||||
|
||||
setup(
|
||||
name=VersionInfo.PACKAGE_NAME,
|
||||
version=VersionInfo().get_package_version(),
|
||||
install_requires=triton_dep,
|
||||
cmdclass={"bdist_wheel":BuildWheelsCommand ,"build_ext": CMakeBuild},
|
||||
ext_modules=ext_modules
|
||||
)
|
||||
|
|
Loading…
Add table
Reference in a new issue