mirror of
https://github.com/kvcache-ai/ktransformers.git
synced 2025-09-07 13:09:50 +00:00
Merge branch 'kvcache-ai:main' into main
This commit is contained in:
commit
877aec858e
251 changed files with 47224 additions and 749 deletions
2
.github/workflows/package_wheel_release.yml
vendored
2
.github/workflows/package_wheel_release.yml
vendored
|
@ -163,6 +163,8 @@ jobs:
|
|||
|
||||
- name: build for cuda
|
||||
if: matrix.cuda != ''
|
||||
env:
|
||||
USE_BALANCE_SERVE: "1"
|
||||
run: |
|
||||
git submodule init
|
||||
git submodule update
|
||||
|
|
13
.gitmodules
vendored
13
.gitmodules
vendored
|
@ -4,3 +4,16 @@
|
|||
[submodule "third_party/pybind11"]
|
||||
path = third_party/pybind11
|
||||
url = https://github.com/pybind/pybind11.git
|
||||
[submodule "third_party/spdlog"]
|
||||
path = third_party/spdlog
|
||||
url = https://github.com/gabime/spdlog.git
|
||||
[submodule "third_party/custom_flashinfer"]
|
||||
path = third_party/custom_flashinfer
|
||||
url = https://github.com/kvcache-ai/custom_flashinfer.git
|
||||
branch = fix-precision-mla-merge-main
|
||||
[submodule "third_party/xxHash"]
|
||||
path = third_party/xxHash
|
||||
url = https://github.com/Cyan4973/xxHash.git
|
||||
[submodule "third_party/prometheus-cpp"]
|
||||
path = third_party/prometheus-cpp
|
||||
url = https://github.com/jupp0r/prometheus-cpp
|
||||
|
|
84
Dockerfile
84
Dockerfile
|
@ -1,38 +1,64 @@
|
|||
FROM node:20.16.0 as web_compile
|
||||
WORKDIR /home
|
||||
RUN <<EOF
|
||||
git clone https://github.com/kvcache-ai/ktransformers.git &&
|
||||
cd ktransformers/ktransformers/website/ &&
|
||||
npm install @vue/cli &&
|
||||
npm run build &&
|
||||
rm -rf node_modules
|
||||
EOF
|
||||
|
||||
|
||||
|
||||
FROM pytorch/pytorch:2.5.1-cuda12.1-cudnn9-devel as compile_server
|
||||
|
||||
|
||||
ARG CPU_INSTRUCT=NATIVE
|
||||
|
||||
# 设置工作目录和 CUDA 路径
|
||||
WORKDIR /workspace
|
||||
ENV CUDA_HOME /usr/local/cuda
|
||||
COPY --from=web_compile /home/ktransformers /workspace/ktransformers
|
||||
RUN <<EOF
|
||||
apt update -y && apt install -y --no-install-recommends \
|
||||
ENV CUDA_HOME=/usr/local/cuda
|
||||
|
||||
|
||||
|
||||
# 安装依赖
|
||||
RUN apt update -y
|
||||
RUN apt install -y --no-install-recommends \
|
||||
libtbb-dev \
|
||||
libssl-dev \
|
||||
libcurl4-openssl-dev \
|
||||
libaio1 \
|
||||
libaio-dev \
|
||||
libfmt-dev \
|
||||
libgflags-dev \
|
||||
zlib1g-dev \
|
||||
patchelf \
|
||||
git \
|
||||
wget \
|
||||
vim \
|
||||
gcc \
|
||||
g++ \
|
||||
cmake &&
|
||||
rm -rf /var/lib/apt/lists/* &&
|
||||
cd ktransformers &&
|
||||
git submodule init &&
|
||||
git submodule update &&
|
||||
pip install --upgrade pip &&
|
||||
pip install ninja pyproject numpy cpufeature &&
|
||||
pip install flash-attn &&
|
||||
CPU_INSTRUCT=${CPU_INSTRUCT} KTRANSFORMERS_FORCE_BUILD=TRUE TORCH_CUDA_ARCH_LIST="8.0;8.6;8.7;8.9;9.0+PTX" pip install . --no-build-isolation --verbose &&
|
||||
pip cache purge &&
|
||||
cp /usr/lib/x86_64-linux-gnu/libstdc++.so.6 /opt/conda/lib/
|
||||
EOF
|
||||
cmake
|
||||
# 拷贝代码
|
||||
RUN git clone https://github.com/kvcache-ai/ktransformers.git
|
||||
# 清理 apt 缓存
|
||||
RUN rm -rf /var/lib/apt/lists/*
|
||||
|
||||
ENTRYPOINT ["tail", "-f", "/dev/null"]
|
||||
# 进入项目目录
|
||||
WORKDIR /workspace/ktransformers
|
||||
# 初始化子模块
|
||||
RUN git submodule update --init --recursive
|
||||
|
||||
# 升级 pip
|
||||
RUN pip install --upgrade pip
|
||||
|
||||
# 安装构建依赖
|
||||
RUN pip install ninja pyproject numpy cpufeature aiohttp zmq openai
|
||||
|
||||
# 安装 flash-attn(提前装可以避免后续某些编译依赖出错)
|
||||
RUN pip install flash-attn
|
||||
|
||||
# 安装 ktransformers 本体(含编译)
|
||||
RUN CPU_INSTRUCT=${CPU_INSTRUCT} \
|
||||
USE_BALANCE_SERVE=1 \
|
||||
KTRANSFORMERS_FORCE_BUILD=TRUE \
|
||||
TORCH_CUDA_ARCH_LIST="8.0;8.6;8.7;8.9;9.0+PTX" \
|
||||
pip install . --no-build-isolation --verbose
|
||||
|
||||
RUN pip install third_party/custom_flashinfer/
|
||||
# 清理 pip 缓存
|
||||
RUN pip cache purge
|
||||
|
||||
# 拷贝 C++ 运行时库
|
||||
RUN cp /usr/lib/x86_64-linux-gnu/libstdc++.so.6 /opt/conda/lib/
|
||||
|
||||
# 保持容器运行(调试用)
|
||||
ENTRYPOINT ["tail", "-f", "/dev/null"]
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
graft third_party
|
||||
graft ktransformers
|
||||
graft local_chat.py
|
||||
graft csrc
|
||||
include LICENSE README.md
|
||||
prune ktransformers/website
|
||||
prune ktransformers/logs
|
||||
|
@ -9,3 +10,4 @@ prune third_party/llama.cpp/models
|
|||
graft ktransformers/website/dist
|
||||
global-exclude __pycache__
|
||||
include KTransformersOps.*.so
|
||||
include cpuinfer_ext.*.so
|
||||
|
|
2
Makefile
2
Makefile
|
@ -29,4 +29,4 @@ clean:
|
|||
install_numa:
|
||||
USE_NUMA=1 make dev_install
|
||||
install_no_numa:
|
||||
env -u USE_NUMA make dev_install
|
||||
env -u USE_NUMA make dev_install
|
32
README.md
32
README.md
|
@ -23,17 +23,23 @@ Our vision for KTransformers is to serve as a flexible platform for experimentin
|
|||
|
||||
<h2 id="Updates">🔥 Updates</h2>
|
||||
|
||||
* **Apr 2, 2025**: Support Multi-concurrency. ([Tutorial](./doc/en/balance-serve.md)).
|
||||
|
||||
https://github.com/user-attachments/assets/faa3bda2-928b-45a7-b44f-21e12ec84b8a
|
||||
|
||||
* **Mar 15, 2025**: Support ROCm on AMD GPU ([Tutorial](./doc/en/ROCm.md)).
|
||||
* **Mar 5, 2025**: Support unsloth 1.58/2.51 bits weights and [IQ1_S/FP8 hybrid](./doc/en/fp8_kernel.md) weights. Support 139K [Longer Context](./doc/en/DeepseekR1_V3_tutorial.md#v022--v023-longer-context--fp8-kernel) for DeepSeek-V3 and R1 in 24GB VRAM.
|
||||
* **Feb 25, 2025**: Support [FP8 GPU kernel](./doc/en/fp8_kernel.md) for DeepSeek-V3 and R1; [Longer Context](./doc/en/DeepseekR1_V3_tutorial.md#v022-longer-context).
|
||||
* **Feb 15, 2025**: Longer Context (from 4K to 8K for 24GB VRAM) & Slightly Faster Speed (+15%, up to 16 Tokens/s), update [docs](./doc/en/DeepseekR1_V3_tutorial.md) and [online books](https://kvcache-ai.github.io/ktransformers/).
|
||||
* **Feb 10, 2025**: Support Deepseek-R1 and V3 on single (24GB VRAM)/multi gpu and 382G DRAM, up to 3~28x speedup. For detailed show case and reproduction tutorial, see [here](./doc/en/DeepseekR1_V3_tutorial.md).
|
||||
* **Aug 28, 2024**: Decrease DeepseekV2's required VRAM from 21G to 11G.
|
||||
* **Aug 15, 2024**: Update detailed [tutorial](doc/en/injection_tutorial.md) for injection and multi-GPU.
|
||||
* **Aug 14, 2024**: Support llamfile as linear backend.
|
||||
* **Aug 15, 2024**: Update detailed [tutorial](doc/en/injection_tutorial.md) for injection and multi-GPU.
|
||||
* **Aug 14, 2024**: Support llamfile as linear backend.
|
||||
* **Aug 12, 2024**: Support multiple GPU; Support new model: mixtral 8\*7B and 8\*22B; Support q2k, q3k, q5k dequant on gpu.
|
||||
* **Aug 9, 2024**: Support windows native.
|
||||
|
||||
<!-- * **Aug 28, 2024**: Support 1M context under the InternLM2.5-7B-Chat-1M model, utilizing 24GB of VRAM and 150GB of DRAM. The detailed tutorial is [here](./doc/en/long_context_tutorial.md). -->
|
||||
|
||||
<h2 id="show-cases">🌟 Show Cases</h2>
|
||||
|
||||
<div>
|
||||
|
@ -45,16 +51,16 @@ https://github.com/user-attachments/assets/ebd70bfa-b2c1-4abb-ae3b-296ed38aa285
|
|||
</p>
|
||||
|
||||
- **[NEW!!!] Local 671B DeepSeek-Coder-V3/R1:** Running its Q4_K_M version using only 14GB VRAM and 382GB DRAM([Tutorial](./doc/en/DeepseekR1_V3_tutorial.md)).
|
||||
- Prefill Speed (tokens/s):
|
||||
- KTransformers: 54.21 (32 cores) → 74.362 (dual-socket, 2×32 cores) → 255.26 (optimized AMX-based MoE kernel, V0.3 only) → 286.55 (selectively using 6 experts, V0.3 only)
|
||||
- Compared to 10.31 tokens/s in llama.cpp with 2×32 cores, achieving up to **27.79× speedup**.
|
||||
- Decode Speed (tokens/s):
|
||||
- KTransformers: 8.73 (32 cores) → 11.26 (dual-socket, 2×32 cores) → 13.69 (selectively using 6 experts, V0.3 only)
|
||||
- Compared to 4.51 tokens/s in llama.cpp with 2×32 cores, achieving up to **3.03× speedup**.
|
||||
- Upcoming Open Source Release:
|
||||
- AMX optimizations and selective expert activation will be open-sourced in V0.3.
|
||||
- Currently available only in preview binary distribution, which can be downloaded [here](./doc/en/DeepseekR1_V3_tutorial.md).
|
||||
|
||||
- Prefill Speed (tokens/s):
|
||||
- KTransformers: 54.21 (32 cores) → 74.362 (dual-socket, 2×32 cores) → 255.26 (optimized AMX-based MoE kernel, V0.3 only) → 286.55 (selectively using 6 experts, V0.3 only)
|
||||
- Compared to 10.31 tokens/s in llama.cpp with 2×32 cores, achieving up to **27.79× speedup**.
|
||||
- Decode Speed (tokens/s):
|
||||
- KTransformers: 8.73 (32 cores) → 11.26 (dual-socket, 2×32 cores) → 13.69 (selectively using 6 experts, V0.3 only)
|
||||
- Compared to 4.51 tokens/s in llama.cpp with 2×32 cores, achieving up to **3.03× speedup**.
|
||||
- Upcoming Open Source Release:
|
||||
- AMX optimizations and selective expert activation will be open-sourced in V0.3.
|
||||
- Currently available only in preview binary distribution, which can be downloaded [here](./doc/en/DeepseekR1_V3_tutorial.md).
|
||||
- **Local 236B DeepSeek-Coder-V2:** Running its Q4_K_M version using only 21GB VRAM and 136GB DRAM, attainable on a local desktop machine, which scores even better than GPT4-0613 in [BigCodeBench](https://huggingface.co/blog/leaderboard-bigcodebench).
|
||||
|
||||
<p align="center">
|
||||
|
@ -96,19 +102,16 @@ https://github.com/user-attachments/assets/a865e5e4-bca3-401e-94b8-af3c080e6c12
|
|||
* **Flexible Sparse Attention Framework**: Offers a flexible block sparse attention framework for CPU offloaded decoding. Compatible with SnapKV, Quest, and InfLLm. Further information is available [here](./doc/en/long_context_introduction.md).
|
||||
-->
|
||||
|
||||
|
||||
<strong>More advanced features will coming soon, so stay tuned!</strong>
|
||||
|
||||
<h2 id="quick-start">🚀 Quick Start</h2>
|
||||
|
||||
|
||||
Getting started with KTransformers is simple! Follow the steps below to set up and start using it.
|
||||
|
||||
### 📥 Installation
|
||||
|
||||
To install KTransformers, follow the official [Installation Guide](https://kvcache-ai.github.io/ktransformers/en/install.html).
|
||||
|
||||
|
||||
<h2 id="tutorial">📃 Brief Injection Tutorial</h2>
|
||||
At the heart of KTransformers is a user-friendly, template-based injection framework.
|
||||
This allows researchers to easily replace original torch modules with optimized variants. It also simplifies the process of combining multiple optimizations, allowing the exploration of their synergistic effects.
|
||||
|
@ -167,7 +170,6 @@ The development of KTransformers is based on the flexible and versatile framewor
|
|||
|
||||
KTransformers is actively maintained and developed by contributors from the <a href="https://madsys.cs.tsinghua.edu.cn/">MADSys group</a> at Tsinghua University and members from <a href="http://approaching.ai/">Approaching.AI</a>. We welcome new contributors to join us in making KTransformers faster and easier to use.
|
||||
|
||||
|
||||
<h2 id="ack">Discussion</h2>
|
||||
|
||||
If you have any questions, feel free to open an issue. Alternatively, you can join our WeChat group for further discussion. QR Code: [WeChat Group](WeChatGroup.png)
|
||||
|
|
67
csrc/balance_serve/CMakeLists.txt
Normal file
67
csrc/balance_serve/CMakeLists.txt
Normal file
|
@ -0,0 +1,67 @@
|
|||
|
||||
cmake_minimum_required(VERSION 3.21)
|
||||
find_program(GCC_COMPILER NAMES g++-13 g++-12 g++-11 REQUIRED)
|
||||
set(CMAKE_CXX_COMPILER ${GCC_COMPILER})
|
||||
|
||||
# 显示选定的编译器
|
||||
message(STATUS "Using compiler: ${CMAKE_CXX_COMPILER}")
|
||||
|
||||
|
||||
project(balance_serve VERSION 0.1.0)
|
||||
|
||||
set(CMAKE_CXX_STANDARD 20)
|
||||
# set(CMAKE_CXX_FLAGS "-Og -march=native -Wall -Wextra -g -fPIC")
|
||||
# set(CMAKE_BUILD_TYPE "Debug")
|
||||
set(CMAKE_CXX_FLAGS "-O3 -march=native -Wall -Wextra -fPIC")
|
||||
add_compile_definitions(_GLIBCXX_USE_CXX11_ABI=${_GLIBCXX_USE_CXX11_ABI})
|
||||
set(CMAKE_BUILD_TYPE "Release")
|
||||
|
||||
file(GLOB_RECURSE FMT_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/*.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/*.hpp" "${CMAKE_CURRENT_SOURCE_DIR}/*.h")
|
||||
|
||||
add_custom_target(
|
||||
format
|
||||
COMMAND clang-format
|
||||
-i
|
||||
-style=file
|
||||
${FMT_SOURCES}
|
||||
COMMENT "Running clang-format on all source files"
|
||||
)
|
||||
|
||||
set(BUILD_SHARED_LIBS ON)
|
||||
set(ENABLE_PUSH OFF)
|
||||
set(ENABLE_COMPRESSION OFF)
|
||||
|
||||
# set(CMAKE_BUILD_TYPE "Release")
|
||||
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
|
||||
|
||||
set(THIRD_PARTY_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../third_party)
|
||||
set(THIRD_PARTY_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/third_party)
|
||||
add_subdirectory(${THIRD_PARTY_DIR}/prometheus-cpp ${THIRD_PARTY_BUILD_DIR}/prometheus-cpp EXCLUDE_FROM_ALL)
|
||||
add_subdirectory(${THIRD_PARTY_DIR}/xxHash/cmake_unofficial ${THIRD_PARTY_BUILD_DIR}/xxHash EXCLUDE_FROM_ALL)
|
||||
|
||||
# add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third_party/prometheus-cpp ${CMAKE_CURRENT_BINARY_DIR}/third_party/prometheus-cpp)
|
||||
set(SPDLOG_DIR ${THIRD_PARTY_DIR}/spdlog)
|
||||
set(FMT_DIR ${THIRD_PARTY_DIR}/fmt)
|
||||
|
||||
set(KVC2_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/kvc2/src)
|
||||
|
||||
include_directories(${THIRD_PARTY_DIR})
|
||||
|
||||
add_subdirectory(${THIRD_PARTY_DIR}/pybind11 ${THIRD_PARTY_BUILD_DIR}/pybind11)
|
||||
|
||||
execute_process(
|
||||
COMMAND python3 -c "import torch; print(torch.__path__[0])"
|
||||
OUTPUT_VARIABLE TORCH_INSTALL_PREFIX
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE
|
||||
)
|
||||
|
||||
message(STATUS "Found PyTorch at: ${TORCH_INSTALL_PREFIX}")
|
||||
|
||||
# set(TORCH_INSTALL_PREFIX "/home/xwy/.conda/envs/kvc/lib/python3.12/site-packages/torch")
|
||||
find_library(TORCH_PYTHON_LIBRARY torch_python PATH "${TORCH_INSTALL_PREFIX}/lib")
|
||||
find_package(Torch REQUIRED PATHS "${TORCH_INSTALL_PREFIX}/share/cmake/Torch" NO_DEFAULT_PATH)
|
||||
|
||||
add_subdirectory(kvc2)
|
||||
add_subdirectory(sched)
|
||||
|
||||
# add_subdirectory(test)
|
25
csrc/balance_serve/kvc2/.clang-format
Normal file
25
csrc/balance_serve/kvc2/.clang-format
Normal file
|
@ -0,0 +1,25 @@
|
|||
Language: Cpp
|
||||
# 格式化风格,可以是LLVM, Google, Chromium, Mozilla, WebKit等,或者自定义
|
||||
BasedOnStyle: Google
|
||||
|
||||
# 缩进设置
|
||||
IndentWidth: 2
|
||||
TabWidth: 2
|
||||
UseTab: Never
|
||||
|
||||
# 换行相关设置
|
||||
BreakBeforeBraces: Attach
|
||||
AllowShortIfStatementsOnASingleLine: false
|
||||
AllowShortFunctionsOnASingleLine: Inline
|
||||
AllowShortLoopsOnASingleLine: false
|
||||
|
||||
# 类与结构体
|
||||
DerivePointerAlignment: false
|
||||
PointerAlignment: Left
|
||||
|
||||
# 包含文件的排序和分组
|
||||
IncludeBlocks: Preserve
|
||||
SortIncludes: true
|
||||
|
||||
# 控制最大行宽
|
||||
ColumnLimit: 120
|
103
csrc/balance_serve/kvc2/CMakeLists.txt
Normal file
103
csrc/balance_serve/kvc2/CMakeLists.txt
Normal file
|
@ -0,0 +1,103 @@
|
|||
cmake_minimum_required(VERSION 3.21)
|
||||
|
||||
find_program(GCC_COMPILER NAMES g++-13 g++-12 g++-11 REQUIRED)
|
||||
set(CMAKE_CXX_COMPILER ${GCC_COMPILER})
|
||||
|
||||
project(kvcache-manager VERSION 0.1.0)
|
||||
|
||||
set(CMAKE_CXX_STANDARD 20)
|
||||
|
||||
# set(CMAKE_CXX_FLAGS "-fPIC -O3 -ffast-math -march=native -Wall -Wextra -Wpedantic -fvisibility=hidden -s")
|
||||
# set(CMAKE_CXX_FLAGS "-Og -march=native -Wall -Wextra -Wpedantic -g -fsanitize=address")
|
||||
# set(CMAKE_CXX_FLAGS "-march=native -Wall -Wextra -Wpedantic -g")
|
||||
# set(CMAKE_CXX_FLAGS "-fPIC -O3 -ffast-math -march=native -Wall -Wextra -g")
|
||||
set(CMAKE_BUILD_TYPE "Release")
|
||||
add_compile_definitions(_GLIBCXX_USE_CXX11_ABI=${_GLIBCXX_USE_CXX11_ABI})
|
||||
# set(CMAKE_BUILD_TYPE "Debug")
|
||||
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
|
||||
set(BUILD_TEST OFF)
|
||||
set(BUILD_PYTHON_EXT OFF)
|
||||
|
||||
# set(USE_IO_URING ON)
|
||||
if(USE_IO_URING)
|
||||
message(STATUS "Using io_uring")
|
||||
add_compile_definitions(USE_IO_URING)
|
||||
else()
|
||||
message(STATUS "Using aio")
|
||||
endif()
|
||||
|
||||
file(GLOB_RECURSE ALL_SOURCE_FILES src/*.cpp src/*.h test/*.cpp test/*.h test/*.hpp)
|
||||
|
||||
# 添加一个自定义目标来格式化所有代码
|
||||
if(NOT TARGET format)
|
||||
add_custom_target(
|
||||
format
|
||||
COMMAND clang-format
|
||||
-i
|
||||
-style=file
|
||||
${ALL_SOURCE_FILES}
|
||||
COMMENT "Running clang-format on all source files"
|
||||
)
|
||||
endif()
|
||||
|
||||
execute_process(
|
||||
COMMAND python3 -c "import torch; print(torch.__path__[0])"
|
||||
OUTPUT_VARIABLE TORCH_INSTALL_PREFIX
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE
|
||||
)
|
||||
|
||||
message(STATUS "Found PyTorch at: ${TORCH_INSTALL_PREFIX}")
|
||||
|
||||
# set(TORCH_INSTALL_PREFIX "/home/xwy/.conda/envs/kvc/lib/python3.12/site-packages/torch")
|
||||
find_library(TORCH_PYTHON_LIBRARY torch_python PATH "${TORCH_INSTALL_PREFIX}/lib")
|
||||
find_package(Torch REQUIRED PATHS "${TORCH_INSTALL_PREFIX}/share/cmake/Torch" NO_DEFAULT_PATH)
|
||||
|
||||
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../../third_party)
|
||||
|
||||
find_package(TBB REQUIRED)
|
||||
find_package(CUDA REQUIRED)
|
||||
|
||||
# find_package(prometheus-cpp CONFIG REQUIRED)
|
||||
if(NOT TARGET prometheus-cpp::pull)
|
||||
message(FATAL_ERROR "prometheus-cpp::pull not found")
|
||||
else()
|
||||
message(STATUS "prometheus Found!")
|
||||
endif()
|
||||
|
||||
if(CUDA_FOUND)
|
||||
message(STATUS "CUDA Found!")
|
||||
message(STATUS "CUDA Version: ${CUDA_VERSION_STRING}")
|
||||
message(STATUS "CUDA Toolkit Root: ${CUDA_TOOLKIT_ROOT_DIR}")
|
||||
else()
|
||||
message(FATAL_ERROR "CUDA not found!")
|
||||
endif()
|
||||
|
||||
add_subdirectory(src)
|
||||
|
||||
if(BUILD_TEST)
|
||||
add_subdirectory(test)
|
||||
endif()
|
||||
|
||||
message(STATUS "BUILD_PYTHON_EXT: ${BUILD_PYTHON_EXT}")
|
||||
|
||||
if(BUILD_PYTHON_EXT)
|
||||
if(NOT TARGET pybind11::pybind11)
|
||||
add_subdirectory(${THIRD_PARTY_DIR}/pybind11 ${THIRD_PARTY_BUILD_DIR}/pybind11)
|
||||
endif()
|
||||
|
||||
pybind11_add_module(kvc2_ext src/bind.cpp)
|
||||
|
||||
# EXAMPLE_VERSION_INFO is defined by setup.py and passed into the C++ code as a
|
||||
# define (VERSION_INFO) here.
|
||||
target_compile_definitions(kvc2_ext PRIVATE VERSION_INFO=${EXAMPLE_VERSION_INFO})
|
||||
message(STATUS "CMAKE_CURRENT_SOURCE_DIR: ${CMAKE_CURRENT_SOURCE_DIR}")
|
||||
target_include_directories(kvc2_ext PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../third_party/spdlog/include)
|
||||
|
||||
target_link_libraries(kvc2_ext PUBLIC kvc2 async_store)
|
||||
|
||||
install(TARGETS kvc2_ext LIBRARY
|
||||
DESTINATION ${CMAKE_BINARY_DIR}/output)
|
||||
install(FILES src/kvc2_utils.py
|
||||
DESTINATION ${CMAKE_BINARY_DIR}/output)
|
||||
endif()
|
||||
|
38
csrc/balance_serve/kvc2/README.md
Normal file
38
csrc/balance_serve/kvc2/README.md
Normal file
|
@ -0,0 +1,38 @@
|
|||
# KVC2
|
||||
|
||||
# Build
|
||||
运行以下命令编译kvc2,注意可能需要 sudo 权限安装一些依赖
|
||||
```shell
|
||||
git clone https://github.com/kvcache-ai/kvc2
|
||||
cd kvc2
|
||||
./install_deps.sh
|
||||
mkdir build
|
||||
cd build
|
||||
cmake ..
|
||||
make -j && make install
|
||||
```
|
||||
编译完成后会生成`build/output`,包含`kvc2_ext.cpython-312-x86_64-linux-gnu.so`和`kvc2_utils.py`方便调用。
|
||||
|
||||
<!-- # Test
|
||||
运行以下命令测试kvc2,需要指定一个 disk path 作为测试目录。
|
||||
```shell
|
||||
./unit_test.sh ${DISK_PATH}
|
||||
```
|
||||
或者运行 python 的测试文件
|
||||
```shell
|
||||
python test/pytest_mem_read.py
|
||||
``` -->
|
||||
|
||||
# Troubleshooting
|
||||
在 Python 环境运行时,可以需要在 conda 中安装相关的依赖。
|
||||
```shell
|
||||
conda install -c conda-forge gcc_linux-64 gxx_linux-64
|
||||
```
|
||||
|
||||
也可以尝试设置一下环境变量,然后再运行。
|
||||
```shell
|
||||
export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
|
||||
export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libffi.so.7
|
||||
```
|
||||
|
||||
|
42
csrc/balance_serve/kvc2/config/model_configs.json
Normal file
42
csrc/balance_serve/kvc2/config/model_configs.json
Normal file
|
@ -0,0 +1,42 @@
|
|||
{
|
||||
"DeepSeek-Coder-V2-Instruct": {
|
||||
"hidden_size": 5120,
|
||||
"intermediate_size": 12288,
|
||||
"max_position_embeddings": 163840,
|
||||
"model_type": "deepseek_v2",
|
||||
"num_attention_heads": 128,
|
||||
"num_hidden_layers": 60,
|
||||
"num_key_value_heads": 128,
|
||||
"vocab_size": 102400
|
||||
},
|
||||
"LLaMA-2-7B-32K": {
|
||||
"hidden_size": 4096,
|
||||
"intermediate_size": 11008,
|
||||
"max_position_embeddings": 32768,
|
||||
"model_type": "llama",
|
||||
"num_attention_heads": 32,
|
||||
"num_hidden_layers": 32,
|
||||
"num_key_value_heads": 32,
|
||||
"vocab_size": 32000
|
||||
},
|
||||
"Qwen2.5-7B-Instruct": {
|
||||
"hidden_size": 3584,
|
||||
"intermediate_size": 18944,
|
||||
"max_position_embeddings": 32768,
|
||||
"model_type": "qwen2",
|
||||
"num_attention_heads": 28,
|
||||
"num_hidden_layers": 28,
|
||||
"num_key_value_heads": 4,
|
||||
"vocab_size": 152064
|
||||
},
|
||||
"qwen2-72b-instruct": {
|
||||
"hidden_size": 8192,
|
||||
"intermediate_size": 29568,
|
||||
"max_position_embeddings": 32768,
|
||||
"model_type": "qwen2",
|
||||
"num_attention_heads": 64,
|
||||
"num_hidden_layers": 80,
|
||||
"num_key_value_heads": 8,
|
||||
"vocab_size": 152064
|
||||
}
|
||||
}
|
57
csrc/balance_serve/kvc2/config/quant_configs.json
Normal file
57
csrc/balance_serve/kvc2/config/quant_configs.json
Normal file
|
@ -0,0 +1,57 @@
|
|||
{
|
||||
"BF16": {
|
||||
"block_element_count": 1,
|
||||
"block_element_size": 2,
|
||||
"bytes_per_element": 2.0,
|
||||
"can_be_used_as_vector": true,
|
||||
"has_min": false,
|
||||
"has_scale": false,
|
||||
"name": "BF16",
|
||||
"reference": "",
|
||||
"type_of_dot_vector": "BF16"
|
||||
},
|
||||
"FP16": {
|
||||
"block_element_count": 1,
|
||||
"block_element_size": 2,
|
||||
"bytes_per_element": 2.0,
|
||||
"can_be_used_as_vector": true,
|
||||
"has_min": false,
|
||||
"has_scale": false,
|
||||
"name": "FP16",
|
||||
"reference": "",
|
||||
"type_of_dot_vector": "FP16"
|
||||
},
|
||||
"FP32": {
|
||||
"block_element_count": 1,
|
||||
"block_element_size": 4,
|
||||
"bytes_per_element": 4.0,
|
||||
"can_be_used_as_vector": true,
|
||||
"has_min": false,
|
||||
"has_scale": false,
|
||||
"name": "FP32",
|
||||
"reference": "",
|
||||
"type_of_dot_vector": "FP32"
|
||||
},
|
||||
"Q4_0": {
|
||||
"block_element_count": 32,
|
||||
"block_element_size": 18,
|
||||
"bytes_per_element": 0.5625,
|
||||
"can_be_used_as_vector": false,
|
||||
"has_min": false,
|
||||
"has_scale": true,
|
||||
"name": "Q4_0",
|
||||
"reference": "https://huggingface.co/docs/hub/gguf",
|
||||
"type_of_dot_vector": "Q8_0"
|
||||
},
|
||||
"Q8_0": {
|
||||
"block_element_count": 32,
|
||||
"block_element_size": 34,
|
||||
"bytes_per_element": 1.0625,
|
||||
"can_be_used_as_vector": true,
|
||||
"has_min": false,
|
||||
"has_scale": true,
|
||||
"name": "Q8_0",
|
||||
"reference": "https://huggingface.co/docs/hub/gguf",
|
||||
"type_of_dot_vector": "Q8_0"
|
||||
}
|
||||
}
|
2
csrc/balance_serve/kvc2/export_envs_before_run.sh
Executable file
2
csrc/balance_serve/kvc2/export_envs_before_run.sh
Executable file
|
@ -0,0 +1,2 @@
|
|||
export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
|
||||
export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libffi.so.7
|
15
csrc/balance_serve/kvc2/install_deps.sh
Executable file
15
csrc/balance_serve/kvc2/install_deps.sh
Executable file
|
@ -0,0 +1,15 @@
|
|||
#!/bin/bash
|
||||
|
||||
cd "${0%/*}"
|
||||
git submodule update --init --recursive
|
||||
|
||||
sudo apt update
|
||||
sudo apt install libtbb-dev
|
||||
sudo apt install libcurl4-openssl-dev
|
||||
sudo apt install libaio-dev
|
||||
|
||||
cd third_party/xxHash/
|
||||
make -j
|
||||
sudo make install
|
||||
cd ../..
|
||||
|
4
csrc/balance_serve/kvc2/mkfs.sh
Executable file
4
csrc/balance_serve/kvc2/mkfs.sh
Executable file
|
@ -0,0 +1,4 @@
|
|||
sudo umount /mnt/xwy
|
||||
sudo mkfs.xfs /dev/nvme0n1 -f
|
||||
sudo mount /dev/nvme0n1 /mnt/xwy
|
||||
sudo chown -R xwy /mnt/xwy/
|
45
csrc/balance_serve/kvc2/src/CMakeLists.txt
Normal file
45
csrc/balance_serve/kvc2/src/CMakeLists.txt
Normal file
|
@ -0,0 +1,45 @@
|
|||
include_directories(${THIRD_PARTY_DIR}/asyncio/include)
|
||||
|
||||
add_library(kvc2_metrics STATIC metrics.cpp)
|
||||
target_link_libraries(kvc2_metrics PUBLIC prometheus-cpp::pull)
|
||||
|
||||
add_library(page_aligned_memory_pool page_aligned_memory_pool.cpp)
|
||||
target_include_directories(page_aligned_memory_pool PRIVATE ${THIRD_PARTY_DIR}/spdlog/include)
|
||||
|
||||
function(add_third_party_includes TARGET_NAME)
|
||||
target_include_directories(${TARGET_NAME} PRIVATE
|
||||
${THIRD_PARTY_BUILD_DIR}/prometheus-cpp/core/include
|
||||
${THIRD_PARTY_BUILD_DIR}/prometheus-cpp/pull/include
|
||||
${THIRD_PARTY_DIR}/prometheus-cpp/core/include
|
||||
${THIRD_PARTY_DIR}/prometheus-cpp/pull/include
|
||||
${THIRD_PARTY_DIR}/spdlog/include
|
||||
)
|
||||
endfunction()
|
||||
|
||||
|
||||
add_library(cache_entry cache_entry.cpp)
|
||||
add_third_party_includes(cache_entry)
|
||||
target_link_libraries(cache_entry PUBLIC gpu_cache)
|
||||
|
||||
add_library(gpu_cache gpu_cache.cpp)
|
||||
add_third_party_includes(gpu_cache)
|
||||
target_link_libraries(gpu_cache PUBLIC xxHash::xxhash ${TORCH_LIBRARIES} cuda_stream_manager)
|
||||
|
||||
add_library(kvc2 prefix.cpp)
|
||||
target_include_directories(kvc2 PRIVATE ${THIRD_PARTY_DIR}/nlohmann/single_include)
|
||||
add_third_party_includes(kvc2)
|
||||
target_link_libraries(kvc2 PUBLIC TBB::tbb xxHash::xxhash cache_entry cuda_stream_manager page_aligned_memory_pool ${TORCH_LIBRARIES} prometheus-cpp::pull kvc2_metrics)
|
||||
|
||||
message(STATUS "CMAKE_SOURCE_DIR: " ${CMAKE_SOURCE_DIR})
|
||||
add_library(async_store async_store.cpp)
|
||||
target_include_directories(async_store PRIVATE ${THIRD_PARTY_DIR}/nlohmann/single_include)
|
||||
target_include_directories(async_store PRIVATE ${THIRD_PARTY_DIR}/spdlog/include)
|
||||
target_link_libraries(async_store PUBLIC pthread)
|
||||
|
||||
|
||||
|
||||
add_library(cuda_stream_manager cuda_stream_manager.cpp)
|
||||
target_include_directories(cuda_stream_manager PUBLIC ${THIRD_PARTY_DIR}/nlohmann/single_include)
|
||||
target_include_directories(cuda_stream_manager PUBLIC ${THIRD_PARTY_DIR}/spdlog/include)
|
||||
target_include_directories(cuda_stream_manager PUBLIC ${CUDAToolkit_INCLUDE_DIRS})
|
||||
target_link_libraries(cuda_stream_manager PUBLIC CUDA::cudart)
|
137
csrc/balance_serve/kvc2/src/async_store.cpp
Normal file
137
csrc/balance_serve/kvc2/src/async_store.cpp
Normal file
|
@ -0,0 +1,137 @@
|
|||
|
||||
#include <fcntl.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
#include <filesystem>
|
||||
#include <future>
|
||||
#include <iostream>
|
||||
#include <nlohmann/json.hpp>
|
||||
#include <optional>
|
||||
#include <queue>
|
||||
#include <thread>
|
||||
#include <unordered_map>
|
||||
|
||||
#include "utils/lock_free_queue.hpp"
|
||||
|
||||
#include "async_store.hh"
|
||||
|
||||
namespace async_store {
|
||||
|
||||
struct ArrayStore {
|
||||
static const size_t DeviceBlockSize = 512;
|
||||
|
||||
const size_t element_size;
|
||||
const size_t element_size_aligned;
|
||||
|
||||
size_t size;
|
||||
|
||||
size_t size_in_bytes() { return size * element_size_aligned; }
|
||||
|
||||
std::filesystem::path data_path;
|
||||
|
||||
void extend(size_t to) {
|
||||
if (to <= size) {
|
||||
return;
|
||||
}
|
||||
// TODO: extend file
|
||||
size = to;
|
||||
// LOG_INFO("Extend file to `, size `", to, size_in_bytes());
|
||||
}
|
||||
|
||||
ArrayStore(size_t element_size, size_t size, std::filesystem::path data_path)
|
||||
: element_size(element_size),
|
||||
element_size_aligned((element_size + DeviceBlockSize - 1) / DeviceBlockSize),
|
||||
data_path(data_path) {
|
||||
// TODO: prefix cache
|
||||
}
|
||||
|
||||
void read(size_t index, void* buffer) {
|
||||
// TODO: read from file
|
||||
}
|
||||
void write(size_t index, void* buffer) {
|
||||
// TODO: write to file
|
||||
}
|
||||
};
|
||||
|
||||
ArrayStore* create_or_open_store(size_t element_size, size_t size, std::filesystem::path data_path) {
|
||||
return new ArrayStore(element_size, size, data_path);
|
||||
}
|
||||
|
||||
void close_store(ArrayStore* store) {
|
||||
delete store;
|
||||
}
|
||||
|
||||
size_t capacity(ArrayStore* store) {
|
||||
return store->size;
|
||||
}
|
||||
|
||||
void extend(ArrayStore* store, size_t to) {
|
||||
store->extend(to);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
struct ArrayStoreT {
|
||||
ArrayStore store;
|
||||
ArrayStoreT(size_t element_count, std::filesystem::path data_path) : store(sizeof(T), element_count, data_path) {}
|
||||
|
||||
void read(size_t index, void* output) { store.read(index, output); }
|
||||
|
||||
void write(size_t index, T& value) { store.write(index, &value); }
|
||||
void write(size_t index, void* value) { store.write(index, value); }
|
||||
};
|
||||
|
||||
std::string request_to_string(IORequest* req) {
|
||||
return fmt::format("IOReqeust {} {} to {}[{}]", req->write ? "Write" : "Read ", req->data,
|
||||
req->store->data_path.c_str(), req->index);
|
||||
}
|
||||
|
||||
struct IODealerImpl {
|
||||
MPSCQueue<IORequest> ioQueue;
|
||||
uint64_t io_cnt = 0;
|
||||
size_t io_amount = 0;
|
||||
bool use_io_uring;
|
||||
int IO_DEPTH;
|
||||
|
||||
bool stop = false;
|
||||
IODealerImpl(bool use_io_uring, int IO_DEPTH) : use_io_uring(use_io_uring), IO_DEPTH(IO_DEPTH) {}
|
||||
|
||||
void queue_consumer() {
|
||||
// TODO:
|
||||
}
|
||||
|
||||
void io_perf() {
|
||||
// TODO:
|
||||
}
|
||||
|
||||
void io_dealer() {
|
||||
// TODO:
|
||||
}
|
||||
};
|
||||
|
||||
IODealer::IODealer(bool use_io_uring, int IO_DEPTH) {
|
||||
io_impl = new IODealerImpl(use_io_uring, IO_DEPTH);
|
||||
}
|
||||
|
||||
IODealer::~IODealer() {
|
||||
stop();
|
||||
delete io_impl;
|
||||
}
|
||||
|
||||
void IODealer::enqueue(std::shared_ptr<IORequest> req) {
|
||||
io_impl->ioQueue.enqueue(req);
|
||||
}
|
||||
|
||||
std::thread IODealer::start_io_thread() {
|
||||
return std::thread([this]() { io_impl->io_dealer(); });
|
||||
}
|
||||
void IODealer::stop() {
|
||||
if (io_impl->stop) {
|
||||
return;
|
||||
}
|
||||
// LOG_INFO("Stopping IO Dealer");
|
||||
io_impl->stop = true;
|
||||
}
|
||||
|
||||
} // namespace async_store
|
51
csrc/balance_serve/kvc2/src/async_store.hh
Normal file
51
csrc/balance_serve/kvc2/src/async_store.hh
Normal file
|
@ -0,0 +1,51 @@
|
|||
#pragma once
|
||||
#include <cstddef>
|
||||
#include <filesystem>
|
||||
|
||||
#define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG
|
||||
#define FMT_HEADER_ONLY
|
||||
#include "spdlog/spdlog.h"
|
||||
|
||||
#include "io_helper.hpp"
|
||||
|
||||
namespace async_store {
|
||||
|
||||
struct ArrayStore;
|
||||
|
||||
ArrayStore* create_or_open_store(size_t element_size, size_t size, std::filesystem::path data_path);
|
||||
void close_store(ArrayStore* store);
|
||||
size_t capacity(ArrayStore* store);
|
||||
void extend(ArrayStore* store, size_t to);
|
||||
|
||||
|
||||
|
||||
struct IORequest {
|
||||
ArrayStore* store;
|
||||
bool write;
|
||||
void* data;
|
||||
size_t index;
|
||||
|
||||
// for sync
|
||||
bool need_promise = false;
|
||||
BatchPromise* promise;
|
||||
};
|
||||
|
||||
std::string request_to_string(IORequest* req);
|
||||
|
||||
struct IODealerImpl;
|
||||
struct IODealer {
|
||||
IODealerImpl* io_impl;
|
||||
|
||||
IODealer(bool use_io_uring = false, int IO_DEPTH = 128);
|
||||
~IODealer();
|
||||
IODealer(const IODealer&) = delete;
|
||||
IODealer& operator=(const IODealer&) = delete;
|
||||
IODealer(IODealer&&) = default;
|
||||
IODealer& operator=(IODealer&&) = default;
|
||||
|
||||
void enqueue(std::shared_ptr<IORequest> req);
|
||||
std::thread start_io_thread();
|
||||
void stop();
|
||||
};
|
||||
|
||||
} // namespace async_store
|
53
csrc/balance_serve/kvc2/src/bind.cpp
Normal file
53
csrc/balance_serve/kvc2/src/bind.cpp
Normal file
|
@ -0,0 +1,53 @@
|
|||
// #include <pybind11/functional.h>
|
||||
// #include <pybind11/pybind11.h>
|
||||
// #include <pybind11/stl.h>
|
||||
// #include <memory>
|
||||
// #include <thread>
|
||||
// #include <vector>
|
||||
// #include "kvc2.h"
|
||||
// #define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG
|
||||
// #define FMT_HEADER_ONLY
|
||||
// #include "spdlog/spdlog.h"
|
||||
// #include "utils/arithmetic.hpp"
|
||||
|
||||
// namespace py = pybind11;
|
||||
|
||||
// PYBIND11_MODULE(kvc2_ext, m) {
|
||||
// // Bind KVC2Config struct
|
||||
// py::class_<kvc2::KVC2Config>(m, "KVC2Config")
|
||||
// .def(py::init<>())
|
||||
// .def_readwrite("path", &kvc2::KVC2Config::path)
|
||||
// .def_readwrite("block_length", &kvc2::KVC2Config::num_token_per_page)
|
||||
// .def_readwrite("memory_pool_size", &kvc2::KVC2Config::memory_pool_size)
|
||||
// .def_readwrite("evict_count", &kvc2::KVC2Config::evict_count);
|
||||
|
||||
// // Bind CacheInfo struct
|
||||
// py::class_<kvc2::CacheInfo>(m, "CacheInfo")
|
||||
// .def(py::init<>())
|
||||
// .def_readwrite("model_name", &kvc2::CacheInfo::model_name)
|
||||
// .def_readwrite("is_key_cache", &kvc2::CacheInfo::is_key_cache)
|
||||
// .def_readwrite("quant_type", &kvc2::CacheInfo::quant_type)
|
||||
// .def("hidden_layer_count", &kvc2::CacheInfo::hidden_layer_count)
|
||||
// .def("path", &kvc2::CacheInfo::path, py::arg("which_layer") = std::nullopt)
|
||||
// .def("__eq__", &kvc2::CacheInfo::operator==)
|
||||
// .def("element_size", &kvc2::CacheInfo::element_size)
|
||||
// .def("hash_value", &kvc2::CacheInfo::hash_value);
|
||||
|
||||
// // Bind KVC2HandleInterface class
|
||||
// py::class_<kvc2::KVC2HandleInterface, std::shared_ptr<kvc2::KVC2HandleInterface>>(m, "KVC2HandleInterface")
|
||||
// .def("matched_length", &kvc2::SingleCacheHandleInterface::matched_length)
|
||||
// .def("handle_data", &kvc2::KVC2HandleInterface::handle_data);
|
||||
|
||||
// // Bind KVC2Interface class
|
||||
// py::class_<kvc2::KVC2Interface, std::shared_ptr<kvc2::KVC2Interface>>(m, "KVC2Interface")
|
||||
// .def("start_io_thread", [](kvc2::KVC2Interface& self) { self.start_io_thread(); })
|
||||
// .def("stop_io_thread", &kvc2::KVC2Interface::stop_io_thread)
|
||||
// .def("load", &kvc2::KVC2Interface::load)
|
||||
// .def("save", &kvc2::KVC2Interface::save)
|
||||
// .def("raw_insert", &kvc2::KVC2Interface::raw_insert)
|
||||
// .def("raw_read", &kvc2::KVC2Interface::raw_read)
|
||||
// .def("lookup", &kvc2::KVC2Interface::lookup);
|
||||
|
||||
// // Bind create_kvc2 function
|
||||
// m.def("create_kvc2", &kvc2::create_kvc2, py::arg("config"));
|
||||
// }
|
263
csrc/balance_serve/kvc2/src/cache_entry.cpp
Normal file
263
csrc/balance_serve/kvc2/src/cache_entry.cpp
Normal file
|
@ -0,0 +1,263 @@
|
|||
#include "cache_entry.hh"
|
||||
#include <mutex>
|
||||
|
||||
#define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG
|
||||
#define FMT_HEADER_ONLY
|
||||
#include "spdlog/spdlog.h"
|
||||
|
||||
#include "gpu_cache.hh"
|
||||
|
||||
namespace kvc2 {
|
||||
|
||||
bool ConcurrentControlUnit::can_desert() {
|
||||
if (ref_count.load() == 0 && dirty.load() == false) {
|
||||
tc.reset();
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
void ConcurrentControlUnit::debug() {
|
||||
SPDLOG_DEBUG("ref count {}, dirty {}, {}", ref_count.load(), dirty.load(), tc.debug());
|
||||
}
|
||||
|
||||
CacheBlockEntry::~CacheBlockEntry() {
|
||||
if (data != nullptr && manager && manager->pool) {
|
||||
SPDLOG_WARN("Free {} when destruct", data);
|
||||
free_on_cpu();
|
||||
}
|
||||
}
|
||||
|
||||
bool CacheBlockEntry::alloc_on_cpu() {
|
||||
assert(data == nullptr);
|
||||
data = manager->pool->alloc(size);
|
||||
if (data == nullptr) {
|
||||
manager->evict_for_cpu_cache();
|
||||
data = manager->pool->alloc(size);
|
||||
if (data == nullptr) {
|
||||
SPDLOG_ERROR("Not enough memory for Block Cache");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void CacheBlockEntry::free_on_cpu() {
|
||||
manager->pool->free(data, size);
|
||||
data = nullptr;
|
||||
}
|
||||
|
||||
bool CacheBlockEntry::alloc_on_cpu_no_lock() {
|
||||
if (data == nullptr) {
|
||||
if (alloc_on_cpu() == false) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool CacheBlockEntry::inc_ref_or_alloc_on_cpu() {
|
||||
std::lock_guard<CacheBlockEntry::MutexT> lg(lock);
|
||||
if (data == nullptr) {
|
||||
if (alloc_on_cpu()) {
|
||||
cpu_cc.ref_count.fetch_add(1);
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
cpu_cc.ref_count.fetch_add(1);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
std::unique_lock<CacheBlockEntry::MutexT> CacheBlockEntry::try_lock() {
|
||||
return std::unique_lock<CacheBlockEntry::MutexT>(lock, std::try_to_lock);
|
||||
}
|
||||
|
||||
std::lock_guard<CacheBlockEntry::MutexT> CacheBlockEntry::lock_guard() {
|
||||
return std::lock_guard<CacheBlockEntry::MutexT>(lock);
|
||||
}
|
||||
|
||||
void CacheBlockEntry::debug() {
|
||||
SPDLOG_DEBUG(
|
||||
"CacheBlockEntry: disk[{:4},{:7}], with key {}, hash {:016x}, data: {}, ref_count: {}, size: {}, cpu tc: {}, "
|
||||
"in page cache: {}, gpu ref count:{}, gpu tc: {}",
|
||||
layer, idx, with_key, hash, data, cpu_cc.ref_count.load(), size, cpu_cc.tc.debug(), manager != nullptr,
|
||||
gpu_cc.ref_count.load(), gpu_cc.tc.debug());
|
||||
}
|
||||
|
||||
CacheBlockEntryCollector::CacheBlockEntryCollector(std::function<void(CacheBlockEntry*)> exit_fn) : exit_fn(exit_fn) {}
|
||||
|
||||
CacheBlockEntryCollector::~CacheBlockEntryCollector() {
|
||||
// SPDLOG_DEBUG("Collector Destruct");
|
||||
for (auto& e : entries) {
|
||||
exit_fn(e);
|
||||
}
|
||||
}
|
||||
|
||||
void CacheBlockEntry::io_with(async_store::IODealer* dealer, IO_Helper<CacheBlockEntry>& io_helper,
|
||||
async_store::ArrayStore* store, size_t layer, size_t index, IOOption option) {
|
||||
bool write;
|
||||
|
||||
auto& batch_promise = io_helper.batch_promise;
|
||||
|
||||
switch (option) {
|
||||
case IO_Read: {
|
||||
write = false;
|
||||
if (io_helper.absorb_tc(this, cpu_cc.tc)) {
|
||||
// need read
|
||||
} else {
|
||||
return;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case IO_ForceRead: {
|
||||
// Not change
|
||||
write = false;
|
||||
break;
|
||||
}
|
||||
case IO_ForceWrite: {
|
||||
// Not change
|
||||
write = true;
|
||||
break;
|
||||
}
|
||||
case IO_Write: {
|
||||
write = true;
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
assert(0);
|
||||
}
|
||||
}
|
||||
io_helper.new_task();
|
||||
this->layer = layer;
|
||||
this->idx = index;
|
||||
|
||||
auto req = std::make_shared<async_store::IORequest>();
|
||||
req->store = store;
|
||||
req->data = data;
|
||||
req->index = index;
|
||||
req->write = write;
|
||||
req->need_promise = true;
|
||||
req->promise = &batch_promise;
|
||||
|
||||
SPDLOG_TRACE("Submitting {}", async_store::request_to_string(req.get()));
|
||||
dealer->enqueue(std::move(req));
|
||||
}
|
||||
|
||||
CacheEntryManager::CacheEntryManager(CacheEntryManagerConfig config) : config(config) {}
|
||||
|
||||
void CacheEntryManager::evict_for_cpu_cache() {
|
||||
size_t count = 0;
|
||||
evict(
|
||||
[&count](const BlockPtr& block) {
|
||||
// here we assume each with gpu must resides on cpu
|
||||
if (block->data != nullptr && block->cpu_cc.can_desert() &&
|
||||
block->gpu_cc.can_desert() /*For now If A Cache Entry Block is on GPU, it must on cpu. */) {
|
||||
block->free_on_cpu();
|
||||
count += 1;
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
},
|
||||
[&count, this]() {
|
||||
return false;
|
||||
// return count == this->config.evict_count;
|
||||
});
|
||||
}
|
||||
|
||||
void CacheEntryManager::insert(BlockPtr entry) {
|
||||
assert(entry->with_key);
|
||||
assert(key_entry_map.count(entry->hash) == 0);
|
||||
usage_list.push_front(entry);
|
||||
key_entry_map[entry->hash] = usage_list.begin();
|
||||
}
|
||||
|
||||
CacheEntryManager::BlockPtr CacheEntryManager::access(const Key& key) {
|
||||
auto it = key_entry_map.at(key);
|
||||
auto entry = *it;
|
||||
usage_list.erase(it);
|
||||
usage_list.push_front(entry);
|
||||
key_entry_map[key] = usage_list.begin();
|
||||
return entry;
|
||||
}
|
||||
|
||||
// void CacheEntryManager::remove(const Key& key) {
|
||||
// auto it = key_entry_map[key];
|
||||
// usage_list.erase(it);
|
||||
// key_entry_map.erase(key);
|
||||
// }
|
||||
|
||||
void CacheEntryManager::evict(std::function<bool(const BlockPtr&)> filter, std::function<bool()> stop_condition) {
|
||||
auto evict_count = 0;
|
||||
auto inspect_count = 0;
|
||||
|
||||
std::lock_guard<std::mutex> lg(lock);
|
||||
for (auto it = usage_list.rbegin(); it != usage_list.rend();) {
|
||||
inspect_count += 1;
|
||||
// SPDLOG_DEBUG("Map Size {}, List Size {}, Evicted {} blocks, Inspected {}, {}", key_entry_map.size(),
|
||||
// usage_list.size(), evict_count, inspect_count, pool->debug());
|
||||
// (*it)->debug();
|
||||
if (stop_condition())
|
||||
break;
|
||||
auto entry_ul = (*it)->try_lock();
|
||||
if (entry_ul.owns_lock() == false) {
|
||||
++it; // Ensure iterator advances when locking fails
|
||||
continue;
|
||||
}
|
||||
if (filter(*it)) {
|
||||
// SPDLOG_DEBUG("Evicting {}", fmt::ptr(it->get()));
|
||||
evict_count++;
|
||||
if ((*it)->with_key)
|
||||
key_entry_map.erase((*it)->hash);
|
||||
it = decltype(it)(usage_list.erase(std::next(it).base())); // Use base() to adjust for reverse iterator
|
||||
} else {
|
||||
++it; // Ensure iterator advances when filter fails
|
||||
}
|
||||
}
|
||||
|
||||
if (evict_count > 0) {
|
||||
SPDLOG_DEBUG("Map Size {}, List Size {}, Evicted {} blocks, Inspected {}, {}", key_entry_map.size(),
|
||||
usage_list.size(), evict_count, inspect_count, pool->debug());
|
||||
}
|
||||
}
|
||||
|
||||
CacheEntryManager::BlockPtr CacheEntryManager::get(bool& is_new, size_t size, std::optional<Key> key) {
|
||||
std::unique_lock<std::mutex> ul(lock);
|
||||
if (key.has_value()) {
|
||||
if (key_entry_map.count(key.value())) {
|
||||
is_new = false;
|
||||
return access(key.value());
|
||||
} else {
|
||||
auto entry = std::make_shared<CacheBlockEntry>();
|
||||
entry->with_key = true;
|
||||
entry->hash = key.value();
|
||||
entry->size = size;
|
||||
entry->manager = this;
|
||||
insert(entry);
|
||||
is_new = true;
|
||||
return entry;
|
||||
}
|
||||
} else {
|
||||
auto entry = std::make_shared<CacheBlockEntry>();
|
||||
entry->with_key = false;
|
||||
entry->size = size;
|
||||
entry->manager = this;
|
||||
is_new = true;
|
||||
return entry;
|
||||
}
|
||||
}
|
||||
|
||||
void CacheEntryManager::debug() {
|
||||
fmt::print("Cache Manager: {} entries\n", key_entry_map.size());
|
||||
pool->debug();
|
||||
fmt::print("Layer 0 Entries in Order\n", key_entry_map.size());
|
||||
for (auto& it : usage_list) {
|
||||
if (it->layer == 0)
|
||||
it->debug();
|
||||
}
|
||||
}
|
||||
|
||||
}; // namespace kvc2
|
182
csrc/balance_serve/kvc2/src/cache_entry.hh
Normal file
182
csrc/balance_serve/kvc2/src/cache_entry.hh
Normal file
|
@ -0,0 +1,182 @@
|
|||
#ifndef __CACHE_ENTRY_HH_
|
||||
#define __CACHE_ENTRY_HH_
|
||||
#include "async_store.hh"
|
||||
#include "cuda_stream_manager.hh"
|
||||
#include "defs.h"
|
||||
#include "hasher.hpp"
|
||||
#include "io_helper.hpp"
|
||||
#include "page_aligned_memory_pool.h"
|
||||
#include "utils/periodic_task.hpp"
|
||||
|
||||
#include <atomic>
|
||||
#include <list>
|
||||
#include <memory>
|
||||
#include "utils/mutex_extend.hpp"
|
||||
|
||||
namespace kvc2 {
|
||||
using CacheBlockKey = TokensHash;
|
||||
|
||||
class CacheEntryManager;
|
||||
struct DoubleVerticalBlocksHandle;
|
||||
class GPUPageCache;
|
||||
|
||||
struct ConcurrentControlUnit {
|
||||
std::atomic_size_t ref_count = 0;
|
||||
std::atomic_bool dirty = false;
|
||||
TransferControl<std::mutex> tc;
|
||||
|
||||
bool can_desert();
|
||||
void debug();
|
||||
};
|
||||
|
||||
enum IOOption {
|
||||
IO_ForceRead,
|
||||
IO_ForceWrite,
|
||||
IO_Read,
|
||||
IO_Write,
|
||||
};
|
||||
|
||||
inline std::string to_string(IOOption op) {
|
||||
switch (op) {
|
||||
case IO_ForceRead:
|
||||
return "IO_ForceRead";
|
||||
case IO_ForceWrite:
|
||||
return "IO_ForceWrite";
|
||||
case IO_Read:
|
||||
return "IO_Read";
|
||||
case IO_Write:
|
||||
return "IO_Write";
|
||||
default:
|
||||
return "Unknown";
|
||||
}
|
||||
}
|
||||
|
||||
struct CacheBlockEntry {
|
||||
friend CacheEntryManager;
|
||||
using MutexT = non_recursive_mutex;
|
||||
// using MutexT = std::mutex;
|
||||
MutexT lock;
|
||||
|
||||
// for cache
|
||||
bool with_key = true;
|
||||
CacheBlockKey hash = 0;
|
||||
CacheBlockKey hash_check = 0;
|
||||
|
||||
CacheInfo cache_info;
|
||||
CacheEntryManager* manager = nullptr;
|
||||
|
||||
// for memory pool
|
||||
void* data = nullptr;
|
||||
size_t size = 0;
|
||||
|
||||
ConcurrentControlUnit cpu_cc;
|
||||
|
||||
// for disk
|
||||
size_t layer = -1;
|
||||
size_t idx = -1;
|
||||
|
||||
// for gpu
|
||||
|
||||
std::optional<size_t> gpu_block_idx = std::nullopt;
|
||||
ConcurrentControlUnit gpu_cc;
|
||||
|
||||
CacheBlockEntry() =default;
|
||||
CacheBlockEntry(const CacheBlockEntry& other) = delete;
|
||||
CacheBlockEntry& operator=(const CacheBlockEntry& other) = delete;
|
||||
CacheBlockEntry(CacheBlockEntry&& other) = delete;
|
||||
CacheBlockEntry& operator=(CacheBlockEntry&& other) = delete;
|
||||
~CacheBlockEntry();
|
||||
|
||||
private:
|
||||
bool alloc_on_cpu();
|
||||
|
||||
|
||||
public:
|
||||
void free_on_cpu();
|
||||
bool alloc_on_cpu_no_lock();
|
||||
|
||||
bool inc_ref_or_alloc_on_cpu();
|
||||
void set_key(TokensHash key, std::shared_ptr<CacheBlockEntry> me);
|
||||
|
||||
std::unique_lock<MutexT> try_lock();
|
||||
std::lock_guard<MutexT> lock_guard();
|
||||
|
||||
// will not get lock
|
||||
void io_with(async_store::IODealer* dealer, IO_Helper<CacheBlockEntry>& io_helper, async_store::ArrayStore* store,
|
||||
size_t layer, size_t index, IOOption option);
|
||||
void flush_back_async(IO_Helper<CacheBlockEntry>& helper, std::vector<std::atomic_bool*>& dirty_flags);
|
||||
|
||||
void debug();
|
||||
};
|
||||
|
||||
struct CacheBlockEntryCollector{
|
||||
|
||||
std::vector<CacheBlockEntry*> entries;
|
||||
std::function<void(CacheBlockEntry*)> exit_fn;
|
||||
|
||||
CacheBlockEntryCollector(std::function<void(CacheBlockEntry*)> exit_fn);
|
||||
~CacheBlockEntryCollector();
|
||||
|
||||
CacheBlockEntryCollector(const CacheBlockEntryCollector& other) = delete;
|
||||
CacheBlockEntryCollector(CacheBlockEntryCollector&& other) = delete;
|
||||
CacheBlockEntryCollector& operator=(const CacheBlockEntryCollector& other) = delete;
|
||||
CacheBlockEntryCollector& operator=(CacheBlockEntryCollector&& other) = delete;
|
||||
|
||||
|
||||
|
||||
};
|
||||
|
||||
|
||||
struct KVC2;
|
||||
struct CacheEntryManagerConfig {
|
||||
size_t evict_count = 100;
|
||||
KVC2* kvc2_top = nullptr;
|
||||
};
|
||||
|
||||
class CacheEntryManager {
|
||||
public:
|
||||
using Key = CacheBlockKey;
|
||||
using BlockPtr = std::shared_ptr<CacheBlockEntry>;
|
||||
|
||||
private:
|
||||
friend CacheBlockEntry;
|
||||
|
||||
CacheEntryManagerConfig config;
|
||||
|
||||
std::mutex lock;
|
||||
std::list<BlockPtr> usage_list;
|
||||
std::unordered_map<Key, std::list<BlockPtr>::iterator> key_entry_map;
|
||||
|
||||
void insert(BlockPtr entry);
|
||||
BlockPtr access(const Key& key);
|
||||
|
||||
// void remove(const Key& key);
|
||||
void evict(std::function<bool(const BlockPtr&)> filter, std::function<bool()> stop_condition);
|
||||
|
||||
|
||||
public:
|
||||
std::unique_ptr<periodic::PeriodicTask> background_flush_back=nullptr;
|
||||
std::shared_ptr<PageAlignedMemoryPool> pool;
|
||||
std::shared_ptr<GPUPageCache> gpu_cache;
|
||||
|
||||
CacheEntryManager(CacheEntryManagerConfig config);
|
||||
|
||||
// disable all move and copy
|
||||
CacheEntryManager(const CacheEntryManager& other) = delete;
|
||||
CacheEntryManager& operator=(const CacheEntryManager& other) = delete;
|
||||
CacheEntryManager(CacheEntryManager&& other) = delete;
|
||||
CacheEntryManager& operator=(CacheEntryManager&& other) = delete;
|
||||
|
||||
void cpu_background_flush();
|
||||
|
||||
void evict_for_cpu_cache();
|
||||
|
||||
// just get block pointers, not allocate them, will not return nullptr
|
||||
BlockPtr get(bool& is_new,size_t size, std::optional<Key> key = std::nullopt);
|
||||
|
||||
void debug();
|
||||
};
|
||||
|
||||
} // namespace kvc2
|
||||
|
||||
#endif
|
0
csrc/balance_serve/kvc2/src/common.h
Normal file
0
csrc/balance_serve/kvc2/src/common.h
Normal file
135
csrc/balance_serve/kvc2/src/cuda_stream_manager.cpp
Normal file
135
csrc/balance_serve/kvc2/src/cuda_stream_manager.cpp
Normal file
|
@ -0,0 +1,135 @@
|
|||
#include "cuda_stream_manager.hh"
|
||||
#include <cuda_runtime.h>
|
||||
#include <functional>
|
||||
#include <iostream>
|
||||
#include <stdexcept>
|
||||
#include <vector>
|
||||
#define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_INFO
|
||||
// #define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG
|
||||
#define FMT_HEADER_ONLY
|
||||
#include "spdlog/spdlog.h"
|
||||
|
||||
CudaStreamManager::CudaStreamManager(const std::vector<size_t>& device_ids, int num_streams_per_device) {
|
||||
for (int device_id : device_ids) {
|
||||
auto x = std::unique_ptr<DeviceInfo>(new DeviceInfo);
|
||||
DeviceInfo& device_info = *x;
|
||||
device_info.device_id = device_id;
|
||||
device_info.next_stream_index = 0;
|
||||
device_info.stop_flag = false;
|
||||
|
||||
// 设置设备
|
||||
cudaError_t err = cudaSetDevice(device_id);
|
||||
if (err != cudaSuccess) {
|
||||
SPDLOG_WARN("cudaSetDevice failed on device {}: {}", device_id, cudaGetErrorString(err));
|
||||
throw std::runtime_error("cudaSetDevice failed");
|
||||
}
|
||||
|
||||
// 创建 CUDA 流
|
||||
device_info.streams.resize(num_streams_per_device);
|
||||
for (int i = 0; i < num_streams_per_device; ++i) {
|
||||
err = cudaStreamCreate(&device_info.streams[i]);
|
||||
if (err != cudaSuccess) {
|
||||
SPDLOG_WARN("Failed to create CUDA stream on device {}: {}", device_id, cudaGetErrorString(err));
|
||||
throw std::runtime_error("Failed to create CUDA stream");
|
||||
}
|
||||
}
|
||||
|
||||
// 启动设备工作线程
|
||||
device_info.worker_thread = std::thread(&CudaStreamManager::deviceWorker, this, std::ref(device_info));
|
||||
|
||||
devices_.push_back(std::move(x));
|
||||
}
|
||||
}
|
||||
|
||||
CudaStreamManager::~CudaStreamManager() {
|
||||
// 通知所有设备线程停止
|
||||
for (auto& device_info : devices_) {
|
||||
device_info->stop_flag.store(true);
|
||||
auto request = std::shared_ptr<Request>(new Request);
|
||||
request->should_exit = true;
|
||||
device_info->request_queue.enqueue(std::move(request));
|
||||
}
|
||||
|
||||
// 等待所有线程结束
|
||||
for (auto& device_info : devices_) {
|
||||
if (device_info->worker_thread.joinable()) {
|
||||
device_info->worker_thread.join();
|
||||
}
|
||||
|
||||
// 销毁 CUDA 流
|
||||
cudaSetDevice(device_info->device_id);
|
||||
for (auto& stream : device_info->streams) {
|
||||
cudaStreamDestroy(stream);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void CudaStreamManager::submitRequest(std::shared_ptr<Request> request) {
|
||||
// 找到对应的设备
|
||||
for (auto& device_info : devices_) {
|
||||
if (device_info->device_id == request->device_id) {
|
||||
device_info->request_queue.enqueue(request);
|
||||
return;
|
||||
}
|
||||
}
|
||||
throw std::runtime_error("Invalid device ID in request");
|
||||
}
|
||||
|
||||
void CudaStreamManager::deviceWorker(DeviceInfo& device_info) {
|
||||
// 设置设备
|
||||
cudaError_t err = cudaSetDevice(device_info.device_id);
|
||||
if (err != cudaSuccess) {
|
||||
SPDLOG_WARN("cudaSetDevice failed in worker thread for device {}: {}", device_info.device_id,
|
||||
cudaGetErrorString(err));
|
||||
return;
|
||||
}
|
||||
|
||||
while (device_info.stop_flag.load() == false) {
|
||||
auto request = device_info.request_queue.dequeue();
|
||||
if (request->should_exit) {
|
||||
return;
|
||||
}
|
||||
// 处理请求
|
||||
SPDLOG_DEBUG("Getting request on device {}, count {}", device_info.device_id, request->host_mem_addresses.size());
|
||||
int stream_index = device_info.next_stream_index;
|
||||
cudaStream_t stream = device_info.streams[stream_index];
|
||||
device_info.next_stream_index = (device_info.next_stream_index + 1) % device_info.streams.size();
|
||||
|
||||
size_t num_transfers = request->host_mem_addresses.size();
|
||||
for (size_t i = 0; i < num_transfers; ++i) {
|
||||
void* dst = request->device_mem_addresses[i];
|
||||
void* src = request->host_mem_addresses[i];
|
||||
if (request->direction == cudaMemcpyDeviceToHost) {
|
||||
std::swap(dst, src);
|
||||
}
|
||||
|
||||
cudaError_t err = cudaMemcpyAsync(dst, src, request->sizes[i], request->direction, stream);
|
||||
if (err != cudaSuccess) {
|
||||
SPDLOG_WARN("cudaMemcpyAsync failed on device {}: {}", device_info.device_id, cudaGetErrorString(err));
|
||||
// 可以根据需要处理错误,这里简单地继续
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// 添加回调函数,因为是异步,所以需要包起来
|
||||
struct CallbackData {
|
||||
std::function<void()> callback;
|
||||
};
|
||||
CallbackData* cb_data = new CallbackData{request->callback};
|
||||
|
||||
err = cudaLaunchHostFunc(
|
||||
stream,
|
||||
[](void* data) {
|
||||
// SPDLOG_DEBUG("Callback function called");
|
||||
CallbackData* cb_data = static_cast<CallbackData*>(data);
|
||||
cb_data->callback();
|
||||
delete cb_data;
|
||||
},
|
||||
cb_data);
|
||||
|
||||
if (err != cudaSuccess) {
|
||||
SPDLOG_WARN("cudaLaunchHostFunc failed on device {}: {}", device_info.device_id, cudaGetErrorString(err));
|
||||
// 根据需要处理错误
|
||||
}
|
||||
}
|
||||
}
|
54
csrc/balance_serve/kvc2/src/cuda_stream_manager.hh
Normal file
54
csrc/balance_serve/kvc2/src/cuda_stream_manager.hh
Normal file
|
@ -0,0 +1,54 @@
|
|||
/*
|
||||
* @Author: Xie Weiyu ervinxie@qq.com
|
||||
* @Date: 2024-11-19 09:24:47
|
||||
* @LastEditors: Xie Weiyu ervinxie@qq.com
|
||||
* @LastEditTime: 2024-11-20 02:55:49
|
||||
* @FilePath: /kvc2/src/cuda_stream_manager.hh
|
||||
* @Description: 这是默认设置,请设置`customMade`, 打开koroFileHeader查看配置 进行设置: https://github.com/OBKoro1/koro1FileHeader/wiki/%E9%85%8D%E7%BD%AE
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <atomic>
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
#include "utils/mpsc.hpp"
|
||||
|
||||
class CudaStreamManager {
|
||||
public:
|
||||
// 构造函数,接受要使用的设备 ID 列表和每个设备的流数量
|
||||
CudaStreamManager(const std::vector<size_t>& device_ids, int num_streams_per_device);
|
||||
~CudaStreamManager();
|
||||
|
||||
// 请求结构体
|
||||
struct Request {
|
||||
bool should_exit = false;
|
||||
int device_id;
|
||||
std::vector<void*> host_mem_addresses;
|
||||
std::vector<void*> device_mem_addresses;
|
||||
std::vector<size_t> sizes;
|
||||
cudaMemcpyKind direction;
|
||||
std::function<void()> callback;
|
||||
};
|
||||
|
||||
void submitRequest(std::shared_ptr<Request> request);
|
||||
|
||||
private:
|
||||
// 每个设备的信息
|
||||
struct DeviceInfo {
|
||||
int device_id;
|
||||
std::thread worker_thread;
|
||||
std::vector<cudaStream_t> streams;
|
||||
int next_stream_index;
|
||||
MPSCQueueConsumerLock<std::shared_ptr<Request>> request_queue;
|
||||
std::atomic_bool stop_flag;
|
||||
};
|
||||
|
||||
// 设备 ID 到 DeviceInfo 的映射
|
||||
std::vector<std::unique_ptr<DeviceInfo>> devices_;
|
||||
|
||||
// 私有方法
|
||||
void deviceWorker(DeviceInfo& device_info);
|
||||
};
|
35
csrc/balance_serve/kvc2/src/defs.h
Normal file
35
csrc/balance_serve/kvc2/src/defs.h
Normal file
|
@ -0,0 +1,35 @@
|
|||
#ifndef __DEFS_H_
|
||||
#define __DEFS_H_
|
||||
|
||||
#include <cstdint>
|
||||
#include <optional>
|
||||
#include <vector>
|
||||
#include "model_config.h"
|
||||
|
||||
namespace kvc2 {
|
||||
using kvc2_ptr = void*;
|
||||
// using data_block_ptr = std::intptr_t;
|
||||
using data_block_ptr = void*;
|
||||
using layer_data = std::vector<data_block_ptr>;
|
||||
using kvc2_handle = void*;
|
||||
|
||||
using Token = uint32_t;
|
||||
using Tokens = std::vector<Token>;
|
||||
using TokenPtr = std::intptr_t;
|
||||
using TokenLength = size_t;
|
||||
using BlockLength = size_t;
|
||||
|
||||
struct CacheInfo {
|
||||
ModelName model_name;
|
||||
bool is_key_cache;
|
||||
QuantType quant_type;
|
||||
|
||||
size_t hidden_layer_count();
|
||||
std::filesystem::path path(std::optional<size_t> which_layer = std::nullopt);
|
||||
bool operator==(const CacheInfo& other) const;
|
||||
size_t element_size(size_t block_length);
|
||||
size_t hash_value() const;
|
||||
};
|
||||
|
||||
}; // namespace kvc2
|
||||
#endif
|
282
csrc/balance_serve/kvc2/src/gpu_cache.cpp
Normal file
282
csrc/balance_serve/kvc2/src/gpu_cache.cpp
Normal file
|
@ -0,0 +1,282 @@
|
|||
#include "gpu_cache.hh"
|
||||
|
||||
#define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG
|
||||
#define FMT_HEADER_ONLY
|
||||
#include "spdlog/spdlog.h"
|
||||
|
||||
#include "cache_entry.hh"
|
||||
#include "utils/arithmetic.hpp"
|
||||
|
||||
namespace kvc2 {
|
||||
|
||||
GPUPageCache::GPUPageCache(GPUPageCacheConfig& config) : config(config) {
|
||||
if (torch::cuda::is_available()) {
|
||||
size_t gpu_count = torch::cuda::device_count();
|
||||
SPDLOG_INFO("Number of available GPUs: {}, want {}", gpu_count, config.gpu_devices_id.size());
|
||||
if (gpu_count < config.gpu_devices_id.size()) {
|
||||
SPDLOG_ERROR("Not enough GPUs available.");
|
||||
exit(0);
|
||||
}
|
||||
for (auto x : config.gpu_devices_id) {
|
||||
gpu_devices.push_back(torch::Device(torch::kCUDA, x));
|
||||
}
|
||||
} else {
|
||||
SPDLOG_ERROR("CUDA is not available on this system.");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
SPDLOG_WARN("Creating GPU Cache");
|
||||
shape.push_back(config.layer_count);
|
||||
shape.push_back(config.total_kvcache_pages);
|
||||
shape.push_back(config.num_token_per_page);
|
||||
if (config.full_kv_cache_on_each_gpu) {
|
||||
if (config.gpu_devices_id.size() > 1) {
|
||||
SPDLOG_WARN("Replicated KVCache on multiple gpu");
|
||||
}
|
||||
shape.push_back(config.num_k_heads);
|
||||
} else {
|
||||
shape.push_back(config.num_k_heads / config.gpu_devices_id.size());
|
||||
}
|
||||
shape.push_back(config.k_head_dim);
|
||||
tensor_size = torch::elementSize(config.tensor_type);
|
||||
for (auto& s : shape) {
|
||||
tensor_size *= s;
|
||||
}
|
||||
SPDLOG_INFO("Creating KV Page Cache, Shape ({},{},{},{},{}), Size {} MiB", shape[0], shape[1], shape[2], shape[3],
|
||||
shape[4], tensor_size / (1 << 20));
|
||||
if (config.k_cache_on) {
|
||||
for (size_t i = 0; i < config.gpu_devices_id.size(); i++) {
|
||||
auto k = torch::zeros(shape, torch::TensorOptions().dtype(config.tensor_type));
|
||||
k = k.to(gpu_devices[i]);
|
||||
|
||||
k_cache.push_back(k);
|
||||
|
||||
SPDLOG_INFO("K Page Cache of GPU {} is created", config.gpu_devices_id[i]);
|
||||
}
|
||||
occupations.resize(config.layer_count);
|
||||
} else {
|
||||
SPDLOG_WARN("Disalbe K Cache");
|
||||
assert(config.gpu_only);
|
||||
}
|
||||
|
||||
if (config.v_cache_on) {
|
||||
for (size_t i = 0; i < config.gpu_devices_id.size(); i++) {
|
||||
auto v = torch::zeros(shape, torch::TensorOptions().dtype(config.tensor_type));
|
||||
v = v.to(gpu_devices[i]);
|
||||
v_cache.push_back(v);
|
||||
|
||||
SPDLOG_INFO("V Page Cache of GPU {} is created", config.gpu_devices_id[i]);
|
||||
}
|
||||
v_occupations.resize(config.layer_count);
|
||||
} else {
|
||||
SPDLOG_WARN("Disalbe V Cache");
|
||||
// assert(config.gpu_only); // should not assert
|
||||
}
|
||||
|
||||
if (config.gpu_only) {
|
||||
gpu_only_occupations.resize(config.total_kvcache_pages, false);
|
||||
}
|
||||
|
||||
num_free_pages = config.total_kvcache_pages;
|
||||
for (size_t i = 0; i < config.layer_count; i++) {
|
||||
if (config.k_cache_on)
|
||||
occupations[i].resize(config.total_kvcache_pages, nullptr);
|
||||
if (config.v_cache_on)
|
||||
v_occupations[i].resize(config.total_kvcache_pages, nullptr);
|
||||
}
|
||||
|
||||
tp_size.resize(config.gpu_devices_id.size(), shape[2] * shape[3] * shape[4] * c10::elementSize(config.tensor_type));
|
||||
tp_offset.resize(config.gpu_devices_id.size(), 0);
|
||||
for (size_t i = 1; i < tp_offset.size(); i++) {
|
||||
tp_offset[i] = tp_offset[i - 1] + tp_size[i - 1];
|
||||
}
|
||||
|
||||
stream_manager =
|
||||
std::unique_ptr<CudaStreamManager>(new CudaStreamManager(config.gpu_devices_id, config.num_streams_per_device));
|
||||
}
|
||||
|
||||
bool GPUPageCache::alloc_col(std::vector<std::vector<std::shared_ptr<CacheBlockEntry>>>& k_entries,
|
||||
std::vector<std::vector<std::shared_ptr<CacheBlockEntry>>>& v_entries, size_t at) {
|
||||
std::lock_guard<std::mutex> lg(lock);
|
||||
auto idx = next_empty_col();
|
||||
if (idx.has_value()) {
|
||||
// must have entry lock
|
||||
auto& k0_entry = k_entries[0][at];
|
||||
k0_entry->gpu_block_idx = idx;
|
||||
|
||||
for (size_t l = 0; l < config.layer_count; l++) {
|
||||
if (config.k_cache_on) {
|
||||
assert(k_entries[l][at]->data != nullptr);
|
||||
occupations[l][idx.value()] = k_entries[l][at];
|
||||
}
|
||||
if (config.v_cache_on) {
|
||||
assert(v_entries[l][at]->data != nullptr);
|
||||
v_occupations[l][idx.value()] = v_entries[l][at];
|
||||
}
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<size_t> GPUPageCache::gpu_only_alloc_col(size_t count) {
|
||||
assert(config.gpu_only);
|
||||
std::lock_guard<std::mutex> lg(lock);
|
||||
std::vector<size_t> re;
|
||||
|
||||
for (size_t i = 0; i < config.total_kvcache_pages; i++) {
|
||||
if (gpu_only_occupations[i] == false) {
|
||||
re.push_back(i);
|
||||
if (re.size() == count) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (re.size() == count) {
|
||||
for (auto at : re) {
|
||||
gpu_only_occupations[at] = true;
|
||||
}
|
||||
} else {
|
||||
SPDLOG_WARN("GPU ONLY: Cannot allocate {} cols", count);
|
||||
re.clear();
|
||||
}
|
||||
return re;
|
||||
}
|
||||
|
||||
void GPUPageCache::gpu_only_free_cols(std::vector<size_t> cols) {
|
||||
assert(config.gpu_only);
|
||||
std::lock_guard<std::mutex> lg(lock);
|
||||
for (auto at : cols) {
|
||||
assert(gpu_only_occupations[at]);
|
||||
gpu_only_occupations[at] = false;
|
||||
}
|
||||
}
|
||||
|
||||
std::optional<size_t> GPUPageCache::next_empty_col() {
|
||||
if (num_free_pages == 0) {
|
||||
evict_cols();
|
||||
if (num_free_pages == 0) {
|
||||
return std::nullopt;
|
||||
}
|
||||
}
|
||||
while (occupations[0][_col_idx] != nullptr) {
|
||||
_col_idx = (_col_idx + 1) % config.total_kvcache_pages;
|
||||
}
|
||||
num_free_pages -= 1;
|
||||
return _col_idx;
|
||||
}
|
||||
|
||||
void GPUPageCache::evict_cols() {
|
||||
auto evicted_count = 0;
|
||||
for (size_t i = 0; i < config.total_kvcache_pages; i++) {
|
||||
auto& h = occupations[0][i];
|
||||
if (h == nullptr) {
|
||||
continue;
|
||||
}
|
||||
auto lg = h->lock_guard();
|
||||
if (h->gpu_cc.can_desert()) {
|
||||
h->gpu_cc.tc.reset();
|
||||
h = nullptr;
|
||||
num_free_pages += 1;
|
||||
evicted_count += 1;
|
||||
}
|
||||
}
|
||||
if (evicted_count > 0)
|
||||
SPDLOG_INFO("GPU: Evicted {} GPU pages", evicted_count);
|
||||
}
|
||||
|
||||
std::vector<std::unique_lock<CacheBlockEntry::MutexT>> GPUPageCache::try_lock_col(size_t at) {
|
||||
std::vector<std::unique_lock<CacheBlockEntry::MutexT>> re;
|
||||
if (config.k_cache_on) {
|
||||
for (size_t l = 0; l < config.layer_count; l++) {
|
||||
if (occupations[l][at] == nullptr) {
|
||||
return {};
|
||||
}
|
||||
auto ul = occupations[l][at]->try_lock();
|
||||
if (ul.owns_lock()) {
|
||||
re.push_back(std::move(ul));
|
||||
} else {
|
||||
return {};
|
||||
}
|
||||
}
|
||||
}
|
||||
if (config.v_cache_on) {
|
||||
for (size_t l = 0; l < config.layer_count; l++) {
|
||||
if (v_occupations[l][at] == nullptr) {
|
||||
return {};
|
||||
}
|
||||
auto ul = v_occupations[l][at]->try_lock();
|
||||
if (ul.owns_lock()) {
|
||||
re.push_back(std::move(ul));
|
||||
} else {
|
||||
return {};
|
||||
}
|
||||
}
|
||||
}
|
||||
return re;
|
||||
}
|
||||
|
||||
std::vector<std::shared_ptr<CudaStreamManager::Request>> GPUPageCache::basic_request(cudaMemcpyKind direction,
|
||||
std::function<void()> callback) {
|
||||
std::vector<std::shared_ptr<CudaStreamManager::Request>> re;
|
||||
re.resize(config.gpu_devices_id.size(), nullptr);
|
||||
for (size_t i = 0; i < re.size(); i++) {
|
||||
re[i] = std::shared_ptr<CudaStreamManager::Request>(new CudaStreamManager::Request);
|
||||
re[i]->direction = direction;
|
||||
re[i]->device_id = config.gpu_devices_id[i];
|
||||
re[i]->callback = callback;
|
||||
}
|
||||
return re;
|
||||
}
|
||||
|
||||
void GPUPageCache::submit_requests(std::vector<std::shared_ptr<CudaStreamManager::Request>> reqs) {
|
||||
for (auto& r : reqs) {
|
||||
stream_manager->submitRequest(r);
|
||||
}
|
||||
}
|
||||
|
||||
void GPUPageCache::append_col_to_request(std::vector<std::shared_ptr<CudaStreamManager::Request>>& reqs,
|
||||
std::vector<std::vector<std::shared_ptr<CacheBlockEntry>>>& k_handles,
|
||||
std::vector<std::vector<std::shared_ptr<CacheBlockEntry>>>& v_handles,
|
||||
size_t at) {
|
||||
if (config.k_cache_on == false && config.v_cache_on == false) {
|
||||
return;
|
||||
}
|
||||
auto gpu_block_idx = k_handles[0][at]->gpu_block_idx.value();
|
||||
for (size_t layer = 0; layer < config.layer_count; layer++) {
|
||||
for (size_t which_gpu = 0; which_gpu < config.gpu_devices_id.size(); which_gpu++) {
|
||||
if (config.k_cache_on) {
|
||||
assert(k_handles[layer][at]->data != nullptr);
|
||||
reqs[which_gpu]->sizes.push_back(tp_size[which_gpu]);
|
||||
reqs[which_gpu]->host_mem_addresses.push_back(
|
||||
offset_by_bytes(k_handles[layer][at]->data, tp_offset[which_gpu]));
|
||||
reqs[which_gpu]->device_mem_addresses.push_back(k_cache[which_gpu][layer][gpu_block_idx].data_ptr());
|
||||
}
|
||||
|
||||
if (config.v_cache_on) {
|
||||
assert(v_handles[layer][at]->data != nullptr);
|
||||
reqs[which_gpu]->sizes.push_back(tp_size[which_gpu]);
|
||||
reqs[which_gpu]->host_mem_addresses.push_back(
|
||||
offset_by_bytes(v_handles[layer][at]->data, tp_offset[which_gpu]));
|
||||
reqs[which_gpu]->device_mem_addresses.push_back(v_cache[which_gpu][layer][gpu_block_idx].data_ptr());
|
||||
}
|
||||
}
|
||||
}
|
||||
// SPDLOG_DEBUG("GPU: Appended Vertical Handle to Request, count {}", reqs[0]->sizes.size());
|
||||
}
|
||||
|
||||
void GPUPageCache::debug() {
|
||||
size_t count = 0;
|
||||
for (size_t i = 0; i < config.total_kvcache_pages; i++) {
|
||||
if (occupations[0][i] == nullptr) {
|
||||
count += 1;
|
||||
} else {
|
||||
// occupations[0][i]->gpu_cc.debug();
|
||||
}
|
||||
}
|
||||
SPDLOG_DEBUG("Free Page: {}/{}", count, config.total_kvcache_pages);
|
||||
}
|
||||
|
||||
} // namespace kvc2
|
74
csrc/balance_serve/kvc2/src/gpu_cache.hh
Normal file
74
csrc/balance_serve/kvc2/src/gpu_cache.hh
Normal file
|
@ -0,0 +1,74 @@
|
|||
#ifndef __GPU_CACHE_HH_
|
||||
#define __GPU_CACHE_HH_
|
||||
|
||||
#include <torch/torch.h>
|
||||
#include "cache_entry.hh"
|
||||
#include "cuda_stream_manager.hh"
|
||||
#include "defs.h"
|
||||
#include "kvc2.h"
|
||||
#include "metrics.h"
|
||||
#include "utils/periodic_task.hpp"
|
||||
|
||||
namespace kvc2 {
|
||||
|
||||
class GPUPageCache {
|
||||
std::vector<torch::Device> gpu_devices;
|
||||
|
||||
std::vector<int64_t> shape;
|
||||
size_t tensor_size;
|
||||
std::vector<size_t> tp_offset;
|
||||
std::vector<size_t> tp_size;
|
||||
|
||||
|
||||
|
||||
// met
|
||||
std::shared_ptr<Metrics> met;
|
||||
|
||||
// states
|
||||
std::mutex lock;
|
||||
size_t num_free_pages;
|
||||
std::vector<bool> gpu_only_occupations;
|
||||
std::vector<std::vector<std::shared_ptr<CacheBlockEntry>>> occupations,v_occupations;
|
||||
size_t _col_idx = 0;
|
||||
|
||||
|
||||
// cuda stream manager
|
||||
std::optional<size_t> next_empty_col();
|
||||
|
||||
public:
|
||||
GPUPageCacheConfig config;
|
||||
std::unique_ptr<CudaStreamManager> stream_manager;
|
||||
std::vector<torch::Tensor> k_cache;
|
||||
std::vector<torch::Tensor> v_cache;
|
||||
std::unique_ptr<periodic::PeriodicTask> background_flush_back =nullptr;
|
||||
|
||||
GPUPageCache(GPUPageCacheConfig& config);
|
||||
|
||||
std::vector<size_t> gpu_only_alloc_col(size_t count);
|
||||
void gpu_only_free_cols(std::vector<size_t> cols);
|
||||
|
||||
|
||||
void gpu_background_flush();
|
||||
|
||||
|
||||
bool alloc_col(std::vector<std::vector<std::shared_ptr<CacheBlockEntry>>>& k_entries,
|
||||
std::vector<std::vector<std::shared_ptr<CacheBlockEntry>>>& v_entries, size_t at);
|
||||
void evict_cols();
|
||||
void flush_col(size_t at);
|
||||
std::vector<std::unique_lock<CacheBlockEntry::MutexT>> try_lock_col(size_t at);
|
||||
|
||||
void free_col(size_t at);
|
||||
|
||||
std::vector<std::shared_ptr<CudaStreamManager::Request>> basic_request(cudaMemcpyKind direction,
|
||||
std::function<void()> callback);
|
||||
|
||||
void submit_requests(std::vector<std::shared_ptr<CudaStreamManager::Request>> reqs);
|
||||
|
||||
void append_col_to_request(std::vector<std::shared_ptr<CudaStreamManager::Request>>& reqs,
|
||||
std::vector<std::vector<std::shared_ptr<CacheBlockEntry>>>& k_handles,
|
||||
std::vector<std::vector<std::shared_ptr<CacheBlockEntry>>>& v_handles, size_t at);
|
||||
|
||||
void debug();
|
||||
};
|
||||
} // namespace kvc2
|
||||
#endif
|
40
csrc/balance_serve/kvc2/src/hasher.hpp
Normal file
40
csrc/balance_serve/kvc2/src/hasher.hpp
Normal file
|
@ -0,0 +1,40 @@
|
|||
#ifndef __HASHER_HPP_
|
||||
#define __HASHER_HPP_
|
||||
|
||||
#include "defs.h"
|
||||
#include "xxhash.h"
|
||||
|
||||
namespace kvc2 {
|
||||
|
||||
const uint64_t hash_seed = 4123512;
|
||||
const uint64_t check_hash_seed = 1025753;
|
||||
|
||||
using TokensHash = XXH64_hash_t;
|
||||
struct TokensHasher {
|
||||
XXH64_state_t* state;
|
||||
TokensHasher() {
|
||||
state = XXH64_createState();
|
||||
reset();
|
||||
}
|
||||
~TokensHasher() { XXH64_freeState(state); }
|
||||
|
||||
TokensHasher(TokensHasher& other) = delete;
|
||||
TokensHasher& operator=(TokensHasher& other) = delete;
|
||||
TokensHasher(TokensHasher&& other) = delete;
|
||||
TokensHasher& operator=(TokensHasher&& other) = delete;
|
||||
TokensHash get() { return XXH64_digest(state); }
|
||||
void reset(size_t seed = hash_seed) { XXH64_reset(state, seed); }
|
||||
TokensHash update(Token* data, TokenLength length) {
|
||||
XXH64_update(state, data, length * sizeof(Token));
|
||||
return get();
|
||||
}
|
||||
|
||||
TokensHash update_raw(void* data, size_t size) {
|
||||
XXH64_update(state, data, size);
|
||||
return get();
|
||||
}
|
||||
|
||||
static TokensHash hash(Token* data, TokenLength length) { return XXH64(data, length * sizeof(Token), hash_seed); }
|
||||
};
|
||||
} // namespace kvc2
|
||||
#endif
|
155
csrc/balance_serve/kvc2/src/io_helper.hpp
Normal file
155
csrc/balance_serve/kvc2/src/io_helper.hpp
Normal file
|
@ -0,0 +1,155 @@
|
|||
/**
|
||||
* @Description :
|
||||
* @Author : Xie Weiyu
|
||||
* @Date : 2024-12-11 06:35:31
|
||||
* @Version : 1.0.0
|
||||
* @LastEditors : Xie Weiyu
|
||||
* @LastEditTime : 2024-12-11 06:50:55
|
||||
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
**/
|
||||
|
||||
#pragma once
|
||||
#include <atomic>
|
||||
#include <future>
|
||||
#include <iostream>
|
||||
#include <mutex>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
struct BatchPromise {
|
||||
std::promise<void> promise;
|
||||
std::shared_future<void> fut;
|
||||
std::atomic_size_t count;
|
||||
|
||||
inline BatchPromise(size_t count) : count(count) { fut = promise.get_future().share(); }
|
||||
|
||||
inline void inc(size_t count = 1) { this->count.fetch_add(count, std::memory_order_seq_cst); }
|
||||
|
||||
inline void set() {
|
||||
if (count.fetch_sub(1, std::memory_order_seq_cst) == 1) {
|
||||
promise.set_value();
|
||||
}
|
||||
}
|
||||
inline std::shared_future<void> get_shared_fut() { return fut; }
|
||||
};
|
||||
|
||||
template <typename Lock>
|
||||
struct TransferControl {
|
||||
Lock lock;
|
||||
|
||||
std::optional<std::shared_future<void>> transfer_ok = std::nullopt;
|
||||
bool has_data = false;
|
||||
|
||||
TransferControl() {}
|
||||
|
||||
/*
|
||||
true, std::nullopt : Already has data
|
||||
false, shared_future : Transfer already started, should wait for the future
|
||||
false, std::nullopt : should transfer by you
|
||||
true, shared_future: Should not appear
|
||||
*/
|
||||
std::pair<bool, std::optional<std::shared_future<void>>> has_data_or_transfer(std::shared_future<void> shared_fut) {
|
||||
std::lock_guard<Lock> lg(lock);
|
||||
if (has_data) {
|
||||
return {true, std::nullopt};
|
||||
} else {
|
||||
if (transfer_ok.has_value()) {
|
||||
return {false, transfer_ok};
|
||||
} else {
|
||||
transfer_ok = shared_fut;
|
||||
return {false, std::nullopt};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void set_has_data() {
|
||||
std::lock_guard<Lock> lg(lock);
|
||||
has_data = true;
|
||||
transfer_ok = std::nullopt;
|
||||
}
|
||||
|
||||
bool get_has_data() {
|
||||
std::lock_guard<Lock> lg(lock);
|
||||
if (has_data) {
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
void reset() {
|
||||
std::lock_guard<Lock> lg(lock);
|
||||
transfer_ok = std::nullopt;
|
||||
has_data = false;
|
||||
}
|
||||
|
||||
std::string debug() {
|
||||
std::lock_guard<Lock> lg(lock);
|
||||
return std::string("") + (has_data ? "has data" : "no data") + " " +
|
||||
(transfer_ok.has_value() ? "transfer " : "no transfer");
|
||||
}
|
||||
};
|
||||
|
||||
struct ConcurrentController {
|
||||
std::atomic_bool dirty = false;
|
||||
std::atomic_size_t ref_count = 0;
|
||||
TransferControl<std::mutex> tc;
|
||||
};
|
||||
|
||||
template <typename Unit>
|
||||
struct IO_Helper {
|
||||
BatchPromise batch_promise;
|
||||
std::function<void(Unit*)> call_back_on_unit = nullptr;
|
||||
std::function<void()> call_back = nullptr;
|
||||
|
||||
std::vector<std::shared_future<void>> futs;
|
||||
std::vector<Unit*> units_by_myself;
|
||||
|
||||
IO_Helper(std::function<void(Unit*)> call_back_on_unit, std::function<void()> call_back = nullptr)
|
||||
: batch_promise(1), call_back_on_unit(call_back_on_unit), call_back(call_back) {}
|
||||
|
||||
IO_Helper(const IO_Helper& other) = delete;
|
||||
IO_Helper& operator=(const IO_Helper& other) = delete;
|
||||
IO_Helper(IO_Helper&& other) = delete;
|
||||
IO_Helper& operator=(IO_Helper&& other) = delete;
|
||||
~IO_Helper() {
|
||||
// std::cout<<"Destory IO helper"<<std::endl;
|
||||
}
|
||||
|
||||
size_t total_task_count = 0;
|
||||
void new_task(size_t count = 1) {
|
||||
total_task_count += 1;
|
||||
batch_promise.inc(count);
|
||||
}
|
||||
void finish_add_taks() { batch_promise.set(); }
|
||||
|
||||
bool absorb_tc(Unit* unit, TransferControl<std::mutex>& tc) {
|
||||
auto [ok, fut] = tc.has_data_or_transfer(batch_promise.get_shared_fut());
|
||||
if (ok) {
|
||||
return false;
|
||||
} else {
|
||||
if (fut.has_value()) {
|
||||
futs.push_back(fut.value());
|
||||
// printf("Transfer started\n");
|
||||
return false;
|
||||
} else {
|
||||
units_by_myself.push_back(unit);
|
||||
// printf("Not Transfer\n");
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void wait() {
|
||||
for (auto& fut : futs) {
|
||||
fut.wait();
|
||||
}
|
||||
batch_promise.get_shared_fut().wait();
|
||||
for (auto& b : units_by_myself) {
|
||||
call_back_on_unit(b);
|
||||
}
|
||||
if (call_back)
|
||||
call_back();
|
||||
}
|
||||
};
|
138
csrc/balance_serve/kvc2/src/kvc2.h
Normal file
138
csrc/balance_serve/kvc2/src/kvc2.h
Normal file
|
@ -0,0 +1,138 @@
|
|||
#pragma once
|
||||
#include <torch/torch.h>
|
||||
#include <cstdint>
|
||||
#include <optional>
|
||||
#include <vector>
|
||||
#include "defs.h"
|
||||
#include "model_config.h"
|
||||
|
||||
namespace kvc2 {
|
||||
struct GPUPageCacheConfig {
|
||||
bool gpu_only;
|
||||
std::vector<size_t> gpu_devices_id;
|
||||
|
||||
size_t layer_count;
|
||||
size_t total_kvcache_pages;
|
||||
size_t num_token_per_page;
|
||||
size_t num_k_heads;
|
||||
size_t k_head_dim;
|
||||
|
||||
bool full_kv_cache_on_each_gpu = false;
|
||||
bool k_cache_on = true;
|
||||
bool v_cache_on = true;
|
||||
torch::ScalarType tensor_type;
|
||||
|
||||
// for cuda stream manager
|
||||
size_t num_streams_per_device = 4;
|
||||
};
|
||||
|
||||
struct KVC2Config {
|
||||
bool k_cache_on = true;
|
||||
bool v_cache_on = true;
|
||||
bool gpu_only = false;
|
||||
bool load_from_disk = true;
|
||||
bool save_to_disk = true;
|
||||
std::string path;
|
||||
std::string config_path;
|
||||
TokenLength num_token_per_page = 256;
|
||||
size_t memory_pool_size = 10e9;
|
||||
size_t evict_count = 20;
|
||||
std::optional<GPUPageCacheConfig> gpu_cache_config = std::nullopt;
|
||||
size_t metrics_port;
|
||||
double recompute_ratio = 0.2;
|
||||
};
|
||||
|
||||
class DoubleCacheHandleInterface;
|
||||
class KVC2Interface {
|
||||
public:
|
||||
virtual ~KVC2Interface() = default;
|
||||
|
||||
virtual void load() = 0;
|
||||
virtual void save() = 0;
|
||||
/*
|
||||
Raw Insert
|
||||
Insert kvcache from kvcache_data to disk.
|
||||
|
||||
info: cache info
|
||||
id: start pointer of token array
|
||||
length: length of token array
|
||||
kvcache_data: data of kvcache
|
||||
|
||||
This will firstly match the ID array with the existing kvcache, and then insert the unmatched kvcache to disk.
|
||||
*/
|
||||
virtual void raw_insert(ModelName model_name, QuantType quant_type, Token* id, TokenLength length,
|
||||
const std::vector<layer_data>& k_cache, const std::vector<layer_data>& v_cache) = 0;
|
||||
|
||||
/*
|
||||
Raw Read
|
||||
Read kvcache from disk to user specified pointers.
|
||||
|
||||
info: cache info
|
||||
id: start pointer of token array
|
||||
length: length of token array
|
||||
kvcache_data: data of kvcache
|
||||
Return: matched length of prefix, in tokens
|
||||
|
||||
This will not read from memory pool, it directly read from disk.
|
||||
*/
|
||||
virtual TokenLength raw_read(ModelName model_name, QuantType quant_type, Token* id, TokenLength length,
|
||||
const std::vector<layer_data>& k_cache, const std::vector<layer_data>& v_cache) = 0;
|
||||
|
||||
/*
|
||||
Lookup
|
||||
Lookup kvcache and load it from disk to memory pool if needed.
|
||||
|
||||
info: cache info
|
||||
id: start pointer of token array
|
||||
length: length of token array
|
||||
|
||||
Return: kvc2_handle, holds kvcache until being released.
|
||||
if not found, matched_length will return 0.
|
||||
if memory pool is full, return nullptr
|
||||
*/
|
||||
virtual std::shared_ptr<DoubleCacheHandleInterface> lookup(ModelName model_name, QuantType quant_type, Token* id,
|
||||
TokenLength length, TokenLength estimated_length) = 0;
|
||||
|
||||
/*
|
||||
Lookup and allocate to gpu
|
||||
info.is_k_cache does not matter here
|
||||
*/
|
||||
virtual std::shared_ptr<DoubleCacheHandleInterface> lookup_to_gpu(ModelName model_name, QuantType quant_type,
|
||||
Token* id, TokenLength length,
|
||||
TokenLength estimated_length) = 0;
|
||||
|
||||
virtual void lookup_to_gpu_async(ModelName model_name, QuantType quant_type, Token* id, TokenLength length,
|
||||
TokenLength estimated_length,
|
||||
std::function<void(std::shared_ptr<DoubleCacheHandleInterface>)> call_back) = 0;
|
||||
|
||||
virtual std::pair<std::vector<torch::Tensor>, std::vector<torch::Tensor>> get_kvcache() = 0;
|
||||
|
||||
virtual void debug() = 0;
|
||||
};
|
||||
|
||||
std::shared_ptr<KVC2Interface> create_kvc2(KVC2Config config);
|
||||
|
||||
enum MatchStatus {
|
||||
Exact,
|
||||
Partial,
|
||||
NotMatchExact,
|
||||
NotMatchPartial,
|
||||
};
|
||||
|
||||
class DoubleCacheHandleInterface {
|
||||
public:
|
||||
virtual ~DoubleCacheHandleInterface() = default;
|
||||
virtual TokenLength matched_length() = 0;
|
||||
virtual std::vector<MatchStatus> matched_status() = 0;
|
||||
virtual std::vector<layer_data> handle_data(bool is_key_cache) = 0;
|
||||
virtual bool to_gpu() = 0;
|
||||
virtual void to_gpu_async(std::function<void(bool)> call_back) = 0;
|
||||
virtual std::vector<size_t> get_gpu_block_idx() = 0;
|
||||
virtual std::vector<size_t> get_gpu_attached_block_idx() = 0;
|
||||
|
||||
virtual void append_tokens(Token* tokens, TokenLength length) = 0; // update generated tokens
|
||||
|
||||
virtual void debug() = 0;
|
||||
};
|
||||
|
||||
}; // namespace kvc2
|
64
csrc/balance_serve/kvc2/src/kvc2_utils.py
Normal file
64
csrc/balance_serve/kvc2/src/kvc2_utils.py
Normal file
|
@ -0,0 +1,64 @@
|
|||
import torch
|
||||
import ctypes
|
||||
|
||||
def aligned_tensor(size, alignment=4096):
|
||||
num_bytes = size
|
||||
mem = ctypes.c_void_p()
|
||||
error_code = ctypes.CDLL(None).posix_memalign(
|
||||
ctypes.byref(mem), ctypes.c_size_t(alignment), ctypes.c_size_t(num_bytes)
|
||||
)
|
||||
|
||||
if error_code != 0:
|
||||
raise MemoryError(f"posix_memalign failed with error code {error_code}")
|
||||
|
||||
array_type = (ctypes.c_int8 * size)
|
||||
raw_array = array_type.from_address(mem.value)
|
||||
|
||||
tensor = torch.frombuffer(raw_array, dtype=torch.int8)
|
||||
|
||||
if tensor.data_ptr() % alignment != 0:
|
||||
raise ValueError(f"Tensor data_ptr {tensor.data_ptr()} is not aligned to {alignment} bytes")
|
||||
|
||||
return tensor, mem
|
||||
|
||||
def alloc_aligned_cache(layer_count,block_count,element_size):
|
||||
cache = []
|
||||
cache_mem = []
|
||||
for i in range(layer_count):
|
||||
layer_data = []
|
||||
layer_mem = []
|
||||
for j in range(block_count):
|
||||
tensor, mem_ptr = aligned_tensor(element_size, alignment=4096)
|
||||
layer_data.append(tensor)
|
||||
layer_mem.append(mem_ptr)
|
||||
cache.append(layer_data)
|
||||
cache_mem.append(layer_mem)
|
||||
return cache,cache_mem
|
||||
|
||||
def dealloc_aligned_cache(cache_mem):
|
||||
for layer_mem in cache_mem:
|
||||
for mem_ptr in layer_mem:
|
||||
ctypes.CDLL(None).free(mem_ptr)
|
||||
|
||||
def get_tensor_ptr(tensors):
|
||||
tensor_ptr = []
|
||||
for layer in tensors:
|
||||
layer_ptr = []
|
||||
for data in layer:
|
||||
layer_ptr.append(data.data_ptr())
|
||||
tensor_ptr.append(layer_ptr)
|
||||
return tensor_ptr
|
||||
|
||||
def get_tensor_from_data_ptr(matched_data,element_size):
|
||||
re = []
|
||||
for layer in matched_data:
|
||||
re_layer = []
|
||||
for data_ptr in layer:
|
||||
array_type = (ctypes.c_int8 * element_size)
|
||||
raw_array = array_type.from_address(data_ptr)
|
||||
tensor = torch.frombuffer(raw_array, dtype=torch.int8)
|
||||
re_layer.append(tensor)
|
||||
re.append(re_layer)
|
||||
return re
|
||||
if __name__ == "__main__":
|
||||
pass
|
141
csrc/balance_serve/kvc2/src/metrics.cpp
Normal file
141
csrc/balance_serve/kvc2/src/metrics.cpp
Normal file
|
@ -0,0 +1,141 @@
|
|||
#include "metrics.h"
|
||||
|
||||
namespace kvc2 {
|
||||
|
||||
Metrics::Metrics(const MetricsConfig& config)
|
||||
: registry_(std::make_shared<prometheus::Registry>()), exposer_(config.endpoint) {
|
||||
// 注册 prefix_nodes Counter
|
||||
auto& prefix_nodes_family = prometheus::BuildCounter()
|
||||
.Name(std::string(METRIC_PREFIX) + "_prefix_nodes")
|
||||
.Help("Number of prefix nodes")
|
||||
.Register(*registry_);
|
||||
prefix_nodes = &prefix_nodes_family.Add({});
|
||||
|
||||
// 注册 prefix_block_count Counter
|
||||
auto& prefix_block_count_family = prometheus::BuildCounter()
|
||||
.Name(std::string(METRIC_PREFIX) + "_prefix_block_count")
|
||||
.Help("Number of prefix blocks")
|
||||
.Register(*registry_);
|
||||
prefix_block_count = &prefix_block_count_family.Add({});
|
||||
|
||||
// 定义统一的桶大小,最大为 10000 ms (10 s)
|
||||
std::vector<double> common_buckets = {1.0, 5.0, 10.0, 50.0, 100.0, 500.0, 1000.0, 5000.0, 10000.0};
|
||||
|
||||
// 注册 raw_insert_time_ms Histogram
|
||||
auto& raw_insert_time_ms_family = prometheus::BuildHistogram()
|
||||
.Name(std::string(METRIC_PREFIX) + "_raw_insert_time_ms")
|
||||
.Help("function raw insert's time in milliseconds")
|
||||
.Register(*registry_);
|
||||
raw_insert_time_ms = &raw_insert_time_ms_family.Add({}, common_buckets);
|
||||
|
||||
// 注册 lookup_time_ms Histogram
|
||||
auto& lookup_time_ms_family = prometheus::BuildHistogram()
|
||||
.Name(std::string(METRIC_PREFIX) + "_lookup_time_ms")
|
||||
.Help("function lookup's time in milliseconds")
|
||||
.Register(*registry_);
|
||||
lookup_time_ms = &lookup_time_ms_family.Add({}, common_buckets);
|
||||
|
||||
// 注册 lookup_prefixmatch_length Histogram
|
||||
auto& lookup_prefixmatch_length_family = prometheus::BuildHistogram()
|
||||
.Name(std::string(METRIC_PREFIX) + "_lookup_prefixmatch_length")
|
||||
.Help("function lookup's prefix match length")
|
||||
.Register(*registry_);
|
||||
lookup_prefixmatch_length = &lookup_prefixmatch_length_family.Add({}, common_buckets);
|
||||
|
||||
// 注册 matched_length_percentage Histogram
|
||||
auto& matched_length_percentage_family = prometheus::BuildHistogram()
|
||||
.Name(std::string(METRIC_PREFIX) + "_matched_length_percentage")
|
||||
.Help("function matched length percentage")
|
||||
.Register(*registry_);
|
||||
matched_length_percentage = &matched_length_percentage_family.Add({}, common_buckets);
|
||||
|
||||
// 注册 disk_usage Gauge
|
||||
auto& disk_usage_family =
|
||||
prometheus::BuildGauge().Name(std::string(METRIC_PREFIX) + "_disk_usage").Help("disk usage").Register(*registry_);
|
||||
disk_usage = &disk_usage_family.Add({});
|
||||
|
||||
// 注册 memory_pool_size Gauge
|
||||
memory_pool_size_family_ = &prometheus::BuildGauge()
|
||||
.Name(std::string(METRIC_PREFIX) + "_memory_pool_size")
|
||||
.Help("memory pool size")
|
||||
.Register(*registry_);
|
||||
|
||||
// 注册 memory_pool_node_count Gauge
|
||||
memory_pool_node_count_family_ = &prometheus::BuildGauge()
|
||||
.Name(std::string(METRIC_PREFIX) + "_memory_pool_node_count")
|
||||
.Help("memory pool node count")
|
||||
.Register(*registry_);
|
||||
|
||||
// 注册 lru_entry_count Gauge
|
||||
lru_entry_count_family_ = &prometheus::BuildGauge()
|
||||
.Name(std::string(METRIC_PREFIX) + "_lru_entry_count")
|
||||
.Help("lru entry count")
|
||||
.Register(*registry_);
|
||||
|
||||
// 注册 gpu_page_count Gauge
|
||||
gpu_page_count_family_ = &prometheus::BuildGauge()
|
||||
.Name(std::string(METRIC_PREFIX) + "_gpu_page_count")
|
||||
.Help("gpu page count")
|
||||
.Register(*registry_);
|
||||
|
||||
// 注册 append_tokens_time_ms Histogram
|
||||
auto& append_tokens_time_ms_family = prometheus::BuildHistogram()
|
||||
.Name(std::string(METRIC_PREFIX) + "_append_tokens_time_ms")
|
||||
.Help("append tokens time in milliseconds")
|
||||
.Register(*registry_);
|
||||
append_tokens_time_ms = &append_tokens_time_ms_family.Add({}, common_buckets);
|
||||
|
||||
// 注册 gpu_flush_back_time_ms Histogram
|
||||
auto& gpu_flush_back_time_ms_family = prometheus::BuildHistogram()
|
||||
.Name(std::string(METRIC_PREFIX) + "_gpu_flush_back_time_ms")
|
||||
.Help("gpu flush back time in milliseconds")
|
||||
.Register(*registry_);
|
||||
gpu_flush_back_time_ms = &gpu_flush_back_time_ms_family.Add({}, common_buckets);
|
||||
|
||||
// 注册 cpu_flush_back_time_ms Histogram
|
||||
auto& cpu_flush_back_time_ms_family = prometheus::BuildHistogram()
|
||||
.Name(std::string(METRIC_PREFIX) + "_cpu_flush_back_time_ms")
|
||||
.Help("cpu flush back time in milliseconds")
|
||||
.Register(*registry_);
|
||||
cpu_flush_back_time_ms = &cpu_flush_back_time_ms_family.Add({}, common_buckets);
|
||||
|
||||
exposer_.RegisterCollectable(registry_);
|
||||
}
|
||||
|
||||
// 析构函数
|
||||
Metrics::~Metrics() {
|
||||
// 停止指标暴露
|
||||
// exposer_.Stop();
|
||||
}
|
||||
|
||||
// 获取 memory_pool_size 指标
|
||||
prometheus::Gauge* Metrics::memory_pool_size(const std::string& type) {
|
||||
return &memory_pool_size_family_->Add({{"type", type}});
|
||||
}
|
||||
|
||||
// 获取 memory_pool_node_count 指标
|
||||
prometheus::Gauge* Metrics::memory_pool_node_count(const std::string& type) {
|
||||
return &memory_pool_node_count_family_->Add({{"type", type}});
|
||||
}
|
||||
|
||||
// 获取 lru_entry_count 指标
|
||||
prometheus::Gauge* Metrics::lru_entry_count(const std::string& type) {
|
||||
return &lru_entry_count_family_->Add({{"type", type}});
|
||||
}
|
||||
|
||||
// 获取 gpu_page_count 指标
|
||||
prometheus::Gauge* Metrics::gpu_page_count(std::string type) {
|
||||
return &gpu_page_count_family_->Add({{"type", type}});
|
||||
}
|
||||
|
||||
TimeObserver::TimeObserver(prometheus::Histogram* h) {
|
||||
histogram_ = h;
|
||||
timer_.start();
|
||||
}
|
||||
|
||||
TimeObserver::~TimeObserver() {
|
||||
timer_.stop();
|
||||
histogram_->Observe(timer_.elapsedNs() / 1e6); // ns -> ms
|
||||
}
|
||||
|
||||
} // namespace kvc2
|
77
csrc/balance_serve/kvc2/src/metrics.h
Normal file
77
csrc/balance_serve/kvc2/src/metrics.h
Normal file
|
@ -0,0 +1,77 @@
|
|||
#pragma once
|
||||
|
||||
#include <atomic>
|
||||
#include <chrono>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
#include "prometheus/counter.h"
|
||||
#include "prometheus/exposer.h"
|
||||
#include "prometheus/gauge.h"
|
||||
#include "prometheus/histogram.h"
|
||||
#include "prometheus/registry.h"
|
||||
|
||||
#include "utils/timer.hpp"
|
||||
|
||||
namespace kvc2 {
|
||||
|
||||
// 指标前缀宏定义
|
||||
#define METRIC_PREFIX "kvc2"
|
||||
|
||||
struct MetricsConfig {
|
||||
std::string endpoint; // 监听端点,如 "0.0.0.0:8080"
|
||||
};
|
||||
|
||||
class Metrics {
|
||||
public:
|
||||
// 构造函数传入 MetricsConfig
|
||||
Metrics(const MetricsConfig& config);
|
||||
~Metrics();
|
||||
|
||||
// 禁止拷贝和赋值
|
||||
Metrics(const Metrics&) = delete;
|
||||
Metrics& operator=(const Metrics&) = delete;
|
||||
|
||||
// 指标指针
|
||||
prometheus::Counter* prefix_nodes;
|
||||
prometheus::Counter* prefix_block_count;
|
||||
|
||||
prometheus::Histogram* raw_insert_time_ms;
|
||||
prometheus::Histogram* lookup_time_ms;
|
||||
prometheus::Histogram* lookup_prefixmatch_length;
|
||||
prometheus::Histogram* matched_length_percentage;
|
||||
|
||||
prometheus::Gauge* disk_usage;
|
||||
|
||||
prometheus::Gauge* memory_pool_size(const std::string& type);
|
||||
prometheus::Gauge* memory_pool_node_count(const std::string& type);
|
||||
|
||||
prometheus::Gauge* lru_entry_count(const std::string& type);
|
||||
prometheus::Gauge* gpu_page_count(std::string type);
|
||||
|
||||
prometheus::Histogram* append_tokens_time_ms;
|
||||
prometheus::Histogram* gpu_flush_back_time_ms;
|
||||
prometheus::Histogram* cpu_flush_back_time_ms;
|
||||
|
||||
private:
|
||||
std::shared_ptr<prometheus::Registry> registry_;
|
||||
prometheus::Exposer exposer_;
|
||||
|
||||
prometheus::Family<prometheus::Gauge>* memory_pool_size_family_;
|
||||
prometheus::Family<prometheus::Gauge>* memory_pool_node_count_family_;
|
||||
prometheus::Family<prometheus::Gauge>* lru_entry_count_family_;
|
||||
prometheus::Family<prometheus::Gauge>* gpu_page_count_family_;
|
||||
};
|
||||
|
||||
class TimeObserver {
|
||||
public:
|
||||
TimeObserver(prometheus::Histogram* h);
|
||||
~TimeObserver();
|
||||
|
||||
private:
|
||||
Timer timer_;
|
||||
prometheus::Histogram* histogram_;
|
||||
};
|
||||
|
||||
} // namespace kvc2
|
119
csrc/balance_serve/kvc2/src/model_config.h
Normal file
119
csrc/balance_serve/kvc2/src/model_config.h
Normal file
|
@ -0,0 +1,119 @@
|
|||
#ifndef __MODEL_CONFIG_HPP_
|
||||
#define __MODEL_CONFIG_HPP_
|
||||
|
||||
#include "nlohmann/json.hpp"
|
||||
#include <iostream>
|
||||
|
||||
#include <filesystem>
|
||||
#include <fstream>
|
||||
|
||||
using DimSize = size_t;
|
||||
using URL = std::string;
|
||||
using ModelName = std::string;
|
||||
|
||||
// We must assure this can be load by config.json
|
||||
class ModelConfig {
|
||||
public:
|
||||
DimSize hidden_size;
|
||||
DimSize intermediate_size;
|
||||
size_t max_position_embeddings;
|
||||
std::string model_type;
|
||||
size_t num_attention_heads;
|
||||
size_t num_hidden_layers;
|
||||
size_t num_key_value_heads;
|
||||
size_t vocab_size;
|
||||
|
||||
NLOHMANN_DEFINE_TYPE_INTRUSIVE(ModelConfig, hidden_size, intermediate_size,
|
||||
max_position_embeddings, model_type,
|
||||
num_attention_heads, num_hidden_layers,
|
||||
num_key_value_heads, vocab_size);
|
||||
|
||||
void load_from(std::filesystem::path path) {
|
||||
std::cout << "Load from " << path << std::endl;
|
||||
std::ifstream i(path);
|
||||
nlohmann::json j;
|
||||
i >> j;
|
||||
*this = j.get<ModelConfig>();
|
||||
}
|
||||
};
|
||||
|
||||
using QuantType = std::string;
|
||||
static const QuantType NoQuantType = "";
|
||||
|
||||
class QuantConfig {
|
||||
public:
|
||||
QuantType name;
|
||||
|
||||
// For GEMV
|
||||
QuantType type_of_dot_vector = NoQuantType;
|
||||
inline bool can_be_used_as_matrix() {
|
||||
return type_of_dot_vector != NoQuantType;
|
||||
}
|
||||
|
||||
bool can_be_used_as_vector;
|
||||
|
||||
double bytes_per_element;
|
||||
bool has_scale;
|
||||
bool has_min;
|
||||
|
||||
size_t block_element_count;
|
||||
size_t block_element_size;
|
||||
|
||||
URL reference = "";
|
||||
|
||||
NLOHMANN_DEFINE_TYPE_INTRUSIVE_WITH_DEFAULT(QuantConfig, name,
|
||||
type_of_dot_vector,
|
||||
can_be_used_as_vector,
|
||||
bytes_per_element, has_scale,
|
||||
has_min, block_element_count,
|
||||
block_element_size, reference);
|
||||
};
|
||||
|
||||
inline std::map<QuantType, QuantConfig> quant_configs;
|
||||
inline std::map<ModelName, ModelConfig> model_configs;
|
||||
|
||||
inline void load_quant_configs(std::filesystem::path path) {
|
||||
nlohmann::json j;
|
||||
if (std::filesystem::exists(path)) {
|
||||
std::cout << __FUNCTION__ << " from " << path << std::endl;
|
||||
std::ifstream i(path);
|
||||
i >> j;
|
||||
quant_configs = j.get<std::map<QuantType, QuantConfig>>();
|
||||
std::cout << "Loaded Quant Configs" << std::endl;
|
||||
for (auto &[k, v] : quant_configs) {
|
||||
std::cout << " - " << k << std::endl;
|
||||
}
|
||||
} else {
|
||||
std::cout << __FUNCTION__ << " no file at " << path << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
inline void dump_quant_configs(std::filesystem::path path) {
|
||||
std::ofstream o(path);
|
||||
nlohmann::json j = quant_configs;
|
||||
o << j.dump(4);
|
||||
}
|
||||
|
||||
inline void load_model_configs(std::filesystem::path path) {
|
||||
nlohmann::json j;
|
||||
if (std::filesystem::exists(path)) {
|
||||
std::cout << __FUNCTION__ << " from " << path << std::endl;
|
||||
std::ifstream i(path);
|
||||
i >> j;
|
||||
model_configs = j.get<std::map<ModelName, ModelConfig>>();
|
||||
std::cout << "Loaded Model Configs" << std::endl;
|
||||
for (auto &[k, v] : model_configs) {
|
||||
std::cout << " - " << k << std::endl;
|
||||
}
|
||||
} else {
|
||||
std::cout << __FUNCTION__ << " no file at " << path << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
inline void dump_model_configs(std::filesystem::path path) {
|
||||
std::ofstream o(path);
|
||||
nlohmann::json j = model_configs;
|
||||
o << j.dump(4);
|
||||
}
|
||||
|
||||
#endif
|
125
csrc/balance_serve/kvc2/src/page_aligned_memory_pool.cpp
Normal file
125
csrc/balance_serve/kvc2/src/page_aligned_memory_pool.cpp
Normal file
|
@ -0,0 +1,125 @@
|
|||
#include "page_aligned_memory_pool.h"
|
||||
|
||||
#define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG
|
||||
#define FMT_HEADER_ONLY
|
||||
#include "spdlog/spdlog.h"
|
||||
|
||||
#include "utils/arithmetic.hpp"
|
||||
#include "utils/easy_format.hpp"
|
||||
|
||||
/// 构造函数
|
||||
PageAlignedMemoryPool::PageAlignedMemoryPool(size_t size_in_bytes) {
|
||||
total_size = (size_in_bytes / PageSize) * PageSize;
|
||||
// 对齐分配。C++17 对齐方式写法,如果编译器不支持可以改用其它方法
|
||||
data = ::operator new[](total_size, std::align_val_t(PageSize));
|
||||
total_pages = total_size / PageSize;
|
||||
|
||||
assert(total_pages >= Blocks);
|
||||
page_per_block = total_pages / Blocks;
|
||||
|
||||
for (size_t block_index = 0; block_index < Blocks; block_index++) {
|
||||
first_page[block_index] = reinterpret_cast<void*>(reinterpret_cast<intptr_t>(data) +
|
||||
static_cast<intptr_t>(block_index) * page_per_block * PageSize);
|
||||
count_page[block_index] =
|
||||
block_index == Blocks - 1 ? (total_pages - page_per_block * (Blocks - 1)) : page_per_block;
|
||||
SPDLOG_DEBUG("first_page[{}] = {}, count_page[{}] = {}", block_index,
|
||||
reinterpret_cast<intptr_t>(first_page[block_index]) - reinterpret_cast<intptr_t>(data), block_index,
|
||||
count_page[block_index]);
|
||||
bitmap[block_index].resize(count_page[block_index], 0);
|
||||
}
|
||||
SPDLOG_INFO("PageAlignedMemoryPool with size {} Mbytes, {} pages", total_size / (1 << 20), page_count());
|
||||
}
|
||||
|
||||
/// 析构函数
|
||||
PageAlignedMemoryPool::~PageAlignedMemoryPool() {
|
||||
if (data) {
|
||||
// 注意:需要与分配时的对齐方式对应
|
||||
::operator delete[](data, std::align_val_t(PageSize));
|
||||
data = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
/// 返回总页数
|
||||
size_t PageAlignedMemoryPool::page_count() {
|
||||
return total_size / PageSize;
|
||||
}
|
||||
|
||||
/// 返回按整页对齐后的字节数
|
||||
size_t PageAlignedMemoryPool::page_padded_size(size_t size) {
|
||||
return div_up(size, PageSize) * PageSize;
|
||||
}
|
||||
|
||||
void* PageAlignedMemoryPool::alloc_in_block(size_t block_index, size_t alloc_size) {
|
||||
std::lock_guard<std::mutex> guard(lock[block_index]);
|
||||
size_t free_pages = 0;
|
||||
for (size_t i = 0; i < count_page[block_index]; i++) {
|
||||
if (bitmap[block_index][i] == 0) {
|
||||
free_pages++;
|
||||
if (free_pages == alloc_size) {
|
||||
size_t page_index = i + 1 - free_pages;
|
||||
for (size_t page = page_index; page < page_index + alloc_size; page++) {
|
||||
bitmap[block_index][page] = 1;
|
||||
// SPDLOG_DEBUG("alloc page {} in block {}", page, block_index);
|
||||
}
|
||||
return reinterpret_cast<void*>(reinterpret_cast<intptr_t>(first_page[block_index]) + page_index * PageSize);
|
||||
}
|
||||
} else {
|
||||
free_pages = 0;
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
/// 分配函数
|
||||
void* PageAlignedMemoryPool::alloc(size_t size) {
|
||||
size_t alloc_size = div_up(size, PageSize);
|
||||
auto cnt = now_block.fetch_add(1, std::memory_order_relaxed);
|
||||
for (size_t i = 0; i < Blocks; i++) {
|
||||
auto result = alloc_in_block((i + cnt) % Blocks, alloc_size);
|
||||
if (result != nullptr) {
|
||||
allocated.fetch_add(alloc_size * PageSize, std::memory_order_relaxed);
|
||||
alloc_count.fetch_add(1, std::memory_order_relaxed);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
/// 释放函数
|
||||
void PageAlignedMemoryPool::free(void* p, size_t size) {
|
||||
auto alloc_size = div_up(size, PageSize);
|
||||
size_t block_index = (reinterpret_cast<intptr_t>(p) - reinterpret_cast<intptr_t>(data)) / page_per_block / PageSize;
|
||||
size_t page_index = (reinterpret_cast<intptr_t>(p) - reinterpret_cast<intptr_t>(first_page[block_index])) / PageSize;
|
||||
|
||||
std::lock_guard<std::mutex> guard(lock[block_index]);
|
||||
|
||||
for (size_t page = page_index; page < page_index + alloc_size; page++)
|
||||
bitmap[block_index][page] = 0;
|
||||
|
||||
allocated.fetch_sub(alloc_size * PageSize, std::memory_order_relaxed);
|
||||
free_count.fetch_add(1, std::memory_order_relaxed);
|
||||
}
|
||||
// TODO: too slow
|
||||
std::vector<void*> PageAlignedMemoryPool::alloc_multiple(size_t size, size_t count) {
|
||||
std::vector<void*> result;
|
||||
for (size_t i = 0; i < count; i++) {
|
||||
auto p = alloc(size);
|
||||
if (p == nullptr) {
|
||||
for (auto ptr : result) {
|
||||
free(ptr, size);
|
||||
}
|
||||
return {};
|
||||
}
|
||||
result.push_back(p);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
void PageAlignedMemoryPool::defragment() {}
|
||||
|
||||
/// 调试打印
|
||||
std::string PageAlignedMemoryPool::debug() {
|
||||
return fmt::format("PageAlignedMemoryPool: total_size: {}MB, allocated: {}, alloc/free count: {}/{}\n",
|
||||
readable_number(total_size), readable_number(size_t(allocated)), size_t(alloc_count),
|
||||
size_t(free_count));
|
||||
}
|
54
csrc/balance_serve/kvc2/src/page_aligned_memory_pool.h
Normal file
54
csrc/balance_serve/kvc2/src/page_aligned_memory_pool.h
Normal file
|
@ -0,0 +1,54 @@
|
|||
#pragma once
|
||||
|
||||
#include <assert.h>
|
||||
#include <algorithm> // std::sort
|
||||
#include <atomic>
|
||||
#include <bitset>
|
||||
#include <cstddef> // size_t
|
||||
#include <mutex> // std::mutex
|
||||
#include <vector>
|
||||
|
||||
constexpr size_t PageSize = 4096;
|
||||
|
||||
/// PageAlignedMemoryPool 类的声明
|
||||
struct PageAlignedMemoryPool {
|
||||
private:
|
||||
constexpr static size_t Blocks = 16;
|
||||
|
||||
void* data = nullptr;
|
||||
|
||||
size_t total_size = 0, total_pages = 0;
|
||||
|
||||
std::atomic_size_t now_block = 0;
|
||||
std::atomic_size_t allocated = 0; // allocated_size
|
||||
std::atomic_size_t alloc_count = 0;
|
||||
std::atomic_size_t free_count = 0;
|
||||
|
||||
std::mutex lock[Blocks];
|
||||
size_t page_per_block = 0;
|
||||
void* first_page[Blocks];
|
||||
size_t count_page[Blocks];
|
||||
std::vector<int8_t> bitmap[Blocks];
|
||||
void* alloc_in_block(size_t block_index, size_t alloc_size);
|
||||
|
||||
public:
|
||||
/// 构造函数和析构函数
|
||||
explicit PageAlignedMemoryPool(size_t size_in_bytes);
|
||||
~PageAlignedMemoryPool();
|
||||
|
||||
/// 禁用拷贝和移动
|
||||
PageAlignedMemoryPool(PageAlignedMemoryPool&& other) = delete;
|
||||
PageAlignedMemoryPool& operator=(PageAlignedMemoryPool&& other) = delete;
|
||||
PageAlignedMemoryPool(const PageAlignedMemoryPool& other) = delete;
|
||||
PageAlignedMemoryPool& operator=(const PageAlignedMemoryPool& other) = delete;
|
||||
|
||||
/// 成员函数
|
||||
size_t page_count();
|
||||
size_t page_padded_size(size_t size);
|
||||
|
||||
void* alloc(size_t size);
|
||||
std::vector<void*> alloc_multiple(size_t size, size_t count);
|
||||
void free(void* data, size_t size);
|
||||
void defragment();
|
||||
std::string debug();
|
||||
};
|
1744
csrc/balance_serve/kvc2/src/prefix.cpp
Normal file
1744
csrc/balance_serve/kvc2/src/prefix.cpp
Normal file
File diff suppressed because it is too large
Load diff
3
csrc/balance_serve/kvc2/src/utils/all.hpp
Normal file
3
csrc/balance_serve/kvc2/src/utils/all.hpp
Normal file
|
@ -0,0 +1,3 @@
|
|||
#pragma once
|
||||
#include "easy_format.hpp"
|
||||
#include "timer.hpp"
|
14
csrc/balance_serve/kvc2/src/utils/arithmetic.hpp
Normal file
14
csrc/balance_serve/kvc2/src/utils/arithmetic.hpp
Normal file
|
@ -0,0 +1,14 @@
|
|||
#include <memory>
|
||||
#include <type_traits>
|
||||
|
||||
template <typename T, typename U>
|
||||
T div_up(T x, U by) {
|
||||
static_assert(std::is_integral_v<T>);
|
||||
static_assert(std::is_integral_v<U>);
|
||||
return (x + by - 1) / by;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
T* offset_by_bytes(T* t, size_t n) {
|
||||
return reinterpret_cast<T*>(reinterpret_cast<size_t>(t) + n);
|
||||
}
|
37
csrc/balance_serve/kvc2/src/utils/easy_format.hpp
Normal file
37
csrc/balance_serve/kvc2/src/utils/easy_format.hpp
Normal file
|
@ -0,0 +1,37 @@
|
|||
#ifndef __EASY_FORMAT_HPP_
|
||||
#define __EASY_FORMAT_HPP_
|
||||
#include <array>
|
||||
#include <iomanip>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
|
||||
#include <vector>
|
||||
|
||||
template <typename T>
|
||||
inline std::string format_vector(const std::vector<T>& v) {
|
||||
std::ostringstream oss;
|
||||
if (v.empty())
|
||||
return "[]";
|
||||
for (size_t i = 0; i < v.size(); ++i) {
|
||||
oss << v[i];
|
||||
if (i < v.size() - 1)
|
||||
oss << ", "; // 逗号分隔
|
||||
}
|
||||
return oss.str();
|
||||
}
|
||||
|
||||
inline std::array<std::string, 7> units = {"", "K", "M", "G", "T", "P", "E"};
|
||||
|
||||
inline std::string readable_number(size_t size) {
|
||||
size_t unit_index = 0;
|
||||
double readable_size = size;
|
||||
while (readable_size >= 1000 && unit_index < units.size() - 1) {
|
||||
readable_size /= 1000;
|
||||
unit_index++;
|
||||
}
|
||||
std::ostringstream ss;
|
||||
ss << std::fixed << std::setprecision(2) << readable_size;
|
||||
std::string str = ss.str();
|
||||
return str + "" + units[unit_index];
|
||||
}
|
||||
#endif
|
60
csrc/balance_serve/kvc2/src/utils/lock_free_queue.hpp
Normal file
60
csrc/balance_serve/kvc2/src/utils/lock_free_queue.hpp
Normal file
|
@ -0,0 +1,60 @@
|
|||
#include <atomic>
|
||||
#include <future>
|
||||
#include <iostream>
|
||||
#include <memory>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
|
||||
template <typename T>
|
||||
class MPSCQueue {
|
||||
struct Node {
|
||||
std::shared_ptr<T> data;
|
||||
std::atomic<Node*> next;
|
||||
|
||||
Node() : next(nullptr) {}
|
||||
Node(std::shared_ptr<T> data_) : data(std::move(data_)), next(nullptr) {}
|
||||
};
|
||||
|
||||
std::atomic<Node*> head;
|
||||
Node* tail;
|
||||
|
||||
public:
|
||||
std::atomic_size_t enqueue_count = 0;
|
||||
size_t dequeue_count = 0;
|
||||
MPSCQueue() {
|
||||
Node* dummy = new Node();
|
||||
head.store(dummy, std::memory_order_relaxed);
|
||||
tail = dummy;
|
||||
}
|
||||
|
||||
~MPSCQueue() {
|
||||
// 清理剩余的节点
|
||||
Node* node = tail;
|
||||
while (node) {
|
||||
Node* next = node->next.load(std::memory_order_relaxed);
|
||||
delete node;
|
||||
node = next;
|
||||
}
|
||||
}
|
||||
|
||||
// 生产者调用
|
||||
void enqueue(std::shared_ptr<T> data) {
|
||||
enqueue_count.fetch_add(1);
|
||||
Node* node = new Node(std::move(data));
|
||||
Node* prev_head = head.exchange(node, std::memory_order_acq_rel);
|
||||
prev_head->next.store(node, std::memory_order_release);
|
||||
}
|
||||
|
||||
// 消费者调用
|
||||
std::shared_ptr<T> dequeue() {
|
||||
Node* next = tail->next.load(std::memory_order_acquire);
|
||||
if (next) {
|
||||
std::shared_ptr<T> res = std::move(next->data);
|
||||
delete tail;
|
||||
tail = next;
|
||||
dequeue_count += 1;
|
||||
return res;
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
};
|
90
csrc/balance_serve/kvc2/src/utils/mpsc.hpp
Normal file
90
csrc/balance_serve/kvc2/src/utils/mpsc.hpp
Normal file
|
@ -0,0 +1,90 @@
|
|||
#include <atomic>
|
||||
#include <cassert>
|
||||
#include <iostream>
|
||||
#include <optional>
|
||||
#include <semaphore>
|
||||
|
||||
template <typename T>
|
||||
class MPSCQueue {
|
||||
struct Node {
|
||||
T data;
|
||||
std::atomic<Node*> next;
|
||||
|
||||
Node() : next(nullptr) {}
|
||||
Node(T data_) : data(std::move(data_)), next(nullptr) {}
|
||||
};
|
||||
|
||||
std::atomic<Node*> head;
|
||||
Node* tail;
|
||||
|
||||
public:
|
||||
std::atomic_size_t enqueue_count = 0;
|
||||
size_t dequeue_count = 0;
|
||||
MPSCQueue() {
|
||||
Node* dummy = new Node();
|
||||
head.store(dummy, std::memory_order_seq_cst);
|
||||
tail = dummy;
|
||||
}
|
||||
|
||||
~MPSCQueue() {
|
||||
Node* node = tail;
|
||||
while (node) {
|
||||
Node* next = node->next.load(std::memory_order_seq_cst);
|
||||
delete node;
|
||||
node = next;
|
||||
}
|
||||
}
|
||||
|
||||
// 生产者调用
|
||||
void enqueue(T data) {
|
||||
enqueue_count.fetch_add(1);
|
||||
Node* node = new Node(std::move(data));
|
||||
Node* prev_head = head.exchange(node, std::memory_order_seq_cst);
|
||||
prev_head->next.store(node, std::memory_order_seq_cst);
|
||||
}
|
||||
|
||||
// 消费者调用
|
||||
std::optional<T> dequeue() {
|
||||
Node* next = tail->next.load(std::memory_order_seq_cst);
|
||||
if (next) {
|
||||
T res = std::move(next->data);
|
||||
delete tail;
|
||||
tail = next;
|
||||
dequeue_count += 1;
|
||||
return res;
|
||||
}
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
size_t size() { return enqueue_count.load() - dequeue_count; }
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
class MPSCQueueConsumerLock {
|
||||
MPSCQueue<T> queue;
|
||||
std::counting_semaphore<> sema{0};
|
||||
|
||||
public:
|
||||
void enqueue(T data) {
|
||||
queue.enqueue(std::move(data));
|
||||
// std::atomic_thread_fence(std::memory_order_seq_cst);// Inserting this because the memory order might be wrong, I
|
||||
// am also not that sure about this.
|
||||
sema.release();
|
||||
}
|
||||
|
||||
T dequeue() {
|
||||
auto re = queue.dequeue();
|
||||
if (re.has_value()) {
|
||||
while (sema.try_acquire() == false) {
|
||||
std::cerr << __FILE__ << ":" << __FUNCTION__ << " sema try acquire should be success, retrying, please check"
|
||||
<< std::endl;
|
||||
// assert(false);
|
||||
}
|
||||
return re.value();
|
||||
}
|
||||
sema.acquire();
|
||||
return queue.dequeue().value();
|
||||
}
|
||||
|
||||
size_t size() { return queue.size(); }
|
||||
};
|
70
csrc/balance_serve/kvc2/src/utils/mutex_extend.hpp
Normal file
70
csrc/balance_serve/kvc2/src/utils/mutex_extend.hpp
Normal file
|
@ -0,0 +1,70 @@
|
|||
#ifndef __MUTEX_EXTEND_HPP_
|
||||
#define __MUTEX_EXTEND_HPP_
|
||||
|
||||
#include <atomic>
|
||||
#include <chrono>
|
||||
#include <iostream>
|
||||
#include <thread>
|
||||
|
||||
class non_recursive_mutex {
|
||||
public:
|
||||
non_recursive_mutex() = default;
|
||||
|
||||
// 使用 try_lock 实现非递归锁
|
||||
bool try_lock() {
|
||||
std::thread::id this_id = std::this_thread::get_id();
|
||||
|
||||
// 检查当前线程是否已经持有该锁
|
||||
if (owner.load(std::memory_order_acquire) == this_id) {
|
||||
return false; // 如果是当前线程,返回失败
|
||||
}
|
||||
|
||||
// 尝试加锁
|
||||
if (mtx.try_lock()) {
|
||||
owner.store(this_id, std::memory_order_release); // 设置锁的拥有者
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
// lock 会阻塞,直到获得锁
|
||||
void lock() {
|
||||
std::thread::id this_id = std::this_thread::get_id();
|
||||
|
||||
while (true) {
|
||||
// 检查当前线程是否已经持有该锁
|
||||
if (owner.load(std::memory_order_acquire) == this_id) {
|
||||
throw std::runtime_error("Thread is trying to lock a mutex it already holds");
|
||||
}
|
||||
|
||||
// 尝试加锁
|
||||
if (mtx.try_lock()) {
|
||||
owner.store(this_id, std::memory_order_release); // 设置锁的拥有者
|
||||
return;
|
||||
}
|
||||
|
||||
// 如果锁未获得,则稍微等待,防止忙等
|
||||
std::this_thread::yield();
|
||||
}
|
||||
}
|
||||
|
||||
// 解锁
|
||||
void unlock() {
|
||||
std::thread::id this_id = std::this_thread::get_id();
|
||||
|
||||
// 确保只有持有锁的线程可以解锁
|
||||
if (owner.load(std::memory_order_acquire) == this_id) {
|
||||
owner.store(std::thread::id(), std::memory_order_release); // 清除锁的拥有者
|
||||
mtx.unlock();
|
||||
} else {
|
||||
throw std::runtime_error("Thread attempting to unlock a mutex it doesn't own");
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
std::mutex mtx; // 实际的互斥量
|
||||
std::atomic<std::thread::id> owner; // 原子变量,记录当前锁的拥有者
|
||||
};
|
||||
|
||||
#endif
|
102
csrc/balance_serve/kvc2/src/utils/periodic_task.hpp
Normal file
102
csrc/balance_serve/kvc2/src/utils/periodic_task.hpp
Normal file
|
@ -0,0 +1,102 @@
|
|||
#ifndef PERIODIC_TASK_HPP
|
||||
#define PERIODIC_TASK_HPP
|
||||
|
||||
#include <atomic>
|
||||
#include <chrono>
|
||||
#include <condition_variable>
|
||||
#include <cstdio>
|
||||
#include <functional>
|
||||
#include <future>
|
||||
#include <iostream>
|
||||
#include <mutex>
|
||||
#include <stop_token>
|
||||
#include <thread>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
namespace periodic {
|
||||
|
||||
class PeriodicTask {
|
||||
public:
|
||||
explicit PeriodicTask(std::function<void()> func,
|
||||
std::chrono::milliseconds interval_ms = std::chrono::milliseconds(100))
|
||||
: func_(std::move(func)), interval_(interval_ms), worker_([this](std::stop_token stoken) { this->run(stoken); }) {
|
||||
// std::cout << "PeriodicTask created with interval: " << interval_.count() << " ms" << std::endl;
|
||||
}
|
||||
|
||||
~PeriodicTask() {
|
||||
worker_.request_stop();
|
||||
cv_.notify_one(); // Ensure worker wakes up when destroyed
|
||||
// std::cout << "PeriodicTask destructor called, stopping worker." << std::endl;
|
||||
}
|
||||
|
||||
void wakeUp() {
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(wakeup_mutex_);
|
||||
wake_up_requested_ = true;
|
||||
}
|
||||
cv_.notify_one(); // Notify worker thread to wake up immediately
|
||||
// std::cout << "wakeUp() called: worker thread will wake up." << std::endl;
|
||||
}
|
||||
|
||||
std::future<void> wakeUpWait() {
|
||||
std::promise<void> promise;
|
||||
std::future<void> future = promise.get_future();
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(promise_mutex_);
|
||||
wakeup_promises_.push_back(std::move(promise));
|
||||
}
|
||||
wakeUp();
|
||||
return future;
|
||||
}
|
||||
|
||||
private:
|
||||
void run(std::stop_token stoken) {
|
||||
while (!stoken.stop_requested()) {
|
||||
std::unique_lock lock(mutex_);
|
||||
// Wait for either the time interval or a wake-up signal
|
||||
cv_.wait_for(lock, interval_, [this] { return wake_up_requested_.load(); });
|
||||
|
||||
if (stoken.stop_requested())
|
||||
break;
|
||||
|
||||
// If the wake-up was triggered, reset the flag and process the task
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(wakeup_mutex_);
|
||||
wake_up_requested_ = false;
|
||||
}
|
||||
|
||||
try {
|
||||
// std::cout << "Running task function." << std::endl;
|
||||
func_();
|
||||
} catch (...) {
|
||||
std::cerr << "Error in task function." << std::endl;
|
||||
}
|
||||
|
||||
notifyPromises();
|
||||
}
|
||||
}
|
||||
|
||||
void notifyPromises() {
|
||||
std::lock_guard<std::mutex> lock(promise_mutex_);
|
||||
// std::cout << "Notifying all waiting promises." << std::endl;
|
||||
for (auto& promise : wakeup_promises_) {
|
||||
promise.set_value();
|
||||
}
|
||||
wakeup_promises_.clear();
|
||||
}
|
||||
|
||||
std::function<void()> func_;
|
||||
std::chrono::milliseconds interval_;
|
||||
std::mutex mutex_;
|
||||
std::condition_variable cv_;
|
||||
std::vector<std::promise<void>> wakeup_promises_;
|
||||
std::mutex promise_mutex_;
|
||||
std::mutex wakeup_mutex_;
|
||||
std::atomic<bool> wake_up_requested_ = false;
|
||||
std::jthread worker_;
|
||||
};
|
||||
|
||||
} // namespace periodic
|
||||
|
||||
#endif // PERIODIC_TASK_HPP
|
36
csrc/balance_serve/kvc2/src/utils/spin_lock.hpp
Normal file
36
csrc/balance_serve/kvc2/src/utils/spin_lock.hpp
Normal file
|
@ -0,0 +1,36 @@
|
|||
/*
|
||||
* @Author: Xie Weiyu ervinxie@qq.com
|
||||
* @Date: 2024-11-21 06:35:47
|
||||
* @LastEditors: Xie Weiyu ervinxie@qq.com
|
||||
* @LastEditTime: 2024-11-21 06:35:50
|
||||
* @FilePath: /kvc2/src/utils/spin_lock.hpp
|
||||
* @Description: 这是默认设置,请设置`customMade`, 打开koroFileHeader查看配置 进行设置:
|
||||
* https://github.com/OBKoro1/koro1FileHeader/wiki/%E9%85%8D%E7%BD%AE
|
||||
*/
|
||||
|
||||
#include <atomic>
|
||||
#include <chrono>
|
||||
#include <thread>
|
||||
|
||||
class SpinLock {
|
||||
public:
|
||||
SpinLock() { flag.clear(); }
|
||||
|
||||
void lock() {
|
||||
const int max_delay = 1024; // Maximum delay in microseconds
|
||||
int delay = 1; // Initial delay in microseconds
|
||||
|
||||
while (flag.test_and_set(std::memory_order_acquire)) {
|
||||
std::this_thread::sleep_for(std::chrono::microseconds(delay));
|
||||
delay *= 2;
|
||||
if (delay > max_delay) {
|
||||
delay = max_delay;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void unlock() { flag.clear(std::memory_order_release); }
|
||||
|
||||
private:
|
||||
std::atomic_flag flag = ATOMIC_FLAG_INIT;
|
||||
};
|
128
csrc/balance_serve/kvc2/src/utils/timer.hpp
Normal file
128
csrc/balance_serve/kvc2/src/utils/timer.hpp
Normal file
|
@ -0,0 +1,128 @@
|
|||
#pragma once
|
||||
#include <cassert>
|
||||
#include <chrono>
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <map>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include "easy_format.hpp"
|
||||
|
||||
inline std::string doubleToStringR2(double value) {
|
||||
std::stringstream stream;
|
||||
stream << std::fixed << std::setprecision(2) << value;
|
||||
return stream.str();
|
||||
}
|
||||
|
||||
class Timer {
|
||||
public:
|
||||
std::string name;
|
||||
bool tmp_timer = false;
|
||||
|
||||
Timer() {}
|
||||
Timer(std::string name) : name(name), tmp_timer(true) { start(); }
|
||||
~Timer() {
|
||||
if (tmp_timer) {
|
||||
std::cout << name << " " << elapsedMs() << " ms" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
void start() {
|
||||
m_startTime = std::chrono::high_resolution_clock::now();
|
||||
assert(m_isRunning == false);
|
||||
m_isRunning = true;
|
||||
}
|
||||
|
||||
void stop() {
|
||||
m_endTime = std::chrono::high_resolution_clock::now();
|
||||
assert(m_isRunning == true);
|
||||
m_isRunning = false;
|
||||
m_runningNs += elapsedNs();
|
||||
}
|
||||
|
||||
double elapsedNs() {
|
||||
std::chrono::time_point<std::chrono::high_resolution_clock> endTime;
|
||||
|
||||
if (m_isRunning) {
|
||||
endTime = std::chrono::high_resolution_clock::now();
|
||||
} else {
|
||||
endTime = m_endTime;
|
||||
}
|
||||
|
||||
return std::chrono::duration_cast<std::chrono::nanoseconds>(endTime - m_startTime).count();
|
||||
}
|
||||
|
||||
void printElapsedMilliseconds() { std::cout << elapsedNs() / 1e6 << " ms" << std::endl; }
|
||||
|
||||
static std::string ns_to_string(double duration) {
|
||||
auto nano_sec = duration;
|
||||
if (nano_sec >= 1000) {
|
||||
auto mirco_sec = nano_sec / 1000.0;
|
||||
if (mirco_sec >= 1000) {
|
||||
auto milli_sec = mirco_sec / 1000.0;
|
||||
if (milli_sec >= 1000) {
|
||||
auto seconds = milli_sec / 1000.0;
|
||||
|
||||
if (seconds >= 60.0) {
|
||||
auto minutes = seconds / 60.0;
|
||||
|
||||
if (minutes >= 60.0) {
|
||||
auto hours = minutes / 60.0;
|
||||
return doubleToStringR2(hours) + " h";
|
||||
} else {
|
||||
return doubleToStringR2(minutes) + " min";
|
||||
}
|
||||
} else {
|
||||
return doubleToStringR2(seconds) + " sec";
|
||||
}
|
||||
} else {
|
||||
return doubleToStringR2(milli_sec) + " ms";
|
||||
}
|
||||
} else {
|
||||
return doubleToStringR2(mirco_sec) + " us";
|
||||
}
|
||||
} else {
|
||||
return doubleToStringR2(nano_sec) + " ns";
|
||||
}
|
||||
}
|
||||
|
||||
double runningTimeNs() { return m_runningNs; }
|
||||
|
||||
std::string runningTime() {
|
||||
auto duration = m_runningNs;
|
||||
return ns_to_string(duration);
|
||||
}
|
||||
|
||||
std::string elapsedTime() { return ns_to_string(elapsedNs()); }
|
||||
double elapsedMs() { return elapsedNs() / 1e6; }
|
||||
std::string report_throughput(size_t op_cnt) {
|
||||
double ops = op_cnt / elapsedMs() * 1000;
|
||||
return readable_number(ops) + "op/s";
|
||||
}
|
||||
|
||||
void merge(Timer& other) {
|
||||
assert(m_isRunning == false);
|
||||
assert(other.m_isRunning == false);
|
||||
m_runningNs += other.runningTimeNs();
|
||||
}
|
||||
|
||||
private:
|
||||
std::chrono::time_point<std::chrono::high_resolution_clock> m_startTime;
|
||||
std::chrono::time_point<std::chrono::high_resolution_clock> m_endTime;
|
||||
bool m_isRunning = false;
|
||||
double m_runningNs = 0.0;
|
||||
};
|
||||
|
||||
class Counter {
|
||||
public:
|
||||
Counter() {}
|
||||
|
||||
std::map<std::string, size_t> counters;
|
||||
|
||||
void inc(const char* name, size_t num) { counters[name] += num; };
|
||||
void print() {
|
||||
for (auto& p : counters) {
|
||||
std::cout << p.first << " : " << p.second << std::endl;
|
||||
}
|
||||
};
|
||||
};
|
78
csrc/balance_serve/kvc2/test/CMakeLists.txt
Normal file
78
csrc/balance_serve/kvc2/test/CMakeLists.txt
Normal file
|
@ -0,0 +1,78 @@
|
|||
|
||||
set(CMAKE_CXX_FLAGS "-Og -march=native -Wall -Wextra -g -fopenmp")
|
||||
# set(CMAKE_CXX_FLAGS "-O3 -march=native -Wall -Wextra -pthread")
|
||||
|
||||
add_subdirectory(kvc2test)
|
||||
|
||||
|
||||
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../src)
|
||||
|
||||
add_executable(hashmap_test hashmap_test.cpp)
|
||||
target_link_libraries(hashmap_test PRIVATE TBB::tbb)
|
||||
|
||||
|
||||
add_executable(xxHash_test xxHash_test.cpp)
|
||||
target_link_libraries(xxHash_test PRIVATE xxhash)
|
||||
|
||||
function(add_async_store_executable source_file)
|
||||
get_filename_component(target_name ${source_file} NAME_WE) # 获取不带扩展名的文件名作为目标名
|
||||
add_executable(${target_name} ${source_file})
|
||||
target_include_directories(${target_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../src)
|
||||
target_include_directories(${target_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../third_party/nlohmann/single_include)
|
||||
target_include_directories(${target_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../third_party/spdlog/include)
|
||||
target_link_libraries(${target_name} PRIVATE async_store gflags)
|
||||
endfunction()
|
||||
|
||||
add_async_store_executable(async_store_test.cpp)
|
||||
|
||||
|
||||
function(add_kvc2_executable source_file)
|
||||
get_filename_component(target_name ${source_file} NAME_WE) # 获取不带扩展名的文件名作为目标名
|
||||
add_executable(${target_name} ${source_file})
|
||||
# target_compile_options(${target_name} PRIVATE -fopenmp -fno-strict-aliasing)
|
||||
target_include_directories(${target_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../src)
|
||||
target_include_directories(${target_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../third_party/nlohmann/single_include)
|
||||
target_include_directories(${target_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../third_party/spdlog/include)
|
||||
target_link_libraries(${target_name} PRIVATE kvc2 async_store gflags)
|
||||
endfunction()
|
||||
|
||||
|
||||
|
||||
|
||||
add_kvc2_executable(test_lock_free_queue.cpp)
|
||||
add_kvc2_executable(test_queue_perf.cpp)
|
||||
|
||||
# Disable deprecated test
|
||||
# add_kvc2_executable(prefix_test.cpp)
|
||||
# add_kvc2_executable(kvcache_disk_insert_read_test.cpp)
|
||||
# add_kvc2_executable(kvcache_mem_eviction_test.cpp)
|
||||
# add_kvc2_executable(kvcache_mem_insert_read_test.cpp)
|
||||
# add_kvc2_executable(kvcache_save_load_test.cpp)
|
||||
# add_kvc2_executable(kvc2_export_header_test.cpp)
|
||||
# add_kvc2_executable(kvc2_export_load_test.cpp)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
target_include_directories(async_store_test PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/..//third_party/nlohmann/single_include)
|
||||
target_include_directories(async_store_test PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/..//third_party/spdlog/include)
|
||||
target_link_libraries(async_store_test PRIVATE xxhash)
|
||||
|
||||
add_executable(test_std_list test_std_list.cpp)
|
||||
|
||||
|
||||
add_executable(test_cuda_stream test_cuda_stream.cpp)
|
||||
target_include_directories(test_cuda_stream PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
|
||||
target_link_libraries(test_cuda_stream PRIVATE CUDA::cudart)
|
||||
|
||||
add_executable(test_cuda_stream_manager test_cuda_stream_manager.cpp)
|
||||
target_include_directories(test_cuda_stream_manager PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../src)
|
||||
target_link_libraries(test_cuda_stream_manager PRIVATE cuda_stream_manager)
|
||||
|
||||
add_executable(test_periodic_task test_periodic_task.cpp)
|
||||
target_include_directories(test_periodic_task PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../src)
|
||||
|
||||
add_executable(test_page_pool page_pool_test.cpp)
|
||||
target_include_directories(test_page_pool PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../src)
|
||||
target_include_directories(test_page_pool PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../third_party/spdlog/include)
|
11
csrc/balance_serve/kvc2/test/hashmap_test.cpp
Normal file
11
csrc/balance_serve/kvc2/test/hashmap_test.cpp
Normal file
|
@ -0,0 +1,11 @@
|
|||
#include <tbb/concurrent_hash_map.h>
|
||||
#include <iostream>
|
||||
|
||||
int main() {
|
||||
tbb::concurrent_hash_map<int, int> map;
|
||||
map.insert({1, 2});
|
||||
decltype(map)::accessor a;
|
||||
std::cout << map.find(a, 1) << std::endl;
|
||||
|
||||
return 0;
|
||||
}
|
87
csrc/balance_serve/kvc2/test/kvc2_export_header_test.cpp
Normal file
87
csrc/balance_serve/kvc2/test/kvc2_export_header_test.cpp
Normal file
|
@ -0,0 +1,87 @@
|
|||
#include "kvc2.h"
|
||||
#include "kvc2_test_utils.cpp"
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
init(argc, argv);
|
||||
spdlog::set_level(spdlog::level::debug);
|
||||
std::mt19937 gen(123);
|
||||
|
||||
KVC2Config config = {
|
||||
.path = FLAGS_disk_cache_path,
|
||||
.config_path = std::string("/home/xwy/conifg"),
|
||||
.block_length = BlockLength,
|
||||
.memory_pool_size = size_t(10e9),
|
||||
.evict_count = 20,
|
||||
};
|
||||
auto kvcc = create_kvc2(config);
|
||||
|
||||
auto io = kvcc->start_io_thread();
|
||||
|
||||
SPDLOG_INFO("Disk Test");
|
||||
auto ids = random_ids(10 * BlockLength, gen);
|
||||
auto h1 = random_kvcache(qwen_cache_info, 10, gen);
|
||||
kvcc->raw_insert(qwen_cache_info, reinterpret_cast<TokenPtr>(ids.data()), ids.size(), h1);
|
||||
|
||||
// complete same
|
||||
{
|
||||
auto h2 = empty_kvcache(qwen_cache_info, 10);
|
||||
kvcc->raw_read(qwen_cache_info, reinterpret_cast<TokenPtr>(ids.data()), ids.size(), h2);
|
||||
cmp_handle_data(qwen_cache_info, h1, h2);
|
||||
}
|
||||
|
||||
// complete prefix
|
||||
{
|
||||
auto h2 = empty_kvcache(qwen_cache_info, 10);
|
||||
auto ids2 = std::vector<Token>(ids.begin(), ids.begin() + 3 * BlockLength);
|
||||
kvcc->raw_read(qwen_cache_info, reinterpret_cast<TokenPtr>(ids2.data()), ids2.size(), h2);
|
||||
cmp_handle_data(qwen_cache_info, h1, h2, 3);
|
||||
}
|
||||
|
||||
// common prefix
|
||||
{
|
||||
auto h2 = empty_kvcache(qwen_cache_info, 10);
|
||||
auto ids2 = std::vector<Token>(ids.begin(), ids.begin() + 5 * BlockLength);
|
||||
auto rids = random_ids(BlockLength * 2 + BlockLength / 2, gen);
|
||||
ids2.insert(ids2.end(), rids.begin(), rids.end());
|
||||
|
||||
kvcc->raw_read(qwen_cache_info, reinterpret_cast<TokenPtr>(ids2.data()), ids2.size(), h2);
|
||||
|
||||
cmp_handle_data(qwen_cache_info, h1, h2, 5);
|
||||
}
|
||||
|
||||
// no prefix
|
||||
{
|
||||
auto h2 = empty_kvcache(qwen_cache_info, 10);
|
||||
|
||||
auto ids2 = random_ids(10 * BlockLength, gen);
|
||||
|
||||
kvcc->raw_read(qwen_cache_info, reinterpret_cast<TokenPtr>(ids2.data()), ids2.size(), h2);
|
||||
}
|
||||
|
||||
// insert partly new
|
||||
auto h2 = random_kvcache(qwen_cache_info, 10, gen);
|
||||
copy_kvcache(h1, h2, 0, 5);
|
||||
auto ids2 = random_ids(10 * BlockLength, gen);
|
||||
for (size_t i = 0; i < 5 * BlockLength; i++) {
|
||||
ids2[i] = ids[i];
|
||||
}
|
||||
|
||||
kvcc->raw_insert(qwen_cache_info, reinterpret_cast<TokenPtr>(ids2.data()), ids2.size(), h2);
|
||||
|
||||
// read new part
|
||||
{
|
||||
auto h3 = empty_kvcache(qwen_cache_info, 10);
|
||||
auto ids3 = std::vector<Token>(ids2.begin(), ids2.begin() + 7 * BlockLength);
|
||||
ids3.push_back(123);
|
||||
|
||||
kvcc->raw_read(qwen_cache_info, reinterpret_cast<TokenPtr>(ids3.data()), ids3.size(), h3);
|
||||
cmp_handle_data(qwen_cache_info, h3, h2, 7);
|
||||
}
|
||||
kvcc->save();
|
||||
kvcc->stop_io_thread();
|
||||
io.join();
|
||||
|
||||
SPDLOG_WARN("{} Test Passed", __FILE__);
|
||||
|
||||
return 0;
|
||||
}
|
87
csrc/balance_serve/kvc2/test/kvc2_export_load_test.cpp
Normal file
87
csrc/balance_serve/kvc2/test/kvc2_export_load_test.cpp
Normal file
|
@ -0,0 +1,87 @@
|
|||
#include "kvc2.h"
|
||||
#include "kvc2_test_utils.cpp"
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
init(argc, argv);
|
||||
spdlog::set_level(spdlog::level::debug);
|
||||
std::mt19937 gen(123);
|
||||
|
||||
KVC2Config config = {
|
||||
.path = FLAGS_disk_cache_path,
|
||||
.block_length = BlockLength,
|
||||
.memory_pool_size = size_t(10e9),
|
||||
.evict_count = 20,
|
||||
};
|
||||
auto kvcc = create_kvc2(config);
|
||||
kvcc->load();
|
||||
|
||||
auto io = kvcc->start_io_thread();
|
||||
|
||||
SPDLOG_INFO("Disk Test");
|
||||
auto ids = random_ids(10 * BlockLength, gen);
|
||||
auto h1 = empty_kvcache(qwen_cache_info, 10);
|
||||
// kvcc->raw_insert(qwen_cache_info, reinterpret_cast<IDptr>(ids.data()), ids.size(), h1);
|
||||
|
||||
// complete same
|
||||
{
|
||||
// auto h2 = empty_kvcache(qwen_cache_info, 10);
|
||||
kvcc->raw_read(qwen_cache_info, reinterpret_cast<TokenPtr>(ids.data()), ids.size(), h1);
|
||||
// cmp_handle_data(qwen_cache_info, h1, h2);
|
||||
}
|
||||
|
||||
// complete prefix
|
||||
{
|
||||
auto h2 = empty_kvcache(qwen_cache_info, 10);
|
||||
auto ids2 = std::vector<Token>(ids.begin(), ids.begin() + 3 * BlockLength);
|
||||
kvcc->raw_read(qwen_cache_info, reinterpret_cast<TokenPtr>(ids2.data()), ids2.size(), h2);
|
||||
cmp_handle_data(qwen_cache_info, h1, h2, 3);
|
||||
}
|
||||
|
||||
// common prefix
|
||||
{
|
||||
auto h2 = empty_kvcache(qwen_cache_info, 10);
|
||||
auto ids2 = std::vector<Token>(ids.begin(), ids.begin() + 5 * BlockLength);
|
||||
auto rids = random_ids(BlockLength * 2 + BlockLength / 2, gen);
|
||||
ids2.insert(ids2.end(), rids.begin(), rids.end());
|
||||
|
||||
kvcc->raw_read(qwen_cache_info, reinterpret_cast<TokenPtr>(ids2.data()), ids2.size(), h2);
|
||||
|
||||
cmp_handle_data(qwen_cache_info, h1, h2, 5);
|
||||
}
|
||||
|
||||
// no prefix
|
||||
{
|
||||
auto h2 = empty_kvcache(qwen_cache_info, 10);
|
||||
|
||||
auto ids2 = random_ids(10 * BlockLength, gen);
|
||||
|
||||
kvcc->raw_read(qwen_cache_info, reinterpret_cast<TokenPtr>(ids2.data()), ids2.size(), h2);
|
||||
}
|
||||
|
||||
// insert partly new
|
||||
auto h2 = random_kvcache(qwen_cache_info, 10, gen);
|
||||
copy_kvcache(h1, h2, 0, 5);
|
||||
auto ids2 = random_ids(10 * BlockLength, gen);
|
||||
for (size_t i = 0; i < 5 * BlockLength; i++) {
|
||||
ids2[i] = ids[i];
|
||||
}
|
||||
|
||||
kvcc->raw_insert(qwen_cache_info, reinterpret_cast<TokenPtr>(ids2.data()), ids2.size(), h2);
|
||||
|
||||
// read new part
|
||||
{
|
||||
auto h3 = empty_kvcache(qwen_cache_info, 10);
|
||||
auto ids3 = std::vector<Token>(ids2.begin(), ids2.begin() + 7 * BlockLength);
|
||||
ids3.push_back(123);
|
||||
|
||||
kvcc->raw_read(qwen_cache_info, reinterpret_cast<TokenPtr>(ids3.data()), ids3.size(), h3);
|
||||
cmp_handle_data(qwen_cache_info, h3, h2, 7);
|
||||
}
|
||||
|
||||
kvcc->stop_io_thread();
|
||||
io.join();
|
||||
|
||||
SPDLOG_WARN("{} Test Passed", __FILE__);
|
||||
|
||||
return 0;
|
||||
}
|
117
csrc/balance_serve/kvc2/test/kvc2_test_utils.cpp
Normal file
117
csrc/balance_serve/kvc2/test/kvc2_test_utils.cpp
Normal file
|
@ -0,0 +1,117 @@
|
|||
#include <optional>
|
||||
#include <random>
|
||||
#include "kvc2.h"
|
||||
#define FMT_HEADER_ONLY
|
||||
#include <spdlog/spdlog.h>
|
||||
|
||||
const int BlockLength = 256;
|
||||
|
||||
std::string FLAGS_disk_cache_path;
|
||||
|
||||
void init(int argc, char* argv[]) {
|
||||
if (argc != 2) {
|
||||
fmt::print("Usage: {} --disk_cache_path=xxx\n", argv[0]);
|
||||
exit(1);
|
||||
}
|
||||
FLAGS_disk_cache_path = argv[1];
|
||||
if (FLAGS_disk_cache_path.empty()) {
|
||||
fmt::print("disk_cache_path is empty");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
using namespace kvc2;
|
||||
|
||||
data_block_ptr empty_block(CacheInfo info) {
|
||||
auto re = new (std::align_val_t(4096)) std::byte[info.element_size(BlockLength)];
|
||||
return reinterpret_cast<data_block_ptr>(re);
|
||||
}
|
||||
|
||||
data_block_ptr random_block(CacheInfo info, std::mt19937& gen) {
|
||||
auto re = empty_block(info);
|
||||
uint64_t* d = (uint64_t*)re;
|
||||
for (size_t i = 0; i < info.element_size(BlockLength) / 8; i++) {
|
||||
d[i] = gen();
|
||||
}
|
||||
return re;
|
||||
}
|
||||
layer_data random_blocks(CacheInfo info, size_t block_count, size_t seed) {
|
||||
std::mt19937 gen(seed);
|
||||
layer_data re;
|
||||
for (size_t i = 0; i < block_count; i++) {
|
||||
re.push_back(random_block(info, gen));
|
||||
}
|
||||
return re;
|
||||
}
|
||||
|
||||
layer_data empty_blocks(CacheInfo info, size_t block_count) {
|
||||
layer_data re;
|
||||
for (size_t i = 0; i < block_count; i++) {
|
||||
re.push_back(empty_block(info));
|
||||
}
|
||||
return re;
|
||||
}
|
||||
|
||||
void copy_kvcache(std::vector<layer_data>& from, std::vector<layer_data>& to, size_t block_start, size_t length) {
|
||||
for (size_t i = 0; i < from.size(); i++) {
|
||||
for (size_t j = 0; j < length; j++) {
|
||||
to[i][block_start + j] = from[i][block_start + j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<layer_data> random_kvcache(CacheInfo info, size_t block_count, std::mt19937& gen) {
|
||||
std::vector<layer_data> re;
|
||||
re.resize(info.hidden_layer_count());
|
||||
fmt::print("Generating random kvcache, layer {}\n", info.hidden_layer_count());
|
||||
#pragma omp parallel for
|
||||
for (size_t i = 0; i < info.hidden_layer_count(); i++) {
|
||||
re[i] = random_blocks(info, block_count, gen());
|
||||
}
|
||||
return re;
|
||||
}
|
||||
|
||||
std::vector<layer_data> empty_kvcache(CacheInfo info, size_t block_count) {
|
||||
std::vector<layer_data> re;
|
||||
re.resize(info.hidden_layer_count());
|
||||
fmt::print("Generating empty kvcache, layer {}\n", info.hidden_layer_count());
|
||||
#pragma omp parallel for
|
||||
for (size_t i = 0; i < info.hidden_layer_count(); i++) {
|
||||
re[i] = empty_blocks(info, block_count);
|
||||
}
|
||||
return re;
|
||||
}
|
||||
|
||||
std::vector<Token> random_ids(size_t length, std::mt19937& gen) {
|
||||
std::vector<Token> re;
|
||||
for (size_t i = 0; i < length; i++) {
|
||||
re.push_back(gen());
|
||||
}
|
||||
return re;
|
||||
}
|
||||
|
||||
CacheInfo qwen_cache_info = {
|
||||
.model_name = "qwen2-72b-instruct",
|
||||
.is_key_cache = true,
|
||||
.quant_type = "BF16",
|
||||
};
|
||||
|
||||
void cmp_handle_data(CacheInfo info, std::vector<layer_data>& h1, std::vector<layer_data>& h2,
|
||||
std::optional<size_t> blocks = std::nullopt) {
|
||||
assert(h1.size() == h2.size());
|
||||
|
||||
for (size_t i = 0; i < h1.size(); i++) {
|
||||
auto& b1 = h1[i];
|
||||
auto& b2 = h2[i];
|
||||
if (blocks.has_value() == false) {
|
||||
assert(b1.size() == b2.size());
|
||||
}
|
||||
int cmp_to = blocks.has_value() ? blocks.value() : b1.size();
|
||||
for (int j = 0; j < cmp_to; j++) {
|
||||
auto e1 = reinterpret_cast<void*>(b1[j]);
|
||||
auto e2 = reinterpret_cast<void*>(b2[j]);
|
||||
assert(memcmp(e1, e2, info.element_size(BlockLength)) == 0);
|
||||
}
|
||||
}
|
||||
fmt::print("KVCacheHandle cmp ok\n");
|
||||
}
|
26
csrc/balance_serve/kvc2/test/kvc2test/CMakeLists.txt
Normal file
26
csrc/balance_serve/kvc2/test/kvc2test/CMakeLists.txt
Normal file
|
@ -0,0 +1,26 @@
|
|||
|
||||
set(CMAKE_CXX_FLAGS "-Og -march=native -Wall -Wextra -g -fopenmp")
|
||||
|
||||
function(add_kvc2_test source_file)
|
||||
get_filename_component(target_name ${source_file} NAME_WE) # 获取不带扩展名的文件名作为目标名
|
||||
add_executable(${target_name} ${source_file})
|
||||
# target_compile_options(${target_name} PRIVATE -fopenmp -fno-strict-aliasing)
|
||||
target_include_directories(${target_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../src)
|
||||
target_include_directories(${target_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../third_party/nlohmann/single_include)
|
||||
target_include_directories(${target_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../third_party/spdlog/include)
|
||||
target_link_libraries(${target_name} PRIVATE kvc2 async_store)
|
||||
endfunction()
|
||||
|
||||
add_kvc2_test(raw_insert_read.cpp)
|
||||
add_kvc2_test(lookup.cpp)
|
||||
add_kvc2_test(lookup-alt.cpp)
|
||||
add_kvc2_test(lookup-alt-gpu.cpp)
|
||||
add_kvc2_test(lookup-mt.cpp)
|
||||
add_kvc2_test(lookup-gpu.cpp)
|
||||
add_kvc2_test(lookup-gpu-mt.cpp)
|
||||
add_kvc2_test(lookup-gpu-async.cpp)
|
||||
add_kvc2_test(append-tokens.cpp)
|
||||
add_kvc2_test(flush-back.cpp)
|
||||
add_kvc2_test(check-flush-back.cpp)
|
||||
add_kvc2_test(lookup-without-vcache.cpp)
|
||||
add_kvc2_test(lookup-gpu-mt-without-vcache.cpp)
|
52
csrc/balance_serve/kvc2/test/kvc2test/append-tokens.cpp
Normal file
52
csrc/balance_serve/kvc2/test/kvc2test/append-tokens.cpp
Normal file
|
@ -0,0 +1,52 @@
|
|||
#include <future>
|
||||
#include "common.hpp"
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
init(argc, argv);
|
||||
spdlog::set_level(spdlog::level::debug);
|
||||
auto kvc2 = kvc2::create_kvc2(config);
|
||||
|
||||
#pragma omp parallel for
|
||||
for (size_t ti = 0; ti < 3; ti++) {
|
||||
auto [kcache, vcache] = kvc2->get_kvcache();
|
||||
std::mt19937 gen(ti + 123);
|
||||
size_t total_page = 10;
|
||||
TokenLength total_length = total_page * config.num_token_per_page;
|
||||
auto tokens = random_ids(total_length, gen);
|
||||
TokenLength prompt_length = 3 * config.num_token_per_page;
|
||||
auto k1 = random_kvcache(total_page, gen);
|
||||
auto v1 = random_kvcache(total_page, gen);
|
||||
{
|
||||
std::promise<std::shared_ptr<DoubleCacheHandleInterface>> p;
|
||||
kvc2->lookup_to_gpu_async(test_model_name, test_quant_type, tokens.data(), prompt_length, total_length,
|
||||
[&p](std::shared_ptr<DoubleCacheHandleInterface> h) { p.set_value(h); });
|
||||
auto fut = p.get_future();
|
||||
fut.wait();
|
||||
auto h = fut.get();
|
||||
assert(h->matched_length() % config.num_token_per_page == 0);
|
||||
size_t matched_block = h->matched_length() / config.num_token_per_page;
|
||||
auto block_idx = h->get_gpu_block_idx();
|
||||
cmp_handle_gpu(block_idx, kcache, vcache, k1, v1, matched_block);
|
||||
for (size_t at = matched_block; at < block_idx.size(); at++) {
|
||||
copy_cpu_gpu(block_idx, kcache, vcache, k1, v1, at);
|
||||
}
|
||||
h->append_tokens(tokens.data(), total_length);
|
||||
cmp_handle_gpu(block_idx, kcache, vcache, k1, v1, total_page);
|
||||
}
|
||||
|
||||
{
|
||||
std::promise<std::shared_ptr<DoubleCacheHandleInterface>> p;
|
||||
kvc2->lookup_to_gpu_async(test_model_name, test_quant_type, tokens.data(), total_length, total_length,
|
||||
[&p](std::shared_ptr<DoubleCacheHandleInterface> h) { p.set_value(h); });
|
||||
auto fut = p.get_future();
|
||||
fut.wait();
|
||||
auto h = fut.get();
|
||||
assert(h->matched_length() == total_length);
|
||||
size_t matched_block = h->matched_length() / config.num_token_per_page;
|
||||
auto block_idx = h->get_gpu_block_idx();
|
||||
cmp_handle_gpu(block_idx, kcache, vcache, k1, v1, matched_block);
|
||||
}
|
||||
}
|
||||
SPDLOG_CRITICAL("All Test Passed: {}", argv[0]);
|
||||
return 0;
|
||||
}
|
36
csrc/balance_serve/kvc2/test/kvc2test/check-flush-back.cpp
Normal file
36
csrc/balance_serve/kvc2/test/kvc2test/check-flush-back.cpp
Normal file
|
@ -0,0 +1,36 @@
|
|||
#include <future>
|
||||
#include "common.hpp"
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
init(argc, argv);
|
||||
spdlog::set_level(spdlog::level::debug);
|
||||
config.gpu_cache_config->total_kvcache_pages = 12;
|
||||
auto kvc2 = kvc2::create_kvc2(config);
|
||||
kvc2->load();
|
||||
// #pragma omp parallel for
|
||||
for (size_t ti = 0; ti < 2; ti++) {
|
||||
SPDLOG_WARN("Test {}", ti);
|
||||
auto [kcache, vcache] = kvc2->get_kvcache();
|
||||
std::mt19937 gen(ti + 123);
|
||||
size_t total_page = 10;
|
||||
TokenLength total_length = total_page * config.num_token_per_page;
|
||||
auto tokens = random_ids(total_length, gen);
|
||||
auto k1 = random_kvcache(total_page, gen);
|
||||
auto v1 = random_kvcache(total_page, gen);
|
||||
|
||||
{
|
||||
std::promise<std::shared_ptr<DoubleCacheHandleInterface>> p;
|
||||
kvc2->lookup_to_gpu_async(test_model_name, test_quant_type, tokens.data(), total_length, total_length,
|
||||
[&p](std::shared_ptr<DoubleCacheHandleInterface> h) { p.set_value(h); });
|
||||
auto fut = p.get_future();
|
||||
fut.wait();
|
||||
auto h = fut.get();
|
||||
assert(h->matched_length() == total_length);
|
||||
size_t matched_block = h->matched_length() / config.num_token_per_page;
|
||||
auto block_idx = h->get_gpu_block_idx();
|
||||
cmp_handle_gpu(block_idx, kcache, vcache, k1, v1, matched_block);
|
||||
}
|
||||
}
|
||||
SPDLOG_CRITICAL("All Test Passed: {}", argv[0]);
|
||||
return 0;
|
||||
}
|
233
csrc/balance_serve/kvc2/test/kvc2test/common.hpp
Normal file
233
csrc/balance_serve/kvc2/test/kvc2test/common.hpp
Normal file
|
@ -0,0 +1,233 @@
|
|||
/**
|
||||
* @Description :
|
||||
* @Author : Xie Weiyu
|
||||
* @Date : 2024-11-22 06:02:41
|
||||
* @Version : 1.0.0
|
||||
* @LastEditors : Xie Weiyu
|
||||
* @LastEditTime : 2024-12-11 07:34:10
|
||||
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
**/
|
||||
#pragma once
|
||||
#include <random>
|
||||
#include <thread>
|
||||
#include "kvc2.h"
|
||||
#define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG
|
||||
#define FMT_HEADER_ONLY
|
||||
#include "spdlog/spdlog.h"
|
||||
|
||||
using namespace kvc2;
|
||||
|
||||
template <typename T>
|
||||
T* offset_by_bytes(T* t, size_t n) {
|
||||
return reinterpret_cast<T*>(reinterpret_cast<size_t>(t) + n);
|
||||
}
|
||||
|
||||
std::string FLAGS_disk_cache_path;
|
||||
|
||||
kvc2::KVC2Config config;
|
||||
kvc2::GPUPageCacheConfig qw25_7B_gpu_config{
|
||||
.gpu_only = false,
|
||||
.gpu_devices_id = {0, 1},
|
||||
.layer_count = 28,
|
||||
.total_kvcache_pages = 40,
|
||||
.num_token_per_page = 256,
|
||||
.num_k_heads = 4,
|
||||
.k_head_dim = 896,
|
||||
.full_kv_cache_on_each_gpu = false,
|
||||
.k_cache_on = true,
|
||||
.v_cache_on = true,
|
||||
.tensor_type = torch::kBFloat16,
|
||||
.num_streams_per_device = 4,
|
||||
};
|
||||
|
||||
ModelName test_model_name = "Qwen2.5-7B-Instruct";
|
||||
QuantType test_quant_type = "FP16";
|
||||
CacheInfo test_cache_info{
|
||||
.model_name = test_model_name,
|
||||
.is_key_cache = true,
|
||||
.quant_type = test_quant_type,
|
||||
};
|
||||
|
||||
void init(int argc, char* argv[]) {
|
||||
if (argc != 2) {
|
||||
fmt::print("Usage: {} <disk_cache_path>\n", argv[0]);
|
||||
exit(1);
|
||||
}
|
||||
load_quant_configs("./config/quant_configs.json");
|
||||
load_model_configs("./config/model_configs.json");
|
||||
|
||||
FLAGS_disk_cache_path = argv[1];
|
||||
if (FLAGS_disk_cache_path.empty()) {
|
||||
fmt::print("disk_cache_path is empty\n");
|
||||
exit(1);
|
||||
}
|
||||
config.path = FLAGS_disk_cache_path;
|
||||
config.config_path = "./config";
|
||||
config.gpu_cache_config = qw25_7B_gpu_config;
|
||||
}
|
||||
|
||||
data_block_ptr empty_block() {
|
||||
auto re = new (std::align_val_t(4096)) std::byte[test_cache_info.element_size(config.num_token_per_page)];
|
||||
memset(re, 0, test_cache_info.element_size(config.num_token_per_page));
|
||||
return reinterpret_cast<data_block_ptr>(re);
|
||||
}
|
||||
|
||||
data_block_ptr random_block(std::mt19937& gen) {
|
||||
auto re = empty_block();
|
||||
uint64_t* d = (uint64_t*)re;
|
||||
for (size_t i = 0; i < test_cache_info.element_size(config.num_token_per_page) / 8; i++) {
|
||||
d[i] = gen();
|
||||
}
|
||||
return re;
|
||||
}
|
||||
layer_data random_blocks(size_t block_count, size_t seed) {
|
||||
std::mt19937 gen(seed);
|
||||
layer_data re;
|
||||
for (size_t i = 0; i < block_count; i++) {
|
||||
re.push_back(random_block(gen));
|
||||
}
|
||||
return re;
|
||||
}
|
||||
|
||||
layer_data empty_blocks(size_t block_count) {
|
||||
layer_data re;
|
||||
for (size_t i = 0; i < block_count; i++) {
|
||||
re.push_back(empty_block());
|
||||
}
|
||||
return re;
|
||||
}
|
||||
|
||||
void copy_kvcache(std::vector<layer_data>& from, std::vector<layer_data>& to, size_t block_start, size_t length) {
|
||||
for (size_t i = 0; i < from.size(); i++) {
|
||||
for (size_t j = 0; j < length; j++) {
|
||||
to[i][block_start + j] = from[i][block_start + j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<layer_data> random_kvcache(size_t block_count, std::mt19937& gen) {
|
||||
std::vector<layer_data> re;
|
||||
re.resize(test_cache_info.hidden_layer_count());
|
||||
fmt::print("Generating random kvcache, layer {}\n", test_cache_info.hidden_layer_count());
|
||||
std::vector<std::mt19937> gens;
|
||||
for (size_t i = 0; i < test_cache_info.hidden_layer_count(); i++) {
|
||||
gens.push_back(std::mt19937(gen()));
|
||||
}
|
||||
#pragma omp parallel for
|
||||
for (size_t i = 0; i < test_cache_info.hidden_layer_count(); i++) {
|
||||
re[i] = random_blocks(block_count, gens[i]());
|
||||
}
|
||||
return re;
|
||||
}
|
||||
|
||||
std::vector<layer_data> empty_kvcache(size_t block_count) {
|
||||
std::vector<layer_data> re;
|
||||
re.resize(test_cache_info.hidden_layer_count());
|
||||
fmt::print("Generating empty kvcache, layer {}\n", test_cache_info.hidden_layer_count());
|
||||
#pragma omp parallel for
|
||||
for (size_t i = 0; i < test_cache_info.hidden_layer_count(); i++) {
|
||||
re[i] = empty_blocks(block_count);
|
||||
}
|
||||
return re;
|
||||
}
|
||||
|
||||
std::vector<Token> random_ids(size_t length, std::mt19937& gen) {
|
||||
std::vector<Token> re;
|
||||
for (size_t i = 0; i < length; i++) {
|
||||
re.push_back(gen());
|
||||
}
|
||||
return re;
|
||||
}
|
||||
|
||||
std::vector<layer_data> slice(std::vector<layer_data>& h1, size_t start, size_t end) {
|
||||
std::vector<layer_data> re;
|
||||
for (auto& l : h1) {
|
||||
layer_data new_layer;
|
||||
new_layer.insert(new_layer.end(), l.begin() + start, l.begin() + end);
|
||||
re.push_back(new_layer);
|
||||
}
|
||||
return re;
|
||||
}
|
||||
|
||||
void cmp_handle_data(std::vector<layer_data> h1, std::vector<layer_data> h2,
|
||||
std::optional<size_t> blocks = std::nullopt) {
|
||||
assert(h1.size() == h2.size());
|
||||
|
||||
for (size_t i = 0; i < h1.size(); i++) {
|
||||
auto& b1 = h1[i];
|
||||
auto& b2 = h2[i];
|
||||
if (blocks.has_value() == false) {
|
||||
assert(b1.size() == b2.size());
|
||||
}
|
||||
int cmp_to = blocks.has_value() ? blocks.value() : b1.size();
|
||||
for (int j = 0; j < cmp_to; j++) {
|
||||
auto e1 = reinterpret_cast<void*>(b1[j]);
|
||||
auto e2 = reinterpret_cast<void*>(b2[j]);
|
||||
assert(memcmp(e1, e2, test_cache_info.element_size(config.num_token_per_page)) == 0);
|
||||
}
|
||||
}
|
||||
fmt::print("KVCacheHandle cmp ok\n");
|
||||
}
|
||||
|
||||
void copy_gpu_cpu(std::vector<size_t>& block_idx, std::vector<torch::Tensor>& kcache,
|
||||
std::vector<torch::Tensor>& vcache, std::vector<layer_data>& k_cpu, std::vector<layer_data>& v_cpu,
|
||||
size_t at) {
|
||||
size_t gpu_count = config.gpu_cache_config->gpu_devices_id.size();
|
||||
size_t element_size_per_gpu = test_cache_info.element_size(config.num_token_per_page) / gpu_count;
|
||||
|
||||
for (size_t layer = 0; layer < test_cache_info.hidden_layer_count(); layer++) {
|
||||
for (size_t gpu_idx = 0; gpu_idx < gpu_count; gpu_idx++) {
|
||||
{
|
||||
auto kt = kcache[gpu_idx][layer][block_idx[at]].to(torch::kCPU);
|
||||
void* src = kt.data_ptr();
|
||||
void* dst = offset_by_bytes(k_cpu[layer][at], gpu_idx * element_size_per_gpu);
|
||||
memcpy(dst, src, element_size_per_gpu);
|
||||
}
|
||||
{
|
||||
auto vt = vcache[gpu_idx][layer][block_idx[at]].to(torch::kCPU);
|
||||
void* src = vt.data_ptr();
|
||||
void* dst = offset_by_bytes(v_cpu[layer][at], gpu_idx * element_size_per_gpu);
|
||||
memcpy(dst, src, element_size_per_gpu);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void copy_cpu_gpu(std::vector<size_t>& block_idx, std::vector<torch::Tensor>& kcache,
|
||||
std::vector<torch::Tensor>& vcache, std::vector<layer_data>& k_cpu, std::vector<layer_data>& v_cpu,
|
||||
size_t at) {
|
||||
size_t gpu_count = config.gpu_cache_config->gpu_devices_id.size();
|
||||
size_t element_size_per_gpu = test_cache_info.element_size(config.num_token_per_page) / gpu_count;
|
||||
|
||||
for (size_t layer = 0; layer < test_cache_info.hidden_layer_count(); layer++) {
|
||||
for (size_t gpu_idx = 0; gpu_idx < gpu_count; gpu_idx++) {
|
||||
{
|
||||
auto kt = kcache[gpu_idx][layer][block_idx[at]].to(torch::kCPU);
|
||||
void* dst = kt.data_ptr();
|
||||
void* src = offset_by_bytes(k_cpu[layer][at], gpu_idx * element_size_per_gpu);
|
||||
memcpy(dst, src, element_size_per_gpu);
|
||||
kcache[gpu_idx][layer][block_idx[at]].copy_(kt);
|
||||
}
|
||||
{
|
||||
auto vt = vcache[gpu_idx][layer][block_idx[at]].to(torch::kCPU);
|
||||
void* dst = vt.data_ptr();
|
||||
void* src = offset_by_bytes(v_cpu[layer][at], gpu_idx * element_size_per_gpu);
|
||||
memcpy(dst, src, element_size_per_gpu);
|
||||
vcache[gpu_idx][layer][block_idx[at]].copy_(vt);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void cmp_handle_gpu(std::vector<size_t>& block_idx, std::vector<torch::Tensor>& kcache,
|
||||
std::vector<torch::Tensor>& vcache, std::vector<layer_data>& k1, std::vector<layer_data>& v1,
|
||||
size_t num_blocks) {
|
||||
auto k_from_gpu = empty_kvcache(num_blocks);
|
||||
auto v_from_gpu = empty_kvcache(num_blocks);
|
||||
|
||||
for (size_t j = 0; j < std::min(block_idx.size(), num_blocks); j++) {
|
||||
copy_gpu_cpu(block_idx, kcache, vcache, k_from_gpu, v_from_gpu, j);
|
||||
}
|
||||
cmp_handle_data(k1, k_from_gpu, num_blocks);
|
||||
cmp_handle_data(v1, v_from_gpu, num_blocks);
|
||||
}
|
57
csrc/balance_serve/kvc2/test/kvc2test/flush-back.cpp
Normal file
57
csrc/balance_serve/kvc2/test/kvc2test/flush-back.cpp
Normal file
|
@ -0,0 +1,57 @@
|
|||
#include <future>
|
||||
#include "common.hpp"
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
init(argc, argv);
|
||||
spdlog::set_level(spdlog::level::debug);
|
||||
config.gpu_cache_config->total_kvcache_pages = 12;
|
||||
auto kvc2 = kvc2::create_kvc2(config);
|
||||
|
||||
// #pragma omp parallel for
|
||||
for (size_t ti = 0; ti < 2; ti++) {
|
||||
SPDLOG_WARN("Test {}", ti);
|
||||
auto [kcache, vcache] = kvc2->get_kvcache();
|
||||
std::mt19937 gen(ti + 123);
|
||||
size_t total_page = 10;
|
||||
TokenLength total_length = total_page * config.num_token_per_page;
|
||||
auto tokens = random_ids(total_length, gen);
|
||||
TokenLength prompt_length = 3 * config.num_token_per_page;
|
||||
auto k1 = random_kvcache(total_page, gen);
|
||||
auto v1 = random_kvcache(total_page, gen);
|
||||
|
||||
{
|
||||
std::promise<std::shared_ptr<DoubleCacheHandleInterface>> p;
|
||||
kvc2->lookup_to_gpu_async(test_model_name, test_quant_type, tokens.data(), prompt_length, total_length,
|
||||
[&p](std::shared_ptr<DoubleCacheHandleInterface> h) { p.set_value(h); });
|
||||
auto fut = p.get_future();
|
||||
fut.wait();
|
||||
auto h = fut.get();
|
||||
assert(h->matched_length() % config.num_token_per_page == 0);
|
||||
size_t matched_block = h->matched_length() / config.num_token_per_page;
|
||||
auto block_idx = h->get_gpu_block_idx();
|
||||
cmp_handle_gpu(block_idx, kcache, vcache, k1, v1, matched_block);
|
||||
for (size_t at = matched_block; at < block_idx.size(); at++) {
|
||||
copy_cpu_gpu(block_idx, kcache, vcache, k1, v1, at);
|
||||
}
|
||||
h->append_tokens(tokens.data(), total_length);
|
||||
cmp_handle_gpu(block_idx, kcache, vcache, k1, v1, total_page);
|
||||
}
|
||||
|
||||
{
|
||||
std::promise<std::shared_ptr<DoubleCacheHandleInterface>> p;
|
||||
kvc2->lookup_to_gpu_async(test_model_name, test_quant_type, tokens.data(), total_length, total_length,
|
||||
[&p](std::shared_ptr<DoubleCacheHandleInterface> h) { p.set_value(h); });
|
||||
auto fut = p.get_future();
|
||||
fut.wait();
|
||||
auto h = fut.get();
|
||||
assert(h->matched_length() == total_length);
|
||||
size_t matched_block = h->matched_length() / config.num_token_per_page;
|
||||
auto block_idx = h->get_gpu_block_idx();
|
||||
cmp_handle_gpu(block_idx, kcache, vcache, k1, v1, matched_block);
|
||||
}
|
||||
}
|
||||
kvc2->save();
|
||||
SPDLOG_CRITICAL("All Test Passed: {}", argv[0]);
|
||||
|
||||
return 0;
|
||||
}
|
125
csrc/balance_serve/kvc2/test/kvc2test/lookup-alt-gpu.cpp
Normal file
125
csrc/balance_serve/kvc2/test/kvc2test/lookup-alt-gpu.cpp
Normal file
|
@ -0,0 +1,125 @@
|
|||
/**
|
||||
* @Description :
|
||||
* @Author : Xie Weiyu
|
||||
* @Date : 2024-11-22 08:29:45
|
||||
* @Version : 1.0.0
|
||||
* @LastEditors : Xie Weiyu
|
||||
* @LastEditTime : 2024-11-22 09:56:12
|
||||
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
**/
|
||||
#include <future>
|
||||
#include "common.hpp"
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
init(argc, argv);
|
||||
spdlog::set_level(spdlog::level::trace);
|
||||
auto kvc2 = kvc2::create_kvc2(config);
|
||||
|
||||
std::mt19937 gen(123);
|
||||
|
||||
std::vector<std::vector<Token>> ids;
|
||||
|
||||
std::vector<std::vector<layer_data>> k, v;
|
||||
for (size_t i = 0; i < 10; i++) {
|
||||
ids.push_back(random_ids(1 * config.num_token_per_page, gen));
|
||||
k.push_back(random_kvcache(1, gen));
|
||||
v.push_back(random_kvcache(1, gen));
|
||||
kvc2->raw_insert(test_model_name, test_quant_type, ids[i].data(), ids[i].size(), k[i], v[i]);
|
||||
}
|
||||
|
||||
kvc2->debug();
|
||||
{
|
||||
// all match
|
||||
std::vector<Token*> chunks;
|
||||
std::vector<TokenLength> lengths;
|
||||
for (size_t i = 0; i < 10; i++) {
|
||||
chunks.push_back(ids[i].data());
|
||||
lengths.push_back(ids[i].size());
|
||||
}
|
||||
std::promise<std::shared_ptr<DoubleCacheHandleInterface>> p;
|
||||
kvc2->lookup_alt_to_gpu_async(test_model_name, test_quant_type, chunks, lengths, 15 * config.num_token_per_page,
|
||||
[&p](std::shared_ptr<DoubleCacheHandleInterface> h) { p.set_value(h); });
|
||||
|
||||
auto fut = p.get_future();
|
||||
fut.wait();
|
||||
auto h = fut.get();
|
||||
auto hk = h->handle_data(true);
|
||||
auto hv = h->handle_data(false);
|
||||
|
||||
for (size_t i = 0; i < 10; i++) {
|
||||
cmp_handle_data(slice(hk, i, i + 1), k[i], 1);
|
||||
cmp_handle_data(slice(hv, i, i + 1), v[i], 1);
|
||||
}
|
||||
|
||||
auto block_idx = h->get_gpu_block_idx();
|
||||
auto [kcache, vcache] = kvc2->get_kvcache();
|
||||
for (size_t i = 0; i < 10; i++) {
|
||||
std::vector<size_t> blocks = {block_idx[i]};
|
||||
cmp_handle_gpu(blocks, kcache, vcache, k[i], v[i], 1);
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
// no match in the middle
|
||||
std::vector<Token*> chunks;
|
||||
std::vector<TokenLength> lengths;
|
||||
|
||||
std::vector<std::vector<Token>> new_ids;
|
||||
for (size_t i = 0; i < 10; i++) {
|
||||
new_ids.push_back(random_ids(1 * config.num_token_per_page, gen));
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < 10; i++) {
|
||||
if (i == 1 || i == 5 || i == 6) {
|
||||
chunks.push_back(new_ids[i].data());
|
||||
} else {
|
||||
chunks.push_back(ids[i].data());
|
||||
}
|
||||
lengths.push_back(ids[i].size());
|
||||
}
|
||||
|
||||
std::promise<std::shared_ptr<DoubleCacheHandleInterface>> p;
|
||||
kvc2->lookup_alt_to_gpu_async(test_model_name, test_quant_type, chunks, lengths, 15 * config.num_token_per_page,
|
||||
[&p](std::shared_ptr<DoubleCacheHandleInterface> h) { p.set_value(h); });
|
||||
|
||||
auto fut = p.get_future();
|
||||
fut.wait();
|
||||
auto h = fut.get();
|
||||
auto statuses = h->matched_status();
|
||||
for (size_t i = 0; i < 10; i++) {
|
||||
if (i == 1) {
|
||||
assert(statuses[i] == MatchStatus::NotMatchExact);
|
||||
} else if (i == 5 || i == 6) {
|
||||
assert(statuses[i] == MatchStatus::NotMatchPartial);
|
||||
} else if (i == 0) {
|
||||
assert(statuses[i] == MatchStatus::Exact);
|
||||
} else {
|
||||
assert(statuses[i] == MatchStatus::Partial);
|
||||
}
|
||||
}
|
||||
|
||||
auto hk = h->handle_data(true);
|
||||
auto hv = h->handle_data(false);
|
||||
|
||||
for (size_t i = 0; i < 10; i++) {
|
||||
if (i == 1 || i == 5 || i == 6) {
|
||||
} else {
|
||||
cmp_handle_data(slice(hk, i, i + 1), k[i], 1);
|
||||
cmp_handle_data(slice(hv, i, i + 1), v[i], 1);
|
||||
}
|
||||
}
|
||||
|
||||
auto block_idx = h->get_gpu_block_idx();
|
||||
auto [kcache, vcache] = kvc2->get_kvcache();
|
||||
for (size_t i = 0; i < 10; i++) {
|
||||
if (i == 1 || i == 5 || i == 6) {
|
||||
} else {
|
||||
std::vector<size_t> blocks = {block_idx[i]};
|
||||
cmp_handle_gpu(blocks, kcache, vcache, k[i], v[i], 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
SPDLOG_CRITICAL("All Test Passed: {}", argv[0]);
|
||||
return 0;
|
||||
}
|
97
csrc/balance_serve/kvc2/test/kvc2test/lookup-alt.cpp
Normal file
97
csrc/balance_serve/kvc2/test/kvc2test/lookup-alt.cpp
Normal file
|
@ -0,0 +1,97 @@
|
|||
/**
|
||||
* @Description :
|
||||
* @Author : Xie Weiyu
|
||||
* @Date : 2024-11-22 08:29:45
|
||||
* @Version : 1.0.0
|
||||
* @LastEditors : Xie Weiyu
|
||||
* @LastEditTime : 2024-11-22 09:56:12
|
||||
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
**/
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
init(argc, argv);
|
||||
spdlog::set_level(spdlog::level::trace);
|
||||
auto kvc2 = kvc2::create_kvc2(config);
|
||||
|
||||
std::mt19937 gen(123);
|
||||
|
||||
std::vector<std::vector<Token>> ids;
|
||||
|
||||
std::vector<std::vector<layer_data>> k, v;
|
||||
for (size_t i = 0; i < 10; i++) {
|
||||
ids.push_back(random_ids(1 * config.num_token_per_page, gen));
|
||||
k.push_back(random_kvcache(1, gen));
|
||||
v.push_back(random_kvcache(1, gen));
|
||||
kvc2->raw_insert(test_model_name, test_quant_type, ids[i].data(), ids[i].size(), k[i], v[i]);
|
||||
}
|
||||
|
||||
kvc2->debug();
|
||||
{
|
||||
// all match
|
||||
std::vector<Token*> chunks;
|
||||
std::vector<TokenLength> lengths;
|
||||
for (size_t i = 0; i < 10; i++) {
|
||||
chunks.push_back(ids[i].data());
|
||||
lengths.push_back(ids[i].size());
|
||||
}
|
||||
|
||||
auto h = kvc2->lookup_alt(test_model_name, test_quant_type, chunks, lengths, 15 * config.num_token_per_page);
|
||||
auto hk = h->handle_data(true);
|
||||
auto hv = h->handle_data(false);
|
||||
|
||||
for (size_t i = 0; i < 10; i++) {
|
||||
cmp_handle_data(slice(hk, i, i + 1), k[i], 1);
|
||||
cmp_handle_data(slice(hv, i, i + 1), v[i], 1);
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
// no match in the middle
|
||||
std::vector<Token*> chunks;
|
||||
std::vector<TokenLength> lengths;
|
||||
|
||||
std::vector<std::vector<Token>> new_ids;
|
||||
for (size_t i = 0; i < 10; i++) {
|
||||
new_ids.push_back(random_ids(1 * config.num_token_per_page, gen));
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < 10; i++) {
|
||||
if (i == 1 || i == 5 || i == 6) {
|
||||
chunks.push_back(new_ids[i].data());
|
||||
} else {
|
||||
chunks.push_back(ids[i].data());
|
||||
}
|
||||
lengths.push_back(ids[i].size());
|
||||
}
|
||||
|
||||
auto h = kvc2->lookup_alt(test_model_name, test_quant_type, chunks, lengths, 15 * config.num_token_per_page);
|
||||
auto statuses = h->matched_status();
|
||||
for (size_t i = 0; i < 10; i++) {
|
||||
if (i == 1) {
|
||||
assert(statuses[i] == MatchStatus::NotMatchExact);
|
||||
} else if (i == 5 || i == 6) {
|
||||
assert(statuses[i] == MatchStatus::NotMatchPartial);
|
||||
} else if (i == 0) {
|
||||
assert(statuses[i] == MatchStatus::Exact);
|
||||
} else {
|
||||
assert(statuses[i] == MatchStatus::Partial);
|
||||
}
|
||||
}
|
||||
|
||||
auto hk = h->handle_data(true);
|
||||
auto hv = h->handle_data(false);
|
||||
|
||||
for (size_t i = 0; i < 10; i++) {
|
||||
if (i == 1 || i == 5 || i == 6) {
|
||||
} else {
|
||||
cmp_handle_data(slice(hk, i, i + 1), k[i], 1);
|
||||
cmp_handle_data(slice(hv, i, i + 1), v[i], 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
SPDLOG_CRITICAL("All Test Passed: {}", argv[0]);
|
||||
return 0;
|
||||
}
|
49
csrc/balance_serve/kvc2/test/kvc2test/lookup-gpu-async.cpp
Normal file
49
csrc/balance_serve/kvc2/test/kvc2test/lookup-gpu-async.cpp
Normal file
|
@ -0,0 +1,49 @@
|
|||
/**
|
||||
* @Description :
|
||||
* @Author : Xie Weiyu
|
||||
* @Date : 2024-11-22 09:52:48
|
||||
* @Version : 1.0.0
|
||||
* @LastEditors : Xie Weiyu
|
||||
* @LastEditTime : 2024-11-25 07:51:09
|
||||
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
**/
|
||||
|
||||
#include <future>
|
||||
#include "common.hpp"
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
init(argc, argv);
|
||||
spdlog::set_level(spdlog::level::debug);
|
||||
auto kvc2 = kvc2::create_kvc2(config);
|
||||
|
||||
std::mt19937 gen(123);
|
||||
auto ids1 = random_ids(10 * config.num_token_per_page, gen);
|
||||
auto k1 = random_kvcache(10, gen);
|
||||
auto v1 = random_kvcache(10, gen);
|
||||
|
||||
kvc2->raw_insert(test_model_name, test_quant_type, ids1.data(), ids1.size(), k1, v1);
|
||||
|
||||
// complete same
|
||||
#pragma omp parallel for
|
||||
for (size_t ti = 0; ti < 3; ti++) {
|
||||
std::promise<std::shared_ptr<DoubleCacheHandleInterface>> p;
|
||||
kvc2->lookup_to_gpu_async(test_model_name, test_quant_type, ids1.data(), ids1.size(),
|
||||
ids1.size() + 2 * config.num_token_per_page,
|
||||
[&p](std::shared_ptr<DoubleCacheHandleInterface> h) { p.set_value(h); });
|
||||
auto fut = p.get_future();
|
||||
fut.wait();
|
||||
auto h = fut.get();
|
||||
auto k = h->handle_data(true);
|
||||
auto v = h->handle_data(false);
|
||||
cmp_handle_data(k1, k, 10);
|
||||
cmp_handle_data(v1, v, 10);
|
||||
|
||||
auto block_idx = h->get_gpu_block_idx();
|
||||
auto [kcache, vcache] = kvc2->get_kvcache();
|
||||
|
||||
cmp_handle_gpu(block_idx, kcache, vcache, k1, v1, 10);
|
||||
}
|
||||
|
||||
SPDLOG_CRITICAL("All Test Passed: {}", argv[0]);
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,61 @@
|
|||
/**
|
||||
* @Description :
|
||||
* @Author : Xie Weiyu
|
||||
* @Date : 2024-11-22 09:52:48
|
||||
* @Version : 1.0.0
|
||||
* @LastEditors : Xie Weiyu
|
||||
* @LastEditTime : 2024-11-25 07:51:09
|
||||
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
**/
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
qw25_7B_gpu_config.v_cache_on = false;
|
||||
config.gpu_cache_config = qw25_7B_gpu_config;
|
||||
config.v_cache_on = false;
|
||||
|
||||
init(argc, argv);
|
||||
spdlog::set_level(spdlog::level::debug);
|
||||
auto kvc2 = kvc2::create_kvc2(config);
|
||||
|
||||
std::mt19937 gen(123);
|
||||
auto ids1 = random_ids(10 * config.num_token_per_page, gen);
|
||||
auto k1 = random_kvcache(10, gen);
|
||||
|
||||
kvc2->raw_insert(test_model_name, test_quant_type, ids1.data(), ids1.size(), k1, {});
|
||||
|
||||
// complete same
|
||||
#pragma omp parallel for
|
||||
for (size_t ti = 0; ti < 3; ti++) {
|
||||
auto h = kvc2->lookup_to_gpu(test_model_name, test_quant_type, ids1.data(), ids1.size(),
|
||||
ids1.size() + 2 * config.num_token_per_page);
|
||||
auto k = h->handle_data(true);
|
||||
cmp_handle_data(k1, k, 10);
|
||||
|
||||
auto block_idx = h->get_gpu_block_idx();
|
||||
auto [kcache, vcache] = kvc2->get_kvcache();
|
||||
|
||||
auto k_from_gpu = empty_kvcache(15);
|
||||
|
||||
size_t gpu_count = config.gpu_cache_config->gpu_devices_id.size();
|
||||
size_t element_size_per_gpu = test_cache_info.element_size(config.num_token_per_page) / gpu_count;
|
||||
for (size_t i = 0; i < k_from_gpu.size(); i++) {
|
||||
for (size_t j = 0; j < block_idx.size(); j++) {
|
||||
size_t b_idx = block_idx[j];
|
||||
for (size_t gpu_idx = 0; gpu_idx < gpu_count; gpu_idx++) {
|
||||
{
|
||||
auto kt = kcache[gpu_idx][i][b_idx].to(torch::kCPU);
|
||||
void* src = kt.data_ptr();
|
||||
void* dst = offset_by_bytes(k_from_gpu[i][j], gpu_idx * element_size_per_gpu);
|
||||
memcpy(dst, src, element_size_per_gpu);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
cmp_handle_data(k1, k_from_gpu, 10);
|
||||
}
|
||||
|
||||
SPDLOG_CRITICAL("All Test Passed: {}", argv[0]);
|
||||
return 0;
|
||||
}
|
68
csrc/balance_serve/kvc2/test/kvc2test/lookup-gpu-mt.cpp
Normal file
68
csrc/balance_serve/kvc2/test/kvc2test/lookup-gpu-mt.cpp
Normal file
|
@ -0,0 +1,68 @@
|
|||
/**
|
||||
* @Description :
|
||||
* @Author : Xie Weiyu
|
||||
* @Date : 2024-11-22 09:52:48
|
||||
* @Version : 1.0.0
|
||||
* @LastEditors : Xie Weiyu
|
||||
* @LastEditTime : 2024-11-25 07:51:09
|
||||
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
**/
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
init(argc, argv);
|
||||
spdlog::set_level(spdlog::level::debug);
|
||||
auto kvc2 = kvc2::create_kvc2(config);
|
||||
|
||||
std::mt19937 gen(123);
|
||||
auto ids1 = random_ids(10 * config.num_token_per_page, gen);
|
||||
auto k1 = random_kvcache(10, gen);
|
||||
auto v1 = random_kvcache(10, gen);
|
||||
|
||||
kvc2->raw_insert(test_model_name, test_quant_type, ids1.data(), ids1.size(), k1, v1);
|
||||
|
||||
// complete same
|
||||
#pragma omp parallel for
|
||||
for (size_t ti = 0; ti < 3; ti++) {
|
||||
auto h = kvc2->lookup_to_gpu(test_model_name, test_quant_type, ids1.data(), ids1.size(),
|
||||
ids1.size() + 2 * config.num_token_per_page);
|
||||
auto k = h->handle_data(true);
|
||||
auto v = h->handle_data(false);
|
||||
cmp_handle_data(k1, k, 10);
|
||||
cmp_handle_data(v1, v, 10);
|
||||
|
||||
auto block_idx = h->get_gpu_block_idx();
|
||||
auto [kcache, vcache] = kvc2->get_kvcache();
|
||||
|
||||
auto k_from_gpu = empty_kvcache(15);
|
||||
auto v_from_gpu = empty_kvcache(15);
|
||||
|
||||
size_t gpu_count = config.gpu_cache_config->gpu_devices_id.size();
|
||||
size_t element_size_per_gpu = test_cache_info.element_size(config.num_token_per_page) / gpu_count;
|
||||
for (size_t i = 0; i < k_from_gpu.size(); i++) {
|
||||
for (size_t j = 0; j < block_idx.size(); j++) {
|
||||
size_t b_idx = block_idx[j];
|
||||
for (size_t gpu_idx = 0; gpu_idx < gpu_count; gpu_idx++) {
|
||||
{
|
||||
auto kt = kcache[gpu_idx][i][b_idx].to(torch::kCPU);
|
||||
void* src = kt.data_ptr();
|
||||
void* dst = offset_by_bytes(k_from_gpu[i][j], gpu_idx * element_size_per_gpu);
|
||||
memcpy(dst, src, element_size_per_gpu);
|
||||
}
|
||||
{
|
||||
auto vt = vcache[gpu_idx][i][b_idx].to(torch::kCPU);
|
||||
void* src = vt.data_ptr();
|
||||
void* dst = offset_by_bytes(v_from_gpu[i][j], gpu_idx * element_size_per_gpu);
|
||||
memcpy(dst, src, element_size_per_gpu);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
cmp_handle_data(k1, k_from_gpu, 10);
|
||||
cmp_handle_data(v1, v_from_gpu, 10);
|
||||
}
|
||||
|
||||
SPDLOG_CRITICAL("All Test Passed: {}", argv[0]);
|
||||
return 0;
|
||||
}
|
160
csrc/balance_serve/kvc2/test/kvc2test/lookup-gpu.cpp
Normal file
160
csrc/balance_serve/kvc2/test/kvc2test/lookup-gpu.cpp
Normal file
|
@ -0,0 +1,160 @@
|
|||
/**
|
||||
* @Description :
|
||||
* @Author : Xie Weiyu
|
||||
* @Date : 2024-11-22 09:52:48
|
||||
* @Version : 1.0.0
|
||||
* @LastEditors : Xie Weiyu
|
||||
* @LastEditTime : 2024-11-25 08:38:33
|
||||
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
**/
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
init(argc, argv);
|
||||
spdlog::set_level(spdlog::level::debug);
|
||||
auto kvc2 = kvc2::create_kvc2(config);
|
||||
|
||||
std::mt19937 gen(123);
|
||||
auto ids1 = random_ids(10 * config.num_token_per_page, gen);
|
||||
auto k1 = random_kvcache(10, gen);
|
||||
auto v1 = random_kvcache(10, gen);
|
||||
|
||||
kvc2->raw_insert(test_model_name, test_quant_type, ids1.data(), ids1.size(), k1, v1);
|
||||
|
||||
// complete same
|
||||
{
|
||||
auto h = kvc2->lookup_to_gpu(test_model_name, test_quant_type, ids1.data(), ids1.size(),
|
||||
ids1.size() + 5 * config.num_token_per_page);
|
||||
auto k = h->handle_data(true);
|
||||
auto v = h->handle_data(false);
|
||||
cmp_handle_data(k1, k, 10);
|
||||
cmp_handle_data(v1, v, 10);
|
||||
|
||||
auto block_idx = h->get_gpu_block_idx();
|
||||
auto [kcache, vcache] = kvc2->get_kvcache();
|
||||
|
||||
auto k_from_gpu = empty_kvcache(15);
|
||||
auto v_from_gpu = empty_kvcache(15);
|
||||
|
||||
size_t gpu_count = config.gpu_cache_config->gpu_devices_id.size();
|
||||
size_t element_size_per_gpu = test_cache_info.element_size(config.num_token_per_page) / gpu_count;
|
||||
for (size_t i = 0; i < k_from_gpu.size(); i++) {
|
||||
for (size_t j = 0; j < block_idx.size(); j++) {
|
||||
size_t b_idx = block_idx[j];
|
||||
for (size_t gpu_idx = 0; gpu_idx < gpu_count; gpu_idx++) {
|
||||
{
|
||||
auto kt = kcache[gpu_idx][i][b_idx].to(torch::kCPU);
|
||||
void* src = kt.data_ptr();
|
||||
void* dst = offset_by_bytes(k_from_gpu[i][j], gpu_idx * element_size_per_gpu);
|
||||
memcpy(dst, src, element_size_per_gpu);
|
||||
}
|
||||
{
|
||||
auto vt = vcache[gpu_idx][i][b_idx].to(torch::kCPU);
|
||||
void* src = vt.data_ptr();
|
||||
void* dst = offset_by_bytes(v_from_gpu[i][j], gpu_idx * element_size_per_gpu);
|
||||
memcpy(dst, src, element_size_per_gpu);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
cmp_handle_data(k1, k_from_gpu, 10);
|
||||
cmp_handle_data(v1, v_from_gpu, 10);
|
||||
}
|
||||
|
||||
// prefix and evict
|
||||
{
|
||||
auto h = kvc2->lookup_to_gpu(test_model_name, test_quant_type, ids1.data(), config.num_token_per_page * 3,
|
||||
config.gpu_cache_config->total_kvcache_pages * config.num_token_per_page);
|
||||
auto k = h->handle_data(true);
|
||||
auto v = h->handle_data(false);
|
||||
cmp_handle_data(k1, k, 3);
|
||||
cmp_handle_data(v1, v, 3);
|
||||
|
||||
auto block_idx = h->get_gpu_block_idx();
|
||||
auto [kcache, vcache] = kvc2->get_kvcache();
|
||||
|
||||
auto k_from_gpu = empty_kvcache(3);
|
||||
auto v_from_gpu = empty_kvcache(3);
|
||||
|
||||
size_t gpu_count = config.gpu_cache_config->gpu_devices_id.size();
|
||||
size_t element_size_per_gpu = test_cache_info.element_size(config.num_token_per_page) / gpu_count;
|
||||
for (size_t i = 0; i < k_from_gpu.size(); i++) {
|
||||
for (size_t j = 0; j < 3; j++) {
|
||||
size_t b_idx = block_idx[j];
|
||||
for (size_t gpu_idx = 0; gpu_idx < gpu_count; gpu_idx++) {
|
||||
{
|
||||
auto kt = kcache[gpu_idx][i][b_idx].to(torch::kCPU);
|
||||
void* src = kt.data_ptr();
|
||||
void* dst = offset_by_bytes(k_from_gpu[i][j], gpu_idx * element_size_per_gpu);
|
||||
memcpy(dst, src, element_size_per_gpu);
|
||||
}
|
||||
{
|
||||
auto vt = vcache[gpu_idx][i][b_idx].to(torch::kCPU);
|
||||
void* src = vt.data_ptr();
|
||||
void* dst = offset_by_bytes(v_from_gpu[i][j], gpu_idx * element_size_per_gpu);
|
||||
memcpy(dst, src, element_size_per_gpu);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
cmp_handle_data(k1, k_from_gpu, 3);
|
||||
cmp_handle_data(v1, v_from_gpu, 3);
|
||||
}
|
||||
|
||||
// // complete prefix
|
||||
// {
|
||||
// std::vector<Token> ids2(ids1.begin(), ids1.begin() + 3 * config.num_token_per_page);
|
||||
// auto h = kvc2->lookup(test_model_name, test_quant_type, ids2.data(), ids2.size(),
|
||||
// ids2.size() + 3 * config.num_token_per_page);
|
||||
// auto k = h->handle_data(true);
|
||||
// auto v = h->handle_data(false);
|
||||
// cmp_handle_data(k1, k, 3);
|
||||
// cmp_handle_data(v1, v, 3);
|
||||
// }
|
||||
|
||||
// // common prefix
|
||||
// {
|
||||
// std::vector<Token> ids2(ids1.begin(), ids1.begin() + 3 * config.num_token_per_page);
|
||||
// auto rids = random_ids(config.num_token_per_page * 2 + config.num_token_per_page / 2, gen);
|
||||
// ids2.insert(ids2.end(), rids.begin(), rids.end());
|
||||
|
||||
// auto h = kvc2->lookup(test_model_name, test_quant_type, ids2.data(), ids2.size(), ids2.size());
|
||||
// auto k = h->handle_data(true);
|
||||
// auto v = h->handle_data(false);
|
||||
// cmp_handle_data(k1, k, 3);
|
||||
// cmp_handle_data(v1, v, 3);
|
||||
// }
|
||||
|
||||
// // no prefix
|
||||
// {
|
||||
// std::vector<Token> ids2 = random_ids(config.num_token_per_page, gen);
|
||||
// auto h = kvc2->lookup(test_model_name, test_quant_type, ids2.data(), ids2.size(), ids2.size());
|
||||
// assert(h->matched_length() == 0);
|
||||
// }
|
||||
|
||||
// // insert partly new
|
||||
// auto k2 = random_kvcache(10, gen);
|
||||
// auto v2 = random_kvcache(10, gen);
|
||||
// copy_kvcache(k1, k2, 0, 5);
|
||||
// copy_kvcache(v1, v2, 0, 5);
|
||||
// auto ids2 = random_ids(10 * config.num_token_per_page, gen);
|
||||
// for (size_t i = 0; i < 5 * config.num_token_per_page; i++) {
|
||||
// ids2[i] = ids1[i];
|
||||
// }
|
||||
// kvc2->raw_insert(test_model_name, test_quant_type, ids2.data(), ids2.size(), k2, v2);
|
||||
|
||||
// // read new part
|
||||
// {
|
||||
// std::vector<Token> ids(ids2.begin(), ids2.begin() + 7 * config.num_token_per_page);
|
||||
// auto h = kvc2->lookup(test_model_name, test_quant_type, ids.data(), ids.size(),
|
||||
// ids.size() + 7 * config.num_token_per_page);
|
||||
// auto k = h->handle_data(true);
|
||||
// auto v = h->handle_data(false);
|
||||
// cmp_handle_data(k, k2, 7);
|
||||
// cmp_handle_data(v, v2, 7);
|
||||
// }
|
||||
|
||||
SPDLOG_CRITICAL("All Test Passed: {}", argv[0]);
|
||||
return 0;
|
||||
}
|
103
csrc/balance_serve/kvc2/test/kvc2test/lookup-mt.cpp
Normal file
103
csrc/balance_serve/kvc2/test/kvc2test/lookup-mt.cpp
Normal file
|
@ -0,0 +1,103 @@
|
|||
/**
|
||||
* @Description :
|
||||
* @Author : Xie Weiyu
|
||||
* @Date : 2024-11-22 08:48:40
|
||||
* @Version : 1.0.0
|
||||
* @LastEditors : Xie Weiyu
|
||||
* @LastEditTime : 2024-11-22 09:53:06
|
||||
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
**/
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
template <typename F>
|
||||
void test_multi(F f) {
|
||||
std::vector<std::thread> threads;
|
||||
for (size_t i = 0; i < 10; i++) {
|
||||
threads.push_back([f]() { f(); });
|
||||
}
|
||||
for (auto& t : threads) {
|
||||
t.join();
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
init(argc, argv);
|
||||
spdlog::set_level(spdlog::level::debug);
|
||||
auto kvc2 = kvc2::create_kvc2(config);
|
||||
|
||||
std::mt19937 gen(123);
|
||||
auto ids1 = random_ids(3 * config.num_token_per_page, gen);
|
||||
auto k1 = random_kvcache(3, gen);
|
||||
auto v1 = random_kvcache(3, gen);
|
||||
|
||||
kvc2->raw_insert(test_model_name, test_quant_type, ids1.data(), ids1.size(), k1, v1);
|
||||
|
||||
// complete same
|
||||
{
|
||||
#pragma omp parallel for
|
||||
for (size_t i = 0; i < 10; i++) {
|
||||
auto h = kvc2->lookup(test_model_name, test_quant_type, ids1.data(), ids1.size(),
|
||||
ids1.size() + 10 * config.num_token_per_page);
|
||||
if (h == nullptr) {
|
||||
SPDLOG_WARN("Thread[{}]: h is nullptr", i);
|
||||
} else {
|
||||
auto k = h->handle_data(true);
|
||||
auto v = h->handle_data(false);
|
||||
cmp_handle_data(k1, k, 3);
|
||||
cmp_handle_data(v1, v, 3);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// // complete prefix
|
||||
// {
|
||||
// std::vector<Token> ids2(ids1.begin(), ids1.begin() + 3 * config.num_token_per_page);
|
||||
// auto h = kvc2->lookup(test_model_name, test_quant_type, ids2.data(), ids2.size(), ids2.size() + 3 *
|
||||
// config.num_token_per_page); auto k = h->handle_data(true); auto v = h->handle_data(false); cmp_handle_data(k1,
|
||||
// k, 3); cmp_handle_data(v1, v, 3);
|
||||
// }
|
||||
|
||||
// // common prefix
|
||||
// {
|
||||
// std::vector<Token> ids2(ids1.begin(), ids1.begin() + 3 * config.num_token_per_page);
|
||||
// auto rids = random_ids(config.num_token_per_page * 2 + config.num_token_per_page / 2, gen);
|
||||
// ids2.insert(ids2.end(), rids.begin(), rids.end());
|
||||
|
||||
// auto h = kvc2->lookup(test_model_name, test_quant_type, ids2.data(), ids2.size(), ids2.size());
|
||||
// auto k = h->handle_data(true);
|
||||
// auto v = h->handle_data(false);
|
||||
// cmp_handle_data(k1, k, 3);
|
||||
// cmp_handle_data(v1, v, 3);
|
||||
// }
|
||||
|
||||
// // no prefix
|
||||
// {
|
||||
// std::vector<Token> ids2 = random_ids(config.num_token_per_page, gen);
|
||||
// auto h = kvc2->lookup(test_model_name, test_quant_type, ids2.data(), ids2.size(), ids2.size());
|
||||
// assert(h->matched_length() == 0);
|
||||
// }
|
||||
|
||||
// // insert partly new
|
||||
// auto k2 = random_kvcache(10, gen);
|
||||
// auto v2 = random_kvcache(10, gen);
|
||||
// copy_kvcache(k1, k2, 0, 5);
|
||||
// copy_kvcache(v1, v2, 0, 5);
|
||||
// auto ids2 = random_ids(10 * config.num_token_per_page, gen);
|
||||
// for (size_t i = 0; i < 5 * config.num_token_per_page; i++) {
|
||||
// ids2[i] = ids1[i];
|
||||
// }
|
||||
// kvc2->raw_insert(test_model_name, test_quant_type, ids2.data(), ids2.size(), k2, v2);
|
||||
|
||||
// // read new part
|
||||
// {
|
||||
// std::vector<Token> ids(ids2.begin(), ids2.begin() + 7 * config.num_token_per_page);
|
||||
// auto h = kvc2->lookup(test_model_name, test_quant_type, ids.data(), ids.size(), ids.size() + 7 *
|
||||
// config.num_token_per_page); auto k = h->handle_data(true); auto v = h->handle_data(false); cmp_handle_data(k,
|
||||
// k2, 7); cmp_handle_data(v, v2, 7);
|
||||
// }
|
||||
kvc2->debug();
|
||||
|
||||
SPDLOG_CRITICAL("All Test Passed: {}", argv[0]);
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,84 @@
|
|||
/**
|
||||
* @Description :
|
||||
* @Author : Xie Weiyu
|
||||
* @Date : 2024-11-22 08:29:45
|
||||
* @Version : 1.0.0
|
||||
* @LastEditors : Xie Weiyu
|
||||
* @LastEditTime : 2024-11-22 09:56:12
|
||||
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
**/
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
qw25_7B_gpu_config.v_cache_on = false;
|
||||
config.gpu_cache_config = qw25_7B_gpu_config;
|
||||
config.v_cache_on = false;
|
||||
|
||||
init(argc, argv);
|
||||
spdlog::set_level(spdlog::level::debug);
|
||||
auto kvc2 = kvc2::create_kvc2(config);
|
||||
|
||||
std::mt19937 gen(123);
|
||||
auto ids1 = random_ids(10 * config.num_token_per_page, gen);
|
||||
auto k1 = random_kvcache(10, gen);
|
||||
// auto v1 = random_kvcache(10, gen);
|
||||
|
||||
kvc2->raw_insert(test_model_name, test_quant_type, ids1.data(), ids1.size(), k1, {});
|
||||
|
||||
// complete same
|
||||
{
|
||||
auto h = kvc2->lookup(test_model_name, test_quant_type, ids1.data(), ids1.size(),
|
||||
ids1.size() + 10 * config.num_token_per_page);
|
||||
auto k = h->handle_data(true);
|
||||
cmp_handle_data(k1, k, 10);
|
||||
}
|
||||
|
||||
// complete prefix
|
||||
{
|
||||
std::vector<Token> ids2(ids1.begin(), ids1.begin() + 3 * config.num_token_per_page);
|
||||
auto h = kvc2->lookup(test_model_name, test_quant_type, ids2.data(), ids2.size(),
|
||||
ids2.size() + 3 * config.num_token_per_page);
|
||||
auto k = h->handle_data(true);
|
||||
cmp_handle_data(k1, k, 3);
|
||||
}
|
||||
|
||||
// common prefix
|
||||
{
|
||||
std::vector<Token> ids2(ids1.begin(), ids1.begin() + 3 * config.num_token_per_page);
|
||||
auto rids = random_ids(config.num_token_per_page * 2 + config.num_token_per_page / 2, gen);
|
||||
ids2.insert(ids2.end(), rids.begin(), rids.end());
|
||||
|
||||
auto h = kvc2->lookup(test_model_name, test_quant_type, ids2.data(), ids2.size(), ids2.size());
|
||||
auto k = h->handle_data(true);
|
||||
cmp_handle_data(k1, k, 3);
|
||||
}
|
||||
|
||||
// no prefix
|
||||
{
|
||||
std::vector<Token> ids2 = random_ids(config.num_token_per_page, gen);
|
||||
auto h = kvc2->lookup(test_model_name, test_quant_type, ids2.data(), ids2.size(), ids2.size());
|
||||
assert(h->matched_length() == 0);
|
||||
}
|
||||
|
||||
// insert partly new
|
||||
auto k2 = random_kvcache(10, gen);
|
||||
copy_kvcache(k1, k2, 0, 5);
|
||||
auto ids2 = random_ids(10 * config.num_token_per_page, gen);
|
||||
for (size_t i = 0; i < 5 * config.num_token_per_page; i++) {
|
||||
ids2[i] = ids1[i];
|
||||
}
|
||||
kvc2->raw_insert(test_model_name, test_quant_type, ids2.data(), ids2.size(), k2, {});
|
||||
|
||||
// read new part
|
||||
{
|
||||
std::vector<Token> ids(ids2.begin(), ids2.begin() + 7 * config.num_token_per_page);
|
||||
auto h = kvc2->lookup(test_model_name, test_quant_type, ids.data(), ids.size(),
|
||||
ids.size() + 7 * config.num_token_per_page);
|
||||
auto k = h->handle_data(true);
|
||||
cmp_handle_data(k, k2, 7);
|
||||
}
|
||||
|
||||
SPDLOG_CRITICAL("All Test Passed: {}", argv[0]);
|
||||
return 0;
|
||||
}
|
90
csrc/balance_serve/kvc2/test/kvc2test/lookup.cpp
Normal file
90
csrc/balance_serve/kvc2/test/kvc2test/lookup.cpp
Normal file
|
@ -0,0 +1,90 @@
|
|||
/**
|
||||
* @Description :
|
||||
* @Author : Xie Weiyu
|
||||
* @Date : 2024-11-22 08:29:45
|
||||
* @Version : 1.0.0
|
||||
* @LastEditors : Xie Weiyu
|
||||
* @LastEditTime : 2024-11-22 09:56:12
|
||||
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
**/
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
init(argc, argv);
|
||||
spdlog::set_level(spdlog::level::debug);
|
||||
auto kvc2 = kvc2::create_kvc2(config);
|
||||
|
||||
std::mt19937 gen(123);
|
||||
auto ids1 = random_ids(10 * config.num_token_per_page, gen);
|
||||
auto k1 = random_kvcache(10, gen);
|
||||
auto v1 = random_kvcache(10, gen);
|
||||
|
||||
kvc2->raw_insert(test_model_name, test_quant_type, ids1.data(), ids1.size(), k1, v1);
|
||||
|
||||
// complete same
|
||||
{
|
||||
auto h = kvc2->lookup(test_model_name, test_quant_type, ids1.data(), ids1.size(),
|
||||
ids1.size() + 10 * config.num_token_per_page);
|
||||
auto k = h->handle_data(true);
|
||||
auto v = h->handle_data(false);
|
||||
cmp_handle_data(k1, k, 10);
|
||||
cmp_handle_data(v1, v, 10);
|
||||
}
|
||||
|
||||
// complete prefix
|
||||
{
|
||||
std::vector<Token> ids2(ids1.begin(), ids1.begin() + 3 * config.num_token_per_page);
|
||||
auto h = kvc2->lookup(test_model_name, test_quant_type, ids2.data(), ids2.size(),
|
||||
ids2.size() + 3 * config.num_token_per_page);
|
||||
auto k = h->handle_data(true);
|
||||
auto v = h->handle_data(false);
|
||||
cmp_handle_data(k1, k, 3);
|
||||
cmp_handle_data(v1, v, 3);
|
||||
}
|
||||
|
||||
// common prefix
|
||||
{
|
||||
std::vector<Token> ids2(ids1.begin(), ids1.begin() + 3 * config.num_token_per_page);
|
||||
auto rids = random_ids(config.num_token_per_page * 2 + config.num_token_per_page / 2, gen);
|
||||
ids2.insert(ids2.end(), rids.begin(), rids.end());
|
||||
|
||||
auto h = kvc2->lookup(test_model_name, test_quant_type, ids2.data(), ids2.size(), ids2.size());
|
||||
auto k = h->handle_data(true);
|
||||
auto v = h->handle_data(false);
|
||||
cmp_handle_data(k1, k, 3);
|
||||
cmp_handle_data(v1, v, 3);
|
||||
}
|
||||
|
||||
// no prefix
|
||||
{
|
||||
std::vector<Token> ids2 = random_ids(config.num_token_per_page, gen);
|
||||
auto h = kvc2->lookup(test_model_name, test_quant_type, ids2.data(), ids2.size(), ids2.size());
|
||||
assert(h->matched_length() == 0);
|
||||
}
|
||||
|
||||
// insert partly new
|
||||
auto k2 = random_kvcache(10, gen);
|
||||
auto v2 = random_kvcache(10, gen);
|
||||
copy_kvcache(k1, k2, 0, 5);
|
||||
copy_kvcache(v1, v2, 0, 5);
|
||||
auto ids2 = random_ids(10 * config.num_token_per_page, gen);
|
||||
for (size_t i = 0; i < 5 * config.num_token_per_page; i++) {
|
||||
ids2[i] = ids1[i];
|
||||
}
|
||||
kvc2->raw_insert(test_model_name, test_quant_type, ids2.data(), ids2.size(), k2, v2);
|
||||
|
||||
// read new part
|
||||
{
|
||||
std::vector<Token> ids(ids2.begin(), ids2.begin() + 7 * config.num_token_per_page);
|
||||
auto h = kvc2->lookup(test_model_name, test_quant_type, ids.data(), ids.size(),
|
||||
ids.size() + 7 * config.num_token_per_page);
|
||||
auto k = h->handle_data(true);
|
||||
auto v = h->handle_data(false);
|
||||
cmp_handle_data(k, k2, 7);
|
||||
cmp_handle_data(v, v2, 7);
|
||||
}
|
||||
|
||||
SPDLOG_CRITICAL("All Test Passed: {}", argv[0]);
|
||||
return 0;
|
||||
}
|
99
csrc/balance_serve/kvc2/test/kvc2test/raw_insert_read.cpp
Normal file
99
csrc/balance_serve/kvc2/test/kvc2test/raw_insert_read.cpp
Normal file
|
@ -0,0 +1,99 @@
|
|||
/**
|
||||
* @Description :
|
||||
* @Author : Xie Weiyu
|
||||
* @Date : 2024-11-22 06:00:16
|
||||
* @Version : 1.0.0
|
||||
* @LastEditors : Xie Weiyu
|
||||
* @LastEditTime : 2024-11-22 07:30:46
|
||||
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
**/
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
init(argc, argv);
|
||||
spdlog::set_level(spdlog::level::debug);
|
||||
auto kvc2 = kvc2::create_kvc2(config);
|
||||
|
||||
std::mt19937 gen(123);
|
||||
auto ids1 = random_ids(10 * config.num_token_per_page, gen);
|
||||
auto k1 = random_kvcache(10, gen);
|
||||
auto v1 = random_kvcache(10, gen);
|
||||
|
||||
kvc2->raw_insert(test_model_name, test_quant_type, ids1.data(), ids1.size(), k1, v1);
|
||||
|
||||
// complete same
|
||||
{
|
||||
auto k2 = empty_kvcache(10);
|
||||
auto v2 = empty_kvcache(10);
|
||||
auto l2 = kvc2->raw_read(test_model_name, test_quant_type, ids1.data(), ids1.size(), k2, v2);
|
||||
assert(l2 == ids1.size());
|
||||
|
||||
cmp_handle_data(k1, k2);
|
||||
cmp_handle_data(v1, v2);
|
||||
}
|
||||
|
||||
// complete prefix
|
||||
{
|
||||
auto k2 = empty_kvcache(10);
|
||||
auto v2 = empty_kvcache(10);
|
||||
std::vector<Token> ids2 = std::vector<Token>(ids1.begin(), ids1.begin() + 3 * config.num_token_per_page);
|
||||
auto l2 = kvc2->raw_read(test_model_name, test_quant_type, ids2.data(), ids2.size(), k2, v2);
|
||||
assert(l2 == 3 * config.num_token_per_page);
|
||||
|
||||
cmp_handle_data(k1, k2, 3);
|
||||
cmp_handle_data(v1, v2, 3);
|
||||
}
|
||||
|
||||
// common prefix
|
||||
{
|
||||
auto k2 = empty_kvcache(10);
|
||||
auto v2 = empty_kvcache(10);
|
||||
std::vector<Token> ids2 = std::vector<Token>(ids1.begin(), ids1.begin() + 3 * config.num_token_per_page);
|
||||
auto rids = random_ids(config.num_token_per_page * 2 + config.num_token_per_page / 2, gen);
|
||||
ids2.insert(ids2.end(), rids.begin(), rids.end());
|
||||
|
||||
auto l2 = kvc2->raw_read(test_model_name, test_quant_type, ids2.data(), ids2.size(), k2, v2);
|
||||
assert(l2 == 3 * config.num_token_per_page);
|
||||
|
||||
cmp_handle_data(k1, k2, 3);
|
||||
cmp_handle_data(v1, v2, 3);
|
||||
}
|
||||
|
||||
// no prefix
|
||||
{
|
||||
auto k2 = empty_kvcache(1);
|
||||
auto v2 = empty_kvcache(1);
|
||||
std::vector<Token> ids2 = random_ids(config.num_token_per_page, gen);
|
||||
auto l2 = kvc2->raw_read(test_model_name, test_quant_type, ids2.data(), ids2.size(), k2, v2);
|
||||
assert(l2 == 0);
|
||||
}
|
||||
|
||||
// insert partly new
|
||||
auto k2 = random_kvcache(10, gen);
|
||||
auto v2 = random_kvcache(10, gen);
|
||||
copy_kvcache(k1, k2, 0, 5);
|
||||
copy_kvcache(v1, v2, 0, 5);
|
||||
auto ids2 = random_ids(10 * config.num_token_per_page, gen);
|
||||
for (size_t i = 0; i < 5 * config.num_token_per_page; i++) {
|
||||
ids2[i] = ids1[i];
|
||||
}
|
||||
kvc2->raw_insert(test_model_name, test_quant_type, ids2.data(), ids2.size(), k2, v2);
|
||||
|
||||
// read new part
|
||||
{
|
||||
auto k = empty_kvcache(10);
|
||||
auto v = empty_kvcache(10);
|
||||
std::vector<Token> ids = std::vector<Token>(ids2.begin(), ids2.begin() + 7 * config.num_token_per_page);
|
||||
|
||||
auto l = kvc2->raw_read(test_model_name, test_quant_type, ids.data(), ids.size(), k, v);
|
||||
assert(l == 7 * config.num_token_per_page);
|
||||
|
||||
cmp_handle_data(k, k2, 7);
|
||||
cmp_handle_data(v, v2, 7);
|
||||
}
|
||||
|
||||
SPDLOG_CRITICAL("All Test Passed: {}", argv[0]);
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,87 @@
|
|||
#include "kvcache_test_utils.cpp"
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
parse_and_check(argc, argv);
|
||||
spdlog::set_level(spdlog::level::debug);
|
||||
std::mt19937 gen(123);
|
||||
|
||||
KVC2 kvc2(FLAGS_disk_cache_path);
|
||||
// auto io = kvc2.io_dealer->start_io_thread();
|
||||
kvc2.io_dealer->start_io_thread().detach();
|
||||
|
||||
auto h1 = random_kvcache(qwen_cache_info, 10, gen);
|
||||
h1.ids = random_ids(10 * BlockLength, gen);
|
||||
kvc2.raw_insert(h1);
|
||||
|
||||
// complete same
|
||||
{
|
||||
auto h2 = empty_kvcache(qwen_cache_info, 10);
|
||||
h2.ids = h1.ids;
|
||||
kvc2.raw_read(h2);
|
||||
assert(static_cast<size_t>(h2.match.match_length) == h1.ids.size());
|
||||
|
||||
cmp_handle_data(h1, h2);
|
||||
}
|
||||
|
||||
// complete prefix
|
||||
{
|
||||
auto h2 = empty_kvcache(qwen_cache_info, 10);
|
||||
|
||||
h2.ids = std::vector<ID>(h1.ids.begin(), h1.ids.begin() + 3 * BlockLength);
|
||||
kvc2.raw_read(h2);
|
||||
assert(h2.match.match_length == 3 * BlockLength);
|
||||
|
||||
cmp_handle_data(h1, h2, 3);
|
||||
}
|
||||
|
||||
// common prefix
|
||||
{
|
||||
auto h2 = empty_kvcache(qwen_cache_info, 10);
|
||||
|
||||
h2.ids = std::vector<ID>(h1.ids.begin(), h1.ids.begin() + 5 * BlockLength);
|
||||
auto rids = random_ids(BlockLength * 2 + BlockLength / 2, gen);
|
||||
h2.ids.insert(h2.ids.end(), rids.begin(), rids.end());
|
||||
|
||||
kvc2.raw_read(h2);
|
||||
assert(h2.match.match_length == 5 * BlockLength);
|
||||
|
||||
cmp_handle_data(h1, h2, 5);
|
||||
}
|
||||
|
||||
// no prefix
|
||||
{
|
||||
auto h2 = empty_kvcache(qwen_cache_info, 10);
|
||||
|
||||
h2.ids = random_ids(10 * BlockLength, gen);
|
||||
|
||||
kvc2.raw_read(h2);
|
||||
assert(h2.match.match_length == 0);
|
||||
}
|
||||
|
||||
// insert partly new
|
||||
auto h2 = random_kvcache(qwen_cache_info, 10, gen);
|
||||
copy_kvcache(h1, h2, 0, 5);
|
||||
h2.ids = random_ids(10 * BlockLength, gen);
|
||||
for (size_t i = 0; i < 5 * BlockLength; i++) {
|
||||
h2.ids[i] = h1.ids[i];
|
||||
}
|
||||
kvc2.raw_insert(h2);
|
||||
|
||||
// read new part
|
||||
{
|
||||
auto h = empty_kvcache(qwen_cache_info, 10);
|
||||
h.ids = std::vector<ID>(h2.ids.begin(), h2.ids.begin() + 7 * BlockLength);
|
||||
h.ids.push_back(123);
|
||||
|
||||
kvc2.raw_read(h);
|
||||
assert(h.match.match_length == 7 * BlockLength);
|
||||
cmp_handle_data(h, h2, 7);
|
||||
}
|
||||
|
||||
kvc2.tree->debug();
|
||||
kvc2.io_dealer->stop();
|
||||
// io.join();
|
||||
|
||||
SPDLOG_WARN("{} Test Passed", __FILE__);
|
||||
return 0;
|
||||
}
|
52
csrc/balance_serve/kvc2/test/kvcache_mem_eviction_test.cpp
Normal file
52
csrc/balance_serve/kvc2/test/kvcache_mem_eviction_test.cpp
Normal file
|
@ -0,0 +1,52 @@
|
|||
#include "kvcache_test_utils.cpp"
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
parse_and_check(argc, argv);
|
||||
spdlog::set_level(spdlog::level::debug);
|
||||
std::mt19937 gen(123);
|
||||
|
||||
KVC2 kvc2(FLAGS_disk_cache_path);
|
||||
auto io = kvc2.io_dealer->start_io_thread();
|
||||
|
||||
SPDLOG_WARN("Insert 10 x 10 KVCache");
|
||||
std::vector<KVCacheHandle> handles(10);
|
||||
for (int i = 0; i < 10; i++) {
|
||||
handles[i] = random_kvcache(qwen_cache_info, 10, gen);
|
||||
auto& h1 = handles[i];
|
||||
h1.ids = random_ids(10 * BlockLength, gen);
|
||||
kvc2.raw_insert(h1);
|
||||
}
|
||||
|
||||
SPDLOG_WARN("Cache Eviction Test");
|
||||
{
|
||||
for (int i = 0; i < 10; i++) {
|
||||
auto& h = handles[i];
|
||||
SPDLOG_WARN("Lookup {}", i);
|
||||
auto x = kvc2.lookup(qwen_cache_info, h.ids.data(), h.ids.size());
|
||||
cmp_handle_data(h, *x);
|
||||
}
|
||||
SPDLOG_WARN("Simple Eviction OK");
|
||||
}
|
||||
|
||||
{
|
||||
std::vector<std::shared_ptr<KVCacheHandle>> lookup_handles;
|
||||
for (int i = 0; i < 10; i++) {
|
||||
auto& h = handles[i];
|
||||
SPDLOG_WARN("Lookup {}", i);
|
||||
auto x = kvc2.lookup(qwen_cache_info, h.ids.data(), h.ids.size());
|
||||
if (i >= 5) {
|
||||
assert(x == nullptr);
|
||||
continue;
|
||||
}
|
||||
lookup_handles.push_back(x);
|
||||
cmp_handle_data(h, *x);
|
||||
}
|
||||
SPDLOG_WARN("Cannot Eviction OK");
|
||||
}
|
||||
|
||||
kvc2.io_dealer->stop();
|
||||
io.join();
|
||||
|
||||
SPDLOG_WARN("{} Test Passed", __FILE__);
|
||||
return 0;
|
||||
}
|
104
csrc/balance_serve/kvc2/test/kvcache_mem_insert_read_test.cpp
Normal file
104
csrc/balance_serve/kvc2/test/kvcache_mem_insert_read_test.cpp
Normal file
|
@ -0,0 +1,104 @@
|
|||
#include "kvcache_test_utils.cpp"
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
parse_and_check(argc, argv);
|
||||
spdlog::set_level(spdlog::level::debug);
|
||||
std::mt19937 gen(123);
|
||||
|
||||
KVC2 kvc2(FLAGS_disk_cache_path);
|
||||
auto io = kvc2.io_dealer->start_io_thread();
|
||||
|
||||
SPDLOG_INFO("Disk Test");
|
||||
auto h1 = random_kvcache(qwen_cache_info, 10, gen);
|
||||
h1.ids = random_ids(10 * BlockLength, gen);
|
||||
kvc2.raw_insert(h1);
|
||||
|
||||
// complete same
|
||||
{
|
||||
auto h2 = empty_kvcache(qwen_cache_info, 10);
|
||||
h2.ids = h1.ids;
|
||||
kvc2.raw_read(h2);
|
||||
assert(static_cast<size_t>(h2.match.match_length) == h1.ids.size());
|
||||
|
||||
cmp_handle_data(h1, h2);
|
||||
}
|
||||
|
||||
// complete prefix
|
||||
{
|
||||
auto h2 = empty_kvcache(qwen_cache_info, 10);
|
||||
|
||||
h2.ids = std::vector<ID>(h1.ids.begin(), h1.ids.begin() + 3 * BlockLength);
|
||||
kvc2.raw_read(h2);
|
||||
assert(h2.match.match_length == 3 * BlockLength);
|
||||
|
||||
cmp_handle_data(h1, h2, 3);
|
||||
}
|
||||
|
||||
// common prefix
|
||||
{
|
||||
auto h2 = empty_kvcache(qwen_cache_info, 10);
|
||||
|
||||
h2.ids = std::vector<ID>(h1.ids.begin(), h1.ids.begin() + 5 * BlockLength);
|
||||
auto rids = random_ids(BlockLength * 2 + BlockLength / 2, gen);
|
||||
h2.ids.insert(h2.ids.end(), rids.begin(), rids.end());
|
||||
|
||||
kvc2.raw_read(h2);
|
||||
assert(h2.match.match_length == 5 * BlockLength);
|
||||
|
||||
cmp_handle_data(h1, h2, 5);
|
||||
}
|
||||
|
||||
// no prefix
|
||||
{
|
||||
auto h2 = empty_kvcache(qwen_cache_info, 10);
|
||||
|
||||
h2.ids = random_ids(10 * BlockLength, gen);
|
||||
|
||||
kvc2.raw_read(h2);
|
||||
assert(h2.match.match_length == 0);
|
||||
}
|
||||
|
||||
// insert partly new
|
||||
auto h2 = random_kvcache(qwen_cache_info, 10, gen);
|
||||
copy_kvcache(h1, h2, 0, 5);
|
||||
h2.ids = random_ids(10 * BlockLength, gen);
|
||||
for (size_t i = 0; i < 5 * BlockLength; i++) {
|
||||
h2.ids[i] = h1.ids[i];
|
||||
}
|
||||
kvc2.raw_insert(h2);
|
||||
|
||||
// read new part
|
||||
{
|
||||
auto h = empty_kvcache(qwen_cache_info, 10);
|
||||
h.ids = std::vector<ID>(h2.ids.begin(), h2.ids.begin() + 7 * BlockLength);
|
||||
h.ids.push_back(123);
|
||||
|
||||
kvc2.raw_read(h);
|
||||
assert(h.match.match_length == 7 * BlockLength);
|
||||
cmp_handle_data(h, h2, 7);
|
||||
}
|
||||
|
||||
SPDLOG_WARN("Memory Test");
|
||||
|
||||
{
|
||||
auto h = kvc2.lookup(qwen_cache_info, h1.ids.data(), h1.ids.size());
|
||||
assert(h);
|
||||
cmp_handle_data(h1, *h);
|
||||
kvc2.block_cache->debug();
|
||||
}
|
||||
kvc2.block_cache->debug();
|
||||
|
||||
{
|
||||
auto h = kvc2.lookup(qwen_cache_info, h1.ids.data(), 5 * BlockLength);
|
||||
assert(h);
|
||||
cmp_handle_data(h1, *h, 5);
|
||||
kvc2.block_cache->debug();
|
||||
}
|
||||
kvc2.block_cache->debug();
|
||||
|
||||
kvc2.io_dealer->stop();
|
||||
io.join();
|
||||
|
||||
SPDLOG_WARN("{} Test Passed", __FILE__);
|
||||
return 0;
|
||||
}
|
102
csrc/balance_serve/kvc2/test/kvcache_save_load_test.cpp
Normal file
102
csrc/balance_serve/kvc2/test/kvcache_save_load_test.cpp
Normal file
|
@ -0,0 +1,102 @@
|
|||
#include "kvcache_test_utils.cpp"
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
parse_and_check(argc, argv);
|
||||
spdlog::set_level(spdlog::level::debug);
|
||||
std::mt19937 gen(123);
|
||||
std::vector<KVCacheHandle> handles(10);
|
||||
|
||||
{
|
||||
KVC2 kvc2(FLAGS_disk_cache_path);
|
||||
auto io = kvc2.io_dealer->start_io_thread();
|
||||
SPDLOG_WARN("Insert 10 x 10 KVCache");
|
||||
for (int i = 0; i < 10; i++) {
|
||||
handles[i] = random_kvcache(qwen_cache_info, 10, gen);
|
||||
auto& h1 = handles[i];
|
||||
h1.ids = random_ids(10 * BlockLength, gen);
|
||||
kvc2.raw_insert(h1);
|
||||
}
|
||||
|
||||
kvc2.save();
|
||||
kvc2.tree->debug();
|
||||
|
||||
kvc2.io_dealer->stop();
|
||||
io.join();
|
||||
}
|
||||
{
|
||||
KVC2 kvc2(FLAGS_disk_cache_path);
|
||||
auto io = kvc2.io_dealer->start_io_thread();
|
||||
kvc2.load();
|
||||
kvc2.tree->debug();
|
||||
auto& h1 = handles[0];
|
||||
// complete same
|
||||
{
|
||||
auto h2 = empty_kvcache(qwen_cache_info, 10);
|
||||
h2.ids = h1.ids;
|
||||
kvc2.raw_read(h2);
|
||||
assert(static_cast<size_t>(h2.match.match_length) == h1.ids.size());
|
||||
|
||||
cmp_handle_data(h1, h2);
|
||||
}
|
||||
|
||||
// complete prefix
|
||||
{
|
||||
auto h2 = empty_kvcache(qwen_cache_info, 10);
|
||||
|
||||
h2.ids = std::vector<ID>(h1.ids.begin(), h1.ids.begin() + 3 * BlockLength);
|
||||
kvc2.raw_read(h2);
|
||||
assert(h2.match.match_length == 3 * BlockLength);
|
||||
|
||||
cmp_handle_data(h1, h2, 3);
|
||||
}
|
||||
|
||||
// common prefix
|
||||
{
|
||||
auto h2 = empty_kvcache(qwen_cache_info, 10);
|
||||
|
||||
h2.ids = std::vector<ID>(h1.ids.begin(), h1.ids.begin() + 5 * BlockLength);
|
||||
auto rids = random_ids(BlockLength * 2 + BlockLength / 2, gen);
|
||||
h2.ids.insert(h2.ids.end(), rids.begin(), rids.end());
|
||||
|
||||
kvc2.raw_read(h2);
|
||||
assert(h2.match.match_length == 5 * BlockLength);
|
||||
|
||||
cmp_handle_data(h1, h2, 5);
|
||||
}
|
||||
|
||||
// no prefix
|
||||
{
|
||||
auto h2 = empty_kvcache(qwen_cache_info, 10);
|
||||
|
||||
h2.ids = random_ids(10 * BlockLength, gen);
|
||||
|
||||
kvc2.raw_read(h2);
|
||||
assert(h2.match.match_length == 0);
|
||||
}
|
||||
|
||||
// insert partly new
|
||||
auto h2 = random_kvcache(qwen_cache_info, 10, gen);
|
||||
copy_kvcache(h1, h2, 0, 5);
|
||||
h2.ids = random_ids(10 * BlockLength, gen);
|
||||
for (size_t i = 0; i < 5 * BlockLength; i++) {
|
||||
h2.ids[i] = h1.ids[i];
|
||||
}
|
||||
kvc2.raw_insert(h2);
|
||||
|
||||
// read new part
|
||||
{
|
||||
auto h = empty_kvcache(qwen_cache_info, 10);
|
||||
h.ids = std::vector<ID>(h2.ids.begin(), h2.ids.begin() + 7 * BlockLength);
|
||||
h.ids.push_back(123);
|
||||
|
||||
kvc2.raw_read(h);
|
||||
assert(h.match.match_length == 7 * BlockLength);
|
||||
cmp_handle_data(h, h2, 7);
|
||||
}
|
||||
|
||||
kvc2.io_dealer->stop();
|
||||
io.join();
|
||||
}
|
||||
SPDLOG_WARN("{} Test Passed", __FILE__);
|
||||
return 0;
|
||||
}
|
0
csrc/balance_serve/kvc2/test/kvcache_test_utils.cpp
Normal file
0
csrc/balance_serve/kvc2/test/kvcache_test_utils.cpp
Normal file
57
csrc/balance_serve/kvc2/test/page_pool_test.cpp
Normal file
57
csrc/balance_serve/kvc2/test/page_pool_test.cpp
Normal file
|
@ -0,0 +1,57 @@
|
|||
|
||||
#include <unistd.h>
|
||||
#include <iostream>
|
||||
#include <random>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
#include "page_aligned_memory_pool.cpp"
|
||||
|
||||
#define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG
|
||||
#define FMT_HEADER_ONLY
|
||||
#include "spdlog/spdlog.h"
|
||||
|
||||
// 每个线程执行的任务
|
||||
void thread_task(PageAlignedMemoryPool& pool) {
|
||||
std::mt19937 gen(123);
|
||||
std::vector<std::pair<void*, size_t>> allocated;
|
||||
size_t cnt = 40000;
|
||||
for (size_t i = 0; i < cnt; ++i) {
|
||||
// 随机分配一个大小
|
||||
size_t size = (gen() % 100 + 1) * 4096 * 4;
|
||||
void* ptr = pool.alloc(size);
|
||||
// SPDLOG_DEBUG(pool.debug());
|
||||
if (ptr) {
|
||||
pool.free(ptr, size);
|
||||
// allocated.push_back({ptr, size});
|
||||
}
|
||||
// sleep((int)(gen() % 1000) / 1000.0);
|
||||
}
|
||||
// free all memory
|
||||
for (auto& p : allocated) {
|
||||
pool.free(p.first, p.second);
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
spdlog::set_level(spdlog::level::debug);
|
||||
|
||||
// 创建一个内存池
|
||||
PageAlignedMemoryPool pool(40ll * 1024 * 1024 * 1024); // 40 G
|
||||
|
||||
// 创建线程
|
||||
const int num_threads = 32;
|
||||
std::vector<std::thread> threads;
|
||||
for (int i = 0; i < num_threads; ++i) {
|
||||
threads.emplace_back(thread_task, std::ref(pool));
|
||||
}
|
||||
|
||||
// 等待所有线程完成
|
||||
for (auto& t : threads) {
|
||||
t.join();
|
||||
}
|
||||
|
||||
// 输出调试信息
|
||||
std::cout << pool.debug() << std::endl;
|
||||
|
||||
return 0;
|
||||
}
|
0
csrc/balance_serve/kvc2/test/prefix_test.cpp
Normal file
0
csrc/balance_serve/kvc2/test/prefix_test.cpp
Normal file
61
csrc/balance_serve/kvc2/test/pytest_load.py
Normal file
61
csrc/balance_serve/kvc2/test/pytest_load.py
Normal file
|
@ -0,0 +1,61 @@
|
|||
import sys
|
||||
sys.path.append('./build')
|
||||
sys.path.append('./src')
|
||||
import torch
|
||||
import kvc2_ext
|
||||
from kvc2_utils import get_tensor_from_data_ptr
|
||||
|
||||
# Create a kvc2 instance
|
||||
path = "/mnt/data/kvc2"
|
||||
kvc2_instance = kvc2_ext.create_kvc2(path,int(10e9)) # 10 G memory pool
|
||||
kvc2_ext.load(kvc2_instance)
|
||||
|
||||
# Start IO thread
|
||||
print("Start IO thread")
|
||||
kvc2_ext.start_io_thread(kvc2_instance)
|
||||
print("IO thread started")
|
||||
|
||||
# Create CacheInfoInput
|
||||
test_info = kvc2_ext.CacheInfoInput()
|
||||
test_info.model_type = kvc2_ext.ModelType.MT_DeepseekV2
|
||||
test_info.cache_type = kvc2_ext.CacheType.CT_KeyCache
|
||||
test_info.quant_type = kvc2_ext.QuantType.QT_F32
|
||||
|
||||
print("Element size: ", test_info.element_size())
|
||||
|
||||
# Generate random test IDs (length = 2560)
|
||||
torch.manual_seed(123)
|
||||
length = 2560
|
||||
test_id = torch.randint(0, 65536, (length,), dtype=torch.uint16).contiguous()
|
||||
block_count = (length+255) // 256
|
||||
# print("Test ID: ", test_id)
|
||||
|
||||
# Generate test data based on element size and hidden layer count
|
||||
element_size = test_info.element_size()
|
||||
hidden_layer_count = test_info.hidden_layer_count()
|
||||
|
||||
def read_cmp_and_release(kvc2_instance,cache_info,ids,length):
|
||||
handle = kvc2_ext.lookup(kvc2_instance, cache_info, ids, length)
|
||||
if kvc2_ext.is_nullptr(handle):
|
||||
print("Handle is nullptr.")
|
||||
exit()
|
||||
matched_length = kvc2_ext.matched_length(handle)
|
||||
matched_data = kvc2_ext.handle_data(handle)
|
||||
print('Matched length: ', matched_length)
|
||||
if matched_length >0:
|
||||
print(f'First layer address {[hex(x) for x in matched_data[0]]}')
|
||||
read_data = get_tensor_from_data_ptr(matched_data,element_size)
|
||||
|
||||
print("Just read check ok.")
|
||||
kvc2_ext.release(handle)
|
||||
|
||||
|
||||
l = 128
|
||||
while l<=length:
|
||||
read_cmp_and_release(kvc2_instance,test_info,test_id.data_ptr(),l)
|
||||
l+=128
|
||||
|
||||
kvc2_ext.destroy_kvc2(kvc2_instance)
|
||||
|
||||
|
||||
print("Test completed successfully.")
|
83
csrc/balance_serve/kvc2/test/pytest_mem_prefix_test.py
Normal file
83
csrc/balance_serve/kvc2/test/pytest_mem_prefix_test.py
Normal file
|
@ -0,0 +1,83 @@
|
|||
import sys
|
||||
sys.path.append('./build')
|
||||
sys.path.append('./src')
|
||||
import torch
|
||||
import kvc2_ext
|
||||
from kvc2_utils import alloc_aligned_cache,dealloc_aligned_cache,get_tensor_ptr,get_tensor_from_data_ptr
|
||||
|
||||
# Create a kvc2 instance
|
||||
path = "/mnt/data/kvc2"
|
||||
kvc2_instance = kvc2_ext.create_kvc2(path,int(10e9)) # 10 G memory pool
|
||||
|
||||
# Start IO thread
|
||||
print("Start IO thread")
|
||||
kvc2_ext.start_io_thread(kvc2_instance)
|
||||
print("IO thread started")
|
||||
|
||||
# Create CacheInfoInput
|
||||
test_info = kvc2_ext.CacheInfoInput()
|
||||
test_info.model_type = kvc2_ext.ModelType.MT_DeepseekV2
|
||||
test_info.cache_type = kvc2_ext.CacheType.CT_KeyCache
|
||||
test_info.quant_type = kvc2_ext.QuantType.QT_F32
|
||||
|
||||
print("Element size: ", test_info.element_size())
|
||||
|
||||
# Generate random test IDs (length = 2560)
|
||||
torch.manual_seed(123)
|
||||
length = 2560
|
||||
test_id = torch.randint(0, 65536, (length,), dtype=torch.uint16).contiguous()
|
||||
block_count = (length+255) // 256
|
||||
# print("Test ID: ", test_id)
|
||||
|
||||
# Generate test data based on element size and hidden layer count
|
||||
element_size = test_info.element_size()
|
||||
hidden_layer_count = test_info.hidden_layer_count()
|
||||
|
||||
write_data,write_data_mem = alloc_aligned_cache(hidden_layer_count,block_count,element_size)
|
||||
# print(test_data,test_data_mem)
|
||||
print('Generate Insert Data')
|
||||
for layer in write_data:
|
||||
for data in layer:
|
||||
random_values = torch.randint(0, 256, (element_size,), dtype=torch.uint8)
|
||||
data.copy_(random_values)
|
||||
|
||||
print('Insert New data')
|
||||
# Insert raw data
|
||||
kvc2_ext.raw_insert(kvc2_instance, test_info, test_id.data_ptr(), length, get_tensor_ptr(write_data))
|
||||
|
||||
|
||||
def read_cmp_and_release(kvc2_instance,cache_info,ids,length):
|
||||
handle = kvc2_ext.lookup(kvc2_instance, cache_info, ids, length)
|
||||
if kvc2_ext.is_nullptr(handle):
|
||||
print("Handle is nullptr.")
|
||||
exit()
|
||||
matched_length = kvc2_ext.matched_length(handle)
|
||||
matched_data = kvc2_ext.handle_data(handle)
|
||||
print('Matched length: ', matched_length)
|
||||
if matched_length >0:
|
||||
print(f'First layer address {[hex(x) for x in matched_data[0]]}')
|
||||
read_data = get_tensor_from_data_ptr(matched_data,element_size)
|
||||
|
||||
for layer_w,layer_r in zip(write_data,read_data):
|
||||
for data_w,data_r in zip(layer_w,layer_r):
|
||||
# print(data_w,data_r)
|
||||
assert torch.equal(data_w,data_r)
|
||||
print("Lookup read check ok.")
|
||||
kvc2_ext.release(handle)
|
||||
|
||||
|
||||
l = 128
|
||||
while l<=length:
|
||||
read_cmp_and_release(kvc2_instance,test_info,test_id.data_ptr(),l)
|
||||
l+=128
|
||||
|
||||
|
||||
|
||||
dealloc_aligned_cache(write_data_mem)
|
||||
|
||||
|
||||
kvc2_ext.save(kvc2_instance)
|
||||
kvc2_ext.destroy_kvc2(kvc2_instance)
|
||||
|
||||
|
||||
print("Test completed successfully.")
|
72
csrc/balance_serve/kvc2/test/pytest_mem_read.py
Normal file
72
csrc/balance_serve/kvc2/test/pytest_mem_read.py
Normal file
|
@ -0,0 +1,72 @@
|
|||
import sys
|
||||
sys.path.append('./build')
|
||||
sys.path.append('./src')
|
||||
import torch
|
||||
import kvc2_ext
|
||||
from kvc2_utils import alloc_aligned_cache,dealloc_aligned_cache,get_tensor_ptr,get_tensor_from_data_ptr
|
||||
|
||||
# Create a kvc2 instance
|
||||
path = "/mnt/data/kvc2"
|
||||
kvc2_instance = kvc2_ext.create_kvc2(path,int(10e9)) # 10 G memory pool
|
||||
|
||||
# Start IO thread
|
||||
print("Start IO thread")
|
||||
kvc2_ext.start_io_thread(kvc2_instance)
|
||||
print("IO thread started")
|
||||
|
||||
# Create CacheInfoInput
|
||||
test_info = kvc2_ext.CacheInfoInput()
|
||||
test_info.model_type = kvc2_ext.ModelType.MT_DeepseekV2
|
||||
test_info.cache_type = kvc2_ext.CacheType.CT_KeyCache
|
||||
test_info.quant_type = kvc2_ext.QuantType.QT_F32
|
||||
|
||||
print("Element size: ", test_info.element_size())
|
||||
|
||||
# Generate random test IDs (length = 2560)
|
||||
length = 2560
|
||||
test_id = torch.randint(0, 65536, (length,), dtype=torch.uint16).contiguous()
|
||||
block_count = (length+255) // 256
|
||||
# print("Test ID: ", test_id)
|
||||
|
||||
# Generate test data based on element size and hidden layer count
|
||||
element_size = test_info.element_size()
|
||||
hidden_layer_count = test_info.hidden_layer_count()
|
||||
|
||||
write_data,write_data_mem = alloc_aligned_cache(hidden_layer_count,block_count,element_size)
|
||||
# print(test_data,test_data_mem)
|
||||
print('Generate Insert Data')
|
||||
for layer in write_data:
|
||||
for data in layer:
|
||||
random_values = torch.randint(0, 256, (element_size,), dtype=torch.uint8)
|
||||
data.copy_(random_values)
|
||||
|
||||
print('Insert New data')
|
||||
# Insert raw data
|
||||
kvc2_ext.raw_insert(kvc2_instance, test_info, test_id.data_ptr(), length, get_tensor_ptr(write_data))
|
||||
|
||||
|
||||
handle = kvc2_ext.lookup(kvc2_instance, test_info, test_id.data_ptr(), length)
|
||||
matched_length = kvc2_ext.matched_length(handle)
|
||||
matched_data = kvc2_ext.handle_data(handle)
|
||||
|
||||
print('Matched length: ', matched_length)
|
||||
print(f'Match data layer {len(matched_data)}')
|
||||
print(f'Match layer block count {len(matched_data[0])}')
|
||||
read_data = get_tensor_from_data_ptr(matched_data,element_size)
|
||||
|
||||
|
||||
for layer_w,layer_r in zip(write_data,read_data):
|
||||
for data_w,data_r in zip(layer_w,layer_r):
|
||||
# print(data_w,data_r)
|
||||
assert torch.equal(data_w,data_r)
|
||||
print("Lookup read check ok.")
|
||||
|
||||
dealloc_aligned_cache(write_data_mem)
|
||||
|
||||
|
||||
kvc2_ext.save(kvc2_instance)
|
||||
|
||||
|
||||
|
||||
|
||||
print("Test completed successfully.")
|
69
csrc/balance_serve/kvc2/test/pytest_raw_insert_and_read.py
Normal file
69
csrc/balance_serve/kvc2/test/pytest_raw_insert_and_read.py
Normal file
|
@ -0,0 +1,69 @@
|
|||
import sys
|
||||
sys.path.append('./build')
|
||||
sys.path.append('./src')
|
||||
import torch
|
||||
import kvc2_ext
|
||||
from kvc2_utils import alloc_aligned_cache,dealloc_aligned_cache,get_tensor_ptr
|
||||
|
||||
# Create a kvc2 instance
|
||||
path = "/mnt/data/kvc2"
|
||||
kvc2_instance = kvc2_ext.create_kvc2(path,int(10e9)) # 10 G memory pool
|
||||
|
||||
# Start IO thread
|
||||
print("Start IO thread")
|
||||
kvc2_ext.start_io_thread(kvc2_instance)
|
||||
print("IO thread started")
|
||||
|
||||
# Create CacheInfoInput
|
||||
test_info = kvc2_ext.CacheInfoInput()
|
||||
test_info.model_type = kvc2_ext.ModelType.MT_DeepseekV2
|
||||
test_info.cache_type = kvc2_ext.CacheType.CT_KeyCache
|
||||
test_info.quant_type = kvc2_ext.QuantType.QT_F32
|
||||
|
||||
print("Element size: ", test_info.element_size())
|
||||
|
||||
# Generate random test IDs (length = 2560)
|
||||
length = 2560
|
||||
test_id = torch.randint(0, 65536, (length,), dtype=torch.uint16).contiguous()
|
||||
block_count = (length+255) // 256
|
||||
# print("Test ID: ", test_id)
|
||||
|
||||
# Generate test data based on element size and hidden layer count
|
||||
element_size = test_info.element_size()
|
||||
hidden_layer_count = test_info.hidden_layer_count()
|
||||
|
||||
write_data,write_data_mem = alloc_aligned_cache(hidden_layer_count,block_count,element_size)
|
||||
# print(test_data,test_data_mem)
|
||||
print('Generate Insert Data')
|
||||
for layer in write_data:
|
||||
for data in layer:
|
||||
random_values = torch.randint(0, 256, (element_size,), dtype=torch.uint8)
|
||||
data.copy_(random_values)
|
||||
|
||||
print('Insert New data')
|
||||
# Insert raw data
|
||||
kvc2_ext.raw_insert(kvc2_instance, test_info, test_id.data_ptr(), length, get_tensor_ptr(write_data))
|
||||
|
||||
|
||||
read_data,read_data_mem = alloc_aligned_cache(hidden_layer_count,block_count,element_size)
|
||||
|
||||
print('Raw read')
|
||||
matched_length = kvc2_ext.raw_read(kvc2_instance, test_info, test_id.data_ptr(), length,get_tensor_ptr(read_data))
|
||||
|
||||
print('Matched length: ', matched_length)
|
||||
for layer_w,layer_r in zip(write_data,read_data):
|
||||
for data_w,data_r in zip(layer_w,layer_r):
|
||||
# print(data_w,data_r)
|
||||
assert torch.equal(data_w,data_r)
|
||||
print("Raw read check ok.")
|
||||
|
||||
dealloc_aligned_cache(write_data_mem)
|
||||
dealloc_aligned_cache(read_data_mem)
|
||||
|
||||
|
||||
kvc2_ext.save(kvc2_instance)
|
||||
|
||||
|
||||
|
||||
|
||||
print("Test completed successfully.")
|
32
csrc/balance_serve/kvc2/test/test_align.py
Normal file
32
csrc/balance_serve/kvc2/test/test_align.py
Normal file
|
@ -0,0 +1,32 @@
|
|||
import ctypes
|
||||
import torch
|
||||
|
||||
def aligned_tensor(size, alignment=4096):
|
||||
num_bytes = size
|
||||
mem = ctypes.c_void_p()
|
||||
error_code = ctypes.CDLL(None).posix_memalign(
|
||||
ctypes.byref(mem), ctypes.c_size_t(alignment), ctypes.c_size_t(num_bytes)
|
||||
)
|
||||
|
||||
if error_code != 0:
|
||||
raise MemoryError(f"posix_memalign failed with error code {error_code}")
|
||||
|
||||
array_type = (ctypes.c_int8 * size)
|
||||
raw_array = array_type.from_address(mem.value)
|
||||
|
||||
tensor = torch.frombuffer(raw_array, dtype=torch.int8)
|
||||
|
||||
if tensor.data_ptr() % alignment != 0:
|
||||
raise ValueError(f"Tensor data_ptr {tensor.data_ptr()} is not aligned to {alignment} bytes")
|
||||
|
||||
return tensor, mem
|
||||
|
||||
|
||||
size = 5124380
|
||||
tensor, mem_ptr = aligned_tensor(size, alignment=4096)
|
||||
|
||||
print(f"Tensor: {tensor}, size: {tensor.size()}, dataptr: {tensor.data_ptr()}")
|
||||
print(f"Tensor memory alignment: {tensor.data_ptr() % 4096 == 0}")
|
||||
print(f"Allocated memory address: {mem_ptr.value}")
|
||||
|
||||
ctypes.CDLL(None).free(mem_ptr)
|
145
csrc/balance_serve/kvc2/test/test_cuda_stream.cpp
Normal file
145
csrc/balance_serve/kvc2/test/test_cuda_stream.cpp
Normal file
|
@ -0,0 +1,145 @@
|
|||
#include <cuda_runtime.h>
|
||||
#include <functional>
|
||||
#include <iostream>
|
||||
#include <stdexcept>
|
||||
#include <vector>
|
||||
|
||||
class CudaStreamManager {
|
||||
public:
|
||||
CudaStreamManager(int num_streams);
|
||||
~CudaStreamManager();
|
||||
|
||||
// Request structure
|
||||
struct Request {
|
||||
std::vector<void*> host_mem_addresses;
|
||||
std::vector<void*> device_mem_addresses;
|
||||
std::vector<size_t> sizes;
|
||||
cudaMemcpyKind direction;
|
||||
std::function<void()> callback;
|
||||
};
|
||||
|
||||
void submitRequest(const Request& request);
|
||||
|
||||
private:
|
||||
int num_streams_;
|
||||
std::vector<cudaStream_t> streams_;
|
||||
int next_stream_index_;
|
||||
};
|
||||
|
||||
CudaStreamManager::CudaStreamManager(int num_streams) : num_streams_(num_streams), next_stream_index_(0) {
|
||||
streams_.resize(num_streams_);
|
||||
for (int i = 0; i < num_streams_; ++i) {
|
||||
cudaError_t err = cudaStreamCreate(&streams_[i]);
|
||||
if (err != cudaSuccess) {
|
||||
std::cerr << "Failed to create CUDA stream: " << cudaGetErrorString(err) << std::endl;
|
||||
for (int j = 0; j < i; ++j) {
|
||||
cudaStreamDestroy(streams_[j]);
|
||||
}
|
||||
throw std::runtime_error("Failed to create CUDA stream");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
CudaStreamManager::~CudaStreamManager() {
|
||||
for (int i = 0; i < num_streams_; ++i) {
|
||||
cudaStreamDestroy(streams_[i]);
|
||||
}
|
||||
}
|
||||
|
||||
void CudaStreamManager::submitRequest(const Request& request) {
|
||||
int stream_index = next_stream_index_;
|
||||
cudaStream_t stream = streams_[stream_index];
|
||||
next_stream_index_ = (next_stream_index_ + 1) % num_streams_;
|
||||
|
||||
size_t num_transfers = request.host_mem_addresses.size();
|
||||
for (size_t i = 0; i < num_transfers; ++i) {
|
||||
cudaError_t err = cudaMemcpyAsync(request.device_mem_addresses[i], request.host_mem_addresses[i], request.sizes[i],
|
||||
request.direction, stream);
|
||||
if (err != cudaSuccess) {
|
||||
std::cerr << "cudaMemcpyAsync failed: " << cudaGetErrorString(err) << std::endl;
|
||||
throw std::runtime_error("cudaMemcpyAsync failed");
|
||||
}
|
||||
}
|
||||
|
||||
// Enqueue the callback function
|
||||
struct CallbackData {
|
||||
std::function<void()> callback;
|
||||
};
|
||||
|
||||
CallbackData* cb_data = new CallbackData{request.callback};
|
||||
|
||||
cudaError_t err = cudaLaunchHostFunc(
|
||||
stream,
|
||||
[](void* data) {
|
||||
CallbackData* cb_data = static_cast<CallbackData*>(data);
|
||||
cb_data->callback();
|
||||
delete cb_data;
|
||||
},
|
||||
cb_data);
|
||||
|
||||
if (err != cudaSuccess) {
|
||||
std::cerr << "cudaLaunchHostFunc failed: " << cudaGetErrorString(err) << std::endl;
|
||||
throw std::runtime_error("cudaLaunchHostFunc failed");
|
||||
}
|
||||
}
|
||||
|
||||
// Example usage
|
||||
int main() {
|
||||
try {
|
||||
CudaStreamManager stream_manager(4); // Create a manager with 4 streams
|
||||
|
||||
// Prepare host and device memory
|
||||
const size_t num_pages = 10;
|
||||
std::vector<void*> host_mem_addresses(num_pages);
|
||||
std::vector<void*> device_mem_addresses(num_pages);
|
||||
std::vector<size_t> sizes(num_pages, 4096); // 4KB pages
|
||||
|
||||
// Allocate host memory
|
||||
for (size_t i = 0; i < num_pages; ++i) {
|
||||
host_mem_addresses[i] = malloc(4096);
|
||||
if (!host_mem_addresses[i]) {
|
||||
throw std::runtime_error("Failed to allocate host memory");
|
||||
}
|
||||
// Initialize data if necessary
|
||||
}
|
||||
|
||||
// Allocate device memory
|
||||
for (size_t i = 0; i < num_pages; ++i) {
|
||||
cudaError_t err = cudaMalloc(&device_mem_addresses[i], 4096);
|
||||
if (err != cudaSuccess) {
|
||||
std::cerr << "cudaMalloc failed: " << cudaGetErrorString(err) << std::endl;
|
||||
throw std::runtime_error("cudaMalloc failed");
|
||||
}
|
||||
}
|
||||
|
||||
// Create a request
|
||||
CudaStreamManager::Request request;
|
||||
request.host_mem_addresses = host_mem_addresses;
|
||||
request.device_mem_addresses = device_mem_addresses;
|
||||
request.sizes = sizes;
|
||||
request.direction = cudaMemcpyHostToDevice;
|
||||
request.callback = []() { std::cout << "Data transfer completed!" << std::endl; };
|
||||
|
||||
// Submit the request
|
||||
stream_manager.submitRequest(request);
|
||||
|
||||
// Wait for all streams to complete
|
||||
cudaError_t err = cudaDeviceSynchronize();
|
||||
if (err != cudaSuccess) {
|
||||
std::cerr << "cudaDeviceSynchronize failed: " << cudaGetErrorString(err) << std::endl;
|
||||
throw std::runtime_error("cudaDeviceSynchronize failed");
|
||||
}
|
||||
|
||||
// Clean up
|
||||
for (size_t i = 0; i < num_pages; ++i) {
|
||||
free(host_mem_addresses[i]);
|
||||
cudaFree(device_mem_addresses[i]);
|
||||
}
|
||||
|
||||
} catch (const std::exception& e) {
|
||||
std::cerr << "Exception: " << e.what() << std::endl;
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
113
csrc/balance_serve/kvc2/test/test_cuda_stream_manager.cpp
Normal file
113
csrc/balance_serve/kvc2/test/test_cuda_stream_manager.cpp
Normal file
|
@ -0,0 +1,113 @@
|
|||
#include "cuda_stream_manager.hh"
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <functional>
|
||||
#include <iostream>
|
||||
#include <stdexcept>
|
||||
#include <vector>
|
||||
|
||||
int main() {
|
||||
try {
|
||||
int num_devices = 0;
|
||||
cudaError_t err = cudaGetDeviceCount(&num_devices);
|
||||
if (err != cudaSuccess) {
|
||||
std::cerr << "cudaGetDeviceCount failed: " << cudaGetErrorString(err) << std::endl;
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (num_devices < 1) {
|
||||
std::cerr << "未找到 CUDA 设备。" << std::endl;
|
||||
return 1;
|
||||
}
|
||||
|
||||
std::vector<size_t> device_ids;
|
||||
for (int i = 0; i < num_devices; ++i) {
|
||||
device_ids.push_back(i);
|
||||
}
|
||||
|
||||
const size_t num_pages = 10;
|
||||
const size_t page_size = 4096; // 每页 4KB
|
||||
|
||||
// 创建 CudaStreamManager 实例,管理所有设备
|
||||
CudaStreamManager stream_manager(device_ids, 4);
|
||||
|
||||
// 准备主机内存和设备内存映射
|
||||
std::vector<std::vector<void*>> host_mem_addresses(num_devices);
|
||||
std::vector<std::vector<void*>> device_mem_addresses(num_devices);
|
||||
|
||||
// 分配主机内存
|
||||
for (size_t i = 0; i < num_pages; ++i) {
|
||||
void* host_ptr = malloc(page_size);
|
||||
if (!host_ptr) {
|
||||
throw std::runtime_error("Failed to allocate host memory");
|
||||
}
|
||||
// 如果需要,初始化数据
|
||||
|
||||
// 将相同的主机内存添加到每个设备的列表中
|
||||
for (int device_id = 0; device_id < num_devices; ++device_id) {
|
||||
host_mem_addresses[device_id].push_back(host_ptr);
|
||||
}
|
||||
}
|
||||
|
||||
// 为每个设备分配设备内存
|
||||
for (int device_id = 0; device_id < num_devices; ++device_id) {
|
||||
err = cudaSetDevice(device_id);
|
||||
if (err != cudaSuccess) {
|
||||
std::cerr << "cudaSetDevice failed: " << cudaGetErrorString(err) << std::endl;
|
||||
throw std::runtime_error("cudaSetDevice failed");
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < num_pages; ++i) {
|
||||
void* device_ptr;
|
||||
err = cudaMalloc(&device_ptr, page_size);
|
||||
if (err != cudaSuccess) {
|
||||
std::cerr << "cudaMalloc failed on device " << device_id << ": " << cudaGetErrorString(err) << std::endl;
|
||||
throw std::runtime_error("cudaMalloc failed");
|
||||
}
|
||||
device_mem_addresses[device_id].push_back(device_ptr);
|
||||
}
|
||||
}
|
||||
|
||||
// 为每个设备创建并提交请求
|
||||
for (int device_id = 0; device_id < num_devices; ++device_id) {
|
||||
auto request = std::shared_ptr<CudaStreamManager::Request>(new CudaStreamManager::Request);
|
||||
request->device_id = device_id;
|
||||
request->host_mem_addresses = host_mem_addresses[device_id];
|
||||
request->device_mem_addresses = device_mem_addresses[device_id];
|
||||
request->sizes = std::vector<size_t>(num_pages, page_size);
|
||||
request->direction = cudaMemcpyHostToDevice;
|
||||
request->callback = [device_id]() {
|
||||
std::cout << "Device " << device_id << " data transfer completed!" << std::endl;
|
||||
};
|
||||
|
||||
stream_manager.submitRequest(request);
|
||||
}
|
||||
|
||||
// 等待一段时间,确保所有请求都被处理
|
||||
// 在实际应用中,可以使用更好的同步机制
|
||||
std::this_thread::sleep_for(std::chrono::seconds(5));
|
||||
|
||||
// 清理主机内存
|
||||
for (size_t i = 0; i < num_pages; ++i) {
|
||||
free(host_mem_addresses[0][i]); // 所有设备共享相同的主机内存,只需释放一次
|
||||
}
|
||||
|
||||
// 清理设备内存
|
||||
for (int device_id = 0; device_id < num_devices; ++device_id) {
|
||||
err = cudaSetDevice(device_id);
|
||||
if (err != cudaSuccess) {
|
||||
std::cerr << "cudaSetDevice failed during cleanup: " << cudaGetErrorString(err) << std::endl;
|
||||
continue;
|
||||
}
|
||||
for (void* ptr : device_mem_addresses[device_id]) {
|
||||
cudaFree(ptr);
|
||||
}
|
||||
}
|
||||
|
||||
} catch (const std::exception& e) {
|
||||
std::cerr << "异常: " << e.what() << std::endl;
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
56
csrc/balance_serve/kvc2/test/test_lock_free_queue.cpp
Normal file
56
csrc/balance_serve/kvc2/test/test_lock_free_queue.cpp
Normal file
|
@ -0,0 +1,56 @@
|
|||
#include <chrono>
|
||||
#include <iostream>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
#include "utils/lock_free_queue.hpp"
|
||||
|
||||
struct Item {
|
||||
int value;
|
||||
std::promise<void> promise;
|
||||
};
|
||||
|
||||
int main() {
|
||||
MPSCQueue<Item> queue;
|
||||
|
||||
std::vector<std::thread> producers;
|
||||
const int num_producers = 4;
|
||||
const int items_per_producer = 5;
|
||||
|
||||
// 启动生产者线程
|
||||
for (int i = 0; i < num_producers; ++i) {
|
||||
producers.emplace_back([&queue, i]() {
|
||||
for (int j = 0; j < items_per_producer; ++j) {
|
||||
auto item = std::make_shared<Item>();
|
||||
item->value = i * items_per_producer + j;
|
||||
std::future<void> future = item->promise.get_future();
|
||||
queue.enqueue(item);
|
||||
future.wait(); // 等待消费者处理完成
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// 启动消费者线程
|
||||
std::thread consumer([&queue, num_producers, items_per_producer]() {
|
||||
int total_items = num_producers * items_per_producer;
|
||||
int processed = 0;
|
||||
while (processed < total_items) {
|
||||
std::shared_ptr<Item> item = queue.dequeue();
|
||||
if (item) {
|
||||
std::cout << "Consumed item with value: " << item->value << std::endl;
|
||||
item->promise.set_value(); // 通知生产者
|
||||
++processed;
|
||||
} else {
|
||||
// 如果队列为空,可以选择休眠或让出线程
|
||||
std::this_thread::yield();
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// 等待所有线程完成
|
||||
for (auto& producer : producers) {
|
||||
producer.join();
|
||||
}
|
||||
consumer.join();
|
||||
|
||||
return 0;
|
||||
}
|
163
csrc/balance_serve/kvc2/test/test_periodic_task.cpp
Normal file
163
csrc/balance_serve/kvc2/test/test_periodic_task.cpp
Normal file
|
@ -0,0 +1,163 @@
|
|||
#include <atomic>
|
||||
#include <cassert>
|
||||
#include <chrono>
|
||||
#include <cstdio>
|
||||
#include <future>
|
||||
#include <iostream>
|
||||
#include <thread>
|
||||
#include "utils/periodic_task.hpp"
|
||||
|
||||
// 1. 任务是否按预期执行
|
||||
void testPeriodicTaskExecution() {
|
||||
std::atomic<int> execution_count{0};
|
||||
auto task = [&execution_count]() { execution_count++; };
|
||||
|
||||
periodic::PeriodicTask periodic_task(task, std::chrono::milliseconds(50));
|
||||
|
||||
std::this_thread::sleep_for(std::chrono::seconds(2));
|
||||
|
||||
assert(execution_count >= 20); // 确保任务执行了至少 20 次
|
||||
std::cout << "Test 1 passed: Task executed periodically." << std::endl;
|
||||
std::cout << "Task executed " << execution_count.load() << " times." << std::endl;
|
||||
}
|
||||
|
||||
// 2. 提前唤醒任务的功能
|
||||
void testWakeUpImmediately() {
|
||||
std::atomic<int> execution_count{0};
|
||||
auto task = [&execution_count]() { execution_count++; };
|
||||
|
||||
periodic::PeriodicTask periodic_task(task, std::chrono::milliseconds(200));
|
||||
|
||||
// 提前唤醒任务
|
||||
periodic_task.wakeUp();
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(50)); // 等待任务执行
|
||||
|
||||
std::cout << "Execution count after wakeUp: " << execution_count.load() << std::endl;
|
||||
assert(execution_count == 1); // 确保任务立即执行
|
||||
std::cout << "Test 2 passed: Task woke up immediately." << std::endl;
|
||||
}
|
||||
|
||||
// 3. wakeUpWait() 的等待功能
|
||||
void testWakeUpWait() {
|
||||
std::promise<void> promise;
|
||||
std::future<void> future = promise.get_future();
|
||||
auto task = [&promise]() {
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(100)); // 模拟任务执行
|
||||
promise.set_value(); // 任务完成时设置 promise
|
||||
};
|
||||
|
||||
periodic::PeriodicTask periodic_task(task, std::chrono::milliseconds(200));
|
||||
|
||||
// 调用 wakeUpWait 并等待任务完成
|
||||
std::future<void> wakeup_future = periodic_task.wakeUpWait();
|
||||
wakeup_future.wait(); // 等待任务完成
|
||||
|
||||
assert(wakeup_future.valid()); // 确保 future 是有效的
|
||||
std::cout << "Test 3 passed: wakeUpWait() works correctly." << std::endl;
|
||||
std::cout << "wakeUpWait() future is valid." << std::endl;
|
||||
}
|
||||
|
||||
// 4. 任务抛出异常的处理
|
||||
void testTaskExceptionHandling() {
|
||||
auto task = []() { throw std::runtime_error("Test exception"); };
|
||||
|
||||
periodic::PeriodicTask periodic_task(task, std::chrono::milliseconds(200));
|
||||
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(300)); // 等待一段时间
|
||||
|
||||
std::cout << "Test 4 passed: Task exception is handled correctly." << std::endl;
|
||||
std::cout << "Exception handled and task did not crash." << std::endl;
|
||||
}
|
||||
|
||||
// 5. 线程是否能正确停止
|
||||
void testTaskStop() {
|
||||
std::atomic<bool> stopped{false};
|
||||
auto task = [&stopped]() {
|
||||
while (!stopped) {
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(50));
|
||||
}
|
||||
};
|
||||
|
||||
periodic::PeriodicTask periodic_task(task, std::chrono::milliseconds(100));
|
||||
|
||||
std::this_thread::sleep_for(std::chrono::seconds(1)); // 运行一段时间
|
||||
|
||||
stopped = true; // 请求停止
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(50)); // 等待线程停止
|
||||
|
||||
std::cout << "Test 5 passed: Task thread stops correctly." << std::endl;
|
||||
std::cout << "Task has been stopped successfully." << std::endl;
|
||||
}
|
||||
|
||||
// 6. 高频唤醒的情况下任务执行是否正常
|
||||
void testHighFrequencyWakeUp() {
|
||||
std::atomic<int> execution_count{0};
|
||||
auto task = [&execution_count]() { execution_count++; };
|
||||
|
||||
periodic::PeriodicTask periodic_task(task, std::chrono::milliseconds(200));
|
||||
|
||||
for (int i = 0; i < 100; ++i) {
|
||||
periodic_task.wakeUp();
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(10)); // 每 10 毫秒唤醒一次
|
||||
}
|
||||
|
||||
std::this_thread::sleep_for(std::chrono::seconds(1)); // 等待任务执行完成
|
||||
|
||||
assert(execution_count > 50); // 确保任务至少执行了 50 次
|
||||
std::cout << "Test 6 passed: Task handles frequent wake ups correctly." << std::endl;
|
||||
std::cout << "Task executed " << execution_count.load() << " times." << std::endl;
|
||||
}
|
||||
|
||||
// 7. 多个 wakeUpWait() 调用的处理
|
||||
void testMultipleWakeUpWait() {
|
||||
std::atomic<int> execution_count{0};
|
||||
auto task = [&execution_count]() {
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(100)); // 模拟任务执行
|
||||
execution_count++;
|
||||
};
|
||||
|
||||
periodic::PeriodicTask periodic_task(task, std::chrono::milliseconds(200));
|
||||
|
||||
// 同时调用两个 wakeUpWait
|
||||
std::future<void> future1 = periodic_task.wakeUpWait();
|
||||
std::future<void> future2 = periodic_task.wakeUpWait();
|
||||
|
||||
future1.wait();
|
||||
future2.wait();
|
||||
|
||||
assert(execution_count == 1); // 确保任务只执行了一次
|
||||
std::cout << "Test 7 passed: Multiple wakeUpWait() calls are handled correctly." << std::endl;
|
||||
std::cout << "Task executed " << execution_count.load() << " times." << std::endl;
|
||||
}
|
||||
|
||||
// 8. 任务函数为空的边界情况
|
||||
void testEmptyTaskFunction() {
|
||||
auto task = []() {
|
||||
// 空任务函数
|
||||
};
|
||||
|
||||
periodic::PeriodicTask periodic_task(task, std::chrono::milliseconds(100));
|
||||
|
||||
std::this_thread::sleep_for(std::chrono::seconds(1)); // 等待一段时间
|
||||
|
||||
std::cout << "Test 8 passed: Empty task function works correctly." << std::endl;
|
||||
std::cout << "Empty task function executed without issues." << std::endl;
|
||||
}
|
||||
|
||||
int main() {
|
||||
std::cout << "Starting tests..." << std::endl;
|
||||
|
||||
// testWakeUpImmediately();
|
||||
testPeriodicTaskExecution();
|
||||
testWakeUpImmediately();
|
||||
testWakeUpWait();
|
||||
testTaskExceptionHandling();
|
||||
testTaskStop();
|
||||
testHighFrequencyWakeUp();
|
||||
testMultipleWakeUpWait();
|
||||
testEmptyTaskFunction();
|
||||
|
||||
std::cout << "All tests passed!" << std::endl;
|
||||
|
||||
return 0;
|
||||
}
|
84
csrc/balance_serve/kvc2/test/test_queue_perf.cpp
Normal file
84
csrc/balance_serve/kvc2/test/test_queue_perf.cpp
Normal file
|
@ -0,0 +1,84 @@
|
|||
#include <mutex>
|
||||
#include <queue>
|
||||
#include "utils/lock_free_queue.hpp"
|
||||
|
||||
#define STDQ
|
||||
|
||||
int main() {
|
||||
const int num_producers = 48;
|
||||
const int num_items = 1e6;
|
||||
|
||||
#ifdef STDQ
|
||||
std::mutex lock;
|
||||
std::queue<int> queue;
|
||||
#else
|
||||
MPSCQueue<int> queue;
|
||||
#endif
|
||||
|
||||
auto start_time = std::chrono::high_resolution_clock::now();
|
||||
|
||||
// Launch multiple producer threads
|
||||
std::vector<std::thread> producers;
|
||||
for (int i = 0; i < num_producers; ++i) {
|
||||
producers.emplace_back([&queue, i
|
||||
#ifdef STDQ
|
||||
,
|
||||
&lock
|
||||
#endif
|
||||
]() {
|
||||
for (int j = 0; j < num_items; ++j) {
|
||||
#ifdef STDQ
|
||||
std::lock_guard<std::mutex> guard(lock);
|
||||
queue.push(i * num_items + j);
|
||||
#else
|
||||
queue.enqueue(std::make_shared<int>(i * num_items + j));
|
||||
#endif
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// Consumer thread
|
||||
std::thread consumer([&queue, num_producers
|
||||
#ifdef STDQ
|
||||
,
|
||||
&lock
|
||||
#endif
|
||||
]() {
|
||||
int count = 0;
|
||||
while (count < num_producers * num_items) {
|
||||
#ifdef STDQ
|
||||
std::lock_guard<std::mutex> guard(lock);
|
||||
if (!queue.empty()) {
|
||||
queue.pop();
|
||||
count++;
|
||||
}
|
||||
#else
|
||||
if (auto item = queue.dequeue()) {
|
||||
count++;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
});
|
||||
|
||||
// Wait for all producers to finish
|
||||
for (auto& producer : producers) {
|
||||
producer.join();
|
||||
}
|
||||
|
||||
// Wait for the consumer to finish
|
||||
consumer.join();
|
||||
|
||||
auto end_time = std::chrono::high_resolution_clock::now();
|
||||
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time).count();
|
||||
|
||||
#ifdef STDQ
|
||||
std::cout << "std::queue with mutex ";
|
||||
#else
|
||||
std::cout << "lock free queue ";
|
||||
#endif
|
||||
|
||||
std::cout << "Processed " << num_producers * num_items / 1e6 << "M items in " << duration << " milliseconds "
|
||||
<< num_producers * num_items / 1e3 / duration << " MOps." << std::endl;
|
||||
|
||||
return 0;
|
||||
}
|
38
csrc/balance_serve/kvc2/test/test_std_list.cpp
Normal file
38
csrc/balance_serve/kvc2/test/test_std_list.cpp
Normal file
|
@ -0,0 +1,38 @@
|
|||
#include <iostream>
|
||||
#include <iterator>
|
||||
#include <vector>
|
||||
|
||||
int main() {
|
||||
std::vector<int> v = {0, 1, 2, 3, 4, 5};
|
||||
|
||||
using RevIt = std::reverse_iterator<std::vector<int>::iterator>;
|
||||
|
||||
const auto it = v.begin() + 3;
|
||||
RevIt r_it{it};
|
||||
|
||||
std::cout << "*it == " << *it << '\n'
|
||||
<< "*r_it == " << *r_it << '\n'
|
||||
<< "*r_it.base() == " << *r_it.base() << '\n'
|
||||
<< "*(r_it.base()-1) == " << *(r_it.base() - 1) << '\n';
|
||||
|
||||
RevIt r_end{v.begin()};
|
||||
RevIt r_begin{v.end()};
|
||||
|
||||
for (auto it = r_end.base(); it != r_begin.base(); ++it)
|
||||
std::cout << *it << ' ';
|
||||
std::cout << '\n';
|
||||
|
||||
for (auto it = r_begin; it != r_end; ++it)
|
||||
std::cout << *it << ' ';
|
||||
std::cout << '\n';
|
||||
|
||||
for (auto it = r_begin; it != r_end; ++it) {
|
||||
if (*it == 3) {
|
||||
v.erase(std::next(it).base());
|
||||
}
|
||||
}
|
||||
|
||||
for (auto it : v)
|
||||
std::cout << it << ' ';
|
||||
std::cout << '\n';
|
||||
}
|
31
csrc/balance_serve/kvc2/test/xxHash_test.cpp
Normal file
31
csrc/balance_serve/kvc2/test/xxHash_test.cpp
Normal file
|
@ -0,0 +1,31 @@
|
|||
#include "xxhash.h"
|
||||
#include <iostream>
|
||||
|
||||
int main() {
|
||||
std::string t = "hello world";
|
||||
XXH64_hash_t hash = XXH64(t.data(), t.size(), 123);
|
||||
std::cout << hash << std::endl;
|
||||
{
|
||||
/* create a hash state */
|
||||
XXH64_state_t* const state = XXH64_createState();
|
||||
if (state == NULL)
|
||||
abort();
|
||||
|
||||
if (XXH64_reset(state, 123) == XXH_ERROR)
|
||||
abort();
|
||||
|
||||
if (XXH64_update(state, t.data(), 5) == XXH_ERROR)
|
||||
abort();
|
||||
|
||||
if (XXH64_update(state, t.data() + 5, t.size() - 5) == XXH_ERROR)
|
||||
abort();
|
||||
/* Produce the final hash value */
|
||||
XXH64_hash_t const hash = XXH64_digest(state);
|
||||
|
||||
/* State could be re-used; but in this example, it is simply freed */
|
||||
XXH64_freeState(state);
|
||||
std::cout << hash << std::endl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
36
csrc/balance_serve/kvc2/unit_test.sh
Executable file
36
csrc/balance_serve/kvc2/unit_test.sh
Executable file
|
@ -0,0 +1,36 @@
|
|||
#!/bin/bash
|
||||
|
||||
# 检查是否提供了 disk_cache_path 参数
|
||||
if [ -z "$1" ]; then
|
||||
echo "Usage: $0 <disk_cache_path>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 将 disk_cache_path 参数赋值给变量
|
||||
disk_cache_path=$1
|
||||
|
||||
# 定义测试命令数组,并使用变量替换 disk_cache_path
|
||||
tests=(
|
||||
"./build/test/kvc2_export_header_test --disk_cache_path=$disk_cache_path"
|
||||
"./build/test/kvcache_disk_insert_read_test --disk_cache_path=$disk_cache_path"
|
||||
"./build/test/kvcache_mem_eviction_test --disk_cache_path=$disk_cache_path"
|
||||
"./build/test/kvcache_mem_insert_read_test --disk_cache_path=$disk_cache_path"
|
||||
"./build/test/kvcache_save_load_test --disk_cache_path=$disk_cache_path"
|
||||
)
|
||||
|
||||
|
||||
# 遍历每个测试命令
|
||||
for test in "${tests[@]}"; do
|
||||
echo "Running: $test"
|
||||
# 运行测试并捕获输出
|
||||
output=$($test)
|
||||
|
||||
# 检查测试输出中是否包含 "Test Passed"
|
||||
if echo "$output" | grep -q "Test Passed"; then
|
||||
echo " Test Passed"
|
||||
else
|
||||
echo " Test Failed"
|
||||
fi
|
||||
|
||||
sleep 1
|
||||
done
|
20
csrc/balance_serve/sched/CMakeLists.txt
Normal file
20
csrc/balance_serve/sched/CMakeLists.txt
Normal file
|
@ -0,0 +1,20 @@
|
|||
set(CMAKE_CXX_FLAGS "-Og -march=native -Wall -Wextra -g -fPIC")
|
||||
# set(CMAKE_CXX_FLAGS "-O3 -march=native -Wall -Wextra -fPIC")
|
||||
add_compile_definitions(_GLIBCXX_USE_CXX11_ABI=${_GLIBCXX_USE_CXX11_ABI})
|
||||
|
||||
set(UTILS_DIR ${CMAKE_CURRENT_SOURCE_DIR}/utils)
|
||||
|
||||
add_library(sched_metrics metrics.cpp)
|
||||
target_include_directories(sched_metrics PRIVATE ${UTILS_DIR})
|
||||
target_link_libraries(sched_metrics PUBLIC prometheus-cpp::pull)
|
||||
|
||||
|
||||
add_library(sched scheduler.cpp)
|
||||
target_include_directories(sched PRIVATE ${SPDLOG_DIR}/include ${FMT_DIR}/include ${UTILS_DIR} ${KVC2_INCLUDE_DIR})
|
||||
target_link_libraries(sched PUBLIC pthread ${TORCH_LIBRARIES} kvc2 async_store sched_metrics)
|
||||
|
||||
pybind11_add_module(sched_ext bind.cpp)
|
||||
target_link_libraries(sched_ext PUBLIC sched ${TORCH_LIBRARIES} ${TORCH_PYTHON_LIBRARY})
|
||||
|
||||
|
||||
|
249
csrc/balance_serve/sched/bind.cpp
Normal file
249
csrc/balance_serve/sched/bind.cpp
Normal file
|
@ -0,0 +1,249 @@
|
|||
#include "scheduler.h"
|
||||
#include <memory>
|
||||
#include <pybind11/numpy.h>
|
||||
#include <pybind11/pybind11.h>
|
||||
#include <pybind11/stl.h>
|
||||
|
||||
#include <torch/extension.h>
|
||||
|
||||
namespace py = pybind11;
|
||||
|
||||
PYBIND11_MODULE(sched_ext, m) {
|
||||
py::class_<scheduler::ModelSettings>(m, "ModelSettings")
|
||||
.def(py::init<>())
|
||||
.def_readwrite("model_path", &scheduler::ModelSettings::model_path)
|
||||
.def_readwrite("params_count", &scheduler::ModelSettings::params_count)
|
||||
.def_readwrite("layer_count", &scheduler::ModelSettings::layer_count)
|
||||
.def_readwrite("num_k_heads", &scheduler::ModelSettings::num_k_heads)
|
||||
.def_readwrite("k_head_dim", &scheduler::ModelSettings::k_head_dim)
|
||||
.def_readwrite("bytes_per_params",
|
||||
&scheduler::ModelSettings::bytes_per_params)
|
||||
.def_readwrite("bytes_per_kv_cache_element",
|
||||
&scheduler::ModelSettings::bytes_per_kv_cache_element)
|
||||
.def("params_size", &scheduler::ModelSettings::params_nbytes)
|
||||
.def("bytes_per_token_kv_cache",
|
||||
&scheduler::ModelSettings::bytes_per_token_kv_cache)
|
||||
// 添加 pickle 支持
|
||||
.def(py::pickle(
|
||||
[](const scheduler::ModelSettings &self) { // __getstate__
|
||||
return py::make_tuple(self.params_count, self.layer_count,
|
||||
self.num_k_heads, self.k_head_dim,
|
||||
self.bytes_per_params,
|
||||
self.bytes_per_kv_cache_element);
|
||||
},
|
||||
[](py::tuple t) { // __setstate__
|
||||
if (t.size() != 6)
|
||||
throw std::runtime_error("Invalid state! t.size() = " +
|
||||
std::to_string(t.size()));
|
||||
scheduler::ModelSettings ms;
|
||||
ms.params_count = t[0].cast<size_t>();
|
||||
ms.layer_count = t[1].cast<size_t>();
|
||||
ms.num_k_heads = t[2].cast<size_t>();
|
||||
ms.k_head_dim = t[3].cast<size_t>();
|
||||
ms.bytes_per_params = t[4].cast<double>();
|
||||
ms.bytes_per_kv_cache_element = t[5].cast<double>();
|
||||
return ms;
|
||||
}));
|
||||
|
||||
py::class_<scheduler::SampleOptions>(m, "SampleOptions")
|
||||
.def(py::init<>())
|
||||
.def_readwrite("temperature", &scheduler::SampleOptions::temperature)
|
||||
.def_readwrite("top_p",
|
||||
&scheduler::SampleOptions::top_p) // 确保 top_p 也能被访问
|
||||
.def(py::pickle(
|
||||
[](const scheduler::SampleOptions &self) {
|
||||
return py::make_tuple(self.temperature,
|
||||
self.top_p); // 序列化 temperature 和 top_p
|
||||
},
|
||||
[](py::tuple t) {
|
||||
if (t.size() != 2) // 确保解包时参数数量匹配
|
||||
throw std::runtime_error("Invalid state! t.size() = " +
|
||||
std::to_string(t.size()));
|
||||
scheduler::SampleOptions so;
|
||||
so.temperature = t[0].cast<double>();
|
||||
so.top_p = t[1].cast<double>(); // 反序列化 top_p
|
||||
return so;
|
||||
}));
|
||||
|
||||
py::class_<scheduler::Settings>(m, "Settings")
|
||||
.def(py::init<>())
|
||||
.def_readwrite("model_name", &scheduler::Settings::model_name)
|
||||
.def_readwrite("quant_type", &scheduler::Settings::quant_type)
|
||||
.def_readwrite("model_settings", &scheduler::Settings::model_settings)
|
||||
.def_readwrite("page_size", &scheduler::Settings::page_size)
|
||||
.def_readwrite("gpu_device_id", &scheduler::Settings::gpu_device_id)
|
||||
.def_readwrite("gpu_memory_size", &scheduler::Settings::gpu_memory_size)
|
||||
.def_readwrite("memory_utilization_percentage",
|
||||
&scheduler::Settings::memory_utilization_percentage)
|
||||
.def_readwrite("max_batch_size", &scheduler::Settings::max_batch_size)
|
||||
.def_readwrite(
|
||||
"recommended_chunk_prefill_token_count",
|
||||
&scheduler::Settings::recommended_chunk_prefill_token_count)
|
||||
.def_readwrite("sample_options", &scheduler::Settings::sample_options)
|
||||
.def_readwrite("sched_metrics_port",
|
||||
&scheduler::Settings::sched_metrics_port)
|
||||
.def_readwrite("gpu_only", &scheduler::Settings::gpu_only)
|
||||
.def_readwrite("use_self_defined_head_dim",
|
||||
&scheduler::Settings::use_self_defined_head_dim)
|
||||
.def_readwrite("self_defined_head_dim",
|
||||
&scheduler::Settings::self_defined_head_dim)
|
||||
.def_readwrite("full_kv_cache_on_each_gpu",
|
||||
&scheduler::Settings::full_kv_cache_on_each_gpu)
|
||||
.def_readwrite("k_cache_on", &scheduler::Settings::k_cache_on)
|
||||
.def_readwrite("v_cache_on", &scheduler::Settings::v_cache_on)
|
||||
.def_readwrite("kvc2_config_path", &scheduler::Settings::kvc2_config_path)
|
||||
.def_readwrite("kvc2_root_path", &scheduler::Settings::kvc2_root_path)
|
||||
.def_readwrite("memory_pool_size_GB",
|
||||
&scheduler::Settings::memory_pool_size_GB)
|
||||
.def_readwrite("evict_count", &scheduler::Settings::evict_count)
|
||||
.def_readwrite("strategy_name", &scheduler::Settings::strategy_name)
|
||||
.def_readwrite("kvc2_metrics_port",
|
||||
&scheduler::Settings::kvc2_metrics_port)
|
||||
.def_readwrite("load_from_disk", &scheduler::Settings::load_from_disk)
|
||||
.def_readwrite("save_to_disk", &scheduler::Settings::save_to_disk)
|
||||
// derived
|
||||
.def_readwrite("gpu_device_count", &scheduler::Settings::gpu_device_count)
|
||||
.def_readwrite("total_kvcache_pages",
|
||||
&scheduler::Settings::total_kvcache_pages)
|
||||
.def_readwrite("devices", &scheduler::Settings::devices)
|
||||
.def("auto_derive", &scheduler::Settings::auto_derive);
|
||||
|
||||
py::class_<scheduler::BatchQueryTodo,
|
||||
std::shared_ptr<scheduler::BatchQueryTodo>>(m, "BatchQueryTodo")
|
||||
.def(py::init<>())
|
||||
.def_readwrite("query_ids", &scheduler::BatchQueryTodo::query_ids)
|
||||
.def_readwrite("query_tokens", &scheduler::BatchQueryTodo::query_tokens)
|
||||
.def_readwrite("query_lengths", &scheduler::BatchQueryTodo::query_lengths)
|
||||
.def_readwrite("block_indexes", &scheduler::BatchQueryTodo::block_indexes)
|
||||
.def_readwrite("attn_masks", &scheduler::BatchQueryTodo::attn_masks)
|
||||
.def_readwrite("rope_ranges", &scheduler::BatchQueryTodo::rope_ranges)
|
||||
.def_readwrite("sample_options",
|
||||
&scheduler::BatchQueryTodo::sample_options)
|
||||
.def_readwrite("prefill_mini_batches",
|
||||
&scheduler::BatchQueryTodo::prefill_mini_batches)
|
||||
.def_readwrite("decode_mini_batches",
|
||||
&scheduler::BatchQueryTodo::decode_mini_batches)
|
||||
.def_readwrite("stop_criteria", &scheduler::BatchQueryTodo::stop_criteria)
|
||||
.def("debug", &scheduler::BatchQueryTodo::debug)
|
||||
.def(py::pickle(
|
||||
[](const scheduler::BatchQueryTodo &self) {
|
||||
return py::make_tuple(
|
||||
self.query_ids, self.query_tokens, self.query_lengths,
|
||||
self.block_indexes, self.attn_masks, self.rope_ranges,
|
||||
self.sample_options, self.prefill_mini_batches,
|
||||
self.decode_mini_batches, self.stop_criteria);
|
||||
},
|
||||
[](py::tuple t) {
|
||||
if (t.size() != 10)
|
||||
throw std::runtime_error("Invalid state! t.size() = " +
|
||||
std::to_string(t.size()));
|
||||
scheduler::BatchQueryTodo bqt;
|
||||
bqt.query_ids = t[0].cast<std::vector<scheduler::QueryID>>();
|
||||
bqt.query_tokens = t[1].cast<std::vector<torch::Tensor>>();
|
||||
bqt.query_lengths =
|
||||
t[2].cast<std::vector<scheduler::TokenLength>>();
|
||||
bqt.block_indexes = t[3].cast<std::vector<torch::Tensor>>();
|
||||
bqt.attn_masks = t[4].cast<std::optional<torch::Tensor>>();
|
||||
bqt.rope_ranges = t[5].cast<std::optional<torch::Tensor>>();
|
||||
bqt.sample_options =
|
||||
t[6].cast<std::vector<scheduler::SampleOptions>>();
|
||||
bqt.prefill_mini_batches =
|
||||
t[7].cast<std::vector<scheduler::PrefillTask>>();
|
||||
bqt.decode_mini_batches =
|
||||
t[8].cast<std::vector<std::vector<scheduler::QueryID>>>();
|
||||
bqt.stop_criteria =
|
||||
t[9].cast<std::vector<std::vector<std::vector<int>>>>();
|
||||
return bqt;
|
||||
}));
|
||||
|
||||
py::class_<scheduler::QueryUpdate>(m, "QueryUpdate")
|
||||
.def(py::init<>())
|
||||
.def_readwrite("id", &scheduler::QueryUpdate::id)
|
||||
.def_readwrite("ok", &scheduler::QueryUpdate::ok)
|
||||
.def_readwrite("is_prefill", &scheduler::QueryUpdate::is_prefill)
|
||||
.def_readwrite("decode_done", &scheduler::QueryUpdate::decode_done)
|
||||
.def_readwrite("active_position",
|
||||
&scheduler::QueryUpdate::active_position)
|
||||
.def_readwrite("generated_token",
|
||||
&scheduler::QueryUpdate::generated_token)
|
||||
.def(py::pickle(
|
||||
[](const scheduler::QueryUpdate &self) {
|
||||
return py::make_tuple(self.id, self.ok, self.is_prefill,
|
||||
self.decode_done, self.active_position,
|
||||
self.generated_token);
|
||||
},
|
||||
[](py::tuple t) {
|
||||
if (t.size() != 6)
|
||||
throw std::runtime_error("Invalid state! t.size() = " +
|
||||
std::to_string(t.size()));
|
||||
scheduler::QueryUpdate qu;
|
||||
qu.id = t[0].cast<scheduler::QueryID>();
|
||||
qu.ok = t[1].cast<bool>();
|
||||
qu.is_prefill = t[2].cast<bool>();
|
||||
qu.decode_done = t[3].cast<bool>();
|
||||
qu.active_position = t[4].cast<scheduler::TokenLength>();
|
||||
qu.generated_token = t[5].cast<scheduler::Token>();
|
||||
return qu;
|
||||
}));
|
||||
|
||||
py::class_<scheduler::InferenceContext>(m, "InferenceContext")
|
||||
.def(py::init<>())
|
||||
.def_readwrite("k_cache", &scheduler::InferenceContext::k_cache)
|
||||
.def_readwrite("v_cache", &scheduler::InferenceContext::v_cache);
|
||||
|
||||
py::class_<scheduler::QueryAdd>(m, "QueryAdd")
|
||||
.def(py::init<>())
|
||||
.def_readwrite("query_token", &scheduler::QueryAdd::query_token)
|
||||
// .def_readwrite("attn_mask", &scheduler::QueryAdd::attn_mask)
|
||||
.def_readwrite("query_length", &scheduler::QueryAdd::query_length)
|
||||
.def_readwrite("estimated_length", &scheduler::QueryAdd::estimated_length)
|
||||
.def_readwrite("sample_options", &scheduler::QueryAdd::sample_options)
|
||||
.def_readwrite("user_id", &scheduler::QueryAdd::user_id)
|
||||
.def_readwrite("SLO_TTFT_ms", &scheduler::QueryAdd::SLO_TTFT_ms)
|
||||
.def_readwrite("SLO_TBT_ms", &scheduler::QueryAdd::SLO_TBT_ms)
|
||||
.def_readwrite("stop_criteria", &scheduler::QueryAdd::stop_criteria)
|
||||
.def("serialize", &scheduler::QueryAdd::serialize)
|
||||
.def_static("deserialize", &scheduler::QueryAdd::deserialize)
|
||||
.def(py::pickle(
|
||||
[](const scheduler::QueryAdd &self) {
|
||||
return py::make_tuple(self.query_token,
|
||||
// self.attn_mask,
|
||||
self.query_length, self.estimated_length,
|
||||
self.sample_options, self.user_id,
|
||||
self.SLO_TTFT_ms, self.SLO_TBT_ms,
|
||||
self.stop_criteria);
|
||||
},
|
||||
[](py::tuple t) {
|
||||
if (t.size() != 8)
|
||||
throw std::runtime_error("Invalid state! t.size() = " +
|
||||
std::to_string(t.size()));
|
||||
scheduler::QueryAdd qa;
|
||||
qa.query_token = t[0].cast<std::vector<scheduler::Token>>();
|
||||
// qa.attn_mask = t[1].cast<torch::Tensor>();
|
||||
qa.query_length = t[1].cast<scheduler::TokenLength>();
|
||||
qa.estimated_length = t[2].cast<scheduler::TokenLength>();
|
||||
qa.sample_options = t[3].cast<scheduler::SampleOptions>();
|
||||
qa.user_id = t[4].cast<scheduler::UserID>();
|
||||
qa.SLO_TTFT_ms = t[5].cast<int>();
|
||||
qa.SLO_TBT_ms = t[6].cast<int>();
|
||||
qa.stop_criteria = t[7].cast<std::vector<std::vector<int>>>();
|
||||
return qa;
|
||||
}));
|
||||
|
||||
py::class_<scheduler::Scheduler, std::shared_ptr<scheduler::Scheduler>>(
|
||||
m, "Scheduler")
|
||||
.def("init", &scheduler::Scheduler::init)
|
||||
.def("run", &scheduler::Scheduler::run)
|
||||
.def("stop", &scheduler::Scheduler::stop)
|
||||
.def("add_query", &scheduler::Scheduler::add_query,
|
||||
py::call_guard<py::gil_scoped_release>())
|
||||
.def("cancel_query", &scheduler::Scheduler::cancel_query,
|
||||
py::call_guard<py::gil_scoped_release>())
|
||||
.def("update_last_batch", &scheduler::Scheduler::update_last_batch,
|
||||
py::call_guard<py::gil_scoped_release>())
|
||||
.def("get_inference_context",
|
||||
&scheduler::Scheduler::get_inference_context);
|
||||
|
||||
m.def("create_scheduler", &scheduler::create_scheduler,
|
||||
"Create a new Scheduler instance");
|
||||
}
|
147
csrc/balance_serve/sched/metrics.cpp
Normal file
147
csrc/balance_serve/sched/metrics.cpp
Normal file
|
@ -0,0 +1,147 @@
|
|||
#include "metrics.h"
|
||||
#include <iostream>
|
||||
|
||||
// 构造函数
|
||||
Metrics::Metrics(const MetricsConfig &config)
|
||||
: registry_(std::make_shared<prometheus::Registry>()),
|
||||
exposer_(config.endpoint), stop_uptime_thread_(false),
|
||||
start_time_(std::chrono::steady_clock::now()) {
|
||||
// 定义统一的桶大小,最大为 10000 ms (10 s)
|
||||
std::vector<double> common_buckets = {
|
||||
0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0,
|
||||
10.0, 50.0, 100.0, 500.0, 1000.0, 5000.0, 10000.0}; // 毫秒
|
||||
|
||||
// 注册 TTFT_ms Histogram
|
||||
auto &TTFT_family = prometheus::BuildHistogram()
|
||||
.Name(std::string(METRIC_PREFIX) + "_TTFT_ms")
|
||||
.Help("Time to first token in milliseconds")
|
||||
.Register(*registry_);
|
||||
TTFT_ms = &TTFT_family.Add({{"model", config.model_name}}, common_buckets);
|
||||
|
||||
// 注册 TBT_ms Histogram
|
||||
auto &TBT_family = prometheus::BuildHistogram()
|
||||
.Name(std::string(METRIC_PREFIX) + "_TBT_ms")
|
||||
.Help("Time between tokens in milliseconds")
|
||||
.Register(*registry_);
|
||||
TBT_ms = &TBT_family.Add({{"model", config.model_name}}, common_buckets);
|
||||
|
||||
// 注册 schedule_time Histogram
|
||||
auto &schedule_time_family =
|
||||
prometheus::BuildHistogram()
|
||||
.Name(std::string(METRIC_PREFIX) + "_schedule_time_ms")
|
||||
.Help("Time to generate schedule in milliseconds")
|
||||
.Register(*registry_);
|
||||
schedule_time =
|
||||
&schedule_time_family.Add({{"model", config.model_name}}, common_buckets);
|
||||
|
||||
// 注册 generated_tokens Counter
|
||||
auto &generated_tokens_family =
|
||||
prometheus::BuildCounter()
|
||||
.Name(std::string(METRIC_PREFIX) + "_generated_tokens_total")
|
||||
.Help("Total generated tokens")
|
||||
.Register(*registry_);
|
||||
generated_tokens =
|
||||
&generated_tokens_family.Add({{"model", config.model_name}});
|
||||
|
||||
// 注册 throughput_query Gauge
|
||||
auto &throughput_query_family =
|
||||
prometheus::BuildGauge()
|
||||
.Name(std::string(METRIC_PREFIX) + "_throughput_query")
|
||||
.Help("Throughput per second based on queries")
|
||||
.Register(*registry_);
|
||||
throughput_query =
|
||||
&throughput_query_family.Add({{"model", config.model_name}});
|
||||
|
||||
// 注册 throughput_generated_tokens Gauge
|
||||
auto &throughput_generated_tokens_family =
|
||||
prometheus::BuildGauge()
|
||||
.Name(std::string(METRIC_PREFIX) + "_throughput_generated_tokens")
|
||||
.Help("Throughput per second based on generated tokens")
|
||||
.Register(*registry_);
|
||||
throughput_generated_tokens =
|
||||
&throughput_generated_tokens_family.Add({{"model", config.model_name}});
|
||||
|
||||
// 注册 event_count Counter family
|
||||
event_count_family_ =
|
||||
&prometheus::BuildCounter()
|
||||
.Name(std::string(METRIC_PREFIX) + "_event_count_total")
|
||||
.Help("Count of various events")
|
||||
.Register(*registry_);
|
||||
|
||||
batch_count_family_ =
|
||||
&prometheus::BuildCounter()
|
||||
.Name(std::string(METRIC_PREFIX) + "_batch_count_total")
|
||||
.Help("Count of various batch by status")
|
||||
.Register(*registry_);
|
||||
|
||||
// 注册 query_count Counter family
|
||||
query_count_family_ =
|
||||
&prometheus::BuildCounter()
|
||||
.Name(std::string(METRIC_PREFIX) + "_query_count_total")
|
||||
.Help("Count of queries by status")
|
||||
.Register(*registry_);
|
||||
|
||||
// 注册 uptime_ms Gauge
|
||||
auto &uptime_family = prometheus::BuildGauge()
|
||||
.Name(std::string(METRIC_PREFIX) + "_uptime_ms")
|
||||
.Help("Uptime of the scheduler in milliseconds")
|
||||
.Register(*registry_);
|
||||
uptime_ms = &uptime_family.Add({{"model", config.model_name}});
|
||||
|
||||
// 注册 GPU 利用率 Gauges
|
||||
auto &gpu_util_family =
|
||||
prometheus::BuildGauge()
|
||||
.Name(std::string(METRIC_PREFIX) + "_gpu_utilization_ratio")
|
||||
.Help("Current GPU utilization ratio (0 to 1)")
|
||||
.Register(*registry_);
|
||||
for (size_t i = 0; i < config.gpu_count; ++i) {
|
||||
gpu_utilization_gauges.push_back(&gpu_util_family.Add(
|
||||
{{"gpu_id", std::to_string(i)}, {"model", config.model_name}}));
|
||||
}
|
||||
|
||||
// 将 Registry 注册到 Exposer 中
|
||||
exposer_.RegisterCollectable(registry_);
|
||||
|
||||
// 启动 uptime 更新线程
|
||||
StartUptimeUpdater();
|
||||
}
|
||||
|
||||
// 析构函数
|
||||
Metrics::~Metrics() { StopUptimeUpdater(); }
|
||||
|
||||
// 启动 uptime 更新线程
|
||||
void Metrics::StartUptimeUpdater() {
|
||||
uptime_thread_ = std::thread([this]() {
|
||||
while (!stop_uptime_thread_) {
|
||||
auto now = std::chrono::steady_clock::now();
|
||||
std::chrono::duration<double, std::milli> uptime_duration =
|
||||
now - start_time_;
|
||||
uptime_ms->Set(uptime_duration.count());
|
||||
// fn_every_sec(this);
|
||||
std::this_thread::sleep_for(std::chrono::seconds(1));
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// 停止 uptime 更新线程
|
||||
void Metrics::StopUptimeUpdater() {
|
||||
stop_uptime_thread_ = true;
|
||||
if (uptime_thread_.joinable()) {
|
||||
uptime_thread_.join();
|
||||
}
|
||||
}
|
||||
|
||||
// 获取 event_count 指标
|
||||
prometheus::Counter *Metrics::event_count(const std::string &type) {
|
||||
return &event_count_family_->Add({{"type", type}}); // 可根据需要添加更多标签
|
||||
}
|
||||
|
||||
// 获取 query_count 指标
|
||||
prometheus::Counter *Metrics::query_count(const std::string &status) {
|
||||
return &query_count_family_->Add(
|
||||
{{"status", status}}); // 可根据需要添加更多标签
|
||||
}
|
||||
|
||||
prometheus::Counter *Metrics::batch_count(const std::string &type) {
|
||||
return &batch_count_family_->Add({{"type", type}});
|
||||
}
|
88
csrc/balance_serve/sched/metrics.h
Normal file
88
csrc/balance_serve/sched/metrics.h
Normal file
|
@ -0,0 +1,88 @@
|
|||
#ifndef Metrics_H
|
||||
#define Metrics_H
|
||||
|
||||
#include <atomic>
|
||||
#include <chrono>
|
||||
#include <memory>
|
||||
#include <prometheus/counter.h>
|
||||
#include <prometheus/exposer.h>
|
||||
#include <prometheus/gauge.h>
|
||||
#include <prometheus/histogram.h>
|
||||
#include <prometheus/registry.h>
|
||||
#include <string>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
|
||||
#include "timer.hpp"
|
||||
// 指标前缀宏定义
|
||||
#define METRIC_PREFIX "scheduler"
|
||||
class Metrics;
|
||||
|
||||
// 配置结构体
|
||||
struct MetricsConfig {
|
||||
std::string endpoint;
|
||||
std::string model_name; // 模型名称,如 "gpt-4"
|
||||
size_t gpu_count; // GPU数量
|
||||
};
|
||||
|
||||
// Metrics 类,根据配置初始化 Prometheus 指标
|
||||
class Metrics {
|
||||
public:
|
||||
// 构造函数传入 MetricsConfig
|
||||
Metrics(const MetricsConfig &config);
|
||||
~Metrics();
|
||||
|
||||
// 禁止拷贝和赋值
|
||||
Metrics(const Metrics &) = delete;
|
||||
Metrics &operator=(const Metrics &) = delete;
|
||||
|
||||
std::function<void(Metrics *)> fn_every_sec;
|
||||
|
||||
// 指标指针
|
||||
prometheus::Gauge *uptime_ms;
|
||||
prometheus::Histogram *TTFT_ms;
|
||||
prometheus::Histogram *TBT_ms;
|
||||
prometheus::Histogram *schedule_time;
|
||||
prometheus::Gauge *throughput_query;
|
||||
prometheus::Gauge *throughput_generated_tokens;
|
||||
prometheus::Counter *generated_tokens;
|
||||
std::vector<prometheus::Gauge *> gpu_utilization_gauges;
|
||||
|
||||
// 计数器家族
|
||||
prometheus::Counter *event_count(const std::string &type);
|
||||
prometheus::Counter *query_count(const std::string &status);
|
||||
prometheus::Counter *batch_count(const std::string &type);
|
||||
|
||||
private:
|
||||
std::shared_ptr<prometheus::Registry> registry_;
|
||||
prometheus::Exposer exposer_;
|
||||
|
||||
// 计数器家族
|
||||
prometheus::Family<prometheus::Counter> *event_count_family_;
|
||||
prometheus::Family<prometheus::Counter> *batch_count_family_;
|
||||
prometheus::Family<prometheus::Counter> *query_count_family_;
|
||||
|
||||
// 线程和控制变量用于更新 uptime_ms
|
||||
std::thread uptime_thread_;
|
||||
std::atomic<bool> stop_uptime_thread_;
|
||||
|
||||
// 启动 uptime 更新线程
|
||||
void StartUptimeUpdater();
|
||||
// 停止 uptime 更新线程
|
||||
void StopUptimeUpdater();
|
||||
|
||||
// 记录程序启动时间
|
||||
std::chrono::steady_clock::time_point start_time_;
|
||||
};
|
||||
|
||||
struct HistogramTimerWrapper {
|
||||
prometheus::Histogram *histogram;
|
||||
Timer timer;
|
||||
inline HistogramTimerWrapper(prometheus::Histogram *histogram)
|
||||
: histogram(histogram), timer() {
|
||||
timer.start();
|
||||
}
|
||||
inline ~HistogramTimerWrapper() { histogram->Observe(timer.elapsedMs()); }
|
||||
};
|
||||
|
||||
#endif // Metrics_H
|
119
csrc/balance_serve/sched/model_config.h
Normal file
119
csrc/balance_serve/sched/model_config.h
Normal file
|
@ -0,0 +1,119 @@
|
|||
#ifndef __MODEL_CONFIG_HPP_
|
||||
#define __MODEL_CONFIG_HPP_
|
||||
|
||||
#include "nlohmann/json.hpp"
|
||||
#include <iostream>
|
||||
|
||||
#include <filesystem>
|
||||
#include <fstream>
|
||||
|
||||
using DimSize = size_t;
|
||||
using URL = std::string;
|
||||
using ModelName = std::string;
|
||||
|
||||
// We must assure this can be load by config.json
|
||||
class ModelConfig {
|
||||
public:
|
||||
DimSize hidden_size;
|
||||
DimSize intermediate_size;
|
||||
size_t max_position_embeddings;
|
||||
std::string model_type;
|
||||
size_t num_attention_heads;
|
||||
size_t num_hidden_layers;
|
||||
size_t num_key_value_heads;
|
||||
size_t vocab_size;
|
||||
|
||||
NLOHMANN_DEFINE_TYPE_INTRUSIVE(ModelConfig, hidden_size, intermediate_size,
|
||||
max_position_embeddings, model_type,
|
||||
num_attention_heads, num_hidden_layers,
|
||||
num_key_value_heads, vocab_size);
|
||||
|
||||
void load_from(std::filesystem::path path) {
|
||||
std::cout << "Load from " << path << std::endl;
|
||||
std::ifstream i(path);
|
||||
nlohmann::json j;
|
||||
i >> j;
|
||||
*this = j.get<ModelConfig>();
|
||||
}
|
||||
};
|
||||
|
||||
using QuantType = std::string;
|
||||
static const QuantType NoQuantType = "";
|
||||
|
||||
class QuantConfig {
|
||||
public:
|
||||
QuantType name;
|
||||
|
||||
// For GEMV
|
||||
QuantType type_of_dot_vector = NoQuantType;
|
||||
inline bool can_be_used_as_matrix() {
|
||||
return type_of_dot_vector != NoQuantType;
|
||||
}
|
||||
|
||||
bool can_be_used_as_vector;
|
||||
|
||||
double bytes_per_element;
|
||||
bool has_scale;
|
||||
bool has_min;
|
||||
|
||||
size_t block_element_count;
|
||||
size_t block_element_size;
|
||||
|
||||
URL reference = "";
|
||||
|
||||
NLOHMANN_DEFINE_TYPE_INTRUSIVE_WITH_DEFAULT(QuantConfig, name,
|
||||
type_of_dot_vector,
|
||||
can_be_used_as_vector,
|
||||
bytes_per_element, has_scale,
|
||||
has_min, block_element_count,
|
||||
block_element_size, reference);
|
||||
};
|
||||
|
||||
inline std::map<QuantType, QuantConfig> quant_configs;
|
||||
inline std::map<ModelName, ModelConfig> model_configs;
|
||||
|
||||
inline void load_quant_configs(std::filesystem::path path) {
|
||||
nlohmann::json j;
|
||||
if (std::filesystem::exists(path)) {
|
||||
std::cout << __FUNCTION__ << " from " << path << std::endl;
|
||||
std::ifstream i(path);
|
||||
i >> j;
|
||||
quant_configs = j.get<std::map<QuantType, QuantConfig>>();
|
||||
std::cout << "Loaded Quant Configs" << std::endl;
|
||||
for (auto &[k, v] : quant_configs) {
|
||||
std::cout << " - " << k << std::endl;
|
||||
}
|
||||
} else {
|
||||
std::cout << __FUNCTION__ << " no file at " << path << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
inline void dump_quant_configs(std::filesystem::path path) {
|
||||
std::ofstream o(path);
|
||||
nlohmann::json j = quant_configs;
|
||||
o << j.dump(4);
|
||||
}
|
||||
|
||||
inline void load_model_configs(std::filesystem::path path) {
|
||||
nlohmann::json j;
|
||||
if (std::filesystem::exists(path)) {
|
||||
std::cout << __FUNCTION__ << " from " << path << std::endl;
|
||||
std::ifstream i(path);
|
||||
i >> j;
|
||||
model_configs = j.get<std::map<ModelName, ModelConfig>>();
|
||||
std::cout << "Loaded Model Configs" << std::endl;
|
||||
for (auto &[k, v] : model_configs) {
|
||||
std::cout << " - " << k << std::endl;
|
||||
}
|
||||
} else {
|
||||
std::cout << __FUNCTION__ << " no file at " << path << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
inline void dump_model_configs(std::filesystem::path path) {
|
||||
std::ofstream o(path);
|
||||
nlohmann::json j = model_configs;
|
||||
o << j.dump(4);
|
||||
}
|
||||
|
||||
#endif
|
960
csrc/balance_serve/sched/scheduler.cpp
Normal file
960
csrc/balance_serve/sched/scheduler.cpp
Normal file
|
@ -0,0 +1,960 @@
|
|||
#define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_INFO
|
||||
#define FMT_HEADER_ONLY
|
||||
#include "nlohmann/json.hpp"
|
||||
#include "spdlog/spdlog.h"
|
||||
|
||||
#include "scheduler.h"
|
||||
#include <optional>
|
||||
|
||||
#include "arithmetic.hpp"
|
||||
#include "atomic_ptr_with_flags.hpp"
|
||||
#include "easy_format.hpp"
|
||||
#include "metrics.h"
|
||||
#include "mpsc.hpp"
|
||||
#include "timer.hpp"
|
||||
#include <atomic>
|
||||
#include <cassert>
|
||||
#include <future>
|
||||
#include <memory>
|
||||
#include <queue>
|
||||
|
||||
#include "kvc2.h"
|
||||
|
||||
using json = nlohmann::json;
|
||||
|
||||
namespace scheduler {
|
||||
|
||||
void Settings::auto_derive() {
|
||||
gpu_device_count = gpu_device_id.size();
|
||||
if (torch::cuda::is_available()) {
|
||||
size_t gpu_count = torch::cuda::device_count();
|
||||
SPDLOG_INFO("Number of available GPUs: {}, want {}", gpu_count,
|
||||
gpu_device_count);
|
||||
if (gpu_count < gpu_device_count) {
|
||||
SPDLOG_ERROR("Not enough GPUs available.");
|
||||
exit(0);
|
||||
}
|
||||
for (size_t i = 0; i < gpu_device_count; i++) {
|
||||
devices.push_back(torch::Device(torch::kCUDA, gpu_device_id[i]));
|
||||
}
|
||||
} else {
|
||||
SPDLOG_ERROR("CUDA is not available on this system.");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
if (model_settings.num_k_heads % gpu_device_count != 0) {
|
||||
SPDLOG_ERROR("num_k_heads {} is not divisible by gpu_device_count {}",
|
||||
model_settings.num_k_heads, gpu_device_count);
|
||||
assert(false);
|
||||
}
|
||||
|
||||
size_t gpu_memory_available = gpu_memory_size * memory_utilization_percentage;
|
||||
if (gpu_memory_available * gpu_device_count <
|
||||
model_settings.params_nbytes()) {
|
||||
SPDLOG_ERROR("GPU memory size {}G is smaller than {}G",
|
||||
gpu_memory_available * gpu_device_count / 1e9,
|
||||
model_settings.params_nbytes() / 1e9);
|
||||
assert(false);
|
||||
}
|
||||
|
||||
assert(model_settings.k_head_dim % model_settings.num_k_heads == 0);
|
||||
size_t head_per_gpu = model_settings.num_k_heads / gpu_device_count;
|
||||
size_t gpu_memory_for_kv_cache =
|
||||
gpu_memory_available /*- model_settings.params_nbytes() /
|
||||
gpu_device_count*/
|
||||
;
|
||||
SPDLOG_INFO(
|
||||
"Each GPU Total: {}MiB, Model Params: {}MiB, KVCache: {}MiB, Left: {}MiB",
|
||||
gpu_memory_size / (1 << 20),
|
||||
model_settings.params_nbytes() / gpu_device_count / (1 << 20),
|
||||
gpu_memory_for_kv_cache / (1 << 20),
|
||||
(gpu_memory_size - gpu_memory_available) / (1 << 20));
|
||||
size_t kv_cache_on_cnt = (size_t)(k_cache_on) + (size_t)(v_cache_on);
|
||||
size_t max_total_kvcache_pages =
|
||||
gpu_memory_for_kv_cache /
|
||||
(kv_cache_on_cnt * head_per_gpu * model_settings.k_head_dim *
|
||||
model_settings.bytes_per_kv_cache_element * page_size *
|
||||
model_settings.layer_count);
|
||||
if (total_kvcache_pages.has_value()) {
|
||||
if (total_kvcache_pages.value() > max_total_kvcache_pages) {
|
||||
SPDLOG_ERROR(
|
||||
"total_kvcache_pages {} is larger than max_total_kvcache_pages {}",
|
||||
total_kvcache_pages.value(), max_total_kvcache_pages);
|
||||
assert(false);
|
||||
}
|
||||
} else {
|
||||
total_kvcache_pages = max_total_kvcache_pages;
|
||||
SPDLOG_INFO("total_kvcache_pages is auto derived as {}",
|
||||
max_total_kvcache_pages);
|
||||
}
|
||||
|
||||
if (page_size % 256 != 0) {
|
||||
SPDLOG_ERROR("page_size {} is not divisible by 256", page_size);
|
||||
assert(false);
|
||||
}
|
||||
if (page_size < 256) {
|
||||
SPDLOG_ERROR("page_size {} is smaller than 256", page_size);
|
||||
assert(false);
|
||||
}
|
||||
}
|
||||
|
||||
std::string BatchQueryTodo::debug() {
|
||||
std::string re = "BatchQueryTodo: ";
|
||||
re += "QueryIDs: ";
|
||||
for (auto &id : query_ids) {
|
||||
re += std::to_string(id) + " ";
|
||||
}
|
||||
return re;
|
||||
}
|
||||
|
||||
bool BatchQueryTodo::empty() {
|
||||
return prefill_mini_batches.empty() && decode_mini_batches.empty();
|
||||
}
|
||||
|
||||
struct QueryMaintainer;
|
||||
|
||||
struct Query {
|
||||
QueryID id;
|
||||
torch::Tensor query_token;
|
||||
TokenLength prompt_length;
|
||||
TokenLength no_kvcache_from;
|
||||
TokenLength estimated_length;
|
||||
|
||||
SampleOptions sample_options;
|
||||
|
||||
UserID user_id;
|
||||
std::optional<int> SLO_TTFT_ms;
|
||||
std::optional<int> SLO_TBT_ms;
|
||||
|
||||
std::vector<std::vector<int>> stop_criteria;
|
||||
|
||||
// status
|
||||
// Query status changed by this order
|
||||
enum Status { Received, Preparing, Ready, Prefill, Decode, Done };
|
||||
Status plan_status = Received;
|
||||
TokenLength active_position; // the position where no kvcache now
|
||||
TokenLength plan_position; // the position where no kvcache now, in plan
|
||||
size_t prepare_try_count = 0;
|
||||
std::shared_ptr<kvc2::DoubleCacheHandleInterface> kvc2_handle = nullptr;
|
||||
|
||||
// derived from kvc2_handle
|
||||
torch::Tensor block_index; // block indexes
|
||||
|
||||
struct QueryContext {
|
||||
ModelName model_name;
|
||||
QuantType quant_type;
|
||||
kvc2::KVC2Interface *kvc2_interface;
|
||||
QueryMaintainer *query_maintainer;
|
||||
Metrics *met;
|
||||
} ctx;
|
||||
|
||||
void after_load(bool ok);
|
||||
|
||||
void to_status(Status to);
|
||||
|
||||
void export_metrics() {
|
||||
ctx.met->query_count(status_to_string(plan_status))->Increment(1);
|
||||
}
|
||||
|
||||
Query(QueryID id, QueryAdd query_add, QueryContext context)
|
||||
: id(id), prompt_length(query_add.query_length), no_kvcache_from(0),
|
||||
estimated_length(query_add.estimated_length),
|
||||
sample_options(query_add.sample_options), user_id(query_add.user_id),
|
||||
SLO_TTFT_ms(query_add.SLO_TTFT_ms), SLO_TBT_ms(query_add.SLO_TBT_ms),
|
||||
stop_criteria(query_add.stop_criteria), ctx(context) {
|
||||
std::vector<int64_t> shape = {int64_t(query_add.estimated_length)};
|
||||
query_token =
|
||||
torch::zeros(shape, torch::TensorOptions().dtype(torch::kInt32));
|
||||
assert(query_token.is_contiguous());
|
||||
if (query_token.is_contiguous() == false) {
|
||||
SPDLOG_ERROR("Query Token must be contiguous!");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
memcpy(query_token.data_ptr(), query_add.query_token.data(),
|
||||
query_add.query_length * sizeof(Token));
|
||||
|
||||
no_kvcache_from = 0; // maybe match prefix later
|
||||
export_metrics();
|
||||
}
|
||||
|
||||
Token &token_at(size_t idx) {
|
||||
return reinterpret_cast<Token *>(query_token.data_ptr())[idx];
|
||||
}
|
||||
|
||||
void absorb_update(const QueryUpdate &update) {
|
||||
SPDLOG_DEBUG("{}", update.debug());
|
||||
active_position = update.active_position;
|
||||
kvc2_handle->append_tokens(&token_at(0),
|
||||
active_position); // active_position is length -1
|
||||
if (update.is_prefill) {
|
||||
if (active_position == prompt_length) {
|
||||
token_at(active_position) = update.generated_token;
|
||||
ctx.met->generated_tokens->Increment(1);
|
||||
}
|
||||
} else {
|
||||
token_at(active_position) = update.generated_token;
|
||||
ctx.met->generated_tokens->Increment(1);
|
||||
}
|
||||
|
||||
if (update.decode_done || active_position == estimated_length - 1) {
|
||||
to_status(Done);
|
||||
}
|
||||
}
|
||||
|
||||
void absorb_prefill_task(const PrefillTask &task) {
|
||||
auto &[id, start, length] = task;
|
||||
this->plan_position = start + length;
|
||||
if (this->plan_position == prompt_length) {
|
||||
to_status(Decode);
|
||||
}
|
||||
}
|
||||
|
||||
void absorb_decode_task([[maybe_unused]] const QueryID &task) {
|
||||
this->plan_position += 1;
|
||||
}
|
||||
|
||||
PrefillTask get_prefill_task(size_t prefill_length) {
|
||||
if (prefill_length + plan_position > prompt_length) {
|
||||
prefill_length = prompt_length - plan_position;
|
||||
}
|
||||
return {id, plan_position, prefill_length};
|
||||
}
|
||||
|
||||
static std::string status_to_string(Status status) {
|
||||
switch (status) {
|
||||
case Received:
|
||||
return "Received";
|
||||
case Preparing:
|
||||
return "Preparing";
|
||||
case Ready:
|
||||
return "Ready";
|
||||
case Prefill:
|
||||
return "Prefill";
|
||||
case Decode:
|
||||
return "Decode";
|
||||
case Done:
|
||||
return "Done";
|
||||
}
|
||||
assert(false);
|
||||
}
|
||||
|
||||
void debug() {
|
||||
std::string status_string = status_to_string(plan_status);
|
||||
|
||||
SPDLOG_DEBUG("Query {}, prompt_length {}, estimated_length {}, plan status "
|
||||
"{}, plan position {} "
|
||||
"active position {}",
|
||||
id, prompt_length, estimated_length, status_string,
|
||||
plan_position, active_position);
|
||||
}
|
||||
};
|
||||
|
||||
std::string QueryUpdate::debug() const {
|
||||
return fmt::format("Query {}, ok {}, is_prefill {}, done {}, active_position "
|
||||
"{}, gen token {}",
|
||||
id, ok, is_prefill, decode_done, active_position,
|
||||
generated_token);
|
||||
}
|
||||
|
||||
using Q = std::shared_ptr<Query>;
|
||||
|
||||
struct KVC2_Maintainer {
|
||||
Settings settings;
|
||||
|
||||
std::vector<torch::Tensor> k_cache;
|
||||
std::vector<torch::Tensor> v_cache;
|
||||
std::shared_ptr<kvc2::KVC2Interface> kvc2_interface;
|
||||
|
||||
KVC2_Maintainer(Settings settings) : settings(settings) {
|
||||
// SPDLOG_WARN("Creating KVC2 Instance {}", settings.kvc2_root_path);
|
||||
assert(settings.kvc2_root_path.size() > 0);
|
||||
|
||||
// SPDLOG_WARN("Sizeof KVC2Config {} upper", sizeof(kvc2::KVC2Config));
|
||||
kvc2::GPUPageCacheConfig gpu_cache_config{
|
||||
.gpu_only = settings.gpu_only,
|
||||
.gpu_devices_id = settings.gpu_device_id,
|
||||
.layer_count = settings.model_settings.layer_count,
|
||||
.total_kvcache_pages = settings.total_kvcache_pages.value(),
|
||||
.num_token_per_page = settings.page_size,
|
||||
.num_k_heads = settings.model_settings.num_k_heads,
|
||||
.k_head_dim = settings.use_self_defined_head_dim
|
||||
? settings.self_defined_head_dim
|
||||
: settings.model_settings.k_head_dim,
|
||||
.full_kv_cache_on_each_gpu = settings.full_kv_cache_on_each_gpu,
|
||||
.k_cache_on = settings.k_cache_on,
|
||||
.v_cache_on = settings.v_cache_on,
|
||||
.tensor_type = torch::kBFloat16,
|
||||
};
|
||||
|
||||
auto model_configs_path =
|
||||
std::filesystem::path(settings.kvc2_config_path) / "model_configs.json";
|
||||
load_model_configs(model_configs_path);
|
||||
auto my_model_config = ModelConfig();
|
||||
my_model_config.load_from(
|
||||
std::filesystem::path(settings.model_settings.model_path) /
|
||||
"config.json");
|
||||
model_configs[settings.model_name] = my_model_config;
|
||||
dump_model_configs(model_configs_path);
|
||||
|
||||
kvc2::KVC2Config kvc2_config = {
|
||||
.k_cache_on = settings.k_cache_on,
|
||||
.v_cache_on = settings.v_cache_on,
|
||||
.gpu_only = settings.gpu_only,
|
||||
.load_from_disk = settings.load_from_disk,
|
||||
.save_to_disk = settings.save_to_disk,
|
||||
.path = settings.kvc2_root_path,
|
||||
.config_path = settings.kvc2_config_path,
|
||||
.num_token_per_page = settings.page_size,
|
||||
.memory_pool_size = size_t(settings.memory_pool_size_GB * 1e9),
|
||||
.evict_count = settings.evict_count,
|
||||
.gpu_cache_config = gpu_cache_config,
|
||||
.metrics_port = settings.kvc2_metrics_port,
|
||||
};
|
||||
kvc2_interface = kvc2::create_kvc2(kvc2_config);
|
||||
if (settings.load_from_disk)
|
||||
kvc2_interface->load();
|
||||
|
||||
SPDLOG_DEBUG("KVC2 created ok");
|
||||
|
||||
auto [k_cache, v_cache] = kvc2_interface->get_kvcache();
|
||||
this->k_cache = k_cache;
|
||||
this->v_cache = v_cache;
|
||||
}
|
||||
};
|
||||
|
||||
using EventAddQuery = std::pair<QueryAdd, std::promise<QueryID> *>;
|
||||
using EventUpdateQuery = BatchQueryUpdate;
|
||||
using EventTakenBatch = std::shared_ptr<BatchQueryTodo>;
|
||||
struct EventPrepare {
|
||||
QueryID query_id;
|
||||
bool first_try;
|
||||
};
|
||||
struct EventPrepared {
|
||||
QueryID query_id;
|
||||
bool ok;
|
||||
};
|
||||
|
||||
struct EventQueryStatus {
|
||||
QueryID query_id;
|
||||
Query::Status now_status;
|
||||
};
|
||||
struct EventSchedule {};
|
||||
|
||||
using Event =
|
||||
std::variant<EventAddQuery, EventUpdateQuery, EventTakenBatch, EventPrepare,
|
||||
EventPrepared, EventQueryStatus, EventSchedule>;
|
||||
|
||||
template <typename T> std::string event_name(const T &event);
|
||||
|
||||
template <> std::string event_name(const EventAddQuery &) {
|
||||
return "EventAddQuery";
|
||||
}
|
||||
|
||||
template <> std::string event_name(const EventUpdateQuery &) {
|
||||
return "EventUpdateQuery";
|
||||
}
|
||||
|
||||
template <> std::string event_name(const EventTakenBatch &) {
|
||||
return "EventTakenBatch";
|
||||
}
|
||||
template <> std::string event_name(const EventPrepare &) {
|
||||
return "EventPrepare";
|
||||
}
|
||||
|
||||
template <> std::string event_name(const EventPrepared &) {
|
||||
return "EventPrepared";
|
||||
}
|
||||
|
||||
template <> std::string event_name(const EventQueryStatus &) {
|
||||
return "EventQueryStatus";
|
||||
}
|
||||
|
||||
template <> std::string event_name(const EventSchedule &) {
|
||||
return "EventSchedule";
|
||||
}
|
||||
|
||||
// 用 std::visit 实现对 variant 的 event_name
|
||||
std::string event_name(const Event &event) {
|
||||
return std::visit([](const auto &e) { return event_name(e); }, event);
|
||||
}
|
||||
|
||||
static_assert(std::is_copy_constructible<Event>::value);
|
||||
static_assert(std::is_move_constructible<Event>::value);
|
||||
|
||||
struct QueryMaintainer : public Scheduler {
|
||||
// only get access by event loop
|
||||
Settings settings;
|
||||
QueryID query_id_counter = NoQueryID + 1;
|
||||
std::map<QueryID, Q> query_map;
|
||||
std::shared_ptr<KVC2_Maintainer> kvc2_maintainer;
|
||||
|
||||
std::shared_ptr<Metrics> met;
|
||||
// multi-thread visit
|
||||
std::atomic_bool stop_flag = false;
|
||||
// TODO consider correctness of event loop
|
||||
MPSCQueueConsumerLock<Event> event_loop_queue;
|
||||
|
||||
// std::binary_semaphore batch_ready{0};
|
||||
AtomicPtrWithFlag<BatchQueryTodo> next_batch;
|
||||
|
||||
QueryMaintainer() = default;
|
||||
|
||||
void gen_batch_query_todo(BatchQueryTodo *re, const std::set<Q> &queries) {
|
||||
std::vector<std::vector<QueryID>> d_batch(2);
|
||||
size_t last_decode_batch = 0;
|
||||
size_t prefill_num = 0;
|
||||
size_t decode_num = 0;
|
||||
size_t preill_length = 0;
|
||||
for (auto &q : queries) {
|
||||
if (q->plan_status == Query::Prefill) {
|
||||
prefill_num += 1;
|
||||
}
|
||||
if (q->plan_status == Query::Decode) {
|
||||
decode_num += 1;
|
||||
}
|
||||
}
|
||||
if (prefill_num >= 2 ||
|
||||
(prefill_num == 1 && settings.max_batch_size - 2 < decode_num)) {
|
||||
preill_length = settings.recommended_chunk_prefill_token_count;
|
||||
} else {
|
||||
preill_length = settings.recommended_chunk_prefill_token_count * 2;
|
||||
}
|
||||
for (auto &q : queries) {
|
||||
re->query_ids.push_back(q->id);
|
||||
re->query_tokens.push_back(q->query_token);
|
||||
re->query_lengths.push_back(q->prompt_length);
|
||||
if (q->plan_status == Query::Prefill) {
|
||||
re->prefill_mini_batches.push_back(q->get_prefill_task(preill_length));
|
||||
assert(re->prefill_mini_batches.size() <= 2);
|
||||
}
|
||||
if (q->plan_status == Query::Decode) {
|
||||
d_batch[last_decode_batch].push_back(q->id);
|
||||
// last_decode_batch = 1 - last_decode_batch;
|
||||
if (d_batch[last_decode_batch].size() == settings.max_batch_size - 1) {
|
||||
last_decode_batch += 1;
|
||||
assert(last_decode_batch < 2);
|
||||
}
|
||||
}
|
||||
re->block_indexes.push_back(q->block_index);
|
||||
re->sample_options.push_back(q->sample_options);
|
||||
re->stop_criteria.push_back(q->stop_criteria);
|
||||
}
|
||||
|
||||
re->attn_masks = std::nullopt;
|
||||
re->rope_ranges = std::nullopt;
|
||||
|
||||
for (auto &b : d_batch) {
|
||||
if (b.empty())
|
||||
continue;
|
||||
re->decode_mini_batches.push_back(b);
|
||||
}
|
||||
|
||||
met->batch_count("Generated")->Increment(1);
|
||||
}
|
||||
|
||||
// Interface
|
||||
|
||||
void init(Settings settings) override {
|
||||
SPDLOG_INFO("\nScheduler Settings:\n"
|
||||
" model_name: {}\n"
|
||||
" quant_type: {}\n"
|
||||
" model_path: {}\n"
|
||||
" params_count: {}\n"
|
||||
" layer_count: {}\n"
|
||||
" num_k_heads: {}\n"
|
||||
" k_head_dim: {}\n"
|
||||
" bytes_per_params: {}\n"
|
||||
" bytes_per_kv_cache_element: {}\n"
|
||||
" page_size: {}\n"
|
||||
" gpu_device_id: {}\n"
|
||||
" gpu_memory_size: {}\n"
|
||||
" memory_utilization_percentage: {}\n"
|
||||
" max_batch_size: {}\n"
|
||||
" recommended_chunk_prefill_token_count: {}\n"
|
||||
" sched_metrics_port: {}\n"
|
||||
" kvc2_config_path: {}\n"
|
||||
" kvc2_root_path: {}\n"
|
||||
" memory_pool_size_GB: {}\n"
|
||||
" evict_count: {}\n"
|
||||
" kvc2_metrics_port: {}\n"
|
||||
" load_from_disk: {}\n"
|
||||
" save_to_disk: {}\n"
|
||||
" strategy_name: {}\n"
|
||||
" gpu_device_count: {}\n",
|
||||
settings.model_name, settings.quant_type,
|
||||
settings.model_settings.model_path,
|
||||
settings.model_settings.params_count,
|
||||
settings.model_settings.layer_count,
|
||||
settings.model_settings.num_k_heads,
|
||||
settings.model_settings.k_head_dim,
|
||||
settings.model_settings.bytes_per_params,
|
||||
settings.model_settings.bytes_per_kv_cache_element,
|
||||
|
||||
settings.page_size, format_vector(settings.gpu_device_id),
|
||||
readable_number(settings.gpu_memory_size),
|
||||
settings.memory_utilization_percentage, settings.max_batch_size,
|
||||
settings.recommended_chunk_prefill_token_count,
|
||||
settings.sched_metrics_port, settings.kvc2_config_path,
|
||||
settings.kvc2_root_path, settings.memory_pool_size_GB,
|
||||
settings.evict_count, settings.kvc2_metrics_port,
|
||||
settings.load_from_disk, settings.save_to_disk,
|
||||
settings.strategy_name, settings.gpu_device_count);
|
||||
|
||||
this->settings = settings;
|
||||
kvc2_maintainer =
|
||||
std::shared_ptr<KVC2_Maintainer>(new KVC2_Maintainer(settings));
|
||||
MetricsConfig met_conf = {
|
||||
.endpoint = "0.0.0.0:" + std::to_string(settings.sched_metrics_port),
|
||||
.model_name = settings.model_name,
|
||||
.gpu_count = settings.gpu_device_count,
|
||||
};
|
||||
|
||||
SPDLOG_INFO("Creating scheduler metrics exporter on {}", met_conf.endpoint);
|
||||
met = std::make_shared<Metrics>(met_conf);
|
||||
met->fn_every_sec = [](Metrics *met) {
|
||||
auto generated_tokens = met->generated_tokens->Collect().counter.value;
|
||||
SPDLOG_INFO("Last Sec Generated Tokens {}", generated_tokens);
|
||||
};
|
||||
}
|
||||
Query::QueryContext get_query_context() {
|
||||
return Query::QueryContext{
|
||||
.model_name = settings.model_name,
|
||||
.quant_type = settings.quant_type,
|
||||
.kvc2_interface = kvc2_maintainer->kvc2_interface.get(),
|
||||
.query_maintainer = this,
|
||||
.met = met.get(),
|
||||
};
|
||||
}
|
||||
|
||||
QueryID add_query(QueryAdd query_add) override {
|
||||
std::promise<QueryID> p;
|
||||
event_loop_queue.enqueue(EventAddQuery(query_add, &p));
|
||||
return p.get_future().get();
|
||||
}
|
||||
|
||||
void cancel_query(QueryID id) override {
|
||||
SPDLOG_INFO("Cancel Query");
|
||||
SPDLOG_INFO("sched:{} Cancel Query", fmt::ptr(this));
|
||||
auto it = query_map.find(id);
|
||||
if (it == query_map.end()) {
|
||||
SPDLOG_ERROR("Query {} is not found", id);
|
||||
return;
|
||||
}
|
||||
query_map.erase(it);
|
||||
}
|
||||
|
||||
// Here this function update last batch results and get the next batch
|
||||
// in most cases, the batch is ready,
|
||||
// if not, busy wait to get it
|
||||
std::shared_ptr<BatchQueryTodo>
|
||||
update_last_batch(BatchQueryUpdate updates) override {
|
||||
event_loop_queue.enqueue(updates);
|
||||
|
||||
// Busy Wait
|
||||
while (true) {
|
||||
auto [ptr, is_new] = next_batch.touch_load();
|
||||
// SPDLOG_INFO("ptr {} is_new {}", fmt::ptr(ptr), is_new);
|
||||
if (is_new) {
|
||||
// SPDLOG_DEBUG("New Batch {}", fmt::ptr(ptr));
|
||||
auto re = std::shared_ptr<BatchQueryTodo>(ptr);
|
||||
event_loop_queue.enqueue(re);
|
||||
return re;
|
||||
} else {
|
||||
// // here to busy wait
|
||||
// SPDLOG_INFO("Not New");
|
||||
// using namespace std::chrono_literals;
|
||||
// std::this_thread::sleep_for(1s);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
InferenceContext get_inference_context() override {
|
||||
InferenceContext re;
|
||||
re.k_cache = kvc2_maintainer->k_cache;
|
||||
re.v_cache = kvc2_maintainer->v_cache;
|
||||
// kvc2_maintainer->k_cache[0][0][0][0][0][0] = 42; // test whether we pass
|
||||
// this to inference loop
|
||||
return re;
|
||||
}
|
||||
|
||||
virtual void strategy_add_query(Q new_query) = 0;
|
||||
virtual void strategy_update_query(const EventUpdateQuery &update) = 0;
|
||||
virtual void strategy_taken_batch(const EventTakenBatch &batch) = 0;
|
||||
virtual void strategy_prepare(const EventPrepare &prepare) = 0;
|
||||
virtual void strategy_prepared(const EventPrepared &prepared) = 0;
|
||||
virtual void strategy_query_status(const EventQueryStatus &query_status) = 0;
|
||||
virtual void strategy_schedule(const EventSchedule &event,
|
||||
BatchQueryTodo *new_batch) = 0;
|
||||
|
||||
void tackle_event(EventAddQuery &event) {
|
||||
auto &query_add = event.first;
|
||||
QueryID id = query_id_counter;
|
||||
event.second->set_value(id);
|
||||
query_id_counter += 1;
|
||||
Q new_query(new Query(id, query_add, get_query_context()));
|
||||
query_map[id] = new_query;
|
||||
SPDLOG_INFO("New Query {} is added", id);
|
||||
strategy_add_query(new_query);
|
||||
}
|
||||
|
||||
void tackle_event(const EventUpdateQuery &update) {
|
||||
// SPDLOG_INFO("Tackle Update Query");
|
||||
for (auto &u : update) {
|
||||
if (u.ok == false) {
|
||||
SPDLOG_ERROR("Query {} is not exectued OK", u.id);
|
||||
exit(1);
|
||||
}
|
||||
auto q = query_map[u.id];
|
||||
if (q->plan_status == Query::Status::Prefill ||
|
||||
q->plan_status == Query::Status::Decode) {
|
||||
q->absorb_update(u);
|
||||
} else {
|
||||
SPDLOG_DEBUG(
|
||||
"Query {} is not in Prefill or Decode status, do not update it",
|
||||
u.id);
|
||||
}
|
||||
}
|
||||
strategy_update_query(update);
|
||||
}
|
||||
|
||||
void tackle_event(const EventTakenBatch &batch) {
|
||||
met->batch_count("Taken")->Increment(1);
|
||||
for (auto &task : batch->prefill_mini_batches) {
|
||||
auto [id, s, l] = task;
|
||||
if (l == 0)
|
||||
continue;
|
||||
query_map.at(id)->absorb_prefill_task(task);
|
||||
}
|
||||
for (auto &mini_batch : batch->decode_mini_batches) {
|
||||
for (auto &id : mini_batch) {
|
||||
query_map.at(id)->absorb_decode_task(id);
|
||||
}
|
||||
}
|
||||
|
||||
strategy_taken_batch(batch);
|
||||
}
|
||||
|
||||
void tackle_event(const EventPrepare &event) { strategy_prepare(event); }
|
||||
void tackle_event(const EventPrepared &event) { strategy_prepared(event); }
|
||||
void tackle_event(const EventQueryStatus &event) {
|
||||
strategy_query_status(event);
|
||||
}
|
||||
|
||||
void tackle_event(const EventSchedule &event) {
|
||||
// SPDLOG_INFO("Tackle Schedule Event");
|
||||
|
||||
HistogramTimerWrapper t(met->schedule_time);
|
||||
|
||||
BatchQueryTodo *new_batch = new BatchQueryTodo;
|
||||
strategy_schedule(event, new_batch);
|
||||
// if (new_batch->query_ids.empty()) {
|
||||
// SPDLOG_INFO("Nothing todo");
|
||||
// delete new_batch;
|
||||
// return;
|
||||
// }
|
||||
auto [old_batch, flag] = next_batch.exchange(new_batch, true);
|
||||
if (new_batch->empty() == false) {
|
||||
SPDLOG_DEBUG("set new batch {}", fmt::ptr(new_batch));
|
||||
}
|
||||
if (flag) {
|
||||
SPDLOG_INFO("Batch {} is not consumed", fmt::ptr(old_batch));
|
||||
delete old_batch;
|
||||
}
|
||||
}
|
||||
|
||||
void run() override {
|
||||
std::thread([this]() {
|
||||
SPDLOG_WARN("Starting Scheduler Event Loop");
|
||||
while (stop_flag.load() == false) {
|
||||
auto event = event_loop_queue.dequeue();
|
||||
met->event_count(event_name(event))->Increment(1);
|
||||
std::visit(
|
||||
[this](auto event) {
|
||||
using T = std::decay_t<decltype(event)>;
|
||||
// SPDLOG_INFO("Event Loop: {}", typeid(T).name());
|
||||
if constexpr (std::is_same_v<T, EventAddQuery>) {
|
||||
tackle_event(event);
|
||||
} else if constexpr (std::is_same_v<T, EventUpdateQuery>) {
|
||||
tackle_event(event);
|
||||
} else if constexpr (std::is_same_v<T, EventTakenBatch>) {
|
||||
tackle_event(event);
|
||||
} else if constexpr (std::is_same_v<T, EventPrepare>) {
|
||||
tackle_event(event);
|
||||
} else if constexpr (std::is_same_v<T, EventPrepared>) {
|
||||
tackle_event(event);
|
||||
} else if constexpr (std::is_same_v<T, EventQueryStatus>) {
|
||||
tackle_event(event);
|
||||
} else if constexpr (std::is_same_v<T, EventSchedule>) {
|
||||
tackle_event(event);
|
||||
} else {
|
||||
SPDLOG_ERROR("Should not be here");
|
||||
assert(false);
|
||||
}
|
||||
},
|
||||
event);
|
||||
if (event_loop_queue.size() == 0 &&
|
||||
std::holds_alternative<EventSchedule>(event) == false) {
|
||||
// if this is not a schedule event, we need to schedule one
|
||||
event_loop_queue.enqueue(EventSchedule());
|
||||
}
|
||||
}
|
||||
}).detach();
|
||||
}
|
||||
|
||||
void stop() override { stop_flag.store(true); }
|
||||
|
||||
~QueryMaintainer() {
|
||||
kvc2_maintainer->kvc2_interface->save();
|
||||
stop();
|
||||
}
|
||||
};
|
||||
|
||||
void Query::to_status(Status to) {
|
||||
SPDLOG_DEBUG("Calling to status query {}, to {}", id, status_to_string(to));
|
||||
switch (to) {
|
||||
case Received:
|
||||
assert(false);
|
||||
break;
|
||||
case Preparing:
|
||||
SPDLOG_INFO("Preparing Query {} {}", id,
|
||||
prepare_try_count > 0
|
||||
? (std::to_string(prepare_try_count) + " Try")
|
||||
: "");
|
||||
prepare_try_count += 1;
|
||||
|
||||
ctx.kvc2_interface->lookup_to_gpu_async(
|
||||
ctx.model_name, ctx.quant_type,
|
||||
static_cast<kvc2::Token *>(query_token.data_ptr()), prompt_length,
|
||||
estimated_length,
|
||||
[this](std::shared_ptr<kvc2::DoubleCacheHandleInterface> handle) {
|
||||
if (handle == nullptr) {
|
||||
SPDLOG_INFO("Get handle from kvc2 Failed.");
|
||||
this->after_load(false);
|
||||
} else {
|
||||
SPDLOG_INFO("Get handle from kvc2 Success.");
|
||||
this->kvc2_handle = handle;
|
||||
this->to_status(Ready);
|
||||
this->after_load(true);
|
||||
}
|
||||
});
|
||||
break;
|
||||
case Ready:
|
||||
SPDLOG_INFO("Ready Query {}", id);
|
||||
break;
|
||||
case Prefill:
|
||||
SPDLOG_INFO("Prefilling Query {}", id);
|
||||
// assert(plan_status == Received);
|
||||
plan_position = kvc2_handle->matched_length();
|
||||
|
||||
if (prompt_length - plan_position == 0) {
|
||||
assert(prompt_length > 0);
|
||||
plan_position -= 1;
|
||||
}
|
||||
break;
|
||||
case Decode:
|
||||
SPDLOG_INFO("Decoding Query {}", id);
|
||||
// assert(plan_status == Prefill);
|
||||
break;
|
||||
case Done:
|
||||
SPDLOG_INFO("Finish Query {}", id);
|
||||
kvc2_handle = nullptr;
|
||||
ctx.query_maintainer->event_loop_queue.enqueue(EventQueryStatus{
|
||||
.query_id = id,
|
||||
.now_status = to,
|
||||
});
|
||||
// assert(plan_status == Decode);
|
||||
break;
|
||||
}
|
||||
plan_status = to;
|
||||
export_metrics();
|
||||
}
|
||||
|
||||
void Query::after_load(bool ok) {
|
||||
if (ok) {
|
||||
size_t page_count =
|
||||
div_up(estimated_length, ctx.query_maintainer->settings.page_size);
|
||||
std::vector<int64_t> shape;
|
||||
shape.push_back(page_count);
|
||||
block_index =
|
||||
torch::zeros(shape, torch::TensorOptions().dtype(torch::kInt32))
|
||||
.contiguous();
|
||||
auto ptr = reinterpret_cast<int32_t *>(block_index.data_ptr());
|
||||
auto vec_idx = kvc2_handle->get_gpu_block_idx();
|
||||
for (size_t i = 0; i < vec_idx.size(); i++) {
|
||||
ptr[i] = vec_idx[i];
|
||||
}
|
||||
no_kvcache_from = kvc2_handle->matched_length();
|
||||
}
|
||||
if (ok) {
|
||||
ctx.query_maintainer->event_loop_queue.enqueue(EventPrepared{
|
||||
.query_id = id,
|
||||
.ok = ok,
|
||||
});
|
||||
} else {
|
||||
ctx.query_maintainer->event_loop_queue.enqueue(EventPrepare{
|
||||
.query_id = id,
|
||||
.first_try = false,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
struct FCFS_single_prefill : public QueryMaintainer {
|
||||
std::queue<Q> queue;
|
||||
std::queue<Q> ready_queue;
|
||||
|
||||
bool has_query_preparing = false;
|
||||
std::optional<EventPrepare> wait_done_prepare = std::nullopt;
|
||||
|
||||
std::set<Q> active_query; // on going queries for LLMs
|
||||
|
||||
// interface all these are executed in a single thread
|
||||
void strategy_add_query(Q new_query) override {
|
||||
queue.push(new_query);
|
||||
if (has_query_preparing == false) {
|
||||
has_query_preparing = true;
|
||||
auto next_q = queue.front();
|
||||
queue.pop();
|
||||
event_loop_queue.enqueue(EventPrepare{next_q->id, true});
|
||||
}
|
||||
}
|
||||
|
||||
void strategy_update_query(const EventUpdateQuery &update) override {
|
||||
for (auto u : update) {
|
||||
auto &q = query_map[u.id];
|
||||
if (q->plan_status == Query::Done) {
|
||||
active_query.erase(q);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void strategy_taken_batch(const EventTakenBatch &batch) override {
|
||||
for (auto &q : batch->query_ids) {
|
||||
if (query_map[q]->plan_status != Query::Done) {
|
||||
active_query.insert(query_map[q]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void strategy_prepare(const EventPrepare &prepare) override {
|
||||
if (prepare.first_try) {
|
||||
auto &q = query_map[prepare.query_id];
|
||||
q->to_status(Query::Preparing);
|
||||
} else {
|
||||
assert(wait_done_prepare.has_value() == false);
|
||||
wait_done_prepare = prepare;
|
||||
wait_done_prepare->first_try = true;
|
||||
}
|
||||
}
|
||||
|
||||
void strategy_prepared(const EventPrepared &prepared) override {
|
||||
assert(prepared.ok);
|
||||
ready_queue.push(query_map[prepared.query_id]);
|
||||
if (queue.empty() == false) {
|
||||
auto next_q_prepare = queue.front();
|
||||
queue.pop();
|
||||
event_loop_queue.enqueue(EventPrepare{next_q_prepare->id, true});
|
||||
|
||||
} else {
|
||||
has_query_preparing = false;
|
||||
}
|
||||
}
|
||||
|
||||
void strategy_query_status(const EventQueryStatus &query_status) override {
|
||||
if (query_status.now_status == Query::Done) {
|
||||
if (wait_done_prepare.has_value()) {
|
||||
event_loop_queue.enqueue(wait_done_prepare.value());
|
||||
wait_done_prepare = std::nullopt;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void strategy_schedule([[maybe_unused]] const EventSchedule &event,
|
||||
BatchQueryTodo *new_batch) override {
|
||||
bool have_prefill = false;
|
||||
for (auto &q : active_query) {
|
||||
if (q->plan_status == Query::Prefill) {
|
||||
have_prefill = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (have_prefill == false && ready_queue.empty() == false &&
|
||||
active_query.size() < settings.max_batch_size) {
|
||||
auto &next_q = ready_queue.front();
|
||||
ready_queue.pop();
|
||||
|
||||
SPDLOG_INFO("Active query {}", next_q->id);
|
||||
active_query.insert(next_q);
|
||||
next_q->to_status(Query::Prefill);
|
||||
}
|
||||
if (active_query.empty() == false)
|
||||
SPDLOG_INFO("Active Query Size {}", active_query.size());
|
||||
for (auto &q : active_query) {
|
||||
q->debug();
|
||||
}
|
||||
gen_batch_query_todo(new_batch, active_query);
|
||||
}
|
||||
};
|
||||
|
||||
struct FCFS : public FCFS_single_prefill {
|
||||
void strategy_schedule([[maybe_unused]] const EventSchedule &event,
|
||||
BatchQueryTodo *new_batch) override {
|
||||
int prefill_count = 0;
|
||||
const int max_prefill_count = 2;
|
||||
for (auto &q : active_query) {
|
||||
if (q->plan_status == Query::Prefill) {
|
||||
prefill_count += 1;
|
||||
}
|
||||
}
|
||||
|
||||
while (prefill_count < max_prefill_count && ready_queue.empty() == false &&
|
||||
active_query.size() < settings.max_batch_size) {
|
||||
auto next_q = ready_queue.front();
|
||||
ready_queue.pop();
|
||||
|
||||
SPDLOG_INFO("Active query {}", next_q->id);
|
||||
active_query.insert(next_q);
|
||||
next_q->to_status(Query::Prefill);
|
||||
prefill_count += 1;
|
||||
}
|
||||
if (active_query.empty() == false) {
|
||||
SPDLOG_DEBUG("Active Query Size {}", active_query.size());
|
||||
}
|
||||
for (auto &q : active_query) {
|
||||
q->debug();
|
||||
}
|
||||
gen_batch_query_todo(new_batch, active_query);
|
||||
}
|
||||
};
|
||||
|
||||
std::shared_ptr<Scheduler> create_scheduler(Settings settings) {
|
||||
spdlog::set_level(spdlog::level::debug);
|
||||
std::shared_ptr<Scheduler> re;
|
||||
SPDLOG_INFO("Using Strategy {}", settings.strategy_name);
|
||||
if (settings.strategy_name == "FCFS-single-prefill") {
|
||||
re = std::shared_ptr<Scheduler>(new FCFS_single_prefill());
|
||||
} else if (settings.strategy_name == "FCFS") {
|
||||
re = std::shared_ptr<Scheduler>(new FCFS());
|
||||
} else {
|
||||
SPDLOG_ERROR("Unknown strategy {}", settings.strategy_name);
|
||||
}
|
||||
re->init(settings);
|
||||
return re;
|
||||
}
|
||||
|
||||
NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(SampleOptions, temperature, top_p);
|
||||
NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(QueryAdd, query_token, query_length,
|
||||
estimated_length, sample_options, user_id,
|
||||
SLO_TTFT_ms, SLO_TBT_ms);
|
||||
|
||||
std::string QueryAdd::serialize() {
|
||||
json j = *this;
|
||||
return j.dump();
|
||||
}
|
||||
|
||||
QueryAdd QueryAdd::deserialize(const std::string &input) {
|
||||
json j = json::parse(input);
|
||||
return j.get<QueryAdd>();
|
||||
}
|
||||
|
||||
}; // namespace scheduler
|
175
csrc/balance_serve/sched/scheduler.h
Normal file
175
csrc/balance_serve/sched/scheduler.h
Normal file
|
@ -0,0 +1,175 @@
|
|||
#pragma once
|
||||
#include "model_config.h"
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <optional>
|
||||
#include <torch/torch.h>
|
||||
#include <vector>
|
||||
|
||||
namespace scheduler {
|
||||
|
||||
using Token = uint32_t;
|
||||
using QueryID = uint64_t;
|
||||
constexpr QueryID NoQueryID = 0;
|
||||
|
||||
using TokenLength = size_t;
|
||||
using BatchID = uint64_t;
|
||||
|
||||
using PageCount = size_t;
|
||||
|
||||
struct ModelSettings {
|
||||
std::string model_path;
|
||||
size_t params_count;
|
||||
size_t layer_count;
|
||||
size_t num_k_heads;
|
||||
size_t k_head_dim;
|
||||
|
||||
double bytes_per_params;
|
||||
double bytes_per_kv_cache_element;
|
||||
|
||||
inline size_t params_nbytes() { return params_count * bytes_per_params; }
|
||||
inline size_t bytes_per_token_kv_cache() {
|
||||
return bytes_per_kv_cache_element * num_k_heads * k_head_dim;
|
||||
}
|
||||
};
|
||||
|
||||
struct SampleOptions {
|
||||
double temperature = 1.0;
|
||||
double top_p = 1.0;
|
||||
};
|
||||
|
||||
struct Settings {
|
||||
// something is aukward here, kvc2 only use model_name and quant_type to get
|
||||
// model infos.
|
||||
ModelName model_name;
|
||||
QuantType quant_type;
|
||||
// model_setting is ignore by kvc2
|
||||
ModelSettings model_settings;
|
||||
|
||||
size_t page_size = 256; // how many token in a page
|
||||
std::vector<size_t> gpu_device_id; //
|
||||
size_t gpu_memory_size; // memory size in bytes of each GPU, each
|
||||
double memory_utilization_percentage;
|
||||
|
||||
size_t max_batch_size = 256;
|
||||
|
||||
size_t recommended_chunk_prefill_token_count;
|
||||
SampleOptions sample_options;
|
||||
size_t sched_metrics_port;
|
||||
|
||||
// for kvc2
|
||||
bool gpu_only;
|
||||
bool use_self_defined_head_dim = false;
|
||||
size_t self_defined_head_dim;
|
||||
bool full_kv_cache_on_each_gpu = false;
|
||||
bool k_cache_on = true;
|
||||
bool v_cache_on = true;
|
||||
std::string kvc2_config_path;
|
||||
std::string kvc2_root_path;
|
||||
double memory_pool_size_GB = 100;
|
||||
size_t evict_count = 20;
|
||||
size_t kvc2_metrics_port;
|
||||
bool load_from_disk = false;
|
||||
bool save_to_disk = false;
|
||||
|
||||
// for strategy
|
||||
std::string strategy_name;
|
||||
|
||||
// derived
|
||||
size_t gpu_device_count;
|
||||
std::optional<size_t> total_kvcache_pages;
|
||||
std::vector<torch::Device> devices;
|
||||
void auto_derive();
|
||||
};
|
||||
|
||||
using PrefillTask =
|
||||
std::tuple<QueryID, TokenLength, TokenLength>; // id, start, length
|
||||
|
||||
struct BatchQueryTodo {
|
||||
// query
|
||||
std::vector<QueryID> query_ids;
|
||||
std::vector<torch::Tensor> query_tokens;
|
||||
std::vector<TokenLength> query_lengths;
|
||||
std::vector<torch::Tensor>
|
||||
block_indexes; // (max_num_blocks_per_seq), dtype torch.int32.
|
||||
std::optional<torch::Tensor> attn_masks;
|
||||
std::optional<torch::Tensor> rope_ranges;
|
||||
std::vector<SampleOptions> sample_options;
|
||||
std::vector<std::vector<std::vector<int>>> stop_criteria;
|
||||
|
||||
// mini batches, adjacent two mini batches are executed together
|
||||
// tasks count must be <=2, because of flash infer attention
|
||||
std::vector<PrefillTask>
|
||||
prefill_mini_batches; // prefill minibatch only has 1 prefill
|
||||
std::vector<std::vector<QueryID>>
|
||||
decode_mini_batches; // decode minibatch has multiple decode
|
||||
|
||||
std::string debug();
|
||||
bool empty();
|
||||
};
|
||||
|
||||
struct QueryUpdate {
|
||||
QueryID id;
|
||||
bool ok;
|
||||
bool is_prefill;
|
||||
bool decode_done; // no use for now
|
||||
TokenLength active_position; // the position where no kvcache now,
|
||||
// kvcache[active_position] == None
|
||||
|
||||
Token generated_token;
|
||||
|
||||
std::string debug() const;
|
||||
};
|
||||
|
||||
using BatchQueryUpdate = std::vector<QueryUpdate>;
|
||||
|
||||
struct InferenceContext {
|
||||
std::vector<torch::Tensor> k_cache; // [gpu num] (layer_count, num blocks,
|
||||
// page size, kheadnum, head_dim)
|
||||
std::vector<torch::Tensor> v_cache;
|
||||
};
|
||||
|
||||
using UserID = int64_t;
|
||||
constexpr UserID NoUser = -1;
|
||||
const int MAX_SLO_TIME = 1e9;
|
||||
|
||||
struct QueryAdd {
|
||||
std::vector<Token> query_token; // int here
|
||||
// torch::Tensor attn_mask;
|
||||
TokenLength query_length;
|
||||
TokenLength estimated_length;
|
||||
|
||||
std::vector<std::vector<int>> stop_criteria;
|
||||
|
||||
SampleOptions sample_options;
|
||||
|
||||
UserID user_id;
|
||||
int SLO_TTFT_ms = MAX_SLO_TIME;
|
||||
int SLO_TBT_ms = MAX_SLO_TIME;
|
||||
|
||||
std::string serialize();
|
||||
static QueryAdd deserialize(const std::string &input);
|
||||
};
|
||||
|
||||
class Scheduler {
|
||||
public:
|
||||
virtual void init(Settings settings) = 0;
|
||||
|
||||
virtual void run() = 0;
|
||||
virtual void stop() = 0;
|
||||
|
||||
// webserver call this
|
||||
virtual QueryID add_query(QueryAdd query) = 0;
|
||||
virtual void cancel_query(QueryID id) = 0;
|
||||
|
||||
// inference loop call this
|
||||
virtual std::shared_ptr<BatchQueryTodo>
|
||||
update_last_batch(BatchQueryUpdate updates) = 0;
|
||||
virtual InferenceContext get_inference_context() = 0;
|
||||
|
||||
virtual ~Scheduler() = default;
|
||||
};
|
||||
|
||||
std::shared_ptr<Scheduler> create_scheduler(Settings settings);
|
||||
|
||||
}; // namespace scheduler
|
3
csrc/balance_serve/sched/utils/all.hpp
Normal file
3
csrc/balance_serve/sched/utils/all.hpp
Normal file
|
@ -0,0 +1,3 @@
|
|||
#pragma once
|
||||
#include "readable_number.hpp"
|
||||
#include "timer.hpp"
|
7
csrc/balance_serve/sched/utils/arithmetic.hpp
Normal file
7
csrc/balance_serve/sched/utils/arithmetic.hpp
Normal file
|
@ -0,0 +1,7 @@
|
|||
#include <type_traits>
|
||||
|
||||
template <typename T, typename U> T div_up(T x, U by) {
|
||||
static_assert(std::is_integral_v<T>);
|
||||
static_assert(std::is_integral_v<U>);
|
||||
return (x + by - 1) / by;
|
||||
}
|
35
csrc/balance_serve/sched/utils/atomic_ptr_with_flags.hpp
Normal file
35
csrc/balance_serve/sched/utils/atomic_ptr_with_flags.hpp
Normal file
|
@ -0,0 +1,35 @@
|
|||
#include <atomic>
|
||||
|
||||
template <typename T> struct AtomicPtrWithFlag {
|
||||
constexpr static uint64_t mask = 1ull << 63;
|
||||
std::atomic_uint64_t ptr = 0;
|
||||
|
||||
std::pair<T *, bool>
|
||||
load(std::memory_order order = std::memory_order_seq_cst) {
|
||||
uint64_t val = ptr.load(order);
|
||||
return {reinterpret_cast<T *>(val & (~mask)), val & mask};
|
||||
}
|
||||
|
||||
void store(T *p, bool flag,
|
||||
std::memory_order order = std::memory_order_seq_cst) {
|
||||
ptr.store(reinterpret_cast<uint64_t>(p) | (flag ? mask : 0), order);
|
||||
}
|
||||
|
||||
std::pair<T *, bool>
|
||||
exchange(T *p, bool flag,
|
||||
std::memory_order order = std::memory_order_seq_cst) {
|
||||
uint64_t val =
|
||||
ptr.exchange(reinterpret_cast<uint64_t>(p) | (flag ? mask : 0), order);
|
||||
return {reinterpret_cast<T *>(val & (~mask)), val & mask};
|
||||
}
|
||||
|
||||
std::pair<T *, bool>
|
||||
touch_load(std::memory_order order = std::memory_order_seq_cst) {
|
||||
uint64_t val = ptr.fetch_and(~mask, order);
|
||||
return {reinterpret_cast<T *>(val & (~mask)), val & mask};
|
||||
}
|
||||
|
||||
bool check_flag(std::memory_order order = std::memory_order_seq_cst) {
|
||||
return ptr.load(order) & mask;
|
||||
}
|
||||
};
|
229
csrc/balance_serve/sched/utils/csv.hpp
Normal file
229
csrc/balance_serve/sched/utils/csv.hpp
Normal file
|
@ -0,0 +1,229 @@
|
|||
#ifndef CSV_READER_HPP
|
||||
#define CSV_READER_HPP
|
||||
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <mutex>
|
||||
#include <sstream>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
|
||||
namespace csv {
|
||||
|
||||
/**
|
||||
* @brief Parses a CSV line into individual fields, handling quoted fields with
|
||||
* commas and newlines.
|
||||
*
|
||||
* @param line The CSV line to parse.
|
||||
* @return A vector of strings, each representing a field in the CSV line.
|
||||
*/
|
||||
inline std::vector<std::string> parse_csv_line(const std::string &line) {
|
||||
std::vector<std::string> result;
|
||||
std::string field;
|
||||
bool in_quotes = false;
|
||||
|
||||
for (size_t i = 0; i < line.length(); ++i) {
|
||||
char c = line[i];
|
||||
|
||||
if (c == '"') {
|
||||
// Handle double quotes inside quoted fields
|
||||
if (in_quotes && i + 1 < line.length() && line[i + 1] == '"') {
|
||||
field += '"';
|
||||
++i;
|
||||
} else {
|
||||
in_quotes = !in_quotes;
|
||||
}
|
||||
} else if (c == ',' && !in_quotes) {
|
||||
result.push_back(field);
|
||||
field.clear();
|
||||
} else {
|
||||
field += c;
|
||||
}
|
||||
}
|
||||
result.push_back(field);
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Reads a CSV file and returns a vector of pairs containing column names
|
||||
* and their corresponding data vectors.
|
||||
*
|
||||
* This function reads the header to obtain column names and uses multithreading
|
||||
* to read and parse the CSV file in chunks.
|
||||
*
|
||||
* @param filename The path to the CSV file.
|
||||
* @return A vector of pairs, each containing a column name and a vector of data
|
||||
* for that column.
|
||||
*/
|
||||
inline std::vector<std::pair<std::string, std::vector<std::string>>>
|
||||
read_csv(const std::string &filename) {
|
||||
std::cout << "Reading CSV file: " << filename << std::endl;
|
||||
// Open the file
|
||||
std::ifstream file(filename);
|
||||
if (!file) {
|
||||
throw std::runtime_error("Cannot open file");
|
||||
}
|
||||
|
||||
// Read the header line and parse column names
|
||||
std::string header_line;
|
||||
std::getline(file, header_line);
|
||||
std::vector<std::string> column_names = parse_csv_line(header_line);
|
||||
|
||||
// Prepare the result vector with column names
|
||||
std::vector<std::pair<std::string, std::vector<std::string>>> result;
|
||||
for (const auto &name : column_names) {
|
||||
result.emplace_back(name, std::vector<std::string>());
|
||||
}
|
||||
|
||||
// Read the rest of the file into a string buffer
|
||||
std::stringstream buffer;
|
||||
buffer << file.rdbuf();
|
||||
std::string content = buffer.str();
|
||||
|
||||
// Determine the number of threads to use
|
||||
unsigned int num_threads = std::thread::hardware_concurrency();
|
||||
if (num_threads == 0)
|
||||
num_threads = 4; // Default to 4 threads if hardware_concurrency returns 0
|
||||
|
||||
// Calculate chunk start positions based on content size
|
||||
std::vector<size_t> chunk_starts;
|
||||
size_t content_size = content.size();
|
||||
size_t chunk_size = content_size / num_threads;
|
||||
|
||||
chunk_starts.push_back(0);
|
||||
for (unsigned int i = 1; i < num_threads; ++i) {
|
||||
size_t pos = i * chunk_size;
|
||||
// Adjust position to the next newline character to ensure we start at the
|
||||
// beginning of a line
|
||||
while (pos < content_size && content[pos] != '\n') {
|
||||
++pos;
|
||||
}
|
||||
if (pos < content_size) {
|
||||
++pos; // Skip the newline character
|
||||
}
|
||||
chunk_starts.push_back(pos);
|
||||
}
|
||||
chunk_starts.push_back(content_size);
|
||||
|
||||
// Create threads to parse each chunk
|
||||
std::vector<std::vector<std::vector<std::string>>> thread_results(
|
||||
num_threads);
|
||||
std::vector<std::thread> threads;
|
||||
|
||||
for (unsigned int i = 0; i < num_threads; ++i) {
|
||||
size_t start = chunk_starts[i];
|
||||
size_t end = chunk_starts[i + 1];
|
||||
|
||||
threads.emplace_back([&content, start, end, &thread_results, i]() {
|
||||
std::vector<std::vector<std::string>> local_result;
|
||||
size_t pos = start;
|
||||
while (pos < end) {
|
||||
size_t next_pos = content.find('\n', pos);
|
||||
if (next_pos == std::string::npos || next_pos > end) {
|
||||
next_pos = end;
|
||||
}
|
||||
std::string line = content.substr(pos, next_pos - pos);
|
||||
if (!line.empty()) {
|
||||
local_result.push_back(parse_csv_line(line));
|
||||
}
|
||||
pos = next_pos + 1;
|
||||
}
|
||||
thread_results[i] = std::move(local_result);
|
||||
});
|
||||
}
|
||||
|
||||
// Wait for all threads to finish
|
||||
for (auto &t : threads) {
|
||||
t.join();
|
||||
}
|
||||
|
||||
// Combine the results from all threads into the final result
|
||||
for (const auto &local_result : thread_results) {
|
||||
for (const auto &row : local_result) {
|
||||
for (size_t i = 0; i < row.size(); ++i) {
|
||||
if (i < result.size()) {
|
||||
result[i].second.push_back(row[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Writes the CSV data into a file.
|
||||
*
|
||||
* @param filename The path to the output CSV file.
|
||||
* @param data A vector of pairs, each containing a column name and a vector of
|
||||
* data for that column.
|
||||
*/
|
||||
inline void write_csv(
|
||||
const std::string &filename,
|
||||
const std::vector<std::pair<std::string, std::vector<std::string>>> &data) {
|
||||
std::cout << "Writing CSV file: " << filename << std::endl;
|
||||
|
||||
// Open the file for writing
|
||||
std::ofstream file(filename);
|
||||
if (!file) {
|
||||
throw std::runtime_error("Cannot open file for writing");
|
||||
}
|
||||
|
||||
// Check that all columns have the same number of rows
|
||||
if (data.empty()) {
|
||||
return; // Nothing to write
|
||||
}
|
||||
size_t num_rows = data[0].second.size();
|
||||
for (const auto &column : data) {
|
||||
if (column.second.size() != num_rows) {
|
||||
throw std::runtime_error("All columns must have the same number of rows");
|
||||
}
|
||||
}
|
||||
|
||||
// Write the header
|
||||
for (size_t i = 0; i < data.size(); ++i) {
|
||||
file << data[i].first;
|
||||
if (i != data.size() - 1) {
|
||||
file << ',';
|
||||
}
|
||||
}
|
||||
file << '\n';
|
||||
|
||||
// Write the data rows
|
||||
for (size_t row = 0; row < num_rows; ++row) {
|
||||
for (size_t col = 0; col < data.size(); ++col) {
|
||||
const std::string &field = data[col].second[row];
|
||||
// Handle CSV escaping
|
||||
std::string escaped_field = field;
|
||||
bool needs_quotes = false;
|
||||
if (escaped_field.find('"') != std::string::npos) {
|
||||
needs_quotes = true;
|
||||
// Escape double quotes
|
||||
size_t pos = 0;
|
||||
while ((pos = escaped_field.find('"', pos)) != std::string::npos) {
|
||||
escaped_field.insert(pos, "\"");
|
||||
pos += 2;
|
||||
}
|
||||
}
|
||||
if (escaped_field.find(',') != std::string::npos ||
|
||||
escaped_field.find('\n') != std::string::npos) {
|
||||
needs_quotes = true;
|
||||
}
|
||||
if (needs_quotes) {
|
||||
file << '"' << escaped_field << '"';
|
||||
} else {
|
||||
file << escaped_field;
|
||||
}
|
||||
if (col != data.size() - 1) {
|
||||
file << ',';
|
||||
}
|
||||
}
|
||||
file << '\n';
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace csv
|
||||
|
||||
#endif // CSV_READER_HPP
|
15
csrc/balance_serve/sched/utils/easy_format.hpp
Normal file
15
csrc/balance_serve/sched/utils/easy_format.hpp
Normal file
|
@ -0,0 +1,15 @@
|
|||
#include <sstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
template <typename T> std::string format_vector(const std::vector<T> &v) {
|
||||
std::ostringstream oss;
|
||||
if (v.empty())
|
||||
return "[]";
|
||||
for (size_t i = 0; i < v.size(); ++i) {
|
||||
oss << v[i];
|
||||
if (i < v.size() - 1)
|
||||
oss << ", "; // 逗号分隔
|
||||
}
|
||||
return oss.str();
|
||||
}
|
112
csrc/balance_serve/sched/utils/mpsc.hpp
Normal file
112
csrc/balance_serve/sched/utils/mpsc.hpp
Normal file
|
@ -0,0 +1,112 @@
|
|||
#include <atomic>
|
||||
#include <cassert>
|
||||
#include <iostream>
|
||||
#include <optional>
|
||||
#include <semaphore>
|
||||
|
||||
template <typename T> class MPSCQueue {
|
||||
struct Node {
|
||||
T data;
|
||||
std::atomic<Node *> next;
|
||||
|
||||
Node() : next(nullptr) {}
|
||||
Node(T data_) : data(std::move(data_)), next(nullptr) {}
|
||||
};
|
||||
|
||||
std::atomic<Node *> head;
|
||||
Node *tail;
|
||||
|
||||
public:
|
||||
std::atomic_size_t enqueue_count = 0;
|
||||
size_t dequeue_count = 0;
|
||||
MPSCQueue() {
|
||||
Node *dummy = new Node();
|
||||
head.store(dummy, std::memory_order_seq_cst);
|
||||
tail = dummy;
|
||||
}
|
||||
|
||||
~MPSCQueue() {
|
||||
Node *node = tail;
|
||||
while (node) {
|
||||
Node *next = node->next.load(std::memory_order_seq_cst);
|
||||
delete node;
|
||||
node = next;
|
||||
}
|
||||
}
|
||||
|
||||
// 生产者调用
|
||||
void enqueue(T data) {
|
||||
enqueue_count.fetch_add(1);
|
||||
Node *node = new Node(std::move(data));
|
||||
Node *prev_head = head.exchange(node, std::memory_order_seq_cst);
|
||||
prev_head->next.store(node, std::memory_order_seq_cst);
|
||||
}
|
||||
|
||||
// 消费者调用
|
||||
std::optional<T> dequeue() {
|
||||
Node *next = tail->next.load(std::memory_order_seq_cst);
|
||||
if (next) {
|
||||
T res = std::move(next->data);
|
||||
delete tail;
|
||||
tail = next;
|
||||
dequeue_count += 1;
|
||||
return res;
|
||||
}
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
size_t size() { return enqueue_count.load() - dequeue_count; }
|
||||
};
|
||||
|
||||
template <typename T> class MPSCQueueConsumerLock {
|
||||
MPSCQueue<T> queue;
|
||||
std::counting_semaphore<> sema{0};
|
||||
|
||||
public:
|
||||
void enqueue(T data) {
|
||||
queue.enqueue(std::move(data));
|
||||
// std::atomic_thread_fence(std::memory_order_seq_cst);// Inserting this
|
||||
// because the memory order might be wrong, I am also not that sure about
|
||||
// this.
|
||||
sema.release();
|
||||
}
|
||||
|
||||
T dequeue() {
|
||||
auto re = queue.dequeue();
|
||||
if (re.has_value()) {
|
||||
while (sema.try_acquire() == false) {
|
||||
std::cerr
|
||||
<< __FILE__ << ":" << __FUNCTION__
|
||||
<< " sema try acquire should be success, retrying, please check"
|
||||
<< std::endl;
|
||||
// assert(false);
|
||||
}
|
||||
return re.value();
|
||||
}
|
||||
sema.acquire();
|
||||
return queue.dequeue().value();
|
||||
}
|
||||
|
||||
template <typename Rep, typename Period>
|
||||
std::optional<T> try_dequeue_for(std::chrono::duration<Rep, Period> dur) {
|
||||
auto re = queue.dequeue();
|
||||
if (re.has_value()) {
|
||||
while (sema.try_acquire() == false) {
|
||||
std::cerr
|
||||
<< __FILE__ << ":" << __FUNCTION__
|
||||
<< " sema try acquire should be success, retrying, please check"
|
||||
<< std::endl;
|
||||
// assert(false);
|
||||
}
|
||||
return re.value();
|
||||
}
|
||||
|
||||
if (sema.try_acquire_for(dur)) {
|
||||
return queue.dequeue().value();
|
||||
} else {
|
||||
return std::nullopt;
|
||||
}
|
||||
}
|
||||
|
||||
size_t size() { return queue.size(); }
|
||||
};
|
20
csrc/balance_serve/sched/utils/readable_number.hpp
Normal file
20
csrc/balance_serve/sched/utils/readable_number.hpp
Normal file
|
@ -0,0 +1,20 @@
|
|||
#pragma once
|
||||
#include <array>
|
||||
#include <iomanip>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
|
||||
inline std::array<std::string, 7> units = {"", "K", "M", "G", "T", "P", "E"};
|
||||
|
||||
inline std::string readable_number(size_t size) {
|
||||
size_t unit_index = 0;
|
||||
double readable_size = size;
|
||||
while (readable_size >= 1000 && unit_index < units.size() - 1) {
|
||||
readable_size /= 1000;
|
||||
unit_index++;
|
||||
}
|
||||
std::ostringstream ss;
|
||||
ss << std::fixed << std::setprecision(2) << readable_size;
|
||||
std::string str = ss.str();
|
||||
return str + "" + units[unit_index];
|
||||
}
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Reference in a new issue