Merge branch 'kvcache-ai:main' into main

2025-09-07 13:09:50 +00:00 · 2025-04-09 11:46:39 +08:00 · 2025-04-09 11:46:39 +08:00 · 877aec858e
commit 877aec858e
parent 84164f584c 9037bf30d5
251 changed files with 47224 additions and 749 deletions
--- a/.github/workflows/package_wheel_release.yml
+++ b/.github/workflows/package_wheel_release.yml
@ -163,6 +163,8 @@ jobs:

      - name: build for cuda
        if: matrix.cuda != ''
+        env:
+          USE_BALANCE_SERVE: "1"
        run: |
          git submodule init
          git submodule update
--- a/.gitmodules
+++ b/.gitmodules
@ -4,3 +4,16 @@
 [submodule "third_party/pybind11"]
 	path = third_party/pybind11
 	url = https://github.com/pybind/pybind11.git
+[submodule "third_party/spdlog"]
+	path = third_party/spdlog
+	url = https://github.com/gabime/spdlog.git
+[submodule "third_party/custom_flashinfer"]
+	path = third_party/custom_flashinfer
+	url = https://github.com/kvcache-ai/custom_flashinfer.git
+	branch = fix-precision-mla-merge-main
+[submodule "third_party/xxHash"]
+	path = third_party/xxHash
+	url = https://github.com/Cyan4973/xxHash.git
+[submodule "third_party/prometheus-cpp"]
+	path = third_party/prometheus-cpp
+	url = https://github.com/jupp0r/prometheus-cpp
--- a/84
+++ b/84
@ -1,38 +1,64 @@
-FROM node:20.16.0 as web_compile
-WORKDIR /home
-RUN <<EOF
-git clone https://github.com/kvcache-ai/ktransformers.git &&
-cd ktransformers/ktransformers/website/ &&
-npm install @vue/cli &&
-npm run build &&
-rm -rf node_modules
-EOF
-
-
-
 FROM pytorch/pytorch:2.5.1-cuda12.1-cudnn9-devel as compile_server
+
+
 ARG CPU_INSTRUCT=NATIVE
+
+# 设置工作目录和 CUDA 路径
 WORKDIR /workspace
-ENV CUDA_HOME /usr/local/cuda
-COPY --from=web_compile /home/ktransformers /workspace/ktransformers
-RUN <<EOF
-apt update -y &&  apt install -y  --no-install-recommends \
+ENV CUDA_HOME=/usr/local/cuda
+
+
+
+# 安装依赖
+RUN apt update -y
+RUN apt install -y --no-install-recommends \
+    libtbb-dev \
+    libssl-dev \
+    libcurl4-openssl-dev \
+    libaio1 \
+    libaio-dev \
+    libfmt-dev \
+    libgflags-dev \
+    zlib1g-dev \
+    patchelf \
    git \
    wget \
    vim \
    gcc \
    g++ \
-    cmake && 
-rm -rf /var/lib/apt/lists/* &&
-cd ktransformers &&
-git submodule init &&
-git submodule update &&
-pip install --upgrade pip &&
-pip install ninja pyproject numpy cpufeature &&
-pip install flash-attn &&
-CPU_INSTRUCT=${CPU_INSTRUCT}  KTRANSFORMERS_FORCE_BUILD=TRUE TORCH_CUDA_ARCH_LIST="8.0;8.6;8.7;8.9;9.0+PTX" pip install . --no-build-isolation --verbose &&
-pip cache purge &&
-cp /usr/lib/x86_64-linux-gnu/libstdc++.so.6 /opt/conda/lib/
-EOF
+    cmake
+# 拷贝代码
+RUN git clone https://github.com/kvcache-ai/ktransformers.git 
+# 清理 apt 缓存
+RUN rm -rf /var/lib/apt/lists/*

-ENTRYPOINT ["tail", "-f", "/dev/null"]
+# 进入项目目录
+WORKDIR /workspace/ktransformers
+# 初始化子模块
+RUN git submodule update --init --recursive
+
+# 升级 pip
+RUN pip install --upgrade pip
+
+# 安装构建依赖
+RUN pip install ninja pyproject numpy cpufeature aiohttp zmq openai
+
+# 安装 flash-attn（提前装可以避免后续某些编译依赖出错）
+RUN pip install flash-attn
+
+# 安装 ktransformers 本体（含编译）
+RUN CPU_INSTRUCT=${CPU_INSTRUCT} \
+    USE_BALANCE_SERVE=1 \
+    KTRANSFORMERS_FORCE_BUILD=TRUE \
+    TORCH_CUDA_ARCH_LIST="8.0;8.6;8.7;8.9;9.0+PTX" \
+    pip install . --no-build-isolation --verbose
+
+RUN pip install third_party/custom_flashinfer/
+# 清理 pip 缓存
+RUN pip cache purge
+
+# 拷贝 C++ 运行时库
+RUN cp /usr/lib/x86_64-linux-gnu/libstdc++.so.6 /opt/conda/lib/
+
+# 保持容器运行（调试用）
+ENTRYPOINT ["tail", "-f", "/dev/null"]
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1,6 +1,7 @@
 graft third_party
 graft ktransformers
 graft local_chat.py
+graft csrc
 include LICENSE README.md
 prune ktransformers/website
 prune ktransformers/logs
@ -9,3 +10,4 @@ prune third_party/llama.cpp/models
 graft ktransformers/website/dist
 global-exclude __pycache__
 include KTransformersOps.*.so
+include cpuinfer_ext.*.so
--- a/2
+++ b/2
@ -29,4 +29,4 @@ clean:
 install_numa:
 	USE_NUMA=1 make dev_install
 install_no_numa:
-	env -u USE_NUMA make dev_install
+	env -u USE_NUMA make dev_install
--- a/README.md
+++ b/README.md
@ -23,17 +23,23 @@ Our vision for KTransformers is to serve as a flexible platform for experimentin

 <h2 id="Updates">🔥 Updates</h2>

+* **Apr 2, 2025**: Support Multi-concurrency. ([Tutorial](./doc/en/balance-serve.md)).
+
+https://github.com/user-attachments/assets/faa3bda2-928b-45a7-b44f-21e12ec84b8a
+
 * **Mar 15, 2025**: Support ROCm on AMD GPU ([Tutorial](./doc/en/ROCm.md)).
 * **Mar 5, 2025**: Support unsloth 1.58/2.51 bits weights and [IQ1_S/FP8 hybrid](./doc/en/fp8_kernel.md) weights. Support 139K [Longer Context](./doc/en/DeepseekR1_V3_tutorial.md#v022--v023-longer-context--fp8-kernel) for DeepSeek-V3 and R1 in 24GB VRAM.
 * **Feb 25, 2025**: Support [FP8 GPU kernel](./doc/en/fp8_kernel.md) for DeepSeek-V3 and R1; [Longer Context](./doc/en/DeepseekR1_V3_tutorial.md#v022-longer-context).
 * **Feb 15, 2025**: Longer Context (from 4K to 8K for 24GB VRAM) & Slightly Faster Speed （+15%, up to 16 Tokens/s), update [docs](./doc/en/DeepseekR1_V3_tutorial.md) and [online books](https://kvcache-ai.github.io/ktransformers/).
 * **Feb 10, 2025**: Support Deepseek-R1 and V3 on single (24GB VRAM)/multi gpu and 382G DRAM, up to 3~28x speedup. For detailed show case and reproduction tutorial, see [here](./doc/en/DeepseekR1_V3_tutorial.md).
 * **Aug 28, 2024**: Decrease DeepseekV2's required VRAM from 21G to 11G.
-* **Aug 15, 2024**: Update detailed [tutorial](doc/en/injection_tutorial.md) for injection and multi-GPU. 
-* **Aug 14, 2024**: Support llamfile as linear backend. 
+* **Aug 15, 2024**: Update detailed [tutorial](doc/en/injection_tutorial.md) for injection and multi-GPU.
+* **Aug 14, 2024**: Support llamfile as linear backend.
 * **Aug 12, 2024**: Support multiple GPU; Support new model: mixtral 8\*7B  and 8\*22B; Support q2k, q3k, q5k dequant on gpu.
 * **Aug 9, 2024**: Support windows native.
+
 <!-- * **Aug 28, 2024**: Support 1M context under the InternLM2.5-7B-Chat-1M model, utilizing 24GB of VRAM and 150GB of DRAM. The detailed tutorial is [here](./doc/en/long_context_tutorial.md). -->
+
 <h2 id="show-cases">🌟 Show Cases</h2>

 <div>
@ -45,16 +51,16 @@ https://github.com/user-attachments/assets/ebd70bfa-b2c1-4abb-ae3b-296ed38aa285
 </p>

 - **[NEW!!!] Local 671B DeepSeek-Coder-V3/R1:** Running its Q4_K_M version using only 14GB VRAM and 382GB DRAM([Tutorial](./doc/en/DeepseekR1_V3_tutorial.md)).
-	- Prefill Speed (tokens/s): 
- 		- KTransformers: 54.21 (32 cores) → 74.362 (dual-socket, 2×32 cores) → 255.26 (optimized AMX-based MoE kernel, V0.3 only) → 286.55 (selectively using 6 experts, V0.3 only)  
- 		- Compared to 10.31 tokens/s in llama.cpp with 2×32 cores, achieving up to **27.79× speedup**.  
- 	- Decode Speed (tokens/s):  
- 		- KTransformers: 8.73 (32 cores) → 11.26 (dual-socket, 2×32 cores) → 13.69 (selectively using 6 experts, V0.3 only)  
- 		- Compared to 4.51 tokens/s in llama.cpp with 2×32 cores, achieving up to **3.03× speedup**.  
-	- Upcoming Open Source Release:
-		- AMX optimizations and selective expert activation will be open-sourced in V0.3.  
-		- Currently available only in preview binary distribution, which can be downloaded [here](./doc/en/DeepseekR1_V3_tutorial.md).  

+  - Prefill Speed (tokens/s):
+    - KTransformers: 54.21 (32 cores) → 74.362 (dual-socket, 2×32 cores) → 255.26 (optimized AMX-based MoE kernel, V0.3 only) → 286.55 (selectively using 6 experts, V0.3 only)
+    - Compared to 10.31 tokens/s in llama.cpp with 2×32 cores, achieving up to **27.79× speedup**.
+  - Decode Speed (tokens/s):
+    - KTransformers: 8.73 (32 cores) → 11.26 (dual-socket, 2×32 cores) → 13.69 (selectively using 6 experts, V0.3 only)
+    - Compared to 4.51 tokens/s in llama.cpp with 2×32 cores, achieving up to **3.03× speedup**.
+  - Upcoming Open Source Release:
+    - AMX optimizations and selective expert activation will be open-sourced in V0.3.
+    - Currently available only in preview binary distribution, which can be downloaded [here](./doc/en/DeepseekR1_V3_tutorial.md).
 - **Local 236B DeepSeek-Coder-V2:** Running its Q4_K_M version using only 21GB VRAM and 136GB DRAM, attainable on a local desktop machine, which scores even better than GPT4-0613 in [BigCodeBench](https://huggingface.co/blog/leaderboard-bigcodebench).

 <p align="center">
@ -96,19 +102,16 @@ https://github.com/user-attachments/assets/a865e5e4-bca3-401e-94b8-af3c080e6c12
 * **Flexible Sparse Attention Framework**: Offers a flexible block sparse attention framework for CPU offloaded decoding. Compatible with SnapKV, Quest, and InfLLm. Further information is available [here](./doc/en/long_context_introduction.md).
 -->

-
 <strong>More advanced features will coming soon, so stay tuned!</strong>

 <h2 id="quick-start">🚀 Quick Start</h2>

-
 Getting started with KTransformers is simple! Follow the steps below to set up and start using it.

 ### 📥 Installation

 To install KTransformers, follow the official [Installation Guide](https://kvcache-ai.github.io/ktransformers/en/install.html).

-
 <h2 id="tutorial">📃 Brief Injection Tutorial</h2>
 At the heart of KTransformers is a user-friendly, template-based injection framework. 
 This allows researchers to easily replace original torch modules with optimized variants. It also simplifies the process of combining multiple optimizations, allowing the exploration of their synergistic effects.
@ -167,7 +170,6 @@ The development of KTransformers is based on the flexible and versatile framewor

 KTransformers is actively maintained and developed by contributors from the <a href="https://madsys.cs.tsinghua.edu.cn/">MADSys group</a> at Tsinghua University and members from <a href="http://approaching.ai/">Approaching.AI</a>. We welcome new contributors to join us in making KTransformers faster and easier to use.

-
 <h2 id="ack">Discussion</h2>

 If you have any questions, feel free to open an issue. Alternatively, you can join our WeChat group for further discussion. QR Code: [WeChat Group](WeChatGroup.png)
--- a/csrc/balance_serve/CMakeLists.txt
+++ b/csrc/balance_serve/CMakeLists.txt
@ -0,0 +1,67 @@
+
+cmake_minimum_required(VERSION 3.21)
+find_program(GCC_COMPILER NAMES g++-13 g++-12 g++-11 REQUIRED)
+set(CMAKE_CXX_COMPILER ${GCC_COMPILER})
+
+# 显示选定的编译器
+message(STATUS "Using compiler: ${CMAKE_CXX_COMPILER}")
+
+
+project(balance_serve VERSION 0.1.0)
+
+set(CMAKE_CXX_STANDARD 20)
+# set(CMAKE_CXX_FLAGS "-Og -march=native -Wall -Wextra -g -fPIC")
+# set(CMAKE_BUILD_TYPE "Debug")
+set(CMAKE_CXX_FLAGS "-O3 -march=native -Wall -Wextra -fPIC")
+add_compile_definitions(_GLIBCXX_USE_CXX11_ABI=${_GLIBCXX_USE_CXX11_ABI})
+set(CMAKE_BUILD_TYPE "Release")
+
+file(GLOB_RECURSE FMT_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/*.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/*.hpp" "${CMAKE_CURRENT_SOURCE_DIR}/*.h")
+
+add_custom_target(
+    format
+    COMMAND clang-format
+    -i
+    -style=file
+    ${FMT_SOURCES}
+    COMMENT "Running clang-format on all source files"
+)
+
+set(BUILD_SHARED_LIBS ON)
+set(ENABLE_PUSH OFF)
+set(ENABLE_COMPRESSION OFF)
+
+# set(CMAKE_BUILD_TYPE "Release")
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+set(THIRD_PARTY_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../third_party)
+set(THIRD_PARTY_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/third_party)
+add_subdirectory(${THIRD_PARTY_DIR}/prometheus-cpp ${THIRD_PARTY_BUILD_DIR}/prometheus-cpp EXCLUDE_FROM_ALL)
+add_subdirectory(${THIRD_PARTY_DIR}/xxHash/cmake_unofficial ${THIRD_PARTY_BUILD_DIR}/xxHash EXCLUDE_FROM_ALL)
+
+# add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third_party/prometheus-cpp ${CMAKE_CURRENT_BINARY_DIR}/third_party/prometheus-cpp)
+set(SPDLOG_DIR ${THIRD_PARTY_DIR}/spdlog)
+set(FMT_DIR ${THIRD_PARTY_DIR}/fmt)
+
+set(KVC2_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/kvc2/src)
+
+include_directories(${THIRD_PARTY_DIR})
+
+add_subdirectory(${THIRD_PARTY_DIR}/pybind11 ${THIRD_PARTY_BUILD_DIR}/pybind11)
+
+execute_process(
+    COMMAND python3 -c "import torch; print(torch.__path__[0])"
+    OUTPUT_VARIABLE TORCH_INSTALL_PREFIX
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
+message(STATUS "Found PyTorch at: ${TORCH_INSTALL_PREFIX}")
+
+# set(TORCH_INSTALL_PREFIX "/home/xwy/.conda/envs/kvc/lib/python3.12/site-packages/torch")
+find_library(TORCH_PYTHON_LIBRARY torch_python PATH "${TORCH_INSTALL_PREFIX}/lib")
+find_package(Torch REQUIRED PATHS "${TORCH_INSTALL_PREFIX}/share/cmake/Torch" NO_DEFAULT_PATH)
+
+add_subdirectory(kvc2)
+add_subdirectory(sched)
+
+# add_subdirectory(test)
--- a/csrc/balance_serve/kvc2/.clang-format
+++ b/csrc/balance_serve/kvc2/.clang-format
@ -0,0 +1,25 @@
+Language:        Cpp
+# 格式化风格，可以是LLVM, Google, Chromium, Mozilla, WebKit等，或者自定义
+BasedOnStyle:  Google
+
+# 缩进设置
+IndentWidth:        2
+TabWidth:           2
+UseTab:             Never
+
+# 换行相关设置
+BreakBeforeBraces: Attach
+AllowShortIfStatementsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: Inline
+AllowShortLoopsOnASingleLine: false
+
+# 类与结构体
+DerivePointerAlignment: false
+PointerAlignment: Left
+
+# 包含文件的排序和分组
+IncludeBlocks:   Preserve
+SortIncludes:    true
+
+# 控制最大行宽
+ColumnLimit:     120
--- a/csrc/balance_serve/kvc2/CMakeLists.txt
+++ b/csrc/balance_serve/kvc2/CMakeLists.txt
@ -0,0 +1,103 @@
+cmake_minimum_required(VERSION 3.21)
+
+find_program(GCC_COMPILER NAMES g++-13 g++-12 g++-11 REQUIRED)
+set(CMAKE_CXX_COMPILER ${GCC_COMPILER})
+
+project(kvcache-manager VERSION 0.1.0)
+
+set(CMAKE_CXX_STANDARD 20)
+
+# set(CMAKE_CXX_FLAGS "-fPIC -O3 -ffast-math -march=native -Wall -Wextra -Wpedantic  -fvisibility=hidden -s")
+# set(CMAKE_CXX_FLAGS "-Og -march=native -Wall -Wextra -Wpedantic -g -fsanitize=address")
+# set(CMAKE_CXX_FLAGS "-march=native -Wall -Wextra -Wpedantic -g")
+# set(CMAKE_CXX_FLAGS "-fPIC -O3 -ffast-math -march=native -Wall -Wextra -g")
+set(CMAKE_BUILD_TYPE "Release")
+add_compile_definitions(_GLIBCXX_USE_CXX11_ABI=${_GLIBCXX_USE_CXX11_ABI})
+# set(CMAKE_BUILD_TYPE "Debug")
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+set(BUILD_TEST OFF)
+set(BUILD_PYTHON_EXT OFF)
+
+# set(USE_IO_URING ON)
+if(USE_IO_URING)
+    message(STATUS "Using io_uring")
+    add_compile_definitions(USE_IO_URING)
+else()
+    message(STATUS "Using aio")
+endif()
+
+file(GLOB_RECURSE ALL_SOURCE_FILES src/*.cpp src/*.h test/*.cpp test/*.h test/*.hpp)
+
+# 添加一个自定义目标来格式化所有代码
+if(NOT TARGET format)
+    add_custom_target(
+        format
+        COMMAND clang-format
+        -i
+        -style=file
+        ${ALL_SOURCE_FILES}
+        COMMENT "Running clang-format on all source files"
+    )
+endif()
+
+execute_process(
+    COMMAND python3 -c "import torch; print(torch.__path__[0])"
+    OUTPUT_VARIABLE TORCH_INSTALL_PREFIX
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
+message(STATUS "Found PyTorch at: ${TORCH_INSTALL_PREFIX}")
+
+# set(TORCH_INSTALL_PREFIX "/home/xwy/.conda/envs/kvc/lib/python3.12/site-packages/torch")
+find_library(TORCH_PYTHON_LIBRARY torch_python PATH "${TORCH_INSTALL_PREFIX}/lib")
+find_package(Torch REQUIRED PATHS "${TORCH_INSTALL_PREFIX}/share/cmake/Torch" NO_DEFAULT_PATH)
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../../third_party)
+
+find_package(TBB REQUIRED)
+find_package(CUDA REQUIRED)
+
+# find_package(prometheus-cpp CONFIG REQUIRED)
+if(NOT TARGET prometheus-cpp::pull)
+    message(FATAL_ERROR "prometheus-cpp::pull not found")
+else()
+    message(STATUS "prometheus Found!")
+endif()
+
+if(CUDA_FOUND)
+    message(STATUS "CUDA Found!")
+    message(STATUS "CUDA Version: ${CUDA_VERSION_STRING}")
+    message(STATUS "CUDA Toolkit Root: ${CUDA_TOOLKIT_ROOT_DIR}")
+else()
+    message(FATAL_ERROR "CUDA not found!")
+endif()
+
+add_subdirectory(src)
+
+if(BUILD_TEST)
+    add_subdirectory(test)
+endif()
+
+message(STATUS "BUILD_PYTHON_EXT: ${BUILD_PYTHON_EXT}")
+
+if(BUILD_PYTHON_EXT)
+    if(NOT TARGET pybind11::pybind11)
+        add_subdirectory(${THIRD_PARTY_DIR}/pybind11 ${THIRD_PARTY_BUILD_DIR}/pybind11)
+    endif()
+
+    pybind11_add_module(kvc2_ext src/bind.cpp)
+
+    # EXAMPLE_VERSION_INFO is defined by setup.py and passed into the C++ code as a
+    # define (VERSION_INFO) here.
+    target_compile_definitions(kvc2_ext PRIVATE VERSION_INFO=${EXAMPLE_VERSION_INFO})
+    message(STATUS "CMAKE_CURRENT_SOURCE_DIR: ${CMAKE_CURRENT_SOURCE_DIR}")
+    target_include_directories(kvc2_ext PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../third_party/spdlog/include)
+
+    target_link_libraries(kvc2_ext PUBLIC kvc2 async_store)
+
+    install(TARGETS kvc2_ext LIBRARY
+        DESTINATION ${CMAKE_BINARY_DIR}/output)
+    install(FILES src/kvc2_utils.py
+        DESTINATION ${CMAKE_BINARY_DIR}/output)
+endif()
+
--- a/csrc/balance_serve/kvc2/README.md
+++ b/csrc/balance_serve/kvc2/README.md
@ -0,0 +1,38 @@
+# KVC2
+
+# Build
+运行以下命令编译kvc2，注意可能需要 sudo 权限安装一些依赖
+```shell
+git clone https://github.com/kvcache-ai/kvc2
+cd kvc2
+./install_deps.sh
+mkdir build
+cd build
+cmake ..
+make -j && make install
+```
+编译完成后会生成`build/output`，包含`kvc2_ext.cpython-312-x86_64-linux-gnu.so`和`kvc2_utils.py`方便调用。
+
+<!-- # Test
+运行以下命令测试kvc2，需要指定一个 disk path 作为测试目录。
+```shell
+./unit_test.sh ${DISK_PATH}
+```
+或者运行 python 的测试文件
+```shell
+python test/pytest_mem_read.py 
+``` -->
+
+# Troubleshooting
+在 Python 环境运行时，可以需要在 conda 中安装相关的依赖。
+```shell
+conda install -c conda-forge gcc_linux-64 gxx_linux-64
+```
+
+也可以尝试设置一下环境变量，然后再运行。
+```shell
+export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
+export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libffi.so.7 
+```
+
+
--- a/csrc/balance_serve/kvc2/config/model_configs.json
+++ b/csrc/balance_serve/kvc2/config/model_configs.json
@ -0,0 +1,42 @@
+{
+    "DeepSeek-Coder-V2-Instruct": {
+        "hidden_size": 5120,
+        "intermediate_size": 12288,
+        "max_position_embeddings": 163840,
+        "model_type": "deepseek_v2",
+        "num_attention_heads": 128,
+        "num_hidden_layers": 60,
+        "num_key_value_heads": 128,
+        "vocab_size": 102400
+    },
+    "LLaMA-2-7B-32K": {
+        "hidden_size": 4096,
+        "intermediate_size": 11008,
+        "max_position_embeddings": 32768,
+        "model_type": "llama",
+        "num_attention_heads": 32,
+        "num_hidden_layers": 32,
+        "num_key_value_heads": 32,
+        "vocab_size": 32000
+    },
+    "Qwen2.5-7B-Instruct": {
+        "hidden_size": 3584,
+        "intermediate_size": 18944,
+        "max_position_embeddings": 32768,
+        "model_type": "qwen2",
+        "num_attention_heads": 28,
+        "num_hidden_layers": 28,
+        "num_key_value_heads": 4,
+        "vocab_size": 152064
+    },
+    "qwen2-72b-instruct": {
+        "hidden_size": 8192,
+        "intermediate_size": 29568,
+        "max_position_embeddings": 32768,
+        "model_type": "qwen2",
+        "num_attention_heads": 64,
+        "num_hidden_layers": 80,
+        "num_key_value_heads": 8,
+        "vocab_size": 152064
+    }
+}
--- a/csrc/balance_serve/kvc2/config/quant_configs.json
+++ b/csrc/balance_serve/kvc2/config/quant_configs.json
@ -0,0 +1,57 @@
+{
+    "BF16": {
+        "block_element_count": 1,
+        "block_element_size": 2,
+        "bytes_per_element": 2.0,
+        "can_be_used_as_vector": true,
+        "has_min": false,
+        "has_scale": false,
+        "name": "BF16",
+        "reference": "",
+        "type_of_dot_vector": "BF16"
+    },
+    "FP16": {
+        "block_element_count": 1,
+        "block_element_size": 2,
+        "bytes_per_element": 2.0,
+        "can_be_used_as_vector": true,
+        "has_min": false,
+        "has_scale": false,
+        "name": "FP16",
+        "reference": "",
+        "type_of_dot_vector": "FP16"
+    },
+    "FP32": {
+        "block_element_count": 1,
+        "block_element_size": 4,
+        "bytes_per_element": 4.0,
+        "can_be_used_as_vector": true,
+        "has_min": false,
+        "has_scale": false,
+        "name": "FP32",
+        "reference": "",
+        "type_of_dot_vector": "FP32"
+    },
+    "Q4_0": {
+        "block_element_count": 32,
+        "block_element_size": 18,
+        "bytes_per_element": 0.5625,
+        "can_be_used_as_vector": false,
+        "has_min": false,
+        "has_scale": true,
+        "name": "Q4_0",
+        "reference": "https://huggingface.co/docs/hub/gguf",
+        "type_of_dot_vector": "Q8_0"
+    },
+    "Q8_0": {
+        "block_element_count": 32,
+        "block_element_size": 34,
+        "bytes_per_element": 1.0625,
+        "can_be_used_as_vector": true,
+        "has_min": false,
+        "has_scale": true,
+        "name": "Q8_0",
+        "reference": "https://huggingface.co/docs/hub/gguf",
+        "type_of_dot_vector": "Q8_0"
+    }
+}
--- a/csrc/balance_serve/kvc2/export_envs_before_run.sh
+++ b/csrc/balance_serve/kvc2/export_envs_before_run.sh
@ -0,0 +1,2 @@
+export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
+export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libffi.so.7 
--- a/csrc/balance_serve/kvc2/install_deps.sh
+++ b/csrc/balance_serve/kvc2/install_deps.sh
@ -0,0 +1,15 @@
+#!/bin/bash
+
+cd "${0%/*}"
+git submodule update --init --recursive
+
+sudo apt update
+sudo apt install libtbb-dev
+sudo apt install libcurl4-openssl-dev
+sudo apt install libaio-dev
+
+cd third_party/xxHash/
+make -j
+sudo make install
+cd ../..
+
--- a/csrc/balance_serve/kvc2/mkfs.sh
+++ b/csrc/balance_serve/kvc2/mkfs.sh
@ -0,0 +1,4 @@
+sudo umount /mnt/xwy 
+sudo mkfs.xfs /dev/nvme0n1 -f
+sudo mount /dev/nvme0n1 /mnt/xwy
+sudo chown -R xwy /mnt/xwy/
--- a/csrc/balance_serve/kvc2/src/CMakeLists.txt
+++ b/csrc/balance_serve/kvc2/src/CMakeLists.txt
@ -0,0 +1,45 @@
+include_directories(${THIRD_PARTY_DIR}/asyncio/include)
+
+add_library(kvc2_metrics STATIC metrics.cpp)
+target_link_libraries(kvc2_metrics PUBLIC prometheus-cpp::pull)
+
+add_library(page_aligned_memory_pool page_aligned_memory_pool.cpp)
+target_include_directories(page_aligned_memory_pool PRIVATE ${THIRD_PARTY_DIR}/spdlog/include)
+
+function(add_third_party_includes TARGET_NAME)
+    target_include_directories(${TARGET_NAME} PRIVATE
+        ${THIRD_PARTY_BUILD_DIR}/prometheus-cpp/core/include
+        ${THIRD_PARTY_BUILD_DIR}/prometheus-cpp/pull/include
+        ${THIRD_PARTY_DIR}/prometheus-cpp/core/include
+        ${THIRD_PARTY_DIR}/prometheus-cpp/pull/include
+        ${THIRD_PARTY_DIR}/spdlog/include
+    )
+endfunction()
+
+
+add_library(cache_entry cache_entry.cpp)
+add_third_party_includes(cache_entry)
+target_link_libraries(cache_entry PUBLIC gpu_cache)
+
+add_library(gpu_cache gpu_cache.cpp)
+add_third_party_includes(gpu_cache)
+target_link_libraries(gpu_cache PUBLIC xxHash::xxhash ${TORCH_LIBRARIES} cuda_stream_manager)
+
+add_library(kvc2 prefix.cpp)
+target_include_directories(kvc2 PRIVATE ${THIRD_PARTY_DIR}/nlohmann/single_include)
+add_third_party_includes(kvc2)
+target_link_libraries(kvc2 PUBLIC TBB::tbb xxHash::xxhash cache_entry cuda_stream_manager page_aligned_memory_pool ${TORCH_LIBRARIES} prometheus-cpp::pull kvc2_metrics)
+
+message(STATUS "CMAKE_SOURCE_DIR: " ${CMAKE_SOURCE_DIR})
+add_library(async_store async_store.cpp)
+target_include_directories(async_store PRIVATE ${THIRD_PARTY_DIR}/nlohmann/single_include)
+target_include_directories(async_store PRIVATE ${THIRD_PARTY_DIR}/spdlog/include)
+target_link_libraries(async_store PUBLIC pthread)
+
+
+
+add_library(cuda_stream_manager cuda_stream_manager.cpp)
+target_include_directories(cuda_stream_manager PUBLIC ${THIRD_PARTY_DIR}/nlohmann/single_include)
+target_include_directories(cuda_stream_manager PUBLIC ${THIRD_PARTY_DIR}/spdlog/include)
+target_include_directories(cuda_stream_manager  PUBLIC ${CUDAToolkit_INCLUDE_DIRS})
+target_link_libraries(cuda_stream_manager PUBLIC CUDA::cudart)
--- a/csrc/balance_serve/kvc2/src/async_store.cpp
+++ b/csrc/balance_serve/kvc2/src/async_store.cpp
@ -0,0 +1,137 @@
+
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <filesystem>
+#include <future>
+#include <iostream>
+#include <nlohmann/json.hpp>
+#include <optional>
+#include <queue>
+#include <thread>
+#include <unordered_map>
+
+#include "utils/lock_free_queue.hpp"
+
+#include "async_store.hh"
+
+namespace async_store {
+
+struct ArrayStore {
+  static const size_t DeviceBlockSize = 512;
+
+  const size_t element_size;
+  const size_t element_size_aligned;
+
+  size_t size;
+
+  size_t size_in_bytes() { return size * element_size_aligned; }
+
+  std::filesystem::path data_path;
+
+  void extend(size_t to) {
+    if (to <= size) {
+      return;
+    }
+    // TODO: extend file
+    size = to;
+    // LOG_INFO("Extend file to `, size `", to, size_in_bytes());
+  }
+
+  ArrayStore(size_t element_size, size_t size, std::filesystem::path data_path)
+      : element_size(element_size),
+        element_size_aligned((element_size + DeviceBlockSize - 1) / DeviceBlockSize),
+        data_path(data_path) {
+    // TODO: prefix cache
+  }
+
+  void read(size_t index, void* buffer) {
+    // TODO: read from file
+  }
+  void write(size_t index, void* buffer) {
+    // TODO: write to file
+  }
+};
+
+ArrayStore* create_or_open_store(size_t element_size, size_t size, std::filesystem::path data_path) {
+  return new ArrayStore(element_size, size, data_path);
+}
+
+void close_store(ArrayStore* store) {
+  delete store;
+}
+
+size_t capacity(ArrayStore* store) {
+  return store->size;
+}
+
+void extend(ArrayStore* store, size_t to) {
+  store->extend(to);
+}
+
+template <typename T>
+struct ArrayStoreT {
+  ArrayStore store;
+  ArrayStoreT(size_t element_count, std::filesystem::path data_path) : store(sizeof(T), element_count, data_path) {}
+
+  void read(size_t index, void* output) { store.read(index, output); }
+
+  void write(size_t index, T& value) { store.write(index, &value); }
+  void write(size_t index, void* value) { store.write(index, value); }
+};
+
+std::string request_to_string(IORequest* req) {
+  return fmt::format("IOReqeust {} {} to {}[{}]", req->write ? "Write" : "Read ", req->data,
+                     req->store->data_path.c_str(), req->index);
+}
+
+struct IODealerImpl {
+  MPSCQueue<IORequest> ioQueue;
+  uint64_t io_cnt = 0;
+  size_t io_amount = 0;
+  bool use_io_uring;
+  int IO_DEPTH;
+
+  bool stop = false;
+  IODealerImpl(bool use_io_uring, int IO_DEPTH) : use_io_uring(use_io_uring), IO_DEPTH(IO_DEPTH) {}
+
+  void queue_consumer() {
+    // TODO:
+  }
+
+  void io_perf() {
+    // TODO:
+  }
+
+  void io_dealer() {
+    // TODO:
+  }
+};
+
+IODealer::IODealer(bool use_io_uring, int IO_DEPTH) {
+  io_impl = new IODealerImpl(use_io_uring, IO_DEPTH);
+}
+
+IODealer::~IODealer() {
+  stop();
+  delete io_impl;
+}
+
+void IODealer::enqueue(std::shared_ptr<IORequest> req) {
+  io_impl->ioQueue.enqueue(req);
+}
+
+std::thread IODealer::start_io_thread() {
+  return std::thread([this]() { io_impl->io_dealer(); });
+}
+void IODealer::stop() {
+  if (io_impl->stop) {
+    return;
+  }
+  // LOG_INFO("Stopping IO Dealer");
+  io_impl->stop = true;
+}
+
+}  // namespace async_store
--- a/csrc/balance_serve/kvc2/src/async_store.hh
+++ b/csrc/balance_serve/kvc2/src/async_store.hh
@ -0,0 +1,51 @@
+#pragma once
+#include <cstddef>
+#include <filesystem>
+
+#define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG
+#define FMT_HEADER_ONLY
+#include "spdlog/spdlog.h"
+
+#include "io_helper.hpp"
+
+namespace async_store {
+
+struct ArrayStore;
+
+ArrayStore* create_or_open_store(size_t element_size, size_t size, std::filesystem::path data_path);
+void close_store(ArrayStore* store);
+size_t capacity(ArrayStore* store);
+void extend(ArrayStore* store, size_t to);
+
+
+
+struct IORequest {
+  ArrayStore* store;
+  bool write;
+  void* data;
+  size_t index;
+
+  // for sync
+  bool need_promise = false;
+  BatchPromise* promise;
+};
+
+std::string request_to_string(IORequest* req);
+
+struct IODealerImpl;
+struct IODealer {
+  IODealerImpl* io_impl;
+
+  IODealer(bool use_io_uring = false, int IO_DEPTH = 128);
+  ~IODealer();
+  IODealer(const IODealer&) = delete;
+  IODealer& operator=(const IODealer&) = delete;
+  IODealer(IODealer&&) = default;
+  IODealer& operator=(IODealer&&) = default;
+
+  void enqueue(std::shared_ptr<IORequest> req);
+  std::thread start_io_thread();
+  void stop();
+};
+
+}  // namespace async_store
--- a/csrc/balance_serve/kvc2/src/bind.cpp
+++ b/csrc/balance_serve/kvc2/src/bind.cpp
@ -0,0 +1,53 @@
+// #include <pybind11/functional.h>
+// #include <pybind11/pybind11.h>
+// #include <pybind11/stl.h>
+// #include <memory>
+// #include <thread>
+// #include <vector>
+// #include "kvc2.h"
+// #define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG
+// #define FMT_HEADER_ONLY
+// #include "spdlog/spdlog.h"
+// #include "utils/arithmetic.hpp"
+
+// namespace py = pybind11;
+
+// PYBIND11_MODULE(kvc2_ext, m) {
+//   // Bind KVC2Config struct
+//   py::class_<kvc2::KVC2Config>(m, "KVC2Config")
+//       .def(py::init<>())
+//       .def_readwrite("path", &kvc2::KVC2Config::path)
+//       .def_readwrite("block_length", &kvc2::KVC2Config::num_token_per_page)
+//       .def_readwrite("memory_pool_size", &kvc2::KVC2Config::memory_pool_size)
+//       .def_readwrite("evict_count", &kvc2::KVC2Config::evict_count);
+
+//   // Bind CacheInfo struct
+//   py::class_<kvc2::CacheInfo>(m, "CacheInfo")
+//       .def(py::init<>())
+//       .def_readwrite("model_name", &kvc2::CacheInfo::model_name)
+//       .def_readwrite("is_key_cache", &kvc2::CacheInfo::is_key_cache)
+//       .def_readwrite("quant_type", &kvc2::CacheInfo::quant_type)
+//       .def("hidden_layer_count", &kvc2::CacheInfo::hidden_layer_count)
+//       .def("path", &kvc2::CacheInfo::path, py::arg("which_layer") = std::nullopt)
+//       .def("__eq__", &kvc2::CacheInfo::operator==)
+//       .def("element_size", &kvc2::CacheInfo::element_size)
+//       .def("hash_value", &kvc2::CacheInfo::hash_value);
+
+//   // Bind KVC2HandleInterface class
+//   py::class_<kvc2::KVC2HandleInterface, std::shared_ptr<kvc2::KVC2HandleInterface>>(m, "KVC2HandleInterface")
+//       .def("matched_length", &kvc2::SingleCacheHandleInterface::matched_length)
+//       .def("handle_data", &kvc2::KVC2HandleInterface::handle_data);
+
+//   // Bind KVC2Interface class
+//   py::class_<kvc2::KVC2Interface, std::shared_ptr<kvc2::KVC2Interface>>(m, "KVC2Interface")
+//       .def("start_io_thread", [](kvc2::KVC2Interface& self) { self.start_io_thread(); })
+//       .def("stop_io_thread", &kvc2::KVC2Interface::stop_io_thread)
+//       .def("load", &kvc2::KVC2Interface::load)
+//       .def("save", &kvc2::KVC2Interface::save)
+//       .def("raw_insert", &kvc2::KVC2Interface::raw_insert)
+//       .def("raw_read", &kvc2::KVC2Interface::raw_read)
+//       .def("lookup", &kvc2::KVC2Interface::lookup);
+
+//   // Bind create_kvc2 function
+//   m.def("create_kvc2", &kvc2::create_kvc2, py::arg("config"));
+// }
--- a/csrc/balance_serve/kvc2/src/cache_entry.cpp
+++ b/csrc/balance_serve/kvc2/src/cache_entry.cpp
@ -0,0 +1,263 @@
+#include "cache_entry.hh"
+#include <mutex>
+
+#define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG
+#define FMT_HEADER_ONLY
+#include "spdlog/spdlog.h"
+
+#include "gpu_cache.hh"
+
+namespace kvc2 {
+
+bool ConcurrentControlUnit::can_desert() {
+  if (ref_count.load() == 0 && dirty.load() == false) {
+    tc.reset();
+    return true;
+  } else {
+    return false;
+  }
+}
+void ConcurrentControlUnit::debug() {
+  SPDLOG_DEBUG("ref count {}, dirty {}, {}", ref_count.load(), dirty.load(), tc.debug());
+}
+
+CacheBlockEntry::~CacheBlockEntry() {
+  if (data != nullptr && manager && manager->pool) {
+    SPDLOG_WARN("Free {} when destruct", data);
+    free_on_cpu();
+  }
+}
+
+bool CacheBlockEntry::alloc_on_cpu() {
+  assert(data == nullptr);
+  data = manager->pool->alloc(size);
+  if (data == nullptr) {
+    manager->evict_for_cpu_cache();
+    data = manager->pool->alloc(size);
+    if (data == nullptr) {
+      SPDLOG_ERROR("Not enough memory for Block Cache");
+      return false;
+    }
+  }
+  return true;
+}
+
+void CacheBlockEntry::free_on_cpu() {
+  manager->pool->free(data, size);
+  data = nullptr;
+}
+
+bool CacheBlockEntry::alloc_on_cpu_no_lock() {
+  if (data == nullptr) {
+    if (alloc_on_cpu() == false) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool CacheBlockEntry::inc_ref_or_alloc_on_cpu() {
+  std::lock_guard<CacheBlockEntry::MutexT> lg(lock);
+  if (data == nullptr) {
+    if (alloc_on_cpu()) {
+      cpu_cc.ref_count.fetch_add(1);
+      return true;
+    } else {
+      return false;
+    }
+  } else {
+    cpu_cc.ref_count.fetch_add(1);
+    return true;
+  }
+}
+
+std::unique_lock<CacheBlockEntry::MutexT> CacheBlockEntry::try_lock() {
+  return std::unique_lock<CacheBlockEntry::MutexT>(lock, std::try_to_lock);
+}
+
+std::lock_guard<CacheBlockEntry::MutexT> CacheBlockEntry::lock_guard() {
+  return std::lock_guard<CacheBlockEntry::MutexT>(lock);
+}
+
+void CacheBlockEntry::debug() {
+  SPDLOG_DEBUG(
+      "CacheBlockEntry: disk[{:4},{:7}], with key {}, hash {:016x}, data: {}, ref_count: {}, size: {}, cpu tc: {}, "
+      "in page cache: {}, gpu ref count:{}, gpu tc: {}",
+      layer, idx, with_key, hash, data, cpu_cc.ref_count.load(), size, cpu_cc.tc.debug(), manager != nullptr,
+      gpu_cc.ref_count.load(), gpu_cc.tc.debug());
+}
+
+CacheBlockEntryCollector::CacheBlockEntryCollector(std::function<void(CacheBlockEntry*)> exit_fn) : exit_fn(exit_fn) {}
+
+CacheBlockEntryCollector::~CacheBlockEntryCollector() {
+  // SPDLOG_DEBUG("Collector Destruct");
+  for (auto& e : entries) {
+    exit_fn(e);
+  }
+}
+
+void CacheBlockEntry::io_with(async_store::IODealer* dealer, IO_Helper<CacheBlockEntry>& io_helper,
+                              async_store::ArrayStore* store, size_t layer, size_t index, IOOption option) {
+  bool write;
+
+  auto& batch_promise = io_helper.batch_promise;
+
+  switch (option) {
+    case IO_Read: {
+      write = false;
+      if (io_helper.absorb_tc(this, cpu_cc.tc)) {
+        // need read
+      } else {
+        return;
+      }
+      break;
+    }
+    case IO_ForceRead: {
+      // Not change
+      write = false;
+      break;
+    }
+    case IO_ForceWrite: {
+      // Not change
+      write = true;
+      break;
+    }
+    case IO_Write: {
+      write = true;
+      break;
+    }
+    default: {
+      assert(0);
+    }
+  }
+  io_helper.new_task();
+  this->layer = layer;
+  this->idx = index;
+
+  auto req = std::make_shared<async_store::IORequest>();
+  req->store = store;
+  req->data = data;
+  req->index = index;
+  req->write = write;
+  req->need_promise = true;
+  req->promise = &batch_promise;
+
+  SPDLOG_TRACE("Submitting {}", async_store::request_to_string(req.get()));
+  dealer->enqueue(std::move(req));
+}
+
+CacheEntryManager::CacheEntryManager(CacheEntryManagerConfig config) : config(config) {}
+
+void CacheEntryManager::evict_for_cpu_cache() {
+  size_t count = 0;
+  evict(
+      [&count](const BlockPtr& block) {
+        // here we assume each with gpu must resides on cpu
+        if (block->data != nullptr && block->cpu_cc.can_desert() &&
+            block->gpu_cc.can_desert() /*For now If A Cache Entry Block is on GPU, it must on cpu. */) {
+          block->free_on_cpu();
+          count += 1;
+          return true;
+        } else {
+          return false;
+        }
+      },
+      [&count, this]() {
+        return false;
+        // return count == this->config.evict_count;
+      });
+}
+
+void CacheEntryManager::insert(BlockPtr entry) {
+  assert(entry->with_key);
+  assert(key_entry_map.count(entry->hash) == 0);
+  usage_list.push_front(entry);
+  key_entry_map[entry->hash] = usage_list.begin();
+}
+
+CacheEntryManager::BlockPtr CacheEntryManager::access(const Key& key) {
+  auto it = key_entry_map.at(key);
+  auto entry = *it;
+  usage_list.erase(it);
+  usage_list.push_front(entry);
+  key_entry_map[key] = usage_list.begin();
+  return entry;
+}
+
+// void CacheEntryManager::remove(const Key& key) {
+//   auto it = key_entry_map[key];
+//   usage_list.erase(it);
+//   key_entry_map.erase(key);
+// }
+
+void CacheEntryManager::evict(std::function<bool(const BlockPtr&)> filter, std::function<bool()> stop_condition) {
+  auto evict_count = 0;
+  auto inspect_count = 0;
+
+  std::lock_guard<std::mutex> lg(lock);
+  for (auto it = usage_list.rbegin(); it != usage_list.rend();) {
+    inspect_count += 1;
+    // SPDLOG_DEBUG("Map Size {}, List Size {}, Evicted {} blocks, Inspected {}, {}", key_entry_map.size(),
+    //              usage_list.size(), evict_count, inspect_count, pool->debug());
+    // (*it)->debug();
+    if (stop_condition())
+      break;
+    auto entry_ul = (*it)->try_lock();
+    if (entry_ul.owns_lock() == false) {
+      ++it;  // Ensure iterator advances when locking fails
+      continue;
+    }
+    if (filter(*it)) {
+      // SPDLOG_DEBUG("Evicting {}", fmt::ptr(it->get()));
+      evict_count++;
+      if ((*it)->with_key)
+        key_entry_map.erase((*it)->hash);
+      it = decltype(it)(usage_list.erase(std::next(it).base()));  // Use base() to adjust for reverse iterator
+    } else {
+      ++it;  // Ensure iterator advances when filter fails
+    }
+  }
+
+  if (evict_count > 0) {
+    SPDLOG_DEBUG("Map Size {}, List Size {}, Evicted {} blocks, Inspected {}, {}", key_entry_map.size(),
+                 usage_list.size(), evict_count, inspect_count, pool->debug());
+  }
+}
+
+CacheEntryManager::BlockPtr CacheEntryManager::get(bool& is_new, size_t size, std::optional<Key> key) {
+  std::unique_lock<std::mutex> ul(lock);
+  if (key.has_value()) {
+    if (key_entry_map.count(key.value())) {
+      is_new = false;
+      return access(key.value());
+    } else {
+      auto entry = std::make_shared<CacheBlockEntry>();
+      entry->with_key = true;
+      entry->hash = key.value();
+      entry->size = size;
+      entry->manager = this;
+      insert(entry);
+      is_new = true;
+      return entry;
+    }
+  } else {
+    auto entry = std::make_shared<CacheBlockEntry>();
+    entry->with_key = false;
+    entry->size = size;
+    entry->manager = this;
+    is_new = true;
+    return entry;
+  }
+}
+
+void CacheEntryManager::debug() {
+  fmt::print("Cache Manager: {} entries\n", key_entry_map.size());
+  pool->debug();
+  fmt::print("Layer 0 Entries in Order\n", key_entry_map.size());
+  for (auto& it : usage_list) {
+    if (it->layer == 0)
+      it->debug();
+  }
+}
+
+};  // namespace kvc2
--- a/csrc/balance_serve/kvc2/src/cache_entry.hh
+++ b/csrc/balance_serve/kvc2/src/cache_entry.hh
@ -0,0 +1,182 @@
+#ifndef __CACHE_ENTRY_HH_
+#define __CACHE_ENTRY_HH_
+#include "async_store.hh"
+#include "cuda_stream_manager.hh"
+#include "defs.h"
+#include "hasher.hpp"
+#include "io_helper.hpp"
+#include "page_aligned_memory_pool.h"
+#include "utils/periodic_task.hpp"
+
+#include <atomic>
+#include <list>
+#include <memory>
+#include "utils/mutex_extend.hpp"
+
+namespace kvc2 {
+using CacheBlockKey = TokensHash;
+
+class CacheEntryManager;
+struct DoubleVerticalBlocksHandle;
+class GPUPageCache;
+
+struct ConcurrentControlUnit {
+  std::atomic_size_t ref_count = 0;
+  std::atomic_bool dirty = false;
+  TransferControl<std::mutex> tc;
+
+  bool can_desert();
+  void debug();
+};
+
+enum IOOption {
+  IO_ForceRead,
+  IO_ForceWrite,
+  IO_Read,
+  IO_Write,
+};
+
+inline std::string to_string(IOOption op) {
+  switch (op) {
+    case IO_ForceRead:
+      return "IO_ForceRead";
+    case IO_ForceWrite:
+      return "IO_ForceWrite";
+    case IO_Read:
+      return "IO_Read";
+    case IO_Write:
+      return "IO_Write";
+    default:
+      return "Unknown";
+  }
+}
+
+struct CacheBlockEntry {
+  friend CacheEntryManager;
+  using MutexT = non_recursive_mutex;
+  // using MutexT = std::mutex;
+  MutexT lock;
+
+  // for cache
+  bool with_key = true;
+  CacheBlockKey hash = 0;
+  CacheBlockKey hash_check = 0;
+
+  CacheInfo cache_info;
+  CacheEntryManager* manager = nullptr;
+
+  // for memory pool
+  void* data = nullptr;
+  size_t size = 0;
+
+  ConcurrentControlUnit cpu_cc;
+
+  // for disk
+  size_t layer = -1;
+  size_t idx = -1;
+
+  // for gpu
+
+  std::optional<size_t> gpu_block_idx = std::nullopt;
+  ConcurrentControlUnit gpu_cc;
+
+  CacheBlockEntry() =default;
+  CacheBlockEntry(const CacheBlockEntry& other) = delete;
+  CacheBlockEntry& operator=(const CacheBlockEntry& other) = delete;
+  CacheBlockEntry(CacheBlockEntry&& other) = delete;
+  CacheBlockEntry& operator=(CacheBlockEntry&& other) = delete;
+  ~CacheBlockEntry();
+
+ private:
+  bool alloc_on_cpu();
+
+
+ public:
+  void free_on_cpu();
+  bool alloc_on_cpu_no_lock();
+
+  bool inc_ref_or_alloc_on_cpu();
+  void set_key(TokensHash key, std::shared_ptr<CacheBlockEntry> me);
+
+  std::unique_lock<MutexT> try_lock();
+  std::lock_guard<MutexT> lock_guard();
+
+  // will not get lock
+  void io_with(async_store::IODealer* dealer, IO_Helper<CacheBlockEntry>& io_helper, async_store::ArrayStore* store,
+               size_t layer, size_t index, IOOption option);
+  void flush_back_async(IO_Helper<CacheBlockEntry>& helper, std::vector<std::atomic_bool*>& dirty_flags);
+
+  void debug();
+};
+
+struct CacheBlockEntryCollector{
+
+  std::vector<CacheBlockEntry*> entries;
+  std::function<void(CacheBlockEntry*)> exit_fn;
+
+  CacheBlockEntryCollector(std::function<void(CacheBlockEntry*)> exit_fn);
+  ~CacheBlockEntryCollector();
+  
+  CacheBlockEntryCollector(const CacheBlockEntryCollector& other) = delete;
+  CacheBlockEntryCollector(CacheBlockEntryCollector&& other) = delete;
+  CacheBlockEntryCollector& operator=(const CacheBlockEntryCollector& other) = delete;
+  CacheBlockEntryCollector& operator=(CacheBlockEntryCollector&& other) = delete;
+
+
+
+};
+
+
+struct KVC2;
+struct CacheEntryManagerConfig {
+  size_t evict_count = 100;
+  KVC2* kvc2_top = nullptr;
+};
+
+class CacheEntryManager {
+ public:
+  using Key = CacheBlockKey;
+  using BlockPtr = std::shared_ptr<CacheBlockEntry>;
+
+ private:
+  friend CacheBlockEntry;
+
+  CacheEntryManagerConfig config;
+
+  std::mutex lock;
+  std::list<BlockPtr> usage_list;
+  std::unordered_map<Key, std::list<BlockPtr>::iterator> key_entry_map;
+
+  void insert(BlockPtr entry);
+  BlockPtr access(const Key& key);
+
+  // void remove(const Key& key);
+  void evict(std::function<bool(const BlockPtr&)> filter, std::function<bool()> stop_condition);
+
+
+ public:
+  std::unique_ptr<periodic::PeriodicTask> background_flush_back=nullptr;
+  std::shared_ptr<PageAlignedMemoryPool> pool;
+  std::shared_ptr<GPUPageCache> gpu_cache;
+
+  CacheEntryManager(CacheEntryManagerConfig config);
+
+  // disable all move and copy
+  CacheEntryManager(const CacheEntryManager& other) = delete;
+  CacheEntryManager& operator=(const CacheEntryManager& other) = delete;
+  CacheEntryManager(CacheEntryManager&& other) = delete;
+  CacheEntryManager& operator=(CacheEntryManager&& other) = delete;
+
+  void cpu_background_flush();
+
+  void evict_for_cpu_cache();
+
+  // just get block pointers, not allocate them, will not return nullptr
+  BlockPtr get(bool& is_new,size_t size, std::optional<Key> key = std::nullopt);
+
+  void debug();
+};
+
+}  // namespace kvc2
+
+#endif
--- a/csrc/balance_serve/kvc2/src/common.h
+++ b/csrc/balance_serve/kvc2/src/common.h
--- a/csrc/balance_serve/kvc2/src/cuda_stream_manager.cpp
+++ b/csrc/balance_serve/kvc2/src/cuda_stream_manager.cpp
@ -0,0 +1,135 @@
+#include "cuda_stream_manager.hh"
+#include <cuda_runtime.h>
+#include <functional>
+#include <iostream>
+#include <stdexcept>
+#include <vector>
+#define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_INFO
+// #define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG
+#define FMT_HEADER_ONLY
+#include "spdlog/spdlog.h"
+
+CudaStreamManager::CudaStreamManager(const std::vector<size_t>& device_ids, int num_streams_per_device) {
+  for (int device_id : device_ids) {
+    auto x = std::unique_ptr<DeviceInfo>(new DeviceInfo);
+    DeviceInfo& device_info = *x;
+    device_info.device_id = device_id;
+    device_info.next_stream_index = 0;
+    device_info.stop_flag = false;
+
+    // 设置设备
+    cudaError_t err = cudaSetDevice(device_id);
+    if (err != cudaSuccess) {
+      SPDLOG_WARN("cudaSetDevice failed on device {}: {}", device_id, cudaGetErrorString(err));
+      throw std::runtime_error("cudaSetDevice failed");
+    }
+
+    // 创建 CUDA 流
+    device_info.streams.resize(num_streams_per_device);
+    for (int i = 0; i < num_streams_per_device; ++i) {
+      err = cudaStreamCreate(&device_info.streams[i]);
+      if (err != cudaSuccess) {
+        SPDLOG_WARN("Failed to create CUDA stream on device {}: {}", device_id, cudaGetErrorString(err));
+        throw std::runtime_error("Failed to create CUDA stream");
+      }
+    }
+
+    // 启动设备工作线程
+    device_info.worker_thread = std::thread(&CudaStreamManager::deviceWorker, this, std::ref(device_info));
+
+    devices_.push_back(std::move(x));
+  }
+}
+
+CudaStreamManager::~CudaStreamManager() {
+  // 通知所有设备线程停止
+  for (auto& device_info : devices_) {
+    device_info->stop_flag.store(true);
+    auto request = std::shared_ptr<Request>(new Request);
+    request->should_exit = true;
+    device_info->request_queue.enqueue(std::move(request));
+  }
+
+  // 等待所有线程结束
+  for (auto& device_info : devices_) {
+    if (device_info->worker_thread.joinable()) {
+      device_info->worker_thread.join();
+    }
+
+    // 销毁 CUDA 流
+    cudaSetDevice(device_info->device_id);
+    for (auto& stream : device_info->streams) {
+      cudaStreamDestroy(stream);
+    }
+  }
+}
+
+void CudaStreamManager::submitRequest(std::shared_ptr<Request> request) {
+  // 找到对应的设备
+  for (auto& device_info : devices_) {
+    if (device_info->device_id == request->device_id) {
+      device_info->request_queue.enqueue(request);
+      return;
+    }
+  }
+  throw std::runtime_error("Invalid device ID in request");
+}
+
+void CudaStreamManager::deviceWorker(DeviceInfo& device_info) {
+  // 设置设备
+  cudaError_t err = cudaSetDevice(device_info.device_id);
+  if (err != cudaSuccess) {
+    SPDLOG_WARN("cudaSetDevice failed in worker thread for device {}: {}", device_info.device_id,
+                cudaGetErrorString(err));
+    return;
+  }
+
+  while (device_info.stop_flag.load() == false) {
+    auto request = device_info.request_queue.dequeue();
+    if (request->should_exit) {
+      return;
+    }
+    // 处理请求
+    SPDLOG_DEBUG("Getting request on device {}, count {}", device_info.device_id, request->host_mem_addresses.size());
+    int stream_index = device_info.next_stream_index;
+    cudaStream_t stream = device_info.streams[stream_index];
+    device_info.next_stream_index = (device_info.next_stream_index + 1) % device_info.streams.size();
+
+    size_t num_transfers = request->host_mem_addresses.size();
+    for (size_t i = 0; i < num_transfers; ++i) {
+      void* dst = request->device_mem_addresses[i];
+      void* src = request->host_mem_addresses[i];
+      if (request->direction == cudaMemcpyDeviceToHost) {
+        std::swap(dst, src);
+      }
+
+      cudaError_t err = cudaMemcpyAsync(dst, src, request->sizes[i], request->direction, stream);
+      if (err != cudaSuccess) {
+        SPDLOG_WARN("cudaMemcpyAsync failed on device {}: {}", device_info.device_id, cudaGetErrorString(err));
+        // 可以根据需要处理错误，这里简单地继续
+        continue;
+      }
+    }
+
+    // 添加回调函数，因为是异步，所以需要包起来
+    struct CallbackData {
+      std::function<void()> callback;
+    };
+    CallbackData* cb_data = new CallbackData{request->callback};
+
+    err = cudaLaunchHostFunc(
+        stream,
+        [](void* data) {
+          // SPDLOG_DEBUG("Callback function called");
+          CallbackData* cb_data = static_cast<CallbackData*>(data);
+          cb_data->callback();
+          delete cb_data;
+        },
+        cb_data);
+
+    if (err != cudaSuccess) {
+      SPDLOG_WARN("cudaLaunchHostFunc failed on device {}: {}", device_info.device_id, cudaGetErrorString(err));
+      // 根据需要处理错误
+    }
+  }
+}
--- a/csrc/balance_serve/kvc2/src/cuda_stream_manager.hh
+++ b/csrc/balance_serve/kvc2/src/cuda_stream_manager.hh
@ -0,0 +1,54 @@
+/*
+ * @Author: Xie Weiyu ervinxie@qq.com
+ * @Date: 2024-11-19 09:24:47
+ * @LastEditors: Xie Weiyu ervinxie@qq.com
+ * @LastEditTime: 2024-11-20 02:55:49
+ * @FilePath: /kvc2/src/cuda_stream_manager.hh
+ * @Description: 这是默认设置,请设置`customMade`, 打开koroFileHeader查看配置 进行设置: https://github.com/OBKoro1/koro1FileHeader/wiki/%E9%85%8D%E7%BD%AE
+ */
+#pragma once
+
+#include <cuda_runtime.h>
+#include <atomic>
+#include <functional>
+#include <memory>
+#include <thread>
+#include <vector>
+#include "utils/mpsc.hpp"
+
+class CudaStreamManager {
+ public:
+  // 构造函数，接受要使用的设备 ID 列表和每个设备的流数量
+  CudaStreamManager(const std::vector<size_t>& device_ids, int num_streams_per_device);
+  ~CudaStreamManager();
+
+  // 请求结构体
+  struct Request {
+    bool should_exit = false;
+    int device_id;
+    std::vector<void*> host_mem_addresses;
+    std::vector<void*> device_mem_addresses;
+    std::vector<size_t> sizes;
+    cudaMemcpyKind direction;
+    std::function<void()> callback;
+  };
+
+  void submitRequest(std::shared_ptr<Request> request);
+
+ private:
+  // 每个设备的信息
+  struct DeviceInfo {
+    int device_id;
+    std::thread worker_thread;
+    std::vector<cudaStream_t> streams;
+    int next_stream_index;
+    MPSCQueueConsumerLock<std::shared_ptr<Request>> request_queue;
+    std::atomic_bool stop_flag;
+  };
+
+  // 设备 ID 到 DeviceInfo 的映射
+  std::vector<std::unique_ptr<DeviceInfo>> devices_;
+
+  // 私有方法
+  void deviceWorker(DeviceInfo& device_info);
+};
--- a/csrc/balance_serve/kvc2/src/defs.h
+++ b/csrc/balance_serve/kvc2/src/defs.h
@ -0,0 +1,35 @@
+#ifndef __DEFS_H_
+#define __DEFS_H_
+
+#include <cstdint>
+#include <optional>
+#include <vector>
+#include "model_config.h"
+
+namespace kvc2 {
+using kvc2_ptr = void*;
+// using data_block_ptr = std::intptr_t;
+using data_block_ptr = void*;
+using layer_data = std::vector<data_block_ptr>;
+using kvc2_handle = void*;
+
+using Token = uint32_t;
+using Tokens = std::vector<Token>;
+using TokenPtr = std::intptr_t;
+using TokenLength = size_t;
+using BlockLength = size_t;
+
+struct CacheInfo {
+  ModelName model_name;
+  bool is_key_cache;
+  QuantType quant_type;
+
+  size_t hidden_layer_count();
+  std::filesystem::path path(std::optional<size_t> which_layer = std::nullopt);
+  bool operator==(const CacheInfo& other) const;
+  size_t element_size(size_t block_length);
+  size_t hash_value() const;
+};
+
+};  // namespace kvc2
+#endif
--- a/csrc/balance_serve/kvc2/src/gpu_cache.cpp
+++ b/csrc/balance_serve/kvc2/src/gpu_cache.cpp
@ -0,0 +1,282 @@
+#include "gpu_cache.hh"
+
+#define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG
+#define FMT_HEADER_ONLY
+#include "spdlog/spdlog.h"
+
+#include "cache_entry.hh"
+#include "utils/arithmetic.hpp"
+
+namespace kvc2 {
+
+GPUPageCache::GPUPageCache(GPUPageCacheConfig& config) : config(config) {
+  if (torch::cuda::is_available()) {
+    size_t gpu_count = torch::cuda::device_count();
+    SPDLOG_INFO("Number of available GPUs: {}, want {}", gpu_count, config.gpu_devices_id.size());
+    if (gpu_count < config.gpu_devices_id.size()) {
+      SPDLOG_ERROR("Not enough GPUs available.");
+      exit(0);
+    }
+    for (auto x : config.gpu_devices_id) {
+      gpu_devices.push_back(torch::Device(torch::kCUDA, x));
+    }
+  } else {
+    SPDLOG_ERROR("CUDA is not available on this system.");
+    exit(0);
+  }
+
+  SPDLOG_WARN("Creating GPU Cache");
+  shape.push_back(config.layer_count);
+  shape.push_back(config.total_kvcache_pages);
+  shape.push_back(config.num_token_per_page);
+  if (config.full_kv_cache_on_each_gpu) {
+    if (config.gpu_devices_id.size() > 1) {
+      SPDLOG_WARN("Replicated KVCache on multiple gpu");
+    }
+    shape.push_back(config.num_k_heads);
+  } else {
+    shape.push_back(config.num_k_heads / config.gpu_devices_id.size());
+  }
+  shape.push_back(config.k_head_dim);
+  tensor_size = torch::elementSize(config.tensor_type);
+  for (auto& s : shape) {
+    tensor_size *= s;
+  }
+  SPDLOG_INFO("Creating KV Page Cache, Shape ({},{},{},{},{}), Size {} MiB", shape[0], shape[1], shape[2], shape[3],
+              shape[4], tensor_size / (1 << 20));
+  if (config.k_cache_on) {
+    for (size_t i = 0; i < config.gpu_devices_id.size(); i++) {
+      auto k = torch::zeros(shape, torch::TensorOptions().dtype(config.tensor_type));
+      k = k.to(gpu_devices[i]);
+
+      k_cache.push_back(k);
+
+      SPDLOG_INFO("K Page Cache of GPU {} is created", config.gpu_devices_id[i]);
+    }
+    occupations.resize(config.layer_count);
+  } else {
+    SPDLOG_WARN("Disalbe K Cache");
+    assert(config.gpu_only);
+  }
+
+  if (config.v_cache_on) {
+    for (size_t i = 0; i < config.gpu_devices_id.size(); i++) {
+      auto v = torch::zeros(shape, torch::TensorOptions().dtype(config.tensor_type));
+      v = v.to(gpu_devices[i]);
+      v_cache.push_back(v);
+
+      SPDLOG_INFO("V Page Cache of GPU {} is created", config.gpu_devices_id[i]);
+    }
+    v_occupations.resize(config.layer_count);
+  } else {
+    SPDLOG_WARN("Disalbe V Cache");
+    // assert(config.gpu_only); // should not assert
+  }
+
+  if (config.gpu_only) {
+    gpu_only_occupations.resize(config.total_kvcache_pages, false);
+  }
+
+  num_free_pages = config.total_kvcache_pages;
+  for (size_t i = 0; i < config.layer_count; i++) {
+    if (config.k_cache_on)
+      occupations[i].resize(config.total_kvcache_pages, nullptr);
+    if (config.v_cache_on)
+      v_occupations[i].resize(config.total_kvcache_pages, nullptr);
+  }
+
+  tp_size.resize(config.gpu_devices_id.size(), shape[2] * shape[3] * shape[4] * c10::elementSize(config.tensor_type));
+  tp_offset.resize(config.gpu_devices_id.size(), 0);
+  for (size_t i = 1; i < tp_offset.size(); i++) {
+    tp_offset[i] = tp_offset[i - 1] + tp_size[i - 1];
+  }
+
+  stream_manager =
+      std::unique_ptr<CudaStreamManager>(new CudaStreamManager(config.gpu_devices_id, config.num_streams_per_device));
+}
+
+bool GPUPageCache::alloc_col(std::vector<std::vector<std::shared_ptr<CacheBlockEntry>>>& k_entries,
+                             std::vector<std::vector<std::shared_ptr<CacheBlockEntry>>>& v_entries, size_t at) {
+  std::lock_guard<std::mutex> lg(lock);
+  auto idx = next_empty_col();
+  if (idx.has_value()) {
+    // must have entry lock
+    auto& k0_entry = k_entries[0][at];
+    k0_entry->gpu_block_idx = idx;
+
+    for (size_t l = 0; l < config.layer_count; l++) {
+      if (config.k_cache_on) {
+        assert(k_entries[l][at]->data != nullptr);
+        occupations[l][idx.value()] = k_entries[l][at];
+      }
+      if (config.v_cache_on) {
+        assert(v_entries[l][at]->data != nullptr);
+        v_occupations[l][idx.value()] = v_entries[l][at];
+      }
+    }
+    return true;
+  } else {
+    return false;
+  }
+}
+
+std::vector<size_t> GPUPageCache::gpu_only_alloc_col(size_t count) {
+  assert(config.gpu_only);
+  std::lock_guard<std::mutex> lg(lock);
+  std::vector<size_t> re;
+
+  for (size_t i = 0; i < config.total_kvcache_pages; i++) {
+    if (gpu_only_occupations[i] == false) {
+      re.push_back(i);
+      if (re.size() == count) {
+        break;
+      }
+    }
+  }
+
+  if (re.size() == count) {
+    for (auto at : re) {
+      gpu_only_occupations[at] = true;
+    }
+  } else {
+    SPDLOG_WARN("GPU ONLY: Cannot allocate {} cols", count);
+    re.clear();
+  }
+  return re;
+}
+
+void GPUPageCache::gpu_only_free_cols(std::vector<size_t> cols) {
+  assert(config.gpu_only);
+  std::lock_guard<std::mutex> lg(lock);
+  for (auto at : cols) {
+    assert(gpu_only_occupations[at]);
+    gpu_only_occupations[at] = false;
+  }
+}
+
+std::optional<size_t> GPUPageCache::next_empty_col() {
+  if (num_free_pages == 0) {
+    evict_cols();
+    if (num_free_pages == 0) {
+      return std::nullopt;
+    }
+  }
+  while (occupations[0][_col_idx] != nullptr) {
+    _col_idx = (_col_idx + 1) % config.total_kvcache_pages;
+  }
+  num_free_pages -= 1;
+  return _col_idx;
+}
+
+void GPUPageCache::evict_cols() {
+  auto evicted_count = 0;
+  for (size_t i = 0; i < config.total_kvcache_pages; i++) {
+    auto& h = occupations[0][i];
+    if (h == nullptr) {
+      continue;
+    }
+    auto lg = h->lock_guard();
+    if (h->gpu_cc.can_desert()) {
+      h->gpu_cc.tc.reset();
+      h = nullptr;
+      num_free_pages += 1;
+      evicted_count += 1;
+    }
+  }
+  if (evicted_count > 0)
+    SPDLOG_INFO("GPU: Evicted {} GPU pages", evicted_count);
+}
+
+std::vector<std::unique_lock<CacheBlockEntry::MutexT>> GPUPageCache::try_lock_col(size_t at) {
+  std::vector<std::unique_lock<CacheBlockEntry::MutexT>> re;
+  if (config.k_cache_on) {
+    for (size_t l = 0; l < config.layer_count; l++) {
+      if (occupations[l][at] == nullptr) {
+        return {};
+      }
+      auto ul = occupations[l][at]->try_lock();
+      if (ul.owns_lock()) {
+        re.push_back(std::move(ul));
+      } else {
+        return {};
+      }
+    }
+  }
+  if (config.v_cache_on) {
+    for (size_t l = 0; l < config.layer_count; l++) {
+      if (v_occupations[l][at] == nullptr) {
+        return {};
+      }
+      auto ul = v_occupations[l][at]->try_lock();
+      if (ul.owns_lock()) {
+        re.push_back(std::move(ul));
+      } else {
+        return {};
+      }
+    }
+  }
+  return re;
+}
+
+std::vector<std::shared_ptr<CudaStreamManager::Request>> GPUPageCache::basic_request(cudaMemcpyKind direction,
+                                                                                     std::function<void()> callback) {
+  std::vector<std::shared_ptr<CudaStreamManager::Request>> re;
+  re.resize(config.gpu_devices_id.size(), nullptr);
+  for (size_t i = 0; i < re.size(); i++) {
+    re[i] = std::shared_ptr<CudaStreamManager::Request>(new CudaStreamManager::Request);
+    re[i]->direction = direction;
+    re[i]->device_id = config.gpu_devices_id[i];
+    re[i]->callback = callback;
+  }
+  return re;
+}
+
+void GPUPageCache::submit_requests(std::vector<std::shared_ptr<CudaStreamManager::Request>> reqs) {
+  for (auto& r : reqs) {
+    stream_manager->submitRequest(r);
+  }
+}
+
+void GPUPageCache::append_col_to_request(std::vector<std::shared_ptr<CudaStreamManager::Request>>& reqs,
+                                         std::vector<std::vector<std::shared_ptr<CacheBlockEntry>>>& k_handles,
+                                         std::vector<std::vector<std::shared_ptr<CacheBlockEntry>>>& v_handles,
+                                         size_t at) {
+  if (config.k_cache_on == false && config.v_cache_on == false) {
+    return;
+  }
+  auto gpu_block_idx = k_handles[0][at]->gpu_block_idx.value();
+  for (size_t layer = 0; layer < config.layer_count; layer++) {
+    for (size_t which_gpu = 0; which_gpu < config.gpu_devices_id.size(); which_gpu++) {
+      if (config.k_cache_on) {
+        assert(k_handles[layer][at]->data != nullptr);
+        reqs[which_gpu]->sizes.push_back(tp_size[which_gpu]);
+        reqs[which_gpu]->host_mem_addresses.push_back(
+            offset_by_bytes(k_handles[layer][at]->data, tp_offset[which_gpu]));
+        reqs[which_gpu]->device_mem_addresses.push_back(k_cache[which_gpu][layer][gpu_block_idx].data_ptr());
+      }
+
+      if (config.v_cache_on) {
+        assert(v_handles[layer][at]->data != nullptr);
+        reqs[which_gpu]->sizes.push_back(tp_size[which_gpu]);
+        reqs[which_gpu]->host_mem_addresses.push_back(
+            offset_by_bytes(v_handles[layer][at]->data, tp_offset[which_gpu]));
+        reqs[which_gpu]->device_mem_addresses.push_back(v_cache[which_gpu][layer][gpu_block_idx].data_ptr());
+      }
+    }
+  }
+  // SPDLOG_DEBUG("GPU: Appended Vertical Handle to Request, count {}", reqs[0]->sizes.size());
+}
+
+void GPUPageCache::debug() {
+  size_t count = 0;
+  for (size_t i = 0; i < config.total_kvcache_pages; i++) {
+    if (occupations[0][i] == nullptr) {
+      count += 1;
+    } else {
+      // occupations[0][i]->gpu_cc.debug();
+    }
+  }
+  SPDLOG_DEBUG("Free Page: {}/{}", count, config.total_kvcache_pages);
+}
+
+}  // namespace kvc2
--- a/csrc/balance_serve/kvc2/src/gpu_cache.hh
+++ b/csrc/balance_serve/kvc2/src/gpu_cache.hh
@ -0,0 +1,74 @@
+#ifndef __GPU_CACHE_HH_
+#define __GPU_CACHE_HH_
+
+#include <torch/torch.h>
+#include "cache_entry.hh"
+#include "cuda_stream_manager.hh"
+#include "defs.h"
+#include "kvc2.h"
+#include "metrics.h"
+#include "utils/periodic_task.hpp"
+
+namespace kvc2 {
+
+class GPUPageCache {
+  std::vector<torch::Device> gpu_devices;
+
+  std::vector<int64_t> shape;
+  size_t tensor_size;
+  std::vector<size_t> tp_offset;
+  std::vector<size_t> tp_size;
+
+
+
+  // met
+  std::shared_ptr<Metrics> met;
+
+  // states
+  std::mutex lock;
+  size_t num_free_pages;
+  std::vector<bool> gpu_only_occupations;
+  std::vector<std::vector<std::shared_ptr<CacheBlockEntry>>> occupations,v_occupations;
+  size_t _col_idx = 0;
+
+
+  // cuda stream manager
+  std::optional<size_t> next_empty_col();
+
+ public:
+  GPUPageCacheConfig config;
+  std::unique_ptr<CudaStreamManager> stream_manager;
+  std::vector<torch::Tensor> k_cache;
+  std::vector<torch::Tensor> v_cache;
+  std::unique_ptr<periodic::PeriodicTask> background_flush_back =nullptr;
+
+  GPUPageCache(GPUPageCacheConfig& config);
+
+  std::vector<size_t> gpu_only_alloc_col(size_t count);
+  void gpu_only_free_cols(std::vector<size_t> cols);
+  
+
+  void gpu_background_flush();
+
+
+  bool alloc_col(std::vector<std::vector<std::shared_ptr<CacheBlockEntry>>>& k_entries,
+                 std::vector<std::vector<std::shared_ptr<CacheBlockEntry>>>& v_entries, size_t at);
+  void evict_cols();
+  void flush_col(size_t at);
+  std::vector<std::unique_lock<CacheBlockEntry::MutexT>> try_lock_col(size_t at);
+
+  void free_col(size_t at);
+
+  std::vector<std::shared_ptr<CudaStreamManager::Request>> basic_request(cudaMemcpyKind direction,
+                                                                         std::function<void()> callback);
+
+  void submit_requests(std::vector<std::shared_ptr<CudaStreamManager::Request>> reqs);
+
+  void append_col_to_request(std::vector<std::shared_ptr<CudaStreamManager::Request>>& reqs,
+                             std::vector<std::vector<std::shared_ptr<CacheBlockEntry>>>& k_handles,
+                             std::vector<std::vector<std::shared_ptr<CacheBlockEntry>>>& v_handles, size_t at);
+
+  void debug();
+};
+}  // namespace kvc2
+#endif
--- a/csrc/balance_serve/kvc2/src/hasher.hpp
+++ b/csrc/balance_serve/kvc2/src/hasher.hpp
@ -0,0 +1,40 @@
+#ifndef __HASHER_HPP_
+#define __HASHER_HPP_
+
+#include "defs.h"
+#include "xxhash.h"
+
+namespace kvc2 {
+
+const uint64_t hash_seed = 4123512;
+const uint64_t check_hash_seed = 1025753;
+
+using TokensHash = XXH64_hash_t;
+struct TokensHasher {
+  XXH64_state_t* state;
+  TokensHasher() {
+    state = XXH64_createState();
+    reset();
+  }
+  ~TokensHasher() { XXH64_freeState(state); }
+
+  TokensHasher(TokensHasher& other) = delete;
+  TokensHasher& operator=(TokensHasher& other) = delete;
+  TokensHasher(TokensHasher&& other) = delete;
+  TokensHasher& operator=(TokensHasher&& other) = delete;
+  TokensHash get() { return XXH64_digest(state); }
+  void reset(size_t seed = hash_seed) { XXH64_reset(state, seed); }
+  TokensHash update(Token* data, TokenLength length) {
+    XXH64_update(state, data, length * sizeof(Token));
+    return get();
+  }
+
+  TokensHash update_raw(void* data, size_t size) {
+    XXH64_update(state, data, size);
+    return get();
+  }
+
+  static TokensHash hash(Token* data, TokenLength length) { return XXH64(data, length * sizeof(Token), hash_seed); }
+};
+}  // namespace kvc2
+#endif
--- a/csrc/balance_serve/kvc2/src/io_helper.hpp
+++ b/csrc/balance_serve/kvc2/src/io_helper.hpp
@ -0,0 +1,155 @@
+/**
+ * @Description  :
+ * @Author       : Xie Weiyu
+ * @Date         : 2024-12-11 06:35:31
+ * @Version      : 1.0.0
+ * @LastEditors  : Xie Weiyu
+ * @LastEditTime : 2024-12-11 06:50:55
+ * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
+ **/
+
+#pragma once
+#include <atomic>
+#include <future>
+#include <iostream>
+#include <mutex>
+#include <optional>
+#include <string>
+#include <vector>
+
+struct BatchPromise {
+  std::promise<void> promise;
+  std::shared_future<void> fut;
+  std::atomic_size_t count;
+
+  inline BatchPromise(size_t count) : count(count) { fut = promise.get_future().share(); }
+
+  inline void inc(size_t count = 1) { this->count.fetch_add(count, std::memory_order_seq_cst); }
+
+  inline void set() {
+    if (count.fetch_sub(1, std::memory_order_seq_cst) == 1) {
+      promise.set_value();
+    }
+  }
+  inline std::shared_future<void> get_shared_fut() { return fut; }
+};
+
+template <typename Lock>
+struct TransferControl {
+  Lock lock;
+
+  std::optional<std::shared_future<void>> transfer_ok = std::nullopt;
+  bool has_data = false;
+
+  TransferControl() {}
+
+  /*
+   true, std::nullopt : Already has data
+   false, shared_future : Transfer already started, should wait for the future
+   false, std::nullopt : should transfer by you
+   true, shared_future: Should not appear
+  */
+  std::pair<bool, std::optional<std::shared_future<void>>> has_data_or_transfer(std::shared_future<void> shared_fut) {
+    std::lock_guard<Lock> lg(lock);
+    if (has_data) {
+      return {true, std::nullopt};
+    } else {
+      if (transfer_ok.has_value()) {
+        return {false, transfer_ok};
+      } else {
+        transfer_ok = shared_fut;
+        return {false, std::nullopt};
+      }
+    }
+  }
+
+  void set_has_data() {
+    std::lock_guard<Lock> lg(lock);
+    has_data = true;
+    transfer_ok = std::nullopt;
+  }
+
+  bool get_has_data() {
+    std::lock_guard<Lock> lg(lock);
+    if (has_data) {
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  void reset() {
+    std::lock_guard<Lock> lg(lock);
+    transfer_ok = std::nullopt;
+    has_data = false;
+  }
+
+  std::string debug() {
+    std::lock_guard<Lock> lg(lock);
+    return std::string("") + (has_data ? "has data" : "no data") + " " +
+           (transfer_ok.has_value() ? "transfer " : "no transfer");
+  }
+};
+
+struct ConcurrentController {
+  std::atomic_bool dirty = false;
+  std::atomic_size_t ref_count = 0;
+  TransferControl<std::mutex> tc;
+};
+
+template <typename Unit>
+struct IO_Helper {
+  BatchPromise batch_promise;
+  std::function<void(Unit*)> call_back_on_unit = nullptr;
+  std::function<void()> call_back = nullptr;
+
+  std::vector<std::shared_future<void>> futs;
+  std::vector<Unit*> units_by_myself;
+
+  IO_Helper(std::function<void(Unit*)> call_back_on_unit, std::function<void()> call_back = nullptr)
+      : batch_promise(1), call_back_on_unit(call_back_on_unit), call_back(call_back) {}
+
+  IO_Helper(const IO_Helper& other) = delete;
+  IO_Helper& operator=(const IO_Helper& other) = delete;
+  IO_Helper(IO_Helper&& other) = delete;
+  IO_Helper& operator=(IO_Helper&& other) = delete;
+  ~IO_Helper() {
+    // std::cout<<"Destory IO helper"<<std::endl;
+  }
+
+  size_t total_task_count = 0;
+  void new_task(size_t count = 1) {
+    total_task_count += 1;
+    batch_promise.inc(count);
+  }
+  void finish_add_taks() { batch_promise.set(); }
+
+  bool absorb_tc(Unit* unit, TransferControl<std::mutex>& tc) {
+    auto [ok, fut] = tc.has_data_or_transfer(batch_promise.get_shared_fut());
+    if (ok) {
+      return false;
+    } else {
+      if (fut.has_value()) {
+        futs.push_back(fut.value());
+        // printf("Transfer started\n");
+        return false;
+      } else {
+        units_by_myself.push_back(unit);
+        // printf("Not Transfer\n");
+        return true;
+      }
+    }
+  }
+
+  void wait() {
+    for (auto& fut : futs) {
+      fut.wait();
+    }
+    batch_promise.get_shared_fut().wait();
+    for (auto& b : units_by_myself) {
+      call_back_on_unit(b);
+    }
+    if (call_back)
+      call_back();
+  }
+};
--- a/csrc/balance_serve/kvc2/src/kvc2.h
+++ b/csrc/balance_serve/kvc2/src/kvc2.h
@ -0,0 +1,138 @@
+#pragma once
+#include <torch/torch.h>
+#include <cstdint>
+#include <optional>
+#include <vector>
+#include "defs.h"
+#include "model_config.h"
+
+namespace kvc2 {
+struct GPUPageCacheConfig {
+  bool gpu_only;
+  std::vector<size_t> gpu_devices_id;
+
+  size_t layer_count;
+  size_t total_kvcache_pages;
+  size_t num_token_per_page;
+  size_t num_k_heads;
+  size_t k_head_dim;
+
+  bool full_kv_cache_on_each_gpu = false;
+  bool k_cache_on = true;
+  bool v_cache_on = true;
+  torch::ScalarType tensor_type;
+
+  // for cuda stream manager
+  size_t num_streams_per_device = 4;
+};
+
+struct KVC2Config {
+  bool k_cache_on = true;
+  bool v_cache_on = true;
+  bool gpu_only = false;
+  bool load_from_disk = true;
+  bool save_to_disk = true;
+  std::string path;
+  std::string config_path;
+  TokenLength num_token_per_page = 256;
+  size_t memory_pool_size = 10e9;
+  size_t evict_count = 20;
+  std::optional<GPUPageCacheConfig> gpu_cache_config = std::nullopt;
+  size_t metrics_port;
+  double recompute_ratio = 0.2;
+};
+
+class DoubleCacheHandleInterface;
+class KVC2Interface {
+ public:
+  virtual ~KVC2Interface() = default;
+
+  virtual void load() = 0;
+  virtual void save() = 0;
+  /*
+Raw Insert
+Insert kvcache from kvcache_data to disk.
+
+info: cache info
+id: start pointer of token array
+length: length of token array
+kvcache_data: data of kvcache
+
+This will firstly match the ID array with the existing kvcache, and then insert the unmatched kvcache to disk.
+*/
+  virtual void raw_insert(ModelName model_name, QuantType quant_type, Token* id, TokenLength length,
+                          const std::vector<layer_data>& k_cache, const std::vector<layer_data>& v_cache) = 0;
+
+  /*
+Raw Read
+Read kvcache from disk to user specified pointers.
+
+info: cache info
+id: start pointer of token array
+length: length of token array
+kvcache_data: data of kvcache
+Return:  matched length of prefix, in tokens
+
+This will not read from memory pool, it directly read from disk.
+*/
+  virtual TokenLength raw_read(ModelName model_name, QuantType quant_type, Token* id, TokenLength length,
+                               const std::vector<layer_data>& k_cache, const std::vector<layer_data>& v_cache) = 0;
+
+  /*
+  Lookup
+  Lookup kvcache and load it from disk to memory pool if needed.
+
+  info: cache info
+  id: start pointer of token array
+  length: length of token array
+
+  Return:  kvc2_handle, holds kvcache until being released.
+           if not found, matched_length will return 0.
+           if memory pool is full, return nullptr
+  */
+  virtual std::shared_ptr<DoubleCacheHandleInterface> lookup(ModelName model_name, QuantType quant_type, Token* id,
+                                                             TokenLength length, TokenLength estimated_length) = 0;
+
+  /*
+  Lookup and allocate to gpu
+  info.is_k_cache does not matter here
+  */
+  virtual std::shared_ptr<DoubleCacheHandleInterface> lookup_to_gpu(ModelName model_name, QuantType quant_type,
+                                                                    Token* id, TokenLength length,
+                                                                    TokenLength estimated_length) = 0;
+
+  virtual void lookup_to_gpu_async(ModelName model_name, QuantType quant_type, Token* id, TokenLength length,
+                                   TokenLength estimated_length,
+                                   std::function<void(std::shared_ptr<DoubleCacheHandleInterface>)> call_back) = 0;
+
+  virtual std::pair<std::vector<torch::Tensor>, std::vector<torch::Tensor>> get_kvcache() = 0;
+
+  virtual void debug() = 0;
+};
+
+std::shared_ptr<KVC2Interface> create_kvc2(KVC2Config config);
+
+enum MatchStatus {
+  Exact,
+  Partial,
+  NotMatchExact,
+  NotMatchPartial,
+};
+
+class DoubleCacheHandleInterface {
+ public:
+  virtual ~DoubleCacheHandleInterface() = default;
+  virtual TokenLength matched_length() = 0;
+  virtual std::vector<MatchStatus> matched_status() = 0;
+  virtual std::vector<layer_data> handle_data(bool is_key_cache) = 0;
+  virtual bool to_gpu() = 0;
+  virtual void to_gpu_async(std::function<void(bool)> call_back) = 0;
+  virtual std::vector<size_t> get_gpu_block_idx() = 0;
+  virtual std::vector<size_t> get_gpu_attached_block_idx() = 0;
+
+  virtual void append_tokens(Token* tokens, TokenLength length) = 0;  // update generated tokens
+
+  virtual void debug() = 0;
+};
+
+};  // namespace kvc2
--- a/csrc/balance_serve/kvc2/src/kvc2_utils.py
+++ b/csrc/balance_serve/kvc2/src/kvc2_utils.py
@ -0,0 +1,64 @@
+import torch
+import ctypes
+
+def aligned_tensor(size, alignment=4096):
+    num_bytes = size 
+    mem = ctypes.c_void_p()
+    error_code = ctypes.CDLL(None).posix_memalign(
+        ctypes.byref(mem), ctypes.c_size_t(alignment), ctypes.c_size_t(num_bytes)
+    )
+
+    if error_code != 0:
+        raise MemoryError(f"posix_memalign failed with error code {error_code}")
+
+    array_type = (ctypes.c_int8 * size) 
+    raw_array = array_type.from_address(mem.value)
+
+    tensor = torch.frombuffer(raw_array, dtype=torch.int8)
+
+    if tensor.data_ptr() % alignment != 0:
+        raise ValueError(f"Tensor data_ptr {tensor.data_ptr()} is not aligned to {alignment} bytes")
+
+    return tensor, mem
+
+def alloc_aligned_cache(layer_count,block_count,element_size):
+    cache = []
+    cache_mem = []
+    for i in range(layer_count):
+        layer_data = []
+        layer_mem = []
+        for j in range(block_count):
+            tensor, mem_ptr = aligned_tensor(element_size, alignment=4096)
+            layer_data.append(tensor)
+            layer_mem.append(mem_ptr)
+        cache.append(layer_data)
+        cache_mem.append(layer_mem)
+    return cache,cache_mem
+
+def dealloc_aligned_cache(cache_mem):
+    for layer_mem in cache_mem:
+        for mem_ptr in layer_mem:
+            ctypes.CDLL(None).free(mem_ptr)
+
+def get_tensor_ptr(tensors):
+    tensor_ptr = []
+    for layer in tensors:
+        layer_ptr = []
+        for data in layer:
+            layer_ptr.append(data.data_ptr())
+        tensor_ptr.append(layer_ptr)
+    return tensor_ptr
+
+def get_tensor_from_data_ptr(matched_data,element_size):
+    re = []
+    for layer in matched_data:
+        re_layer = []
+        for data_ptr in layer:
+            array_type = (ctypes.c_int8 * element_size) 
+            raw_array = array_type.from_address(data_ptr)
+            tensor = torch.frombuffer(raw_array, dtype=torch.int8)
+            re_layer.append(tensor)
+        re.append(re_layer)
+    return re
+if __name__ == "__main__":
+    pass
--- a/csrc/balance_serve/kvc2/src/metrics.cpp
+++ b/csrc/balance_serve/kvc2/src/metrics.cpp
@ -0,0 +1,141 @@
+#include "metrics.h"
+
+namespace kvc2 {
+
+Metrics::Metrics(const MetricsConfig& config)
+    : registry_(std::make_shared<prometheus::Registry>()), exposer_(config.endpoint) {
+  // 注册 prefix_nodes Counter
+  auto& prefix_nodes_family = prometheus::BuildCounter()
+                                  .Name(std::string(METRIC_PREFIX) + "_prefix_nodes")
+                                  .Help("Number of prefix nodes")
+                                  .Register(*registry_);
+  prefix_nodes = &prefix_nodes_family.Add({});
+
+  // 注册 prefix_block_count Counter
+  auto& prefix_block_count_family = prometheus::BuildCounter()
+                                        .Name(std::string(METRIC_PREFIX) + "_prefix_block_count")
+                                        .Help("Number of prefix blocks")
+                                        .Register(*registry_);
+  prefix_block_count = &prefix_block_count_family.Add({});
+
+  // 定义统一的桶大小，最大为 10000 ms (10 s)
+  std::vector<double> common_buckets = {1.0, 5.0, 10.0, 50.0, 100.0, 500.0, 1000.0, 5000.0, 10000.0};
+
+  // 注册 raw_insert_time_ms Histogram
+  auto& raw_insert_time_ms_family = prometheus::BuildHistogram()
+                                        .Name(std::string(METRIC_PREFIX) + "_raw_insert_time_ms")
+                                        .Help("function raw insert's time in milliseconds")
+                                        .Register(*registry_);
+  raw_insert_time_ms = &raw_insert_time_ms_family.Add({}, common_buckets);
+
+  // 注册 lookup_time_ms Histogram
+  auto& lookup_time_ms_family = prometheus::BuildHistogram()
+                                    .Name(std::string(METRIC_PREFIX) + "_lookup_time_ms")
+                                    .Help("function lookup's time in milliseconds")
+                                    .Register(*registry_);
+  lookup_time_ms = &lookup_time_ms_family.Add({}, common_buckets);
+
+  // 注册 lookup_prefixmatch_length Histogram
+  auto& lookup_prefixmatch_length_family = prometheus::BuildHistogram()
+                                               .Name(std::string(METRIC_PREFIX) + "_lookup_prefixmatch_length")
+                                               .Help("function lookup's prefix match length")
+                                               .Register(*registry_);
+  lookup_prefixmatch_length = &lookup_prefixmatch_length_family.Add({}, common_buckets);
+
+  // 注册 matched_length_percentage Histogram
+  auto& matched_length_percentage_family = prometheus::BuildHistogram()
+                                               .Name(std::string(METRIC_PREFIX) + "_matched_length_percentage")
+                                               .Help("function matched length percentage")
+                                               .Register(*registry_);
+  matched_length_percentage = &matched_length_percentage_family.Add({}, common_buckets);
+
+  // 注册 disk_usage Gauge
+  auto& disk_usage_family =
+      prometheus::BuildGauge().Name(std::string(METRIC_PREFIX) + "_disk_usage").Help("disk usage").Register(*registry_);
+  disk_usage = &disk_usage_family.Add({});
+
+  // 注册 memory_pool_size Gauge
+  memory_pool_size_family_ = &prometheus::BuildGauge()
+                                  .Name(std::string(METRIC_PREFIX) + "_memory_pool_size")
+                                  .Help("memory pool size")
+                                  .Register(*registry_);
+
+  // 注册 memory_pool_node_count Gauge
+  memory_pool_node_count_family_ = &prometheus::BuildGauge()
+                                        .Name(std::string(METRIC_PREFIX) + "_memory_pool_node_count")
+                                        .Help("memory pool node count")
+                                        .Register(*registry_);
+
+  // 注册 lru_entry_count Gauge
+  lru_entry_count_family_ = &prometheus::BuildGauge()
+                                 .Name(std::string(METRIC_PREFIX) + "_lru_entry_count")
+                                 .Help("lru entry count")
+                                 .Register(*registry_);
+
+  // 注册 gpu_page_count Gauge
+  gpu_page_count_family_ = &prometheus::BuildGauge()
+                                .Name(std::string(METRIC_PREFIX) + "_gpu_page_count")
+                                .Help("gpu page count")
+                                .Register(*registry_);
+
+  // 注册 append_tokens_time_ms Histogram
+  auto& append_tokens_time_ms_family = prometheus::BuildHistogram()
+                                           .Name(std::string(METRIC_PREFIX) + "_append_tokens_time_ms")
+                                           .Help("append tokens time in milliseconds")
+                                           .Register(*registry_);
+  append_tokens_time_ms = &append_tokens_time_ms_family.Add({}, common_buckets);
+
+  // 注册 gpu_flush_back_time_ms Histogram
+  auto& gpu_flush_back_time_ms_family = prometheus::BuildHistogram()
+                                            .Name(std::string(METRIC_PREFIX) + "_gpu_flush_back_time_ms")
+                                            .Help("gpu flush back time in milliseconds")
+                                            .Register(*registry_);
+  gpu_flush_back_time_ms = &gpu_flush_back_time_ms_family.Add({}, common_buckets);
+
+  // 注册 cpu_flush_back_time_ms Histogram
+  auto& cpu_flush_back_time_ms_family = prometheus::BuildHistogram()
+                                            .Name(std::string(METRIC_PREFIX) + "_cpu_flush_back_time_ms")
+                                            .Help("cpu flush back time in milliseconds")
+                                            .Register(*registry_);
+  cpu_flush_back_time_ms = &cpu_flush_back_time_ms_family.Add({}, common_buckets);
+
+  exposer_.RegisterCollectable(registry_);
+}
+
+// 析构函数
+Metrics::~Metrics() {
+  // 停止指标暴露
+  // exposer_.Stop();
+}
+
+// 获取 memory_pool_size 指标
+prometheus::Gauge* Metrics::memory_pool_size(const std::string& type) {
+  return &memory_pool_size_family_->Add({{"type", type}});
+}
+
+// 获取 memory_pool_node_count 指标
+prometheus::Gauge* Metrics::memory_pool_node_count(const std::string& type) {
+  return &memory_pool_node_count_family_->Add({{"type", type}});
+}
+
+// 获取 lru_entry_count 指标
+prometheus::Gauge* Metrics::lru_entry_count(const std::string& type) {
+  return &lru_entry_count_family_->Add({{"type", type}});
+}
+
+// 获取 gpu_page_count 指标
+prometheus::Gauge* Metrics::gpu_page_count(std::string type) {
+  return &gpu_page_count_family_->Add({{"type", type}});
+}
+
+TimeObserver::TimeObserver(prometheus::Histogram* h) {
+  histogram_ = h;
+  timer_.start();
+}
+
+TimeObserver::~TimeObserver() {
+  timer_.stop();
+  histogram_->Observe(timer_.elapsedNs() / 1e6);  // ns -> ms
+}
+
+}  // namespace kvc2
--- a/csrc/balance_serve/kvc2/src/metrics.h
+++ b/csrc/balance_serve/kvc2/src/metrics.h
@ -0,0 +1,77 @@
+#pragma once
+
+#include <atomic>
+#include <chrono>
+#include <memory>
+#include <string>
+#include <thread>
+#include <vector>
+#include "prometheus/counter.h"
+#include "prometheus/exposer.h"
+#include "prometheus/gauge.h"
+#include "prometheus/histogram.h"
+#include "prometheus/registry.h"
+
+#include "utils/timer.hpp"
+
+namespace kvc2 {
+
+// 指标前缀宏定义
+#define METRIC_PREFIX "kvc2"
+
+struct MetricsConfig {
+  std::string endpoint;  // 监听端点，如 "0.0.0.0:8080"
+};
+
+class Metrics {
+ public:
+  // 构造函数传入 MetricsConfig
+  Metrics(const MetricsConfig& config);
+  ~Metrics();
+
+  // 禁止拷贝和赋值
+  Metrics(const Metrics&) = delete;
+  Metrics& operator=(const Metrics&) = delete;
+
+  // 指标指针
+  prometheus::Counter* prefix_nodes;
+  prometheus::Counter* prefix_block_count;
+
+  prometheus::Histogram* raw_insert_time_ms;
+  prometheus::Histogram* lookup_time_ms;
+  prometheus::Histogram* lookup_prefixmatch_length;
+  prometheus::Histogram* matched_length_percentage;
+
+  prometheus::Gauge* disk_usage;
+
+  prometheus::Gauge* memory_pool_size(const std::string& type);
+  prometheus::Gauge* memory_pool_node_count(const std::string& type);
+
+  prometheus::Gauge* lru_entry_count(const std::string& type);
+  prometheus::Gauge* gpu_page_count(std::string type);
+
+  prometheus::Histogram* append_tokens_time_ms;
+  prometheus::Histogram* gpu_flush_back_time_ms;
+  prometheus::Histogram* cpu_flush_back_time_ms;
+
+ private:
+  std::shared_ptr<prometheus::Registry> registry_;
+  prometheus::Exposer exposer_;
+
+  prometheus::Family<prometheus::Gauge>* memory_pool_size_family_;
+  prometheus::Family<prometheus::Gauge>* memory_pool_node_count_family_;
+  prometheus::Family<prometheus::Gauge>* lru_entry_count_family_;
+  prometheus::Family<prometheus::Gauge>* gpu_page_count_family_;
+};
+
+class TimeObserver {
+ public:
+  TimeObserver(prometheus::Histogram* h);
+  ~TimeObserver();
+
+ private:
+  Timer timer_;
+  prometheus::Histogram* histogram_;
+};
+
+}  // namespace kvc2
--- a/csrc/balance_serve/kvc2/src/model_config.h
+++ b/csrc/balance_serve/kvc2/src/model_config.h
@ -0,0 +1,119 @@
+#ifndef __MODEL_CONFIG_HPP_
+#define __MODEL_CONFIG_HPP_
+
+#include "nlohmann/json.hpp"
+#include <iostream>
+
+#include <filesystem>
+#include <fstream>
+
+using DimSize = size_t;
+using URL = std::string;
+using ModelName = std::string;
+
+// We must assure this can be load by config.json
+class ModelConfig {
+public:
+  DimSize hidden_size;
+  DimSize intermediate_size;
+  size_t max_position_embeddings;
+  std::string model_type;
+  size_t num_attention_heads;
+  size_t num_hidden_layers;
+  size_t num_key_value_heads;
+  size_t vocab_size;
+
+  NLOHMANN_DEFINE_TYPE_INTRUSIVE(ModelConfig, hidden_size, intermediate_size,
+                                 max_position_embeddings, model_type,
+                                 num_attention_heads, num_hidden_layers,
+                                 num_key_value_heads, vocab_size);
+
+  void load_from(std::filesystem::path path) {
+    std::cout << "Load from " << path << std::endl;
+    std::ifstream i(path);
+    nlohmann::json j;
+    i >> j;
+    *this = j.get<ModelConfig>();
+  }
+};
+
+using QuantType = std::string;
+static const QuantType NoQuantType = "";
+
+class QuantConfig {
+public:
+  QuantType name;
+
+  // For GEMV
+  QuantType type_of_dot_vector = NoQuantType;
+  inline bool can_be_used_as_matrix() {
+    return type_of_dot_vector != NoQuantType;
+  }
+
+  bool can_be_used_as_vector;
+
+  double bytes_per_element;
+  bool has_scale;
+  bool has_min;
+
+  size_t block_element_count;
+  size_t block_element_size;
+
+  URL reference = "";
+
+  NLOHMANN_DEFINE_TYPE_INTRUSIVE_WITH_DEFAULT(QuantConfig, name,
+                                              type_of_dot_vector,
+                                              can_be_used_as_vector,
+                                              bytes_per_element, has_scale,
+                                              has_min, block_element_count,
+                                              block_element_size, reference);
+};
+
+inline std::map<QuantType, QuantConfig> quant_configs;
+inline std::map<ModelName, ModelConfig> model_configs;
+
+inline void load_quant_configs(std::filesystem::path path) {
+  nlohmann::json j;
+  if (std::filesystem::exists(path)) {
+    std::cout << __FUNCTION__ << " from " << path << std::endl;
+    std::ifstream i(path);
+    i >> j;
+    quant_configs = j.get<std::map<QuantType, QuantConfig>>();
+    std::cout << "Loaded Quant Configs" << std::endl;
+    for (auto &[k, v] : quant_configs) {
+      std::cout << " - " << k << std::endl;
+    }
+  } else {
+    std::cout << __FUNCTION__ << " no file at " << path << std::endl;
+  }
+}
+
+inline void dump_quant_configs(std::filesystem::path path) {
+  std::ofstream o(path);
+  nlohmann::json j = quant_configs;
+  o << j.dump(4);
+}
+
+inline void load_model_configs(std::filesystem::path path) {
+  nlohmann::json j;
+  if (std::filesystem::exists(path)) {
+    std::cout << __FUNCTION__ << " from " << path << std::endl;
+    std::ifstream i(path);
+    i >> j;
+    model_configs = j.get<std::map<ModelName, ModelConfig>>();
+    std::cout << "Loaded Model Configs" << std::endl;
+    for (auto &[k, v] : model_configs) {
+      std::cout << " - " << k << std::endl;
+    }
+  } else {
+    std::cout << __FUNCTION__ << " no file at " << path << std::endl;
+  }
+}
+
+inline void dump_model_configs(std::filesystem::path path) {
+  std::ofstream o(path);
+  nlohmann::json j = model_configs;
+  o << j.dump(4);
+}
+
+#endif
--- a/csrc/balance_serve/kvc2/src/page_aligned_memory_pool.cpp
+++ b/csrc/balance_serve/kvc2/src/page_aligned_memory_pool.cpp
@ -0,0 +1,125 @@
+#include "page_aligned_memory_pool.h"
+
+#define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG
+#define FMT_HEADER_ONLY
+#include "spdlog/spdlog.h"
+
+#include "utils/arithmetic.hpp"
+#include "utils/easy_format.hpp"
+
+/// 构造函数
+PageAlignedMemoryPool::PageAlignedMemoryPool(size_t size_in_bytes) {
+  total_size = (size_in_bytes / PageSize) * PageSize;
+  // 对齐分配。C++17 对齐方式写法，如果编译器不支持可以改用其它方法
+  data = ::operator new[](total_size, std::align_val_t(PageSize));
+  total_pages = total_size / PageSize;
+
+  assert(total_pages >= Blocks);
+  page_per_block = total_pages / Blocks;
+
+  for (size_t block_index = 0; block_index < Blocks; block_index++) {
+    first_page[block_index] = reinterpret_cast<void*>(reinterpret_cast<intptr_t>(data) +
+                                                      static_cast<intptr_t>(block_index) * page_per_block * PageSize);
+    count_page[block_index] =
+        block_index == Blocks - 1 ? (total_pages - page_per_block * (Blocks - 1)) : page_per_block;
+    SPDLOG_DEBUG("first_page[{}] = {}, count_page[{}] = {}", block_index,
+                 reinterpret_cast<intptr_t>(first_page[block_index]) - reinterpret_cast<intptr_t>(data), block_index,
+                 count_page[block_index]);
+    bitmap[block_index].resize(count_page[block_index], 0);
+  }
+  SPDLOG_INFO("PageAlignedMemoryPool with size {} Mbytes, {} pages", total_size / (1 << 20), page_count());
+}
+
+/// 析构函数
+PageAlignedMemoryPool::~PageAlignedMemoryPool() {
+  if (data) {
+    // 注意：需要与分配时的对齐方式对应
+    ::operator delete[](data, std::align_val_t(PageSize));
+    data = nullptr;
+  }
+}
+
+/// 返回总页数
+size_t PageAlignedMemoryPool::page_count() {
+  return total_size / PageSize;
+}
+
+/// 返回按整页对齐后的字节数
+size_t PageAlignedMemoryPool::page_padded_size(size_t size) {
+  return div_up(size, PageSize) * PageSize;
+}
+
+void* PageAlignedMemoryPool::alloc_in_block(size_t block_index, size_t alloc_size) {
+  std::lock_guard<std::mutex> guard(lock[block_index]);
+  size_t free_pages = 0;
+  for (size_t i = 0; i < count_page[block_index]; i++) {
+    if (bitmap[block_index][i] == 0) {
+      free_pages++;
+      if (free_pages == alloc_size) {
+        size_t page_index = i + 1 - free_pages;
+        for (size_t page = page_index; page < page_index + alloc_size; page++) {
+          bitmap[block_index][page] = 1;
+          // SPDLOG_DEBUG("alloc page {} in block {}", page, block_index);
+        }
+        return reinterpret_cast<void*>(reinterpret_cast<intptr_t>(first_page[block_index]) + page_index * PageSize);
+      }
+    } else {
+      free_pages = 0;
+    }
+  }
+  return nullptr;
+}
+
+/// 分配函数
+void* PageAlignedMemoryPool::alloc(size_t size) {
+  size_t alloc_size = div_up(size, PageSize);
+  auto cnt = now_block.fetch_add(1, std::memory_order_relaxed);
+  for (size_t i = 0; i < Blocks; i++) {
+    auto result = alloc_in_block((i + cnt) % Blocks, alloc_size);
+    if (result != nullptr) {
+      allocated.fetch_add(alloc_size * PageSize, std::memory_order_relaxed);
+      alloc_count.fetch_add(1, std::memory_order_relaxed);
+      return result;
+    }
+  }
+  return nullptr;
+}
+
+/// 释放函数
+void PageAlignedMemoryPool::free(void* p, size_t size) {
+  auto alloc_size = div_up(size, PageSize);
+  size_t block_index = (reinterpret_cast<intptr_t>(p) - reinterpret_cast<intptr_t>(data)) / page_per_block / PageSize;
+  size_t page_index = (reinterpret_cast<intptr_t>(p) - reinterpret_cast<intptr_t>(first_page[block_index])) / PageSize;
+
+  std::lock_guard<std::mutex> guard(lock[block_index]);
+
+  for (size_t page = page_index; page < page_index + alloc_size; page++)
+    bitmap[block_index][page] = 0;
+
+  allocated.fetch_sub(alloc_size * PageSize, std::memory_order_relaxed);
+  free_count.fetch_add(1, std::memory_order_relaxed);
+}
+// TODO: too slow
+std::vector<void*> PageAlignedMemoryPool::alloc_multiple(size_t size, size_t count) {
+  std::vector<void*> result;
+  for (size_t i = 0; i < count; i++) {
+    auto p = alloc(size);
+    if (p == nullptr) {
+      for (auto ptr : result) {
+        free(ptr, size);
+      }
+      return {};
+    }
+    result.push_back(p);
+  }
+  return result;
+}
+
+void PageAlignedMemoryPool::defragment() {}
+
+/// 调试打印
+std::string PageAlignedMemoryPool::debug() {
+  return fmt::format("PageAlignedMemoryPool: total_size: {}MB, allocated: {}, alloc/free count: {}/{}\n",
+                     readable_number(total_size), readable_number(size_t(allocated)), size_t(alloc_count),
+                     size_t(free_count));
+}
--- a/csrc/balance_serve/kvc2/src/page_aligned_memory_pool.h
+++ b/csrc/balance_serve/kvc2/src/page_aligned_memory_pool.h
@ -0,0 +1,54 @@
+#pragma once
+
+#include <assert.h>
+#include <algorithm>  // std::sort
+#include <atomic>
+#include <bitset>
+#include <cstddef>  // size_t
+#include <mutex>    // std::mutex
+#include <vector>
+
+constexpr size_t PageSize = 4096;
+
+/// PageAlignedMemoryPool 类的声明
+struct PageAlignedMemoryPool {
+ private:
+  constexpr static size_t Blocks = 16;
+
+  void* data = nullptr;
+
+  size_t total_size = 0, total_pages = 0;
+
+  std::atomic_size_t now_block = 0;
+  std::atomic_size_t allocated = 0;  // allocated_size
+  std::atomic_size_t alloc_count = 0;
+  std::atomic_size_t free_count = 0;
+
+  std::mutex lock[Blocks];
+  size_t page_per_block = 0;
+  void* first_page[Blocks];
+  size_t count_page[Blocks];
+  std::vector<int8_t> bitmap[Blocks];
+  void* alloc_in_block(size_t block_index, size_t alloc_size);
+
+ public:
+  /// 构造函数和析构函数
+  explicit PageAlignedMemoryPool(size_t size_in_bytes);
+  ~PageAlignedMemoryPool();
+
+  /// 禁用拷贝和移动
+  PageAlignedMemoryPool(PageAlignedMemoryPool&& other) = delete;
+  PageAlignedMemoryPool& operator=(PageAlignedMemoryPool&& other) = delete;
+  PageAlignedMemoryPool(const PageAlignedMemoryPool& other) = delete;
+  PageAlignedMemoryPool& operator=(const PageAlignedMemoryPool& other) = delete;
+
+  /// 成员函数
+  size_t page_count();
+  size_t page_padded_size(size_t size);
+
+  void* alloc(size_t size);
+  std::vector<void*> alloc_multiple(size_t size, size_t count);
+  void free(void* data, size_t size);
+  void defragment();
+  std::string debug();
+};
--- a/csrc/balance_serve/kvc2/src/prefix.cpp
+++ b/csrc/balance_serve/kvc2/src/prefix.cpp
--- a/csrc/balance_serve/kvc2/src/utils/all.hpp
+++ b/csrc/balance_serve/kvc2/src/utils/all.hpp
@ -0,0 +1,3 @@
+#pragma once
+#include "easy_format.hpp"
+#include "timer.hpp"
--- a/csrc/balance_serve/kvc2/src/utils/arithmetic.hpp
+++ b/csrc/balance_serve/kvc2/src/utils/arithmetic.hpp
@ -0,0 +1,14 @@
+#include <memory>
+#include <type_traits>
+
+template <typename T, typename U>
+T div_up(T x, U by) {
+  static_assert(std::is_integral_v<T>);
+  static_assert(std::is_integral_v<U>);
+  return (x + by - 1) / by;
+}
+
+template <typename T>
+T* offset_by_bytes(T* t, size_t n) {
+  return reinterpret_cast<T*>(reinterpret_cast<size_t>(t) + n);
+}
--- a/csrc/balance_serve/kvc2/src/utils/easy_format.hpp
+++ b/csrc/balance_serve/kvc2/src/utils/easy_format.hpp
@ -0,0 +1,37 @@
+#ifndef __EASY_FORMAT_HPP_
+#define __EASY_FORMAT_HPP_
+#include <array>
+#include <iomanip>
+#include <sstream>
+#include <string>
+
+#include <vector>
+
+template <typename T>
+inline std::string format_vector(const std::vector<T>& v) {
+  std::ostringstream oss;
+  if (v.empty())
+    return "[]";
+  for (size_t i = 0; i < v.size(); ++i) {
+    oss << v[i];
+    if (i < v.size() - 1)
+      oss << ", ";  // 逗号分隔
+  }
+  return oss.str();
+}
+
+inline std::array<std::string, 7> units = {"", "K", "M", "G", "T", "P", "E"};
+
+inline std::string readable_number(size_t size) {
+  size_t unit_index = 0;
+  double readable_size = size;
+  while (readable_size >= 1000 && unit_index < units.size() - 1) {
+    readable_size /= 1000;
+    unit_index++;
+  }
+  std::ostringstream ss;
+  ss << std::fixed << std::setprecision(2) << readable_size;
+  std::string str = ss.str();
+  return str + "" + units[unit_index];
+}
+#endif
--- a/csrc/balance_serve/kvc2/src/utils/lock_free_queue.hpp
+++ b/csrc/balance_serve/kvc2/src/utils/lock_free_queue.hpp
@ -0,0 +1,60 @@
+#include <atomic>
+#include <future>
+#include <iostream>
+#include <memory>
+#include <thread>
+#include <vector>
+
+template <typename T>
+class MPSCQueue {
+  struct Node {
+    std::shared_ptr<T> data;
+    std::atomic<Node*> next;
+
+    Node() : next(nullptr) {}
+    Node(std::shared_ptr<T> data_) : data(std::move(data_)), next(nullptr) {}
+  };
+
+  std::atomic<Node*> head;
+  Node* tail;
+
+ public:
+  std::atomic_size_t enqueue_count = 0;
+  size_t dequeue_count = 0;
+  MPSCQueue() {
+    Node* dummy = new Node();
+    head.store(dummy, std::memory_order_relaxed);
+    tail = dummy;
+  }
+
+  ~MPSCQueue() {
+    // 清理剩余的节点
+    Node* node = tail;
+    while (node) {
+      Node* next = node->next.load(std::memory_order_relaxed);
+      delete node;
+      node = next;
+    }
+  }
+
+  // 生产者调用
+  void enqueue(std::shared_ptr<T> data) {
+    enqueue_count.fetch_add(1);
+    Node* node = new Node(std::move(data));
+    Node* prev_head = head.exchange(node, std::memory_order_acq_rel);
+    prev_head->next.store(node, std::memory_order_release);
+  }
+
+  // 消费者调用
+  std::shared_ptr<T> dequeue() {
+    Node* next = tail->next.load(std::memory_order_acquire);
+    if (next) {
+      std::shared_ptr<T> res = std::move(next->data);
+      delete tail;
+      tail = next;
+      dequeue_count += 1;
+      return res;
+    }
+    return nullptr;
+  }
+};
--- a/csrc/balance_serve/kvc2/src/utils/mpsc.hpp
+++ b/csrc/balance_serve/kvc2/src/utils/mpsc.hpp
@ -0,0 +1,90 @@
+#include <atomic>
+#include <cassert>
+#include <iostream>
+#include <optional>
+#include <semaphore>
+
+template <typename T>
+class MPSCQueue {
+  struct Node {
+    T data;
+    std::atomic<Node*> next;
+
+    Node() : next(nullptr) {}
+    Node(T data_) : data(std::move(data_)), next(nullptr) {}
+  };
+
+  std::atomic<Node*> head;
+  Node* tail;
+
+ public:
+  std::atomic_size_t enqueue_count = 0;
+  size_t dequeue_count = 0;
+  MPSCQueue() {
+    Node* dummy = new Node();
+    head.store(dummy, std::memory_order_seq_cst);
+    tail = dummy;
+  }
+
+  ~MPSCQueue() {
+    Node* node = tail;
+    while (node) {
+      Node* next = node->next.load(std::memory_order_seq_cst);
+      delete node;
+      node = next;
+    }
+  }
+
+  // 生产者调用
+  void enqueue(T data) {
+    enqueue_count.fetch_add(1);
+    Node* node = new Node(std::move(data));
+    Node* prev_head = head.exchange(node, std::memory_order_seq_cst);
+    prev_head->next.store(node, std::memory_order_seq_cst);
+  }
+
+  // 消费者调用
+  std::optional<T> dequeue() {
+    Node* next = tail->next.load(std::memory_order_seq_cst);
+    if (next) {
+      T res = std::move(next->data);
+      delete tail;
+      tail = next;
+      dequeue_count += 1;
+      return res;
+    }
+    return std::nullopt;
+  }
+
+  size_t size() { return enqueue_count.load() - dequeue_count; }
+};
+
+template <typename T>
+class MPSCQueueConsumerLock {
+  MPSCQueue<T> queue;
+  std::counting_semaphore<> sema{0};
+
+ public:
+  void enqueue(T data) {
+    queue.enqueue(std::move(data));
+    // std::atomic_thread_fence(std::memory_order_seq_cst);// Inserting this because the memory order might be wrong, I
+    // am also not that sure about this.
+    sema.release();
+  }
+
+  T dequeue() {
+    auto re = queue.dequeue();
+    if (re.has_value()) {
+      while (sema.try_acquire() == false) {
+        std::cerr << __FILE__ << ":" << __FUNCTION__ << " sema try acquire should be success, retrying, please check"
+                  << std::endl;
+        // assert(false);
+      }
+      return re.value();
+    }
+    sema.acquire();
+    return queue.dequeue().value();
+  }
+
+  size_t size() { return queue.size(); }
+};
--- a/csrc/balance_serve/kvc2/src/utils/mutex_extend.hpp
+++ b/csrc/balance_serve/kvc2/src/utils/mutex_extend.hpp
@ -0,0 +1,70 @@
+#ifndef __MUTEX_EXTEND_HPP_
+#define __MUTEX_EXTEND_HPP_
+
+#include <atomic>
+#include <chrono>
+#include <iostream>
+#include <thread>
+
+class non_recursive_mutex {
+ public:
+  non_recursive_mutex() = default;
+
+  // 使用 try_lock 实现非递归锁
+  bool try_lock() {
+    std::thread::id this_id = std::this_thread::get_id();
+
+    // 检查当前线程是否已经持有该锁
+    if (owner.load(std::memory_order_acquire) == this_id) {
+      return false;  // 如果是当前线程，返回失败
+    }
+
+    // 尝试加锁
+    if (mtx.try_lock()) {
+      owner.store(this_id, std::memory_order_release);  // 设置锁的拥有者
+      return true;
+    }
+
+    return false;
+  }
+
+  // lock 会阻塞，直到获得锁
+  void lock() {
+    std::thread::id this_id = std::this_thread::get_id();
+
+    while (true) {
+      // 检查当前线程是否已经持有该锁
+      if (owner.load(std::memory_order_acquire) == this_id) {
+        throw std::runtime_error("Thread is trying to lock a mutex it already holds");
+      }
+
+      // 尝试加锁
+      if (mtx.try_lock()) {
+        owner.store(this_id, std::memory_order_release);  // 设置锁的拥有者
+        return;
+      }
+
+      // 如果锁未获得，则稍微等待，防止忙等
+      std::this_thread::yield();
+    }
+  }
+
+  // 解锁
+  void unlock() {
+    std::thread::id this_id = std::this_thread::get_id();
+
+    // 确保只有持有锁的线程可以解锁
+    if (owner.load(std::memory_order_acquire) == this_id) {
+      owner.store(std::thread::id(), std::memory_order_release);  // 清除锁的拥有者
+      mtx.unlock();
+    } else {
+      throw std::runtime_error("Thread attempting to unlock a mutex it doesn't own");
+    }
+  }
+
+ private:
+  std::mutex mtx;                      // 实际的互斥量
+  std::atomic<std::thread::id> owner;  // 原子变量，记录当前锁的拥有者
+};
+
+#endif
--- a/csrc/balance_serve/kvc2/src/utils/periodic_task.hpp
+++ b/csrc/balance_serve/kvc2/src/utils/periodic_task.hpp
@ -0,0 +1,102 @@
+#ifndef PERIODIC_TASK_HPP
+#define PERIODIC_TASK_HPP
+
+#include <atomic>
+#include <chrono>
+#include <condition_variable>
+#include <cstdio>
+#include <functional>
+#include <future>
+#include <iostream>
+#include <mutex>
+#include <stop_token>
+#include <thread>
+#include <utility>
+#include <vector>
+
+namespace periodic {
+
+class PeriodicTask {
+ public:
+  explicit PeriodicTask(std::function<void()> func,
+                        std::chrono::milliseconds interval_ms = std::chrono::milliseconds(100))
+      : func_(std::move(func)), interval_(interval_ms), worker_([this](std::stop_token stoken) { this->run(stoken); }) {
+    // std::cout << "PeriodicTask created with interval: " << interval_.count() << " ms" << std::endl;
+  }
+
+  ~PeriodicTask() {
+    worker_.request_stop();
+    cv_.notify_one();  // Ensure worker wakes up when destroyed
+    // std::cout << "PeriodicTask destructor called, stopping worker." << std::endl;
+  }
+
+  void wakeUp() {
+    {
+      std::lock_guard<std::mutex> lock(wakeup_mutex_);
+      wake_up_requested_ = true;
+    }
+    cv_.notify_one();  // Notify worker thread to wake up immediately
+    // std::cout << "wakeUp() called: worker thread will wake up." << std::endl;
+  }
+
+  std::future<void> wakeUpWait() {
+    std::promise<void> promise;
+    std::future<void> future = promise.get_future();
+    {
+      std::lock_guard<std::mutex> lock(promise_mutex_);
+      wakeup_promises_.push_back(std::move(promise));
+    }
+    wakeUp();
+    return future;
+  }
+
+ private:
+  void run(std::stop_token stoken) {
+    while (!stoken.stop_requested()) {
+      std::unique_lock lock(mutex_);
+      // Wait for either the time interval or a wake-up signal
+      cv_.wait_for(lock, interval_, [this] { return wake_up_requested_.load(); });
+
+      if (stoken.stop_requested())
+        break;
+
+      // If the wake-up was triggered, reset the flag and process the task
+      {
+        std::lock_guard<std::mutex> lock(wakeup_mutex_);
+        wake_up_requested_ = false;
+      }
+
+      try {
+        // std::cout << "Running task function." << std::endl;
+        func_();
+      } catch (...) {
+        std::cerr << "Error in task function." << std::endl;
+      }
+
+      notifyPromises();
+    }
+  }
+
+  void notifyPromises() {
+    std::lock_guard<std::mutex> lock(promise_mutex_);
+    // std::cout << "Notifying all waiting promises." << std::endl;
+    for (auto& promise : wakeup_promises_) {
+      promise.set_value();
+    }
+    wakeup_promises_.clear();
+  }
+
+  std::function<void()> func_;
+  std::chrono::milliseconds interval_;
+  std::mutex mutex_;
+  std::condition_variable cv_;
+  std::vector<std::promise<void>> wakeup_promises_;
+  std::mutex promise_mutex_;
+  std::mutex wakeup_mutex_;
+  std::atomic<bool> wake_up_requested_ = false;
+  std::jthread worker_;
+};
+
+}  // namespace periodic
+
+#endif  // PERIODIC_TASK_HPP
--- a/csrc/balance_serve/kvc2/src/utils/spin_lock.hpp
+++ b/csrc/balance_serve/kvc2/src/utils/spin_lock.hpp
@ -0,0 +1,36 @@
+/*
+ * @Author: Xie Weiyu ervinxie@qq.com
+ * @Date: 2024-11-21 06:35:47
+ * @LastEditors: Xie Weiyu ervinxie@qq.com
+ * @LastEditTime: 2024-11-21 06:35:50
+ * @FilePath: /kvc2/src/utils/spin_lock.hpp
+ * @Description: 这是默认设置,请设置`customMade`, 打开koroFileHeader查看配置 进行设置:
+ * https://github.com/OBKoro1/koro1FileHeader/wiki/%E9%85%8D%E7%BD%AE
+ */
+
+#include <atomic>
+#include <chrono>
+#include <thread>
+
+class SpinLock {
+ public:
+  SpinLock() { flag.clear(); }
+
+  void lock() {
+    const int max_delay = 1024;  // Maximum delay in microseconds
+    int delay = 1;               // Initial delay in microseconds
+
+    while (flag.test_and_set(std::memory_order_acquire)) {
+      std::this_thread::sleep_for(std::chrono::microseconds(delay));
+      delay *= 2;
+      if (delay > max_delay) {
+        delay = max_delay;
+      }
+    }
+  }
+
+  void unlock() { flag.clear(std::memory_order_release); }
+
+ private:
+  std::atomic_flag flag = ATOMIC_FLAG_INIT;
+};
--- a/csrc/balance_serve/kvc2/src/utils/timer.hpp
+++ b/csrc/balance_serve/kvc2/src/utils/timer.hpp
@ -0,0 +1,128 @@
+#pragma once
+#include <cassert>
+#include <chrono>
+#include <iomanip>
+#include <iostream>
+#include <map>
+#include <sstream>
+#include <string>
+#include "easy_format.hpp"
+
+inline std::string doubleToStringR2(double value) {
+  std::stringstream stream;
+  stream << std::fixed << std::setprecision(2) << value;
+  return stream.str();
+}
+
+class Timer {
+ public:
+  std::string name;
+  bool tmp_timer = false;
+
+  Timer() {}
+  Timer(std::string name) : name(name), tmp_timer(true) { start(); }
+  ~Timer() {
+    if (tmp_timer) {
+      std::cout << name << " " << elapsedMs() << " ms" << std::endl;
+    }
+  }
+
+  void start() {
+    m_startTime = std::chrono::high_resolution_clock::now();
+    assert(m_isRunning == false);
+    m_isRunning = true;
+  }
+
+  void stop() {
+    m_endTime = std::chrono::high_resolution_clock::now();
+    assert(m_isRunning == true);
+    m_isRunning = false;
+    m_runningNs += elapsedNs();
+  }
+
+  double elapsedNs() {
+    std::chrono::time_point<std::chrono::high_resolution_clock> endTime;
+
+    if (m_isRunning) {
+      endTime = std::chrono::high_resolution_clock::now();
+    } else {
+      endTime = m_endTime;
+    }
+
+    return std::chrono::duration_cast<std::chrono::nanoseconds>(endTime - m_startTime).count();
+  }
+
+  void printElapsedMilliseconds() { std::cout << elapsedNs() / 1e6 << " ms" << std::endl; }
+
+  static std::string ns_to_string(double duration) {
+    auto nano_sec = duration;
+    if (nano_sec >= 1000) {
+      auto mirco_sec = nano_sec / 1000.0;
+      if (mirco_sec >= 1000) {
+        auto milli_sec = mirco_sec / 1000.0;
+        if (milli_sec >= 1000) {
+          auto seconds = milli_sec / 1000.0;
+
+          if (seconds >= 60.0) {
+            auto minutes = seconds / 60.0;
+
+            if (minutes >= 60.0) {
+              auto hours = minutes / 60.0;
+              return doubleToStringR2(hours) + " h";
+            } else {
+              return doubleToStringR2(minutes) + " min";
+            }
+          } else {
+            return doubleToStringR2(seconds) + " sec";
+          }
+        } else {
+          return doubleToStringR2(milli_sec) + " ms";
+        }
+      } else {
+        return doubleToStringR2(mirco_sec) + " us";
+      }
+    } else {
+      return doubleToStringR2(nano_sec) + " ns";
+    }
+  }
+
+  double runningTimeNs() { return m_runningNs; }
+
+  std::string runningTime() {
+    auto duration = m_runningNs;
+    return ns_to_string(duration);
+  }
+
+  std::string elapsedTime() { return ns_to_string(elapsedNs()); }
+  double elapsedMs() { return elapsedNs() / 1e6; }
+  std::string report_throughput(size_t op_cnt) {
+    double ops = op_cnt / elapsedMs() * 1000;
+    return readable_number(ops) + "op/s";
+  }
+
+  void merge(Timer& other) {
+    assert(m_isRunning == false);
+    assert(other.m_isRunning == false);
+    m_runningNs += other.runningTimeNs();
+  }
+
+ private:
+  std::chrono::time_point<std::chrono::high_resolution_clock> m_startTime;
+  std::chrono::time_point<std::chrono::high_resolution_clock> m_endTime;
+  bool m_isRunning = false;
+  double m_runningNs = 0.0;
+};
+
+class Counter {
+ public:
+  Counter() {}
+
+  std::map<std::string, size_t> counters;
+
+  void inc(const char* name, size_t num) { counters[name] += num; };
+  void print() {
+    for (auto& p : counters) {
+      std::cout << p.first << " : " << p.second << std::endl;
+    }
+  };
+};
--- a/csrc/balance_serve/kvc2/test/CMakeLists.txt
+++ b/csrc/balance_serve/kvc2/test/CMakeLists.txt
@ -0,0 +1,78 @@
+
+set(CMAKE_CXX_FLAGS "-Og -march=native -Wall -Wextra -g -fopenmp")
+# set(CMAKE_CXX_FLAGS "-O3 -march=native -Wall -Wextra -pthread")
+
+add_subdirectory(kvc2test)
+
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../src)
+
+add_executable(hashmap_test hashmap_test.cpp)
+target_link_libraries(hashmap_test PRIVATE TBB::tbb)
+
+
+add_executable(xxHash_test xxHash_test.cpp)
+target_link_libraries(xxHash_test PRIVATE xxhash)
+
+function(add_async_store_executable source_file)
+    get_filename_component(target_name ${source_file} NAME_WE) # 获取不带扩展名的文件名作为目标名
+    add_executable(${target_name} ${source_file})
+    target_include_directories(${target_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../src)
+    target_include_directories(${target_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../third_party/nlohmann/single_include)
+    target_include_directories(${target_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../third_party/spdlog/include)
+    target_link_libraries(${target_name} PRIVATE async_store gflags)
+endfunction()
+
+add_async_store_executable(async_store_test.cpp)
+
+
+function(add_kvc2_executable source_file)
+    get_filename_component(target_name ${source_file} NAME_WE) # 获取不带扩展名的文件名作为目标名
+    add_executable(${target_name} ${source_file})
+    # target_compile_options(${target_name} PRIVATE -fopenmp  -fno-strict-aliasing)
+    target_include_directories(${target_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../src)
+    target_include_directories(${target_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../third_party/nlohmann/single_include)
+    target_include_directories(${target_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../third_party/spdlog/include)
+    target_link_libraries(${target_name} PRIVATE kvc2 async_store gflags)
+endfunction()
+
+
+
+
+add_kvc2_executable(test_lock_free_queue.cpp)
+add_kvc2_executable(test_queue_perf.cpp)
+
+# Disable deprecated test
+# add_kvc2_executable(prefix_test.cpp)
+# add_kvc2_executable(kvcache_disk_insert_read_test.cpp)
+# add_kvc2_executable(kvcache_mem_eviction_test.cpp)
+# add_kvc2_executable(kvcache_mem_insert_read_test.cpp)
+# add_kvc2_executable(kvcache_save_load_test.cpp)
+# add_kvc2_executable(kvc2_export_header_test.cpp)
+# add_kvc2_executable(kvc2_export_load_test.cpp)
+
+
+
+
+
+target_include_directories(async_store_test PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/..//third_party/nlohmann/single_include)
+target_include_directories(async_store_test PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/..//third_party/spdlog/include)
+target_link_libraries(async_store_test PRIVATE xxhash)
+
+add_executable(test_std_list test_std_list.cpp)
+
+
+add_executable(test_cuda_stream test_cuda_stream.cpp)
+target_include_directories(test_cuda_stream PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
+target_link_libraries(test_cuda_stream PRIVATE CUDA::cudart)
+
+add_executable(test_cuda_stream_manager test_cuda_stream_manager.cpp)
+target_include_directories(test_cuda_stream_manager PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../src)
+target_link_libraries(test_cuda_stream_manager PRIVATE cuda_stream_manager)
+
+add_executable(test_periodic_task test_periodic_task.cpp)
+target_include_directories(test_periodic_task PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../src)
+
+add_executable(test_page_pool page_pool_test.cpp)
+target_include_directories(test_page_pool PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../src)
+target_include_directories(test_page_pool PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../third_party/spdlog/include)
--- a/csrc/balance_serve/kvc2/test/hashmap_test.cpp
+++ b/csrc/balance_serve/kvc2/test/hashmap_test.cpp
@ -0,0 +1,11 @@
+#include <tbb/concurrent_hash_map.h>
+#include <iostream>
+
+int main() {
+  tbb::concurrent_hash_map<int, int> map;
+  map.insert({1, 2});
+  decltype(map)::accessor a;
+  std::cout << map.find(a, 1) << std::endl;
+
+  return 0;
+}
--- a/csrc/balance_serve/kvc2/test/kvc2_export_header_test.cpp
+++ b/csrc/balance_serve/kvc2/test/kvc2_export_header_test.cpp
@ -0,0 +1,87 @@
+#include "kvc2.h"
+#include "kvc2_test_utils.cpp"
+
+int main(int argc, char* argv[]) {
+  init(argc, argv);
+  spdlog::set_level(spdlog::level::debug);
+  std::mt19937 gen(123);
+
+  KVC2Config config = {
+      .path = FLAGS_disk_cache_path,
+      .config_path = std::string("/home/xwy/conifg"),
+      .block_length = BlockLength,
+      .memory_pool_size = size_t(10e9),
+      .evict_count = 20,
+  };
+  auto kvcc = create_kvc2(config);
+
+  auto io = kvcc->start_io_thread();
+
+  SPDLOG_INFO("Disk Test");
+  auto ids = random_ids(10 * BlockLength, gen);
+  auto h1 = random_kvcache(qwen_cache_info, 10, gen);
+  kvcc->raw_insert(qwen_cache_info, reinterpret_cast<TokenPtr>(ids.data()), ids.size(), h1);
+
+  // complete same
+  {
+    auto h2 = empty_kvcache(qwen_cache_info, 10);
+    kvcc->raw_read(qwen_cache_info, reinterpret_cast<TokenPtr>(ids.data()), ids.size(), h2);
+    cmp_handle_data(qwen_cache_info, h1, h2);
+  }
+
+  // complete prefix
+  {
+    auto h2 = empty_kvcache(qwen_cache_info, 10);
+    auto ids2 = std::vector<Token>(ids.begin(), ids.begin() + 3 * BlockLength);
+    kvcc->raw_read(qwen_cache_info, reinterpret_cast<TokenPtr>(ids2.data()), ids2.size(), h2);
+    cmp_handle_data(qwen_cache_info, h1, h2, 3);
+  }
+
+  // common prefix
+  {
+    auto h2 = empty_kvcache(qwen_cache_info, 10);
+    auto ids2 = std::vector<Token>(ids.begin(), ids.begin() + 5 * BlockLength);
+    auto rids = random_ids(BlockLength * 2 + BlockLength / 2, gen);
+    ids2.insert(ids2.end(), rids.begin(), rids.end());
+
+    kvcc->raw_read(qwen_cache_info, reinterpret_cast<TokenPtr>(ids2.data()), ids2.size(), h2);
+
+    cmp_handle_data(qwen_cache_info, h1, h2, 5);
+  }
+
+  // no prefix
+  {
+    auto h2 = empty_kvcache(qwen_cache_info, 10);
+
+    auto ids2 = random_ids(10 * BlockLength, gen);
+
+    kvcc->raw_read(qwen_cache_info, reinterpret_cast<TokenPtr>(ids2.data()), ids2.size(), h2);
+  }
+
+  // insert partly new
+  auto h2 = random_kvcache(qwen_cache_info, 10, gen);
+  copy_kvcache(h1, h2, 0, 5);
+  auto ids2 = random_ids(10 * BlockLength, gen);
+  for (size_t i = 0; i < 5 * BlockLength; i++) {
+    ids2[i] = ids[i];
+  }
+
+  kvcc->raw_insert(qwen_cache_info, reinterpret_cast<TokenPtr>(ids2.data()), ids2.size(), h2);
+
+  // read new part
+  {
+    auto h3 = empty_kvcache(qwen_cache_info, 10);
+    auto ids3 = std::vector<Token>(ids2.begin(), ids2.begin() + 7 * BlockLength);
+    ids3.push_back(123);
+
+    kvcc->raw_read(qwen_cache_info, reinterpret_cast<TokenPtr>(ids3.data()), ids3.size(), h3);
+    cmp_handle_data(qwen_cache_info, h3, h2, 7);
+  }
+  kvcc->save();
+  kvcc->stop_io_thread();
+  io.join();
+
+  SPDLOG_WARN("{} Test Passed", __FILE__);
+
+  return 0;
+}
--- a/csrc/balance_serve/kvc2/test/kvc2_export_load_test.cpp
+++ b/csrc/balance_serve/kvc2/test/kvc2_export_load_test.cpp
@ -0,0 +1,87 @@
+#include "kvc2.h"
+#include "kvc2_test_utils.cpp"
+
+int main(int argc, char* argv[]) {
+  init(argc, argv);
+  spdlog::set_level(spdlog::level::debug);
+  std::mt19937 gen(123);
+
+  KVC2Config config = {
+      .path = FLAGS_disk_cache_path,
+      .block_length = BlockLength,
+      .memory_pool_size = size_t(10e9),
+      .evict_count = 20,
+  };
+  auto kvcc = create_kvc2(config);
+  kvcc->load();
+
+  auto io = kvcc->start_io_thread();
+
+  SPDLOG_INFO("Disk Test");
+  auto ids = random_ids(10 * BlockLength, gen);
+  auto h1 = empty_kvcache(qwen_cache_info, 10);
+  // kvcc->raw_insert(qwen_cache_info, reinterpret_cast<IDptr>(ids.data()), ids.size(), h1);
+
+  // complete same
+  {
+    // auto h2 = empty_kvcache(qwen_cache_info, 10);
+    kvcc->raw_read(qwen_cache_info, reinterpret_cast<TokenPtr>(ids.data()), ids.size(), h1);
+    // cmp_handle_data(qwen_cache_info, h1, h2);
+  }
+
+  // complete prefix
+  {
+    auto h2 = empty_kvcache(qwen_cache_info, 10);
+    auto ids2 = std::vector<Token>(ids.begin(), ids.begin() + 3 * BlockLength);
+    kvcc->raw_read(qwen_cache_info, reinterpret_cast<TokenPtr>(ids2.data()), ids2.size(), h2);
+    cmp_handle_data(qwen_cache_info, h1, h2, 3);
+  }
+
+  // common prefix
+  {
+    auto h2 = empty_kvcache(qwen_cache_info, 10);
+    auto ids2 = std::vector<Token>(ids.begin(), ids.begin() + 5 * BlockLength);
+    auto rids = random_ids(BlockLength * 2 + BlockLength / 2, gen);
+    ids2.insert(ids2.end(), rids.begin(), rids.end());
+
+    kvcc->raw_read(qwen_cache_info, reinterpret_cast<TokenPtr>(ids2.data()), ids2.size(), h2);
+
+    cmp_handle_data(qwen_cache_info, h1, h2, 5);
+  }
+
+  // no prefix
+  {
+    auto h2 = empty_kvcache(qwen_cache_info, 10);
+
+    auto ids2 = random_ids(10 * BlockLength, gen);
+
+    kvcc->raw_read(qwen_cache_info, reinterpret_cast<TokenPtr>(ids2.data()), ids2.size(), h2);
+  }
+
+  // insert partly new
+  auto h2 = random_kvcache(qwen_cache_info, 10, gen);
+  copy_kvcache(h1, h2, 0, 5);
+  auto ids2 = random_ids(10 * BlockLength, gen);
+  for (size_t i = 0; i < 5 * BlockLength; i++) {
+    ids2[i] = ids[i];
+  }
+
+  kvcc->raw_insert(qwen_cache_info, reinterpret_cast<TokenPtr>(ids2.data()), ids2.size(), h2);
+
+  // read new part
+  {
+    auto h3 = empty_kvcache(qwen_cache_info, 10);
+    auto ids3 = std::vector<Token>(ids2.begin(), ids2.begin() + 7 * BlockLength);
+    ids3.push_back(123);
+
+    kvcc->raw_read(qwen_cache_info, reinterpret_cast<TokenPtr>(ids3.data()), ids3.size(), h3);
+    cmp_handle_data(qwen_cache_info, h3, h2, 7);
+  }
+
+  kvcc->stop_io_thread();
+  io.join();
+
+  SPDLOG_WARN("{} Test Passed", __FILE__);
+
+  return 0;
+}
--- a/csrc/balance_serve/kvc2/test/kvc2_test_utils.cpp
+++ b/csrc/balance_serve/kvc2/test/kvc2_test_utils.cpp
@ -0,0 +1,117 @@
+#include <optional>
+#include <random>
+#include "kvc2.h"
+#define FMT_HEADER_ONLY
+#include <spdlog/spdlog.h>
+
+const int BlockLength = 256;
+
+std::string FLAGS_disk_cache_path;
+
+void init(int argc, char* argv[]) {
+  if (argc != 2) {
+    fmt::print("Usage: {} --disk_cache_path=xxx\n", argv[0]);
+    exit(1);
+  }
+  FLAGS_disk_cache_path = argv[1];
+  if (FLAGS_disk_cache_path.empty()) {
+    fmt::print("disk_cache_path is empty");
+    exit(1);
+  }
+}
+
+using namespace kvc2;
+
+data_block_ptr empty_block(CacheInfo info) {
+  auto re = new (std::align_val_t(4096)) std::byte[info.element_size(BlockLength)];
+  return reinterpret_cast<data_block_ptr>(re);
+}
+
+data_block_ptr random_block(CacheInfo info, std::mt19937& gen) {
+  auto re = empty_block(info);
+  uint64_t* d = (uint64_t*)re;
+  for (size_t i = 0; i < info.element_size(BlockLength) / 8; i++) {
+    d[i] = gen();
+  }
+  return re;
+}
+layer_data random_blocks(CacheInfo info, size_t block_count, size_t seed) {
+  std::mt19937 gen(seed);
+  layer_data re;
+  for (size_t i = 0; i < block_count; i++) {
+    re.push_back(random_block(info, gen));
+  }
+  return re;
+}
+
+layer_data empty_blocks(CacheInfo info, size_t block_count) {
+  layer_data re;
+  for (size_t i = 0; i < block_count; i++) {
+    re.push_back(empty_block(info));
+  }
+  return re;
+}
+
+void copy_kvcache(std::vector<layer_data>& from, std::vector<layer_data>& to, size_t block_start, size_t length) {
+  for (size_t i = 0; i < from.size(); i++) {
+    for (size_t j = 0; j < length; j++) {
+      to[i][block_start + j] = from[i][block_start + j];
+    }
+  }
+}
+
+std::vector<layer_data> random_kvcache(CacheInfo info, size_t block_count, std::mt19937& gen) {
+  std::vector<layer_data> re;
+  re.resize(info.hidden_layer_count());
+  fmt::print("Generating random kvcache, layer {}\n", info.hidden_layer_count());
+#pragma omp parallel for
+  for (size_t i = 0; i < info.hidden_layer_count(); i++) {
+    re[i] = random_blocks(info, block_count, gen());
+  }
+  return re;
+}
+
+std::vector<layer_data> empty_kvcache(CacheInfo info, size_t block_count) {
+  std::vector<layer_data> re;
+  re.resize(info.hidden_layer_count());
+  fmt::print("Generating empty kvcache, layer {}\n", info.hidden_layer_count());
+#pragma omp parallel for
+  for (size_t i = 0; i < info.hidden_layer_count(); i++) {
+    re[i] = empty_blocks(info, block_count);
+  }
+  return re;
+}
+
+std::vector<Token> random_ids(size_t length, std::mt19937& gen) {
+  std::vector<Token> re;
+  for (size_t i = 0; i < length; i++) {
+    re.push_back(gen());
+  }
+  return re;
+}
+
+CacheInfo qwen_cache_info = {
+    .model_name = "qwen2-72b-instruct",
+    .is_key_cache = true,
+    .quant_type = "BF16",
+};
+
+void cmp_handle_data(CacheInfo info, std::vector<layer_data>& h1, std::vector<layer_data>& h2,
+                     std::optional<size_t> blocks = std::nullopt) {
+  assert(h1.size() == h2.size());
+
+  for (size_t i = 0; i < h1.size(); i++) {
+    auto& b1 = h1[i];
+    auto& b2 = h2[i];
+    if (blocks.has_value() == false) {
+      assert(b1.size() == b2.size());
+    }
+    int cmp_to = blocks.has_value() ? blocks.value() : b1.size();
+    for (int j = 0; j < cmp_to; j++) {
+      auto e1 = reinterpret_cast<void*>(b1[j]);
+      auto e2 = reinterpret_cast<void*>(b2[j]);
+      assert(memcmp(e1, e2, info.element_size(BlockLength)) == 0);
+    }
+  }
+  fmt::print("KVCacheHandle cmp ok\n");
+}
--- a/csrc/balance_serve/kvc2/test/kvc2test/CMakeLists.txt
+++ b/csrc/balance_serve/kvc2/test/kvc2test/CMakeLists.txt
@ -0,0 +1,26 @@
+
+set(CMAKE_CXX_FLAGS "-Og -march=native -Wall -Wextra -g -fopenmp")
+
+function(add_kvc2_test source_file)
+    get_filename_component(target_name ${source_file} NAME_WE) # 获取不带扩展名的文件名作为目标名
+    add_executable(${target_name} ${source_file})
+    # target_compile_options(${target_name} PRIVATE -fopenmp  -fno-strict-aliasing)
+    target_include_directories(${target_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../src)
+    target_include_directories(${target_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../third_party/nlohmann/single_include)
+    target_include_directories(${target_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../third_party/spdlog/include)
+    target_link_libraries(${target_name} PRIVATE kvc2 async_store)
+endfunction()
+
+add_kvc2_test(raw_insert_read.cpp)
+add_kvc2_test(lookup.cpp)
+add_kvc2_test(lookup-alt.cpp)
+add_kvc2_test(lookup-alt-gpu.cpp)
+add_kvc2_test(lookup-mt.cpp)
+add_kvc2_test(lookup-gpu.cpp)
+add_kvc2_test(lookup-gpu-mt.cpp)
+add_kvc2_test(lookup-gpu-async.cpp)
+add_kvc2_test(append-tokens.cpp)
+add_kvc2_test(flush-back.cpp)
+add_kvc2_test(check-flush-back.cpp)
+add_kvc2_test(lookup-without-vcache.cpp)
+add_kvc2_test(lookup-gpu-mt-without-vcache.cpp)
--- a/csrc/balance_serve/kvc2/test/kvc2test/append-tokens.cpp
+++ b/csrc/balance_serve/kvc2/test/kvc2test/append-tokens.cpp
@ -0,0 +1,52 @@
+#include <future>
+#include "common.hpp"
+
+int main(int argc, char* argv[]) {
+  init(argc, argv);
+  spdlog::set_level(spdlog::level::debug);
+  auto kvc2 = kvc2::create_kvc2(config);
+
+#pragma omp parallel for
+  for (size_t ti = 0; ti < 3; ti++) {
+    auto [kcache, vcache] = kvc2->get_kvcache();
+    std::mt19937 gen(ti + 123);
+    size_t total_page = 10;
+    TokenLength total_length = total_page * config.num_token_per_page;
+    auto tokens = random_ids(total_length, gen);
+    TokenLength prompt_length = 3 * config.num_token_per_page;
+    auto k1 = random_kvcache(total_page, gen);
+    auto v1 = random_kvcache(total_page, gen);
+    {
+      std::promise<std::shared_ptr<DoubleCacheHandleInterface>> p;
+      kvc2->lookup_to_gpu_async(test_model_name, test_quant_type, tokens.data(), prompt_length, total_length,
+                                [&p](std::shared_ptr<DoubleCacheHandleInterface> h) { p.set_value(h); });
+      auto fut = p.get_future();
+      fut.wait();
+      auto h = fut.get();
+      assert(h->matched_length() % config.num_token_per_page == 0);
+      size_t matched_block = h->matched_length() / config.num_token_per_page;
+      auto block_idx = h->get_gpu_block_idx();
+      cmp_handle_gpu(block_idx, kcache, vcache, k1, v1, matched_block);
+      for (size_t at = matched_block; at < block_idx.size(); at++) {
+        copy_cpu_gpu(block_idx, kcache, vcache, k1, v1, at);
+      }
+      h->append_tokens(tokens.data(), total_length);
+      cmp_handle_gpu(block_idx, kcache, vcache, k1, v1, total_page);
+    }
+
+    {
+      std::promise<std::shared_ptr<DoubleCacheHandleInterface>> p;
+      kvc2->lookup_to_gpu_async(test_model_name, test_quant_type, tokens.data(), total_length, total_length,
+                                [&p](std::shared_ptr<DoubleCacheHandleInterface> h) { p.set_value(h); });
+      auto fut = p.get_future();
+      fut.wait();
+      auto h = fut.get();
+      assert(h->matched_length() == total_length);
+      size_t matched_block = h->matched_length() / config.num_token_per_page;
+      auto block_idx = h->get_gpu_block_idx();
+      cmp_handle_gpu(block_idx, kcache, vcache, k1, v1, matched_block);
+    }
+  }
+  SPDLOG_CRITICAL("All Test Passed: {}", argv[0]);
+  return 0;
+}
--- a/csrc/balance_serve/kvc2/test/kvc2test/check-flush-back.cpp
+++ b/csrc/balance_serve/kvc2/test/kvc2test/check-flush-back.cpp
@ -0,0 +1,36 @@
+#include <future>
+#include "common.hpp"
+
+int main(int argc, char* argv[]) {
+  init(argc, argv);
+  spdlog::set_level(spdlog::level::debug);
+  config.gpu_cache_config->total_kvcache_pages = 12;
+  auto kvc2 = kvc2::create_kvc2(config);
+  kvc2->load();
+  // #pragma omp parallel for
+  for (size_t ti = 0; ti < 2; ti++) {
+    SPDLOG_WARN("Test {}", ti);
+    auto [kcache, vcache] = kvc2->get_kvcache();
+    std::mt19937 gen(ti + 123);
+    size_t total_page = 10;
+    TokenLength total_length = total_page * config.num_token_per_page;
+    auto tokens = random_ids(total_length, gen);
+    auto k1 = random_kvcache(total_page, gen);
+    auto v1 = random_kvcache(total_page, gen);
+
+    {
+      std::promise<std::shared_ptr<DoubleCacheHandleInterface>> p;
+      kvc2->lookup_to_gpu_async(test_model_name, test_quant_type, tokens.data(), total_length, total_length,
+                                [&p](std::shared_ptr<DoubleCacheHandleInterface> h) { p.set_value(h); });
+      auto fut = p.get_future();
+      fut.wait();
+      auto h = fut.get();
+      assert(h->matched_length() == total_length);
+      size_t matched_block = h->matched_length() / config.num_token_per_page;
+      auto block_idx = h->get_gpu_block_idx();
+      cmp_handle_gpu(block_idx, kcache, vcache, k1, v1, matched_block);
+    }
+  }
+  SPDLOG_CRITICAL("All Test Passed: {}", argv[0]);
+  return 0;
+}
--- a/csrc/balance_serve/kvc2/test/kvc2test/common.hpp
+++ b/csrc/balance_serve/kvc2/test/kvc2test/common.hpp
@ -0,0 +1,233 @@
+/**
+ * @Description  :
+ * @Author       : Xie Weiyu
+ * @Date         : 2024-11-22 06:02:41
+ * @Version      : 1.0.0
+ * @LastEditors  : Xie Weiyu
+ * @LastEditTime : 2024-12-11 07:34:10
+ * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
+ **/
+#pragma once
+#include <random>
+#include <thread>
+#include "kvc2.h"
+#define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG
+#define FMT_HEADER_ONLY
+#include "spdlog/spdlog.h"
+
+using namespace kvc2;
+
+template <typename T>
+T* offset_by_bytes(T* t, size_t n) {
+  return reinterpret_cast<T*>(reinterpret_cast<size_t>(t) + n);
+}
+
+std::string FLAGS_disk_cache_path;
+
+kvc2::KVC2Config config;
+kvc2::GPUPageCacheConfig qw25_7B_gpu_config{
+    .gpu_only = false,
+    .gpu_devices_id = {0, 1},
+    .layer_count = 28,
+    .total_kvcache_pages = 40,
+    .num_token_per_page = 256,
+    .num_k_heads = 4,
+    .k_head_dim = 896,
+    .full_kv_cache_on_each_gpu = false,
+    .k_cache_on = true,
+    .v_cache_on = true,
+    .tensor_type = torch::kBFloat16,
+    .num_streams_per_device = 4,
+};
+
+ModelName test_model_name = "Qwen2.5-7B-Instruct";
+QuantType test_quant_type = "FP16";
+CacheInfo test_cache_info{
+    .model_name = test_model_name,
+    .is_key_cache = true,
+    .quant_type = test_quant_type,
+};
+
+void init(int argc, char* argv[]) {
+  if (argc != 2) {
+    fmt::print("Usage: {} <disk_cache_path>\n", argv[0]);
+    exit(1);
+  }
+  load_quant_configs("./config/quant_configs.json");
+  load_model_configs("./config/model_configs.json");
+
+  FLAGS_disk_cache_path = argv[1];
+  if (FLAGS_disk_cache_path.empty()) {
+    fmt::print("disk_cache_path is empty\n");
+    exit(1);
+  }
+  config.path = FLAGS_disk_cache_path;
+  config.config_path = "./config";
+  config.gpu_cache_config = qw25_7B_gpu_config;
+}
+
+data_block_ptr empty_block() {
+  auto re = new (std::align_val_t(4096)) std::byte[test_cache_info.element_size(config.num_token_per_page)];
+  memset(re, 0, test_cache_info.element_size(config.num_token_per_page));
+  return reinterpret_cast<data_block_ptr>(re);
+}
+
+data_block_ptr random_block(std::mt19937& gen) {
+  auto re = empty_block();
+  uint64_t* d = (uint64_t*)re;
+  for (size_t i = 0; i < test_cache_info.element_size(config.num_token_per_page) / 8; i++) {
+    d[i] = gen();
+  }
+  return re;
+}
+layer_data random_blocks(size_t block_count, size_t seed) {
+  std::mt19937 gen(seed);
+  layer_data re;
+  for (size_t i = 0; i < block_count; i++) {
+    re.push_back(random_block(gen));
+  }
+  return re;
+}
+
+layer_data empty_blocks(size_t block_count) {
+  layer_data re;
+  for (size_t i = 0; i < block_count; i++) {
+    re.push_back(empty_block());
+  }
+  return re;
+}
+
+void copy_kvcache(std::vector<layer_data>& from, std::vector<layer_data>& to, size_t block_start, size_t length) {
+  for (size_t i = 0; i < from.size(); i++) {
+    for (size_t j = 0; j < length; j++) {
+      to[i][block_start + j] = from[i][block_start + j];
+    }
+  }
+}
+
+std::vector<layer_data> random_kvcache(size_t block_count, std::mt19937& gen) {
+  std::vector<layer_data> re;
+  re.resize(test_cache_info.hidden_layer_count());
+  fmt::print("Generating random kvcache, layer {}\n", test_cache_info.hidden_layer_count());
+  std::vector<std::mt19937> gens;
+  for (size_t i = 0; i < test_cache_info.hidden_layer_count(); i++) {
+    gens.push_back(std::mt19937(gen()));
+  }
+#pragma omp parallel for
+  for (size_t i = 0; i < test_cache_info.hidden_layer_count(); i++) {
+    re[i] = random_blocks(block_count, gens[i]());
+  }
+  return re;
+}
+
+std::vector<layer_data> empty_kvcache(size_t block_count) {
+  std::vector<layer_data> re;
+  re.resize(test_cache_info.hidden_layer_count());
+  fmt::print("Generating empty kvcache, layer {}\n", test_cache_info.hidden_layer_count());
+#pragma omp parallel for
+  for (size_t i = 0; i < test_cache_info.hidden_layer_count(); i++) {
+    re[i] = empty_blocks(block_count);
+  }
+  return re;
+}
+
+std::vector<Token> random_ids(size_t length, std::mt19937& gen) {
+  std::vector<Token> re;
+  for (size_t i = 0; i < length; i++) {
+    re.push_back(gen());
+  }
+  return re;
+}
+
+std::vector<layer_data> slice(std::vector<layer_data>& h1, size_t start, size_t end) {
+  std::vector<layer_data> re;
+  for (auto& l : h1) {
+    layer_data new_layer;
+    new_layer.insert(new_layer.end(), l.begin() + start, l.begin() + end);
+    re.push_back(new_layer);
+  }
+  return re;
+}
+
+void cmp_handle_data(std::vector<layer_data> h1, std::vector<layer_data> h2,
+                     std::optional<size_t> blocks = std::nullopt) {
+  assert(h1.size() == h2.size());
+
+  for (size_t i = 0; i < h1.size(); i++) {
+    auto& b1 = h1[i];
+    auto& b2 = h2[i];
+    if (blocks.has_value() == false) {
+      assert(b1.size() == b2.size());
+    }
+    int cmp_to = blocks.has_value() ? blocks.value() : b1.size();
+    for (int j = 0; j < cmp_to; j++) {
+      auto e1 = reinterpret_cast<void*>(b1[j]);
+      auto e2 = reinterpret_cast<void*>(b2[j]);
+      assert(memcmp(e1, e2, test_cache_info.element_size(config.num_token_per_page)) == 0);
+    }
+  }
+  fmt::print("KVCacheHandle cmp ok\n");
+}
+
+void copy_gpu_cpu(std::vector<size_t>& block_idx, std::vector<torch::Tensor>& kcache,
+                  std::vector<torch::Tensor>& vcache, std::vector<layer_data>& k_cpu, std::vector<layer_data>& v_cpu,
+                  size_t at) {
+  size_t gpu_count = config.gpu_cache_config->gpu_devices_id.size();
+  size_t element_size_per_gpu = test_cache_info.element_size(config.num_token_per_page) / gpu_count;
+
+  for (size_t layer = 0; layer < test_cache_info.hidden_layer_count(); layer++) {
+    for (size_t gpu_idx = 0; gpu_idx < gpu_count; gpu_idx++) {
+      {
+        auto kt = kcache[gpu_idx][layer][block_idx[at]].to(torch::kCPU);
+        void* src = kt.data_ptr();
+        void* dst = offset_by_bytes(k_cpu[layer][at], gpu_idx * element_size_per_gpu);
+        memcpy(dst, src, element_size_per_gpu);
+      }
+      {
+        auto vt = vcache[gpu_idx][layer][block_idx[at]].to(torch::kCPU);
+        void* src = vt.data_ptr();
+        void* dst = offset_by_bytes(v_cpu[layer][at], gpu_idx * element_size_per_gpu);
+        memcpy(dst, src, element_size_per_gpu);
+      }
+    }
+  }
+}
+
+void copy_cpu_gpu(std::vector<size_t>& block_idx, std::vector<torch::Tensor>& kcache,
+                  std::vector<torch::Tensor>& vcache, std::vector<layer_data>& k_cpu, std::vector<layer_data>& v_cpu,
+                  size_t at) {
+  size_t gpu_count = config.gpu_cache_config->gpu_devices_id.size();
+  size_t element_size_per_gpu = test_cache_info.element_size(config.num_token_per_page) / gpu_count;
+
+  for (size_t layer = 0; layer < test_cache_info.hidden_layer_count(); layer++) {
+    for (size_t gpu_idx = 0; gpu_idx < gpu_count; gpu_idx++) {
+      {
+        auto kt = kcache[gpu_idx][layer][block_idx[at]].to(torch::kCPU);
+        void* dst = kt.data_ptr();
+        void* src = offset_by_bytes(k_cpu[layer][at], gpu_idx * element_size_per_gpu);
+        memcpy(dst, src, element_size_per_gpu);
+        kcache[gpu_idx][layer][block_idx[at]].copy_(kt);
+      }
+      {
+        auto vt = vcache[gpu_idx][layer][block_idx[at]].to(torch::kCPU);
+        void* dst = vt.data_ptr();
+        void* src = offset_by_bytes(v_cpu[layer][at], gpu_idx * element_size_per_gpu);
+        memcpy(dst, src, element_size_per_gpu);
+        vcache[gpu_idx][layer][block_idx[at]].copy_(vt);
+      }
+    }
+  }
+}
+
+void cmp_handle_gpu(std::vector<size_t>& block_idx, std::vector<torch::Tensor>& kcache,
+                    std::vector<torch::Tensor>& vcache, std::vector<layer_data>& k1, std::vector<layer_data>& v1,
+                    size_t num_blocks) {
+  auto k_from_gpu = empty_kvcache(num_blocks);
+  auto v_from_gpu = empty_kvcache(num_blocks);
+
+  for (size_t j = 0; j < std::min(block_idx.size(), num_blocks); j++) {
+    copy_gpu_cpu(block_idx, kcache, vcache, k_from_gpu, v_from_gpu, j);
+  }
+  cmp_handle_data(k1, k_from_gpu, num_blocks);
+  cmp_handle_data(v1, v_from_gpu, num_blocks);
+}
--- a/csrc/balance_serve/kvc2/test/kvc2test/flush-back.cpp
+++ b/csrc/balance_serve/kvc2/test/kvc2test/flush-back.cpp
@ -0,0 +1,57 @@
+#include <future>
+#include "common.hpp"
+
+int main(int argc, char* argv[]) {
+  init(argc, argv);
+  spdlog::set_level(spdlog::level::debug);
+  config.gpu_cache_config->total_kvcache_pages = 12;
+  auto kvc2 = kvc2::create_kvc2(config);
+
+  // #pragma omp parallel for
+  for (size_t ti = 0; ti < 2; ti++) {
+    SPDLOG_WARN("Test {}", ti);
+    auto [kcache, vcache] = kvc2->get_kvcache();
+    std::mt19937 gen(ti + 123);
+    size_t total_page = 10;
+    TokenLength total_length = total_page * config.num_token_per_page;
+    auto tokens = random_ids(total_length, gen);
+    TokenLength prompt_length = 3 * config.num_token_per_page;
+    auto k1 = random_kvcache(total_page, gen);
+    auto v1 = random_kvcache(total_page, gen);
+
+    {
+      std::promise<std::shared_ptr<DoubleCacheHandleInterface>> p;
+      kvc2->lookup_to_gpu_async(test_model_name, test_quant_type, tokens.data(), prompt_length, total_length,
+                                [&p](std::shared_ptr<DoubleCacheHandleInterface> h) { p.set_value(h); });
+      auto fut = p.get_future();
+      fut.wait();
+      auto h = fut.get();
+      assert(h->matched_length() % config.num_token_per_page == 0);
+      size_t matched_block = h->matched_length() / config.num_token_per_page;
+      auto block_idx = h->get_gpu_block_idx();
+      cmp_handle_gpu(block_idx, kcache, vcache, k1, v1, matched_block);
+      for (size_t at = matched_block; at < block_idx.size(); at++) {
+        copy_cpu_gpu(block_idx, kcache, vcache, k1, v1, at);
+      }
+      h->append_tokens(tokens.data(), total_length);
+      cmp_handle_gpu(block_idx, kcache, vcache, k1, v1, total_page);
+    }
+
+    {
+      std::promise<std::shared_ptr<DoubleCacheHandleInterface>> p;
+      kvc2->lookup_to_gpu_async(test_model_name, test_quant_type, tokens.data(), total_length, total_length,
+                                [&p](std::shared_ptr<DoubleCacheHandleInterface> h) { p.set_value(h); });
+      auto fut = p.get_future();
+      fut.wait();
+      auto h = fut.get();
+      assert(h->matched_length() == total_length);
+      size_t matched_block = h->matched_length() / config.num_token_per_page;
+      auto block_idx = h->get_gpu_block_idx();
+      cmp_handle_gpu(block_idx, kcache, vcache, k1, v1, matched_block);
+    }
+  }
+  kvc2->save();
+  SPDLOG_CRITICAL("All Test Passed: {}", argv[0]);
+
+  return 0;
+}
--- a/csrc/balance_serve/kvc2/test/kvc2test/lookup-alt-gpu.cpp
+++ b/csrc/balance_serve/kvc2/test/kvc2test/lookup-alt-gpu.cpp
@ -0,0 +1,125 @@
+/**
+ * @Description  :
+ * @Author       : Xie Weiyu
+ * @Date         : 2024-11-22 08:29:45
+ * @Version      : 1.0.0
+ * @LastEditors  : Xie Weiyu
+ * @LastEditTime : 2024-11-22 09:56:12
+ * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
+ **/
+#include <future>
+#include "common.hpp"
+
+int main(int argc, char* argv[]) {
+  init(argc, argv);
+  spdlog::set_level(spdlog::level::trace);
+  auto kvc2 = kvc2::create_kvc2(config);
+
+  std::mt19937 gen(123);
+
+  std::vector<std::vector<Token>> ids;
+
+  std::vector<std::vector<layer_data>> k, v;
+  for (size_t i = 0; i < 10; i++) {
+    ids.push_back(random_ids(1 * config.num_token_per_page, gen));
+    k.push_back(random_kvcache(1, gen));
+    v.push_back(random_kvcache(1, gen));
+    kvc2->raw_insert(test_model_name, test_quant_type, ids[i].data(), ids[i].size(), k[i], v[i]);
+  }
+
+  kvc2->debug();
+  {
+    // all match
+    std::vector<Token*> chunks;
+    std::vector<TokenLength> lengths;
+    for (size_t i = 0; i < 10; i++) {
+      chunks.push_back(ids[i].data());
+      lengths.push_back(ids[i].size());
+    }
+    std::promise<std::shared_ptr<DoubleCacheHandleInterface>> p;
+    kvc2->lookup_alt_to_gpu_async(test_model_name, test_quant_type, chunks, lengths, 15 * config.num_token_per_page,
+                                  [&p](std::shared_ptr<DoubleCacheHandleInterface> h) { p.set_value(h); });
+
+    auto fut = p.get_future();
+    fut.wait();
+    auto h = fut.get();
+    auto hk = h->handle_data(true);
+    auto hv = h->handle_data(false);
+
+    for (size_t i = 0; i < 10; i++) {
+      cmp_handle_data(slice(hk, i, i + 1), k[i], 1);
+      cmp_handle_data(slice(hv, i, i + 1), v[i], 1);
+    }
+
+    auto block_idx = h->get_gpu_block_idx();
+    auto [kcache, vcache] = kvc2->get_kvcache();
+    for (size_t i = 0; i < 10; i++) {
+      std::vector<size_t> blocks = {block_idx[i]};
+      cmp_handle_gpu(blocks, kcache, vcache, k[i], v[i], 1);
+    }
+  }
+
+  {
+    // no match in the middle
+    std::vector<Token*> chunks;
+    std::vector<TokenLength> lengths;
+
+    std::vector<std::vector<Token>> new_ids;
+    for (size_t i = 0; i < 10; i++) {
+      new_ids.push_back(random_ids(1 * config.num_token_per_page, gen));
+    }
+
+    for (size_t i = 0; i < 10; i++) {
+      if (i == 1 || i == 5 || i == 6) {
+        chunks.push_back(new_ids[i].data());
+      } else {
+        chunks.push_back(ids[i].data());
+      }
+      lengths.push_back(ids[i].size());
+    }
+
+    std::promise<std::shared_ptr<DoubleCacheHandleInterface>> p;
+    kvc2->lookup_alt_to_gpu_async(test_model_name, test_quant_type, chunks, lengths, 15 * config.num_token_per_page,
+                                  [&p](std::shared_ptr<DoubleCacheHandleInterface> h) { p.set_value(h); });
+
+    auto fut = p.get_future();
+    fut.wait();
+    auto h = fut.get();
+    auto statuses = h->matched_status();
+    for (size_t i = 0; i < 10; i++) {
+      if (i == 1) {
+        assert(statuses[i] == MatchStatus::NotMatchExact);
+      } else if (i == 5 || i == 6) {
+        assert(statuses[i] == MatchStatus::NotMatchPartial);
+      } else if (i == 0) {
+        assert(statuses[i] == MatchStatus::Exact);
+      } else {
+        assert(statuses[i] == MatchStatus::Partial);
+      }
+    }
+
+    auto hk = h->handle_data(true);
+    auto hv = h->handle_data(false);
+
+    for (size_t i = 0; i < 10; i++) {
+      if (i == 1 || i == 5 || i == 6) {
+      } else {
+        cmp_handle_data(slice(hk, i, i + 1), k[i], 1);
+        cmp_handle_data(slice(hv, i, i + 1), v[i], 1);
+      }
+    }
+
+    auto block_idx = h->get_gpu_block_idx();
+    auto [kcache, vcache] = kvc2->get_kvcache();
+    for (size_t i = 0; i < 10; i++) {
+      if (i == 1 || i == 5 || i == 6) {
+      } else {
+        std::vector<size_t> blocks = {block_idx[i]};
+        cmp_handle_gpu(blocks, kcache, vcache, k[i], v[i], 1);
+      }
+    }
+  }
+
+  SPDLOG_CRITICAL("All Test Passed: {}", argv[0]);
+  return 0;
+}
--- a/csrc/balance_serve/kvc2/test/kvc2test/lookup-alt.cpp
+++ b/csrc/balance_serve/kvc2/test/kvc2test/lookup-alt.cpp
@ -0,0 +1,97 @@
+/**
+ * @Description  :
+ * @Author       : Xie Weiyu
+ * @Date         : 2024-11-22 08:29:45
+ * @Version      : 1.0.0
+ * @LastEditors  : Xie Weiyu
+ * @LastEditTime : 2024-11-22 09:56:12
+ * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
+ **/
+
+#include "common.hpp"
+
+int main(int argc, char* argv[]) {
+  init(argc, argv);
+  spdlog::set_level(spdlog::level::trace);
+  auto kvc2 = kvc2::create_kvc2(config);
+
+  std::mt19937 gen(123);
+
+  std::vector<std::vector<Token>> ids;
+
+  std::vector<std::vector<layer_data>> k, v;
+  for (size_t i = 0; i < 10; i++) {
+    ids.push_back(random_ids(1 * config.num_token_per_page, gen));
+    k.push_back(random_kvcache(1, gen));
+    v.push_back(random_kvcache(1, gen));
+    kvc2->raw_insert(test_model_name, test_quant_type, ids[i].data(), ids[i].size(), k[i], v[i]);
+  }
+
+  kvc2->debug();
+  {
+    // all match
+    std::vector<Token*> chunks;
+    std::vector<TokenLength> lengths;
+    for (size_t i = 0; i < 10; i++) {
+      chunks.push_back(ids[i].data());
+      lengths.push_back(ids[i].size());
+    }
+
+    auto h = kvc2->lookup_alt(test_model_name, test_quant_type, chunks, lengths, 15 * config.num_token_per_page);
+    auto hk = h->handle_data(true);
+    auto hv = h->handle_data(false);
+
+    for (size_t i = 0; i < 10; i++) {
+      cmp_handle_data(slice(hk, i, i + 1), k[i], 1);
+      cmp_handle_data(slice(hv, i, i + 1), v[i], 1);
+    }
+  }
+
+  {
+    // no match in the middle
+    std::vector<Token*> chunks;
+    std::vector<TokenLength> lengths;
+
+    std::vector<std::vector<Token>> new_ids;
+    for (size_t i = 0; i < 10; i++) {
+      new_ids.push_back(random_ids(1 * config.num_token_per_page, gen));
+    }
+
+    for (size_t i = 0; i < 10; i++) {
+      if (i == 1 || i == 5 || i == 6) {
+        chunks.push_back(new_ids[i].data());
+      } else {
+        chunks.push_back(ids[i].data());
+      }
+      lengths.push_back(ids[i].size());
+    }
+
+    auto h = kvc2->lookup_alt(test_model_name, test_quant_type, chunks, lengths, 15 * config.num_token_per_page);
+    auto statuses = h->matched_status();
+    for (size_t i = 0; i < 10; i++) {
+      if (i == 1) {
+        assert(statuses[i] == MatchStatus::NotMatchExact);
+      } else if (i == 5 || i == 6) {
+        assert(statuses[i] == MatchStatus::NotMatchPartial);
+      } else if (i == 0) {
+        assert(statuses[i] == MatchStatus::Exact);
+      } else {
+        assert(statuses[i] == MatchStatus::Partial);
+      }
+    }
+
+    auto hk = h->handle_data(true);
+    auto hv = h->handle_data(false);
+
+    for (size_t i = 0; i < 10; i++) {
+      if (i == 1 || i == 5 || i == 6) {
+      } else {
+        cmp_handle_data(slice(hk, i, i + 1), k[i], 1);
+        cmp_handle_data(slice(hv, i, i + 1), v[i], 1);
+      }
+    }
+  }
+
+  SPDLOG_CRITICAL("All Test Passed: {}", argv[0]);
+  return 0;
+}
--- a/csrc/balance_serve/kvc2/test/kvc2test/lookup-gpu-async.cpp
+++ b/csrc/balance_serve/kvc2/test/kvc2test/lookup-gpu-async.cpp
@ -0,0 +1,49 @@
+/**
+ * @Description  :
+ * @Author       : Xie Weiyu
+ * @Date         : 2024-11-22 09:52:48
+ * @Version      : 1.0.0
+ * @LastEditors  : Xie Weiyu
+ * @LastEditTime : 2024-11-25 07:51:09
+ * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
+ **/
+
+#include <future>
+#include "common.hpp"
+
+int main(int argc, char* argv[]) {
+  init(argc, argv);
+  spdlog::set_level(spdlog::level::debug);
+  auto kvc2 = kvc2::create_kvc2(config);
+
+  std::mt19937 gen(123);
+  auto ids1 = random_ids(10 * config.num_token_per_page, gen);
+  auto k1 = random_kvcache(10, gen);
+  auto v1 = random_kvcache(10, gen);
+
+  kvc2->raw_insert(test_model_name, test_quant_type, ids1.data(), ids1.size(), k1, v1);
+
+// complete same
+#pragma omp parallel for
+  for (size_t ti = 0; ti < 3; ti++) {
+    std::promise<std::shared_ptr<DoubleCacheHandleInterface>> p;
+    kvc2->lookup_to_gpu_async(test_model_name, test_quant_type, ids1.data(), ids1.size(),
+                              ids1.size() + 2 * config.num_token_per_page,
+                              [&p](std::shared_ptr<DoubleCacheHandleInterface> h) { p.set_value(h); });
+    auto fut = p.get_future();
+    fut.wait();
+    auto h = fut.get();
+    auto k = h->handle_data(true);
+    auto v = h->handle_data(false);
+    cmp_handle_data(k1, k, 10);
+    cmp_handle_data(v1, v, 10);
+
+    auto block_idx = h->get_gpu_block_idx();
+    auto [kcache, vcache] = kvc2->get_kvcache();
+
+    cmp_handle_gpu(block_idx, kcache, vcache, k1, v1, 10);
+  }
+
+  SPDLOG_CRITICAL("All Test Passed: {}", argv[0]);
+  return 0;
+}
--- a/csrc/balance_serve/kvc2/test/kvc2test/lookup-gpu-mt-without-vcache.cpp
+++ b/csrc/balance_serve/kvc2/test/kvc2test/lookup-gpu-mt-without-vcache.cpp
@ -0,0 +1,61 @@
+/**
+ * @Description  :
+ * @Author       : Xie Weiyu
+ * @Date         : 2024-11-22 09:52:48
+ * @Version      : 1.0.0
+ * @LastEditors  : Xie Weiyu
+ * @LastEditTime : 2024-11-25 07:51:09
+ * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
+ **/
+
+#include "common.hpp"
+
+int main(int argc, char* argv[]) {
+  qw25_7B_gpu_config.v_cache_on = false;
+  config.gpu_cache_config = qw25_7B_gpu_config;
+  config.v_cache_on = false;
+
+  init(argc, argv);
+  spdlog::set_level(spdlog::level::debug);
+  auto kvc2 = kvc2::create_kvc2(config);
+
+  std::mt19937 gen(123);
+  auto ids1 = random_ids(10 * config.num_token_per_page, gen);
+  auto k1 = random_kvcache(10, gen);
+
+  kvc2->raw_insert(test_model_name, test_quant_type, ids1.data(), ids1.size(), k1, {});
+
+// complete same
+#pragma omp parallel for
+  for (size_t ti = 0; ti < 3; ti++) {
+    auto h = kvc2->lookup_to_gpu(test_model_name, test_quant_type, ids1.data(), ids1.size(),
+                                 ids1.size() + 2 * config.num_token_per_page);
+    auto k = h->handle_data(true);
+    cmp_handle_data(k1, k, 10);
+
+    auto block_idx = h->get_gpu_block_idx();
+    auto [kcache, vcache] = kvc2->get_kvcache();
+
+    auto k_from_gpu = empty_kvcache(15);
+
+    size_t gpu_count = config.gpu_cache_config->gpu_devices_id.size();
+    size_t element_size_per_gpu = test_cache_info.element_size(config.num_token_per_page) / gpu_count;
+    for (size_t i = 0; i < k_from_gpu.size(); i++) {
+      for (size_t j = 0; j < block_idx.size(); j++) {
+        size_t b_idx = block_idx[j];
+        for (size_t gpu_idx = 0; gpu_idx < gpu_count; gpu_idx++) {
+          {
+            auto kt = kcache[gpu_idx][i][b_idx].to(torch::kCPU);
+            void* src = kt.data_ptr();
+            void* dst = offset_by_bytes(k_from_gpu[i][j], gpu_idx * element_size_per_gpu);
+            memcpy(dst, src, element_size_per_gpu);
+          }
+        }
+      }
+    }
+    cmp_handle_data(k1, k_from_gpu, 10);
+  }
+
+  SPDLOG_CRITICAL("All Test Passed: {}", argv[0]);
+  return 0;
+}
--- a/csrc/balance_serve/kvc2/test/kvc2test/lookup-gpu-mt.cpp
+++ b/csrc/balance_serve/kvc2/test/kvc2test/lookup-gpu-mt.cpp
@ -0,0 +1,68 @@
+/**
+ * @Description  :
+ * @Author       : Xie Weiyu
+ * @Date         : 2024-11-22 09:52:48
+ * @Version      : 1.0.0
+ * @LastEditors  : Xie Weiyu
+ * @LastEditTime : 2024-11-25 07:51:09
+ * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
+ **/
+
+#include "common.hpp"
+
+int main(int argc, char* argv[]) {
+  init(argc, argv);
+  spdlog::set_level(spdlog::level::debug);
+  auto kvc2 = kvc2::create_kvc2(config);
+
+  std::mt19937 gen(123);
+  auto ids1 = random_ids(10 * config.num_token_per_page, gen);
+  auto k1 = random_kvcache(10, gen);
+  auto v1 = random_kvcache(10, gen);
+
+  kvc2->raw_insert(test_model_name, test_quant_type, ids1.data(), ids1.size(), k1, v1);
+
+// complete same
+#pragma omp parallel for
+  for (size_t ti = 0; ti < 3; ti++) {
+    auto h = kvc2->lookup_to_gpu(test_model_name, test_quant_type, ids1.data(), ids1.size(),
+                                 ids1.size() + 2 * config.num_token_per_page);
+    auto k = h->handle_data(true);
+    auto v = h->handle_data(false);
+    cmp_handle_data(k1, k, 10);
+    cmp_handle_data(v1, v, 10);
+
+    auto block_idx = h->get_gpu_block_idx();
+    auto [kcache, vcache] = kvc2->get_kvcache();
+
+    auto k_from_gpu = empty_kvcache(15);
+    auto v_from_gpu = empty_kvcache(15);
+
+    size_t gpu_count = config.gpu_cache_config->gpu_devices_id.size();
+    size_t element_size_per_gpu = test_cache_info.element_size(config.num_token_per_page) / gpu_count;
+    for (size_t i = 0; i < k_from_gpu.size(); i++) {
+      for (size_t j = 0; j < block_idx.size(); j++) {
+        size_t b_idx = block_idx[j];
+        for (size_t gpu_idx = 0; gpu_idx < gpu_count; gpu_idx++) {
+          {
+            auto kt = kcache[gpu_idx][i][b_idx].to(torch::kCPU);
+            void* src = kt.data_ptr();
+            void* dst = offset_by_bytes(k_from_gpu[i][j], gpu_idx * element_size_per_gpu);
+            memcpy(dst, src, element_size_per_gpu);
+          }
+          {
+            auto vt = vcache[gpu_idx][i][b_idx].to(torch::kCPU);
+            void* src = vt.data_ptr();
+            void* dst = offset_by_bytes(v_from_gpu[i][j], gpu_idx * element_size_per_gpu);
+            memcpy(dst, src, element_size_per_gpu);
+          }
+        }
+      }
+    }
+    cmp_handle_data(k1, k_from_gpu, 10);
+    cmp_handle_data(v1, v_from_gpu, 10);
+  }
+
+  SPDLOG_CRITICAL("All Test Passed: {}", argv[0]);
+  return 0;
+}
--- a/csrc/balance_serve/kvc2/test/kvc2test/lookup-gpu.cpp
+++ b/csrc/balance_serve/kvc2/test/kvc2test/lookup-gpu.cpp
@ -0,0 +1,160 @@
+/**
+ * @Description  :
+ * @Author       : Xie Weiyu
+ * @Date         : 2024-11-22 09:52:48
+ * @Version      : 1.0.0
+ * @LastEditors  : Xie Weiyu
+ * @LastEditTime : 2024-11-25 08:38:33
+ * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
+ **/
+
+#include "common.hpp"
+
+int main(int argc, char* argv[]) {
+  init(argc, argv);
+  spdlog::set_level(spdlog::level::debug);
+  auto kvc2 = kvc2::create_kvc2(config);
+
+  std::mt19937 gen(123);
+  auto ids1 = random_ids(10 * config.num_token_per_page, gen);
+  auto k1 = random_kvcache(10, gen);
+  auto v1 = random_kvcache(10, gen);
+
+  kvc2->raw_insert(test_model_name, test_quant_type, ids1.data(), ids1.size(), k1, v1);
+
+  // complete same
+  {
+    auto h = kvc2->lookup_to_gpu(test_model_name, test_quant_type, ids1.data(), ids1.size(),
+                                 ids1.size() + 5 * config.num_token_per_page);
+    auto k = h->handle_data(true);
+    auto v = h->handle_data(false);
+    cmp_handle_data(k1, k, 10);
+    cmp_handle_data(v1, v, 10);
+
+    auto block_idx = h->get_gpu_block_idx();
+    auto [kcache, vcache] = kvc2->get_kvcache();
+
+    auto k_from_gpu = empty_kvcache(15);
+    auto v_from_gpu = empty_kvcache(15);
+
+    size_t gpu_count = config.gpu_cache_config->gpu_devices_id.size();
+    size_t element_size_per_gpu = test_cache_info.element_size(config.num_token_per_page) / gpu_count;
+    for (size_t i = 0; i < k_from_gpu.size(); i++) {
+      for (size_t j = 0; j < block_idx.size(); j++) {
+        size_t b_idx = block_idx[j];
+        for (size_t gpu_idx = 0; gpu_idx < gpu_count; gpu_idx++) {
+          {
+            auto kt = kcache[gpu_idx][i][b_idx].to(torch::kCPU);
+            void* src = kt.data_ptr();
+            void* dst = offset_by_bytes(k_from_gpu[i][j], gpu_idx * element_size_per_gpu);
+            memcpy(dst, src, element_size_per_gpu);
+          }
+          {
+            auto vt = vcache[gpu_idx][i][b_idx].to(torch::kCPU);
+            void* src = vt.data_ptr();
+            void* dst = offset_by_bytes(v_from_gpu[i][j], gpu_idx * element_size_per_gpu);
+            memcpy(dst, src, element_size_per_gpu);
+          }
+        }
+      }
+    }
+    cmp_handle_data(k1, k_from_gpu, 10);
+    cmp_handle_data(v1, v_from_gpu, 10);
+  }
+
+  // prefix and evict
+  {
+    auto h = kvc2->lookup_to_gpu(test_model_name, test_quant_type, ids1.data(), config.num_token_per_page * 3,
+                                 config.gpu_cache_config->total_kvcache_pages * config.num_token_per_page);
+    auto k = h->handle_data(true);
+    auto v = h->handle_data(false);
+    cmp_handle_data(k1, k, 3);
+    cmp_handle_data(v1, v, 3);
+
+    auto block_idx = h->get_gpu_block_idx();
+    auto [kcache, vcache] = kvc2->get_kvcache();
+
+    auto k_from_gpu = empty_kvcache(3);
+    auto v_from_gpu = empty_kvcache(3);
+
+    size_t gpu_count = config.gpu_cache_config->gpu_devices_id.size();
+    size_t element_size_per_gpu = test_cache_info.element_size(config.num_token_per_page) / gpu_count;
+    for (size_t i = 0; i < k_from_gpu.size(); i++) {
+      for (size_t j = 0; j < 3; j++) {
+        size_t b_idx = block_idx[j];
+        for (size_t gpu_idx = 0; gpu_idx < gpu_count; gpu_idx++) {
+          {
+            auto kt = kcache[gpu_idx][i][b_idx].to(torch::kCPU);
+            void* src = kt.data_ptr();
+            void* dst = offset_by_bytes(k_from_gpu[i][j], gpu_idx * element_size_per_gpu);
+            memcpy(dst, src, element_size_per_gpu);
+          }
+          {
+            auto vt = vcache[gpu_idx][i][b_idx].to(torch::kCPU);
+            void* src = vt.data_ptr();
+            void* dst = offset_by_bytes(v_from_gpu[i][j], gpu_idx * element_size_per_gpu);
+            memcpy(dst, src, element_size_per_gpu);
+          }
+        }
+      }
+    }
+    cmp_handle_data(k1, k_from_gpu, 3);
+    cmp_handle_data(v1, v_from_gpu, 3);
+  }
+
+  // // complete prefix
+  // {
+  //   std::vector<Token> ids2(ids1.begin(), ids1.begin() + 3 * config.num_token_per_page);
+  //   auto h = kvc2->lookup(test_model_name, test_quant_type, ids2.data(), ids2.size(),
+  //                         ids2.size() + 3 * config.num_token_per_page);
+  //   auto k = h->handle_data(true);
+  //   auto v = h->handle_data(false);
+  //   cmp_handle_data(k1, k, 3);
+  //   cmp_handle_data(v1, v, 3);
+  // }
+
+  // // common prefix
+  // {
+  //   std::vector<Token> ids2(ids1.begin(), ids1.begin() + 3 * config.num_token_per_page);
+  //   auto rids = random_ids(config.num_token_per_page * 2 + config.num_token_per_page / 2, gen);
+  //   ids2.insert(ids2.end(), rids.begin(), rids.end());
+
+  //   auto h = kvc2->lookup(test_model_name, test_quant_type, ids2.data(), ids2.size(), ids2.size());
+  //   auto k = h->handle_data(true);
+  //   auto v = h->handle_data(false);
+  //   cmp_handle_data(k1, k, 3);
+  //   cmp_handle_data(v1, v, 3);
+  // }
+
+  // // no prefix
+  // {
+  //   std::vector<Token> ids2 = random_ids(config.num_token_per_page, gen);
+  //   auto h = kvc2->lookup(test_model_name, test_quant_type, ids2.data(), ids2.size(), ids2.size());
+  //   assert(h->matched_length() == 0);
+  // }
+
+  // // insert partly new
+  // auto k2 = random_kvcache(10, gen);
+  // auto v2 = random_kvcache(10, gen);
+  // copy_kvcache(k1, k2, 0, 5);
+  // copy_kvcache(v1, v2, 0, 5);
+  // auto ids2 = random_ids(10 * config.num_token_per_page, gen);
+  // for (size_t i = 0; i < 5 * config.num_token_per_page; i++) {
+  //   ids2[i] = ids1[i];
+  // }
+  // kvc2->raw_insert(test_model_name, test_quant_type, ids2.data(), ids2.size(), k2, v2);
+
+  // // read new part
+  // {
+  //   std::vector<Token> ids(ids2.begin(), ids2.begin() + 7 * config.num_token_per_page);
+  //   auto h = kvc2->lookup(test_model_name, test_quant_type, ids.data(), ids.size(),
+  //                         ids.size() + 7 * config.num_token_per_page);
+  //   auto k = h->handle_data(true);
+  //   auto v = h->handle_data(false);
+  //   cmp_handle_data(k, k2, 7);
+  //   cmp_handle_data(v, v2, 7);
+  // }
+
+  SPDLOG_CRITICAL("All Test Passed: {}", argv[0]);
+  return 0;
+}
--- a/csrc/balance_serve/kvc2/test/kvc2test/lookup-mt.cpp
+++ b/csrc/balance_serve/kvc2/test/kvc2test/lookup-mt.cpp
@ -0,0 +1,103 @@
+/**
+ * @Description  :
+ * @Author       : Xie Weiyu
+ * @Date         : 2024-11-22 08:48:40
+ * @Version      : 1.0.0
+ * @LastEditors  : Xie Weiyu
+ * @LastEditTime : 2024-11-22 09:53:06
+ * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
+ **/
+
+#include "common.hpp"
+
+template <typename F>
+void test_multi(F f) {
+  std::vector<std::thread> threads;
+  for (size_t i = 0; i < 10; i++) {
+    threads.push_back([f]() { f(); });
+  }
+  for (auto& t : threads) {
+    t.join();
+  }
+}
+
+int main(int argc, char* argv[]) {
+  init(argc, argv);
+  spdlog::set_level(spdlog::level::debug);
+  auto kvc2 = kvc2::create_kvc2(config);
+
+  std::mt19937 gen(123);
+  auto ids1 = random_ids(3 * config.num_token_per_page, gen);
+  auto k1 = random_kvcache(3, gen);
+  auto v1 = random_kvcache(3, gen);
+
+  kvc2->raw_insert(test_model_name, test_quant_type, ids1.data(), ids1.size(), k1, v1);
+
+  // complete same
+  {
+#pragma omp parallel for
+    for (size_t i = 0; i < 10; i++) {
+      auto h = kvc2->lookup(test_model_name, test_quant_type, ids1.data(), ids1.size(),
+                            ids1.size() + 10 * config.num_token_per_page);
+      if (h == nullptr) {
+        SPDLOG_WARN("Thread[{}]: h is nullptr", i);
+      } else {
+        auto k = h->handle_data(true);
+        auto v = h->handle_data(false);
+        cmp_handle_data(k1, k, 3);
+        cmp_handle_data(v1, v, 3);
+      }
+    }
+  }
+
+  //  // complete prefix
+  //   {
+  //     std::vector<Token> ids2(ids1.begin(), ids1.begin() + 3 * config.num_token_per_page);
+  //     auto h = kvc2->lookup(test_model_name, test_quant_type, ids2.data(), ids2.size(), ids2.size() + 3 *
+  //     config.num_token_per_page); auto k = h->handle_data(true); auto v = h->handle_data(false); cmp_handle_data(k1,
+  //     k, 3); cmp_handle_data(v1, v, 3);
+  //   }
+
+  //   // common prefix
+  //   {
+  //     std::vector<Token> ids2(ids1.begin(), ids1.begin() + 3 * config.num_token_per_page);
+  //     auto rids = random_ids(config.num_token_per_page * 2 + config.num_token_per_page / 2, gen);
+  //     ids2.insert(ids2.end(), rids.begin(), rids.end());
+
+  //     auto h = kvc2->lookup(test_model_name, test_quant_type, ids2.data(), ids2.size(), ids2.size());
+  //     auto k = h->handle_data(true);
+  //     auto v = h->handle_data(false);
+  //     cmp_handle_data(k1, k, 3);
+  //     cmp_handle_data(v1, v, 3);
+  //   }
+
+  //   // no prefix
+  //   {
+  //     std::vector<Token> ids2 = random_ids(config.num_token_per_page, gen);
+  //     auto h = kvc2->lookup(test_model_name, test_quant_type, ids2.data(), ids2.size(), ids2.size());
+  //     assert(h->matched_length() == 0);
+  //   }
+
+  //   // insert partly new
+  //   auto k2 = random_kvcache(10, gen);
+  //   auto v2 = random_kvcache(10, gen);
+  //   copy_kvcache(k1, k2, 0, 5);
+  //   copy_kvcache(v1, v2, 0, 5);
+  //   auto ids2 = random_ids(10 * config.num_token_per_page, gen);
+  //   for (size_t i = 0; i < 5 * config.num_token_per_page; i++) {
+  //     ids2[i] = ids1[i];
+  //   }
+  //   kvc2->raw_insert(test_model_name, test_quant_type, ids2.data(), ids2.size(), k2, v2);
+
+  //   // read new part
+  //   {
+  //     std::vector<Token> ids(ids2.begin(), ids2.begin() + 7 * config.num_token_per_page);
+  //     auto h = kvc2->lookup(test_model_name, test_quant_type, ids.data(), ids.size(), ids.size() + 7 *
+  //     config.num_token_per_page); auto k = h->handle_data(true); auto v = h->handle_data(false); cmp_handle_data(k,
+  //     k2, 7); cmp_handle_data(v, v2, 7);
+  //   }
+  kvc2->debug();
+
+  SPDLOG_CRITICAL("All Test Passed: {}", argv[0]);
+  return 0;
+}
--- a/csrc/balance_serve/kvc2/test/kvc2test/lookup-without-vcache.cpp
+++ b/csrc/balance_serve/kvc2/test/kvc2test/lookup-without-vcache.cpp
@ -0,0 +1,84 @@
+/**
+ * @Description  :
+ * @Author       : Xie Weiyu
+ * @Date         : 2024-11-22 08:29:45
+ * @Version      : 1.0.0
+ * @LastEditors  : Xie Weiyu
+ * @LastEditTime : 2024-11-22 09:56:12
+ * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
+ **/
+
+#include "common.hpp"
+
+int main(int argc, char* argv[]) {
+  qw25_7B_gpu_config.v_cache_on = false;
+  config.gpu_cache_config = qw25_7B_gpu_config;
+  config.v_cache_on = false;
+
+  init(argc, argv);
+  spdlog::set_level(spdlog::level::debug);
+  auto kvc2 = kvc2::create_kvc2(config);
+
+  std::mt19937 gen(123);
+  auto ids1 = random_ids(10 * config.num_token_per_page, gen);
+  auto k1 = random_kvcache(10, gen);
+  // auto v1 = random_kvcache(10, gen);
+
+  kvc2->raw_insert(test_model_name, test_quant_type, ids1.data(), ids1.size(), k1, {});
+
+  // complete same
+  {
+    auto h = kvc2->lookup(test_model_name, test_quant_type, ids1.data(), ids1.size(),
+                          ids1.size() + 10 * config.num_token_per_page);
+    auto k = h->handle_data(true);
+    cmp_handle_data(k1, k, 10);
+  }
+
+  // complete prefix
+  {
+    std::vector<Token> ids2(ids1.begin(), ids1.begin() + 3 * config.num_token_per_page);
+    auto h = kvc2->lookup(test_model_name, test_quant_type, ids2.data(), ids2.size(),
+                          ids2.size() + 3 * config.num_token_per_page);
+    auto k = h->handle_data(true);
+    cmp_handle_data(k1, k, 3);
+  }
+
+  // common prefix
+  {
+    std::vector<Token> ids2(ids1.begin(), ids1.begin() + 3 * config.num_token_per_page);
+    auto rids = random_ids(config.num_token_per_page * 2 + config.num_token_per_page / 2, gen);
+    ids2.insert(ids2.end(), rids.begin(), rids.end());
+
+    auto h = kvc2->lookup(test_model_name, test_quant_type, ids2.data(), ids2.size(), ids2.size());
+    auto k = h->handle_data(true);
+    cmp_handle_data(k1, k, 3);
+  }
+
+  // no prefix
+  {
+    std::vector<Token> ids2 = random_ids(config.num_token_per_page, gen);
+    auto h = kvc2->lookup(test_model_name, test_quant_type, ids2.data(), ids2.size(), ids2.size());
+    assert(h->matched_length() == 0);
+  }
+
+  // insert partly new
+  auto k2 = random_kvcache(10, gen);
+  copy_kvcache(k1, k2, 0, 5);
+  auto ids2 = random_ids(10 * config.num_token_per_page, gen);
+  for (size_t i = 0; i < 5 * config.num_token_per_page; i++) {
+    ids2[i] = ids1[i];
+  }
+  kvc2->raw_insert(test_model_name, test_quant_type, ids2.data(), ids2.size(), k2, {});
+
+  // read new part
+  {
+    std::vector<Token> ids(ids2.begin(), ids2.begin() + 7 * config.num_token_per_page);
+    auto h = kvc2->lookup(test_model_name, test_quant_type, ids.data(), ids.size(),
+                          ids.size() + 7 * config.num_token_per_page);
+    auto k = h->handle_data(true);
+    cmp_handle_data(k, k2, 7);
+  }
+
+  SPDLOG_CRITICAL("All Test Passed: {}", argv[0]);
+  return 0;
+}
--- a/csrc/balance_serve/kvc2/test/kvc2test/lookup.cpp
+++ b/csrc/balance_serve/kvc2/test/kvc2test/lookup.cpp
@ -0,0 +1,90 @@
+/**
+ * @Description  :
+ * @Author       : Xie Weiyu
+ * @Date         : 2024-11-22 08:29:45
+ * @Version      : 1.0.0
+ * @LastEditors  : Xie Weiyu
+ * @LastEditTime : 2024-11-22 09:56:12
+ * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
+ **/
+
+#include "common.hpp"
+
+int main(int argc, char* argv[]) {
+  init(argc, argv);
+  spdlog::set_level(spdlog::level::debug);
+  auto kvc2 = kvc2::create_kvc2(config);
+
+  std::mt19937 gen(123);
+  auto ids1 = random_ids(10 * config.num_token_per_page, gen);
+  auto k1 = random_kvcache(10, gen);
+  auto v1 = random_kvcache(10, gen);
+
+  kvc2->raw_insert(test_model_name, test_quant_type, ids1.data(), ids1.size(), k1, v1);
+
+  // complete same
+  {
+    auto h = kvc2->lookup(test_model_name, test_quant_type, ids1.data(), ids1.size(),
+                          ids1.size() + 10 * config.num_token_per_page);
+    auto k = h->handle_data(true);
+    auto v = h->handle_data(false);
+    cmp_handle_data(k1, k, 10);
+    cmp_handle_data(v1, v, 10);
+  }
+
+  // complete prefix
+  {
+    std::vector<Token> ids2(ids1.begin(), ids1.begin() + 3 * config.num_token_per_page);
+    auto h = kvc2->lookup(test_model_name, test_quant_type, ids2.data(), ids2.size(),
+                          ids2.size() + 3 * config.num_token_per_page);
+    auto k = h->handle_data(true);
+    auto v = h->handle_data(false);
+    cmp_handle_data(k1, k, 3);
+    cmp_handle_data(v1, v, 3);
+  }
+
+  // common prefix
+  {
+    std::vector<Token> ids2(ids1.begin(), ids1.begin() + 3 * config.num_token_per_page);
+    auto rids = random_ids(config.num_token_per_page * 2 + config.num_token_per_page / 2, gen);
+    ids2.insert(ids2.end(), rids.begin(), rids.end());
+
+    auto h = kvc2->lookup(test_model_name, test_quant_type, ids2.data(), ids2.size(), ids2.size());
+    auto k = h->handle_data(true);
+    auto v = h->handle_data(false);
+    cmp_handle_data(k1, k, 3);
+    cmp_handle_data(v1, v, 3);
+  }
+
+  // no prefix
+  {
+    std::vector<Token> ids2 = random_ids(config.num_token_per_page, gen);
+    auto h = kvc2->lookup(test_model_name, test_quant_type, ids2.data(), ids2.size(), ids2.size());
+    assert(h->matched_length() == 0);
+  }
+
+  // insert partly new
+  auto k2 = random_kvcache(10, gen);
+  auto v2 = random_kvcache(10, gen);
+  copy_kvcache(k1, k2, 0, 5);
+  copy_kvcache(v1, v2, 0, 5);
+  auto ids2 = random_ids(10 * config.num_token_per_page, gen);
+  for (size_t i = 0; i < 5 * config.num_token_per_page; i++) {
+    ids2[i] = ids1[i];
+  }
+  kvc2->raw_insert(test_model_name, test_quant_type, ids2.data(), ids2.size(), k2, v2);
+
+  // read new part
+  {
+    std::vector<Token> ids(ids2.begin(), ids2.begin() + 7 * config.num_token_per_page);
+    auto h = kvc2->lookup(test_model_name, test_quant_type, ids.data(), ids.size(),
+                          ids.size() + 7 * config.num_token_per_page);
+    auto k = h->handle_data(true);
+    auto v = h->handle_data(false);
+    cmp_handle_data(k, k2, 7);
+    cmp_handle_data(v, v2, 7);
+  }
+
+  SPDLOG_CRITICAL("All Test Passed: {}", argv[0]);
+  return 0;
+}
--- a/csrc/balance_serve/kvc2/test/kvc2test/raw_insert_read.cpp
+++ b/csrc/balance_serve/kvc2/test/kvc2test/raw_insert_read.cpp
@ -0,0 +1,99 @@
+/**
+ * @Description  :
+ * @Author       : Xie Weiyu
+ * @Date         : 2024-11-22 06:00:16
+ * @Version      : 1.0.0
+ * @LastEditors  : Xie Weiyu
+ * @LastEditTime : 2024-11-22 07:30:46
+ * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
+ **/
+
+#include "common.hpp"
+
+int main(int argc, char* argv[]) {
+  init(argc, argv);
+  spdlog::set_level(spdlog::level::debug);
+  auto kvc2 = kvc2::create_kvc2(config);
+
+  std::mt19937 gen(123);
+  auto ids1 = random_ids(10 * config.num_token_per_page, gen);
+  auto k1 = random_kvcache(10, gen);
+  auto v1 = random_kvcache(10, gen);
+
+  kvc2->raw_insert(test_model_name, test_quant_type, ids1.data(), ids1.size(), k1, v1);
+
+  // complete same
+  {
+    auto k2 = empty_kvcache(10);
+    auto v2 = empty_kvcache(10);
+    auto l2 = kvc2->raw_read(test_model_name, test_quant_type, ids1.data(), ids1.size(), k2, v2);
+    assert(l2 == ids1.size());
+
+    cmp_handle_data(k1, k2);
+    cmp_handle_data(v1, v2);
+  }
+
+  // complete prefix
+  {
+    auto k2 = empty_kvcache(10);
+    auto v2 = empty_kvcache(10);
+    std::vector<Token> ids2 = std::vector<Token>(ids1.begin(), ids1.begin() + 3 * config.num_token_per_page);
+    auto l2 = kvc2->raw_read(test_model_name, test_quant_type, ids2.data(), ids2.size(), k2, v2);
+    assert(l2 == 3 * config.num_token_per_page);
+
+    cmp_handle_data(k1, k2, 3);
+    cmp_handle_data(v1, v2, 3);
+  }
+
+  // common prefix
+  {
+    auto k2 = empty_kvcache(10);
+    auto v2 = empty_kvcache(10);
+    std::vector<Token> ids2 = std::vector<Token>(ids1.begin(), ids1.begin() + 3 * config.num_token_per_page);
+    auto rids = random_ids(config.num_token_per_page * 2 + config.num_token_per_page / 2, gen);
+    ids2.insert(ids2.end(), rids.begin(), rids.end());
+
+    auto l2 = kvc2->raw_read(test_model_name, test_quant_type, ids2.data(), ids2.size(), k2, v2);
+    assert(l2 == 3 * config.num_token_per_page);
+
+    cmp_handle_data(k1, k2, 3);
+    cmp_handle_data(v1, v2, 3);
+  }
+
+  // no prefix
+  {
+    auto k2 = empty_kvcache(1);
+    auto v2 = empty_kvcache(1);
+    std::vector<Token> ids2 = random_ids(config.num_token_per_page, gen);
+    auto l2 = kvc2->raw_read(test_model_name, test_quant_type, ids2.data(), ids2.size(), k2, v2);
+    assert(l2 == 0);
+  }
+
+  // insert partly new
+  auto k2 = random_kvcache(10, gen);
+  auto v2 = random_kvcache(10, gen);
+  copy_kvcache(k1, k2, 0, 5);
+  copy_kvcache(v1, v2, 0, 5);
+  auto ids2 = random_ids(10 * config.num_token_per_page, gen);
+  for (size_t i = 0; i < 5 * config.num_token_per_page; i++) {
+    ids2[i] = ids1[i];
+  }
+  kvc2->raw_insert(test_model_name, test_quant_type, ids2.data(), ids2.size(), k2, v2);
+
+  // read new part
+  {
+    auto k = empty_kvcache(10);
+    auto v = empty_kvcache(10);
+    std::vector<Token> ids = std::vector<Token>(ids2.begin(), ids2.begin() + 7 * config.num_token_per_page);
+
+    auto l = kvc2->raw_read(test_model_name, test_quant_type, ids.data(), ids.size(), k, v);
+    assert(l == 7 * config.num_token_per_page);
+
+    cmp_handle_data(k, k2, 7);
+    cmp_handle_data(v, v2, 7);
+  }
+
+  SPDLOG_CRITICAL("All Test Passed: {}", argv[0]);
+
+  return 0;
+}
--- a/csrc/balance_serve/kvc2/test/kvcache_disk_insert_read_test.cpp
+++ b/csrc/balance_serve/kvc2/test/kvcache_disk_insert_read_test.cpp
@ -0,0 +1,87 @@
+#include "kvcache_test_utils.cpp"
+
+int main(int argc, char* argv[]) {
+  parse_and_check(argc, argv);
+  spdlog::set_level(spdlog::level::debug);
+  std::mt19937 gen(123);
+
+  KVC2 kvc2(FLAGS_disk_cache_path);
+  // auto io = kvc2.io_dealer->start_io_thread();
+  kvc2.io_dealer->start_io_thread().detach();
+
+  auto h1 = random_kvcache(qwen_cache_info, 10, gen);
+  h1.ids = random_ids(10 * BlockLength, gen);
+  kvc2.raw_insert(h1);
+
+  // complete same
+  {
+    auto h2 = empty_kvcache(qwen_cache_info, 10);
+    h2.ids = h1.ids;
+    kvc2.raw_read(h2);
+    assert(static_cast<size_t>(h2.match.match_length) == h1.ids.size());
+
+    cmp_handle_data(h1, h2);
+  }
+
+  // complete prefix
+  {
+    auto h2 = empty_kvcache(qwen_cache_info, 10);
+
+    h2.ids = std::vector<ID>(h1.ids.begin(), h1.ids.begin() + 3 * BlockLength);
+    kvc2.raw_read(h2);
+    assert(h2.match.match_length == 3 * BlockLength);
+
+    cmp_handle_data(h1, h2, 3);
+  }
+
+  // common prefix
+  {
+    auto h2 = empty_kvcache(qwen_cache_info, 10);
+
+    h2.ids = std::vector<ID>(h1.ids.begin(), h1.ids.begin() + 5 * BlockLength);
+    auto rids = random_ids(BlockLength * 2 + BlockLength / 2, gen);
+    h2.ids.insert(h2.ids.end(), rids.begin(), rids.end());
+
+    kvc2.raw_read(h2);
+    assert(h2.match.match_length == 5 * BlockLength);
+
+    cmp_handle_data(h1, h2, 5);
+  }
+
+  // no prefix
+  {
+    auto h2 = empty_kvcache(qwen_cache_info, 10);
+
+    h2.ids = random_ids(10 * BlockLength, gen);
+
+    kvc2.raw_read(h2);
+    assert(h2.match.match_length == 0);
+  }
+
+  // insert partly new
+  auto h2 = random_kvcache(qwen_cache_info, 10, gen);
+  copy_kvcache(h1, h2, 0, 5);
+  h2.ids = random_ids(10 * BlockLength, gen);
+  for (size_t i = 0; i < 5 * BlockLength; i++) {
+    h2.ids[i] = h1.ids[i];
+  }
+  kvc2.raw_insert(h2);
+
+  // read new part
+  {
+    auto h = empty_kvcache(qwen_cache_info, 10);
+    h.ids = std::vector<ID>(h2.ids.begin(), h2.ids.begin() + 7 * BlockLength);
+    h.ids.push_back(123);
+
+    kvc2.raw_read(h);
+    assert(h.match.match_length == 7 * BlockLength);
+    cmp_handle_data(h, h2, 7);
+  }
+
+  kvc2.tree->debug();
+  kvc2.io_dealer->stop();
+  // io.join();
+
+  SPDLOG_WARN("{} Test Passed", __FILE__);
+  return 0;
+}
--- a/csrc/balance_serve/kvc2/test/kvcache_mem_eviction_test.cpp
+++ b/csrc/balance_serve/kvc2/test/kvcache_mem_eviction_test.cpp
@ -0,0 +1,52 @@
+#include "kvcache_test_utils.cpp"
+
+int main(int argc, char* argv[]) {
+  parse_and_check(argc, argv);
+  spdlog::set_level(spdlog::level::debug);
+  std::mt19937 gen(123);
+
+  KVC2 kvc2(FLAGS_disk_cache_path);
+  auto io = kvc2.io_dealer->start_io_thread();
+
+  SPDLOG_WARN("Insert 10 x 10 KVCache");
+  std::vector<KVCacheHandle> handles(10);
+  for (int i = 0; i < 10; i++) {
+    handles[i] = random_kvcache(qwen_cache_info, 10, gen);
+    auto& h1 = handles[i];
+    h1.ids = random_ids(10 * BlockLength, gen);
+    kvc2.raw_insert(h1);
+  }
+
+  SPDLOG_WARN("Cache Eviction Test");
+  {
+    for (int i = 0; i < 10; i++) {
+      auto& h = handles[i];
+      SPDLOG_WARN("Lookup {}", i);
+      auto x = kvc2.lookup(qwen_cache_info, h.ids.data(), h.ids.size());
+      cmp_handle_data(h, *x);
+    }
+    SPDLOG_WARN("Simple Eviction OK");
+  }
+
+  {
+    std::vector<std::shared_ptr<KVCacheHandle>> lookup_handles;
+    for (int i = 0; i < 10; i++) {
+      auto& h = handles[i];
+      SPDLOG_WARN("Lookup {}", i);
+      auto x = kvc2.lookup(qwen_cache_info, h.ids.data(), h.ids.size());
+      if (i >= 5) {
+        assert(x == nullptr);
+        continue;
+      }
+      lookup_handles.push_back(x);
+      cmp_handle_data(h, *x);
+    }
+    SPDLOG_WARN("Cannot Eviction OK");
+  }
+
+  kvc2.io_dealer->stop();
+  io.join();
+
+  SPDLOG_WARN("{} Test Passed", __FILE__);
+  return 0;
+}
--- a/csrc/balance_serve/kvc2/test/kvcache_mem_insert_read_test.cpp
+++ b/csrc/balance_serve/kvc2/test/kvcache_mem_insert_read_test.cpp
@ -0,0 +1,104 @@
+#include "kvcache_test_utils.cpp"
+
+int main(int argc, char* argv[]) {
+  parse_and_check(argc, argv);
+  spdlog::set_level(spdlog::level::debug);
+  std::mt19937 gen(123);
+
+  KVC2 kvc2(FLAGS_disk_cache_path);
+  auto io = kvc2.io_dealer->start_io_thread();
+
+  SPDLOG_INFO("Disk Test");
+  auto h1 = random_kvcache(qwen_cache_info, 10, gen);
+  h1.ids = random_ids(10 * BlockLength, gen);
+  kvc2.raw_insert(h1);
+
+  // complete same
+  {
+    auto h2 = empty_kvcache(qwen_cache_info, 10);
+    h2.ids = h1.ids;
+    kvc2.raw_read(h2);
+    assert(static_cast<size_t>(h2.match.match_length) == h1.ids.size());
+
+    cmp_handle_data(h1, h2);
+  }
+
+  // complete prefix
+  {
+    auto h2 = empty_kvcache(qwen_cache_info, 10);
+
+    h2.ids = std::vector<ID>(h1.ids.begin(), h1.ids.begin() + 3 * BlockLength);
+    kvc2.raw_read(h2);
+    assert(h2.match.match_length == 3 * BlockLength);
+
+    cmp_handle_data(h1, h2, 3);
+  }
+
+  // common prefix
+  {
+    auto h2 = empty_kvcache(qwen_cache_info, 10);
+
+    h2.ids = std::vector<ID>(h1.ids.begin(), h1.ids.begin() + 5 * BlockLength);
+    auto rids = random_ids(BlockLength * 2 + BlockLength / 2, gen);
+    h2.ids.insert(h2.ids.end(), rids.begin(), rids.end());
+
+    kvc2.raw_read(h2);
+    assert(h2.match.match_length == 5 * BlockLength);
+
+    cmp_handle_data(h1, h2, 5);
+  }
+
+  // no prefix
+  {
+    auto h2 = empty_kvcache(qwen_cache_info, 10);
+
+    h2.ids = random_ids(10 * BlockLength, gen);
+
+    kvc2.raw_read(h2);
+    assert(h2.match.match_length == 0);
+  }
+
+  // insert partly new
+  auto h2 = random_kvcache(qwen_cache_info, 10, gen);
+  copy_kvcache(h1, h2, 0, 5);
+  h2.ids = random_ids(10 * BlockLength, gen);
+  for (size_t i = 0; i < 5 * BlockLength; i++) {
+    h2.ids[i] = h1.ids[i];
+  }
+  kvc2.raw_insert(h2);
+
+  // read new part
+  {
+    auto h = empty_kvcache(qwen_cache_info, 10);
+    h.ids = std::vector<ID>(h2.ids.begin(), h2.ids.begin() + 7 * BlockLength);
+    h.ids.push_back(123);
+
+    kvc2.raw_read(h);
+    assert(h.match.match_length == 7 * BlockLength);
+    cmp_handle_data(h, h2, 7);
+  }
+
+  SPDLOG_WARN("Memory Test");
+
+  {
+    auto h = kvc2.lookup(qwen_cache_info, h1.ids.data(), h1.ids.size());
+    assert(h);
+    cmp_handle_data(h1, *h);
+    kvc2.block_cache->debug();
+  }
+  kvc2.block_cache->debug();
+
+  {
+    auto h = kvc2.lookup(qwen_cache_info, h1.ids.data(), 5 * BlockLength);
+    assert(h);
+    cmp_handle_data(h1, *h, 5);
+    kvc2.block_cache->debug();
+  }
+  kvc2.block_cache->debug();
+
+  kvc2.io_dealer->stop();
+  io.join();
+
+  SPDLOG_WARN("{} Test Passed", __FILE__);
+  return 0;
+}
--- a/csrc/balance_serve/kvc2/test/kvcache_save_load_test.cpp
+++ b/csrc/balance_serve/kvc2/test/kvcache_save_load_test.cpp
@ -0,0 +1,102 @@
+#include "kvcache_test_utils.cpp"
+
+int main(int argc, char* argv[]) {
+  parse_and_check(argc, argv);
+  spdlog::set_level(spdlog::level::debug);
+  std::mt19937 gen(123);
+  std::vector<KVCacheHandle> handles(10);
+
+  {
+    KVC2 kvc2(FLAGS_disk_cache_path);
+    auto io = kvc2.io_dealer->start_io_thread();
+    SPDLOG_WARN("Insert 10 x 10 KVCache");
+    for (int i = 0; i < 10; i++) {
+      handles[i] = random_kvcache(qwen_cache_info, 10, gen);
+      auto& h1 = handles[i];
+      h1.ids = random_ids(10 * BlockLength, gen);
+      kvc2.raw_insert(h1);
+    }
+
+    kvc2.save();
+    kvc2.tree->debug();
+
+    kvc2.io_dealer->stop();
+    io.join();
+  }
+  {
+    KVC2 kvc2(FLAGS_disk_cache_path);
+    auto io = kvc2.io_dealer->start_io_thread();
+    kvc2.load();
+    kvc2.tree->debug();
+    auto& h1 = handles[0];
+    // complete same
+    {
+      auto h2 = empty_kvcache(qwen_cache_info, 10);
+      h2.ids = h1.ids;
+      kvc2.raw_read(h2);
+      assert(static_cast<size_t>(h2.match.match_length) == h1.ids.size());
+
+      cmp_handle_data(h1, h2);
+    }
+
+    // complete prefix
+    {
+      auto h2 = empty_kvcache(qwen_cache_info, 10);
+
+      h2.ids = std::vector<ID>(h1.ids.begin(), h1.ids.begin() + 3 * BlockLength);
+      kvc2.raw_read(h2);
+      assert(h2.match.match_length == 3 * BlockLength);
+
+      cmp_handle_data(h1, h2, 3);
+    }
+
+    // common prefix
+    {
+      auto h2 = empty_kvcache(qwen_cache_info, 10);
+
+      h2.ids = std::vector<ID>(h1.ids.begin(), h1.ids.begin() + 5 * BlockLength);
+      auto rids = random_ids(BlockLength * 2 + BlockLength / 2, gen);
+      h2.ids.insert(h2.ids.end(), rids.begin(), rids.end());
+
+      kvc2.raw_read(h2);
+      assert(h2.match.match_length == 5 * BlockLength);
+
+      cmp_handle_data(h1, h2, 5);
+    }
+
+    // no prefix
+    {
+      auto h2 = empty_kvcache(qwen_cache_info, 10);
+
+      h2.ids = random_ids(10 * BlockLength, gen);
+
+      kvc2.raw_read(h2);
+      assert(h2.match.match_length == 0);
+    }
+
+    // insert partly new
+    auto h2 = random_kvcache(qwen_cache_info, 10, gen);
+    copy_kvcache(h1, h2, 0, 5);
+    h2.ids = random_ids(10 * BlockLength, gen);
+    for (size_t i = 0; i < 5 * BlockLength; i++) {
+      h2.ids[i] = h1.ids[i];
+    }
+    kvc2.raw_insert(h2);
+
+    // read new part
+    {
+      auto h = empty_kvcache(qwen_cache_info, 10);
+      h.ids = std::vector<ID>(h2.ids.begin(), h2.ids.begin() + 7 * BlockLength);
+      h.ids.push_back(123);
+
+      kvc2.raw_read(h);
+      assert(h.match.match_length == 7 * BlockLength);
+      cmp_handle_data(h, h2, 7);
+    }
+
+    kvc2.io_dealer->stop();
+    io.join();
+  }
+  SPDLOG_WARN("{} Test Passed", __FILE__);
+  return 0;
+}
--- a/csrc/balance_serve/kvc2/test/kvcache_test_utils.cpp
+++ b/csrc/balance_serve/kvc2/test/kvcache_test_utils.cpp
--- a/csrc/balance_serve/kvc2/test/page_pool_test.cpp
+++ b/csrc/balance_serve/kvc2/test/page_pool_test.cpp
@ -0,0 +1,57 @@
+
+#include <unistd.h>
+#include <iostream>
+#include <random>
+#include <thread>
+#include <vector>
+#include "page_aligned_memory_pool.cpp"
+
+#define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG
+#define FMT_HEADER_ONLY
+#include "spdlog/spdlog.h"
+
+// 每个线程执行的任务
+void thread_task(PageAlignedMemoryPool& pool) {
+  std::mt19937 gen(123);
+  std::vector<std::pair<void*, size_t>> allocated;
+  size_t cnt = 40000;
+  for (size_t i = 0; i < cnt; ++i) {
+    // 随机分配一个大小
+    size_t size = (gen() % 100 + 1) * 4096 * 4;
+    void* ptr = pool.alloc(size);
+    // SPDLOG_DEBUG(pool.debug());
+    if (ptr) {
+      pool.free(ptr, size);
+      //   allocated.push_back({ptr, size});
+    }
+    // sleep((int)(gen() % 1000) / 1000.0);
+  }
+  // free all memory
+  for (auto& p : allocated) {
+    pool.free(p.first, p.second);
+  }
+}
+
+int main(int argc, char* argv[]) {
+  spdlog::set_level(spdlog::level::debug);
+
+  // 创建一个内存池
+  PageAlignedMemoryPool pool(40ll * 1024 * 1024 * 1024);  // 40 G
+
+  // 创建线程
+  const int num_threads = 32;
+  std::vector<std::thread> threads;
+  for (int i = 0; i < num_threads; ++i) {
+    threads.emplace_back(thread_task, std::ref(pool));
+  }
+
+  // 等待所有线程完成
+  for (auto& t : threads) {
+    t.join();
+  }
+
+  // 输出调试信息
+  std::cout << pool.debug() << std::endl;
+
+  return 0;
+}
--- a/csrc/balance_serve/kvc2/test/prefix_test.cpp
+++ b/csrc/balance_serve/kvc2/test/prefix_test.cpp
--- a/csrc/balance_serve/kvc2/test/pytest_load.py
+++ b/csrc/balance_serve/kvc2/test/pytest_load.py
@ -0,0 +1,61 @@
+import sys
+sys.path.append('./build')
+sys.path.append('./src')
+import torch
+import kvc2_ext
+from kvc2_utils import get_tensor_from_data_ptr
+
+# Create a kvc2 instance
+path = "/mnt/data/kvc2"
+kvc2_instance = kvc2_ext.create_kvc2(path,int(10e9)) # 10 G memory pool
+kvc2_ext.load(kvc2_instance)
+
+# Start IO thread
+print("Start IO thread")
+kvc2_ext.start_io_thread(kvc2_instance)
+print("IO thread started")
+
+# Create CacheInfoInput
+test_info = kvc2_ext.CacheInfoInput()
+test_info.model_type = kvc2_ext.ModelType.MT_DeepseekV2
+test_info.cache_type = kvc2_ext.CacheType.CT_KeyCache
+test_info.quant_type = kvc2_ext.QuantType.QT_F32
+
+print("Element size: ", test_info.element_size())
+
+# Generate random test IDs (length = 2560)
+torch.manual_seed(123)
+length = 2560
+test_id = torch.randint(0, 65536, (length,), dtype=torch.uint16).contiguous()
+block_count = (length+255) // 256
+# print("Test ID: ", test_id)
+
+# Generate test data based on element size and hidden layer count
+element_size = test_info.element_size()
+hidden_layer_count = test_info.hidden_layer_count()
+
+def read_cmp_and_release(kvc2_instance,cache_info,ids,length):
+    handle = kvc2_ext.lookup(kvc2_instance, cache_info, ids, length)
+    if kvc2_ext.is_nullptr(handle):
+        print("Handle is nullptr.")
+        exit()
+    matched_length = kvc2_ext.matched_length(handle)
+    matched_data = kvc2_ext.handle_data(handle)
+    print('Matched length: ', matched_length)
+    if matched_length >0:
+        print(f'First layer address {[hex(x) for x in matched_data[0]]}')
+    read_data = get_tensor_from_data_ptr(matched_data,element_size)
+    
+    print("Just read check ok.")
+    kvc2_ext.release(handle)
+
+
+l = 128
+while l<=length:
+    read_cmp_and_release(kvc2_instance,test_info,test_id.data_ptr(),l)
+    l+=128
+
+kvc2_ext.destroy_kvc2(kvc2_instance)
+
+
+print("Test completed successfully.")
--- a/csrc/balance_serve/kvc2/test/pytest_mem_prefix_test.py
+++ b/csrc/balance_serve/kvc2/test/pytest_mem_prefix_test.py
@ -0,0 +1,83 @@
+import sys
+sys.path.append('./build')
+sys.path.append('./src')
+import torch
+import kvc2_ext
+from kvc2_utils import alloc_aligned_cache,dealloc_aligned_cache,get_tensor_ptr,get_tensor_from_data_ptr
+
+# Create a kvc2 instance
+path = "/mnt/data/kvc2"
+kvc2_instance = kvc2_ext.create_kvc2(path,int(10e9)) # 10 G memory pool
+
+# Start IO thread
+print("Start IO thread")
+kvc2_ext.start_io_thread(kvc2_instance)
+print("IO thread started")
+
+# Create CacheInfoInput
+test_info = kvc2_ext.CacheInfoInput()
+test_info.model_type = kvc2_ext.ModelType.MT_DeepseekV2
+test_info.cache_type = kvc2_ext.CacheType.CT_KeyCache
+test_info.quant_type = kvc2_ext.QuantType.QT_F32
+
+print("Element size: ", test_info.element_size())
+
+# Generate random test IDs (length = 2560)
+torch.manual_seed(123)
+length = 2560
+test_id = torch.randint(0, 65536, (length,), dtype=torch.uint16).contiguous()
+block_count = (length+255) // 256
+# print("Test ID: ", test_id)
+
+# Generate test data based on element size and hidden layer count
+element_size = test_info.element_size()
+hidden_layer_count = test_info.hidden_layer_count()
+
+write_data,write_data_mem = alloc_aligned_cache(hidden_layer_count,block_count,element_size)
+# print(test_data,test_data_mem)
+print('Generate Insert Data')
+for layer in write_data:
+    for data in layer:
+        random_values = torch.randint(0, 256, (element_size,), dtype=torch.uint8)
+        data.copy_(random_values)
+
+print('Insert New data')
+# Insert raw data
+kvc2_ext.raw_insert(kvc2_instance, test_info, test_id.data_ptr(), length, get_tensor_ptr(write_data))
+
+
+def read_cmp_and_release(kvc2_instance,cache_info,ids,length):
+    handle = kvc2_ext.lookup(kvc2_instance, cache_info, ids, length)
+    if kvc2_ext.is_nullptr(handle):
+        print("Handle is nullptr.")
+        exit()
+    matched_length = kvc2_ext.matched_length(handle)
+    matched_data = kvc2_ext.handle_data(handle)
+    print('Matched length: ', matched_length)
+    if matched_length >0:
+        print(f'First layer address {[hex(x) for x in matched_data[0]]}')
+    read_data = get_tensor_from_data_ptr(matched_data,element_size)
+    
+    for layer_w,layer_r in zip(write_data,read_data):
+        for data_w,data_r in zip(layer_w,layer_r):
+            # print(data_w,data_r)
+            assert torch.equal(data_w,data_r)
+    print("Lookup read check ok.")
+    kvc2_ext.release(handle)
+
+
+l = 128
+while l<=length:
+    read_cmp_and_release(kvc2_instance,test_info,test_id.data_ptr(),l)
+    l+=128
+
+
+
+dealloc_aligned_cache(write_data_mem)
+
+
+kvc2_ext.save(kvc2_instance)
+kvc2_ext.destroy_kvc2(kvc2_instance)
+
+
+print("Test completed successfully.")
--- a/csrc/balance_serve/kvc2/test/pytest_mem_read.py
+++ b/csrc/balance_serve/kvc2/test/pytest_mem_read.py
@ -0,0 +1,72 @@
+import sys
+sys.path.append('./build')
+sys.path.append('./src')
+import torch
+import kvc2_ext
+from kvc2_utils import alloc_aligned_cache,dealloc_aligned_cache,get_tensor_ptr,get_tensor_from_data_ptr
+
+# Create a kvc2 instance
+path = "/mnt/data/kvc2"
+kvc2_instance = kvc2_ext.create_kvc2(path,int(10e9)) # 10 G memory pool
+
+# Start IO thread
+print("Start IO thread")
+kvc2_ext.start_io_thread(kvc2_instance)
+print("IO thread started")
+
+# Create CacheInfoInput
+test_info = kvc2_ext.CacheInfoInput()
+test_info.model_type = kvc2_ext.ModelType.MT_DeepseekV2
+test_info.cache_type = kvc2_ext.CacheType.CT_KeyCache
+test_info.quant_type = kvc2_ext.QuantType.QT_F32
+
+print("Element size: ", test_info.element_size())
+
+# Generate random test IDs (length = 2560)
+length = 2560
+test_id = torch.randint(0, 65536, (length,), dtype=torch.uint16).contiguous()
+block_count = (length+255) // 256
+# print("Test ID: ", test_id)
+
+# Generate test data based on element size and hidden layer count
+element_size = test_info.element_size()
+hidden_layer_count = test_info.hidden_layer_count()
+
+write_data,write_data_mem = alloc_aligned_cache(hidden_layer_count,block_count,element_size)
+# print(test_data,test_data_mem)
+print('Generate Insert Data')
+for layer in write_data:
+    for data in layer:
+        random_values = torch.randint(0, 256, (element_size,), dtype=torch.uint8)
+        data.copy_(random_values)
+
+print('Insert New data')
+# Insert raw data
+kvc2_ext.raw_insert(kvc2_instance, test_info, test_id.data_ptr(), length, get_tensor_ptr(write_data))
+
+
+handle = kvc2_ext.lookup(kvc2_instance, test_info, test_id.data_ptr(), length)
+matched_length = kvc2_ext.matched_length(handle)
+matched_data = kvc2_ext.handle_data(handle)
+
+print('Matched length: ', matched_length)
+print(f'Match data layer {len(matched_data)}')
+print(f'Match layer block count {len(matched_data[0])}')
+read_data = get_tensor_from_data_ptr(matched_data,element_size)
+
+
+for layer_w,layer_r in zip(write_data,read_data):
+    for data_w,data_r in zip(layer_w,layer_r):
+        # print(data_w,data_r)
+        assert torch.equal(data_w,data_r)
+print("Lookup read check ok.")
+
+dealloc_aligned_cache(write_data_mem)
+
+
+kvc2_ext.save(kvc2_instance)
+
+
+
+
+print("Test completed successfully.")
--- a/csrc/balance_serve/kvc2/test/pytest_raw_insert_and_read.py
+++ b/csrc/balance_serve/kvc2/test/pytest_raw_insert_and_read.py
@ -0,0 +1,69 @@
+import sys
+sys.path.append('./build')
+sys.path.append('./src')
+import torch
+import kvc2_ext
+from kvc2_utils import alloc_aligned_cache,dealloc_aligned_cache,get_tensor_ptr
+
+# Create a kvc2 instance
+path = "/mnt/data/kvc2"
+kvc2_instance = kvc2_ext.create_kvc2(path,int(10e9)) # 10 G memory pool
+
+# Start IO thread
+print("Start IO thread")
+kvc2_ext.start_io_thread(kvc2_instance)
+print("IO thread started")
+
+# Create CacheInfoInput
+test_info = kvc2_ext.CacheInfoInput()
+test_info.model_type = kvc2_ext.ModelType.MT_DeepseekV2
+test_info.cache_type = kvc2_ext.CacheType.CT_KeyCache
+test_info.quant_type = kvc2_ext.QuantType.QT_F32
+
+print("Element size: ", test_info.element_size())
+
+# Generate random test IDs (length = 2560)
+length = 2560
+test_id = torch.randint(0, 65536, (length,), dtype=torch.uint16).contiguous()
+block_count = (length+255) // 256
+# print("Test ID: ", test_id)
+
+# Generate test data based on element size and hidden layer count
+element_size = test_info.element_size()
+hidden_layer_count = test_info.hidden_layer_count()
+
+write_data,write_data_mem = alloc_aligned_cache(hidden_layer_count,block_count,element_size)
+# print(test_data,test_data_mem)
+print('Generate Insert Data')
+for layer in write_data:
+    for data in layer:
+        random_values = torch.randint(0, 256, (element_size,), dtype=torch.uint8)
+        data.copy_(random_values)
+
+print('Insert New data')
+# Insert raw data
+kvc2_ext.raw_insert(kvc2_instance, test_info, test_id.data_ptr(), length, get_tensor_ptr(write_data))
+
+
+read_data,read_data_mem = alloc_aligned_cache(hidden_layer_count,block_count,element_size)
+
+print('Raw read')
+matched_length = kvc2_ext.raw_read(kvc2_instance, test_info, test_id.data_ptr(), length,get_tensor_ptr(read_data))
+
+print('Matched length: ', matched_length)
+for layer_w,layer_r in zip(write_data,read_data):
+    for data_w,data_r in zip(layer_w,layer_r):
+        # print(data_w,data_r)
+        assert torch.equal(data_w,data_r)
+print("Raw read check ok.")
+
+dealloc_aligned_cache(write_data_mem)
+dealloc_aligned_cache(read_data_mem)
+
+
+kvc2_ext.save(kvc2_instance)
+
+
+
+
+print("Test completed successfully.")
--- a/csrc/balance_serve/kvc2/test/test_align.py
+++ b/csrc/balance_serve/kvc2/test/test_align.py
@ -0,0 +1,32 @@
+import ctypes
+import torch
+
+def aligned_tensor(size, alignment=4096):
+    num_bytes = size 
+    mem = ctypes.c_void_p()
+    error_code = ctypes.CDLL(None).posix_memalign(
+        ctypes.byref(mem), ctypes.c_size_t(alignment), ctypes.c_size_t(num_bytes)
+    )
+
+    if error_code != 0:
+        raise MemoryError(f"posix_memalign failed with error code {error_code}")
+
+    array_type = (ctypes.c_int8 * size) 
+    raw_array = array_type.from_address(mem.value)
+
+    tensor = torch.frombuffer(raw_array, dtype=torch.int8)
+
+    if tensor.data_ptr() % alignment != 0:
+        raise ValueError(f"Tensor data_ptr {tensor.data_ptr()} is not aligned to {alignment} bytes")
+
+    return tensor, mem
+
+
+size = 5124380
+tensor, mem_ptr = aligned_tensor(size, alignment=4096)
+
+print(f"Tensor: {tensor}, size: {tensor.size()}, dataptr: {tensor.data_ptr()}")
+print(f"Tensor memory alignment: {tensor.data_ptr() % 4096 == 0}")
+print(f"Allocated memory address: {mem_ptr.value}")
+
+ctypes.CDLL(None).free(mem_ptr)
--- a/csrc/balance_serve/kvc2/test/test_cuda_stream.cpp
+++ b/csrc/balance_serve/kvc2/test/test_cuda_stream.cpp
@ -0,0 +1,145 @@
+#include <cuda_runtime.h>
+#include <functional>
+#include <iostream>
+#include <stdexcept>
+#include <vector>
+
+class CudaStreamManager {
+ public:
+  CudaStreamManager(int num_streams);
+  ~CudaStreamManager();
+
+  // Request structure
+  struct Request {
+    std::vector<void*> host_mem_addresses;
+    std::vector<void*> device_mem_addresses;
+    std::vector<size_t> sizes;
+    cudaMemcpyKind direction;
+    std::function<void()> callback;
+  };
+
+  void submitRequest(const Request& request);
+
+ private:
+  int num_streams_;
+  std::vector<cudaStream_t> streams_;
+  int next_stream_index_;
+};
+
+CudaStreamManager::CudaStreamManager(int num_streams) : num_streams_(num_streams), next_stream_index_(0) {
+  streams_.resize(num_streams_);
+  for (int i = 0; i < num_streams_; ++i) {
+    cudaError_t err = cudaStreamCreate(&streams_[i]);
+    if (err != cudaSuccess) {
+      std::cerr << "Failed to create CUDA stream: " << cudaGetErrorString(err) << std::endl;
+      for (int j = 0; j < i; ++j) {
+        cudaStreamDestroy(streams_[j]);
+      }
+      throw std::runtime_error("Failed to create CUDA stream");
+    }
+  }
+}
+
+CudaStreamManager::~CudaStreamManager() {
+  for (int i = 0; i < num_streams_; ++i) {
+    cudaStreamDestroy(streams_[i]);
+  }
+}
+
+void CudaStreamManager::submitRequest(const Request& request) {
+  int stream_index = next_stream_index_;
+  cudaStream_t stream = streams_[stream_index];
+  next_stream_index_ = (next_stream_index_ + 1) % num_streams_;
+
+  size_t num_transfers = request.host_mem_addresses.size();
+  for (size_t i = 0; i < num_transfers; ++i) {
+    cudaError_t err = cudaMemcpyAsync(request.device_mem_addresses[i], request.host_mem_addresses[i], request.sizes[i],
+                                      request.direction, stream);
+    if (err != cudaSuccess) {
+      std::cerr << "cudaMemcpyAsync failed: " << cudaGetErrorString(err) << std::endl;
+      throw std::runtime_error("cudaMemcpyAsync failed");
+    }
+  }
+
+  // Enqueue the callback function
+  struct CallbackData {
+    std::function<void()> callback;
+  };
+
+  CallbackData* cb_data = new CallbackData{request.callback};
+
+  cudaError_t err = cudaLaunchHostFunc(
+      stream,
+      [](void* data) {
+        CallbackData* cb_data = static_cast<CallbackData*>(data);
+        cb_data->callback();
+        delete cb_data;
+      },
+      cb_data);
+
+  if (err != cudaSuccess) {
+    std::cerr << "cudaLaunchHostFunc failed: " << cudaGetErrorString(err) << std::endl;
+    throw std::runtime_error("cudaLaunchHostFunc failed");
+  }
+}
+
+// Example usage
+int main() {
+  try {
+    CudaStreamManager stream_manager(4);  // Create a manager with 4 streams
+
+    // Prepare host and device memory
+    const size_t num_pages = 10;
+    std::vector<void*> host_mem_addresses(num_pages);
+    std::vector<void*> device_mem_addresses(num_pages);
+    std::vector<size_t> sizes(num_pages, 4096);  // 4KB pages
+
+    // Allocate host memory
+    for (size_t i = 0; i < num_pages; ++i) {
+      host_mem_addresses[i] = malloc(4096);
+      if (!host_mem_addresses[i]) {
+        throw std::runtime_error("Failed to allocate host memory");
+      }
+      // Initialize data if necessary
+    }
+
+    // Allocate device memory
+    for (size_t i = 0; i < num_pages; ++i) {
+      cudaError_t err = cudaMalloc(&device_mem_addresses[i], 4096);
+      if (err != cudaSuccess) {
+        std::cerr << "cudaMalloc failed: " << cudaGetErrorString(err) << std::endl;
+        throw std::runtime_error("cudaMalloc failed");
+      }
+    }
+
+    // Create a request
+    CudaStreamManager::Request request;
+    request.host_mem_addresses = host_mem_addresses;
+    request.device_mem_addresses = device_mem_addresses;
+    request.sizes = sizes;
+    request.direction = cudaMemcpyHostToDevice;
+    request.callback = []() { std::cout << "Data transfer completed!" << std::endl; };
+
+    // Submit the request
+    stream_manager.submitRequest(request);
+
+    // Wait for all streams to complete
+    cudaError_t err = cudaDeviceSynchronize();
+    if (err != cudaSuccess) {
+      std::cerr << "cudaDeviceSynchronize failed: " << cudaGetErrorString(err) << std::endl;
+      throw std::runtime_error("cudaDeviceSynchronize failed");
+    }
+
+    // Clean up
+    for (size_t i = 0; i < num_pages; ++i) {
+      free(host_mem_addresses[i]);
+      cudaFree(device_mem_addresses[i]);
+    }
+
+  } catch (const std::exception& e) {
+    std::cerr << "Exception: " << e.what() << std::endl;
+    return 1;
+  }
+
+  return 0;
+}
--- a/csrc/balance_serve/kvc2/test/test_cuda_stream_manager.cpp
+++ b/csrc/balance_serve/kvc2/test/test_cuda_stream_manager.cpp
@ -0,0 +1,113 @@
+#include "cuda_stream_manager.hh"
+
+#include <cuda_runtime.h>
+#include <functional>
+#include <iostream>
+#include <stdexcept>
+#include <vector>
+
+int main() {
+  try {
+    int num_devices = 0;
+    cudaError_t err = cudaGetDeviceCount(&num_devices);
+    if (err != cudaSuccess) {
+      std::cerr << "cudaGetDeviceCount failed: " << cudaGetErrorString(err) << std::endl;
+      return 1;
+    }
+
+    if (num_devices < 1) {
+      std::cerr << "未找到 CUDA 设备。" << std::endl;
+      return 1;
+    }
+
+    std::vector<size_t> device_ids;
+    for (int i = 0; i < num_devices; ++i) {
+      device_ids.push_back(i);
+    }
+
+    const size_t num_pages = 10;
+    const size_t page_size = 4096;  // 每页 4KB
+
+    // 创建 CudaStreamManager 实例，管理所有设备
+    CudaStreamManager stream_manager(device_ids, 4);
+
+    // 准备主机内存和设备内存映射
+    std::vector<std::vector<void*>> host_mem_addresses(num_devices);
+    std::vector<std::vector<void*>> device_mem_addresses(num_devices);
+
+    // 分配主机内存
+    for (size_t i = 0; i < num_pages; ++i) {
+      void* host_ptr = malloc(page_size);
+      if (!host_ptr) {
+        throw std::runtime_error("Failed to allocate host memory");
+      }
+      // 如果需要，初始化数据
+
+      // 将相同的主机内存添加到每个设备的列表中
+      for (int device_id = 0; device_id < num_devices; ++device_id) {
+        host_mem_addresses[device_id].push_back(host_ptr);
+      }
+    }
+
+    // 为每个设备分配设备内存
+    for (int device_id = 0; device_id < num_devices; ++device_id) {
+      err = cudaSetDevice(device_id);
+      if (err != cudaSuccess) {
+        std::cerr << "cudaSetDevice failed: " << cudaGetErrorString(err) << std::endl;
+        throw std::runtime_error("cudaSetDevice failed");
+      }
+
+      for (size_t i = 0; i < num_pages; ++i) {
+        void* device_ptr;
+        err = cudaMalloc(&device_ptr, page_size);
+        if (err != cudaSuccess) {
+          std::cerr << "cudaMalloc failed on device " << device_id << ": " << cudaGetErrorString(err) << std::endl;
+          throw std::runtime_error("cudaMalloc failed");
+        }
+        device_mem_addresses[device_id].push_back(device_ptr);
+      }
+    }
+
+    // 为每个设备创建并提交请求
+    for (int device_id = 0; device_id < num_devices; ++device_id) {
+      auto request = std::shared_ptr<CudaStreamManager::Request>(new CudaStreamManager::Request);
+      request->device_id = device_id;
+      request->host_mem_addresses = host_mem_addresses[device_id];
+      request->device_mem_addresses = device_mem_addresses[device_id];
+      request->sizes = std::vector<size_t>(num_pages, page_size);
+      request->direction = cudaMemcpyHostToDevice;
+      request->callback = [device_id]() {
+        std::cout << "Device " << device_id << " data transfer completed!" << std::endl;
+      };
+
+      stream_manager.submitRequest(request);
+    }
+
+    // 等待一段时间，确保所有请求都被处理
+    // 在实际应用中，可以使用更好的同步机制
+    std::this_thread::sleep_for(std::chrono::seconds(5));
+
+    // 清理主机内存
+    for (size_t i = 0; i < num_pages; ++i) {
+      free(host_mem_addresses[0][i]);  // 所有设备共享相同的主机内存，只需释放一次
+    }
+
+    // 清理设备内存
+    for (int device_id = 0; device_id < num_devices; ++device_id) {
+      err = cudaSetDevice(device_id);
+      if (err != cudaSuccess) {
+        std::cerr << "cudaSetDevice failed during cleanup: " << cudaGetErrorString(err) << std::endl;
+        continue;
+      }
+      for (void* ptr : device_mem_addresses[device_id]) {
+        cudaFree(ptr);
+      }
+    }
+
+  } catch (const std::exception& e) {
+    std::cerr << "异常: " << e.what() << std::endl;
+    return 1;
+  }
+
+  return 0;
+}
--- a/csrc/balance_serve/kvc2/test/test_lock_free_queue.cpp
+++ b/csrc/balance_serve/kvc2/test/test_lock_free_queue.cpp
@ -0,0 +1,56 @@
+#include <chrono>
+#include <iostream>
+#include <thread>
+#include <vector>
+#include "utils/lock_free_queue.hpp"
+
+struct Item {
+  int value;
+  std::promise<void> promise;
+};
+
+int main() {
+  MPSCQueue<Item> queue;
+
+  std::vector<std::thread> producers;
+  const int num_producers = 4;
+  const int items_per_producer = 5;
+
+  // 启动生产者线程
+  for (int i = 0; i < num_producers; ++i) {
+    producers.emplace_back([&queue, i]() {
+      for (int j = 0; j < items_per_producer; ++j) {
+        auto item = std::make_shared<Item>();
+        item->value = i * items_per_producer + j;
+        std::future<void> future = item->promise.get_future();
+        queue.enqueue(item);
+        future.wait();  // 等待消费者处理完成
+      }
+    });
+  }
+
+  // 启动消费者线程
+  std::thread consumer([&queue, num_producers, items_per_producer]() {
+    int total_items = num_producers * items_per_producer;
+    int processed = 0;
+    while (processed < total_items) {
+      std::shared_ptr<Item> item = queue.dequeue();
+      if (item) {
+        std::cout << "Consumed item with value: " << item->value << std::endl;
+        item->promise.set_value();  // 通知生产者
+        ++processed;
+      } else {
+        // 如果队列为空，可以选择休眠或让出线程
+        std::this_thread::yield();
+      }
+    }
+  });
+
+  // 等待所有线程完成
+  for (auto& producer : producers) {
+    producer.join();
+  }
+  consumer.join();
+
+  return 0;
+}
--- a/csrc/balance_serve/kvc2/test/test_periodic_task.cpp
+++ b/csrc/balance_serve/kvc2/test/test_periodic_task.cpp
@ -0,0 +1,163 @@
+#include <atomic>
+#include <cassert>
+#include <chrono>
+#include <cstdio>
+#include <future>
+#include <iostream>
+#include <thread>
+#include "utils/periodic_task.hpp"
+
+// 1. 任务是否按预期执行
+void testPeriodicTaskExecution() {
+  std::atomic<int> execution_count{0};
+  auto task = [&execution_count]() { execution_count++; };
+
+  periodic::PeriodicTask periodic_task(task, std::chrono::milliseconds(50));
+
+  std::this_thread::sleep_for(std::chrono::seconds(2));
+
+  assert(execution_count >= 20);  // 确保任务执行了至少 20 次
+  std::cout << "Test 1 passed: Task executed periodically." << std::endl;
+  std::cout << "Task executed " << execution_count.load() << " times." << std::endl;
+}
+
+// 2. 提前唤醒任务的功能
+void testWakeUpImmediately() {
+  std::atomic<int> execution_count{0};
+  auto task = [&execution_count]() { execution_count++; };
+
+  periodic::PeriodicTask periodic_task(task, std::chrono::milliseconds(200));
+
+  // 提前唤醒任务
+  periodic_task.wakeUp();
+  std::this_thread::sleep_for(std::chrono::milliseconds(50));  // 等待任务执行
+
+  std::cout << "Execution count after wakeUp: " << execution_count.load() << std::endl;
+  assert(execution_count == 1);  // 确保任务立即执行
+  std::cout << "Test 2 passed: Task woke up immediately." << std::endl;
+}
+
+// 3. wakeUpWait() 的等待功能
+void testWakeUpWait() {
+  std::promise<void> promise;
+  std::future<void> future = promise.get_future();
+  auto task = [&promise]() {
+    std::this_thread::sleep_for(std::chrono::milliseconds(100));  // 模拟任务执行
+    promise.set_value();                                          // 任务完成时设置 promise
+  };
+
+  periodic::PeriodicTask periodic_task(task, std::chrono::milliseconds(200));
+
+  // 调用 wakeUpWait 并等待任务完成
+  std::future<void> wakeup_future = periodic_task.wakeUpWait();
+  wakeup_future.wait();  // 等待任务完成
+
+  assert(wakeup_future.valid());  // 确保 future 是有效的
+  std::cout << "Test 3 passed: wakeUpWait() works correctly." << std::endl;
+  std::cout << "wakeUpWait() future is valid." << std::endl;
+}
+
+// 4. 任务抛出异常的处理
+void testTaskExceptionHandling() {
+  auto task = []() { throw std::runtime_error("Test exception"); };
+
+  periodic::PeriodicTask periodic_task(task, std::chrono::milliseconds(200));
+
+  std::this_thread::sleep_for(std::chrono::milliseconds(300));  // 等待一段时间
+
+  std::cout << "Test 4 passed: Task exception is handled correctly." << std::endl;
+  std::cout << "Exception handled and task did not crash." << std::endl;
+}
+
+// 5. 线程是否能正确停止
+void testTaskStop() {
+  std::atomic<bool> stopped{false};
+  auto task = [&stopped]() {
+    while (!stopped) {
+      std::this_thread::sleep_for(std::chrono::milliseconds(50));
+    }
+  };
+
+  periodic::PeriodicTask periodic_task(task, std::chrono::milliseconds(100));
+
+  std::this_thread::sleep_for(std::chrono::seconds(1));  // 运行一段时间
+
+  stopped = true;                                              // 请求停止
+  std::this_thread::sleep_for(std::chrono::milliseconds(50));  // 等待线程停止
+
+  std::cout << "Test 5 passed: Task thread stops correctly." << std::endl;
+  std::cout << "Task has been stopped successfully." << std::endl;
+}
+
+// 6. 高频唤醒的情况下任务执行是否正常
+void testHighFrequencyWakeUp() {
+  std::atomic<int> execution_count{0};
+  auto task = [&execution_count]() { execution_count++; };
+
+  periodic::PeriodicTask periodic_task(task, std::chrono::milliseconds(200));
+
+  for (int i = 0; i < 100; ++i) {
+    periodic_task.wakeUp();
+    std::this_thread::sleep_for(std::chrono::milliseconds(10));  // 每 10 毫秒唤醒一次
+  }
+
+  std::this_thread::sleep_for(std::chrono::seconds(1));  // 等待任务执行完成
+
+  assert(execution_count > 50);  // 确保任务至少执行了 50 次
+  std::cout << "Test 6 passed: Task handles frequent wake ups correctly." << std::endl;
+  std::cout << "Task executed " << execution_count.load() << " times." << std::endl;
+}
+
+// 7. 多个 wakeUpWait() 调用的处理
+void testMultipleWakeUpWait() {
+  std::atomic<int> execution_count{0};
+  auto task = [&execution_count]() {
+    std::this_thread::sleep_for(std::chrono::milliseconds(100));  // 模拟任务执行
+    execution_count++;
+  };
+
+  periodic::PeriodicTask periodic_task(task, std::chrono::milliseconds(200));
+
+  // 同时调用两个 wakeUpWait
+  std::future<void> future1 = periodic_task.wakeUpWait();
+  std::future<void> future2 = periodic_task.wakeUpWait();
+
+  future1.wait();
+  future2.wait();
+
+  assert(execution_count == 1);  // 确保任务只执行了一次
+  std::cout << "Test 7 passed: Multiple wakeUpWait() calls are handled correctly." << std::endl;
+  std::cout << "Task executed " << execution_count.load() << " times." << std::endl;
+}
+
+// 8. 任务函数为空的边界情况
+void testEmptyTaskFunction() {
+  auto task = []() {
+    // 空任务函数
+  };
+
+  periodic::PeriodicTask periodic_task(task, std::chrono::milliseconds(100));
+
+  std::this_thread::sleep_for(std::chrono::seconds(1));  // 等待一段时间
+
+  std::cout << "Test 8 passed: Empty task function works correctly." << std::endl;
+  std::cout << "Empty task function executed without issues." << std::endl;
+}
+
+int main() {
+  std::cout << "Starting tests..." << std::endl;
+
+  // testWakeUpImmediately();
+  testPeriodicTaskExecution();
+  testWakeUpImmediately();
+  testWakeUpWait();
+  testTaskExceptionHandling();
+  testTaskStop();
+  testHighFrequencyWakeUp();
+  testMultipleWakeUpWait();
+  testEmptyTaskFunction();
+
+  std::cout << "All tests passed!" << std::endl;
+
+  return 0;
+}
--- a/csrc/balance_serve/kvc2/test/test_queue_perf.cpp
+++ b/csrc/balance_serve/kvc2/test/test_queue_perf.cpp
@ -0,0 +1,84 @@
+#include <mutex>
+#include <queue>
+#include "utils/lock_free_queue.hpp"
+
+#define STDQ
+
+int main() {
+  const int num_producers = 48;
+  const int num_items = 1e6;
+
+#ifdef STDQ
+  std::mutex lock;
+  std::queue<int> queue;
+#else
+  MPSCQueue<int> queue;
+#endif
+
+  auto start_time = std::chrono::high_resolution_clock::now();
+
+  // Launch multiple producer threads
+  std::vector<std::thread> producers;
+  for (int i = 0; i < num_producers; ++i) {
+    producers.emplace_back([&queue, i
+#ifdef STDQ
+                            ,
+                            &lock
+#endif
+    ]() {
+      for (int j = 0; j < num_items; ++j) {
+#ifdef STDQ
+        std::lock_guard<std::mutex> guard(lock);
+        queue.push(i * num_items + j);
+#else
+        queue.enqueue(std::make_shared<int>(i * num_items + j));
+#endif
+      }
+    });
+  }
+
+  // Consumer thread
+  std::thread consumer([&queue, num_producers
+#ifdef STDQ
+                        ,
+                        &lock
+#endif
+  ]() {
+    int count = 0;
+    while (count < num_producers * num_items) {
+#ifdef STDQ
+      std::lock_guard<std::mutex> guard(lock);
+      if (!queue.empty()) {
+        queue.pop();
+        count++;
+      }
+#else
+      if (auto item = queue.dequeue()) {
+        count++;
+      }
+#endif
+    }
+  });
+
+  // Wait for all producers to finish
+  for (auto& producer : producers) {
+    producer.join();
+  }
+
+  // Wait for the consumer to finish
+  consumer.join();
+
+  auto end_time = std::chrono::high_resolution_clock::now();
+  auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time).count();
+
+#ifdef STDQ
+  std::cout << "std::queue with mutex ";
+#else
+  std::cout << "lock free queue ";
+#endif
+
+  std::cout << "Processed " << num_producers * num_items / 1e6 << "M items in " << duration << " milliseconds "
+            << num_producers * num_items / 1e3 / duration << " MOps." << std::endl;
+
+  return 0;
+}
--- a/csrc/balance_serve/kvc2/test/test_std_list.cpp
+++ b/csrc/balance_serve/kvc2/test/test_std_list.cpp
@ -0,0 +1,38 @@
+#include <iostream>
+#include <iterator>
+#include <vector>
+
+int main() {
+  std::vector<int> v = {0, 1, 2, 3, 4, 5};
+
+  using RevIt = std::reverse_iterator<std::vector<int>::iterator>;
+
+  const auto it = v.begin() + 3;
+  RevIt r_it{it};
+
+  std::cout << "*it == " << *it << '\n'
+            << "*r_it == " << *r_it << '\n'
+            << "*r_it.base() == " << *r_it.base() << '\n'
+            << "*(r_it.base()-1) == " << *(r_it.base() - 1) << '\n';
+
+  RevIt r_end{v.begin()};
+  RevIt r_begin{v.end()};
+
+  for (auto it = r_end.base(); it != r_begin.base(); ++it)
+    std::cout << *it << ' ';
+  std::cout << '\n';
+
+  for (auto it = r_begin; it != r_end; ++it)
+    std::cout << *it << ' ';
+  std::cout << '\n';
+
+  for (auto it = r_begin; it != r_end; ++it) {
+    if (*it == 3) {
+      v.erase(std::next(it).base());
+    }
+  }
+
+  for (auto it : v)
+    std::cout << it << ' ';
+  std::cout << '\n';
+}
--- a/csrc/balance_serve/kvc2/test/xxHash_test.cpp
+++ b/csrc/balance_serve/kvc2/test/xxHash_test.cpp
@ -0,0 +1,31 @@
+#include "xxhash.h"
+#include <iostream>
+
+int main() {
+  std::string t = "hello world";
+  XXH64_hash_t hash = XXH64(t.data(), t.size(), 123);
+  std::cout << hash << std::endl;
+  {
+    /* create a hash state */
+    XXH64_state_t* const state = XXH64_createState();
+    if (state == NULL)
+      abort();
+
+    if (XXH64_reset(state, 123) == XXH_ERROR)
+      abort();
+
+    if (XXH64_update(state, t.data(), 5) == XXH_ERROR)
+      abort();
+
+    if (XXH64_update(state, t.data() + 5, t.size() - 5) == XXH_ERROR)
+      abort();
+    /* Produce the final hash value */
+    XXH64_hash_t const hash = XXH64_digest(state);
+
+    /* State could be re-used; but in this example, it is simply freed  */
+    XXH64_freeState(state);
+    std::cout << hash << std::endl;
+  }
+
+  return 0;
+}
--- a/csrc/balance_serve/kvc2/unit_test.sh
+++ b/csrc/balance_serve/kvc2/unit_test.sh
@ -0,0 +1,36 @@
+#!/bin/bash
+
+# 检查是否提供了 disk_cache_path 参数
+if [ -z "$1" ]; then
+    echo "Usage: $0 <disk_cache_path>"
+    exit 1
+fi
+
+# 将 disk_cache_path 参数赋值给变量
+disk_cache_path=$1
+
+# 定义测试命令数组，并使用变量替换 disk_cache_path
+tests=(
+    "./build/test/kvc2_export_header_test --disk_cache_path=$disk_cache_path"
+    "./build/test/kvcache_disk_insert_read_test --disk_cache_path=$disk_cache_path"
+    "./build/test/kvcache_mem_eviction_test --disk_cache_path=$disk_cache_path"
+    "./build/test/kvcache_mem_insert_read_test --disk_cache_path=$disk_cache_path"
+    "./build/test/kvcache_save_load_test --disk_cache_path=$disk_cache_path"
+)
+
+
+# 遍历每个测试命令
+for test in "${tests[@]}"; do
+    echo "Running: $test"
+    # 运行测试并捕获输出
+    output=$($test)
+    
+    # 检查测试输出中是否包含 "Test Passed"
+    if echo "$output" | grep -q "Test Passed"; then
+        echo "  Test Passed"
+    else
+        echo "  Test Failed"
+    fi
+
+    sleep 1
+done
--- a/csrc/balance_serve/sched/CMakeLists.txt
+++ b/csrc/balance_serve/sched/CMakeLists.txt
@ -0,0 +1,20 @@
+set(CMAKE_CXX_FLAGS "-Og -march=native -Wall -Wextra -g -fPIC")
+# set(CMAKE_CXX_FLAGS "-O3 -march=native -Wall -Wextra -fPIC")
+add_compile_definitions(_GLIBCXX_USE_CXX11_ABI=${_GLIBCXX_USE_CXX11_ABI})
+
+set(UTILS_DIR ${CMAKE_CURRENT_SOURCE_DIR}/utils)
+
+add_library(sched_metrics metrics.cpp)
+target_include_directories(sched_metrics PRIVATE ${UTILS_DIR})
+target_link_libraries(sched_metrics PUBLIC prometheus-cpp::pull)
+
+
+add_library(sched scheduler.cpp)
+target_include_directories(sched PRIVATE ${SPDLOG_DIR}/include ${FMT_DIR}/include ${UTILS_DIR} ${KVC2_INCLUDE_DIR})
+target_link_libraries(sched PUBLIC pthread ${TORCH_LIBRARIES} kvc2 async_store sched_metrics)
+
+pybind11_add_module(sched_ext bind.cpp)
+target_link_libraries(sched_ext PUBLIC sched ${TORCH_LIBRARIES} ${TORCH_PYTHON_LIBRARY})
+
+
+
--- a/csrc/balance_serve/sched/bind.cpp
+++ b/csrc/balance_serve/sched/bind.cpp
@ -0,0 +1,249 @@
+#include "scheduler.h"
+#include <memory>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include <torch/extension.h>
+
+namespace py = pybind11;
+
+PYBIND11_MODULE(sched_ext, m) {
+  py::class_<scheduler::ModelSettings>(m, "ModelSettings")
+      .def(py::init<>())
+      .def_readwrite("model_path", &scheduler::ModelSettings::model_path)
+      .def_readwrite("params_count", &scheduler::ModelSettings::params_count)
+      .def_readwrite("layer_count", &scheduler::ModelSettings::layer_count)
+      .def_readwrite("num_k_heads", &scheduler::ModelSettings::num_k_heads)
+      .def_readwrite("k_head_dim", &scheduler::ModelSettings::k_head_dim)
+      .def_readwrite("bytes_per_params",
+                     &scheduler::ModelSettings::bytes_per_params)
+      .def_readwrite("bytes_per_kv_cache_element",
+                     &scheduler::ModelSettings::bytes_per_kv_cache_element)
+      .def("params_size", &scheduler::ModelSettings::params_nbytes)
+      .def("bytes_per_token_kv_cache",
+           &scheduler::ModelSettings::bytes_per_token_kv_cache)
+      // 添加 pickle 支持
+      .def(py::pickle(
+          [](const scheduler::ModelSettings &self) { // __getstate__
+            return py::make_tuple(self.params_count, self.layer_count,
+                                  self.num_k_heads, self.k_head_dim,
+                                  self.bytes_per_params,
+                                  self.bytes_per_kv_cache_element);
+          },
+          [](py::tuple t) { // __setstate__
+            if (t.size() != 6)
+              throw std::runtime_error("Invalid state! t.size() = " +
+                                       std::to_string(t.size()));
+            scheduler::ModelSettings ms;
+            ms.params_count = t[0].cast<size_t>();
+            ms.layer_count = t[1].cast<size_t>();
+            ms.num_k_heads = t[2].cast<size_t>();
+            ms.k_head_dim = t[3].cast<size_t>();
+            ms.bytes_per_params = t[4].cast<double>();
+            ms.bytes_per_kv_cache_element = t[5].cast<double>();
+            return ms;
+          }));
+
+  py::class_<scheduler::SampleOptions>(m, "SampleOptions")
+      .def(py::init<>())
+      .def_readwrite("temperature", &scheduler::SampleOptions::temperature)
+      .def_readwrite("top_p",
+                     &scheduler::SampleOptions::top_p) // 确保 top_p 也能被访问
+      .def(py::pickle(
+          [](const scheduler::SampleOptions &self) {
+            return py::make_tuple(self.temperature,
+                                  self.top_p); // 序列化 temperature 和 top_p
+          },
+          [](py::tuple t) {
+            if (t.size() != 2) // 确保解包时参数数量匹配
+              throw std::runtime_error("Invalid state! t.size() = " +
+                                       std::to_string(t.size()));
+            scheduler::SampleOptions so;
+            so.temperature = t[0].cast<double>();
+            so.top_p = t[1].cast<double>(); // 反序列化 top_p
+            return so;
+          }));
+
+  py::class_<scheduler::Settings>(m, "Settings")
+      .def(py::init<>())
+      .def_readwrite("model_name", &scheduler::Settings::model_name)
+      .def_readwrite("quant_type", &scheduler::Settings::quant_type)
+      .def_readwrite("model_settings", &scheduler::Settings::model_settings)
+      .def_readwrite("page_size", &scheduler::Settings::page_size)
+      .def_readwrite("gpu_device_id", &scheduler::Settings::gpu_device_id)
+      .def_readwrite("gpu_memory_size", &scheduler::Settings::gpu_memory_size)
+      .def_readwrite("memory_utilization_percentage",
+                     &scheduler::Settings::memory_utilization_percentage)
+      .def_readwrite("max_batch_size", &scheduler::Settings::max_batch_size)
+      .def_readwrite(
+          "recommended_chunk_prefill_token_count",
+          &scheduler::Settings::recommended_chunk_prefill_token_count)
+      .def_readwrite("sample_options", &scheduler::Settings::sample_options)
+      .def_readwrite("sched_metrics_port",
+                     &scheduler::Settings::sched_metrics_port)
+      .def_readwrite("gpu_only", &scheduler::Settings::gpu_only)
+      .def_readwrite("use_self_defined_head_dim",
+                     &scheduler::Settings::use_self_defined_head_dim)
+      .def_readwrite("self_defined_head_dim",
+                     &scheduler::Settings::self_defined_head_dim)
+      .def_readwrite("full_kv_cache_on_each_gpu",
+                     &scheduler::Settings::full_kv_cache_on_each_gpu)
+      .def_readwrite("k_cache_on", &scheduler::Settings::k_cache_on)
+      .def_readwrite("v_cache_on", &scheduler::Settings::v_cache_on)
+      .def_readwrite("kvc2_config_path", &scheduler::Settings::kvc2_config_path)
+      .def_readwrite("kvc2_root_path", &scheduler::Settings::kvc2_root_path)
+      .def_readwrite("memory_pool_size_GB",
+                     &scheduler::Settings::memory_pool_size_GB)
+      .def_readwrite("evict_count", &scheduler::Settings::evict_count)
+      .def_readwrite("strategy_name", &scheduler::Settings::strategy_name)
+      .def_readwrite("kvc2_metrics_port",
+                     &scheduler::Settings::kvc2_metrics_port)
+      .def_readwrite("load_from_disk", &scheduler::Settings::load_from_disk)
+      .def_readwrite("save_to_disk", &scheduler::Settings::save_to_disk)
+      // derived
+      .def_readwrite("gpu_device_count", &scheduler::Settings::gpu_device_count)
+      .def_readwrite("total_kvcache_pages",
+                     &scheduler::Settings::total_kvcache_pages)
+      .def_readwrite("devices", &scheduler::Settings::devices)
+      .def("auto_derive", &scheduler::Settings::auto_derive);
+
+  py::class_<scheduler::BatchQueryTodo,
+             std::shared_ptr<scheduler::BatchQueryTodo>>(m, "BatchQueryTodo")
+      .def(py::init<>())
+      .def_readwrite("query_ids", &scheduler::BatchQueryTodo::query_ids)
+      .def_readwrite("query_tokens", &scheduler::BatchQueryTodo::query_tokens)
+      .def_readwrite("query_lengths", &scheduler::BatchQueryTodo::query_lengths)
+      .def_readwrite("block_indexes", &scheduler::BatchQueryTodo::block_indexes)
+      .def_readwrite("attn_masks", &scheduler::BatchQueryTodo::attn_masks)
+      .def_readwrite("rope_ranges", &scheduler::BatchQueryTodo::rope_ranges)
+      .def_readwrite("sample_options",
+                     &scheduler::BatchQueryTodo::sample_options)
+      .def_readwrite("prefill_mini_batches",
+                     &scheduler::BatchQueryTodo::prefill_mini_batches)
+      .def_readwrite("decode_mini_batches",
+                     &scheduler::BatchQueryTodo::decode_mini_batches)
+      .def_readwrite("stop_criteria", &scheduler::BatchQueryTodo::stop_criteria)
+      .def("debug", &scheduler::BatchQueryTodo::debug)
+      .def(py::pickle(
+          [](const scheduler::BatchQueryTodo &self) {
+            return py::make_tuple(
+                self.query_ids, self.query_tokens, self.query_lengths,
+                self.block_indexes, self.attn_masks, self.rope_ranges,
+                self.sample_options, self.prefill_mini_batches,
+                self.decode_mini_batches, self.stop_criteria);
+          },
+          [](py::tuple t) {
+            if (t.size() != 10)
+              throw std::runtime_error("Invalid state! t.size() = " +
+                                       std::to_string(t.size()));
+            scheduler::BatchQueryTodo bqt;
+            bqt.query_ids = t[0].cast<std::vector<scheduler::QueryID>>();
+            bqt.query_tokens = t[1].cast<std::vector<torch::Tensor>>();
+            bqt.query_lengths =
+                t[2].cast<std::vector<scheduler::TokenLength>>();
+            bqt.block_indexes = t[3].cast<std::vector<torch::Tensor>>();
+            bqt.attn_masks = t[4].cast<std::optional<torch::Tensor>>();
+            bqt.rope_ranges = t[5].cast<std::optional<torch::Tensor>>();
+            bqt.sample_options =
+                t[6].cast<std::vector<scheduler::SampleOptions>>();
+            bqt.prefill_mini_batches =
+                t[7].cast<std::vector<scheduler::PrefillTask>>();
+            bqt.decode_mini_batches =
+                t[8].cast<std::vector<std::vector<scheduler::QueryID>>>();
+            bqt.stop_criteria =
+                t[9].cast<std::vector<std::vector<std::vector<int>>>>();
+            return bqt;
+          }));
+
+  py::class_<scheduler::QueryUpdate>(m, "QueryUpdate")
+      .def(py::init<>())
+      .def_readwrite("id", &scheduler::QueryUpdate::id)
+      .def_readwrite("ok", &scheduler::QueryUpdate::ok)
+      .def_readwrite("is_prefill", &scheduler::QueryUpdate::is_prefill)
+      .def_readwrite("decode_done", &scheduler::QueryUpdate::decode_done)
+      .def_readwrite("active_position",
+                     &scheduler::QueryUpdate::active_position)
+      .def_readwrite("generated_token",
+                     &scheduler::QueryUpdate::generated_token)
+      .def(py::pickle(
+          [](const scheduler::QueryUpdate &self) {
+            return py::make_tuple(self.id, self.ok, self.is_prefill,
+                                  self.decode_done, self.active_position,
+                                  self.generated_token);
+          },
+          [](py::tuple t) {
+            if (t.size() != 6)
+              throw std::runtime_error("Invalid state! t.size() = " +
+                                       std::to_string(t.size()));
+            scheduler::QueryUpdate qu;
+            qu.id = t[0].cast<scheduler::QueryID>();
+            qu.ok = t[1].cast<bool>();
+            qu.is_prefill = t[2].cast<bool>();
+            qu.decode_done = t[3].cast<bool>();
+            qu.active_position = t[4].cast<scheduler::TokenLength>();
+            qu.generated_token = t[5].cast<scheduler::Token>();
+            return qu;
+          }));
+
+  py::class_<scheduler::InferenceContext>(m, "InferenceContext")
+      .def(py::init<>())
+      .def_readwrite("k_cache", &scheduler::InferenceContext::k_cache)
+      .def_readwrite("v_cache", &scheduler::InferenceContext::v_cache);
+
+  py::class_<scheduler::QueryAdd>(m, "QueryAdd")
+      .def(py::init<>())
+      .def_readwrite("query_token", &scheduler::QueryAdd::query_token)
+      // .def_readwrite("attn_mask", &scheduler::QueryAdd::attn_mask)
+      .def_readwrite("query_length", &scheduler::QueryAdd::query_length)
+      .def_readwrite("estimated_length", &scheduler::QueryAdd::estimated_length)
+      .def_readwrite("sample_options", &scheduler::QueryAdd::sample_options)
+      .def_readwrite("user_id", &scheduler::QueryAdd::user_id)
+      .def_readwrite("SLO_TTFT_ms", &scheduler::QueryAdd::SLO_TTFT_ms)
+      .def_readwrite("SLO_TBT_ms", &scheduler::QueryAdd::SLO_TBT_ms)
+      .def_readwrite("stop_criteria", &scheduler::QueryAdd::stop_criteria)
+      .def("serialize", &scheduler::QueryAdd::serialize)
+      .def_static("deserialize", &scheduler::QueryAdd::deserialize)
+      .def(py::pickle(
+          [](const scheduler::QueryAdd &self) {
+            return py::make_tuple(self.query_token,
+                                  // self.attn_mask,
+                                  self.query_length, self.estimated_length,
+                                  self.sample_options, self.user_id,
+                                  self.SLO_TTFT_ms, self.SLO_TBT_ms,
+                                  self.stop_criteria);
+          },
+          [](py::tuple t) {
+            if (t.size() != 8)
+              throw std::runtime_error("Invalid state! t.size() = " +
+                                       std::to_string(t.size()));
+            scheduler::QueryAdd qa;
+            qa.query_token = t[0].cast<std::vector<scheduler::Token>>();
+            // qa.attn_mask = t[1].cast<torch::Tensor>();
+            qa.query_length = t[1].cast<scheduler::TokenLength>();
+            qa.estimated_length = t[2].cast<scheduler::TokenLength>();
+            qa.sample_options = t[3].cast<scheduler::SampleOptions>();
+            qa.user_id = t[4].cast<scheduler::UserID>();
+            qa.SLO_TTFT_ms = t[5].cast<int>();
+            qa.SLO_TBT_ms = t[6].cast<int>();
+            qa.stop_criteria = t[7].cast<std::vector<std::vector<int>>>();
+            return qa;
+          }));
+
+  py::class_<scheduler::Scheduler, std::shared_ptr<scheduler::Scheduler>>(
+      m, "Scheduler")
+      .def("init", &scheduler::Scheduler::init)
+      .def("run", &scheduler::Scheduler::run)
+      .def("stop", &scheduler::Scheduler::stop)
+      .def("add_query", &scheduler::Scheduler::add_query,
+           py::call_guard<py::gil_scoped_release>())
+      .def("cancel_query", &scheduler::Scheduler::cancel_query,
+           py::call_guard<py::gil_scoped_release>())
+      .def("update_last_batch", &scheduler::Scheduler::update_last_batch,
+           py::call_guard<py::gil_scoped_release>())
+      .def("get_inference_context",
+           &scheduler::Scheduler::get_inference_context);
+
+  m.def("create_scheduler", &scheduler::create_scheduler,
+        "Create a new Scheduler instance");
+}
--- a/csrc/balance_serve/sched/metrics.cpp
+++ b/csrc/balance_serve/sched/metrics.cpp
@ -0,0 +1,147 @@
+#include "metrics.h"
+#include <iostream>
+
+// 构造函数
+Metrics::Metrics(const MetricsConfig &config)
+    : registry_(std::make_shared<prometheus::Registry>()),
+      exposer_(config.endpoint), stop_uptime_thread_(false),
+      start_time_(std::chrono::steady_clock::now()) {
+  // 定义统一的桶大小，最大为 10000 ms (10 s)
+  std::vector<double> common_buckets = {
+      0.001, 0.005, 0.01,  0.05,  0.1,    0.5,    1.0,    5.0,
+      10.0,  50.0,  100.0, 500.0, 1000.0, 5000.0, 10000.0}; // 毫秒
+
+  // 注册 TTFT_ms Histogram
+  auto &TTFT_family = prometheus::BuildHistogram()
+                          .Name(std::string(METRIC_PREFIX) + "_TTFT_ms")
+                          .Help("Time to first token in milliseconds")
+                          .Register(*registry_);
+  TTFT_ms = &TTFT_family.Add({{"model", config.model_name}}, common_buckets);
+
+  // 注册 TBT_ms Histogram
+  auto &TBT_family = prometheus::BuildHistogram()
+                         .Name(std::string(METRIC_PREFIX) + "_TBT_ms")
+                         .Help("Time between tokens in milliseconds")
+                         .Register(*registry_);
+  TBT_ms = &TBT_family.Add({{"model", config.model_name}}, common_buckets);
+
+  // 注册 schedule_time Histogram
+  auto &schedule_time_family =
+      prometheus::BuildHistogram()
+          .Name(std::string(METRIC_PREFIX) + "_schedule_time_ms")
+          .Help("Time to generate schedule in milliseconds")
+          .Register(*registry_);
+  schedule_time =
+      &schedule_time_family.Add({{"model", config.model_name}}, common_buckets);
+
+  // 注册 generated_tokens Counter
+  auto &generated_tokens_family =
+      prometheus::BuildCounter()
+          .Name(std::string(METRIC_PREFIX) + "_generated_tokens_total")
+          .Help("Total generated tokens")
+          .Register(*registry_);
+  generated_tokens =
+      &generated_tokens_family.Add({{"model", config.model_name}});
+
+  // 注册 throughput_query Gauge
+  auto &throughput_query_family =
+      prometheus::BuildGauge()
+          .Name(std::string(METRIC_PREFIX) + "_throughput_query")
+          .Help("Throughput per second based on queries")
+          .Register(*registry_);
+  throughput_query =
+      &throughput_query_family.Add({{"model", config.model_name}});
+
+  // 注册 throughput_generated_tokens Gauge
+  auto &throughput_generated_tokens_family =
+      prometheus::BuildGauge()
+          .Name(std::string(METRIC_PREFIX) + "_throughput_generated_tokens")
+          .Help("Throughput per second based on generated tokens")
+          .Register(*registry_);
+  throughput_generated_tokens =
+      &throughput_generated_tokens_family.Add({{"model", config.model_name}});
+
+  // 注册 event_count Counter family
+  event_count_family_ =
+      &prometheus::BuildCounter()
+           .Name(std::string(METRIC_PREFIX) + "_event_count_total")
+           .Help("Count of various events")
+           .Register(*registry_);
+
+  batch_count_family_ =
+      &prometheus::BuildCounter()
+           .Name(std::string(METRIC_PREFIX) + "_batch_count_total")
+           .Help("Count of various batch by status")
+           .Register(*registry_);
+
+  // 注册 query_count Counter family
+  query_count_family_ =
+      &prometheus::BuildCounter()
+           .Name(std::string(METRIC_PREFIX) + "_query_count_total")
+           .Help("Count of queries by status")
+           .Register(*registry_);
+
+  // 注册 uptime_ms Gauge
+  auto &uptime_family = prometheus::BuildGauge()
+                            .Name(std::string(METRIC_PREFIX) + "_uptime_ms")
+                            .Help("Uptime of the scheduler in milliseconds")
+                            .Register(*registry_);
+  uptime_ms = &uptime_family.Add({{"model", config.model_name}});
+
+  // 注册 GPU 利用率 Gauges
+  auto &gpu_util_family =
+      prometheus::BuildGauge()
+          .Name(std::string(METRIC_PREFIX) + "_gpu_utilization_ratio")
+          .Help("Current GPU utilization ratio (0 to 1)")
+          .Register(*registry_);
+  for (size_t i = 0; i < config.gpu_count; ++i) {
+    gpu_utilization_gauges.push_back(&gpu_util_family.Add(
+        {{"gpu_id", std::to_string(i)}, {"model", config.model_name}}));
+  }
+
+  // 将 Registry 注册到 Exposer 中
+  exposer_.RegisterCollectable(registry_);
+
+  // 启动 uptime 更新线程
+  StartUptimeUpdater();
+}
+
+// 析构函数
+Metrics::~Metrics() { StopUptimeUpdater(); }
+
+// 启动 uptime 更新线程
+void Metrics::StartUptimeUpdater() {
+  uptime_thread_ = std::thread([this]() {
+    while (!stop_uptime_thread_) {
+      auto now = std::chrono::steady_clock::now();
+      std::chrono::duration<double, std::milli> uptime_duration =
+          now - start_time_;
+      uptime_ms->Set(uptime_duration.count());
+      // fn_every_sec(this);
+      std::this_thread::sleep_for(std::chrono::seconds(1));
+    }
+  });
+}
+
+// 停止 uptime 更新线程
+void Metrics::StopUptimeUpdater() {
+  stop_uptime_thread_ = true;
+  if (uptime_thread_.joinable()) {
+    uptime_thread_.join();
+  }
+}
+
+// 获取 event_count 指标
+prometheus::Counter *Metrics::event_count(const std::string &type) {
+  return &event_count_family_->Add({{"type", type}}); // 可根据需要添加更多标签
+}
+
+// 获取 query_count 指标
+prometheus::Counter *Metrics::query_count(const std::string &status) {
+  return &query_count_family_->Add(
+      {{"status", status}}); // 可根据需要添加更多标签
+}
+
+prometheus::Counter *Metrics::batch_count(const std::string &type) {
+  return &batch_count_family_->Add({{"type", type}});
+}
--- a/csrc/balance_serve/sched/metrics.h
+++ b/csrc/balance_serve/sched/metrics.h
@ -0,0 +1,88 @@
+#ifndef Metrics_H
+#define Metrics_H
+
+#include <atomic>
+#include <chrono>
+#include <memory>
+#include <prometheus/counter.h>
+#include <prometheus/exposer.h>
+#include <prometheus/gauge.h>
+#include <prometheus/histogram.h>
+#include <prometheus/registry.h>
+#include <string>
+#include <thread>
+#include <vector>
+
+#include "timer.hpp"
+// 指标前缀宏定义
+#define METRIC_PREFIX "scheduler"
+class Metrics;
+
+// 配置结构体
+struct MetricsConfig {
+  std::string endpoint;
+  std::string model_name; // 模型名称，如 "gpt-4"
+  size_t gpu_count;       // GPU数量
+};
+
+// Metrics 类，根据配置初始化 Prometheus 指标
+class Metrics {
+public:
+  // 构造函数传入 MetricsConfig
+  Metrics(const MetricsConfig &config);
+  ~Metrics();
+
+  // 禁止拷贝和赋值
+  Metrics(const Metrics &) = delete;
+  Metrics &operator=(const Metrics &) = delete;
+
+  std::function<void(Metrics *)> fn_every_sec;
+
+  // 指标指针
+  prometheus::Gauge *uptime_ms;
+  prometheus::Histogram *TTFT_ms;
+  prometheus::Histogram *TBT_ms;
+  prometheus::Histogram *schedule_time;
+  prometheus::Gauge *throughput_query;
+  prometheus::Gauge *throughput_generated_tokens;
+  prometheus::Counter *generated_tokens;
+  std::vector<prometheus::Gauge *> gpu_utilization_gauges;
+
+  // 计数器家族
+  prometheus::Counter *event_count(const std::string &type);
+  prometheus::Counter *query_count(const std::string &status);
+  prometheus::Counter *batch_count(const std::string &type);
+
+private:
+  std::shared_ptr<prometheus::Registry> registry_;
+  prometheus::Exposer exposer_;
+
+  // 计数器家族
+  prometheus::Family<prometheus::Counter> *event_count_family_;
+  prometheus::Family<prometheus::Counter> *batch_count_family_;
+  prometheus::Family<prometheus::Counter> *query_count_family_;
+
+  // 线程和控制变量用于更新 uptime_ms
+  std::thread uptime_thread_;
+  std::atomic<bool> stop_uptime_thread_;
+
+  // 启动 uptime 更新线程
+  void StartUptimeUpdater();
+  // 停止 uptime 更新线程
+  void StopUptimeUpdater();
+
+  // 记录程序启动时间
+  std::chrono::steady_clock::time_point start_time_;
+};
+
+struct HistogramTimerWrapper {
+  prometheus::Histogram *histogram;
+  Timer timer;
+  inline HistogramTimerWrapper(prometheus::Histogram *histogram)
+      : histogram(histogram), timer() {
+    timer.start();
+  }
+  inline ~HistogramTimerWrapper() { histogram->Observe(timer.elapsedMs()); }
+};
+
+#endif // Metrics_H
--- a/csrc/balance_serve/sched/model_config.h
+++ b/csrc/balance_serve/sched/model_config.h
@ -0,0 +1,119 @@
+#ifndef __MODEL_CONFIG_HPP_
+#define __MODEL_CONFIG_HPP_
+
+#include "nlohmann/json.hpp"
+#include <iostream>
+
+#include <filesystem>
+#include <fstream>
+
+using DimSize = size_t;
+using URL = std::string;
+using ModelName = std::string;
+
+// We must assure this can be load by config.json
+class ModelConfig {
+public:
+  DimSize hidden_size;
+  DimSize intermediate_size;
+  size_t max_position_embeddings;
+  std::string model_type;
+  size_t num_attention_heads;
+  size_t num_hidden_layers;
+  size_t num_key_value_heads;
+  size_t vocab_size;
+
+  NLOHMANN_DEFINE_TYPE_INTRUSIVE(ModelConfig, hidden_size, intermediate_size,
+                                 max_position_embeddings, model_type,
+                                 num_attention_heads, num_hidden_layers,
+                                 num_key_value_heads, vocab_size);
+
+  void load_from(std::filesystem::path path) {
+    std::cout << "Load from " << path << std::endl;
+    std::ifstream i(path);
+    nlohmann::json j;
+    i >> j;
+    *this = j.get<ModelConfig>();
+  }
+};
+
+using QuantType = std::string;
+static const QuantType NoQuantType = "";
+
+class QuantConfig {
+public:
+  QuantType name;
+
+  // For GEMV
+  QuantType type_of_dot_vector = NoQuantType;
+  inline bool can_be_used_as_matrix() {
+    return type_of_dot_vector != NoQuantType;
+  }
+
+  bool can_be_used_as_vector;
+
+  double bytes_per_element;
+  bool has_scale;
+  bool has_min;
+
+  size_t block_element_count;
+  size_t block_element_size;
+
+  URL reference = "";
+
+  NLOHMANN_DEFINE_TYPE_INTRUSIVE_WITH_DEFAULT(QuantConfig, name,
+                                              type_of_dot_vector,
+                                              can_be_used_as_vector,
+                                              bytes_per_element, has_scale,
+                                              has_min, block_element_count,
+                                              block_element_size, reference);
+};
+
+inline std::map<QuantType, QuantConfig> quant_configs;
+inline std::map<ModelName, ModelConfig> model_configs;
+
+inline void load_quant_configs(std::filesystem::path path) {
+  nlohmann::json j;
+  if (std::filesystem::exists(path)) {
+    std::cout << __FUNCTION__ << " from " << path << std::endl;
+    std::ifstream i(path);
+    i >> j;
+    quant_configs = j.get<std::map<QuantType, QuantConfig>>();
+    std::cout << "Loaded Quant Configs" << std::endl;
+    for (auto &[k, v] : quant_configs) {
+      std::cout << " - " << k << std::endl;
+    }
+  } else {
+    std::cout << __FUNCTION__ << " no file at " << path << std::endl;
+  }
+}
+
+inline void dump_quant_configs(std::filesystem::path path) {
+  std::ofstream o(path);
+  nlohmann::json j = quant_configs;
+  o << j.dump(4);
+}
+
+inline void load_model_configs(std::filesystem::path path) {
+  nlohmann::json j;
+  if (std::filesystem::exists(path)) {
+    std::cout << __FUNCTION__ << " from " << path << std::endl;
+    std::ifstream i(path);
+    i >> j;
+    model_configs = j.get<std::map<ModelName, ModelConfig>>();
+    std::cout << "Loaded Model Configs" << std::endl;
+    for (auto &[k, v] : model_configs) {
+      std::cout << " - " << k << std::endl;
+    }
+  } else {
+    std::cout << __FUNCTION__ << " no file at " << path << std::endl;
+  }
+}
+
+inline void dump_model_configs(std::filesystem::path path) {
+  std::ofstream o(path);
+  nlohmann::json j = model_configs;
+  o << j.dump(4);
+}
+
+#endif
--- a/csrc/balance_serve/sched/scheduler.cpp
+++ b/csrc/balance_serve/sched/scheduler.cpp
@ -0,0 +1,960 @@
+#define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_INFO
+#define FMT_HEADER_ONLY
+#include "nlohmann/json.hpp"
+#include "spdlog/spdlog.h"
+
+#include "scheduler.h"
+#include <optional>
+
+#include "arithmetic.hpp"
+#include "atomic_ptr_with_flags.hpp"
+#include "easy_format.hpp"
+#include "metrics.h"
+#include "mpsc.hpp"
+#include "timer.hpp"
+#include <atomic>
+#include <cassert>
+#include <future>
+#include <memory>
+#include <queue>
+
+#include "kvc2.h"
+
+using json = nlohmann::json;
+
+namespace scheduler {
+
+void Settings::auto_derive() {
+  gpu_device_count = gpu_device_id.size();
+  if (torch::cuda::is_available()) {
+    size_t gpu_count = torch::cuda::device_count();
+    SPDLOG_INFO("Number of available GPUs: {}, want {}", gpu_count,
+                gpu_device_count);
+    if (gpu_count < gpu_device_count) {
+      SPDLOG_ERROR("Not enough GPUs available.");
+      exit(0);
+    }
+    for (size_t i = 0; i < gpu_device_count; i++) {
+      devices.push_back(torch::Device(torch::kCUDA, gpu_device_id[i]));
+    }
+  } else {
+    SPDLOG_ERROR("CUDA is not available on this system.");
+    exit(0);
+  }
+
+  if (model_settings.num_k_heads % gpu_device_count != 0) {
+    SPDLOG_ERROR("num_k_heads {} is not divisible by gpu_device_count {}",
+                 model_settings.num_k_heads, gpu_device_count);
+    assert(false);
+  }
+
+  size_t gpu_memory_available = gpu_memory_size * memory_utilization_percentage;
+  if (gpu_memory_available * gpu_device_count <
+      model_settings.params_nbytes()) {
+    SPDLOG_ERROR("GPU memory size {}G is smaller than {}G",
+                 gpu_memory_available * gpu_device_count / 1e9,
+                 model_settings.params_nbytes() / 1e9);
+    assert(false);
+  }
+
+  assert(model_settings.k_head_dim % model_settings.num_k_heads == 0);
+  size_t head_per_gpu = model_settings.num_k_heads / gpu_device_count;
+  size_t gpu_memory_for_kv_cache =
+      gpu_memory_available /*- model_settings.params_nbytes() /
+                              gpu_device_count*/
+      ;
+  SPDLOG_INFO(
+      "Each GPU Total: {}MiB, Model Params: {}MiB, KVCache: {}MiB, Left: {}MiB",
+      gpu_memory_size / (1 << 20),
+      model_settings.params_nbytes() / gpu_device_count / (1 << 20),
+      gpu_memory_for_kv_cache / (1 << 20),
+      (gpu_memory_size - gpu_memory_available) / (1 << 20));
+  size_t kv_cache_on_cnt = (size_t)(k_cache_on) + (size_t)(v_cache_on);
+  size_t max_total_kvcache_pages =
+      gpu_memory_for_kv_cache /
+      (kv_cache_on_cnt * head_per_gpu * model_settings.k_head_dim *
+       model_settings.bytes_per_kv_cache_element * page_size *
+       model_settings.layer_count);
+  if (total_kvcache_pages.has_value()) {
+    if (total_kvcache_pages.value() > max_total_kvcache_pages) {
+      SPDLOG_ERROR(
+          "total_kvcache_pages {} is larger than max_total_kvcache_pages {}",
+          total_kvcache_pages.value(), max_total_kvcache_pages);
+      assert(false);
+    }
+  } else {
+    total_kvcache_pages = max_total_kvcache_pages;
+    SPDLOG_INFO("total_kvcache_pages is auto derived as {}",
+                max_total_kvcache_pages);
+  }
+
+  if (page_size % 256 != 0) {
+    SPDLOG_ERROR("page_size {} is not divisible by 256", page_size);
+    assert(false);
+  }
+  if (page_size < 256) {
+    SPDLOG_ERROR("page_size {} is smaller than 256", page_size);
+    assert(false);
+  }
+}
+
+std::string BatchQueryTodo::debug() {
+  std::string re = "BatchQueryTodo: ";
+  re += "QueryIDs: ";
+  for (auto &id : query_ids) {
+    re += std::to_string(id) + " ";
+  }
+  return re;
+}
+
+bool BatchQueryTodo::empty() {
+  return prefill_mini_batches.empty() && decode_mini_batches.empty();
+}
+
+struct QueryMaintainer;
+
+struct Query {
+  QueryID id;
+  torch::Tensor query_token;
+  TokenLength prompt_length;
+  TokenLength no_kvcache_from;
+  TokenLength estimated_length;
+
+  SampleOptions sample_options;
+
+  UserID user_id;
+  std::optional<int> SLO_TTFT_ms;
+  std::optional<int> SLO_TBT_ms;
+
+  std::vector<std::vector<int>> stop_criteria;
+
+  // status
+  // Query status changed by this order
+  enum Status { Received, Preparing, Ready, Prefill, Decode, Done };
+  Status plan_status = Received;
+  TokenLength active_position; // the position where no kvcache now
+  TokenLength plan_position;   // the position where no kvcache now, in plan
+  size_t prepare_try_count = 0;
+  std::shared_ptr<kvc2::DoubleCacheHandleInterface> kvc2_handle = nullptr;
+
+  // derived from kvc2_handle
+  torch::Tensor block_index; // block indexes
+
+  struct QueryContext {
+    ModelName model_name;
+    QuantType quant_type;
+    kvc2::KVC2Interface *kvc2_interface;
+    QueryMaintainer *query_maintainer;
+    Metrics *met;
+  } ctx;
+
+  void after_load(bool ok);
+
+  void to_status(Status to);
+
+  void export_metrics() {
+    ctx.met->query_count(status_to_string(plan_status))->Increment(1);
+  }
+
+  Query(QueryID id, QueryAdd query_add, QueryContext context)
+      : id(id), prompt_length(query_add.query_length), no_kvcache_from(0),
+        estimated_length(query_add.estimated_length),
+        sample_options(query_add.sample_options), user_id(query_add.user_id),
+        SLO_TTFT_ms(query_add.SLO_TTFT_ms), SLO_TBT_ms(query_add.SLO_TBT_ms),
+        stop_criteria(query_add.stop_criteria), ctx(context) {
+    std::vector<int64_t> shape = {int64_t(query_add.estimated_length)};
+    query_token =
+        torch::zeros(shape, torch::TensorOptions().dtype(torch::kInt32));
+    assert(query_token.is_contiguous());
+    if (query_token.is_contiguous() == false) {
+      SPDLOG_ERROR("Query Token must be contiguous!");
+      exit(1);
+    }
+
+    memcpy(query_token.data_ptr(), query_add.query_token.data(),
+           query_add.query_length * sizeof(Token));
+
+    no_kvcache_from = 0; // maybe match prefix later
+    export_metrics();
+  }
+
+  Token &token_at(size_t idx) {
+    return reinterpret_cast<Token *>(query_token.data_ptr())[idx];
+  }
+
+  void absorb_update(const QueryUpdate &update) {
+    SPDLOG_DEBUG("{}", update.debug());
+    active_position = update.active_position;
+    kvc2_handle->append_tokens(&token_at(0),
+                               active_position); // active_position is length -1
+    if (update.is_prefill) {
+      if (active_position == prompt_length) {
+        token_at(active_position) = update.generated_token;
+        ctx.met->generated_tokens->Increment(1);
+      }
+    } else {
+      token_at(active_position) = update.generated_token;
+      ctx.met->generated_tokens->Increment(1);
+    }
+
+    if (update.decode_done || active_position == estimated_length - 1) {
+      to_status(Done);
+    }
+  }
+
+  void absorb_prefill_task(const PrefillTask &task) {
+    auto &[id, start, length] = task;
+    this->plan_position = start + length;
+    if (this->plan_position == prompt_length) {
+      to_status(Decode);
+    }
+  }
+
+  void absorb_decode_task([[maybe_unused]] const QueryID &task) {
+    this->plan_position += 1;
+  }
+
+  PrefillTask get_prefill_task(size_t prefill_length) {
+    if (prefill_length + plan_position > prompt_length) {
+      prefill_length = prompt_length - plan_position;
+    }
+    return {id, plan_position, prefill_length};
+  }
+
+  static std::string status_to_string(Status status) {
+    switch (status) {
+    case Received:
+      return "Received";
+    case Preparing:
+      return "Preparing";
+    case Ready:
+      return "Ready";
+    case Prefill:
+      return "Prefill";
+    case Decode:
+      return "Decode";
+    case Done:
+      return "Done";
+    }
+    assert(false);
+  }
+
+  void debug() {
+    std::string status_string = status_to_string(plan_status);
+
+    SPDLOG_DEBUG("Query {}, prompt_length {}, estimated_length {}, plan status "
+                 "{}, plan position {} "
+                 "active position {}",
+                 id, prompt_length, estimated_length, status_string,
+                 plan_position, active_position);
+  }
+};
+
+std::string QueryUpdate::debug() const {
+  return fmt::format("Query {}, ok {}, is_prefill {}, done {}, active_position "
+                     "{}, gen token {}",
+                     id, ok, is_prefill, decode_done, active_position,
+                     generated_token);
+}
+
+using Q = std::shared_ptr<Query>;
+
+struct KVC2_Maintainer {
+  Settings settings;
+
+  std::vector<torch::Tensor> k_cache;
+  std::vector<torch::Tensor> v_cache;
+  std::shared_ptr<kvc2::KVC2Interface> kvc2_interface;
+
+  KVC2_Maintainer(Settings settings) : settings(settings) {
+    // SPDLOG_WARN("Creating KVC2 Instance {}", settings.kvc2_root_path);
+    assert(settings.kvc2_root_path.size() > 0);
+
+    // SPDLOG_WARN("Sizeof KVC2Config {} upper", sizeof(kvc2::KVC2Config));
+    kvc2::GPUPageCacheConfig gpu_cache_config{
+        .gpu_only = settings.gpu_only,
+        .gpu_devices_id = settings.gpu_device_id,
+        .layer_count = settings.model_settings.layer_count,
+        .total_kvcache_pages = settings.total_kvcache_pages.value(),
+        .num_token_per_page = settings.page_size,
+        .num_k_heads = settings.model_settings.num_k_heads,
+        .k_head_dim = settings.use_self_defined_head_dim
+                          ? settings.self_defined_head_dim
+                          : settings.model_settings.k_head_dim,
+        .full_kv_cache_on_each_gpu = settings.full_kv_cache_on_each_gpu,
+        .k_cache_on = settings.k_cache_on,
+        .v_cache_on = settings.v_cache_on,
+        .tensor_type = torch::kBFloat16,
+    };
+
+    auto model_configs_path =
+        std::filesystem::path(settings.kvc2_config_path) / "model_configs.json";
+    load_model_configs(model_configs_path);
+    auto my_model_config = ModelConfig();
+    my_model_config.load_from(
+        std::filesystem::path(settings.model_settings.model_path) /
+        "config.json");
+    model_configs[settings.model_name] = my_model_config;
+    dump_model_configs(model_configs_path);
+
+    kvc2::KVC2Config kvc2_config = {
+        .k_cache_on = settings.k_cache_on,
+        .v_cache_on = settings.v_cache_on,
+        .gpu_only = settings.gpu_only,
+        .load_from_disk = settings.load_from_disk,
+        .save_to_disk = settings.save_to_disk,
+        .path = settings.kvc2_root_path,
+        .config_path = settings.kvc2_config_path,
+        .num_token_per_page = settings.page_size,
+        .memory_pool_size = size_t(settings.memory_pool_size_GB * 1e9),
+        .evict_count = settings.evict_count,
+        .gpu_cache_config = gpu_cache_config,
+        .metrics_port = settings.kvc2_metrics_port,
+    };
+    kvc2_interface = kvc2::create_kvc2(kvc2_config);
+    if (settings.load_from_disk)
+      kvc2_interface->load();
+
+    SPDLOG_DEBUG("KVC2 created ok");
+
+    auto [k_cache, v_cache] = kvc2_interface->get_kvcache();
+    this->k_cache = k_cache;
+    this->v_cache = v_cache;
+  }
+};
+
+using EventAddQuery = std::pair<QueryAdd, std::promise<QueryID> *>;
+using EventUpdateQuery = BatchQueryUpdate;
+using EventTakenBatch = std::shared_ptr<BatchQueryTodo>;
+struct EventPrepare {
+  QueryID query_id;
+  bool first_try;
+};
+struct EventPrepared {
+  QueryID query_id;
+  bool ok;
+};
+
+struct EventQueryStatus {
+  QueryID query_id;
+  Query::Status now_status;
+};
+struct EventSchedule {};
+
+using Event =
+    std::variant<EventAddQuery, EventUpdateQuery, EventTakenBatch, EventPrepare,
+                 EventPrepared, EventQueryStatus, EventSchedule>;
+
+template <typename T> std::string event_name(const T &event);
+
+template <> std::string event_name(const EventAddQuery &) {
+  return "EventAddQuery";
+}
+
+template <> std::string event_name(const EventUpdateQuery &) {
+  return "EventUpdateQuery";
+}
+
+template <> std::string event_name(const EventTakenBatch &) {
+  return "EventTakenBatch";
+}
+template <> std::string event_name(const EventPrepare &) {
+  return "EventPrepare";
+}
+
+template <> std::string event_name(const EventPrepared &) {
+  return "EventPrepared";
+}
+
+template <> std::string event_name(const EventQueryStatus &) {
+  return "EventQueryStatus";
+}
+
+template <> std::string event_name(const EventSchedule &) {
+  return "EventSchedule";
+}
+
+// 用 std::visit 实现对 variant 的 event_name
+std::string event_name(const Event &event) {
+  return std::visit([](const auto &e) { return event_name(e); }, event);
+}
+
+static_assert(std::is_copy_constructible<Event>::value);
+static_assert(std::is_move_constructible<Event>::value);
+
+struct QueryMaintainer : public Scheduler {
+  // only get access by event loop
+  Settings settings;
+  QueryID query_id_counter = NoQueryID + 1;
+  std::map<QueryID, Q> query_map;
+  std::shared_ptr<KVC2_Maintainer> kvc2_maintainer;
+
+  std::shared_ptr<Metrics> met;
+  // multi-thread visit
+  std::atomic_bool stop_flag = false;
+  // TODO consider correctness of event loop
+  MPSCQueueConsumerLock<Event> event_loop_queue;
+
+  // std::binary_semaphore batch_ready{0};
+  AtomicPtrWithFlag<BatchQueryTodo> next_batch;
+
+  QueryMaintainer() = default;
+
+  void gen_batch_query_todo(BatchQueryTodo *re, const std::set<Q> &queries) {
+    std::vector<std::vector<QueryID>> d_batch(2);
+    size_t last_decode_batch = 0;
+    size_t prefill_num = 0;
+    size_t decode_num = 0;
+    size_t preill_length = 0;
+    for (auto &q : queries) {
+      if (q->plan_status == Query::Prefill) {
+        prefill_num += 1;
+      }
+      if (q->plan_status == Query::Decode) {
+        decode_num += 1;
+      }
+    }
+    if (prefill_num >= 2 ||
+        (prefill_num == 1 && settings.max_batch_size - 2 < decode_num)) {
+      preill_length = settings.recommended_chunk_prefill_token_count;
+    } else {
+      preill_length = settings.recommended_chunk_prefill_token_count * 2;
+    }
+    for (auto &q : queries) {
+      re->query_ids.push_back(q->id);
+      re->query_tokens.push_back(q->query_token);
+      re->query_lengths.push_back(q->prompt_length);
+      if (q->plan_status == Query::Prefill) {
+        re->prefill_mini_batches.push_back(q->get_prefill_task(preill_length));
+        assert(re->prefill_mini_batches.size() <= 2);
+      }
+      if (q->plan_status == Query::Decode) {
+        d_batch[last_decode_batch].push_back(q->id);
+        // last_decode_batch = 1 - last_decode_batch;
+        if (d_batch[last_decode_batch].size() == settings.max_batch_size - 1) {
+          last_decode_batch += 1;
+          assert(last_decode_batch < 2);
+        }
+      }
+      re->block_indexes.push_back(q->block_index);
+      re->sample_options.push_back(q->sample_options);
+      re->stop_criteria.push_back(q->stop_criteria);
+    }
+
+    re->attn_masks = std::nullopt;
+    re->rope_ranges = std::nullopt;
+
+    for (auto &b : d_batch) {
+      if (b.empty())
+        continue;
+      re->decode_mini_batches.push_back(b);
+    }
+
+    met->batch_count("Generated")->Increment(1);
+  }
+
+  // Interface
+
+  void init(Settings settings) override {
+    SPDLOG_INFO("\nScheduler Settings:\n"
+                "  model_name: {}\n"
+                "  quant_type: {}\n"
+                "    model_path: {}\n"
+                "    params_count: {}\n"
+                "    layer_count: {}\n"
+                "    num_k_heads: {}\n"
+                "    k_head_dim: {}\n"
+                "    bytes_per_params: {}\n"
+                "    bytes_per_kv_cache_element: {}\n"
+                "  page_size: {}\n"
+                "  gpu_device_id: {}\n"
+                "  gpu_memory_size: {}\n"
+                "  memory_utilization_percentage: {}\n"
+                "  max_batch_size: {}\n"
+                "  recommended_chunk_prefill_token_count: {}\n"
+                "  sched_metrics_port: {}\n"
+                "  kvc2_config_path: {}\n"
+                "  kvc2_root_path: {}\n"
+                "  memory_pool_size_GB: {}\n"
+                "  evict_count: {}\n"
+                "  kvc2_metrics_port: {}\n"
+                "  load_from_disk: {}\n"
+                "  save_to_disk: {}\n"
+                "  strategy_name: {}\n"
+                "  gpu_device_count: {}\n",
+                settings.model_name, settings.quant_type,
+                settings.model_settings.model_path,
+                settings.model_settings.params_count,
+                settings.model_settings.layer_count,
+                settings.model_settings.num_k_heads,
+                settings.model_settings.k_head_dim,
+                settings.model_settings.bytes_per_params,
+                settings.model_settings.bytes_per_kv_cache_element,
+
+                settings.page_size, format_vector(settings.gpu_device_id),
+                readable_number(settings.gpu_memory_size),
+                settings.memory_utilization_percentage, settings.max_batch_size,
+                settings.recommended_chunk_prefill_token_count,
+                settings.sched_metrics_port, settings.kvc2_config_path,
+                settings.kvc2_root_path, settings.memory_pool_size_GB,
+                settings.evict_count, settings.kvc2_metrics_port,
+                settings.load_from_disk, settings.save_to_disk,
+                settings.strategy_name, settings.gpu_device_count);
+
+    this->settings = settings;
+    kvc2_maintainer =
+        std::shared_ptr<KVC2_Maintainer>(new KVC2_Maintainer(settings));
+    MetricsConfig met_conf = {
+        .endpoint = "0.0.0.0:" + std::to_string(settings.sched_metrics_port),
+        .model_name = settings.model_name,
+        .gpu_count = settings.gpu_device_count,
+    };
+
+    SPDLOG_INFO("Creating scheduler metrics exporter on {}", met_conf.endpoint);
+    met = std::make_shared<Metrics>(met_conf);
+    met->fn_every_sec = [](Metrics *met) {
+      auto generated_tokens = met->generated_tokens->Collect().counter.value;
+      SPDLOG_INFO("Last Sec Generated Tokens {}", generated_tokens);
+    };
+  }
+  Query::QueryContext get_query_context() {
+    return Query::QueryContext{
+        .model_name = settings.model_name,
+        .quant_type = settings.quant_type,
+        .kvc2_interface = kvc2_maintainer->kvc2_interface.get(),
+        .query_maintainer = this,
+        .met = met.get(),
+    };
+  }
+
+  QueryID add_query(QueryAdd query_add) override {
+    std::promise<QueryID> p;
+    event_loop_queue.enqueue(EventAddQuery(query_add, &p));
+    return p.get_future().get();
+  }
+
+  void cancel_query(QueryID id) override {
+    SPDLOG_INFO("Cancel Query");
+    SPDLOG_INFO("sched:{} Cancel Query", fmt::ptr(this));
+    auto it = query_map.find(id);
+    if (it == query_map.end()) {
+      SPDLOG_ERROR("Query {} is not found", id);
+      return;
+    }
+    query_map.erase(it);
+  }
+
+  // Here this function update last batch results and get the next batch
+  // in most cases, the batch is ready,
+  // if not, busy wait to get it
+  std::shared_ptr<BatchQueryTodo>
+  update_last_batch(BatchQueryUpdate updates) override {
+    event_loop_queue.enqueue(updates);
+
+    // Busy Wait
+    while (true) {
+      auto [ptr, is_new] = next_batch.touch_load();
+      // SPDLOG_INFO("ptr {} is_new {}", fmt::ptr(ptr), is_new);
+      if (is_new) {
+        // SPDLOG_DEBUG("New Batch {}", fmt::ptr(ptr));
+        auto re = std::shared_ptr<BatchQueryTodo>(ptr);
+        event_loop_queue.enqueue(re);
+        return re;
+      } else {
+        // // here to busy wait
+        // SPDLOG_INFO("Not New");
+        // using namespace std::chrono_literals;
+        // std::this_thread::sleep_for(1s);
+      }
+    }
+  }
+
+  InferenceContext get_inference_context() override {
+    InferenceContext re;
+    re.k_cache = kvc2_maintainer->k_cache;
+    re.v_cache = kvc2_maintainer->v_cache;
+    // kvc2_maintainer->k_cache[0][0][0][0][0][0] = 42; // test whether we pass
+    // this to inference loop
+    return re;
+  }
+
+  virtual void strategy_add_query(Q new_query) = 0;
+  virtual void strategy_update_query(const EventUpdateQuery &update) = 0;
+  virtual void strategy_taken_batch(const EventTakenBatch &batch) = 0;
+  virtual void strategy_prepare(const EventPrepare &prepare) = 0;
+  virtual void strategy_prepared(const EventPrepared &prepared) = 0;
+  virtual void strategy_query_status(const EventQueryStatus &query_status) = 0;
+  virtual void strategy_schedule(const EventSchedule &event,
+                                 BatchQueryTodo *new_batch) = 0;
+
+  void tackle_event(EventAddQuery &event) {
+    auto &query_add = event.first;
+    QueryID id = query_id_counter;
+    event.second->set_value(id);
+    query_id_counter += 1;
+    Q new_query(new Query(id, query_add, get_query_context()));
+    query_map[id] = new_query;
+    SPDLOG_INFO("New Query {} is added", id);
+    strategy_add_query(new_query);
+  }
+
+  void tackle_event(const EventUpdateQuery &update) {
+    // SPDLOG_INFO("Tackle Update Query");
+    for (auto &u : update) {
+      if (u.ok == false) {
+        SPDLOG_ERROR("Query {} is not exectued OK", u.id);
+        exit(1);
+      }
+      auto q = query_map[u.id];
+      if (q->plan_status == Query::Status::Prefill ||
+          q->plan_status == Query::Status::Decode) {
+        q->absorb_update(u);
+      } else {
+        SPDLOG_DEBUG(
+            "Query {} is not in Prefill or Decode status, do not update it",
+            u.id);
+      }
+    }
+    strategy_update_query(update);
+  }
+
+  void tackle_event(const EventTakenBatch &batch) {
+    met->batch_count("Taken")->Increment(1);
+    for (auto &task : batch->prefill_mini_batches) {
+      auto [id, s, l] = task;
+      if (l == 0)
+        continue;
+      query_map.at(id)->absorb_prefill_task(task);
+    }
+    for (auto &mini_batch : batch->decode_mini_batches) {
+      for (auto &id : mini_batch) {
+        query_map.at(id)->absorb_decode_task(id);
+      }
+    }
+
+    strategy_taken_batch(batch);
+  }
+
+  void tackle_event(const EventPrepare &event) { strategy_prepare(event); }
+  void tackle_event(const EventPrepared &event) { strategy_prepared(event); }
+  void tackle_event(const EventQueryStatus &event) {
+    strategy_query_status(event);
+  }
+
+  void tackle_event(const EventSchedule &event) {
+    // SPDLOG_INFO("Tackle Schedule Event");
+
+    HistogramTimerWrapper t(met->schedule_time);
+
+    BatchQueryTodo *new_batch = new BatchQueryTodo;
+    strategy_schedule(event, new_batch);
+    // if (new_batch->query_ids.empty()) {
+    //   SPDLOG_INFO("Nothing todo");
+    //   delete new_batch;
+    //   return;
+    // }
+    auto [old_batch, flag] = next_batch.exchange(new_batch, true);
+    if (new_batch->empty() == false) {
+      SPDLOG_DEBUG("set new batch {}", fmt::ptr(new_batch));
+    }
+    if (flag) {
+      SPDLOG_INFO("Batch {} is not consumed", fmt::ptr(old_batch));
+      delete old_batch;
+    }
+  }
+
+  void run() override {
+    std::thread([this]() {
+      SPDLOG_WARN("Starting Scheduler Event Loop");
+      while (stop_flag.load() == false) {
+        auto event = event_loop_queue.dequeue();
+        met->event_count(event_name(event))->Increment(1);
+        std::visit(
+            [this](auto event) {
+              using T = std::decay_t<decltype(event)>;
+              // SPDLOG_INFO("Event Loop: {}", typeid(T).name());
+              if constexpr (std::is_same_v<T, EventAddQuery>) {
+                tackle_event(event);
+              } else if constexpr (std::is_same_v<T, EventUpdateQuery>) {
+                tackle_event(event);
+              } else if constexpr (std::is_same_v<T, EventTakenBatch>) {
+                tackle_event(event);
+              } else if constexpr (std::is_same_v<T, EventPrepare>) {
+                tackle_event(event);
+              } else if constexpr (std::is_same_v<T, EventPrepared>) {
+                tackle_event(event);
+              } else if constexpr (std::is_same_v<T, EventQueryStatus>) {
+                tackle_event(event);
+              } else if constexpr (std::is_same_v<T, EventSchedule>) {
+                tackle_event(event);
+              } else {
+                SPDLOG_ERROR("Should not be here");
+                assert(false);
+              }
+            },
+            event);
+        if (event_loop_queue.size() == 0 &&
+            std::holds_alternative<EventSchedule>(event) == false) {
+          // if this is not a schedule event, we need to schedule one
+          event_loop_queue.enqueue(EventSchedule());
+        }
+      }
+    }).detach();
+  }
+
+  void stop() override { stop_flag.store(true); }
+
+  ~QueryMaintainer() {
+    kvc2_maintainer->kvc2_interface->save();
+    stop();
+  }
+};
+
+void Query::to_status(Status to) {
+  SPDLOG_DEBUG("Calling to status query {}, to {}", id, status_to_string(to));
+  switch (to) {
+  case Received:
+    assert(false);
+    break;
+  case Preparing:
+    SPDLOG_INFO("Preparing Query {} {}", id,
+                prepare_try_count > 0
+                    ? (std::to_string(prepare_try_count) + " Try")
+                    : "");
+    prepare_try_count += 1;
+
+    ctx.kvc2_interface->lookup_to_gpu_async(
+        ctx.model_name, ctx.quant_type,
+        static_cast<kvc2::Token *>(query_token.data_ptr()), prompt_length,
+        estimated_length,
+        [this](std::shared_ptr<kvc2::DoubleCacheHandleInterface> handle) {
+          if (handle == nullptr) {
+            SPDLOG_INFO("Get handle from kvc2 Failed.");
+            this->after_load(false);
+          } else {
+            SPDLOG_INFO("Get handle from kvc2 Success.");
+            this->kvc2_handle = handle;
+            this->to_status(Ready);
+            this->after_load(true);
+          }
+        });
+    break;
+  case Ready:
+    SPDLOG_INFO("Ready Query {}", id);
+    break;
+  case Prefill:
+    SPDLOG_INFO("Prefilling Query {}", id);
+    // assert(plan_status == Received);
+    plan_position = kvc2_handle->matched_length();
+
+    if (prompt_length - plan_position == 0) {
+      assert(prompt_length > 0);
+      plan_position -= 1;
+    }
+    break;
+  case Decode:
+    SPDLOG_INFO("Decoding Query {}", id);
+    // assert(plan_status == Prefill);
+    break;
+  case Done:
+    SPDLOG_INFO("Finish Query {}", id);
+    kvc2_handle = nullptr;
+    ctx.query_maintainer->event_loop_queue.enqueue(EventQueryStatus{
+        .query_id = id,
+        .now_status = to,
+    });
+    // assert(plan_status == Decode);
+    break;
+  }
+  plan_status = to;
+  export_metrics();
+}
+
+void Query::after_load(bool ok) {
+  if (ok) {
+    size_t page_count =
+        div_up(estimated_length, ctx.query_maintainer->settings.page_size);
+    std::vector<int64_t> shape;
+    shape.push_back(page_count);
+    block_index =
+        torch::zeros(shape, torch::TensorOptions().dtype(torch::kInt32))
+            .contiguous();
+    auto ptr = reinterpret_cast<int32_t *>(block_index.data_ptr());
+    auto vec_idx = kvc2_handle->get_gpu_block_idx();
+    for (size_t i = 0; i < vec_idx.size(); i++) {
+      ptr[i] = vec_idx[i];
+    }
+    no_kvcache_from = kvc2_handle->matched_length();
+  }
+  if (ok) {
+    ctx.query_maintainer->event_loop_queue.enqueue(EventPrepared{
+        .query_id = id,
+        .ok = ok,
+    });
+  } else {
+    ctx.query_maintainer->event_loop_queue.enqueue(EventPrepare{
+        .query_id = id,
+        .first_try = false,
+    });
+  }
+}
+
+struct FCFS_single_prefill : public QueryMaintainer {
+  std::queue<Q> queue;
+  std::queue<Q> ready_queue;
+
+  bool has_query_preparing = false;
+  std::optional<EventPrepare> wait_done_prepare = std::nullopt;
+
+  std::set<Q> active_query; // on going queries for LLMs
+
+  // interface all these are executed in a single thread
+  void strategy_add_query(Q new_query) override {
+    queue.push(new_query);
+    if (has_query_preparing == false) {
+      has_query_preparing = true;
+      auto next_q = queue.front();
+      queue.pop();
+      event_loop_queue.enqueue(EventPrepare{next_q->id, true});
+    }
+  }
+
+  void strategy_update_query(const EventUpdateQuery &update) override {
+    for (auto u : update) {
+      auto &q = query_map[u.id];
+      if (q->plan_status == Query::Done) {
+        active_query.erase(q);
+      }
+    }
+  }
+
+  void strategy_taken_batch(const EventTakenBatch &batch) override {
+    for (auto &q : batch->query_ids) {
+      if (query_map[q]->plan_status != Query::Done) {
+        active_query.insert(query_map[q]);
+      }
+    }
+  }
+
+  void strategy_prepare(const EventPrepare &prepare) override {
+    if (prepare.first_try) {
+      auto &q = query_map[prepare.query_id];
+      q->to_status(Query::Preparing);
+    } else {
+      assert(wait_done_prepare.has_value() == false);
+      wait_done_prepare = prepare;
+      wait_done_prepare->first_try = true;
+    }
+  }
+
+  void strategy_prepared(const EventPrepared &prepared) override {
+    assert(prepared.ok);
+    ready_queue.push(query_map[prepared.query_id]);
+    if (queue.empty() == false) {
+      auto next_q_prepare = queue.front();
+      queue.pop();
+      event_loop_queue.enqueue(EventPrepare{next_q_prepare->id, true});
+
+    } else {
+      has_query_preparing = false;
+    }
+  }
+
+  void strategy_query_status(const EventQueryStatus &query_status) override {
+    if (query_status.now_status == Query::Done) {
+      if (wait_done_prepare.has_value()) {
+        event_loop_queue.enqueue(wait_done_prepare.value());
+        wait_done_prepare = std::nullopt;
+      }
+    }
+  }
+
+  void strategy_schedule([[maybe_unused]] const EventSchedule &event,
+                         BatchQueryTodo *new_batch) override {
+    bool have_prefill = false;
+    for (auto &q : active_query) {
+      if (q->plan_status == Query::Prefill) {
+        have_prefill = true;
+      }
+    }
+
+    if (have_prefill == false && ready_queue.empty() == false &&
+        active_query.size() < settings.max_batch_size) {
+      auto &next_q = ready_queue.front();
+      ready_queue.pop();
+
+      SPDLOG_INFO("Active query {}", next_q->id);
+      active_query.insert(next_q);
+      next_q->to_status(Query::Prefill);
+    }
+    if (active_query.empty() == false)
+      SPDLOG_INFO("Active Query Size {}", active_query.size());
+    for (auto &q : active_query) {
+      q->debug();
+    }
+    gen_batch_query_todo(new_batch, active_query);
+  }
+};
+
+struct FCFS : public FCFS_single_prefill {
+  void strategy_schedule([[maybe_unused]] const EventSchedule &event,
+                         BatchQueryTodo *new_batch) override {
+    int prefill_count = 0;
+    const int max_prefill_count = 2;
+    for (auto &q : active_query) {
+      if (q->plan_status == Query::Prefill) {
+        prefill_count += 1;
+      }
+    }
+
+    while (prefill_count < max_prefill_count && ready_queue.empty() == false &&
+           active_query.size() < settings.max_batch_size) {
+      auto next_q = ready_queue.front();
+      ready_queue.pop();
+
+      SPDLOG_INFO("Active query {}", next_q->id);
+      active_query.insert(next_q);
+      next_q->to_status(Query::Prefill);
+      prefill_count += 1;
+    }
+    if (active_query.empty() == false) {
+      SPDLOG_DEBUG("Active Query Size {}", active_query.size());
+    }
+    for (auto &q : active_query) {
+      q->debug();
+    }
+    gen_batch_query_todo(new_batch, active_query);
+  }
+};
+
+std::shared_ptr<Scheduler> create_scheduler(Settings settings) {
+  spdlog::set_level(spdlog::level::debug);
+  std::shared_ptr<Scheduler> re;
+  SPDLOG_INFO("Using Strategy {}", settings.strategy_name);
+  if (settings.strategy_name == "FCFS-single-prefill") {
+    re = std::shared_ptr<Scheduler>(new FCFS_single_prefill());
+  } else if (settings.strategy_name == "FCFS") {
+    re = std::shared_ptr<Scheduler>(new FCFS());
+  } else {
+    SPDLOG_ERROR("Unknown strategy {}", settings.strategy_name);
+  }
+  re->init(settings);
+  return re;
+}
+
+NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(SampleOptions, temperature, top_p);
+NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(QueryAdd, query_token, query_length,
+                                   estimated_length, sample_options, user_id,
+                                   SLO_TTFT_ms, SLO_TBT_ms);
+
+std::string QueryAdd::serialize() {
+  json j = *this;
+  return j.dump();
+}
+
+QueryAdd QueryAdd::deserialize(const std::string &input) {
+  json j = json::parse(input);
+  return j.get<QueryAdd>();
+}
+
+}; // namespace scheduler
--- a/csrc/balance_serve/sched/scheduler.h
+++ b/csrc/balance_serve/sched/scheduler.h
@ -0,0 +1,175 @@
+#pragma once
+#include "model_config.h"
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <torch/torch.h>
+#include <vector>
+
+namespace scheduler {
+
+using Token = uint32_t;
+using QueryID = uint64_t;
+constexpr QueryID NoQueryID = 0;
+
+using TokenLength = size_t;
+using BatchID = uint64_t;
+
+using PageCount = size_t;
+
+struct ModelSettings {
+  std::string model_path;
+  size_t params_count;
+  size_t layer_count;
+  size_t num_k_heads;
+  size_t k_head_dim;
+
+  double bytes_per_params;
+  double bytes_per_kv_cache_element;
+
+  inline size_t params_nbytes() { return params_count * bytes_per_params; }
+  inline size_t bytes_per_token_kv_cache() {
+    return bytes_per_kv_cache_element * num_k_heads * k_head_dim;
+  }
+};
+
+struct SampleOptions {
+  double temperature = 1.0;
+  double top_p = 1.0;
+};
+
+struct Settings {
+  // something is aukward here, kvc2 only use model_name and quant_type to get
+  // model infos.
+  ModelName model_name;
+  QuantType quant_type;
+  // model_setting is ignore by kvc2
+  ModelSettings model_settings;
+
+  size_t page_size = 256;            // how many token in a page
+  std::vector<size_t> gpu_device_id; //
+  size_t gpu_memory_size;            // memory size in bytes of each GPU, each
+  double memory_utilization_percentage;
+
+  size_t max_batch_size = 256;
+
+  size_t recommended_chunk_prefill_token_count;
+  SampleOptions sample_options;
+  size_t sched_metrics_port;
+
+  // for kvc2
+  bool gpu_only;
+  bool use_self_defined_head_dim = false;
+  size_t self_defined_head_dim;
+  bool full_kv_cache_on_each_gpu = false;
+  bool k_cache_on = true;
+  bool v_cache_on = true;
+  std::string kvc2_config_path;
+  std::string kvc2_root_path;
+  double memory_pool_size_GB = 100;
+  size_t evict_count = 20;
+  size_t kvc2_metrics_port;
+  bool load_from_disk = false;
+  bool save_to_disk = false;
+
+  // for strategy
+  std::string strategy_name;
+
+  // derived
+  size_t gpu_device_count;
+  std::optional<size_t> total_kvcache_pages;
+  std::vector<torch::Device> devices;
+  void auto_derive();
+};
+
+using PrefillTask =
+    std::tuple<QueryID, TokenLength, TokenLength>; // id, start, length
+
+struct BatchQueryTodo {
+  // query
+  std::vector<QueryID> query_ids;
+  std::vector<torch::Tensor> query_tokens;
+  std::vector<TokenLength> query_lengths;
+  std::vector<torch::Tensor>
+      block_indexes; // (max_num_blocks_per_seq), dtype torch.int32.
+  std::optional<torch::Tensor> attn_masks;
+  std::optional<torch::Tensor> rope_ranges;
+  std::vector<SampleOptions> sample_options;
+  std::vector<std::vector<std::vector<int>>> stop_criteria;
+
+  // mini batches, adjacent two mini batches are executed together
+  // tasks count must be <=2, because of flash infer attention
+  std::vector<PrefillTask>
+      prefill_mini_batches; // prefill minibatch only has 1 prefill
+  std::vector<std::vector<QueryID>>
+      decode_mini_batches; // decode minibatch has multiple decode
+
+  std::string debug();
+  bool empty();
+};
+
+struct QueryUpdate {
+  QueryID id;
+  bool ok;
+  bool is_prefill;
+  bool decode_done;            // no use for now
+  TokenLength active_position; // the position where no kvcache now,
+                               // kvcache[active_position] == None
+
+  Token generated_token;
+
+  std::string debug() const;
+};
+
+using BatchQueryUpdate = std::vector<QueryUpdate>;
+
+struct InferenceContext {
+  std::vector<torch::Tensor> k_cache; // [gpu num] (layer_count, num blocks,
+                                      // page size, kheadnum, head_dim)
+  std::vector<torch::Tensor> v_cache;
+};
+
+using UserID = int64_t;
+constexpr UserID NoUser = -1;
+const int MAX_SLO_TIME = 1e9;
+
+struct QueryAdd {
+  std::vector<Token> query_token; // int here
+  // torch::Tensor attn_mask;
+  TokenLength query_length;
+  TokenLength estimated_length;
+
+  std::vector<std::vector<int>> stop_criteria;
+
+  SampleOptions sample_options;
+
+  UserID user_id;
+  int SLO_TTFT_ms = MAX_SLO_TIME;
+  int SLO_TBT_ms = MAX_SLO_TIME;
+
+  std::string serialize();
+  static QueryAdd deserialize(const std::string &input);
+};
+
+class Scheduler {
+public:
+  virtual void init(Settings settings) = 0;
+
+  virtual void run() = 0;
+  virtual void stop() = 0;
+
+  // webserver call this
+  virtual QueryID add_query(QueryAdd query) = 0;
+  virtual void cancel_query(QueryID id) = 0;
+
+  // inference loop call this
+  virtual std::shared_ptr<BatchQueryTodo>
+  update_last_batch(BatchQueryUpdate updates) = 0;
+  virtual InferenceContext get_inference_context() = 0;
+
+  virtual ~Scheduler() = default;
+};
+
+std::shared_ptr<Scheduler> create_scheduler(Settings settings);
+
+}; // namespace scheduler
--- a/csrc/balance_serve/sched/utils/all.hpp
+++ b/csrc/balance_serve/sched/utils/all.hpp
@ -0,0 +1,3 @@
+#pragma once
+#include "readable_number.hpp"
+#include "timer.hpp"
--- a/csrc/balance_serve/sched/utils/arithmetic.hpp
+++ b/csrc/balance_serve/sched/utils/arithmetic.hpp
@ -0,0 +1,7 @@
+#include <type_traits>
+
+template <typename T, typename U> T div_up(T x, U by) {
+  static_assert(std::is_integral_v<T>);
+  static_assert(std::is_integral_v<U>);
+  return (x + by - 1) / by;
+}
--- a/csrc/balance_serve/sched/utils/atomic_ptr_with_flags.hpp
+++ b/csrc/balance_serve/sched/utils/atomic_ptr_with_flags.hpp
@ -0,0 +1,35 @@
+#include <atomic>
+
+template <typename T> struct AtomicPtrWithFlag {
+  constexpr static uint64_t mask = 1ull << 63;
+  std::atomic_uint64_t ptr = 0;
+
+  std::pair<T *, bool>
+  load(std::memory_order order = std::memory_order_seq_cst) {
+    uint64_t val = ptr.load(order);
+    return {reinterpret_cast<T *>(val & (~mask)), val & mask};
+  }
+
+  void store(T *p, bool flag,
+             std::memory_order order = std::memory_order_seq_cst) {
+    ptr.store(reinterpret_cast<uint64_t>(p) | (flag ? mask : 0), order);
+  }
+
+  std::pair<T *, bool>
+  exchange(T *p, bool flag,
+           std::memory_order order = std::memory_order_seq_cst) {
+    uint64_t val =
+        ptr.exchange(reinterpret_cast<uint64_t>(p) | (flag ? mask : 0), order);
+    return {reinterpret_cast<T *>(val & (~mask)), val & mask};
+  }
+
+  std::pair<T *, bool>
+  touch_load(std::memory_order order = std::memory_order_seq_cst) {
+    uint64_t val = ptr.fetch_and(~mask, order);
+    return {reinterpret_cast<T *>(val & (~mask)), val & mask};
+  }
+
+  bool check_flag(std::memory_order order = std::memory_order_seq_cst) {
+    return ptr.load(order) & mask;
+  }
+};
--- a/csrc/balance_serve/sched/utils/csv.hpp
+++ b/csrc/balance_serve/sched/utils/csv.hpp
@ -0,0 +1,229 @@
+#ifndef CSV_READER_HPP
+#define CSV_READER_HPP
+
+#include <fstream>
+#include <iostream>
+#include <mutex>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <thread>
+#include <vector>
+
+namespace csv {
+
+/**
+ * @brief Parses a CSV line into individual fields, handling quoted fields with
+ * commas and newlines.
+ *
+ * @param line The CSV line to parse.
+ * @return A vector of strings, each representing a field in the CSV line.
+ */
+inline std::vector<std::string> parse_csv_line(const std::string &line) {
+  std::vector<std::string> result;
+  std::string field;
+  bool in_quotes = false;
+
+  for (size_t i = 0; i < line.length(); ++i) {
+    char c = line[i];
+
+    if (c == '"') {
+      // Handle double quotes inside quoted fields
+      if (in_quotes && i + 1 < line.length() && line[i + 1] == '"') {
+        field += '"';
+        ++i;
+      } else {
+        in_quotes = !in_quotes;
+      }
+    } else if (c == ',' && !in_quotes) {
+      result.push_back(field);
+      field.clear();
+    } else {
+      field += c;
+    }
+  }
+  result.push_back(field);
+  return result;
+}
+
+/**
+ * @brief Reads a CSV file and returns a vector of pairs containing column names
+ * and their corresponding data vectors.
+ *
+ * This function reads the header to obtain column names and uses multithreading
+ * to read and parse the CSV file in chunks.
+ *
+ * @param filename The path to the CSV file.
+ * @return A vector of pairs, each containing a column name and a vector of data
+ * for that column.
+ */
+inline std::vector<std::pair<std::string, std::vector<std::string>>>
+read_csv(const std::string &filename) {
+  std::cout << "Reading CSV file: " << filename << std::endl;
+  // Open the file
+  std::ifstream file(filename);
+  if (!file) {
+    throw std::runtime_error("Cannot open file");
+  }
+
+  // Read the header line and parse column names
+  std::string header_line;
+  std::getline(file, header_line);
+  std::vector<std::string> column_names = parse_csv_line(header_line);
+
+  // Prepare the result vector with column names
+  std::vector<std::pair<std::string, std::vector<std::string>>> result;
+  for (const auto &name : column_names) {
+    result.emplace_back(name, std::vector<std::string>());
+  }
+
+  // Read the rest of the file into a string buffer
+  std::stringstream buffer;
+  buffer << file.rdbuf();
+  std::string content = buffer.str();
+
+  // Determine the number of threads to use
+  unsigned int num_threads = std::thread::hardware_concurrency();
+  if (num_threads == 0)
+    num_threads = 4; // Default to 4 threads if hardware_concurrency returns 0
+
+  // Calculate chunk start positions based on content size
+  std::vector<size_t> chunk_starts;
+  size_t content_size = content.size();
+  size_t chunk_size = content_size / num_threads;
+
+  chunk_starts.push_back(0);
+  for (unsigned int i = 1; i < num_threads; ++i) {
+    size_t pos = i * chunk_size;
+    // Adjust position to the next newline character to ensure we start at the
+    // beginning of a line
+    while (pos < content_size && content[pos] != '\n') {
+      ++pos;
+    }
+    if (pos < content_size) {
+      ++pos; // Skip the newline character
+    }
+    chunk_starts.push_back(pos);
+  }
+  chunk_starts.push_back(content_size);
+
+  // Create threads to parse each chunk
+  std::vector<std::vector<std::vector<std::string>>> thread_results(
+      num_threads);
+  std::vector<std::thread> threads;
+
+  for (unsigned int i = 0; i < num_threads; ++i) {
+    size_t start = chunk_starts[i];
+    size_t end = chunk_starts[i + 1];
+
+    threads.emplace_back([&content, start, end, &thread_results, i]() {
+      std::vector<std::vector<std::string>> local_result;
+      size_t pos = start;
+      while (pos < end) {
+        size_t next_pos = content.find('\n', pos);
+        if (next_pos == std::string::npos || next_pos > end) {
+          next_pos = end;
+        }
+        std::string line = content.substr(pos, next_pos - pos);
+        if (!line.empty()) {
+          local_result.push_back(parse_csv_line(line));
+        }
+        pos = next_pos + 1;
+      }
+      thread_results[i] = std::move(local_result);
+    });
+  }
+
+  // Wait for all threads to finish
+  for (auto &t : threads) {
+    t.join();
+  }
+
+  // Combine the results from all threads into the final result
+  for (const auto &local_result : thread_results) {
+    for (const auto &row : local_result) {
+      for (size_t i = 0; i < row.size(); ++i) {
+        if (i < result.size()) {
+          result[i].second.push_back(row[i]);
+        }
+      }
+    }
+  }
+
+  return result;
+}
+
+/**
+ * @brief Writes the CSV data into a file.
+ *
+ * @param filename The path to the output CSV file.
+ * @param data A vector of pairs, each containing a column name and a vector of
+ * data for that column.
+ */
+inline void write_csv(
+    const std::string &filename,
+    const std::vector<std::pair<std::string, std::vector<std::string>>> &data) {
+  std::cout << "Writing CSV file: " << filename << std::endl;
+
+  // Open the file for writing
+  std::ofstream file(filename);
+  if (!file) {
+    throw std::runtime_error("Cannot open file for writing");
+  }
+
+  // Check that all columns have the same number of rows
+  if (data.empty()) {
+    return; // Nothing to write
+  }
+  size_t num_rows = data[0].second.size();
+  for (const auto &column : data) {
+    if (column.second.size() != num_rows) {
+      throw std::runtime_error("All columns must have the same number of rows");
+    }
+  }
+
+  // Write the header
+  for (size_t i = 0; i < data.size(); ++i) {
+    file << data[i].first;
+    if (i != data.size() - 1) {
+      file << ',';
+    }
+  }
+  file << '\n';
+
+  // Write the data rows
+  for (size_t row = 0; row < num_rows; ++row) {
+    for (size_t col = 0; col < data.size(); ++col) {
+      const std::string &field = data[col].second[row];
+      // Handle CSV escaping
+      std::string escaped_field = field;
+      bool needs_quotes = false;
+      if (escaped_field.find('"') != std::string::npos) {
+        needs_quotes = true;
+        // Escape double quotes
+        size_t pos = 0;
+        while ((pos = escaped_field.find('"', pos)) != std::string::npos) {
+          escaped_field.insert(pos, "\"");
+          pos += 2;
+        }
+      }
+      if (escaped_field.find(',') != std::string::npos ||
+          escaped_field.find('\n') != std::string::npos) {
+        needs_quotes = true;
+      }
+      if (needs_quotes) {
+        file << '"' << escaped_field << '"';
+      } else {
+        file << escaped_field;
+      }
+      if (col != data.size() - 1) {
+        file << ',';
+      }
+    }
+    file << '\n';
+  }
+}
+
+} // namespace csv
+
+#endif // CSV_READER_HPP
--- a/csrc/balance_serve/sched/utils/easy_format.hpp
+++ b/csrc/balance_serve/sched/utils/easy_format.hpp
@ -0,0 +1,15 @@
+#include <sstream>
+#include <string>
+#include <vector>
+
+template <typename T> std::string format_vector(const std::vector<T> &v) {
+  std::ostringstream oss;
+  if (v.empty())
+    return "[]";
+  for (size_t i = 0; i < v.size(); ++i) {
+    oss << v[i];
+    if (i < v.size() - 1)
+      oss << ", "; // 逗号分隔
+  }
+  return oss.str();
+}
--- a/csrc/balance_serve/sched/utils/mpsc.hpp
+++ b/csrc/balance_serve/sched/utils/mpsc.hpp
@ -0,0 +1,112 @@
+#include <atomic>
+#include <cassert>
+#include <iostream>
+#include <optional>
+#include <semaphore>
+
+template <typename T> class MPSCQueue {
+  struct Node {
+    T data;
+    std::atomic<Node *> next;
+
+    Node() : next(nullptr) {}
+    Node(T data_) : data(std::move(data_)), next(nullptr) {}
+  };
+
+  std::atomic<Node *> head;
+  Node *tail;
+
+public:
+  std::atomic_size_t enqueue_count = 0;
+  size_t dequeue_count = 0;
+  MPSCQueue() {
+    Node *dummy = new Node();
+    head.store(dummy, std::memory_order_seq_cst);
+    tail = dummy;
+  }
+
+  ~MPSCQueue() {
+    Node *node = tail;
+    while (node) {
+      Node *next = node->next.load(std::memory_order_seq_cst);
+      delete node;
+      node = next;
+    }
+  }
+
+  // 生产者调用
+  void enqueue(T data) {
+    enqueue_count.fetch_add(1);
+    Node *node = new Node(std::move(data));
+    Node *prev_head = head.exchange(node, std::memory_order_seq_cst);
+    prev_head->next.store(node, std::memory_order_seq_cst);
+  }
+
+  // 消费者调用
+  std::optional<T> dequeue() {
+    Node *next = tail->next.load(std::memory_order_seq_cst);
+    if (next) {
+      T res = std::move(next->data);
+      delete tail;
+      tail = next;
+      dequeue_count += 1;
+      return res;
+    }
+    return std::nullopt;
+  }
+
+  size_t size() { return enqueue_count.load() - dequeue_count; }
+};
+
+template <typename T> class MPSCQueueConsumerLock {
+  MPSCQueue<T> queue;
+  std::counting_semaphore<> sema{0};
+
+public:
+  void enqueue(T data) {
+    queue.enqueue(std::move(data));
+    // std::atomic_thread_fence(std::memory_order_seq_cst);// Inserting this
+    // because the memory order might be wrong, I am also not that sure about
+    // this.
+    sema.release();
+  }
+
+  T dequeue() {
+    auto re = queue.dequeue();
+    if (re.has_value()) {
+      while (sema.try_acquire() == false) {
+        std::cerr
+            << __FILE__ << ":" << __FUNCTION__
+            << " sema try acquire should be success, retrying, please check"
+            << std::endl;
+        // assert(false);
+      }
+      return re.value();
+    }
+    sema.acquire();
+    return queue.dequeue().value();
+  }
+
+  template <typename Rep, typename Period>
+  std::optional<T> try_dequeue_for(std::chrono::duration<Rep, Period> dur) {
+    auto re = queue.dequeue();
+    if (re.has_value()) {
+      while (sema.try_acquire() == false) {
+        std::cerr
+            << __FILE__ << ":" << __FUNCTION__
+            << " sema try acquire should be success, retrying, please check"
+            << std::endl;
+        // assert(false);
+      }
+      return re.value();
+    }
+
+    if (sema.try_acquire_for(dur)) {
+      return queue.dequeue().value();
+    } else {
+      return std::nullopt;
+    }
+  }
+
+  size_t size() { return queue.size(); }
+};
--- a/csrc/balance_serve/sched/utils/readable_number.hpp
+++ b/csrc/balance_serve/sched/utils/readable_number.hpp
@ -0,0 +1,20 @@
+#pragma once
+#include <array>
+#include <iomanip>
+#include <sstream>
+#include <string>
+
+inline std::array<std::string, 7> units = {"", "K", "M", "G", "T", "P", "E"};
+
+inline std::string readable_number(size_t size) {
+  size_t unit_index = 0;
+  double readable_size = size;
+  while (readable_size >= 1000 && unit_index < units.size() - 1) {
+    readable_size /= 1000;
+    unit_index++;
+  }
+  std::ostringstream ss;
+  ss << std::fixed << std::setprecision(2) << readable_size;
+  std::string str = ss.str();
+  return str + "" + units[unit_index];
+}
--- a/Show more
+++ b/Show more