mirror of
https://github.com/kvcache-ai/ktransformers.git
synced 2026-05-03 06:01:35 +00:00
* refactor: move legacy code to archive/ directory - Moved ktransformers, csrc, third_party, merge_tensors to archive/ - Moved build scripts and configurations to archive/ - Kept kt-kernel, KT-SFT, doc, and README files in root - Preserved complete git history for all moved files * refactor: restructure repository to focus on kt-kernel and KT-SFT modules * fix README * fix README * fix README * fix README * docs: add performance benchmarks to kt-kernel section Add comprehensive performance data for kt-kernel to match KT-SFT's presentation: - AMX kernel optimization: 21.3 TFLOPS (3.9× faster than PyTorch) - Prefill phase: up to 20× speedup vs baseline - Decode phase: up to 4× speedup - NUMA optimization: up to 63% throughput improvement - Multi-GPU (8×L20): 227.85 tokens/s total throughput with DeepSeek-R1 FP8 Source: https://lmsys.org/blog/2025-10-22-KTransformers/ This provides users with concrete performance metrics for both core modules, making it easier to understand the capabilities of each component. * refactor: improve kt-kernel performance data with specific hardware and models Replace generic performance descriptions with concrete benchmarks: - Specify exact hardware: 8×L20 GPU + Xeon Gold 6454S, Single/Dual-socket Xeon + AMX - Include specific models: DeepSeek-R1-0528 (FP8), DeepSeek-V3 (671B) - Show detailed metrics: total throughput, output throughput, concurrency details - Match KT-SFT presentation style for consistency This provides users with actionable performance data they can use to evaluate hardware requirements and expected performance for their use cases. * fix README * docs: clean up performance table and improve formatting * add pic for README * refactor: simplify .gitmodules and backup legacy submodules - Remove 7 legacy submodules from root .gitmodules (archive/third_party/*) - Keep only 2 active submodules for kt-kernel (llama.cpp, pybind11) - Backup complete .gitmodules to archive/.gitmodules - Add documentation in archive/README.md for researchers who need legacy submodules This reduces initial clone size by ~500MB and avoids downloading unused dependencies. * refactor: move doc/ back to root directory Keep documentation in root for easier access and maintenance. * refactor: consolidate all images to doc/assets/ - Move kt-kernel/assets/heterogeneous_computing.png to doc/assets/ - Remove KT-SFT/assets/ (images already in doc/assets/) - Update KT-SFT/README.md image references to ../doc/assets/ - Eliminates ~7.9MB image duplication - Centralizes all documentation assets in one location * fix pic path for README
154 lines
No EOL
5.1 KiB
C++
154 lines
No EOL
5.1 KiB
C++
/**
|
||
* @Description :
|
||
* @Author : chenht2022
|
||
* @Date : 2024-07-22 02:03:05
|
||
* @Version : 1.0.0
|
||
* @LastEditors : chenht2022
|
||
* @LastEditTime : 2024-07-25 10:33:34
|
||
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
|
||
**/
|
||
|
||
#include "backend.h"
|
||
|
||
#ifdef USE_NUMA
|
||
#include <numa.h>
|
||
#include <numaif.h>
|
||
|
||
thread_local int Backend::numa_node = -1;
|
||
#endif
|
||
|
||
thread_local int Backend::thread_local_id = -1;
|
||
|
||
Backend::Backend(int max_thread_num) {
|
||
max_thread_num_ = max_thread_num;
|
||
thread_state_.resize(max_thread_num_);
|
||
for (int i = 0; i < max_thread_num_; i++) {
|
||
thread_state_[i].curr = std::make_unique<std::atomic<int>>();
|
||
thread_state_[i].status =
|
||
std::make_unique<std::atomic<ThreadStatus>>(ThreadStatus::WAITING);
|
||
}
|
||
workers_.resize(max_thread_num_);
|
||
for (int i = 1; i < max_thread_num_; i++) {
|
||
workers_[i] = std::thread(&Backend::worker_thread, this, i);
|
||
}
|
||
}
|
||
|
||
Backend::~Backend() {
|
||
for (int i = 0; i < max_thread_num_; i++) {
|
||
thread_state_[i].status->store(ThreadStatus::EXIT,
|
||
std::memory_order_release);
|
||
}
|
||
for (int i = 1; i < max_thread_num_; i++) {
|
||
if (workers_[i].joinable()) {
|
||
workers_[i].join();
|
||
}
|
||
}
|
||
}
|
||
|
||
int Backend::get_thread_num() { return max_thread_num_; }
|
||
|
||
void Backend::do_work_stealing_job(int task_num,
|
||
std::function<void(int)> init_func,
|
||
std::function<void(int)> compute_func,
|
||
std::function<void(int)> finalize_func) {
|
||
init_func_ = init_func;
|
||
compute_func_ = compute_func;
|
||
finalize_func_ = finalize_func;
|
||
#ifdef USE_NUMA
|
||
// numa node location will be calculated based on the number of threads
|
||
thread_num_ = max_thread_num_;
|
||
#else
|
||
thread_num_ = std::min(max_thread_num_, task_num);
|
||
#endif
|
||
int base = task_num / thread_num_;
|
||
int remain = task_num % thread_num_;
|
||
thread_state_[0].end = base + (0 < remain);
|
||
|
||
// 为主线程设置 thread_local_id
|
||
thread_local_id = 0;
|
||
|
||
for (int i = 1; i < thread_num_; i++) {
|
||
thread_state_[i].curr->store(thread_state_[i - 1].end,
|
||
std::memory_order_relaxed);
|
||
thread_state_[i].end = thread_state_[i - 1].end + base + (i < remain);
|
||
thread_state_[i].status->store(ThreadStatus::WORKING,
|
||
std::memory_order_release);
|
||
}
|
||
thread_state_[0].curr->store(0, std::memory_order_relaxed);
|
||
thread_state_[0].status->store(ThreadStatus::WORKING,
|
||
std::memory_order_release);
|
||
process_tasks(0);
|
||
for (int i = 1; i < thread_num_; i++) {
|
||
while (thread_state_[i].status->load(std::memory_order_acquire) ==
|
||
ThreadStatus::WORKING) {
|
||
}
|
||
}
|
||
}
|
||
|
||
void Backend::process_tasks(int thread_id) {
|
||
|
||
#ifdef USE_NUMA
|
||
if(numa_node == -1){
|
||
numa_node = thread_id * numa_num_configured_nodes() / thread_num_;
|
||
struct bitmask* mask = numa_bitmask_alloc(numa_num_configured_nodes());
|
||
numa_bitmask_setbit(mask, numa_node);
|
||
numa_bind(mask);
|
||
}
|
||
#endif
|
||
|
||
if (init_func_ != nullptr) {
|
||
init_func_(thread_id);
|
||
}
|
||
while (true) {
|
||
int task_id = thread_state_[thread_id].curr->fetch_add(
|
||
1, std::memory_order_acq_rel);
|
||
if (task_id >= thread_state_[thread_id].end) {
|
||
break;
|
||
}
|
||
compute_func_(task_id);
|
||
}
|
||
for (int t_offset = 1; t_offset < thread_num_; t_offset++) {
|
||
int t_i = (thread_id + t_offset) % thread_num_;
|
||
if (thread_state_[t_i].status->load(std::memory_order_acquire) !=
|
||
ThreadStatus::WORKING) {
|
||
continue;
|
||
}
|
||
while (true) {
|
||
int task_id = thread_state_[t_i].curr->fetch_add(
|
||
1, std::memory_order_acq_rel);
|
||
if (task_id >= thread_state_[t_i].end) {
|
||
break;
|
||
}
|
||
compute_func_(task_id);
|
||
}
|
||
}
|
||
if (finalize_func_ != nullptr) {
|
||
finalize_func_(thread_id);
|
||
}
|
||
thread_state_[thread_id].status->store(ThreadStatus::WAITING,
|
||
std::memory_order_release);
|
||
}
|
||
|
||
void Backend::worker_thread(int thread_id) {
|
||
auto start = std::chrono::steady_clock::now();
|
||
thread_local_id = thread_id; // 设置线程本地å<C2B0>˜é‡<C3A9>
|
||
while (true) {
|
||
ThreadStatus status =
|
||
thread_state_[thread_id].status->load(std::memory_order_acquire);
|
||
if (status == ThreadStatus::WORKING) {
|
||
process_tasks(thread_id);
|
||
start = std::chrono::steady_clock::now();
|
||
} else if (status == ThreadStatus::WAITING) {
|
||
auto now = std::chrono::steady_clock::now();
|
||
auto duration =
|
||
std::chrono::duration_cast<std::chrono::milliseconds>(now -
|
||
start)
|
||
.count();
|
||
if (duration > 50) {
|
||
std::this_thread::sleep_for(std::chrono::milliseconds(1));
|
||
}
|
||
} else if (status == ThreadStatus::EXIT) {
|
||
return;
|
||
}
|
||
}
|
||
} |