mirror of
https://github.com/kvcache-ai/ktransformers.git
synced 2026-05-05 23:50:14 +00:00
* refactor: move legacy code to archive/ directory - Moved ktransformers, csrc, third_party, merge_tensors to archive/ - Moved build scripts and configurations to archive/ - Kept kt-kernel, KT-SFT, doc, and README files in root - Preserved complete git history for all moved files * refactor: restructure repository to focus on kt-kernel and KT-SFT modules * fix README * fix README * fix README * fix README * docs: add performance benchmarks to kt-kernel section Add comprehensive performance data for kt-kernel to match KT-SFT's presentation: - AMX kernel optimization: 21.3 TFLOPS (3.9× faster than PyTorch) - Prefill phase: up to 20× speedup vs baseline - Decode phase: up to 4× speedup - NUMA optimization: up to 63% throughput improvement - Multi-GPU (8×L20): 227.85 tokens/s total throughput with DeepSeek-R1 FP8 Source: https://lmsys.org/blog/2025-10-22-KTransformers/ This provides users with concrete performance metrics for both core modules, making it easier to understand the capabilities of each component. * refactor: improve kt-kernel performance data with specific hardware and models Replace generic performance descriptions with concrete benchmarks: - Specify exact hardware: 8×L20 GPU + Xeon Gold 6454S, Single/Dual-socket Xeon + AMX - Include specific models: DeepSeek-R1-0528 (FP8), DeepSeek-V3 (671B) - Show detailed metrics: total throughput, output throughput, concurrency details - Match KT-SFT presentation style for consistency This provides users with actionable performance data they can use to evaluate hardware requirements and expected performance for their use cases. * fix README * docs: clean up performance table and improve formatting * add pic for README * refactor: simplify .gitmodules and backup legacy submodules - Remove 7 legacy submodules from root .gitmodules (archive/third_party/*) - Keep only 2 active submodules for kt-kernel (llama.cpp, pybind11) - Backup complete .gitmodules to archive/.gitmodules - Add documentation in archive/README.md for researchers who need legacy submodules This reduces initial clone size by ~500MB and avoids downloading unused dependencies. * refactor: move doc/ back to root directory Keep documentation in root for easier access and maintenance. * refactor: consolidate all images to doc/assets/ - Move kt-kernel/assets/heterogeneous_computing.png to doc/assets/ - Remove KT-SFT/assets/ (images already in doc/assets/) - Update KT-SFT/README.md image references to ../doc/assets/ - Eliminates ~7.9MB image duplication - Centralizes all documentation assets in one location * fix pic path for README
46 lines
No EOL
1.4 KiB
Python
46 lines
No EOL
1.4 KiB
Python
import torch
|
|
|
|
# 定义一个包含线性层的浮点模型
|
|
class LinearModel(torch.nn.Module):
|
|
def __init__(self, in_features, out_features):
|
|
super().__init__()
|
|
self.linear = torch.nn.Linear(in_features, out_features)
|
|
|
|
def forward(self, x):
|
|
return self.linear(x)
|
|
|
|
# 创建浮点模型实例
|
|
in_features = 64
|
|
out_features = 128
|
|
model_fp32 = LinearModel(in_features, out_features)
|
|
|
|
# 创建量化模型实例
|
|
model_int8 = torch.ao.quantization.quantize_dynamic(
|
|
model_fp32, # 原始浮点模型
|
|
{torch.nn.Linear}, # 要量化的层类型集合
|
|
dtype=torch.qint8 # 量化的目标数据类型
|
|
)
|
|
|
|
# 测试模型
|
|
batch_size = 32
|
|
input_fp32 = torch.randn(1, batch_size, in_features) # 生成随机输入数据
|
|
output_int8 = model_int8(input_fp32) # 通过量化模型运行数据
|
|
|
|
# 打印输出形状验证
|
|
print(f"输入形状: {input_fp32.shape}")
|
|
print(f"输出形状: {output_int8.shape}")
|
|
|
|
# 比较原始模型和量化模型的输出
|
|
with torch.no_grad():
|
|
output_fp32 = model_fp32(input_fp32)
|
|
|
|
print(f"FP32输出的前几个值: {output_fp32[0, :5]}")
|
|
print(f"INT8输出的前几个值: {output_int8[0, :5]}")
|
|
|
|
# 计算平均误差
|
|
error = torch.abs(output_fp32 - output_int8).mean().item()
|
|
print(f"平均绝对误差: {error}")
|
|
|
|
# 打印模型类型信息
|
|
print(f"量化前模型类型: {type(model_fp32.linear)}")
|
|
print(f"量化后模型类型: {type(model_int8.linear)}") |