From 70739cefbad43019bc91667a8a39986f539293cb Mon Sep 17 00:00:00 2001 From: xwy-amd8 Date: Wed, 17 Dec 2025 10:00:13 +0000 Subject: [PATCH] [fix](test): fix import kt-kernel --- kt-kernel/bench/bench_attention.py | 37 +-- kt-kernel/bench/bench_attention_torch.py | 23 +- kt-kernel/bench/bench_k2_moe_amx.py | 60 +--- kt-kernel/bench/bench_k2_write_buffer.py | 40 +-- kt-kernel/bench/bench_linear.py | 81 +++--- kt-kernel/bench/bench_mla.py | 163 ++++++----- kt-kernel/bench/bench_mlp.py | 146 +++++----- kt-kernel/bench/bench_moe.py | 131 +++++---- kt-kernel/bench/bench_moe_amx.py | 2 +- kt-kernel/bench/bench_moe_amx_k.py | 131 +++++---- kt-kernel/bench/bench_moe_kernel.py | 2 +- kt-kernel/bench/bench_moe_kernel_tiling.py | 2 +- kt-kernel/bench/bench_moe_kml.py | 2 +- kt-kernel/examples/test-debug.py | 25 +- kt-kernel/examples/test_attention.py | 38 +-- kt-kernel/examples/test_awq_moe_amx.py | 2 +- kt-kernel/examples/test_deepseekv3.py | 83 +++--- kt-kernel/examples/test_deepseekv3_prefill.py | 78 ++---- .../examples/test_deepseekv3_prefill_speed.py | 93 +++--- kt-kernel/examples/test_gate.py | 160 +++++------ kt-kernel/examples/test_k2_moe_amx.py | 9 +- kt-kernel/examples/test_k2_write_buffer.py | 19 +- kt-kernel/examples/test_linear.py | 39 ++- kt-kernel/examples/test_mla.py | 264 +++++++++--------- kt-kernel/examples/test_mla_qlen.py | 84 +++--- kt-kernel/examples/test_mla_quant.py | 108 ++++--- kt-kernel/examples/test_mla_torch.py | 213 +++++++------- kt-kernel/examples/test_mlp.py | 70 +++-- kt-kernel/examples/test_moe.py | 101 ++++--- kt-kernel/examples/test_moe_amx.py | 2 +- kt-kernel/examples/test_moe_kernel.py | 2 +- kt-kernel/examples/test_moe_kml.py | 2 +- kt-kernel/python/experts_base.py | 2 +- 33 files changed, 1063 insertions(+), 1151 deletions(-) diff --git a/kt-kernel/bench/bench_attention.py b/kt-kernel/bench/bench_attention.py index edbc8e26..f30e29dc 100644 --- a/kt-kernel/bench/bench_attention.py +++ b/kt-kernel/bench/bench_attention.py @@ -1,19 +1,19 @@ #!/usr/bin/env python # coding=utf-8 """ -Description : +Description : Author : Jianwei Dong Date : 2024-08-28 10:32:05 Version : 1.0.0 -LastEditors : Jianwei Dong +LastEditors : Jianwei Dong LastEditTime : 2024-08-28 10:32:05 -Copyright (c) 2024 by KVCache.AI, All Rights Reserved. +Copyright (c) 2024 by KVCache.AI, All Rights Reserved. """ import os, sys import time sys.path.append(os.path.dirname(__file__) + "/../build") -import kt_kernel_ext +from kt_kernel import kt_kernel_ext import torch layer_num = 10 @@ -61,11 +61,7 @@ def bench_linear(cache_seqlen: int): max_thread_num, ) local_kvcache = kt_kernel_ext.kvcache.KVCache(config) - block_table = ( - torch.arange(max_block_num, dtype=torch.int32, device="cpu") - .contiguous() - .view(1, -1) - ) + block_table = torch.arange(max_block_num, dtype=torch.int32, device="cpu").contiguous().view(1, -1) for layer_idx in range(layer_num): k_cache = torch.randn( @@ -93,17 +89,11 @@ def bench_linear(cache_seqlen: int): ) CPUInfer.sync() - input = torch.randn( - (1, 1, q_head_num, head_dim), dtype=torch.float16, device="cpu" - ).contiguous() - output = torch.empty( - (1, 1, q_head_num, head_dim), dtype=torch.float16, device="cpu" - ).contiguous() + input = torch.randn((1, 1, q_head_num, head_dim), dtype=torch.float16, device="cpu").contiguous() + output = torch.empty((1, 1, q_head_num, head_dim), dtype=torch.float16, device="cpu").contiguous() # attn_lse: (bsz, q_len, q_head_num) - attn_lse = torch.empty( - (1, 1, q_head_num), dtype=torch.float32, device="cpu" - ).contiguous() + attn_lse = torch.empty((1, 1, q_head_num), dtype=torch.float32, device="cpu").contiguous() input = input / 100 # warm up @@ -156,16 +146,7 @@ def bench_linear(cache_seqlen: int): print("Time(us) per iteration: ", total_time / test_iter * 1000000) print( "Bandwidth: ", - cache_seqlen - * kv_head_num - * head_dim - * 2 - * 2 - * test_iter - / total_time - / 1000 - / 1000 - / 1000, + cache_seqlen * kv_head_num * head_dim * 2 * 2 * test_iter / total_time / 1000 / 1000 / 1000, "GB/s", ) print("") diff --git a/kt-kernel/bench/bench_attention_torch.py b/kt-kernel/bench/bench_attention_torch.py index 8331d096..1e8af316 100644 --- a/kt-kernel/bench/bench_attention_torch.py +++ b/kt-kernel/bench/bench_attention_torch.py @@ -1,19 +1,19 @@ #!/usr/bin/env python # coding=utf-8 """ -Description : +Description : Author : Jianwei Dong Date : 2024-08-28 10:32:05 Version : 1.0.0 -LastEditors : Jianwei Dong +LastEditors : Jianwei Dong LastEditTime : 2024-08-28 10:32:05 -Copyright (c) 2024 by KVCache.AI, All Rights Reserved. +Copyright (c) 2024 by KVCache.AI, All Rights Reserved. """ import os, sys import time sys.path.append(os.path.dirname(__file__) + "/../build") -import kt_kernel_ext +from kt_kernel import kt_kernel_ext import torch layer_num = 10 @@ -45,9 +45,7 @@ def bench_linear(cache_seqlen: int, device): kvcaches.append((k_cache, v_cache)) - input = torch.randn( - (1, q_head_num, 1, head_dim), dtype=torch.float16, device=device - ).contiguous() + input = torch.randn((1, q_head_num, 1, head_dim), dtype=torch.float16, device=device).contiguous() input = input / 100 # warm up @@ -70,16 +68,7 @@ def bench_linear(cache_seqlen: int, device): print("Time(us) per iteration: ", total_time / test_iter * 1000000) print( "Bandwidth: ", - cache_seqlen - * q_head_num - * head_dim - * 2 - * 2 - * test_iter - / total_time - / 1000 - / 1000 - / 1000, + cache_seqlen * q_head_num * head_dim * 2 * 2 * test_iter / total_time / 1000 / 1000 / 1000, "GB/s", ) print("") diff --git a/kt-kernel/bench/bench_k2_moe_amx.py b/kt-kernel/bench/bench_k2_moe_amx.py index 50f5837e..f8880b7a 100644 --- a/kt-kernel/bench/bench_k2_moe_amx.py +++ b/kt-kernel/bench/bench_k2_moe_amx.py @@ -15,7 +15,7 @@ from tqdm import tqdm sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "build")) -import kt_kernel_ext +from kt_kernel import kt_kernel_ext import torch # Benchmark parameters (single MoE, no layer loop) @@ -29,9 +29,7 @@ warm_up_iter = 1000 test_iter = 5000 k_group_size = 32 -physical_to_logical_map = ( - torch.tensor(data=range(expert_num), device="cpu", dtype=torch.int64).contiguous() -) +physical_to_logical_map = torch.tensor(data=range(expert_num), device="cpu", dtype=torch.int64).contiguous() worker_config = kt_kernel_ext.WorkerPoolConfig() worker_config.subpool_count = 2 @@ -43,24 +41,12 @@ CPUInfer = kt_kernel_ext.CPUInfer(worker_config) def get_git_commit(): result = {} try: - commit = ( - subprocess.check_output(["git", "rev-parse", "HEAD"]) - .decode("utf-8") - .strip() - ) - commit_msg = ( - subprocess.check_output(["git", "log", "-1", "--pretty=%B"]) - .decode("utf-8") - .strip() - ) + commit = subprocess.check_output(["git", "rev-parse", "HEAD"]).decode("utf-8").strip() + commit_msg = subprocess.check_output(["git", "log", "-1", "--pretty=%B"]).decode("utf-8").strip() result["commit"] = commit result["commit_message"] = commit_msg - dirty_output = ( - subprocess.check_output(["git", "status", "--porcelain"]) - .decode("utf-8") - .strip() - ) + dirty_output = subprocess.check_output(["git", "status", "--porcelain"]).decode("utf-8").strip() if dirty_output: result["dirty"] = True result["dirty_files"] = dirty_output.splitlines() @@ -132,9 +118,7 @@ def record_results(result, filename=json_path): f.write(json.dumps(result) + "\n") -def pack_to_int32( - value: torch.Tensor, num_bits: int, packed_dim: int = 1 -) -> torch.Tensor: +def pack_to_int32(value: torch.Tensor, num_bits: int, packed_dim: int = 1) -> torch.Tensor: if value.dtype is not torch.int8: raise ValueError("Tensor must be torch.int8 before packing") if not (1 <= num_bits <= 8): @@ -181,9 +165,7 @@ def quantize_k2_tensor(weights: torch.Tensor, group_size: int): weights_f32 = weights.to(torch.float32) e, rows, cols = weights_f32.shape if cols % group_size != 0 or cols % 2 != 0: - raise ValueError( - f"cols ({cols}) must be divisible by group_size ({group_size}) and 2" - ) + raise ValueError(f"cols ({cols}) must be divisible by group_size ({group_size}) and 2") reshaped = weights_f32.view(e, rows, cols // group_size, group_size) max_abs = reshaped.abs().amax(dim=-1, keepdim=True).clamp(min=1e-8) @@ -191,9 +173,7 @@ def quantize_k2_tensor(weights: torch.Tensor, group_size: int): q = torch.round(reshaped / scales.unsqueeze(-1)).clamp(-8, 7).to(torch.int8) q = q.view(e, rows, cols) packed = pack_tensor_per_row(q, num_bits=4).view(e, rows, cols // 8).contiguous() - scales = scales.to(torch.bfloat16).contiguous().view( - e, rows, cols // group_size - ).contiguous() + scales = scales.to(torch.bfloat16).contiguous().view(e, rows, cols // group_size).contiguous() return packed, scales @@ -233,9 +213,7 @@ def bench_k2_moe(): bytes_per_elem = 0.5 + 2.0 / k_group_size quant_data = build_quantized_layer_weights() - config = kt_kernel_ext.moe.MOEConfig( - expert_num, num_experts_per_tok, hidden_size, intermediate_size, 0 - ) + config = kt_kernel_ext.moe.MOEConfig(expert_num, num_experts_per_tok, hidden_size, intermediate_size, 0) config.max_len = max_len config.quant_config.bits = 4 config.quant_config.group_size = k_group_size @@ -261,12 +239,8 @@ def bench_k2_moe(): .reshape(gen_iter, qlen * num_experts_per_tok) .contiguous() ) - weights = torch.rand( - (gen_iter, qlen, num_experts_per_tok), dtype=torch.float32, device="cpu" - ).contiguous() - input_tensor = torch.randn( - (qlen, hidden_size), dtype=torch.bfloat16, device="cpu" - ).contiguous() + weights = torch.rand((gen_iter, qlen, num_experts_per_tok), dtype=torch.float32, device="cpu").contiguous() + input_tensor = torch.randn((qlen, hidden_size), dtype=torch.bfloat16, device="cpu").contiguous() output_tensor = torch.empty_like(input_tensor) bsz_tensor = torch.tensor([qlen], device="cpu") @@ -313,17 +287,7 @@ def bench_k2_moe(): / total_time / 1e9 ) - flops = ( - hidden_size - * intermediate_size - * qlen - * 3 - * num_experts_per_tok - * 2 - * test_iter - / total_time - / 1e12 - ) + flops = hidden_size * intermediate_size * qlen * 3 * num_experts_per_tok * 2 * test_iter / total_time / 1e12 print("Quant mode: int4_k2") print("Time(s): ", total_time) diff --git a/kt-kernel/bench/bench_k2_write_buffer.py b/kt-kernel/bench/bench_k2_write_buffer.py index 940a0247..30e042c9 100644 --- a/kt-kernel/bench/bench_k2_write_buffer.py +++ b/kt-kernel/bench/bench_k2_write_buffer.py @@ -14,7 +14,7 @@ from tqdm import tqdm sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "build")) -import kt_kernel_ext +from kt_kernel import kt_kernel_ext import torch # Benchmark parameters (single MoE, mirror examples/test_k2_write_buffer.py) @@ -39,20 +39,12 @@ CPUInfer = kt_kernel_ext.CPUInfer(96) def get_git_commit(): result = {} try: - commit = ( - subprocess.check_output(["git", "rev-parse", "HEAD"]).decode("utf-8").strip() - ) - commit_msg = ( - subprocess.check_output(["git", "log", "-1", "--pretty=%B"]) - .decode("utf-8") - .strip() - ) + commit = subprocess.check_output(["git", "rev-parse", "HEAD"]).decode("utf-8").strip() + commit_msg = subprocess.check_output(["git", "log", "-1", "--pretty=%B"]).decode("utf-8").strip() result["commit"] = commit result["commit_message"] = commit_msg - dirty_output = ( - subprocess.check_output(["git", "status", "--porcelain"]).decode("utf-8").strip() - ) + dirty_output = subprocess.check_output(["git", "status", "--porcelain"]).decode("utf-8").strip() if dirty_output: result["dirty"] = True result["dirty_files"] = dirty_output.splitlines() @@ -160,9 +152,7 @@ def build_moe(): per_mat_scale_elems, ) = allocate_weights() - config = kt_kernel_ext.moe.MOEConfig( - expert_num, num_experts_per_tok, hidden_size, intermediate_size - ) + config = kt_kernel_ext.moe.MOEConfig(expert_num, num_experts_per_tok, hidden_size, intermediate_size) config.max_len = max_len config.quant_config.bits = 4 config.quant_config.group_size = group_size @@ -186,18 +176,10 @@ def build_moe(): total_weight_bytes_per_tp = gpu_experts_num * weight_bytes_per_expert_per_tp total_scale_elems_per_tp = gpu_experts_num * scale_elems_per_expert_per_tp - w13_weight_bufs = [ - torch.empty(2 * total_weight_bytes_per_tp, dtype=torch.uint8) for _ in range(gpu_tp_count) - ] - w13_scale_bufs = [ - torch.empty(2 * total_scale_elems_per_tp, dtype=torch.bfloat16) for _ in range(gpu_tp_count) - ] - w2_weight_bufs = [ - torch.empty(total_weight_bytes_per_tp, dtype=torch.uint8) for _ in range(gpu_tp_count) - ] - w2_scale_bufs = [ - torch.empty(total_scale_elems_per_tp, dtype=torch.bfloat16) for _ in range(gpu_tp_count) - ] + w13_weight_bufs = [torch.empty(2 * total_weight_bytes_per_tp, dtype=torch.uint8) for _ in range(gpu_tp_count)] + w13_scale_bufs = [torch.empty(2 * total_scale_elems_per_tp, dtype=torch.bfloat16) for _ in range(gpu_tp_count)] + w2_weight_bufs = [torch.empty(total_weight_bytes_per_tp, dtype=torch.uint8) for _ in range(gpu_tp_count)] + w2_scale_bufs = [torch.empty(total_scale_elems_per_tp, dtype=torch.bfloat16) for _ in range(gpu_tp_count)] buffer_ptrs = { "w13_weight_ptrs": [buf.data_ptr() for buf in w13_weight_bufs], @@ -248,7 +230,7 @@ def bench_write_buffer(): ) ) CPUInfer.sync() - + total_time = 0 for _ in tqdm(range(test_iter), desc="Testing"): start = time.perf_counter() @@ -265,8 +247,6 @@ def bench_write_buffer(): time.sleep(0.6) print(end - start) - - time_per_iter_us = total_time / test_iter * 1e6 bandwidth_gbs = bytes_per_call * test_iter / total_time / 1e9 diff --git a/kt-kernel/bench/bench_linear.py b/kt-kernel/bench/bench_linear.py index 42f3526d..8c1fb842 100644 --- a/kt-kernel/bench/bench_linear.py +++ b/kt-kernel/bench/bench_linear.py @@ -1,18 +1,19 @@ #!/usr/bin/env python # coding=utf-8 -''' -Description : +""" +Description : Author : chenht2022 Date : 2024-07-25 10:31:59 Version : 1.0.0 -LastEditors : chenht2022 +LastEditors : chenht2022 LastEditTime : 2024-08-06 10:35:35 -Copyright (c) 2024 by KVCache.AI, All Rights Reserved. -''' +Copyright (c) 2024 by KVCache.AI, All Rights Reserved. +""" import os, sys import time -sys.path.append(os.path.dirname(__file__) + '/../build') -import kt_kernel_ext + +sys.path.append(os.path.dirname(__file__) + "/../build") +from kt_kernel import kt_kernel_ext import torch input_size = 16384 @@ -25,65 +26,64 @@ CPUInfer = kt_kernel_ext.CPUInfer(64) warm_up_iter = 1000 test_iter = 10000 + def bench_linear(quant_mode: str): with torch.inference_mode(mode=True): - hidden_type = 30 # ggml_type::GGML_TYPE_BF16 + hidden_type = 30 # ggml_type::GGML_TYPE_BF16 if quant_mode == "fp32": - proj_type = 0 # ggml_type::GGML_TYPE_F32 + proj_type = 0 # ggml_type::GGML_TYPE_F32 bytes_per_elem = 4.000000 elif quant_mode == "fp16": - proj_type = 1 # ggml_type::GGML_TYPE_F16 + proj_type = 1 # ggml_type::GGML_TYPE_F16 bytes_per_elem = 2.000000 elif quant_mode == "bf16": - proj_type = 30 # ggml_type::GGML_TYPE_BF16 + proj_type = 30 # ggml_type::GGML_TYPE_BF16 bytes_per_elem = 2.000000 elif quant_mode == "q8_0": - proj_type = 8 # ggml_type::GGML_TYPE_Q8_0 + proj_type = 8 # ggml_type::GGML_TYPE_Q8_0 bytes_per_elem = 1.062500 elif quant_mode == "q6_k": - proj_type = 14 # ggml_type::GGML_TYPE_Q6_K + proj_type = 14 # ggml_type::GGML_TYPE_Q6_K bytes_per_elem = 0.820312 elif quant_mode == "q5_k_m": - proj_type = 13 # ggml_type::GGML_TYPE_Q5_K + proj_type = 13 # ggml_type::GGML_TYPE_Q5_K bytes_per_elem = 0.687500 elif quant_mode == "q4_k_m": - proj_type = 12 # ggml_type::GGML_TYPE_Q4_K + proj_type = 12 # ggml_type::GGML_TYPE_Q4_K bytes_per_elem = 0.562500 elif quant_mode == "q3_k_m": - proj_type = 11 # ggml_type::GGML_TYPE_Q3_K + proj_type = 11 # ggml_type::GGML_TYPE_Q3_K bytes_per_elem = 0.429688 elif quant_mode == "q2_k": - proj_type = 10 # ggml_type::GGML_TYPE_Q2_K + proj_type = 10 # ggml_type::GGML_TYPE_Q2_K bytes_per_elem = 0.328125 elif quant_mode == "iq3_xs": - proj_type = 21 # ggml_type::GGML_TYPE_IQ3_S + proj_type = 21 # ggml_type::GGML_TYPE_IQ3_S bytes_per_elem = 0.429688 elif quant_mode == "iq2_xxs": - proj_type = 16 # ggml_type::GGML_TYPE_IQ2_XXS + proj_type = 16 # ggml_type::GGML_TYPE_IQ2_XXS bytes_per_elem = 0.257812 else: - assert(False) + assert False linears = [] projs = [] for _ in range(layer_num): - proj = torch.randn((output_size, input_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous() - config = kt_kernel_ext.linear.LinearConfig(input_size, output_size, stride, group_max_len, proj.data_ptr(), proj_type, hidden_type) + proj = torch.randn((output_size, input_size), dtype=torch.float32, device="cuda").to("cpu").contiguous() + config = kt_kernel_ext.linear.LinearConfig( + input_size, output_size, stride, group_max_len, proj.data_ptr(), proj_type, hidden_type + ) linear = kt_kernel_ext.linear.Linear(config) projs.append(proj) linears.append(linear) - input = torch.randn((layer_num, qlen, input_size), dtype=torch.bfloat16, device = "cuda").to("cpu").contiguous() - output = torch.empty((layer_num, qlen, output_size), dtype=torch.bfloat16, device = "cuda").to("cpu").contiguous() + input = torch.randn((layer_num, qlen, input_size), dtype=torch.bfloat16, device="cuda").to("cpu").contiguous() + output = torch.empty((layer_num, qlen, output_size), dtype=torch.bfloat16, device="cuda").to("cpu").contiguous() # warm up for i in range(warm_up_iter): CPUInfer.submit( - linears[i % layer_num].forward( - qlen, - input[i % layer_num].data_ptr(), - output[i % layer_num].data_ptr() - ) + linears[i % layer_num].forward(qlen, input[i % layer_num].data_ptr(), output[i % layer_num].data_ptr()) ) CPUInfer.sync() @@ -91,21 +91,22 @@ def bench_linear(quant_mode: str): start = time.perf_counter() for i in range(test_iter): CPUInfer.submit( - linears[i % layer_num].forward( - qlen, - input[i % layer_num].data_ptr(), - output[i % layer_num].data_ptr() - ) + linears[i % layer_num].forward(qlen, input[i % layer_num].data_ptr(), output[i % layer_num].data_ptr()) ) CPUInfer.sync() end = time.perf_counter() total_time = end - start - print('Quant mode: ', quant_mode) - print('Time(s): ', total_time) - print('Iteration: ', test_iter) - print('Time(us) per iteration: ', total_time / test_iter * 1000000) - print('Bandwidth: ', input_size * output_size * bytes_per_elem * test_iter / total_time / 1000 / 1000 / 1000, 'GB/s') - print('') + print("Quant mode: ", quant_mode) + print("Time(s): ", total_time) + print("Iteration: ", test_iter) + print("Time(us) per iteration: ", total_time / test_iter * 1000000) + print( + "Bandwidth: ", + input_size * output_size * bytes_per_elem * test_iter / total_time / 1000 / 1000 / 1000, + "GB/s", + ) + print("") + bench_linear("fp32") bench_linear("fp16") diff --git a/kt-kernel/bench/bench_mla.py b/kt-kernel/bench/bench_mla.py index 40976283..5218a363 100644 --- a/kt-kernel/bench/bench_mla.py +++ b/kt-kernel/bench/bench_mla.py @@ -3,9 +3,10 @@ import time import subprocess import platform import json + os.environ["BLAS_NUM_THREADS"] = "1" -sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'build')) -import kt_kernel_ext +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "build")) +from kt_kernel import kt_kernel_ext from kt_kernel_ext.kvcache import ggml_type import torch from torch import inf, nn @@ -31,9 +32,9 @@ layer_num = 10 rope_theta = 10000 -max_qlen = qlen+kvlen +max_qlen = qlen + kvlen max_kvlen = 4096 -max_position_embeddings = 163840 +max_position_embeddings = 163840 rope_scaling = { "beta_fast": 32, @@ -42,7 +43,7 @@ rope_scaling = { "mscale": 1.0, "mscale_all_dim": 1.0, "original_max_position_embeddings": 4096, - "type": "yarn" + "type": "yarn", } CPUINFER_PARAM = 304 @@ -54,13 +55,12 @@ warm_up_iter = 20 test_iter = 100 - - # 获取脚本相关信息,用于生成结果保存文件名 script_path = os.path.abspath(__file__) script_dir = os.path.dirname(script_path) script_name = os.path.splitext(os.path.basename(script_path))[0] -json_path = os.path.join(script_dir, "bench_results "+ ".jsonl") +json_path = os.path.join(script_dir, "bench_results " + ".jsonl") + def get_git_commit(): """ @@ -100,9 +100,9 @@ def get_system_info(): # 获取 CPU 型号(仅 Linux 支持) cpu_model = None - if os.path.exists('/proc/cpuinfo'): + if os.path.exists("/proc/cpuinfo"): try: - with open('/proc/cpuinfo', 'r') as f: + with open("/proc/cpuinfo", "r") as f: for line in f: if "model name" in line: cpu_model = line.split(":", 1)[1].strip() @@ -113,9 +113,9 @@ def get_system_info(): # 获取内存大小(单位:GB),仅 Linux 支持 mem_total_gb = None - if os.path.exists('/proc/meminfo'): + if os.path.exists("/proc/meminfo"): try: - with open('/proc/meminfo', 'r') as f: + with open("/proc/meminfo", "r") as f: for line in f: if "MemTotal" in line: mem_kb = float(line.split(":", 1)[1].split()[0]) @@ -149,6 +149,7 @@ def record_results(result, filename=json_path): with open(filename, "a") as f: f.write(json.dumps(result) + "\n") + def bench_mla(quant_mode: str): """ 测试 MLA 模型的性能 @@ -171,22 +172,22 @@ def bench_mla(quant_mode: str): w_o_type = 1 bytes_per_elem = 2.000000 elif quant_mode == "q4_k_m": - q_a_proj_type = 12 # ggml_type::GGML_TYPE_Q4_K + q_a_proj_type = 12 # ggml_type::GGML_TYPE_Q4_K q_b_proj_type = 12 - kv_a_proj_with_mqa_type = 12 # ggml_type::GGML_TYPE_Q6_K + kv_a_proj_with_mqa_type = 12 # ggml_type::GGML_TYPE_Q6_K kv_b_proj_type = 12 w_o_type = 12 bytes_per_elem = 0.5625 else: raise ValueError("不支持的量化模式") - - # 构建各层 MLA 模型的输入数据 + + # 构建各层 MLA 模型的输入数据 mlas = [] for i in tqdm(range(layer_num)): q_a_proj = nn.Linear(hidden_size, q_lora_rank, bias=False, dtype=torch.float16) - q_b_proj = nn.Linear(q_lora_rank, num_heads * (nope_size+rope_size) , bias=False, dtype=torch.float16) + q_b_proj = nn.Linear(q_lora_rank, num_heads * (nope_size + rope_size), bias=False, dtype=torch.float16) kv_a_proj_with_mqa = nn.Linear(hidden_size, kv_lora_rank + rope_size, bias=False, dtype=torch.float16) - kv_b_proj = nn.Linear( num_heads * (nope_size + nope_size),kv_lora_rank, bias=False, dtype=torch.float16) + kv_b_proj = nn.Linear(num_heads * (nope_size + nope_size), kv_lora_rank, bias=False, dtype=torch.float16) o_proj = nn.Linear(num_heads * nope_size, hidden_size, bias=False, dtype=torch.float16) init.normal_(q_a_proj.weight, mean=0.0, std=0.02) @@ -194,11 +195,11 @@ def bench_mla(quant_mode: str): init.normal_(kv_a_proj_with_mqa.weight, mean=0.0, std=0.02) init.normal_(kv_b_proj.weight, mean=0.0, std=0.02) init.normal_(o_proj.weight, mean=0.0, std=0.02) - q_a_proj_weight = q_a_proj.weight.to(torch.float16).to('cpu').contiguous() - q_b_proj_weight = q_b_proj.weight.to(torch.float16).to('cpu').contiguous() - kv_a_proj_with_mqa_weight = kv_a_proj_with_mqa.weight.to('cpu').to(torch.float16).contiguous() - kv_b_proj_weight = kv_b_proj.weight.to(torch.float16).to('cpu').contiguous() - o_proj_weight = o_proj.weight.to(torch.float16).to('cpu').contiguous() + q_a_proj_weight = q_a_proj.weight.to(torch.float16).to("cpu").contiguous() + q_b_proj_weight = q_b_proj.weight.to(torch.float16).to("cpu").contiguous() + kv_a_proj_with_mqa_weight = kv_a_proj_with_mqa.weight.to("cpu").to(torch.float16).contiguous() + kv_b_proj_weight = kv_b_proj.weight.to(torch.float16).to("cpu").contiguous() + o_proj_weight = o_proj.weight.to(torch.float16).to("cpu").contiguous() config = kt_kernel_ext.mla.MLAConfig( hidden_size, @@ -210,7 +211,7 @@ def bench_mla(quant_mode: str): ) config.max_qlen = max_qlen config.max_kvlen = max_kvlen - config.max_position_embeddings = max_position_embeddings + config.max_position_embeddings = max_position_embeddings config.rope_scaling_factor = rope_scaling["factor"] config.rope_theta = rope_theta config.rope_scaling_beta_fast = rope_scaling["beta_fast"] @@ -231,64 +232,85 @@ def bench_mla(quant_mode: str): config.kv_b_proj_type = ggml_type.FP16 config.w_o_type = ggml_type.FP16 - config.pool = CPUInfer.backend_ - - mla = kt_kernel_ext.mla.MLA(config) mla.load_weights() mla.set_local_pages(pages_count) mlas.append(mla) - print('Generating data...') - input_tensor = torch.randn((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cpu").to("cpu").contiguous() - output_tensor = torch.empty((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cpu").to("cpu").contiguous() - - print('Warming up...') + print("Generating data...") + input_tensor = ( + torch.randn((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cpu").to("cpu").contiguous() + ) + output_tensor = ( + torch.empty((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cpu").to("cpu").contiguous() + ) + + print("Warming up...") for i in tqdm(range(warm_up_iter)): - mlas[i%layer_num].forward([qlen],[page_table],[kvlen], - input_tensor[i%layer_num].data_ptr(),output_tensor[i%layer_num].data_ptr()) + mlas[i % layer_num].forward( + [qlen], + [page_table], + [kvlen], + input_tensor[i % layer_num].data_ptr(), + output_tensor[i % layer_num].data_ptr(), + ) - - print('Start testing...') + print("Start testing...") start = time.perf_counter() for i in tqdm(range(test_iter)): - mlas[i%layer_num].forward([qlen],[page_table],[kvlen], - input_tensor[i%layer_num].data_ptr(),output_tensor[i%layer_num].data_ptr()) + mlas[i % layer_num].forward( + [qlen], + [page_table], + [kvlen], + input_tensor[i % layer_num].data_ptr(), + output_tensor[i % layer_num].data_ptr(), + ) end = time.perf_counter() total_time = end - start time_per_iter_us = (total_time * 1e6) / test_iter - bandwidth = bytes_per_elem * (q_lora_rank * hidden_size - + (kv_lora_rank+rope_size) * hidden_size - + (nope_size+rope_size) * q_lora_rank * num_heads - + (nope_size+nope_size)*kv_lora_rank * num_heads - + hidden_size * nope_size * num_heads - + hidden_size * qlen) * test_iter / (total_time * 1e9) - flops = 2*( - q_lora_rank*hidden_size*qlen - + kv_lora_rank * hidden_size * qlen - +num_heads* (nope_size+rope_size)*q_lora_rank*qlen - + num_heads * qlen * nope_size * kv_lora_rank - + num_heads * (kvlen+qlen) * kv_lora_rank * qlen - + num_heads * rope_size * qlen * (qlen+kvlen) - + num_heads * kv_lora_rank * (qlen + kvlen) * qlen - + num_heads * nope_size * kv_lora_rank * qlen - + hidden_size * num_heads* nope_size * qlen - ) * test_iter / (total_time * 1e12) + bandwidth = ( + bytes_per_elem + * ( + q_lora_rank * hidden_size + + (kv_lora_rank + rope_size) * hidden_size + + (nope_size + rope_size) * q_lora_rank * num_heads + + (nope_size + nope_size) * kv_lora_rank * num_heads + + hidden_size * nope_size * num_heads + + hidden_size * qlen + ) + * test_iter + / (total_time * 1e9) + ) + flops = ( + 2 + * ( + q_lora_rank * hidden_size * qlen + + kv_lora_rank * hidden_size * qlen + + num_heads * (nope_size + rope_size) * q_lora_rank * qlen + + num_heads * qlen * nope_size * kv_lora_rank + + num_heads * (kvlen + qlen) * kv_lora_rank * qlen + + num_heads * rope_size * qlen * (qlen + kvlen) + + num_heads * kv_lora_rank * (qlen + kvlen) * qlen + + num_heads * nope_size * kv_lora_rank * qlen + + hidden_size * num_heads * nope_size * qlen + ) + * test_iter + / (total_time * 1e12) + ) - - print('Quant mode:', quant_mode) - print('Time(s):', total_time) - print('Iteration:', test_iter) - print('Time(us) per iteration:', time_per_iter_us) - print('Bandwidth:', bandwidth, 'GB/s') - print('TFLOPS:', flops) - print('') + print("Quant mode:", quant_mode) + print("Time(s):", total_time) + print("Iteration:", test_iter) + print("Time(us) per iteration:", time_per_iter_us) + print("Bandwidth:", bandwidth, "GB/s") + print("TFLOPS:", flops) + print("") # 整理测试结果 result = { @@ -301,7 +323,7 @@ def bench_mla(quant_mode: str): "flops_TFLOPS": flops, "timestamp": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), "test_parameters": { - "qlen": qlen, + "qlen": qlen, "kvlen": kvlen, "page_table": page_table, "page_size": page_size, @@ -312,21 +334,16 @@ def bench_mla(quant_mode: str): "q_lora_rank": q_lora_rank, "nope_size": nope_size, "rope_size": rope_size, - - "layer_num": layer_num, - "rope_theta": rope_theta, "max_qlen": max_qlen, "max_kvlen": max_kvlen, "max_position_embeddings": max_position_embeddings, - "rope_scaling": rope_scaling, - "warm_up_iter": warm_up_iter, "test_iter": test_iter, - "CPUInfer_parameter": CPUINFER_PARAM - } + "CPUInfer_parameter": CPUINFER_PARAM, + }, } # 添加 git 与系统信息 result.update(get_git_commit()) @@ -334,6 +351,6 @@ def bench_mla(quant_mode: str): # 将结果记录到 JSON 文件中 print(result) record_results(result) - -bench_mla("fp16") \ No newline at end of file + +bench_mla("fp16") diff --git a/kt-kernel/bench/bench_mlp.py b/kt-kernel/bench/bench_mlp.py index 1e80cf94..175e0bf3 100644 --- a/kt-kernel/bench/bench_mlp.py +++ b/kt-kernel/bench/bench_mlp.py @@ -1,18 +1,19 @@ #!/usr/bin/env python # coding=utf-8 -''' -Description : +""" +Description : Author : chenht2022 Date : 2024-07-16 10:43:18 Version : 1.0.0 -LastEditors : chenht2022 +LastEditors : chenht2022 LastEditTime : 2024-08-06 10:36:04 -Copyright (c) 2024 by KVCache.AI, All Rights Reserved. -''' +Copyright (c) 2024 by KVCache.AI, All Rights Reserved. +""" import os, sys import time -sys.path.append(os.path.dirname(__file__) + '/../build') -import kt_kernel_ext + +sys.path.append(os.path.dirname(__file__) + "/../build") +from kt_kernel import kt_kernel_ext import torch hidden_size = 5120 @@ -25,94 +26,108 @@ CPUInfer = kt_kernel_ext.CPUInfer(64) warm_up_iter = 1000 test_iter = 10000 + def bench_mlp(quant_mode: str): with torch.inference_mode(mode=True): - hidden_type = 30 # ggml_type::GGML_TYPE_BF16 + hidden_type = 30 # ggml_type::GGML_TYPE_BF16 if quant_mode == "fp32": - gate_type = 0 # ggml_type::GGML_TYPE_F32 - up_type = 0 # ggml_type::GGML_TYPE_F32 - down_type = 0 # ggml_type::GGML_TYPE_F32 + gate_type = 0 # ggml_type::GGML_TYPE_F32 + up_type = 0 # ggml_type::GGML_TYPE_F32 + down_type = 0 # ggml_type::GGML_TYPE_F32 bytes_per_elem = 4.000000 elif quant_mode == "fp16": - gate_type = 1 # ggml_type::GGML_TYPE_F16 - up_type = 1 # ggml_type::GGML_TYPE_F16 - down_type = 1 # ggml_type::GGML_TYPE_F16 + gate_type = 1 # ggml_type::GGML_TYPE_F16 + up_type = 1 # ggml_type::GGML_TYPE_F16 + down_type = 1 # ggml_type::GGML_TYPE_F16 bytes_per_elem = 2.000000 elif quant_mode == "bf16": - gate_type = 30 # ggml_type::GGML_TYPE_BF16 - up_type = 30 # ggml_type::GGML_TYPE_BF16 - down_type = 30 # ggml_type::GGML_TYPE_BF16 + gate_type = 30 # ggml_type::GGML_TYPE_BF16 + up_type = 30 # ggml_type::GGML_TYPE_BF16 + down_type = 30 # ggml_type::GGML_TYPE_BF16 bytes_per_elem = 2.000000 elif quant_mode == "q8_0": - gate_type = 8 # ggml_type::GGML_TYPE_Q8_0 - up_type = 8 # ggml_type::GGML_TYPE_Q8_0 - down_type = 8 # ggml_type::GGML_TYPE_Q8_0 + gate_type = 8 # ggml_type::GGML_TYPE_Q8_0 + up_type = 8 # ggml_type::GGML_TYPE_Q8_0 + down_type = 8 # ggml_type::GGML_TYPE_Q8_0 bytes_per_elem = 1.062500 elif quant_mode == "q6_k": - gate_type = 14 # ggml_type::GGML_TYPE_Q6_K - up_type = 14 # ggml_type::GGML_TYPE_Q6_K - down_type = 14 # ggml_type::GGML_TYPE_Q6_K + gate_type = 14 # ggml_type::GGML_TYPE_Q6_K + up_type = 14 # ggml_type::GGML_TYPE_Q6_K + down_type = 14 # ggml_type::GGML_TYPE_Q6_K bytes_per_elem = 0.820312 elif quant_mode == "q5_k_m": - gate_type = 13 # ggml_type::GGML_TYPE_Q5_K - up_type = 13 # ggml_type::GGML_TYPE_Q5_K - down_type = 14 # ggml_type::GGML_TYPE_Q6_K + gate_type = 13 # ggml_type::GGML_TYPE_Q5_K + up_type = 13 # ggml_type::GGML_TYPE_Q5_K + down_type = 14 # ggml_type::GGML_TYPE_Q6_K bytes_per_elem = 0.731771 elif quant_mode == "q4_k_m": - gate_type = 12 # ggml_type::GGML_TYPE_Q4_K - up_type = 12 # ggml_type::GGML_TYPE_Q4_K - down_type = 14 # ggml_type::GGML_TYPE_Q6_K + gate_type = 12 # ggml_type::GGML_TYPE_Q4_K + up_type = 12 # ggml_type::GGML_TYPE_Q4_K + down_type = 14 # ggml_type::GGML_TYPE_Q6_K bytes_per_elem = 0.648437 elif quant_mode == "q3_k_m": - gate_type = 11 # ggml_type::GGML_TYPE_Q3_K - up_type = 11 # ggml_type::GGML_TYPE_Q3_K - down_type = 13 # ggml_type::GGML_TYPE_Q5_K + gate_type = 11 # ggml_type::GGML_TYPE_Q3_K + up_type = 11 # ggml_type::GGML_TYPE_Q3_K + down_type = 13 # ggml_type::GGML_TYPE_Q5_K bytes_per_elem = 0.515625 elif quant_mode == "q2_k": - gate_type = 10 # ggml_type::GGML_TYPE_Q2_K - up_type = 10 # ggml_type::GGML_TYPE_Q2_K - down_type = 11 # ggml_type::GGML_TYPE_Q3_K + gate_type = 10 # ggml_type::GGML_TYPE_Q2_K + up_type = 10 # ggml_type::GGML_TYPE_Q2_K + down_type = 11 # ggml_type::GGML_TYPE_Q3_K bytes_per_elem = 0.328125 elif quant_mode == "iq3_xs": - gate_type = 21 # ggml_type::GGML_TYPE_IQ3_S - up_type = 21 # ggml_type::GGML_TYPE_IQ3_S - down_type = 21 # ggml_type::GGML_TYPE_IQ3_S + gate_type = 21 # ggml_type::GGML_TYPE_IQ3_S + up_type = 21 # ggml_type::GGML_TYPE_IQ3_S + down_type = 21 # ggml_type::GGML_TYPE_IQ3_S bytes_per_elem = 0.429688 elif quant_mode == "iq2_xxs": - gate_type = 16 # ggml_type::GGML_TYPE_IQ2_XXS - up_type = 16 # ggml_type::GGML_TYPE_IQ2_XXS - down_type = 16 # ggml_type::GGML_TYPE_IQ2_XXS + gate_type = 16 # ggml_type::GGML_TYPE_IQ2_XXS + up_type = 16 # ggml_type::GGML_TYPE_IQ2_XXS + down_type = 16 # ggml_type::GGML_TYPE_IQ2_XXS bytes_per_elem = 0.257812 else: - assert(False) - + assert False mlps = [] gate_projs = [] up_projs = [] down_projs = [] for _ in range(layer_num): - gate_proj = torch.randn((intermediate_size, hidden_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous() - up_proj = torch.randn((intermediate_size, hidden_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous() - down_proj = torch.randn((hidden_size, intermediate_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous() - config = kt_kernel_ext.mlp.MLPConfig(hidden_size, intermediate_size, stride, group_max_len, gate_proj.data_ptr(), up_proj.data_ptr(), down_proj.data_ptr(), gate_type, up_type, down_type, hidden_type) + gate_proj = ( + torch.randn((intermediate_size, hidden_size), dtype=torch.float32, device="cuda").to("cpu").contiguous() + ) + up_proj = ( + torch.randn((intermediate_size, hidden_size), dtype=torch.float32, device="cuda").to("cpu").contiguous() + ) + down_proj = ( + torch.randn((hidden_size, intermediate_size), dtype=torch.float32, device="cuda").to("cpu").contiguous() + ) + config = kt_kernel_ext.mlp.MLPConfig( + hidden_size, + intermediate_size, + stride, + group_max_len, + gate_proj.data_ptr(), + up_proj.data_ptr(), + down_proj.data_ptr(), + gate_type, + up_type, + down_type, + hidden_type, + ) mlp = kt_kernel_ext.mlp.MLP(config) gate_projs.append(gate_proj) up_projs.append(up_proj) down_projs.append(down_proj) mlps.append(mlp) - input = torch.randn((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device = "cuda").to("cpu").contiguous() - output = torch.empty((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device = "cuda").to("cpu").contiguous() + input = torch.randn((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cuda").to("cpu").contiguous() + output = torch.empty((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cuda").to("cpu").contiguous() # warm up for i in range(warm_up_iter): CPUInfer.submit( - mlps[i % layer_num].forward( - qlen, - input[i % layer_num].data_ptr(), - output[i % layer_num].data_ptr() - ) + mlps[i % layer_num].forward(qlen, input[i % layer_num].data_ptr(), output[i % layer_num].data_ptr()) ) CPUInfer.sync() @@ -120,21 +135,22 @@ def bench_mlp(quant_mode: str): start = time.perf_counter() for i in range(test_iter): CPUInfer.submit( - mlps[i % layer_num].forward( - qlen, - input[i % layer_num].data_ptr(), - output[i % layer_num].data_ptr() - ) + mlps[i % layer_num].forward(qlen, input[i % layer_num].data_ptr(), output[i % layer_num].data_ptr()) ) CPUInfer.sync() end = time.perf_counter() total_time = end - start - print('Quant mode: ', quant_mode) - print('Time(s): ', total_time) - print('Iteration: ', test_iter) - print('Time(us) per iteration: ', total_time / test_iter * 1000000) - print('Bandwidth: ', hidden_size * intermediate_size * 3 * bytes_per_elem * test_iter / total_time / 1000 / 1000 / 1000, 'GB/s') - print('') + print("Quant mode: ", quant_mode) + print("Time(s): ", total_time) + print("Iteration: ", test_iter) + print("Time(us) per iteration: ", total_time / test_iter * 1000000) + print( + "Bandwidth: ", + hidden_size * intermediate_size * 3 * bytes_per_elem * test_iter / total_time / 1000 / 1000 / 1000, + "GB/s", + ) + print("") + bench_mlp("fp32") bench_mlp("fp16") diff --git a/kt-kernel/bench/bench_moe.py b/kt-kernel/bench/bench_moe.py index b75da127..019cc950 100644 --- a/kt-kernel/bench/bench_moe.py +++ b/kt-kernel/bench/bench_moe.py @@ -5,8 +5,8 @@ import json import subprocess import platform -sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'build')) -import kt_kernel_ext +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "build")) +from kt_kernel import kt_kernel_ext import torch from tqdm import tqdm @@ -35,7 +35,7 @@ CPUInfer = kt_kernel_ext.CPUInfer(CPUINFER_PARAM) script_path = os.path.abspath(__file__) script_dir = os.path.dirname(script_path) script_name = os.path.splitext(os.path.basename(script_path))[0] -json_path = os.path.join(script_dir, "bench_results "+ ".jsonl") +json_path = os.path.join(script_dir, "bench_results " + ".jsonl") def get_git_commit(): @@ -76,9 +76,9 @@ def get_system_info(): # 获取 CPU 型号(仅 Linux 支持) cpu_model = None - if os.path.exists('/proc/cpuinfo'): + if os.path.exists("/proc/cpuinfo"): try: - with open('/proc/cpuinfo', 'r') as f: + with open("/proc/cpuinfo", "r") as f: for line in f: if "model name" in line: cpu_model = line.split(":", 1)[1].strip() @@ -89,9 +89,9 @@ def get_system_info(): # 获取内存大小(单位:GB),仅 Linux 支持 mem_total_gb = None - if os.path.exists('/proc/meminfo'): + if os.path.exists("/proc/meminfo"): try: - with open('/proc/meminfo', 'r') as f: + with open("/proc/meminfo", "r") as f: for line in f: if "MemTotal" in line: mem_kb = float(line.split(":", 1)[1].split()[0]) @@ -134,57 +134,57 @@ def bench_moe(quant_mode: str): # 根据量化模式设置数据类型与 bytes_per_elem hidden_type = 30 # ggml_type::GGML_TYPE_BF16(固定) if quant_mode == "fp32": - gate_type = 0 # ggml_type::GGML_TYPE_F32 + gate_type = 0 # ggml_type::GGML_TYPE_F32 up_type = 0 down_type = 0 bytes_per_elem = 4.0 elif quant_mode == "fp16": - gate_type = 1 # ggml_type::GGML_TYPE_F16 + gate_type = 1 # ggml_type::GGML_TYPE_F16 up_type = 1 down_type = 1 bytes_per_elem = 2.0 elif quant_mode == "bf16": - gate_type = 30 # ggml_type::GGML_TYPE_BF16 + gate_type = 30 # ggml_type::GGML_TYPE_BF16 up_type = 30 down_type = 30 bytes_per_elem = 2.0 elif quant_mode == "q8_0": - gate_type = 8 # ggml_type::GGML_TYPE_Q8_0 + gate_type = 8 # ggml_type::GGML_TYPE_Q8_0 up_type = 8 down_type = 8 bytes_per_elem = 1.062500 elif quant_mode == "q6_k": - gate_type = 14 # ggml_type::GGML_TYPE_Q6_K + gate_type = 14 # ggml_type::GGML_TYPE_Q6_K up_type = 14 down_type = 14 bytes_per_elem = 0.820312 elif quant_mode == "q5_k_m": - gate_type = 13 # ggml_type::GGML_TYPE_Q5_K + gate_type = 13 # ggml_type::GGML_TYPE_Q5_K up_type = 13 - down_type = 14 # ggml_type::GGML_TYPE_Q6_K + down_type = 14 # ggml_type::GGML_TYPE_Q6_K bytes_per_elem = 0.731771 elif quant_mode == "q4_k_m": - gate_type = 12 # ggml_type::GGML_TYPE_Q4_K + gate_type = 12 # ggml_type::GGML_TYPE_Q4_K up_type = 12 - down_type = 14 # ggml_type::GGML_TYPE_Q6_K + down_type = 14 # ggml_type::GGML_TYPE_Q6_K bytes_per_elem = 0.648437 elif quant_mode == "q3_k_m": - gate_type = 11 # ggml_type::GGML_TYPE_Q3_K + gate_type = 11 # ggml_type::GGML_TYPE_Q3_K up_type = 11 - down_type = 13 # ggml_type::GGML_TYPE_Q5_K + down_type = 13 # ggml_type::GGML_TYPE_Q5_K bytes_per_elem = 0.515625 elif quant_mode == "q2_k": - gate_type = 10 # ggml_type::GGML_TYPE_Q2_K + gate_type = 10 # ggml_type::GGML_TYPE_Q2_K up_type = 10 - down_type = 11 # ggml_type::GGML_TYPE_Q3_K + down_type = 11 # ggml_type::GGML_TYPE_Q3_K bytes_per_elem = 0.328125 elif quant_mode == "iq3_xs": - gate_type = 21 # ggml_type::GGML_TYPE_IQ3_S + gate_type = 21 # ggml_type::GGML_TYPE_IQ3_S up_type = 21 down_type = 21 bytes_per_elem = 0.429688 elif quant_mode == "iq2_xxs": - gate_type = 16 # ggml_type::GGML_TYPE_IQ2_XXS + gate_type = 16 # ggml_type::GGML_TYPE_IQ2_XXS up_type = 16 down_type = 16 bytes_per_elem = 0.257812 @@ -194,13 +194,25 @@ def bench_moe(quant_mode: str): # 构建各层 MoE 模型 moes = [] for _ in tqdm(range(layer_num), desc="Initializing MOEs"): - gate_proj = torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float16, device="cpu").to("cpu").contiguous() - up_proj = torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float16, device="cpu").to("cpu").contiguous() - down_proj = torch.randn((expert_num, hidden_size, intermediate_size), dtype=torch.float16, device="cpu").to("cpu").contiguous() - + gate_proj = ( + torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float16, device="cpu") + .to("cpu") + .contiguous() + ) + up_proj = ( + torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float16, device="cpu") + .to("cpu") + .contiguous() + ) + down_proj = ( + torch.randn((expert_num, hidden_size, intermediate_size), dtype=torch.float16, device="cpu") + .to("cpu") + .contiguous() + ) + config = kt_kernel_ext.moe.MOEConfig(expert_num, num_experts_per_tok, hidden_size, intermediate_size) config.pool = CPUInfer.backend_ - config.m_block = m_block + config.m_block = m_block config.group_min_len = group_min_len config.group_max_len = group_max_len config.gate_proj = gate_proj.data_ptr() @@ -215,47 +227,52 @@ def bench_moe(quant_mode: str): CPUInfer.submit(moe.load_weights_task()) CPUInfer.sync() moes.append(moe) - + # 生成输入数据 - print('Generating data...') + print("Generating data...") # 专家路由索引与权重,每层一个 gen_iter = 1000 - expert_ids = torch.rand(gen_iter * qlen , expert_num, device="cpu").argsort(dim=-1)[:, :num_experts_per_tok].reshape(gen_iter, qlen * num_experts_per_tok).contiguous() - weights = torch.rand((gen_iter, qlen, num_experts_per_tok), dtype=torch.float32, device="cpu").contiguous() + expert_ids = ( + torch.rand(gen_iter * qlen, expert_num, device="cpu") + .argsort(dim=-1)[:, :num_experts_per_tok] + .reshape(gen_iter, qlen * num_experts_per_tok) + .contiguous() + ) + weights = torch.rand((gen_iter, qlen, num_experts_per_tok), dtype=torch.float32, device="cpu").contiguous() input_tensor = torch.randn((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cpu").contiguous() output_tensor = torch.empty((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cpu").contiguous() # 将 qlen 封装成 tensor,用于 forward 调用 qlen_tensor = torch.tensor([qlen], dtype=torch.int32) # 预热阶段 - print('Warming up...') + print("Warming up...") for i in tqdm(range(warm_up_iter), desc="Warm-up"): CPUInfer.submit( moes[i % layer_num].forward_task( qlen_tensor.data_ptr(), num_experts_per_tok, - expert_ids[i%gen_iter].data_ptr(), - weights[i%gen_iter].data_ptr(), + expert_ids[i % gen_iter].data_ptr(), + weights[i % gen_iter].data_ptr(), input_tensor[i % layer_num].data_ptr(), output_tensor[i % layer_num].data_ptr(), - False + False, ) ) CPUInfer.sync() # 测试阶段 - print('Start testing...') + print("Start testing...") start = time.perf_counter() for i in tqdm(range(test_iter), desc="Testing"): CPUInfer.submit( moes[i % layer_num].forward_task( qlen_tensor.data_ptr(), num_experts_per_tok, - expert_ids[i%gen_iter].data_ptr(), - weights[i%gen_iter].data_ptr(), + expert_ids[i % gen_iter].data_ptr(), + weights[i % gen_iter].data_ptr(), input_tensor[i % layer_num].data_ptr(), output_tensor[i % layer_num].data_ptr(), - False + False, ) ) CPUInfer.sync() @@ -264,17 +281,29 @@ def bench_moe(quant_mode: str): # 计算性能指标 time_per_iter_us = total_time / test_iter * 1e6 - bandwidth = hidden_size * intermediate_size * 3 * num_experts_per_tok * (1/8 * 256 * (1-(31/32)**qlen)) * bytes_per_elem * test_iter / total_time / 1e9 # 单位:GB/s - flops = hidden_size * intermediate_size * qlen * 3 * num_experts_per_tok * 2 * test_iter / total_time / 1e12 # 单位:TFLOPS + bandwidth = ( + hidden_size + * intermediate_size + * 3 + * num_experts_per_tok + * (1 / 8 * 256 * (1 - (31 / 32) ** qlen)) + * bytes_per_elem + * test_iter + / total_time + / 1e9 + ) # 单位:GB/s + flops = ( + hidden_size * intermediate_size * qlen * 3 * num_experts_per_tok * 2 * test_iter / total_time / 1e12 + ) # 单位:TFLOPS # 打印结果 - print('Quant mode:', quant_mode) - print('Time(s):', total_time) - print('Iteration:', test_iter) - print('Time(us) per iteration:', time_per_iter_us) - print('Bandwidth:', bandwidth, 'GB/s') - print('TFLOPS:', flops) - print('') + print("Quant mode:", quant_mode) + print("Time(s):", total_time) + print("Iteration:", test_iter) + print("Time(us) per iteration:", time_per_iter_us) + print("Bandwidth:", bandwidth, "GB/s") + print("TFLOPS:", flops) + print("") # 整理测试结果 result = { @@ -298,8 +327,8 @@ def bench_moe(quant_mode: str): "qlen": qlen, "warm_up_iter": warm_up_iter, "test_iter": test_iter, - "CPUInfer_parameter": CPUINFER_PARAM - } + "CPUInfer_parameter": CPUINFER_PARAM, + }, } # 添加 git 与系统信息 result.update(get_git_commit()) @@ -321,4 +350,4 @@ if __name__ == "__main__": # bench_moe("q3_k_m", layer_num) # bench_moe("q2_k", layer_num) # bench_moe("iq3_xs", layer_num) - # bench_moe("iq2_xxs", layer_num) \ No newline at end of file + # bench_moe("iq2_xxs", layer_num) diff --git a/kt-kernel/bench/bench_moe_amx.py b/kt-kernel/bench/bench_moe_amx.py index 77a1f144..5b5c12b5 100644 --- a/kt-kernel/bench/bench_moe_amx.py +++ b/kt-kernel/bench/bench_moe_amx.py @@ -15,7 +15,7 @@ from tqdm import tqdm sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "build")) import torch -import kt_kernel_ext +from kt_kernel import kt_kernel_ext import numpy as np # 测试参数设置 diff --git a/kt-kernel/bench/bench_moe_amx_k.py b/kt-kernel/bench/bench_moe_amx_k.py index 843c82db..f632f6bc 100644 --- a/kt-kernel/bench/bench_moe_amx_k.py +++ b/kt-kernel/bench/bench_moe_amx_k.py @@ -1,19 +1,20 @@ #!/usr/bin/env python # coding=utf-8 -''' -Description : +""" +Description : Author : chenht2022 Date : 2024-07-25 10:32:05 Version : 1.0.0 -LastEditors : chenht2022 +LastEditors : chenht2022 LastEditTime : 2024-08-06 10:41:28 -Copyright (c) 2024 by KVCache.AI, All Rights Reserved. -''' +Copyright (c) 2024 by KVCache.AI, All Rights Reserved. +""" import os, sys, time, json, subprocess, platform from tqdm import tqdm -sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'build')) -import kt_kernel_ext + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "build")) +from kt_kernel import kt_kernel_ext import torch import numpy as np @@ -21,33 +22,28 @@ import numpy as np expert_num = 256 hidden_size = 7168 intermediate_size = 2048 -max_len = 25600 +max_len = 25600 num_experts_per_tok = 8 layer_num = 4 qlen = 1024 -# qlen = 1 +# qlen = 1 warm_up_iter = 1000 test_iter = 5000 k_group_size = 128 -physical_to_logical_map = torch.tensor( - data=range(expert_num), - device="cpu", - dtype=torch.int64 -).contiguous() +physical_to_logical_map = torch.tensor(data=range(expert_num), device="cpu", dtype=torch.int64).contiguous() # 将 CPUInfer 参数设为变量 # CPUINFER_PARAM = 257 # CPUInfer = kt_kernel_ext.CPUInfer(CPUINFER_PARAM) worker_config = kt_kernel_ext.WorkerPoolConfig() worker_config.subpool_count = 2 -worker_config.subpool_numa_map= [0,1] -worker_config.subpool_thread_count = [40,40] +worker_config.subpool_numa_map = [0, 1] +worker_config.subpool_thread_count = [40, 40] CPUINFER_PARAM = 80 CPUInfer = kt_kernel_ext.CPUInfer(worker_config) - def get_git_commit(): """ 获取当前 git 提交记录(commit hash 和提交信息), @@ -82,14 +78,14 @@ def get_system_info(): info = {} # 系统名称及主机名 uname = platform.uname() - info["system_name"] = uname.system # 如 Linux, Windows 等 - info["node_name"] = uname.node # 主机名称 + info["system_name"] = uname.system # 如 Linux, Windows 等 + info["node_name"] = uname.node # 主机名称 # 获取 CPU 型号(仅 Linux 支持) cpu_model = None - if os.path.exists('/proc/cpuinfo'): + if os.path.exists("/proc/cpuinfo"): try: - with open('/proc/cpuinfo', 'r') as f: + with open("/proc/cpuinfo", "r") as f: for line in f: if "model name" in line: cpu_model = line.split(":", 1)[1].strip() @@ -100,9 +96,9 @@ def get_system_info(): # 获取内存大小(单位:GB),仅 Linux 支持 mem_total_gb = None - if os.path.exists('/proc/meminfo'): + if os.path.exists("/proc/meminfo"): try: - with open('/proc/meminfo', 'r') as f: + with open("/proc/meminfo", "r") as f: for line in f: if "MemTotal" in line: mem_kb = float(line.split(":", 1)[1].split()[0]) @@ -130,11 +126,13 @@ def get_system_info(): return info + script_path = os.path.abspath(__file__) script_dir = os.path.dirname(script_path) script_name = os.path.splitext(os.path.basename(script_path))[0] json_path = os.path.join(script_dir, script_name + ".jsonl") + def record_results(result, filename=json_path): """ 将结果以 JSON 格式追加到文件中 @@ -142,6 +140,7 @@ def record_results(result, filename=json_path): with open(filename, "a") as f: f.write(json.dumps(result) + "\n") + def bench_moe(quant_mode: str): with torch.inference_mode(): if quant_mode == "bf16": @@ -160,11 +159,22 @@ def bench_moe(quant_mode: str): up_projs = [] down_projs = [] for layer_index in range(layer_num): - gate_proj = torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device="cuda").to("cpu").contiguous() - up_proj = torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device="cuda").to("cpu").contiguous() - down_proj = torch.randn((expert_num, hidden_size, intermediate_size), dtype=torch.float32, device="cuda").to("cpu").contiguous() - config = kt_kernel_ext.moe.MOEConfig( - expert_num, num_experts_per_tok, hidden_size, intermediate_size,0) + gate_proj = ( + torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device="cuda") + .to("cpu") + .contiguous() + ) + up_proj = ( + torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device="cuda") + .to("cpu") + .contiguous() + ) + down_proj = ( + torch.randn((expert_num, hidden_size, intermediate_size), dtype=torch.float32, device="cuda") + .to("cpu") + .contiguous() + ) + config = kt_kernel_ext.moe.MOEConfig(expert_num, num_experts_per_tok, hidden_size, intermediate_size, 0) config.max_len = max_len config.gate_proj = gate_proj.data_ptr() config.up_proj = up_proj.data_ptr() @@ -189,10 +199,22 @@ def bench_moe(quant_mode: str): down_projs.append(down_proj) moes.append(moe) gen_iter = 3000 - expert_ids = torch.rand(gen_iter * qlen , expert_num, device="cpu").argsort(dim=-1)[:, :num_experts_per_tok].reshape(gen_iter, qlen * num_experts_per_tok).to("cpu").contiguous() - weights = torch.rand((gen_iter, qlen, num_experts_per_tok), dtype=torch.float32, device="cpu").to("cpu").contiguous() - input_tensor = torch.randn((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cuda").to("cpu").contiguous() - output_tensor = torch.empty((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cuda").to("cpu").contiguous() + expert_ids = ( + torch.rand(gen_iter * qlen, expert_num, device="cpu") + .argsort(dim=-1)[:, :num_experts_per_tok] + .reshape(gen_iter, qlen * num_experts_per_tok) + .to("cpu") + .contiguous() + ) + weights = ( + torch.rand((gen_iter, qlen, num_experts_per_tok), dtype=torch.float32, device="cpu").to("cpu").contiguous() + ) + input_tensor = ( + torch.randn((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cuda").to("cpu").contiguous() + ) + output_tensor = ( + torch.empty((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cuda").to("cpu").contiguous() + ) bsz_tensor = torch.tensor([qlen], device="cpu") # 预热迭代 @@ -203,8 +225,8 @@ def bench_moe(quant_mode: str): moes[i % layer_num].forward_task( bsz_tensor.data_ptr(), num_experts_per_tok, - expert_ids[i%gen_iter].data_ptr(), - weights[i%gen_iter].data_ptr(), + expert_ids[i % gen_iter].data_ptr(), + weights[i % gen_iter].data_ptr(), input_tensor[i % layer_num].data_ptr(), output_tensor[i % layer_num].data_ptr(), False, @@ -224,8 +246,8 @@ def bench_moe(quant_mode: str): moes[i % layer_num].forward_task( bsz_tensor.data_ptr(), num_experts_per_tok, - expert_ids[i%gen_iter].data_ptr(), - weights[i%gen_iter].data_ptr(), + expert_ids[i % gen_iter].data_ptr(), + weights[i % gen_iter].data_ptr(), input_tensor[i % layer_num].data_ptr(), output_tensor[i % layer_num].data_ptr(), False, @@ -239,16 +261,28 @@ def bench_moe(quant_mode: str): # 计算性能指标 time_per_iter_us = total_time / test_iter * 1e6 - bandwidth = hidden_size * intermediate_size * 3 * num_experts_per_tok * (1/8 * 256 * (1-(31/32)**qlen)) * bytes_per_elem * test_iter / total_time / 1e9 # 单位:GB/s - flops = hidden_size * intermediate_size * qlen * 3 * num_experts_per_tok * 2 * test_iter / total_time / 1e12 # 单位:TFLOPS + bandwidth = ( + hidden_size + * intermediate_size + * 3 + * num_experts_per_tok + * (1 / 8 * 256 * (1 - (31 / 32) ** qlen)) + * bytes_per_elem + * test_iter + / total_time + / 1e9 + ) # 单位:GB/s + flops = ( + hidden_size * intermediate_size * qlen * 3 * num_experts_per_tok * 2 * test_iter / total_time / 1e12 + ) # 单位:TFLOPS - print('Quant mode: ', quant_mode) - print('Time(s): ', total_time) - print('Iteration: ', test_iter) - print('Time(us) per iteration: ', time_per_iter_us) - print('Bandwidth: ', bandwidth, 'GB/s') - print('Flops: ', flops, 'TFLOPS') - print('') + print("Quant mode: ", quant_mode) + print("Time(s): ", total_time) + print("Iteration: ", test_iter) + print("Time(us) per iteration: ", time_per_iter_us) + print("Bandwidth: ", bandwidth, "GB/s") + print("Flops: ", flops, "TFLOPS") + print("") # 整理结果记录,包括测试参数 result = { @@ -270,8 +304,8 @@ def bench_moe(quant_mode: str): "warm_up_iter": warm_up_iter, "test_iter": test_iter, "CPUInfer_parameter": CPUINFER_PARAM, - "k_group_size": k_group_size - } + "k_group_size": k_group_size, + }, } # 添加 git 提交记录信息 result.update(get_git_commit()) @@ -280,9 +314,10 @@ def bench_moe(quant_mode: str): # 将结果以 JSON 形式追加到文件中 record_results(result) + if __name__ == "__main__": # 选择需要测试的量化模式 # bench_moe("bf16") # bench_moe("int8") # bench_moe("int4") - bench_moe("int4_1k") \ No newline at end of file + bench_moe("int4_1k") diff --git a/kt-kernel/bench/bench_moe_kernel.py b/kt-kernel/bench/bench_moe_kernel.py index a4583a0e..01f109dd 100644 --- a/kt-kernel/bench/bench_moe_kernel.py +++ b/kt-kernel/bench/bench_moe_kernel.py @@ -14,7 +14,7 @@ import os, sys, time, json, subprocess, platform os.environ["BLAS_NUM_THREADS"] = "1" sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "build")) import torch -import kt_kernel_ext +from kt_kernel import kt_kernel_ext import numpy as np from tqdm import tqdm diff --git a/kt-kernel/bench/bench_moe_kernel_tiling.py b/kt-kernel/bench/bench_moe_kernel_tiling.py index de3d0d59..41b2e2cd 100644 --- a/kt-kernel/bench/bench_moe_kernel_tiling.py +++ b/kt-kernel/bench/bench_moe_kernel_tiling.py @@ -26,7 +26,7 @@ os.environ.setdefault("BLAS_NUM_THREADS", "1") sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "build")) import torch # noqa: E402 -import kt_kernel_ext as ce # noqa: E402 +from kt_kernel import kt_kernel_ext as ce # noqa: E402 from tqdm import tqdm # noqa: E402 diff --git a/kt-kernel/bench/bench_moe_kml.py b/kt-kernel/bench/bench_moe_kml.py index b4772dd0..1aa1011e 100644 --- a/kt-kernel/bench/bench_moe_kml.py +++ b/kt-kernel/bench/bench_moe_kml.py @@ -13,7 +13,7 @@ import os, sys, time, json, subprocess, platform os.environ["BLAS_NUM_THREADS"] = "1" sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "build")) -import kt_kernel_ext +from kt_kernel import kt_kernel_ext import torch import numpy as np from tqdm import tqdm diff --git a/kt-kernel/examples/test-debug.py b/kt-kernel/examples/test-debug.py index a90aa1b8..9ea461b5 100644 --- a/kt-kernel/examples/test-debug.py +++ b/kt-kernel/examples/test-debug.py @@ -1,9 +1,10 @@ import os import sys -sys.path.insert(0, os.path.dirname(__file__) + '/../build') + +sys.path.insert(0, os.path.dirname(__file__) + "/../build") import torch import ctypes -import kt_kernel_ext +from kt_kernel import kt_kernel_ext from kt_kernel_ext.moe import MOEConfig, MOE, AMXBF16_MOE, AMXInt8_MOE, AMXInt4_MOE, AMXInt4_1_MOE intermediate_size_full = 2048 @@ -14,20 +15,14 @@ num_experts_per_tok = 8 cpu_infer = kt_kernel_ext.CPUInfer(97) up = torch.empty(experts_num, intermediate_size_full, hidden_size, dtype=torch.bfloat16, device="cpu") - + gate = torch.empty(experts_num, intermediate_size_full, hidden_size, dtype=torch.bfloat16, device="cpu") - + down = torch.empty(experts_num, hidden_size, intermediate_size_full, dtype=torch.bfloat16, device="cpu") -gate_ptr = ctypes.addressof( - ctypes.cast(gate.data_ptr(), ctypes.POINTER(ctypes.c_uint64)).contents -) -up_ptr = ctypes.addressof( - ctypes.cast(up.data_ptr(), ctypes.POINTER(ctypes.c_uint64)).contents -) -down_ptr = ctypes.addressof( - ctypes.cast(down.data_ptr(), ctypes.POINTER(ctypes.c_uint64)).contents -) +gate_ptr = ctypes.addressof(ctypes.cast(gate.data_ptr(), ctypes.POINTER(ctypes.c_uint64)).contents) +up_ptr = ctypes.addressof(ctypes.cast(up.data_ptr(), ctypes.POINTER(ctypes.c_uint64)).contents) +down_ptr = ctypes.addressof(ctypes.cast(down.data_ptr(), ctypes.POINTER(ctypes.c_uint64)).contents) moe_config = MOEConfig( experts_num, num_experts_per_tok, @@ -36,9 +31,9 @@ moe_config = MOEConfig( ) moe_config.layer_idx = 45 moe_config.pool = cpu_infer.backend_ -moe_config.max_len = 1024 #TODO(zbx): multi cuda graph +moe_config.max_len = 1024 # TODO(zbx): multi cuda graph moe_config.gate_proj = gate_ptr moe_config.up_proj = up_ptr moe_config.down_proj = down_ptr moe_config.path = "" -moe = AMXInt4_MOE(moe_config) \ No newline at end of file +moe = AMXInt4_MOE(moe_config) diff --git a/kt-kernel/examples/test_attention.py b/kt-kernel/examples/test_attention.py index 56aaf318..442a15f6 100644 --- a/kt-kernel/examples/test_attention.py +++ b/kt-kernel/examples/test_attention.py @@ -1,19 +1,19 @@ #!/usr/bin/env python # coding=utf-8 """ -Description : +Description : Author : Jianwei Dong Date : 2024-08-28 10:32:05 Version : 1.0.0 -LastEditors : chenht2022 +LastEditors : chenht2022 LastEditTime : 2024-08-28 10:32:05 -Copyright (c) 2024 by KVCache.AI, All Rights Reserved. +Copyright (c) 2024 by KVCache.AI, All Rights Reserved. """ import os, sys import time sys.path.append(os.path.dirname(__file__) + "/../build") -import kt_kernel_ext +from kt_kernel import kt_kernel_ext from flash_attn import flash_attn_with_kvcache import torch @@ -59,19 +59,11 @@ with torch.inference_mode(mode=True): local_kvcache = kt_kernel_ext.kvcache.KVCache(config) kvcaches = [] - block_table = ( - torch.arange(max_block_num, dtype=torch.int32, device="cpu") - .contiguous() - .view(1, -1) - ) + block_table = torch.arange(max_block_num, dtype=torch.int32, device="cpu").contiguous().view(1, -1) for layer_idx in range(layer_num): - k_cache = torch.randn( - (1, cache_seqlen, kv_head_num, head_dim), dtype=torch.float16, device="cpu" - ).contiguous() - v_cache = torch.randn( - (1, cache_seqlen, kv_head_num, head_dim), dtype=torch.float16, device="cpu" - ).contiguous() + k_cache = torch.randn((1, cache_seqlen, kv_head_num, head_dim), dtype=torch.float16, device="cpu").contiguous() + v_cache = torch.randn((1, cache_seqlen, kv_head_num, head_dim), dtype=torch.float16, device="cpu").contiguous() CPUInfer.submit( local_kvcache.update_kvcache_fp16( @@ -94,17 +86,11 @@ with torch.inference_mode(mode=True): k_cache = kvcaches[i % layer_num][0] v_cache = kvcaches[i % layer_num][1] - input = torch.randn( - (1, 1, q_head_num, head_dim), dtype=torch.float16, device="cpu" - ).contiguous() - output = torch.empty( - (1, 1, q_head_num, head_dim), dtype=torch.float16, device="cpu" - ).contiguous() + input = torch.randn((1, 1, q_head_num, head_dim), dtype=torch.float16, device="cpu").contiguous() + output = torch.empty((1, 1, q_head_num, head_dim), dtype=torch.float16, device="cpu").contiguous() # attn_lse: (bsz, q_len, q_head_num) - attn_lse = torch.empty( - (1, 1, q_head_num), dtype=torch.float32, device="cpu" - ).contiguous() + attn_lse = torch.empty((1, 1, q_head_num), dtype=torch.float32, device="cpu").contiguous() input = input / 100 CPUInfer.submit( @@ -135,8 +121,6 @@ with torch.inference_mode(mode=True): ) # print("torch output", t_output) - diff = torch.mean(torch.abs(output.to("cuda") - t_output)) / torch.mean( - torch.abs(t_output) - ) + diff = torch.mean(torch.abs(output.to("cuda") - t_output)) / torch.mean(torch.abs(t_output)) print("diff = ", diff) assert diff < 0.001 diff --git a/kt-kernel/examples/test_awq_moe_amx.py b/kt-kernel/examples/test_awq_moe_amx.py index 366b10d6..afc59369 100644 --- a/kt-kernel/examples/test_awq_moe_amx.py +++ b/kt-kernel/examples/test_awq_moe_amx.py @@ -2,7 +2,7 @@ import os, sys sys.path.insert(0, os.path.dirname(__file__) + "/../build") -import kt_kernel_ext +from kt_kernel import kt_kernel_ext import torch # Set fixed seed for reproducible results diff --git a/kt-kernel/examples/test_deepseekv3.py b/kt-kernel/examples/test_deepseekv3.py index 76a7b287..9b5adb9b 100644 --- a/kt-kernel/examples/test_deepseekv3.py +++ b/kt-kernel/examples/test_deepseekv3.py @@ -1,8 +1,9 @@ import os, sys import time + os.environ["BLAS_NUM_THREADS"] = "1" sys.path.insert(0, os.path.dirname(__file__) + "/../build") -import kt_kernel_ext +from kt_kernel import kt_kernel_ext from kt_kernel_ext.kvcache import ggml_type import torch import logging @@ -20,6 +21,7 @@ from transformers import ( logger = logging.getLogger("reader") from gguf.gguf_reader import GGUFReader + # load_layers = 6 load_layers = None CPUInfer = kt_kernel_ext.CPUInfer(304) @@ -284,22 +286,21 @@ def build_moegate(layer_idx, json_config, gguf_weights): json_config["topk_group"], ) - config.routed_scaling_factor = json_config['routed_scaling_factor'] + config.routed_scaling_factor = json_config["routed_scaling_factor"] config.pool = CPUInfer.backend_ - weight,weight_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_gate_inp.weight") + weight, weight_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_gate_inp.weight") config.weight = weight.data_ptr() config.weight_type = type_to_ggml_type(weight_type) - bias,bias_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.exp_probs_b.bias") + bias, bias_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.exp_probs_b.bias") config.e_score_correction_bias = bias.data_ptr() config.e_score_correction_bias_type = type_to_ggml_type(bias_type) gate = kt_kernel_ext.gate.MoEGate(config) - + return gate - def build_llm(json_config, gguf_weights): @@ -312,15 +313,15 @@ def build_llm(json_config, gguf_weights): general_config.n_shared_experts = json_config["n_shared_experts"] general_config.max_qlen = max_qlen - lm_heads,lm_heads_type = get_torch_tensor_and_type_from_gguf(gguf_weights, "output.weight") + lm_heads, lm_heads_type = get_torch_tensor_and_type_from_gguf(gguf_weights, "output.weight") general_config.lm_heads_ptr = lm_heads.data_ptr() general_config.lm_heads_type = type_to_ggml_type(lm_heads_type) output_norm, output_norm_type = get_torch_tensor_and_type_from_gguf(gguf_weights, "output_norm.weight") general_config.norm_weights_ptr = output_norm.data_ptr() - general_config.norm_weights_type = type_to_ggml_type(output_norm_type) + general_config.norm_weights_type = type_to_ggml_type(output_norm_type) - token_embd,token_embd_type = get_torch_tensor_and_type_from_gguf(weights, "token_embd.weight") + token_embd, token_embd_type = get_torch_tensor_and_type_from_gguf(weights, "token_embd.weight") general_config.token_embd_ptr = token_embd.data_ptr() general_config.token_embd_type = type_to_ggml_type(token_embd_type) @@ -330,12 +331,11 @@ def build_llm(json_config, gguf_weights): model = kt_kernel_ext.DeepseekV3Model(general_config) llm.model = model - decoder_layers = [] real_load_layers = json_config["num_hidden_layers"] if load_layers is None else load_layers for i in range(real_load_layers): - layer = kt_kernel_ext.DeepseekV3DecoderLayer(general_config,i) + layer = kt_kernel_ext.DeepseekV3DecoderLayer(general_config, i) attn_norm, attn_norm_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{i}.attn_norm.weight") ffn_norm, ffn_norm_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{i}.ffn_norm.weight") @@ -351,11 +351,11 @@ def build_llm(json_config, gguf_weights): layer.ffn = build_ffn(i, json_config, gguf_weights) decoder_layers.append(layer) - model.layers = decoder_layers + model.layers = decoder_layers return llm -safetensor_path = '/home/bd/models/DeepSeek-R1' +safetensor_path = "/home/bd/models/DeepSeek-R1" json_path = os.path.join(safetensor_path, "config.json") json_config = json.load(open(json_path, "r")) print(json_config) @@ -368,11 +368,11 @@ weights = dict(sorted(weights.items())) for name, t in weights.items(): # if not name.startswith("blk"): # if name.startswith("blk.10."): - # if "ffn_gate." in name: - # print(f"Found weight: {t.name}, Shape: {t.shape}, Type: {t.tensor_type.name}, Size: {t.n_elements}") + # if "ffn_gate." in name: + # print(f"Found weight: {t.name}, Shape: {t.shape}, Type: {t.tensor_type.name}, Size: {t.n_elements}") print(f"Found weight: {t.name}, Shape: {t.shape}, Type: {t.tensor_type.name}, Size: {t.n_elements}") - -print("Building LLM ...") + +print("Building LLM ...") load_start_time = time.perf_counter() llm = build_llm(json_config, weights) load_end_time = time.perf_counter() @@ -389,22 +389,20 @@ config = AutoConfig.from_pretrained(safetensor_path, trust_remote_code=True) force_think = False -output_logits = torch.zeros((max_qlen, json_config['vocab_size']), dtype=torch.float32) +output_logits = torch.zeros((max_qlen, json_config["vocab_size"]), dtype=torch.float32) def start_chat(content=None): if content is None: content = input("Chat: ") - + messages = [{"role": "user", "content": content}] - input_tensor = tokenizer.apply_chat_template( - messages, add_generation_prompt=True, return_tensors="pt" - ) + input_tensor = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt") if force_think: - token_thinks = torch.tensor([tokenizer.encode("\\n",add_special_tokens=False)],device=input_tensor.device) - input_tensor = torch.cat( - [input_tensor, token_thinks], dim=1 + token_thinks = torch.tensor( + [tokenizer.encode("\\n", add_special_tokens=False)], device=input_tensor.device ) + input_tensor = torch.cat([input_tensor, token_thinks], dim=1) input_tensor = input_tensor.squeeze(0) # Add batch dimension print(f"Input tensor: {input_tensor}, type {input_tensor.dtype}, shape {input_tensor.shape}") @@ -415,34 +413,36 @@ def start_chat(content=None): stream = TextStreamer(tokenizer) qlen = input_tensor.shape[0] - qlens = [qlen-kvlen] + qlens = [qlen - kvlen] kvlens = [kvlen] page_tables = [list(range(pages_count))] start_time = time.perf_counter() - llm.forward(qlens,page_tables, kvlens, input_tensor[kvlen:].data_ptr(), output_logits.data_ptr()) + llm.forward(qlens, page_tables, kvlens, input_tensor[kvlen:].data_ptr(), output_logits.data_ptr()) end_time = time.perf_counter() - print(f"Forward time: {end_time - start_time:.4f} seconds, tps: {qlens[0] / (end_time - start_time)} tokens/sec") - + print( + f"Forward time: {end_time - start_time:.4f} seconds, tps: {qlens[0] / (end_time - start_time)} tokens/sec" + ) + logits = output_logits[0] # print(logits) - # sample + # sample next_token = torch.argmax(logits).item() # print(f"Next token: {next_token}, {tokenizer.decode(next_token)}") kvlen = input_tensor.shape[0] input_tensor = torch.cat((input_tensor, torch.tensor([next_token])), dim=-1) - - if next_token == tokenizer.eos_token_id or tokenizer.decode(next_token) == '<|im_end|>': + + if next_token == tokenizer.eos_token_id or tokenizer.decode(next_token) == "<|im_end|>": stream.end() break else: stream.put(torch.tensor([next_token])) + + job_id = 0 while True: try: # ---------- 让用户决定是否继续 ---------- - choice = input( - "\n【回车】开始对话 | 输入 1 读取文件 | 输入 q/quit/exit 退出程序: " - ).strip().lower() + choice = input("\n【回车】开始对话 | 输入 1 读取文件 | 输入 q/quit/exit 退出程序: ").strip().lower() if choice in {"q", "quit", "exit"}: print("收到退出指令,程序结束。") break @@ -466,15 +466,4 @@ while True: print(f"\n发生错误:{e}\n已终止对话 #{job_id},马上重启…") logger.error(f"Error in job {job_id}: {e}", exc_info=True) finally: - job_id += 1 # 不管中断与否,都给下一任务换编号 - - - - - - - - - - - + job_id += 1 # 不管中断与否,都给下一任务换编号 diff --git a/kt-kernel/examples/test_deepseekv3_prefill.py b/kt-kernel/examples/test_deepseekv3_prefill.py index 00a98975..4bccf735 100644 --- a/kt-kernel/examples/test_deepseekv3_prefill.py +++ b/kt-kernel/examples/test_deepseekv3_prefill.py @@ -1,8 +1,9 @@ import os, sys import time + os.environ["BLAS_NUM_THREADS"] = "1" sys.path.insert(0, os.path.dirname(__file__) + "/../build") -import kt_kernel_ext +from kt_kernel import kt_kernel_ext from kt_kernel_ext.kvcache import ggml_type import torch import logging @@ -188,7 +189,6 @@ def build_mla(layer_idx, json_config, gguf_weights): config.layer_idx = layer_idx config.pool = CPUInfer.backend_ config.page_count = pages_count - if q_a_type == "F32": mla = kt_kernel_ext.mla.MLA_F32(config) @@ -284,22 +284,21 @@ def build_moegate(layer_idx, json_config, gguf_weights): json_config["topk_group"], ) - config.routed_scaling_factor = json_config['routed_scaling_factor'] + config.routed_scaling_factor = json_config["routed_scaling_factor"] config.pool = CPUInfer.backend_ - weight,weight_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_gate_inp.weight") + weight, weight_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_gate_inp.weight") config.weight = weight.data_ptr() config.weight_type = type_to_ggml_type(weight_type) - bias,bias_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.exp_probs_b.bias") + bias, bias_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.exp_probs_b.bias") config.e_score_correction_bias = bias.data_ptr() config.e_score_correction_bias_type = type_to_ggml_type(bias_type) gate = kt_kernel_ext.gate.MoEGate(config) - + return gate - def build_llm(json_config, gguf_weights): @@ -312,15 +311,15 @@ def build_llm(json_config, gguf_weights): general_config.n_shared_experts = json_config["n_shared_experts"] general_config.max_qlen = max_qlen - lm_heads,lm_heads_type = get_torch_tensor_and_type_from_gguf(gguf_weights, "output.weight") + lm_heads, lm_heads_type = get_torch_tensor_and_type_from_gguf(gguf_weights, "output.weight") general_config.lm_heads_ptr = lm_heads.data_ptr() general_config.lm_heads_type = type_to_ggml_type(lm_heads_type) output_norm, output_norm_type = get_torch_tensor_and_type_from_gguf(gguf_weights, "output_norm.weight") general_config.norm_weights_ptr = output_norm.data_ptr() - general_config.norm_weights_type = type_to_ggml_type(output_norm_type) + general_config.norm_weights_type = type_to_ggml_type(output_norm_type) - token_embd,token_embd_type = get_torch_tensor_and_type_from_gguf(weights, "token_embd.weight") + token_embd, token_embd_type = get_torch_tensor_and_type_from_gguf(weights, "token_embd.weight") general_config.token_embd_ptr = token_embd.data_ptr() general_config.token_embd_type = type_to_ggml_type(token_embd_type) @@ -330,12 +329,11 @@ def build_llm(json_config, gguf_weights): model = kt_kernel_ext.DeepseekV3Model(general_config) llm.model = model - decoder_layers = [] for i in range(json_config["num_hidden_layers"]): - # for i in range(6): - # for i in [0,1,2,3,4,5,6,7,8,9,10]: - layer = kt_kernel_ext.DeepseekV3DecoderLayer(general_config,i) + # for i in range(6): + # for i in [0,1,2,3,4,5,6,7,8,9,10]: + layer = kt_kernel_ext.DeepseekV3DecoderLayer(general_config, i) attn_norm, attn_norm_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{i}.attn_norm.weight") ffn_norm, ffn_norm_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{i}.ffn_norm.weight") @@ -351,11 +349,11 @@ def build_llm(json_config, gguf_weights): layer.ffn = build_ffn(i, json_config, gguf_weights) decoder_layers.append(layer) - model.layers = decoder_layers + model.layers = decoder_layers return llm -safetensor_path = '/home/bd/models/DeepSeek-R1' +safetensor_path = "/home/bd/models/DeepSeek-R1" json_path = os.path.join(safetensor_path, "config.json") json_config = json.load(open(json_path, "r")) print(json_config) @@ -368,8 +366,8 @@ weights = dict(sorted(weights.items())) for name, t in weights.items(): # if not name.startswith("blk"): # if name.startswith("blk.10."): - # if "ffn_gate." in name: - # print(f"Found weight: {t.name}, Shape: {t.shape}, Type: {t.tensor_type.name}, Size: {t.n_elements}") + # if "ffn_gate." in name: + # print(f"Found weight: {t.name}, Shape: {t.shape}, Type: {t.tensor_type.name}, Size: {t.n_elements}") print(f"Found weight: {t.name}, Shape: {t.shape}, Type: {t.tensor_type.name}, Size: {t.n_elements}") print("Building LLM ...") llm = build_llm(json_config, weights) @@ -384,7 +382,7 @@ prompt_file = None force_think = False -output_logits = torch.zeros((max_qlen, json_config['vocab_size']), dtype=torch.float32) +output_logits = torch.zeros((max_qlen, json_config["vocab_size"]), dtype=torch.float32) def start_chat(): @@ -411,16 +409,14 @@ def start_chat(): content = "Please write a piece of quicksort code in C++." elif os.path.isfile(content): content = open(content, "r").read() - + messages = [{"role": "user", "content": content}] - input_tensor = tokenizer.apply_chat_template( - messages, add_generation_prompt=True, return_tensors="pt" - ) + input_tensor = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt") if force_think: - token_thinks = torch.tensor([tokenizer.encode("\\n",add_special_tokens=False)],device=input_tensor.device) - input_tensor = torch.cat( - [input_tensor, token_thinks], dim=1 + token_thinks = torch.tensor( + [tokenizer.encode("\\n", add_special_tokens=False)], device=input_tensor.device ) + input_tensor = torch.cat([input_tensor, token_thinks], dim=1) input_tensor = input_tensor.squeeze(0) # Add batch dimension print(f"Input tensor: {input_tensor}, type {input_tensor.dtype}, shape {input_tensor.shape}") @@ -431,28 +427,27 @@ def start_chat(): qlens = [qlen] kvlens = [0] page_tables = [list(range(pages_count))] - llm.forward(qlens,page_tables, kvlens, input_tensor.data_ptr(), output_logits.data_ptr()) - + llm.forward(qlens, page_tables, kvlens, input_tensor.data_ptr(), output_logits.data_ptr()) + logits = output_logits[0] # print(logits) - # sample + # sample next_token = torch.argmax(logits).item() # print(f"Next token: {next_token}, {tokenizer.decode(next_token)}") input_tensor = torch.cat((input_tensor, torch.tensor([next_token])), dim=-1) - - - if next_token == tokenizer.eos_token_id or tokenizer.decode(next_token) == '<|im_end|>': + + if next_token == tokenizer.eos_token_id or tokenizer.decode(next_token) == "<|im_end|>": print(stream.end(), end="", flush=True) break else: print(stream.put(torch.tensor([next_token])), end="", flush=True) + + job_id = 0 while True: try: # ---------- 让用户决定是否继续 ---------- - choice = input( - "\n【回车】开始对话 | 输入 q/quit/exit 退出程序: " - ).strip().lower() + choice = input("\n【回车】开始对话 | 输入 q/quit/exit 退出程序: ").strip().lower() if choice in {"q", "quit", "exit"}: print("收到退出指令,程序结束。") break @@ -464,15 +459,4 @@ while True: # 随时 Ctrl-C:放弃当前任务并重启 print(f"\n检测到 Ctrl-C,已终止对话 #{job_id},马上重启…") finally: - job_id += 1 # 不管中断与否,都给下一任务换编号 - - - - - - - - - - - + job_id += 1 # 不管中断与否,都给下一任务换编号 diff --git a/kt-kernel/examples/test_deepseekv3_prefill_speed.py b/kt-kernel/examples/test_deepseekv3_prefill_speed.py index 156b6482..ce04b668 100644 --- a/kt-kernel/examples/test_deepseekv3_prefill_speed.py +++ b/kt-kernel/examples/test_deepseekv3_prefill_speed.py @@ -1,8 +1,9 @@ import os, sys import time + os.environ["BLAS_NUM_THREADS"] = "1" sys.path.insert(0, os.path.dirname(__file__) + "/../build") -import kt_kernel_ext +from kt_kernel import kt_kernel_ext from kt_kernel_ext.kvcache import ggml_type import torch import logging @@ -20,12 +21,13 @@ from transformers import ( logger = logging.getLogger("reader") from gguf.gguf_reader import GGUFReader + # load_layers = 3 load_layers = None worker_config = kt_kernel_ext.WorkerPoolConfig() worker_config.subpool_count = 2 -worker_config.subpool_numa_map= [0,1] -worker_config.subpool_thread_count = [72,72] +worker_config.subpool_numa_map = [0, 1] +worker_config.subpool_thread_count = [72, 72] CPUInfer = kt_kernel_ext.CPUInfer(worker_config) max_qlen = 4096 @@ -289,22 +291,21 @@ def build_moegate(layer_idx, json_config, gguf_weights): json_config["topk_group"], ) - config.routed_scaling_factor = json_config['routed_scaling_factor'] + config.routed_scaling_factor = json_config["routed_scaling_factor"] config.pool = CPUInfer.backend_ - weight,weight_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_gate_inp.weight") + weight, weight_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_gate_inp.weight") config.weight = weight.data_ptr() config.weight_type = type_to_ggml_type(weight_type) - bias,bias_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.exp_probs_b.bias") + bias, bias_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.exp_probs_b.bias") config.e_score_correction_bias = bias.data_ptr() config.e_score_correction_bias_type = type_to_ggml_type(bias_type) gate = kt_kernel_ext.gate.MoEGate(config) - + return gate - def build_llm(json_config, gguf_weights): @@ -317,15 +318,15 @@ def build_llm(json_config, gguf_weights): general_config.n_shared_experts = json_config["n_shared_experts"] general_config.max_qlen = max_qlen - lm_heads,lm_heads_type = get_torch_tensor_and_type_from_gguf(gguf_weights, "output.weight") + lm_heads, lm_heads_type = get_torch_tensor_and_type_from_gguf(gguf_weights, "output.weight") general_config.lm_heads_ptr = lm_heads.data_ptr() general_config.lm_heads_type = type_to_ggml_type(lm_heads_type) output_norm, output_norm_type = get_torch_tensor_and_type_from_gguf(gguf_weights, "output_norm.weight") general_config.norm_weights_ptr = output_norm.data_ptr() - general_config.norm_weights_type = type_to_ggml_type(output_norm_type) + general_config.norm_weights_type = type_to_ggml_type(output_norm_type) - token_embd,token_embd_type = get_torch_tensor_and_type_from_gguf(weights, "token_embd.weight") + token_embd, token_embd_type = get_torch_tensor_and_type_from_gguf(weights, "token_embd.weight") general_config.token_embd_ptr = token_embd.data_ptr() general_config.token_embd_type = type_to_ggml_type(token_embd_type) @@ -335,13 +336,12 @@ def build_llm(json_config, gguf_weights): model = kt_kernel_ext.DeepseekV3Model(general_config) llm.model = model - decoder_layers = [] real_load_layers = json_config["num_hidden_layers"] if load_layers is None else load_layers for i in range(real_load_layers): - # for i in [2,3]: - layer = kt_kernel_ext.DeepseekV3DecoderLayer(general_config,i) + # for i in [2,3]: + layer = kt_kernel_ext.DeepseekV3DecoderLayer(general_config, i) attn_norm, attn_norm_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{i}.attn_norm.weight") ffn_norm, ffn_norm_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{i}.ffn_norm.weight") @@ -357,11 +357,11 @@ def build_llm(json_config, gguf_weights): layer.ffn = build_ffn(i, json_config, gguf_weights) decoder_layers.append(layer) - model.layers = decoder_layers + model.layers = decoder_layers return llm -safetensor_path = '/home/bd/models/DeepSeek-R1' +safetensor_path = "/home/bd/models/DeepSeek-R1" json_path = os.path.join(safetensor_path, "config.json") json_config = json.load(open(json_path, "r")) print(json_config) @@ -372,13 +372,13 @@ weights = dict(sorted(weights.items())) # for name, t in weights.items(): - # if not name.startswith("blk"): - # if name.startswith("blk.10."): - # if "ffn_gate." in name: - # print(f"Found weight: {t.name}, Shape: {t.shape}, Type: {t.tensor_type.name}, Size: {t.n_elements}") - # print(f"Found weight: {t.name}, Shape: {t.shape}, Type: {t.tensor_type.name}, Size: {t.n_elements}") - -print("Building LLM ...") +# if not name.startswith("blk"): +# if name.startswith("blk.10."): +# if "ffn_gate." in name: +# print(f"Found weight: {t.name}, Shape: {t.shape}, Type: {t.tensor_type.name}, Size: {t.n_elements}") +# print(f"Found weight: {t.name}, Shape: {t.shape}, Type: {t.tensor_type.name}, Size: {t.n_elements}") + +print("Building LLM ...") load_start_time = time.perf_counter() llm = build_llm(json_config, weights) load_end_time = time.perf_counter() @@ -395,22 +395,20 @@ config = AutoConfig.from_pretrained(safetensor_path, trust_remote_code=True) force_think = False -output_logits = torch.zeros((max_qlen, json_config['vocab_size']), dtype=torch.float32) +output_logits = torch.zeros((max_qlen, json_config["vocab_size"]), dtype=torch.float32) def start_chat(content=None): if content is None: content = input("Chat: ") - + messages = [{"role": "user", "content": content}] - input_tensor = tokenizer.apply_chat_template( - messages, add_generation_prompt=True, return_tensors="pt" - ) + input_tensor = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt") if force_think: - token_thinks = torch.tensor([tokenizer.encode("\\n",add_special_tokens=False)],device=input_tensor.device) - input_tensor = torch.cat( - [input_tensor, token_thinks], dim=1 + token_thinks = torch.tensor( + [tokenizer.encode("\\n", add_special_tokens=False)], device=input_tensor.device ) + input_tensor = torch.cat([input_tensor, token_thinks], dim=1) input_tensor = input_tensor.squeeze(0) # Add batch dimension print(f"Input tensor: {input_tensor}, type {input_tensor.dtype}, shape {input_tensor.shape}") @@ -425,30 +423,32 @@ def start_chat(content=None): kvlens = [0] page_tables = [list(range(pages_count))] start_time = time.perf_counter() - llm.forward(qlens,page_tables, kvlens, input_tensor.data_ptr(), output_logits.data_ptr()) + llm.forward(qlens, page_tables, kvlens, input_tensor.data_ptr(), output_logits.data_ptr()) end_time = time.perf_counter() - print(f"Forward time: {end_time - start_time:.4f} seconds, tps: {qlens[0] / (end_time - start_time)} tokens/sec") - + print( + f"Forward time: {end_time - start_time:.4f} seconds, tps: {qlens[0] / (end_time - start_time)} tokens/sec" + ) + logits = output_logits[0] # print(logits) - # sample + # sample next_token = torch.argmax(logits).item() # print(f"Next token: {next_token}, {tokenizer.decode(next_token)}") # kvlen = input_tensor.shape[0] input_tensor = torch.cat((input_tensor, torch.tensor([next_token])), dim=-1) - - if next_token == tokenizer.eos_token_id or tokenizer.decode(next_token) == '<|im_end|>': + + if next_token == tokenizer.eos_token_id or tokenizer.decode(next_token) == "<|im_end|>": stream.end() break else: stream.put(torch.tensor([next_token])) + + job_id = 0 while True: try: # ---------- 让用户决定是否继续 ---------- - choice = input( - "\n【回车】开始对话 | 输入 1 读取文件 | 输入 q/quit/exit 退出程序: " - ).strip().lower() + choice = input("\n【回车】开始对话 | 输入 1 读取文件 | 输入 q/quit/exit 退出程序: ").strip().lower() if choice in {"q", "quit", "exit"}: print("收到退出指令,程序结束。") break @@ -472,15 +472,4 @@ while True: print(f"\n发生错误:{e}\n已终止对话 #{job_id},马上重启…") logger.error(f"Error in job {job_id}: {e}", exc_info=True) finally: - job_id += 1 # 不管中断与否,都给下一任务换编号 - - - - - - - - - - - + job_id += 1 # 不管中断与否,都给下一任务换编号 diff --git a/kt-kernel/examples/test_gate.py b/kt-kernel/examples/test_gate.py index d3c05f54..504c13bf 100644 --- a/kt-kernel/examples/test_gate.py +++ b/kt-kernel/examples/test_gate.py @@ -1,15 +1,17 @@ import math -import os,sys +import os, sys import time from typing import Optional + os.environ["BLAS_NUM_THREADS"] = "1" -sys.path.insert(0, os.path.dirname(__file__) + '/../build') -import kt_kernel_ext +sys.path.insert(0, os.path.dirname(__file__) + "/../build") +from kt_kernel import kt_kernel_ext from kt_kernel_ext.kvcache import ggml_type import torch from torch import nn import torch.nn.functional as F + # from modeling_deepseek_v3 import MoEGate from configuration_deepseek_v3 import DeepseekV3Config @@ -28,17 +30,20 @@ n_group = config.n_group topk_group = config.topk_group routed_scaling_factor = config.routed_scaling_factor -weights = torch.randn((n_routed_experts, hidden_size), dtype=torch.float32).to('cpu').contiguous() -bias = torch.randn((n_routed_experts,), dtype=torch.float32).to('cpu').contiguous() +weights = torch.randn((n_routed_experts, hidden_size), dtype=torch.float32).to("cpu").contiguous() +bias = torch.randn((n_routed_experts,), dtype=torch.float32).to("cpu").contiguous() + + # weights = torch.randn((n_routed_experts, hidden_size), dtype=torch.float16).to('cpu').contiguous () def load_fp32_tensor(file_path, shape): - return torch.zeros(shape, dtype=torch.float32).to('cpu').contiguous() - with open(file_path, 'rb') as f: + return torch.zeros(shape, dtype=torch.float32).to("cpu").contiguous() + with open(file_path, "rb") as f: raw_data = f.read() tensor = torch.frombuffer(raw_data, dtype=torch.float32) tensor = tensor.view(shape) # 根据你的 shape reshape return tensor + class MoEGate(nn.Module): def __init__(self, config): super().__init__() @@ -54,13 +59,9 @@ class MoEGate(nn.Module): # topk selection algorithm self.norm_topk_prob = config.norm_topk_prob self.gating_dim = config.hidden_size - self.weight = nn.Parameter( - torch.empty((self.n_routed_experts, self.gating_dim)) - ) + self.weight = nn.Parameter(torch.empty((self.n_routed_experts, self.gating_dim))) if self.topk_method == "noaux_tc": - self.e_score_correction_bias = nn.Parameter( - torch.empty((self.n_routed_experts)) - ) + self.e_score_correction_bias = nn.Parameter(torch.empty((self.n_routed_experts))) self.reset_parameters() def reset_parameters(self) -> None: @@ -73,93 +74,88 @@ class MoEGate(nn.Module): ### compute gating score hidden_states = hidden_states.view(-1, h) - h_to_check = load_fp32_tensor('/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug/gate_input',(seq_len,h)) + h_to_check = load_fp32_tensor( + "/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug/gate_input", (seq_len, h) + ) diff = (h_to_check - hidden_states).abs().max() # print("hidden_states diff:", diff) # assert diff<0.02 - - bias_to_check = load_fp32_tensor('/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug/bias',(n_routed_experts)) + bias_to_check = load_fp32_tensor( + "/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug/bias", (n_routed_experts) + ) diff = (bias - bias_to_check).abs().max() # print('bias diff:',diff) # assert diff < 0.02 + logits = F.linear(hidden_states.type(torch.float32), self.weight.type(torch.float32), None) - logits = F.linear( - hidden_states.type(torch.float32), self.weight.type(torch.float32), None + logits_to_check = load_fp32_tensor( + "/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug/gate_logits", + (seq_len, n_routed_experts), ) - - logits_to_check = load_fp32_tensor('/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug/gate_logits',(seq_len,n_routed_experts)) diff = (logits_to_check - logits).abs().max() # print("logits diff:", diff) # assert diff < 0.02 - if self.scoring_func == "sigmoid": scores = logits.sigmoid() else: - raise NotImplementedError( - f"insupportable scoring function for MoE gating: {self.scoring_func}" - ) + raise NotImplementedError(f"insupportable scoring function for MoE gating: {self.scoring_func}") ### select top-k experts if self.topk_method == "noaux_tc": # assert not self.training scores_for_choice = scores.view(bsz * seq_len, -1) + self.e_score_correction_bias.unsqueeze(0) - scores_to_check = load_fp32_tensor('/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug/scores_to_choice',(seq_len,n_routed_experts)) + scores_to_check = load_fp32_tensor( + "/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug/scores_to_choice", + (seq_len, n_routed_experts), + ) diff = (scores_for_choice - scores_to_check).abs().max() - print(f'score for choice diff = {diff}') - + print(f"score for choice diff = {diff}") group_scores = ( - scores_for_choice.view(bsz * seq_len, self.n_group, -1).topk(2, dim=-1)[0].sum(dim = -1) + scores_for_choice.view(bsz * seq_len, self.n_group, -1).topk(2, dim=-1)[0].sum(dim=-1) ) # [n, n_group] - group_scores_to_check = load_fp32_tensor('/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug/group_scores',(seq_len,n_group)) + group_scores_to_check = load_fp32_tensor( + "/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug/group_scores", + (seq_len, n_group), + ) diff = (group_scores - group_scores_to_check).abs().max() - print(f'group scores diff = {diff}') + print(f"group scores diff = {diff}") - - group_idx = torch.topk( - group_scores, k=self.topk_group, dim=-1, sorted=False - )[ - 1 - ] # [n, top_k_group] + group_idx = torch.topk(group_scores, k=self.topk_group, dim=-1, sorted=False)[1] # [n, top_k_group] group_mask = torch.zeros_like(group_scores) # [n, n_group] group_mask.scatter_(1, group_idx, 1) # [n, n_group] score_mask = ( group_mask.unsqueeze(-1) - .expand( - bsz * seq_len, self.n_group, self.n_routed_experts // self.n_group - ) + .expand(bsz * seq_len, self.n_group, self.n_routed_experts // self.n_group) .reshape(bsz * seq_len, -1) ) # [n, e] tmp_scores = scores_for_choice.masked_fill(~score_mask.bool(), float("-inf")) # [n, e] - tmp_scores_to_check = load_fp32_tensor('/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug/gate_logits_toped',(seq_len,n_routed_experts)) - is_close = torch.isclose(tmp_scores, tmp_scores_to_check, rtol=1e-2, atol=1e-2, equal_nan=True) - print(f'tmp_score ok {is_close.all()}') - - - _, topk_idx = torch.topk( - tmp_scores, k=self.top_k, dim=-1, sorted=False + tmp_scores_to_check = load_fp32_tensor( + "/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug/gate_logits_toped", + (seq_len, n_routed_experts), ) + is_close = torch.isclose(tmp_scores, tmp_scores_to_check, rtol=1e-2, atol=1e-2, equal_nan=True) + print(f"tmp_score ok {is_close.all()}") + + _, topk_idx = torch.topk(tmp_scores, k=self.top_k, dim=-1, sorted=False) topk_weight = scores.gather(1, topk_idx) else: - raise NotImplementedError( - f"insupportable TopK function for MoE gating: {self.topk_method}" - ) + raise NotImplementedError(f"insupportable TopK function for MoE gating: {self.topk_method}") ### norm gate to sum 1 if self.top_k > 1 and self.norm_topk_prob: denominator = topk_weight.sum(dim=-1, keepdim=True) + 1e-20 topk_weight = topk_weight / denominator - topk_weight = topk_weight * self.routed_scaling_factor # must multiply the scaling factor + topk_weight = topk_weight * self.routed_scaling_factor # must multiply the scaling factor return topk_idx, topk_weight - def torch_gate(hidden_states): hidden_states.unsqueeze_(0) gate = MoEGate(config) @@ -172,11 +168,11 @@ def torch_gate(hidden_states): def cpuinfer_gate(hidden_states): config = kt_kernel_ext.gate.GateConfig( - hidden_size, - num_experts_per_token, - n_routed_experts, - n_group, - topk_group, + hidden_size, + num_experts_per_token, + n_routed_experts, + n_group, + topk_group, ) CPUInfer = kt_kernel_ext.CPUInfer(64) @@ -188,32 +184,29 @@ def cpuinfer_gate(hidden_states): config.e_score_correction_bias = bias.data_ptr() config.e_score_correction_bias_type = ggml_type.FP32 - gate = kt_kernel_ext.gate.MoEGate(config) + gate = kt_kernel_ext.gate.MoEGate(config) + expert_ids = torch.zeros((seqlen, num_experts_per_token), dtype=torch.int64).to("cpu").contiguous() + expert_weights = torch.zeros((seqlen, num_experts_per_token), dtype=torch.float32).to("cpu").contiguous() - - expert_ids = torch.zeros((seqlen, num_experts_per_token), dtype=torch.int64).to('cpu').contiguous() - expert_weights = torch.zeros((seqlen, num_experts_per_token), dtype=torch.float32).to('cpu').contiguous() - - gate.forward(seqlen,hidden_states.data_ptr(),expert_ids.data_ptr(), expert_weights.data_ptr()) + gate.forward(seqlen, hidden_states.data_ptr(), expert_ids.data_ptr(), expert_weights.data_ptr()) # print(expert_ids,expert_weights) return expert_ids, expert_weights -input = torch.randn(seqlen, hidden_size, dtype=torch.float32).to('cpu').contiguous() + +input = torch.randn(seqlen, hidden_size, dtype=torch.float32).to("cpu").contiguous() # print(input) -ids,we = cpuinfer_gate(input) +ids, we = cpuinfer_gate(input) idx = torch.argsort(ids, dim=-1, descending=True) -ids = torch.gather(ids,dim=-1,index=idx) -we = torch.gather(we,dim=-1,index=idx) +ids = torch.gather(ids, dim=-1, index=idx) +we = torch.gather(we, dim=-1, index=idx) - -std_ids,std_we= torch_gate(input) +std_ids, std_we = torch_gate(input) idx = torch.argsort(std_ids, dim=-1, descending=True) -std_we = torch.gather(std_we,dim=-1,index=idx) -std_ids = torch.gather(std_ids,dim=-1,index=idx) - +std_we = torch.gather(std_we, dim=-1, index=idx) +std_ids = torch.gather(std_ids, dim=-1, index=idx) # print("ids diff:", torch.abs(std_ids - ids).max()) @@ -221,28 +214,3 @@ std_ids = torch.gather(std_ids,dim=-1,index=idx) assert torch.abs(std_ids - ids).max() == 0, "Expert IDs do not match!" assert torch.abs(std_we - we).max() < 1e-2, "Expert Weights do not match!" print("Expert IDs and Weights match successfully!") - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/kt-kernel/examples/test_k2_moe_amx.py b/kt-kernel/examples/test_k2_moe_amx.py index 903f9896..bca81efe 100644 --- a/kt-kernel/examples/test_k2_moe_amx.py +++ b/kt-kernel/examples/test_k2_moe_amx.py @@ -6,7 +6,7 @@ from typing import Dict, Literal sys.path.insert(0, os.path.dirname(__file__) + "/../build") import torch -import kt_kernel_ext +from kt_kernel import kt_kernel_ext torch.manual_seed(42) @@ -132,6 +132,7 @@ def pack_to_int32(value: torch.Tensor, num_bits: int, packed_dim: Literal[0, 1] return packed + def pack_tensor_per_row(q: torch.Tensor, num_bits: int) -> torch.Tensor: e, rows, cols = q.shape flat = q.view(e * rows, cols) @@ -283,9 +284,9 @@ def run_case(pattern: str) -> Dict[str, float]: CPUInfer.sync() input_tensor_fp16 = input_tensor.to(torch.float16) - t_output = moe_torch( - input_tensor_fp16, expert_ids, weights, gate_fp16, up_fp16, down_fp16 - ).to(torch.bfloat16) + t_output = moe_torch(input_tensor_fp16, expert_ids, weights, gate_fp16, up_fp16, down_fp16).to( + torch.bfloat16 + ) t_output = t_output.flatten() output = output.flatten() diff --git a/kt-kernel/examples/test_k2_write_buffer.py b/kt-kernel/examples/test_k2_write_buffer.py index 210a4a3d..4b156ed6 100644 --- a/kt-kernel/examples/test_k2_write_buffer.py +++ b/kt-kernel/examples/test_k2_write_buffer.py @@ -11,7 +11,7 @@ import numpy as np # if REPO_ROOT not in sys.path: # sys.path.insert(0, REPO_ROOT) -import kt_kernel_ext +from kt_kernel import kt_kernel_ext from kt_kernel_ext import CPUInfer @@ -57,10 +57,10 @@ def allocate_weights(expert_num, hidden_size, intermediate_size, group_size): def main(): torch.manual_seed(123) - expert_num = 256 # Total experts + expert_num = 256 # Total experts gpu_experts = expert_num # Number of experts on GPU gpu_tp_count = 2 # Number of TP parts - + num_experts_per_tok = 8 hidden_size = 7168 intermediate_size = 2048 @@ -89,9 +89,7 @@ def main(): moe = kt_kernel_ext.moe.AMXInt4_KGroup_MOE(cfg) - physical_to_logical_map = ( - torch.arange(expert_num, dtype=torch.int64, device="cpu").contiguous() - ) + physical_to_logical_map = torch.arange(expert_num, dtype=torch.int64, device="cpu").contiguous() cpuinfer.submit(moe.load_weights_task(physical_to_logical_map.data_ptr())) cpuinfer.sync() @@ -169,6 +167,7 @@ def main(): total_bytes = total_weights // group_size + total_weights // 2 print(f"write_weight_scale_to_buffer time: {elapsed_ms:.2f} ms") print(f"Throughput: {total_bytes / (elapsed_ms * 1e6):.2f} GB/s") + def split_expert_tensor(tensor, chunk): """Split tensor by experts""" return [tensor[i * chunk : (i + 1) * chunk] for i in range(expert_num)] @@ -229,10 +228,10 @@ def main(): tp_scale_offset = col_scale_start + tp_idx * tp_slice_scale_size down_weight_tp_parts.append( - down_q_experts[expert_idx][tp_weight_offset:tp_weight_offset + tp_slice_weight_size] + down_q_experts[expert_idx][tp_weight_offset : tp_weight_offset + tp_slice_weight_size] ) down_scale_tp_parts.append( - down_scale_experts[expert_idx][tp_scale_offset:tp_scale_offset + tp_slice_scale_size] + down_scale_experts[expert_idx][tp_scale_offset : tp_scale_offset + tp_slice_scale_size] ) # Concatenate all column slices for this TP @@ -260,7 +259,9 @@ def main(): assert torch.equal(w2_weight_bufs[tp_idx], expected_w2_weight), f"w2 weight bytes mismatch for TP {tp_idx}" assert torch.allclose(w2_scale_bufs[tp_idx], expected_w2_scale), f"w2 scale values mismatch for TP {tp_idx}" - print(f"\n✓ write_weight_scale_to_buffer passed: extracted {gpu_experts} GPU experts across {gpu_tp_count} TP parts from total {expert_num} experts") + print( + f"\n✓ write_weight_scale_to_buffer passed: extracted {gpu_experts} GPU experts across {gpu_tp_count} TP parts from total {expert_num} experts" + ) if __name__ == "__main__": diff --git a/kt-kernel/examples/test_linear.py b/kt-kernel/examples/test_linear.py index f2fa6cb3..d90930f1 100644 --- a/kt-kernel/examples/test_linear.py +++ b/kt-kernel/examples/test_linear.py @@ -1,26 +1,27 @@ #!/usr/bin/env python # coding=utf-8 -''' -Description : +""" +Description : Author : chenht2022 Date : 2024-07-25 10:32:05 Version : 1.0.0 -LastEditors : chenht2022 +LastEditors : chenht2022 LastEditTime : 2024-08-06 10:36:59 -Copyright (c) 2024 by KVCache.AI, All Rights Reserved. -''' +Copyright (c) 2024 by KVCache.AI, All Rights Reserved. +""" import os, sys import time -sys.path.append(os.path.dirname(__file__) + '/../build') -import kt_kernel_ext + +sys.path.append(os.path.dirname(__file__) + "/../build") +from kt_kernel import kt_kernel_ext import torch input_size = 16384 output_size = 5120 stride = 32 group_max_len = 1024 -proj_type = 1 # ggml_type::GGML_TYPE_F16 -hidden_type = 1 # ggml_type::GGML_TYPE_F16 +proj_type = 1 # ggml_type::GGML_TYPE_F16 +hidden_type = 1 # ggml_type::GGML_TYPE_F16 qlen = 30 layer_num = 10 CPUInfer = kt_kernel_ext.CPUInfer(48) @@ -30,8 +31,10 @@ with torch.inference_mode(mode=True): linears = [] projs = [] for _ in range(layer_num): - proj = torch.randn((output_size, input_size), dtype=torch.float16, device = "cuda").to("cpu").contiguous() - config = kt_kernel_ext.linear.LinearConfig(input_size, output_size, stride, group_max_len, proj.data_ptr(), proj_type, hidden_type) + proj = torch.randn((output_size, input_size), dtype=torch.float16, device="cuda").to("cpu").contiguous() + config = kt_kernel_ext.linear.LinearConfig( + input_size, output_size, stride, group_max_len, proj.data_ptr(), proj_type, hidden_type + ) linear = kt_kernel_ext.linear.Linear(config) projs.append(proj) linears.append(linear) @@ -43,20 +46,14 @@ with torch.inference_mode(mode=True): output = torch.empty((qlen, output_size), dtype=torch.float16).contiguous() input = input / 100 - CPUInfer.submit( - linear.forward( - qlen, - input.data_ptr(), - output.data_ptr() - ) - ) + CPUInfer.submit(linear.forward(qlen, input.data_ptr(), output.data_ptr())) CPUInfer.sync() # print('cpuinfer output', output) - proj = projs[i%layer_num] + proj = projs[i % layer_num] t_output = torch.mm(input, proj.t()) # print('torch output', t_output) diff = torch.mean(torch.abs(output - t_output)) / torch.mean(torch.abs(t_output)) - print('diff = ', diff) - assert(diff < 0.001) + print("diff = ", diff) + assert diff < 0.001 diff --git a/kt-kernel/examples/test_mla.py b/kt-kernel/examples/test_mla.py index c9256189..bb9b512d 100644 --- a/kt-kernel/examples/test_mla.py +++ b/kt-kernel/examples/test_mla.py @@ -1,19 +1,22 @@ import logging -import os,sys +import os, sys import time from typing import Optional + os.environ["BLAS_NUM_THREADS"] = "1" -sys.path.insert(0, os.path.dirname(__file__) + '/../build') -import kt_kernel_ext +sys.path.insert(0, os.path.dirname(__file__) + "/../build") +from kt_kernel import kt_kernel_ext from kt_kernel_ext.kvcache import ggml_type import torch from torch import inf, nn from torch.nn import init -from torch_attention import apply_rotary_pos_emb,DeepseekV2RMSNorm,KDeepSeekV3Cache,DeepseekV3YarnRotaryEmbedding +from torch_attention import apply_rotary_pos_emb, DeepseekV2RMSNorm, KDeepSeekV3Cache, DeepseekV3YarnRotaryEmbedding + logger = logging.getLogger("reader") from gguf.gguf_reader import GGUFReader + def read_gguf_file(gguf_file_path): """ Reads and prints key-value pairs and tensor information from a GGUF file in an improved format. @@ -46,12 +49,15 @@ def read_gguf_file(gguf_file_path): re.append(tensor) return re + def get_torch_tensor_from_gguf(gguf_weights, name): return torch.from_numpy(gguf_weights[name].data).contiguous() + def get_torch_tensor_and_type_from_gguf(gguf_weights, name): return torch.from_numpy(gguf_weights[name].data).contiguous(), gguf_weights[name].tensor_type.name + def type_to_ggml_type(type): if type == "F32": return ggml_type.FP32 @@ -70,12 +76,12 @@ seed = 42 # 你可以选择任何整数作为种子 torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) -qlen = 3212 +qlen = 3212 kvlen = 0 page_table = range(20) -bsz_tensors=torch.tensor([1]) +bsz_tensors = torch.tensor([1]) page_size = 256 @@ -94,8 +100,7 @@ rope_theta = 10000 max_qlen = 4096 max_kvlen = 4096 -max_position_embeddings = 163840 - +max_position_embeddings = 163840 rope_scaling = { @@ -105,11 +110,10 @@ rope_scaling = { "mscale": 1.0, "mscale_all_dim": 1.0, "original_max_position_embeddings": 4096, - "type": "yarn" + "type": "yarn", } - CPUInfer = kt_kernel_ext.CPUInfer(30) validation_iter = 100 @@ -119,15 +123,16 @@ weight_type = torch.bfloat16 # weight_type = torch.float16 -input_type = {torch.float32:torch.float32, - torch.float16:torch.float16, - torch.bfloat16:torch.float32, - }[weight_type] +input_type = { + torch.float32: torch.float32, + torch.float16: torch.float16, + torch.bfloat16: torch.float32, +}[weight_type] q_a_proj = nn.Linear(hidden_size, q_lora_rank, bias=False, dtype=weight_type) -q_b_proj = nn.Linear(q_lora_rank, num_heads * (nope_size+rope_size) , bias=False, dtype=weight_type) +q_b_proj = nn.Linear(q_lora_rank, num_heads * (nope_size + rope_size), bias=False, dtype=weight_type) kv_a_proj_with_mqa = nn.Linear(hidden_size, kv_lora_rank + rope_size, bias=False, dtype=weight_type) -kv_b_proj = nn.Linear( num_heads * (nope_size + nope_size),kv_lora_rank, bias=False, dtype=weight_type) +kv_b_proj = nn.Linear(num_heads * (nope_size + nope_size), kv_lora_rank, bias=False, dtype=weight_type) o_proj = nn.Linear(num_heads * nope_size, hidden_size, bias=False, dtype=weight_type) q_a_norm = torch.ones(hidden_size, dtype=torch.float32) kv_a_norm = torch.ones(hidden_size, dtype=torch.float32) @@ -190,7 +195,7 @@ if use_real_weights := True: o_proj_weight, type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.attn_output.weight") o_proj.weight = nn.Parameter(o_proj_weight.view(torch.bfloat16), requires_grad=False) - + else: init.normal_(q_a_proj.weight, mean=0.0, std=0.02) init.normal_(q_b_proj.weight, mean=0.0, std=0.02) @@ -203,16 +208,16 @@ q_absorb = x_reshaped[:, 0] out_absorb = x_reshaped[:, 1] -hidden_states = torch.randn((qlen, hidden_size), dtype=input_type).to('cpu').contiguous() +hidden_states = torch.randn((qlen, hidden_size), dtype=input_type).to("cpu").contiguous() def test_cpu_mla(): os.environ["BLAS_NUM_THREADS"] = "1" - q_a_proj_weight = q_a_proj.weight.to(weight_type).to('cpu').contiguous() - q_b_proj_weight = q_b_proj.weight.to(weight_type).to('cpu').contiguous() - kv_a_proj_with_mqa_weight = kv_a_proj_with_mqa.weight.to('cpu').to(weight_type).contiguous() - kv_b_proj_weight = kv_b_proj.weight.to(weight_type).to('cpu').contiguous() - o_proj_weight = o_proj.weight.to(weight_type).to('cpu').contiguous() + q_a_proj_weight = q_a_proj.weight.to(weight_type).to("cpu").contiguous() + q_b_proj_weight = q_b_proj.weight.to(weight_type).to("cpu").contiguous() + kv_a_proj_with_mqa_weight = kv_a_proj_with_mqa.weight.to("cpu").to(weight_type).contiguous() + kv_b_proj_weight = kv_b_proj.weight.to(weight_type).to("cpu").contiguous() + o_proj_weight = o_proj.weight.to(weight_type).to("cpu").contiguous() config = kt_kernel_ext.mla.MLAConfig( hidden_size, @@ -224,7 +229,7 @@ def test_cpu_mla(): ) config.max_qlen = max_qlen config.max_kvlen = max_kvlen - config.max_position_embeddings = max_position_embeddings + config.max_position_embeddings = max_position_embeddings config.rope_scaling_factor = rope_scaling["factor"] config.rope_theta = rope_theta config.rope_scaling_beta_fast = rope_scaling["beta_fast"] @@ -245,7 +250,6 @@ def test_cpu_mla(): config.kv_a_norm_type = ggml_type.FP32 config.page_count = pages_count - if weight_type == torch.float32: config.q_a_proj_type = ggml_type.FP32 config.q_b_proj_type = ggml_type.FP32 @@ -267,10 +271,8 @@ def test_cpu_mla(): else: raise ValueError(f"Unsupported data type: {weight_type}") - config.pool = CPUInfer.backend_ - if weight_type == torch.float32: mla = kt_kernel_ext.mla.MLA_F32(config) elif weight_type == torch.float16: @@ -280,54 +282,53 @@ def test_cpu_mla(): mla = kt_kernel_ext.mla.MLA_QUAN_F32(config) else: raise ValueError(f"Unsupported data type: {weight_type}") - + mla.load_weights() mla.set_local_pages(pages_count) - output = torch.zeros((qlen, hidden_size), dtype=input_type).to('cpu').contiguous() - mla.forward([qlen],[page_table],[kvlen],hidden_states.data_ptr(),output.data_ptr()) - print("CPU MLA Output: ",output) + output = torch.zeros((qlen, hidden_size), dtype=input_type).to("cpu").contiguous() + mla.forward([qlen], [page_table], [kvlen], hidden_states.data_ptr(), output.data_ptr()) + print("CPU MLA Output: ", output) return output - - def load_fp16_tensor(file_path, shape): # return load_fp32_tensor(file_path, shape) return torch.zeros(shape) - with open(file_path, 'rb') as f: + with open(file_path, "rb") as f: raw_data = f.read() tensor = torch.frombuffer(raw_data, dtype=weight_type) tensor = tensor.view(shape) # 根据你的 shape reshape return tensor + def load_fp32_tensor(file_path, shape): return torch.zeros(shape) - with open(file_path, 'rb') as f: + with open(file_path, "rb") as f: raw_data = f.read() tensor = torch.frombuffer(raw_data, dtype=torch.float32) tensor = tensor.view(shape) # 根据你的 shape reshape return tensor + def test_torch(): torch.set_grad_enabled(False) softmax_scale = (nope_size + rope_size) ** -0.5 # 1代表的是压缩的kv的头数 - k_caches = torch.randn(1,pages_count, page_size, 1, kv_lora_rank + rope_size).to(weight_type) + k_caches = torch.randn(1, pages_count, page_size, 1, kv_lora_rank + rope_size).to(weight_type) kv_cache = KDeepSeekV3Cache(page_size=page_size, kv_lora_rank=kv_lora_rank, k_caches=k_caches) q_a_layernorm = DeepseekV2RMSNorm(q_lora_rank) - q_a_layernorm.weight = nn.Parameter( q_a_norm,requires_grad=False) + q_a_layernorm.weight = nn.Parameter(q_a_norm, requires_grad=False) - x = torch.randn(q_lora_rank, dtype=weight_type)*100 + x = torch.randn(q_lora_rank, dtype=weight_type) * 100 print(x) print(q_a_layernorm(x)) kv_a_layernorm = DeepseekV2RMSNorm(kv_lora_rank) kv_a_layernorm.weight = nn.Parameter(kv_a_norm, requires_grad=False) - # 第三步:拆分成两个 tensor # q_absorb, out_absorb = x_permuted[:, 0], x_permuted[:, 1] # 都是 (num_heads, nope_size, kv_lora_rank # q_absorb = kv_b_proj[:, ] # torch.randn(num_heads, nope_size, kv_lora_rank, dtype=data_type) @@ -348,65 +349,64 @@ def test_torch(): # kv_indices 是[0:bsz],page_idx=[0:bsz], page_offset=[kvlen:qlen+kvlen] # last_page_len = [qlen+kvlen,...] layer_idx = 1 # position_ids = [kvlen:qlen+kvlen] - q_indptr = torch.tensor([0,qlen]).to(torch.int32) + q_indptr = torch.tensor([0, qlen]).to(torch.int32) - kv_indptr = torch.tensor([0,(qlen+kvlen+page_size-1)//page_size]).to(torch.int32) + kv_indptr = torch.tensor([0, (qlen + kvlen + page_size - 1) // page_size]).to(torch.int32) kv_indices = torch.tensor(range(pages_count)).to(torch.int32) - page_idx = torch.tensor([i//page_size for i in range(kvlen,kvlen+qlen)] ).to(torch.int32) - page_offset = torch.tensor( [i%page_size for i in range(kvlen, kvlen + qlen)]).to(torch.int32) + page_idx = torch.tensor([i // page_size for i in range(kvlen, kvlen + qlen)]).to(torch.int32) + page_offset = torch.tensor([i % page_size for i in range(kvlen, kvlen + qlen)]).to(torch.int32) last_page_len = torch.tensor([256], device=hidden_states.device) position_ids = torch.tensor(range(kvlen, kvlen + qlen)).to(torch.int32) - # 按照行创建 mask [qlen,kvlen+qlen] attention_masks = torch.zeros((max_qlen, max_kvlen), dtype=weight_type) for i in range(max_qlen): - attention_masks[i, i + kvlen + 1:] = -inf + attention_masks[i, i + kvlen + 1 :] = -inf - - def torch_attn(hidden_states_i: torch.Tensor, - kv_cache: KDeepSeekV3Cache, - position_ids: torch.Tensor, - page_idx: torch.Tensor, - page_offset: torch.Tensor, - attention_masks: Optional[list[torch.Tensor]] = None, - q_indptr: Optional[torch.Tensor] = None, - kv_indices: Optional[torch.Tensor] = None, - kv_indptr: Optional[torch.Tensor] = None, - bsz_tensors: Optional[torch.Tensor] = None, - last_page_len: Optional[torch.Tensor] = None, - layer_idx: Optional[int] = None, - ): + def torch_attn( + hidden_states_i: torch.Tensor, + kv_cache: KDeepSeekV3Cache, + position_ids: torch.Tensor, + page_idx: torch.Tensor, + page_offset: torch.Tensor, + attention_masks: Optional[list[torch.Tensor]] = None, + q_indptr: Optional[torch.Tensor] = None, + kv_indices: Optional[torch.Tensor] = None, + kv_indptr: Optional[torch.Tensor] = None, + bsz_tensors: Optional[torch.Tensor] = None, + last_page_len: Optional[torch.Tensor] = None, + layer_idx: Optional[int] = None, + ): global out_absorb global q_absorb hidden_states = hidden_states_i.to(weight_type) # range bsz_tensors final_attention_output = torch.tensor([], device=hidden_states.device) for i in range(bsz_tensors[0]): - batch_num_tokens_tensors = q_indptr[i+1] - q_indptr[i] + batch_num_tokens_tensors = q_indptr[i + 1] - q_indptr[i] batch_last_page_len = last_page_len[i] # kv_total_len is kv_len, batch_compressed_kv is compressed_kv, batch_k_pe is k_pe - batch_page_idx = page_idx[q_indptr[i]:q_indptr[i+1]] - batch_page_offset = page_offset[q_indptr[i]:q_indptr[i+1]] + batch_page_idx = page_idx[q_indptr[i] : q_indptr[i + 1]] + batch_page_offset = page_offset[q_indptr[i] : q_indptr[i + 1]] # kv_page_nums is the number of pages for the current batch - kv_page_nums = kv_indptr[i+1] - kv_indptr[i] + kv_page_nums = kv_indptr[i + 1] - kv_indptr[i] # kv_total_len is the total length of the kv cache for the current batch (kv_len for algorithm) kv_total_len = kv_page_nums * page_size if batch_last_page_len is not None: kv_total_len = kv_total_len - (page_size - batch_last_page_len) # print(f"kv_total_len's shape {kv_total_len.shape}") # kv_index is the index of the kv cache pages for the current batch - kv_index = kv_indices[kv_indptr[i]:kv_indptr[i+1]] + kv_index = kv_indices[kv_indptr[i] : kv_indptr[i + 1]] # we can index [kv_index, page_offset_indices] to get the kv cache for the current batch # from q_indptr[i] to q_indptr[i+1] is the range of the current batch - batch_hidden_states = hidden_states[q_indptr[i]:q_indptr[i+1]] - batch_position_ids = position_ids[q_indptr[i]:q_indptr[i+1]] + batch_hidden_states = hidden_states[q_indptr[i] : q_indptr[i + 1]] + batch_position_ids = position_ids[q_indptr[i] : q_indptr[i + 1]] qlen, _ = batch_hidden_states.size() # print("qlen -> ", qlen) - hidden_states_to_check = load_fp16_tensor('./debug/query_0_tp_0_input.bin',batch_hidden_states.shape) + hidden_states_to_check = load_fp16_tensor("./debug/query_0_tp_0_input.bin", batch_hidden_states.shape) diff = torch.abs(batch_hidden_states - hidden_states_to_check).max() print("hidden_states diff -> ", diff) @@ -422,8 +422,6 @@ def test_torch(): # print("q_lora mae -> ", mae) # print("q_lora mae test -> ", mae_test) - - q_lora_norm = q_a_layernorm(q_lora) # q_lora_norm_to_check = load_fp16_tensor('./debug/query_0_tp_0_qlora_norm.bin', q_lora_norm.shape) # q_lora_norm_to_check_test = load_fp16_tensor('./debug/query_0_tp_0_qlora_norm_test.bin', q_lora_norm.shape) @@ -435,30 +433,25 @@ def test_torch(): # print("q_lora_norm mae -> ", mae) # print("q_lora_norm diff test -> ", diff_test) # print("q_lora_norm mae test -> ", mae_test) - + q = q_b_proj(q_lora_norm) # for v3, bsz, qlen, num_heads(128), qk_head_dim(192=128(nope)+64(rope)) - q = q.view(qlen, num_heads, nope_size+rope_size) + q = q.view(qlen, num_heads, nope_size + rope_size) # q_nope is [qlen, num_heads(128), qk_nope_head_dim(128)] # q_pe is [qlen, num_heads(128), qk_rope_head_dim(64)] - q_nope, q_pe = torch.split( - q, [nope_size, rope_size], dim=-1 - ) - + q_nope, q_pe = torch.split(q, [nope_size, rope_size], dim=-1) + # compressed_kv is [qlen, kv_lora_rank(512) + rope(64)] compressed_kv = kv_a_proj_with_mqa(batch_hidden_states) # compressed_kv is [qlen, kv_lora_rank(512)], k_pe is [qlen, rope(64)] - compressed_kv, k_pe = torch.split( - compressed_kv, [kv_lora_rank, rope_size], dim=-1 - ) + compressed_kv, k_pe = torch.split(compressed_kv, [kv_lora_rank, rope_size], dim=-1) compressed_kv = compressed_kv.contiguous() - # compressed_kv_page_0 = compressed_kv[0:page_size, :] # compressed_kv_to_check = load_fp16_tensor('./debug/query_0_tp_0_page_0_kv_lora_rank', # compressed_kv_page_0.shape) # diff = torch.abs(compressed_kv_page_0 - compressed_kv_to_check).max() - # mae = torch.mean(torch.abs(compressed_kv_page_0 - compressed_kv_to_check)) + # mae = torch.mean(torch.abs(compressed_kv_page_0 - compressed_kv_to_check)) # print("compressed_kv diff -> ", diff) # print("compressed_kv mae -> ", mae) @@ -472,14 +465,11 @@ def test_torch(): # mae = torch.mean(torch.abs(compressed_kv_page_0 - compressed_kv_to_check)) # print("compressed_kv diff norm -> ", diff) # print("compressed_kv mae norm -> ", mae) - - - k_pe = k_pe.view(qlen, 1, rope_size) # compressed_kv is [qlen, 1, kv_lora_rank(512)] compressed_kv = compressed_kv.view(qlen, 1, kv_lora_rank) - + cos, sin = rotary_emb(q_pe, batch_position_ids) # q_nope_check = q_nope.transpose(0, 1) # qlen is 1, no GPU overhead, same below @@ -494,8 +484,8 @@ def test_torch(): # print("q_nope[0] mae -> ", mae) # print("q_nope[0] diff test -> ", diff_test) # print("q_nope[0] mae test -> ", mae_test) - - q_pe_nope = q_pe.transpose(0,1) + + q_pe_nope = q_pe.transpose(0, 1) # q_pe_0_to_check = load_fp16_tensor('./debug/query_0_tp_0_q_rope', q_pe_nope[0].shape) # q_pe_0_to_check = load_fp16_tensor('./debug/query_0_tp_0_q_rope_no_rope', q_pe_nope[0].shape) # q_pe_0_to_check_test = load_fp16_tensor('./debug/query_0_tp_0_q_rope_no_rope_test', q_pe_nope[0].shape) @@ -534,12 +524,11 @@ def test_torch(): q_pe, k_pe = apply_rotary_pos_emb(q_pe.unsqueeze(0), k_pe.unsqueeze(0), cos, sin, unsqueeze_dim=1) q_pe = q_pe.squeeze(0) # q_pe is [num_heads(128), qlen, qk_rope_head_dim(64)] - q_pe.transpose_(0, 1) + q_pe.transpose_(0, 1) # diff = torch.abs(q_pe - q_new).max() # print("q_pe diff -> ", diff) - # q_pe_0_to_check = load_fp16_tensor('./debug/query_0_tp_0_q_rope', q_pe[0].shape) # diff = torch.abs(q_pe[0] - q_pe_0_to_check).max() # mae = torch.mean(torch.abs(q_pe[0] - q_pe_0_to_check)) @@ -552,15 +541,22 @@ def test_torch(): # print("q_pe[0] 2 mae -> ", mae) if kv_cache is not None: - cache_kwargs = {"sin": sin, "cos": cos, "page_idx": batch_page_idx, "page_offset": batch_page_offset} # Specific to RoPE models - compressed_kv_with_k_pe = kv_cache.update(compressed_kv.unsqueeze(0), k_pe, layer_idx, batch_page_idx, batch_page_offset, cache_kwargs) - compressed_kv = compressed_kv_with_k_pe [:, :, :, :kv_lora_rank].view(-1, page_size, kv_lora_rank) - k_pe = compressed_kv_with_k_pe [:, :, :, kv_lora_rank:].view(-1, page_size, rope_size) + cache_kwargs = { + "sin": sin, + "cos": cos, + "page_idx": batch_page_idx, + "page_offset": batch_page_offset, + } # Specific to RoPE models + compressed_kv_with_k_pe = kv_cache.update( + compressed_kv.unsqueeze(0), k_pe, layer_idx, batch_page_idx, batch_page_offset, cache_kwargs + ) + compressed_kv = compressed_kv_with_k_pe[:, :, :, :kv_lora_rank].view(-1, page_size, kv_lora_rank) + k_pe = compressed_kv_with_k_pe[:, :, :, kv_lora_rank:].view(-1, page_size, rope_size) # q_absorb is [num_heads(128), qk_nope_head_dim(128), kv_lora_rank(512)] # out_absorb is [num_heads(128), kv_lora_rank(512), v_head_dim(128)] v_head_dim is also the nope dim # q_absorb, out_absorb = get_absorbed() # q_nope is [num_heads(128), qlen, qk_nope_head_dim(128)] - q_nope = q_nope.transpose(0, 1) # qlen is 1, no GPU overhead, same below + q_nope = q_nope.transpose(0, 1) # qlen is 1, no GPU overhead, same below # q_nope_0_to_check = load_fp16_tensor('./debug/query_0_tp_0_q_nope', q_nope[0].shape) # diff = torch.abs(q_nope[0] - q_nope_0_to_check).max() @@ -568,7 +564,7 @@ def test_torch(): # print("q_nope[0] diff -> ", diff) # q_nope is [num_heads(128), qlen, kv_lora_rank(512)] - q_nope = torch.matmul(q_nope, q_absorb) # batched MM + q_nope = torch.matmul(q_nope, q_absorb) # batched MM # k_b_proj_check = load_fp16_tensor('./debug/query_0_tp_0_k_b_lora', (nope_size,kv_lora_rank)) # diff = torch.abs(q_absorb[0] - k_b_proj_check).max() @@ -594,7 +590,7 @@ def test_torch(): if batch_compressed_kv is None or batch_k_pe is None: batch_compressed_kv = tmp_compressed_kv batch_k_pe = tmp_k_pe - else: + else: batch_compressed_kv = torch.cat((batch_compressed_kv, tmp_compressed_kv), dim=0) batch_k_pe = torch.cat((batch_k_pe, tmp_k_pe), dim=0) kv_total_len -= page_size @@ -604,28 +600,27 @@ def test_torch(): if batch_compressed_kv is None or batch_k_pe is None: batch_compressed_kv = tmp_compressed_kv batch_k_pe = tmp_k_pe - else: + else: batch_compressed_kv = torch.cat((batch_compressed_kv, tmp_compressed_kv), dim=0) batch_k_pe = torch.cat((batch_k_pe, tmp_k_pe), dim=0) break # batch_compressed_kv is [kv_total_len(k_len), kv_lora_rank(512)] # batch_k_pe is [kv_total_len(k_len), qk_rope_head_dim(64)] - # k_pe_to_check = load_fp16_tensor('./debug/query_0_tp_0_page_0_k_rope', (256,64)) # diff = torch.abs(batch_k_pe[:256] - k_pe_to_check).max() # mae = torch.mean(torch.abs(batch_k_pe[:256] - k_pe_to_check)) # print("k_pe diff -> ", diff) # print("k_pe mae -> ", mae) - pe_weights = torch.matmul(q_pe,batch_k_pe.mT) + pe_weights = torch.matmul(q_pe, batch_k_pe.mT) kv_total_len = kv_page_nums * page_size # pe_weights_0 = load_fp16_tensor('./debug/query_0_tp_0_pe_attention_weights', (1024,4096)) # pe_weights_0 = pe_weights_0[0:qlen, 0:kv_total_len] # diff = torch.abs(pe_weights[0] - pe_weights_0).max() # print("pe_weights[0] diff -> ", diff) - attention_weights = (pe_weights + torch.matmul(q_nope, batch_compressed_kv.mT)) + attention_weights = pe_weights + torch.matmul(q_nope, batch_compressed_kv.mT) # raw_weights = load_fp16_tensor('./debug/query_0_tp_0_raw_attention_weights', (1024, 4096)) # raw_weights = raw_weights[0:qlen, 0:kv_total_len] @@ -634,47 +629,47 @@ def test_torch(): attention_weights = attention_weights * softmax_scale # attention_weights is [num_heads(128), qlen, k_len] - + # attention_weights = attention_weights.transpose(0,1).unsqueeze(0).squeeze(-1).expand(qlen,-1,-1).transpose(0,1) - + # attention_masks[i] is [qlen, k_len] - + print(attention_weights.shape) print(attention_masks.shape) - attention_weights = (attention_weights + attention_masks[ :attention_weights.shape[1],:attention_weights.shape[2]]) + attention_weights = ( + attention_weights + attention_masks[: attention_weights.shape[1], : attention_weights.shape[2]] + ) # attention_weights shape is [num_heads(128), qlen, k_len] - - attention_weights = nn.functional.softmax(attention_weights,dim=-1,dtype=weight_type).to(q_pe.dtype) + attention_weights = nn.functional.softmax(attention_weights, dim=-1, dtype=weight_type).to(q_pe.dtype) # attention_weights_0 = load_fp16_tensor('./debug/query_0_tp_0_attention_weights', (1024, 4096)) # attention_weights_0 = attention_weights_0[0:qlen, 0:kv_total_len] # diff = torch.abs(attention_weights[0] - attention_weights_0).max() # print("attention_weights[0] diff -> ", diff) - - attn_output = torch.matmul(attention_weights, batch_compressed_kv) # [num_heads(128),qlen, lora_rank(512)] + attn_output = torch.matmul(attention_weights, batch_compressed_kv) # [num_heads(128),qlen, lora_rank(512)] # out_absorb shape is [num_heads(128), kv_lora_rank(512), v_head_dim(128)] # o_absorb_check = load_fp16_tensor('./debug/query_0_tp_0_o_absorb', (qlen,kv_lora_rank)) # diff = torch.abs(attn_output[0] - o_absorb_check).max() # print("o absorb[0] diff -> ", diff) - out_absorb = out_absorb.transpose(1, 2) # [qlen, num_heads(128), v_head_dim(128)] + out_absorb = out_absorb.transpose(1, 2) # [qlen, num_heads(128), v_head_dim(128)] # q for qlen, n for num_heads, h for v_head_dim, v for kv_lora_rank - attn_output = torch.matmul(attn_output, out_absorb) # [num_heads(128), qlen, v_head_dim(128)] + attn_output = torch.matmul(attn_output, out_absorb) # [num_heads(128), qlen, v_head_dim(128)] # attn_output_check_0 = load_fp16_tensor('./debug/query_0_tp_0_attention_output', (qlen, nope_size)) # diff = torch.abs(attn_output[0] - attn_output_check_0).max() # print("attn_output[0] diff -> ", diff) - attn_output = attn_output.transpose(0, 1) # [qlen, num_heads(128), v_head_dim(128)] + attn_output = attn_output.transpose(0, 1) # [qlen, num_heads(128), v_head_dim(128)] attn_output = attn_output.reshape(qlen, num_heads * nope_size) - w_o = o_proj.weight.view([hidden_size,num_heads * nope_size]) - output = torch.matmul(attn_output,w_o.transpose(0,1)) + w_o = o_proj.weight.view([hidden_size, num_heads * nope_size]) + output = torch.matmul(attn_output, w_o.transpose(0, 1)) output = output.view(qlen, hidden_size) - + # output_0_check = load_fp16_tensor('./debug/query_0_tp_0_qlen_output', (qlen, hidden_size)) # h1_o = w_o[:,:128] # local_o_check = load_fp16_tensor('./debug/query_0_tp_0_local_w_o', (hidden_size, 128)) @@ -685,35 +680,32 @@ def test_torch(): # diff = torch.abs(h1_output - output_0_check).max() # print("h1_output diff -> ", diff) - # output_check = load_fp16_tensor('./debug/output.bin', output.shape) # diff = torch.abs(output - output_check).max() # mae = torch.mean(torch.abs(output - output_check)) # print("output diff -> ", diff) - final_attention_output = torch.cat((final_attention_output, output), dim=0) return final_attention_output - - torch_output = torch_attn( - hidden_states, - kv_cache, - position_ids, - page_idx, - page_offset, - attention_masks=attention_masks, - q_indptr=q_indptr, - kv_indices=kv_indices, - kv_indptr=kv_indptr, - bsz_tensors=bsz_tensors, - last_page_len=last_page_len, - layer_idx=0 - ) - print("Torch Output: ",torch_output) + hidden_states, + kv_cache, + position_ids, + page_idx, + page_offset, + attention_masks=attention_masks, + q_indptr=q_indptr, + kv_indices=kv_indices, + kv_indptr=kv_indptr, + bsz_tensors=bsz_tensors, + last_page_len=last_page_len, + layer_idx=0, + ) + print("Torch Output: ", torch_output) return torch_output + torch.set_printoptions(sci_mode=False, precision=5) output_cpu = test_cpu_mla() output_torch = test_torch() @@ -724,11 +716,9 @@ diff = (output_cpu - output_torch).abs() diff_relative = diff / (output_cpu.abs()) # 把 diff_relative 中的 NaN 替换为 0 diff_relative = torch.where(torch.isnan(diff_relative), torch.zeros_like(diff_relative), diff_relative) -diff_relative_mean = torch.mean(torch.abs(output_cpu-output_torch)) / torch.mean(torch.abs(output_torch)) +diff_relative_mean = torch.mean(torch.abs(output_cpu - output_torch)) / torch.mean(torch.abs(output_torch)) -print(f'Diff: ave:{diff.mean()}, max:{diff.max()}, min:{diff.min()}, relative_mean:{diff_relative_mean}, relative_max:{diff_relative.max()}, relative_min:{diff_relative.min()}') +print( + f"Diff: ave:{diff.mean()}, max:{diff.max()}, min:{diff.min()}, relative_mean:{diff_relative_mean}, relative_max:{diff_relative.max()}, relative_min:{diff_relative.min()}" +) assert diff_relative_mean < 2e-1, "CPU and Torch outputs are not close enough!" - - - - diff --git a/kt-kernel/examples/test_mla_qlen.py b/kt-kernel/examples/test_mla_qlen.py index 037c4992..6ff1d06a 100644 --- a/kt-kernel/examples/test_mla_qlen.py +++ b/kt-kernel/examples/test_mla_qlen.py @@ -1,19 +1,22 @@ import logging -import os,sys +import os, sys import time from typing import Optional + os.environ["BLAS_NUM_THREADS"] = "1" -sys.path.insert(0, os.path.dirname(__file__) + '/../build') -import kt_kernel_ext +sys.path.insert(0, os.path.dirname(__file__) + "/../build") +from kt_kernel import kt_kernel_ext from kt_kernel_ext.kvcache import ggml_type import torch from torch import inf, nn from torch.nn import init -from torch_attention import apply_rotary_pos_emb,DeepseekV2RMSNorm,KDeepSeekV3Cache,DeepseekV3YarnRotaryEmbedding +from torch_attention import apply_rotary_pos_emb, DeepseekV2RMSNorm, KDeepSeekV3Cache, DeepseekV3YarnRotaryEmbedding + logger = logging.getLogger("reader") from gguf.gguf_reader import GGUFReader + def read_gguf_file(gguf_file_path): """ Reads and prints key-value pairs and tensor information from a GGUF file in an improved format. @@ -46,12 +49,15 @@ def read_gguf_file(gguf_file_path): re.append(tensor) return re + def get_torch_tensor_from_gguf(gguf_weights, name): return torch.from_numpy(gguf_weights[name].data).contiguous() + def get_torch_tensor_and_type_from_gguf(gguf_weights, name): return torch.from_numpy(gguf_weights[name].data).contiguous(), gguf_weights[name].tensor_type.name + def type_to_ggml_type(type): if type == "F32": return ggml_type.FP32 @@ -75,7 +81,7 @@ kvlen = 0 page_table = range(20) -bsz_tensors=torch.tensor([1]) +bsz_tensors = torch.tensor([1]) page_size = 256 @@ -94,8 +100,7 @@ rope_theta = 10000 max_qlen = 1024 max_kvlen = 4096 -max_position_embeddings = 163840 - +max_position_embeddings = 163840 rope_scaling = { @@ -105,11 +110,10 @@ rope_scaling = { "mscale": 1.0, "mscale_all_dim": 1.0, "original_max_position_embeddings": 4096, - "type": "yarn" + "type": "yarn", } - CPUInfer = kt_kernel_ext.CPUInfer(64) validation_iter = 100 @@ -119,15 +123,16 @@ weight_type = torch.bfloat16 # weight_type = torch.float16 -input_type = {torch.float32:torch.float32, - torch.float16:torch.float16, - torch.bfloat16:torch.float32, - }[weight_type] +input_type = { + torch.float32: torch.float32, + torch.float16: torch.float16, + torch.bfloat16: torch.float32, +}[weight_type] q_a_proj = nn.Linear(hidden_size, q_lora_rank, bias=False, dtype=weight_type) -q_b_proj = nn.Linear(q_lora_rank, num_heads * (nope_size+rope_size) , bias=False, dtype=weight_type) +q_b_proj = nn.Linear(q_lora_rank, num_heads * (nope_size + rope_size), bias=False, dtype=weight_type) kv_a_proj_with_mqa = nn.Linear(hidden_size, kv_lora_rank + rope_size, bias=False, dtype=weight_type) -kv_b_proj = nn.Linear( num_heads * (nope_size + nope_size),kv_lora_rank, bias=False, dtype=weight_type) +kv_b_proj = nn.Linear(num_heads * (nope_size + nope_size), kv_lora_rank, bias=False, dtype=weight_type) o_proj = nn.Linear(num_heads * nope_size, hidden_size, bias=False, dtype=weight_type) q_a_norm = torch.ones(hidden_size, dtype=torch.float32) kv_a_norm = torch.ones(hidden_size, dtype=torch.float32) @@ -190,7 +195,7 @@ if use_real_weights := True: o_proj_weight, type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.attn_output.weight") o_proj.weight = nn.Parameter(o_proj_weight.view(torch.bfloat16), requires_grad=False) - + else: init.normal_(q_a_proj.weight, mean=0.0, std=0.02) init.normal_(q_b_proj.weight, mean=0.0, std=0.02) @@ -203,16 +208,16 @@ q_absorb = x_reshaped[:, 0] out_absorb = x_reshaped[:, 1] -hidden_states = torch.randn((qlen, hidden_size), dtype=input_type).to('cpu').contiguous() +hidden_states = torch.randn((qlen, hidden_size), dtype=input_type).to("cpu").contiguous() def build_mla(): os.environ["BLAS_NUM_THREADS"] = "1" - q_a_proj_weight = q_a_proj.weight.to(weight_type).to('cpu').contiguous() - q_b_proj_weight = q_b_proj.weight.to(weight_type).to('cpu').contiguous() - kv_a_proj_with_mqa_weight = kv_a_proj_with_mqa.weight.to('cpu').to(weight_type).contiguous() - kv_b_proj_weight = kv_b_proj.weight.to(weight_type).to('cpu').contiguous() - o_proj_weight = o_proj.weight.to(weight_type).to('cpu').contiguous() + q_a_proj_weight = q_a_proj.weight.to(weight_type).to("cpu").contiguous() + q_b_proj_weight = q_b_proj.weight.to(weight_type).to("cpu").contiguous() + kv_a_proj_with_mqa_weight = kv_a_proj_with_mqa.weight.to("cpu").to(weight_type).contiguous() + kv_b_proj_weight = kv_b_proj.weight.to(weight_type).to("cpu").contiguous() + o_proj_weight = o_proj.weight.to(weight_type).to("cpu").contiguous() config = kt_kernel_ext.mla.MLAConfig( hidden_size, @@ -224,7 +229,7 @@ def build_mla(): ) config.max_qlen = max_qlen config.max_kvlen = max_kvlen - config.max_position_embeddings = max_position_embeddings + config.max_position_embeddings = max_position_embeddings config.rope_scaling_factor = rope_scaling["factor"] config.rope_theta = rope_theta config.rope_scaling_beta_fast = rope_scaling["beta_fast"] @@ -244,7 +249,6 @@ def build_mla(): config.kv_a_norm = kv_a_norm.data_ptr() config.kv_a_norm_type = ggml_type.FP32 - if weight_type == torch.float32: config.q_a_proj_type = ggml_type.FP32 config.q_b_proj_type = ggml_type.FP32 @@ -266,10 +270,8 @@ def build_mla(): else: raise ValueError(f"Unsupported data type: {weight_type}") - config.pool = CPUInfer.backend_ - if weight_type == torch.float32: mla = kt_kernel_ext.mla.MLA_F32(config) elif weight_type == torch.float16: @@ -278,25 +280,20 @@ def build_mla(): mla = kt_kernel_ext.mla.MLA_F32(config) else: raise ValueError(f"Unsupported data type: {weight_type}") - + mla.load_weights() mla.set_local_pages(pages_count) return mla - - def load_fp32_tensor(file_path, shape): - with open(file_path, 'rb') as f: + with open(file_path, "rb") as f: raw_data = f.read() tensor = torch.frombuffer(raw_data, dtype=torch.float32) tensor = tensor.view(shape) # 根据你的 shape reshape return tensor - - - # page3 = load_fp32_tensor('/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug1/query_0_tp_0_page_3_kv_lora_rank_norm.f32',(page_size,kv_lora_rank)) # page3_2 = load_fp32_tensor('/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug2/query_0_tp_0_page_3_kv_lora_rank_norm.f32',(page_size,kv_lora_rank)) @@ -320,7 +317,6 @@ def load_fp32_tensor(file_path, shape): # print(f'PE Attention Weights Diff: ave:{diff.mean()}, max:{diff.max()}') - # raw_attn_w_1 = load_fp32_tensor('/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug1/query_0_tp_0_raw_attention_weights.f32',(1,max_kvlen)) # raw_attn_w_2 = load_fp32_tensor('/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug2/query_0_tp_0_raw_attention_weights.f32',(qlen,max_kvlen)) # diff = torch.abs(raw_attn_w_1 - raw_attn_w_2[-1]) @@ -334,22 +330,16 @@ def load_fp32_tensor(file_path, shape): # print(f'Output Diff: ave:{diff.mean()}, max:{diff.max()}') - - mla = build_mla() -output = torch.zeros((qlen, hidden_size), dtype=input_type).to('cpu').contiguous() -mla.forward([qlen],[page_table],[kvlen],hidden_states.data_ptr(),output.data_ptr()) -print("CPU MLA Output: ",output[-1]) +output = torch.zeros((qlen, hidden_size), dtype=input_type).to("cpu").contiguous() +mla.forward([qlen], [page_table], [kvlen], hidden_states.data_ptr(), output.data_ptr()) +print("CPU MLA Output: ", output[-1]) -output_2 = torch.zeros((1, hidden_size), dtype=input_type).to('cpu').contiguous() -mla.forward([1],[page_table],[qlen-1],hidden_states[-1].data_ptr(),output_2.data_ptr()) -print("CPU MLA Output 2: ",output_2[-1]) +output_2 = torch.zeros((1, hidden_size), dtype=input_type).to("cpu").contiguous() +mla.forward([1], [page_table], [qlen - 1], hidden_states[-1].data_ptr(), output_2.data_ptr()) +print("CPU MLA Output 2: ", output_2[-1]) diff = torch.abs(output[-1] - output_2[-1]) -print(f'Diff: ave:{diff.mean()}, max:{diff.max()}') +print(f"Diff: ave:{diff.mean()}, max:{diff.max()}") assert diff.max() < 1e-1, "CPU and Torch outputs are not close enough!" - - - - diff --git a/kt-kernel/examples/test_mla_quant.py b/kt-kernel/examples/test_mla_quant.py index 471dcd1e..cdac0ecb 100644 --- a/kt-kernel/examples/test_mla_quant.py +++ b/kt-kernel/examples/test_mla_quant.py @@ -1,59 +1,62 @@ import logging -import os,sys +import os, sys import time from typing import Optional + os.environ["BLAS_NUM_THREADS"] = "1" -sys.path.insert(0, os.path.dirname(__file__) + '/../build') -import kt_kernel_ext +sys.path.insert(0, os.path.dirname(__file__) + "/../build") +from kt_kernel import kt_kernel_ext from kt_kernel_ext.kvcache import ggml_type import torch from torch import inf, nn from torch.nn import init -from torch_attention import apply_rotary_pos_emb,DeepseekV2RMSNorm,KDeepSeekV3Cache,DeepseekV3YarnRotaryEmbedding +from torch_attention import apply_rotary_pos_emb, DeepseekV2RMSNorm, KDeepSeekV3Cache, DeepseekV3YarnRotaryEmbedding + logger = logging.getLogger("reader") from gguf.gguf_reader import GGUFReader - def load_fp32_tensor_raw(file_path): # return torch.zeros(shape) - with open(file_path, 'rb') as f: + with open(file_path, "rb") as f: raw_data = f.read() tensor = torch.frombuffer(raw_data, dtype=torch.float32) return tensor + def load_fp16_tensor(file_path, shape=None): # return load_fp32_tensor(file_path, shape) return load_fp32_tensor_raw(file_path) # return torch.zeros(shape) - with open(file_path, 'rb') as f: + with open(file_path, "rb") as f: raw_data = f.read() tensor = torch.frombuffer(raw_data, dtype=weight_type) tensor = tensor.view(shape) # 根据你的 shape reshape return tensor + def load_fp32_tensor(file_path, shape): # return torch.zeros(shape) - with open(file_path, 'rb') as f: + with open(file_path, "rb") as f: raw_data = f.read() tensor = torch.frombuffer(raw_data, dtype=torch.float32) tensor = tensor.view(shape) # 根据你的 shape reshape return tensor + def test_torch(): torch.set_grad_enabled(False) - - hidden_states_to_check_decode = load_fp16_tensor('./debug_decode/query_0_tp_0_input.bin') - hidden_states_to_check_prefill = load_fp16_tensor('./debug_prefill/query_0_tp_0_input.bin') + hidden_states_to_check_decode = load_fp16_tensor("./debug_decode/query_0_tp_0_input.bin") + hidden_states_to_check_prefill = load_fp16_tensor("./debug_prefill/query_0_tp_0_input.bin") # diff = torch.abs(hidden_states_to_check_prefill - hidden_states_to_check_decode).max() # print("hidden_states diff -> ", diff) - q_lora_to_check_decode = load_fp16_tensor('./debug_decode/query_0_tp_0_qlora.bin') - q_lora_to_check_test_decode = load_fp16_tensor('./debug_decode/query_0_tp_0_qlora_test.bin') - q_lora_to_check_prefill = load_fp16_tensor('./debug_prefill/query_0_tp_0_qlora.bin') - q_lora_to_check_test_prefill = load_fp16_tensor('./debug_prefill/query_0_tp_0_qlora_test.bin') + q_lora_to_check_decode = load_fp16_tensor("./debug_decode/query_0_tp_0_qlora.bin") + q_lora_to_check_test_decode = load_fp16_tensor("./debug_decode/query_0_tp_0_qlora_test.bin") + q_lora_to_check_prefill = load_fp16_tensor("./debug_prefill/query_0_tp_0_qlora.bin") + q_lora_to_check_test_prefill = load_fp16_tensor("./debug_prefill/query_0_tp_0_qlora_test.bin") # diff = torch.abs(q_lora_to_check_prefill - q_lora_to_check_decode).max() # diff_test = torch.abs(q_lora_to_check_prefill - q_lora_to_check_decode).max() # print("q_lora max diff -> ", diff) @@ -63,8 +66,6 @@ def test_torch(): # print("q_lora mae -> ", mae) # print("q_lora mae test -> ", mae_test) - - # q_lora_norm = q_a_layernorm(q_lora) # q_lora_norm_to_check = load_fp16_tensor('./debug/query_0_tp_0_qlora_norm.bin', q_lora_norm.shape) # q_lora_norm_to_check_test = load_fp16_tensor('./debug/query_0_tp_0_qlora_norm_test.bin', q_lora_norm.shape) @@ -76,7 +77,7 @@ def test_torch(): # print("q_lora_norm mae -> ", mae) # print("q_lora_norm diff test -> ", diff_test) # print("q_lora_norm mae test -> ", mae_test) - + # q = q_b_proj(q_lora_norm) # for v3, bsz, qlen, num_heads(128), qk_head_dim(192=128(nope)+64(rope)) # q = q.view(qlen, num_heads, nope_size+rope_size) @@ -85,7 +86,7 @@ def test_torch(): # q_nope, q_pe = torch.split( # q, [nope_size, rope_size], dim=-1 # ) - + # compressed_kv is [qlen, kv_lora_rank(512) + rope(64)] # compressed_kv = kv_a_proj_with_mqa(batch_hidden_states) # compressed_kv is [qlen, kv_lora_rank(512)], k_pe is [qlen, rope(64)] @@ -94,12 +95,11 @@ def test_torch(): # ) # compressed_kv = compressed_kv.contiguous() - # compressed_kv_page_0 = compressed_kv[0:page_size, :] - compressed_kv_to_check_decode = load_fp16_tensor('./debug_decode/query_0_tp_0_page_0_kv_lora_rank') - compressed_kv_to_check_prefill = load_fp16_tensor('./debug_prefill/query_0_tp_0_page_0_kv_lora_rank') + compressed_kv_to_check_decode = load_fp16_tensor("./debug_decode/query_0_tp_0_page_0_kv_lora_rank") + compressed_kv_to_check_prefill = load_fp16_tensor("./debug_prefill/query_0_tp_0_page_0_kv_lora_rank") # diff = torch.abs(compressed_kv_to_check_prefill - compressed_kv_to_check_decode).max() - # mae = torch.mean(torch.abs(compressed_kv_to_check_prefill - compressed_kv_to_check_decode)) + # mae = torch.mean(torch.abs(compressed_kv_to_check_prefill - compressed_kv_to_check_decode)) # print("compressed_kv diff -> ", diff) # print("compressed_kv mae -> ", mae) @@ -107,20 +107,17 @@ def test_torch(): # k_pe is [qlen, 1, qk_rope_head_dim(64)] # compressed_kv_page_0 = compressed_kv[0:page_size, :] - compressed_kv_to_check_decode = load_fp16_tensor('./debug_decode/query_0_tp_0_page_0_kv_lora_rank_norm') - compressed_kv_to_check_prefill = load_fp16_tensor('./debug_prefill/query_0_tp_0_page_0_kv_lora_rank_norm') + compressed_kv_to_check_decode = load_fp16_tensor("./debug_decode/query_0_tp_0_page_0_kv_lora_rank_norm") + compressed_kv_to_check_prefill = load_fp16_tensor("./debug_prefill/query_0_tp_0_page_0_kv_lora_rank_norm") # diff = torch.abs(compressed_kv_page_0 - compressed_kv_to_check).max() # mae = torch.mean(torch.abs(compressed_kv_page_0 - compressed_kv_to_check)) # print("compressed_kv diff norm -> ", diff) # print("compressed_kv mae norm -> ", mae) - - - # k_pe = k_pe.view(qlen, 1, rope_size) # compressed_kv is [qlen, 1, kv_lora_rank(512)] # compressed_kv = compressed_kv.view(qlen, 1, kv_lora_rank) - + # cos, sin = rotary_emb(q_pe, batch_position_ids) # q_nope_check = q_nope.transpose(0, 1) # qlen is 1, no GPU overhead, same below @@ -135,11 +132,11 @@ def test_torch(): # print("q_nope[0] mae -> ", mae) # print("q_nope[0] diff test -> ", diff_test) # print("q_nope[0] mae test -> ", mae_test) - + # q_pe_nope = q_pe.transpose(0,1) - q_pe_0_to_check_decode = load_fp16_tensor('./debug_decode/query_0_tp_0_q_rope') - q_pe_0_to_check_prefill = load_fp16_tensor('./debug_prefill/query_0_tp_0_q_rope') - + q_pe_0_to_check_decode = load_fp16_tensor("./debug_decode/query_0_tp_0_q_rope") + q_pe_0_to_check_prefill = load_fp16_tensor("./debug_prefill/query_0_tp_0_q_rope") + # q_pe_0_to_check_decode_test = load_fp16_tensor('./debug_decode/query_0_tp_0_q_rope_test') # q_pe_0_to_check_prefill_test = load_fp16_tensor('./debug_prefill/query_0_tp_0_q_rope_test') @@ -180,12 +177,11 @@ def test_torch(): # q_pe, k_pe = apply_rotary_pos_emb(q_pe.unsqueeze(0), k_pe.unsqueeze(0), cos, sin, unsqueeze_dim=1) # q_pe = q_pe.squeeze(0) # q_pe is [num_heads(128), qlen, qk_rope_head_dim(64)] - # q_pe.transpose_(0, 1) + # q_pe.transpose_(0, 1) # diff = torch.abs(q_pe - q_new).max() # print("q_pe diff -> ", diff) - # q_pe_0_to_check = load_fp16_tensor('./debug/query_0_tp_0_q_rope', q_pe[0].shape) # diff = torch.abs(q_pe[0] - q_pe_0_to_check).max() # mae = torch.mean(torch.abs(q_pe[0] - q_pe_0_to_check)) @@ -240,7 +236,7 @@ def test_torch(): # if batch_compressed_kv is None or batch_k_pe is None: # batch_compressed_kv = tmp_compressed_kv # batch_k_pe = tmp_k_pe - # else: + # else: # batch_compressed_kv = torch.cat((batch_compressed_kv, tmp_compressed_kv), dim=0) # batch_k_pe = torch.cat((batch_k_pe, tmp_k_pe), dim=0) # kv_total_len -= page_size @@ -250,16 +246,15 @@ def test_torch(): # if batch_compressed_kv is None or batch_k_pe is None: # batch_compressed_kv = tmp_compressed_kv # batch_k_pe = tmp_k_pe - # else: + # else: # batch_compressed_kv = torch.cat((batch_compressed_kv, tmp_compressed_kv), dim=0) # batch_k_pe = torch.cat((batch_k_pe, tmp_k_pe), dim=0) # break # batch_compressed_kv is [kv_total_len(k_len), kv_lora_rank(512)] # batch_k_pe is [kv_total_len(k_len), qk_rope_head_dim(64)] - - k_pe_to_check_decode = load_fp16_tensor('./debug_decode/query_0_tp_0_page_0_k_rope', (256,64)) - k_pe_to_check_prefill = load_fp16_tensor('./debug_prefill/query_0_tp_0_page_0_k_rope', (256,64)) + k_pe_to_check_decode = load_fp16_tensor("./debug_decode/query_0_tp_0_page_0_k_rope", (256, 64)) + k_pe_to_check_prefill = load_fp16_tensor("./debug_prefill/query_0_tp_0_page_0_k_rope", (256, 64)) # diff = torch.abs(k_pe_to_check_prefill - k_pe_to_check_decode).max() # mae = torch.mean(k_pe_to_check_prefill - k_pe_to_check_decode) # print("k_pe diff -> ", diff) @@ -267,13 +262,13 @@ def test_torch(): # pe_weights = torch.matmul(q_pe,batch_k_pe.mT) # kv_total_len = kv_page_nums * page_size - pe_weights_0_decode = load_fp16_tensor('./debug_decode/query_0_tp_0_pe_attention_weights', (1024,4096)) - pe_weights_0_prefill = load_fp16_tensor('./debug_prefill/query_0_tp_0_pe_attention_weights', (1024,4096)) + pe_weights_0_decode = load_fp16_tensor("./debug_decode/query_0_tp_0_pe_attention_weights", (1024, 4096)) + pe_weights_0_prefill = load_fp16_tensor("./debug_prefill/query_0_tp_0_pe_attention_weights", (1024, 4096)) # diff = torch.abs(pe_weights[0] - pe_weights_0).max() # print("pe_weights[0] diff -> ", diff) - # attention_weights = (pe_weights + torch.matmul(q_nope, batch_compressed_kv.mT)) + # attention_weights = (pe_weights + torch.matmul(q_nope, batch_compressed_kv.mT)) # raw_weights = load_fp16_tensor('./debug/query_0_tp_0_raw_attention_weights', (1024, 4096)) # raw_weights = raw_weights[0:qlen, 0:kv_total_len] @@ -282,25 +277,23 @@ def test_torch(): # attention_weights = attention_weights * softmax_scale # attention_weights is [num_heads(128), qlen, k_len] - + # attention_weights = attention_weights.transpose(0,1).unsqueeze(0).squeeze(-1).expand(qlen,-1,-1).transpose(0,1) - + # attention_masks[i] is [qlen, k_len] - + # attention_weights = (attention_weights + attention_masks) # attention_weights shape is [num_heads(128), qlen, k_len] - # attention_weights = nn.functional.softmax(attention_weights,dim=-1,dtype=weight_type).to(q_pe.dtype) - attention_weights_0_decode = load_fp16_tensor('./debug_decode/query_0_tp_0_attention_weights', (1024, 4096)) - attention_weights_0_prefill = load_fp16_tensor('./debug_prefill/query_0_tp_0_attention_weights', (1024, 4096)) + attention_weights_0_decode = load_fp16_tensor("./debug_decode/query_0_tp_0_attention_weights", (1024, 4096)) + attention_weights_0_prefill = load_fp16_tensor("./debug_prefill/query_0_tp_0_attention_weights", (1024, 4096)) # attention_weights_0 = attention_weights_0[0:qlen, 0:kv_total_len] # diff = torch.abs(attention_weights[0] - attention_weights_0).max() # print("attention_weights[0] diff -> ", diff) - # attn_output = torch.matmul(attention_weights, batch_compressed_kv) # [num_heads(128),qlen, lora_rank(512)] # out_absorb shape is [num_heads(128), kv_lora_rank(512), v_head_dim(128)] @@ -322,7 +315,7 @@ def test_torch(): # w_o = o_proj.weight.view([hidden_size,num_heads * nope_size]) # output = torch.matmul(attn_output,w_o.transpose(0,1)) # output = output.view(qlen, hidden_size) - + # output_0_check = load_fp16_tensor('./debug/query_0_tp_0_qlen_output', (qlen, hidden_size)) # h1_o = w_o[:,:128] # local_o_check = load_fp16_tensor('./debug/query_0_tp_0_local_w_o', (hidden_size, 128)) @@ -333,18 +326,15 @@ def test_torch(): # diff = torch.abs(h1_output - output_0_check).max() # print("h1_output diff -> ", diff) - - output_check_decode = load_fp16_tensor('./debug_decode/output.bin') - output_check_prefill = load_fp16_tensor('./debug_prefill/output.bin') + output_check_decode = load_fp16_tensor("./debug_decode/output.bin") + output_check_prefill = load_fp16_tensor("./debug_prefill/output.bin") # diff = torch.abs(output - output_check).max() # mae = torch.mean(torch.abs(output - output_check)) # print("output diff -> ", diff) - - - return None + torch.set_printoptions(sci_mode=False, precision=5) # output_cpu = test_cpu_mla() # output_cpu_quant = test_cpu_mla_quant() @@ -361,7 +351,3 @@ output_torch = test_torch() # print(f'Diff: ave:{diff.mean()}, max:{diff.max()}, min:{diff.min()}, relative_mean:{diff_relative_mean}, relative_max:{diff_relative.max()}, relative_min:{diff_relative.min()}') # assert diff_relative_mean < 2e-1, "CPU and Torch outputs are not close enough!" - - - - diff --git a/kt-kernel/examples/test_mla_torch.py b/kt-kernel/examples/test_mla_torch.py index bf2164d5..2280de75 100644 --- a/kt-kernel/examples/test_mla_torch.py +++ b/kt-kernel/examples/test_mla_torch.py @@ -1,13 +1,14 @@ -import os,sys +import os, sys import time from typing import Optional -sys.path.insert(0, os.path.dirname(__file__) + '/../build') -import kt_kernel_ext + +sys.path.insert(0, os.path.dirname(__file__) + "/../build") +from kt_kernel import kt_kernel_ext from kt_kernel_ext.kvcache import ggml_type import torch from torch import nn from torch.nn import init -from torch_attention import apply_rotary_pos_emb,DeepseekV2RMSNorm,KDeepSeekV3Cache,DeepseekV3YarnRotaryEmbedding +from torch_attention import apply_rotary_pos_emb, DeepseekV2RMSNorm, KDeepSeekV3Cache, DeepseekV3YarnRotaryEmbedding seed = 42 # 你可以选择任何整数作为种子 @@ -19,7 +20,7 @@ kvlen = 0 page_table = range(20) -bsz_tensors=torch.tensor([1]) +bsz_tensors = torch.tensor([1]) page_size = 256 @@ -38,8 +39,7 @@ rope_theta = 10000 max_qlen = 1024 max_kvlen = 4096 -max_position_embeddings = 163840 - +max_position_embeddings = 163840 rope_scaling = { @@ -49,17 +49,16 @@ rope_scaling = { "mscale": 1.0, "mscale_all_dim": 1.0, "original_max_position_embeddings": 4096, - "type": "yarn" + "type": "yarn", } - CPUInfer = kt_kernel_ext.CPUInfer(64) validation_iter = 100 q_a_proj = nn.Linear(hidden_size, q_lora_rank, bias=False, dtype=torch.float16) -q_b_proj = nn.Linear(q_lora_rank, num_heads * (nope_size+rope_size) , bias=False, dtype=torch.float16) +q_b_proj = nn.Linear(q_lora_rank, num_heads * (nope_size + rope_size), bias=False, dtype=torch.float16) kv_a_proj_with_mqa = nn.Linear(hidden_size, kv_lora_rank + rope_size, bias=False, dtype=torch.float16) kv_b_proj = nn.Linear(kv_lora_rank, num_heads * (nope_size + nope_size), bias=False, dtype=torch.float16) o_proj = nn.Linear(num_heads * nope_size, hidden_size, bias=False, dtype=torch.float16) @@ -70,13 +69,11 @@ init.normal_(kv_a_proj_with_mqa.weight, mean=0.0, std=0.02) init.normal_(kv_b_proj.weight, mean=0.0, std=0.02) init.normal_(o_proj.weight, mean=0.0, std=0.02) -q_a_proj_weight = q_a_proj.weight.to(torch.float16).to('cpu').contiguous() -q_b_proj_weight = q_b_proj.weight.to(torch.float16).to('cpu').contiguous() -kv_a_proj_with_mqa_weight = kv_a_proj_with_mqa.weight.to('cpu').to(torch.float16).contiguous() -kv_b_proj_weight = kv_b_proj.weight.to(torch.float16).to('cpu').contiguous() -o_proj_weight = o_proj.weight.to(torch.float16).to('cpu').contiguous() - - +q_a_proj_weight = q_a_proj.weight.to(torch.float16).to("cpu").contiguous() +q_b_proj_weight = q_b_proj.weight.to(torch.float16).to("cpu").contiguous() +kv_a_proj_with_mqa_weight = kv_a_proj_with_mqa.weight.to("cpu").to(torch.float16).contiguous() +kv_b_proj_weight = kv_b_proj.weight.to(torch.float16).to("cpu").contiguous() +o_proj_weight = o_proj.weight.to(torch.float16).to("cpu").contiguous() config = kt_kernel_ext.mla.MLAConfig( @@ -89,7 +86,7 @@ config = kt_kernel_ext.mla.MLAConfig( ) config.max_qlen = max_qlen config.max_kvlen = max_kvlen -config.max_position_embeddings = max_position_embeddings +config.max_position_embeddings = max_position_embeddings config.rope_scaling_factor = rope_scaling["factor"] config.rope_theta = rope_theta config.rope_scaling_beta_fast = rope_scaling["beta_fast"] @@ -114,30 +111,27 @@ config.w_o_type = ggml_type.FP16 config.pool = CPUInfer.backend_ - mla = kt_kernel_ext.mla.MLA(config) mla.load_weights() mla.set_local_pages(pages_count) - -input = torch.randn((qlen, hidden_size), dtype=torch.float16).to('cpu').contiguous() +input = torch.randn((qlen, hidden_size), dtype=torch.float16).to("cpu").contiguous() -output = torch.zeros((qlen, hidden_size), dtype=torch.float16).to('cpu').contiguous() -mla.forward([qlen],[page_table],[kvlen],input.data_ptr(),output.data_ptr()) -print("CPU MLA Output: ",output) - +output = torch.zeros((qlen, hidden_size), dtype=torch.float16).to("cpu").contiguous() +mla.forward([qlen], [page_table], [kvlen], input.data_ptr(), output.data_ptr()) +print("CPU MLA Output: ", output) softmax_scale = (nope_size + rope_size) ** -0.5 # 1代表的是压缩的kv的头数 -k_caches = torch.randn(1,pages_count, page_size, 1, kv_lora_rank + rope_size).to(torch.float16) +k_caches = torch.randn(1, pages_count, page_size, 1, kv_lora_rank + rope_size).to(torch.float16) kv_cache = KDeepSeekV3Cache(page_size=page_size, kv_lora_rank=kv_lora_rank, k_caches=k_caches) q_a_layernorm = DeepseekV2RMSNorm(q_lora_rank) -x = torch.randn(q_lora_rank, dtype=torch.float16)*100 +x = torch.randn(q_lora_rank, dtype=torch.float16) * 100 print(x) print(q_a_layernorm(x)) @@ -163,110 +157,114 @@ rotary_emb = DeepseekV3YarnRotaryEmbedding( # last_page_len = [qlen+kvlen,...] layer_idx = 1 # position_ids = [kvlen:qlen+kvlen] hidden_states = torch.randn(qlen, hidden_size, dtype=torch.float16) -q_indptr = torch.tensor([0,qlen]).to(torch.int32) +q_indptr = torch.tensor([0, qlen]).to(torch.int32) -kv_indptr = torch.tensor([0,(qlen+kvlen+page_size-1)//page_size]).to(torch.int32) +kv_indptr = torch.tensor([0, (qlen + kvlen + page_size - 1) // page_size]).to(torch.int32) kv_indices = torch.tensor(range(pages_count)).to(torch.int32) -page_idx = torch.tensor([i//page_size for i in range(kvlen,kvlen+qlen)] ).to(torch.int32) -page_offset = torch.tensor( [i%page_size for i in range(kvlen, kvlen + qlen)]).to(torch.int32) +page_idx = torch.tensor([i // page_size for i in range(kvlen, kvlen + qlen)]).to(torch.int32) +page_offset = torch.tensor([i % page_size for i in range(kvlen, kvlen + qlen)]).to(torch.int32) -last_page_len = torch.tensor([(qlen+kvlen)%page_size], device=hidden_states.device) +last_page_len = torch.tensor([(qlen + kvlen) % page_size], device=hidden_states.device) position_ids = torch.tensor(range(kvlen, kvlen + qlen)).to(torch.int32) # 按照行创建 mask [qlen,kvlen+qlen] attention_masks = torch.zeros((qlen, kvlen + qlen), dtype=torch.float16) for i in range(qlen): - attention_masks[i, i + kvlen + 1: i + kvlen + qlen] = -65504.0 + attention_masks[i, i + kvlen + 1 : i + kvlen + qlen] = -65504.0 -def torch_attn(hidden_states: torch.Tensor, - kv_cache: KDeepSeekV3Cache, - position_ids: torch.Tensor, - page_idx: torch.Tensor, - page_offset: torch.Tensor, - attention_masks: Optional[list[torch.Tensor]] = None, - q_indptr: Optional[torch.Tensor] = None, - kv_indices: Optional[torch.Tensor] = None, - kv_indptr: Optional[torch.Tensor] = None, - bsz_tensors: Optional[torch.Tensor] = None, - last_page_len: Optional[torch.Tensor] = None, - layer_idx: Optional[int] = None, - ): +def torch_attn( + hidden_states: torch.Tensor, + kv_cache: KDeepSeekV3Cache, + position_ids: torch.Tensor, + page_idx: torch.Tensor, + page_offset: torch.Tensor, + attention_masks: Optional[list[torch.Tensor]] = None, + q_indptr: Optional[torch.Tensor] = None, + kv_indices: Optional[torch.Tensor] = None, + kv_indptr: Optional[torch.Tensor] = None, + bsz_tensors: Optional[torch.Tensor] = None, + last_page_len: Optional[torch.Tensor] = None, + layer_idx: Optional[int] = None, +): global out_absorb global q_absorb # range bsz_tensors final_attention_output = torch.tensor([], device=hidden_states.device) for i in range(bsz_tensors[0]): - batch_num_tokens_tensors = q_indptr[i+1] - q_indptr[i] + batch_num_tokens_tensors = q_indptr[i + 1] - q_indptr[i] batch_last_page_len = last_page_len[i] # kv_total_len is kv_len, batch_compressed_kv is compressed_kv, batch_k_pe is k_pe - batch_page_idx = page_idx[q_indptr[i]:q_indptr[i+1]] - batch_page_offset = page_offset[q_indptr[i]:q_indptr[i+1]] + batch_page_idx = page_idx[q_indptr[i] : q_indptr[i + 1]] + batch_page_offset = page_offset[q_indptr[i] : q_indptr[i + 1]] # kv_page_nums is the number of pages for the current batch - kv_page_nums = kv_indptr[i+1] - kv_indptr[i] + kv_page_nums = kv_indptr[i + 1] - kv_indptr[i] # kv_total_len is the total length of the kv cache for the current batch (kv_len for algorithm) kv_total_len = kv_page_nums * page_size if batch_last_page_len is not None: kv_total_len = kv_total_len - (page_size - batch_last_page_len) # print(f"kv_total_len's shape {kv_total_len.shape}") # kv_index is the index of the kv cache pages for the current batch - kv_index = kv_indices[kv_indptr[i]:kv_indptr[i+1]] + kv_index = kv_indices[kv_indptr[i] : kv_indptr[i + 1]] # we can index [kv_index, page_offset_indices] to get the kv cache for the current batch # from q_indptr[i] to q_indptr[i+1] is the range of the current batch - batch_hidden_states = hidden_states[q_indptr[i]:q_indptr[i+1]] - batch_position_ids = position_ids[q_indptr[i]:q_indptr[i+1]] + batch_hidden_states = hidden_states[q_indptr[i] : q_indptr[i + 1]] + batch_position_ids = position_ids[q_indptr[i] : q_indptr[i + 1]] qlen, _ = batch_hidden_states.size() # print("qlen -> ", qlen) q_lora = q_a_proj(batch_hidden_states) - print('q_a_proj',q_a_proj.weight) - print('q_lora',q_lora) - + print("q_a_proj", q_a_proj.weight) + print("q_lora", q_lora) + q = q_b_proj(q_a_layernorm(q_lora)) - print('q_b_proj',q_b_proj.weight) + print("q_b_proj", q_b_proj.weight) # for v3, bsz, qlen, num_heads(128), qk_head_dim(192=128(nope)+64(rope)) - q = q.view(qlen, num_heads, nope_size+rope_size) + q = q.view(qlen, num_heads, nope_size + rope_size) # q_nope is [qlen, num_heads(128), qk_nope_head_dim(128)] # q_pe is [qlen, num_heads(128), qk_rope_head_dim(64)] - q_nope, q_pe = torch.split( - q, [nope_size, rope_size], dim=-1 - ) - print('q_nope',q_nope) - print('q_pe',q_pe) + q_nope, q_pe = torch.split(q, [nope_size, rope_size], dim=-1) + print("q_nope", q_nope) + print("q_pe", q_pe) # compressed_kv is [qlen, kv_lora_rank(512) + rope(64)] compressed_kv = kv_a_proj_with_mqa(batch_hidden_states) # compressed_kv is [qlen, kv_lora_rank(512)], k_pe is [qlen, rope(64)] - compressed_kv, k_pe = torch.split( - compressed_kv, [kv_lora_rank, rope_size], dim=-1 - ) + compressed_kv, k_pe = torch.split(compressed_kv, [kv_lora_rank, rope_size], dim=-1) compressed_kv = compressed_kv.contiguous() compressed_kv = kv_a_layernorm(compressed_kv) # k_pe is [qlen, 1, qk_rope_head_dim(64)] - print('compressed_kv ',compressed_kv) - print('k_pe ',k_pe) + print("compressed_kv ", compressed_kv) + print("k_pe ", k_pe) k_pe = k_pe.view(qlen, 1, rope_size) # compressed_kv is [qlen, 1, kv_lora_rank(512)] compressed_kv = compressed_kv.view(qlen, 1, kv_lora_rank) - + cos, sin = rotary_emb(q_pe, batch_position_ids) # print(f"q_pe shape{q_pe.shape}, k_pe shape {k_pe.shape}") q_pe, k_pe = apply_rotary_pos_emb(q_pe.unsqueeze(0), k_pe.unsqueeze(0), cos, sin, unsqueeze_dim=1) q_pe = q_pe.squeeze(0) # q_pe is [num_heads(128), qlen, qk_rope_head_dim(64)] - q_pe.transpose_(0, 1) + q_pe.transpose_(0, 1) if kv_cache is not None: - cache_kwargs = {"sin": sin, "cos": cos, "page_idx": batch_page_idx, "page_offset": batch_page_offset} # Specific to RoPE models - compressed_kv_with_k_pe = kv_cache.update(compressed_kv.unsqueeze(0), k_pe, layer_idx, batch_page_idx, batch_page_offset, cache_kwargs) - compressed_kv = compressed_kv_with_k_pe [:, :, :, :kv_lora_rank].view(-1, page_size, kv_lora_rank) - k_pe = compressed_kv_with_k_pe [:, :, :, kv_lora_rank:].view(-1, page_size, rope_size) + cache_kwargs = { + "sin": sin, + "cos": cos, + "page_idx": batch_page_idx, + "page_offset": batch_page_offset, + } # Specific to RoPE models + compressed_kv_with_k_pe = kv_cache.update( + compressed_kv.unsqueeze(0), k_pe, layer_idx, batch_page_idx, batch_page_offset, cache_kwargs + ) + compressed_kv = compressed_kv_with_k_pe[:, :, :, :kv_lora_rank].view(-1, page_size, kv_lora_rank) + k_pe = compressed_kv_with_k_pe[:, :, :, kv_lora_rank:].view(-1, page_size, rope_size) # q_absorb is [num_heads(128), qk_nope_head_dim(128), kv_lora_rank(512)] # out_absorb is [num_heads(128), kv_lora_rank(512), v_head_dim(128)] v_head_dim is also the nope dim # q_absorb, out_absorb = get_absorbed() # q_nope is [num_heads(128), qlen, qk_nope_head_dim(128)] - q_nope = q_nope.transpose(0, 1) # qlen is 1, no GPU overhead, same below + q_nope = q_nope.transpose(0, 1) # qlen is 1, no GPU overhead, same below # q_nope is [num_heads(128), qlen, kv_lora_rank(512)] - q_nope = torch.matmul(q_nope, q_absorb) # batched MM + q_nope = torch.matmul(q_nope, q_absorb) # batched MM # # q_nope is [qlen, num_heads(128), kv_lora_rank(512)] # q_nope = q_nope.transpose(0, 1) @@ -281,7 +279,7 @@ def torch_attn(hidden_states: torch.Tensor, if batch_compressed_kv is None or batch_k_pe is None: batch_compressed_kv = tmp_compressed_kv batch_k_pe = tmp_k_pe - else: + else: batch_compressed_kv = torch.cat((batch_compressed_kv, tmp_compressed_kv), dim=0) batch_k_pe = torch.cat((batch_k_pe, tmp_k_pe), dim=0) kv_total_len -= page_size @@ -291,57 +289,48 @@ def torch_attn(hidden_states: torch.Tensor, if batch_compressed_kv is None or batch_k_pe is None: batch_compressed_kv = tmp_compressed_kv batch_k_pe = tmp_k_pe - else: + else: batch_compressed_kv = torch.cat((batch_compressed_kv, tmp_compressed_kv), dim=0) batch_k_pe = torch.cat((batch_k_pe, tmp_k_pe), dim=0) break # batch_compressed_kv is [kv_total_len(k_len), kv_lora_rank(512)] # batch_k_pe is [kv_total_len(k_len), qk_rope_head_dim(64)] - pe_weights = torch.matmul(q_pe,batch_k_pe.mT) - print('pe_weights',pe_weights) + pe_weights = torch.matmul(q_pe, batch_k_pe.mT) + print("pe_weights", pe_weights) attention_weights = (pe_weights + torch.matmul(q_nope, batch_compressed_kv.mT)) * softmax_scale # attention_weights is [num_heads(128), qlen, k_len] - + # attention_weights = attention_weights.transpose(0,1).unsqueeze(0).squeeze(-1).expand(qlen,-1,-1).transpose(0,1) - + # attention_masks[i] is [qlen, k_len] - - attention_weights = (attention_weights + attention_masks[i]) + + attention_weights = attention_weights + attention_masks[i] # attention_weights shape is [num_heads(128), qlen, k_len] - attention_weights = nn.functional.softmax(attention_weights,dim=-1,dtype=torch.float16).to(q_pe.dtype) - attn_output = torch.matmul(attention_weights, batch_compressed_kv) # [num_heads(128),qlen, lora_rank(512)] + attention_weights = nn.functional.softmax(attention_weights, dim=-1, dtype=torch.float16).to(q_pe.dtype) + attn_output = torch.matmul(attention_weights, batch_compressed_kv) # [num_heads(128),qlen, lora_rank(512)] # out_absorb shape is [num_heads(128), kv_lora_rank(512), v_head_dim(128)] - out_absorb = out_absorb.transpose(1,2) + out_absorb = out_absorb.transpose(1, 2) # q for qlen, n for num_heads, h for v_head_dim, v for kv_lora_rank - attn_output = torch.matmul(attn_output, out_absorb) # [num_heads(128), qlen, v_head_dim(128)] - attn_output = attn_output.transpose(0, 1) # [qlen, num_heads(128), v_head_dim(128)] + attn_output = torch.matmul(attn_output, out_absorb) # [num_heads(128), qlen, v_head_dim(128)] + attn_output = attn_output.transpose(0, 1) # [qlen, num_heads(128), v_head_dim(128)] attn_output = attn_output.reshape(qlen, num_heads * nope_size) attn_output = o_proj(attn_output) final_attention_output = torch.cat((final_attention_output, attn_output), dim=0) return final_attention_output - torch_output = torch_attn( - input, - kv_cache, - position_ids, - page_idx, - page_offset, - attention_masks=attention_masks, - q_indptr=q_indptr, - kv_indices=kv_indices, - kv_indptr=kv_indptr, - bsz_tensors=bsz_tensors, - last_page_len=last_page_len, - layer_idx=0 - ) -print("Torch Output: ",torch_output) - - - - - - - - + input, + kv_cache, + position_ids, + page_idx, + page_offset, + attention_masks=attention_masks, + q_indptr=q_indptr, + kv_indices=kv_indices, + kv_indptr=kv_indptr, + bsz_tensors=bsz_tensors, + last_page_len=last_page_len, + layer_idx=0, +) +print("Torch Output: ", torch_output) diff --git a/kt-kernel/examples/test_mlp.py b/kt-kernel/examples/test_mlp.py index dfe3c979..832053bf 100644 --- a/kt-kernel/examples/test_mlp.py +++ b/kt-kernel/examples/test_mlp.py @@ -1,36 +1,39 @@ #!/usr/bin/env python # coding=utf-8 -''' -Description : +""" +Description : Author : chenht2022 Date : 2024-07-25 10:32:05 Version : 1.0.0 -LastEditors : chenht2022 +LastEditors : chenht2022 LastEditTime : 2024-08-06 10:37:28 -Copyright (c) 2024 by KVCache.AI, All Rights Reserved. -''' +Copyright (c) 2024 by KVCache.AI, All Rights Reserved. +""" import os, sys import time -sys.path.append(os.path.dirname(__file__) + '/../build') -import kt_kernel_ext + +sys.path.append(os.path.dirname(__file__) + "/../build") +from kt_kernel import kt_kernel_ext import torch hidden_size = 5120 intermediate_size = 3072 stride = 32 group_max_len = 1024 -gate_type = 1 # ggml_type::GGML_TYPE_F16 -up_type = 1 # ggml_type::GGML_TYPE_F16 -down_type = 1 # ggml_type::GGML_TYPE_F16 -hidden_type = 1 # ggml_type::GGML_TYPE_F16 +gate_type = 1 # ggml_type::GGML_TYPE_F16 +up_type = 1 # ggml_type::GGML_TYPE_F16 +down_type = 1 # ggml_type::GGML_TYPE_F16 +hidden_type = 1 # ggml_type::GGML_TYPE_F16 qlen = 30 layer_num = 10 CPUInfer = kt_kernel_ext.CPUInfer(48) validation_iter = 100 + def act_fn(x): return x / (1.0 + torch.exp(-x)) + def mlp_torch(input, gate_proj, up_proj, down_proj): gate_buf = torch.mm(input, gate_proj.t()) up_buf = torch.mm(input, up_proj.t()) @@ -38,16 +41,35 @@ def mlp_torch(input, gate_proj, up_proj, down_proj): ret = torch.mm(intermediate, down_proj.t()) return ret + with torch.inference_mode(mode=True): mlps = [] gate_projs = [] up_projs = [] down_projs = [] for _ in range(layer_num): - gate_proj = torch.randn((intermediate_size, hidden_size), dtype=torch.float16, device = "cuda").to("cpu").contiguous() - up_proj = torch.randn((intermediate_size, hidden_size), dtype=torch.float16, device = "cuda").to("cpu").contiguous() - down_proj = torch.randn((hidden_size, intermediate_size), dtype=torch.float16, device = "cuda").to("cpu").contiguous() - config = kt_kernel_ext.mlp.MLPConfig(hidden_size, intermediate_size, stride, group_max_len, gate_proj.data_ptr(), up_proj.data_ptr(), down_proj.data_ptr(), gate_type, up_type, down_type, hidden_type) + gate_proj = ( + torch.randn((intermediate_size, hidden_size), dtype=torch.float16, device="cuda").to("cpu").contiguous() + ) + up_proj = ( + torch.randn((intermediate_size, hidden_size), dtype=torch.float16, device="cuda").to("cpu").contiguous() + ) + down_proj = ( + torch.randn((hidden_size, intermediate_size), dtype=torch.float16, device="cuda").to("cpu").contiguous() + ) + config = kt_kernel_ext.mlp.MLPConfig( + hidden_size, + intermediate_size, + stride, + group_max_len, + gate_proj.data_ptr(), + up_proj.data_ptr(), + down_proj.data_ptr(), + gate_type, + up_type, + down_type, + hidden_type, + ) mlp = kt_kernel_ext.mlp.MLP(config) gate_projs.append(gate_proj) up_projs.append(up_proj) @@ -61,22 +83,16 @@ with torch.inference_mode(mode=True): output = torch.empty((qlen, hidden_size), dtype=torch.float16).contiguous() input = input / 100 - CPUInfer.submit( - mlp.forward( - qlen, - input.data_ptr(), - output.data_ptr() - ) - ) + CPUInfer.submit(mlp.forward(qlen, input.data_ptr(), output.data_ptr())) CPUInfer.sync() # print('cpuinfer output', output) - gate_proj = gate_projs[i%layer_num] - up_proj = up_projs[i%layer_num] - down_proj = down_projs[i%layer_num] + gate_proj = gate_projs[i % layer_num] + up_proj = up_projs[i % layer_num] + down_proj = down_projs[i % layer_num] t_output = mlp_torch(input, gate_proj, up_proj, down_proj) # print('torch output', t_output) diff = torch.mean(torch.abs(output - t_output)) / torch.mean(torch.abs(t_output)) - print('diff = ', diff) - assert(diff < 0.001) + print("diff = ", diff) + assert diff < 0.001 diff --git a/kt-kernel/examples/test_moe.py b/kt-kernel/examples/test_moe.py index 3487d322..9411ba54 100644 --- a/kt-kernel/examples/test_moe.py +++ b/kt-kernel/examples/test_moe.py @@ -1,18 +1,19 @@ #!/usr/bin/env python # coding=utf-8 -''' -Description : +""" +Description : Author : chenht2022 Date : 2024-07-25 10:32:05 Version : 1.0.0 -LastEditors : SkqLiao +LastEditors : SkqLiao LastEditTime : 2025-03-13 11:38:05 -Copyright (c) 2024 by KVCache.AI, All Rights Reserved. -''' +Copyright (c) 2024 by KVCache.AI, All Rights Reserved. +""" import os, sys import time -sys.path.insert(0, os.path.dirname(__file__) + '/../build') -import kt_kernel_ext + +sys.path.insert(0, os.path.dirname(__file__) + "/../build") +from kt_kernel import kt_kernel_ext import torch from tqdm import tqdm from kt_kernel_ext.kvcache import ggml_type @@ -20,7 +21,7 @@ from kt_kernel_ext.kvcache import ggml_type torch.manual_seed(0) expert_num = 8 -hidden_size = 2048 #7168 +hidden_size = 2048 # 7168 intermediate_size = 2048 stride = 32 group_min_len = 10 @@ -39,9 +40,11 @@ layer_num = 1 CPUInfer = kt_kernel_ext.CPUInfer(64) validation_iter = 10 + def act_fn(x): return x / (1.0 + torch.exp(-x)) + def mlp_torch(input, gate_proj, up_proj, down_proj): gate_buf = torch.mm(input, gate_proj.t()) up_buf = torch.mm(input, up_proj.t()) @@ -49,6 +52,7 @@ def mlp_torch(input, gate_proj, up_proj, down_proj): ret = torch.mm(intermediate, down_proj.t()) return ret + def moe_torch(input, expert_ids, weights, gate_proj, up_proj, down_proj): cnts = expert_ids.new_zeros((expert_ids.shape[0], expert_num)) cnts.scatter_(1, expert_ids, 1) @@ -85,10 +89,12 @@ def to_cpuinfer_tensor(tensor, type): size = torch.prod(torch.tensor(tensor.shape, dtype=torch.int32)).item() return kt_kernel_ext.utils.from_float(tensor.data_ptr(), size, type) + def from_cpuinfer_tensor(tensor, size, type): return kt_kernel_ext.utils.to_float(tensor.data_ptr(), size, type) -qlens = [1,64] #[64, 512, 2048, 8192, 16384] + +qlens = [1, 64] # [64, 512, 2048, 8192, 16384] # gate_types = [ggml_type.FP32, ggml_type.FP16, ggml_type.Q8_0, ggml_type.Q6_K, ggml_type.Q5_K, ggml_type.Q4_K, ggml_type.Q3_K] # up_types = [ggml_type.FP32, ggml_type.FP16, ggml_type.Q8_0, ggml_type.Q6_K, ggml_type.Q5_K, ggml_type.Q4_K, ggml_type.Q3_K] # down_types = [ggml_type.FP32, ggml_type.FP16, ggml_type.Q8_0, ggml_type.Q6_K, ggml_type.Q6_K, ggml_type.Q6_K, ggml_type.Q5_K] @@ -96,8 +102,8 @@ gate_types = [ggml_type.Q4_K] up_types = [ggml_type.Q4_K] down_types = [ggml_type.Q6_K] hidden_type = ggml_type.BF16 -print(f'Parameters: expert_num: {expert_num} hidden_size: {hidden_size} intermediate_size: {intermediate_size}') -print(f'group_max_len: ', group_max_len) +print(f"Parameters: expert_num: {expert_num} hidden_size: {hidden_size} intermediate_size: {intermediate_size}") +print(f"group_max_len: ", group_max_len) for qlen in qlens: for gate_type, up_type, down_type in zip(gate_types, up_types, down_types): @@ -106,18 +112,30 @@ for qlen in qlens: gate_projs = [] up_projs = [] down_projs = [] - print('Preparing data...') + print("Preparing data...") converted_tensors = [] for _ in range(layer_num): size = expert_num * intermediate_size * hidden_size - gate_proj = torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous() - up_proj = torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous() - down_proj = torch.randn((expert_num, hidden_size, intermediate_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous() - + gate_proj = ( + torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device="cuda") + .to("cpu") + .contiguous() + ) + up_proj = ( + torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device="cuda") + .to("cpu") + .contiguous() + ) + down_proj = ( + torch.randn((expert_num, hidden_size, intermediate_size), dtype=torch.float32, device="cuda") + .to("cpu") + .contiguous() + ) + gate_tensor = to_cpuinfer_tensor(gate_proj, gate_type) up_tensor = to_cpuinfer_tensor(up_proj, up_type) down_tensor = to_cpuinfer_tensor(down_proj, down_type) - + config = kt_kernel_ext.moe.MOEConfig(expert_num, num_experts_per_tok, hidden_size, intermediate_size) config.pool = CPUInfer.backend_ config.stride = stride @@ -131,59 +149,62 @@ for qlen in qlens: config.down_type = down_type config.hidden_type = hidden_type - moe = kt_kernel_ext.moe.MOE(config) gate_projs.append(gate_proj) up_projs.append(up_proj) - down_projs.append(down_proj) + down_projs.append(down_proj) CPUInfer.submit(moe.load_weights_task()) CPUInfer.sync() moes.append(moe) converted_tensors.append((gate_tensor, up_tensor, down_tensor)) - print('Finished initialization!') + print("Finished initialization!") CPUInfer.submit(moes[0].warm_up_task()) CPUInfer.sync() - print('Warm up finished!') + print("Warm up finished!") # validation progress_bar = tqdm(range(validation_iter), desc="Starting") total_diff = 0 - + for i in tqdm(progress_bar): - progress_bar.set_description('Round: {}/{}'.format(i + 1, validation_iter)) - expert_ids = torch.stack([torch.randperm(expert_num)[:num_experts_per_tok] for _ in range(qlen)]).contiguous() + progress_bar.set_description("Round: {}/{}".format(i + 1, validation_iter)) + expert_ids = torch.stack( + [torch.randperm(expert_num)[:num_experts_per_tok] for _ in range(qlen)] + ).contiguous() weights = torch.rand((qlen, num_experts_per_tok), dtype=torch.float32).contiguous() input_proj = torch.randn((qlen, hidden_size), dtype=torch.float32).contiguous() / 100 output_proj = torch.empty((qlen, hidden_size), dtype=torch.float32).contiguous() - + input_tensor = to_cpuinfer_tensor(input_proj, hidden_type) output_tensor = to_cpuinfer_tensor(output_proj, hidden_type) - + qlen_tensor = torch.tensor([qlen], dtype=torch.int32) moe = moes[i % layer_num] CPUInfer.submit( - moe.forward_task( + moe.forward_task( qlen_tensor.data_ptr(), - num_experts_per_tok, - expert_ids.data_ptr(), - weights.data_ptr(), - input_tensor.data_ptr(), + num_experts_per_tok, + expert_ids.data_ptr(), + weights.data_ptr(), + input_tensor.data_ptr(), output_tensor.data_ptr(), ) ) CPUInfer.sync() cpu_output = from_cpuinfer_tensor(output_tensor, qlen * hidden_size, hidden_type) - gate_proj = gate_projs[i%layer_num] - up_proj = up_projs[i%layer_num] - down_proj = down_projs[i%layer_num] + gate_proj = gate_projs[i % layer_num] + up_proj = up_projs[i % layer_num] + down_proj = down_projs[i % layer_num] t_output = moe_torch(input_proj, expert_ids, weights, gate_proj, up_proj, down_proj) - print('cpuinfer output', cpu_output) - print('torch output', t_output) - diff = torch.mean(torch.abs(cpu_output.flatten() - t_output.flatten())) / torch.mean(torch.abs(t_output.flatten())) + print("cpuinfer output", cpu_output) + print("torch output", t_output) + diff = torch.mean(torch.abs(cpu_output.flatten() - t_output.flatten())) / torch.mean( + torch.abs(t_output.flatten()) + ) assert diff < 0.5 total_diff += diff - - print(f'gate_type: {gate_type}, up_type: {up_type}, down_type: {down_type}') - print(f'Average diff: {total_diff / validation_iter:.4f}') + + print(f"gate_type: {gate_type}, up_type: {up_type}, down_type: {down_type}") + print(f"Average diff: {total_diff / validation_iter:.4f}") diff --git a/kt-kernel/examples/test_moe_amx.py b/kt-kernel/examples/test_moe_amx.py index ef270ff5..280ae5f7 100644 --- a/kt-kernel/examples/test_moe_amx.py +++ b/kt-kernel/examples/test_moe_amx.py @@ -4,7 +4,7 @@ sys.path.insert(0, os.path.dirname(__file__) + "/../build") print("sys.path:", sys.path) import torch -import kt_kernel_ext +from kt_kernel import kt_kernel_ext expert_num = 256 hidden_size = 7168 diff --git a/kt-kernel/examples/test_moe_kernel.py b/kt-kernel/examples/test_moe_kernel.py index 207b9a6d..ff7efd19 100644 --- a/kt-kernel/examples/test_moe_kernel.py +++ b/kt-kernel/examples/test_moe_kernel.py @@ -15,7 +15,7 @@ import time sys.path.insert(0, os.path.dirname(__file__) + "/../build") os.environ["BLAS_NUM_THREADS"] = "1" import torch -import kt_kernel_ext +from kt_kernel import kt_kernel_ext expert_num = 16 diff --git a/kt-kernel/examples/test_moe_kml.py b/kt-kernel/examples/test_moe_kml.py index e2819184..ca8a7c54 100644 --- a/kt-kernel/examples/test_moe_kml.py +++ b/kt-kernel/examples/test_moe_kml.py @@ -14,7 +14,7 @@ import time sys.path.insert(0, os.path.dirname(__file__) + "/../build") os.environ["BLAS_NUM_THREADS"] = "1" -import kt_kernel_ext +from kt_kernel import kt_kernel_ext import torch expert_num = 16 diff --git a/kt-kernel/python/experts_base.py b/kt-kernel/python/experts_base.py index 3da70e0c..365fe206 100644 --- a/kt-kernel/python/experts_base.py +++ b/kt-kernel/python/experts_base.py @@ -15,7 +15,7 @@ from abc import ABC, abstractmethod import os import ctypes -import kt_kernel_ext +from kt_kernel import kt_kernel_ext class KExpertsCPUBuffer: