diff --git a/ktransformers/ktransformers_ext/CMakeLists.txt b/ktransformers/ktransformers_ext/CMakeLists.txt index 02e6a04..c3d4f5b 100644 --- a/ktransformers/ktransformers_ext/CMakeLists.txt +++ b/ktransformers/ktransformers_ext/CMakeLists.txt @@ -22,14 +22,13 @@ option(LLAMA_AVX2 "llama: enable AVX2" option(LLAMA_AVX512 "llama: enable AVX512" OFF) option(LLAMA_AVX512_VBMI "llama: enable AVX512-VBMI" OFF) option(LLAMA_AVX512_VNNI "llama: enable AVX512-VNNI" OFF) +option(LLAMA_AVX512_BF16 "llama: enable AVX512-BF16" OFF) option(LLAMA_FMA "llama: enable FMA" OFF) # in MSVC F16C is implied with AVX2/AVX512 if (NOT MSVC) option(LLAMA_F16C "llama: enable F16C" OFF) endif() option(LLAMA_AVX512_FANCY_SIMD "llama: enable AVX512-VL, AVX512-BW, AVX512-DQ, AVX512-VNNI" OFF) -option(LLAMA_AVX512_BF16 "llama: enable AVX512-BF16" OFF) - # Architecture specific # TODO: probably these flags need to be tweaked on some architectures diff --git a/ktransformers/ktransformers_ext/bench/bench_linear.py b/ktransformers/ktransformers_ext/bench/bench_linear.py index 0a4de3a..3189afd 100644 --- a/ktransformers/ktransformers_ext/bench/bench_linear.py +++ b/ktransformers/ktransformers_ext/bench/bench_linear.py @@ -6,7 +6,7 @@ Author : chenht2022 Date : 2024-07-25 10:31:59 Version : 1.0.0 LastEditors : chenht2022 -LastEditTime : 2024-07-25 10:32:51 +LastEditTime : 2024-08-06 10:35:35 Copyright (c) 2024 by KVCache.AI, All Rights Reserved. ''' import os, sys @@ -15,15 +15,18 @@ sys.path.append(os.path.dirname(__file__) + '/../build') import cpuinfer_ext import torch +input_size = 16384 +output_size = 5120 +stride = 16 +group_max_len = 1024 +layer_num = 10 +qlen = 1 +CPUInfer = cpuinfer_ext.CPUInfer(64) +warm_up_iter = 1000 +test_iter = 10000 + def bench_linear(quant_mode: str): with torch.inference_mode(mode=True): - input_size = 16384 - output_size = 5120 - stride = 16 - layer_num = 10 - CPUInfer = cpuinfer_ext.CPUInfer(64) - warm_up_iter = 1000 - test_iter = 10000 hidden_type = 30 # ggml_type::GGML_TYPE_BF16 if quant_mode == "fp32": @@ -66,30 +69,37 @@ def bench_linear(quant_mode: str): projs = [] for _ in range(layer_num): proj = torch.randn((output_size, input_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous() - config = cpuinfer_ext.linear.LinearConfig(input_size, output_size, stride, proj.data_ptr(), proj_type, hidden_type) + config = cpuinfer_ext.linear.LinearConfig(input_size, output_size, stride, group_max_len, proj.data_ptr(), proj_type, hidden_type) linear = cpuinfer_ext.linear.Linear(config) projs.append(proj) linears.append(linear) + input = torch.randn((layer_num, qlen, input_size), dtype=torch.bfloat16, device = "cuda").to("cpu").contiguous() + output = torch.empty((layer_num, qlen, output_size), dtype=torch.bfloat16, device = "cuda").to("cpu").contiguous() # warm up for i in range(warm_up_iter): - linear = linears[i % layer_num] - input = torch.randn((1, input_size), dtype=torch.bfloat16).contiguous() - output = torch.empty((1, output_size), dtype=torch.bfloat16).contiguous() - CPUInfer.submit(linear.forward, input.data_ptr(), output.data_ptr()) + CPUInfer.submit( + linears[i % layer_num].forward( + qlen, + input[i % layer_num].data_ptr(), + output[i % layer_num].data_ptr() + ) + ) CPUInfer.sync() # test - total_time = 0 + start = time.perf_counter() for i in range(test_iter): - linear = linears[i % layer_num] - input = torch.randn((1, input_size), dtype=torch.bfloat16).contiguous() - output = torch.empty((1, output_size), dtype=torch.bfloat16).contiguous() - start = time.perf_counter() - CPUInfer.submit(linear.forward, input.data_ptr(), output.data_ptr()) + CPUInfer.submit( + linears[i % layer_num].forward( + qlen, + input[i % layer_num].data_ptr(), + output[i % layer_num].data_ptr() + ) + ) CPUInfer.sync() - end = time.perf_counter() - total_time += end - start + end = time.perf_counter() + total_time = end - start print('Quant mode: ', quant_mode) print('Time(s): ', total_time) print('Iteration: ', test_iter) diff --git a/ktransformers/ktransformers_ext/bench/bench_linear_torch.py b/ktransformers/ktransformers_ext/bench/bench_linear_torch.py index cb3e4ef..72e0e75 100644 --- a/ktransformers/ktransformers_ext/bench/bench_linear_torch.py +++ b/ktransformers/ktransformers_ext/bench/bench_linear_torch.py @@ -14,14 +14,17 @@ import time import torch import torch.nn.quantized as nnq +scale, zero_point = 0.1, 0 # Adjust scale and zero_point based on your dataset + +input_size = 16384 +output_size = 5120 +layer_num = 10 +qlen = 1 +warm_up_iter = 1000 +test_iter = 10000 + def bench_linear(quant_mode: str): with torch.inference_mode(mode=True): - input_size = 16384 - output_size = 5120 - layer_num = 10 - warm_up_iter = 1000 - test_iter = 10000 - if quant_mode == "fp32": proj_type = torch.float32 bytes_per_elem = 4.000000 @@ -41,37 +44,32 @@ def bench_linear(quant_mode: str): for _ in range(layer_num): proj = torch.randn((output_size, input_size), dtype = torch.float32, device = "cuda").to("cpu").contiguous() if quant_mode == "qint8": - scale, zero_point = 0.1, 0 # Adjust scale and zero_point based on your dataset proj_q = torch.quantize_per_tensor(proj, scale, zero_point, torch.qint8) quantized_layer = nnq.Linear(input_size, output_size) quantized_layer.set_weight_bias(proj_q, None) projs.append(quantized_layer) else: projs.append(proj.to(proj_type)) + input = torch.randn((layer_num, qlen, input_size), dtype=torch.bfloat16, device = "cuda").to("cpu").contiguous() # warm up for i in range(warm_up_iter): - input = torch.randn((1, input_size), dtype=torch.float32).contiguous() - if quant_mode == "qint8": - input_q = torch.quantize_per_tensor(input, scale, zero_point, torch.quint8) - quantized_layer = projs[i % layer_num] - t_output = quantized_layer(input_q) + if isinstance(projs[i % layer_num], nnq.Linear): + input_q = torch.quantize_per_tensor(input[i % layer_num].to(torch.float32), scale, zero_point, torch.quint8) + t_output = projs[i % layer_num](input_q) else: - t_output = torch.mm(input.to(proj_type), projs[i % layer_num].t()) + t_output = torch.mm(input[i % layer_num].to(proj_type), projs[i % layer_num].t()) # test - total_time = 0 + start = time.perf_counter() for i in range(test_iter): - input = torch.randn((1, input_size), dtype=torch.float32).contiguous() - start = time.perf_counter() - if quant_mode == "qint8": - input_q = torch.quantize_per_tensor(input, scale, zero_point, torch.quint8) - quantized_layer = projs[i % layer_num] - t_output = quantized_layer(input_q) + if isinstance(projs[i % layer_num], nnq.Linear): + input_q = torch.quantize_per_tensor(input[i % layer_num].to(torch.float32), scale, zero_point, torch.quint8) + t_output = projs[i % layer_num](input_q) else: - t_output = torch.mm(input.to(proj_type), projs[i % layer_num].t()) - end = time.perf_counter() - total_time += end - start + t_output = torch.mm(input[i % layer_num].to(proj_type), projs[i % layer_num].t()) + end = time.perf_counter() + total_time = end - start print('Quant mode: ', quant_mode) print('Time(s): ', total_time) print('Iteration: ', test_iter) diff --git a/ktransformers/ktransformers_ext/bench/bench_mlp.py b/ktransformers/ktransformers_ext/bench/bench_mlp.py index 5680a9b..690f9d9 100644 --- a/ktransformers/ktransformers_ext/bench/bench_mlp.py +++ b/ktransformers/ktransformers_ext/bench/bench_mlp.py @@ -6,7 +6,7 @@ Author : chenht2022 Date : 2024-07-16 10:43:18 Version : 1.0.0 LastEditors : chenht2022 -LastEditTime : 2024-07-25 10:32:55 +LastEditTime : 2024-08-06 10:36:04 Copyright (c) 2024 by KVCache.AI, All Rights Reserved. ''' import os, sys @@ -15,15 +15,18 @@ sys.path.append(os.path.dirname(__file__) + '/../build') import cpuinfer_ext import torch +hidden_size = 5120 +intermediate_size = 3072 +stride = 16 +group_max_len = 1024 +layer_num = 10 +qlen = 1 +CPUInfer = cpuinfer_ext.CPUInfer(64) +warm_up_iter = 1000 +test_iter = 10000 + def bench_mlp(quant_mode: str): with torch.inference_mode(mode=True): - hidden_size = 5120 - intermediate_size = 3072 - stride = 16 - layer_num = 10 - CPUInfer = cpuinfer_ext.CPUInfer(64) - warm_up_iter = 1000 - test_iter = 10000 hidden_type = 30 # ggml_type::GGML_TYPE_BF16 if quant_mode == "fp32": @@ -93,32 +96,39 @@ def bench_mlp(quant_mode: str): gate_proj = torch.randn((intermediate_size, hidden_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous() up_proj = torch.randn((intermediate_size, hidden_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous() down_proj = torch.randn((hidden_size, intermediate_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous() - config = cpuinfer_ext.mlp.MLPConfig(hidden_size, intermediate_size, stride, gate_proj.data_ptr(), up_proj.data_ptr(), down_proj.data_ptr(), gate_type, up_type, down_type, hidden_type) + config = cpuinfer_ext.mlp.MLPConfig(hidden_size, intermediate_size, stride, group_max_len, gate_proj.data_ptr(), up_proj.data_ptr(), down_proj.data_ptr(), gate_type, up_type, down_type, hidden_type) mlp = cpuinfer_ext.mlp.MLP(config) gate_projs.append(gate_proj) up_projs.append(up_proj) down_projs.append(down_proj) mlps.append(mlp) + input = torch.randn((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device = "cuda").to("cpu").contiguous() + output = torch.empty((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device = "cuda").to("cpu").contiguous() # warm up for i in range(warm_up_iter): - mlp = mlps[i % layer_num] - input = torch.randn((1, hidden_size), dtype=torch.bfloat16).contiguous() - output = torch.empty((1, hidden_size), dtype=torch.bfloat16).contiguous() - CPUInfer.submit(mlp.forward, input.data_ptr(), output.data_ptr()) + CPUInfer.submit( + mlps[i % layer_num].forward( + qlen, + input[i % layer_num].data_ptr(), + output[i % layer_num].data_ptr() + ) + ) CPUInfer.sync() # test - total_time = 0 + start = time.perf_counter() for i in range(test_iter): - mlp = mlps[i % layer_num] - input = torch.randn((1, hidden_size), dtype=torch.bfloat16).contiguous() - output = torch.empty((1, hidden_size), dtype=torch.bfloat16).contiguous() - start = time.perf_counter() - CPUInfer.submit(mlp.forward, input.data_ptr(), output.data_ptr()) + CPUInfer.submit( + mlps[i % layer_num].forward( + qlen, + input[i % layer_num].data_ptr(), + output[i % layer_num].data_ptr() + ) + ) CPUInfer.sync() - end = time.perf_counter() - total_time += end - start + end = time.perf_counter() + total_time = end - start print('Quant mode: ', quant_mode) print('Time(s): ', total_time) print('Iteration: ', test_iter) diff --git a/ktransformers/ktransformers_ext/bench/bench_mlp_torch.py b/ktransformers/ktransformers_ext/bench/bench_mlp_torch.py index 3aad58c..7b811d8 100644 --- a/ktransformers/ktransformers_ext/bench/bench_mlp_torch.py +++ b/ktransformers/ktransformers_ext/bench/bench_mlp_torch.py @@ -14,17 +14,38 @@ import time import torch import torch.nn.quantized as nnq +scale, zero_point = 0.1, 0 # Adjust scale and zero_point based on your dataset + +hidden_size = 5120 +intermediate_size = 3072 +layer_num = 10 +qlen = 1 +warm_up_iter = 1000 +test_iter = 10000 + def act_fn(x): return x / (1.0 + torch.exp(-x)) +def mlp_torch(input, gate_proj, up_proj, down_proj): + if isinstance(gate_proj, nnq.Linear): + input_q = torch.quantize_per_tensor(input.to(torch.float32), scale, zero_point, torch.quint8) + gate_buf = gate_proj(input_q) + up_buf = up_proj(input_q) + gate_buf = gate_buf.dequantize() + up_buf = up_buf.dequantize() + intermediate = act_fn(gate_buf) * up_buf + intermediate_q = torch.quantize_per_tensor(intermediate, scale, zero_point, torch.quint8) + expert_output = down_proj(intermediate_q) + ret = expert_output.dequantize() + else: + gate_buf = torch.mm(input.to(gate_proj.dtype), gate_proj.t()) + up_buf = torch.mm(input.to(up_proj.dtype), up_proj.t()) + intermediate = act_fn(gate_buf) * up_buf + ret = torch.mm(intermediate.to(down_proj.dtype), down_proj.t()) + return ret + def bench_mlp(quant_mode: str): with torch.inference_mode(mode=True): - hidden_size = 5120 - intermediate_size = 3072 - layer_num = 10 - warm_up_iter = 1000 - test_iter = 10000 - if quant_mode == "fp32": proj_type = torch.float32 bytes_per_elem = 4.000000 @@ -48,7 +69,6 @@ def bench_mlp(quant_mode: str): up_proj = torch.randn((intermediate_size, hidden_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous() down_proj = torch.randn((hidden_size, intermediate_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous() if quant_mode == "qint8": - scale, zero_point = 0.1, 0 # Adjust scale and zero_point based on your dataset gate_proj_q = torch.quantize_per_tensor(gate_proj, scale, zero_point, torch.qint8) quantized_gate = nnq.Linear(hidden_size, intermediate_size) quantized_gate.set_weight_bias(gate_proj_q, None) @@ -65,58 +85,18 @@ def bench_mlp(quant_mode: str): gate_projs.append(gate_proj.to(proj_type)) up_projs.append(up_proj.to(proj_type)) down_projs.append(down_proj.to(proj_type)) + input = torch.randn((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device = "cuda").to("cpu").contiguous() # warm up for i in range(warm_up_iter): - input = torch.randn((1, hidden_size), dtype=torch.float32).contiguous() - if quant_mode == "qint8": - input_q = torch.quantize_per_tensor(input, scale, zero_point, torch.quint8) - quantized_gate = gate_projs[i % layer_num] - gate_buf = quantized_gate(input_q) - quantized_up = up_projs[i % layer_num] - up_buf = quantized_gate(input_q) - gate_buf = gate_buf.dequantize() - up_buf = up_buf.dequantize() - intermediate = act_fn(gate_buf) * up_buf - intermediate_q = torch.quantize_per_tensor(intermediate, scale, zero_point, torch.quint8) - quantized_down = down_projs[i % layer_num] - t_output = quantized_down(intermediate_q) - else: - gate_proj = gate_projs[i%layer_num] - up_proj = up_projs[i%layer_num] - down_proj = down_projs[i%layer_num] - gate_buf = torch.mm(input.to(proj_type), gate_proj.t()) - up_buf = torch.mm(input.to(proj_type), up_proj.t()) - intermediate = act_fn(gate_buf) * up_buf - t_output = torch.mm(intermediate.to(proj_type), down_proj.t()) + mlp_torch(input[i % layer_num], gate_projs[i % layer_num], up_projs[i % layer_num], down_projs[i % layer_num]) # test - total_time = 0 + start = time.perf_counter() for i in range(test_iter): - input = torch.randn((1, hidden_size), dtype=torch.float32).contiguous() - start = time.perf_counter() - if quant_mode == "qint8": - input_q = torch.quantize_per_tensor(input, scale, zero_point, torch.quint8) - quantized_gate = gate_projs[i % layer_num] - gate_buf = quantized_gate(input_q) - quantized_up = up_projs[i % layer_num] - up_buf = quantized_gate(input_q) - gate_buf = gate_buf.dequantize() - up_buf = up_buf.dequantize() - intermediate = act_fn(gate_buf) * up_buf - intermediate_q = torch.quantize_per_tensor(intermediate, scale, zero_point, torch.quint8) - quantized_down = down_projs[i % layer_num] - t_output = quantized_down(intermediate_q) - else: - gate_proj = gate_projs[i%layer_num] - up_proj = up_projs[i%layer_num] - down_proj = down_projs[i%layer_num] - gate_buf = torch.mm(input.to(proj_type), gate_proj.t()) - up_buf = torch.mm(input.to(proj_type), up_proj.t()) - intermediate = act_fn(gate_buf) * up_buf - t_output = torch.mm(intermediate.to(proj_type), down_proj.t()) - end = time.perf_counter() - total_time += end - start + mlp_torch(input[i % layer_num], gate_projs[i % layer_num], up_projs[i % layer_num], down_projs[i % layer_num]) + end = time.perf_counter() + total_time = end - start print('Quant mode: ', quant_mode) print('Time(s): ', total_time) print('Iteration: ', test_iter) diff --git a/ktransformers/ktransformers_ext/bench/bench_moe.py b/ktransformers/ktransformers_ext/bench/bench_moe.py index 909f029..6d617b7 100644 --- a/ktransformers/ktransformers_ext/bench/bench_moe.py +++ b/ktransformers/ktransformers_ext/bench/bench_moe.py @@ -6,7 +6,7 @@ Author : chenht2022 Date : 2024-07-25 10:32:05 Version : 1.0.0 LastEditors : chenht2022 -LastEditTime : 2024-07-25 10:33:00 +LastEditTime : 2024-08-06 10:41:28 Copyright (c) 2024 by KVCache.AI, All Rights Reserved. ''' import os, sys @@ -15,21 +15,21 @@ sys.path.append(os.path.dirname(__file__) + '/../build') import cpuinfer_ext import torch +expert_num = 160 +hidden_size = 5120 +intermediate_size = 1536 +stride = 16 +group_min_len = 10 +group_max_len = 1024 +n_routed_experts = 6 +layer_num = 10 +qlen = 1 +CPUInfer = cpuinfer_ext.CPUInfer(64) +warm_up_iter = 1000 +test_iter = 10000 + def bench_moe(quant_mode: str): with torch.inference_mode(mode=True): - expert_num = 10 - hidden_size = 5120 - intermediate_size = 1536 - stride = 16 - group_min_len = 10 - group_max_len = 1024 - n_routed_experts = 6 - layer_num = 10 - qlen = 1 - CPUInfer = cpuinfer_ext.CPUInfer(64) - warm_up_iter = 1000 - test_iter = 10000 - hidden_type = 30 # ggml_type::GGML_TYPE_BF16 if quant_mode == "fp32": gate_type = 0 # ggml_type::GGML_TYPE_F32 @@ -104,32 +104,38 @@ def bench_moe(quant_mode: str): up_projs.append(up_proj) down_projs.append(down_proj) moes.append(moe) - expert_ids = torch.randint(0, expert_num, (layer_num, qlen, n_routed_experts), dtype=torch.int64, device = "cuda").to("cpu").contiguous() + expert_ids = torch.stack([torch.stack([torch.randperm(expert_num, dtype=torch.int64, device = "cuda")[:n_routed_experts] for _ in range(qlen)]) for _ in range(layer_num)]).to("cpu").contiguous() weights = torch.rand((layer_num, qlen, n_routed_experts), dtype=torch.float32, device = "cuda").to("cpu").contiguous() input = torch.randn((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device = "cuda").to("cpu").contiguous() output = torch.empty((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device = "cuda").to("cpu").contiguous() # warm up for i in range(warm_up_iter): - CPUInfer.submit(moes[i % layer_num].forward, - qlen, - n_routed_experts, - expert_ids[i % layer_num].data_ptr(), - weights[i % layer_num].data_ptr(), - input[i % layer_num].data_ptr(), - output[i % layer_num].data_ptr()) + CPUInfer.submit( + moes[i % layer_num].forward( + qlen, + n_routed_experts, + expert_ids[i % layer_num].data_ptr(), + weights[i % layer_num].data_ptr(), + input[i % layer_num].data_ptr(), + output[i % layer_num].data_ptr() + ) + ) CPUInfer.sync() # test start = time.perf_counter() for i in range(test_iter): - CPUInfer.submit(moes[i % layer_num].forward, - qlen, - n_routed_experts, - expert_ids[i % layer_num].data_ptr(), - weights[i % layer_num].data_ptr(), - input[i % layer_num].data_ptr(), - output[i % layer_num].data_ptr()) + CPUInfer.submit( + moes[i % layer_num].forward( + qlen, + n_routed_experts, + expert_ids[i % layer_num].data_ptr(), + weights[i % layer_num].data_ptr(), + input[i % layer_num].data_ptr(), + output[i % layer_num].data_ptr() + ) + ) CPUInfer.sync() end = time.perf_counter() total_time = end - start diff --git a/ktransformers/ktransformers_ext/bench/bench_moe_torch.py b/ktransformers/ktransformers_ext/bench/bench_moe_torch.py index 5075636..1aecf40 100644 --- a/ktransformers/ktransformers_ext/bench/bench_moe_torch.py +++ b/ktransformers/ktransformers_ext/bench/bench_moe_torch.py @@ -14,19 +14,71 @@ import time import torch import torch.nn.quantized as nnq +scale, zero_point = 0.1, 0 # Adjust scale and zero_point based on your dataset + +expert_num = 160 +hidden_size = 5120 +intermediate_size = 1536 +n_routed_experts = 6 +layer_num = 10 +qlen = 1 +warm_up_iter = 1000 +test_iter = 10000 + def act_fn(x): return x / (1.0 + torch.exp(-x)) +def mlp_torch(input, gate_proj, up_proj, down_proj): + if isinstance(gate_proj, nnq.Linear): + input_q = torch.quantize_per_tensor(input.to(torch.float32), scale, zero_point, torch.quint8) + gate_buf = gate_proj(input_q) + up_buf = up_proj(input_q) + gate_buf = gate_buf.dequantize() + up_buf = up_buf.dequantize() + intermediate = act_fn(gate_buf) * up_buf + intermediate_q = torch.quantize_per_tensor(intermediate, scale, zero_point, torch.quint8) + expert_output = down_proj(intermediate_q) + ret = expert_output.dequantize() + else: + gate_buf = torch.mm(input.to(gate_proj.dtype), gate_proj.t()) + up_buf = torch.mm(input.to(up_proj.dtype), up_proj.t()) + intermediate = act_fn(gate_buf) * up_buf + ret = torch.mm(intermediate.to(down_proj.dtype), down_proj.t()) + return ret + +def moe_torch(input, expert_ids, weights, gate_proj, up_proj, down_proj): + cnts = expert_ids.new_zeros((expert_ids.shape[0], expert_num)) + cnts.scatter_(1, expert_ids, 1) + tokens_per_expert = cnts.sum(dim=0) + idxs = expert_ids.view(-1).argsort() + sorted_tokens = input[idxs // expert_ids.shape[1]] + + outputs = [] + start_idx = 0 + for i, num_tokens in enumerate(tokens_per_expert): + end_idx = start_idx + num_tokens + if num_tokens == 0: + continue + tokens_for_this_expert = sorted_tokens[start_idx:end_idx] + expert_out = mlp_torch(tokens_for_this_expert, gate_proj[i], up_proj[i], down_proj[i]) + outputs.append(expert_out) + start_idx = end_idx + + outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0) + + new_x = torch.empty_like(outs) + new_x[idxs] = outs + t_output = ( + new_x.view(*expert_ids.shape, -1) + .type(weights.dtype) + .mul_(weights.unsqueeze(dim=-1)) + .sum(dim=1) + .type(new_x.dtype) + ) + return t_output + def bench_moe(quant_mode: str): with torch.inference_mode(mode=True): - expert_num = 10 - hidden_size = 5120 - intermediate_size = 1536 - n_routed_experts = 6 - layer_num = 10 - warm_up_iter = 1000 - test_iter = 10000 - if quant_mode == "fp32": proj_type = torch.float32 bytes_per_elem = 4.000000 @@ -50,7 +102,6 @@ def bench_moe(quant_mode: str): up_proj = torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous() down_proj = torch.randn((expert_num, hidden_size, intermediate_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous() if quant_mode == "qint8": - scale, zero_point = 0.1, 0 # Adjust scale and zero_point based on your dataset quantized_gate_proj = [] quantized_up_proj = [] quantized_down_proj = [] @@ -74,82 +125,20 @@ def bench_moe(quant_mode: str): gate_projs.append(gate_proj.to(proj_type)) up_projs.append(up_proj.to(proj_type)) down_projs.append(down_proj.to(proj_type)) + expert_ids = torch.stack([torch.stack([torch.randperm(expert_num, dtype=torch.int64, device = "cuda")[:n_routed_experts] for _ in range(qlen)]) for _ in range(layer_num)]).to("cpu").contiguous() + weights = torch.rand((layer_num, qlen, n_routed_experts), dtype=torch.float32, device = "cuda").to("cpu").contiguous() + input = torch.randn((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device = "cuda").to("cpu").contiguous() # warm up for i in range(warm_up_iter): - expert_ids = torch.randint(0, expert_num, (n_routed_experts,), dtype=torch.int64).contiguous() - weights = torch.rand((n_routed_experts,), dtype=torch.float32).contiguous() - input = torch.randn((1, hidden_size), dtype=torch.float32).contiguous() - if quant_mode == "qint8": - input_q = torch.quantize_per_tensor(input, scale, zero_point, torch.quint8) - t_output = torch.zeros((1, hidden_size), dtype=torch.float32).contiguous() - gate_proj = gate_projs[i%layer_num] - up_proj = up_projs[i%layer_num] - down_proj = down_projs[i%layer_num] - for i, expert_id in enumerate(expert_ids): - quantized_gate = gate_proj[expert_id] - gate_buf = quantized_gate(input_q) - quantized_up = up_proj[expert_id] - up_buf = quantized_up(input_q) - gate_buf = gate_buf.dequantize() - up_buf = up_buf.dequantize() - intermediate = act_fn(gate_buf) * up_buf - intermediate_q = torch.quantize_per_tensor(intermediate, scale, zero_point, torch.quint8) - quantized_down = down_proj[expert_id] - expert_output = quantized_down(intermediate_q) - expert_output = expert_output.dequantize() - t_output += weights[i] * expert_output - else: - t_output = torch.zeros((1, hidden_size), dtype=proj_type).contiguous() - gate_proj = gate_projs[i%layer_num] - up_proj = up_projs[i%layer_num] - down_proj = down_projs[i%layer_num] - for i, expert_id in enumerate(expert_ids): - gate_buf = torch.mm(input.to(proj_type), gate_proj[expert_id].t()) - up_buf = torch.mm(input.to(proj_type), up_proj[expert_id].t()) - intermediate = act_fn(gate_buf) * up_buf - expert_output = torch.mm(intermediate.to(proj_type), down_proj[expert_id].t()) - t_output += weights[i] * expert_output + moe_torch(input[i % layer_num], expert_ids[i % layer_num], weights[i % layer_num], gate_projs[i % layer_num], up_projs[i % layer_num], down_projs[i % layer_num]) # test - total_time = 0 + start = time.perf_counter() for i in range(test_iter): - expert_ids = torch.randint(0, expert_num, (n_routed_experts,), dtype=torch.int64).contiguous() - weights = torch.rand((n_routed_experts,), dtype=torch.float32).contiguous() - input = torch.randn((1, hidden_size), dtype=torch.float32).contiguous() - start = time.perf_counter() - if quant_mode == "qint8": - input_q = torch.quantize_per_tensor(input, scale, zero_point, torch.quint8) - t_output = torch.zeros((1, hidden_size), dtype=torch.float32).contiguous() - gate_proj = gate_projs[i%layer_num] - up_proj = up_projs[i%layer_num] - down_proj = down_projs[i%layer_num] - for i, expert_id in enumerate(expert_ids): - quantized_gate = gate_proj[expert_id] - gate_buf = quantized_gate(input_q) - quantized_up = up_proj[expert_id] - up_buf = quantized_up(input_q) - gate_buf = gate_buf.dequantize() - up_buf = up_buf.dequantize() - intermediate = act_fn(gate_buf) * up_buf - intermediate_q = torch.quantize_per_tensor(intermediate, scale, zero_point, torch.quint8) - quantized_down = down_proj[expert_id] - expert_output = quantized_down(intermediate_q) - expert_output = expert_output.dequantize() - t_output += weights[i] * expert_output - else: - t_output = torch.zeros((1, hidden_size), dtype=proj_type).contiguous() - gate_proj = gate_projs[i%layer_num] - up_proj = up_projs[i%layer_num] - down_proj = down_projs[i%layer_num] - for i, expert_id in enumerate(expert_ids): - gate_buf = torch.mm(input.to(proj_type), gate_proj[expert_id].t()) - up_buf = torch.mm(input.to(proj_type), up_proj[expert_id].t()) - intermediate = act_fn(gate_buf) * up_buf - expert_output = torch.mm(intermediate.to(proj_type), down_proj[expert_id].t()) - t_output += weights[i] * expert_output - end = time.perf_counter() - total_time += end - start + moe_torch(input[i % layer_num], expert_ids[i % layer_num], weights[i % layer_num], gate_projs[i % layer_num], up_projs[i % layer_num], down_projs[i % layer_num]) + end = time.perf_counter() + total_time = end - start print('Quant mode: ', quant_mode) print('Time(s): ', total_time) print('Iteration: ', test_iter) diff --git a/ktransformers/ktransformers_ext/cpu_backend/cpuinfer.h b/ktransformers/ktransformers_ext/cpu_backend/cpuinfer.h index eae6f90..9618e6b 100644 --- a/ktransformers/ktransformers_ext/cpu_backend/cpuinfer.h +++ b/ktransformers/ktransformers_ext/cpu_backend/cpuinfer.h @@ -1,12 +1,12 @@ /** - * @Description : + * @Description : * @Author : chenht2022 * @Date : 2024-07-16 10:43:18 * @Version : 1.0.0 - * @LastEditors : chenht2022 - * @LastEditTime : 2024-07-25 10:33:42 - * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved. -**/ + * @LastEditors : chenht2022 + * @LastEditTime : 2024-08-07 09:47:43 + * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved. + **/ #ifndef CPUINFER_CPUINFER_H #define CPUINFER_CPUINFER_H @@ -17,6 +17,7 @@ #include #include #include +#include "cuda_runtime.h" #include "backend.h" #include "task_queue.h" @@ -39,16 +40,39 @@ class CPUInfer { } template - void submit(Func f, Obj* obj, Args... args) { + void enqueue(Func f, Obj* obj, Args... args) { task_queue_->enqueue([=]() { std::invoke(f, *obj, args..., backend_); }); } + void submit(std::pair params) { + void (*func)(void*) = (void (*)(void*))params.first; + void* args = (void*)params.second; + *((CPUInfer**)args) = this; + func(args); + } + void sync() { task_queue_->sync(); } + void submit_with_cuda_stream(intptr_t user_cuda_stream, std::pair params) { + void (*func)(void*) = (void (*)(void*))params.first; + void* args = (void*)params.second; + *((CPUInfer**)args) = this; + cudaLaunchHostFunc((cudaStream_t)user_cuda_stream, (cudaHostFn_t)func, args); + } + + static void sync_(void* cpu_infer_ptr) { + CPUInfer* cpuinfer = (CPUInfer*)cpu_infer_ptr; + cpuinfer->sync(); + } + + void sync_with_cuda_stream(intptr_t user_cuda_stream) { + cudaLaunchHostFunc((cudaStream_t)user_cuda_stream, (cudaHostFn_t)&sync_, (void*)this); + } + public: Backend* backend_; TaskQueue* task_queue_; diff --git a/ktransformers/ktransformers_ext/examples/test_linear.py b/ktransformers/ktransformers_ext/examples/test_linear.py index 6cb8d0c..7a331db 100644 --- a/ktransformers/ktransformers_ext/examples/test_linear.py +++ b/ktransformers/ktransformers_ext/examples/test_linear.py @@ -6,7 +6,7 @@ Author : chenht2022 Date : 2024-07-25 10:32:05 Version : 1.0.0 LastEditors : chenht2022 -LastEditTime : 2024-07-25 10:34:00 +LastEditTime : 2024-08-06 10:36:59 Copyright (c) 2024 by KVCache.AI, All Rights Reserved. ''' import os, sys @@ -15,23 +15,23 @@ sys.path.append(os.path.dirname(__file__) + '/../build') import cpuinfer_ext import torch -with torch.inference_mode(mode=True): - input_size = 16384 - output_size = 5120 - stride = 32 - proj_type = 1 # ggml_type::GGML_TYPE_F16 - hidden_type = 1 # ggml_type::GGML_TYPE_F16 - layer_num = 10 - CPUInfer = cpuinfer_ext.CPUInfer(48) - validation_iter = 100 - warm_up_iter = 1000 - test_iter = 10000 +input_size = 16384 +output_size = 5120 +stride = 32 +group_max_len = 1024 +proj_type = 1 # ggml_type::GGML_TYPE_F16 +hidden_type = 1 # ggml_type::GGML_TYPE_F16 +qlen = 30 +layer_num = 10 +CPUInfer = cpuinfer_ext.CPUInfer(48) +validation_iter = 100 +with torch.inference_mode(mode=True): linears = [] projs = [] for _ in range(layer_num): proj = torch.randn((output_size, input_size), dtype=torch.float16, device = "cuda").to("cpu").contiguous() - config = cpuinfer_ext.linear.LinearConfig(input_size, output_size, stride, proj.data_ptr(), proj_type, hidden_type) + config = cpuinfer_ext.linear.LinearConfig(input_size, output_size, stride, group_max_len, proj.data_ptr(), proj_type, hidden_type) linear = cpuinfer_ext.linear.Linear(config) projs.append(proj) linears.append(linear) @@ -39,11 +39,17 @@ with torch.inference_mode(mode=True): # validation for i in range(validation_iter): linear = linears[i % layer_num] - input = torch.randn((1, input_size), dtype=torch.float16).contiguous() - output = torch.empty((1, output_size), dtype=torch.float16).contiguous() + input = torch.randn((qlen, input_size), dtype=torch.float16).contiguous() + output = torch.empty((qlen, output_size), dtype=torch.float16).contiguous() input = input / 100 - CPUInfer.submit(linear.forward, input.data_ptr(), output.data_ptr()) + CPUInfer.submit( + linear.forward( + qlen, + input.data_ptr(), + output.data_ptr() + ) + ) CPUInfer.sync() # print('cpuinfer output', output) @@ -54,30 +60,3 @@ with torch.inference_mode(mode=True): diff = torch.mean(torch.abs(output - t_output)) / torch.mean(torch.abs(t_output)) print('diff = ', diff) assert(diff < 0.001) - - # warm up - for i in range(warm_up_iter): - linear = linears[i % layer_num] - input = torch.randn((1, input_size), dtype=torch.float16).contiguous() - output = torch.empty((1, output_size), dtype=torch.float16).contiguous() - input = input / 100 - CPUInfer.submit(linear.forward, input.data_ptr(), output.data_ptr()) - CPUInfer.sync() - - # test - total_time = 0 - for i in range(test_iter): - linear = linears[i % layer_num] - input = torch.randn((1, input_size), dtype=torch.float16).contiguous() - output = torch.empty((1, output_size), dtype=torch.float16).contiguous() - input = input / 100 - start = time.perf_counter() - CPUInfer.submit(linear.forward, input.data_ptr(), output.data_ptr()) - CPUInfer.sync() - end = time.perf_counter() - total_time += end - start - print('Time: ', total_time) - print('Iteration: ', test_iter) - print('Time per iteration: ', total_time / test_iter) - print('Bandwidth: ', input_size * output_size * 2 * test_iter / total_time / 1000 / 1000 / 1000, 'GB/s') - print("All tasks completed.") \ No newline at end of file diff --git a/ktransformers/ktransformers_ext/examples/test_mlp.py b/ktransformers/ktransformers_ext/examples/test_mlp.py index d965877..9805e72 100644 --- a/ktransformers/ktransformers_ext/examples/test_mlp.py +++ b/ktransformers/ktransformers_ext/examples/test_mlp.py @@ -6,7 +6,7 @@ Author : chenht2022 Date : 2024-07-25 10:32:05 Version : 1.0.0 LastEditors : chenht2022 -LastEditTime : 2024-07-25 10:34:03 +LastEditTime : 2024-08-06 10:37:28 Copyright (c) 2024 by KVCache.AI, All Rights Reserved. ''' import os, sys @@ -15,20 +15,30 @@ sys.path.append(os.path.dirname(__file__) + '/../build') import cpuinfer_ext import torch -with torch.inference_mode(mode=True): - hidden_size = 5120 - intermediate_size = 3072 - stride = 32 - gate_type = 1 # ggml_type::GGML_TYPE_F16 - up_type = 1 # ggml_type::GGML_TYPE_F16 - down_type = 1 # ggml_type::GGML_TYPE_F16 - hidden_type = 1 # ggml_type::GGML_TYPE_F16 - layer_num = 10 - CPUInfer = cpuinfer_ext.CPUInfer(48) - validation_iter = 100 - warm_up_iter = 1000 - test_iter = 10000 +hidden_size = 5120 +intermediate_size = 3072 +stride = 32 +group_max_len = 1024 +gate_type = 1 # ggml_type::GGML_TYPE_F16 +up_type = 1 # ggml_type::GGML_TYPE_F16 +down_type = 1 # ggml_type::GGML_TYPE_F16 +hidden_type = 1 # ggml_type::GGML_TYPE_F16 +qlen = 30 +layer_num = 10 +CPUInfer = cpuinfer_ext.CPUInfer(48) +validation_iter = 100 +def act_fn(x): + return x / (1.0 + torch.exp(-x)) + +def mlp_torch(input, gate_proj, up_proj, down_proj): + gate_buf = torch.mm(input, gate_proj.t()) + up_buf = torch.mm(input, up_proj.t()) + intermediate = act_fn(gate_buf) * up_buf + ret = torch.mm(intermediate, down_proj.t()) + return ret + +with torch.inference_mode(mode=True): mlps = [] gate_projs = [] up_projs = [] @@ -37,7 +47,7 @@ with torch.inference_mode(mode=True): gate_proj = torch.randn((intermediate_size, hidden_size), dtype=torch.float16, device = "cuda").to("cpu").contiguous() up_proj = torch.randn((intermediate_size, hidden_size), dtype=torch.float16, device = "cuda").to("cpu").contiguous() down_proj = torch.randn((hidden_size, intermediate_size), dtype=torch.float16, device = "cuda").to("cpu").contiguous() - config = cpuinfer_ext.mlp.MLPConfig(hidden_size, intermediate_size, stride, gate_proj.data_ptr(), up_proj.data_ptr(), down_proj.data_ptr(), gate_type, up_type, down_type, hidden_type) + config = cpuinfer_ext.mlp.MLPConfig(hidden_size, intermediate_size, stride, group_max_len, gate_proj.data_ptr(), up_proj.data_ptr(), down_proj.data_ptr(), gate_type, up_type, down_type, hidden_type) mlp = cpuinfer_ext.mlp.MLP(config) gate_projs.append(gate_proj) up_projs.append(up_proj) @@ -47,52 +57,26 @@ with torch.inference_mode(mode=True): # validation for i in range(validation_iter): mlp = mlps[i % layer_num] - input = torch.randn((1, hidden_size), dtype=torch.float16).contiguous() - output = torch.empty((1, hidden_size), dtype=torch.float16).contiguous() + input = torch.randn((qlen, hidden_size), dtype=torch.float16).contiguous() + output = torch.empty((qlen, hidden_size), dtype=torch.float16).contiguous() input = input / 100 - CPUInfer.submit(mlp.forward, input.data_ptr(), output.data_ptr()) + CPUInfer.submit( + mlp.forward( + qlen, + input.data_ptr(), + output.data_ptr() + ) + ) CPUInfer.sync() # print('cpuinfer output', output) - def act_fn(x): - return x / (1.0 + torch.exp(-x)) gate_proj = gate_projs[i%layer_num] up_proj = up_projs[i%layer_num] down_proj = down_projs[i%layer_num] - gate_buf = torch.mm(input, gate_proj.t()) - up_buf = torch.mm(input, up_proj.t()) - intermediate = act_fn(gate_buf) * up_buf - t_output = torch.mm(intermediate, down_proj.t()) + t_output = mlp_torch(input, gate_proj, up_proj, down_proj) # print('torch output', t_output) diff = torch.mean(torch.abs(output - t_output)) / torch.mean(torch.abs(t_output)) print('diff = ', diff) assert(diff < 0.001) - - # warm up - for i in range(warm_up_iter): - mlp = mlps[i % layer_num] - input = torch.randn((1, hidden_size), dtype=torch.float16).contiguous() - output = torch.empty((1, hidden_size), dtype=torch.float16).contiguous() - input = input / 100 - CPUInfer.submit(mlp.forward, input.data_ptr(), output.data_ptr()) - CPUInfer.sync() - - # test - total_time = 0 - for i in range(test_iter): - mlp = mlps[i % layer_num] - input = torch.randn((1, hidden_size), dtype=torch.float16).contiguous() - output = torch.empty((1, hidden_size), dtype=torch.float16).contiguous() - input = input / 100 - start = time.time() - CPUInfer.submit(mlp.forward, input.data_ptr(), output.data_ptr()) - CPUInfer.sync() - end = time.time() - total_time += end - start - print('Time: ', total_time) - print('Iteration: ', test_iter) - print('Time per iteration: ', total_time / test_iter) - print('Bandwidth: ', hidden_size * intermediate_size * 3 * 2 * test_iter / total_time / 1024 / 1024 / 1024, 'GB/s') - print("All tasks completed.") \ No newline at end of file diff --git a/ktransformers/ktransformers_ext/examples/test_moe.py b/ktransformers/ktransformers_ext/examples/test_moe.py index 0597811..3fa4dbd 100644 --- a/ktransformers/ktransformers_ext/examples/test_moe.py +++ b/ktransformers/ktransformers_ext/examples/test_moe.py @@ -6,7 +6,7 @@ Author : chenht2022 Date : 2024-07-25 10:32:05 Version : 1.0.0 LastEditors : chenht2022 -LastEditTime : 2024-07-25 10:34:06 +LastEditTime : 2024-08-06 10:38:05 Copyright (c) 2024 by KVCache.AI, All Rights Reserved. ''' import os, sys @@ -15,25 +15,64 @@ sys.path.append(os.path.dirname(__file__) + '/../build') import cpuinfer_ext import torch -with torch.inference_mode(mode=True): - expert_num = 10 - hidden_size = 5120 - intermediate_size = 1536 - stride = 32 - group_min_len = 10 - group_max_len = 1024 - gate_type = 1 # ggml_type::GGML_TYPE_F16 - up_type = 1 # ggml_type::GGML_TYPE_F16 - down_type = 1 # ggml_type::GGML_TYPE_F16 - hidden_type = 1 # ggml_type::GGML_TYPE_F16 - n_routed_experts = 6 - qlen = 30 - layer_num = 10 - CPUInfer = cpuinfer_ext.CPUInfer(48) - validation_iter = 100 - warm_up_iter = 1000 - test_iter = 10000 +expert_num = 160 +hidden_size = 5120 +intermediate_size = 1536 +stride = 32 +group_min_len = 10 +group_max_len = 1024 +gate_type = 1 # ggml_type::GGML_TYPE_F16 +up_type = 1 # ggml_type::GGML_TYPE_F16 +down_type = 1 # ggml_type::GGML_TYPE_F16 +hidden_type = 1 # ggml_type::GGML_TYPE_F16 +n_routed_experts = 6 +qlen = 30 +layer_num = 10 +CPUInfer = cpuinfer_ext.CPUInfer(48) +validation_iter = 100 +def act_fn(x): + return x / (1.0 + torch.exp(-x)) + +def mlp_torch(input, gate_proj, up_proj, down_proj): + gate_buf = torch.mm(input, gate_proj.t()) + up_buf = torch.mm(input, up_proj.t()) + intermediate = act_fn(gate_buf) * up_buf + ret = torch.mm(intermediate, down_proj.t()) + return ret + +def moe_torch(input, expert_ids, weights, gate_proj, up_proj, down_proj): + cnts = expert_ids.new_zeros((expert_ids.shape[0], expert_num)) + cnts.scatter_(1, expert_ids, 1) + tokens_per_expert = cnts.sum(dim=0) + idxs = expert_ids.view(-1).argsort() + sorted_tokens = input[idxs // expert_ids.shape[1]] + + outputs = [] + start_idx = 0 + for i, num_tokens in enumerate(tokens_per_expert): + end_idx = start_idx + num_tokens + if num_tokens == 0: + continue + tokens_for_this_expert = sorted_tokens[start_idx:end_idx] + expert_out = mlp_torch(tokens_for_this_expert, gate_proj[i], up_proj[i], down_proj[i]) + outputs.append(expert_out) + start_idx = end_idx + + outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0) + + new_x = torch.empty_like(outs) + new_x[idxs] = outs + t_output = ( + new_x.view(*expert_ids.shape, -1) + .type(weights.dtype) + .mul_(weights.unsqueeze(dim=-1)) + .sum(dim=1) + .type(new_x.dtype) + ) + return t_output + +with torch.inference_mode(mode=True): moes = [] gate_projs = [] up_projs = [] @@ -51,63 +90,32 @@ with torch.inference_mode(mode=True): # validation for i in range(validation_iter): - moe = moes[i % layer_num] - expert_ids = torch.randint(0, expert_num, (qlen, n_routed_experts), dtype=torch.int64).contiguous() + expert_ids = torch.stack([torch.randperm(expert_num)[:n_routed_experts] for _ in range(qlen)]).contiguous() weights = torch.rand((qlen, n_routed_experts), dtype=torch.float32).contiguous() - input = torch.randn((qlen, 1, hidden_size), dtype=torch.float16).contiguous() - output = torch.empty((qlen, 1, hidden_size), dtype=torch.float16).contiguous() + input = torch.randn((qlen, hidden_size), dtype=torch.float16).contiguous() + output = torch.empty((qlen, hidden_size), dtype=torch.float16).contiguous() input = input / 100 - CPUInfer.submit(moe.forward, qlen, n_routed_experts, expert_ids.data_ptr(), weights.data_ptr(), input.data_ptr(), output.data_ptr()) + moe = moes[i % layer_num] + CPUInfer.submit( + moe.forward( + qlen, + n_routed_experts, + expert_ids.data_ptr(), + weights.data_ptr(), + input.data_ptr(), + output.data_ptr() + ) + ) CPUInfer.sync() # print('cpuinfer output', output) - def act_fn(x): - return x / (1.0 + torch.exp(-x)) - t_output = torch.zeros((qlen, 1, hidden_size), dtype=torch.float32).contiguous() gate_proj = gate_projs[i%layer_num] up_proj = up_projs[i%layer_num] down_proj = down_projs[i%layer_num] - for token_idx in range(qlen): - for i, expert_id in enumerate(expert_ids[token_idx]): - gate_buf = torch.mm(input[token_idx], gate_proj[expert_id].t()) - up_buf = torch.mm(input[token_idx], up_proj[expert_id].t()) - intermediate = act_fn(gate_buf) * up_buf - expert_output = torch.mm(intermediate, down_proj[expert_id].t()) - t_output[token_idx] += weights[token_idx][i] * expert_output + t_output = moe_torch(input, expert_ids, weights, gate_proj, up_proj, down_proj) # print('torch output', t_output) diff = torch.mean(torch.abs(output - t_output)) / torch.mean(torch.abs(t_output)) print('diff = ', diff) assert(diff < 0.001) - - # warm up - for i in range(warm_up_iter): - moe = moes[i % layer_num] - expert_ids = torch.randint(0, expert_num, (qlen, n_routed_experts), dtype=torch.int64).contiguous() - weights = torch.rand((qlen, n_routed_experts), dtype=torch.float32).contiguous() - input = torch.randn((qlen, hidden_size), dtype=torch.float16).contiguous() - output = torch.empty((qlen, hidden_size), dtype=torch.float16).contiguous() - input = input / 100 - CPUInfer.submit(moe.forward, qlen, n_routed_experts, expert_ids.data_ptr(), weights.data_ptr(), input.data_ptr(), output.data_ptr()) - CPUInfer.sync() - - # test - total_time = 0 - for i in range(test_iter): - moe = moes[i % layer_num] - expert_ids = torch.randint(0, expert_num, (qlen, n_routed_experts), dtype=torch.int64).contiguous() - weights = torch.rand((qlen, n_routed_experts), dtype=torch.float32).contiguous() - input = torch.randn((qlen, hidden_size), dtype=torch.float16).contiguous() - output = torch.empty((qlen, hidden_size), dtype=torch.float16).contiguous() - input = input / 100 - start = time.perf_counter() - CPUInfer.submit(moe.forward, qlen, n_routed_experts, expert_ids.data_ptr(), weights.data_ptr(), input.data_ptr(), output.data_ptr()) - CPUInfer.sync() - end = time.perf_counter() - total_time += end - start - print('Time: ', total_time) - print('Iteration: ', test_iter) - print('Time per iteration: ', total_time / test_iter) - print('Bandwidth: ', hidden_size * intermediate_size * 3 * n_routed_experts * 2 * test_iter / total_time / 1000 / 1000 / 1000, 'GB/s') - print("All tasks completed.") \ No newline at end of file diff --git a/ktransformers/ktransformers_ext/ext_bindings.cpp b/ktransformers/ktransformers_ext/ext_bindings.cpp index 0aeead3..ef30037 100644 --- a/ktransformers/ktransformers_ext/ext_bindings.cpp +++ b/ktransformers/ktransformers_ext/ext_bindings.cpp @@ -3,8 +3,8 @@ * @Author : chenht2022 * @Date : 2024-07-22 02:03:22 * @Version : 1.0.0 - * @LastEditors : chenht2022 - * @LastEditTime : 2024-07-25 10:34:23 + * @LastEditors : chenht2022 + * @LastEditTime : 2024-08-07 10:39:37 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved. **/ // Python bindings @@ -12,7 +12,6 @@ #include #include #include "cpu_backend/cpuinfer.h" -#include "cuda_runtime.h" #include "device_launch_parameters.h" #include "llamafile/flags.h" #include "operators/llamafile/linear.h" @@ -26,239 +25,155 @@ namespace py = pybind11; using namespace pybind11::literals; -// Binding functions for the Linear class class LinearBindings { public: - static void bind_forward(CPUInfer& cpuinfer, Linear* linear, py::args args, py::kwargs kwargs) { - auto input = args[0].cast(); - auto output = args[1].cast(); - cpuinfer.submit(&Linear::forward, linear, - (const void*)input, (void*)output); - } - - static void bind_warm_up(CPUInfer& cpuinfer, Linear* linear, py::args args, py::kwargs kwargs) { - cpuinfer.submit(&Linear::warm_up, linear); - } - - static void bind_functions(CPUInfer& cpuinfer, py::object func, py::args args, py::kwargs kwargs) { - auto linear = func.attr("__self__").cast(); - std::string func_name = py::str(func.attr("__func__").attr("__name__")); - - if (func_name == "forward") { - bind_forward(cpuinfer, linear, args, kwargs); - } else if (func_name == "warm_up") { - bind_warm_up(cpuinfer, linear, args, kwargs); - } else { - throw py::value_error("Unsupported function: " + - std::string(func_name)); + class WarmUpBindinds { + public: + struct Args { + CPUInfer* cpuinfer; + Linear* linear; + }; + static void inner(void* args) { + Args* args_ = (Args*)args; + args_->cpuinfer->enqueue(&Linear::warm_up, args_->linear); } - } + static std::pair interface(Linear& linear) { + Args* args = new Args{nullptr, &linear}; + return std::make_pair((intptr_t)&inner, (intptr_t)args); + } + }; + class ForwardBindings { + public: + struct Args { + CPUInfer* cpuinfer; + Linear* linear; + int qlen; + const void* input; + void* output; + }; + static void inner(void* args) { + Args* args_ = (Args*)args; + args_->cpuinfer->enqueue(&Linear::forward, args_->linear, args_->qlen, args_->input, args_->output); + } + static std::pair interface(Linear& linear, int qlen, intptr_t input, intptr_t output) { + Args* args = new Args{nullptr, &linear, qlen, (const void*)input, (void*)output}; + return std::make_pair((intptr_t)&inner, (intptr_t)args); + } + }; }; -// Binding functions for the MLP class class MLPBindings { public: - static void bind_forward(CPUInfer& cpuinfer, MLP* mlp, py::args args, py::kwargs kwargs) { - auto input = args[0].cast(); - auto output = args[1].cast(); - cpuinfer.submit(&MLP::forward, mlp, - (const void*)input, (void*)output); - } - - static void bind_warm_up(CPUInfer& cpuinfer, MLP* mlp, py::args args, py::kwargs kwargs) { - cpuinfer.submit(&MLP::warm_up, mlp); - } - - static void bind_functions(CPUInfer& cpuinfer, py::object func, py::args args, py::kwargs kwargs) { - auto mlp = func.attr("__self__").cast(); - std::string func_name = py::str(func.attr("__func__").attr("__name__")); - - if (func_name == "forward") { - bind_forward(cpuinfer, mlp, args, kwargs); - } else if (func_name == "warm_up") { - bind_warm_up(cpuinfer, mlp, args, kwargs); - } else { - throw py::value_error("Unsupported function: " + - std::string(func_name)); + class WarmUpBindinds { + public: + struct Args { + CPUInfer* cpuinfer; + MLP* mlp; + }; + static void inner(void* args) { + Args* args_ = (Args*)args; + args_->cpuinfer->enqueue(&MLP::warm_up, args_->mlp); } - } + static std::pair interface(MLP& mlp) { + Args* args = new Args{nullptr, &mlp}; + return std::make_pair((intptr_t)&inner, (intptr_t)args); + } + }; + class ForwardBindings { + public: + struct Args { + CPUInfer* cpuinfer; + MLP* mlp; + int qlen; + const void* input; + void* output; + }; + static void inner(void* args) { + Args* args_ = (Args*)args; + args_->cpuinfer->enqueue(&MLP::forward, args_->mlp, args_->qlen, args_->input, args_->output); + } + static std::pair interface(MLP& mlp, int qlen, intptr_t input, intptr_t output) { + Args* args = new Args{nullptr, &mlp, qlen, (const void*)input, (void*)output}; + return std::make_pair((intptr_t)&inner, (intptr_t)args); + } + }; }; -// Binding functions for the MOE class class MOEBindings { public: - static void bind_forward(CPUInfer& cpuinfer, MOE* moe, py::args args, py::kwargs kwargs) { - int qlen = args[0].cast(); - int k = args[1].cast(); - auto expert_ids = args[2].cast(); - auto weights = args[3].cast(); - auto input = args[4].cast(); - auto output = args[5].cast(); - cpuinfer.submit(&MOE::forward, moe, - qlen, k, (const uint64_t*)expert_ids, (const float*)weights, (const void*)input, (void*)output); - } - - static void bind_warm_up(CPUInfer& cpuinfer, MOE* moe, py::args args, py::kwargs kwargs) { - cpuinfer.submit(&MOE::warm_up, moe); - } - - static void bind_functions(CPUInfer& cpuinfer, py::object func, py::args args, py::kwargs kwargs) { - auto moe = func.attr("__self__").cast(); - std::string func_name = py::str(func.attr("__func__").attr("__name__")); - - if (func_name == "forward") { - bind_forward(cpuinfer, moe, args, kwargs); - } else if (func_name == "warm_up") { - bind_warm_up(cpuinfer, moe, args, kwargs); - } else { - throw py::value_error("Unsupported function: " + - std::string(func_name)); + class WarmUpBindinds { + public: + struct Args { + CPUInfer* cpuinfer; + MOE* moe; + }; + static void inner(void* args) { + Args* args_ = (Args*)args; + args_->cpuinfer->enqueue(&MOE::warm_up, args_->moe); } - } + static std::pair interface(MOE& moe) { + Args* args = new Args{nullptr, &moe}; + return std::make_pair((intptr_t)&inner, (intptr_t)args); + } + }; + class ForwardBindings { + public: + struct Args { + CPUInfer* cpuinfer; + MOE* moe; + int qlen; + int k; + const uint64_t* expert_ids; + const float* weights; + const void* input; + void* output; + }; + static void inner(void* args) { + Args* args_ = (Args*)args; + args_->cpuinfer->enqueue(&MOE::forward, args_->moe, args_->qlen, args_->k, args_->expert_ids, args_->weights, args_->input, args_->output); + } + static std::pair interface(MOE& moe, int qlen, int k, intptr_t expert_ids, intptr_t weights, intptr_t input, intptr_t output) { + Args* args = new Args{nullptr, &moe, qlen, k, (const uint64_t*)expert_ids, (const float*)weights, (const void*)input, (void*)output}; + return std::make_pair((intptr_t)&inner, (intptr_t)args); + } + }; }; -struct MOEForwardArgs { - CPUInfer* cpuinfer; - MOE* moe; - int qlen; - int k; - uint64_t* expert_ids; - float* weights; - void* input; - void* output; -}; - -void submit_moe_forward_with_host_args_ptr(void* host_args_ptr) { - MOEForwardArgs* host_args = (MOEForwardArgs*)host_args_ptr; - host_args->cpuinfer->submit(&MOE::forward, host_args->moe, - host_args->qlen, host_args->k, host_args->expert_ids, host_args->weights, host_args->input, host_args->output); -} - -void cpuinfer_sync(void* host_args_ptr) { - CPUInfer* cpuinfer = (CPUInfer*)host_args_ptr; - cpuinfer->sync(); -} - PYBIND11_MODULE(cpuinfer_ext, m) { + py::class_(m, "CPUInfer") + .def(py::init()) + .def("submit", &CPUInfer::submit) + .def("submit_with_cuda_stream", &CPUInfer::submit_with_cuda_stream) + .def("sync", &CPUInfer::sync) + .def("sync_with_cuda_stream", &CPUInfer::sync_with_cuda_stream); + auto linear_module = m.def_submodule("linear"); - py::class_(linear_module, "LinearConfig") - .def(py::init([](int hidden_size, int intermediate_size, int stride, intptr_t proj, int proj_type, int hidden_type) { - return LinearConfig(hidden_size, intermediate_size, stride, (void*)proj, (ggml_type)proj_type, (ggml_type)hidden_type); + .def(py::init([](int hidden_size, int intermediate_size, int stride, int group_max_len, intptr_t proj, int proj_type, int hidden_type) { + return LinearConfig(hidden_size, intermediate_size, stride, group_max_len, (void*)proj, (ggml_type)proj_type, (ggml_type)hidden_type); })); - py::class_(linear_module, "Linear") .def(py::init()) - .def("warm_up", [](Linear& linear) { - throw std::runtime_error("!!! Doing nothing, please use CPUInfer.submit to call it!!!\n"); - }) - .def("forward", [](Linear& linear, intptr_t input, intptr_t output) { - throw std::runtime_error("!!! Doing nothing, please use CPUInfer.submit to call it!!!\n"); - }); + .def("warm_up", &LinearBindings::WarmUpBindinds::interface) + .def("forward", &LinearBindings::ForwardBindings::interface); auto mlp_module = m.def_submodule("mlp"); - py::class_(mlp_module, "MLPConfig") - .def(py::init([](int hidden_size, int intermediate_size, int stride, intptr_t gate_proj, intptr_t up_proj, intptr_t down_proj, int gate_type, int up_type, int down_type, int hidden_type) { - return MLPConfig(hidden_size, intermediate_size, stride, (void*)gate_proj, (void*)up_proj, (void*)down_proj, (ggml_type)gate_type, (ggml_type)up_type, (ggml_type)down_type, (ggml_type)hidden_type); + .def(py::init([](int hidden_size, int intermediate_size, int stride, int group_max_len, intptr_t gate_proj, intptr_t up_proj, intptr_t down_proj, int gate_type, int up_type, int down_type, int hidden_type) { + return MLPConfig(hidden_size, intermediate_size, stride, group_max_len, (void*)gate_proj, (void*)up_proj, (void*)down_proj, (ggml_type)gate_type, (ggml_type)up_type, (ggml_type)down_type, (ggml_type)hidden_type); })); - py::class_(mlp_module, "MLP") .def(py::init()) - .def("warm_up", [](MLP& mlp) { - throw std::runtime_error("!!! Doing nothing, please use CPUInfer.submit to call it!!!\n"); - }) - .def("forward", [](MLP& mlp, intptr_t input, intptr_t output) { - throw std::runtime_error("!!! Doing nothing, please use CPUInfer.submit to call it!!!\n"); - }); + .def("warm_up", &MLPBindings::WarmUpBindinds::interface) + .def("forward", &MLPBindings::ForwardBindings::interface); auto moe_module = m.def_submodule("moe"); - py::class_(moe_module, "MOEConfig") .def(py::init([](int expert_num, int routed_expert_num, int hidden_size, int intermediate_size, int stride, int group_min_len, int group_max_len, intptr_t gate_proj, intptr_t up_proj, intptr_t down_proj, int gate_type, int up_type, int down_type, int hidden_type) { return MOEConfig(expert_num, routed_expert_num, hidden_size, intermediate_size, stride, group_min_len, group_max_len, (void*)gate_proj, (void*)up_proj, (void*)down_proj, (ggml_type)gate_type, (ggml_type)up_type, (ggml_type)down_type, (ggml_type)hidden_type); })); - py::class_(moe_module, "MOE") .def(py::init()) - .def("warm_up", [](MOE& moe) { - throw std::runtime_error("!!! Doing nothing, please use CPUInfer.submit to call it!!!\n"); - }) - .def("forward", [](MOE& moe, int k, uint64_t expert_ids, intptr_t weights, intptr_t input, intptr_t output) { - throw std::runtime_error("!!! Doing nothing, please use CPUInfer.submit to call it!!!\n"); - }); - - py::class_(m, "CPUInfer") - .def(py::init()) - .def("submit", - [linear_module, mlp_module, moe_module](CPUInfer& cpuinfer, py::object func, py::args args, py::kwargs kwargs) { - if (py::hasattr(func, "__self__") && - py::hasattr(func, "__func__")) { - std::string class_name = py::str(func.attr("__self__") - .attr("__class__") - .attr("__name__")); - if (class_name == "Linear") { - LinearBindings::bind_functions(cpuinfer, func, - args, kwargs); - } else if (class_name == "MLP") { - MLPBindings::bind_functions(cpuinfer, func, - args, kwargs); - } else if (class_name == "MOE") { - MOEBindings::bind_functions(cpuinfer, func, - args, kwargs); - } else { - // handle other classes - throw py::type_error("Unsupported class type: " + - class_name); - } - } else { - // handle cases where func does not have __self__ or - // __func__ - throw py::type_error( - "Invalid function object: missing " - "__self__ or __func__ attribute."); - } - }) - .def("submit_with_cuda_stream", - [linear_module, mlp_module, moe_module](CPUInfer& cpuinfer, intptr_t user_cuda_stream, py::object func, py::args args, py::kwargs kwargs) { - if (py::hasattr(func, "__self__") && - py::hasattr(func, "__func__")) { - std::string class_name = py::str(func.attr("__self__") - .attr("__class__") - .attr("__name__")); - if (class_name == "MOE") { - std::string func_name = py::str(func.attr("__func__").attr("__name__")); - if (func_name == "forward") { - auto moe = func.attr("__self__").cast(); - int qlen = args[0].cast(); - int k = args[1].cast(); - auto expert_ids = args[2].cast(); - auto weights = args[3].cast(); - auto input = args[4].cast(); - auto output = args[5].cast(); - MOEForwardArgs* moe_forward_args = new MOEForwardArgs{&cpuinfer, moe, qlen, k, (uint64_t*)expert_ids, (float*)weights, (void*)input, (void*)output}; - // submit_moe_forward_with_host_args_ptr(moe_forward_args); - cudaLaunchHostFunc((cudaStream_t)user_cuda_stream, (cudaHostFn_t)submit_moe_forward_with_host_args_ptr, moe_forward_args); - } else { - throw py::value_error("Unsupported function: " + - std::string(func_name)); - } - } else { - // handle other classes - throw py::type_error("Unsupported class type: " + - class_name); - } - } else { - // handle cases where func does not have __self__ or - // __func__ - throw py::type_error( - "Invalid function object: missing " - "__self__ or __func__ attribute."); - } - }) - .def("sync_with_cuda_stream", [](CPUInfer& cpuinfer, intptr_t user_cuda_stream) { - // cpuinfer_sync((void*)(&cpuinfer)); - cudaLaunchHostFunc((cudaStream_t)user_cuda_stream, (cudaHostFn_t)cpuinfer_sync, (void*)(&cpuinfer)); - }) - .def("sync", &CPUInfer::sync); + .def("warm_up", &MOEBindings::WarmUpBindinds::interface) + .def("forward", &MOEBindings::ForwardBindings::interface); } diff --git a/ktransformers/ktransformers_ext/operators/llamafile/linear.cpp b/ktransformers/ktransformers_ext/operators/llamafile/linear.cpp index bf1935e..7dcba57 100644 --- a/ktransformers/ktransformers_ext/operators/llamafile/linear.cpp +++ b/ktransformers/ktransformers_ext/operators/llamafile/linear.cpp @@ -3,7 +3,7 @@ * @Author : chenht2022 * @Date : 2024-07-12 10:07:58 * @Version : 1.0.0 - * @LastEditors : chenht2022 + * @LastEditors : chenht2022 * @LastEditTime : 2024-07-25 10:34:58 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved. **/ @@ -13,9 +13,15 @@ Linear::Linear(LinearConfig config) { config_ = config; proj_ = config_.proj; - input_fp32_.resize(config_.input_size); - proj_input_.resize(config_.input_size * 4); - proj_output_.resize(config_.output_size); + std::vector> mem_requests; + mem_requests.push_back({(void**)&input_fp32_, sizeof(float) * config_.group_max_len * config_.input_size}); + mem_requests.push_back({(void**)&proj_input_, config_.group_max_len * config_.input_size * ggml_type_size(ggml_internal_get_type_traits(config_.proj_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.proj_type).vec_dot_type)}); + mem_requests.push_back({(void**)&proj_output_, sizeof(float) * config_.group_max_len * config_.output_size}); + shared_mem_buffer.alloc(this, mem_requests); +} + +Linear::~Linear() { + shared_mem_buffer.dealloc(this); } void Linear::warm_up(Backend* backend) { @@ -26,22 +32,42 @@ void Linear::warm_up(Backend* backend) { input_fp32[i] = 0; } from_float(input_fp32.data(), input.data(), config_.input_size, config_.hidden_type); - forward(input.data(), output.data(), backend); + forward_many(1, input.data(), output.data(), backend); } -void Linear::forward(const void* input, void* output, Backend* backend) { +void Linear::forward_many(int qlen, const void* input, void* output, Backend* backend) { const void* proj_input_ptr; if (config_.hidden_type == ggml_internal_get_type_traits(config_.proj_type).vec_dot_type) { proj_input_ptr = input; } else { - to_float(input, input_fp32_.data(), config_.input_size, config_.hidden_type); - from_float(input_fp32_.data(), proj_input_.data(), config_.input_size, ggml_internal_get_type_traits(config_.proj_type).vec_dot_type); - proj_input_ptr = proj_input_.data(); + to_float(input, input_fp32_, qlen * config_.input_size, config_.hidden_type); + from_float(input_fp32_, proj_input_, qlen * config_.input_size, ggml_internal_get_type_traits(config_.proj_type).vec_dot_type); + proj_input_ptr = proj_input_; } int nth = config_.output_size / config_.stride; backend->do_work_stealing_job(nth, [&](int task_id) { - int ith = task_id % nth; - llamafile_sgemm(config_.output_size, 1, config_.input_size / ggml_blck_size(config_.proj_type), proj_, config_.input_size / ggml_blck_size(config_.proj_type), proj_input_ptr, config_.input_size / ggml_blck_size(config_.proj_type), proj_output_.data(), config_.output_size, ith, nth, GGML_TASK_TYPE_COMPUTE, config_.proj_type, ggml_internal_get_type_traits(config_.proj_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT); + int ith = task_id; + void* proj_ptr = proj_ + ith * config_.stride * config_.input_size * ggml_type_size(config_.proj_type) / ggml_blck_size(config_.proj_type); + float* proj_output_ptr = proj_output_ + ith * config_.stride; + llamafile_sgemm(config_.stride, qlen, config_.input_size / ggml_blck_size(config_.proj_type), proj_ptr, config_.input_size / ggml_blck_size(config_.proj_type), proj_input_ptr, config_.input_size / ggml_blck_size(config_.proj_type), proj_output_ptr, config_.output_size, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.proj_type, ggml_internal_get_type_traits(config_.proj_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT); + if (config_.stride % ggml_blck_size(config_.hidden_type) == 0) { + for (int i = 0; i < qlen; i++) { + float* output_fp32_ptr = proj_output_ + i * config_.output_size + ith * config_.stride; + void* output_ptr = output + i * config_.output_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type) + ith * config_.stride * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type); + from_float(output_fp32_ptr, output_ptr, config_.stride, config_.hidden_type); + } + } }); - from_float(proj_output_.data(), output, config_.output_size, config_.hidden_type); + if (config_.stride % ggml_blck_size(config_.hidden_type) != 0) { + from_float(proj_output_, output, qlen * config_.output_size, config_.hidden_type); + } +} + +void Linear::forward(int qlen, const void* input, void* output, Backend* backend) { + if (qlen <= 0) { + return; + } + int forward_len = std::min(qlen, config_.group_max_len); + forward_many(forward_len, input, output, backend); + forward(qlen - forward_len, input + forward_len * config_.input_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), output + forward_len * config_.output_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), backend); } \ No newline at end of file diff --git a/ktransformers/ktransformers_ext/operators/llamafile/linear.h b/ktransformers/ktransformers_ext/operators/llamafile/linear.h index 4285551..fd856f9 100644 --- a/ktransformers/ktransformers_ext/operators/llamafile/linear.h +++ b/ktransformers/ktransformers_ext/operators/llamafile/linear.h @@ -3,7 +3,7 @@ * @Author : chenht2022 * @Date : 2024-07-12 10:07:58 * @Version : 1.0.0 - * @LastEditors : chenht2022 + * @LastEditors : chenht2022 * @LastEditTime : 2024-07-25 10:35:00 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved. **/ @@ -22,34 +22,38 @@ #include "llama.cpp/ggml-quants.h" #include "llama.cpp/ggml.h" #include "llamafile/sgemm.h" +#include "shared_mem_buffer.h" struct LinearConfig { int input_size; int output_size; int stride; + int group_max_len; void* proj; ggml_type proj_type; ggml_type hidden_type; LinearConfig() {} - LinearConfig(int input_size, int output_size, int stride, void* proj, ggml_type proj_type, ggml_type hidden_type) - : input_size(input_size), output_size(output_size), stride(stride), proj(proj), proj_type(proj_type), hidden_type(hidden_type) {} + LinearConfig(int input_size, int output_size, int stride, int group_max_len, void* proj, ggml_type proj_type, ggml_type hidden_type) + : input_size(input_size), output_size(output_size), stride(stride), group_max_len(group_max_len), proj(proj), proj_type(proj_type), hidden_type(hidden_type) {} }; class Linear { public: Linear(LinearConfig); + ~Linear(); void warm_up(Backend* backend); - void forward(const void* input, void* output, Backend* backend); + void forward_many(int qlen, const void* input, void* output, Backend* backend); + void forward(int qlen, const void* input, void* output, Backend* backend); private: LinearConfig config_; void* proj_; // [output_size * input_size ( /32 if quantized)] - std::vector input_fp32_; // [input_size] - std::vector proj_input_; // [input_size * 4] - std::vector proj_output_; // [output_size] + float* input_fp32_; // [group_max_len * input_size] + uint8_t* proj_input_; // [group_max_len * input_size * ggml_type_size(ggml_internal_get_type_traits(proj_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(proj_type).vec_dot_type)] + float* proj_output_; // [group_max_len * output_size] }; #endif \ No newline at end of file diff --git a/ktransformers/ktransformers_ext/operators/llamafile/mlp.cpp b/ktransformers/ktransformers_ext/operators/llamafile/mlp.cpp index 632c210..8ef092f 100644 --- a/ktransformers/ktransformers_ext/operators/llamafile/mlp.cpp +++ b/ktransformers/ktransformers_ext/operators/llamafile/mlp.cpp @@ -3,7 +3,7 @@ * @Author : chenht2022 * @Date : 2024-07-16 10:43:18 * @Version : 1.0.0 - * @LastEditors : chenht2022 + * @LastEditors : chenht2022 * @LastEditTime : 2024-07-25 10:35:04 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved. **/ @@ -15,14 +15,20 @@ MLP::MLP(MLPConfig config) { up_proj_ = config_.up_proj; down_proj_ = config_.down_proj; - input_fp32_.resize(config_.hidden_size); - gate_input_.resize(config_.hidden_size * 4); - up_input_.resize(config_.hidden_size * 4); - gate_output_.resize(config_.intermediate_size); - up_output_.resize(config_.intermediate_size); - intermediate_fp32_.resize(config_.intermediate_size); - down_input_.resize(config_.intermediate_size * 4); - down_output_.resize(config_.hidden_size); + std::vector> mem_requests; + mem_requests.push_back({(void**)&input_fp32_, sizeof(float) * config_.group_max_len * config_.hidden_size}); + mem_requests.push_back({(void**)&gate_input_, config_.group_max_len * config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type)}); + mem_requests.push_back({(void**)&up_input_, config_.group_max_len * config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type)}); + mem_requests.push_back({(void**)&gate_output_, sizeof(float) * config_.group_max_len * config_.intermediate_size}); + mem_requests.push_back({(void**)&up_output_, sizeof(float) * config_.group_max_len * config_.intermediate_size}); + mem_requests.push_back({(void**)&intermediate_fp32_, sizeof(float) * config_.group_max_len * config_.intermediate_size}); + mem_requests.push_back({(void**)&down_input_, config_.group_max_len * config_.intermediate_size * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type)}); + mem_requests.push_back({(void**)&down_output_, sizeof(float) * config_.group_max_len * config_.hidden_size}); + shared_mem_buffer.alloc(this, mem_requests); +} + +MLP::~MLP() { + shared_mem_buffer.dealloc(this); } void MLP::warm_up(Backend* backend) { @@ -33,33 +39,33 @@ void MLP::warm_up(Backend* backend) { input_fp32[i] = 0; } from_float(input_fp32.data(), input.data(), config_.hidden_size, config_.hidden_type); - forward(input.data(), output.data(), backend); + forward_many(1, input.data(), output.data(), backend); } static float act_fn(float x) { return x / (1.0f + expf(-x)); } -void MLP::forward(const void* input, void* output, Backend* backend) { +void MLP::forward_many(int qlen, const void* input, void* output, Backend* backend) { const void* gate_input_ptr; const void* up_input_ptr; if (config_.hidden_type == ggml_internal_get_type_traits(config_.gate_type).vec_dot_type && config_.hidden_type == ggml_internal_get_type_traits(config_.up_type).vec_dot_type) { gate_input_ptr = up_input_ptr = input; } else { - to_float(input, input_fp32_.data(), config_.hidden_size, config_.hidden_type); + to_float(input, input_fp32_, qlen * config_.hidden_size, config_.hidden_type); if (ggml_internal_get_type_traits(config_.gate_type).vec_dot_type == ggml_internal_get_type_traits(config_.up_type).vec_dot_type) { - from_float(input_fp32_.data(), gate_input_.data(), config_.hidden_size, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type); - gate_input_ptr = up_input_ptr = gate_input_.data(); + from_float(input_fp32_, gate_input_, qlen * config_.hidden_size, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type); + gate_input_ptr = up_input_ptr = gate_input_; } else { if (config_.hidden_type != ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) { - from_float(input_fp32_.data(), gate_input_.data(), config_.hidden_size, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type); - gate_input_ptr = gate_input_.data(); + from_float(input_fp32_, gate_input_, qlen * config_.hidden_size, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type); + gate_input_ptr = gate_input_; } else { gate_input_ptr = input; } if (config_.hidden_type != ggml_internal_get_type_traits(config_.up_type).vec_dot_type) { - from_float(input_fp32_.data(), up_input_.data(), config_.hidden_size, ggml_internal_get_type_traits(config_.up_type).vec_dot_type); - up_input_ptr = up_input_.data(); + from_float(input_fp32_, up_input_, qlen * config_.hidden_size, ggml_internal_get_type_traits(config_.up_type).vec_dot_type); + up_input_ptr = up_input_; } else { up_input_ptr = input; } @@ -69,35 +75,49 @@ void MLP::forward(const void* input, void* output, Backend* backend) { backend->do_work_stealing_job(nth, [&](int task_id) { int ith = task_id; void* gate_proj_ptr = gate_proj_ + ith * config_.stride * config_.hidden_size * ggml_type_size(config_.gate_type) / ggml_blck_size(config_.gate_type); - float* gate_output_ptr = gate_output_.data() + ith * config_.stride; - llamafile_sgemm(config_.stride, 1, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_proj_ptr, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_input_ptr, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_output_ptr, config_.stride, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.gate_type, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT); + float* gate_output_ptr = gate_output_ + ith * config_.stride; + llamafile_sgemm(config_.stride, qlen, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_proj_ptr, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_input_ptr, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_output_ptr, config_.intermediate_size, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.gate_type, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT); void* up_proj_ptr = up_proj_ + ith * config_.stride * config_.hidden_size * ggml_type_size(config_.up_type) / ggml_blck_size(config_.up_type); - float* up_output_ptr = up_output_.data() + ith * config_.stride; - llamafile_sgemm(config_.stride, 1, config_.hidden_size / ggml_blck_size(config_.up_type), up_proj_ptr, config_.hidden_size / ggml_blck_size(config_.up_type), up_input_ptr, config_.hidden_size / ggml_blck_size(config_.up_type), up_output_ptr, config_.stride, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.up_type, ggml_internal_get_type_traits(config_.up_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT); - for (int i = ith * config_.stride; i < (ith + 1) * config_.stride; i++) { - intermediate_fp32_[i] = act_fn(gate_output_[i]) * up_output_[i]; - } - if (config_.stride % ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) == 0) { - float* intermediate_fp32_ptr = intermediate_fp32_.data() + ith * config_.stride; - void* down_input_ptr = down_input_.data() + ith * config_.stride * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type); - from_float(intermediate_fp32_ptr, down_input_ptr, config_.stride, ggml_internal_get_type_traits(config_.down_type).vec_dot_type); + float* up_output_ptr = up_output_ + ith * config_.stride; + llamafile_sgemm(config_.stride, qlen, config_.hidden_size / ggml_blck_size(config_.up_type), up_proj_ptr, config_.hidden_size / ggml_blck_size(config_.up_type), up_input_ptr, config_.hidden_size / ggml_blck_size(config_.up_type), up_output_ptr, config_.intermediate_size, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.up_type, ggml_internal_get_type_traits(config_.up_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT); + for (int i = 0; i < qlen; i++) { + for (int j = ith * config_.stride; j < (ith + 1) * config_.stride; j++) { + intermediate_fp32_[i * config_.intermediate_size + j] = act_fn(gate_output_[i * config_.intermediate_size + j]) * up_output_[i * config_.intermediate_size + j]; + } + if (config_.stride % ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) == 0) { + float* intermediate_fp32_ptr = intermediate_fp32_ + i * config_.intermediate_size + ith * config_.stride; + void* down_input_ptr = down_input_ + i * config_.intermediate_size * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) + ith * config_.stride * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type); + from_float(intermediate_fp32_ptr, down_input_ptr, config_.stride, ggml_internal_get_type_traits(config_.down_type).vec_dot_type); + } } }); if (config_.stride % ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) != 0) { - from_float(intermediate_fp32_.data(), down_input_.data(), config_.intermediate_size, ggml_internal_get_type_traits(config_.down_type).vec_dot_type); + from_float(intermediate_fp32_, down_input_, qlen * config_.intermediate_size, ggml_internal_get_type_traits(config_.down_type).vec_dot_type); } nth = config_.hidden_size / config_.stride; backend->do_work_stealing_job(nth, [&](int task_id) { int ith = task_id; void* down_proj_ptr = down_proj_ + ith * config_.stride * config_.intermediate_size * ggml_type_size(config_.down_type) / ggml_blck_size(config_.down_type); - float* down_output_ptr = down_output_.data() + ith * config_.stride; - llamafile_sgemm(config_.stride, 1, config_.intermediate_size / ggml_blck_size(config_.down_type), down_proj_ptr, config_.intermediate_size / ggml_blck_size(config_.down_type), down_input_.data(), config_.intermediate_size / ggml_blck_size(config_.down_type), down_output_ptr, config_.stride, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.down_type, ggml_internal_get_type_traits(config_.down_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT); + float* down_output_ptr = down_output_ + ith * config_.stride; + llamafile_sgemm(config_.stride, qlen, config_.intermediate_size / ggml_blck_size(config_.down_type), down_proj_ptr, config_.intermediate_size / ggml_blck_size(config_.down_type), down_input_, config_.intermediate_size / ggml_blck_size(config_.down_type), down_output_ptr, config_.hidden_size, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.down_type, ggml_internal_get_type_traits(config_.down_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT); if (config_.stride % ggml_blck_size(config_.hidden_type) == 0) { - void* output_ptr = output + ith * config_.stride * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type); - from_float(down_output_ptr, output_ptr, config_.stride, config_.hidden_type); + for (int i = 0; i < qlen; i++) { + float* output_fp32_ptr = down_output_ + i * config_.hidden_size + ith * config_.stride; + void* output_ptr = output + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type) + ith * config_.stride * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type); + from_float(output_fp32_ptr, output_ptr, config_.stride, config_.hidden_type); + } } }); if (config_.stride % ggml_blck_size(config_.hidden_type) != 0) { - from_float(down_output_.data(), output, config_.hidden_size, config_.hidden_type); + from_float(down_output_, output, qlen * config_.hidden_size, config_.hidden_type); } } + +void MLP::forward(int qlen, const void* input, void* output, Backend* backend) { + if (qlen <= 0) { + return; + } + int forward_len = std::min(qlen, config_.group_max_len); + forward_many(forward_len, input, output, backend); + forward(qlen - forward_len, input + forward_len * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), output + forward_len * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), backend); +} \ No newline at end of file diff --git a/ktransformers/ktransformers_ext/operators/llamafile/mlp.h b/ktransformers/ktransformers_ext/operators/llamafile/mlp.h index 604db77..eb93294 100644 --- a/ktransformers/ktransformers_ext/operators/llamafile/mlp.h +++ b/ktransformers/ktransformers_ext/operators/llamafile/mlp.h @@ -3,7 +3,7 @@ * @Author : chenht2022 * @Date : 2024-07-12 10:07:58 * @Version : 1.0.0 - * @LastEditors : chenht2022 + * @LastEditors : chenht2022 * @LastEditTime : 2024-07-25 10:35:06 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved. **/ @@ -22,11 +22,13 @@ #include "llama.cpp/ggml-quants.h" #include "llama.cpp/ggml.h" #include "llamafile/sgemm.h" +#include "shared_mem_buffer.h" struct MLPConfig { int hidden_size; int intermediate_size; int stride; + int group_max_len; void* gate_proj; void* up_proj; void* down_proj; @@ -37,15 +39,17 @@ struct MLPConfig { MLPConfig() {} - MLPConfig(int hidden_size, int intermediate_size, int stride, void* gate_proj, void* up_proj, void* down_proj, ggml_type gate_type, ggml_type up_type, ggml_type down_type, ggml_type hidden_type) - : hidden_size(hidden_size), intermediate_size(intermediate_size), stride(stride), gate_proj(gate_proj), up_proj(up_proj), down_proj(down_proj), gate_type(gate_type), up_type(up_type), down_type(down_type), hidden_type(hidden_type) {} + MLPConfig(int hidden_size, int intermediate_size, int stride, int group_max_len, void* gate_proj, void* up_proj, void* down_proj, ggml_type gate_type, ggml_type up_type, ggml_type down_type, ggml_type hidden_type) + : hidden_size(hidden_size), intermediate_size(intermediate_size), stride(stride), group_max_len(group_max_len), gate_proj(gate_proj), up_proj(up_proj), down_proj(down_proj), gate_type(gate_type), up_type(up_type), down_type(down_type), hidden_type(hidden_type) {} }; class MLP { public: MLP(MLPConfig); + ~MLP(); void warm_up(Backend* backend); - void forward(const void* input, void* output, Backend* backend); + void forward_many(int qlen, const void* input, void* output, Backend* backend); + void forward(int qlen, const void* input, void* output, Backend* backend); private: MLPConfig config_; @@ -53,14 +57,14 @@ class MLP { void* up_proj_; // [intermediate_size * hidden_size ( /32 if quantized)] void* down_proj_; // [hidden_size * intermediate_size ( /32 if quantized)] - std::vector input_fp32_; // [hidden_size] - std::vector gate_input_; // [hidden_size * 4] - std::vector up_input_; // [hidden_size * 4] - std::vector gate_output_; // [intermediate_size] - std::vector up_output_; // [intermediate_size] - std::vector intermediate_fp32_; // [intermediate_size] - std::vector down_input_; // [intermediate_size * 4] - std::vector down_output_; // [hidden_size] + float* input_fp32_; // [group_max_len * hidden_size] + uint8_t* gate_input_; // [group_max_len * hidden_size * ggml_type_size(ggml_internal_get_type_traits(gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(gate_type).vec_dot_type)] + uint8_t* up_input_; // [group_max_len * hidden_size * ggml_type_size(ggml_internal_get_type_traits(up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(up_type).vec_dot_type)] + float* gate_output_; // [group_max_len * intermediate_size] + float* up_output_; // [group_max_len * intermediate_size] + float* intermediate_fp32_; // [group_max_len * intermediate_size] + uint8_t* down_input_; // [group_max_len * intermediate_size * ggml_type_size(ggml_internal_get_type_traits(down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(down_type).vec_dot_type)] + float* down_output_; // [group_max_len * hidden_size] }; #endif \ No newline at end of file diff --git a/ktransformers/ktransformers_ext/operators/llamafile/moe.cpp b/ktransformers/ktransformers_ext/operators/llamafile/moe.cpp index aaea4a7..8010f54 100644 --- a/ktransformers/ktransformers_ext/operators/llamafile/moe.cpp +++ b/ktransformers/ktransformers_ext/operators/llamafile/moe.cpp @@ -1,97 +1,62 @@ /** - * @Description : + * @Description : * @Author : chenht2022 * @Date : 2024-07-22 02:03:22 * @Version : 1.0.0 - * @LastEditors : chenht2022 + * @LastEditors : chenht2022 * @LastEditTime : 2024-07-25 10:35:07 - * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved. -**/ + * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved. + **/ #include "moe.h" #include #include "unistd.h" -void* MOE::buffer_ = nullptr; - MOE::MOE(MOEConfig config) { config_ = config; gate_proj_ = config_.gate_proj; up_proj_ = config_.up_proj; down_proj_ = config_.down_proj; - if (MOE::buffer_ == nullptr) { - uint64_t buffer_size = 0; - buffer_size += sizeof(float) * config_.group_max_len * config_.hidden_size; - buffer_size += config_.group_max_len * config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type); - buffer_size += config_.group_max_len * config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type); - buffer_size += config_.routed_expert_num * config_.group_max_len * config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type); - buffer_size += config_.routed_expert_num * config_.group_max_len * config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type); - buffer_size += sizeof(float) * config_.routed_expert_num * config_.group_max_len * config_.intermediate_size; - buffer_size += sizeof(float) * config_.routed_expert_num * config_.group_max_len * config_.intermediate_size; - buffer_size += sizeof(float) * config_.routed_expert_num * config_.group_max_len * config_.intermediate_size; - buffer_size += config_.routed_expert_num * config_.group_max_len * config_.intermediate_size * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type); - buffer_size += sizeof(float) * config_.routed_expert_num * config_.group_max_len * config_.hidden_size; - buffer_size += sizeof(float) * config_.group_max_len * config_.hidden_size; - buffer_ = malloc(buffer_size); - } - - uint64_t offset = 0; - s_input_fp32_ = (float*)(buffer_ + offset); - offset += sizeof(float) * config_.hidden_size; - s_gate_input_ = (uint8_t*)(buffer_ + offset); - offset += config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type); - s_up_input_ = (uint8_t*)(buffer_ + offset); - offset += config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type); + std::vector> s_mem_requests; + s_mem_requests.push_back({(void**)&s_input_fp32_, sizeof(float) * config_.hidden_size}); + s_mem_requests.push_back({(void**)&s_gate_input_, config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type)}); + s_mem_requests.push_back({(void**)&s_up_input_, config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type)}); s_gate_output_.resize(config_.routed_expert_num); s_up_output_.resize(config_.routed_expert_num); s_intermediate_fp32_.resize(config_.routed_expert_num); s_down_input_.resize(config_.routed_expert_num); s_down_output_.resize(config_.routed_expert_num); for (int i = 0; i < config_.routed_expert_num; i++) { - s_gate_output_[i] = (float*)(buffer_ + offset); - offset += sizeof(float) * config_.intermediate_size; - s_up_output_[i] = (float*)(buffer_ + offset); - offset += sizeof(float) * config_.intermediate_size; - s_intermediate_fp32_[i] = (float*)(buffer_ + offset); - offset += sizeof(float) * config_.intermediate_size; - s_down_input_[i] = (uint8_t*)(buffer_ + offset); - offset += config_.intermediate_size * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type); - s_down_output_[i] = (float*)(buffer_ + offset); - offset += sizeof(float) * config_.hidden_size; + s_mem_requests.push_back({(void**)&s_gate_output_[i], sizeof(float) * config_.intermediate_size}); + s_mem_requests.push_back({(void**)&s_up_output_[i], sizeof(float) * config_.intermediate_size}); + s_mem_requests.push_back({(void**)&s_intermediate_fp32_[i], sizeof(float) * config_.intermediate_size}); + s_mem_requests.push_back({(void**)&s_down_input_[i], config_.intermediate_size * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type)}); + s_mem_requests.push_back({(void**)&s_down_output_[i], sizeof(float) * config_.hidden_size}); } - s_output_fp32_ = (float*)(buffer_ + offset); + s_mem_requests.push_back({(void**)&s_output_fp32_, sizeof(float) * config_.hidden_size}); + shared_mem_buffer.alloc(this, s_mem_requests); - offset = 0; + std::vector> m_mem_requests; m_input_fp32_.resize(config_.group_max_len); m_gate_input_.resize(config_.group_max_len); m_up_input_.resize(config_.group_max_len); for (int i = 0; i < config_.group_max_len; i++) { - m_input_fp32_[i] = (float*)(buffer_ + offset); - offset += sizeof(float) * config_.hidden_size; - m_gate_input_[i] = (uint8_t*)(buffer_ + offset); - offset += config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type); - m_up_input_[i] = (uint8_t*)(buffer_ + offset); - offset += config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type); + m_mem_requests.push_back({(void**)&m_input_fp32_[i], sizeof(float) * config_.hidden_size}); + m_mem_requests.push_back({(void**)&m_gate_input_[i], config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type)}); + m_mem_requests.push_back({(void**)&m_up_input_[i], config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type)}); } - m_local_gate_input_ = (uint8_t*)(buffer_ + offset); - offset += config_.routed_expert_num * config_.group_max_len * config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type); - m_local_up_input_ = (uint8_t*)(buffer_ + offset); - offset += config_.routed_expert_num * config_.group_max_len * config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type); - m_local_gate_output_ = (float*)(buffer_ + offset); - offset += sizeof(float) * config_.routed_expert_num * config_.group_max_len * config_.intermediate_size; - m_local_up_output_ = (float*)(buffer_ + offset); - offset += sizeof(float) * config_.routed_expert_num * config_.group_max_len * config_.intermediate_size; - m_local_intermediate_fp32_ = (float*)(buffer_ + offset); - offset += sizeof(float) * config_.routed_expert_num * config_.group_max_len * config_.intermediate_size; - m_local_down_input_ = (uint8_t*)(buffer_ + offset); - offset += config_.routed_expert_num * config_.group_max_len * config_.intermediate_size * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type); - m_local_down_output_ = (float*)(buffer_ + offset); - offset += sizeof(float) * config_.routed_expert_num * config_.group_max_len * config_.hidden_size; + m_mem_requests.push_back({(void**)&m_local_gate_input_, config_.routed_expert_num * config_.group_max_len * config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type)}); + m_mem_requests.push_back({(void**)&m_local_up_input_, config_.routed_expert_num * config_.group_max_len * config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type)}); + m_mem_requests.push_back({(void**)&m_local_gate_output_, sizeof(float) * config_.routed_expert_num * config_.group_max_len * config_.intermediate_size}); + m_mem_requests.push_back({(void**)&m_local_up_output_, sizeof(float) * config_.routed_expert_num * config_.group_max_len * config_.intermediate_size}); + m_mem_requests.push_back({(void**)&m_local_intermediate_fp32_, sizeof(float) * config_.routed_expert_num * config_.group_max_len * config_.intermediate_size}); + m_mem_requests.push_back({(void**)&m_local_down_input_, config_.routed_expert_num * config_.group_max_len * config_.intermediate_size * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type)}); + m_mem_requests.push_back({(void**)&m_local_down_output_, sizeof(float) * config_.routed_expert_num * config_.group_max_len * config_.hidden_size}); m_output_fp32_.resize(config_.group_max_len); for (int i = 0; i < config_.group_max_len; i++) { - m_output_fp32_[i] = (float*)(buffer_ + offset); - offset += sizeof(float) * config_.hidden_size; + m_mem_requests.push_back({(void**)&m_output_fp32_[i], sizeof(float) * config_.hidden_size}); } + shared_mem_buffer.alloc(this, m_mem_requests); m_local_pos_.resize(config_.group_max_len); for (int i = 0; i < config_.group_max_len; i++) { @@ -107,6 +72,10 @@ MOE::MOE(MOEConfig config) { m_local_down_output_ptr_.resize(config_.expert_num); } +MOE::~MOE() { + shared_mem_buffer.dealloc(this); +} + void MOE::warm_up(Backend* backend) { std::vector input_fp32(config_.hidden_size); std::vector input(config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type)); diff --git a/ktransformers/ktransformers_ext/operators/llamafile/moe.h b/ktransformers/ktransformers_ext/operators/llamafile/moe.h index 0d279fe..a1470aa 100644 --- a/ktransformers/ktransformers_ext/operators/llamafile/moe.h +++ b/ktransformers/ktransformers_ext/operators/llamafile/moe.h @@ -3,7 +3,7 @@ * @Author : chenht2022 * @Date : 2024-07-22 02:03:22 * @Version : 1.0.0 - * @LastEditors : chenht2022 + * @LastEditors : chenht2022 * @LastEditTime : 2024-07-25 10:35:10 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved. **/ @@ -22,6 +22,7 @@ #include "llama.cpp/ggml-quants.h" #include "llama.cpp/ggml.h" #include "llamafile/sgemm.h" +#include "shared_mem_buffer.h" struct MOEConfig { int expert_num; @@ -48,13 +49,13 @@ struct MOEConfig { class MOE { public: MOE(MOEConfig); + ~MOE(); void warm_up(Backend* backend); void forward_one(int k, const uint64_t* expert_ids, const float* weights, const void* input, void* output, Backend* backend); void forward_many(int qlen, int k, const uint64_t* expert_ids, const float* weights, const void* input, void* output, Backend* backend); void forward(int qlen, int k, const uint64_t* expert_ids, const float* weights, const void* input, void* output, Backend* backend); private: - static void* buffer_; MOEConfig config_; void* gate_proj_; // [expert_num * intermediate_size * hidden_size ( /32 if quantized)] void* up_proj_; // [expert_num * intermediate_size * hidden_size ( /32 if quantized)] diff --git a/ktransformers/ktransformers_ext/operators/llamafile/shared_mem_buffer.cpp b/ktransformers/ktransformers_ext/operators/llamafile/shared_mem_buffer.cpp new file mode 100644 index 0000000..b1599da --- /dev/null +++ b/ktransformers/ktransformers_ext/operators/llamafile/shared_mem_buffer.cpp @@ -0,0 +1,55 @@ +/** + * @Description : + * @Author : chenht2022 + * @Date : 2024-08-05 04:49:08 + * @Version : 1.0.0 + * @LastEditors : chenht2022 + * @LastEditTime : 2024-08-05 09:21:29 + * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved. + **/ +#include "shared_mem_buffer.h" +#include + +SharedMemBuffer::SharedMemBuffer() { + buffer_ = nullptr; + size_ = 0; +} + +SharedMemBuffer::~SharedMemBuffer() { + if (buffer_) { + free(buffer_); + } +} + +void SharedMemBuffer::alloc(void* object, std::vector> requests) { + uint64_t size = 0; + for (auto& request : requests) { + size += request.second; + } + if (size > size_) { + if (buffer_) { + free(buffer_); + } + buffer_ = malloc(size); + size_ = size; + for (auto& obj_requests : hist_requests_) { + for (auto& requests : obj_requests.second) { + arrange(requests); + } + } + } + arrange(requests); + hist_requests_[object].push_back(requests); +} + +void SharedMemBuffer::dealloc(void* object) { + hist_requests_.erase(object); +} + +void SharedMemBuffer::arrange(std::vector> requests) { + uint64_t offset = 0; + for (auto& request : requests) { + *(request.first) = buffer_ + offset; + offset += request.second; + } +} diff --git a/ktransformers/ktransformers_ext/operators/llamafile/shared_mem_buffer.h b/ktransformers/ktransformers_ext/operators/llamafile/shared_mem_buffer.h new file mode 100644 index 0000000..eeaccd4 --- /dev/null +++ b/ktransformers/ktransformers_ext/operators/llamafile/shared_mem_buffer.h @@ -0,0 +1,37 @@ +/** + * @Description : + * @Author : chenht2022 + * @Date : 2024-08-05 04:49:08 + * @Version : 1.0.0 + * @LastEditors : chenht2022 + * @LastEditTime : 2024-08-05 06:36:41 + * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved. + **/ + +#ifndef CPUINFER_SHAREDMEMBUFFER_H +#define CPUINFER_SHAREDMEMBUFFER_H + +#include +#include +#include +#include + +class SharedMemBuffer { + public: + SharedMemBuffer(); + ~SharedMemBuffer(); + + void alloc(void* object, std::vector> requests); + void dealloc(void* object); + + private: + void* buffer_; + uint64_t size_; + std::map>>> hist_requests_; + + void arrange(std::vector> requests); +}; + +static SharedMemBuffer shared_mem_buffer; + +#endif \ No newline at end of file diff --git a/ktransformers/operators/experts.py b/ktransformers/operators/experts.py index 6adb657..0369f5f 100644 --- a/ktransformers/operators/experts.py +++ b/ktransformers/operators/experts.py @@ -155,7 +155,7 @@ class MLPCPUExperts(MLPExpertsBase): self.moe = MOE(moe_config) self.cpu_infer = MLPCPUExperts.CPU_INFER if warmup: - self.cpu_infer.submit(self.moe.warm_up) + self.cpu_infer.submit(self.moe.warm_up()) self.cpu_infer.sync() if MLPCPUExperts.output_gpu == None: MLPCPUExperts.input_tensor_cpu = torch.empty((self.config.hidden_size), device="cpu", pin_memory=True) @@ -168,7 +168,7 @@ class MLPCPUExperts(MLPExpertsBase): MLPCPUExperts.input_tensor_cpu.copy_(input_tensor, non_blocking=True) MLPCPUExperts.expert_ids_cpu.copy_(expert_ids, non_blocking=True) MLPCPUExperts.weights_cpu.copy_(weights, non_blocking=True) - self.cpu_infer.submit_with_cuda_stream(torch.cuda.current_stream().cuda_stream, self.moe.forward, 1, expert_ids.size(0), MLPCPUExperts.expert_ids_cpu.data_ptr(), MLPCPUExperts.weights_cpu.data_ptr(), MLPCPUExperts.input_tensor_cpu.data_ptr(), MLPCPUExperts.output_cpu.data_ptr()) + self.cpu_infer.submit_with_cuda_stream(torch.cuda.current_stream().cuda_stream, self.moe.forward(1, expert_ids.size(0), MLPCPUExperts.expert_ids_cpu.data_ptr(), MLPCPUExperts.weights_cpu.data_ptr(), MLPCPUExperts.input_tensor_cpu.data_ptr(), MLPCPUExperts.output_cpu.data_ptr())) def sync_for_one_decode(self): self.cpu_infer.sync_with_cuda_stream(torch.cuda.current_stream().cuda_stream) @@ -183,7 +183,7 @@ class MLPCPUExperts(MLPExpertsBase): MLPCPUExperts.input_tensor_cpu.copy_(input_tensor, non_blocking=True) MLPCPUExperts.expert_ids_cpu.copy_(expert_ids, non_blocking=True) MLPCPUExperts.weights_cpu.copy_(weights, non_blocking=True) - self.cpu_infer.submit_with_cuda_stream(torch.cuda.current_stream().cuda_stream, self.moe.forward, 1, expert_ids.size(1), MLPCPUExperts.expert_ids_cpu.data_ptr(), MLPCPUExperts.weights_cpu.data_ptr(), MLPCPUExperts.input_tensor_cpu.data_ptr(), MLPCPUExperts.output_cpu.data_ptr()) + self.cpu_infer.submit_with_cuda_stream(torch.cuda.current_stream().cuda_stream, self.moe.forward(1, expert_ids.size(1), MLPCPUExperts.expert_ids_cpu.data_ptr(), MLPCPUExperts.weights_cpu.data_ptr(), MLPCPUExperts.input_tensor_cpu.data_ptr(), MLPCPUExperts.output_cpu.data_ptr())) self.cpu_infer.sync_with_cuda_stream(torch.cuda.current_stream().cuda_stream) MLPCPUExperts.output_gpu.copy_(MLPCPUExperts.output_cpu, non_blocking=True) #print("capturing experts finish") @@ -193,7 +193,7 @@ class MLPCPUExperts(MLPExpertsBase): expert_ids = expert_ids.contiguous().cpu() weights = weights.contiguous().to(torch.float32).cpu() output = torch.empty_like(input_tensor).contiguous() - self.cpu_infer.submit(self.moe.forward, expert_ids.size(0), expert_ids.size(1), expert_ids.data_ptr(), weights.data_ptr(), input_tensor.data_ptr(), output.data_ptr()) + self.cpu_infer.submit(self.moe.forward(expert_ids.size(0), expert_ids.size(1), expert_ids.data_ptr(), weights.data_ptr(), input_tensor.data_ptr(), output.data_ptr())) self.cpu_infer.sync() return output.to(device=object.__getattribute__(self, "device"))