mirror of
https://github.com/kvcache-ai/ktransformers.git
synced 2025-09-05 20:19:51 +00:00
1) Linear and MLP operators support qlen>1; 2) All operators now share a single memory buffer; 3) Refactor CPUInfer submit/sync logic.
This commit is contained in:
parent
442e13bc97
commit
c1cc7d2cd2
21 changed files with 749 additions and 731 deletions
|
@ -22,14 +22,13 @@ option(LLAMA_AVX2 "llama: enable AVX2"
|
|||
option(LLAMA_AVX512 "llama: enable AVX512" OFF)
|
||||
option(LLAMA_AVX512_VBMI "llama: enable AVX512-VBMI" OFF)
|
||||
option(LLAMA_AVX512_VNNI "llama: enable AVX512-VNNI" OFF)
|
||||
option(LLAMA_AVX512_BF16 "llama: enable AVX512-BF16" OFF)
|
||||
option(LLAMA_FMA "llama: enable FMA" OFF)
|
||||
# in MSVC F16C is implied with AVX2/AVX512
|
||||
if (NOT MSVC)
|
||||
option(LLAMA_F16C "llama: enable F16C" OFF)
|
||||
endif()
|
||||
option(LLAMA_AVX512_FANCY_SIMD "llama: enable AVX512-VL, AVX512-BW, AVX512-DQ, AVX512-VNNI" OFF)
|
||||
option(LLAMA_AVX512_BF16 "llama: enable AVX512-BF16" OFF)
|
||||
|
||||
|
||||
# Architecture specific
|
||||
# TODO: probably these flags need to be tweaked on some architectures
|
||||
|
|
|
@ -6,7 +6,7 @@ Author : chenht2022
|
|||
Date : 2024-07-25 10:31:59
|
||||
Version : 1.0.0
|
||||
LastEditors : chenht2022
|
||||
LastEditTime : 2024-07-25 10:32:51
|
||||
LastEditTime : 2024-08-06 10:35:35
|
||||
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
'''
|
||||
import os, sys
|
||||
|
@ -15,16 +15,19 @@ sys.path.append(os.path.dirname(__file__) + '/../build')
|
|||
import cpuinfer_ext
|
||||
import torch
|
||||
|
||||
def bench_linear(quant_mode: str):
|
||||
with torch.inference_mode(mode=True):
|
||||
input_size = 16384
|
||||
output_size = 5120
|
||||
stride = 16
|
||||
group_max_len = 1024
|
||||
layer_num = 10
|
||||
qlen = 1
|
||||
CPUInfer = cpuinfer_ext.CPUInfer(64)
|
||||
warm_up_iter = 1000
|
||||
test_iter = 10000
|
||||
|
||||
def bench_linear(quant_mode: str):
|
||||
with torch.inference_mode(mode=True):
|
||||
|
||||
hidden_type = 30 # ggml_type::GGML_TYPE_BF16
|
||||
if quant_mode == "fp32":
|
||||
proj_type = 0 # ggml_type::GGML_TYPE_F32
|
||||
|
@ -66,30 +69,37 @@ def bench_linear(quant_mode: str):
|
|||
projs = []
|
||||
for _ in range(layer_num):
|
||||
proj = torch.randn((output_size, input_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
|
||||
config = cpuinfer_ext.linear.LinearConfig(input_size, output_size, stride, proj.data_ptr(), proj_type, hidden_type)
|
||||
config = cpuinfer_ext.linear.LinearConfig(input_size, output_size, stride, group_max_len, proj.data_ptr(), proj_type, hidden_type)
|
||||
linear = cpuinfer_ext.linear.Linear(config)
|
||||
projs.append(proj)
|
||||
linears.append(linear)
|
||||
input = torch.randn((layer_num, qlen, input_size), dtype=torch.bfloat16, device = "cuda").to("cpu").contiguous()
|
||||
output = torch.empty((layer_num, qlen, output_size), dtype=torch.bfloat16, device = "cuda").to("cpu").contiguous()
|
||||
|
||||
# warm up
|
||||
for i in range(warm_up_iter):
|
||||
linear = linears[i % layer_num]
|
||||
input = torch.randn((1, input_size), dtype=torch.bfloat16).contiguous()
|
||||
output = torch.empty((1, output_size), dtype=torch.bfloat16).contiguous()
|
||||
CPUInfer.submit(linear.forward, input.data_ptr(), output.data_ptr())
|
||||
CPUInfer.submit(
|
||||
linears[i % layer_num].forward(
|
||||
qlen,
|
||||
input[i % layer_num].data_ptr(),
|
||||
output[i % layer_num].data_ptr()
|
||||
)
|
||||
)
|
||||
CPUInfer.sync()
|
||||
|
||||
# test
|
||||
total_time = 0
|
||||
for i in range(test_iter):
|
||||
linear = linears[i % layer_num]
|
||||
input = torch.randn((1, input_size), dtype=torch.bfloat16).contiguous()
|
||||
output = torch.empty((1, output_size), dtype=torch.bfloat16).contiguous()
|
||||
start = time.perf_counter()
|
||||
CPUInfer.submit(linear.forward, input.data_ptr(), output.data_ptr())
|
||||
for i in range(test_iter):
|
||||
CPUInfer.submit(
|
||||
linears[i % layer_num].forward(
|
||||
qlen,
|
||||
input[i % layer_num].data_ptr(),
|
||||
output[i % layer_num].data_ptr()
|
||||
)
|
||||
)
|
||||
CPUInfer.sync()
|
||||
end = time.perf_counter()
|
||||
total_time += end - start
|
||||
total_time = end - start
|
||||
print('Quant mode: ', quant_mode)
|
||||
print('Time(s): ', total_time)
|
||||
print('Iteration: ', test_iter)
|
||||
|
|
|
@ -14,14 +14,17 @@ import time
|
|||
import torch
|
||||
import torch.nn.quantized as nnq
|
||||
|
||||
def bench_linear(quant_mode: str):
|
||||
with torch.inference_mode(mode=True):
|
||||
scale, zero_point = 0.1, 0 # Adjust scale and zero_point based on your dataset
|
||||
|
||||
input_size = 16384
|
||||
output_size = 5120
|
||||
layer_num = 10
|
||||
qlen = 1
|
||||
warm_up_iter = 1000
|
||||
test_iter = 10000
|
||||
|
||||
def bench_linear(quant_mode: str):
|
||||
with torch.inference_mode(mode=True):
|
||||
if quant_mode == "fp32":
|
||||
proj_type = torch.float32
|
||||
bytes_per_elem = 4.000000
|
||||
|
@ -41,37 +44,32 @@ def bench_linear(quant_mode: str):
|
|||
for _ in range(layer_num):
|
||||
proj = torch.randn((output_size, input_size), dtype = torch.float32, device = "cuda").to("cpu").contiguous()
|
||||
if quant_mode == "qint8":
|
||||
scale, zero_point = 0.1, 0 # Adjust scale and zero_point based on your dataset
|
||||
proj_q = torch.quantize_per_tensor(proj, scale, zero_point, torch.qint8)
|
||||
quantized_layer = nnq.Linear(input_size, output_size)
|
||||
quantized_layer.set_weight_bias(proj_q, None)
|
||||
projs.append(quantized_layer)
|
||||
else:
|
||||
projs.append(proj.to(proj_type))
|
||||
input = torch.randn((layer_num, qlen, input_size), dtype=torch.bfloat16, device = "cuda").to("cpu").contiguous()
|
||||
|
||||
# warm up
|
||||
for i in range(warm_up_iter):
|
||||
input = torch.randn((1, input_size), dtype=torch.float32).contiguous()
|
||||
if quant_mode == "qint8":
|
||||
input_q = torch.quantize_per_tensor(input, scale, zero_point, torch.quint8)
|
||||
quantized_layer = projs[i % layer_num]
|
||||
t_output = quantized_layer(input_q)
|
||||
if isinstance(projs[i % layer_num], nnq.Linear):
|
||||
input_q = torch.quantize_per_tensor(input[i % layer_num].to(torch.float32), scale, zero_point, torch.quint8)
|
||||
t_output = projs[i % layer_num](input_q)
|
||||
else:
|
||||
t_output = torch.mm(input.to(proj_type), projs[i % layer_num].t())
|
||||
t_output = torch.mm(input[i % layer_num].to(proj_type), projs[i % layer_num].t())
|
||||
|
||||
# test
|
||||
total_time = 0
|
||||
for i in range(test_iter):
|
||||
input = torch.randn((1, input_size), dtype=torch.float32).contiguous()
|
||||
start = time.perf_counter()
|
||||
if quant_mode == "qint8":
|
||||
input_q = torch.quantize_per_tensor(input, scale, zero_point, torch.quint8)
|
||||
quantized_layer = projs[i % layer_num]
|
||||
t_output = quantized_layer(input_q)
|
||||
for i in range(test_iter):
|
||||
if isinstance(projs[i % layer_num], nnq.Linear):
|
||||
input_q = torch.quantize_per_tensor(input[i % layer_num].to(torch.float32), scale, zero_point, torch.quint8)
|
||||
t_output = projs[i % layer_num](input_q)
|
||||
else:
|
||||
t_output = torch.mm(input.to(proj_type), projs[i % layer_num].t())
|
||||
t_output = torch.mm(input[i % layer_num].to(proj_type), projs[i % layer_num].t())
|
||||
end = time.perf_counter()
|
||||
total_time += end - start
|
||||
total_time = end - start
|
||||
print('Quant mode: ', quant_mode)
|
||||
print('Time(s): ', total_time)
|
||||
print('Iteration: ', test_iter)
|
||||
|
|
|
@ -6,7 +6,7 @@ Author : chenht2022
|
|||
Date : 2024-07-16 10:43:18
|
||||
Version : 1.0.0
|
||||
LastEditors : chenht2022
|
||||
LastEditTime : 2024-07-25 10:32:55
|
||||
LastEditTime : 2024-08-06 10:36:04
|
||||
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
'''
|
||||
import os, sys
|
||||
|
@ -15,16 +15,19 @@ sys.path.append(os.path.dirname(__file__) + '/../build')
|
|||
import cpuinfer_ext
|
||||
import torch
|
||||
|
||||
def bench_mlp(quant_mode: str):
|
||||
with torch.inference_mode(mode=True):
|
||||
hidden_size = 5120
|
||||
intermediate_size = 3072
|
||||
stride = 16
|
||||
group_max_len = 1024
|
||||
layer_num = 10
|
||||
qlen = 1
|
||||
CPUInfer = cpuinfer_ext.CPUInfer(64)
|
||||
warm_up_iter = 1000
|
||||
test_iter = 10000
|
||||
|
||||
def bench_mlp(quant_mode: str):
|
||||
with torch.inference_mode(mode=True):
|
||||
|
||||
hidden_type = 30 # ggml_type::GGML_TYPE_BF16
|
||||
if quant_mode == "fp32":
|
||||
gate_type = 0 # ggml_type::GGML_TYPE_F32
|
||||
|
@ -93,32 +96,39 @@ def bench_mlp(quant_mode: str):
|
|||
gate_proj = torch.randn((intermediate_size, hidden_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
|
||||
up_proj = torch.randn((intermediate_size, hidden_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
|
||||
down_proj = torch.randn((hidden_size, intermediate_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
|
||||
config = cpuinfer_ext.mlp.MLPConfig(hidden_size, intermediate_size, stride, gate_proj.data_ptr(), up_proj.data_ptr(), down_proj.data_ptr(), gate_type, up_type, down_type, hidden_type)
|
||||
config = cpuinfer_ext.mlp.MLPConfig(hidden_size, intermediate_size, stride, group_max_len, gate_proj.data_ptr(), up_proj.data_ptr(), down_proj.data_ptr(), gate_type, up_type, down_type, hidden_type)
|
||||
mlp = cpuinfer_ext.mlp.MLP(config)
|
||||
gate_projs.append(gate_proj)
|
||||
up_projs.append(up_proj)
|
||||
down_projs.append(down_proj)
|
||||
mlps.append(mlp)
|
||||
input = torch.randn((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device = "cuda").to("cpu").contiguous()
|
||||
output = torch.empty((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device = "cuda").to("cpu").contiguous()
|
||||
|
||||
# warm up
|
||||
for i in range(warm_up_iter):
|
||||
mlp = mlps[i % layer_num]
|
||||
input = torch.randn((1, hidden_size), dtype=torch.bfloat16).contiguous()
|
||||
output = torch.empty((1, hidden_size), dtype=torch.bfloat16).contiguous()
|
||||
CPUInfer.submit(mlp.forward, input.data_ptr(), output.data_ptr())
|
||||
CPUInfer.submit(
|
||||
mlps[i % layer_num].forward(
|
||||
qlen,
|
||||
input[i % layer_num].data_ptr(),
|
||||
output[i % layer_num].data_ptr()
|
||||
)
|
||||
)
|
||||
CPUInfer.sync()
|
||||
|
||||
# test
|
||||
total_time = 0
|
||||
for i in range(test_iter):
|
||||
mlp = mlps[i % layer_num]
|
||||
input = torch.randn((1, hidden_size), dtype=torch.bfloat16).contiguous()
|
||||
output = torch.empty((1, hidden_size), dtype=torch.bfloat16).contiguous()
|
||||
start = time.perf_counter()
|
||||
CPUInfer.submit(mlp.forward, input.data_ptr(), output.data_ptr())
|
||||
for i in range(test_iter):
|
||||
CPUInfer.submit(
|
||||
mlps[i % layer_num].forward(
|
||||
qlen,
|
||||
input[i % layer_num].data_ptr(),
|
||||
output[i % layer_num].data_ptr()
|
||||
)
|
||||
)
|
||||
CPUInfer.sync()
|
||||
end = time.perf_counter()
|
||||
total_time += end - start
|
||||
total_time = end - start
|
||||
print('Quant mode: ', quant_mode)
|
||||
print('Time(s): ', total_time)
|
||||
print('Iteration: ', test_iter)
|
||||
|
|
|
@ -14,17 +14,38 @@ import time
|
|||
import torch
|
||||
import torch.nn.quantized as nnq
|
||||
|
||||
def act_fn(x):
|
||||
return x / (1.0 + torch.exp(-x))
|
||||
scale, zero_point = 0.1, 0 # Adjust scale and zero_point based on your dataset
|
||||
|
||||
def bench_mlp(quant_mode: str):
|
||||
with torch.inference_mode(mode=True):
|
||||
hidden_size = 5120
|
||||
intermediate_size = 3072
|
||||
layer_num = 10
|
||||
qlen = 1
|
||||
warm_up_iter = 1000
|
||||
test_iter = 10000
|
||||
|
||||
def act_fn(x):
|
||||
return x / (1.0 + torch.exp(-x))
|
||||
|
||||
def mlp_torch(input, gate_proj, up_proj, down_proj):
|
||||
if isinstance(gate_proj, nnq.Linear):
|
||||
input_q = torch.quantize_per_tensor(input.to(torch.float32), scale, zero_point, torch.quint8)
|
||||
gate_buf = gate_proj(input_q)
|
||||
up_buf = up_proj(input_q)
|
||||
gate_buf = gate_buf.dequantize()
|
||||
up_buf = up_buf.dequantize()
|
||||
intermediate = act_fn(gate_buf) * up_buf
|
||||
intermediate_q = torch.quantize_per_tensor(intermediate, scale, zero_point, torch.quint8)
|
||||
expert_output = down_proj(intermediate_q)
|
||||
ret = expert_output.dequantize()
|
||||
else:
|
||||
gate_buf = torch.mm(input.to(gate_proj.dtype), gate_proj.t())
|
||||
up_buf = torch.mm(input.to(up_proj.dtype), up_proj.t())
|
||||
intermediate = act_fn(gate_buf) * up_buf
|
||||
ret = torch.mm(intermediate.to(down_proj.dtype), down_proj.t())
|
||||
return ret
|
||||
|
||||
def bench_mlp(quant_mode: str):
|
||||
with torch.inference_mode(mode=True):
|
||||
if quant_mode == "fp32":
|
||||
proj_type = torch.float32
|
||||
bytes_per_elem = 4.000000
|
||||
|
@ -48,7 +69,6 @@ def bench_mlp(quant_mode: str):
|
|||
up_proj = torch.randn((intermediate_size, hidden_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
|
||||
down_proj = torch.randn((hidden_size, intermediate_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
|
||||
if quant_mode == "qint8":
|
||||
scale, zero_point = 0.1, 0 # Adjust scale and zero_point based on your dataset
|
||||
gate_proj_q = torch.quantize_per_tensor(gate_proj, scale, zero_point, torch.qint8)
|
||||
quantized_gate = nnq.Linear(hidden_size, intermediate_size)
|
||||
quantized_gate.set_weight_bias(gate_proj_q, None)
|
||||
|
@ -65,58 +85,18 @@ def bench_mlp(quant_mode: str):
|
|||
gate_projs.append(gate_proj.to(proj_type))
|
||||
up_projs.append(up_proj.to(proj_type))
|
||||
down_projs.append(down_proj.to(proj_type))
|
||||
input = torch.randn((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device = "cuda").to("cpu").contiguous()
|
||||
|
||||
# warm up
|
||||
for i in range(warm_up_iter):
|
||||
input = torch.randn((1, hidden_size), dtype=torch.float32).contiguous()
|
||||
if quant_mode == "qint8":
|
||||
input_q = torch.quantize_per_tensor(input, scale, zero_point, torch.quint8)
|
||||
quantized_gate = gate_projs[i % layer_num]
|
||||
gate_buf = quantized_gate(input_q)
|
||||
quantized_up = up_projs[i % layer_num]
|
||||
up_buf = quantized_gate(input_q)
|
||||
gate_buf = gate_buf.dequantize()
|
||||
up_buf = up_buf.dequantize()
|
||||
intermediate = act_fn(gate_buf) * up_buf
|
||||
intermediate_q = torch.quantize_per_tensor(intermediate, scale, zero_point, torch.quint8)
|
||||
quantized_down = down_projs[i % layer_num]
|
||||
t_output = quantized_down(intermediate_q)
|
||||
else:
|
||||
gate_proj = gate_projs[i%layer_num]
|
||||
up_proj = up_projs[i%layer_num]
|
||||
down_proj = down_projs[i%layer_num]
|
||||
gate_buf = torch.mm(input.to(proj_type), gate_proj.t())
|
||||
up_buf = torch.mm(input.to(proj_type), up_proj.t())
|
||||
intermediate = act_fn(gate_buf) * up_buf
|
||||
t_output = torch.mm(intermediate.to(proj_type), down_proj.t())
|
||||
mlp_torch(input[i % layer_num], gate_projs[i % layer_num], up_projs[i % layer_num], down_projs[i % layer_num])
|
||||
|
||||
# test
|
||||
total_time = 0
|
||||
for i in range(test_iter):
|
||||
input = torch.randn((1, hidden_size), dtype=torch.float32).contiguous()
|
||||
start = time.perf_counter()
|
||||
if quant_mode == "qint8":
|
||||
input_q = torch.quantize_per_tensor(input, scale, zero_point, torch.quint8)
|
||||
quantized_gate = gate_projs[i % layer_num]
|
||||
gate_buf = quantized_gate(input_q)
|
||||
quantized_up = up_projs[i % layer_num]
|
||||
up_buf = quantized_gate(input_q)
|
||||
gate_buf = gate_buf.dequantize()
|
||||
up_buf = up_buf.dequantize()
|
||||
intermediate = act_fn(gate_buf) * up_buf
|
||||
intermediate_q = torch.quantize_per_tensor(intermediate, scale, zero_point, torch.quint8)
|
||||
quantized_down = down_projs[i % layer_num]
|
||||
t_output = quantized_down(intermediate_q)
|
||||
else:
|
||||
gate_proj = gate_projs[i%layer_num]
|
||||
up_proj = up_projs[i%layer_num]
|
||||
down_proj = down_projs[i%layer_num]
|
||||
gate_buf = torch.mm(input.to(proj_type), gate_proj.t())
|
||||
up_buf = torch.mm(input.to(proj_type), up_proj.t())
|
||||
intermediate = act_fn(gate_buf) * up_buf
|
||||
t_output = torch.mm(intermediate.to(proj_type), down_proj.t())
|
||||
for i in range(test_iter):
|
||||
mlp_torch(input[i % layer_num], gate_projs[i % layer_num], up_projs[i % layer_num], down_projs[i % layer_num])
|
||||
end = time.perf_counter()
|
||||
total_time += end - start
|
||||
total_time = end - start
|
||||
print('Quant mode: ', quant_mode)
|
||||
print('Time(s): ', total_time)
|
||||
print('Iteration: ', test_iter)
|
||||
|
|
|
@ -6,7 +6,7 @@ Author : chenht2022
|
|||
Date : 2024-07-25 10:32:05
|
||||
Version : 1.0.0
|
||||
LastEditors : chenht2022
|
||||
LastEditTime : 2024-07-25 10:33:00
|
||||
LastEditTime : 2024-08-06 10:41:28
|
||||
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
'''
|
||||
import os, sys
|
||||
|
@ -15,9 +15,7 @@ sys.path.append(os.path.dirname(__file__) + '/../build')
|
|||
import cpuinfer_ext
|
||||
import torch
|
||||
|
||||
def bench_moe(quant_mode: str):
|
||||
with torch.inference_mode(mode=True):
|
||||
expert_num = 10
|
||||
expert_num = 160
|
||||
hidden_size = 5120
|
||||
intermediate_size = 1536
|
||||
stride = 16
|
||||
|
@ -30,6 +28,8 @@ def bench_moe(quant_mode: str):
|
|||
warm_up_iter = 1000
|
||||
test_iter = 10000
|
||||
|
||||
def bench_moe(quant_mode: str):
|
||||
with torch.inference_mode(mode=True):
|
||||
hidden_type = 30 # ggml_type::GGML_TYPE_BF16
|
||||
if quant_mode == "fp32":
|
||||
gate_type = 0 # ggml_type::GGML_TYPE_F32
|
||||
|
@ -104,32 +104,38 @@ def bench_moe(quant_mode: str):
|
|||
up_projs.append(up_proj)
|
||||
down_projs.append(down_proj)
|
||||
moes.append(moe)
|
||||
expert_ids = torch.randint(0, expert_num, (layer_num, qlen, n_routed_experts), dtype=torch.int64, device = "cuda").to("cpu").contiguous()
|
||||
expert_ids = torch.stack([torch.stack([torch.randperm(expert_num, dtype=torch.int64, device = "cuda")[:n_routed_experts] for _ in range(qlen)]) for _ in range(layer_num)]).to("cpu").contiguous()
|
||||
weights = torch.rand((layer_num, qlen, n_routed_experts), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
|
||||
input = torch.randn((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device = "cuda").to("cpu").contiguous()
|
||||
output = torch.empty((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device = "cuda").to("cpu").contiguous()
|
||||
|
||||
# warm up
|
||||
for i in range(warm_up_iter):
|
||||
CPUInfer.submit(moes[i % layer_num].forward,
|
||||
CPUInfer.submit(
|
||||
moes[i % layer_num].forward(
|
||||
qlen,
|
||||
n_routed_experts,
|
||||
expert_ids[i % layer_num].data_ptr(),
|
||||
weights[i % layer_num].data_ptr(),
|
||||
input[i % layer_num].data_ptr(),
|
||||
output[i % layer_num].data_ptr())
|
||||
output[i % layer_num].data_ptr()
|
||||
)
|
||||
)
|
||||
CPUInfer.sync()
|
||||
|
||||
# test
|
||||
start = time.perf_counter()
|
||||
for i in range(test_iter):
|
||||
CPUInfer.submit(moes[i % layer_num].forward,
|
||||
CPUInfer.submit(
|
||||
moes[i % layer_num].forward(
|
||||
qlen,
|
||||
n_routed_experts,
|
||||
expert_ids[i % layer_num].data_ptr(),
|
||||
weights[i % layer_num].data_ptr(),
|
||||
input[i % layer_num].data_ptr(),
|
||||
output[i % layer_num].data_ptr())
|
||||
output[i % layer_num].data_ptr()
|
||||
)
|
||||
)
|
||||
CPUInfer.sync()
|
||||
end = time.perf_counter()
|
||||
total_time = end - start
|
||||
|
|
|
@ -14,19 +14,71 @@ import time
|
|||
import torch
|
||||
import torch.nn.quantized as nnq
|
||||
|
||||
def act_fn(x):
|
||||
return x / (1.0 + torch.exp(-x))
|
||||
scale, zero_point = 0.1, 0 # Adjust scale and zero_point based on your dataset
|
||||
|
||||
def bench_moe(quant_mode: str):
|
||||
with torch.inference_mode(mode=True):
|
||||
expert_num = 10
|
||||
expert_num = 160
|
||||
hidden_size = 5120
|
||||
intermediate_size = 1536
|
||||
n_routed_experts = 6
|
||||
layer_num = 10
|
||||
qlen = 1
|
||||
warm_up_iter = 1000
|
||||
test_iter = 10000
|
||||
|
||||
def act_fn(x):
|
||||
return x / (1.0 + torch.exp(-x))
|
||||
|
||||
def mlp_torch(input, gate_proj, up_proj, down_proj):
|
||||
if isinstance(gate_proj, nnq.Linear):
|
||||
input_q = torch.quantize_per_tensor(input.to(torch.float32), scale, zero_point, torch.quint8)
|
||||
gate_buf = gate_proj(input_q)
|
||||
up_buf = up_proj(input_q)
|
||||
gate_buf = gate_buf.dequantize()
|
||||
up_buf = up_buf.dequantize()
|
||||
intermediate = act_fn(gate_buf) * up_buf
|
||||
intermediate_q = torch.quantize_per_tensor(intermediate, scale, zero_point, torch.quint8)
|
||||
expert_output = down_proj(intermediate_q)
|
||||
ret = expert_output.dequantize()
|
||||
else:
|
||||
gate_buf = torch.mm(input.to(gate_proj.dtype), gate_proj.t())
|
||||
up_buf = torch.mm(input.to(up_proj.dtype), up_proj.t())
|
||||
intermediate = act_fn(gate_buf) * up_buf
|
||||
ret = torch.mm(intermediate.to(down_proj.dtype), down_proj.t())
|
||||
return ret
|
||||
|
||||
def moe_torch(input, expert_ids, weights, gate_proj, up_proj, down_proj):
|
||||
cnts = expert_ids.new_zeros((expert_ids.shape[0], expert_num))
|
||||
cnts.scatter_(1, expert_ids, 1)
|
||||
tokens_per_expert = cnts.sum(dim=0)
|
||||
idxs = expert_ids.view(-1).argsort()
|
||||
sorted_tokens = input[idxs // expert_ids.shape[1]]
|
||||
|
||||
outputs = []
|
||||
start_idx = 0
|
||||
for i, num_tokens in enumerate(tokens_per_expert):
|
||||
end_idx = start_idx + num_tokens
|
||||
if num_tokens == 0:
|
||||
continue
|
||||
tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
|
||||
expert_out = mlp_torch(tokens_for_this_expert, gate_proj[i], up_proj[i], down_proj[i])
|
||||
outputs.append(expert_out)
|
||||
start_idx = end_idx
|
||||
|
||||
outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0)
|
||||
|
||||
new_x = torch.empty_like(outs)
|
||||
new_x[idxs] = outs
|
||||
t_output = (
|
||||
new_x.view(*expert_ids.shape, -1)
|
||||
.type(weights.dtype)
|
||||
.mul_(weights.unsqueeze(dim=-1))
|
||||
.sum(dim=1)
|
||||
.type(new_x.dtype)
|
||||
)
|
||||
return t_output
|
||||
|
||||
def bench_moe(quant_mode: str):
|
||||
with torch.inference_mode(mode=True):
|
||||
if quant_mode == "fp32":
|
||||
proj_type = torch.float32
|
||||
bytes_per_elem = 4.000000
|
||||
|
@ -50,7 +102,6 @@ def bench_moe(quant_mode: str):
|
|||
up_proj = torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
|
||||
down_proj = torch.randn((expert_num, hidden_size, intermediate_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
|
||||
if quant_mode == "qint8":
|
||||
scale, zero_point = 0.1, 0 # Adjust scale and zero_point based on your dataset
|
||||
quantized_gate_proj = []
|
||||
quantized_up_proj = []
|
||||
quantized_down_proj = []
|
||||
|
@ -74,82 +125,20 @@ def bench_moe(quant_mode: str):
|
|||
gate_projs.append(gate_proj.to(proj_type))
|
||||
up_projs.append(up_proj.to(proj_type))
|
||||
down_projs.append(down_proj.to(proj_type))
|
||||
expert_ids = torch.stack([torch.stack([torch.randperm(expert_num, dtype=torch.int64, device = "cuda")[:n_routed_experts] for _ in range(qlen)]) for _ in range(layer_num)]).to("cpu").contiguous()
|
||||
weights = torch.rand((layer_num, qlen, n_routed_experts), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
|
||||
input = torch.randn((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device = "cuda").to("cpu").contiguous()
|
||||
|
||||
# warm up
|
||||
for i in range(warm_up_iter):
|
||||
expert_ids = torch.randint(0, expert_num, (n_routed_experts,), dtype=torch.int64).contiguous()
|
||||
weights = torch.rand((n_routed_experts,), dtype=torch.float32).contiguous()
|
||||
input = torch.randn((1, hidden_size), dtype=torch.float32).contiguous()
|
||||
if quant_mode == "qint8":
|
||||
input_q = torch.quantize_per_tensor(input, scale, zero_point, torch.quint8)
|
||||
t_output = torch.zeros((1, hidden_size), dtype=torch.float32).contiguous()
|
||||
gate_proj = gate_projs[i%layer_num]
|
||||
up_proj = up_projs[i%layer_num]
|
||||
down_proj = down_projs[i%layer_num]
|
||||
for i, expert_id in enumerate(expert_ids):
|
||||
quantized_gate = gate_proj[expert_id]
|
||||
gate_buf = quantized_gate(input_q)
|
||||
quantized_up = up_proj[expert_id]
|
||||
up_buf = quantized_up(input_q)
|
||||
gate_buf = gate_buf.dequantize()
|
||||
up_buf = up_buf.dequantize()
|
||||
intermediate = act_fn(gate_buf) * up_buf
|
||||
intermediate_q = torch.quantize_per_tensor(intermediate, scale, zero_point, torch.quint8)
|
||||
quantized_down = down_proj[expert_id]
|
||||
expert_output = quantized_down(intermediate_q)
|
||||
expert_output = expert_output.dequantize()
|
||||
t_output += weights[i] * expert_output
|
||||
else:
|
||||
t_output = torch.zeros((1, hidden_size), dtype=proj_type).contiguous()
|
||||
gate_proj = gate_projs[i%layer_num]
|
||||
up_proj = up_projs[i%layer_num]
|
||||
down_proj = down_projs[i%layer_num]
|
||||
for i, expert_id in enumerate(expert_ids):
|
||||
gate_buf = torch.mm(input.to(proj_type), gate_proj[expert_id].t())
|
||||
up_buf = torch.mm(input.to(proj_type), up_proj[expert_id].t())
|
||||
intermediate = act_fn(gate_buf) * up_buf
|
||||
expert_output = torch.mm(intermediate.to(proj_type), down_proj[expert_id].t())
|
||||
t_output += weights[i] * expert_output
|
||||
moe_torch(input[i % layer_num], expert_ids[i % layer_num], weights[i % layer_num], gate_projs[i % layer_num], up_projs[i % layer_num], down_projs[i % layer_num])
|
||||
|
||||
# test
|
||||
total_time = 0
|
||||
for i in range(test_iter):
|
||||
expert_ids = torch.randint(0, expert_num, (n_routed_experts,), dtype=torch.int64).contiguous()
|
||||
weights = torch.rand((n_routed_experts,), dtype=torch.float32).contiguous()
|
||||
input = torch.randn((1, hidden_size), dtype=torch.float32).contiguous()
|
||||
start = time.perf_counter()
|
||||
if quant_mode == "qint8":
|
||||
input_q = torch.quantize_per_tensor(input, scale, zero_point, torch.quint8)
|
||||
t_output = torch.zeros((1, hidden_size), dtype=torch.float32).contiguous()
|
||||
gate_proj = gate_projs[i%layer_num]
|
||||
up_proj = up_projs[i%layer_num]
|
||||
down_proj = down_projs[i%layer_num]
|
||||
for i, expert_id in enumerate(expert_ids):
|
||||
quantized_gate = gate_proj[expert_id]
|
||||
gate_buf = quantized_gate(input_q)
|
||||
quantized_up = up_proj[expert_id]
|
||||
up_buf = quantized_up(input_q)
|
||||
gate_buf = gate_buf.dequantize()
|
||||
up_buf = up_buf.dequantize()
|
||||
intermediate = act_fn(gate_buf) * up_buf
|
||||
intermediate_q = torch.quantize_per_tensor(intermediate, scale, zero_point, torch.quint8)
|
||||
quantized_down = down_proj[expert_id]
|
||||
expert_output = quantized_down(intermediate_q)
|
||||
expert_output = expert_output.dequantize()
|
||||
t_output += weights[i] * expert_output
|
||||
else:
|
||||
t_output = torch.zeros((1, hidden_size), dtype=proj_type).contiguous()
|
||||
gate_proj = gate_projs[i%layer_num]
|
||||
up_proj = up_projs[i%layer_num]
|
||||
down_proj = down_projs[i%layer_num]
|
||||
for i, expert_id in enumerate(expert_ids):
|
||||
gate_buf = torch.mm(input.to(proj_type), gate_proj[expert_id].t())
|
||||
up_buf = torch.mm(input.to(proj_type), up_proj[expert_id].t())
|
||||
intermediate = act_fn(gate_buf) * up_buf
|
||||
expert_output = torch.mm(intermediate.to(proj_type), down_proj[expert_id].t())
|
||||
t_output += weights[i] * expert_output
|
||||
for i in range(test_iter):
|
||||
moe_torch(input[i % layer_num], expert_ids[i % layer_num], weights[i % layer_num], gate_projs[i % layer_num], up_projs[i % layer_num], down_projs[i % layer_num])
|
||||
end = time.perf_counter()
|
||||
total_time += end - start
|
||||
total_time = end - start
|
||||
print('Quant mode: ', quant_mode)
|
||||
print('Time(s): ', total_time)
|
||||
print('Iteration: ', test_iter)
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
* @Date : 2024-07-16 10:43:18
|
||||
* @Version : 1.0.0
|
||||
* @LastEditors : chenht2022
|
||||
* @LastEditTime : 2024-07-25 10:33:42
|
||||
* @LastEditTime : 2024-08-07 09:47:43
|
||||
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
**/
|
||||
#ifndef CPUINFER_CPUINFER_H
|
||||
|
@ -17,6 +17,7 @@
|
|||
#include <queue>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
#include "cuda_runtime.h"
|
||||
|
||||
#include "backend.h"
|
||||
#include "task_queue.h"
|
||||
|
@ -39,16 +40,39 @@ class CPUInfer {
|
|||
}
|
||||
|
||||
template <typename Func, typename Obj, typename... Args>
|
||||
void submit(Func f, Obj* obj, Args... args) {
|
||||
void enqueue(Func f, Obj* obj, Args... args) {
|
||||
task_queue_->enqueue([=]() {
|
||||
std::invoke(f, *obj, args..., backend_);
|
||||
});
|
||||
}
|
||||
|
||||
void submit(std::pair<intptr_t, intptr_t> params) {
|
||||
void (*func)(void*) = (void (*)(void*))params.first;
|
||||
void* args = (void*)params.second;
|
||||
*((CPUInfer**)args) = this;
|
||||
func(args);
|
||||
}
|
||||
|
||||
void sync() {
|
||||
task_queue_->sync();
|
||||
}
|
||||
|
||||
void submit_with_cuda_stream(intptr_t user_cuda_stream, std::pair<intptr_t, intptr_t> params) {
|
||||
void (*func)(void*) = (void (*)(void*))params.first;
|
||||
void* args = (void*)params.second;
|
||||
*((CPUInfer**)args) = this;
|
||||
cudaLaunchHostFunc((cudaStream_t)user_cuda_stream, (cudaHostFn_t)func, args);
|
||||
}
|
||||
|
||||
static void sync_(void* cpu_infer_ptr) {
|
||||
CPUInfer* cpuinfer = (CPUInfer*)cpu_infer_ptr;
|
||||
cpuinfer->sync();
|
||||
}
|
||||
|
||||
void sync_with_cuda_stream(intptr_t user_cuda_stream) {
|
||||
cudaLaunchHostFunc((cudaStream_t)user_cuda_stream, (cudaHostFn_t)&sync_, (void*)this);
|
||||
}
|
||||
|
||||
public:
|
||||
Backend* backend_;
|
||||
TaskQueue* task_queue_;
|
||||
|
|
|
@ -6,7 +6,7 @@ Author : chenht2022
|
|||
Date : 2024-07-25 10:32:05
|
||||
Version : 1.0.0
|
||||
LastEditors : chenht2022
|
||||
LastEditTime : 2024-07-25 10:34:00
|
||||
LastEditTime : 2024-08-06 10:36:59
|
||||
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
'''
|
||||
import os, sys
|
||||
|
@ -15,23 +15,23 @@ sys.path.append(os.path.dirname(__file__) + '/../build')
|
|||
import cpuinfer_ext
|
||||
import torch
|
||||
|
||||
with torch.inference_mode(mode=True):
|
||||
input_size = 16384
|
||||
output_size = 5120
|
||||
stride = 32
|
||||
group_max_len = 1024
|
||||
proj_type = 1 # ggml_type::GGML_TYPE_F16
|
||||
hidden_type = 1 # ggml_type::GGML_TYPE_F16
|
||||
qlen = 30
|
||||
layer_num = 10
|
||||
CPUInfer = cpuinfer_ext.CPUInfer(48)
|
||||
validation_iter = 100
|
||||
warm_up_iter = 1000
|
||||
test_iter = 10000
|
||||
|
||||
with torch.inference_mode(mode=True):
|
||||
linears = []
|
||||
projs = []
|
||||
for _ in range(layer_num):
|
||||
proj = torch.randn((output_size, input_size), dtype=torch.float16, device = "cuda").to("cpu").contiguous()
|
||||
config = cpuinfer_ext.linear.LinearConfig(input_size, output_size, stride, proj.data_ptr(), proj_type, hidden_type)
|
||||
config = cpuinfer_ext.linear.LinearConfig(input_size, output_size, stride, group_max_len, proj.data_ptr(), proj_type, hidden_type)
|
||||
linear = cpuinfer_ext.linear.Linear(config)
|
||||
projs.append(proj)
|
||||
linears.append(linear)
|
||||
|
@ -39,11 +39,17 @@ with torch.inference_mode(mode=True):
|
|||
# validation
|
||||
for i in range(validation_iter):
|
||||
linear = linears[i % layer_num]
|
||||
input = torch.randn((1, input_size), dtype=torch.float16).contiguous()
|
||||
output = torch.empty((1, output_size), dtype=torch.float16).contiguous()
|
||||
input = torch.randn((qlen, input_size), dtype=torch.float16).contiguous()
|
||||
output = torch.empty((qlen, output_size), dtype=torch.float16).contiguous()
|
||||
input = input / 100
|
||||
|
||||
CPUInfer.submit(linear.forward, input.data_ptr(), output.data_ptr())
|
||||
CPUInfer.submit(
|
||||
linear.forward(
|
||||
qlen,
|
||||
input.data_ptr(),
|
||||
output.data_ptr()
|
||||
)
|
||||
)
|
||||
CPUInfer.sync()
|
||||
# print('cpuinfer output', output)
|
||||
|
||||
|
@ -54,30 +60,3 @@ with torch.inference_mode(mode=True):
|
|||
diff = torch.mean(torch.abs(output - t_output)) / torch.mean(torch.abs(t_output))
|
||||
print('diff = ', diff)
|
||||
assert(diff < 0.001)
|
||||
|
||||
# warm up
|
||||
for i in range(warm_up_iter):
|
||||
linear = linears[i % layer_num]
|
||||
input = torch.randn((1, input_size), dtype=torch.float16).contiguous()
|
||||
output = torch.empty((1, output_size), dtype=torch.float16).contiguous()
|
||||
input = input / 100
|
||||
CPUInfer.submit(linear.forward, input.data_ptr(), output.data_ptr())
|
||||
CPUInfer.sync()
|
||||
|
||||
# test
|
||||
total_time = 0
|
||||
for i in range(test_iter):
|
||||
linear = linears[i % layer_num]
|
||||
input = torch.randn((1, input_size), dtype=torch.float16).contiguous()
|
||||
output = torch.empty((1, output_size), dtype=torch.float16).contiguous()
|
||||
input = input / 100
|
||||
start = time.perf_counter()
|
||||
CPUInfer.submit(linear.forward, input.data_ptr(), output.data_ptr())
|
||||
CPUInfer.sync()
|
||||
end = time.perf_counter()
|
||||
total_time += end - start
|
||||
print('Time: ', total_time)
|
||||
print('Iteration: ', test_iter)
|
||||
print('Time per iteration: ', total_time / test_iter)
|
||||
print('Bandwidth: ', input_size * output_size * 2 * test_iter / total_time / 1000 / 1000 / 1000, 'GB/s')
|
||||
print("All tasks completed.")
|
|
@ -6,7 +6,7 @@ Author : chenht2022
|
|||
Date : 2024-07-25 10:32:05
|
||||
Version : 1.0.0
|
||||
LastEditors : chenht2022
|
||||
LastEditTime : 2024-07-25 10:34:03
|
||||
LastEditTime : 2024-08-06 10:37:28
|
||||
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
'''
|
||||
import os, sys
|
||||
|
@ -15,20 +15,30 @@ sys.path.append(os.path.dirname(__file__) + '/../build')
|
|||
import cpuinfer_ext
|
||||
import torch
|
||||
|
||||
with torch.inference_mode(mode=True):
|
||||
hidden_size = 5120
|
||||
intermediate_size = 3072
|
||||
stride = 32
|
||||
group_max_len = 1024
|
||||
gate_type = 1 # ggml_type::GGML_TYPE_F16
|
||||
up_type = 1 # ggml_type::GGML_TYPE_F16
|
||||
down_type = 1 # ggml_type::GGML_TYPE_F16
|
||||
hidden_type = 1 # ggml_type::GGML_TYPE_F16
|
||||
qlen = 30
|
||||
layer_num = 10
|
||||
CPUInfer = cpuinfer_ext.CPUInfer(48)
|
||||
validation_iter = 100
|
||||
warm_up_iter = 1000
|
||||
test_iter = 10000
|
||||
|
||||
def act_fn(x):
|
||||
return x / (1.0 + torch.exp(-x))
|
||||
|
||||
def mlp_torch(input, gate_proj, up_proj, down_proj):
|
||||
gate_buf = torch.mm(input, gate_proj.t())
|
||||
up_buf = torch.mm(input, up_proj.t())
|
||||
intermediate = act_fn(gate_buf) * up_buf
|
||||
ret = torch.mm(intermediate, down_proj.t())
|
||||
return ret
|
||||
|
||||
with torch.inference_mode(mode=True):
|
||||
mlps = []
|
||||
gate_projs = []
|
||||
up_projs = []
|
||||
|
@ -37,7 +47,7 @@ with torch.inference_mode(mode=True):
|
|||
gate_proj = torch.randn((intermediate_size, hidden_size), dtype=torch.float16, device = "cuda").to("cpu").contiguous()
|
||||
up_proj = torch.randn((intermediate_size, hidden_size), dtype=torch.float16, device = "cuda").to("cpu").contiguous()
|
||||
down_proj = torch.randn((hidden_size, intermediate_size), dtype=torch.float16, device = "cuda").to("cpu").contiguous()
|
||||
config = cpuinfer_ext.mlp.MLPConfig(hidden_size, intermediate_size, stride, gate_proj.data_ptr(), up_proj.data_ptr(), down_proj.data_ptr(), gate_type, up_type, down_type, hidden_type)
|
||||
config = cpuinfer_ext.mlp.MLPConfig(hidden_size, intermediate_size, stride, group_max_len, gate_proj.data_ptr(), up_proj.data_ptr(), down_proj.data_ptr(), gate_type, up_type, down_type, hidden_type)
|
||||
mlp = cpuinfer_ext.mlp.MLP(config)
|
||||
gate_projs.append(gate_proj)
|
||||
up_projs.append(up_proj)
|
||||
|
@ -47,52 +57,26 @@ with torch.inference_mode(mode=True):
|
|||
# validation
|
||||
for i in range(validation_iter):
|
||||
mlp = mlps[i % layer_num]
|
||||
input = torch.randn((1, hidden_size), dtype=torch.float16).contiguous()
|
||||
output = torch.empty((1, hidden_size), dtype=torch.float16).contiguous()
|
||||
input = torch.randn((qlen, hidden_size), dtype=torch.float16).contiguous()
|
||||
output = torch.empty((qlen, hidden_size), dtype=torch.float16).contiguous()
|
||||
input = input / 100
|
||||
|
||||
CPUInfer.submit(mlp.forward, input.data_ptr(), output.data_ptr())
|
||||
CPUInfer.submit(
|
||||
mlp.forward(
|
||||
qlen,
|
||||
input.data_ptr(),
|
||||
output.data_ptr()
|
||||
)
|
||||
)
|
||||
CPUInfer.sync()
|
||||
# print('cpuinfer output', output)
|
||||
|
||||
def act_fn(x):
|
||||
return x / (1.0 + torch.exp(-x))
|
||||
gate_proj = gate_projs[i%layer_num]
|
||||
up_proj = up_projs[i%layer_num]
|
||||
down_proj = down_projs[i%layer_num]
|
||||
gate_buf = torch.mm(input, gate_proj.t())
|
||||
up_buf = torch.mm(input, up_proj.t())
|
||||
intermediate = act_fn(gate_buf) * up_buf
|
||||
t_output = torch.mm(intermediate, down_proj.t())
|
||||
t_output = mlp_torch(input, gate_proj, up_proj, down_proj)
|
||||
# print('torch output', t_output)
|
||||
|
||||
diff = torch.mean(torch.abs(output - t_output)) / torch.mean(torch.abs(t_output))
|
||||
print('diff = ', diff)
|
||||
assert(diff < 0.001)
|
||||
|
||||
# warm up
|
||||
for i in range(warm_up_iter):
|
||||
mlp = mlps[i % layer_num]
|
||||
input = torch.randn((1, hidden_size), dtype=torch.float16).contiguous()
|
||||
output = torch.empty((1, hidden_size), dtype=torch.float16).contiguous()
|
||||
input = input / 100
|
||||
CPUInfer.submit(mlp.forward, input.data_ptr(), output.data_ptr())
|
||||
CPUInfer.sync()
|
||||
|
||||
# test
|
||||
total_time = 0
|
||||
for i in range(test_iter):
|
||||
mlp = mlps[i % layer_num]
|
||||
input = torch.randn((1, hidden_size), dtype=torch.float16).contiguous()
|
||||
output = torch.empty((1, hidden_size), dtype=torch.float16).contiguous()
|
||||
input = input / 100
|
||||
start = time.time()
|
||||
CPUInfer.submit(mlp.forward, input.data_ptr(), output.data_ptr())
|
||||
CPUInfer.sync()
|
||||
end = time.time()
|
||||
total_time += end - start
|
||||
print('Time: ', total_time)
|
||||
print('Iteration: ', test_iter)
|
||||
print('Time per iteration: ', total_time / test_iter)
|
||||
print('Bandwidth: ', hidden_size * intermediate_size * 3 * 2 * test_iter / total_time / 1024 / 1024 / 1024, 'GB/s')
|
||||
print("All tasks completed.")
|
|
@ -6,7 +6,7 @@ Author : chenht2022
|
|||
Date : 2024-07-25 10:32:05
|
||||
Version : 1.0.0
|
||||
LastEditors : chenht2022
|
||||
LastEditTime : 2024-07-25 10:34:06
|
||||
LastEditTime : 2024-08-06 10:38:05
|
||||
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
'''
|
||||
import os, sys
|
||||
|
@ -15,8 +15,7 @@ sys.path.append(os.path.dirname(__file__) + '/../build')
|
|||
import cpuinfer_ext
|
||||
import torch
|
||||
|
||||
with torch.inference_mode(mode=True):
|
||||
expert_num = 10
|
||||
expert_num = 160
|
||||
hidden_size = 5120
|
||||
intermediate_size = 1536
|
||||
stride = 32
|
||||
|
@ -31,9 +30,49 @@ with torch.inference_mode(mode=True):
|
|||
layer_num = 10
|
||||
CPUInfer = cpuinfer_ext.CPUInfer(48)
|
||||
validation_iter = 100
|
||||
warm_up_iter = 1000
|
||||
test_iter = 10000
|
||||
|
||||
def act_fn(x):
|
||||
return x / (1.0 + torch.exp(-x))
|
||||
|
||||
def mlp_torch(input, gate_proj, up_proj, down_proj):
|
||||
gate_buf = torch.mm(input, gate_proj.t())
|
||||
up_buf = torch.mm(input, up_proj.t())
|
||||
intermediate = act_fn(gate_buf) * up_buf
|
||||
ret = torch.mm(intermediate, down_proj.t())
|
||||
return ret
|
||||
|
||||
def moe_torch(input, expert_ids, weights, gate_proj, up_proj, down_proj):
|
||||
cnts = expert_ids.new_zeros((expert_ids.shape[0], expert_num))
|
||||
cnts.scatter_(1, expert_ids, 1)
|
||||
tokens_per_expert = cnts.sum(dim=0)
|
||||
idxs = expert_ids.view(-1).argsort()
|
||||
sorted_tokens = input[idxs // expert_ids.shape[1]]
|
||||
|
||||
outputs = []
|
||||
start_idx = 0
|
||||
for i, num_tokens in enumerate(tokens_per_expert):
|
||||
end_idx = start_idx + num_tokens
|
||||
if num_tokens == 0:
|
||||
continue
|
||||
tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
|
||||
expert_out = mlp_torch(tokens_for_this_expert, gate_proj[i], up_proj[i], down_proj[i])
|
||||
outputs.append(expert_out)
|
||||
start_idx = end_idx
|
||||
|
||||
outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0)
|
||||
|
||||
new_x = torch.empty_like(outs)
|
||||
new_x[idxs] = outs
|
||||
t_output = (
|
||||
new_x.view(*expert_ids.shape, -1)
|
||||
.type(weights.dtype)
|
||||
.mul_(weights.unsqueeze(dim=-1))
|
||||
.sum(dim=1)
|
||||
.type(new_x.dtype)
|
||||
)
|
||||
return t_output
|
||||
|
||||
with torch.inference_mode(mode=True):
|
||||
moes = []
|
||||
gate_projs = []
|
||||
up_projs = []
|
||||
|
@ -51,63 +90,32 @@ with torch.inference_mode(mode=True):
|
|||
|
||||
# validation
|
||||
for i in range(validation_iter):
|
||||
moe = moes[i % layer_num]
|
||||
expert_ids = torch.randint(0, expert_num, (qlen, n_routed_experts), dtype=torch.int64).contiguous()
|
||||
expert_ids = torch.stack([torch.randperm(expert_num)[:n_routed_experts] for _ in range(qlen)]).contiguous()
|
||||
weights = torch.rand((qlen, n_routed_experts), dtype=torch.float32).contiguous()
|
||||
input = torch.randn((qlen, 1, hidden_size), dtype=torch.float16).contiguous()
|
||||
output = torch.empty((qlen, 1, hidden_size), dtype=torch.float16).contiguous()
|
||||
input = torch.randn((qlen, hidden_size), dtype=torch.float16).contiguous()
|
||||
output = torch.empty((qlen, hidden_size), dtype=torch.float16).contiguous()
|
||||
input = input / 100
|
||||
|
||||
CPUInfer.submit(moe.forward, qlen, n_routed_experts, expert_ids.data_ptr(), weights.data_ptr(), input.data_ptr(), output.data_ptr())
|
||||
moe = moes[i % layer_num]
|
||||
CPUInfer.submit(
|
||||
moe.forward(
|
||||
qlen,
|
||||
n_routed_experts,
|
||||
expert_ids.data_ptr(),
|
||||
weights.data_ptr(),
|
||||
input.data_ptr(),
|
||||
output.data_ptr()
|
||||
)
|
||||
)
|
||||
CPUInfer.sync()
|
||||
# print('cpuinfer output', output)
|
||||
|
||||
def act_fn(x):
|
||||
return x / (1.0 + torch.exp(-x))
|
||||
t_output = torch.zeros((qlen, 1, hidden_size), dtype=torch.float32).contiguous()
|
||||
gate_proj = gate_projs[i%layer_num]
|
||||
up_proj = up_projs[i%layer_num]
|
||||
down_proj = down_projs[i%layer_num]
|
||||
for token_idx in range(qlen):
|
||||
for i, expert_id in enumerate(expert_ids[token_idx]):
|
||||
gate_buf = torch.mm(input[token_idx], gate_proj[expert_id].t())
|
||||
up_buf = torch.mm(input[token_idx], up_proj[expert_id].t())
|
||||
intermediate = act_fn(gate_buf) * up_buf
|
||||
expert_output = torch.mm(intermediate, down_proj[expert_id].t())
|
||||
t_output[token_idx] += weights[token_idx][i] * expert_output
|
||||
t_output = moe_torch(input, expert_ids, weights, gate_proj, up_proj, down_proj)
|
||||
# print('torch output', t_output)
|
||||
|
||||
diff = torch.mean(torch.abs(output - t_output)) / torch.mean(torch.abs(t_output))
|
||||
print('diff = ', diff)
|
||||
assert(diff < 0.001)
|
||||
|
||||
# warm up
|
||||
for i in range(warm_up_iter):
|
||||
moe = moes[i % layer_num]
|
||||
expert_ids = torch.randint(0, expert_num, (qlen, n_routed_experts), dtype=torch.int64).contiguous()
|
||||
weights = torch.rand((qlen, n_routed_experts), dtype=torch.float32).contiguous()
|
||||
input = torch.randn((qlen, hidden_size), dtype=torch.float16).contiguous()
|
||||
output = torch.empty((qlen, hidden_size), dtype=torch.float16).contiguous()
|
||||
input = input / 100
|
||||
CPUInfer.submit(moe.forward, qlen, n_routed_experts, expert_ids.data_ptr(), weights.data_ptr(), input.data_ptr(), output.data_ptr())
|
||||
CPUInfer.sync()
|
||||
|
||||
# test
|
||||
total_time = 0
|
||||
for i in range(test_iter):
|
||||
moe = moes[i % layer_num]
|
||||
expert_ids = torch.randint(0, expert_num, (qlen, n_routed_experts), dtype=torch.int64).contiguous()
|
||||
weights = torch.rand((qlen, n_routed_experts), dtype=torch.float32).contiguous()
|
||||
input = torch.randn((qlen, hidden_size), dtype=torch.float16).contiguous()
|
||||
output = torch.empty((qlen, hidden_size), dtype=torch.float16).contiguous()
|
||||
input = input / 100
|
||||
start = time.perf_counter()
|
||||
CPUInfer.submit(moe.forward, qlen, n_routed_experts, expert_ids.data_ptr(), weights.data_ptr(), input.data_ptr(), output.data_ptr())
|
||||
CPUInfer.sync()
|
||||
end = time.perf_counter()
|
||||
total_time += end - start
|
||||
print('Time: ', total_time)
|
||||
print('Iteration: ', test_iter)
|
||||
print('Time per iteration: ', total_time / test_iter)
|
||||
print('Bandwidth: ', hidden_size * intermediate_size * 3 * n_routed_experts * 2 * test_iter / total_time / 1000 / 1000 / 1000, 'GB/s')
|
||||
print("All tasks completed.")
|
|
@ -4,7 +4,7 @@
|
|||
* @Date : 2024-07-22 02:03:22
|
||||
* @Version : 1.0.0
|
||||
* @LastEditors : chenht2022
|
||||
* @LastEditTime : 2024-07-25 10:34:23
|
||||
* @LastEditTime : 2024-08-07 10:39:37
|
||||
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
**/
|
||||
// Python bindings
|
||||
|
@ -12,7 +12,6 @@
|
|||
#include <iostream>
|
||||
#include <memory>
|
||||
#include "cpu_backend/cpuinfer.h"
|
||||
#include "cuda_runtime.h"
|
||||
#include "device_launch_parameters.h"
|
||||
#include "llamafile/flags.h"
|
||||
#include "operators/llamafile/linear.h"
|
||||
|
@ -26,239 +25,155 @@
|
|||
namespace py = pybind11;
|
||||
using namespace pybind11::literals;
|
||||
|
||||
// Binding functions for the Linear class
|
||||
class LinearBindings {
|
||||
public:
|
||||
static void bind_forward(CPUInfer& cpuinfer, Linear* linear, py::args args, py::kwargs kwargs) {
|
||||
auto input = args[0].cast<intptr_t>();
|
||||
auto output = args[1].cast<intptr_t>();
|
||||
cpuinfer.submit(&Linear::forward, linear,
|
||||
(const void*)input, (void*)output);
|
||||
}
|
||||
|
||||
static void bind_warm_up(CPUInfer& cpuinfer, Linear* linear, py::args args, py::kwargs kwargs) {
|
||||
cpuinfer.submit(&Linear::warm_up, linear);
|
||||
}
|
||||
|
||||
static void bind_functions(CPUInfer& cpuinfer, py::object func, py::args args, py::kwargs kwargs) {
|
||||
auto linear = func.attr("__self__").cast<Linear*>();
|
||||
std::string func_name = py::str(func.attr("__func__").attr("__name__"));
|
||||
|
||||
if (func_name == "forward") {
|
||||
bind_forward(cpuinfer, linear, args, kwargs);
|
||||
} else if (func_name == "warm_up") {
|
||||
bind_warm_up(cpuinfer, linear, args, kwargs);
|
||||
} else {
|
||||
throw py::value_error("Unsupported function: " +
|
||||
std::string(func_name));
|
||||
class WarmUpBindinds {
|
||||
public:
|
||||
struct Args {
|
||||
CPUInfer* cpuinfer;
|
||||
Linear* linear;
|
||||
};
|
||||
static void inner(void* args) {
|
||||
Args* args_ = (Args*)args;
|
||||
args_->cpuinfer->enqueue(&Linear::warm_up, args_->linear);
|
||||
}
|
||||
static std::pair<intptr_t, intptr_t> interface(Linear& linear) {
|
||||
Args* args = new Args{nullptr, &linear};
|
||||
return std::make_pair((intptr_t)&inner, (intptr_t)args);
|
||||
}
|
||||
};
|
||||
class ForwardBindings {
|
||||
public:
|
||||
struct Args {
|
||||
CPUInfer* cpuinfer;
|
||||
Linear* linear;
|
||||
int qlen;
|
||||
const void* input;
|
||||
void* output;
|
||||
};
|
||||
static void inner(void* args) {
|
||||
Args* args_ = (Args*)args;
|
||||
args_->cpuinfer->enqueue(&Linear::forward, args_->linear, args_->qlen, args_->input, args_->output);
|
||||
}
|
||||
static std::pair<intptr_t, intptr_t> interface(Linear& linear, int qlen, intptr_t input, intptr_t output) {
|
||||
Args* args = new Args{nullptr, &linear, qlen, (const void*)input, (void*)output};
|
||||
return std::make_pair((intptr_t)&inner, (intptr_t)args);
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
// Binding functions for the MLP class
|
||||
class MLPBindings {
|
||||
public:
|
||||
static void bind_forward(CPUInfer& cpuinfer, MLP* mlp, py::args args, py::kwargs kwargs) {
|
||||
auto input = args[0].cast<intptr_t>();
|
||||
auto output = args[1].cast<intptr_t>();
|
||||
cpuinfer.submit(&MLP::forward, mlp,
|
||||
(const void*)input, (void*)output);
|
||||
}
|
||||
|
||||
static void bind_warm_up(CPUInfer& cpuinfer, MLP* mlp, py::args args, py::kwargs kwargs) {
|
||||
cpuinfer.submit(&MLP::warm_up, mlp);
|
||||
}
|
||||
|
||||
static void bind_functions(CPUInfer& cpuinfer, py::object func, py::args args, py::kwargs kwargs) {
|
||||
auto mlp = func.attr("__self__").cast<MLP*>();
|
||||
std::string func_name = py::str(func.attr("__func__").attr("__name__"));
|
||||
|
||||
if (func_name == "forward") {
|
||||
bind_forward(cpuinfer, mlp, args, kwargs);
|
||||
} else if (func_name == "warm_up") {
|
||||
bind_warm_up(cpuinfer, mlp, args, kwargs);
|
||||
} else {
|
||||
throw py::value_error("Unsupported function: " +
|
||||
std::string(func_name));
|
||||
class WarmUpBindinds {
|
||||
public:
|
||||
struct Args {
|
||||
CPUInfer* cpuinfer;
|
||||
MLP* mlp;
|
||||
};
|
||||
static void inner(void* args) {
|
||||
Args* args_ = (Args*)args;
|
||||
args_->cpuinfer->enqueue(&MLP::warm_up, args_->mlp);
|
||||
}
|
||||
static std::pair<intptr_t, intptr_t> interface(MLP& mlp) {
|
||||
Args* args = new Args{nullptr, &mlp};
|
||||
return std::make_pair((intptr_t)&inner, (intptr_t)args);
|
||||
}
|
||||
};
|
||||
class ForwardBindings {
|
||||
public:
|
||||
struct Args {
|
||||
CPUInfer* cpuinfer;
|
||||
MLP* mlp;
|
||||
int qlen;
|
||||
const void* input;
|
||||
void* output;
|
||||
};
|
||||
static void inner(void* args) {
|
||||
Args* args_ = (Args*)args;
|
||||
args_->cpuinfer->enqueue(&MLP::forward, args_->mlp, args_->qlen, args_->input, args_->output);
|
||||
}
|
||||
static std::pair<intptr_t, intptr_t> interface(MLP& mlp, int qlen, intptr_t input, intptr_t output) {
|
||||
Args* args = new Args{nullptr, &mlp, qlen, (const void*)input, (void*)output};
|
||||
return std::make_pair((intptr_t)&inner, (intptr_t)args);
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
// Binding functions for the MOE class
|
||||
class MOEBindings {
|
||||
public:
|
||||
static void bind_forward(CPUInfer& cpuinfer, MOE* moe, py::args args, py::kwargs kwargs) {
|
||||
int qlen = args[0].cast<int>();
|
||||
int k = args[1].cast<int>();
|
||||
auto expert_ids = args[2].cast<intptr_t>();
|
||||
auto weights = args[3].cast<intptr_t>();
|
||||
auto input = args[4].cast<intptr_t>();
|
||||
auto output = args[5].cast<intptr_t>();
|
||||
cpuinfer.submit(&MOE::forward, moe,
|
||||
qlen, k, (const uint64_t*)expert_ids, (const float*)weights, (const void*)input, (void*)output);
|
||||
}
|
||||
|
||||
static void bind_warm_up(CPUInfer& cpuinfer, MOE* moe, py::args args, py::kwargs kwargs) {
|
||||
cpuinfer.submit(&MOE::warm_up, moe);
|
||||
}
|
||||
|
||||
static void bind_functions(CPUInfer& cpuinfer, py::object func, py::args args, py::kwargs kwargs) {
|
||||
auto moe = func.attr("__self__").cast<MOE*>();
|
||||
std::string func_name = py::str(func.attr("__func__").attr("__name__"));
|
||||
|
||||
if (func_name == "forward") {
|
||||
bind_forward(cpuinfer, moe, args, kwargs);
|
||||
} else if (func_name == "warm_up") {
|
||||
bind_warm_up(cpuinfer, moe, args, kwargs);
|
||||
} else {
|
||||
throw py::value_error("Unsupported function: " +
|
||||
std::string(func_name));
|
||||
class WarmUpBindinds {
|
||||
public:
|
||||
struct Args {
|
||||
CPUInfer* cpuinfer;
|
||||
MOE* moe;
|
||||
};
|
||||
static void inner(void* args) {
|
||||
Args* args_ = (Args*)args;
|
||||
args_->cpuinfer->enqueue(&MOE::warm_up, args_->moe);
|
||||
}
|
||||
static std::pair<intptr_t, intptr_t> interface(MOE& moe) {
|
||||
Args* args = new Args{nullptr, &moe};
|
||||
return std::make_pair((intptr_t)&inner, (intptr_t)args);
|
||||
}
|
||||
};
|
||||
|
||||
struct MOEForwardArgs {
|
||||
class ForwardBindings {
|
||||
public:
|
||||
struct Args {
|
||||
CPUInfer* cpuinfer;
|
||||
MOE* moe;
|
||||
int qlen;
|
||||
int k;
|
||||
uint64_t* expert_ids;
|
||||
float* weights;
|
||||
void* input;
|
||||
const uint64_t* expert_ids;
|
||||
const float* weights;
|
||||
const void* input;
|
||||
void* output;
|
||||
};
|
||||
|
||||
void submit_moe_forward_with_host_args_ptr(void* host_args_ptr) {
|
||||
MOEForwardArgs* host_args = (MOEForwardArgs*)host_args_ptr;
|
||||
host_args->cpuinfer->submit(&MOE::forward, host_args->moe,
|
||||
host_args->qlen, host_args->k, host_args->expert_ids, host_args->weights, host_args->input, host_args->output);
|
||||
static void inner(void* args) {
|
||||
Args* args_ = (Args*)args;
|
||||
args_->cpuinfer->enqueue(&MOE::forward, args_->moe, args_->qlen, args_->k, args_->expert_ids, args_->weights, args_->input, args_->output);
|
||||
}
|
||||
|
||||
void cpuinfer_sync(void* host_args_ptr) {
|
||||
CPUInfer* cpuinfer = (CPUInfer*)host_args_ptr;
|
||||
cpuinfer->sync();
|
||||
static std::pair<intptr_t, intptr_t> interface(MOE& moe, int qlen, int k, intptr_t expert_ids, intptr_t weights, intptr_t input, intptr_t output) {
|
||||
Args* args = new Args{nullptr, &moe, qlen, k, (const uint64_t*)expert_ids, (const float*)weights, (const void*)input, (void*)output};
|
||||
return std::make_pair((intptr_t)&inner, (intptr_t)args);
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
PYBIND11_MODULE(cpuinfer_ext, m) {
|
||||
py::class_<CPUInfer>(m, "CPUInfer")
|
||||
.def(py::init<int>())
|
||||
.def("submit", &CPUInfer::submit)
|
||||
.def("submit_with_cuda_stream", &CPUInfer::submit_with_cuda_stream)
|
||||
.def("sync", &CPUInfer::sync)
|
||||
.def("sync_with_cuda_stream", &CPUInfer::sync_with_cuda_stream);
|
||||
|
||||
auto linear_module = m.def_submodule("linear");
|
||||
|
||||
py::class_<LinearConfig>(linear_module, "LinearConfig")
|
||||
.def(py::init([](int hidden_size, int intermediate_size, int stride, intptr_t proj, int proj_type, int hidden_type) {
|
||||
return LinearConfig(hidden_size, intermediate_size, stride, (void*)proj, (ggml_type)proj_type, (ggml_type)hidden_type);
|
||||
.def(py::init([](int hidden_size, int intermediate_size, int stride, int group_max_len, intptr_t proj, int proj_type, int hidden_type) {
|
||||
return LinearConfig(hidden_size, intermediate_size, stride, group_max_len, (void*)proj, (ggml_type)proj_type, (ggml_type)hidden_type);
|
||||
}));
|
||||
|
||||
py::class_<Linear>(linear_module, "Linear")
|
||||
.def(py::init<LinearConfig>())
|
||||
.def("warm_up", [](Linear& linear) {
|
||||
throw std::runtime_error("!!! Doing nothing, please use CPUInfer.submit to call it!!!\n");
|
||||
})
|
||||
.def("forward", [](Linear& linear, intptr_t input, intptr_t output) {
|
||||
throw std::runtime_error("!!! Doing nothing, please use CPUInfer.submit to call it!!!\n");
|
||||
});
|
||||
.def("warm_up", &LinearBindings::WarmUpBindinds::interface)
|
||||
.def("forward", &LinearBindings::ForwardBindings::interface);
|
||||
|
||||
auto mlp_module = m.def_submodule("mlp");
|
||||
|
||||
py::class_<MLPConfig>(mlp_module, "MLPConfig")
|
||||
.def(py::init([](int hidden_size, int intermediate_size, int stride, intptr_t gate_proj, intptr_t up_proj, intptr_t down_proj, int gate_type, int up_type, int down_type, int hidden_type) {
|
||||
return MLPConfig(hidden_size, intermediate_size, stride, (void*)gate_proj, (void*)up_proj, (void*)down_proj, (ggml_type)gate_type, (ggml_type)up_type, (ggml_type)down_type, (ggml_type)hidden_type);
|
||||
.def(py::init([](int hidden_size, int intermediate_size, int stride, int group_max_len, intptr_t gate_proj, intptr_t up_proj, intptr_t down_proj, int gate_type, int up_type, int down_type, int hidden_type) {
|
||||
return MLPConfig(hidden_size, intermediate_size, stride, group_max_len, (void*)gate_proj, (void*)up_proj, (void*)down_proj, (ggml_type)gate_type, (ggml_type)up_type, (ggml_type)down_type, (ggml_type)hidden_type);
|
||||
}));
|
||||
|
||||
py::class_<MLP>(mlp_module, "MLP")
|
||||
.def(py::init<MLPConfig>())
|
||||
.def("warm_up", [](MLP& mlp) {
|
||||
throw std::runtime_error("!!! Doing nothing, please use CPUInfer.submit to call it!!!\n");
|
||||
})
|
||||
.def("forward", [](MLP& mlp, intptr_t input, intptr_t output) {
|
||||
throw std::runtime_error("!!! Doing nothing, please use CPUInfer.submit to call it!!!\n");
|
||||
});
|
||||
.def("warm_up", &MLPBindings::WarmUpBindinds::interface)
|
||||
.def("forward", &MLPBindings::ForwardBindings::interface);
|
||||
|
||||
auto moe_module = m.def_submodule("moe");
|
||||
|
||||
py::class_<MOEConfig>(moe_module, "MOEConfig")
|
||||
.def(py::init([](int expert_num, int routed_expert_num, int hidden_size, int intermediate_size, int stride, int group_min_len, int group_max_len, intptr_t gate_proj, intptr_t up_proj, intptr_t down_proj, int gate_type, int up_type, int down_type, int hidden_type) {
|
||||
return MOEConfig(expert_num, routed_expert_num, hidden_size, intermediate_size, stride, group_min_len, group_max_len, (void*)gate_proj, (void*)up_proj, (void*)down_proj, (ggml_type)gate_type, (ggml_type)up_type, (ggml_type)down_type, (ggml_type)hidden_type);
|
||||
}));
|
||||
|
||||
py::class_<MOE>(moe_module, "MOE")
|
||||
.def(py::init<MOEConfig>())
|
||||
.def("warm_up", [](MOE& moe) {
|
||||
throw std::runtime_error("!!! Doing nothing, please use CPUInfer.submit to call it!!!\n");
|
||||
})
|
||||
.def("forward", [](MOE& moe, int k, uint64_t expert_ids, intptr_t weights, intptr_t input, intptr_t output) {
|
||||
throw std::runtime_error("!!! Doing nothing, please use CPUInfer.submit to call it!!!\n");
|
||||
});
|
||||
|
||||
py::class_<CPUInfer>(m, "CPUInfer")
|
||||
.def(py::init<int>())
|
||||
.def("submit",
|
||||
[linear_module, mlp_module, moe_module](CPUInfer& cpuinfer, py::object func, py::args args, py::kwargs kwargs) {
|
||||
if (py::hasattr(func, "__self__") &&
|
||||
py::hasattr(func, "__func__")) {
|
||||
std::string class_name = py::str(func.attr("__self__")
|
||||
.attr("__class__")
|
||||
.attr("__name__"));
|
||||
if (class_name == "Linear") {
|
||||
LinearBindings::bind_functions(cpuinfer, func,
|
||||
args, kwargs);
|
||||
} else if (class_name == "MLP") {
|
||||
MLPBindings::bind_functions(cpuinfer, func,
|
||||
args, kwargs);
|
||||
} else if (class_name == "MOE") {
|
||||
MOEBindings::bind_functions(cpuinfer, func,
|
||||
args, kwargs);
|
||||
} else {
|
||||
// handle other classes
|
||||
throw py::type_error("Unsupported class type: " +
|
||||
class_name);
|
||||
}
|
||||
} else {
|
||||
// handle cases where func does not have __self__ or
|
||||
// __func__
|
||||
throw py::type_error(
|
||||
"Invalid function object: missing "
|
||||
"__self__ or __func__ attribute.");
|
||||
}
|
||||
})
|
||||
.def("submit_with_cuda_stream",
|
||||
[linear_module, mlp_module, moe_module](CPUInfer& cpuinfer, intptr_t user_cuda_stream, py::object func, py::args args, py::kwargs kwargs) {
|
||||
if (py::hasattr(func, "__self__") &&
|
||||
py::hasattr(func, "__func__")) {
|
||||
std::string class_name = py::str(func.attr("__self__")
|
||||
.attr("__class__")
|
||||
.attr("__name__"));
|
||||
if (class_name == "MOE") {
|
||||
std::string func_name = py::str(func.attr("__func__").attr("__name__"));
|
||||
if (func_name == "forward") {
|
||||
auto moe = func.attr("__self__").cast<MOE*>();
|
||||
int qlen = args[0].cast<int>();
|
||||
int k = args[1].cast<int>();
|
||||
auto expert_ids = args[2].cast<intptr_t>();
|
||||
auto weights = args[3].cast<intptr_t>();
|
||||
auto input = args[4].cast<intptr_t>();
|
||||
auto output = args[5].cast<intptr_t>();
|
||||
MOEForwardArgs* moe_forward_args = new MOEForwardArgs{&cpuinfer, moe, qlen, k, (uint64_t*)expert_ids, (float*)weights, (void*)input, (void*)output};
|
||||
// submit_moe_forward_with_host_args_ptr(moe_forward_args);
|
||||
cudaLaunchHostFunc((cudaStream_t)user_cuda_stream, (cudaHostFn_t)submit_moe_forward_with_host_args_ptr, moe_forward_args);
|
||||
} else {
|
||||
throw py::value_error("Unsupported function: " +
|
||||
std::string(func_name));
|
||||
}
|
||||
} else {
|
||||
// handle other classes
|
||||
throw py::type_error("Unsupported class type: " +
|
||||
class_name);
|
||||
}
|
||||
} else {
|
||||
// handle cases where func does not have __self__ or
|
||||
// __func__
|
||||
throw py::type_error(
|
||||
"Invalid function object: missing "
|
||||
"__self__ or __func__ attribute.");
|
||||
}
|
||||
})
|
||||
.def("sync_with_cuda_stream", [](CPUInfer& cpuinfer, intptr_t user_cuda_stream) {
|
||||
// cpuinfer_sync((void*)(&cpuinfer));
|
||||
cudaLaunchHostFunc((cudaStream_t)user_cuda_stream, (cudaHostFn_t)cpuinfer_sync, (void*)(&cpuinfer));
|
||||
})
|
||||
.def("sync", &CPUInfer::sync);
|
||||
.def("warm_up", &MOEBindings::WarmUpBindinds::interface)
|
||||
.def("forward", &MOEBindings::ForwardBindings::interface);
|
||||
}
|
||||
|
|
|
@ -13,9 +13,15 @@ Linear::Linear(LinearConfig config) {
|
|||
config_ = config;
|
||||
proj_ = config_.proj;
|
||||
|
||||
input_fp32_.resize(config_.input_size);
|
||||
proj_input_.resize(config_.input_size * 4);
|
||||
proj_output_.resize(config_.output_size);
|
||||
std::vector<std::pair<void**, uint64_t>> mem_requests;
|
||||
mem_requests.push_back({(void**)&input_fp32_, sizeof(float) * config_.group_max_len * config_.input_size});
|
||||
mem_requests.push_back({(void**)&proj_input_, config_.group_max_len * config_.input_size * ggml_type_size(ggml_internal_get_type_traits(config_.proj_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.proj_type).vec_dot_type)});
|
||||
mem_requests.push_back({(void**)&proj_output_, sizeof(float) * config_.group_max_len * config_.output_size});
|
||||
shared_mem_buffer.alloc(this, mem_requests);
|
||||
}
|
||||
|
||||
Linear::~Linear() {
|
||||
shared_mem_buffer.dealloc(this);
|
||||
}
|
||||
|
||||
void Linear::warm_up(Backend* backend) {
|
||||
|
@ -26,22 +32,42 @@ void Linear::warm_up(Backend* backend) {
|
|||
input_fp32[i] = 0;
|
||||
}
|
||||
from_float(input_fp32.data(), input.data(), config_.input_size, config_.hidden_type);
|
||||
forward(input.data(), output.data(), backend);
|
||||
forward_many(1, input.data(), output.data(), backend);
|
||||
}
|
||||
|
||||
void Linear::forward(const void* input, void* output, Backend* backend) {
|
||||
void Linear::forward_many(int qlen, const void* input, void* output, Backend* backend) {
|
||||
const void* proj_input_ptr;
|
||||
if (config_.hidden_type == ggml_internal_get_type_traits(config_.proj_type).vec_dot_type) {
|
||||
proj_input_ptr = input;
|
||||
} else {
|
||||
to_float(input, input_fp32_.data(), config_.input_size, config_.hidden_type);
|
||||
from_float(input_fp32_.data(), proj_input_.data(), config_.input_size, ggml_internal_get_type_traits(config_.proj_type).vec_dot_type);
|
||||
proj_input_ptr = proj_input_.data();
|
||||
to_float(input, input_fp32_, qlen * config_.input_size, config_.hidden_type);
|
||||
from_float(input_fp32_, proj_input_, qlen * config_.input_size, ggml_internal_get_type_traits(config_.proj_type).vec_dot_type);
|
||||
proj_input_ptr = proj_input_;
|
||||
}
|
||||
int nth = config_.output_size / config_.stride;
|
||||
backend->do_work_stealing_job(nth, [&](int task_id) {
|
||||
int ith = task_id % nth;
|
||||
llamafile_sgemm(config_.output_size, 1, config_.input_size / ggml_blck_size(config_.proj_type), proj_, config_.input_size / ggml_blck_size(config_.proj_type), proj_input_ptr, config_.input_size / ggml_blck_size(config_.proj_type), proj_output_.data(), config_.output_size, ith, nth, GGML_TASK_TYPE_COMPUTE, config_.proj_type, ggml_internal_get_type_traits(config_.proj_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
|
||||
});
|
||||
from_float(proj_output_.data(), output, config_.output_size, config_.hidden_type);
|
||||
int ith = task_id;
|
||||
void* proj_ptr = proj_ + ith * config_.stride * config_.input_size * ggml_type_size(config_.proj_type) / ggml_blck_size(config_.proj_type);
|
||||
float* proj_output_ptr = proj_output_ + ith * config_.stride;
|
||||
llamafile_sgemm(config_.stride, qlen, config_.input_size / ggml_blck_size(config_.proj_type), proj_ptr, config_.input_size / ggml_blck_size(config_.proj_type), proj_input_ptr, config_.input_size / ggml_blck_size(config_.proj_type), proj_output_ptr, config_.output_size, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.proj_type, ggml_internal_get_type_traits(config_.proj_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
|
||||
if (config_.stride % ggml_blck_size(config_.hidden_type) == 0) {
|
||||
for (int i = 0; i < qlen; i++) {
|
||||
float* output_fp32_ptr = proj_output_ + i * config_.output_size + ith * config_.stride;
|
||||
void* output_ptr = output + i * config_.output_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type) + ith * config_.stride * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type);
|
||||
from_float(output_fp32_ptr, output_ptr, config_.stride, config_.hidden_type);
|
||||
}
|
||||
}
|
||||
});
|
||||
if (config_.stride % ggml_blck_size(config_.hidden_type) != 0) {
|
||||
from_float(proj_output_, output, qlen * config_.output_size, config_.hidden_type);
|
||||
}
|
||||
}
|
||||
|
||||
void Linear::forward(int qlen, const void* input, void* output, Backend* backend) {
|
||||
if (qlen <= 0) {
|
||||
return;
|
||||
}
|
||||
int forward_len = std::min(qlen, config_.group_max_len);
|
||||
forward_many(forward_len, input, output, backend);
|
||||
forward(qlen - forward_len, input + forward_len * config_.input_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), output + forward_len * config_.output_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), backend);
|
||||
}
|
|
@ -22,34 +22,38 @@
|
|||
#include "llama.cpp/ggml-quants.h"
|
||||
#include "llama.cpp/ggml.h"
|
||||
#include "llamafile/sgemm.h"
|
||||
#include "shared_mem_buffer.h"
|
||||
|
||||
struct LinearConfig {
|
||||
int input_size;
|
||||
int output_size;
|
||||
int stride;
|
||||
int group_max_len;
|
||||
void* proj;
|
||||
ggml_type proj_type;
|
||||
ggml_type hidden_type;
|
||||
|
||||
LinearConfig() {}
|
||||
|
||||
LinearConfig(int input_size, int output_size, int stride, void* proj, ggml_type proj_type, ggml_type hidden_type)
|
||||
: input_size(input_size), output_size(output_size), stride(stride), proj(proj), proj_type(proj_type), hidden_type(hidden_type) {}
|
||||
LinearConfig(int input_size, int output_size, int stride, int group_max_len, void* proj, ggml_type proj_type, ggml_type hidden_type)
|
||||
: input_size(input_size), output_size(output_size), stride(stride), group_max_len(group_max_len), proj(proj), proj_type(proj_type), hidden_type(hidden_type) {}
|
||||
};
|
||||
|
||||
class Linear {
|
||||
public:
|
||||
Linear(LinearConfig);
|
||||
~Linear();
|
||||
void warm_up(Backend* backend);
|
||||
void forward(const void* input, void* output, Backend* backend);
|
||||
void forward_many(int qlen, const void* input, void* output, Backend* backend);
|
||||
void forward(int qlen, const void* input, void* output, Backend* backend);
|
||||
|
||||
private:
|
||||
LinearConfig config_;
|
||||
void* proj_; // [output_size * input_size ( /32 if quantized)]
|
||||
|
||||
std::vector<float> input_fp32_; // [input_size]
|
||||
std::vector<uint8_t> proj_input_; // [input_size * 4]
|
||||
std::vector<float> proj_output_; // [output_size]
|
||||
float* input_fp32_; // [group_max_len * input_size]
|
||||
uint8_t* proj_input_; // [group_max_len * input_size * ggml_type_size(ggml_internal_get_type_traits(proj_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(proj_type).vec_dot_type)]
|
||||
float* proj_output_; // [group_max_len * output_size]
|
||||
};
|
||||
|
||||
#endif
|
|
@ -15,14 +15,20 @@ MLP::MLP(MLPConfig config) {
|
|||
up_proj_ = config_.up_proj;
|
||||
down_proj_ = config_.down_proj;
|
||||
|
||||
input_fp32_.resize(config_.hidden_size);
|
||||
gate_input_.resize(config_.hidden_size * 4);
|
||||
up_input_.resize(config_.hidden_size * 4);
|
||||
gate_output_.resize(config_.intermediate_size);
|
||||
up_output_.resize(config_.intermediate_size);
|
||||
intermediate_fp32_.resize(config_.intermediate_size);
|
||||
down_input_.resize(config_.intermediate_size * 4);
|
||||
down_output_.resize(config_.hidden_size);
|
||||
std::vector<std::pair<void**, uint64_t>> mem_requests;
|
||||
mem_requests.push_back({(void**)&input_fp32_, sizeof(float) * config_.group_max_len * config_.hidden_size});
|
||||
mem_requests.push_back({(void**)&gate_input_, config_.group_max_len * config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type)});
|
||||
mem_requests.push_back({(void**)&up_input_, config_.group_max_len * config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type)});
|
||||
mem_requests.push_back({(void**)&gate_output_, sizeof(float) * config_.group_max_len * config_.intermediate_size});
|
||||
mem_requests.push_back({(void**)&up_output_, sizeof(float) * config_.group_max_len * config_.intermediate_size});
|
||||
mem_requests.push_back({(void**)&intermediate_fp32_, sizeof(float) * config_.group_max_len * config_.intermediate_size});
|
||||
mem_requests.push_back({(void**)&down_input_, config_.group_max_len * config_.intermediate_size * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type)});
|
||||
mem_requests.push_back({(void**)&down_output_, sizeof(float) * config_.group_max_len * config_.hidden_size});
|
||||
shared_mem_buffer.alloc(this, mem_requests);
|
||||
}
|
||||
|
||||
MLP::~MLP() {
|
||||
shared_mem_buffer.dealloc(this);
|
||||
}
|
||||
|
||||
void MLP::warm_up(Backend* backend) {
|
||||
|
@ -33,33 +39,33 @@ void MLP::warm_up(Backend* backend) {
|
|||
input_fp32[i] = 0;
|
||||
}
|
||||
from_float(input_fp32.data(), input.data(), config_.hidden_size, config_.hidden_type);
|
||||
forward(input.data(), output.data(), backend);
|
||||
forward_many(1, input.data(), output.data(), backend);
|
||||
}
|
||||
|
||||
static float act_fn(float x) {
|
||||
return x / (1.0f + expf(-x));
|
||||
}
|
||||
|
||||
void MLP::forward(const void* input, void* output, Backend* backend) {
|
||||
void MLP::forward_many(int qlen, const void* input, void* output, Backend* backend) {
|
||||
const void* gate_input_ptr;
|
||||
const void* up_input_ptr;
|
||||
if (config_.hidden_type == ggml_internal_get_type_traits(config_.gate_type).vec_dot_type && config_.hidden_type == ggml_internal_get_type_traits(config_.up_type).vec_dot_type) {
|
||||
gate_input_ptr = up_input_ptr = input;
|
||||
} else {
|
||||
to_float(input, input_fp32_.data(), config_.hidden_size, config_.hidden_type);
|
||||
to_float(input, input_fp32_, qlen * config_.hidden_size, config_.hidden_type);
|
||||
if (ggml_internal_get_type_traits(config_.gate_type).vec_dot_type == ggml_internal_get_type_traits(config_.up_type).vec_dot_type) {
|
||||
from_float(input_fp32_.data(), gate_input_.data(), config_.hidden_size, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type);
|
||||
gate_input_ptr = up_input_ptr = gate_input_.data();
|
||||
from_float(input_fp32_, gate_input_, qlen * config_.hidden_size, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type);
|
||||
gate_input_ptr = up_input_ptr = gate_input_;
|
||||
} else {
|
||||
if (config_.hidden_type != ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) {
|
||||
from_float(input_fp32_.data(), gate_input_.data(), config_.hidden_size, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type);
|
||||
gate_input_ptr = gate_input_.data();
|
||||
from_float(input_fp32_, gate_input_, qlen * config_.hidden_size, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type);
|
||||
gate_input_ptr = gate_input_;
|
||||
} else {
|
||||
gate_input_ptr = input;
|
||||
}
|
||||
if (config_.hidden_type != ggml_internal_get_type_traits(config_.up_type).vec_dot_type) {
|
||||
from_float(input_fp32_.data(), up_input_.data(), config_.hidden_size, ggml_internal_get_type_traits(config_.up_type).vec_dot_type);
|
||||
up_input_ptr = up_input_.data();
|
||||
from_float(input_fp32_, up_input_, qlen * config_.hidden_size, ggml_internal_get_type_traits(config_.up_type).vec_dot_type);
|
||||
up_input_ptr = up_input_;
|
||||
} else {
|
||||
up_input_ptr = input;
|
||||
}
|
||||
|
@ -69,35 +75,49 @@ void MLP::forward(const void* input, void* output, Backend* backend) {
|
|||
backend->do_work_stealing_job(nth, [&](int task_id) {
|
||||
int ith = task_id;
|
||||
void* gate_proj_ptr = gate_proj_ + ith * config_.stride * config_.hidden_size * ggml_type_size(config_.gate_type) / ggml_blck_size(config_.gate_type);
|
||||
float* gate_output_ptr = gate_output_.data() + ith * config_.stride;
|
||||
llamafile_sgemm(config_.stride, 1, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_proj_ptr, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_input_ptr, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_output_ptr, config_.stride, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.gate_type, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
|
||||
float* gate_output_ptr = gate_output_ + ith * config_.stride;
|
||||
llamafile_sgemm(config_.stride, qlen, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_proj_ptr, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_input_ptr, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_output_ptr, config_.intermediate_size, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.gate_type, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
|
||||
void* up_proj_ptr = up_proj_ + ith * config_.stride * config_.hidden_size * ggml_type_size(config_.up_type) / ggml_blck_size(config_.up_type);
|
||||
float* up_output_ptr = up_output_.data() + ith * config_.stride;
|
||||
llamafile_sgemm(config_.stride, 1, config_.hidden_size / ggml_blck_size(config_.up_type), up_proj_ptr, config_.hidden_size / ggml_blck_size(config_.up_type), up_input_ptr, config_.hidden_size / ggml_blck_size(config_.up_type), up_output_ptr, config_.stride, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.up_type, ggml_internal_get_type_traits(config_.up_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
|
||||
for (int i = ith * config_.stride; i < (ith + 1) * config_.stride; i++) {
|
||||
intermediate_fp32_[i] = act_fn(gate_output_[i]) * up_output_[i];
|
||||
float* up_output_ptr = up_output_ + ith * config_.stride;
|
||||
llamafile_sgemm(config_.stride, qlen, config_.hidden_size / ggml_blck_size(config_.up_type), up_proj_ptr, config_.hidden_size / ggml_blck_size(config_.up_type), up_input_ptr, config_.hidden_size / ggml_blck_size(config_.up_type), up_output_ptr, config_.intermediate_size, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.up_type, ggml_internal_get_type_traits(config_.up_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
|
||||
for (int i = 0; i < qlen; i++) {
|
||||
for (int j = ith * config_.stride; j < (ith + 1) * config_.stride; j++) {
|
||||
intermediate_fp32_[i * config_.intermediate_size + j] = act_fn(gate_output_[i * config_.intermediate_size + j]) * up_output_[i * config_.intermediate_size + j];
|
||||
}
|
||||
if (config_.stride % ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) == 0) {
|
||||
float* intermediate_fp32_ptr = intermediate_fp32_.data() + ith * config_.stride;
|
||||
void* down_input_ptr = down_input_.data() + ith * config_.stride * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
|
||||
float* intermediate_fp32_ptr = intermediate_fp32_ + i * config_.intermediate_size + ith * config_.stride;
|
||||
void* down_input_ptr = down_input_ + i * config_.intermediate_size * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) + ith * config_.stride * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
|
||||
from_float(intermediate_fp32_ptr, down_input_ptr, config_.stride, ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
|
||||
}
|
||||
}
|
||||
});
|
||||
if (config_.stride % ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) != 0) {
|
||||
from_float(intermediate_fp32_.data(), down_input_.data(), config_.intermediate_size, ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
|
||||
from_float(intermediate_fp32_, down_input_, qlen * config_.intermediate_size, ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
|
||||
}
|
||||
nth = config_.hidden_size / config_.stride;
|
||||
backend->do_work_stealing_job(nth, [&](int task_id) {
|
||||
int ith = task_id;
|
||||
void* down_proj_ptr = down_proj_ + ith * config_.stride * config_.intermediate_size * ggml_type_size(config_.down_type) / ggml_blck_size(config_.down_type);
|
||||
float* down_output_ptr = down_output_.data() + ith * config_.stride;
|
||||
llamafile_sgemm(config_.stride, 1, config_.intermediate_size / ggml_blck_size(config_.down_type), down_proj_ptr, config_.intermediate_size / ggml_blck_size(config_.down_type), down_input_.data(), config_.intermediate_size / ggml_blck_size(config_.down_type), down_output_ptr, config_.stride, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.down_type, ggml_internal_get_type_traits(config_.down_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
|
||||
float* down_output_ptr = down_output_ + ith * config_.stride;
|
||||
llamafile_sgemm(config_.stride, qlen, config_.intermediate_size / ggml_blck_size(config_.down_type), down_proj_ptr, config_.intermediate_size / ggml_blck_size(config_.down_type), down_input_, config_.intermediate_size / ggml_blck_size(config_.down_type), down_output_ptr, config_.hidden_size, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.down_type, ggml_internal_get_type_traits(config_.down_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
|
||||
if (config_.stride % ggml_blck_size(config_.hidden_type) == 0) {
|
||||
void* output_ptr = output + ith * config_.stride * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type);
|
||||
from_float(down_output_ptr, output_ptr, config_.stride, config_.hidden_type);
|
||||
for (int i = 0; i < qlen; i++) {
|
||||
float* output_fp32_ptr = down_output_ + i * config_.hidden_size + ith * config_.stride;
|
||||
void* output_ptr = output + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type) + ith * config_.stride * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type);
|
||||
from_float(output_fp32_ptr, output_ptr, config_.stride, config_.hidden_type);
|
||||
}
|
||||
}
|
||||
});
|
||||
if (config_.stride % ggml_blck_size(config_.hidden_type) != 0) {
|
||||
from_float(down_output_.data(), output, config_.hidden_size, config_.hidden_type);
|
||||
from_float(down_output_, output, qlen * config_.hidden_size, config_.hidden_type);
|
||||
}
|
||||
}
|
||||
|
||||
void MLP::forward(int qlen, const void* input, void* output, Backend* backend) {
|
||||
if (qlen <= 0) {
|
||||
return;
|
||||
}
|
||||
int forward_len = std::min(qlen, config_.group_max_len);
|
||||
forward_many(forward_len, input, output, backend);
|
||||
forward(qlen - forward_len, input + forward_len * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), output + forward_len * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), backend);
|
||||
}
|
|
@ -22,11 +22,13 @@
|
|||
#include "llama.cpp/ggml-quants.h"
|
||||
#include "llama.cpp/ggml.h"
|
||||
#include "llamafile/sgemm.h"
|
||||
#include "shared_mem_buffer.h"
|
||||
|
||||
struct MLPConfig {
|
||||
int hidden_size;
|
||||
int intermediate_size;
|
||||
int stride;
|
||||
int group_max_len;
|
||||
void* gate_proj;
|
||||
void* up_proj;
|
||||
void* down_proj;
|
||||
|
@ -37,15 +39,17 @@ struct MLPConfig {
|
|||
|
||||
MLPConfig() {}
|
||||
|
||||
MLPConfig(int hidden_size, int intermediate_size, int stride, void* gate_proj, void* up_proj, void* down_proj, ggml_type gate_type, ggml_type up_type, ggml_type down_type, ggml_type hidden_type)
|
||||
: hidden_size(hidden_size), intermediate_size(intermediate_size), stride(stride), gate_proj(gate_proj), up_proj(up_proj), down_proj(down_proj), gate_type(gate_type), up_type(up_type), down_type(down_type), hidden_type(hidden_type) {}
|
||||
MLPConfig(int hidden_size, int intermediate_size, int stride, int group_max_len, void* gate_proj, void* up_proj, void* down_proj, ggml_type gate_type, ggml_type up_type, ggml_type down_type, ggml_type hidden_type)
|
||||
: hidden_size(hidden_size), intermediate_size(intermediate_size), stride(stride), group_max_len(group_max_len), gate_proj(gate_proj), up_proj(up_proj), down_proj(down_proj), gate_type(gate_type), up_type(up_type), down_type(down_type), hidden_type(hidden_type) {}
|
||||
};
|
||||
|
||||
class MLP {
|
||||
public:
|
||||
MLP(MLPConfig);
|
||||
~MLP();
|
||||
void warm_up(Backend* backend);
|
||||
void forward(const void* input, void* output, Backend* backend);
|
||||
void forward_many(int qlen, const void* input, void* output, Backend* backend);
|
||||
void forward(int qlen, const void* input, void* output, Backend* backend);
|
||||
|
||||
private:
|
||||
MLPConfig config_;
|
||||
|
@ -53,14 +57,14 @@ class MLP {
|
|||
void* up_proj_; // [intermediate_size * hidden_size ( /32 if quantized)]
|
||||
void* down_proj_; // [hidden_size * intermediate_size ( /32 if quantized)]
|
||||
|
||||
std::vector<float> input_fp32_; // [hidden_size]
|
||||
std::vector<uint8_t> gate_input_; // [hidden_size * 4]
|
||||
std::vector<uint8_t> up_input_; // [hidden_size * 4]
|
||||
std::vector<float> gate_output_; // [intermediate_size]
|
||||
std::vector<float> up_output_; // [intermediate_size]
|
||||
std::vector<float> intermediate_fp32_; // [intermediate_size]
|
||||
std::vector<uint8_t> down_input_; // [intermediate_size * 4]
|
||||
std::vector<float> down_output_; // [hidden_size]
|
||||
float* input_fp32_; // [group_max_len * hidden_size]
|
||||
uint8_t* gate_input_; // [group_max_len * hidden_size * ggml_type_size(ggml_internal_get_type_traits(gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(gate_type).vec_dot_type)]
|
||||
uint8_t* up_input_; // [group_max_len * hidden_size * ggml_type_size(ggml_internal_get_type_traits(up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(up_type).vec_dot_type)]
|
||||
float* gate_output_; // [group_max_len * intermediate_size]
|
||||
float* up_output_; // [group_max_len * intermediate_size]
|
||||
float* intermediate_fp32_; // [group_max_len * intermediate_size]
|
||||
uint8_t* down_input_; // [group_max_len * intermediate_size * ggml_type_size(ggml_internal_get_type_traits(down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(down_type).vec_dot_type)]
|
||||
float* down_output_; // [group_max_len * hidden_size]
|
||||
};
|
||||
|
||||
#endif
|
|
@ -11,87 +11,52 @@
|
|||
#include <iostream>
|
||||
#include "unistd.h"
|
||||
|
||||
void* MOE::buffer_ = nullptr;
|
||||
|
||||
MOE::MOE(MOEConfig config) {
|
||||
config_ = config;
|
||||
gate_proj_ = config_.gate_proj;
|
||||
up_proj_ = config_.up_proj;
|
||||
down_proj_ = config_.down_proj;
|
||||
|
||||
if (MOE::buffer_ == nullptr) {
|
||||
uint64_t buffer_size = 0;
|
||||
buffer_size += sizeof(float) * config_.group_max_len * config_.hidden_size;
|
||||
buffer_size += config_.group_max_len * config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type);
|
||||
buffer_size += config_.group_max_len * config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type);
|
||||
buffer_size += config_.routed_expert_num * config_.group_max_len * config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type);
|
||||
buffer_size += config_.routed_expert_num * config_.group_max_len * config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type);
|
||||
buffer_size += sizeof(float) * config_.routed_expert_num * config_.group_max_len * config_.intermediate_size;
|
||||
buffer_size += sizeof(float) * config_.routed_expert_num * config_.group_max_len * config_.intermediate_size;
|
||||
buffer_size += sizeof(float) * config_.routed_expert_num * config_.group_max_len * config_.intermediate_size;
|
||||
buffer_size += config_.routed_expert_num * config_.group_max_len * config_.intermediate_size * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
|
||||
buffer_size += sizeof(float) * config_.routed_expert_num * config_.group_max_len * config_.hidden_size;
|
||||
buffer_size += sizeof(float) * config_.group_max_len * config_.hidden_size;
|
||||
buffer_ = malloc(buffer_size);
|
||||
}
|
||||
|
||||
uint64_t offset = 0;
|
||||
s_input_fp32_ = (float*)(buffer_ + offset);
|
||||
offset += sizeof(float) * config_.hidden_size;
|
||||
s_gate_input_ = (uint8_t*)(buffer_ + offset);
|
||||
offset += config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type);
|
||||
s_up_input_ = (uint8_t*)(buffer_ + offset);
|
||||
offset += config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type);
|
||||
std::vector<std::pair<void**, uint64_t>> s_mem_requests;
|
||||
s_mem_requests.push_back({(void**)&s_input_fp32_, sizeof(float) * config_.hidden_size});
|
||||
s_mem_requests.push_back({(void**)&s_gate_input_, config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type)});
|
||||
s_mem_requests.push_back({(void**)&s_up_input_, config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type)});
|
||||
s_gate_output_.resize(config_.routed_expert_num);
|
||||
s_up_output_.resize(config_.routed_expert_num);
|
||||
s_intermediate_fp32_.resize(config_.routed_expert_num);
|
||||
s_down_input_.resize(config_.routed_expert_num);
|
||||
s_down_output_.resize(config_.routed_expert_num);
|
||||
for (int i = 0; i < config_.routed_expert_num; i++) {
|
||||
s_gate_output_[i] = (float*)(buffer_ + offset);
|
||||
offset += sizeof(float) * config_.intermediate_size;
|
||||
s_up_output_[i] = (float*)(buffer_ + offset);
|
||||
offset += sizeof(float) * config_.intermediate_size;
|
||||
s_intermediate_fp32_[i] = (float*)(buffer_ + offset);
|
||||
offset += sizeof(float) * config_.intermediate_size;
|
||||
s_down_input_[i] = (uint8_t*)(buffer_ + offset);
|
||||
offset += config_.intermediate_size * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
|
||||
s_down_output_[i] = (float*)(buffer_ + offset);
|
||||
offset += sizeof(float) * config_.hidden_size;
|
||||
s_mem_requests.push_back({(void**)&s_gate_output_[i], sizeof(float) * config_.intermediate_size});
|
||||
s_mem_requests.push_back({(void**)&s_up_output_[i], sizeof(float) * config_.intermediate_size});
|
||||
s_mem_requests.push_back({(void**)&s_intermediate_fp32_[i], sizeof(float) * config_.intermediate_size});
|
||||
s_mem_requests.push_back({(void**)&s_down_input_[i], config_.intermediate_size * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type)});
|
||||
s_mem_requests.push_back({(void**)&s_down_output_[i], sizeof(float) * config_.hidden_size});
|
||||
}
|
||||
s_output_fp32_ = (float*)(buffer_ + offset);
|
||||
s_mem_requests.push_back({(void**)&s_output_fp32_, sizeof(float) * config_.hidden_size});
|
||||
shared_mem_buffer.alloc(this, s_mem_requests);
|
||||
|
||||
offset = 0;
|
||||
std::vector<std::pair<void**, uint64_t>> m_mem_requests;
|
||||
m_input_fp32_.resize(config_.group_max_len);
|
||||
m_gate_input_.resize(config_.group_max_len);
|
||||
m_up_input_.resize(config_.group_max_len);
|
||||
for (int i = 0; i < config_.group_max_len; i++) {
|
||||
m_input_fp32_[i] = (float*)(buffer_ + offset);
|
||||
offset += sizeof(float) * config_.hidden_size;
|
||||
m_gate_input_[i] = (uint8_t*)(buffer_ + offset);
|
||||
offset += config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type);
|
||||
m_up_input_[i] = (uint8_t*)(buffer_ + offset);
|
||||
offset += config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type);
|
||||
m_mem_requests.push_back({(void**)&m_input_fp32_[i], sizeof(float) * config_.hidden_size});
|
||||
m_mem_requests.push_back({(void**)&m_gate_input_[i], config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type)});
|
||||
m_mem_requests.push_back({(void**)&m_up_input_[i], config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type)});
|
||||
}
|
||||
m_local_gate_input_ = (uint8_t*)(buffer_ + offset);
|
||||
offset += config_.routed_expert_num * config_.group_max_len * config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type);
|
||||
m_local_up_input_ = (uint8_t*)(buffer_ + offset);
|
||||
offset += config_.routed_expert_num * config_.group_max_len * config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type);
|
||||
m_local_gate_output_ = (float*)(buffer_ + offset);
|
||||
offset += sizeof(float) * config_.routed_expert_num * config_.group_max_len * config_.intermediate_size;
|
||||
m_local_up_output_ = (float*)(buffer_ + offset);
|
||||
offset += sizeof(float) * config_.routed_expert_num * config_.group_max_len * config_.intermediate_size;
|
||||
m_local_intermediate_fp32_ = (float*)(buffer_ + offset);
|
||||
offset += sizeof(float) * config_.routed_expert_num * config_.group_max_len * config_.intermediate_size;
|
||||
m_local_down_input_ = (uint8_t*)(buffer_ + offset);
|
||||
offset += config_.routed_expert_num * config_.group_max_len * config_.intermediate_size * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
|
||||
m_local_down_output_ = (float*)(buffer_ + offset);
|
||||
offset += sizeof(float) * config_.routed_expert_num * config_.group_max_len * config_.hidden_size;
|
||||
m_mem_requests.push_back({(void**)&m_local_gate_input_, config_.routed_expert_num * config_.group_max_len * config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type)});
|
||||
m_mem_requests.push_back({(void**)&m_local_up_input_, config_.routed_expert_num * config_.group_max_len * config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type)});
|
||||
m_mem_requests.push_back({(void**)&m_local_gate_output_, sizeof(float) * config_.routed_expert_num * config_.group_max_len * config_.intermediate_size});
|
||||
m_mem_requests.push_back({(void**)&m_local_up_output_, sizeof(float) * config_.routed_expert_num * config_.group_max_len * config_.intermediate_size});
|
||||
m_mem_requests.push_back({(void**)&m_local_intermediate_fp32_, sizeof(float) * config_.routed_expert_num * config_.group_max_len * config_.intermediate_size});
|
||||
m_mem_requests.push_back({(void**)&m_local_down_input_, config_.routed_expert_num * config_.group_max_len * config_.intermediate_size * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type)});
|
||||
m_mem_requests.push_back({(void**)&m_local_down_output_, sizeof(float) * config_.routed_expert_num * config_.group_max_len * config_.hidden_size});
|
||||
m_output_fp32_.resize(config_.group_max_len);
|
||||
for (int i = 0; i < config_.group_max_len; i++) {
|
||||
m_output_fp32_[i] = (float*)(buffer_ + offset);
|
||||
offset += sizeof(float) * config_.hidden_size;
|
||||
m_mem_requests.push_back({(void**)&m_output_fp32_[i], sizeof(float) * config_.hidden_size});
|
||||
}
|
||||
shared_mem_buffer.alloc(this, m_mem_requests);
|
||||
|
||||
m_local_pos_.resize(config_.group_max_len);
|
||||
for (int i = 0; i < config_.group_max_len; i++) {
|
||||
|
@ -107,6 +72,10 @@ MOE::MOE(MOEConfig config) {
|
|||
m_local_down_output_ptr_.resize(config_.expert_num);
|
||||
}
|
||||
|
||||
MOE::~MOE() {
|
||||
shared_mem_buffer.dealloc(this);
|
||||
}
|
||||
|
||||
void MOE::warm_up(Backend* backend) {
|
||||
std::vector<float> input_fp32(config_.hidden_size);
|
||||
std::vector<uint8_t> input(config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type));
|
||||
|
|
|
@ -22,6 +22,7 @@
|
|||
#include "llama.cpp/ggml-quants.h"
|
||||
#include "llama.cpp/ggml.h"
|
||||
#include "llamafile/sgemm.h"
|
||||
#include "shared_mem_buffer.h"
|
||||
|
||||
struct MOEConfig {
|
||||
int expert_num;
|
||||
|
@ -48,13 +49,13 @@ struct MOEConfig {
|
|||
class MOE {
|
||||
public:
|
||||
MOE(MOEConfig);
|
||||
~MOE();
|
||||
void warm_up(Backend* backend);
|
||||
void forward_one(int k, const uint64_t* expert_ids, const float* weights, const void* input, void* output, Backend* backend);
|
||||
void forward_many(int qlen, int k, const uint64_t* expert_ids, const float* weights, const void* input, void* output, Backend* backend);
|
||||
void forward(int qlen, int k, const uint64_t* expert_ids, const float* weights, const void* input, void* output, Backend* backend);
|
||||
|
||||
private:
|
||||
static void* buffer_;
|
||||
MOEConfig config_;
|
||||
void* gate_proj_; // [expert_num * intermediate_size * hidden_size ( /32 if quantized)]
|
||||
void* up_proj_; // [expert_num * intermediate_size * hidden_size ( /32 if quantized)]
|
||||
|
|
|
@ -0,0 +1,55 @@
|
|||
/**
|
||||
* @Description :
|
||||
* @Author : chenht2022
|
||||
* @Date : 2024-08-05 04:49:08
|
||||
* @Version : 1.0.0
|
||||
* @LastEditors : chenht2022
|
||||
* @LastEditTime : 2024-08-05 09:21:29
|
||||
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
**/
|
||||
#include "shared_mem_buffer.h"
|
||||
#include <cstdio>
|
||||
|
||||
SharedMemBuffer::SharedMemBuffer() {
|
||||
buffer_ = nullptr;
|
||||
size_ = 0;
|
||||
}
|
||||
|
||||
SharedMemBuffer::~SharedMemBuffer() {
|
||||
if (buffer_) {
|
||||
free(buffer_);
|
||||
}
|
||||
}
|
||||
|
||||
void SharedMemBuffer::alloc(void* object, std::vector<std::pair<void**, uint64_t>> requests) {
|
||||
uint64_t size = 0;
|
||||
for (auto& request : requests) {
|
||||
size += request.second;
|
||||
}
|
||||
if (size > size_) {
|
||||
if (buffer_) {
|
||||
free(buffer_);
|
||||
}
|
||||
buffer_ = malloc(size);
|
||||
size_ = size;
|
||||
for (auto& obj_requests : hist_requests_) {
|
||||
for (auto& requests : obj_requests.second) {
|
||||
arrange(requests);
|
||||
}
|
||||
}
|
||||
}
|
||||
arrange(requests);
|
||||
hist_requests_[object].push_back(requests);
|
||||
}
|
||||
|
||||
void SharedMemBuffer::dealloc(void* object) {
|
||||
hist_requests_.erase(object);
|
||||
}
|
||||
|
||||
void SharedMemBuffer::arrange(std::vector<std::pair<void**, uint64_t>> requests) {
|
||||
uint64_t offset = 0;
|
||||
for (auto& request : requests) {
|
||||
*(request.first) = buffer_ + offset;
|
||||
offset += request.second;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,37 @@
|
|||
/**
|
||||
* @Description :
|
||||
* @Author : chenht2022
|
||||
* @Date : 2024-08-05 04:49:08
|
||||
* @Version : 1.0.0
|
||||
* @LastEditors : chenht2022
|
||||
* @LastEditTime : 2024-08-05 06:36:41
|
||||
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
**/
|
||||
|
||||
#ifndef CPUINFER_SHAREDMEMBUFFER_H
|
||||
#define CPUINFER_SHAREDMEMBUFFER_H
|
||||
|
||||
#include <cstdint>
|
||||
#include <cstdlib>
|
||||
#include <map>
|
||||
#include <vector>
|
||||
|
||||
class SharedMemBuffer {
|
||||
public:
|
||||
SharedMemBuffer();
|
||||
~SharedMemBuffer();
|
||||
|
||||
void alloc(void* object, std::vector<std::pair<void**, uint64_t>> requests);
|
||||
void dealloc(void* object);
|
||||
|
||||
private:
|
||||
void* buffer_;
|
||||
uint64_t size_;
|
||||
std::map<void*, std::vector<std::vector<std::pair<void**, uint64_t>>>> hist_requests_;
|
||||
|
||||
void arrange(std::vector<std::pair<void**, uint64_t>> requests);
|
||||
};
|
||||
|
||||
static SharedMemBuffer shared_mem_buffer;
|
||||
|
||||
#endif
|
|
@ -155,7 +155,7 @@ class MLPCPUExperts(MLPExpertsBase):
|
|||
self.moe = MOE(moe_config)
|
||||
self.cpu_infer = MLPCPUExperts.CPU_INFER
|
||||
if warmup:
|
||||
self.cpu_infer.submit(self.moe.warm_up)
|
||||
self.cpu_infer.submit(self.moe.warm_up())
|
||||
self.cpu_infer.sync()
|
||||
if MLPCPUExperts.output_gpu == None:
|
||||
MLPCPUExperts.input_tensor_cpu = torch.empty((self.config.hidden_size), device="cpu", pin_memory=True)
|
||||
|
@ -168,7 +168,7 @@ class MLPCPUExperts(MLPExpertsBase):
|
|||
MLPCPUExperts.input_tensor_cpu.copy_(input_tensor, non_blocking=True)
|
||||
MLPCPUExperts.expert_ids_cpu.copy_(expert_ids, non_blocking=True)
|
||||
MLPCPUExperts.weights_cpu.copy_(weights, non_blocking=True)
|
||||
self.cpu_infer.submit_with_cuda_stream(torch.cuda.current_stream().cuda_stream, self.moe.forward, 1, expert_ids.size(0), MLPCPUExperts.expert_ids_cpu.data_ptr(), MLPCPUExperts.weights_cpu.data_ptr(), MLPCPUExperts.input_tensor_cpu.data_ptr(), MLPCPUExperts.output_cpu.data_ptr())
|
||||
self.cpu_infer.submit_with_cuda_stream(torch.cuda.current_stream().cuda_stream, self.moe.forward(1, expert_ids.size(0), MLPCPUExperts.expert_ids_cpu.data_ptr(), MLPCPUExperts.weights_cpu.data_ptr(), MLPCPUExperts.input_tensor_cpu.data_ptr(), MLPCPUExperts.output_cpu.data_ptr()))
|
||||
|
||||
def sync_for_one_decode(self):
|
||||
self.cpu_infer.sync_with_cuda_stream(torch.cuda.current_stream().cuda_stream)
|
||||
|
@ -183,7 +183,7 @@ class MLPCPUExperts(MLPExpertsBase):
|
|||
MLPCPUExperts.input_tensor_cpu.copy_(input_tensor, non_blocking=True)
|
||||
MLPCPUExperts.expert_ids_cpu.copy_(expert_ids, non_blocking=True)
|
||||
MLPCPUExperts.weights_cpu.copy_(weights, non_blocking=True)
|
||||
self.cpu_infer.submit_with_cuda_stream(torch.cuda.current_stream().cuda_stream, self.moe.forward, 1, expert_ids.size(1), MLPCPUExperts.expert_ids_cpu.data_ptr(), MLPCPUExperts.weights_cpu.data_ptr(), MLPCPUExperts.input_tensor_cpu.data_ptr(), MLPCPUExperts.output_cpu.data_ptr())
|
||||
self.cpu_infer.submit_with_cuda_stream(torch.cuda.current_stream().cuda_stream, self.moe.forward(1, expert_ids.size(1), MLPCPUExperts.expert_ids_cpu.data_ptr(), MLPCPUExperts.weights_cpu.data_ptr(), MLPCPUExperts.input_tensor_cpu.data_ptr(), MLPCPUExperts.output_cpu.data_ptr()))
|
||||
self.cpu_infer.sync_with_cuda_stream(torch.cuda.current_stream().cuda_stream)
|
||||
MLPCPUExperts.output_gpu.copy_(MLPCPUExperts.output_cpu, non_blocking=True)
|
||||
#print("capturing experts finish")
|
||||
|
@ -193,7 +193,7 @@ class MLPCPUExperts(MLPExpertsBase):
|
|||
expert_ids = expert_ids.contiguous().cpu()
|
||||
weights = weights.contiguous().to(torch.float32).cpu()
|
||||
output = torch.empty_like(input_tensor).contiguous()
|
||||
self.cpu_infer.submit(self.moe.forward, expert_ids.size(0), expert_ids.size(1), expert_ids.data_ptr(), weights.data_ptr(), input_tensor.data_ptr(), output.data_ptr())
|
||||
self.cpu_infer.submit(self.moe.forward(expert_ids.size(0), expert_ids.size(1), expert_ids.data_ptr(), weights.data_ptr(), input_tensor.data_ptr(), output.data_ptr()))
|
||||
self.cpu_infer.sync()
|
||||
return output.to(device=object.__getattribute__(self, "device"))
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue