[ADD] support multi-gpu qlen>1 q5_k

This commit is contained in:
chenxl 2024-08-12 11:17:29 +00:00
parent f293803156
commit f5f79f5c0e
63 changed files with 3271 additions and 1285 deletions

View file

@ -5,8 +5,8 @@ Description :
Author : Azure-Tang, Boxin Zhang, chenht2022
Date : 2024-07-25 11:25:24
Version : 0.1.0
LastEditors : Azure
LastEditTime : 2024-07-26 09:27:41
LastEditors : kkk1nak0
LastEditTime : 2024-08-11 12:14:39
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
'''
@ -19,7 +19,9 @@ import torch
import sys, os
from ktransformers.operators.base_operator import BaseInjectedModule
sys.path.append(os.path.dirname(__file__) + "/../ktransformers_ext/build")
sys.path.append(os.path.join(os.path.dirname(__file__), "..", "ktransformers_ext", "build"))
sys.path.append(os.path.join(os.path.dirname(__file__), "..", "ktransformers_ext", "build", "Release"))
sys.path.append(os.path.join(os.path.dirname(__file__), "..", "ktransformers_ext", "build", "Debug"))
import cpuinfer_ext
from cpuinfer_ext.moe import MOEConfig, MOE
import ctypes
@ -78,6 +80,25 @@ class MLPExpertsBase(ABC):
gate_type = self.gguf_loader.tensor_info[key + ".ffn_gate_exps.weight"]["ggml_type"]
up_type = self.gguf_loader.tensor_info[key + ".ffn_up_exps.weight"]["ggml_type"]
down_type = self.gguf_loader.tensor_info[key + ".ffn_down_exps.weight"]["ggml_type"]
elif key + ".ffn_down.0.weight" in self.gguf_loader.tensor_info:
# for supporting Mixtral-8x7B-Instuct
gate = []
up = []
down = []
for i in range(8):
gatei, upi, downi = f".ffn_gate.{i}.weight", f".ffn_up.{i}.weight", f".ffn_down.{i}.weight"
targets = [gatei, upi, downi]
tensors = self.load_multi(key, targets, device=device)
gate_it, up_it, down_it = tensors[gatei], tensors[upi], tensors[downi]
gate.append(gate_it)
up.append(up_it)
down.append(down_it)
gate = torch.stack(gate)
up = torch.stack(up)
down = torch.stack(down)
gate_type = self.gguf_loader.tensor_info[key + ".ffn_gate.0.weight"]["ggml_type"]
up_type = self.gguf_loader.tensor_info[key + ".ffn_up.0.weight"]["ggml_type"]
down_type = self.gguf_loader.tensor_info[key + ".ffn_down.0.weight"]["ggml_type"]
else:
raise ValueError(f"Experts {key} not found in gguf_loader")
res = {key:{"gate": gate, "up": up, "down": down, "gate_type": gate_type, "up_type": up_type, "down_type": down_type}}
@ -94,7 +115,8 @@ class MLPCPUExperts(MLPExpertsBase):
expert_ids_cpu:Tensor = None
weights_cpu:Tensor = None
output_cpu:Tensor = None
output_gpu:Tensor = None
output_gpu_map:dict = {} # Manage output tensor buffer on different gpu
#stream_map:dict = {} # Manage cuda stream on different gpu
CPU_INFER = cpuinfer_ext.CPUInfer(Config().cpu_infer)
def __init__(
self,
@ -113,81 +135,83 @@ class MLPCPUExperts(MLPExpertsBase):
self.out_device = out_device
def load(self, w: dict | nn.Parameter | tuple | None = None, device:str|None = None, warmup:bool = False):
if device:
assert device.lower() == "cpu", "MLPCPUExperts can only be loaded on CPU, Parameter \"device\" can be cpu or None."
if w is None: w = self.load_weights()[self.key]
self.gate = w["gate"]
self.up = w["up"]
self.down = w["down"]
self.gate_type = w["gate_type"]
self.up_type = w["up_type"]
self.down_type = w["down_type"]
gate_ptr = ctypes.addressof(
ctypes.cast(self.gate.ctypes.data, ctypes.POINTER(ctypes.c_uint64)).contents
)
up_ptr = ctypes.addressof(
ctypes.cast(self.up.ctypes.data, ctypes.POINTER(ctypes.c_uint64)).contents
)
down_ptr = ctypes.addressof(
ctypes.cast(self.down.ctypes.data, ctypes.POINTER(ctypes.c_uint64)).contents
)
# print(self.gate_qtype, self.up_qtype, self.down_qtype)
n_routed_experts = self.n_routed_experts
# n_routed_experts = len(self.orig_module)
moe_config = MOEConfig(
n_routed_experts,
self.config.num_experts_per_tok,
self.config.hidden_size,
self.config.moe_intermediate_size,
64,
10,
1024,
gate_ptr,
up_ptr,
down_ptr,
self.gate_type,
self.up_type,
self.down_type,
30, # TODO: get from model.dtype
)
# print(n_routed_experts, hidden_size, moe_intermediate_size)
num_experts_per_tok = self.config.num_experts_per_tok
self.moe = MOE(moe_config)
self.cpu_infer = MLPCPUExperts.CPU_INFER
if warmup:
self.cpu_infer.submit(self.moe.warm_up())
self.cpu_infer.sync()
if MLPCPUExperts.output_gpu == None:
MLPCPUExperts.input_tensor_cpu = torch.empty((self.config.hidden_size), device="cpu", pin_memory=True)
MLPCPUExperts.expert_ids_cpu = torch.empty((num_experts_per_tok), device="cpu", dtype=torch.long, pin_memory=True)
MLPCPUExperts.weights_cpu = torch.empty((num_experts_per_tok), device="cpu", dtype=torch.float32, pin_memory=True)
MLPCPUExperts.output_cpu = torch.empty((self.config.hidden_size), device="cpu", pin_memory=True)
MLPCPUExperts.output_gpu = torch.empty((self.config.hidden_size), device=self.out_device)
with torch.device(self.out_device):
if device:
assert device.lower() == "cpu", "MLPCPUExperts can only be loaded on CPU, Parameter \"device\" can be cpu or None."
if w is None: w = self.load_weights()[self.key]
self.gate = w["gate"]
self.up = w["up"]
self.down = w["down"]
self.gate_type = w["gate_type"]
self.up_type = w["up_type"]
self.down_type = w["down_type"]
gate_ptr = ctypes.addressof(
ctypes.cast(self.gate.ctypes.data, ctypes.POINTER(ctypes.c_uint64)).contents
)
up_ptr = ctypes.addressof(
ctypes.cast(self.up.ctypes.data, ctypes.POINTER(ctypes.c_uint64)).contents
)
down_ptr = ctypes.addressof(
ctypes.cast(self.down.ctypes.data, ctypes.POINTER(ctypes.c_uint64)).contents
)
# print(self.gate_qtype, self.up_qtype, self.down_qtype)
n_routed_experts = self.n_routed_experts
# n_routed_experts = len(self.orig_module)
moe_config = MOEConfig(
n_routed_experts,
self.config.num_experts_per_tok,
self.config.hidden_size,
self.config.moe_intermediate_size,
64,
10,
1024,
gate_ptr,
up_ptr,
down_ptr,
self.gate_type,
self.up_type,
self.down_type,
30, # TODO: get from model.dtype
)
# print(n_routed_experts, hidden_size, moe_intermediate_size)
num_experts_per_tok = self.config.num_experts_per_tok
self.moe = MOE(moe_config)
self.cpu_infer = MLPCPUExperts.CPU_INFER
if warmup:
self.cpu_infer.submit(self.moe.warm_up())
self.cpu_infer.sync()
if self.out_device not in MLPCPUExperts.output_gpu_map:
MLPCPUExperts.output_gpu_map[self.out_device] = torch.zeros((self.config.hidden_size), device=self.out_device)
if MLPCPUExperts.input_tensor_cpu == None:
MLPCPUExperts.input_tensor_cpu = torch.zeros((self.config.hidden_size), device="cpu", pin_memory=True)
MLPCPUExperts.expert_ids_cpu = torch.zeros((num_experts_per_tok), device="cpu", dtype=torch.long, pin_memory=True)
MLPCPUExperts.weights_cpu = torch.zeros((num_experts_per_tok), device="cpu", dtype=torch.float32, pin_memory=True)
MLPCPUExperts.output_cpu = torch.zeros((self.config.hidden_size), device="cpu", pin_memory=True, dtype=torch.bfloat16)
def submit_for_one_decode(self, input_tensor, expert_ids, weights):
MLPCPUExperts.input_tensor_cpu.copy_(input_tensor, non_blocking=True)
MLPCPUExperts.expert_ids_cpu.copy_(expert_ids, non_blocking=True)
MLPCPUExperts.weights_cpu.copy_(weights, non_blocking=True)
self.cpu_infer.submit_with_cuda_stream(torch.cuda.current_stream().cuda_stream, self.moe.forward(1, expert_ids.size(0), MLPCPUExperts.expert_ids_cpu.data_ptr(), MLPCPUExperts.weights_cpu.data_ptr(), MLPCPUExperts.input_tensor_cpu.data_ptr(), MLPCPUExperts.output_cpu.data_ptr()))
self.cpu_infer.submit_with_cuda_stream(torch.cuda.current_stream(self.out_device).cuda_stream, self.moe.forward(1, expert_ids.size(0), MLPCPUExperts.expert_ids_cpu.data_ptr(), MLPCPUExperts.weights_cpu.data_ptr(), MLPCPUExperts.input_tensor_cpu.data_ptr(), MLPCPUExperts.output_cpu.data_ptr()))
def sync_for_one_decode(self):
self.cpu_infer.sync_with_cuda_stream(torch.cuda.current_stream().cuda_stream)
MLPCPUExperts.output_gpu.copy_(MLPCPUExperts.output_cpu, non_blocking=True)
#print("capturing experts finish")
return MLPCPUExperts.output_gpu
self.cpu_infer.sync_with_cuda_stream(torch.cuda.current_stream(self.out_device).cuda_stream)
MLPCPUExperts.output_gpu_map[self.out_device].copy_(MLPCPUExperts.output_cpu, non_blocking=True)
return MLPCPUExperts.output_gpu_map[self.out_device]
def forward(self, input_tensor, expert_ids, weights):
# generate, capture and run cuda graph
# print(expert_ids)
if input_tensor.size(0)==1:
# TODO: this branch is unreachable, but the shape of input_tensor([1,hidden_size]) and input_tensor_cpu([hidden_size]) is not compatible
#print("capturing experts")
MLPCPUExperts.input_tensor_cpu.copy_(input_tensor, non_blocking=True)
MLPCPUExperts.expert_ids_cpu.copy_(expert_ids, non_blocking=True)
MLPCPUExperts.weights_cpu.copy_(weights, non_blocking=True)
self.cpu_infer.submit_with_cuda_stream(torch.cuda.current_stream().cuda_stream, self.moe.forward(1, expert_ids.size(1), MLPCPUExperts.expert_ids_cpu.data_ptr(), MLPCPUExperts.weights_cpu.data_ptr(), MLPCPUExperts.input_tensor_cpu.data_ptr(), MLPCPUExperts.output_cpu.data_ptr()))
self.cpu_infer.sync_with_cuda_stream(torch.cuda.current_stream().cuda_stream)
MLPCPUExperts.output_gpu.copy_(MLPCPUExperts.output_cpu, non_blocking=True)
#print("capturing experts finish")
return MLPCPUExperts.output_gpu
MLPCPUExperts.output_gpu_map[self.out_device].copy_(MLPCPUExperts.output_cpu, non_blocking=True)
return MLPCPUExperts.output_gpu_map[self.out_device]
else:
input_tensor = input_tensor.contiguous().cpu()
expert_ids = expert_ids.contiguous().cpu()
@ -195,7 +219,7 @@ class MLPCPUExperts(MLPExpertsBase):
output = torch.empty_like(input_tensor).contiguous()
self.cpu_infer.submit(self.moe.forward(expert_ids.size(0), expert_ids.size(1), expert_ids.data_ptr(), weights.data_ptr(), input_tensor.data_ptr(), output.data_ptr()))
self.cpu_infer.sync()
return output.to(device=object.__getattribute__(self, "device"))
return output.to(device=object.__getattribute__(self, "out_device"))
def unload(self):
return
@ -222,6 +246,24 @@ class MLPCPUExperts(MLPExpertsBase):
gate_type = self.gguf_loader.tensor_info[key + ".ffn_gate_exps.weight"]["ggml_type"]
up_type = self.gguf_loader.tensor_info[key + ".ffn_up_exps.weight"]["ggml_type"]
down_type = self.gguf_loader.tensor_info[key + ".ffn_down_exps.weight"]["ggml_type"]
elif key + ".ffn_down.0.weight" in self.gguf_loader.tensor_info:
# for supporting Mixtral-8x7B-Instuct
gate = []
up = []
down = []
for i in range(8):
gate_it = self.gguf_loader.get_mmap_tensor(f"{key}.ffn_gate.{i}.weight")
up_it = self.gguf_loader.get_mmap_tensor(f"{key}.ffn_up.{i}.weight")
down_it = self.gguf_loader.get_mmap_tensor(f"{key}.ffn_down.{i}.weight")
gate.append(gate_it)
up.append(up_it)
down.append(down_it)
gate = np.stack(gate)
up = np.stack(up)
down = np.stack(down)
gate_type = self.gguf_loader.tensor_info[key + ".ffn_gate.0.weight"]["ggml_type"]
up_type = self.gguf_loader.tensor_info[key + ".ffn_up.0.weight"]["ggml_type"]
down_type = self.gguf_loader.tensor_info[key + ".ffn_down.0.weight"]["ggml_type"]
else:
raise ValueError(f"Experts {key} not found in gguf_loader")
res = {key:{"gate": gate, "up": up, "down": down, "gate_type": gate_type, "up_type": up_type, "down_type": down_type}}
@ -299,7 +341,7 @@ class MLPExpertsMarlin(MLPExpertsBase):
gate_type = self.gguf_loader.tensor_info[key + ".ffn_gate_exps.weight"]["ggml_type"]
up_type = self.gguf_loader.tensor_info[key + ".ffn_up_exps.weight"]["ggml_type"]
down_type = self.gguf_loader.tensor_info[key + ".ffn_down_exps.weight"]["ggml_type"]
# tensors = self.load_multi(key, [".ffn_gate_exps.weight", ".ffn_up_exps.weight", ".ffn_down_exps.weight"])
# tensors = self.load_multi(key, [".ffn_gate_exps.weight", ".ffn_up_exps.weight", ".ffn_down_exps.weight"])
res = {key:{"gate": gate, "up": up, "down": down, "gate_type": gate_type, "up_type": up_type, "down_type": down_type}}
return res
@ -359,6 +401,11 @@ class MLPExpertsTorch(MLPExpertsBase):
self.down = None
def forward(self, hidden_states_cpu: torch.Tensor, selected_experts_cpu: torch.Tensor, routing_weights_cpu: torch.Tensor) -> torch.Tensor:
org_device = hidden_states_cpu.device
hidden_states_cpu = hidden_states_cpu.to(self.device)
selected_experts_cpu = selected_experts_cpu.to(self.device)
routing_weights_cpu = routing_weights_cpu.to(self.device)
batch_sequence_length, hidden_dim = hidden_states_cpu.size()
@ -388,27 +435,29 @@ class MLPExpertsTorch(MLPExpertsBase):
# the `top_x` tensor here.
final_hidden_states.index_add_(0, top_x, current_hidden_states)
return final_hidden_states.to(org_dtype)
return final_hidden_states.to(org_dtype, device=org_device)
EXPERTS_MAP = {
"MLPCPUExperts": MLPCPUExperts,
"MLPExpertsTorch": MLPExpertsTorch,
"MLPExpertsMarlin": MLPExpertsMarlin,
}
class KTransformersMLPExpert(BaseInjectedModule, MLPExpertsBase):
def __init__(self,
key: str,
gguf_loader: GGUFLoader,
config: PretrainedConfig,
orig_module: nn.Module,
device: str = "cuda",
# device: str = "cuda",
prefill_device:str = "cuda",
prefill_mlp_type: str | None = "MLPExpertsTorch",
generate_device: str = "cpu",
generate_mlp_type: str | None = "MLPCPUExperts",
**kwargs):
BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, device, **kwargs)
MLPExpertsBase.__init__(self, key, gguf_loader, config, orig_module, device, **kwargs)
BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, generate_device, **kwargs)
MLPExpertsBase.__init__(self, key, gguf_loader, config, orig_module, generate_device, **kwargs)
if generate_mlp_type is not None:
self.generate_experts = EXPERTS_MAP[generate_mlp_type](key, gguf_loader, config, len(orig_module), device=generate_device, **kwargs)
else:
@ -471,6 +520,7 @@ class KTransformersMLPExpert(BaseInjectedModule, MLPExpertsBase):
from ktransformers.models.modeling_deepseek import DeepseekV2MoE
from ktransformers.models.modeling_qwen2_moe import Qwen2MoeSparseMoeBlock
from ktransformers.models.modeling_mixtral import MixtralSparseMoeBlock
class Qwen2MoeSparseMoeBlockInjected(BaseInjectedModule, Qwen2MoeSparseMoeBlock):
@ -578,7 +628,6 @@ class Qwen2MoeSparseMoeBlockInjected(BaseInjectedModule, Qwen2MoeSparseMoeBlock)
return final_hidden_states
class DeepseekV2MoEInjected(BaseInjectedModule, DeepseekV2MoE):
def forward(self, hidden_states):
identity = hidden_states
@ -587,7 +636,7 @@ class DeepseekV2MoEInjected(BaseInjectedModule, DeepseekV2MoE):
topk_idx, topk_weight, aux_loss = self.gate(hidden_states)
hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
if sequence_length == 1:
if sequence_length == 1 and hasattr(self.experts.generate_experts, "submit_for_one_decode"):
self.experts.generate_experts.submit_for_one_decode(hidden_states[0], topk_idx[0], topk_weight[0])
if self.config.n_shared_experts is not None:
y_ = self.shared_experts(identity).squeeze(0)
@ -677,3 +726,102 @@ class DeepseekV2MoEInjected(BaseInjectedModule, DeepseekV2MoE):
.type(new_x.dtype)
)
return final_out
class MisrtalSparseMoEBlockInjected(BaseInjectedModule, MixtralSparseMoeBlock):
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
""" """
orig_shape = hidden_states.shape
batch_size, sequence_length, hidden_dim = hidden_states.shape
if self.training and self.jitter_noise > 0:
hidden_states *= torch.empty_like(hidden_states).uniform_(1.0 - self.jitter_noise, 1.0 + self.jitter_noise)
hidden_states = hidden_states.view(-1, hidden_dim)
# router_logits: (batch * sequence_length, n_experts)
router_logits = self.gate(hidden_states)
routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
# we cast back to the input dtype
routing_weights = routing_weights.to(hidden_states.dtype)
if sequence_length == 1 and hasattr(self.experts.generate_experts, "submit_for_one_decode"):
self.experts.generate_experts.submit_for_one_decode(hidden_states[0], selected_experts[0], routing_weights[0])
y = self.experts.generate_experts.sync_for_one_decode().unsqueeze(0)
y.resize_(*orig_shape)
return y, router_logits
hidden_states_expert = hidden_states.to(self.experts.device) if isinstance(self.experts, MLPExpertsBase) else hidden_states_expert.cpu()
selected_experts_expert = selected_experts.to(self.experts.device) if isinstance(self.experts, MLPExpertsBase) else selected_experts_expert.cpu()
routing_weights_expert = routing_weights.to(self.experts.device) if isinstance(self.experts, MLPExpertsBase) else routing_weights_expert.cpu()
if isinstance(self.experts, MLPExpertsBase):
y = (
self.moe_on_cpuinfer(
hidden_states_expert, selected_experts_expert, routing_weights_expert
)
.view(*orig_shape)
.to(device=hidden_states.device)
)
elif hidden_states_expert.size(0) > 10:
y = self.moe_infer(
hidden_states_expert, selected_experts_expert, routing_weights_expert, orig_shape
).to(device=hidden_states.device)
else:
y = self.moe_infer_simple(
hidden_states_expert, selected_experts_expert, routing_weights_expert
).to(device=hidden_states.device)
y.resize_(*orig_shape)
return y, router_logits
@torch.no_grad()
def moe_on_cpuinfer(self, x: torch.Tensor, topk_ids: torch.Tensor, topk_weight: torch.Tensor) -> torch.Tensor:
outs = torch.empty_like(x)
outs = self.experts(x, topk_ids, topk_weight)
return outs
@torch.no_grad()
# TODO may bugs here
def moe_infer_simple(self, hidden_states_cpu: torch.Tensor, selected_experts_cpu: torch.Tensor, routing_weights_cpu: torch.Tensor) -> torch.Tensor:
'''
hidden_states_cpu: [num_tokens, hidden_size]
topk_ids, topk_weight: [num_tokens, num_selected_experts]
'''
outs = torch.zeros_like(hidden_states_cpu)
for token_idx in range(selected_experts_cpu.size(0)):
for expert_idx in range(selected_experts_cpu.size(1)):
expert = self.experts[selected_experts_cpu[token_idx, expert_idx]]
outs[token_idx] += expert.forward(hidden_states_cpu[token_idx]) * routing_weights_cpu[token_idx, expert_idx]
return outs
@torch.no_grad()
# TODO may bugs here
def moe_infer(self, hidden_states_cpu: torch.Tensor, selected_experts_cpu: torch.Tensor, routing_weights_cpu: torch.Tensor, orig_shape: tuple) -> torch.Tensor:
batch_size, sequence_length, hidden_dim = orig_shape
final_hidden_states = torch.zeros(
(batch_size * sequence_length, hidden_dim), dtype=hidden_states_cpu.dtype, device=hidden_states_cpu.device
)
# One hot encode the selected experts to create an expert mask
# this will be used to easily index which expert is going to be sollicitated
expert_mask = torch.nn.functional.one_hot(selected_experts_cpu, num_classes=self.num_experts).permute(2, 1, 0)
# Loop over all available experts in the model and perform the computation on each expert
for expert_idx in range(self.num_experts):
expert_layer = self.experts[expert_idx]
idx, top_x = torch.where(expert_mask[expert_idx])
# Index the correct hidden states and compute the expert hidden state for
# the current expert. We need to make sure to multiply the output hidden
# states by `routing_weights` on the corresponding tokens (top-1 and top-2)
current_state = hidden_states_cpu[None, top_x].reshape(-1, hidden_dim)
current_hidden_states = expert_layer.forward(current_state) * routing_weights_cpu[top_x, idx, None]
# However `index_add_` only support torch tensors for indexing so we'll use
# the `top_x` tensor here.
final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states_cpu.dtype))
return final_hidden_states