diff --git a/README.md b/README.md index a80fe67..9505c1b 100644 --- a/README.md +++ b/README.md @@ -276,11 +276,11 @@ Below is an example of a YAML template for replacing all original Linear modules name: "^model\\.layers\\..*$" # regular expression class: torch.nn.Linear # only match modules matching name and class simultaneously replace: - class: ktransformers.operators.linear.KTransformerLinear # optimized Kernel on quantized data types + class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types device: "cpu" # which devices to load this module when initializing kwargs: generate_device: "cuda" - generate_linear_type: "QuantizedLinearMarlin" + generate_linear_type: "KLinearMarlin" ``` Each rule in the YAML file has two parts: `match` and `replace`. The `match` part specifies which module should be replaced, and the `replace` part specifies the module to be injected into the model along with the initialization keywords. diff --git a/doc/en/deepseek-v2-injection.md b/doc/en/deepseek-v2-injection.md index c1ccd39..e5dc1c2 100644 --- a/doc/en/deepseek-v2-injection.md +++ b/doc/en/deepseek-v2-injection.md @@ -90,7 +90,7 @@ The YAML rule is listed below. - match: name: "^model\\.layers\\..*\\.self_attn$" # regular expression replace: - class: ktransformers.operators.attention.DeepseekV2AttentionInjected # optimized MLA implementation + class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation ``` As we can see, each rule in the YAML file has two parts: `match` and `replace`. @@ -98,9 +98,9 @@ The match part specifies which module should be replaced, and the replace part s

Routed Experts

-For routed experts, the module we inject is a wrapper of CPUInfer, KTransformersMLPExpert. There are several implementations within a wrapper, and we need to specify keywords to tell the wrapper which implementation we want to use and how we intend to use it. +For routed experts, the module we inject is a wrapper of CPUInfer, KTransformersExperts. There are several implementations within a wrapper, and we need to specify keywords to tell the wrapper which implementation we want to use and how we intend to use it. -In KTransformers, some models exhibit different behaviors during prefilling and generation for better performance. KTransformersMLPExpert is one of them. All these special modules have a `device` keyword describing which device the module should be initialized on. Other keywords specify the behaviors during prefilling and generation and may be differ when using different injection modules. Here, we specify which implementation on which device we want to use during prefilling and generation, and which device the output should be on. +In KTransformers, some models exhibit different behaviors during prefilling and generation for better performance. KTransformersExperts is one of them. All these special modules have a `device` keyword describing which device the module should be initialized on. Other keywords specify the behaviors during prefilling and generation and may be differ when using different injection modules. Here, we specify which implementation on which device we want to use during prefilling and generation, and which device the output should be on. Note that we only use these parameters when layer-wise prefilling is enabled; otherwise, prefilling is conducted with the same configuration as generation. In the original implementation of Transformers, MoE is implemented using `nn.ModuleList`. We don't want KTransformers to iterate through all the sub-modules in the list, so we set `recursive: False` in this rule to prevent recursive injection into submodules of the current module. Here is the YAML rule: @@ -109,13 +109,13 @@ In the original implementation of Transformers, MoE is implemented using `nn.Mod - match: name: "^model\\.layers\\..*\\.mlp\\.experts$" replace: - class: ktransformers.operators.experts.KTransformersMLPExpert # custom MoE Kernel with expert parallelism + class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert parallelism device: "cpu" # device to load this module on initialization kwargs: prefill_device: "cuda" - prefill_mlp_type: "MLPExpertsTorch" + prefill_op: "KExpertsTorch" generate_device: "cpu" - generate_mlp_type: "MLPCPUExperts" + generate_op: "KExpertsCPU" out_device: "cuda" recursive: False # don't recursively inject submodules of this module ``` @@ -126,7 +126,7 @@ If we inject the expert list as a custom module, we can't use the interface in ` - match: class: ktransformers.models.modeling_deepseek.DeepseekV2MoE replace: - class: ktransformers.operators.experts.DeepseekV2MoEInjected # MLP module with custom forward function + class: ktransformers.operators.experts.KDeepseekV2MoE # MLP module with custom forward function ```

Other Linear Modules

@@ -140,12 +140,12 @@ We also need to transfer some keywords similar to the injection of experts. Here name: "^model\\.layers\\.(?!.*self_attn).*$" # regular expression class: torch.nn.Linear # only match modules matching name and class simultaneously replace: - class: ktransformers.operators.linear.KTransformerLinear # optimized Kernel on quantized data types + class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types kwargs: generate_device: "cuda" prefill_device: "cuda" - generate_op: "QuantizedLinearMarlin" - prefill_op: "QuantizedLinearTorch" + generate_op: "KLinearMarlin" + prefill_op: "KLinearTorch" ```

Pre-compute Buffers

diff --git a/ktransformers/operators/attention.py b/ktransformers/operators/attention.py index 0648f51..3cfb9fd 100644 --- a/ktransformers/operators/attention.py +++ b/ktransformers/operators/attention.py @@ -15,7 +15,7 @@ from ktransformers.util.custom_gguf import GGUFLoader from transformers.configuration_utils import PretrainedConfig from transformers.cache_utils import Cache -class DeepseekV2AttentionInjected(BaseInjectedModule, DeepseekV2Attention): +class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention): """Multi-headed attention from 'Attention Is All You Need' paper""" def __init__(self, diff --git a/ktransformers/operators/experts.py b/ktransformers/operators/experts.py index 75fb729..864c4b7 100644 --- a/ktransformers/operators/experts.py +++ b/ktransformers/operators/experts.py @@ -5,8 +5,8 @@ Description : Author : Azure-Tang, Boxin Zhang, chenht2022 Date : 2024-07-25 11:25:24 Version : 0.1.0 -LastEditors : kkk1nak0 -LastEditTime : 2024-08-11 12:14:39 +LastEditors : Azure +LastEditTime : 2024-08-15 02:36:29 Copyright (c) 2024 by KVCache.AI, All Rights Reserved. ''' @@ -31,13 +31,13 @@ from ktransformers.server.config.config import Config from transformers.activations import ACT2FN from transformers.configuration_utils import PretrainedConfig from abc import ABC, abstractmethod -from ktransformers.operators.linear import QuantizedLinearMarlin, QuantizedLinearTorch, KTransformerLinear +from ktransformers.operators.linear import KLinearMarlin, KLinearTorch, KTransformersLinear import time from ktransformers.operators.cpuinfer import CPUInfer # class Base(BaseInjectedModule, ABC): -class MLPExpertsBase(ABC): +class KExpertsBase(ABC): def __init__(self, key: str, gguf_loader: GGUFLoader, config: PretrainedConfig, orig_module: nn.Module, device: str = "cuda", **kwargs): # super().__init__(key, gguf_loader, config, orig_module, device, **kwargs) self.key = key @@ -111,7 +111,7 @@ class MLPExpertsBase(ABC): tensors[k] = self.gguf_loader.load_gguf_tensor(key + k, device=device) return tensors -class MLPCPUExperts(MLPExpertsBase): +class KExpertsCPU(KExpertsBase): input_tensor_cpu:Tensor = None expert_ids_cpu:Tensor = None weights_cpu:Tensor = None @@ -131,13 +131,13 @@ class MLPCPUExperts(MLPExpertsBase): **kwargs ): super().__init__(key, gguf_loader, config, orig_module, device, **kwargs) - assert device.lower() == "cpu", "MLPCPUExperts can only be loaded on CPU" + assert device.lower() == "cpu", "KExpertsCPU can only be loaded on CPU" self.n_routed_experts = n_routed_experts self.out_device = out_device def load(self, w: dict | nn.Parameter | tuple | None = None, device:str|None = None, warmup:bool = False): if device: - assert device.lower() == "cpu", "MLPCPUExperts can only be loaded on CPU, Parameter \"device\" can be cpu or None." + assert device.lower() == "cpu", "KExpertsCPU can only be loaded on CPU, Parameter \"device\" can be cpu or None." if w is None: w = self.load_weights()[self.key] self.gate = w["gate"] self.up = w["up"] @@ -176,28 +176,28 @@ class MLPCPUExperts(MLPExpertsBase): # print(n_routed_experts, hidden_size, moe_intermediate_size) num_experts_per_tok = self.config.num_experts_per_tok self.moe = MOE(moe_config) - self.cpu_infer = MLPCPUExperts.CPU_INFER + self.cpu_infer = KExpertsCPU.CPU_INFER if warmup: self.cpu_infer.submit(self.moe.warm_up()) self.cpu_infer.sync() - if self.out_device not in MLPCPUExperts.output_gpu_map: - MLPCPUExperts.output_gpu_map[self.out_device] = torch.zeros((self.config.hidden_size), device=self.out_device) - if MLPCPUExperts.input_tensor_cpu == None: - MLPCPUExperts.input_tensor_cpu = torch.zeros((self.config.hidden_size), device="cpu", pin_memory=True) - MLPCPUExperts.expert_ids_cpu = torch.zeros((num_experts_per_tok), device="cpu", dtype=torch.long, pin_memory=True) - MLPCPUExperts.weights_cpu = torch.zeros((num_experts_per_tok), device="cpu", dtype=torch.float32, pin_memory=True) - MLPCPUExperts.output_cpu = torch.zeros((self.config.hidden_size), device="cpu", pin_memory=True, dtype=torch.bfloat16) + if self.out_device not in KExpertsCPU.output_gpu_map: + KExpertsCPU.output_gpu_map[self.out_device] = torch.zeros((self.config.hidden_size), device=self.out_device) + if KExpertsCPU.input_tensor_cpu == None: + KExpertsCPU.input_tensor_cpu = torch.zeros((self.config.hidden_size), device="cpu", pin_memory=True) + KExpertsCPU.expert_ids_cpu = torch.zeros((num_experts_per_tok), device="cpu", dtype=torch.long, pin_memory=True) + KExpertsCPU.weights_cpu = torch.zeros((num_experts_per_tok), device="cpu", dtype=torch.float32, pin_memory=True) + KExpertsCPU.output_cpu = torch.zeros((self.config.hidden_size), device="cpu", pin_memory=True, dtype=torch.bfloat16) def submit_for_one_decode(self, input_tensor, expert_ids, weights): - MLPCPUExperts.input_tensor_cpu.copy_(input_tensor, non_blocking=True) - MLPCPUExperts.expert_ids_cpu.copy_(expert_ids, non_blocking=True) - MLPCPUExperts.weights_cpu.copy_(weights, non_blocking=True) - self.cpu_infer.submit_with_cuda_stream(torch.cuda.current_stream(self.out_device).cuda_stream, self.moe.forward(1, expert_ids.size(0), MLPCPUExperts.expert_ids_cpu.data_ptr(), MLPCPUExperts.weights_cpu.data_ptr(), MLPCPUExperts.input_tensor_cpu.data_ptr(), MLPCPUExperts.output_cpu.data_ptr())) + KExpertsCPU.input_tensor_cpu.copy_(input_tensor, non_blocking=True) + KExpertsCPU.expert_ids_cpu.copy_(expert_ids, non_blocking=True) + KExpertsCPU.weights_cpu.copy_(weights, non_blocking=True) + self.cpu_infer.submit_with_cuda_stream(torch.cuda.current_stream(self.out_device).cuda_stream, self.moe.forward(1, expert_ids.size(0), KExpertsCPU.expert_ids_cpu.data_ptr(), KExpertsCPU.weights_cpu.data_ptr(), KExpertsCPU.input_tensor_cpu.data_ptr(), KExpertsCPU.output_cpu.data_ptr())) def sync_for_one_decode(self): self.cpu_infer.sync_with_cuda_stream(torch.cuda.current_stream(self.out_device).cuda_stream) - MLPCPUExperts.output_gpu_map[self.out_device].copy_(MLPCPUExperts.output_cpu, non_blocking=True) - return MLPCPUExperts.output_gpu_map[self.out_device] + KExpertsCPU.output_gpu_map[self.out_device].copy_(KExpertsCPU.output_cpu, non_blocking=True) + return KExpertsCPU.output_gpu_map[self.out_device] def forward(self, input_tensor, expert_ids, weights): # generate, capture and run cuda graph @@ -205,13 +205,13 @@ class MLPCPUExperts(MLPExpertsBase): if input_tensor.size(0)==1: # TODO: this branch is unreachable, but the shape of input_tensor([1,hidden_size]) and input_tensor_cpu([hidden_size]) is not compatible #print("capturing experts") - MLPCPUExperts.input_tensor_cpu.copy_(input_tensor, non_blocking=True) - MLPCPUExperts.expert_ids_cpu.copy_(expert_ids, non_blocking=True) - MLPCPUExperts.weights_cpu.copy_(weights, non_blocking=True) - self.cpu_infer.submit_with_cuda_stream(torch.cuda.current_stream().cuda_stream, self.moe.forward(1, expert_ids.size(1), MLPCPUExperts.expert_ids_cpu.data_ptr(), MLPCPUExperts.weights_cpu.data_ptr(), MLPCPUExperts.input_tensor_cpu.data_ptr(), MLPCPUExperts.output_cpu.data_ptr())) + KExpertsCPU.input_tensor_cpu.copy_(input_tensor, non_blocking=True) + KExpertsCPU.expert_ids_cpu.copy_(expert_ids, non_blocking=True) + KExpertsCPU.weights_cpu.copy_(weights, non_blocking=True) + self.cpu_infer.submit_with_cuda_stream(torch.cuda.current_stream().cuda_stream, self.moe.forward(1, expert_ids.size(1), KExpertsCPU.expert_ids_cpu.data_ptr(), KExpertsCPU.weights_cpu.data_ptr(), KExpertsCPU.input_tensor_cpu.data_ptr(), KExpertsCPU.output_cpu.data_ptr())) self.cpu_infer.sync_with_cuda_stream(torch.cuda.current_stream().cuda_stream) - MLPCPUExperts.output_gpu_map[self.out_device].copy_(MLPCPUExperts.output_cpu, non_blocking=True) - return MLPCPUExperts.output_gpu_map[self.out_device] + KExpertsCPU.output_gpu_map[self.out_device].copy_(KExpertsCPU.output_cpu, non_blocking=True) + return KExpertsCPU.output_gpu_map[self.out_device] else: input_tensor = input_tensor.contiguous().cpu() expert_ids = expert_ids.contiguous().cpu() @@ -269,7 +269,7 @@ class MLPCPUExperts(MLPExpertsBase): res = {key:{"gate": gate, "up": up, "down": down, "gate_type": gate_type, "up_type": up_type, "down_type": down_type}} return res -class MLPExpertsMarlin(MLPExpertsBase): +class KExpertsMarlin(KExpertsBase): expert_num: int loaded_experts_idx: list[int] def __init__( @@ -290,11 +290,11 @@ class MLPExpertsMarlin(MLPExpertsBase): self.device = device # create empty marlin experts according to the number of experts per token # up - self.up_projs = [QuantizedLinearMarlin(key+ "." + "ffn_up_exps", gguf_loader, config, device=device) for i in range(self.expert_num)] + self.up_projs = [KLinearMarlin(key+ "." + "ffn_up_exps", gguf_loader, config, device=device) for i in range(self.expert_num)] # gate - self.gate_projs = [QuantizedLinearMarlin(key+ "." + "ffn_gate_exps", gguf_loader, config, device=device) for i in range(self.expert_num)] + self.gate_projs = [KLinearMarlin(key+ "." + "ffn_gate_exps", gguf_loader, config, device=device) for i in range(self.expert_num)] # down - self.down_projs = [QuantizedLinearMarlin(key+ "." + "ffn_down_exps", gguf_loader, config, device=device) for i in range(self.expert_num)] + self.down_projs = [KLinearMarlin(key+ "." + "ffn_down_exps", gguf_loader, config, device=device) for i in range(self.expert_num)] def load(self, w: dict | nn.Parameter | tuple | None = None, device: str | None = None, warmup: bool = False): if device is None: device = self.device @@ -359,7 +359,7 @@ class MLPExpertsMarlin(MLPExpertsBase): outs = outs.to(device) return outs -class MLPExpertsTorch(MLPExpertsBase): +class KExpertsTorch(KExpertsBase): expert_num: int loaded_experts_idx: list[int] gate: torch.Tensor @@ -439,12 +439,12 @@ class MLPExpertsTorch(MLPExpertsBase): return final_hidden_states.to(org_dtype, device=org_device) EXPERTS_MAP = { - "MLPCPUExperts": MLPCPUExperts, - "MLPExpertsTorch": MLPExpertsTorch, - "MLPExpertsMarlin": MLPExpertsMarlin, + "KExpertsCPU": KExpertsCPU, + "KExpertsTorch": KExpertsTorch, + "KExpertsMarlin": KExpertsMarlin, } -class KTransformersMLPExpert(BaseInjectedModule, MLPExpertsBase): +class KTransformersExperts(BaseInjectedModule, KExpertsBase): def __init__(self, key: str, gguf_loader: GGUFLoader, @@ -452,22 +452,22 @@ class KTransformersMLPExpert(BaseInjectedModule, MLPExpertsBase): orig_module: nn.Module, # device: str = "cuda", prefill_device:str = "cuda", - prefill_mlp_type: str | None = "MLPExpertsTorch", + prefill_op: str | None = "KExpertsTorch", generate_device: str = "cpu", - generate_mlp_type: str | None = "MLPCPUExperts", + generate_op: str | None = "KExpertsCPU", **kwargs): BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, generate_device, **kwargs) - MLPExpertsBase.__init__(self, key, gguf_loader, config, orig_module, generate_device, **kwargs) - if generate_mlp_type is not None: - self.generate_experts = EXPERTS_MAP[generate_mlp_type](key, gguf_loader, config, len(orig_module), device=generate_device, **kwargs) + KExpertsBase.__init__(self, key, gguf_loader, config, orig_module, generate_device, **kwargs) + if generate_op is not None: + self.generate_experts = EXPERTS_MAP[generate_op](key, gguf_loader, config, len(orig_module), device=generate_device, **kwargs) else: self.generate_experts = None - if prefill_mlp_type is not None: - self.prefill_experts = EXPERTS_MAP[prefill_mlp_type](key, gguf_loader, config, len(orig_module), device=prefill_device, **kwargs) + if prefill_op is not None: + self.prefill_experts = EXPERTS_MAP[prefill_op](key, gguf_loader, config, len(orig_module), device=prefill_device, **kwargs) else: self.prefill_experts = None - self.gpu_mlp_type = prefill_mlp_type - self.cpu_mlp_type = generate_mlp_type + self.gpu_mlp_type = prefill_op + self.cpu_mlp_type = generate_op self.mode = InferenceState.UNLOAD def load(self, w: dict = None, mode: InferenceState = None, warmup: bool = True): @@ -523,7 +523,7 @@ from ktransformers.models.modeling_qwen2_moe import Qwen2MoeSparseMoeBlock from ktransformers.models.modeling_mixtral import MixtralSparseMoeBlock -class Qwen2MoeSparseMoeBlockInjected(BaseInjectedModule, Qwen2MoeSparseMoeBlock): +class KQwen2MoeSparseMoeBlock(BaseInjectedModule, Qwen2MoeSparseMoeBlock): def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: """ """ orig_shape = hidden_states.shape @@ -548,16 +548,16 @@ class Qwen2MoeSparseMoeBlockInjected(BaseInjectedModule, Qwen2MoeSparseMoeBlock) y.resize_(*orig_shape) return y, router_logits - hidden_states_expert = hidden_states.to(self.experts.device) if isinstance(self.experts, MLPExpertsBase) else hidden_states_expert.cpu() - selected_experts_expert = selected_experts.to(self.experts.device) if isinstance(self.experts, MLPExpertsBase) else selected_experts_expert.cpu() - routing_weights_expert = routing_weights.to(self.experts.device) if isinstance(self.experts, MLPExpertsBase) else routing_weights_expert.cpu() + hidden_states_expert = hidden_states.to(self.experts.device) if isinstance(self.experts, KExpertsBase) else hidden_states_expert.cpu() + selected_experts_expert = selected_experts.to(self.experts.device) if isinstance(self.experts, KExpertsBase) else selected_experts_expert.cpu() + routing_weights_expert = routing_weights.to(self.experts.device) if isinstance(self.experts, KExpertsBase) else routing_weights_expert.cpu() shared_expert_output = self.shared_expert(hidden_states) shared_expert_output = ( F.sigmoid(self.shared_expert_gate(hidden_states)) * shared_expert_output ) - if isinstance(self.experts, MLPExpertsBase): + if isinstance(self.experts, KExpertsBase): y = ( self.moe_on_cpuinfer( hidden_states_expert, selected_experts_expert, routing_weights_expert @@ -628,7 +628,7 @@ class Qwen2MoeSparseMoeBlockInjected(BaseInjectedModule, Qwen2MoeSparseMoeBlock) return final_hidden_states -class DeepseekV2MoEInjected(BaseInjectedModule, DeepseekV2MoE): +class KDeepseekV2MoE(BaseInjectedModule, DeepseekV2MoE): def forward(self, hidden_states): identity = hidden_states orig_shape = hidden_states.shape @@ -648,7 +648,7 @@ class DeepseekV2MoEInjected(BaseInjectedModule, DeepseekV2MoE): if self.config.n_shared_experts is not None: y_ = self.shared_experts(identity).squeeze(0) - if isinstance(self.experts, MLPExpertsBase): + if isinstance(self.experts, KExpertsBase): y = self.moe_on_cpuinfer(hidden_states, topk_idx, topk_weight).view(*orig_shape).to(device=hidden_states.device) elif hidden_states.size(0) > 10: # TODO may bugs here @@ -727,7 +727,7 @@ class DeepseekV2MoEInjected(BaseInjectedModule, DeepseekV2MoE): ) return final_out -class MisrtalSparseMoEBlockInjected(BaseInjectedModule, MixtralSparseMoeBlock): +class KMisrtalSparseMoEBlock(BaseInjectedModule, MixtralSparseMoeBlock): def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: """ """ @@ -751,11 +751,11 @@ class MisrtalSparseMoEBlockInjected(BaseInjectedModule, MixtralSparseMoeBlock): y.resize_(*orig_shape) return y, router_logits - hidden_states_expert = hidden_states.to(self.experts.device) if isinstance(self.experts, MLPExpertsBase) else hidden_states_expert.cpu() - selected_experts_expert = selected_experts.to(self.experts.device) if isinstance(self.experts, MLPExpertsBase) else selected_experts_expert.cpu() - routing_weights_expert = routing_weights.to(self.experts.device) if isinstance(self.experts, MLPExpertsBase) else routing_weights_expert.cpu() + hidden_states_expert = hidden_states.to(self.experts.device) if isinstance(self.experts, KExpertsBase) else hidden_states_expert.cpu() + selected_experts_expert = selected_experts.to(self.experts.device) if isinstance(self.experts, KExpertsBase) else selected_experts_expert.cpu() + routing_weights_expert = routing_weights.to(self.experts.device) if isinstance(self.experts, KExpertsBase) else routing_weights_expert.cpu() - if isinstance(self.experts, MLPExpertsBase): + if isinstance(self.experts, KExpertsBase): y = ( self.moe_on_cpuinfer( hidden_states_expert, selected_experts_expert, routing_weights_expert diff --git a/ktransformers/operators/linear.py b/ktransformers/operators/linear.py index e984a90..146fb85 100644 --- a/ktransformers/operators/linear.py +++ b/ktransformers/operators/linear.py @@ -6,7 +6,7 @@ Author : Azure-Tang, Boxin Zhang Date : 2024-07-25 11:25:24 Version : 0.1.0 LastEditors : Azure -LastEditTime : 2024-07-26 09:27:53 +LastEditTime : 2024-08-14 14:57:04 Copyright (c) 2024 by KVCache.AI, All Rights Reserved. ''' @@ -34,8 +34,8 @@ import cpuinfer_ext from ktransformers.operators.cpuinfer import CPUInfer from ktransformers.server.config.config import Config -#class QuantizedLinearBase(BaseInjectedModule, ABC): -class QuantizedLinearBase(ABC): +#class KLinearBase(BaseInjectedModule, ABC): +class KLinearBase(ABC): def __init__( self, key: str, @@ -106,7 +106,7 @@ class QuantizedLinearBase(ABC): pass -class QuantizedLinearTorch(QuantizedLinearBase): +class KLinearTorch(KLinearBase): def __init__( self, key: str, @@ -158,7 +158,7 @@ class QuantizedLinearTorch(QuantizedLinearBase): self.bias = None -class QuantizedLinearMarlin(QuantizedLinearBase): +class KLinearMarlin(KLinearBase): marlin_q_w: torch.Tensor marlin_s: torch.Tensor g_idx: torch.Tensor @@ -252,7 +252,7 @@ class QuantizedLinearMarlin(QuantizedLinearBase): self.sort_indices = None self.workspace = None -class QuantizedLinearCPUInfer(QuantizedLinearBase): +class KLinearCPUInfer(KLinearBase): CPU_INFER = CPUInfer(Config().cpu_infer) def __init__( self, @@ -281,7 +281,7 @@ class QuantizedLinearCPUInfer(QuantizedLinearBase): out_device = x.device self.input_tensor_cpu.copy_(x, non_blocking=True) qlen = origin_shape[1] - QuantizedLinearCPUInfer.CPU_INFER.submit_with_cuda_stream( + KLinearCPUInfer.CPU_INFER.submit_with_cuda_stream( torch.cuda.current_stream().cuda_stream, self.linear.forward( qlen, @@ -289,7 +289,7 @@ class QuantizedLinearCPUInfer(QuantizedLinearBase): self.output_cpu.data_ptr() ) ) - QuantizedLinearCPUInfer.CPU_INFER.sync_with_cuda_stream(torch.cuda.current_stream().cuda_stream) + KLinearCPUInfer.CPU_INFER.sync_with_cuda_stream(torch.cuda.current_stream().cuda_stream) self.output_gpu.copy_(self.output_cpu, non_blocking=True) if self.has_bias: self.output_gpu += self.bias @@ -301,14 +301,14 @@ class QuantizedLinearCPUInfer(QuantizedLinearBase): qlen = origin_shape[1] output_shape = (*origin_shape[:-1], self.out_features) output = torch.empty(output_shape, device=x.device, dtype=x.dtype) - QuantizedLinearCPUInfer.CPU_INFER.submit( + KLinearCPUInfer.CPU_INFER.submit( self.linear.forward( qlen, x.data_ptr(), output.data_ptr() ) ) - QuantizedLinearCPUInfer.CPU_INFER.sync() + KLinearCPUInfer.CPU_INFER.sync() if self.has_bias: output = output + self.bias output = output.to(dtype=dtype, device=out_device) @@ -329,8 +329,8 @@ class QuantizedLinearCPUInfer(QuantizedLinearBase): self.linear = cpuinfer_ext.linear.Linear(config) if warmup: - QuantizedLinearCPUInfer.CPU_INFER.submit(self.linear.warm_up()) - QuantizedLinearCPUInfer.CPU_INFER.sync() + KLinearCPUInfer.CPU_INFER.submit(self.linear.warm_up()) + KLinearCPUInfer.CPU_INFER.sync() self.input_tensor_cpu = torch.zeros((1, 1, self.in_features), device="cpu", pin_memory=True) self.output_cpu = torch.zeros((1, 1, self.out_features), device="cpu", pin_memory=True, dtype=torch.bfloat16) self.output_gpu = torch.zeros((1, 1, self.out_features), device=self.out_device) @@ -355,12 +355,12 @@ class QuantizedLinearCPUInfer(QuantizedLinearBase): self.bias = None LINEAR_MAP = { - "QuantizedLinearMarlin": QuantizedLinearMarlin, - "QuantizedLinearTorch": QuantizedLinearTorch, - "QuantizedLinearCPUInfer": QuantizedLinearCPUInfer + "KLinearMarlin": KLinearMarlin, + "KLinearTorch": KLinearTorch, + "KLinearCPUInfer": KLinearCPUInfer } -class KTransformerLinear(BaseInjectedModule, QuantizedLinearBase): +class KTransformersLinear(BaseInjectedModule, KLinearBase): def __init__( self, key: str, @@ -369,20 +369,20 @@ class KTransformerLinear(BaseInjectedModule, QuantizedLinearBase): orig_module: nn.Module, # device: str = "cuda", generate_device: str = "cuda", - generate_op: str| None = "QuantizedLinearMarlin", + generate_op: str| None = "KLinearMarlin", prefill_device: str = "cuda", - prefill_op: str| None = "QuantizedLinearTorch", + prefill_op: str| None = "KLinearTorch", **kwargs, ): BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, generate_device, **kwargs) - QuantizedLinearBase.__init__(self, key, gguf_loader, config, orig_module, generate_device, **kwargs) + KLinearBase.__init__(self, key, gguf_loader, config, orig_module, generate_device, **kwargs) # build all the linear operators if prefill_op is not None: assert prefill_op in LINEAR_MAP, f"linear_type {prefill_op} not supported" - if prefill_op == "QuantizedLinearMarlin" and (orig_module.in_features%GPTQ_MARLIN_MIN_THREAD_N!=0 or orig_module.out_features%GPTQ_MARLIN_MIN_THREAD_N!=0): - print(f"This linear module's in_features or out_features is not divisible by GPTQ_MARLIN_MIN_THREAD_N({GPTQ_MARLIN_MIN_THREAD_N}), using QuantizedLinearTorch instead.") + if prefill_op == "KLinearMarlin" and (orig_module.in_features%GPTQ_MARLIN_MIN_THREAD_N!=0 or orig_module.out_features%GPTQ_MARLIN_MIN_THREAD_N!=0): + print(f"This linear module's in_features or out_features is not divisible by GPTQ_MARLIN_MIN_THREAD_N({GPTQ_MARLIN_MIN_THREAD_N}), using KLinearTorch instead.") print(f"module info: key:{key} orig_module:{orig_module}") - self.prefill_linear = QuantizedLinearTorch(key, gguf_loader, config, orig_module, prefill_device, **kwargs) + self.prefill_linear = KLinearTorch(key, gguf_loader, config, orig_module, prefill_device, **kwargs) else: self.prefill_linear = LINEAR_MAP[prefill_op](key, gguf_loader, config, orig_module, prefill_device, **kwargs) else: @@ -390,11 +390,11 @@ class KTransformerLinear(BaseInjectedModule, QuantizedLinearBase): if generate_op is not None: assert generate_op in LINEAR_MAP, f"linear_type {generate_op} not supported" - if generate_op == "QuantizedLinearMarlin" and (orig_module.in_features%GPTQ_MARLIN_MIN_THREAD_N!=0 or orig_module.out_features%GPTQ_MARLIN_MIN_THREAD_N!=0): - print(f"This linear module's in_features or out_features is not divisible by GPTQ_MARLIN_MIN_THREAD_N({GPTQ_MARLIN_MIN_THREAD_N}), using QuantizedLinearTorch instead.") + if generate_op == "KLinearMarlin" and (orig_module.in_features%GPTQ_MARLIN_MIN_THREAD_N!=0 or orig_module.out_features%GPTQ_MARLIN_MIN_THREAD_N!=0): + print(f"This linear module's in_features or out_features is not divisible by GPTQ_MARLIN_MIN_THREAD_N({GPTQ_MARLIN_MIN_THREAD_N}), using KLinearTorch instead.") print(f"module info: key:{key} orig_module:{orig_module}") - self.generate_op = "QuantizedLinearTorch" - self.generate_linear = QuantizedLinearTorch(key, gguf_loader, config, orig_module, generate_device, **kwargs) + self.generate_op = "KLinearTorch" + self.generate_linear = KLinearTorch(key, gguf_loader, config, orig_module, generate_device, **kwargs) else: self.generate_linear = LINEAR_MAP[generate_op](key, gguf_loader, config, orig_module, generate_device, **kwargs) else: diff --git a/ktransformers/operators/layer_wise_prefill.py b/ktransformers/operators/models.py similarity index 99% rename from ktransformers/operators/layer_wise_prefill.py rename to ktransformers/operators/models.py index 2a1d1fe..c95e1ee 100644 --- a/ktransformers/operators/layer_wise_prefill.py +++ b/ktransformers/operators/models.py @@ -6,7 +6,7 @@ Author : Azure-Tang Date : 2024-07-25 11:25:24 Version : 1.0.0 LastEditors : Azure -LastEditTime : 2024-08-08 10:09:14 +LastEditTime : 2024-08-14 14:53:05 Copyright (c) 2024 by KVCache.AI, All Rights Reserved. ''' @@ -155,7 +155,7 @@ QWEN2MOE_INPUTS_DOCSTRING = r""" "The bare Qwen2MoE Model outputting raw hidden-states without any specific head on top.", QWEN2MOE_START_DOCSTRING, ) -class Qwen2MoeModelKTransformers(BaseInjectedModule): +class KQwen2MoeModel(BaseInjectedModule): """ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen2MoeDecoderLayer`] @@ -451,7 +451,7 @@ DeepseekV2_INPUTS_DOCSTRING = r""" """ -class DeepseekV2ModelKTransformers(BaseInjectedModule): +class KDeepseekV2Model(BaseInjectedModule): """ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`DeepseekV2DecoderLayer`] diff --git a/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu-4.yaml b/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu-4.yaml index 31c5c87..5f3b780 100644 --- a/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu-4.yaml +++ b/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu-4.yaml @@ -43,48 +43,48 @@ name: "^model\\.layers\\.([0-9])\\.(?!self_attn).*$" # regular expression class: torch.nn.Linear # only match modules matching name and class simultaneously replace: - class: ktransformers.operators.linear.KTransformerLinear # optimized Kernel on quantized data types + class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types kwargs: generate_device: "cuda:0" prefill_device: "cuda:0" - generate_op: "QuantizedLinearMarlin" - prefill_op: "QuantizedLinearTorch" + generate_op: "KLinearMarlin" + prefill_op: "KLinearTorch" - match: name: "^model\\.layers\\.([1][0-9])\\.(?!self_attn).*$" # regular expression class: torch.nn.Linear # only match modules matching name and class simultaneously replace: - class: ktransformers.operators.linear.KTransformerLinear # optimized Kernel on quantized data types + class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types kwargs: generate_device: "cuda:1" prefill_device: "cuda:1" - generate_op: "QuantizedLinearMarlin" - prefill_op: "QuantizedLinearTorch" + generate_op: "KLinearMarlin" + prefill_op: "KLinearTorch" - match: name: "^model\\.layers\\.([2][0-9])\\.(?!self_attn).*$" # regular expression class: torch.nn.Linear # only match modules matching name and class simultaneously replace: - class: ktransformers.operators.linear.KTransformerLinear # optimized Kernel on quantized data types + class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types kwargs: generate_device: "cuda:2" prefill_device: "cuda:2" - generate_op: "QuantizedLinearMarlin" - prefill_op: "QuantizedLinearTorch" + generate_op: "KLinearMarlin" + prefill_op: "KLinearTorch" - match: name: "^model\\.layers\\.([345][0-9])\\.(?!self_attn).*$" # regular expression class: torch.nn.Linear # only match modules matching name and class simultaneously replace: - class: ktransformers.operators.linear.KTransformerLinear # optimized Kernel on quantized data types + class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types kwargs: generate_device: "cuda:3" prefill_device: "cuda:3" - generate_op: "QuantizedLinearMarlin" - prefill_op: "QuantizedLinearTorch" + generate_op: "KLinearMarlin" + prefill_op: "KLinearTorch" - match: name: "^model\\.layers\\.([0-9])\\.mlp$" class: ktransformers.models.modeling_deepseek.DeepseekV2MoE replace: - class: ktransformers.operators.experts.DeepseekV2MoEInjected # mlp module with custom forward function + class: ktransformers.operators.experts.KDeepseekV2MoE # mlp module with custom forward function kwargs: generate_device: "cuda:0" prefill_device: "cuda:0" @@ -92,7 +92,7 @@ name: "^model\\.layers\\.([1][0-9])\\.mlp$" class: ktransformers.models.modeling_deepseek.DeepseekV2MoE replace: - class: ktransformers.operators.experts.DeepseekV2MoEInjected # mlp module with custom forward function + class: ktransformers.operators.experts.KDeepseekV2MoE # mlp module with custom forward function kwargs: generate_device: "cuda:1" prefill_device: "cuda:1" @@ -100,7 +100,7 @@ name: "^model\\.layers\\.([2][0-9])\\.mlp$" class: ktransformers.models.modeling_deepseek.DeepseekV2MoE replace: - class: ktransformers.operators.experts.DeepseekV2MoEInjected # mlp module with custom forward function + class: ktransformers.operators.experts.KDeepseekV2MoE # mlp module with custom forward function kwargs: generate_device: "cuda:2" prefill_device: "cuda:2" @@ -108,7 +108,7 @@ name: "^model\\.layers\\.([345][0-9])\\.mlp$" class: ktransformers.models.modeling_deepseek.DeepseekV2MoE replace: - class: ktransformers.operators.experts.DeepseekV2MoEInjected # mlp module with custom forward function + class: ktransformers.operators.experts.KDeepseekV2MoE # mlp module with custom forward function kwargs: generate_device: "cuda:3" prefill_device: "cuda:3" @@ -116,73 +116,73 @@ - match: name: "^model\\.layers\\.([0-9])\\.mlp\\.experts$" replace: - class: ktransformers.operators.experts.KTransformersMLPExpert # custom MoE Kernel with expert paralleism + class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism kwargs: prefill_device: "cuda:0" - prefill_mlp_type: "MLPExpertsTorch" + prefill_op: "KExpertsTorch" generate_device: "cpu" - generate_mlp_type: "MLPCPUExperts" + generate_op: "KExpertsCPU" out_device: "cuda:0" recursive: False # don't recursively inject submodules of this module - match: name: "^model\\.layers\\.([1][0-9])\\.mlp\\.experts$" replace: - class: ktransformers.operators.experts.KTransformersMLPExpert # custom MoE Kernel with expert paralleism + class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism kwargs: prefill_device: "cuda:1" - prefill_mlp_type: "MLPExpertsTorch" + prefill_op: "KExpertsTorch" generate_device: "cpu" - generate_mlp_type: "MLPCPUExperts" + generate_op: "KExpertsCPU" out_device: "cuda:1" recursive: False # don't recursively inject submodules of this module - match: name: "^model\\.layers\\.([2][0-9])\\.mlp\\.experts$" replace: - class: ktransformers.operators.experts.KTransformersMLPExpert # custom MoE Kernel with expert paralleism + class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism kwargs: prefill_device: "cuda:2" - prefill_mlp_type: "MLPExpertsTorch" + prefill_op: "KExpertsTorch" generate_device: "cpu" - generate_mlp_type: "MLPCPUExperts" + generate_op: "KExpertsCPU" out_device: "cuda:2" recursive: False # don't recursively inject submodules of this module - match: name: "^model\\.layers\\.([345][0-9])\\.mlp\\.experts$" replace: - class: ktransformers.operators.experts.KTransformersMLPExpert # custom MoE Kernel with expert paralleism + class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism kwargs: prefill_device: "cuda:3" - prefill_mlp_type: "MLPExpertsTorch" + prefill_op: "KExpertsTorch" generate_device: "cpu" - generate_mlp_type: "MLPCPUExperts" + generate_op: "KExpertsCPU" out_device: "cuda:3" recursive: False # don't recursively inject submodules of this module - match: name: "^model\\.layers\\.([0-9])\\.self_attn$" replace: - class: ktransformers.operators.attention.DeepseekV2AttentionInjected # optimized MLA implementation + class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation kwargs: generate_device: "cuda:0" prefill_device: "cuda:0" - match: name: "^model\\.layers\\.([1][0-9])\\.self_attn$" replace: - class: ktransformers.operators.attention.DeepseekV2AttentionInjected # optimized MLA implementation + class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation kwargs: generate_device: "cuda:1" prefill_device: "cuda:1" - match: name: "^model\\.layers\\.([2][0-9])\\.self_attn$" replace: - class: ktransformers.operators.attention.DeepseekV2AttentionInjected # optimized MLA implementation + class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation kwargs: generate_device: "cuda:2" prefill_device: "cuda:2" - match: name: "^model\\.layers\\.([345][0-9])\\.self_attn$" replace: - class: ktransformers.operators.attention.DeepseekV2AttentionInjected # optimized MLA implementation + class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation kwargs: generate_device: "cuda:3" prefill_device: "cuda:3" @@ -190,7 +190,7 @@ - match: name: "^model$" replace: - class: "ktransformers.operators.layer_wise_prefill.DeepseekV2ModelKTransformers" + class: "ktransformers.operators.layer_wise_prefill.KDeepseekV2Model" kwargs: per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill transfer_map: diff --git a/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu.yaml b/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu.yaml index 15e8e10..882c75f 100644 --- a/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu.yaml +++ b/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu.yaml @@ -27,29 +27,29 @@ name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.(?!self_attn).*$" # regular expression class: torch.nn.Linear # only match modules matching name and class simultaneously replace: - class: ktransformers.operators.linear.KTransformerLinear # optimized Kernel on quantized data types + class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types kwargs: generate_device: "cuda:0" prefill_device: "cuda:0" - generate_op: "QuantizedLinearMarlin" - prefill_op: "QuantizedLinearTorch" + generate_op: "KLinearMarlin" + prefill_op: "KLinearTorch" - match: name: "^model\\.layers\\.([345][0-9])\\.(?!self_attn).*$" # regular expression class: torch.nn.Linear # only match modules matching name and class simultaneously replace: - class: ktransformers.operators.linear.KTransformerLinear # optimized Kernel on quantized data types + class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types kwargs: generate_device: "cuda:1" prefill_device: "cuda:1" - generate_op: "QuantizedLinearMarlin" - prefill_op: "QuantizedLinearTorch" + generate_op: "KLinearMarlin" + prefill_op: "KLinearTorch" - match: name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp$" class: ktransformers.models.modeling_deepseek.DeepseekV2MoE replace: - class: ktransformers.operators.experts.DeepseekV2MoEInjected # mlp module with custom forward function + class: ktransformers.operators.experts.KDeepseekV2MoE # mlp module with custom forward function kwargs: generate_device: "cuda:0" prefill_device: "cuda:0" @@ -57,7 +57,7 @@ name: "^model\\.layers\\.([345][0-9])\\.mlp$" class: ktransformers.models.modeling_deepseek.DeepseekV2MoE replace: - class: ktransformers.operators.experts.DeepseekV2MoEInjected # mlp module with custom forward function + class: ktransformers.operators.experts.KDeepseekV2MoE # mlp module with custom forward function kwargs: generate_device: "cuda:1" prefill_device: "cuda:1" @@ -65,45 +65,45 @@ - match: name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp\\.experts$" replace: - class: ktransformers.operators.experts.KTransformersMLPExpert # custom MoE Kernel with expert paralleism + class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism kwargs: prefill_device: "cuda:0" - prefill_mlp_type: "MLPExpertsTorch" + prefill_op: "KExpertsTorch" generate_device: "cpu" - generate_mlp_type: "MLPCPUExperts" + generate_op: "KExpertsCPU" out_device: "cuda:0" recursive: False # don't recursively inject submodules of this module - match: name: "^model\\.layers\\.([345][0-9])\\.mlp\\.experts$" replace: - class: ktransformers.operators.experts.KTransformersMLPExpert # custom MoE Kernel with expert paralleism + class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism kwargs: prefill_device: "cuda:1" - prefill_mlp_type: "MLPExpertsTorch" + prefill_op: "KExpertsTorch" generate_device: "cpu" - generate_mlp_type: "MLPCPUExperts" + generate_op: "KExpertsCPU" out_device: "cuda:1" recursive: False # don't recursively inject submodules of this module - match: name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.self_attn$" replace: - class: ktransformers.operators.attention.DeepseekV2AttentionInjected # optimized MLA implementation + class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation kwargs: generate_device: "cuda:0" prefill_device: "cuda:0" - match: name: "^model\\.layers\\.([345][0-9])\\.self_attn$" replace: - class: ktransformers.operators.attention.DeepseekV2AttentionInjected # optimized MLA implementation + class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation kwargs: generate_device: "cuda:1" prefill_device: "cuda:1" - match: name: "^model$" replace: - class: "ktransformers.operators.layer_wise_prefill.DeepseekV2ModelKTransformers" + class: "ktransformers.operators.layer_wise_prefill.KDeepseekV2Model" kwargs: per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill transfer_map: diff --git a/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat.yaml b/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat.yaml index 47fe084..85d2e82 100644 --- a/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat.yaml +++ b/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat.yaml @@ -9,53 +9,53 @@ # name: "^model\\.layers\\.([1-5][0-9])\\.mlp\\.shared_experts.*$" # regular expression # class: torch.nn.Linear # only match modules matching name and class simultaneously # replace: -# class: ktransformers.operators.linear.KTransformerLinear # optimized Kernel on quantized data types +# class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types # kwargs: # generate_device: "cpu" # prefill_device: "cuda" -# generate_op: "QuantizedLinearCPUInfer" -# prefill_op: "QuantizedLinearTorch" +# generate_op: "KLinearCPUInfer" +# prefill_op: "KLinearTorch" # out_device: "cuda" - match: name: "^model\\.layers\\.(?!.*self_attn).*$" # regular expression class: torch.nn.Linear # only match modules matching name and class simultaneously replace: - class: ktransformers.operators.linear.KTransformerLinear # optimized Kernel on quantized data types + class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types kwargs: generate_device: "cuda" prefill_device: "cuda" - generate_op: "QuantizedLinearMarlin" - prefill_op: "QuantizedLinearTorch" + generate_op: "KLinearMarlin" + prefill_op: "KLinearTorch" - match: name: "^model\\.layers\\..*\\.mlp$" class: ktransformers.models.modeling_deepseek.DeepseekV2MoE replace: - class: ktransformers.operators.experts.DeepseekV2MoEInjected # mlp module with custom forward function + class: ktransformers.operators.experts.KDeepseekV2MoE # mlp module with custom forward function kwargs: generate_device: "cuda" prefill_device: "cuda" - match: name: "^model\\.layers\\..*\\.mlp\\.experts$" replace: - class: ktransformers.operators.experts.KTransformersMLPExpert # custom MoE Kernel with expert paralleism + class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism kwargs: prefill_device: "cuda" - prefill_mlp_type: "MLPExpertsTorch" + prefill_op: "KExpertsTorch" generate_device: "cpu" - generate_mlp_type: "MLPCPUExperts" + generate_op: "KExpertsCPU" out_device: "cuda" recursive: False # don't recursively inject submodules of this module - match: name: "^model\\.layers\\..*\\.self_attn$" replace: - class: ktransformers.operators.attention.DeepseekV2AttentionInjected # optimized MLA implementation + class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation kwargs: generate_device: "cuda" prefill_device: "cuda" - match: name: "^model$" replace: - class: "ktransformers.operators.layer_wise_prefill.DeepseekV2ModelKTransformers" + class: "ktransformers.operators.layer_wise_prefill.KDeepseekV2Model" kwargs: generate_device: "cuda" prefill_device: "cuda" diff --git a/ktransformers/optimize/optimize_rules/DeepSeek-V2-Lite-Chat-multi-gpu.yaml b/ktransformers/optimize/optimize_rules/DeepSeek-V2-Lite-Chat-multi-gpu.yaml index e79e4fd..4115592 100644 --- a/ktransformers/optimize/optimize_rules/DeepSeek-V2-Lite-Chat-multi-gpu.yaml +++ b/ktransformers/optimize/optimize_rules/DeepSeek-V2-Lite-Chat-multi-gpu.yaml @@ -27,29 +27,29 @@ name: "^model\\.layers\\.(0|[1-9])\\.(?!self_attn).*$" # regular expression class: torch.nn.Linear # only match modules matching name and class simultaneously replace: - class: ktransformers.operators.linear.KTransformerLinear # optimized Kernel on quantized data types + class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types kwargs: generate_device: "cuda:0" prefill_device: "cuda:0" - generate_op: "QuantizedLinearMarlin" - prefill_op: "QuantizedLinearTorch" + generate_op: "KLinearMarlin" + prefill_op: "KLinearTorch" - match: name: "^model\\.layers\\.([12][0-9])\\.(?!self_attn).*$" # regular expression class: torch.nn.Linear # only match modules matching name and class simultaneously replace: - class: ktransformers.operators.linear.KTransformerLinear # optimized Kernel on quantized data types + class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types kwargs: generate_device: "cuda:1" prefill_device: "cuda:1" - generate_op: "QuantizedLinearMarlin" - prefill_op: "QuantizedLinearTorch" + generate_op: "KLinearMarlin" + prefill_op: "KLinearTorch" - match: name: "^model\\.layers\\.(0|[1-9])\\.mlp$" class: ktransformers.models.modeling_deepseek.DeepseekV2MoE replace: - class: ktransformers.operators.experts.DeepseekV2MoEInjected # mlp module with custom forward function + class: ktransformers.operators.experts.KDeepseekV2MoE # mlp module with custom forward function kwargs: generate_device: "cuda:0" prefill_device: "cuda:0" @@ -57,7 +57,7 @@ name: "^model\\.layers\\.([12][0-9])\\.mlp$" class: ktransformers.models.modeling_deepseek.DeepseekV2MoE replace: - class: ktransformers.operators.experts.DeepseekV2MoEInjected # mlp module with custom forward function + class: ktransformers.operators.experts.KDeepseekV2MoE # mlp module with custom forward function kwargs: generate_device: "cuda:1" prefill_device: "cuda:1" @@ -65,45 +65,45 @@ - match: name: "^model\\.layers\\.(0|[1-9])\\.mlp\\.experts$" replace: - class: ktransformers.operators.experts.KTransformersMLPExpert # custom MoE Kernel with expert paralleism + class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism kwargs: prefill_device: "cuda:0" - prefill_mlp_type: "MLPExpertsTorch" + prefill_op: "KExpertsTorch" generate_device: "cpu" - generate_mlp_type: "MLPCPUExperts" + generate_op: "KExpertsCPU" out_device: "cuda:0" recursive: False # don't recursively inject submodules of this module - match: name: "^model\\.layers\\.([12][0-9])\\.mlp\\.experts$" replace: - class: ktransformers.operators.experts.KTransformersMLPExpert # custom MoE Kernel with expert paralleism + class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism kwargs: prefill_device: "cuda:1" - prefill_mlp_type: "MLPExpertsTorch" + prefill_op: "KExpertsTorch" generate_device: "cpu" - generate_mlp_type: "MLPCPUExperts" + generate_op: "KExpertsCPU" out_device: "cuda:1" recursive: False # don't recursively inject submodules of this module - match: name: "^model\\.layers\\.(0|[1-9])\\.self_attn$" replace: - class: ktransformers.operators.attention.DeepseekV2AttentionInjected # optimized MLA implementation + class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation kwargs: generate_device: "cuda:0" prefill_device: "cuda:0" - match: name: "^model\\.layers\\.([12][0-9])\\.self_attn$" replace: - class: ktransformers.operators.attention.DeepseekV2AttentionInjected # optimized MLA implementation + class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation kwargs: generate_device: "cuda:1" prefill_device: "cuda:1" - match: name: "^model$" replace: - class: "ktransformers.operators.layer_wise_prefill.DeepseekV2ModelKTransformers" + class: "ktransformers.operators.layer_wise_prefill.KDeepseekV2Model" kwargs: per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill transfer_map: diff --git a/ktransformers/optimize/optimize_rules/Mixtral.yaml b/ktransformers/optimize/optimize_rules/Mixtral.yaml index 21fdb72..ad7d293 100644 --- a/ktransformers/optimize/optimize_rules/Mixtral.yaml +++ b/ktransformers/optimize/optimize_rules/Mixtral.yaml @@ -9,26 +9,26 @@ name: "^model\\.layers\\..*$" class: torch.nn.Linear # only match modules matching name and class simultaneously replace: - class: ktransformers.operators.linear.KTransformerLinear # optimized Kernel on quantized data types + class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types kwargs: generate_device: "cuda" prefill_device: "cuda" - generate_op: "QuantizedLinearMarlin" - prefill_op: "QuantizedLinearTorch" + generate_op: "KLinearMarlin" + prefill_op: "KLinearTorch" - match: name: "^model\\.layers\\..*\\.block_sparse_moe$" class: ktransformers.models.modeling_mixtral.MixtralSparseMoeBlock replace: - class: ktransformers.operators.experts.MisrtalSparseMoEBlockInjected + class: ktransformers.operators.experts.KMisrtalSparseMoEBlock - match: name: "^model\\.layers\\..*\\.block_sparse_moe\\.experts$" replace: - class: ktransformers.operators.experts.KTransformersMLPExpert + class: ktransformers.operators.experts.KTransformersExperts kwargs: prefill_device: "cuda" - prefill_mlp_type: "MLPExpertsTorch" + prefill_op: "KExpertsTorch" generate_device: "cpu" - generate_mlp_type: "MLPCPUExperts" + generate_op: "KExpertsCPU" out_device: "cuda" recursive: False # don't recursively inject submodules of this module diff --git a/ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct-multi-gpu.yaml b/ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct-multi-gpu.yaml index d48ebeb..37c8a36 100644 --- a/ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct-multi-gpu.yaml +++ b/ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct-multi-gpu.yaml @@ -10,27 +10,27 @@ name: "^model\\.layers\\.([012])$" # regular expression class: torch.nn.Linear # only match modules matching name and class simultaneously replace: - class: ktransformers.operators.linear.KTransformerLinear # optimized Kernel on quantized data types + class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types kwargs: generate_device: "cuda:0" prefill_device: "cuda:0" - generate_op: "QuantizedLinearMarlin" - prefill_op: "QuantizedLinearTorch" + generate_op: "KLinearMarlin" + prefill_op: "KLinearTorch" - match: name: "^model\\.layers\\.([012])\\.mlp$" class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeSparseMoeBlock replace: - class: ktransformers.operators.experts.Qwen2MoeSparseMoeBlockInjected # mlp module with custom forward function + class: ktransformers.operators.experts.KQwen2MoeSparseMoeBlock # mlp module with custom forward function - match: name: "^model\\.layers\\.([012])\\.mlp\\.experts$" replace: - class: ktransformers.operators.experts.KTransformersMLPExpert # custom MoE Kernel with expert paralleism + class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism # device: "cpu" # which devices to load this module when initializing kwargs: prefill_device: "cuda:0" - prefill_mlp_type: "MLPExpertsTorch" + prefill_op: "KExpertsTorch" generate_device: "cpu" - generate_mlp_type: "MLPCPUExperts" + generate_op: "KExpertsCPU" out_device: "cuda:0" recursive: False # don't recursively inject submodules of this module @@ -46,27 +46,27 @@ name: "^model\\.layers\\.([12][0-9]|[3-9])$" # regular expression class: torch.nn.Linear # only match modules matching name and class simultaneously replace: - class: ktransformers.operators.linear.KTransformerLinear # optimized Kernel on quantized data types + class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types kwargs: generate_device: "cuda:1" prefill_device: "cuda:1" - generate_op: "QuantizedLinearMarlin" - prefill_op: "QuantizedLinearTorch" + generate_op: "KLinearMarlin" + prefill_op: "KLinearTorch" - match: name: "^model\\.layers\\.([12][0-9]|[3-9])\\.mlp$" class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeSparseMoeBlock replace: - class: ktransformers.operators.experts.Qwen2MoeSparseMoeBlockInjected # mlp module with custom forward function + class: ktransformers.operators.experts.KQwen2MoeSparseMoeBlock # mlp module with custom forward function - match: name: "^model\\.layers\\.([12][0-9]|[3-9])\\.mlp\\.experts$" replace: - class: ktransformers.operators.experts.KTransformersMLPExpert # custom MoE Kernel with expert paralleism + class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism # device: "cpu" # which devices to load this module when initializing kwargs: prefill_device: "cuda:1" - prefill_mlp_type: "MLPExpertsTorch" + prefill_op: "KExpertsTorch" generate_device: "cpu" - generate_mlp_type: "MLPCPUExperts" + generate_op: "KExpertsCPU" out_device: "cuda:1" recursive: False # don't recursively inject submodules of this module @@ -89,7 +89,7 @@ - match: name: "^model$" replace: - class: "ktransformers.operators.layer_wise_prefill.Qwen2MoeModelKTransformers" + class: "ktransformers.operators.layer_wise_prefill.KQwen2MoeModel" kwargs: per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill transfer_map: diff --git a/ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct.yaml b/ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct.yaml index a48b15a..a44c750 100644 --- a/ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct.yaml +++ b/ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct.yaml @@ -9,36 +9,36 @@ name: "^model\\.layers\\..*$" # regular expression class: torch.nn.Linear # only match modules matching name and class simultaneously replace: - class: ktransformers.operators.linear.KTransformerLinear # optimized Kernel on quantized data types + class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types kwargs: generate_device: "cuda" prefill_device: "cuda" - generate_op: "QuantizedLinearMarlin" - prefill_op: "QuantizedLinearTorch" + generate_op: "KLinearMarlin" + prefill_op: "KLinearTorch" - match: name: "^model\\.layers\\..*\\.mlp$" class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeSparseMoeBlock replace: - class: ktransformers.operators.experts.Qwen2MoeSparseMoeBlockInjected # mlp module with custom forward function + class: ktransformers.operators.experts.KQwen2MoeSparseMoeBlock # mlp module with custom forward function kwargs: generate_device: "cuda" prefill_device: "cuda" - match: name: "^model\\.layers\\..*\\.mlp\\.experts$" replace: - class: ktransformers.operators.experts.KTransformersMLPExpert # custom MoE Kernel with expert paralleism + class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism # device: "cpu" # which devices to load this module when initializing kwargs: prefill_device: "cuda" - prefill_mlp_type: "MLPExpertsTorch" + prefill_op: "KExpertsTorch" generate_device: "cpu" - generate_mlp_type: "MLPCPUExperts" + generate_op: "KExpertsCPU" out_device: "cuda" recursive: False # don't recursively inject submodules of this module - match: name: "^model$" replace: - class: "ktransformers.operators.layer_wise_prefill.Qwen2MoeModelKTransformers" + class: "ktransformers.operators.layer_wise_prefill.KQwen2MoeModel" kwargs: per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill - match: diff --git a/ktransformers/tests/dequant_gpu.py b/ktransformers/tests/dequant_gpu.py index 9c839c1..0dd5272 100644 --- a/ktransformers/tests/dequant_gpu.py +++ b/ktransformers/tests/dequant_gpu.py @@ -5,8 +5,8 @@ import sys current_path = os.path.abspath(os.path.dirname(__file__)) sys.path.append(current_path+"/../..") import numpy as np -# from ktransformers.operators.linear import KTransformerLinear, QuantizedLinearMarlin -# from ktransformers.operators.experts import KTransformersMLPExpert, MLPExpertsTorch +# from ktransformers.operators.linear import KTransformersLinear, KLinearMarlin +# from ktransformers.operators.experts import KTransformersExperts, KExpertsTorch from ktransformers.util.custom_gguf import GGUFLoader import torch import KTransformersOps diff --git a/ktransformers/tests/dequant_gpu_t.py b/ktransformers/tests/dequant_gpu_t.py index 8abc89d..4b2556d 100644 --- a/ktransformers/tests/dequant_gpu_t.py +++ b/ktransformers/tests/dequant_gpu_t.py @@ -7,8 +7,8 @@ import pycuda.autoinit import pycuda.driver as cuda from pycuda.compiler import SourceModule import numpy as np -from ktransformers.operators.linear import KTransformerLinear, QuantizedLinearMarlin -from ktransformers.operators.experts import KTransformersMLPExpert, MLPExpertsTorch +from ktransformers.operators.linear import KTransformersLinear, KLinearMarlin +from ktransformers.operators.experts import KTransformersExperts, KExpertsTorch from ktransformers.util.custom_gguf import GGUFLoader, dequantize_q4_k_gpu, dequantize_q4_k import torch import KTransformersOps