Merge pull request #62 from Azure-Tang/main

[Fix] Fix problem that ktransformers cannot offload whole layer in cpu
This commit is contained in:
UnicornChan 2024-08-29 23:40:21 +08:00 committed by GitHub
commit f536a7085f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 48 additions and 45 deletions

View file

@ -25,7 +25,7 @@ rm -rf /var/lib/apt/lists/* &&
cd ktransformers && cd ktransformers &&
git submodule init && git submodule init &&
git submodule update && git submodule update &&
pip install ninja pyproject numpy && pip install ninja pyproject numpy cpufeature &&
pip install flash-attn && pip install flash-attn &&
CPU_INSTRUCT=NATIVE KTRANSFORMERS_FORCE_BUILD=TRUE TORCH_CUDA_ARCH_LIST="8.0;8.6;8.7;8.9" pip install . --no-build-isolation --verbose && CPU_INSTRUCT=NATIVE KTRANSFORMERS_FORCE_BUILD=TRUE TORCH_CUDA_ARCH_LIST="8.0;8.6;8.7;8.9" pip install . --no-build-isolation --verbose &&
pip cache purge pip cache purge

View file

@ -5,7 +5,7 @@ Description :
Author : kkk1nak0 Author : kkk1nak0
Date : 2024-08-15 07:34:46 Date : 2024-08-15 07:34:46
Version : 1.0.0 Version : 1.0.0
LastEditors : chenxl LastEditors : Azure-Tang
LastEditTime : 2024-08-28 15:19:03 LastEditTime : 2024-08-29 22:35:51
''' '''
__version__ = "0.1.3" __version__ = "0.1.4"

View file

@ -67,6 +67,7 @@ def local_chat(
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True) config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
if mode == 'long_context': if mode == 'long_context':
assert config.architectures[0] == "LlamaForCausalLM", "only LlamaForCausalLM support long_context mode"
torch.set_default_dtype(torch.float16) torch.set_default_dtype(torch.float16)
else: else:
torch.set_default_dtype(config.torch_dtype) torch.set_default_dtype(config.torch_dtype)
@ -143,6 +144,7 @@ def local_chat(
input_tensor = tokenizer.apply_chat_template( input_tensor = tokenizer.apply_chat_template(
messages, add_generation_prompt=True, return_tensors="pt" messages, add_generation_prompt=True, return_tensors="pt"
) )
if mode == 'long_context':
assert Config().long_context_config['max_seq_len'] > input_tensor.shape[1] + max_new_tokens, \ assert Config().long_context_config['max_seq_len'] > input_tensor.shape[1] + max_new_tokens, \
"please change max_seq_len in ~/.ktransformers/config.yaml" "please change max_seq_len in ~/.ktransformers/config.yaml"
torch.set_default_dtype( torch.set_default_dtype(

View file

@ -6,7 +6,7 @@ Author : Azure-Tang, Boxin Zhang, chenht2022
Date : 2024-07-25 11:25:24 Date : 2024-07-25 11:25:24
Version : 0.1.0 Version : 0.1.0
LastEditors : Azure LastEditors : Azure
LastEditTime : 2024-08-27 03:50:23 LastEditTime : 2024-08-29 09:41:10
Copyright (c) 2024 by KVCache.AI, All Rights Reserved. Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
''' '''
@ -202,7 +202,7 @@ class KExpertsCPU(KExpertsBase):
def forward(self, input_tensor, expert_ids, weights): def forward(self, input_tensor, expert_ids, weights):
# generate, capture and run cuda graph # generate, capture and run cuda graph
# print(expert_ids) # print(expert_ids)
if input_tensor.size(0)==1: if input_tensor.size(0)==1 and torch.cuda.is_current_stream_capturing():
# TODO: this branch is unreachable, but the shape of input_tensor([1,hidden_size]) and input_tensor_cpu([hidden_size]) is not compatible # TODO: this branch is unreachable, but the shape of input_tensor([1,hidden_size]) and input_tensor_cpu([hidden_size]) is not compatible
#print("capturing experts") #print("capturing experts")
KExpertsCPU.input_tensor_cpu.copy_(input_tensor, non_blocking=True) KExpertsCPU.input_tensor_cpu.copy_(input_tensor, non_blocking=True)
@ -636,7 +636,7 @@ class KDeepseekV2MoE(BaseInjectedModule, DeepseekV2MoE):
topk_idx, topk_weight, aux_loss = self.gate(hidden_states) topk_idx, topk_weight, aux_loss = self.gate(hidden_states)
hidden_states = hidden_states.view(-1, hidden_states.shape[-1]) hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
if sequence_length == 1 and hasattr(self.experts.generate_experts, "submit_for_one_decode"): if sequence_length == 1 and hasattr(self.experts.generate_experts, "submit_for_one_decode") and torch.cuda.is_current_stream_capturing():
self.experts.generate_experts.submit_for_one_decode(hidden_states[0], topk_idx[0], topk_weight[0]) self.experts.generate_experts.submit_for_one_decode(hidden_states[0], topk_idx[0], topk_weight[0])
if self.config.n_shared_experts is not None: if self.config.n_shared_experts is not None:
y_ = self.shared_experts(identity).squeeze(0) y_ = self.shared_experts(identity).squeeze(0)

View file

@ -6,7 +6,7 @@ Author : Azure-Tang, Boxin Zhang
Date : 2024-07-25 11:25:24 Date : 2024-07-25 11:25:24
Version : 0.1.0 Version : 0.1.0
LastEditors : Azure LastEditors : Azure
LastEditTime : 2024-08-14 14:57:04 LastEditTime : 2024-08-29 09:11:16
Copyright (c) 2024 by KVCache.AI, All Rights Reserved. Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
''' '''
@ -277,7 +277,7 @@ class KLinearCPUInfer(KLinearBase):
def forward(self, x: torch.Tensor) -> torch.Tensor: def forward(self, x: torch.Tensor) -> torch.Tensor:
origin_shape = x.shape # [batch_size, q_len, hidden_size] origin_shape = x.shape # [batch_size, q_len, hidden_size]
if origin_shape[1] == 1: if origin_shape[1] == 1 and torch.cuda.is_current_stream_capturing():
out_device = x.device out_device = x.device
self.input_tensor_cpu.copy_(x, non_blocking=True) self.input_tensor_cpu.copy_(x, non_blocking=True)
qlen = origin_shape[1] qlen = origin_shape[1]

View file

@ -670,8 +670,9 @@ class KDeepseekV2Model(BaseInjectedModule):
if self.transfer_map is not None and i in self.transfer_map: if self.transfer_map is not None and i in self.transfer_map:
prev_stream = torch.cuda.current_stream() prev_stream = torch.cuda.current_stream()
cur_device = self.transfer_map[i] cur_device = self.transfer_map[i]
if cur_device not in self.stream_device_map: if cur_device not in self.stream_device_map and cur_device.lower() != "cpu":
self.stream_device_map[cur_device] = torch.cuda.Stream(cur_device) self.stream_device_map[cur_device] = torch.cuda.Stream(cur_device)
if cur_device.lower() != "cpu":
torch.cuda.set_device(cur_device) torch.cuda.set_device(cur_device)
self.stream_device_map[cur_device].wait_stream(prev_stream) self.stream_device_map[cur_device].wait_stream(prev_stream)
torch.cuda.set_stream(self.stream_device_map[cur_device]) torch.cuda.set_stream(self.stream_device_map[cur_device])

View file

@ -7,7 +7,7 @@
prefill_device: "cpu" prefill_device: "cpu"
- match: - match:
name: "^model\\.layers\\.([0-9])\\." name: "^model\\.layers\\.([0-9]|[1][0-4])\\."
class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
replace: replace:
class: ktransformers.operators.RoPE.YarnRotaryEmbedding class: ktransformers.operators.RoPE.YarnRotaryEmbedding
@ -15,7 +15,7 @@
generate_device: "cuda:0" generate_device: "cuda:0"
prefill_device: "cuda:0" prefill_device: "cuda:0"
- match: - match:
name: "^model\\.layers\\.([1][0-9])\\." name: "^model\\.layers\\.([2][0-9]|[1][5-9])\\."
class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
replace: replace:
class: ktransformers.operators.RoPE.YarnRotaryEmbedding class: ktransformers.operators.RoPE.YarnRotaryEmbedding
@ -23,7 +23,7 @@
generate_device: "cuda:1" generate_device: "cuda:1"
prefill_device: "cuda:1" prefill_device: "cuda:1"
- match: - match:
name: "^model\\.layers\\.([2][0-9])\\." name: "^model\\.layers\\.([3][0-9]|[4][0-4])\\."
class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
replace: replace:
class: ktransformers.operators.RoPE.YarnRotaryEmbedding class: ktransformers.operators.RoPE.YarnRotaryEmbedding
@ -31,7 +31,7 @@
generate_device: "cuda:2" generate_device: "cuda:2"
prefill_device: "cuda:2" prefill_device: "cuda:2"
- match: - match:
name: "^model\\.layers\\.([345][0-9])\\." name: "^model\\.layers\\.([5][0-9]|[4][5-9])\\."
class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
replace: replace:
class: ktransformers.operators.RoPE.YarnRotaryEmbedding class: ktransformers.operators.RoPE.YarnRotaryEmbedding
@ -40,7 +40,7 @@
prefill_device: "cuda:3" prefill_device: "cuda:3"
- match: - match:
name: "^model\\.layers\\.([0-9])\\.(?!self_attn).*$" # regular expression name: "^model\\.layers\\.([0-9]|[1][0-4])\\.(?!self_attn\\.kv_b_proj).*$" # regular expression
class: torch.nn.Linear # only match modules matching name and class simultaneously class: torch.nn.Linear # only match modules matching name and class simultaneously
replace: replace:
class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types
@ -50,7 +50,7 @@
generate_op: "KLinearMarlin" generate_op: "KLinearMarlin"
prefill_op: "KLinearTorch" prefill_op: "KLinearTorch"
- match: - match:
name: "^model\\.layers\\.([1][0-9])\\.(?!self_attn).*$" # regular expression name: "^model\\.layers\\.([2][0-9]|[1][5-9])\\.(?!self_attn\\.kv_b_proj).*$" # regular expression
class: torch.nn.Linear # only match modules matching name and class simultaneously class: torch.nn.Linear # only match modules matching name and class simultaneously
replace: replace:
class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types
@ -60,7 +60,7 @@
generate_op: "KLinearMarlin" generate_op: "KLinearMarlin"
prefill_op: "KLinearTorch" prefill_op: "KLinearTorch"
- match: - match:
name: "^model\\.layers\\.([2][0-9])\\.(?!self_attn).*$" # regular expression name: "^model\\.layers\\.([3][0-9]|[4][0-4])\\.(?!self_attn\\.kv_b_proj).*$" # regular expression
class: torch.nn.Linear # only match modules matching name and class simultaneously class: torch.nn.Linear # only match modules matching name and class simultaneously
replace: replace:
class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types
@ -70,7 +70,7 @@
generate_op: "KLinearMarlin" generate_op: "KLinearMarlin"
prefill_op: "KLinearTorch" prefill_op: "KLinearTorch"
- match: - match:
name: "^model\\.layers\\.([345][0-9])\\.(?!self_attn).*$" # regular expression name: "^model\\.layers\\.([5][0-9]|[4][5-9])\\.(?!self_attn\\.kv_b_proj).*$" # regular expression
class: torch.nn.Linear # only match modules matching name and class simultaneously class: torch.nn.Linear # only match modules matching name and class simultaneously
replace: replace:
class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types
@ -81,7 +81,7 @@
prefill_op: "KLinearTorch" prefill_op: "KLinearTorch"
- match: - match:
name: "^model\\.layers\\.([0-9])\\.mlp$" name: "^model\\.layers\\.([0-9]|[1][0-4])\\.mlp$"
class: ktransformers.models.modeling_deepseek.DeepseekV2MoE class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
replace: replace:
class: ktransformers.operators.experts.KDeepseekV2MoE # mlp module with custom forward function class: ktransformers.operators.experts.KDeepseekV2MoE # mlp module with custom forward function
@ -89,7 +89,7 @@
generate_device: "cuda:0" generate_device: "cuda:0"
prefill_device: "cuda:0" prefill_device: "cuda:0"
- match: - match:
name: "^model\\.layers\\.([1][0-9])\\.mlp$" name: "^model\\.layers\\.([2][0-9]|[1][5-9])\\.mlp$"
class: ktransformers.models.modeling_deepseek.DeepseekV2MoE class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
replace: replace:
class: ktransformers.operators.experts.KDeepseekV2MoE # mlp module with custom forward function class: ktransformers.operators.experts.KDeepseekV2MoE # mlp module with custom forward function
@ -97,7 +97,7 @@
generate_device: "cuda:1" generate_device: "cuda:1"
prefill_device: "cuda:1" prefill_device: "cuda:1"
- match: - match:
name: "^model\\.layers\\.([2][0-9])\\.mlp$" name: "^model\\.layers\\.([3][0-9]|[4][0-4])\\.mlp$"
class: ktransformers.models.modeling_deepseek.DeepseekV2MoE class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
replace: replace:
class: ktransformers.operators.experts.KDeepseekV2MoE # mlp module with custom forward function class: ktransformers.operators.experts.KDeepseekV2MoE # mlp module with custom forward function
@ -105,7 +105,7 @@
generate_device: "cuda:2" generate_device: "cuda:2"
prefill_device: "cuda:2" prefill_device: "cuda:2"
- match: - match:
name: "^model\\.layers\\.([345][0-9])\\.mlp$" name: "^model\\.layers\\.([5][0-9]|[4][5-9])\\.mlp$"
class: ktransformers.models.modeling_deepseek.DeepseekV2MoE class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
replace: replace:
class: ktransformers.operators.experts.KDeepseekV2MoE # mlp module with custom forward function class: ktransformers.operators.experts.KDeepseekV2MoE # mlp module with custom forward function
@ -114,7 +114,7 @@
prefill_device: "cuda:3" prefill_device: "cuda:3"
- match: - match:
name: "^model\\.layers\\.([0-9])\\.mlp\\.experts$" name: "^model\\.layers\\.([0-9]|[1][0-4])\\.mlp\\.experts$"
replace: replace:
class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism
kwargs: kwargs:
@ -125,7 +125,7 @@
out_device: "cuda:0" out_device: "cuda:0"
recursive: False # don't recursively inject submodules of this module recursive: False # don't recursively inject submodules of this module
- match: - match:
name: "^model\\.layers\\.([1][0-9])\\.mlp\\.experts$" name: "^model\\.layers\\.([2][0-9]|[1][5-9])\\.mlp\\.experts$"
replace: replace:
class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism
kwargs: kwargs:
@ -136,7 +136,7 @@
out_device: "cuda:1" out_device: "cuda:1"
recursive: False # don't recursively inject submodules of this module recursive: False # don't recursively inject submodules of this module
- match: - match:
name: "^model\\.layers\\.([2][0-9])\\.mlp\\.experts$" name: "^model\\.layers\\.([3][0-9]|[4][0-4])\\.mlp\\.experts$"
replace: replace:
class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism
kwargs: kwargs:
@ -147,7 +147,7 @@
out_device: "cuda:2" out_device: "cuda:2"
recursive: False # don't recursively inject submodules of this module recursive: False # don't recursively inject submodules of this module
- match: - match:
name: "^model\\.layers\\.([345][0-9])\\.mlp\\.experts$" name: "^model\\.layers\\.([5][0-9]|[4][5-9])\\.mlp\\.experts$"
replace: replace:
class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism
kwargs: kwargs:
@ -159,28 +159,28 @@
recursive: False # don't recursively inject submodules of this module recursive: False # don't recursively inject submodules of this module
- match: - match:
name: "^model\\.layers\\.([0-9])\\.self_attn$" name: "^model\\.layers\\.([0-9]|[1][0-4])\\.self_attn$"
replace: replace:
class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
kwargs: kwargs:
generate_device: "cuda:0" generate_device: "cuda:0"
prefill_device: "cuda:0" prefill_device: "cuda:0"
- match: - match:
name: "^model\\.layers\\.([1][0-9])\\.self_attn$" name: "^model\\.layers\\.([2][0-9]|[1][5-9])\\.self_attn$"
replace: replace:
class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
kwargs: kwargs:
generate_device: "cuda:1" generate_device: "cuda:1"
prefill_device: "cuda:1" prefill_device: "cuda:1"
- match: - match:
name: "^model\\.layers\\.([2][0-9])\\.self_attn$" name: "^model\\.layers\\.([3][0-9]|[4][0-4])\\.self_attn$"
replace: replace:
class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
kwargs: kwargs:
generate_device: "cuda:2" generate_device: "cuda:2"
prefill_device: "cuda:2" prefill_device: "cuda:2"
- match: - match:
name: "^model\\.layers\\.([345][0-9])\\.self_attn$" name: "^model\\.layers\\.([5][0-9]|[4][5-9])\\.self_attn$"
replace: replace:
class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
kwargs: kwargs:
@ -194,33 +194,33 @@
kwargs: kwargs:
per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
transfer_map: transfer_map:
10: "cuda:1" 15: "cuda:1"
20: "cuda:2" 30: "cuda:2"
30: "cuda:3" 45: "cuda:3"
- match: - match:
name: "^model\\.layers\\.([0-9])\\." name: "^model\\.layers\\.([0-9]|[1][0-4])\\."
replace: replace:
class: "default" class: "default"
kwargs: kwargs:
generate_device: "cuda:0" generate_device: "cuda:0"
prefill_device: "cuda:0" prefill_device: "cuda:0"
- match: - match:
name: "(^model\\.layers\\.([1][0-9])\\.)" name: "(^model\\.layers\\.([2][0-9]|[1][5-9])\\.)"
replace: replace:
class: "default" class: "default"
kwargs: kwargs:
generate_device: "cuda:1" generate_device: "cuda:1"
prefill_device: "cuda:1" prefill_device: "cuda:1"
- match: - match:
name: "(^model\\.layers\\.([2][0-9])\\.)" name: "(^model\\.layers\\.([3][0-9]|[4][0-4])\\.)"
replace: replace:
class: "default" class: "default"
kwargs: kwargs:
generate_device: "cuda:2" generate_device: "cuda:2"
prefill_device: "cuda:2" prefill_device: "cuda:2"
- match: - match:
name: "(^model\\.layers\\.([345][0-9])\\.)|(^model.norm)|(^lm_head)" name: "(^model\\.layers\\.([5][0-9]|[4][5-9])\\.)|(^model.norm)|(^lm_head)"
replace: replace:
class: "default" class: "default"
kwargs: kwargs:

View file

@ -24,7 +24,7 @@
prefill_device: "cuda:1" prefill_device: "cuda:1"
- match: - match:
name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.(?!self_attn).*$" # regular expression name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.(?!self_attn\\.kv_b_proj).*$" # regular expression
class: torch.nn.Linear # only match modules matching name and class simultaneously class: torch.nn.Linear # only match modules matching name and class simultaneously
replace: replace:
class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types
@ -35,7 +35,7 @@
prefill_op: "KLinearTorch" prefill_op: "KLinearTorch"
- match: - match:
name: "^model\\.layers\\.([345][0-9])\\.(?!self_attn).*$" # regular expression name: "^model\\.layers\\.([345][0-9])\\.(?!self_attn\\.kv_b_proj).*$" # regular expression
class: torch.nn.Linear # only match modules matching name and class simultaneously class: torch.nn.Linear # only match modules matching name and class simultaneously
replace: replace:
class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types