mirror of
https://github.com/kvcache-ai/ktransformers.git
synced 2025-09-10 23:34:35 +00:00
fix some bugs
This commit is contained in:
parent
d2cf81423f
commit
921061666c
3 changed files with 7 additions and 7 deletions
|
@ -158,12 +158,12 @@ def local_chat(
|
||||||
|
|
||||||
if system != "Windows" and (config.architectures[0] == "DeepseekV2ForCausalLM" or config.architectures[0] == "DeepseekV3ForCausalLM") and flashinfer_enabled and get_compute_capability() >= 8:
|
if system != "Windows" and (config.architectures[0] == "DeepseekV2ForCausalLM" or config.architectures[0] == "DeepseekV3ForCausalLM") and flashinfer_enabled and get_compute_capability() >= 8:
|
||||||
generated = prefill_and_generate(
|
generated = prefill_and_generate(
|
||||||
model, tokenizer, input_tensor.cuda(), max_new_tokens, use_cuda_graph, mode = mode, force_think = force_think, chunk_prefill_size = chunk_prefill_size,
|
model, tokenizer, input_tensor.cuda(), max_new_tokens, use_cuda_graph, mode = mode, force_think = force_think, chunk_size = chunk_prefill_size,
|
||||||
use_flashinfer_mla = True, num_heads = config.num_attention_heads, head_dim_ckv = config.kv_lora_rank, head_dim_kpe = config.qk_rope_head_dim, q_head_dim = config.qk_rope_head_dim + config.qk_nope_head_dim
|
use_flashinfer_mla = True, num_heads = config.num_attention_heads, head_dim_ckv = config.kv_lora_rank, head_dim_kpe = config.qk_rope_head_dim, q_head_dim = config.qk_rope_head_dim + config.qk_nope_head_dim
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
generated = prefill_and_generate(
|
generated = prefill_and_generate(
|
||||||
model, tokenizer, input_tensor.cuda(), max_new_tokens, use_cuda_graph, mode = mode, force_think = force_think, chunk_prefill_size = chunk_prefill_size,
|
model, tokenizer, input_tensor.cuda(), max_new_tokens, use_cuda_graph, mode = mode, force_think = force_think, chunk_size = chunk_prefill_size,
|
||||||
)
|
)
|
||||||
break
|
break
|
||||||
|
|
||||||
|
|
|
@ -680,9 +680,9 @@ class KQwen2MoeSparseMoeBlock(BaseInjectedModule, Qwen2MoeSparseMoeBlock):
|
||||||
y.resize_(*orig_shape)
|
y.resize_(*orig_shape)
|
||||||
return y, router_logits
|
return y, router_logits
|
||||||
|
|
||||||
hidden_states_expert = hidden_states.to(self.experts.device) if isinstance(self.experts, KExpertsBase) else hidden_states_expert.cpu()
|
hidden_states_expert = hidden_states.to(self.experts.device) if isinstance(self.experts, KExpertsBase) else hidden_states.cpu()
|
||||||
selected_experts_expert = selected_experts.to(self.experts.device) if isinstance(self.experts, KExpertsBase) else selected_experts_expert.cpu()
|
selected_experts_expert = selected_experts.to(self.experts.device) if isinstance(self.experts, KExpertsBase) else selected_experts.cpu()
|
||||||
routing_weights_expert = routing_weights.to(self.experts.device) if isinstance(self.experts, KExpertsBase) else routing_weights_expert.cpu()
|
routing_weights_expert = routing_weights.to(self.experts.device) if isinstance(self.experts, KExpertsBase) else routing_weights.cpu()
|
||||||
|
|
||||||
shared_expert_output = self.shared_expert(hidden_states)
|
shared_expert_output = self.shared_expert(hidden_states)
|
||||||
shared_expert_output = (
|
shared_expert_output = (
|
||||||
|
|
|
@ -138,7 +138,7 @@ class KLinearTorch(KLinearBase):
|
||||||
self.weight = None
|
self.weight = None
|
||||||
self.has_bias = False
|
self.has_bias = False
|
||||||
|
|
||||||
def forward(self, x: torch.Tensor, **kwargs) -> torch.Tensor:
|
def forward(self, x: torch.Tensor, bsz_tensor: torch.Tensor=None, **kwargs) -> torch.Tensor:
|
||||||
dtype = x.dtype
|
dtype = x.dtype
|
||||||
out_device = x.device
|
out_device = x.device
|
||||||
# TODO: support CUDA Graph when using cpu, but CPUInfer is recommended.
|
# TODO: support CUDA Graph when using cpu, but CPUInfer is recommended.
|
||||||
|
@ -201,7 +201,7 @@ class KLinearQ8(KLinearBase):
|
||||||
self.bias = None
|
self.bias = None
|
||||||
self.loaded = False
|
self.loaded = False
|
||||||
|
|
||||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
def forward(self, x: torch.Tensor, bsz_tensor: torch.Tensor=None) -> torch.Tensor:
|
||||||
orig_dtype = x.dtype
|
orig_dtype = x.dtype
|
||||||
out_device = x.device
|
out_device = x.device
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue