This commit is contained in:
DDong Jianwei 2025-02-23 18:51:42 +08:00
parent cdb6f896bb
commit 95d937c51d
4 changed files with 13 additions and 8 deletions

View file

@ -58,7 +58,7 @@ def local_chat(
gguf_path: str | None = None, gguf_path: str | None = None,
max_new_tokens: int = 300, max_new_tokens: int = 300,
cpu_infer: int = Config().cpu_infer, cpu_infer: int = Config().cpu_infer,
use_cuda_graph: bool = True, use_cuda_graph: bool = False,
prompt_file : str | None = None, prompt_file : str | None = None,
mode: str = "normal", mode: str = "normal",
force_think: bool = False, force_think: bool = False,
@ -160,6 +160,9 @@ def local_chat(
input_tensor = tokenizer.apply_chat_template( input_tensor = tokenizer.apply_chat_template(
messages, add_generation_prompt=True, return_tensors="pt" messages, add_generation_prompt=True, return_tensors="pt"
) )
# input_tensor = torch.tensor([[0, 6657, 84646]], device=input_tensor.device)
if force_think: if force_think:
token_thinks = torch.tensor([tokenizer.encode("<think>\\n",add_special_tokens=False)],device=input_tensor.device) token_thinks = torch.tensor([tokenizer.encode("<think>\\n",add_special_tokens=False)],device=input_tensor.device)
input_tensor = torch.cat( input_tensor = torch.cat(
@ -181,4 +184,6 @@ def local_chat(
if __name__ == "__main__": if __name__ == "__main__":
fire.Fire(local_chat) # fire.Fire(local_chat)
# local_chat(model_path="/mnt/data/model/DeepSeek-R1", gguf_path="/mnt/data/model/DeepseekV3-q4km-gguf", cpu_infer=33, force_think=False)
local_chat(model_path="/mnt/data/model/Moonlight-16B-A3B-Instruct", gguf_path="/mnt/data/model/Moonlight-16B-A3B-Instruct-GGUF", cpu_infer=33, force_think=False)

View file

@ -441,10 +441,10 @@ class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention):
# mla_wrapper run output: [tokens, self.num_heads, self.kv_lora_rank] # mla_wrapper run output: [tokens, self.num_heads, self.kv_lora_rank]
# attn_output [bsz, q_len, self.num_heads, self.kv_lora_rank] # attn_output [bsz, q_len, self.num_heads, self.kv_lora_rank]
# out_absorb [self.num_heads, self.v_head_dim, self.kv_lora_rank] # out_absorb [self.num_heads, self.v_head_dim, self.kv_lora_rank]
attn_output = attn_output.transpose(1, 2) attn_output = attn_output.transpose(1, 2) # [bsz, self.num_heads, q_len, self.kv_lora_rank]
attn_output = torch.matmul(attn_output, out_absorb.mT) attn_output = torch.matmul(attn_output, out_absorb.mT) # [bsz, self.num_heads, q_len, self.v_head_dim]
attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.v_head_dim) attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.v_head_dim) # [bsz, q_len, self.num_heads * self.v_head_dim]
attn_output = self.o_proj(attn_output) attn_output = self.o_proj(attn_output)
return attn_output, None, past_key_value return attn_output, None, past_key_value

View file

@ -450,9 +450,9 @@ class KExpertsTorch(KExpertsBase):
self.up[i] = w["up"][i, ...].to(device=device, dtype=self.dtype) self.up[i] = w["up"][i, ...].to(device=device, dtype=self.dtype)
self.down[i] = w["down"][i, ...].to(device=device, dtype=self.dtype) self.down[i] = w["down"][i, ...].to(device=device, dtype=self.dtype)
self.up = torch.cat(self.gate, dim=0) self.up = torch.cat(self.up, dim=0)
self.gate = torch.cat(self.gate, dim=0) self.gate = torch.cat(self.gate, dim=0)
self.down = torch.cat(self.gate, dim=0) self.down = torch.cat(self.down, dim=0)
return return
def unload(self): def unload(self):

View file

@ -1,7 +1,7 @@
- match: - match:
class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
replace: replace:
class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3 class: ktransformers.operators.RoPE.RotaryEmbeddingV3
kwargs: kwargs:
generate_device: "cuda" generate_device: "cuda"
prefill_device: "cuda" prefill_device: "cuda"