mirror of
https://github.com/kvcache-ai/ktransformers.git
synced 2025-09-10 14:51:06 +00:00
support Moonlight
This commit is contained in:
parent
95d937c51d
commit
e8e02e5ccc
3 changed files with 4 additions and 10 deletions
|
@ -58,13 +58,12 @@ def local_chat(
|
||||||
gguf_path: str | None = None,
|
gguf_path: str | None = None,
|
||||||
max_new_tokens: int = 300,
|
max_new_tokens: int = 300,
|
||||||
cpu_infer: int = Config().cpu_infer,
|
cpu_infer: int = Config().cpu_infer,
|
||||||
use_cuda_graph: bool = False,
|
use_cuda_graph: bool = True,
|
||||||
prompt_file : str | None = None,
|
prompt_file : str | None = None,
|
||||||
mode: str = "normal",
|
mode: str = "normal",
|
||||||
force_think: bool = False,
|
force_think: bool = False,
|
||||||
):
|
):
|
||||||
|
|
||||||
|
|
||||||
torch.set_grad_enabled(False)
|
torch.set_grad_enabled(False)
|
||||||
|
|
||||||
Config().cpu_infer = cpu_infer
|
Config().cpu_infer = cpu_infer
|
||||||
|
@ -160,9 +159,6 @@ def local_chat(
|
||||||
input_tensor = tokenizer.apply_chat_template(
|
input_tensor = tokenizer.apply_chat_template(
|
||||||
messages, add_generation_prompt=True, return_tensors="pt"
|
messages, add_generation_prompt=True, return_tensors="pt"
|
||||||
)
|
)
|
||||||
|
|
||||||
# input_tensor = torch.tensor([[0, 6657, 84646]], device=input_tensor.device)
|
|
||||||
|
|
||||||
if force_think:
|
if force_think:
|
||||||
token_thinks = torch.tensor([tokenizer.encode("<think>\\n",add_special_tokens=False)],device=input_tensor.device)
|
token_thinks = torch.tensor([tokenizer.encode("<think>\\n",add_special_tokens=False)],device=input_tensor.device)
|
||||||
input_tensor = torch.cat(
|
input_tensor = torch.cat(
|
||||||
|
@ -184,6 +180,4 @@ def local_chat(
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# fire.Fire(local_chat)
|
fire.Fire(local_chat)
|
||||||
# local_chat(model_path="/mnt/data/model/DeepSeek-R1", gguf_path="/mnt/data/model/DeepseekV3-q4km-gguf", cpu_infer=33, force_think=False)
|
|
||||||
local_chat(model_path="/mnt/data/model/Moonlight-16B-A3B-Instruct", gguf_path="/mnt/data/model/Moonlight-16B-A3B-Instruct-GGUF", cpu_infer=33, force_think=False)
|
|
|
@ -159,7 +159,7 @@ class KExpertsCPU(KExpertsBase):
|
||||||
down_ptr = ctypes.addressof(
|
down_ptr = ctypes.addressof(
|
||||||
ctypes.cast(self.down.ctypes.data, ctypes.POINTER(ctypes.c_uint64)).contents
|
ctypes.cast(self.down.ctypes.data, ctypes.POINTER(ctypes.c_uint64)).contents
|
||||||
)
|
)
|
||||||
# print(self.gate_qtype, self.up_qtype, self.down_qtype)
|
#print(self.gate_type, self.up_type, self.down_type)
|
||||||
n_routed_experts = self.n_routed_experts
|
n_routed_experts = self.n_routed_experts
|
||||||
# n_routed_experts = len(self.orig_module)
|
# n_routed_experts = len(self.orig_module)
|
||||||
moe_config = MOEConfig(
|
moe_config = MOEConfig(
|
||||||
|
|
|
@ -207,7 +207,7 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000, use_cud
|
||||||
tokens.append(int(next_token))
|
tokens.append(int(next_token))
|
||||||
seq_length += 1
|
seq_length += 1
|
||||||
|
|
||||||
if next_token[0].item() == tokenizer.eos_token_id or tokenizer.decode(next_token) == '<|im_end|>':
|
if next_token[0].item() == tokenizer.eos_token_id or tokenizer.decode(next_token.tolist()) == '<|im_end|>':
|
||||||
print(stream.end(), end="", flush=True)
|
print(stream.end(), end="", flush=True)
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue