From 27990dc6fbbba6212c20bc3c981889fecc356ca8 Mon Sep 17 00:00:00 2001 From: qiyuxinlin <1668068727@qq.com> Date: Mon, 28 Apr 2025 21:08:13 +0000 Subject: [PATCH] fix load bug --- ktransformers/operators/experts.py | 3 ++- ktransformers/server/balance_serve/inference/model_runner.py | 2 +- requirements-local_chat.txt | 1 + 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/ktransformers/operators/experts.py b/ktransformers/operators/experts.py index 8e8f2b0..34f0af0 100644 --- a/ktransformers/operators/experts.py +++ b/ktransformers/operators/experts.py @@ -25,7 +25,6 @@ sys.path.append(os.path.join(os.path.dirname(__file__), "..", "ktransformers_ext sys.path.append(os.path.join(os.path.dirname(__file__), "..", "ktransformers_ext", "build", "Debug")) import cpuinfer_ext from cpuinfer_ext.moe import MOEConfig, MOE -from cpuinfer_ext.moe import AMX_MOEConfig, AMXBF16_MOE, AMXInt8_MOE import ctypes from ktransformers.util.custom_gguf import GGMLQuantizationType, GGUFLoader from ktransformers.util.utils import InferenceState @@ -186,6 +185,7 @@ class KExpertsCPU(KExpertsBase): ) self.moe = MOE(moe_config) elif self.backend == "AMXBF16": + from cpuinfer_ext.moe import AMX_MOEConfig, AMXBF16_MOE assert self.gate_type == GGMLQuantizationType.BF16 assert self.up_type == GGMLQuantizationType.BF16 assert self.down_type == GGMLQuantizationType.BF16 @@ -203,6 +203,7 @@ class KExpertsCPU(KExpertsBase): self.cpu_infer.submit(self.moe.load_weights()) self.cpu_infer.sync() elif self.backend == "AMXInt8": + from cpuinfer_ext.moe import AMX_MOEConfig, AMXInt8_MOE assert self.gate_type == GGMLQuantizationType.BF16 assert self.up_type == GGMLQuantizationType.BF16 assert self.down_type == GGMLQuantizationType.BF16 diff --git a/ktransformers/server/balance_serve/inference/model_runner.py b/ktransformers/server/balance_serve/inference/model_runner.py index 834fd1b..79b3053 100644 --- a/ktransformers/server/balance_serve/inference/model_runner.py +++ b/ktransformers/server/balance_serve/inference/model_runner.py @@ -85,7 +85,7 @@ class ModelRunner: elif isinstance(self.model, KQwen2MoeForCausalLM) or isinstance(self.model, KQwen3MoeForCausalLM): self.model.flash_infer_attn_plan(batch, self.bsz_tensor_buf, self.num_tokens_tensor_buf, num_q_heads=self.model.config.num_attention_heads, num_kv_heads=self.model.config.num_key_value_heads, - head_dim=self.model.config.head_dim if hasattr(self.model.config, 'head_num') else self.model.config.hidden_size // self.model.config.num_attention_heads, + head_dim=self.model.config.head_dim if hasattr(self.model.config, 'head_dim') else self.model.config.hidden_size // self.model.config.num_attention_heads, page_size=self.model.cache.page_size, causal=True, q_data_type=torch.bfloat16, kv_data_type=torch.bfloat16, cuda_graph_idx=cuda_graph_idx) else: diff --git a/requirements-local_chat.txt b/requirements-local_chat.txt index 25afaef..082adf7 100644 --- a/requirements-local_chat.txt +++ b/requirements-local_chat.txt @@ -7,3 +7,4 @@ cpufeature; sys_platform == 'win32' or sys_platform == 'Windows' protobuf tiktoken blobfile +triton==3.3 \ No newline at end of file