support glm4moe

This commit is contained in:
djw 2025-07-25 17:22:20 +00:00
parent 1677e90092
commit d03d92ba53
31 changed files with 2265 additions and 74 deletions

View file

@ -24,7 +24,11 @@ from ktransformers.models.custom_modeling_deepseek_v3 import KDeepseekV3ForCausa
from ktransformers.models.custom_modeling_deepseek_v2 import KDeepseekV2ForCausalLM
from ktransformers.models.custom_modeling_qwen2_moe import KQwen2MoeForCausalLM
from ktransformers.models.custom_modeling_qwen3_moe import KQwen3MoeForCausalLM
from ktransformers.models.custom_modeling_smallthinker import KSmallThinkerForCausalLM
from ktransformers.models.custom_modeling_glm4_moe import KGlm4MoeForCausalLM
from ktransformers.models.configuration_qwen3_moe import Qwen3MoeConfig
from ktransformers.models.configuration_smallthinker import SmallthinkerConfig
from ktransformers.models.configuration_glm4_moe import Glm4MoeConfig
from ktransformers.server.balance_serve.inference.model_runner import ModelRunner
from ktransformers.server.balance_serve.inference.sampling.sampler import Sampler, SamplingOptions
from ktransformers.server.balance_serve.inference.query_manager import QueryManager
@ -60,6 +64,8 @@ default_optimize_rules = {
"DeepseekV3ForCausalLM": ktransformer_rules_dir + "DeepSeek-V3-Chat-serve.yaml",
"Qwen2MoeForCausalLM": ktransformer_rules_dir + "Qwen2-serve.yaml",
"Qwen3MoeForCausalLM": ktransformer_rules_dir + "Qwen3Moe-serve.yaml",
"SmallThinkerForCausalLM": ktransformer_rules_dir + "Smallthinker-serve.yaml",
"Glm4MoeForCausalLM": ktransformer_rules_dir + "Glm4Moe-serve.yaml",
}
@ -123,15 +129,25 @@ class Engine:
self.sched_client = SchedulerClient(args.sched_port)
self.updates = []
try:
config = AutoConfig.from_pretrained(args.model_dir, trust_remote_code=True)
except:
if args.model_name == "Qwen3Moe":
config = Qwen3MoeConfig.from_pretrained(args.model_dir, trust_remote_code=True)
else:
assert False, f"model {args.model_name} not supported"
print(f"args.architectures: {args.architectures}")
if args.architectures == "Qwen3MoeForCausalLM":
config = Qwen3MoeConfig.from_pretrained(args.model_dir, trust_remote_code=True)
elif args.architectures == "Glm4MoeForCausalLM":
config = Glm4MoeConfig.from_pretrained(args.model_dir, trust_remote_code=True)
elif args.architectures == "SmallThinkerForCausalLM":
config = SmallthinkerConfig.from_pretrained(args.model_dir, trust_remote_code=True)
config._attn_implementation = "eager"
config.moe_intermediate_size = config.moe_ffn_hidden_size
else:
try:
config = AutoConfig.from_pretrained(args.model_dir, trust_remote_code=True)
except:
raise ValueError(f"Model {args.architectures} not supported. Please check your model directory or model name.")
self.gen_queue = generated_token_queue
with torch.device("meta"):
@ -147,6 +163,13 @@ class Engine:
self.model = KQwen2MoeForCausalLM(config, self.cache)
else:
self.model = KQwen3MoeForCausalLM(config, self.cache)
elif config.architectures[0] == "SmallThinkerForCausalLM":
self.cache = KGQACache(config, self.args.page_size)
self.model = KSmallThinkerForCausalLM(config, self.cache)
elif config.architectures[0] == "Glm4MoeForCausalLM":
self.cache = KGQACache(config, self.args.page_size)
self.model = KGlm4MoeForCausalLM(config, self.cache)
context = zmq.Context()
@ -197,7 +220,7 @@ class Engine:
self.block_num = inference_context.k_cache[0].size(1)
self.model_runner = ModelRunner(self.model, self.device, self.args.use_cuda_graph, page_size = args.page_size, block_num=self.block_num)
#@TODO add config
if config.architectures[0] == "Qwen2MoeForCausalLM" or config.architectures[0] == "Qwen3MoeForCausalLM":
if config.architectures[0] == "Qwen2MoeForCausalLM" or config.architectures[0] == "Qwen3MoeForCausalLM" or config.architectures[0] == "Glm4MoeForCausalLM" or config.architectures[0] == "SmallThinkerForCausalLM":
self.model.init_wrapper(self.args.use_cuda_graph, self.device, max(self.model_runner.cuda_graphs), args.max_batch_size, self.block_num)
else:
self.model.init_wrapper(self.args.use_cuda_graph, self.device, args.max_batch_size, self.block_num)