From 33cbd47086d34bba26c0a007896f3b87f9853479 Mon Sep 17 00:00:00 2001 From: djw <1913953267@qq.com> Date: Mon, 28 Apr 2025 18:15:35 +0000 Subject: [PATCH] support qwen3 --- csrc/ktransformers_ext/CMakeLists.txt | 1 - .../optimize_rules/Qwen2-serve-amx.yaml | 96 +++++++++++++++++++ .../optimize/optimize_rules/Qwen2-serve.yaml | 1 - .../optimize_rules/Qwen3Moe-serve-amx.yaml | 96 +++++++++++++++++++ .../optimize_rules/Qwen3Moe-serve.yaml | 1 - ktransformers/server/args.py | 1 - .../backend/interfaces/balance_serve.py | 4 +- .../balance_serve/inference/model_runner.py | 2 +- 8 files changed, 195 insertions(+), 7 deletions(-) create mode 100644 ktransformers/optimize/optimize_rules/Qwen2-serve-amx.yaml create mode 100644 ktransformers/optimize/optimize_rules/Qwen3Moe-serve-amx.yaml diff --git a/csrc/ktransformers_ext/CMakeLists.txt b/csrc/ktransformers_ext/CMakeLists.txt index 6de2bcf..d9cd12d 100644 --- a/csrc/ktransformers_ext/CMakeLists.txt +++ b/csrc/ktransformers_ext/CMakeLists.txt @@ -151,7 +151,6 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW message(STATUS "Compiler and/or CPU do NOT support AVX512F") set(HAS_AVX512 False) endif() - set(HAS_AVX512 False) # check AMX string(FIND "${LSCPU_OUTPUT}" "amx" COMPILER_SUPPORTS_AMX) diff --git a/ktransformers/optimize/optimize_rules/Qwen2-serve-amx.yaml b/ktransformers/optimize/optimize_rules/Qwen2-serve-amx.yaml new file mode 100644 index 0000000..27dba2b --- /dev/null +++ b/ktransformers/optimize/optimize_rules/Qwen2-serve-amx.yaml @@ -0,0 +1,96 @@ +- match: + class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeRotaryEmbedding + replace: + class: ktransformers.operators.RoPE.RotaryEmbedding + kwargs: + generate_device: "cuda" + prefill_device: "cuda" + +- match: + name: "^lm_head$" # regular expression + class: torch.nn.Linear # only match modules matching name and class simultaneously + replace: + class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types + kwargs: + generate_device: "cuda" + prefill_device: "cuda" + generate_op: "KLinearMarlin" + prefill_op: "KLinearTorch" + +# - match: +# name: "^model\\.layers\\..*$" # regular expression +# class: torch.nn.Linear # only match modules matching name and class simultaneously +# replace: +# class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types +# kwargs: +# generate_device: "cuda" +# prefill_device: "cuda" +# generate_op: "VLinearMarlin" +# prefill_op: "KLinearTorch" +- match: + name: "^model\\.layers\\.(?!.*mlp\\.shared_expert_gate).*$" # regular expression + class: torch.nn.Linear # only match modules matching name and class simultaneously + replace: + class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types + kwargs: + generate_device: "cuda" + prefill_device: "cuda" + generate_op: "VLinearMarlin" + prefill_op: "KLinearTorch" +- match: + name: "^model\\.layers\\..*\\.mlp$" + class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeSparseMoeBlock + replace: + class: ktransformers.operators.experts.KQwen2MoeSparseMoeBlockV2 # mlp module with custom forward function + kwargs: + generate_device: "cuda" + prefill_device: "cuda" + +- match: + name: "^model\\.layers\\..*\\.mlp\\.experts$" + replace: + class: ktransformers.operators.experts.KTransformersExpertsV2 # custom MoE Kernel with expert paralleism + kwargs: + prefill_device: "cuda" + prefill_op: "KExpertsTorch" + generate_device: "cpu" + generate_op: "KExpertsCPU" + out_device: "cuda" + backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default) + recursive: False # don't recursively inject submodules of this module +- match: + name: "^model\\.layers\\..*\\.self_attn$" + replace: + class: ktransformers.operators.balance_serve_attention.KQwen2MoeAttention # optimized MLA implementation + kwargs: + generate_device: "cuda" + prefill_device: "cuda" +- match: + name: "^model$" + replace: + class: "ktransformers.operators.models.KQwen2MoeModel" + kwargs: + per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill +- match: + name: "^model.embed_tokens" + replace: + class: "default" + kwargs: + generate_device: "cpu" + prefill_device: "cpu" + +- match: + class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeRMSNorm + replace: + class: ktransformers.operators.layernorm.KQwen2MoeRMSNorm + kwargs: + generate_device: "cuda" + prefill_device: "cuda" + +- match: + class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeMLP + replace: + class: ktransformers.operators.mlp.KQwen2MoeMLP + kwargs: + generate_device: "cuda" + prefill_device: "cuda" \ No newline at end of file diff --git a/ktransformers/optimize/optimize_rules/Qwen2-serve.yaml b/ktransformers/optimize/optimize_rules/Qwen2-serve.yaml index 27dba2b..41b41a7 100644 --- a/ktransformers/optimize/optimize_rules/Qwen2-serve.yaml +++ b/ktransformers/optimize/optimize_rules/Qwen2-serve.yaml @@ -56,7 +56,6 @@ generate_device: "cpu" generate_op: "KExpertsCPU" out_device: "cuda" - backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default) recursive: False # don't recursively inject submodules of this module - match: name: "^model\\.layers\\..*\\.self_attn$" diff --git a/ktransformers/optimize/optimize_rules/Qwen3Moe-serve-amx.yaml b/ktransformers/optimize/optimize_rules/Qwen3Moe-serve-amx.yaml new file mode 100644 index 0000000..8607ca0 --- /dev/null +++ b/ktransformers/optimize/optimize_rules/Qwen3Moe-serve-amx.yaml @@ -0,0 +1,96 @@ +- match: + class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeRotaryEmbedding + replace: + class: ktransformers.operators.RoPE.RotaryEmbedding + kwargs: + generate_device: "cuda" + prefill_device: "cuda" + +- match: + name: "^lm_head$" # regular expression + class: torch.nn.Linear # only match modules matching name and class simultaneously + replace: + class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types + kwargs: + generate_device: "cuda" + prefill_device: "cuda" + generate_op: "VLinearMarlin" + prefill_op: "KLinearTorch" + +# - match: +# name: "^model\\.layers\\..*$" # regular expression +# class: torch.nn.Linear # only match modules matching name and class simultaneously +# replace: +# class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types +# kwargs: +# generate_device: "cuda" +# prefill_device: "cuda" +# generate_op: "VLinearMarlin" +# prefill_op: "KLinearTorch" +- match: + name: "^model\\.layers\\.(?!.*mlp\\.shared_expert_gate).*$" # regular expression + class: torch.nn.Linear # only match modules matching name and class simultaneously + replace: + class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types + kwargs: + generate_device: "cuda" + prefill_device: "cuda" + generate_op: "KLinearMarlin" + prefill_op: "KLinearTorch" +- match: + name: "^model\\.layers\\..*\\.mlp$" + class: ktransformers.models.modeling_qwen3_moe.Qwen3MoeSparseMoeBlock + replace: + class: ktransformers.operators.experts.KQwen3MoeSparseMoeBlockV2 # mlp module with custom forward function + kwargs: + generate_device: "cuda" + prefill_device: "cuda" + +- match: + name: "^model\\.layers\\..*\\.mlp\\.experts$" + replace: + class: ktransformers.operators.experts.KTransformersExpertsV2 # custom MoE Kernel with expert paralleism + kwargs: + prefill_device: "cuda" + prefill_op: "KExpertsTorch" + generate_device: "cpu" + generate_op: "KExpertsCPU" + out_device: "cuda" + backend: "AMXBF16" # or "AMXBF16" or "llamafile" (default) + recursive: False # don't recursively inject submodules of this module +- match: + name: "^model\\.layers\\..*\\.self_attn$" + replace: + class: ktransformers.operators.balance_serve_attention.KQwen3MoeAttention # optimized MLA implementation + kwargs: + generate_device: "cuda" + prefill_device: "cuda" +- match: + name: "^model$" + replace: + class: "ktransformers.operators.models.KQwen2MoeModel" + kwargs: + per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill +- match: + name: "^model.embed_tokens" + replace: + class: "default" + kwargs: + generate_device: "cpu" + prefill_device: "cpu" + +- match: + class: ktransformers.models.modeling_qwen3_moe.Qwen3MoeRMSNorm + replace: + class: ktransformers.operators.layernorm.KQwen3MoeRMSNorm + kwargs: + generate_device: "cuda" + prefill_device: "cuda" + +- match: + class: ktransformers.models.modeling_qwen3_moe.Qwen3MoeMLP + replace: + class: ktransformers.operators.mlp.KQwen2MoeMLP + kwargs: + generate_device: "cuda" + prefill_device: "cuda" \ No newline at end of file diff --git a/ktransformers/optimize/optimize_rules/Qwen3Moe-serve.yaml b/ktransformers/optimize/optimize_rules/Qwen3Moe-serve.yaml index 8607ca0..63f67da 100644 --- a/ktransformers/optimize/optimize_rules/Qwen3Moe-serve.yaml +++ b/ktransformers/optimize/optimize_rules/Qwen3Moe-serve.yaml @@ -56,7 +56,6 @@ generate_device: "cpu" generate_op: "KExpertsCPU" out_device: "cuda" - backend: "AMXBF16" # or "AMXBF16" or "llamafile" (default) recursive: False # don't recursively inject submodules of this module - match: name: "^model\\.layers\\..*\\.self_attn$" diff --git a/ktransformers/server/args.py b/ktransformers/server/args.py index 95934e4..b2a6769 100644 --- a/ktransformers/server/args.py +++ b/ktransformers/server/args.py @@ -94,7 +94,6 @@ class ArgumentParser: parser.add_argument("--user_algorithm", type=str, default=self.cfg.user_algorithm) parser.add_argument("--force_think", action=argparse.BooleanOptionalAction, type=bool, default=self.cfg.user_force_think) parser.add_argument("--use_cuda_graph", action=argparse.BooleanOptionalAction, type=bool, default=self.cfg.use_cuda_graph) - # parser.add_argument("--use_cuda_graph", action=argparse.BooleanOptionalAction, type=bool, default=False) # web config parser.add_argument("--web_cross_domain", type=bool, default=self.cfg.web_cross_domain) diff --git a/ktransformers/server/backend/interfaces/balance_serve.py b/ktransformers/server/backend/interfaces/balance_serve.py index 6301e97..2d89332 100644 --- a/ktransformers/server/backend/interfaces/balance_serve.py +++ b/ktransformers/server/backend/interfaces/balance_serve.py @@ -56,8 +56,8 @@ ktransformer_rules_dir = ( os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "..", "..", "./optimize/optimize_rules/") ) default_optimize_rules = { - "DeepseekV3ForCausalLM": ktransformer_rules_dir + "Moonlight-16B-A3B-serve.yaml", - # "DeepseekV3ForCausalLM": ktransformer_rules_dir + "DeepSeek-V3-Chat-serve.yaml", + # "DeepseekV3ForCausalLM": ktransformer_rules_dir + "Moonlight-16B-A3B-serve.yaml", + "DeepseekV3ForCausalLM": ktransformer_rules_dir + "DeepSeek-V3-Chat-serve.yaml", "Qwen2MoeForCausalLM": ktransformer_rules_dir + "Qwen2-serve.yaml", "Qwen3MoeForCausalLM": ktransformer_rules_dir + "Qwen3Moe-serve.yaml", } diff --git a/ktransformers/server/balance_serve/inference/model_runner.py b/ktransformers/server/balance_serve/inference/model_runner.py index 0193576..834fd1b 100644 --- a/ktransformers/server/balance_serve/inference/model_runner.py +++ b/ktransformers/server/balance_serve/inference/model_runner.py @@ -85,7 +85,7 @@ class ModelRunner: elif isinstance(self.model, KQwen2MoeForCausalLM) or isinstance(self.model, KQwen3MoeForCausalLM): self.model.flash_infer_attn_plan(batch, self.bsz_tensor_buf, self.num_tokens_tensor_buf, num_q_heads=self.model.config.num_attention_heads, num_kv_heads=self.model.config.num_key_value_heads, - head_dim=128, + head_dim=self.model.config.head_dim if hasattr(self.model.config, 'head_num') else self.model.config.hidden_size // self.model.config.num_attention_heads, page_size=self.model.cache.page_size, causal=True, q_data_type=torch.bfloat16, kv_data_type=torch.bfloat16, cuda_graph_idx=cuda_graph_idx) else: