mirror of
https://github.com/kvcache-ai/ktransformers.git
synced 2025-09-06 12:40:02 +00:00
support qwen3
This commit is contained in:
parent
68c2b2e6e6
commit
33cbd47086
8 changed files with 195 additions and 7 deletions
|
@ -151,7 +151,6 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
|
||||||
message(STATUS "Compiler and/or CPU do NOT support AVX512F")
|
message(STATUS "Compiler and/or CPU do NOT support AVX512F")
|
||||||
set(HAS_AVX512 False)
|
set(HAS_AVX512 False)
|
||||||
endif()
|
endif()
|
||||||
set(HAS_AVX512 False)
|
|
||||||
|
|
||||||
# check AMX
|
# check AMX
|
||||||
string(FIND "${LSCPU_OUTPUT}" "amx" COMPILER_SUPPORTS_AMX)
|
string(FIND "${LSCPU_OUTPUT}" "amx" COMPILER_SUPPORTS_AMX)
|
||||||
|
|
96
ktransformers/optimize/optimize_rules/Qwen2-serve-amx.yaml
Normal file
96
ktransformers/optimize/optimize_rules/Qwen2-serve-amx.yaml
Normal file
|
@ -0,0 +1,96 @@
|
||||||
|
- match:
|
||||||
|
class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeRotaryEmbedding
|
||||||
|
replace:
|
||||||
|
class: ktransformers.operators.RoPE.RotaryEmbedding
|
||||||
|
kwargs:
|
||||||
|
generate_device: "cuda"
|
||||||
|
prefill_device: "cuda"
|
||||||
|
|
||||||
|
- match:
|
||||||
|
name: "^lm_head$" # regular expression
|
||||||
|
class: torch.nn.Linear # only match modules matching name and class simultaneously
|
||||||
|
replace:
|
||||||
|
class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types
|
||||||
|
kwargs:
|
||||||
|
generate_device: "cuda"
|
||||||
|
prefill_device: "cuda"
|
||||||
|
generate_op: "KLinearMarlin"
|
||||||
|
prefill_op: "KLinearTorch"
|
||||||
|
|
||||||
|
# - match:
|
||||||
|
# name: "^model\\.layers\\..*$" # regular expression
|
||||||
|
# class: torch.nn.Linear # only match modules matching name and class simultaneously
|
||||||
|
# replace:
|
||||||
|
# class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types
|
||||||
|
# kwargs:
|
||||||
|
# generate_device: "cuda"
|
||||||
|
# prefill_device: "cuda"
|
||||||
|
# generate_op: "VLinearMarlin"
|
||||||
|
# prefill_op: "KLinearTorch"
|
||||||
|
- match:
|
||||||
|
name: "^model\\.layers\\.(?!.*mlp\\.shared_expert_gate).*$" # regular expression
|
||||||
|
class: torch.nn.Linear # only match modules matching name and class simultaneously
|
||||||
|
replace:
|
||||||
|
class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types
|
||||||
|
kwargs:
|
||||||
|
generate_device: "cuda"
|
||||||
|
prefill_device: "cuda"
|
||||||
|
generate_op: "VLinearMarlin"
|
||||||
|
prefill_op: "KLinearTorch"
|
||||||
|
- match:
|
||||||
|
name: "^model\\.layers\\..*\\.mlp$"
|
||||||
|
class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeSparseMoeBlock
|
||||||
|
replace:
|
||||||
|
class: ktransformers.operators.experts.KQwen2MoeSparseMoeBlockV2 # mlp module with custom forward function
|
||||||
|
kwargs:
|
||||||
|
generate_device: "cuda"
|
||||||
|
prefill_device: "cuda"
|
||||||
|
|
||||||
|
- match:
|
||||||
|
name: "^model\\.layers\\..*\\.mlp\\.experts$"
|
||||||
|
replace:
|
||||||
|
class: ktransformers.operators.experts.KTransformersExpertsV2 # custom MoE Kernel with expert paralleism
|
||||||
|
kwargs:
|
||||||
|
prefill_device: "cuda"
|
||||||
|
prefill_op: "KExpertsTorch"
|
||||||
|
generate_device: "cpu"
|
||||||
|
generate_op: "KExpertsCPU"
|
||||||
|
out_device: "cuda"
|
||||||
|
backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default)
|
||||||
|
recursive: False # don't recursively inject submodules of this module
|
||||||
|
- match:
|
||||||
|
name: "^model\\.layers\\..*\\.self_attn$"
|
||||||
|
replace:
|
||||||
|
class: ktransformers.operators.balance_serve_attention.KQwen2MoeAttention # optimized MLA implementation
|
||||||
|
kwargs:
|
||||||
|
generate_device: "cuda"
|
||||||
|
prefill_device: "cuda"
|
||||||
|
- match:
|
||||||
|
name: "^model$"
|
||||||
|
replace:
|
||||||
|
class: "ktransformers.operators.models.KQwen2MoeModel"
|
||||||
|
kwargs:
|
||||||
|
per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
|
||||||
|
- match:
|
||||||
|
name: "^model.embed_tokens"
|
||||||
|
replace:
|
||||||
|
class: "default"
|
||||||
|
kwargs:
|
||||||
|
generate_device: "cpu"
|
||||||
|
prefill_device: "cpu"
|
||||||
|
|
||||||
|
- match:
|
||||||
|
class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeRMSNorm
|
||||||
|
replace:
|
||||||
|
class: ktransformers.operators.layernorm.KQwen2MoeRMSNorm
|
||||||
|
kwargs:
|
||||||
|
generate_device: "cuda"
|
||||||
|
prefill_device: "cuda"
|
||||||
|
|
||||||
|
- match:
|
||||||
|
class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeMLP
|
||||||
|
replace:
|
||||||
|
class: ktransformers.operators.mlp.KQwen2MoeMLP
|
||||||
|
kwargs:
|
||||||
|
generate_device: "cuda"
|
||||||
|
prefill_device: "cuda"
|
|
@ -56,7 +56,6 @@
|
||||||
generate_device: "cpu"
|
generate_device: "cpu"
|
||||||
generate_op: "KExpertsCPU"
|
generate_op: "KExpertsCPU"
|
||||||
out_device: "cuda"
|
out_device: "cuda"
|
||||||
backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default)
|
|
||||||
recursive: False # don't recursively inject submodules of this module
|
recursive: False # don't recursively inject submodules of this module
|
||||||
- match:
|
- match:
|
||||||
name: "^model\\.layers\\..*\\.self_attn$"
|
name: "^model\\.layers\\..*\\.self_attn$"
|
||||||
|
|
|
@ -0,0 +1,96 @@
|
||||||
|
- match:
|
||||||
|
class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeRotaryEmbedding
|
||||||
|
replace:
|
||||||
|
class: ktransformers.operators.RoPE.RotaryEmbedding
|
||||||
|
kwargs:
|
||||||
|
generate_device: "cuda"
|
||||||
|
prefill_device: "cuda"
|
||||||
|
|
||||||
|
- match:
|
||||||
|
name: "^lm_head$" # regular expression
|
||||||
|
class: torch.nn.Linear # only match modules matching name and class simultaneously
|
||||||
|
replace:
|
||||||
|
class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types
|
||||||
|
kwargs:
|
||||||
|
generate_device: "cuda"
|
||||||
|
prefill_device: "cuda"
|
||||||
|
generate_op: "VLinearMarlin"
|
||||||
|
prefill_op: "KLinearTorch"
|
||||||
|
|
||||||
|
# - match:
|
||||||
|
# name: "^model\\.layers\\..*$" # regular expression
|
||||||
|
# class: torch.nn.Linear # only match modules matching name and class simultaneously
|
||||||
|
# replace:
|
||||||
|
# class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types
|
||||||
|
# kwargs:
|
||||||
|
# generate_device: "cuda"
|
||||||
|
# prefill_device: "cuda"
|
||||||
|
# generate_op: "VLinearMarlin"
|
||||||
|
# prefill_op: "KLinearTorch"
|
||||||
|
- match:
|
||||||
|
name: "^model\\.layers\\.(?!.*mlp\\.shared_expert_gate).*$" # regular expression
|
||||||
|
class: torch.nn.Linear # only match modules matching name and class simultaneously
|
||||||
|
replace:
|
||||||
|
class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types
|
||||||
|
kwargs:
|
||||||
|
generate_device: "cuda"
|
||||||
|
prefill_device: "cuda"
|
||||||
|
generate_op: "KLinearMarlin"
|
||||||
|
prefill_op: "KLinearTorch"
|
||||||
|
- match:
|
||||||
|
name: "^model\\.layers\\..*\\.mlp$"
|
||||||
|
class: ktransformers.models.modeling_qwen3_moe.Qwen3MoeSparseMoeBlock
|
||||||
|
replace:
|
||||||
|
class: ktransformers.operators.experts.KQwen3MoeSparseMoeBlockV2 # mlp module with custom forward function
|
||||||
|
kwargs:
|
||||||
|
generate_device: "cuda"
|
||||||
|
prefill_device: "cuda"
|
||||||
|
|
||||||
|
- match:
|
||||||
|
name: "^model\\.layers\\..*\\.mlp\\.experts$"
|
||||||
|
replace:
|
||||||
|
class: ktransformers.operators.experts.KTransformersExpertsV2 # custom MoE Kernel with expert paralleism
|
||||||
|
kwargs:
|
||||||
|
prefill_device: "cuda"
|
||||||
|
prefill_op: "KExpertsTorch"
|
||||||
|
generate_device: "cpu"
|
||||||
|
generate_op: "KExpertsCPU"
|
||||||
|
out_device: "cuda"
|
||||||
|
backend: "AMXBF16" # or "AMXBF16" or "llamafile" (default)
|
||||||
|
recursive: False # don't recursively inject submodules of this module
|
||||||
|
- match:
|
||||||
|
name: "^model\\.layers\\..*\\.self_attn$"
|
||||||
|
replace:
|
||||||
|
class: ktransformers.operators.balance_serve_attention.KQwen3MoeAttention # optimized MLA implementation
|
||||||
|
kwargs:
|
||||||
|
generate_device: "cuda"
|
||||||
|
prefill_device: "cuda"
|
||||||
|
- match:
|
||||||
|
name: "^model$"
|
||||||
|
replace:
|
||||||
|
class: "ktransformers.operators.models.KQwen2MoeModel"
|
||||||
|
kwargs:
|
||||||
|
per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
|
||||||
|
- match:
|
||||||
|
name: "^model.embed_tokens"
|
||||||
|
replace:
|
||||||
|
class: "default"
|
||||||
|
kwargs:
|
||||||
|
generate_device: "cpu"
|
||||||
|
prefill_device: "cpu"
|
||||||
|
|
||||||
|
- match:
|
||||||
|
class: ktransformers.models.modeling_qwen3_moe.Qwen3MoeRMSNorm
|
||||||
|
replace:
|
||||||
|
class: ktransformers.operators.layernorm.KQwen3MoeRMSNorm
|
||||||
|
kwargs:
|
||||||
|
generate_device: "cuda"
|
||||||
|
prefill_device: "cuda"
|
||||||
|
|
||||||
|
- match:
|
||||||
|
class: ktransformers.models.modeling_qwen3_moe.Qwen3MoeMLP
|
||||||
|
replace:
|
||||||
|
class: ktransformers.operators.mlp.KQwen2MoeMLP
|
||||||
|
kwargs:
|
||||||
|
generate_device: "cuda"
|
||||||
|
prefill_device: "cuda"
|
|
@ -56,7 +56,6 @@
|
||||||
generate_device: "cpu"
|
generate_device: "cpu"
|
||||||
generate_op: "KExpertsCPU"
|
generate_op: "KExpertsCPU"
|
||||||
out_device: "cuda"
|
out_device: "cuda"
|
||||||
backend: "AMXBF16" # or "AMXBF16" or "llamafile" (default)
|
|
||||||
recursive: False # don't recursively inject submodules of this module
|
recursive: False # don't recursively inject submodules of this module
|
||||||
- match:
|
- match:
|
||||||
name: "^model\\.layers\\..*\\.self_attn$"
|
name: "^model\\.layers\\..*\\.self_attn$"
|
||||||
|
|
|
@ -94,7 +94,6 @@ class ArgumentParser:
|
||||||
parser.add_argument("--user_algorithm", type=str, default=self.cfg.user_algorithm)
|
parser.add_argument("--user_algorithm", type=str, default=self.cfg.user_algorithm)
|
||||||
parser.add_argument("--force_think", action=argparse.BooleanOptionalAction, type=bool, default=self.cfg.user_force_think)
|
parser.add_argument("--force_think", action=argparse.BooleanOptionalAction, type=bool, default=self.cfg.user_force_think)
|
||||||
parser.add_argument("--use_cuda_graph", action=argparse.BooleanOptionalAction, type=bool, default=self.cfg.use_cuda_graph)
|
parser.add_argument("--use_cuda_graph", action=argparse.BooleanOptionalAction, type=bool, default=self.cfg.use_cuda_graph)
|
||||||
# parser.add_argument("--use_cuda_graph", action=argparse.BooleanOptionalAction, type=bool, default=False)
|
|
||||||
|
|
||||||
# web config
|
# web config
|
||||||
parser.add_argument("--web_cross_domain", type=bool, default=self.cfg.web_cross_domain)
|
parser.add_argument("--web_cross_domain", type=bool, default=self.cfg.web_cross_domain)
|
||||||
|
|
|
@ -56,8 +56,8 @@ ktransformer_rules_dir = (
|
||||||
os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "..", "..", "./optimize/optimize_rules/")
|
os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "..", "..", "./optimize/optimize_rules/")
|
||||||
)
|
)
|
||||||
default_optimize_rules = {
|
default_optimize_rules = {
|
||||||
"DeepseekV3ForCausalLM": ktransformer_rules_dir + "Moonlight-16B-A3B-serve.yaml",
|
# "DeepseekV3ForCausalLM": ktransformer_rules_dir + "Moonlight-16B-A3B-serve.yaml",
|
||||||
# "DeepseekV3ForCausalLM": ktransformer_rules_dir + "DeepSeek-V3-Chat-serve.yaml",
|
"DeepseekV3ForCausalLM": ktransformer_rules_dir + "DeepSeek-V3-Chat-serve.yaml",
|
||||||
"Qwen2MoeForCausalLM": ktransformer_rules_dir + "Qwen2-serve.yaml",
|
"Qwen2MoeForCausalLM": ktransformer_rules_dir + "Qwen2-serve.yaml",
|
||||||
"Qwen3MoeForCausalLM": ktransformer_rules_dir + "Qwen3Moe-serve.yaml",
|
"Qwen3MoeForCausalLM": ktransformer_rules_dir + "Qwen3Moe-serve.yaml",
|
||||||
}
|
}
|
||||||
|
|
|
@ -85,7 +85,7 @@ class ModelRunner:
|
||||||
elif isinstance(self.model, KQwen2MoeForCausalLM) or isinstance(self.model, KQwen3MoeForCausalLM):
|
elif isinstance(self.model, KQwen2MoeForCausalLM) or isinstance(self.model, KQwen3MoeForCausalLM):
|
||||||
self.model.flash_infer_attn_plan(batch, self.bsz_tensor_buf, self.num_tokens_tensor_buf,
|
self.model.flash_infer_attn_plan(batch, self.bsz_tensor_buf, self.num_tokens_tensor_buf,
|
||||||
num_q_heads=self.model.config.num_attention_heads, num_kv_heads=self.model.config.num_key_value_heads,
|
num_q_heads=self.model.config.num_attention_heads, num_kv_heads=self.model.config.num_key_value_heads,
|
||||||
head_dim=128,
|
head_dim=self.model.config.head_dim if hasattr(self.model.config, 'head_num') else self.model.config.hidden_size // self.model.config.num_attention_heads,
|
||||||
page_size=self.model.cache.page_size, causal=True,
|
page_size=self.model.cache.page_size, causal=True,
|
||||||
q_data_type=torch.bfloat16, kv_data_type=torch.bfloat16, cuda_graph_idx=cuda_graph_idx)
|
q_data_type=torch.bfloat16, kv_data_type=torch.bfloat16, cuda_graph_idx=cuda_graph_idx)
|
||||||
else:
|
else:
|
||||||
|
|
Loading…
Add table
Reference in a new issue