- match: name: "^model\\.layers\\.([012])\\." class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeRotaryEmbedding replace: class: ktransformers.operators.RoPE.RotaryEmbedding kwargs: generate_device: "cuda:0" prefill_device: "cuda:0" - match: name: "^model\\.layers\\.([012])$" # regular expression class: torch.nn.Linear # only match modules matching name and class simultaneously replace: class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types kwargs: generate_device: "cuda:0" prefill_device: "cuda:0" generate_op: "KLinearMarlin" prefill_op: "KLinearTorch" - match: name: "^model\\.layers\\.([012])\\.mlp$" class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeSparseMoeBlock replace: class: ktransformers.operators.experts.KQwen2MoeSparseMoeBlock # mlp module with custom forward function - match: name: "^model\\.layers\\.([012])\\.mlp\\.experts$" replace: class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism # device: "cpu" # which devices to load this module when initializing kwargs: prefill_device: "cuda:0" prefill_op: "KExpertsTorch" generate_device: "cpu" generate_op: "KExpertsCPU" out_device: "cuda:0" recursive: False # don't recursively inject submodules of this module - match: name: "^model\\.layers\\.([12][0-9]|[3-9])\\." class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeRotaryEmbedding replace: class: ktransformers.operators.RoPE.RotaryEmbedding kwargs: generate_device: "cuda:1" prefill_device: "cuda:1" - match: name: "^model\\.layers\\.([12][0-9]|[3-9])$" # regular expression class: torch.nn.Linear # only match modules matching name and class simultaneously replace: class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types kwargs: generate_device: "cuda:1" prefill_device: "cuda:1" generate_op: "KLinearMarlin" prefill_op: "KLinearTorch" - match: name: "^model\\.layers\\.([12][0-9]|[3-9])\\.mlp$" class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeSparseMoeBlock replace: class: ktransformers.operators.experts.KQwen2MoeSparseMoeBlock # mlp module with custom forward function - match: name: "^model\\.layers\\.([12][0-9]|[3-9])\\.mlp\\.experts$" replace: class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism # device: "cpu" # which devices to load this module when initializing kwargs: prefill_device: "cuda:1" prefill_op: "KExpertsTorch" generate_device: "cpu" generate_op: "KExpertsCPU" out_device: "cuda:1" recursive: False # don't recursively inject submodules of this module - match: name: "^model.embed_tokens" replace: class: "default" kwargs: generate_device: "cpu" prefill_device: "cpu" - match: name: "(^model.norm)|(^lm_head)" replace: class: "default" kwargs: generate_device: "cuda:1" prefill_device: "cuda:1" - match: name: "^model$" replace: class: "ktransformers.operators.models.KQwen2MoeModel" kwargs: per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill transfer_map: 3: "cuda:1" - match: name: "^model\\.layers\\.([012])\\." replace: class: "default" kwargs: generate_device: "cuda:0" prefill_device: "cuda:0" - match: name: "^model\\.layers\\.([12][0-9]|[3-9])\\." replace: class: "default" kwargs: generate_device: "cuda:1" prefill_device: "cuda:1"