- match: name: "^model.embed_tokens" replace: class: "default" kwargs: generate_device: "cpu" prefill_device: "cpu" - match: name: "^model\\.layers\\.([0-9])\\." class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding replace: class: ktransformers.operators.RoPE.YarnRotaryEmbedding kwargs: generate_device: "cuda:0" prefill_device: "cuda:0" - match: name: "^model\\.layers\\.([1][0-9])\\." class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding replace: class: ktransformers.operators.RoPE.YarnRotaryEmbedding kwargs: generate_device: "cuda:1" prefill_device: "cuda:1" - match: name: "^model\\.layers\\.([2][0-9])\\." class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding replace: class: ktransformers.operators.RoPE.YarnRotaryEmbedding kwargs: generate_device: "cuda:2" prefill_device: "cuda:2" - match: name: "^model\\.layers\\.([345][0-9])\\." class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding replace: class: ktransformers.operators.RoPE.YarnRotaryEmbedding kwargs: generate_device: "cuda:3" prefill_device: "cuda:3" - match: name: "^model\\.layers\\.([0-9])\\.(?!self_attn).*$" # regular expression class: torch.nn.Linear # only match modules matching name and class simultaneously replace: class: ktransformers.operators.linear.KTransformerLinear # optimized Kernel on quantized data types kwargs: generate_device: "cuda:0" prefill_device: "cuda:0" generate_op: "QuantizedLinearMarlin" prefill_op: "QuantizedLinearTorch" - match: name: "^model\\.layers\\.([1][0-9])\\.(?!self_attn).*$" # regular expression class: torch.nn.Linear # only match modules matching name and class simultaneously replace: class: ktransformers.operators.linear.KTransformerLinear # optimized Kernel on quantized data types kwargs: generate_device: "cuda:1" prefill_device: "cuda:1" generate_op: "QuantizedLinearMarlin" prefill_op: "QuantizedLinearTorch" - match: name: "^model\\.layers\\.([2][0-9])\\.(?!self_attn).*$" # regular expression class: torch.nn.Linear # only match modules matching name and class simultaneously replace: class: ktransformers.operators.linear.KTransformerLinear # optimized Kernel on quantized data types kwargs: generate_device: "cuda:2" prefill_device: "cuda:2" generate_op: "QuantizedLinearMarlin" prefill_op: "QuantizedLinearTorch" - match: name: "^model\\.layers\\.([345][0-9])\\.(?!self_attn).*$" # regular expression class: torch.nn.Linear # only match modules matching name and class simultaneously replace: class: ktransformers.operators.linear.KTransformerLinear # optimized Kernel on quantized data types kwargs: generate_device: "cuda:3" prefill_device: "cuda:3" generate_op: "QuantizedLinearMarlin" prefill_op: "QuantizedLinearTorch" - match: name: "^model\\.layers\\.([0-9])\\.mlp$" class: ktransformers.models.modeling_deepseek.DeepseekV2MoE replace: class: ktransformers.operators.experts.DeepseekV2MoEInjected # mlp module with custom forward function kwargs: generate_device: "cuda:0" prefill_device: "cuda:0" - match: name: "^model\\.layers\\.([1][0-9])\\.mlp$" class: ktransformers.models.modeling_deepseek.DeepseekV2MoE replace: class: ktransformers.operators.experts.DeepseekV2MoEInjected # mlp module with custom forward function kwargs: generate_device: "cuda:1" prefill_device: "cuda:1" - match: name: "^model\\.layers\\.([2][0-9])\\.mlp$" class: ktransformers.models.modeling_deepseek.DeepseekV2MoE replace: class: ktransformers.operators.experts.DeepseekV2MoEInjected # mlp module with custom forward function kwargs: generate_device: "cuda:2" prefill_device: "cuda:2" - match: name: "^model\\.layers\\.([345][0-9])\\.mlp$" class: ktransformers.models.modeling_deepseek.DeepseekV2MoE replace: class: ktransformers.operators.experts.DeepseekV2MoEInjected # mlp module with custom forward function kwargs: generate_device: "cuda:3" prefill_device: "cuda:3" - match: name: "^model\\.layers\\.([0-9])\\.mlp\\.experts$" replace: class: ktransformers.operators.experts.KTransformersMLPExpert # custom MoE Kernel with expert paralleism kwargs: prefill_device: "cuda:0" prefill_mlp_type: "MLPExpertsTorch" generate_device: "cpu" generate_mlp_type: "MLPCPUExperts" out_device: "cuda:0" recursive: False # don't recursively inject submodules of this module - match: name: "^model\\.layers\\.([1][0-9])\\.mlp\\.experts$" replace: class: ktransformers.operators.experts.KTransformersMLPExpert # custom MoE Kernel with expert paralleism kwargs: prefill_device: "cuda:1" prefill_mlp_type: "MLPExpertsTorch" generate_device: "cpu" generate_mlp_type: "MLPCPUExperts" out_device: "cuda:1" recursive: False # don't recursively inject submodules of this module - match: name: "^model\\.layers\\.([2][0-9])\\.mlp\\.experts$" replace: class: ktransformers.operators.experts.KTransformersMLPExpert # custom MoE Kernel with expert paralleism kwargs: prefill_device: "cuda:2" prefill_mlp_type: "MLPExpertsTorch" generate_device: "cpu" generate_mlp_type: "MLPCPUExperts" out_device: "cuda:2" recursive: False # don't recursively inject submodules of this module - match: name: "^model\\.layers\\.([345][0-9])\\.mlp\\.experts$" replace: class: ktransformers.operators.experts.KTransformersMLPExpert # custom MoE Kernel with expert paralleism kwargs: prefill_device: "cuda:3" prefill_mlp_type: "MLPExpertsTorch" generate_device: "cpu" generate_mlp_type: "MLPCPUExperts" out_device: "cuda:3" recursive: False # don't recursively inject submodules of this module - match: name: "^model\\.layers\\.([0-9])\\.self_attn$" replace: class: ktransformers.operators.attention.DeepseekV2AttentionInjected # optimized MLA implementation kwargs: generate_device: "cuda:0" prefill_device: "cuda:0" - match: name: "^model\\.layers\\.([1][0-9])\\.self_attn$" replace: class: ktransformers.operators.attention.DeepseekV2AttentionInjected # optimized MLA implementation kwargs: generate_device: "cuda:1" prefill_device: "cuda:1" - match: name: "^model\\.layers\\.([2][0-9])\\.self_attn$" replace: class: ktransformers.operators.attention.DeepseekV2AttentionInjected # optimized MLA implementation kwargs: generate_device: "cuda:2" prefill_device: "cuda:2" - match: name: "^model\\.layers\\.([345][0-9])\\.self_attn$" replace: class: ktransformers.operators.attention.DeepseekV2AttentionInjected # optimized MLA implementation kwargs: generate_device: "cuda:3" prefill_device: "cuda:3" - match: name: "^model$" replace: class: "ktransformers.operators.layer_wise_prefill.DeepseekV2ModelKTransformers" kwargs: per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill transfer_map: 10: "cuda:1" 20: "cuda:2" 30: "cuda:3" - match: name: "^model\\.layers\\.([0-9])\\." replace: class: "default" kwargs: generate_device: "cuda:0" prefill_device: "cuda:0" - match: name: "(^model\\.layers\\.([1][0-9])\\.)" replace: class: "default" kwargs: generate_device: "cuda:1" prefill_device: "cuda:1" - match: name: "(^model\\.layers\\.([2][0-9])\\.)" replace: class: "default" kwargs: generate_device: "cuda:2" prefill_device: "cuda:2" - match: name: "(^model\\.layers\\.([345][0-9])\\.)|(^model.norm)|(^lm_head)" replace: class: "default" kwargs: generate_device: "cuda:3" prefill_device: "cuda:3"