- match: class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding replace: class: ktransformers.operators.RoPE.YarnRotaryEmbedding kwargs: generate_device: "xpu" prefill_device: "xpu" - match: name: "^model\\.layers\\..*" # regular expression class: torch.nn.Linear # only match modules matching name and class simultaneously replace: class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types kwargs: generate_device: "xpu" prefill_device: "xpu" generate_op: "KLinearIPEXLLM" prefill_op: "KLinearIPEXLLM" - match: name: "^model\\.layers\\..*\\.mlp$" class: ktransformers.models.modeling_deepseek.DeepseekV2MoE replace: class: ktransformers.operators.experts.KDeepseekV2MoE # mlp module with custom forward function kwargs: generate_device: "xpu" prefill_device: "xpu" - match: name: "^model\\.layers\\..*\\.mlp\\.experts$" replace: class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism kwargs: prefill_device: "xpu" prefill_op: "KExpertsTorch" generate_device: "cpu" generate_op: "KExpertsCPU" out_device: "xpu" recursive: False # don't recursively inject submodules of this module - match: class: ktransformers.models.modeling_deepseek.DeepseekV2RMSNorm replace: class: ktransformers.operators.layernorm.KDeepseekRMSNormIPEXLLM kwargs: generate_device: "xpu" prefill_device: "xpu" - match: name: "^model\\.layers\\..*\\.self_attn$" replace: class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation kwargs: generate_device: "xpu" prefill_device: "xpu" - match: name: "^model$" replace: class: "ktransformers.operators.models.KDeepseekV2Model" kwargs: per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill device: "xpu" - match: name: "^model.embed_tokens" replace: class: "default" kwargs: generate_device: "cpu" prefill_device: "cpu"