support smt and qlm4

This commit is contained in:
djw 2025-07-25 12:48:51 +00:00
parent 712ad1fa3c
commit 48bc6185b5
9 changed files with 65 additions and 74 deletions

View file

@ -60,7 +60,7 @@
- match:
name: "^model\\.layers\\..*\\.self_attn$"
replace:
class: ktransformers.operators.balance_serve_attention.KSmallthinkerAttention # optimized MLA implementation
class: ktransformers.operators.balance_serve_attention.KGlm4MoeAttention # optimized MLA implementation
kwargs:
generate_device: "cuda"
prefill_device: "cuda"