kvcache-ai-ktransformers/ktransformers/optimize/optimize_rules/Mixtral.yaml
Atream 5ec33d046d optimize gguf dequant, save mem, support Q2_K
use marlin for lm_head, lm_head only calc last token for prefill
extend context window to 19K for DeepSeek-V3/R1 within 24GB VRAM
2025-02-22 06:13:01 +00:00

59 lines
No EOL
1.7 KiB
YAML

- match:
class: ktransformers.models.modeling_mixtral.MixtralRotaryEmbedding
replace:
class: ktransformers.operators.RoPE.RotaryEmbedding
kwargs:
generate_device: "cuda"
prefill_device: "cuda"
- match:
name: "^model\\.layers\\..*$"
class: torch.nn.Linear # only match modules matching name and class simultaneously
replace:
class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types
kwargs:
generate_device: "cuda"
prefill_device: "cuda"
generate_op: "KLinearMarlin"
prefill_op: "KLinearTorch"
- match:
name: "^lm_head"
class: torch.nn.Linear
replace:
class: ktransformers.operators.linear.KTransformersLinear
kwargs:
generate_device: "cuda"
prefill_device: "cuda"
generate_op: "KLinearMarlin"
prefill_op: "KLinearTorch"
- match:
name: "^model\\.layers\\..*\\.block_sparse_moe$"
class: ktransformers.models.modeling_mixtral.MixtralSparseMoeBlock
replace:
class: ktransformers.operators.experts.KMistralSparseMoEBlock
- match:
name: "^model\\.layers\\..*\\.block_sparse_moe\\.experts$"
replace:
class: ktransformers.operators.experts.KTransformersExperts
kwargs:
prefill_device: "cuda"
prefill_op: "KExpertsTorch"
generate_device: "cpu"
generate_op: "KExpertsCPU"
out_device: "cuda"
recursive: False # don't recursively inject submodules of this module
- match:
name: "^model.embed_tokens"
replace:
class: "default"
kwargs:
generate_device: "cpu"
prefill_device: "cpu"
- match:
name: "^model\\.layers\\..*\\."
replace:
class: "default"
kwargs:
generate_device: "cuda"
prefill_device: "cuda"