mirror of
https://github.com/kvcache-ai/ktransformers.git
synced 2026-05-01 21:21:12 +00:00
[ADD] support multi-gpu qlen>1 q5_k
This commit is contained in:
parent
f293803156
commit
f5f79f5c0e
63 changed files with 3271 additions and 1285 deletions
|
|
@ -1,3 +1,10 @@
|
|||
- match:
|
||||
name: "^model\\.layers\\..*\\."
|
||||
replace:
|
||||
class: "default"
|
||||
kwargs:
|
||||
generate_device: "cuda"
|
||||
prefill_device: "cuda"
|
||||
- match:
|
||||
class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeRotaryEmbedding
|
||||
replace:
|
||||
|
|
@ -21,7 +28,7 @@
|
|||
name: "^model\\.layers\\..*\\.mlp\\.experts$"
|
||||
replace:
|
||||
class: ktransformers.operators.experts.KTransformersMLPExpert # custom MoE Kernel with expert paralleism
|
||||
device: "cpu" # which devices to load this module when initializing
|
||||
# device: "cpu" # which devices to load this module when initializing
|
||||
kwargs:
|
||||
prefill_device: "cuda"
|
||||
prefill_mlp_type: "MLPExpertsTorch"
|
||||
|
|
@ -32,6 +39,13 @@
|
|||
- match:
|
||||
name: "^model$"
|
||||
replace:
|
||||
class: "ktransformers.operators.layer_wise_prefill.Qwen2MoeModelPerLayerPrefill"
|
||||
class: "ktransformers.operators.layer_wise_prefill.Qwen2MoeModelKTransformers"
|
||||
kwargs:
|
||||
per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
|
||||
- match:
|
||||
name: "^model.embed_tokens"
|
||||
replace:
|
||||
class: "default"
|
||||
kwargs:
|
||||
generate_device: "cpu"
|
||||
prefill_device: "cpu"
|
||||
Loading…
Add table
Add a link
Reference in a new issue