From aea4243712168abd47f46d9749f0430248ef0bb5 Mon Sep 17 00:00:00 2001 From: MorphisZhang Date: Thu, 13 Feb 2025 16:32:28 +0800 Subject: [PATCH] Add optimization config for Deepseek V3/R1 with 4 GPUs --- .../DeepSeek-V3-Chat-multi-gpu-4.yaml | 326 ++++++++++++++++++ 1 file changed, 326 insertions(+) create mode 100644 ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-4.yaml diff --git a/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-4.yaml b/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-4.yaml new file mode 100644 index 0000000..572f9e5 --- /dev/null +++ b/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-4.yaml @@ -0,0 +1,326 @@ +- match: + name: "^model.embed_tokens" + replace: + class: "default" + kwargs: + generate_device: "cpu" + prefill_device: "cpu" + +# === Rotary Embedding Replacement === + +# GPU 0: layers 0–14 +- match: + name: "^model\\.layers\\.([0-9]|1[0-4])\\." + class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding + replace: + class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3 + kwargs: + generate_device: "cuda:0" + prefill_device: "cuda:0" + +# GPU 1: layers 15–29 +- match: + name: "^model\\.layers\\.(1[5-9]|2[0-9])\\." + class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding + replace: + class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3 + kwargs: + generate_device: "cuda:1" + prefill_device: "cuda:1" + +# GPU 2: layers 30–44 +- match: + name: "^model\\.layers\\.(3[0-9]|4[0-4])\\." + class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding + replace: + class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3 + kwargs: + generate_device: "cuda:2" + prefill_device: "cuda:2" + +# GPU 3: layers 45–60 +- match: + name: "^model\\.layers\\.(4[5-9]|5[0-9]|60)\\." + class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding + replace: + class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3 + kwargs: + generate_device: "cuda:3" + prefill_device: "cuda:3" + +# === Linear Layers Replacement (excluding self_attn.kv_b_proj) === + +# GPU 0: layers 0–14 +- match: + name: "^model\\.layers\\.([0-9]|1[0-4])\\.(?!self_attn\\.kv_b_proj).*$" + class: torch.nn.Linear + replace: + class: ktransformers.operators.linear.KTransformersLinear + kwargs: + generate_device: "cuda:0" + prefill_device: "cuda:0" + generate_op: "KLinearMarlin" + prefill_op: "KLinearTorch" + +# GPU 1: layers 15–29 +- match: + name: "^model\\.layers\\.(1[5-9]|2[0-9])\\.(?!self_attn\\.kv_b_proj).*$" + class: torch.nn.Linear + replace: + class: ktransformers.operators.linear.KTransformersLinear + kwargs: + generate_device: "cuda:1" + prefill_device: "cuda:1" + generate_op: "KLinearMarlin" + prefill_op: "KLinearTorch" + +# GPU 2: layers 30–44 +- match: + name: "^model\\.layers\\.(3[0-9]|4[0-4])\\.(?!self_attn\\.kv_b_proj).*$" + class: torch.nn.Linear + replace: + class: ktransformers.operators.linear.KTransformersLinear + kwargs: + generate_device: "cuda:2" + prefill_device: "cuda:2" + generate_op: "KLinearMarlin" + prefill_op: "KLinearTorch" + +# GPU 3: layers 45–60 +- match: + name: "^model\\.layers\\.(4[5-9]|5[0-9]|60)\\.(?!self_attn\\.kv_b_proj).*$" + class: torch.nn.Linear + replace: + class: ktransformers.operators.linear.KTransformersLinear + kwargs: + generate_device: "cuda:3" + prefill_device: "cuda:3" + generate_op: "KLinearMarlin" + prefill_op: "KLinearTorch" + +# === MLP (MoE) Replacement === + +# GPU 0: layers 0–14 +- match: + name: "^model\\.layers\\.([0-9]|1[0-4])\\.mlp$" + class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE + replace: + class: ktransformers.operators.experts.KDeepseekV3MoE + kwargs: + generate_device: "cuda:0" + prefill_device: "cuda:0" + +# GPU 1: layers 15–29 +- match: + name: "^model\\.layers\\.(1[5-9]|2[0-9])\\.mlp$" + class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE + replace: + class: ktransformers.operators.experts.KDeepseekV3MoE + kwargs: + generate_device: "cuda:1" + prefill_device: "cuda:1" + +# GPU 2: layers 30–44 +- match: + name: "^model\\.layers\\.(3[0-9]|4[0-4])\\.mlp$" + class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE + replace: + class: ktransformers.operators.experts.KDeepseekV3MoE + kwargs: + generate_device: "cuda:2" + prefill_device: "cuda:2" + +# GPU 3: layers 45–60 +- match: + name: "^model\\.layers\\.(4[5-9]|5[0-9]|60)\\.mlp$" + class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE + replace: + class: ktransformers.operators.experts.KDeepseekV3MoE + kwargs: + generate_device: "cuda:3" + prefill_device: "cuda:3" + +# === MLP Gate Replacement === + +# GPU 0: layers 0–14 +- match: + name: "^model\\.layers\\.([0-9]|1[0-4])\\.mlp\\.gate$" + class: ktransformers.models.modeling_deepseek_v3.MoEGate + replace: + class: ktransformers.operators.gate.KMoEGate + kwargs: + generate_device: "cuda:0" + prefill_device: "cuda:0" + +# GPU 1: layers 15–29 +- match: + name: "^model\\.layers\\.(1[5-9]|2[0-9])\\.mlp\\.gate$" + class: ktransformers.models.modeling_deepseek_v3.MoEGate + replace: + class: ktransformers.operators.gate.KMoEGate + kwargs: + generate_device: "cuda:1" + prefill_device: "cuda:1" + +# GPU 2: layers 30–44 +- match: + name: "^model\\.layers\\.(3[0-9]|4[0-4])\\.mlp\\.gate$" + class: ktransformers.models.modeling_deepseek_v3.MoEGate + replace: + class: ktransformers.operators.gate.KMoEGate + kwargs: + generate_device: "cuda:2" + prefill_device: "cuda:2" + +# GPU 3: layers 45–60 +- match: + name: "^model\\.layers\\.(4[5-9]|5[0-9]|60)\\.mlp\\.gate$" + class: ktransformers.models.modeling_deepseek_v3.MoEGate + replace: + class: ktransformers.operators.gate.KMoEGate + kwargs: + generate_device: "cuda:3" + prefill_device: "cuda:3" + +# === MLP Experts Replacement === + +# GPU 0: layers 0–14 +- match: + name: "^model\\.layers\\.([0-9]|1[0-4])\\.mlp\\.experts$" + replace: + class: ktransformers.operators.experts.KTransformersExperts + kwargs: + prefill_device: "cuda:0" + prefill_op: "KExpertsTorch" + generate_device: "cpu" + generate_op: "KExpertsCPU" + out_device: "cuda:0" + recursive: False + +# GPU 1: layers 15–29 +- match: + name: "^model\\.layers\\.(1[5-9]|2[0-9])\\.mlp\\.experts$" + replace: + class: ktransformers.operators.experts.KTransformersExperts + kwargs: + prefill_device: "cuda:1" + prefill_op: "KExpertsTorch" + generate_device: "cpu" + generate_op: "KExpertsCPU" + out_device: "cuda:1" + recursive: False + +# GPU 2: layers 30–44 +- match: + name: "^model\\.layers\\.(3[0-9]|4[0-4])\\.mlp\\.experts$" + replace: + class: ktransformers.operators.experts.KTransformersExperts + kwargs: + prefill_device: "cuda:2" + prefill_op: "KExpertsTorch" + generate_device: "cpu" + generate_op: "KExpertsCPU" + out_device: "cuda:2" + recursive: False + +# GPU 3: layers 45–60 +- match: + name: "^model\\.layers\\.(4[5-9]|5[0-9]|60)\\.mlp\\.experts$" + replace: + class: ktransformers.operators.experts.KTransformersExperts + kwargs: + prefill_device: "cuda:3" + prefill_op: "KExpertsTorch" + generate_device: "cpu" + generate_op: "KExpertsCPU" + out_device: "cuda:3" + recursive: False + +# === Self-Attention Replacement === + +# GPU 0: layers 0–14 +- match: + name: "^model\\.layers\\.([0-9]|1[0-4])\\.self_attn$" + replace: + class: ktransformers.operators.attention.KDeepseekV2Attention + kwargs: + generate_device: "cuda:0" + prefill_device: "cuda:0" + +# GPU 1: layers 15–29 +- match: + name: "^model\\.layers\\.(1[5-9]|2[0-9])\\.self_attn$" + replace: + class: ktransformers.operators.attention.KDeepseekV2Attention + kwargs: + generate_device: "cuda:1" + prefill_device: "cuda:1" + +# GPU 2: layers 30–44 +- match: + name: "^model\\.layers\\.(3[0-9]|4[0-4])\\.self_attn$" + replace: + class: ktransformers.operators.attention.KDeepseekV2Attention + kwargs: + generate_device: "cuda:2" + prefill_device: "cuda:2" + +# GPU 3: layers 45–60 +- match: + name: "^model\\.layers\\.(4[5-9]|5[0-9]|60)\\.self_attn$" + replace: + class: ktransformers.operators.attention.KDeepseekV2Attention + kwargs: + generate_device: "cuda:3" + prefill_device: "cuda:3" + +# === Overall Model Replacement with Transfer Map === + +- match: + name: "^model$" + replace: + class: "ktransformers.operators.models.KDeepseekV2Model" + kwargs: + per_layer_prefill_intput_threshold: 0 # 0 means close layer‐wise prefill + transfer_map: + 15: "cuda:1" # Layers 15+ on GPU 1 + 30: "cuda:2" # Layers 30+ on GPU 2 + 45: "cuda:3" # Layers 45+ on GPU 3 + +# === Default Catch-All for Other Modules === + +# GPU 0: layers 0–14 +- match: + name: "^model\\.layers\\.([0-9]|1[0-4])\\." + replace: + class: "default" + kwargs: + generate_device: "cuda:0" + prefill_device: "cuda:0" + +# GPU 1: layers 15–29 +- match: + name: "^model\\.layers\\.(1[5-9]|2[0-9])\\." + replace: + class: "default" + kwargs: + generate_device: "cuda:1" + prefill_device: "cuda:1" + +# GPU 2: layers 30–44 +- match: + name: "^model\\.layers\\.(3[0-9]|4[0-4])\\." + replace: + class: "default" + kwargs: + generate_device: "cuda:2" + prefill_device: "cuda:2" + +# For final modules (model.norm and lm_head), ensure they are on GPU 3 (as in your original config) +- match: + name: "(^model\\.layers\\.(4[5-9]|5[0-9]|60)\\.)|(^model\\.norm)|(^lm_head)" + replace: + class: "default" + kwargs: + generate_device: "cuda:3" + prefill_device: "cuda:3"