[feature] experts can be injected using CPUInfer

[fix] fix ktransformers interface when use new CUDAGraphRunner [fix] fix YAML and optimize logic, the top rule has the highest priority
2026-04-28 11:49:51 +00:00 · 2024-08-14 16:10:54 +08:00 · 2024-08-14 16:10:54 +08:00 · 412055d450
commit 412055d450
parent 80815dbc50
13 changed files with 318 additions and 158 deletions
--- a/ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct-multi-gpu.yaml
+++ b/ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct-multi-gpu.yaml
@ -1,10 +1,3 @@
- match:
-    name: "^model\\.layers\\.([012])\\."
-  replace:
-    class: "default"
-    kwargs:
-      generate_device: "cuda:0"
-      prefill_device: "cuda:0"
 - match:
    name: "^model\\.layers\\.([012])\\."
    class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeRotaryEmbedding
@ -41,13 +34,6 @@
      out_device: "cuda:0"
  recursive: False # don't recursively inject submodules of this module

- match:
-    name: "^model\\.layers\\.([12][0-9]|[3-9])\\."
-  replace:
-    class: "default"
-    kwargs:
-      generate_device: "cuda:1"
-      prefill_device: "cuda:1"
 - match:
    name: "^model\\.layers\\.([12][0-9]|[3-9])\\."
    class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeRotaryEmbedding
@ -109,3 +95,18 @@
      transfer_map: 
        3: "cuda:1"

+- match:
+    name: "^model\\.layers\\.([012])\\."
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+
+- match:
+    name: "^model\\.layers\\.([12][0-9]|[3-9])\\."
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"