[feature] experts can be injected using CPUInfer

[fix] fix ktransformers interface when use new CUDAGraphRunner [fix] fix YAML and optimize logic, the top rule has the highest priority
2025-09-10 06:14:58 +00:00 · 2024-08-14 16:10:54 +08:00 · 2024-08-14 16:10:54 +08:00 · 412055d450
commit 412055d450
parent 80815dbc50
13 changed files with 318 additions and 158 deletions
--- a/ktransformers/util/utils.py
+++ b/ktransformers/util/utils.py
@ -89,7 +89,7 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000, use_cud
    os.environ["TOKENIZERS_PARALLELISM"] = "false"
    torch._dynamo.config.suppress_errors = True
    batch_size, seq_length = inputs.shape
-    device_map = model.config.gguf_loader.tensor_device_map
+    device_map = model.gguf_loader.tensor_device_map
    torch_device = get_device('blk.0.self_attn', device_map)
    torch_device = "cuda:0" if torch_device == "cuda" else torch_device
    inputs = inputs.to(torch_device)