[feature] experts can be injected using CPUInfer

[fix] fix ktransformers interface when use new CUDAGraphRunner
[fix] fix YAML and optimize logic, the top rule has the highest priority
This commit is contained in:
Atream 2024-08-14 16:10:54 +08:00
parent 80815dbc50
commit 412055d450
13 changed files with 318 additions and 158 deletions

View file

@ -89,7 +89,7 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000, use_cud
os.environ["TOKENIZERS_PARALLELISM"] = "false"
torch._dynamo.config.suppress_errors = True
batch_size, seq_length = inputs.shape
device_map = model.config.gguf_loader.tensor_device_map
device_map = model.gguf_loader.tensor_device_map
torch_device = get_device('blk.0.self_attn', device_map)
torch_device = "cuda:0" if torch_device == "cuda" else torch_device
inputs = inputs.to(torch_device)