mirror of
https://github.com/kvcache-ai/ktransformers.git
synced 2025-09-13 08:39:42 +00:00
fix KExpertsMarlin on GPU with out CUDA Graph
This commit is contained in:
parent
f5f6c6b95d
commit
f327695079
2 changed files with 13 additions and 0 deletions
|
@ -53,6 +53,17 @@
|
||||||
generate_op: "KExpertsCPU"
|
generate_op: "KExpertsCPU"
|
||||||
out_device: "cuda"
|
out_device: "cuda"
|
||||||
recursive: False # don't recursively inject submodules of this module
|
recursive: False # don't recursively inject submodules of this module
|
||||||
|
# if want to use more VRAM, use experts Marlin and disable CUDA Graph(disable CUDA Graph may cause low performance)
|
||||||
|
#- match:
|
||||||
|
# name: "^model\\.layers\\..*\\.mlp\\.experts$"
|
||||||
|
# replace:
|
||||||
|
# class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism
|
||||||
|
# kwargs:
|
||||||
|
# prefill_device: "cuda"
|
||||||
|
# prefill_op: "KExpertsTorch"
|
||||||
|
# generate_device: "cuda"
|
||||||
|
# generate_op: "KExpertsMarlin"
|
||||||
|
# recursive: False # don't recursively inject submodules of this module
|
||||||
- match:
|
- match:
|
||||||
name: "^model\\.layers\\..*\\.self_attn$"
|
name: "^model\\.layers\\..*\\.self_attn$"
|
||||||
replace:
|
replace:
|
||||||
|
|
|
@ -310,6 +310,8 @@ class GGUFLoader:
|
||||||
values = GGML_DEQUANTIZE[ggml_name](data)
|
values = GGML_DEQUANTIZE[ggml_name](data)
|
||||||
values = torch.from_numpy(values.copy())
|
values = torch.from_numpy(values.copy())
|
||||||
|
|
||||||
|
if ggml_name == "BF16":
|
||||||
|
values = values.view(torch.bfloat16)
|
||||||
values = values.view(shape[-2::-1])
|
values = values.view(shape[-2::-1])
|
||||||
|
|
||||||
return values
|
return values
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue