use compile for gate, slight performance improvement

This commit is contained in:
Atream 2025-03-14 12:43:28 +00:00
parent 6c4ed59175
commit a889288fc1
9 changed files with 155 additions and 37 deletions

View file

@ -477,7 +477,6 @@ class KTransformersLinear(BaseInjectedModule, KLinearBase):
gguf_loader: GGUFLoader,
config: PretrainedConfig,
orig_module: nn.Module,
# device: str = "cuda",
generate_device: str = "cuda",
generate_op: str| None = "KLinearMarlin",
prefill_device: str = "cuda",