mirror of
https://github.com/kvcache-ai/ktransformers.git
synced 2025-09-10 15:29:39 +00:00
optimize gguf dequant, save mem, support Q2_K
use marlin for lm_head, lm_head only calc last token for prefill extend context window to 19K for DeepSeek-V3/R1 within 24GB VRAM
This commit is contained in:
parent
7e1fe256c8
commit
5ec33d046d
27 changed files with 435 additions and 259 deletions
|
@ -79,7 +79,7 @@ def load_cur_state_dict(module: nn.Module, gguf_loader: GGUFLoader, prefix: str
|
|||
raise Exception(f"can't find {translated_key} in GGUF file!")
|
||||
|
||||
def load_weights(module:nn.Module, gguf_loader:GGUFLoader, prefix=''):
|
||||
# print(f"recursively loading weights {prefix},{return_when_injected=}, {only_load_injected=}")
|
||||
#print(f"recursively loading weights {prefix}")
|
||||
if not isinstance(module, base_operator.BaseInjectedModule):
|
||||
load_cur_state_dict(module, gguf_loader, prefix)
|
||||
for name, child in module._modules.items():
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue