optimize gguf dequant, save mem, support Q2_K

use marlin for lm_head, lm_head only calc last token for prefill
extend context window to 19K for DeepSeek-V3/R1 within 24GB VRAM
This commit is contained in:
Atream 2025-02-22 06:13:01 +00:00
parent 7e1fe256c8
commit 5ec33d046d
27 changed files with 435 additions and 259 deletions

View file

@ -79,7 +79,7 @@ def load_cur_state_dict(module: nn.Module, gguf_loader: GGUFLoader, prefix: str
raise Exception(f"can't find {translated_key} in GGUF file!")
def load_weights(module:nn.Module, gguf_loader:GGUFLoader, prefix=''):
# print(f"recursively loading weights {prefix},{return_when_injected=}, {only_load_injected=}")
#print(f"recursively loading weights {prefix}")
if not isinstance(module, base_operator.BaseInjectedModule):
load_cur_state_dict(module, gguf_loader, prefix)
for name, child in module._modules.items():