optimize gguf dequant, save mem, support Q2_K

use marlin for lm_head, lm_head only calc last token for prefill extend context window to 19K for DeepSeek-V3/R1 within 24GB VRAM
2025-09-10 15:29:39 +00:00 · 2025-02-22 06:13:01 +00:00 · 2025-02-22 06:13:01 +00:00 · 5ec33d046d
commit 5ec33d046d
parent 7e1fe256c8
27 changed files with 435 additions and 259 deletions
--- a/ktransformers/util/utils.py
+++ b/ktransformers/util/utils.py
@ -79,7 +79,7 @@ def load_cur_state_dict(module: nn.Module, gguf_loader: GGUFLoader, prefix: str
            raise Exception(f"can't find {translated_key} in GGUF file!")
        
 def load_weights(module:nn.Module, gguf_loader:GGUFLoader, prefix=''):
-    # print(f"recursively loading weights {prefix},{return_when_injected=}, {only_load_injected=}")
+    #print(f"recursively loading weights {prefix}")
    if not isinstance(module, base_operator.BaseInjectedModule):
        load_cur_state_dict(module, gguf_loader, prefix)
        for name, child in module._modules.items():