support windows support q4_0 and q5_0 dequant on cpu Add CopyRight from pygguf(It was added before, but disappear after merge). Add some TODO in the code.

2025-09-10 15:29:39 +00:00 · 2024-08-07 12:19:06 +08:00 · 2024-08-07 12:19:06 +08:00 · 0a2fd52cea
commit 0a2fd52cea
parent 442e13bc97
32 changed files with 248 additions and 108 deletions
--- a/ktransformers/models/custom_cache.py
+++ b/ktransformers/models/custom_cache.py
@ -46,6 +46,7 @@ class StaticCache(transformers.StaticCache):
        self.value_cache: List[torch.Tensor] = []
        cache_shape = (max_batch_size, self.num_key_value_heads, self.max_cache_len, self.head_dim)
        if config.architectures[0] == "DeepseekV2ForCausalLM":
+            # TODO: for deepseek, cache_shape is different whether using Absorbed MLA, check it automatically
            # key_shape = (max_batch_size, self.num_key_value_heads, self.max_cache_len, config.qk_rope_head_dim + config.qk_nope_head_dim)
            # value_shape = (max_batch_size, self.num_key_value_heads, self.max_cache_len, config.v_head_dim)
            key_shape = (max_batch_size, 1, self.max_cache_len, config.qk_rope_head_dim)