support windows support q4_0 and q5_0 dequant on cpu Add CopyRight from pygguf(It was added before, but disappear after merge). Add some TODO in the code.

2025-09-10 23:34:35 +00:00 · 2024-08-07 12:19:06 +08:00 · 2024-08-07 12:19:06 +08:00 · 0a2fd52cea
commit 0a2fd52cea
parent 442e13bc97
32 changed files with 248 additions and 108 deletions
--- a/ktransformers/util/utils.py
+++ b/ktransformers/util/utils.py
@ -79,13 +79,15 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000):
        logits = cuda_graph_runner(cur_token, position_ids, cache_position)
        past_key_values.change_seq_length(1)
        """
+        inputs_embeds = model.model.embed_tokens(cur_token.to("cpu")).to("cuda")
+        custom_stream = torch.cuda.Stream()
        with torch.cuda.stream(custom_stream):
-            logits=model(cur_token, 
-                         position_ids=position_ids,
-                         cache_position=cache_position,
-                         past_key_values=past_key_values,
-                         return_dict=False, use_cache=True)[0]
-        #"""            
+            logits=model(inputs_embeds = inputs_embeds, 
+                         position_ids = position_ids,
+                         cache_position = cache_position,
+                         past_key_values = past_key_values,
+                         return_dict = False, use_cache = True) [0]
+        """            
        torch.cuda.synchronize()
        #print(logits)
        next_token_scores = logits_warper(inputs, logits[:, -1, :])
@ -108,7 +110,6 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000):
        generated_ids[:, cache_position] = inputs.to(torch_device).to(torch.int)
        past_key_values.cur_idx=cache_position
        start_time = time.time()
-        #custom_stream = torch.cuda.Stream()

        inputs_embeds = model.model.embed_tokens(inputs.to("cpu")).to("cuda")
        logits = model(