mirror of
https://github.com/kvcache-ai/ktransformers.git
synced 2025-09-10 23:34:35 +00:00
support windows support q4_0 and q5_0 dequant on cpu Add CopyRight from pygguf(It was added before, but disappear after merge). Add some TODO in the code.
This commit is contained in:
parent
442e13bc97
commit
0a2fd52cea
32 changed files with 248 additions and 108 deletions
|
@ -79,13 +79,15 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000):
|
|||
logits = cuda_graph_runner(cur_token, position_ids, cache_position)
|
||||
past_key_values.change_seq_length(1)
|
||||
"""
|
||||
inputs_embeds = model.model.embed_tokens(cur_token.to("cpu")).to("cuda")
|
||||
custom_stream = torch.cuda.Stream()
|
||||
with torch.cuda.stream(custom_stream):
|
||||
logits=model(cur_token,
|
||||
position_ids=position_ids,
|
||||
cache_position=cache_position,
|
||||
past_key_values=past_key_values,
|
||||
return_dict=False, use_cache=True)[0]
|
||||
#"""
|
||||
logits=model(inputs_embeds = inputs_embeds,
|
||||
position_ids = position_ids,
|
||||
cache_position = cache_position,
|
||||
past_key_values = past_key_values,
|
||||
return_dict = False, use_cache = True) [0]
|
||||
"""
|
||||
torch.cuda.synchronize()
|
||||
#print(logits)
|
||||
next_token_scores = logits_warper(inputs, logits[:, -1, :])
|
||||
|
@ -108,7 +110,6 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000):
|
|||
generated_ids[:, cache_position] = inputs.to(torch_device).to(torch.int)
|
||||
past_key_values.cur_idx=cache_position
|
||||
start_time = time.time()
|
||||
#custom_stream = torch.cuda.Stream()
|
||||
|
||||
inputs_embeds = model.model.embed_tokens(inputs.to("cpu")).to("cuda")
|
||||
logits = model(
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue