mirror of
https://github.com/kvcache-ai/ktransformers.git
synced 2025-09-11 07:44:35 +00:00
optimize GPU
This commit is contained in:
parent
cf4da5fd47
commit
7e1fe256c8
8 changed files with 677 additions and 156 deletions
|
@ -168,10 +168,7 @@ def local_chat(
|
|||
if mode == 'long_context':
|
||||
assert Config().long_context_config['max_seq_len'] > input_tensor.shape[1] + max_new_tokens, \
|
||||
"please change max_seq_len in ~/.ktransformers/config.yaml"
|
||||
torch.set_default_dtype(
|
||||
torch.bfloat16
|
||||
) # TODO: Remove this, replace dtype using config
|
||||
|
||||
|
||||
if system != "Windows" and (config.architectures[0] == "DeepseekV2ForCausalLM" or "DeepseekV3ForCausalLM") and flashinfer_enabled:
|
||||
generated = prefill_and_generate(
|
||||
model, tokenizer, input_tensor.cuda(), max_new_tokens, use_cuda_graph, mode = mode, force_think = force_think,
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue