optimize GPU

2025-09-11 07:44:35 +00:00 · 2025-02-21 05:06:57 +00:00 · 2025-02-21 05:06:57 +00:00 · 7e1fe256c8
commit 7e1fe256c8
parent cf4da5fd47
8 changed files with 677 additions and 156 deletions
--- a/ktransformers/local_chat.py
+++ b/ktransformers/local_chat.py
@ -168,10 +168,7 @@ def local_chat(
        if mode == 'long_context':
            assert Config().long_context_config['max_seq_len'] > input_tensor.shape[1] + max_new_tokens, \
            "please change max_seq_len in  ~/.ktransformers/config.yaml"
-        torch.set_default_dtype(
-            torch.bfloat16
-        )  # TODO: Remove this, replace dtype using config
-
+        
        if system != "Windows" and (config.architectures[0] == "DeepseekV2ForCausalLM" or "DeepseekV3ForCausalLM") and flashinfer_enabled:
            generated = prefill_and_generate(
                model, tokenizer, input_tensor.cuda(), max_new_tokens, use_cuda_graph, mode = mode, force_think = force_think,