fix flashinfer precision

2025-09-10 06:14:58 +00:00 · 2025-03-07 14:07:00 +00:00 · 2025-03-07 14:07:00 +00:00 · d453c320f1
commit d453c320f1
parent 96d75d53df
5 changed files with 151 additions and 61 deletions
--- a/ktransformers/util/utils.py
+++ b/ktransformers/util/utils.py
@ -239,7 +239,7 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000, use_cud
            if use_flashinfer_mla:
                MLAWrapperSingleton.plan_all(None,None,None,position_ids.squeeze(1)+1,
                                             num_heads, head_dim_ckv, head_dim_kpe, past_key_values.page_size,
-                                             q_head_dim ** (-0.5), torch.bfloat16, torch.bfloat16)
+                                             model.model.layers[0].self_attn.softmax_scale, torch.bfloat16, torch.bfloat16)
            global warm_uped
            if use_cuda_graph and ( (warm_uped == True and int(i) == 1) or (warm_uped == False and int(i) == 2) ):
                warm_uped = True