fix-update-flashinfer_wrapper_local_chat

This commit is contained in:
Atream 2025-02-25 12:47:31 +00:00
parent 5474be5299
commit 477ac28a9c
4 changed files with 15 additions and 4 deletions

View file

@ -177,6 +177,7 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000, use_cud
else:
inputs_embeds = model.model.embed_tokens(inputs.to("cpu")).to(torch_device)
if use_flashinfer_mla:
MLAWrapperSingleton.update_buffer(past_key_values.max_pages)
MLAWrapperSingleton.need_plan_all()
logits = model(