Merge branch 'fix_precision_MLA' of https://github.com/kvcache-ai/ktransformers into server-prefix-cache

This commit is contained in:
ceerrep 2025-02-18 11:44:28 +08:00
commit 73d072f609
3 changed files with 14 additions and 4 deletions

View file

@ -104,7 +104,10 @@ class KTransformersInterface(TransformersInterface):
torch.cuda.synchronize()
logits = logits[0, -1, :]
return self.logits_to_token(logits)
if self.args.use_cuda_graph:
warm_uped = True
if self.use_static_cache:
mask = torch.ones((1, self.seq_length)).to(torch_device)
logits = self.model(
@ -118,7 +121,6 @@ class KTransformersInterface(TransformersInterface):
else:
logits = self.model(self.current_ids, return_dict=False)[0]
logits = logits[0, -1, :]
warm_uped = True
return self.logits_to_token(logits)