mirror of
https://github.com/kvcache-ai/ktransformers.git
synced 2025-09-10 15:29:39 +00:00
Fix cannot offload whole layer in cpu
This commit is contained in:
parent
35d7aed207
commit
6735beb5b6
4 changed files with 14 additions and 11 deletions
|
@ -670,11 +670,12 @@ class KDeepseekV2Model(BaseInjectedModule):
|
|||
if self.transfer_map is not None and i in self.transfer_map:
|
||||
prev_stream = torch.cuda.current_stream()
|
||||
cur_device = self.transfer_map[i]
|
||||
if cur_device not in self.stream_device_map:
|
||||
if cur_device not in self.stream_device_map and cur_device.lower() != "cpu":
|
||||
self.stream_device_map[cur_device] = torch.cuda.Stream(cur_device)
|
||||
torch.cuda.set_device(cur_device)
|
||||
self.stream_device_map[cur_device].wait_stream(prev_stream)
|
||||
torch.cuda.set_stream(self.stream_device_map[cur_device])
|
||||
if cur_device.lower() != "cpu":
|
||||
torch.cuda.set_device(cur_device)
|
||||
self.stream_device_map[cur_device].wait_stream(prev_stream)
|
||||
torch.cuda.set_stream(self.stream_device_map[cur_device])
|
||||
hidden_states = hidden_states.to(
|
||||
self.transfer_map[i], non_blocking=True
|
||||
)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue