fix(llamafile): resolve deferred experts data race and update README (#1646)
Some checks are pending
Book-CI / test-1 (push) Waiting to run
Book-CI / test-2 (push) Waiting to run
Book-CI / test (push) Waiting to run
Deploy / deploy (macos-latest) (push) Waiting to run
Deploy / deploy (ubuntu-latest) (push) Waiting to run
Deploy / deploy (windows-latest) (push) Waiting to run

This commit is contained in:
Jiaqi Liao 2025-11-26 23:19:37 +08:00 committed by GitHub
parent 51745a9ea1
commit e7d1c1de09
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 696 additions and 88 deletions

View file

@ -62,7 +62,8 @@ class KExpertsCPUBuffer:
for _ in range(cls.buffer_depth)
]
bsz_tensor_cpu = [
torch.zeros((1,), device="cpu", dtype=torch.int32, pin_memory=True) for _ in range(cls.buffer_depth)
torch.full((1,), batch_size, device="cpu", dtype=torch.int32, pin_memory=True)
for _ in range(cls.buffer_depth)
]
output_gpu = [
torch.zeros((batch_size, hidden_size), device=hidden_states.device, dtype=hidden_states.dtype)
@ -256,8 +257,6 @@ class BaseMoEWrapper(ABC):
next_slot = (current_slot + 1) % KExpertsCPUBuffer.buffer_depth
bsz_slot_tensor = bsz_tensor_cpu[current_slot]
bsz_slot_tensor.fill_(batch_size)
deferred_experts_ids_cpu[current_slot].fill_(-1)
topk_ids_long = topk_ids.to(torch.long)
immediate_ids: torch.Tensor