Fix kt-kernel for new wrapper (#1588)
Some checks are pending
Book-CI / test (push) Waiting to run
Book-CI / test-1 (push) Waiting to run
Book-CI / test-2 (push) Waiting to run
Deploy / deploy (macos-latest) (push) Waiting to run
Deploy / deploy (ubuntu-latest) (push) Waiting to run
Deploy / deploy (windows-latest) (push) Waiting to run

* update README for kt-kernel

* style: format C++ and Python code in kt-kernel

  - Format C++ files: task_queue, ext_bindings, and MoE operators
  - Format Python utility modules: amx, llamafile, and loader
  - Improve code readability and consistency
This commit is contained in:
Jiaqi Liao 2025-11-10 21:47:34 +08:00 committed by GitHub
parent 9bc00e587b
commit 94c25626dc
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
10 changed files with 219 additions and 179 deletions

View file

@ -18,13 +18,13 @@ import ctypes
import kt_kernel_ext
class KExpertsCPUBuffer:
"""
CPU buffer management for expert computation.
Manages pinned memory buffers for efficient GPU-CPU data transfer.
"""
capture_bs: List = list()
capture_buffers: Dict = dict()
temp_bs: int = 0
@ -62,8 +62,7 @@ class KExpertsCPUBuffer:
for _ in range(cls.buffer_depth)
]
bsz_tensor_cpu = [
torch.zeros((1,), device="cpu", dtype=torch.int32, pin_memory=True)
for _ in range(cls.buffer_depth)
torch.zeros((1,), device="cpu", dtype=torch.int32, pin_memory=True) for _ in range(cls.buffer_depth)
]
output_gpu = [
torch.zeros((batch_size, hidden_size), device=hidden_states.device, dtype=hidden_states.dtype)
@ -129,7 +128,6 @@ class BaseMoEWrapper(ABC):
max_deferred_experts_per_token: Number of experts per token to defer on this layer. Defaults to 0 (no defer).
method: Backend method string
"""
print(f"Init {self.__class__.__name__}")
self.layer_idx = layer_idx
self.num_experts = num_experts
self.num_experts_per_tok = num_experts_per_tok
@ -139,7 +137,9 @@ class BaseMoEWrapper(ABC):
self.weight_path = weight_path
self.chunked_prefill_size = chunked_prefill_size
self.cpu_save = cpu_save
self.max_deferred_experts_per_token = int(max_deferred_experts_per_token) if max_deferred_experts_per_token is not None else 0
self.max_deferred_experts_per_token = (
int(max_deferred_experts_per_token) if max_deferred_experts_per_token is not None else 0
)
BaseMoEWrapper._layer_has_pending_deferred[self.layer_idx] = False
self.method = method