Fix kt-kernel for new wrapper (#1588)
Some checks are pending
Book-CI / test (push) Waiting to run
Book-CI / test-1 (push) Waiting to run
Book-CI / test-2 (push) Waiting to run
Deploy / deploy (macos-latest) (push) Waiting to run
Deploy / deploy (ubuntu-latest) (push) Waiting to run
Deploy / deploy (windows-latest) (push) Waiting to run

* update README for kt-kernel

* style: format C++ and Python code in kt-kernel

  - Format C++ files: task_queue, ext_bindings, and MoE operators
  - Format Python utility modules: amx, llamafile, and loader
  - Improve code readability and consistency
This commit is contained in:
Jiaqi Liao 2025-11-10 21:47:34 +08:00 committed by GitHub
parent 9bc00e587b
commit 94c25626dc
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
10 changed files with 219 additions and 179 deletions

View file

@ -1,12 +1,15 @@
import torch
from typing import Optional
import os
# Use relative imports for package structure
from ..experts_base import BaseMoEWrapper
from .loader import GGUFLoader
from kt_kernel_ext.moe import MOEConfig
try:
from kt_kernel_ext.moe import MOE
_HAS_LLAMAFILE_SUPPORT = True
except (ImportError, AttributeError):
_HAS_LLAMAFILE_SUPPORT = False
@ -14,6 +17,7 @@ except (ImportError, AttributeError):
from kt_kernel_ext.kvcache import ggml_type
class LlamafileMoEWrapper(BaseMoEWrapper):
"""
Llamafile-based MoE wrapper implementation.
@ -162,27 +166,17 @@ class LlamafileMoEWrapper(BaseMoEWrapper):
)
if physical_to_logical_map_cpu is None:
physical_to_logical_map_cpu = torch.arange(
self.num_experts,
dtype=torch.int32,
device="cpu"
)
physical_to_logical_map_cpu = torch.arange(self.num_experts, dtype=torch.int32, device="cpu")
print(f" Using default identity mapping for {self.num_experts} experts")
base_key = f"blk.{self.layer_idx}"
# Load quantized tensors from GGUF
gate_data, gate_type = self.gguf_loader.get_undequanted_tensor_and_ggml_type(
f"{base_key}.ffn_gate_exps.weight"
)
gate_data, gate_type = self.gguf_loader.get_undequanted_tensor_and_ggml_type(f"{base_key}.ffn_gate_exps.weight")
up_data, up_type = self.gguf_loader.get_undequanted_tensor_and_ggml_type(
f"{base_key}.ffn_up_exps.weight"
)
up_data, up_type = self.gguf_loader.get_undequanted_tensor_and_ggml_type(f"{base_key}.ffn_up_exps.weight")
down_data, down_type = self.gguf_loader.get_undequanted_tensor_and_ggml_type(
f"{base_key}.ffn_down_exps.weight"
)
down_data, down_type = self.gguf_loader.get_undequanted_tensor_and_ggml_type(f"{base_key}.ffn_down_exps.weight")
# Keep tensors alive
self.weights_to_keep = (gate_data, up_data, down_data)