mirror of
https://github.com/kvcache-ai/ktransformers.git
synced 2026-04-30 21:00:07 +00:00
Fix kt-kernel for new wrapper (#1588)
Some checks are pending
Book-CI / test (push) Waiting to run
Book-CI / test-1 (push) Waiting to run
Book-CI / test-2 (push) Waiting to run
Deploy / deploy (macos-latest) (push) Waiting to run
Deploy / deploy (ubuntu-latest) (push) Waiting to run
Deploy / deploy (windows-latest) (push) Waiting to run
Some checks are pending
Book-CI / test (push) Waiting to run
Book-CI / test-1 (push) Waiting to run
Book-CI / test-2 (push) Waiting to run
Deploy / deploy (macos-latest) (push) Waiting to run
Deploy / deploy (ubuntu-latest) (push) Waiting to run
Deploy / deploy (windows-latest) (push) Waiting to run
* update README for kt-kernel * style: format C++ and Python code in kt-kernel - Format C++ files: task_queue, ext_bindings, and MoE operators - Format Python utility modules: amx, llamafile, and loader - Improve code readability and consistency
This commit is contained in:
parent
9bc00e587b
commit
94c25626dc
10 changed files with 219 additions and 179 deletions
|
|
@ -1,12 +1,15 @@
|
|||
import torch
|
||||
from typing import Optional
|
||||
import os
|
||||
|
||||
# Use relative imports for package structure
|
||||
from ..experts_base import BaseMoEWrapper
|
||||
from .loader import GGUFLoader
|
||||
from kt_kernel_ext.moe import MOEConfig
|
||||
|
||||
try:
|
||||
from kt_kernel_ext.moe import MOE
|
||||
|
||||
_HAS_LLAMAFILE_SUPPORT = True
|
||||
except (ImportError, AttributeError):
|
||||
_HAS_LLAMAFILE_SUPPORT = False
|
||||
|
|
@ -14,6 +17,7 @@ except (ImportError, AttributeError):
|
|||
|
||||
from kt_kernel_ext.kvcache import ggml_type
|
||||
|
||||
|
||||
class LlamafileMoEWrapper(BaseMoEWrapper):
|
||||
"""
|
||||
Llamafile-based MoE wrapper implementation.
|
||||
|
|
@ -162,27 +166,17 @@ class LlamafileMoEWrapper(BaseMoEWrapper):
|
|||
)
|
||||
|
||||
if physical_to_logical_map_cpu is None:
|
||||
physical_to_logical_map_cpu = torch.arange(
|
||||
self.num_experts,
|
||||
dtype=torch.int32,
|
||||
device="cpu"
|
||||
)
|
||||
physical_to_logical_map_cpu = torch.arange(self.num_experts, dtype=torch.int32, device="cpu")
|
||||
print(f" Using default identity mapping for {self.num_experts} experts")
|
||||
|
||||
base_key = f"blk.{self.layer_idx}"
|
||||
|
||||
# Load quantized tensors from GGUF
|
||||
gate_data, gate_type = self.gguf_loader.get_undequanted_tensor_and_ggml_type(
|
||||
f"{base_key}.ffn_gate_exps.weight"
|
||||
)
|
||||
gate_data, gate_type = self.gguf_loader.get_undequanted_tensor_and_ggml_type(f"{base_key}.ffn_gate_exps.weight")
|
||||
|
||||
up_data, up_type = self.gguf_loader.get_undequanted_tensor_and_ggml_type(
|
||||
f"{base_key}.ffn_up_exps.weight"
|
||||
)
|
||||
up_data, up_type = self.gguf_loader.get_undequanted_tensor_and_ggml_type(f"{base_key}.ffn_up_exps.weight")
|
||||
|
||||
down_data, down_type = self.gguf_loader.get_undequanted_tensor_and_ggml_type(
|
||||
f"{base_key}.ffn_down_exps.weight"
|
||||
)
|
||||
down_data, down_type = self.gguf_loader.get_undequanted_tensor_and_ggml_type(f"{base_key}.ffn_down_exps.weight")
|
||||
|
||||
# Keep tensors alive
|
||||
self.weights_to_keep = (gate_data, up_data, down_data)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue