mirror of
https://github.com/kvcache-ai/ktransformers.git
synced 2026-05-01 21:21:12 +00:00
update kt-kernel
This commit is contained in:
parent
1a925769d9
commit
f854d03bd7
119 changed files with 4459 additions and 6368 deletions
|
|
@ -2,8 +2,8 @@ import os, sys
|
|||
import time
|
||||
os.environ["BLAS_NUM_THREADS"] = "1"
|
||||
sys.path.insert(0, os.path.dirname(__file__) + "/../build")
|
||||
import cpuinfer_ext
|
||||
from cpuinfer_ext.kvcache import ggml_type
|
||||
import kt_kernel_ext
|
||||
from kt_kernel_ext.kvcache import ggml_type
|
||||
import torch
|
||||
import logging
|
||||
import sys
|
||||
|
|
@ -22,7 +22,7 @@ logger = logging.getLogger("reader")
|
|||
from gguf.gguf_reader import GGUFReader
|
||||
# load_layers = 6
|
||||
load_layers = None
|
||||
CPUInfer = cpuinfer_ext.CPUInfer(304)
|
||||
CPUInfer = kt_kernel_ext.CPUInfer(304)
|
||||
max_qlen = 4096
|
||||
max_kvlen = 4096
|
||||
page_size = 256
|
||||
|
|
@ -136,7 +136,7 @@ def build_mla(layer_idx, json_config, gguf_weights):
|
|||
rope_theta = json_config["rope_theta"]
|
||||
rope_scaling = json_config["rope_scaling"]
|
||||
|
||||
config = cpuinfer_ext.mla.MLAConfig(
|
||||
config = kt_kernel_ext.mla.MLAConfig(
|
||||
hidden_size,
|
||||
q_lora_rank,
|
||||
kv_lora_rank,
|
||||
|
|
@ -191,12 +191,12 @@ def build_mla(layer_idx, json_config, gguf_weights):
|
|||
config.page_count = pages_count
|
||||
|
||||
if q_a_type == "F32":
|
||||
mla = cpuinfer_ext.mla.MLA_F32(config)
|
||||
mla = kt_kernel_ext.mla.MLA_F32(config)
|
||||
elif q_a_type == "F16":
|
||||
mla = cpuinfer_ext.mla.MLA_F16(config)
|
||||
mla = kt_kernel_ext.mla.MLA_F16(config)
|
||||
elif q_a_type == "BF16":
|
||||
# mla = cpuinfer_ext.mla.MLA_F32(config)
|
||||
mla = cpuinfer_ext.mla.MLA_QUAN_F32(config)
|
||||
# mla = kt_kernel_ext.mla.MLA_F32(config)
|
||||
mla = kt_kernel_ext.mla.MLA_QUAN_F32(config)
|
||||
else:
|
||||
raise ValueError(f"Unsupported data type: {q_a_type}")
|
||||
|
||||
|
|
@ -207,7 +207,7 @@ def build_mla(layer_idx, json_config, gguf_weights):
|
|||
|
||||
def build_ffn(layer_idx, json_config, gguf_weights):
|
||||
if f"blk.{layer_idx}.ffn_gate.weight" in gguf_weights: # dense
|
||||
config = cpuinfer_ext.moe.MOEConfig(
|
||||
config = kt_kernel_ext.moe.MOEConfig(
|
||||
json_config["num_experts_per_tok"] + json_config["n_shared_experts"],
|
||||
json_config["num_experts_per_tok"] + json_config["n_shared_experts"],
|
||||
json_config["hidden_size"],
|
||||
|
|
@ -227,12 +227,12 @@ def build_ffn(layer_idx, json_config, gguf_weights):
|
|||
config.down_proj = down.data_ptr()
|
||||
config.down_type = type_to_ggml_type(down_type)
|
||||
|
||||
moe = cpuinfer_ext.moe.KMLInt8_MOE(config)
|
||||
moe = kt_kernel_ext.moe.KMLInt8_MOE(config)
|
||||
moe.load_weights()
|
||||
return moe
|
||||
|
||||
elif f"blk.{layer_idx}.ffn_gate_exps.weight" in gguf_weights:
|
||||
config = cpuinfer_ext.moe.MOEConfig(
|
||||
config = kt_kernel_ext.moe.MOEConfig(
|
||||
json_config["n_routed_experts"] + json_config["n_shared_experts"],
|
||||
json_config["num_experts_per_tok"] + json_config["n_shared_experts"],
|
||||
json_config["hidden_size"],
|
||||
|
|
@ -267,7 +267,7 @@ def build_ffn(layer_idx, json_config, gguf_weights):
|
|||
config.down_proj = down.data_ptr()
|
||||
config.down_type = type_to_ggml_type(down_type)
|
||||
|
||||
moe = cpuinfer_ext.moe.KMLInt8_MOE(config)
|
||||
moe = kt_kernel_ext.moe.KMLInt8_MOE(config)
|
||||
moe.load_weights()
|
||||
return moe
|
||||
|
||||
|
|
@ -276,7 +276,7 @@ def build_ffn(layer_idx, json_config, gguf_weights):
|
|||
|
||||
|
||||
def build_moegate(layer_idx, json_config, gguf_weights):
|
||||
config = cpuinfer_ext.gate.GateConfig(
|
||||
config = kt_kernel_ext.gate.GateConfig(
|
||||
json_config["hidden_size"],
|
||||
json_config["num_experts_per_tok"],
|
||||
json_config["n_routed_experts"],
|
||||
|
|
@ -296,7 +296,7 @@ def build_moegate(layer_idx, json_config, gguf_weights):
|
|||
config.e_score_correction_bias = bias.data_ptr()
|
||||
config.e_score_correction_bias_type = type_to_ggml_type(bias_type)
|
||||
|
||||
gate = cpuinfer_ext.gate.MoEGate(config)
|
||||
gate = kt_kernel_ext.gate.MoEGate(config)
|
||||
|
||||
return gate
|
||||
|
||||
|
|
@ -304,7 +304,7 @@ def build_moegate(layer_idx, json_config, gguf_weights):
|
|||
|
||||
def build_llm(json_config, gguf_weights):
|
||||
|
||||
general_config = cpuinfer_ext.GeneralConfig()
|
||||
general_config = kt_kernel_ext.GeneralConfig()
|
||||
general_config.vocab_size = json_config["vocab_size"]
|
||||
general_config.hidden_size = json_config["hidden_size"]
|
||||
general_config.num_experts_per_tok = json_config["num_experts_per_tok"]
|
||||
|
|
@ -326,8 +326,8 @@ def build_llm(json_config, gguf_weights):
|
|||
|
||||
general_config.pool = CPUInfer.backend_
|
||||
|
||||
llm = cpuinfer_ext.DeepseekV3ForCausalLM(general_config)
|
||||
model = cpuinfer_ext.DeepseekV3Model(general_config)
|
||||
llm = kt_kernel_ext.DeepseekV3ForCausalLM(general_config)
|
||||
model = kt_kernel_ext.DeepseekV3Model(general_config)
|
||||
llm.model = model
|
||||
|
||||
|
||||
|
|
@ -335,7 +335,7 @@ def build_llm(json_config, gguf_weights):
|
|||
real_load_layers = json_config["num_hidden_layers"] if load_layers is None else load_layers
|
||||
|
||||
for i in range(real_load_layers):
|
||||
layer = cpuinfer_ext.DeepseekV3DecoderLayer(general_config,i)
|
||||
layer = kt_kernel_ext.DeepseekV3DecoderLayer(general_config,i)
|
||||
attn_norm, attn_norm_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{i}.attn_norm.weight")
|
||||
ffn_norm, ffn_norm_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{i}.ffn_norm.weight")
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue