Merge pull request #1276 from kvcache-ai/support_load_safetensor
Some checks failed
Book-CI / test (push) Has been cancelled
Deploy / deploy (macos-latest) (push) Has been cancelled
Deploy / deploy (ubuntu-latest) (push) Has been cancelled
Deploy / deploy (windows-latest) (push) Has been cancelled

support safetensor load, delete architectures argument
This commit is contained in:
wang jiahao 2025-05-12 11:10:26 +08:00 committed by GitHub
commit 8456222852
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
30 changed files with 1075 additions and 328 deletions

View file

@ -1,7 +1,7 @@
import os
import sys
sys.path.insert(0,"/home/zbx/ktransformers")
from ktransformers.util.custom_gguf import GGUFLoader
from ktransformers.util.custom_loader import GGUFLoader
import torch
gguf_loader_1 = GGUFLoader("/mnt/data/model/DeepseekV3-q4km-gguf")

26
install-with-cache.sh Executable file
View file

@ -0,0 +1,26 @@
#!/bin/bash
set -e
# clear build dirs
# rm -rf build
# rm -rf *.egg-info
# rm -rf csrc/build
# rm -rf csrc/ktransformers_ext/build
# rm -rf csrc/ktransformers_ext/cuda/build
# rm -rf csrc/ktransformers_ext/cuda/dist
# rm -rf csrc/ktransformers_ext/cuda/*.egg-info
rm -rf ~/.ktransformers
echo "Installing python dependencies from requirements.txt"
pip install -r requirements-local_chat.txt
pip install -r ktransformers/server/requirements.txt
echo "Installing ktransformers"
KTRANSFORMERS_FORCE_BUILD=TRUE USE_BALANCE_SERVE=1 pip install -v . --no-build-isolation
pip install third_party/custom_flashinfer/ -v
# SITE_PACKAGES=$(python -c "import site; print(site.getsitepackages()[0])")
# echo "Copying thirdparty libs to $SITE_PACKAGES"
# cp -a csrc/balance_serve/build/third_party/prometheus-cpp/lib/libprometheus-cpp-*.so* $SITE_PACKAGES/
# patchelf --set-rpath '$ORIGIN' $SITE_PACKAGES/sched_ext.cpython*
echo "Installation completed successfully"

View file

@ -66,7 +66,7 @@ class StaticCache(transformers.StaticCache):
self.page_table_list = []
for idx in range(config.num_hidden_layers):
if isinstance(device, dict):
target_device = device[f"blk.{idx}.self_attn"]["generate_device"]
target_device = device[f"model.layers.{idx}.self_attn"]["generate_device"]
else:
target_device = device
@ -91,7 +91,7 @@ class StaticCache(transformers.StaticCache):
# Note: `mark_static_address` is used to tag the cache as an fixed data pointer, preventing cuda graph
# breaks when updating the cache.
if isinstance(device, dict):
target_device = device[f"blk.{idx}.self_attn"]["generate_device"]
target_device = device[f"model.layers.{idx}.self_attn"]["generate_device"]
else:
target_device = device

View file

@ -39,7 +39,7 @@ class KQwen2MoeForCausalLM(Qwen2MoePreTrainedModel):
self.cache = cache
self.vocab_size = config.vocab_size
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.attn = [None] * 10
self.attn = [None] * 100
def init_wrapper(self, use_cuda_graph, device, max_batch_token, max_batch_size, max_pages, cuda_graph_idx = 0):
self.attn[cuda_graph_idx] = flashInferAttn(use_cuda_graph=use_cuda_graph, max_batch_token=max_batch_token, max_batch_size=max_batch_size, max_pages=max_pages, device=device)

View file

@ -39,7 +39,7 @@ class KQwen3MoeForCausalLM(Qwen3MoePreTrainedModel):
self.cache = cache
self.vocab_size = config.vocab_size
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.attn = [None] * 10
self.attn = [None] * 100
def init_wrapper(self, use_cuda_graph, device, max_batch_token, max_batch_size, max_pages, cuda_graph_idx = 0):
self.attn[cuda_graph_idx] = flashInferAttn(use_cuda_graph=use_cuda_graph, max_batch_token=max_batch_token, max_batch_size=max_batch_size, max_pages=max_pages, device=device)

View file

@ -23,7 +23,7 @@ from ktransformers.models.modeling_deepseek import (
yarn_find_correction_range
)
from ktransformers.operators.base_operator import BaseInjectedModule
from ktransformers.util.custom_gguf import GGUFLoader
from ktransformers.util.custom_loader import GGUFLoader
from ktransformers.util.utils import InferenceState
from transformers.configuration_utils import PretrainedConfig
import torch

View file

@ -15,7 +15,7 @@ from ktransformers.models.modeling_llama import LlamaRotaryEmbedding
from ktransformers.models.modeling_deepseek import DeepseekV2Attention, apply_rotary_pos_emb
from typing import Optional, Tuple
from ktransformers.operators.base_operator import BaseInjectedModule
from ktransformers.util.custom_gguf import GGUFLoader
from ktransformers.util.custom_loader import GGUFLoader
from ktransformers.util.utils import get_compute_capability
import logging
from transformers.configuration_utils import PretrainedConfig

View file

@ -11,7 +11,7 @@ from ktransformers.models.modeling_qwen2_moe import Qwen2MoeAttention
from ktransformers.models.modeling_qwen3_moe import Qwen3MoeAttention
from typing import Optional, Tuple
from ktransformers.operators.base_operator import BaseInjectedModule
from ktransformers.util.custom_gguf import GGUFLoader
from ktransformers.util.custom_loader import GGUFLoader
import logging
from transformers.configuration_utils import PretrainedConfig
from flashinfer import BatchMLAPagedAttentionWrapper

View file

@ -6,7 +6,7 @@ Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
'''
from typing import Any
from torch import nn, Tensor
from ktransformers.util.custom_gguf import GGUFLoader
from ktransformers.util.custom_loader import GGUFLoader
from transformers.configuration_utils import PretrainedConfig
import ktransformers.util.utils as utils
class BaseInjectedModule(nn.Module):

View file

@ -26,7 +26,8 @@ sys.path.append(os.path.join(os.path.dirname(__file__), "..", "ktransformers_ext
import cpuinfer_ext
from cpuinfer_ext.moe import MOEConfig, MOE
import ctypes
from ktransformers.util.custom_gguf import GGMLQuantizationType, GGUFLoader
from ktransformers.util.custom_gguf import GGMLQuantizationType
from ktransformers.util.custom_loader import GGUFLoader, SafeTensorLoader, ModelLoader
from ktransformers.util.utils import InferenceState
from ktransformers.server.config.config import Config
from transformers.activations import ACT2FN
@ -39,8 +40,18 @@ from ktransformers.operators.cpuinfer import CPUInfer
def deduplicate_and_sort(lst):
return sorted(set(lst))
def generate_cuda_graphs(chunk_size: int) -> list:
assert chunk_size <= 1024 or chunk_size % 1024 == 0, "chunk_size must <= 1024 or a multiple of 1024"
base_list = [1, 2, 3, Config().max_batch_size, 64, 256, 512, chunk_size]
if chunk_size <= 1024:
return base_list
multiples = [i for i in range(1024, chunk_size + 1, 1024)]
return deduplicate_and_sort(base_list + multiples)
#cuda_graphs = [Config().chunk_size]
cuda_graphs = deduplicate_and_sort([1, 2, 3, Config().max_batch_size, 64, Config().chunk_size])
cuda_graphs = generate_cuda_graphs(Config().chunk_size)
# class Base(BaseInjectedModule, ABC):
class KExpertsBase(ABC):
def __init__(self, key: str, gguf_loader: GGUFLoader, config: PretrainedConfig, orig_module: nn.Module, device: str = "cuda", **kwargs):
@ -77,7 +88,7 @@ class KExpertsBase(ABC):
down_type = None
for key in keys:
if key + ".ffn_gate_exps.weight" in self.gguf_loader.tensor_info:
if self.gguf_loader.has_tensor(key + ".ffn_gate_exps.weight"):
targets = [".ffn_gate_exps.weight", ".ffn_up_exps.weight", ".ffn_down_exps.weight" ]
tensors = self.load_multi(key, targets, device=device)
gate = tensors[".ffn_gate_exps.weight"]
@ -86,7 +97,7 @@ class KExpertsBase(ABC):
gate_type = self.gguf_loader.tensor_info[key + ".ffn_gate_exps.weight"]["ggml_type"]
up_type = self.gguf_loader.tensor_info[key + ".ffn_up_exps.weight"]["ggml_type"]
down_type = self.gguf_loader.tensor_info[key + ".ffn_down_exps.weight"]["ggml_type"]
elif key + ".ffn_down.0.weight" in self.gguf_loader.tensor_info:
elif self.gguf_loader.has_tensor(key + ".ffn_down.0.weight"):
# for supporting Mixtral-8x7B-Instuct
gate = []
up = []
@ -194,7 +205,7 @@ class KExpertsCPU(KExpertsBase):
self.config.num_experts_per_tok,
self.config.hidden_size,
self.config.moe_intermediate_size,
25600,
max(cuda_graphs),
gate_ptr,
up_ptr,
down_ptr,
@ -212,7 +223,7 @@ class KExpertsCPU(KExpertsBase):
self.config.num_experts_per_tok,
self.config.hidden_size,
self.config.moe_intermediate_size,
25600,
max(cuda_graphs),
gate_ptr,
up_ptr,
down_ptr,
@ -325,14 +336,19 @@ class KExpertsCPU(KExpertsBase):
down_type = None
for key in keys:
if self.gguf_loader.safetensor_loader is not None:
# using a temp ugly way to temprary load the tensor
gate = self.gguf_loader.safetensor_loader.load_tensor(key + ".ffn_gate_exps.weight").numpy()
up = self.gguf_loader.safetensor_loader.load_tensor(key + ".ffn_up_exps.weight").numpy()
down = self.gguf_loader.safetensor_loader.load_tensor(key + ".ffn_down_exps.weight").numpy()
gate_type = self.gguf_loader.safetensor_loader.load_tensor(key + ".ffn_gate_exps.ggml_type").item()
up_type = self.gguf_loader.safetensor_loader.load_tensor(key + ".ffn_up_exps.ggml_type").item()
down_type = self.gguf_loader.safetensor_loader.load_tensor(key + ".ffn_down_exps.ggml_type").item()
if isinstance(self.gguf_loader, SafeTensorLoader):
res = self.gguf_loader.load_experts(key)
return {key: res}
elif self.gguf_loader.has_tensor(key + ".ffn_gate_exps.weight"):
gate = self.gguf_loader.get_mmap_tensor(key + ".ffn_gate_exps.weight")
up = self.gguf_loader.get_mmap_tensor(key + ".ffn_up_exps.weight")
down = self.gguf_loader.get_mmap_tensor(key + ".ffn_down_exps.weight")
# gate_type = self.gguf_loader.tensor_info[key + ".ffn_gate_exps.weight"]["ggml_type"]
# up_type = self.gguf_loader.tensor_info[key + ".ffn_up_exps.weight"]["ggml_type"]
# down_type = self.gguf_loader.tensor_info[key + ".ffn_down_exps.weight"]["ggml_type"]
gate_type = self.gguf_loader.get_ggml_type(key + ".ffn_gate_exps.weight")
up_type = self.gguf_loader.get_ggml_type(key + ".ffn_up_exps.weight")
down_type = self.gguf_loader.get_ggml_type(key + ".ffn_down_exps.weight")
elif key + ".ffn_gate_exps.weight" in self.gguf_loader.tensor_info:
gate = self.gguf_loader.get_mmap_tensor(key + ".ffn_gate_exps.weight")
@ -356,9 +372,9 @@ class KExpertsCPU(KExpertsBase):
gate = np.stack(gate)
up = np.stack(up)
down = np.stack(down)
gate_type = self.gguf_loader.tensor_info[key + ".ffn_gate.0.weight"]["ggml_type"]
up_type = self.gguf_loader.tensor_info[key + ".ffn_up.0.weight"]["ggml_type"]
down_type = self.gguf_loader.tensor_info[key + ".ffn_down.0.weight"]["ggml_type"]
gate_type = self.gguf_loader.get_ggml_type(key + ".ffn_gate.0.weight")
up_type = self.gguf_loader.get_ggml_type(key + ".ffn_up.0.weight")
down_type = self.gguf_loader.get_ggml_type(key + ".ffn_down.0.weight")
else:
raise ValueError(f"Experts {key} not found in gguf_loader")
res = {key:{"gate": gate, "up": up, "down": down, "gate_type": gate_type, "up_type": up_type, "down_type": down_type}}
@ -445,7 +461,7 @@ class KExpertsMarlin(KExpertsBase):
down = None
for key in keys:
if key + ".ffn_gate_exps.weight" in self.gguf_loader.tensor_info:
if self.gguf_loader.has_tensor(key + ".ffn_gate_exps.weight"):
gate = self.gguf_loader.get_mmap_tensor(key + ".ffn_gate_exps.weight")
up = self.gguf_loader.get_mmap_tensor(key + ".ffn_up_exps.weight")
down = self.gguf_loader.get_mmap_tensor(key + ".ffn_down_exps.weight")

View file

@ -40,7 +40,7 @@ class flashInferAttn():
self.kv_layout = kv_layout
self.use_cuda_graph = use_cuda_graph
if flashInferAttn.float_workspace_buffer is None:
flashInferAttn.float_workspace_buffer = torch.empty(1024 * 1024 * 1024, dtype=torch.uint8, device=device)
flashInferAttn.float_workspace_buffer = torch.empty(max_batch_token * 1024 * 1024, dtype=torch.uint8, device=device)
self.qo_indptr_buf = torch.empty((max_batch_size+1,), dtype=torch.int32, device=device)
self.paged_kv_indptr_buf = torch.empty((max_batch_size+1,), dtype=torch.int32, device=device)
self.paged_kv_indices_buf = torch.empty((max_pages,), dtype=torch.int32, device=device)

View file

@ -6,7 +6,7 @@ import os
from ktransformers.operators.base_operator import BaseInjectedModule
from ktransformers.operators.base_operator import BaseInjectedModule
from ktransformers.operators.linear import KTransformersLinear
from ktransformers.util.custom_gguf import GGUFLoader
from ktransformers.util.custom_loader import GGUFLoader, ModelLoader, SafeTensorLoader
from transformers.configuration_utils import PretrainedConfig
from abc import ABC, abstractmethod
@ -55,24 +55,20 @@ class KMoEGateBase(ABC):
down_type = None
for key in keys:
key = ".".join(key.split(".")[:-1])
if self.gguf_loader.safetensor_loader is not None:
targets = [".ffn_gate_inp.weight", ".exp_probs_b.bias"]
weight = self.gguf_loader.safetensor_loader.load_tensor(key + ".ffn_gate_inp.weight")
e_score_correction_bias = self.gguf_loader.safetensor_loader.load_tensor(key + ".exp_probs_b.bias")
weight_type = weight.dtype
e_score_correction_bias_type = e_score_correction_bias.dtype
res = {"weight": weight, "e_score_correction_bias": e_score_correction_bias, "weight_type": weight_type, "e_score_correction_bias_type": e_score_correction_bias_type}
elif key + ".ffn_gate_inp.weight" in self.gguf_loader.tensor_info:
targets = [".ffn_gate_inp.weight", ".exp_probs_b.bias"]
# key = ".".join(key.split(".")[:-1])
if isinstance(self.gguf_loader, SafeTensorLoader):
res = self.gguf_loader.load_gate(key, device=device)
elif self.gguf_loader.has_tensor(key+".weight"):
# targets = [".ffn_gate_inp.weight", ".exp_probs_b.bias"]
targets = [".weight", ".e_score_correction_bias"]
tensors = self.load_multi(key, targets, device=device)
weight = tensors[".ffn_gate_inp.weight"]
e_score_correction_bias = tensors[".exp_probs_b.bias"]
weight_type = self.gguf_loader.tensor_info[key + ".ffn_gate_inp.weight"]["ggml_type"]
e_score_correction_bias_type = self.gguf_loader.tensor_info[key + ".exp_probs_b.bias"]["ggml_type"]
weight = tensors[".weight"]
e_score_correction_bias = tensors[".e_score_correction_bias"]
# weight_type = self.gguf_loader.tensor_info[key + ".weight"]["ggml_type"]
res = {"weight": weight, "e_score_correction_bias": e_score_correction_bias}
else:
raise ValueError(f"Experts {key} not found in gguf_loader")
res = {"weight": weight, "e_score_correction_bias": e_score_correction_bias, "weight_type": weight_type, "e_score_correction_bias_type": e_score_correction_bias_type}
return res
def load_multi(self, key: str, keys: list[str], device: str = "cpu"):
@ -106,8 +102,6 @@ class KMoEGate(BaseInjectedModule, KMoEGateBase):
if w is None: w = self.load_weights(device=device)
if isinstance(w, dict):
self.weight_type = w["weight_type"]
self.e_score_correction_bias_type = w["e_score_correction_bias_type"]
self.orig_module.weight = nn.Parameter(w["weight"])
self.orig_module.e_score_correction_bias = nn.Parameter(w["e_score_correction_bias"])
else:
@ -175,8 +169,6 @@ class KMoEGateQwen2Moe(BaseInjectedModule, KMoEGateBase):
if w is None: w = self.load_weights(device=device)
if isinstance(w, dict):
self.weight_type = w["weight_type"]
self.e_score_correction_bias_type = w["e_score_correction_bias_type"]
self.orig_module.weight = nn.Parameter(w["weight"])
self.orig_module.e_score_correction_bias = nn.Parameter(w["e_score_correction_bias"])
else:

View file

@ -29,7 +29,7 @@ from ktransformers.models.modeling_deepseek_v3 import DeepseekV3RMSNorm
from ktransformers.models.modeling_qwen2_moe import Qwen2MoeRMSNorm
from ktransformers.models.modeling_qwen3_moe import Qwen3MoeRMSNorm
from ktransformers.operators.base_operator import BaseInjectedModule
from ktransformers.util.custom_gguf import GGUFLoader
from ktransformers.util.custom_loader import GGUFLoader
from flashinfer.norm import (
fused_add_rmsnorm,
rmsnorm,

View file

@ -16,7 +16,7 @@ import torch
from torch import Tensor, nn
import KTransformersOps
import vLLMMarlin
from ktransformers.util.custom_gguf import GGUFLoader
from ktransformers.util.custom_loader import GGUFLoader, SafeTensorLoader
from ktransformers.util.utils import InferenceState
from ktransformers.ktransformers_ext.operators.custom_marlin.quantize.utils.marlin_utils import (
MarlinWorkspace,
@ -83,15 +83,15 @@ class KLinearBase(ABC):
keys = [self.key]
for key in keys:
if self.gguf_loader.safetensor_loader is not None:
if isinstance(self.gguf_loader, SafeTensorLoader):
# using safetensor_loader
tensor = self.gguf_loader.safetensor_loader.load_tensor(key+'.weight')
if key+'.weight_scale_inv' in self.gguf_loader.safetensor_loader.tensor_file_map:
weight_scale_inv = self.gguf_loader.safetensor_loader.load_tensor(key+'.weight_scale_inv')
tensor = self.gguf_loader.load_tensor(key+'.weight')
if self.gguf_loader.has_tensor(key+'.weight_scale_inv'):
weight_scale_inv = self.gguf_loader.load_tensor(key+'.weight_scale_inv')
return nn.Parameter(tensor), nn.Parameter(weight_scale_inv)
return nn.Parameter(tensor)
elif key + ".weight" in self.gguf_loader.tensor_file_map:
elif self.gguf_loader.has_tensor(key + ".weight"):
if key + ".bias" in self.gguf_loader.tensor_file_map:
tensors = self.load_multi(key, ["weight", "bias"], device=device)
tensor = tensors["weight"]
@ -760,7 +760,7 @@ class KLinearCPUInfer(KLinearBase):
self.output_gpu = torch.zeros((1, 1, self.out_features), device=self.out_device)
def load_weights(self, w: dict | nn.Parameter | tuple | None = None, device: str = "cpu"):
if self.key + ".weight" in self.gguf_loader.tensor_info:
if self.gguf_loader.has_tensor(self.key + ".weight"):
if self.key + ".bias" in self.gguf_loader.tensor_file_map:
self.weight = self.gguf_loader.get_mmap_tensor(self.key + ".weight")
self.weight_type = self.gguf_loader.tensor_info[self.key + ".weight"]["ggml_type"]

View file

@ -1,6 +1,6 @@
from ktransformers.operators.base_operator import BaseInjectedModule
from ktransformers.util.custom_gguf import GGUFLoader
from ktransformers.util.custom_loader import GGUFLoader
from transformers import PretrainedConfig
import torch.nn as nn
from ktransformers.models.modeling_deepseek_v3 import DeepseekV3MLP

View file

@ -58,7 +58,7 @@ from transformers.models.qwen2_moe.configuration_qwen2_moe import Qwen2MoeConfig
from ktransformers.models.configuration_llama import LlamaConfig
from ktransformers.operators.base_operator import BaseInjectedModule
from ktransformers.util.utils import InferenceState, get_compute_capability
from ktransformers.util.custom_gguf import GGUFLoader
from ktransformers.util.custom_loader import GGUFLoader
from transformers.configuration_utils import PretrainedConfig
from ktransformers.models.modeling_llama import (
LlamaDecoderLayer,

View file

@ -12,7 +12,7 @@ from torch import nn
from transformers import AutoConfig
from transformers.configuration_utils import PretrainedConfig
# from operators import BaseInjectedModule
from ktransformers.util.custom_gguf import GGUFLoader, translate_name_to_gguf
from ktransformers.util.custom_loader import GGUFLoader, ModelLoaderFactory
from ktransformers.util.utils import set_module, load_weights
import itertools
import copy
@ -54,7 +54,7 @@ def del_meta(module:nn.Module):
def gen_optimize_config(module: nn.Module, out_data: Mapping, rule_list: List, prefix: str="", default_device: str = "cuda:0"):
module_name = prefix[:-1]
translated_name = translate_name_to_gguf(prefix)[:-1]
# translated_name = translate_name_to_gguf(prefix)[:-1]
#print("gen_optimize_config", prefix, module_name, translated_name)
recursive = True
for rule in rule_list:
@ -76,7 +76,7 @@ def gen_optimize_config(module: nn.Module, out_data: Mapping, rule_list: List, p
if "replace" in rule:
replace_meta = rule["replace"]
if module_name not in out_data:
out_data[module_name]={"key": translated_name,
out_data[module_name]={"key": module_name,
"class": replace_meta["class"] if "class" in replace_meta else "default",
# "device": replace_meta["device"] if "device" in replace_meta else default_device,
"kwargs": copy.deepcopy(replace_meta["kwargs"]) if "kwargs" in replace_meta else dict()}
@ -91,7 +91,7 @@ def gen_optimize_config(module: nn.Module, out_data: Mapping, rule_list: List, p
if module_name not in out_data:
out_data[module_name]= {
"class": "default",
"key": translated_name,
"key": module_name,
"kwargs": {"generate_device": default_device,
"prefill_device": default_device}
}
@ -123,12 +123,12 @@ def optimize_and_load_gguf(module: nn.Module, rule_file: str, gguf_path: str, mo
model_config = translate_model_config(model_config)
gguf_loader=GGUFLoader(gguf_path)
weights_loader = ModelLoaderFactory.create_loader(gguf_path)
with torch.device("meta"):
inject(module, optimize_config, model_config, gguf_loader)
inject(module, optimize_config, model_config, weights_loader)
# pre load lm_head because its big inter result
load_weights(module.lm_head, gguf_loader, "lm_head.")
load_weights(module, gguf_loader)
module.gguf_loader = gguf_loader
load_weights(module.lm_head, weights_loader, "lm_head.")
load_weights(module, weights_loader)
module.gguf_loader = weights_loader
del_meta(module)
torch.cuda.empty_cache()

View file

@ -0,0 +1,91 @@
- match:
class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
replace:
class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
kwargs:
generate_device: "cuda"
prefill_device: "cuda"
- match:
name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$" # regular expression
class: torch.nn.Linear # only match modules matching name and class simultaneously
replace:
class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types
kwargs:
generate_device: "cuda"
prefill_device: "cuda"
generate_op: "KLinearFP8"
prefill_op: "KLinearTorch"
- match:
name: "^model\\.layers\\..*\\.mlp$"
class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
replace:
class: ktransformers.operators.experts.KDeepseekV3MoEV2 # mlp module with custom forward function
kwargs:
generate_device: "cuda"
prefill_device: "cuda"
- match:
class: ktransformers.models.modeling_deepseek_v3.MoEGate
replace:
class: ktransformers.operators.gate.KMoEGate
kwargs:
generate_device: "cuda:0"
prefill_device: "cuda:0"
- match:
name: "^model\\.layers\\..*\\.mlp\\.experts$"
replace:
class: ktransformers.operators.experts.KTransformersExpertsV2 # custom MoE Kernel with expert paralleism
kwargs:
prefill_device: "cuda"
prefill_op: "KExpertsTorch"
generate_device: "cpu"
generate_op: "KExpertsCPU"
out_device: "cuda"
backend: "llamafile"
recursive: False # don't recursively inject submodules of this module
- match:
name: "^model\\.layers\\..*\\.self_attn$"
replace:
class: ktransformers.operators.balance_serve_attention.flashinfer_attn # optimized MLA implementation
kwargs:
generate_device: "cuda"
prefill_device: "cuda"
- match:
name: "^model$"
replace:
class: "ktransformers.operators.models.KDeepseekV2Model"
kwargs:
per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
- match:
name: "^model.embed_tokens"
replace:
class: "default"
kwargs:
generate_device: "cpu"
prefill_device: "cpu"
- match:
class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RMSNorm
replace:
class: ktransformers.operators.layernorm.RMSNorm
kwargs:
generate_device: "cuda"
prefill_device: "cuda"
- match:
class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MLP
replace:
class: ktransformers.operators.mlp.kDeepseekV3MLP
kwargs:
generate_device: "cuda"
prefill_device: "cuda"
- match:
name: "^lm_head$" # regular expression
class: torch.nn.Linear # only match modules matching name and class simultaneously
replace:
class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types
kwargs:
generate_device: "cuda"
prefill_device: "cuda"
generate_op: "VLinearMarlin"
prefill_op: "KLinearTorch"

View file

@ -128,10 +128,7 @@ class ArgumentParser:
else:
args.model_dir = self.cfg.model_dir
args.model_path = self.cfg.model_path
# set config from args
for key, value in vars(args).items():
if value is not None and hasattr(self.cfg, key):
setattr(self.cfg, key, value)
# we add the name not match args individually
self.cfg.model_device = args.device
self.cfg.mount_web = args.web
@ -140,10 +137,15 @@ class ArgumentParser:
self.cfg.user_force_think = args.force_think
model_config = AutoConfig.from_pretrained(args.model_dir, trust_remote_code=True)
if args.architectures == "Qwen3MoeForCausalLM" or args.architectures == "Qwen2MoeForCausalLM" :
if model_config.architectures[0] == "Qwen3MoeForCausalLM" or model_config.architectures[0] == "Qwen2MoeForCausalLM" :
args.gpu_memory_size = args.cache_lens*2*2*model_config.num_hidden_layers*model_config.num_key_value_heads*model_config.head_dim
args.architectures = model_config.architectures[0]
else:
args.gpu_memory_size = args.cache_lens*2*576*61
# set config from args
for key, value in vars(args).items():
if value is not None and hasattr(self.cfg, key):
setattr(self.cfg, key, value)
self.cfg.gpu_memory_size = args.gpu_memory_size
free_ports = get_free_ports(3, [args.port])
args.sched_port = free_ports[0]

View file

@ -197,7 +197,7 @@ class Engine:
self.block_num = inference_context.k_cache[0].size(1)
#@TODO add config
if config.architectures[0] == "Qwen2MoeForCausalLM" or config.architectures[0] == "Qwen3MoeForCausalLM":
self.model.init_wrapper(self.args.use_cuda_graph, self.device, 1024 ,args.max_batch_size, self.block_num) # TODO: 1024 is a magic number(max_batch_tokens)
self.model.init_wrapper(self.args.use_cuda_graph, self.device, Config().chunk_size, args.max_batch_size, self.block_num) # TODO: 1024 is a magic number(max_batch_tokens)
else:
self.model.init_wrapper(self.args.use_cuda_graph, self.device, args.max_batch_size, self.block_num)

View file

@ -200,7 +200,7 @@ class ForwardBatchInput:
device=None,
tokens: torch.Tensor = None,
num_mini_batches: int = 1,
max_seq_length: int = 1024, # TODO: add to yaml
max_seq_length: int = 4096, # TODO: add to yaml
prefill_query_length: int = (Config().chunk_size - Config().max_decode_batch_size) // Config().max_prefill_batch_size, # TODO: use config
prefill_active_length: int = (Config().chunk_size - Config().max_decode_batch_size) // Config().max_prefill_batch_size,
gen_prefill: bool = True,
@ -223,12 +223,12 @@ class ForwardBatchInput:
decode_querys_info = []
for i in range(min(decode_batch_size, cuda_lens)):
query_info = QueryInfo(i+Config().max_prefill_batch_size, prefill_query_length, max_seq_length, page_size, device, is_prefill=False, offset=offset)
query_info = QueryInfo(i+Config().max_prefill_batch_size, prefill_query_length, 256, page_size, device, is_prefill=False, offset=offset)
offset += max_seq_length // page_size
if tokens is not None:
query_info.query_tokens[prefill_active_length:prefill_active_length + 1].copy_(tokens)
if decode_active_position is None:
query_info.active_position = prefill_active_length
query_info.active_position = 255
else:
query_info.active_position = decode_active_position[i]

View file

@ -39,6 +39,17 @@ def pad_num_tokens(num_tokens):
def deduplicate_and_sort(lst):
return sorted(set(lst))
def generate_cuda_graphs(chunk_size: int) -> list:
# 如果输入不符合要求assert掉
assert chunk_size <= 1024 or chunk_size % 1024 == 0, "chunk_size must <= 1024 or a multiple of 1024"
base_list = [1, 2, 3, Config().max_batch_size, 64, 256, 512, chunk_size]
if chunk_size <= 1024:
return base_list
multiples = [i for i in range(1024, chunk_size + 1, 1024)]
return deduplicate_and_sort(base_list + multiples)
class ModelRunner:
"""A CudaGraphRunner runs the forward pass of a model with CUDA graph and torch.compile."""
@ -56,7 +67,7 @@ class ModelRunner:
self.features_buf = None
self.output = None
self.graph_memory_pool = None
self.cuda_graphs = deduplicate_and_sort([1, 2, 3, Config().max_batch_size, 64, Config().chunk_size])
self.cuda_graphs = generate_cuda_graphs(Config().chunk_size)
self.use_cuda_graph = use_cuda_graph
self.model_time = 0
self.page_size = page_size

View file

@ -7,7 +7,7 @@ sys.path.append(current_path+"/../..")
import numpy as np
# from ktransformers.operators.linear import KTransformersLinear, KLinearMarlin
# from ktransformers.operators.experts import KTransformersExperts, KExpertsTorch
from ktransformers.util.custom_gguf import GGUFLoader
from ktransformers.util.custom_loader import GGUFLoader
import torch
import KTransformersOps
torch.set_default_dtype(torch.bfloat16)

View file

@ -9,7 +9,7 @@ from pycuda.compiler import SourceModule
import numpy as np
from ktransformers.operators.linear import KTransformersLinear, KLinearMarlin
from ktransformers.operators.experts import KTransformersExperts, KExpertsTorch
from ktransformers.util.custom_gguf import GGUFLoader, dequantize_q4_k_gpu, dequantize_q4_k
from ktransformers.util.custom_loader import GGUFLoader, dequantize_q4_k_gpu, dequantize_q4_k
import torch
import KTransformersOps
torch.set_default_dtype(torch.bfloat16)

View file

@ -159,5 +159,7 @@ if __name__ == "__main__":
prompt = ktansformer_prompt1024
elif args.prompt_lens == 2048:
prompt = ktansformer_prompt1024 * 2
elif args.prompt_lens == 4096:
prompt = ktansformer_prompt1024 * 4
asyncio.run(main(args.concurrent, prompt, max_tokens, model))

View file

@ -25,7 +25,6 @@ import os
from enum import IntEnum
import torch
import KTransformersOps
from .custom_loader import SafeTensorLoader
import ctypes
import math
@ -166,238 +165,6 @@ DATA_TYPES = {
"FP8": 13,
}
class GGUFLoader:
tensor_info: dict
gguf_path: str
tensor_file_map: dict # {tensor_name: tensor_file_path}
gguf_file_meta: dict
safetensor_loader: SafeTensorLoader
def __init__(self, gguf_path: str):
# Check dir exist
if not os.path.exists(gguf_path):
raise FileNotFoundError(f"GGUF dir not found: {gguf_path}")
if os.path.isfile(gguf_path):
gguf_path = os.path.dirname(gguf_path)
self.safetensor_loader = None
self.tensor_info = {}
self.gguf_path = gguf_path
self.tensor_file_map = {}
self.file_data_map = {}
self.gguf_file_meta = {}
self.tensor_device_map = {}
# I know this is ugly, but I don't want to change the original code too much
# TODO: merge gguf load and other loads.
safetensor_loader = SafeTensorLoader(gguf_path)
if safetensor_loader.tensor_file_map:
self.safetensor_loader = safetensor_loader
return
# Walk through all the .gguf files in the directory
found_gguf = False
for root, dirs, files in os.walk(gguf_path):
for file in files:
if file.endswith(".gguf"):
found_gguf = True
file_name = os.path.join(root, file)
with open(file_name, "rb") as f:
self.load_gguf(f)
if file_name not in self.file_data_map:
self.file_data_map[file_name] = np.memmap(file_name, mode = 'r')
if not found_gguf:
raise FileNotFoundError(f"Cannot find any .gguf files in: {gguf_path}")
def load_gguf(self, f):
f.seek(0)
assert f.read(4) == b'GGUF'
values = struct.unpack("<IQQ", f.read(4+8+8))
version, n_tensors, n_kv = values
if version != 3:
warnings.warn(f"Version {version} has never been tested, might not work")
info = {}
for _ in range(n_kv):
name = read_value(f, DATA_TYPES["string"])
data_type = struct.unpack("<I", f.read(4))[0]
info[name] = read_value(f, data_type)
tensor_info = {}
for _ in range(n_tensors):
name = read_value(f, DATA_TYPES["string"])
shape_len = read_value(f, DATA_TYPES["uint32"])
shape = [read_value(f, DATA_TYPES["uint64"]) for _ in range(shape_len)]
ggml_type = read_value(f, DATA_TYPES["uint32"])
bad_offset = read_value(f, DATA_TYPES["uint64"])
n_elems = int(math.prod(shape))
block_size, type_size = GGML_QUANT_SIZES[ggml_type]
n_bytes = n_elems * type_size // block_size
np_dims = tuple(reversed(shape))
item_type: npt.DTypeLike
if ggml_type == GGMLQuantizationType.F16:
item_count = n_elems
item_type = np.float16
elif ggml_type == GGMLQuantizationType.F32:
item_count = n_elems
item_type = np.float32
elif ggml_type == GGMLQuantizationType.F64:
item_count = n_elems
item_type = np.float64
elif ggml_type == GGMLQuantizationType.I8:
item_count = n_elems
item_type = np.int8
elif ggml_type == GGMLQuantizationType.I16:
item_count = n_elems
item_type = np.int16
elif ggml_type == GGMLQuantizationType.I32:
item_count = n_elems
item_type = np.int32
elif ggml_type == GGMLQuantizationType.I64:
item_count = n_elems
item_type = np.int64
else:
item_count = n_bytes
item_type = np.uint8
np_dims = quant_shape_to_byte_shape(np_dims, ggml_type)
tensor_info[name] = {
"ggml_type": ggml_type,
"shape": shape,
"bad_offset": bad_offset,
"item_type": item_type,
"item_count": item_count,
"np_dims": np_dims
}
start = f.tell()
# Alignment is 32 by default.
# https://github.com/ggerganov/ggml/blob/e1daebbf9d38d510ba456c4d50b4500a73ac2b14/docs/gguf.md?plain=1#L253
alignment = info.get("general.alignment", 32)
# Inconveniently, the offset defined in gguf files is relative to the
# end of the header and is unaligned.
# We need to compute the absolute file offset ourselves instead.
for t in tensor_info.values():
offset = start + t["bad_offset"]
offset += (alignment - offset % alignment) % alignment
t["offset"] = offset
for name in tensor_info:
self.tensor_file_map[name] = f.name
self.tensor_info.update(tensor_info)
self.gguf_file_meta.update(info)
def get_mmap_tensor(self, name):
t = self.tensor_info[name]
mmap_data = self.file_data_map[ self.tensor_file_map[name] ]
offset = t["offset"]
item_type = t["item_type"]
item_count = t["item_count"]
itemsize = int(np.empty([], dtype = item_type).itemsize)
return mmap_data[offset : offset + itemsize * item_count]
def get_undequanted_tensor_and_ggml_type(self, name):
t = self.tensor_info[name]
data = self.get_mmap_tensor(name)
ggml_type = t["ggml_type"]
data = torch.from_numpy(data)
return data, ggml_type
def load_expert_tensor(self, name, data, expert_id, elements_per_expert, device = "cuda", target_dtype = torch.get_default_dtype())->torch.Tensor:
t = self.tensor_info[name]
if device.lower() == "cpu":
print(f"loading expert {expert_id} of {name} with CPU")
shape = t["shape"]
ggml_type = t["ggml_type"]
if ggml_type not in GGML_NAMES:
raise NotImplementedError(f"ggml_type {ggml_type} not implemented")
ggml_name = GGML_NAMES[ggml_type]
# TODO: experts may fused in quant block, split it
assert elements_per_expert % GGML_ELEMENTS_PER_BLOCK[ggml_name] == 0, "experts may fused in quant block, please use CPU dequant"
blocks_per_experts = elements_per_expert // GGML_ELEMENTS_PER_BLOCK[ggml_name]
block_size = GGML_BLOCK_SIZES[ggml_name]
offset = expert_id * block_size * blocks_per_experts
data = data[offset: offset + block_size * blocks_per_experts]
if "cuda" in device.lower():
values = GGML_DEQUANTIZE_GPU[ggml_name](data, device, target_dtype)
else:
values = GGML_DEQUANTIZE[ggml_name](data)
values = torch.from_numpy(values.copy())
if ggml_name == "BF16":
values = values.view(torch.bfloat16)
values = values.view(shape[-2::-1])
return values
def load_gguf_tensor(self, name: str, device:str = "cpu", target_dtype = None)->torch.Tensor:
t = self.tensor_info[name]
if device.lower() == "cpu":
print(f"loading {name} with CPU")
if target_dtype == None:
target_dtype = torch.get_default_dtype()
shape = t["shape"]
ggml_type = t["ggml_type"]
if ggml_type not in GGML_NAMES:
raise NotImplementedError(f"ggml_type {ggml_type} not implemented")
ggml_name = GGML_NAMES[ggml_type]
data = self.get_mmap_tensor(name)
block_size = GGML_BLOCK_SIZES[ggml_name]
elements_per_block = GGML_ELEMENTS_PER_BLOCK[ggml_name]
num_elements = int(np.prod(shape))
num_blocks = num_elements // elements_per_block
blocks_per_iter = 16384
if num_blocks > blocks_per_iter: # dequant large tensor
values = torch.empty((num_blocks, elements_per_block), dtype=target_dtype, device=device)
for i in range( (num_blocks + blocks_per_iter - 1) // blocks_per_iter):
blocks_begin = i * blocks_per_iter
blocks_end = min(blocks_begin + blocks_per_iter, num_blocks)
if "cuda" in device.lower():
cur_values = GGML_DEQUANTIZE_GPU[ggml_name](data[blocks_begin*block_size : blocks_end*block_size], device, target_dtype)
else:
cur_values = GGML_DEQUANTIZE[ggml_name](data[blocks_begin*block_size : blocks_end*block_size])
cur_values = torch.from_numpy(cur_values.copy())
cur_values = cur_values.view(-1, elements_per_block)
if ggml_name == "BF16":
cur_values = cur_values.view(torch.bfloat16)
values[blocks_begin : blocks_end] = cur_values
else:
if "cuda" in device.lower():
values = GGML_DEQUANTIZE_GPU[ggml_name](data, device)
else:
values = GGML_DEQUANTIZE[ggml_name](data)
values = torch.from_numpy(values)
if ggml_name == "BF16":
values = values.view(torch.bfloat16)
values = values.view(shape[::-1])
if "attn_q" in name and self.gguf_file_meta['general.architecture'] in ["llama"]:
n_head = self.gguf_file_meta['llama.attention.head_count']
values = (values.reshape(n_head, values.shape[0] // n_head // 2, 2, *values.shape[1:])
.swapaxes(1, 2)
.reshape(values.shape))
elif "attn_k" in name and self.gguf_file_meta['general.architecture'] in ["llama"]:
n_head = self.gguf_file_meta['llama.attention.head_count_kv']
values = (values.reshape(n_head, values.shape[0] // n_head // 2, 2, *values.shape[1:])
.swapaxes(1, 2)
.reshape(values.shape))
return values
def read_value(f, data_type):
if data_type == DATA_TYPES["string"]:
@ -921,6 +688,7 @@ def translate_name_to_gguf(name):
name = name.replace(".gate_up_proj.", ".up_proj")
name = name.replace(".mlp.shared_experts.down_proj", ".ffn_down_shexp")
name = name.replace(".mlp.gate.e_score_correction_bias", ".exp_probs_b.bias")
name = name.replace(".mlp.gate", ".ffn_gate_inp")
name = name.replace(".mlp.shared_experts.gate_proj", ".ffn_gate_shexp")
name = name.replace(".mlp.shared_experts.up_proj", ".ffn_up_shexp")

View file

@ -10,12 +10,35 @@ import torch
import KTransformersOps
from safetensors import safe_open
from ktransformers.ktransformers_ext.triton.fp8gemm import fp8_gemm, act_quant, weight_dequant
from ktransformers.util.custom_gguf import *
from safetensors.torch import save_file
from abc import ABC, abstractmethod
from typing import Dict, Any, Optional, Union
class SafeTensorLoader:
class ModelLoader(ABC):
"""
Abstract base class for model loaders.
Defines the interface that all model loaders must implement.
"""
tensor_file_map = {}
tensor_type_map = {}
file_handle_map = {}
@abstractmethod
def has_tensor(cls, name: str):
"""
Check if the tensor exists in the loader.
Args:
name: Name of the tensor to check
Returns:
bool: True if the tensor exists, False otherwise
"""
pass
class SafeTensorLoader(ModelLoader):
tensor_file_map: dict
tensor_type_map: dict
file_handle_map: dict
tensor_device_map: dict
def __init__(self, file_path: str):
self.__load_tensor_file_map(file_path)
@ -28,6 +51,10 @@ class SafeTensorLoader:
folder_path = os.path.dirname(file_path)
else:
folder_path = file_path
self.file_handle_map = {}
self.tensor_file_map = {}
self.tensor_type_map = {}
self.tensor_device_map = {}
found_safetensor = False
for root, _, files in os.walk(folder_path):
@ -57,7 +84,11 @@ class SafeTensorLoader:
# raise FileNotFoundError(f"No Safetensor files found in {folder_path}")
def load_tensor(self, key: str, device: str="cpu"):
if key not in self.tensor_file_map:
if translate_name_to_gguf(key) in self.tensor_file_map:
key = translate_name_to_gguf(key)
elif key in self.tensor_file_map:
pass
else:
raise KeyError(f"Key {key} not found in Safetensor files")
file = self.tensor_file_map[key]
f = self.file_handle_map.get(file)
@ -66,13 +97,145 @@ class SafeTensorLoader:
tensor = f.get_tensor(key)
return tensor.to(device)
def load_experts(self, key: str, device: str="cpu"):
'''
Load experts from safetensor
key: the name of the experts
device: the device to load the experts to
return: dict,
{up: tensor, down: tensor, gate: tensor, up_type: int, down_type: int, gate_type: int}
{xxx}_type: the type of the up tensor, corresponding to the ggml type
'''
if self.has_tensor(translate_name_to_gguf(key)+".ffn_gate_exps.weight"):
# legacy branch for loading hybrid model
base_key = translate_name_to_gguf(key)
# Load experts from safetensor
gate_key = f"{base_key}.ffn_gate_exps.weight"
gate_type_key = f"{base_key}.ffn_gate_exps.ggml_type"
up_key = f"{base_key}.ffn_up_exps.weight"
up_type_key = f"{base_key}.ffn_up_exps.ggml_type"
down_key = f"{base_key}.ffn_down_exps.weight"
down_type_key = f"{base_key}.ffn_down_exps.ggml_type"
gate_tensor = self.load_tensor(gate_key, device).numpy()
up_tensor = self.load_tensor(up_key, device).numpy()
down_tensor = self.load_tensor(down_key, device).numpy()
gate_type = self.load_tensor(gate_type_key, device).item()
up_type = self.load_tensor(up_type_key, device).item()
down_type = self.load_tensor(down_type_key, device).item()
return {
"up": up_tensor,
"gate": gate_tensor,
"down": down_tensor,
"up_type": up_type,
"gate_type": gate_type,
"down_type": down_type
}
else:
# Load experts from safetensor
base_key = key # e.g. "model.layers.3.mlp.experts"
experts_count = 0
# First, count how many experts we have by checking for expert 0's up_proj
while self.has_tensor(f"{base_key}.{experts_count}.up_proj.weight"):
experts_count += 1
if experts_count == 0:
raise ValueError(f"No experts found for key {base_key}")
# Initialize empty lists to store tensors for each projection type
up_projs = []
gate_projs = []
down_projs = []
# Load all expert weights
for expert_id in range(experts_count):
up_key = f"{base_key}.{expert_id}.up_proj.weight"
gate_key = f"{base_key}.{expert_id}.gate_proj.weight"
down_key = f"{base_key}.{expert_id}.down_proj.weight"
up_tensor = self.load_tensor(up_key, device)
gate_tensor = self.load_tensor(gate_key, device)
down_tensor = self.load_tensor(down_key, device)
up_projs.append(up_tensor)
gate_projs.append(gate_tensor)
down_projs.append(down_tensor)
# Stack the tensors along a new dimension
up_tensor = torch.stack(up_projs, dim=0)
gate_tensor = torch.stack(gate_projs, dim=0)
down_tensor = torch.stack(down_projs, dim=0)
# Get original dtype for GGML type determination
orig_up_dtype = up_tensor.dtype
orig_gate_dtype = gate_tensor.dtype
orig_down_dtype = down_tensor.dtype
# Convert to numpy with proper bfloat16 support
up_numpy = up_tensor.view(torch.uint16).numpy()
gate_numpy = gate_tensor.view(torch.uint16).numpy()
down_numpy = down_tensor.view(torch.uint16).numpy()
# Determine tensor data types for GGML conversion
def get_ggml_type(dtype):
if dtype == torch.float32:
return GGMLQuantizationType.F32
elif dtype == torch.float16:
return GGMLQuantizationType.F16
elif dtype == torch.bfloat16:
return GGMLQuantizationType.BF16
else:
raise ValueError(f"Unsupported tensor dtype: {dtype}")
return {
"up": up_numpy,
"gate": gate_numpy,
"down": down_numpy,
"up_type": get_ggml_type(orig_up_dtype),
"gate_type": get_ggml_type(orig_gate_dtype),
"down_type": get_ggml_type(orig_down_dtype)
}
def load_gate(self, key: str, device: str="cpu"):
'''
Load gate from safetensor
key: the name of the gate
device: the device to load the gate to
return: dict,
{'weight': tensor, 'e_score_correction_bias': tensor}
'''
target = ["weight", "e_score_correction_bias"]
res = {'weight': None, 'e_score_correction_bias': None}
if self.has_tensor(translate_name_to_gguf(key)+".ffn_gate_exps.weight"):
# legacy branch for loading hybrid model
base_key = key
for k in target:
translated_key = translate_name_to_gguf(f"{base_key}.{k}")
if self.has_tensor(translated_key):
tensor = self.load_tensor(translated_key, device)
res[k] = tensor
else:
# Load gate from safetensor
base_key = key
for k in target:
if self.has_tensor(f"{base_key}.{k}"):
tensor = self.load_tensor(f"{base_key}.{k}", device)
res[k] = tensor
return res
def close_all_handles(self):
for handle in self.file_handle_map.values():
handle.close()
self.file_handle_map.clear()
def load_dequantized_tensor(self, key:str, device: str="cpu"):
if key not in self.tensor_file_map:
if key in self.tensor_file_map and translate_name_to_gguf(key):
pass
elif translate_name_to_gguf(key) in self.tensor_file_map:
key = translate_name_to_gguf(key)
else:
raise KeyError(f"Key {key} not found in Safetensor files")
file = self.tensor_file_map[key]
f = self.file_handle_map.get(file)
@ -84,3 +247,314 @@ class SafeTensorLoader:
weight_scale_inv = f.get_tensor(key[:-7] + ".weight_scale_inv").to(device)
tensor = weight_dequant(tensor, weight_scale_inv)
return tensor.to(device)
def has_tensor(self, name: str):
return name in self.tensor_file_map or translate_name_to_gguf(name) in self.tensor_file_map
class GGUFLoader(ModelLoader):
tensor_info: dict
gguf_path: str
tensor_file_map: dict # {tensor_name: tensor_file_path}
gguf_file_meta: dict
safetensor_loader: SafeTensorLoader
def __init__(self, gguf_path: str):
# Check dir exist
if not os.path.exists(gguf_path):
raise FileNotFoundError(f"GGUF dir not found: {gguf_path}")
if os.path.isfile(gguf_path):
gguf_path = os.path.dirname(gguf_path)
self.safetensor_loader = None
self.tensor_info = {}
self.gguf_path = gguf_path
self.tensor_file_map = {}
self.file_data_map = {}
self.gguf_file_meta = {}
self.tensor_device_map = {}
# Walk through all the .gguf files in the directory
found_gguf = False
for root, dirs, files in os.walk(gguf_path):
for file in files:
if file.endswith(".gguf"):
found_gguf = True
file_name = os.path.join(root, file)
with open(file_name, "rb") as f:
self.load_gguf(f)
if file_name not in self.file_data_map:
self.file_data_map[file_name] = np.memmap(file_name, mode = 'r')
if not found_gguf:
raise FileNotFoundError(f"Cannot find any .gguf files in: {gguf_path}")
def load_gguf(self, f):
f.seek(0)
assert f.read(4) == b'GGUF'
values = struct.unpack("<IQQ", f.read(4+8+8))
version, n_tensors, n_kv = values
if version != 3:
warnings.warn(f"Version {version} has never been tested, might not work")
info = {}
for _ in range(n_kv):
name = read_value(f, DATA_TYPES["string"])
data_type = struct.unpack("<I", f.read(4))[0]
info[name] = read_value(f, data_type)
tensor_info = {}
for _ in range(n_tensors):
name = read_value(f, DATA_TYPES["string"])
shape_len = read_value(f, DATA_TYPES["uint32"])
shape = [read_value(f, DATA_TYPES["uint64"]) for _ in range(shape_len)]
ggml_type = read_value(f, DATA_TYPES["uint32"])
bad_offset = read_value(f, DATA_TYPES["uint64"])
n_elems = int(math.prod(shape))
block_size, type_size = GGML_QUANT_SIZES[ggml_type]
n_bytes = n_elems * type_size // block_size
np_dims = tuple(reversed(shape))
item_type: npt.DTypeLike
if ggml_type == GGMLQuantizationType.F16:
item_count = n_elems
item_type = np.float16
elif ggml_type == GGMLQuantizationType.F32:
item_count = n_elems
item_type = np.float32
elif ggml_type == GGMLQuantizationType.F64:
item_count = n_elems
item_type = np.float64
elif ggml_type == GGMLQuantizationType.I8:
item_count = n_elems
item_type = np.int8
elif ggml_type == GGMLQuantizationType.I16:
item_count = n_elems
item_type = np.int16
elif ggml_type == GGMLQuantizationType.I32:
item_count = n_elems
item_type = np.int32
elif ggml_type == GGMLQuantizationType.I64:
item_count = n_elems
item_type = np.int64
else:
item_count = n_bytes
item_type = np.uint8
np_dims = quant_shape_to_byte_shape(np_dims, ggml_type)
tensor_info[name] = {
"ggml_type": ggml_type,
"shape": shape,
"bad_offset": bad_offset,
"item_type": item_type,
"item_count": item_count,
"np_dims": np_dims
}
start = f.tell()
# Alignment is 32 by default.
# https://github.com/ggerganov/ggml/blob/e1daebbf9d38d510ba456c4d50b4500a73ac2b14/docs/gguf.md?plain=1#L253
alignment = info.get("general.alignment", 32)
# Inconveniently, the offset defined in gguf files is relative to the
# end of the header and is unaligned.
# We need to compute the absolute file offset ourselves instead.
for t in tensor_info.values():
offset = start + t["bad_offset"]
offset += (alignment - offset % alignment) % alignment
t["offset"] = offset
for name in tensor_info:
self.tensor_file_map[name] = f.name
self.tensor_info.update(tensor_info)
self.gguf_file_meta.update(info)
def get_mmap_tensor(self, name):
name = translate_name_to_gguf(name)
t = self.tensor_info[name]
mmap_data = self.file_data_map[ self.tensor_file_map[name] ]
offset = t["offset"]
item_type = t["item_type"]
item_count = t["item_count"]
itemsize = int(np.empty([], dtype = item_type).itemsize)
return mmap_data[offset : offset + itemsize * item_count]
def get_undequanted_tensor_and_ggml_type(self, name):
name = translate_name_to_gguf(name)
t = self.tensor_info[name]
data = self.get_mmap_tensor(name)
ggml_type = t["ggml_type"]
data = torch.from_numpy(data)
return data, ggml_type
def load_expert_tensor(self, name, data, expert_id, elements_per_expert, device = "cuda", target_dtype = torch.get_default_dtype())->torch.Tensor:
name = translate_name_to_gguf(name)
t = self.tensor_info[name]
shape = t["shape"]
ggml_type = t["ggml_type"]
if ggml_type not in GGML_NAMES:
raise NotImplementedError(f"ggml_type {ggml_type} not implemented")
ggml_name = GGML_NAMES[ggml_type]
# TODO: experts may fused in quant block, split it
assert elements_per_expert % GGML_ELEMENTS_PER_BLOCK[ggml_name] == 0, "experts may fused in quant block, please use CPU dequant"
blocks_per_experts = elements_per_expert // GGML_ELEMENTS_PER_BLOCK[ggml_name]
block_size = GGML_BLOCK_SIZES[ggml_name]
offset = expert_id * block_size * blocks_per_experts
data = data[offset: offset + block_size * blocks_per_experts]
if "cuda" in device.lower():
values = GGML_DEQUANTIZE_GPU[ggml_name](data, device, target_dtype)
else:
values = GGML_DEQUANTIZE[ggml_name](data)
values = torch.from_numpy(values.copy())
if ggml_name == "BF16":
values = values.view(torch.bfloat16)
values = values.view(shape[-2::-1])
return values
def load_gguf_tensor(self, name: str, device:str = "cpu", target_dtype = None)->torch.Tensor:
name = translate_name_to_gguf(name)
t = self.tensor_info[name]
if target_dtype == None:
target_dtype = torch.get_default_dtype()
shape = t["shape"]
ggml_type = t["ggml_type"]
if ggml_type not in GGML_NAMES:
raise NotImplementedError(f"ggml_type {ggml_type} not implemented")
ggml_name = GGML_NAMES[ggml_type]
data = self.get_mmap_tensor(name)
block_size = GGML_BLOCK_SIZES[ggml_name]
elements_per_block = GGML_ELEMENTS_PER_BLOCK[ggml_name]
num_elements = int(np.prod(shape))
num_blocks = num_elements // elements_per_block
blocks_per_iter = 16384
if num_blocks > blocks_per_iter: # dequant large tensor
values = torch.empty((num_blocks, elements_per_block), dtype=target_dtype, device=device)
for i in range( (num_blocks + blocks_per_iter - 1) // blocks_per_iter):
blocks_begin = i * blocks_per_iter
blocks_end = min(blocks_begin + blocks_per_iter, num_blocks)
if "cuda" in device.lower():
cur_values = GGML_DEQUANTIZE_GPU[ggml_name](data[blocks_begin*block_size : blocks_end*block_size], device, target_dtype)
else:
cur_values = GGML_DEQUANTIZE[ggml_name](data[blocks_begin*block_size : blocks_end*block_size])
cur_values = torch.from_numpy(cur_values.copy())
cur_values = cur_values.view(-1, elements_per_block)
if ggml_name == "BF16":
cur_values = cur_values.view(torch.bfloat16)
values[blocks_begin : blocks_end] = cur_values
else:
if "cuda" in device.lower():
values = GGML_DEQUANTIZE_GPU[ggml_name](data, device)
else:
values = GGML_DEQUANTIZE[ggml_name](data)
values = torch.from_numpy(values)
if ggml_name == "BF16":
values = values.view(torch.bfloat16)
values = values.view(shape[::-1])
if "attn_q" in name and self.gguf_file_meta['general.architecture'] in ["llama"]:
n_head = self.gguf_file_meta['llama.attention.head_count']
values = (values.reshape(n_head, values.shape[0] // n_head // 2, 2, *values.shape[1:])
.swapaxes(1, 2)
.reshape(values.shape))
elif "attn_k" in name and self.gguf_file_meta['general.architecture'] in ["llama"]:
n_head = self.gguf_file_meta['llama.attention.head_count_kv']
values = (values.reshape(n_head, values.shape[0] // n_head // 2, 2, *values.shape[1:])
.swapaxes(1, 2)
.reshape(values.shape))
return values
def has_tensor(self, name: str):
name = translate_name_to_gguf(name)
return name in self.tensor_info
def get_ggml_type(self, name: str):
name = translate_name_to_gguf(name)
if name not in self.tensor_info:
raise KeyError(f"Key {name} not found in GGUF files")
return self.tensor_info[name]["ggml_type"]
class ModelLoaderFactory:
"""
Factory class for creating model loaders.
Automatically detects the model format based on file extensions in the directory.
"""
@staticmethod
def create_loader(path: str):
"""
Create a model loader for the given path by detecting the model format.
The function checks for the presence of .safetensors or .gguf files
in the specified path and creates the appropriate loader.
Args:
path: Path to the model directory or file
Returns:
An appropriate ModelLoader instance (SafeTensorLoader or GGUFLoader)
Raises:
FileNotFoundError: If no supported model files are found in the path
"""
if not os.path.exists(path):
raise FileNotFoundError(f"Path not found: {path}")
# Normalize to directory path if a file was provided
if os.path.isfile(path):
if path.endswith(".safetensors"):
return SafeTensorLoader(path)
elif path.endswith(".gguf"):
return GGUFLoader(path)
else:
folder_path = os.path.dirname(path)
else:
folder_path = path
# Check for safetensors files
has_safetensors = False
has_gguf = False
for root, _, files in os.walk(folder_path):
for file in files:
if file.endswith(".safetensors"):
has_safetensors = True
break
elif file.endswith(".gguf"):
has_gguf = True
break
if has_safetensors or has_gguf:
break
# Create the appropriate loader based on detected file types
# Prioritize SafeTensor over GGUF if both are present
if has_safetensors:
try:
return SafeTensorLoader(folder_path)
except Exception as e:
print(f"Failed to create SafeTensorLoader: {e}")
# Fall through to try GGUF if SafeTensor fails
if not has_gguf:
raise
if has_gguf:
try:
return GGUFLoader(folder_path)
except Exception as e:
print(f"Failed to create GGUFLoader: {e}")
raise
# No supported model files found
raise FileNotFoundError(f"No .safetensors or .gguf files found in: {folder_path}")

View file

@ -22,8 +22,7 @@ from transformers import (
EtaLogitsWarper,
)
from ktransformers.util.custom_gguf import translate_name_to_gguf
from ktransformers.util.custom_gguf import GGUFLoader
from ktransformers.util.custom_loader import ModelLoaderFactory, ModelLoader, SafeTensorLoader, GGUFLoader, translate_name_to_gguf
from ktransformers.operators import base_operator
from ktransformers.models.custom_cache import StaticCache
from ktransformers.util.cuda_graph_runner import CUDAGraphRunner
@ -98,25 +97,24 @@ def get_all_used_cuda_device(device_map:dict):
all_device_list = list(all_device_list)
return all_device_list
def load_cur_state_dict(module: nn.Module, gguf_loader: GGUFLoader, prefix: str = ""):
def load_cur_state_dict(module: nn.Module, gguf_loader: ModelLoader, prefix: str = ""):
prefix = prefix.replace("orig_module.", "")
persistent_buffers = {k: v for k, v in module._buffers.items() if k not in module._non_persistent_buffers_set}
local_name_params = itertools.chain(module._parameters.items(), persistent_buffers.items())
local_state = {k: v for k, v in local_name_params if v is not None}
for name, param in local_state.items():
key = prefix + name
translated_key = translate_name_to_gguf(key)
translated_key = key
# TODO: Merge all loader.
# I know this is ugly but lets do it for now.
if gguf_loader.safetensor_loader is not None:
load_dequantized_tensor = gguf_loader.safetensor_loader.load_dequantized_tensor
tensor_file_map = gguf_loader.safetensor_loader.tensor_file_map
if isinstance(gguf_loader, SafeTensorLoader):
load_dequantized_tensor = gguf_loader.load_dequantized_tensor
else:
load_dequantized_tensor = gguf_loader.load_gguf_tensor
tensor_file_map = gguf_loader.tensor_file_map
if translated_key in tensor_file_map:
if gguf_loader.has_tensor(translated_key):
target_dtype = torch.get_default_dtype()
device = get_device(translated_key[:translated_key.rfind(".")], gguf_loader.tensor_device_map)
print(f"loading {translated_key} to {device}")
@ -128,7 +126,7 @@ def load_cur_state_dict(module: nn.Module, gguf_loader: GGUFLoader, prefix: str
#print(load_config.tensor_file_map.keys())
raise Exception(f"can't find {translated_key} in GGUF file!")
def load_weights(module:nn.Module, gguf_loader:GGUFLoader, prefix=''):
def load_weights(module:nn.Module, gguf_loader:ModelLoader, prefix=''):
#print(f"recursively loading weights {prefix}")
if not isinstance(module, base_operator.BaseInjectedModule):
load_cur_state_dict(module, gguf_loader, prefix)

View file

@ -0,0 +1,367 @@
from abc import ABC, abstractmethod
import os
import torch
import numpy as np
from safetensors import safe_open
from typing import Dict, Any, Optional, Union
class ModelLoader(ABC):
"""
Abstract base class for model loaders.
Defines the interface that all model loaders must implement.
"""
@abstractmethod
def load_tensor(self, name: str, device: str = "cpu") -> torch.Tensor:
"""
Load a tensor by name.
Args:
name: Name of the tensor to load
device: Device to load the tensor to
Returns:
The loaded tensor
"""
pass
@classmethod
@abstractmethod
def supports_format(cls, path: str) -> bool:
"""
Check if this loader supports the given path format.
Args:
path: Path to check
Returns:
True if this loader supports the given path, False otherwise
"""
pass
class SafeTensorLoader(ModelLoader):
"""
Loader for SafeTensor format models.
"""
def __init__(self, path: str):
"""
Initialize the SafeTensor loader.
Args:
path: Path to the model directory or file
"""
self.tensor_file_map = {} # Maps tensor names to file paths
self.file_handle_map = {} # Maps file names to file handles
self._load_tensor_file_map(path)
def _load_tensor_file_map(self, path: str) -> None:
"""
Load the tensor file map from the given path.
Args:
path: Path to the model directory or file
"""
# Normalize path to directory
if not os.path.exists(path):
raise FileNotFoundError(f"Path not found: {path}")
if os.path.isfile(path):
folder_path = os.path.dirname(path)
else:
folder_path = path
found_safetensor = False
for root, _, files in os.walk(folder_path):
files = sorted(files)
for file in files:
if file.endswith(".safetensors"):
found_safetensor = True
file_path = os.path.join(root, file)
if file not in self.file_handle_map:
try:
handle = safe_open(file_path, framework="pt")
self.file_handle_map[file] = handle
except Exception as e:
print(f"Error opening Safetensor file {file_path}: {e}")
continue
f = self.file_handle_map.get(file)
if f is None:
continue
try:
for key in f.keys():
self.tensor_file_map[key] = file
except Exception as e:
print(f"Error reading Safetensor file {file_path}: {e}")
if not found_safetensor:
# Not raising an error here allows for the factory to try other loaders
print(f"No Safetensor files found in {folder_path}")
def load_tensor(self, name: str, device: str = "cpu") -> torch.Tensor:
"""
Load a tensor by name.
Args:
name: Name of the tensor to load
device: Device to load the tensor to
Returns:
The loaded tensor
"""
if name not in self.tensor_file_map:
raise KeyError(f"Key {name} not found in Safetensor files")
file = self.tensor_file_map[name]
f = self.file_handle_map.get(file)
if f is None:
raise FileNotFoundError(f"File {file} not found in Safetensor files")
tensor = f.get_tensor(name)
return tensor.to(device)
def load_dequantized_tensor(self, name: str, device: str = "cpu") -> torch.Tensor:
"""
Load and dequantize a tensor.
Args:
name: Name of the tensor to load
device: Device to load the tensor to
Returns:
The dequantized tensor
"""
if name not in self.tensor_file_map:
raise KeyError(f"Key {name} not found in Safetensor files")
file = self.tensor_file_map[name]
f = self.file_handle_map.get(file)
if f is None:
raise FileNotFoundError(f"File {file} not found in Safetensor files")
tensor = f.get_tensor(name).to(device)
if name.endswith(".weight"):
if name[:-7] + ".weight_scale_inv" in self.tensor_file_map:
weight_scale_inv = f.get_tensor(name[:-7] + ".weight_scale_inv").to(device)
# Assuming weight_dequant function is imported
from ktransformers.ktransformers_ext.triton.fp8gemm import weight_dequant
tensor = weight_dequant(tensor, weight_scale_inv)
return tensor.to(device)
def close_all_handles(self) -> None:
"""
Close all file handles.
"""
for handle in self.file_handle_map.values():
handle.close()
self.file_handle_map.clear()
@classmethod
def supports_format(cls, path: str) -> bool:
"""
Check if this loader supports the given path format.
Args:
path: Path to check
Returns:
True if safetensor files are found in the path, False otherwise
"""
# Normalize path to directory
if not os.path.exists(path):
return False
if os.path.isfile(path):
if path.endswith(".safetensors"):
return True
folder_path = os.path.dirname(path)
else:
folder_path = path
# Check if any safetensor files exist in the folder
for root, _, files in os.walk(folder_path):
for file in files:
if file.endswith(".safetensors"):
return True
return False
class GGUFLoader(ModelLoader):
"""
Loader for GGUF format models.
"""
def __init__(self, path: str):
"""
Initialize the GGUF loader.
Args:
path: Path to the model directory or file
"""
# Check if path exists
if not os.path.exists(path):
raise FileNotFoundError(f"GGUF dir not found: {path}")
if os.path.isfile(path):
self.gguf_path = os.path.dirname(path)
else:
self.gguf_path = path
self.tensor_info = {} # Stores tensor metadata
self.tensor_file_map = {} # Maps tensor names to file paths
self.file_data_map = {} # Maps file paths to memory-mapped data
self.gguf_file_meta = {} # Stores GGUF metadata
# For compatibility with the factory pattern
self.safetensor_loader = None
# Scan all GGUF files in the directory
found_gguf = False
for root, _, files in os.walk(self.gguf_path):
for file in files:
if file.endswith(".gguf"):
found_gguf = True
file_path = os.path.join(root, file)
with open(file_path, "rb") as f:
self._load_gguf(f)
if file_path not in self.file_data_map:
self.file_data_map[file_path] = np.memmap(file_path, mode='r')
if not found_gguf:
raise FileNotFoundError(f"Cannot find any .gguf files in: {self.gguf_path}")
def _load_gguf(self, f) -> None:
"""
Load GGUF file metadata and tensor info.
Args:
f: File handle of the GGUF file
"""
# Implementation should follow the original GGUFLoader._load_gguf
# This is a simplified version for illustration
f.seek(0)
assert f.read(4) == b'GGUF'
# Read header
values = struct.unpack("<IQQ", f.read(4+8+8))
version, n_tensors, n_kv = values
if version != 3:
warnings.warn(f"Version {version} has never been tested, might not work")
# Read key-value pairs
info = {}
for _ in range(n_kv):
name = self._read_value(f, 8) # DATA_TYPES["string"]
data_type = struct.unpack("<I", f.read(4))[0]
info[name] = self._read_value(f, data_type)
# Read tensor info
tensor_info = {}
for _ in range(n_tensors):
name = self._read_value(f, 8) # DATA_TYPES["string"]
shape_len = self._read_value(f, 4) # DATA_TYPES["uint32"]
shape = [self._read_value(f, 10) for _ in range(shape_len)] # DATA_TYPES["uint64"]
ggml_type = self._read_value(f, 4) # DATA_TYPES["uint32"]
offset = self._read_value(f, 10) # DATA_TYPES["uint64"]
# Additional tensor metadata would be calculated here
# For brevity, we're omitting the detailed tensor metadata calculation
tensor_info[name] = {
"ggml_type": ggml_type,
"shape": shape,
"offset": offset,
# ... other tensor metadata
}
start = f.tell()
alignment = info.get("general.alignment", 32)
# Calculate actual file offsets
for t in tensor_info.values():
offset = start + t["offset"]
offset += (alignment - offset % alignment) % alignment
t["offset"] = offset
# Update file maps
for name in tensor_info:
self.tensor_file_map[name] = f.name
self.tensor_info.update(tensor_info)
self.gguf_file_meta.update(info)
def _read_value(self, f, data_type) -> Any:
"""
Read a value from the file according to its data type.
Args:
f: File handle
data_type: Type of data to read
Returns:
The read value
"""
# Simplified implementation
# In a complete implementation, this would handle all data types
if data_type == 8: # DATA_TYPES["string"]
length = struct.unpack("<Q", f.read(8))[0]
return f.read(length).decode("utf-8")
elif data_type == 4: # DATA_TYPES["uint32"]
return struct.unpack("<I", f.read(4))[0]
elif data_type == 10: # DATA_TYPES["uint64"]
return struct.unpack("<Q", f.read(8))[0]
# ... handling for other data types
return None
def load_tensor(self, name: str, device: str = "cpu") -> torch.Tensor:
"""
Load a tensor by name.
Args:
name: Name of the tensor to load
device: Device to load the tensor to
Returns:
The loaded tensor
"""
# This should call load_gguf_tensor with the appropriate parameters
return self.load_gguf_tensor(name, device)
def load_gguf_tensor(self, name: str, device: str = "cpu", target_dtype = None) -> torch.Tensor:
"""
Load a GGUF tensor by name.
Args:
name: Name of the tensor to load
device: Device to load the tensor to
target_dtype: Target data type for the tensor
Returns:
The loaded tensor
"""
# Implementation would follow the original GGUFLoader.load_gguf_tensor
# This is a placeholder for illustration
if name not in self.tensor_info:
raise KeyError(f"Tensor {name} not found")
# Actual implementation would dequantize the tensor data
# and return a torch.Tensor
return torch.zeros(1, device=device) # Placeholder
@classmethod
def supports_format(cls, path: str) -> bool:
"""
Check if this loader supports the given path format.
Args:
path: Path to check
Returns:
True if GGUF files are found in the path, False otherwise
"""
# Normalize path to directory
if not os.path.exists(path):
return False
if os.path.isfile(path):
return path.endswith(".gguf")
# Check if any GGUF files exist in the folder
for root, _, files in os.walk(path):
for file in files:
if file.endswith(".gguf"):
return True
return False

View file

@ -6,7 +6,7 @@ import sys
# sys.path.insert(0, "/home/azure/ktransformers")
import argparse
import torch
from ktransformers.util.custom_gguf import GGUFLoader, translate_name_to_gguf
from ktransformers.util.custom_loader import GGUFLoader, translate_name_to_gguf
from safetensors import safe_open
from safetensors.torch import save_file
import re