diff --git a/csrc/ktransformers_ext/cuda/test_dequant.py b/csrc/ktransformers_ext/cuda/test_dequant.py index abca745..c39d6c7 100644 --- a/csrc/ktransformers_ext/cuda/test_dequant.py +++ b/csrc/ktransformers_ext/cuda/test_dequant.py @@ -1,7 +1,7 @@ import os import sys sys.path.insert(0,"/home/zbx/ktransformers") -from ktransformers.util.custom_gguf import GGUFLoader +from ktransformers.util.custom_loader import GGUFLoader import torch gguf_loader_1 = GGUFLoader("/mnt/data/model/DeepseekV3-q4km-gguf") diff --git a/install-with-cache.sh b/install-with-cache.sh new file mode 100755 index 0000000..cef4341 --- /dev/null +++ b/install-with-cache.sh @@ -0,0 +1,26 @@ +#!/bin/bash +set -e + +# clear build dirs +# rm -rf build +# rm -rf *.egg-info +# rm -rf csrc/build +# rm -rf csrc/ktransformers_ext/build +# rm -rf csrc/ktransformers_ext/cuda/build +# rm -rf csrc/ktransformers_ext/cuda/dist +# rm -rf csrc/ktransformers_ext/cuda/*.egg-info +rm -rf ~/.ktransformers +echo "Installing python dependencies from requirements.txt" +pip install -r requirements-local_chat.txt +pip install -r ktransformers/server/requirements.txt +echo "Installing ktransformers" +KTRANSFORMERS_FORCE_BUILD=TRUE USE_BALANCE_SERVE=1 pip install -v . --no-build-isolation +pip install third_party/custom_flashinfer/ -v + +# SITE_PACKAGES=$(python -c "import site; print(site.getsitepackages()[0])") +# echo "Copying thirdparty libs to $SITE_PACKAGES" +# cp -a csrc/balance_serve/build/third_party/prometheus-cpp/lib/libprometheus-cpp-*.so* $SITE_PACKAGES/ +# patchelf --set-rpath '$ORIGIN' $SITE_PACKAGES/sched_ext.cpython* + + +echo "Installation completed successfully" diff --git a/ktransformers/models/custom_cache.py b/ktransformers/models/custom_cache.py index e4a271e..c2901ac 100644 --- a/ktransformers/models/custom_cache.py +++ b/ktransformers/models/custom_cache.py @@ -66,7 +66,7 @@ class StaticCache(transformers.StaticCache): self.page_table_list = [] for idx in range(config.num_hidden_layers): if isinstance(device, dict): - target_device = device[f"blk.{idx}.self_attn"]["generate_device"] + target_device = device[f"model.layers.{idx}.self_attn"]["generate_device"] else: target_device = device @@ -91,7 +91,7 @@ class StaticCache(transformers.StaticCache): # Note: `mark_static_address` is used to tag the cache as an fixed data pointer, preventing cuda graph # breaks when updating the cache. if isinstance(device, dict): - target_device = device[f"blk.{idx}.self_attn"]["generate_device"] + target_device = device[f"model.layers.{idx}.self_attn"]["generate_device"] else: target_device = device diff --git a/ktransformers/models/custom_modeling_qwen2_moe.py b/ktransformers/models/custom_modeling_qwen2_moe.py index 5740c14..1c84cbf 100644 --- a/ktransformers/models/custom_modeling_qwen2_moe.py +++ b/ktransformers/models/custom_modeling_qwen2_moe.py @@ -39,7 +39,7 @@ class KQwen2MoeForCausalLM(Qwen2MoePreTrainedModel): self.cache = cache self.vocab_size = config.vocab_size self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) - self.attn = [None] * 10 + self.attn = [None] * 100 def init_wrapper(self, use_cuda_graph, device, max_batch_token, max_batch_size, max_pages, cuda_graph_idx = 0): self.attn[cuda_graph_idx] = flashInferAttn(use_cuda_graph=use_cuda_graph, max_batch_token=max_batch_token, max_batch_size=max_batch_size, max_pages=max_pages, device=device) diff --git a/ktransformers/models/custom_modeling_qwen3_moe.py b/ktransformers/models/custom_modeling_qwen3_moe.py index 1cb8c46..32b9797 100644 --- a/ktransformers/models/custom_modeling_qwen3_moe.py +++ b/ktransformers/models/custom_modeling_qwen3_moe.py @@ -39,7 +39,7 @@ class KQwen3MoeForCausalLM(Qwen3MoePreTrainedModel): self.cache = cache self.vocab_size = config.vocab_size self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) - self.attn = [None] * 10 + self.attn = [None] * 100 def init_wrapper(self, use_cuda_graph, device, max_batch_token, max_batch_size, max_pages, cuda_graph_idx = 0): self.attn[cuda_graph_idx] = flashInferAttn(use_cuda_graph=use_cuda_graph, max_batch_token=max_batch_token, max_batch_size=max_batch_size, max_pages=max_pages, device=device) diff --git a/ktransformers/operators/RoPE.py b/ktransformers/operators/RoPE.py index 75d1a6e..85d6556 100644 --- a/ktransformers/operators/RoPE.py +++ b/ktransformers/operators/RoPE.py @@ -23,7 +23,7 @@ from ktransformers.models.modeling_deepseek import ( yarn_find_correction_range ) from ktransformers.operators.base_operator import BaseInjectedModule -from ktransformers.util.custom_gguf import GGUFLoader +from ktransformers.util.custom_loader import GGUFLoader from ktransformers.util.utils import InferenceState from transformers.configuration_utils import PretrainedConfig import torch diff --git a/ktransformers/operators/attention.py b/ktransformers/operators/attention.py index b5b7c13..caceb98 100644 --- a/ktransformers/operators/attention.py +++ b/ktransformers/operators/attention.py @@ -15,7 +15,7 @@ from ktransformers.models.modeling_llama import LlamaRotaryEmbedding from ktransformers.models.modeling_deepseek import DeepseekV2Attention, apply_rotary_pos_emb from typing import Optional, Tuple from ktransformers.operators.base_operator import BaseInjectedModule -from ktransformers.util.custom_gguf import GGUFLoader +from ktransformers.util.custom_loader import GGUFLoader from ktransformers.util.utils import get_compute_capability import logging from transformers.configuration_utils import PretrainedConfig diff --git a/ktransformers/operators/balance_serve_attention.py b/ktransformers/operators/balance_serve_attention.py index a785413..c7e9328 100644 --- a/ktransformers/operators/balance_serve_attention.py +++ b/ktransformers/operators/balance_serve_attention.py @@ -11,7 +11,7 @@ from ktransformers.models.modeling_qwen2_moe import Qwen2MoeAttention from ktransformers.models.modeling_qwen3_moe import Qwen3MoeAttention from typing import Optional, Tuple from ktransformers.operators.base_operator import BaseInjectedModule -from ktransformers.util.custom_gguf import GGUFLoader +from ktransformers.util.custom_loader import GGUFLoader import logging from transformers.configuration_utils import PretrainedConfig from flashinfer import BatchMLAPagedAttentionWrapper diff --git a/ktransformers/operators/base_operator.py b/ktransformers/operators/base_operator.py index 0fa2efd..5e49709 100644 --- a/ktransformers/operators/base_operator.py +++ b/ktransformers/operators/base_operator.py @@ -6,7 +6,7 @@ Copyright (c) 2024 by KVCache.AI, All Rights Reserved. ''' from typing import Any from torch import nn, Tensor -from ktransformers.util.custom_gguf import GGUFLoader +from ktransformers.util.custom_loader import GGUFLoader from transformers.configuration_utils import PretrainedConfig import ktransformers.util.utils as utils class BaseInjectedModule(nn.Module): diff --git a/ktransformers/operators/experts.py b/ktransformers/operators/experts.py index 34f0af0..279439d 100644 --- a/ktransformers/operators/experts.py +++ b/ktransformers/operators/experts.py @@ -26,7 +26,8 @@ sys.path.append(os.path.join(os.path.dirname(__file__), "..", "ktransformers_ext import cpuinfer_ext from cpuinfer_ext.moe import MOEConfig, MOE import ctypes -from ktransformers.util.custom_gguf import GGMLQuantizationType, GGUFLoader +from ktransformers.util.custom_gguf import GGMLQuantizationType +from ktransformers.util.custom_loader import GGUFLoader, SafeTensorLoader, ModelLoader from ktransformers.util.utils import InferenceState from ktransformers.server.config.config import Config from transformers.activations import ACT2FN @@ -39,8 +40,18 @@ from ktransformers.operators.cpuinfer import CPUInfer def deduplicate_and_sort(lst): return sorted(set(lst)) +def generate_cuda_graphs(chunk_size: int) -> list: + assert chunk_size <= 1024 or chunk_size % 1024 == 0, "chunk_size must <= 1024 or a multiple of 1024" + base_list = [1, 2, 3, Config().max_batch_size, 64, 256, 512, chunk_size] + + if chunk_size <= 1024: + return base_list + + multiples = [i for i in range(1024, chunk_size + 1, 1024)] + + return deduplicate_and_sort(base_list + multiples) #cuda_graphs = [Config().chunk_size] -cuda_graphs = deduplicate_and_sort([1, 2, 3, Config().max_batch_size, 64, Config().chunk_size]) +cuda_graphs = generate_cuda_graphs(Config().chunk_size) # class Base(BaseInjectedModule, ABC): class KExpertsBase(ABC): def __init__(self, key: str, gguf_loader: GGUFLoader, config: PretrainedConfig, orig_module: nn.Module, device: str = "cuda", **kwargs): @@ -77,7 +88,7 @@ class KExpertsBase(ABC): down_type = None for key in keys: - if key + ".ffn_gate_exps.weight" in self.gguf_loader.tensor_info: + if self.gguf_loader.has_tensor(key + ".ffn_gate_exps.weight"): targets = [".ffn_gate_exps.weight", ".ffn_up_exps.weight", ".ffn_down_exps.weight" ] tensors = self.load_multi(key, targets, device=device) gate = tensors[".ffn_gate_exps.weight"] @@ -86,7 +97,7 @@ class KExpertsBase(ABC): gate_type = self.gguf_loader.tensor_info[key + ".ffn_gate_exps.weight"]["ggml_type"] up_type = self.gguf_loader.tensor_info[key + ".ffn_up_exps.weight"]["ggml_type"] down_type = self.gguf_loader.tensor_info[key + ".ffn_down_exps.weight"]["ggml_type"] - elif key + ".ffn_down.0.weight" in self.gguf_loader.tensor_info: + elif self.gguf_loader.has_tensor(key + ".ffn_down.0.weight"): # for supporting Mixtral-8x7B-Instuct gate = [] up = [] @@ -194,7 +205,7 @@ class KExpertsCPU(KExpertsBase): self.config.num_experts_per_tok, self.config.hidden_size, self.config.moe_intermediate_size, - 25600, + max(cuda_graphs), gate_ptr, up_ptr, down_ptr, @@ -212,7 +223,7 @@ class KExpertsCPU(KExpertsBase): self.config.num_experts_per_tok, self.config.hidden_size, self.config.moe_intermediate_size, - 25600, + max(cuda_graphs), gate_ptr, up_ptr, down_ptr, @@ -325,14 +336,19 @@ class KExpertsCPU(KExpertsBase): down_type = None for key in keys: - if self.gguf_loader.safetensor_loader is not None: - # using a temp ugly way to temprary load the tensor - gate = self.gguf_loader.safetensor_loader.load_tensor(key + ".ffn_gate_exps.weight").numpy() - up = self.gguf_loader.safetensor_loader.load_tensor(key + ".ffn_up_exps.weight").numpy() - down = self.gguf_loader.safetensor_loader.load_tensor(key + ".ffn_down_exps.weight").numpy() - gate_type = self.gguf_loader.safetensor_loader.load_tensor(key + ".ffn_gate_exps.ggml_type").item() - up_type = self.gguf_loader.safetensor_loader.load_tensor(key + ".ffn_up_exps.ggml_type").item() - down_type = self.gguf_loader.safetensor_loader.load_tensor(key + ".ffn_down_exps.ggml_type").item() + if isinstance(self.gguf_loader, SafeTensorLoader): + res = self.gguf_loader.load_experts(key) + return {key: res} + elif self.gguf_loader.has_tensor(key + ".ffn_gate_exps.weight"): + gate = self.gguf_loader.get_mmap_tensor(key + ".ffn_gate_exps.weight") + up = self.gguf_loader.get_mmap_tensor(key + ".ffn_up_exps.weight") + down = self.gguf_loader.get_mmap_tensor(key + ".ffn_down_exps.weight") + # gate_type = self.gguf_loader.tensor_info[key + ".ffn_gate_exps.weight"]["ggml_type"] + # up_type = self.gguf_loader.tensor_info[key + ".ffn_up_exps.weight"]["ggml_type"] + # down_type = self.gguf_loader.tensor_info[key + ".ffn_down_exps.weight"]["ggml_type"] + gate_type = self.gguf_loader.get_ggml_type(key + ".ffn_gate_exps.weight") + up_type = self.gguf_loader.get_ggml_type(key + ".ffn_up_exps.weight") + down_type = self.gguf_loader.get_ggml_type(key + ".ffn_down_exps.weight") elif key + ".ffn_gate_exps.weight" in self.gguf_loader.tensor_info: gate = self.gguf_loader.get_mmap_tensor(key + ".ffn_gate_exps.weight") @@ -356,9 +372,9 @@ class KExpertsCPU(KExpertsBase): gate = np.stack(gate) up = np.stack(up) down = np.stack(down) - gate_type = self.gguf_loader.tensor_info[key + ".ffn_gate.0.weight"]["ggml_type"] - up_type = self.gguf_loader.tensor_info[key + ".ffn_up.0.weight"]["ggml_type"] - down_type = self.gguf_loader.tensor_info[key + ".ffn_down.0.weight"]["ggml_type"] + gate_type = self.gguf_loader.get_ggml_type(key + ".ffn_gate.0.weight") + up_type = self.gguf_loader.get_ggml_type(key + ".ffn_up.0.weight") + down_type = self.gguf_loader.get_ggml_type(key + ".ffn_down.0.weight") else: raise ValueError(f"Experts {key} not found in gguf_loader") res = {key:{"gate": gate, "up": up, "down": down, "gate_type": gate_type, "up_type": up_type, "down_type": down_type}} @@ -445,7 +461,7 @@ class KExpertsMarlin(KExpertsBase): down = None for key in keys: - if key + ".ffn_gate_exps.weight" in self.gguf_loader.tensor_info: + if self.gguf_loader.has_tensor(key + ".ffn_gate_exps.weight"): gate = self.gguf_loader.get_mmap_tensor(key + ".ffn_gate_exps.weight") up = self.gguf_loader.get_mmap_tensor(key + ".ffn_up_exps.weight") down = self.gguf_loader.get_mmap_tensor(key + ".ffn_down_exps.weight") diff --git a/ktransformers/operators/flashinfer_batch_prefill_wrapper.py b/ktransformers/operators/flashinfer_batch_prefill_wrapper.py index e934654..287affb 100644 --- a/ktransformers/operators/flashinfer_batch_prefill_wrapper.py +++ b/ktransformers/operators/flashinfer_batch_prefill_wrapper.py @@ -40,7 +40,7 @@ class flashInferAttn(): self.kv_layout = kv_layout self.use_cuda_graph = use_cuda_graph if flashInferAttn.float_workspace_buffer is None: - flashInferAttn.float_workspace_buffer = torch.empty(1024 * 1024 * 1024, dtype=torch.uint8, device=device) + flashInferAttn.float_workspace_buffer = torch.empty(max_batch_token * 1024 * 1024, dtype=torch.uint8, device=device) self.qo_indptr_buf = torch.empty((max_batch_size+1,), dtype=torch.int32, device=device) self.paged_kv_indptr_buf = torch.empty((max_batch_size+1,), dtype=torch.int32, device=device) self.paged_kv_indices_buf = torch.empty((max_pages,), dtype=torch.int32, device=device) diff --git a/ktransformers/operators/gate.py b/ktransformers/operators/gate.py index cf5799e..a6f95ac 100644 --- a/ktransformers/operators/gate.py +++ b/ktransformers/operators/gate.py @@ -6,7 +6,7 @@ import os from ktransformers.operators.base_operator import BaseInjectedModule from ktransformers.operators.base_operator import BaseInjectedModule from ktransformers.operators.linear import KTransformersLinear -from ktransformers.util.custom_gguf import GGUFLoader +from ktransformers.util.custom_loader import GGUFLoader, ModelLoader, SafeTensorLoader from transformers.configuration_utils import PretrainedConfig from abc import ABC, abstractmethod @@ -55,24 +55,20 @@ class KMoEGateBase(ABC): down_type = None for key in keys: - key = ".".join(key.split(".")[:-1]) - if self.gguf_loader.safetensor_loader is not None: - targets = [".ffn_gate_inp.weight", ".exp_probs_b.bias"] - weight = self.gguf_loader.safetensor_loader.load_tensor(key + ".ffn_gate_inp.weight") - e_score_correction_bias = self.gguf_loader.safetensor_loader.load_tensor(key + ".exp_probs_b.bias") - weight_type = weight.dtype - e_score_correction_bias_type = e_score_correction_bias.dtype - res = {"weight": weight, "e_score_correction_bias": e_score_correction_bias, "weight_type": weight_type, "e_score_correction_bias_type": e_score_correction_bias_type} - elif key + ".ffn_gate_inp.weight" in self.gguf_loader.tensor_info: - targets = [".ffn_gate_inp.weight", ".exp_probs_b.bias"] + # key = ".".join(key.split(".")[:-1]) + if isinstance(self.gguf_loader, SafeTensorLoader): + res = self.gguf_loader.load_gate(key, device=device) + elif self.gguf_loader.has_tensor(key+".weight"): + # targets = [".ffn_gate_inp.weight", ".exp_probs_b.bias"] + targets = [".weight", ".e_score_correction_bias"] tensors = self.load_multi(key, targets, device=device) - weight = tensors[".ffn_gate_inp.weight"] - e_score_correction_bias = tensors[".exp_probs_b.bias"] - weight_type = self.gguf_loader.tensor_info[key + ".ffn_gate_inp.weight"]["ggml_type"] - e_score_correction_bias_type = self.gguf_loader.tensor_info[key + ".exp_probs_b.bias"]["ggml_type"] + weight = tensors[".weight"] + e_score_correction_bias = tensors[".e_score_correction_bias"] + # weight_type = self.gguf_loader.tensor_info[key + ".weight"]["ggml_type"] + res = {"weight": weight, "e_score_correction_bias": e_score_correction_bias} else: raise ValueError(f"Experts {key} not found in gguf_loader") - res = {"weight": weight, "e_score_correction_bias": e_score_correction_bias, "weight_type": weight_type, "e_score_correction_bias_type": e_score_correction_bias_type} + return res def load_multi(self, key: str, keys: list[str], device: str = "cpu"): @@ -106,8 +102,6 @@ class KMoEGate(BaseInjectedModule, KMoEGateBase): if w is None: w = self.load_weights(device=device) if isinstance(w, dict): - self.weight_type = w["weight_type"] - self.e_score_correction_bias_type = w["e_score_correction_bias_type"] self.orig_module.weight = nn.Parameter(w["weight"]) self.orig_module.e_score_correction_bias = nn.Parameter(w["e_score_correction_bias"]) else: @@ -175,8 +169,6 @@ class KMoEGateQwen2Moe(BaseInjectedModule, KMoEGateBase): if w is None: w = self.load_weights(device=device) if isinstance(w, dict): - self.weight_type = w["weight_type"] - self.e_score_correction_bias_type = w["e_score_correction_bias_type"] self.orig_module.weight = nn.Parameter(w["weight"]) self.orig_module.e_score_correction_bias = nn.Parameter(w["e_score_correction_bias"]) else: diff --git a/ktransformers/operators/layernorm.py b/ktransformers/operators/layernorm.py index 62c5cba..06c569b 100644 --- a/ktransformers/operators/layernorm.py +++ b/ktransformers/operators/layernorm.py @@ -29,7 +29,7 @@ from ktransformers.models.modeling_deepseek_v3 import DeepseekV3RMSNorm from ktransformers.models.modeling_qwen2_moe import Qwen2MoeRMSNorm from ktransformers.models.modeling_qwen3_moe import Qwen3MoeRMSNorm from ktransformers.operators.base_operator import BaseInjectedModule -from ktransformers.util.custom_gguf import GGUFLoader +from ktransformers.util.custom_loader import GGUFLoader from flashinfer.norm import ( fused_add_rmsnorm, rmsnorm, diff --git a/ktransformers/operators/linear.py b/ktransformers/operators/linear.py index 293826e..2b12b15 100644 --- a/ktransformers/operators/linear.py +++ b/ktransformers/operators/linear.py @@ -16,7 +16,7 @@ import torch from torch import Tensor, nn import KTransformersOps import vLLMMarlin -from ktransformers.util.custom_gguf import GGUFLoader +from ktransformers.util.custom_loader import GGUFLoader, SafeTensorLoader from ktransformers.util.utils import InferenceState from ktransformers.ktransformers_ext.operators.custom_marlin.quantize.utils.marlin_utils import ( MarlinWorkspace, @@ -83,15 +83,15 @@ class KLinearBase(ABC): keys = [self.key] for key in keys: - if self.gguf_loader.safetensor_loader is not None: + if isinstance(self.gguf_loader, SafeTensorLoader): # using safetensor_loader - tensor = self.gguf_loader.safetensor_loader.load_tensor(key+'.weight') - if key+'.weight_scale_inv' in self.gguf_loader.safetensor_loader.tensor_file_map: - weight_scale_inv = self.gguf_loader.safetensor_loader.load_tensor(key+'.weight_scale_inv') + tensor = self.gguf_loader.load_tensor(key+'.weight') + if self.gguf_loader.has_tensor(key+'.weight_scale_inv'): + weight_scale_inv = self.gguf_loader.load_tensor(key+'.weight_scale_inv') return nn.Parameter(tensor), nn.Parameter(weight_scale_inv) return nn.Parameter(tensor) - elif key + ".weight" in self.gguf_loader.tensor_file_map: + elif self.gguf_loader.has_tensor(key + ".weight"): if key + ".bias" in self.gguf_loader.tensor_file_map: tensors = self.load_multi(key, ["weight", "bias"], device=device) tensor = tensors["weight"] @@ -760,7 +760,7 @@ class KLinearCPUInfer(KLinearBase): self.output_gpu = torch.zeros((1, 1, self.out_features), device=self.out_device) def load_weights(self, w: dict | nn.Parameter | tuple | None = None, device: str = "cpu"): - if self.key + ".weight" in self.gguf_loader.tensor_info: + if self.gguf_loader.has_tensor(self.key + ".weight"): if self.key + ".bias" in self.gguf_loader.tensor_file_map: self.weight = self.gguf_loader.get_mmap_tensor(self.key + ".weight") self.weight_type = self.gguf_loader.tensor_info[self.key + ".weight"]["ggml_type"] diff --git a/ktransformers/operators/mlp.py b/ktransformers/operators/mlp.py index 02648b1..77d7d05 100644 --- a/ktransformers/operators/mlp.py +++ b/ktransformers/operators/mlp.py @@ -1,6 +1,6 @@ from ktransformers.operators.base_operator import BaseInjectedModule -from ktransformers.util.custom_gguf import GGUFLoader +from ktransformers.util.custom_loader import GGUFLoader from transformers import PretrainedConfig import torch.nn as nn from ktransformers.models.modeling_deepseek_v3 import DeepseekV3MLP diff --git a/ktransformers/operators/models.py b/ktransformers/operators/models.py index e495543..4aa223d 100644 --- a/ktransformers/operators/models.py +++ b/ktransformers/operators/models.py @@ -58,7 +58,7 @@ from transformers.models.qwen2_moe.configuration_qwen2_moe import Qwen2MoeConfig from ktransformers.models.configuration_llama import LlamaConfig from ktransformers.operators.base_operator import BaseInjectedModule from ktransformers.util.utils import InferenceState, get_compute_capability -from ktransformers.util.custom_gguf import GGUFLoader +from ktransformers.util.custom_loader import GGUFLoader from transformers.configuration_utils import PretrainedConfig from ktransformers.models.modeling_llama import ( LlamaDecoderLayer, diff --git a/ktransformers/optimize/optimize.py b/ktransformers/optimize/optimize.py index 331e6cf..72c8407 100644 --- a/ktransformers/optimize/optimize.py +++ b/ktransformers/optimize/optimize.py @@ -12,7 +12,7 @@ from torch import nn from transformers import AutoConfig from transformers.configuration_utils import PretrainedConfig # from operators import BaseInjectedModule -from ktransformers.util.custom_gguf import GGUFLoader, translate_name_to_gguf +from ktransformers.util.custom_loader import GGUFLoader, ModelLoaderFactory from ktransformers.util.utils import set_module, load_weights import itertools import copy @@ -54,7 +54,7 @@ def del_meta(module:nn.Module): def gen_optimize_config(module: nn.Module, out_data: Mapping, rule_list: List, prefix: str="", default_device: str = "cuda:0"): module_name = prefix[:-1] - translated_name = translate_name_to_gguf(prefix)[:-1] + # translated_name = translate_name_to_gguf(prefix)[:-1] #print("gen_optimize_config", prefix, module_name, translated_name) recursive = True for rule in rule_list: @@ -76,7 +76,7 @@ def gen_optimize_config(module: nn.Module, out_data: Mapping, rule_list: List, p if "replace" in rule: replace_meta = rule["replace"] if module_name not in out_data: - out_data[module_name]={"key": translated_name, + out_data[module_name]={"key": module_name, "class": replace_meta["class"] if "class" in replace_meta else "default", # "device": replace_meta["device"] if "device" in replace_meta else default_device, "kwargs": copy.deepcopy(replace_meta["kwargs"]) if "kwargs" in replace_meta else dict()} @@ -91,7 +91,7 @@ def gen_optimize_config(module: nn.Module, out_data: Mapping, rule_list: List, p if module_name not in out_data: out_data[module_name]= { "class": "default", - "key": translated_name, + "key": module_name, "kwargs": {"generate_device": default_device, "prefill_device": default_device} } @@ -123,12 +123,12 @@ def optimize_and_load_gguf(module: nn.Module, rule_file: str, gguf_path: str, mo model_config = translate_model_config(model_config) - gguf_loader=GGUFLoader(gguf_path) + weights_loader = ModelLoaderFactory.create_loader(gguf_path) with torch.device("meta"): - inject(module, optimize_config, model_config, gguf_loader) + inject(module, optimize_config, model_config, weights_loader) # pre load lm_head because its big inter result - load_weights(module.lm_head, gguf_loader, "lm_head.") - load_weights(module, gguf_loader) - module.gguf_loader = gguf_loader + load_weights(module.lm_head, weights_loader, "lm_head.") + load_weights(module, weights_loader) + module.gguf_loader = weights_loader del_meta(module) torch.cuda.empty_cache() diff --git a/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-fp8-linear-ggml-experts-serve-amx.yaml b/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-fp8-linear-ggml-experts-serve-amx.yaml new file mode 100644 index 0000000..670f6d5 --- /dev/null +++ b/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-fp8-linear-ggml-experts-serve-amx.yaml @@ -0,0 +1,91 @@ +- match: + class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding + replace: + class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3 + kwargs: + generate_device: "cuda" + prefill_device: "cuda" +- match: + name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$" # regular expression + class: torch.nn.Linear # only match modules matching name and class simultaneously + replace: + class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types + kwargs: + generate_device: "cuda" + prefill_device: "cuda" + generate_op: "KLinearFP8" + prefill_op: "KLinearTorch" +- match: + name: "^model\\.layers\\..*\\.mlp$" + class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE + replace: + class: ktransformers.operators.experts.KDeepseekV3MoEV2 # mlp module with custom forward function + kwargs: + generate_device: "cuda" + prefill_device: "cuda" +- match: + class: ktransformers.models.modeling_deepseek_v3.MoEGate + replace: + class: ktransformers.operators.gate.KMoEGate + kwargs: + generate_device: "cuda:0" + prefill_device: "cuda:0" +- match: + name: "^model\\.layers\\..*\\.mlp\\.experts$" + replace: + class: ktransformers.operators.experts.KTransformersExpertsV2 # custom MoE Kernel with expert paralleism + kwargs: + prefill_device: "cuda" + prefill_op: "KExpertsTorch" + generate_device: "cpu" + generate_op: "KExpertsCPU" + out_device: "cuda" + backend: "llamafile" + recursive: False # don't recursively inject submodules of this module +- match: + name: "^model\\.layers\\..*\\.self_attn$" + replace: + class: ktransformers.operators.balance_serve_attention.flashinfer_attn # optimized MLA implementation + kwargs: + generate_device: "cuda" + prefill_device: "cuda" +- match: + name: "^model$" + replace: + class: "ktransformers.operators.models.KDeepseekV2Model" + kwargs: + per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill +- match: + name: "^model.embed_tokens" + replace: + class: "default" + kwargs: + generate_device: "cpu" + prefill_device: "cpu" + +- match: + class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RMSNorm + replace: + class: ktransformers.operators.layernorm.RMSNorm + kwargs: + generate_device: "cuda" + prefill_device: "cuda" + +- match: + class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MLP + replace: + class: ktransformers.operators.mlp.kDeepseekV3MLP + kwargs: + generate_device: "cuda" + prefill_device: "cuda" + +- match: + name: "^lm_head$" # regular expression + class: torch.nn.Linear # only match modules matching name and class simultaneously + replace: + class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types + kwargs: + generate_device: "cuda" + prefill_device: "cuda" + generate_op: "VLinearMarlin" + prefill_op: "KLinearTorch" \ No newline at end of file diff --git a/ktransformers/server/args.py b/ktransformers/server/args.py index 1210e14..748bd47 100644 --- a/ktransformers/server/args.py +++ b/ktransformers/server/args.py @@ -128,10 +128,7 @@ class ArgumentParser: else: args.model_dir = self.cfg.model_dir args.model_path = self.cfg.model_path - # set config from args - for key, value in vars(args).items(): - if value is not None and hasattr(self.cfg, key): - setattr(self.cfg, key, value) + # we add the name not match args individually self.cfg.model_device = args.device self.cfg.mount_web = args.web @@ -140,10 +137,15 @@ class ArgumentParser: self.cfg.user_force_think = args.force_think model_config = AutoConfig.from_pretrained(args.model_dir, trust_remote_code=True) - if args.architectures == "Qwen3MoeForCausalLM" or args.architectures == "Qwen2MoeForCausalLM" : + if model_config.architectures[0] == "Qwen3MoeForCausalLM" or model_config.architectures[0] == "Qwen2MoeForCausalLM" : args.gpu_memory_size = args.cache_lens*2*2*model_config.num_hidden_layers*model_config.num_key_value_heads*model_config.head_dim + args.architectures = model_config.architectures[0] else: args.gpu_memory_size = args.cache_lens*2*576*61 + # set config from args + for key, value in vars(args).items(): + if value is not None and hasattr(self.cfg, key): + setattr(self.cfg, key, value) self.cfg.gpu_memory_size = args.gpu_memory_size free_ports = get_free_ports(3, [args.port]) args.sched_port = free_ports[0] diff --git a/ktransformers/server/backend/interfaces/balance_serve.py b/ktransformers/server/backend/interfaces/balance_serve.py index 2d89332..7c0ae01 100644 --- a/ktransformers/server/backend/interfaces/balance_serve.py +++ b/ktransformers/server/backend/interfaces/balance_serve.py @@ -197,7 +197,7 @@ class Engine: self.block_num = inference_context.k_cache[0].size(1) #@TODO add config if config.architectures[0] == "Qwen2MoeForCausalLM" or config.architectures[0] == "Qwen3MoeForCausalLM": - self.model.init_wrapper(self.args.use_cuda_graph, self.device, 1024 ,args.max_batch_size, self.block_num) # TODO: 1024 is a magic number(max_batch_tokens) + self.model.init_wrapper(self.args.use_cuda_graph, self.device, Config().chunk_size, args.max_batch_size, self.block_num) # TODO: 1024 is a magic number(max_batch_tokens) else: self.model.init_wrapper(self.args.use_cuda_graph, self.device, args.max_batch_size, self.block_num) diff --git a/ktransformers/server/balance_serve/inference/forward_batch.py b/ktransformers/server/balance_serve/inference/forward_batch.py index 7022d9e..26b4d3d 100644 --- a/ktransformers/server/balance_serve/inference/forward_batch.py +++ b/ktransformers/server/balance_serve/inference/forward_batch.py @@ -200,7 +200,7 @@ class ForwardBatchInput: device=None, tokens: torch.Tensor = None, num_mini_batches: int = 1, - max_seq_length: int = 1024, # TODO: add to yaml + max_seq_length: int = 4096, # TODO: add to yaml prefill_query_length: int = (Config().chunk_size - Config().max_decode_batch_size) // Config().max_prefill_batch_size, # TODO: use config prefill_active_length: int = (Config().chunk_size - Config().max_decode_batch_size) // Config().max_prefill_batch_size, gen_prefill: bool = True, @@ -223,12 +223,12 @@ class ForwardBatchInput: decode_querys_info = [] for i in range(min(decode_batch_size, cuda_lens)): - query_info = QueryInfo(i+Config().max_prefill_batch_size, prefill_query_length, max_seq_length, page_size, device, is_prefill=False, offset=offset) + query_info = QueryInfo(i+Config().max_prefill_batch_size, prefill_query_length, 256, page_size, device, is_prefill=False, offset=offset) offset += max_seq_length // page_size if tokens is not None: query_info.query_tokens[prefill_active_length:prefill_active_length + 1].copy_(tokens) if decode_active_position is None: - query_info.active_position = prefill_active_length + query_info.active_position = 255 else: query_info.active_position = decode_active_position[i] diff --git a/ktransformers/server/balance_serve/inference/model_runner.py b/ktransformers/server/balance_serve/inference/model_runner.py index 79b3053..06f8b16 100644 --- a/ktransformers/server/balance_serve/inference/model_runner.py +++ b/ktransformers/server/balance_serve/inference/model_runner.py @@ -39,6 +39,17 @@ def pad_num_tokens(num_tokens): def deduplicate_and_sort(lst): return sorted(set(lst)) +def generate_cuda_graphs(chunk_size: int) -> list: + # 如果输入不符合要求,assert掉 + assert chunk_size <= 1024 or chunk_size % 1024 == 0, "chunk_size must <= 1024 or a multiple of 1024" + base_list = [1, 2, 3, Config().max_batch_size, 64, 256, 512, chunk_size] + + if chunk_size <= 1024: + return base_list + + multiples = [i for i in range(1024, chunk_size + 1, 1024)] + + return deduplicate_and_sort(base_list + multiples) class ModelRunner: """A CudaGraphRunner runs the forward pass of a model with CUDA graph and torch.compile.""" @@ -56,7 +67,7 @@ class ModelRunner: self.features_buf = None self.output = None self.graph_memory_pool = None - self.cuda_graphs = deduplicate_and_sort([1, 2, 3, Config().max_batch_size, 64, Config().chunk_size]) + self.cuda_graphs = generate_cuda_graphs(Config().chunk_size) self.use_cuda_graph = use_cuda_graph self.model_time = 0 self.page_size = page_size diff --git a/ktransformers/tests/dequant_gpu.py b/ktransformers/tests/dequant_gpu.py index 0dd5272..3dbd794 100644 --- a/ktransformers/tests/dequant_gpu.py +++ b/ktransformers/tests/dequant_gpu.py @@ -7,7 +7,7 @@ sys.path.append(current_path+"/../..") import numpy as np # from ktransformers.operators.linear import KTransformersLinear, KLinearMarlin # from ktransformers.operators.experts import KTransformersExperts, KExpertsTorch -from ktransformers.util.custom_gguf import GGUFLoader +from ktransformers.util.custom_loader import GGUFLoader import torch import KTransformersOps torch.set_default_dtype(torch.bfloat16) diff --git a/ktransformers/tests/dequant_gpu_t.py b/ktransformers/tests/dequant_gpu_t.py index 4b2556d..06de4a0 100644 --- a/ktransformers/tests/dequant_gpu_t.py +++ b/ktransformers/tests/dequant_gpu_t.py @@ -9,7 +9,7 @@ from pycuda.compiler import SourceModule import numpy as np from ktransformers.operators.linear import KTransformersLinear, KLinearMarlin from ktransformers.operators.experts import KTransformersExperts, KExpertsTorch -from ktransformers.util.custom_gguf import GGUFLoader, dequantize_q4_k_gpu, dequantize_q4_k +from ktransformers.util.custom_loader import GGUFLoader, dequantize_q4_k_gpu, dequantize_q4_k import torch import KTransformersOps torch.set_default_dtype(torch.bfloat16) diff --git a/ktransformers/tests/test_speed.py b/ktransformers/tests/test_speed.py index b45bf87..6f435b4 100644 --- a/ktransformers/tests/test_speed.py +++ b/ktransformers/tests/test_speed.py @@ -159,5 +159,7 @@ if __name__ == "__main__": prompt = ktansformer_prompt1024 elif args.prompt_lens == 2048: prompt = ktansformer_prompt1024 * 2 + elif args.prompt_lens == 4096: + prompt = ktansformer_prompt1024 * 4 asyncio.run(main(args.concurrent, prompt, max_tokens, model)) diff --git a/ktransformers/util/custom_gguf.py b/ktransformers/util/custom_gguf.py index b3d98d3..4518366 100644 --- a/ktransformers/util/custom_gguf.py +++ b/ktransformers/util/custom_gguf.py @@ -25,7 +25,6 @@ import os from enum import IntEnum import torch import KTransformersOps -from .custom_loader import SafeTensorLoader import ctypes import math @@ -166,238 +165,6 @@ DATA_TYPES = { "FP8": 13, } -class GGUFLoader: - tensor_info: dict - gguf_path: str - tensor_file_map: dict # {tensor_name: tensor_file_path} - gguf_file_meta: dict - safetensor_loader: SafeTensorLoader - def __init__(self, gguf_path: str): - # Check dir exist - if not os.path.exists(gguf_path): - raise FileNotFoundError(f"GGUF dir not found: {gguf_path}") - if os.path.isfile(gguf_path): - gguf_path = os.path.dirname(gguf_path) - - self.safetensor_loader = None - - self.tensor_info = {} - self.gguf_path = gguf_path - self.tensor_file_map = {} - self.file_data_map = {} - self.gguf_file_meta = {} - self.tensor_device_map = {} - - # I know this is ugly, but I don't want to change the original code too much - # TODO: merge gguf load and other loads. - safetensor_loader = SafeTensorLoader(gguf_path) - if safetensor_loader.tensor_file_map: - self.safetensor_loader = safetensor_loader - return - # Walk through all the .gguf files in the directory - found_gguf = False - for root, dirs, files in os.walk(gguf_path): - for file in files: - if file.endswith(".gguf"): - found_gguf = True - file_name = os.path.join(root, file) - with open(file_name, "rb") as f: - self.load_gguf(f) - if file_name not in self.file_data_map: - self.file_data_map[file_name] = np.memmap(file_name, mode = 'r') - if not found_gguf: - raise FileNotFoundError(f"Cannot find any .gguf files in: {gguf_path}") - - def load_gguf(self, f): - f.seek(0) - assert f.read(4) == b'GGUF' - values = struct.unpack("torch.Tensor: - t = self.tensor_info[name] - if device.lower() == "cpu": - print(f"loading expert {expert_id} of {name} with CPU") - shape = t["shape"] - ggml_type = t["ggml_type"] - if ggml_type not in GGML_NAMES: - raise NotImplementedError(f"ggml_type {ggml_type} not implemented") - ggml_name = GGML_NAMES[ggml_type] - - # TODO: experts may fused in quant block, split it - assert elements_per_expert % GGML_ELEMENTS_PER_BLOCK[ggml_name] == 0, "experts may fused in quant block, please use CPU dequant" - - blocks_per_experts = elements_per_expert // GGML_ELEMENTS_PER_BLOCK[ggml_name] - block_size = GGML_BLOCK_SIZES[ggml_name] - offset = expert_id * block_size * blocks_per_experts - data = data[offset: offset + block_size * blocks_per_experts] - - if "cuda" in device.lower(): - values = GGML_DEQUANTIZE_GPU[ggml_name](data, device, target_dtype) - else: - values = GGML_DEQUANTIZE[ggml_name](data) - values = torch.from_numpy(values.copy()) - - if ggml_name == "BF16": - values = values.view(torch.bfloat16) - values = values.view(shape[-2::-1]) - - return values - - def load_gguf_tensor(self, name: str, device:str = "cpu", target_dtype = None)->torch.Tensor: - t = self.tensor_info[name] - if device.lower() == "cpu": - print(f"loading {name} with CPU") - if target_dtype == None: - target_dtype = torch.get_default_dtype() - - shape = t["shape"] - ggml_type = t["ggml_type"] - - if ggml_type not in GGML_NAMES: - raise NotImplementedError(f"ggml_type {ggml_type} not implemented") - - ggml_name = GGML_NAMES[ggml_type] - - data = self.get_mmap_tensor(name) - - block_size = GGML_BLOCK_SIZES[ggml_name] - elements_per_block = GGML_ELEMENTS_PER_BLOCK[ggml_name] - num_elements = int(np.prod(shape)) - num_blocks = num_elements // elements_per_block - - blocks_per_iter = 16384 - if num_blocks > blocks_per_iter: # dequant large tensor - values = torch.empty((num_blocks, elements_per_block), dtype=target_dtype, device=device) - for i in range( (num_blocks + blocks_per_iter - 1) // blocks_per_iter): - blocks_begin = i * blocks_per_iter - blocks_end = min(blocks_begin + blocks_per_iter, num_blocks) - if "cuda" in device.lower(): - cur_values = GGML_DEQUANTIZE_GPU[ggml_name](data[blocks_begin*block_size : blocks_end*block_size], device, target_dtype) - else: - cur_values = GGML_DEQUANTIZE[ggml_name](data[blocks_begin*block_size : blocks_end*block_size]) - cur_values = torch.from_numpy(cur_values.copy()) - - cur_values = cur_values.view(-1, elements_per_block) - if ggml_name == "BF16": - cur_values = cur_values.view(torch.bfloat16) - values[blocks_begin : blocks_end] = cur_values - else: - if "cuda" in device.lower(): - values = GGML_DEQUANTIZE_GPU[ggml_name](data, device) - else: - values = GGML_DEQUANTIZE[ggml_name](data) - values = torch.from_numpy(values) - - if ggml_name == "BF16": - values = values.view(torch.bfloat16) - - - values = values.view(shape[::-1]) - if "attn_q" in name and self.gguf_file_meta['general.architecture'] in ["llama"]: - n_head = self.gguf_file_meta['llama.attention.head_count'] - values = (values.reshape(n_head, values.shape[0] // n_head // 2, 2, *values.shape[1:]) - .swapaxes(1, 2) - .reshape(values.shape)) - elif "attn_k" in name and self.gguf_file_meta['general.architecture'] in ["llama"]: - n_head = self.gguf_file_meta['llama.attention.head_count_kv'] - values = (values.reshape(n_head, values.shape[0] // n_head // 2, 2, *values.shape[1:]) - .swapaxes(1, 2) - .reshape(values.shape)) - return values def read_value(f, data_type): if data_type == DATA_TYPES["string"]: @@ -921,6 +688,7 @@ def translate_name_to_gguf(name): name = name.replace(".gate_up_proj.", ".up_proj") name = name.replace(".mlp.shared_experts.down_proj", ".ffn_down_shexp") + name = name.replace(".mlp.gate.e_score_correction_bias", ".exp_probs_b.bias") name = name.replace(".mlp.gate", ".ffn_gate_inp") name = name.replace(".mlp.shared_experts.gate_proj", ".ffn_gate_shexp") name = name.replace(".mlp.shared_experts.up_proj", ".ffn_up_shexp") diff --git a/ktransformers/util/custom_loader.py b/ktransformers/util/custom_loader.py index ecc09a0..93b94c4 100644 --- a/ktransformers/util/custom_loader.py +++ b/ktransformers/util/custom_loader.py @@ -10,12 +10,35 @@ import torch import KTransformersOps from safetensors import safe_open from ktransformers.ktransformers_ext.triton.fp8gemm import fp8_gemm, act_quant, weight_dequant +from ktransformers.util.custom_gguf import * from safetensors.torch import save_file +from abc import ABC, abstractmethod +from typing import Dict, Any, Optional, Union -class SafeTensorLoader: +class ModelLoader(ABC): + """ + Abstract base class for model loaders. + Defines the interface that all model loaders must implement. + """ tensor_file_map = {} - tensor_type_map = {} - file_handle_map = {} + @abstractmethod + def has_tensor(cls, name: str): + """ + Check if the tensor exists in the loader. + + Args: + name: Name of the tensor to check + + Returns: + bool: True if the tensor exists, False otherwise + """ + pass + +class SafeTensorLoader(ModelLoader): + tensor_file_map: dict + tensor_type_map: dict + file_handle_map: dict + tensor_device_map: dict def __init__(self, file_path: str): self.__load_tensor_file_map(file_path) @@ -28,6 +51,10 @@ class SafeTensorLoader: folder_path = os.path.dirname(file_path) else: folder_path = file_path + self.file_handle_map = {} + self.tensor_file_map = {} + self.tensor_type_map = {} + self.tensor_device_map = {} found_safetensor = False for root, _, files in os.walk(folder_path): @@ -57,7 +84,11 @@ class SafeTensorLoader: # raise FileNotFoundError(f"No Safetensor files found in {folder_path}") def load_tensor(self, key: str, device: str="cpu"): - if key not in self.tensor_file_map: + if translate_name_to_gguf(key) in self.tensor_file_map: + key = translate_name_to_gguf(key) + elif key in self.tensor_file_map: + pass + else: raise KeyError(f"Key {key} not found in Safetensor files") file = self.tensor_file_map[key] f = self.file_handle_map.get(file) @@ -66,13 +97,145 @@ class SafeTensorLoader: tensor = f.get_tensor(key) return tensor.to(device) + def load_experts(self, key: str, device: str="cpu"): + ''' + Load experts from safetensor + key: the name of the experts + device: the device to load the experts to + return: dict, + {up: tensor, down: tensor, gate: tensor, up_type: int, down_type: int, gate_type: int} + {xxx}_type: the type of the up tensor, corresponding to the ggml type + ''' + if self.has_tensor(translate_name_to_gguf(key)+".ffn_gate_exps.weight"): + # legacy branch for loading hybrid model + base_key = translate_name_to_gguf(key) + # Load experts from safetensor + gate_key = f"{base_key}.ffn_gate_exps.weight" + gate_type_key = f"{base_key}.ffn_gate_exps.ggml_type" + up_key = f"{base_key}.ffn_up_exps.weight" + up_type_key = f"{base_key}.ffn_up_exps.ggml_type" + down_key = f"{base_key}.ffn_down_exps.weight" + down_type_key = f"{base_key}.ffn_down_exps.ggml_type" + gate_tensor = self.load_tensor(gate_key, device).numpy() + up_tensor = self.load_tensor(up_key, device).numpy() + down_tensor = self.load_tensor(down_key, device).numpy() + gate_type = self.load_tensor(gate_type_key, device).item() + up_type = self.load_tensor(up_type_key, device).item() + down_type = self.load_tensor(down_type_key, device).item() + + return { + "up": up_tensor, + "gate": gate_tensor, + "down": down_tensor, + "up_type": up_type, + "gate_type": gate_type, + "down_type": down_type + } + + else: + # Load experts from safetensor + base_key = key # e.g. "model.layers.3.mlp.experts" + experts_count = 0 + + # First, count how many experts we have by checking for expert 0's up_proj + while self.has_tensor(f"{base_key}.{experts_count}.up_proj.weight"): + experts_count += 1 + + if experts_count == 0: + raise ValueError(f"No experts found for key {base_key}") + + # Initialize empty lists to store tensors for each projection type + up_projs = [] + gate_projs = [] + down_projs = [] + + # Load all expert weights + for expert_id in range(experts_count): + up_key = f"{base_key}.{expert_id}.up_proj.weight" + gate_key = f"{base_key}.{expert_id}.gate_proj.weight" + down_key = f"{base_key}.{expert_id}.down_proj.weight" + + up_tensor = self.load_tensor(up_key, device) + gate_tensor = self.load_tensor(gate_key, device) + down_tensor = self.load_tensor(down_key, device) + + up_projs.append(up_tensor) + gate_projs.append(gate_tensor) + down_projs.append(down_tensor) + + # Stack the tensors along a new dimension + up_tensor = torch.stack(up_projs, dim=0) + gate_tensor = torch.stack(gate_projs, dim=0) + down_tensor = torch.stack(down_projs, dim=0) + + # Get original dtype for GGML type determination + orig_up_dtype = up_tensor.dtype + orig_gate_dtype = gate_tensor.dtype + orig_down_dtype = down_tensor.dtype + + # Convert to numpy with proper bfloat16 support + up_numpy = up_tensor.view(torch.uint16).numpy() + gate_numpy = gate_tensor.view(torch.uint16).numpy() + down_numpy = down_tensor.view(torch.uint16).numpy() + + # Determine tensor data types for GGML conversion + def get_ggml_type(dtype): + if dtype == torch.float32: + return GGMLQuantizationType.F32 + elif dtype == torch.float16: + return GGMLQuantizationType.F16 + elif dtype == torch.bfloat16: + return GGMLQuantizationType.BF16 + else: + raise ValueError(f"Unsupported tensor dtype: {dtype}") + + return { + "up": up_numpy, + "gate": gate_numpy, + "down": down_numpy, + "up_type": get_ggml_type(orig_up_dtype), + "gate_type": get_ggml_type(orig_gate_dtype), + "down_type": get_ggml_type(orig_down_dtype) + } + + def load_gate(self, key: str, device: str="cpu"): + ''' + Load gate from safetensor + key: the name of the gate + device: the device to load the gate to + return: dict, + {'weight': tensor, 'e_score_correction_bias': tensor} + ''' + target = ["weight", "e_score_correction_bias"] + res = {'weight': None, 'e_score_correction_bias': None} + if self.has_tensor(translate_name_to_gguf(key)+".ffn_gate_exps.weight"): + # legacy branch for loading hybrid model + base_key = key + for k in target: + translated_key = translate_name_to_gguf(f"{base_key}.{k}") + if self.has_tensor(translated_key): + tensor = self.load_tensor(translated_key, device) + res[k] = tensor + else: + # Load gate from safetensor + base_key = key + for k in target: + if self.has_tensor(f"{base_key}.{k}"): + tensor = self.load_tensor(f"{base_key}.{k}", device) + res[k] = tensor + return res + def close_all_handles(self): for handle in self.file_handle_map.values(): handle.close() self.file_handle_map.clear() def load_dequantized_tensor(self, key:str, device: str="cpu"): - if key not in self.tensor_file_map: + if key in self.tensor_file_map and translate_name_to_gguf(key): + pass + elif translate_name_to_gguf(key) in self.tensor_file_map: + key = translate_name_to_gguf(key) + else: raise KeyError(f"Key {key} not found in Safetensor files") file = self.tensor_file_map[key] f = self.file_handle_map.get(file) @@ -83,4 +246,315 @@ class SafeTensorLoader: if key[:-7] + ".weight_scale_inv" in self.tensor_file_map: weight_scale_inv = f.get_tensor(key[:-7] + ".weight_scale_inv").to(device) tensor = weight_dequant(tensor, weight_scale_inv) - return tensor.to(device) \ No newline at end of file + return tensor.to(device) + + def has_tensor(self, name: str): + return name in self.tensor_file_map or translate_name_to_gguf(name) in self.tensor_file_map + +class GGUFLoader(ModelLoader): + tensor_info: dict + gguf_path: str + tensor_file_map: dict # {tensor_name: tensor_file_path} + gguf_file_meta: dict + safetensor_loader: SafeTensorLoader + def __init__(self, gguf_path: str): + # Check dir exist + if not os.path.exists(gguf_path): + raise FileNotFoundError(f"GGUF dir not found: {gguf_path}") + if os.path.isfile(gguf_path): + gguf_path = os.path.dirname(gguf_path) + + self.safetensor_loader = None + + self.tensor_info = {} + self.gguf_path = gguf_path + self.tensor_file_map = {} + self.file_data_map = {} + self.gguf_file_meta = {} + self.tensor_device_map = {} + + # Walk through all the .gguf files in the directory + found_gguf = False + for root, dirs, files in os.walk(gguf_path): + for file in files: + if file.endswith(".gguf"): + found_gguf = True + file_name = os.path.join(root, file) + with open(file_name, "rb") as f: + self.load_gguf(f) + if file_name not in self.file_data_map: + self.file_data_map[file_name] = np.memmap(file_name, mode = 'r') + if not found_gguf: + raise FileNotFoundError(f"Cannot find any .gguf files in: {gguf_path}") + + def load_gguf(self, f): + f.seek(0) + assert f.read(4) == b'GGUF' + values = struct.unpack("torch.Tensor: + name = translate_name_to_gguf(name) + t = self.tensor_info[name] + shape = t["shape"] + ggml_type = t["ggml_type"] + if ggml_type not in GGML_NAMES: + raise NotImplementedError(f"ggml_type {ggml_type} not implemented") + ggml_name = GGML_NAMES[ggml_type] + + # TODO: experts may fused in quant block, split it + assert elements_per_expert % GGML_ELEMENTS_PER_BLOCK[ggml_name] == 0, "experts may fused in quant block, please use CPU dequant" + + blocks_per_experts = elements_per_expert // GGML_ELEMENTS_PER_BLOCK[ggml_name] + block_size = GGML_BLOCK_SIZES[ggml_name] + offset = expert_id * block_size * blocks_per_experts + data = data[offset: offset + block_size * blocks_per_experts] + + if "cuda" in device.lower(): + values = GGML_DEQUANTIZE_GPU[ggml_name](data, device, target_dtype) + else: + values = GGML_DEQUANTIZE[ggml_name](data) + values = torch.from_numpy(values.copy()) + + if ggml_name == "BF16": + values = values.view(torch.bfloat16) + values = values.view(shape[-2::-1]) + + return values + + def load_gguf_tensor(self, name: str, device:str = "cpu", target_dtype = None)->torch.Tensor: + name = translate_name_to_gguf(name) + t = self.tensor_info[name] + if target_dtype == None: + target_dtype = torch.get_default_dtype() + + shape = t["shape"] + ggml_type = t["ggml_type"] + + if ggml_type not in GGML_NAMES: + raise NotImplementedError(f"ggml_type {ggml_type} not implemented") + + ggml_name = GGML_NAMES[ggml_type] + + data = self.get_mmap_tensor(name) + + block_size = GGML_BLOCK_SIZES[ggml_name] + elements_per_block = GGML_ELEMENTS_PER_BLOCK[ggml_name] + num_elements = int(np.prod(shape)) + num_blocks = num_elements // elements_per_block + + blocks_per_iter = 16384 + if num_blocks > blocks_per_iter: # dequant large tensor + values = torch.empty((num_blocks, elements_per_block), dtype=target_dtype, device=device) + for i in range( (num_blocks + blocks_per_iter - 1) // blocks_per_iter): + blocks_begin = i * blocks_per_iter + blocks_end = min(blocks_begin + blocks_per_iter, num_blocks) + if "cuda" in device.lower(): + cur_values = GGML_DEQUANTIZE_GPU[ggml_name](data[blocks_begin*block_size : blocks_end*block_size], device, target_dtype) + else: + cur_values = GGML_DEQUANTIZE[ggml_name](data[blocks_begin*block_size : blocks_end*block_size]) + cur_values = torch.from_numpy(cur_values.copy()) + + cur_values = cur_values.view(-1, elements_per_block) + if ggml_name == "BF16": + cur_values = cur_values.view(torch.bfloat16) + values[blocks_begin : blocks_end] = cur_values + else: + if "cuda" in device.lower(): + values = GGML_DEQUANTIZE_GPU[ggml_name](data, device) + else: + values = GGML_DEQUANTIZE[ggml_name](data) + values = torch.from_numpy(values) + + if ggml_name == "BF16": + values = values.view(torch.bfloat16) + + + values = values.view(shape[::-1]) + if "attn_q" in name and self.gguf_file_meta['general.architecture'] in ["llama"]: + n_head = self.gguf_file_meta['llama.attention.head_count'] + values = (values.reshape(n_head, values.shape[0] // n_head // 2, 2, *values.shape[1:]) + .swapaxes(1, 2) + .reshape(values.shape)) + elif "attn_k" in name and self.gguf_file_meta['general.architecture'] in ["llama"]: + n_head = self.gguf_file_meta['llama.attention.head_count_kv'] + values = (values.reshape(n_head, values.shape[0] // n_head // 2, 2, *values.shape[1:]) + .swapaxes(1, 2) + .reshape(values.shape)) + return values + def has_tensor(self, name: str): + name = translate_name_to_gguf(name) + return name in self.tensor_info + + def get_ggml_type(self, name: str): + name = translate_name_to_gguf(name) + if name not in self.tensor_info: + raise KeyError(f"Key {name} not found in GGUF files") + return self.tensor_info[name]["ggml_type"] + +class ModelLoaderFactory: + """ + Factory class for creating model loaders. + Automatically detects the model format based on file extensions in the directory. + """ + + @staticmethod + def create_loader(path: str): + """ + Create a model loader for the given path by detecting the model format. + The function checks for the presence of .safetensors or .gguf files + in the specified path and creates the appropriate loader. + + Args: + path: Path to the model directory or file + + Returns: + An appropriate ModelLoader instance (SafeTensorLoader or GGUFLoader) + + Raises: + FileNotFoundError: If no supported model files are found in the path + """ + if not os.path.exists(path): + raise FileNotFoundError(f"Path not found: {path}") + + # Normalize to directory path if a file was provided + if os.path.isfile(path): + if path.endswith(".safetensors"): + return SafeTensorLoader(path) + elif path.endswith(".gguf"): + return GGUFLoader(path) + else: + folder_path = os.path.dirname(path) + else: + folder_path = path + + # Check for safetensors files + has_safetensors = False + has_gguf = False + + for root, _, files in os.walk(folder_path): + for file in files: + if file.endswith(".safetensors"): + has_safetensors = True + break + elif file.endswith(".gguf"): + has_gguf = True + break + if has_safetensors or has_gguf: + break + + # Create the appropriate loader based on detected file types + # Prioritize SafeTensor over GGUF if both are present + if has_safetensors: + try: + return SafeTensorLoader(folder_path) + except Exception as e: + print(f"Failed to create SafeTensorLoader: {e}") + # Fall through to try GGUF if SafeTensor fails + if not has_gguf: + raise + + if has_gguf: + try: + return GGUFLoader(folder_path) + except Exception as e: + print(f"Failed to create GGUFLoader: {e}") + raise + + # No supported model files found + raise FileNotFoundError(f"No .safetensors or .gguf files found in: {folder_path}") \ No newline at end of file diff --git a/ktransformers/util/utils.py b/ktransformers/util/utils.py index 5a83a45..3ddf639 100644 --- a/ktransformers/util/utils.py +++ b/ktransformers/util/utils.py @@ -22,8 +22,7 @@ from transformers import ( EtaLogitsWarper, ) -from ktransformers.util.custom_gguf import translate_name_to_gguf -from ktransformers.util.custom_gguf import GGUFLoader +from ktransformers.util.custom_loader import ModelLoaderFactory, ModelLoader, SafeTensorLoader, GGUFLoader, translate_name_to_gguf from ktransformers.operators import base_operator from ktransformers.models.custom_cache import StaticCache from ktransformers.util.cuda_graph_runner import CUDAGraphRunner @@ -98,25 +97,24 @@ def get_all_used_cuda_device(device_map:dict): all_device_list = list(all_device_list) return all_device_list -def load_cur_state_dict(module: nn.Module, gguf_loader: GGUFLoader, prefix: str = ""): +def load_cur_state_dict(module: nn.Module, gguf_loader: ModelLoader, prefix: str = ""): prefix = prefix.replace("orig_module.", "") persistent_buffers = {k: v for k, v in module._buffers.items() if k not in module._non_persistent_buffers_set} local_name_params = itertools.chain(module._parameters.items(), persistent_buffers.items()) local_state = {k: v for k, v in local_name_params if v is not None} for name, param in local_state.items(): key = prefix + name - translated_key = translate_name_to_gguf(key) + translated_key = key # TODO: Merge all loader. # I know this is ugly but lets do it for now. - if gguf_loader.safetensor_loader is not None: - load_dequantized_tensor = gguf_loader.safetensor_loader.load_dequantized_tensor - tensor_file_map = gguf_loader.safetensor_loader.tensor_file_map + if isinstance(gguf_loader, SafeTensorLoader): + load_dequantized_tensor = gguf_loader.load_dequantized_tensor else: load_dequantized_tensor = gguf_loader.load_gguf_tensor tensor_file_map = gguf_loader.tensor_file_map - if translated_key in tensor_file_map: + if gguf_loader.has_tensor(translated_key): target_dtype = torch.get_default_dtype() device = get_device(translated_key[:translated_key.rfind(".")], gguf_loader.tensor_device_map) print(f"loading {translated_key} to {device}") @@ -128,7 +126,7 @@ def load_cur_state_dict(module: nn.Module, gguf_loader: GGUFLoader, prefix: str #print(load_config.tensor_file_map.keys()) raise Exception(f"can't find {translated_key} in GGUF file!") -def load_weights(module:nn.Module, gguf_loader:GGUFLoader, prefix=''): +def load_weights(module:nn.Module, gguf_loader:ModelLoader, prefix=''): #print(f"recursively loading weights {prefix}") if not isinstance(module, base_operator.BaseInjectedModule): load_cur_state_dict(module, gguf_loader, prefix) diff --git a/ktransformers/util/weight_loader.py b/ktransformers/util/weight_loader.py new file mode 100644 index 0000000..9dda646 --- /dev/null +++ b/ktransformers/util/weight_loader.py @@ -0,0 +1,367 @@ +from abc import ABC, abstractmethod +import os +import torch +import numpy as np +from safetensors import safe_open +from typing import Dict, Any, Optional, Union + +class ModelLoader(ABC): + """ + Abstract base class for model loaders. + Defines the interface that all model loaders must implement. + """ + + @abstractmethod + def load_tensor(self, name: str, device: str = "cpu") -> torch.Tensor: + """ + Load a tensor by name. + + Args: + name: Name of the tensor to load + device: Device to load the tensor to + + Returns: + The loaded tensor + """ + pass + + @classmethod + @abstractmethod + def supports_format(cls, path: str) -> bool: + """ + Check if this loader supports the given path format. + + Args: + path: Path to check + + Returns: + True if this loader supports the given path, False otherwise + """ + pass + + +class SafeTensorLoader(ModelLoader): + """ + Loader for SafeTensor format models. + """ + + def __init__(self, path: str): + """ + Initialize the SafeTensor loader. + + Args: + path: Path to the model directory or file + """ + self.tensor_file_map = {} # Maps tensor names to file paths + self.file_handle_map = {} # Maps file names to file handles + self._load_tensor_file_map(path) + + def _load_tensor_file_map(self, path: str) -> None: + """ + Load the tensor file map from the given path. + + Args: + path: Path to the model directory or file + """ + # Normalize path to directory + if not os.path.exists(path): + raise FileNotFoundError(f"Path not found: {path}") + if os.path.isfile(path): + folder_path = os.path.dirname(path) + else: + folder_path = path + + found_safetensor = False + for root, _, files in os.walk(folder_path): + files = sorted(files) + for file in files: + if file.endswith(".safetensors"): + found_safetensor = True + file_path = os.path.join(root, file) + if file not in self.file_handle_map: + try: + handle = safe_open(file_path, framework="pt") + self.file_handle_map[file] = handle + except Exception as e: + print(f"Error opening Safetensor file {file_path}: {e}") + continue + + f = self.file_handle_map.get(file) + if f is None: + continue + try: + for key in f.keys(): + self.tensor_file_map[key] = file + except Exception as e: + print(f"Error reading Safetensor file {file_path}: {e}") + + if not found_safetensor: + # Not raising an error here allows for the factory to try other loaders + print(f"No Safetensor files found in {folder_path}") + + def load_tensor(self, name: str, device: str = "cpu") -> torch.Tensor: + """ + Load a tensor by name. + + Args: + name: Name of the tensor to load + device: Device to load the tensor to + + Returns: + The loaded tensor + """ + if name not in self.tensor_file_map: + raise KeyError(f"Key {name} not found in Safetensor files") + file = self.tensor_file_map[name] + f = self.file_handle_map.get(file) + if f is None: + raise FileNotFoundError(f"File {file} not found in Safetensor files") + tensor = f.get_tensor(name) + return tensor.to(device) + + def load_dequantized_tensor(self, name: str, device: str = "cpu") -> torch.Tensor: + """ + Load and dequantize a tensor. + + Args: + name: Name of the tensor to load + device: Device to load the tensor to + + Returns: + The dequantized tensor + """ + if name not in self.tensor_file_map: + raise KeyError(f"Key {name} not found in Safetensor files") + file = self.tensor_file_map[name] + f = self.file_handle_map.get(file) + if f is None: + raise FileNotFoundError(f"File {file} not found in Safetensor files") + tensor = f.get_tensor(name).to(device) + if name.endswith(".weight"): + if name[:-7] + ".weight_scale_inv" in self.tensor_file_map: + weight_scale_inv = f.get_tensor(name[:-7] + ".weight_scale_inv").to(device) + # Assuming weight_dequant function is imported + from ktransformers.ktransformers_ext.triton.fp8gemm import weight_dequant + tensor = weight_dequant(tensor, weight_scale_inv) + return tensor.to(device) + + def close_all_handles(self) -> None: + """ + Close all file handles. + """ + for handle in self.file_handle_map.values(): + handle.close() + self.file_handle_map.clear() + + @classmethod + def supports_format(cls, path: str) -> bool: + """ + Check if this loader supports the given path format. + + Args: + path: Path to check + + Returns: + True if safetensor files are found in the path, False otherwise + """ + # Normalize path to directory + if not os.path.exists(path): + return False + if os.path.isfile(path): + if path.endswith(".safetensors"): + return True + folder_path = os.path.dirname(path) + else: + folder_path = path + + # Check if any safetensor files exist in the folder + for root, _, files in os.walk(folder_path): + for file in files: + if file.endswith(".safetensors"): + return True + return False + + +class GGUFLoader(ModelLoader): + """ + Loader for GGUF format models. + """ + + def __init__(self, path: str): + """ + Initialize the GGUF loader. + + Args: + path: Path to the model directory or file + """ + # Check if path exists + if not os.path.exists(path): + raise FileNotFoundError(f"GGUF dir not found: {path}") + if os.path.isfile(path): + self.gguf_path = os.path.dirname(path) + else: + self.gguf_path = path + + self.tensor_info = {} # Stores tensor metadata + self.tensor_file_map = {} # Maps tensor names to file paths + self.file_data_map = {} # Maps file paths to memory-mapped data + self.gguf_file_meta = {} # Stores GGUF metadata + + # For compatibility with the factory pattern + self.safetensor_loader = None + + # Scan all GGUF files in the directory + found_gguf = False + for root, _, files in os.walk(self.gguf_path): + for file in files: + if file.endswith(".gguf"): + found_gguf = True + file_path = os.path.join(root, file) + with open(file_path, "rb") as f: + self._load_gguf(f) + if file_path not in self.file_data_map: + self.file_data_map[file_path] = np.memmap(file_path, mode='r') + + if not found_gguf: + raise FileNotFoundError(f"Cannot find any .gguf files in: {self.gguf_path}") + + def _load_gguf(self, f) -> None: + """ + Load GGUF file metadata and tensor info. + + Args: + f: File handle of the GGUF file + """ + # Implementation should follow the original GGUFLoader._load_gguf + # This is a simplified version for illustration + f.seek(0) + assert f.read(4) == b'GGUF' + + # Read header + values = struct.unpack(" Any: + """ + Read a value from the file according to its data type. + + Args: + f: File handle + data_type: Type of data to read + + Returns: + The read value + """ + # Simplified implementation + # In a complete implementation, this would handle all data types + if data_type == 8: # DATA_TYPES["string"] + length = struct.unpack(" torch.Tensor: + """ + Load a tensor by name. + + Args: + name: Name of the tensor to load + device: Device to load the tensor to + + Returns: + The loaded tensor + """ + # This should call load_gguf_tensor with the appropriate parameters + return self.load_gguf_tensor(name, device) + + def load_gguf_tensor(self, name: str, device: str = "cpu", target_dtype = None) -> torch.Tensor: + """ + Load a GGUF tensor by name. + + Args: + name: Name of the tensor to load + device: Device to load the tensor to + target_dtype: Target data type for the tensor + + Returns: + The loaded tensor + """ + # Implementation would follow the original GGUFLoader.load_gguf_tensor + # This is a placeholder for illustration + if name not in self.tensor_info: + raise KeyError(f"Tensor {name} not found") + + # Actual implementation would dequantize the tensor data + # and return a torch.Tensor + return torch.zeros(1, device=device) # Placeholder + + @classmethod + def supports_format(cls, path: str) -> bool: + """ + Check if this loader supports the given path format. + + Args: + path: Path to check + + Returns: + True if GGUF files are found in the path, False otherwise + """ + # Normalize path to directory + if not os.path.exists(path): + return False + if os.path.isfile(path): + return path.endswith(".gguf") + + # Check if any GGUF files exist in the folder + for root, _, files in os.walk(path): + for file in files: + if file.endswith(".gguf"): + return True + return False \ No newline at end of file diff --git a/merge_tensors/merge_safetensor_gguf.py b/merge_tensors/merge_safetensor_gguf.py index efeab3b..f299ab9 100644 --- a/merge_tensors/merge_safetensor_gguf.py +++ b/merge_tensors/merge_safetensor_gguf.py @@ -6,7 +6,7 @@ import sys # sys.path.insert(0, "/home/azure/ktransformers") import argparse import torch -from ktransformers.util.custom_gguf import GGUFLoader, translate_name_to_gguf +from ktransformers.util.custom_loader import GGUFLoader, translate_name_to_gguf from safetensors import safe_open from safetensors.torch import save_file import re