mirror of
https://github.com/kvcache-ai/ktransformers.git
synced 2025-09-05 20:19:51 +00:00
Merge pull request #1276 from kvcache-ai/support_load_safetensor
support safetensor load, delete architectures argument
This commit is contained in:
commit
8456222852
30 changed files with 1075 additions and 328 deletions
|
@ -1,7 +1,7 @@
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
sys.path.insert(0,"/home/zbx/ktransformers")
|
sys.path.insert(0,"/home/zbx/ktransformers")
|
||||||
from ktransformers.util.custom_gguf import GGUFLoader
|
from ktransformers.util.custom_loader import GGUFLoader
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
gguf_loader_1 = GGUFLoader("/mnt/data/model/DeepseekV3-q4km-gguf")
|
gguf_loader_1 = GGUFLoader("/mnt/data/model/DeepseekV3-q4km-gguf")
|
||||||
|
|
26
install-with-cache.sh
Executable file
26
install-with-cache.sh
Executable file
|
@ -0,0 +1,26 @@
|
||||||
|
#!/bin/bash
|
||||||
|
set -e
|
||||||
|
|
||||||
|
# clear build dirs
|
||||||
|
# rm -rf build
|
||||||
|
# rm -rf *.egg-info
|
||||||
|
# rm -rf csrc/build
|
||||||
|
# rm -rf csrc/ktransformers_ext/build
|
||||||
|
# rm -rf csrc/ktransformers_ext/cuda/build
|
||||||
|
# rm -rf csrc/ktransformers_ext/cuda/dist
|
||||||
|
# rm -rf csrc/ktransformers_ext/cuda/*.egg-info
|
||||||
|
rm -rf ~/.ktransformers
|
||||||
|
echo "Installing python dependencies from requirements.txt"
|
||||||
|
pip install -r requirements-local_chat.txt
|
||||||
|
pip install -r ktransformers/server/requirements.txt
|
||||||
|
echo "Installing ktransformers"
|
||||||
|
KTRANSFORMERS_FORCE_BUILD=TRUE USE_BALANCE_SERVE=1 pip install -v . --no-build-isolation
|
||||||
|
pip install third_party/custom_flashinfer/ -v
|
||||||
|
|
||||||
|
# SITE_PACKAGES=$(python -c "import site; print(site.getsitepackages()[0])")
|
||||||
|
# echo "Copying thirdparty libs to $SITE_PACKAGES"
|
||||||
|
# cp -a csrc/balance_serve/build/third_party/prometheus-cpp/lib/libprometheus-cpp-*.so* $SITE_PACKAGES/
|
||||||
|
# patchelf --set-rpath '$ORIGIN' $SITE_PACKAGES/sched_ext.cpython*
|
||||||
|
|
||||||
|
|
||||||
|
echo "Installation completed successfully"
|
|
@ -66,7 +66,7 @@ class StaticCache(transformers.StaticCache):
|
||||||
self.page_table_list = []
|
self.page_table_list = []
|
||||||
for idx in range(config.num_hidden_layers):
|
for idx in range(config.num_hidden_layers):
|
||||||
if isinstance(device, dict):
|
if isinstance(device, dict):
|
||||||
target_device = device[f"blk.{idx}.self_attn"]["generate_device"]
|
target_device = device[f"model.layers.{idx}.self_attn"]["generate_device"]
|
||||||
else:
|
else:
|
||||||
target_device = device
|
target_device = device
|
||||||
|
|
||||||
|
@ -91,7 +91,7 @@ class StaticCache(transformers.StaticCache):
|
||||||
# Note: `mark_static_address` is used to tag the cache as an fixed data pointer, preventing cuda graph
|
# Note: `mark_static_address` is used to tag the cache as an fixed data pointer, preventing cuda graph
|
||||||
# breaks when updating the cache.
|
# breaks when updating the cache.
|
||||||
if isinstance(device, dict):
|
if isinstance(device, dict):
|
||||||
target_device = device[f"blk.{idx}.self_attn"]["generate_device"]
|
target_device = device[f"model.layers.{idx}.self_attn"]["generate_device"]
|
||||||
else:
|
else:
|
||||||
target_device = device
|
target_device = device
|
||||||
|
|
||||||
|
|
|
@ -39,7 +39,7 @@ class KQwen2MoeForCausalLM(Qwen2MoePreTrainedModel):
|
||||||
self.cache = cache
|
self.cache = cache
|
||||||
self.vocab_size = config.vocab_size
|
self.vocab_size = config.vocab_size
|
||||||
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
|
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
|
||||||
self.attn = [None] * 10
|
self.attn = [None] * 100
|
||||||
|
|
||||||
def init_wrapper(self, use_cuda_graph, device, max_batch_token, max_batch_size, max_pages, cuda_graph_idx = 0):
|
def init_wrapper(self, use_cuda_graph, device, max_batch_token, max_batch_size, max_pages, cuda_graph_idx = 0):
|
||||||
self.attn[cuda_graph_idx] = flashInferAttn(use_cuda_graph=use_cuda_graph, max_batch_token=max_batch_token, max_batch_size=max_batch_size, max_pages=max_pages, device=device)
|
self.attn[cuda_graph_idx] = flashInferAttn(use_cuda_graph=use_cuda_graph, max_batch_token=max_batch_token, max_batch_size=max_batch_size, max_pages=max_pages, device=device)
|
||||||
|
|
|
@ -39,7 +39,7 @@ class KQwen3MoeForCausalLM(Qwen3MoePreTrainedModel):
|
||||||
self.cache = cache
|
self.cache = cache
|
||||||
self.vocab_size = config.vocab_size
|
self.vocab_size = config.vocab_size
|
||||||
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
|
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
|
||||||
self.attn = [None] * 10
|
self.attn = [None] * 100
|
||||||
|
|
||||||
def init_wrapper(self, use_cuda_graph, device, max_batch_token, max_batch_size, max_pages, cuda_graph_idx = 0):
|
def init_wrapper(self, use_cuda_graph, device, max_batch_token, max_batch_size, max_pages, cuda_graph_idx = 0):
|
||||||
self.attn[cuda_graph_idx] = flashInferAttn(use_cuda_graph=use_cuda_graph, max_batch_token=max_batch_token, max_batch_size=max_batch_size, max_pages=max_pages, device=device)
|
self.attn[cuda_graph_idx] = flashInferAttn(use_cuda_graph=use_cuda_graph, max_batch_token=max_batch_token, max_batch_size=max_batch_size, max_pages=max_pages, device=device)
|
||||||
|
|
|
@ -23,7 +23,7 @@ from ktransformers.models.modeling_deepseek import (
|
||||||
yarn_find_correction_range
|
yarn_find_correction_range
|
||||||
)
|
)
|
||||||
from ktransformers.operators.base_operator import BaseInjectedModule
|
from ktransformers.operators.base_operator import BaseInjectedModule
|
||||||
from ktransformers.util.custom_gguf import GGUFLoader
|
from ktransformers.util.custom_loader import GGUFLoader
|
||||||
from ktransformers.util.utils import InferenceState
|
from ktransformers.util.utils import InferenceState
|
||||||
from transformers.configuration_utils import PretrainedConfig
|
from transformers.configuration_utils import PretrainedConfig
|
||||||
import torch
|
import torch
|
||||||
|
|
|
@ -15,7 +15,7 @@ from ktransformers.models.modeling_llama import LlamaRotaryEmbedding
|
||||||
from ktransformers.models.modeling_deepseek import DeepseekV2Attention, apply_rotary_pos_emb
|
from ktransformers.models.modeling_deepseek import DeepseekV2Attention, apply_rotary_pos_emb
|
||||||
from typing import Optional, Tuple
|
from typing import Optional, Tuple
|
||||||
from ktransformers.operators.base_operator import BaseInjectedModule
|
from ktransformers.operators.base_operator import BaseInjectedModule
|
||||||
from ktransformers.util.custom_gguf import GGUFLoader
|
from ktransformers.util.custom_loader import GGUFLoader
|
||||||
from ktransformers.util.utils import get_compute_capability
|
from ktransformers.util.utils import get_compute_capability
|
||||||
import logging
|
import logging
|
||||||
from transformers.configuration_utils import PretrainedConfig
|
from transformers.configuration_utils import PretrainedConfig
|
||||||
|
|
|
@ -11,7 +11,7 @@ from ktransformers.models.modeling_qwen2_moe import Qwen2MoeAttention
|
||||||
from ktransformers.models.modeling_qwen3_moe import Qwen3MoeAttention
|
from ktransformers.models.modeling_qwen3_moe import Qwen3MoeAttention
|
||||||
from typing import Optional, Tuple
|
from typing import Optional, Tuple
|
||||||
from ktransformers.operators.base_operator import BaseInjectedModule
|
from ktransformers.operators.base_operator import BaseInjectedModule
|
||||||
from ktransformers.util.custom_gguf import GGUFLoader
|
from ktransformers.util.custom_loader import GGUFLoader
|
||||||
import logging
|
import logging
|
||||||
from transformers.configuration_utils import PretrainedConfig
|
from transformers.configuration_utils import PretrainedConfig
|
||||||
from flashinfer import BatchMLAPagedAttentionWrapper
|
from flashinfer import BatchMLAPagedAttentionWrapper
|
||||||
|
|
|
@ -6,7 +6,7 @@ Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
|
||||||
'''
|
'''
|
||||||
from typing import Any
|
from typing import Any
|
||||||
from torch import nn, Tensor
|
from torch import nn, Tensor
|
||||||
from ktransformers.util.custom_gguf import GGUFLoader
|
from ktransformers.util.custom_loader import GGUFLoader
|
||||||
from transformers.configuration_utils import PretrainedConfig
|
from transformers.configuration_utils import PretrainedConfig
|
||||||
import ktransformers.util.utils as utils
|
import ktransformers.util.utils as utils
|
||||||
class BaseInjectedModule(nn.Module):
|
class BaseInjectedModule(nn.Module):
|
||||||
|
|
|
@ -26,7 +26,8 @@ sys.path.append(os.path.join(os.path.dirname(__file__), "..", "ktransformers_ext
|
||||||
import cpuinfer_ext
|
import cpuinfer_ext
|
||||||
from cpuinfer_ext.moe import MOEConfig, MOE
|
from cpuinfer_ext.moe import MOEConfig, MOE
|
||||||
import ctypes
|
import ctypes
|
||||||
from ktransformers.util.custom_gguf import GGMLQuantizationType, GGUFLoader
|
from ktransformers.util.custom_gguf import GGMLQuantizationType
|
||||||
|
from ktransformers.util.custom_loader import GGUFLoader, SafeTensorLoader, ModelLoader
|
||||||
from ktransformers.util.utils import InferenceState
|
from ktransformers.util.utils import InferenceState
|
||||||
from ktransformers.server.config.config import Config
|
from ktransformers.server.config.config import Config
|
||||||
from transformers.activations import ACT2FN
|
from transformers.activations import ACT2FN
|
||||||
|
@ -39,8 +40,18 @@ from ktransformers.operators.cpuinfer import CPUInfer
|
||||||
|
|
||||||
def deduplicate_and_sort(lst):
|
def deduplicate_and_sort(lst):
|
||||||
return sorted(set(lst))
|
return sorted(set(lst))
|
||||||
|
def generate_cuda_graphs(chunk_size: int) -> list:
|
||||||
|
assert chunk_size <= 1024 or chunk_size % 1024 == 0, "chunk_size must <= 1024 or a multiple of 1024"
|
||||||
|
base_list = [1, 2, 3, Config().max_batch_size, 64, 256, 512, chunk_size]
|
||||||
|
|
||||||
|
if chunk_size <= 1024:
|
||||||
|
return base_list
|
||||||
|
|
||||||
|
multiples = [i for i in range(1024, chunk_size + 1, 1024)]
|
||||||
|
|
||||||
|
return deduplicate_and_sort(base_list + multiples)
|
||||||
#cuda_graphs = [Config().chunk_size]
|
#cuda_graphs = [Config().chunk_size]
|
||||||
cuda_graphs = deduplicate_and_sort([1, 2, 3, Config().max_batch_size, 64, Config().chunk_size])
|
cuda_graphs = generate_cuda_graphs(Config().chunk_size)
|
||||||
# class Base(BaseInjectedModule, ABC):
|
# class Base(BaseInjectedModule, ABC):
|
||||||
class KExpertsBase(ABC):
|
class KExpertsBase(ABC):
|
||||||
def __init__(self, key: str, gguf_loader: GGUFLoader, config: PretrainedConfig, orig_module: nn.Module, device: str = "cuda", **kwargs):
|
def __init__(self, key: str, gguf_loader: GGUFLoader, config: PretrainedConfig, orig_module: nn.Module, device: str = "cuda", **kwargs):
|
||||||
|
@ -77,7 +88,7 @@ class KExpertsBase(ABC):
|
||||||
down_type = None
|
down_type = None
|
||||||
|
|
||||||
for key in keys:
|
for key in keys:
|
||||||
if key + ".ffn_gate_exps.weight" in self.gguf_loader.tensor_info:
|
if self.gguf_loader.has_tensor(key + ".ffn_gate_exps.weight"):
|
||||||
targets = [".ffn_gate_exps.weight", ".ffn_up_exps.weight", ".ffn_down_exps.weight" ]
|
targets = [".ffn_gate_exps.weight", ".ffn_up_exps.weight", ".ffn_down_exps.weight" ]
|
||||||
tensors = self.load_multi(key, targets, device=device)
|
tensors = self.load_multi(key, targets, device=device)
|
||||||
gate = tensors[".ffn_gate_exps.weight"]
|
gate = tensors[".ffn_gate_exps.weight"]
|
||||||
|
@ -86,7 +97,7 @@ class KExpertsBase(ABC):
|
||||||
gate_type = self.gguf_loader.tensor_info[key + ".ffn_gate_exps.weight"]["ggml_type"]
|
gate_type = self.gguf_loader.tensor_info[key + ".ffn_gate_exps.weight"]["ggml_type"]
|
||||||
up_type = self.gguf_loader.tensor_info[key + ".ffn_up_exps.weight"]["ggml_type"]
|
up_type = self.gguf_loader.tensor_info[key + ".ffn_up_exps.weight"]["ggml_type"]
|
||||||
down_type = self.gguf_loader.tensor_info[key + ".ffn_down_exps.weight"]["ggml_type"]
|
down_type = self.gguf_loader.tensor_info[key + ".ffn_down_exps.weight"]["ggml_type"]
|
||||||
elif key + ".ffn_down.0.weight" in self.gguf_loader.tensor_info:
|
elif self.gguf_loader.has_tensor(key + ".ffn_down.0.weight"):
|
||||||
# for supporting Mixtral-8x7B-Instuct
|
# for supporting Mixtral-8x7B-Instuct
|
||||||
gate = []
|
gate = []
|
||||||
up = []
|
up = []
|
||||||
|
@ -194,7 +205,7 @@ class KExpertsCPU(KExpertsBase):
|
||||||
self.config.num_experts_per_tok,
|
self.config.num_experts_per_tok,
|
||||||
self.config.hidden_size,
|
self.config.hidden_size,
|
||||||
self.config.moe_intermediate_size,
|
self.config.moe_intermediate_size,
|
||||||
25600,
|
max(cuda_graphs),
|
||||||
gate_ptr,
|
gate_ptr,
|
||||||
up_ptr,
|
up_ptr,
|
||||||
down_ptr,
|
down_ptr,
|
||||||
|
@ -212,7 +223,7 @@ class KExpertsCPU(KExpertsBase):
|
||||||
self.config.num_experts_per_tok,
|
self.config.num_experts_per_tok,
|
||||||
self.config.hidden_size,
|
self.config.hidden_size,
|
||||||
self.config.moe_intermediate_size,
|
self.config.moe_intermediate_size,
|
||||||
25600,
|
max(cuda_graphs),
|
||||||
gate_ptr,
|
gate_ptr,
|
||||||
up_ptr,
|
up_ptr,
|
||||||
down_ptr,
|
down_ptr,
|
||||||
|
@ -325,14 +336,19 @@ class KExpertsCPU(KExpertsBase):
|
||||||
down_type = None
|
down_type = None
|
||||||
|
|
||||||
for key in keys:
|
for key in keys:
|
||||||
if self.gguf_loader.safetensor_loader is not None:
|
if isinstance(self.gguf_loader, SafeTensorLoader):
|
||||||
# using a temp ugly way to temprary load the tensor
|
res = self.gguf_loader.load_experts(key)
|
||||||
gate = self.gguf_loader.safetensor_loader.load_tensor(key + ".ffn_gate_exps.weight").numpy()
|
return {key: res}
|
||||||
up = self.gguf_loader.safetensor_loader.load_tensor(key + ".ffn_up_exps.weight").numpy()
|
elif self.gguf_loader.has_tensor(key + ".ffn_gate_exps.weight"):
|
||||||
down = self.gguf_loader.safetensor_loader.load_tensor(key + ".ffn_down_exps.weight").numpy()
|
gate = self.gguf_loader.get_mmap_tensor(key + ".ffn_gate_exps.weight")
|
||||||
gate_type = self.gguf_loader.safetensor_loader.load_tensor(key + ".ffn_gate_exps.ggml_type").item()
|
up = self.gguf_loader.get_mmap_tensor(key + ".ffn_up_exps.weight")
|
||||||
up_type = self.gguf_loader.safetensor_loader.load_tensor(key + ".ffn_up_exps.ggml_type").item()
|
down = self.gguf_loader.get_mmap_tensor(key + ".ffn_down_exps.weight")
|
||||||
down_type = self.gguf_loader.safetensor_loader.load_tensor(key + ".ffn_down_exps.ggml_type").item()
|
# gate_type = self.gguf_loader.tensor_info[key + ".ffn_gate_exps.weight"]["ggml_type"]
|
||||||
|
# up_type = self.gguf_loader.tensor_info[key + ".ffn_up_exps.weight"]["ggml_type"]
|
||||||
|
# down_type = self.gguf_loader.tensor_info[key + ".ffn_down_exps.weight"]["ggml_type"]
|
||||||
|
gate_type = self.gguf_loader.get_ggml_type(key + ".ffn_gate_exps.weight")
|
||||||
|
up_type = self.gguf_loader.get_ggml_type(key + ".ffn_up_exps.weight")
|
||||||
|
down_type = self.gguf_loader.get_ggml_type(key + ".ffn_down_exps.weight")
|
||||||
|
|
||||||
elif key + ".ffn_gate_exps.weight" in self.gguf_loader.tensor_info:
|
elif key + ".ffn_gate_exps.weight" in self.gguf_loader.tensor_info:
|
||||||
gate = self.gguf_loader.get_mmap_tensor(key + ".ffn_gate_exps.weight")
|
gate = self.gguf_loader.get_mmap_tensor(key + ".ffn_gate_exps.weight")
|
||||||
|
@ -356,9 +372,9 @@ class KExpertsCPU(KExpertsBase):
|
||||||
gate = np.stack(gate)
|
gate = np.stack(gate)
|
||||||
up = np.stack(up)
|
up = np.stack(up)
|
||||||
down = np.stack(down)
|
down = np.stack(down)
|
||||||
gate_type = self.gguf_loader.tensor_info[key + ".ffn_gate.0.weight"]["ggml_type"]
|
gate_type = self.gguf_loader.get_ggml_type(key + ".ffn_gate.0.weight")
|
||||||
up_type = self.gguf_loader.tensor_info[key + ".ffn_up.0.weight"]["ggml_type"]
|
up_type = self.gguf_loader.get_ggml_type(key + ".ffn_up.0.weight")
|
||||||
down_type = self.gguf_loader.tensor_info[key + ".ffn_down.0.weight"]["ggml_type"]
|
down_type = self.gguf_loader.get_ggml_type(key + ".ffn_down.0.weight")
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Experts {key} not found in gguf_loader")
|
raise ValueError(f"Experts {key} not found in gguf_loader")
|
||||||
res = {key:{"gate": gate, "up": up, "down": down, "gate_type": gate_type, "up_type": up_type, "down_type": down_type}}
|
res = {key:{"gate": gate, "up": up, "down": down, "gate_type": gate_type, "up_type": up_type, "down_type": down_type}}
|
||||||
|
@ -445,7 +461,7 @@ class KExpertsMarlin(KExpertsBase):
|
||||||
down = None
|
down = None
|
||||||
|
|
||||||
for key in keys:
|
for key in keys:
|
||||||
if key + ".ffn_gate_exps.weight" in self.gguf_loader.tensor_info:
|
if self.gguf_loader.has_tensor(key + ".ffn_gate_exps.weight"):
|
||||||
gate = self.gguf_loader.get_mmap_tensor(key + ".ffn_gate_exps.weight")
|
gate = self.gguf_loader.get_mmap_tensor(key + ".ffn_gate_exps.weight")
|
||||||
up = self.gguf_loader.get_mmap_tensor(key + ".ffn_up_exps.weight")
|
up = self.gguf_loader.get_mmap_tensor(key + ".ffn_up_exps.weight")
|
||||||
down = self.gguf_loader.get_mmap_tensor(key + ".ffn_down_exps.weight")
|
down = self.gguf_loader.get_mmap_tensor(key + ".ffn_down_exps.weight")
|
||||||
|
|
|
@ -40,7 +40,7 @@ class flashInferAttn():
|
||||||
self.kv_layout = kv_layout
|
self.kv_layout = kv_layout
|
||||||
self.use_cuda_graph = use_cuda_graph
|
self.use_cuda_graph = use_cuda_graph
|
||||||
if flashInferAttn.float_workspace_buffer is None:
|
if flashInferAttn.float_workspace_buffer is None:
|
||||||
flashInferAttn.float_workspace_buffer = torch.empty(1024 * 1024 * 1024, dtype=torch.uint8, device=device)
|
flashInferAttn.float_workspace_buffer = torch.empty(max_batch_token * 1024 * 1024, dtype=torch.uint8, device=device)
|
||||||
self.qo_indptr_buf = torch.empty((max_batch_size+1,), dtype=torch.int32, device=device)
|
self.qo_indptr_buf = torch.empty((max_batch_size+1,), dtype=torch.int32, device=device)
|
||||||
self.paged_kv_indptr_buf = torch.empty((max_batch_size+1,), dtype=torch.int32, device=device)
|
self.paged_kv_indptr_buf = torch.empty((max_batch_size+1,), dtype=torch.int32, device=device)
|
||||||
self.paged_kv_indices_buf = torch.empty((max_pages,), dtype=torch.int32, device=device)
|
self.paged_kv_indices_buf = torch.empty((max_pages,), dtype=torch.int32, device=device)
|
||||||
|
|
|
@ -6,7 +6,7 @@ import os
|
||||||
from ktransformers.operators.base_operator import BaseInjectedModule
|
from ktransformers.operators.base_operator import BaseInjectedModule
|
||||||
from ktransformers.operators.base_operator import BaseInjectedModule
|
from ktransformers.operators.base_operator import BaseInjectedModule
|
||||||
from ktransformers.operators.linear import KTransformersLinear
|
from ktransformers.operators.linear import KTransformersLinear
|
||||||
from ktransformers.util.custom_gguf import GGUFLoader
|
from ktransformers.util.custom_loader import GGUFLoader, ModelLoader, SafeTensorLoader
|
||||||
from transformers.configuration_utils import PretrainedConfig
|
from transformers.configuration_utils import PretrainedConfig
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
|
|
||||||
|
@ -55,24 +55,20 @@ class KMoEGateBase(ABC):
|
||||||
down_type = None
|
down_type = None
|
||||||
|
|
||||||
for key in keys:
|
for key in keys:
|
||||||
key = ".".join(key.split(".")[:-1])
|
# key = ".".join(key.split(".")[:-1])
|
||||||
if self.gguf_loader.safetensor_loader is not None:
|
if isinstance(self.gguf_loader, SafeTensorLoader):
|
||||||
targets = [".ffn_gate_inp.weight", ".exp_probs_b.bias"]
|
res = self.gguf_loader.load_gate(key, device=device)
|
||||||
weight = self.gguf_loader.safetensor_loader.load_tensor(key + ".ffn_gate_inp.weight")
|
elif self.gguf_loader.has_tensor(key+".weight"):
|
||||||
e_score_correction_bias = self.gguf_loader.safetensor_loader.load_tensor(key + ".exp_probs_b.bias")
|
# targets = [".ffn_gate_inp.weight", ".exp_probs_b.bias"]
|
||||||
weight_type = weight.dtype
|
targets = [".weight", ".e_score_correction_bias"]
|
||||||
e_score_correction_bias_type = e_score_correction_bias.dtype
|
|
||||||
res = {"weight": weight, "e_score_correction_bias": e_score_correction_bias, "weight_type": weight_type, "e_score_correction_bias_type": e_score_correction_bias_type}
|
|
||||||
elif key + ".ffn_gate_inp.weight" in self.gguf_loader.tensor_info:
|
|
||||||
targets = [".ffn_gate_inp.weight", ".exp_probs_b.bias"]
|
|
||||||
tensors = self.load_multi(key, targets, device=device)
|
tensors = self.load_multi(key, targets, device=device)
|
||||||
weight = tensors[".ffn_gate_inp.weight"]
|
weight = tensors[".weight"]
|
||||||
e_score_correction_bias = tensors[".exp_probs_b.bias"]
|
e_score_correction_bias = tensors[".e_score_correction_bias"]
|
||||||
weight_type = self.gguf_loader.tensor_info[key + ".ffn_gate_inp.weight"]["ggml_type"]
|
# weight_type = self.gguf_loader.tensor_info[key + ".weight"]["ggml_type"]
|
||||||
e_score_correction_bias_type = self.gguf_loader.tensor_info[key + ".exp_probs_b.bias"]["ggml_type"]
|
res = {"weight": weight, "e_score_correction_bias": e_score_correction_bias}
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Experts {key} not found in gguf_loader")
|
raise ValueError(f"Experts {key} not found in gguf_loader")
|
||||||
res = {"weight": weight, "e_score_correction_bias": e_score_correction_bias, "weight_type": weight_type, "e_score_correction_bias_type": e_score_correction_bias_type}
|
|
||||||
return res
|
return res
|
||||||
|
|
||||||
def load_multi(self, key: str, keys: list[str], device: str = "cpu"):
|
def load_multi(self, key: str, keys: list[str], device: str = "cpu"):
|
||||||
|
@ -106,8 +102,6 @@ class KMoEGate(BaseInjectedModule, KMoEGateBase):
|
||||||
if w is None: w = self.load_weights(device=device)
|
if w is None: w = self.load_weights(device=device)
|
||||||
|
|
||||||
if isinstance(w, dict):
|
if isinstance(w, dict):
|
||||||
self.weight_type = w["weight_type"]
|
|
||||||
self.e_score_correction_bias_type = w["e_score_correction_bias_type"]
|
|
||||||
self.orig_module.weight = nn.Parameter(w["weight"])
|
self.orig_module.weight = nn.Parameter(w["weight"])
|
||||||
self.orig_module.e_score_correction_bias = nn.Parameter(w["e_score_correction_bias"])
|
self.orig_module.e_score_correction_bias = nn.Parameter(w["e_score_correction_bias"])
|
||||||
else:
|
else:
|
||||||
|
@ -175,8 +169,6 @@ class KMoEGateQwen2Moe(BaseInjectedModule, KMoEGateBase):
|
||||||
if w is None: w = self.load_weights(device=device)
|
if w is None: w = self.load_weights(device=device)
|
||||||
|
|
||||||
if isinstance(w, dict):
|
if isinstance(w, dict):
|
||||||
self.weight_type = w["weight_type"]
|
|
||||||
self.e_score_correction_bias_type = w["e_score_correction_bias_type"]
|
|
||||||
self.orig_module.weight = nn.Parameter(w["weight"])
|
self.orig_module.weight = nn.Parameter(w["weight"])
|
||||||
self.orig_module.e_score_correction_bias = nn.Parameter(w["e_score_correction_bias"])
|
self.orig_module.e_score_correction_bias = nn.Parameter(w["e_score_correction_bias"])
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -29,7 +29,7 @@ from ktransformers.models.modeling_deepseek_v3 import DeepseekV3RMSNorm
|
||||||
from ktransformers.models.modeling_qwen2_moe import Qwen2MoeRMSNorm
|
from ktransformers.models.modeling_qwen2_moe import Qwen2MoeRMSNorm
|
||||||
from ktransformers.models.modeling_qwen3_moe import Qwen3MoeRMSNorm
|
from ktransformers.models.modeling_qwen3_moe import Qwen3MoeRMSNorm
|
||||||
from ktransformers.operators.base_operator import BaseInjectedModule
|
from ktransformers.operators.base_operator import BaseInjectedModule
|
||||||
from ktransformers.util.custom_gguf import GGUFLoader
|
from ktransformers.util.custom_loader import GGUFLoader
|
||||||
from flashinfer.norm import (
|
from flashinfer.norm import (
|
||||||
fused_add_rmsnorm,
|
fused_add_rmsnorm,
|
||||||
rmsnorm,
|
rmsnorm,
|
||||||
|
|
|
@ -16,7 +16,7 @@ import torch
|
||||||
from torch import Tensor, nn
|
from torch import Tensor, nn
|
||||||
import KTransformersOps
|
import KTransformersOps
|
||||||
import vLLMMarlin
|
import vLLMMarlin
|
||||||
from ktransformers.util.custom_gguf import GGUFLoader
|
from ktransformers.util.custom_loader import GGUFLoader, SafeTensorLoader
|
||||||
from ktransformers.util.utils import InferenceState
|
from ktransformers.util.utils import InferenceState
|
||||||
from ktransformers.ktransformers_ext.operators.custom_marlin.quantize.utils.marlin_utils import (
|
from ktransformers.ktransformers_ext.operators.custom_marlin.quantize.utils.marlin_utils import (
|
||||||
MarlinWorkspace,
|
MarlinWorkspace,
|
||||||
|
@ -83,15 +83,15 @@ class KLinearBase(ABC):
|
||||||
keys = [self.key]
|
keys = [self.key]
|
||||||
|
|
||||||
for key in keys:
|
for key in keys:
|
||||||
if self.gguf_loader.safetensor_loader is not None:
|
if isinstance(self.gguf_loader, SafeTensorLoader):
|
||||||
# using safetensor_loader
|
# using safetensor_loader
|
||||||
tensor = self.gguf_loader.safetensor_loader.load_tensor(key+'.weight')
|
tensor = self.gguf_loader.load_tensor(key+'.weight')
|
||||||
if key+'.weight_scale_inv' in self.gguf_loader.safetensor_loader.tensor_file_map:
|
if self.gguf_loader.has_tensor(key+'.weight_scale_inv'):
|
||||||
weight_scale_inv = self.gguf_loader.safetensor_loader.load_tensor(key+'.weight_scale_inv')
|
weight_scale_inv = self.gguf_loader.load_tensor(key+'.weight_scale_inv')
|
||||||
return nn.Parameter(tensor), nn.Parameter(weight_scale_inv)
|
return nn.Parameter(tensor), nn.Parameter(weight_scale_inv)
|
||||||
return nn.Parameter(tensor)
|
return nn.Parameter(tensor)
|
||||||
|
|
||||||
elif key + ".weight" in self.gguf_loader.tensor_file_map:
|
elif self.gguf_loader.has_tensor(key + ".weight"):
|
||||||
if key + ".bias" in self.gguf_loader.tensor_file_map:
|
if key + ".bias" in self.gguf_loader.tensor_file_map:
|
||||||
tensors = self.load_multi(key, ["weight", "bias"], device=device)
|
tensors = self.load_multi(key, ["weight", "bias"], device=device)
|
||||||
tensor = tensors["weight"]
|
tensor = tensors["weight"]
|
||||||
|
@ -760,7 +760,7 @@ class KLinearCPUInfer(KLinearBase):
|
||||||
self.output_gpu = torch.zeros((1, 1, self.out_features), device=self.out_device)
|
self.output_gpu = torch.zeros((1, 1, self.out_features), device=self.out_device)
|
||||||
|
|
||||||
def load_weights(self, w: dict | nn.Parameter | tuple | None = None, device: str = "cpu"):
|
def load_weights(self, w: dict | nn.Parameter | tuple | None = None, device: str = "cpu"):
|
||||||
if self.key + ".weight" in self.gguf_loader.tensor_info:
|
if self.gguf_loader.has_tensor(self.key + ".weight"):
|
||||||
if self.key + ".bias" in self.gguf_loader.tensor_file_map:
|
if self.key + ".bias" in self.gguf_loader.tensor_file_map:
|
||||||
self.weight = self.gguf_loader.get_mmap_tensor(self.key + ".weight")
|
self.weight = self.gguf_loader.get_mmap_tensor(self.key + ".weight")
|
||||||
self.weight_type = self.gguf_loader.tensor_info[self.key + ".weight"]["ggml_type"]
|
self.weight_type = self.gguf_loader.tensor_info[self.key + ".weight"]["ggml_type"]
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
|
|
||||||
from ktransformers.operators.base_operator import BaseInjectedModule
|
from ktransformers.operators.base_operator import BaseInjectedModule
|
||||||
from ktransformers.util.custom_gguf import GGUFLoader
|
from ktransformers.util.custom_loader import GGUFLoader
|
||||||
from transformers import PretrainedConfig
|
from transformers import PretrainedConfig
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
from ktransformers.models.modeling_deepseek_v3 import DeepseekV3MLP
|
from ktransformers.models.modeling_deepseek_v3 import DeepseekV3MLP
|
||||||
|
|
|
@ -58,7 +58,7 @@ from transformers.models.qwen2_moe.configuration_qwen2_moe import Qwen2MoeConfig
|
||||||
from ktransformers.models.configuration_llama import LlamaConfig
|
from ktransformers.models.configuration_llama import LlamaConfig
|
||||||
from ktransformers.operators.base_operator import BaseInjectedModule
|
from ktransformers.operators.base_operator import BaseInjectedModule
|
||||||
from ktransformers.util.utils import InferenceState, get_compute_capability
|
from ktransformers.util.utils import InferenceState, get_compute_capability
|
||||||
from ktransformers.util.custom_gguf import GGUFLoader
|
from ktransformers.util.custom_loader import GGUFLoader
|
||||||
from transformers.configuration_utils import PretrainedConfig
|
from transformers.configuration_utils import PretrainedConfig
|
||||||
from ktransformers.models.modeling_llama import (
|
from ktransformers.models.modeling_llama import (
|
||||||
LlamaDecoderLayer,
|
LlamaDecoderLayer,
|
||||||
|
|
|
@ -12,7 +12,7 @@ from torch import nn
|
||||||
from transformers import AutoConfig
|
from transformers import AutoConfig
|
||||||
from transformers.configuration_utils import PretrainedConfig
|
from transformers.configuration_utils import PretrainedConfig
|
||||||
# from operators import BaseInjectedModule
|
# from operators import BaseInjectedModule
|
||||||
from ktransformers.util.custom_gguf import GGUFLoader, translate_name_to_gguf
|
from ktransformers.util.custom_loader import GGUFLoader, ModelLoaderFactory
|
||||||
from ktransformers.util.utils import set_module, load_weights
|
from ktransformers.util.utils import set_module, load_weights
|
||||||
import itertools
|
import itertools
|
||||||
import copy
|
import copy
|
||||||
|
@ -54,7 +54,7 @@ def del_meta(module:nn.Module):
|
||||||
|
|
||||||
def gen_optimize_config(module: nn.Module, out_data: Mapping, rule_list: List, prefix: str="", default_device: str = "cuda:0"):
|
def gen_optimize_config(module: nn.Module, out_data: Mapping, rule_list: List, prefix: str="", default_device: str = "cuda:0"):
|
||||||
module_name = prefix[:-1]
|
module_name = prefix[:-1]
|
||||||
translated_name = translate_name_to_gguf(prefix)[:-1]
|
# translated_name = translate_name_to_gguf(prefix)[:-1]
|
||||||
#print("gen_optimize_config", prefix, module_name, translated_name)
|
#print("gen_optimize_config", prefix, module_name, translated_name)
|
||||||
recursive = True
|
recursive = True
|
||||||
for rule in rule_list:
|
for rule in rule_list:
|
||||||
|
@ -76,7 +76,7 @@ def gen_optimize_config(module: nn.Module, out_data: Mapping, rule_list: List, p
|
||||||
if "replace" in rule:
|
if "replace" in rule:
|
||||||
replace_meta = rule["replace"]
|
replace_meta = rule["replace"]
|
||||||
if module_name not in out_data:
|
if module_name not in out_data:
|
||||||
out_data[module_name]={"key": translated_name,
|
out_data[module_name]={"key": module_name,
|
||||||
"class": replace_meta["class"] if "class" in replace_meta else "default",
|
"class": replace_meta["class"] if "class" in replace_meta else "default",
|
||||||
# "device": replace_meta["device"] if "device" in replace_meta else default_device,
|
# "device": replace_meta["device"] if "device" in replace_meta else default_device,
|
||||||
"kwargs": copy.deepcopy(replace_meta["kwargs"]) if "kwargs" in replace_meta else dict()}
|
"kwargs": copy.deepcopy(replace_meta["kwargs"]) if "kwargs" in replace_meta else dict()}
|
||||||
|
@ -91,7 +91,7 @@ def gen_optimize_config(module: nn.Module, out_data: Mapping, rule_list: List, p
|
||||||
if module_name not in out_data:
|
if module_name not in out_data:
|
||||||
out_data[module_name]= {
|
out_data[module_name]= {
|
||||||
"class": "default",
|
"class": "default",
|
||||||
"key": translated_name,
|
"key": module_name,
|
||||||
"kwargs": {"generate_device": default_device,
|
"kwargs": {"generate_device": default_device,
|
||||||
"prefill_device": default_device}
|
"prefill_device": default_device}
|
||||||
}
|
}
|
||||||
|
@ -123,12 +123,12 @@ def optimize_and_load_gguf(module: nn.Module, rule_file: str, gguf_path: str, mo
|
||||||
|
|
||||||
model_config = translate_model_config(model_config)
|
model_config = translate_model_config(model_config)
|
||||||
|
|
||||||
gguf_loader=GGUFLoader(gguf_path)
|
weights_loader = ModelLoaderFactory.create_loader(gguf_path)
|
||||||
with torch.device("meta"):
|
with torch.device("meta"):
|
||||||
inject(module, optimize_config, model_config, gguf_loader)
|
inject(module, optimize_config, model_config, weights_loader)
|
||||||
# pre load lm_head because its big inter result
|
# pre load lm_head because its big inter result
|
||||||
load_weights(module.lm_head, gguf_loader, "lm_head.")
|
load_weights(module.lm_head, weights_loader, "lm_head.")
|
||||||
load_weights(module, gguf_loader)
|
load_weights(module, weights_loader)
|
||||||
module.gguf_loader = gguf_loader
|
module.gguf_loader = weights_loader
|
||||||
del_meta(module)
|
del_meta(module)
|
||||||
torch.cuda.empty_cache()
|
torch.cuda.empty_cache()
|
||||||
|
|
|
@ -0,0 +1,91 @@
|
||||||
|
- match:
|
||||||
|
class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
|
||||||
|
replace:
|
||||||
|
class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
|
||||||
|
kwargs:
|
||||||
|
generate_device: "cuda"
|
||||||
|
prefill_device: "cuda"
|
||||||
|
- match:
|
||||||
|
name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$" # regular expression
|
||||||
|
class: torch.nn.Linear # only match modules matching name and class simultaneously
|
||||||
|
replace:
|
||||||
|
class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types
|
||||||
|
kwargs:
|
||||||
|
generate_device: "cuda"
|
||||||
|
prefill_device: "cuda"
|
||||||
|
generate_op: "KLinearFP8"
|
||||||
|
prefill_op: "KLinearTorch"
|
||||||
|
- match:
|
||||||
|
name: "^model\\.layers\\..*\\.mlp$"
|
||||||
|
class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
|
||||||
|
replace:
|
||||||
|
class: ktransformers.operators.experts.KDeepseekV3MoEV2 # mlp module with custom forward function
|
||||||
|
kwargs:
|
||||||
|
generate_device: "cuda"
|
||||||
|
prefill_device: "cuda"
|
||||||
|
- match:
|
||||||
|
class: ktransformers.models.modeling_deepseek_v3.MoEGate
|
||||||
|
replace:
|
||||||
|
class: ktransformers.operators.gate.KMoEGate
|
||||||
|
kwargs:
|
||||||
|
generate_device: "cuda:0"
|
||||||
|
prefill_device: "cuda:0"
|
||||||
|
- match:
|
||||||
|
name: "^model\\.layers\\..*\\.mlp\\.experts$"
|
||||||
|
replace:
|
||||||
|
class: ktransformers.operators.experts.KTransformersExpertsV2 # custom MoE Kernel with expert paralleism
|
||||||
|
kwargs:
|
||||||
|
prefill_device: "cuda"
|
||||||
|
prefill_op: "KExpertsTorch"
|
||||||
|
generate_device: "cpu"
|
||||||
|
generate_op: "KExpertsCPU"
|
||||||
|
out_device: "cuda"
|
||||||
|
backend: "llamafile"
|
||||||
|
recursive: False # don't recursively inject submodules of this module
|
||||||
|
- match:
|
||||||
|
name: "^model\\.layers\\..*\\.self_attn$"
|
||||||
|
replace:
|
||||||
|
class: ktransformers.operators.balance_serve_attention.flashinfer_attn # optimized MLA implementation
|
||||||
|
kwargs:
|
||||||
|
generate_device: "cuda"
|
||||||
|
prefill_device: "cuda"
|
||||||
|
- match:
|
||||||
|
name: "^model$"
|
||||||
|
replace:
|
||||||
|
class: "ktransformers.operators.models.KDeepseekV2Model"
|
||||||
|
kwargs:
|
||||||
|
per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
|
||||||
|
- match:
|
||||||
|
name: "^model.embed_tokens"
|
||||||
|
replace:
|
||||||
|
class: "default"
|
||||||
|
kwargs:
|
||||||
|
generate_device: "cpu"
|
||||||
|
prefill_device: "cpu"
|
||||||
|
|
||||||
|
- match:
|
||||||
|
class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RMSNorm
|
||||||
|
replace:
|
||||||
|
class: ktransformers.operators.layernorm.RMSNorm
|
||||||
|
kwargs:
|
||||||
|
generate_device: "cuda"
|
||||||
|
prefill_device: "cuda"
|
||||||
|
|
||||||
|
- match:
|
||||||
|
class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MLP
|
||||||
|
replace:
|
||||||
|
class: ktransformers.operators.mlp.kDeepseekV3MLP
|
||||||
|
kwargs:
|
||||||
|
generate_device: "cuda"
|
||||||
|
prefill_device: "cuda"
|
||||||
|
|
||||||
|
- match:
|
||||||
|
name: "^lm_head$" # regular expression
|
||||||
|
class: torch.nn.Linear # only match modules matching name and class simultaneously
|
||||||
|
replace:
|
||||||
|
class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types
|
||||||
|
kwargs:
|
||||||
|
generate_device: "cuda"
|
||||||
|
prefill_device: "cuda"
|
||||||
|
generate_op: "VLinearMarlin"
|
||||||
|
prefill_op: "KLinearTorch"
|
|
@ -128,10 +128,7 @@ class ArgumentParser:
|
||||||
else:
|
else:
|
||||||
args.model_dir = self.cfg.model_dir
|
args.model_dir = self.cfg.model_dir
|
||||||
args.model_path = self.cfg.model_path
|
args.model_path = self.cfg.model_path
|
||||||
# set config from args
|
|
||||||
for key, value in vars(args).items():
|
|
||||||
if value is not None and hasattr(self.cfg, key):
|
|
||||||
setattr(self.cfg, key, value)
|
|
||||||
# we add the name not match args individually
|
# we add the name not match args individually
|
||||||
self.cfg.model_device = args.device
|
self.cfg.model_device = args.device
|
||||||
self.cfg.mount_web = args.web
|
self.cfg.mount_web = args.web
|
||||||
|
@ -140,10 +137,15 @@ class ArgumentParser:
|
||||||
self.cfg.user_force_think = args.force_think
|
self.cfg.user_force_think = args.force_think
|
||||||
|
|
||||||
model_config = AutoConfig.from_pretrained(args.model_dir, trust_remote_code=True)
|
model_config = AutoConfig.from_pretrained(args.model_dir, trust_remote_code=True)
|
||||||
if args.architectures == "Qwen3MoeForCausalLM" or args.architectures == "Qwen2MoeForCausalLM" :
|
if model_config.architectures[0] == "Qwen3MoeForCausalLM" or model_config.architectures[0] == "Qwen2MoeForCausalLM" :
|
||||||
args.gpu_memory_size = args.cache_lens*2*2*model_config.num_hidden_layers*model_config.num_key_value_heads*model_config.head_dim
|
args.gpu_memory_size = args.cache_lens*2*2*model_config.num_hidden_layers*model_config.num_key_value_heads*model_config.head_dim
|
||||||
|
args.architectures = model_config.architectures[0]
|
||||||
else:
|
else:
|
||||||
args.gpu_memory_size = args.cache_lens*2*576*61
|
args.gpu_memory_size = args.cache_lens*2*576*61
|
||||||
|
# set config from args
|
||||||
|
for key, value in vars(args).items():
|
||||||
|
if value is not None and hasattr(self.cfg, key):
|
||||||
|
setattr(self.cfg, key, value)
|
||||||
self.cfg.gpu_memory_size = args.gpu_memory_size
|
self.cfg.gpu_memory_size = args.gpu_memory_size
|
||||||
free_ports = get_free_ports(3, [args.port])
|
free_ports = get_free_ports(3, [args.port])
|
||||||
args.sched_port = free_ports[0]
|
args.sched_port = free_ports[0]
|
||||||
|
|
|
@ -197,7 +197,7 @@ class Engine:
|
||||||
self.block_num = inference_context.k_cache[0].size(1)
|
self.block_num = inference_context.k_cache[0].size(1)
|
||||||
#@TODO add config
|
#@TODO add config
|
||||||
if config.architectures[0] == "Qwen2MoeForCausalLM" or config.architectures[0] == "Qwen3MoeForCausalLM":
|
if config.architectures[0] == "Qwen2MoeForCausalLM" or config.architectures[0] == "Qwen3MoeForCausalLM":
|
||||||
self.model.init_wrapper(self.args.use_cuda_graph, self.device, 1024 ,args.max_batch_size, self.block_num) # TODO: 1024 is a magic number(max_batch_tokens)
|
self.model.init_wrapper(self.args.use_cuda_graph, self.device, Config().chunk_size, args.max_batch_size, self.block_num) # TODO: 1024 is a magic number(max_batch_tokens)
|
||||||
else:
|
else:
|
||||||
self.model.init_wrapper(self.args.use_cuda_graph, self.device, args.max_batch_size, self.block_num)
|
self.model.init_wrapper(self.args.use_cuda_graph, self.device, args.max_batch_size, self.block_num)
|
||||||
|
|
||||||
|
|
|
@ -200,7 +200,7 @@ class ForwardBatchInput:
|
||||||
device=None,
|
device=None,
|
||||||
tokens: torch.Tensor = None,
|
tokens: torch.Tensor = None,
|
||||||
num_mini_batches: int = 1,
|
num_mini_batches: int = 1,
|
||||||
max_seq_length: int = 1024, # TODO: add to yaml
|
max_seq_length: int = 4096, # TODO: add to yaml
|
||||||
prefill_query_length: int = (Config().chunk_size - Config().max_decode_batch_size) // Config().max_prefill_batch_size, # TODO: use config
|
prefill_query_length: int = (Config().chunk_size - Config().max_decode_batch_size) // Config().max_prefill_batch_size, # TODO: use config
|
||||||
prefill_active_length: int = (Config().chunk_size - Config().max_decode_batch_size) // Config().max_prefill_batch_size,
|
prefill_active_length: int = (Config().chunk_size - Config().max_decode_batch_size) // Config().max_prefill_batch_size,
|
||||||
gen_prefill: bool = True,
|
gen_prefill: bool = True,
|
||||||
|
@ -223,12 +223,12 @@ class ForwardBatchInput:
|
||||||
|
|
||||||
decode_querys_info = []
|
decode_querys_info = []
|
||||||
for i in range(min(decode_batch_size, cuda_lens)):
|
for i in range(min(decode_batch_size, cuda_lens)):
|
||||||
query_info = QueryInfo(i+Config().max_prefill_batch_size, prefill_query_length, max_seq_length, page_size, device, is_prefill=False, offset=offset)
|
query_info = QueryInfo(i+Config().max_prefill_batch_size, prefill_query_length, 256, page_size, device, is_prefill=False, offset=offset)
|
||||||
offset += max_seq_length // page_size
|
offset += max_seq_length // page_size
|
||||||
if tokens is not None:
|
if tokens is not None:
|
||||||
query_info.query_tokens[prefill_active_length:prefill_active_length + 1].copy_(tokens)
|
query_info.query_tokens[prefill_active_length:prefill_active_length + 1].copy_(tokens)
|
||||||
if decode_active_position is None:
|
if decode_active_position is None:
|
||||||
query_info.active_position = prefill_active_length
|
query_info.active_position = 255
|
||||||
else:
|
else:
|
||||||
query_info.active_position = decode_active_position[i]
|
query_info.active_position = decode_active_position[i]
|
||||||
|
|
||||||
|
|
|
@ -39,6 +39,17 @@ def pad_num_tokens(num_tokens):
|
||||||
|
|
||||||
def deduplicate_and_sort(lst):
|
def deduplicate_and_sort(lst):
|
||||||
return sorted(set(lst))
|
return sorted(set(lst))
|
||||||
|
def generate_cuda_graphs(chunk_size: int) -> list:
|
||||||
|
# 如果输入不符合要求,assert掉
|
||||||
|
assert chunk_size <= 1024 or chunk_size % 1024 == 0, "chunk_size must <= 1024 or a multiple of 1024"
|
||||||
|
base_list = [1, 2, 3, Config().max_batch_size, 64, 256, 512, chunk_size]
|
||||||
|
|
||||||
|
if chunk_size <= 1024:
|
||||||
|
return base_list
|
||||||
|
|
||||||
|
multiples = [i for i in range(1024, chunk_size + 1, 1024)]
|
||||||
|
|
||||||
|
return deduplicate_and_sort(base_list + multiples)
|
||||||
class ModelRunner:
|
class ModelRunner:
|
||||||
"""A CudaGraphRunner runs the forward pass of a model with CUDA graph and torch.compile."""
|
"""A CudaGraphRunner runs the forward pass of a model with CUDA graph and torch.compile."""
|
||||||
|
|
||||||
|
@ -56,7 +67,7 @@ class ModelRunner:
|
||||||
self.features_buf = None
|
self.features_buf = None
|
||||||
self.output = None
|
self.output = None
|
||||||
self.graph_memory_pool = None
|
self.graph_memory_pool = None
|
||||||
self.cuda_graphs = deduplicate_and_sort([1, 2, 3, Config().max_batch_size, 64, Config().chunk_size])
|
self.cuda_graphs = generate_cuda_graphs(Config().chunk_size)
|
||||||
self.use_cuda_graph = use_cuda_graph
|
self.use_cuda_graph = use_cuda_graph
|
||||||
self.model_time = 0
|
self.model_time = 0
|
||||||
self.page_size = page_size
|
self.page_size = page_size
|
||||||
|
|
|
@ -7,7 +7,7 @@ sys.path.append(current_path+"/../..")
|
||||||
import numpy as np
|
import numpy as np
|
||||||
# from ktransformers.operators.linear import KTransformersLinear, KLinearMarlin
|
# from ktransformers.operators.linear import KTransformersLinear, KLinearMarlin
|
||||||
# from ktransformers.operators.experts import KTransformersExperts, KExpertsTorch
|
# from ktransformers.operators.experts import KTransformersExperts, KExpertsTorch
|
||||||
from ktransformers.util.custom_gguf import GGUFLoader
|
from ktransformers.util.custom_loader import GGUFLoader
|
||||||
import torch
|
import torch
|
||||||
import KTransformersOps
|
import KTransformersOps
|
||||||
torch.set_default_dtype(torch.bfloat16)
|
torch.set_default_dtype(torch.bfloat16)
|
||||||
|
|
|
@ -9,7 +9,7 @@ from pycuda.compiler import SourceModule
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from ktransformers.operators.linear import KTransformersLinear, KLinearMarlin
|
from ktransformers.operators.linear import KTransformersLinear, KLinearMarlin
|
||||||
from ktransformers.operators.experts import KTransformersExperts, KExpertsTorch
|
from ktransformers.operators.experts import KTransformersExperts, KExpertsTorch
|
||||||
from ktransformers.util.custom_gguf import GGUFLoader, dequantize_q4_k_gpu, dequantize_q4_k
|
from ktransformers.util.custom_loader import GGUFLoader, dequantize_q4_k_gpu, dequantize_q4_k
|
||||||
import torch
|
import torch
|
||||||
import KTransformersOps
|
import KTransformersOps
|
||||||
torch.set_default_dtype(torch.bfloat16)
|
torch.set_default_dtype(torch.bfloat16)
|
||||||
|
|
|
@ -159,5 +159,7 @@ if __name__ == "__main__":
|
||||||
prompt = ktansformer_prompt1024
|
prompt = ktansformer_prompt1024
|
||||||
elif args.prompt_lens == 2048:
|
elif args.prompt_lens == 2048:
|
||||||
prompt = ktansformer_prompt1024 * 2
|
prompt = ktansformer_prompt1024 * 2
|
||||||
|
elif args.prompt_lens == 4096:
|
||||||
|
prompt = ktansformer_prompt1024 * 4
|
||||||
asyncio.run(main(args.concurrent, prompt, max_tokens, model))
|
asyncio.run(main(args.concurrent, prompt, max_tokens, model))
|
||||||
|
|
||||||
|
|
|
@ -25,7 +25,6 @@ import os
|
||||||
from enum import IntEnum
|
from enum import IntEnum
|
||||||
import torch
|
import torch
|
||||||
import KTransformersOps
|
import KTransformersOps
|
||||||
from .custom_loader import SafeTensorLoader
|
|
||||||
import ctypes
|
import ctypes
|
||||||
import math
|
import math
|
||||||
|
|
||||||
|
@ -166,238 +165,6 @@ DATA_TYPES = {
|
||||||
"FP8": 13,
|
"FP8": 13,
|
||||||
}
|
}
|
||||||
|
|
||||||
class GGUFLoader:
|
|
||||||
tensor_info: dict
|
|
||||||
gguf_path: str
|
|
||||||
tensor_file_map: dict # {tensor_name: tensor_file_path}
|
|
||||||
gguf_file_meta: dict
|
|
||||||
safetensor_loader: SafeTensorLoader
|
|
||||||
def __init__(self, gguf_path: str):
|
|
||||||
# Check dir exist
|
|
||||||
if not os.path.exists(gguf_path):
|
|
||||||
raise FileNotFoundError(f"GGUF dir not found: {gguf_path}")
|
|
||||||
if os.path.isfile(gguf_path):
|
|
||||||
gguf_path = os.path.dirname(gguf_path)
|
|
||||||
|
|
||||||
self.safetensor_loader = None
|
|
||||||
|
|
||||||
self.tensor_info = {}
|
|
||||||
self.gguf_path = gguf_path
|
|
||||||
self.tensor_file_map = {}
|
|
||||||
self.file_data_map = {}
|
|
||||||
self.gguf_file_meta = {}
|
|
||||||
self.tensor_device_map = {}
|
|
||||||
|
|
||||||
# I know this is ugly, but I don't want to change the original code too much
|
|
||||||
# TODO: merge gguf load and other loads.
|
|
||||||
safetensor_loader = SafeTensorLoader(gguf_path)
|
|
||||||
if safetensor_loader.tensor_file_map:
|
|
||||||
self.safetensor_loader = safetensor_loader
|
|
||||||
return
|
|
||||||
# Walk through all the .gguf files in the directory
|
|
||||||
found_gguf = False
|
|
||||||
for root, dirs, files in os.walk(gguf_path):
|
|
||||||
for file in files:
|
|
||||||
if file.endswith(".gguf"):
|
|
||||||
found_gguf = True
|
|
||||||
file_name = os.path.join(root, file)
|
|
||||||
with open(file_name, "rb") as f:
|
|
||||||
self.load_gguf(f)
|
|
||||||
if file_name not in self.file_data_map:
|
|
||||||
self.file_data_map[file_name] = np.memmap(file_name, mode = 'r')
|
|
||||||
if not found_gguf:
|
|
||||||
raise FileNotFoundError(f"Cannot find any .gguf files in: {gguf_path}")
|
|
||||||
|
|
||||||
def load_gguf(self, f):
|
|
||||||
f.seek(0)
|
|
||||||
assert f.read(4) == b'GGUF'
|
|
||||||
values = struct.unpack("<IQQ", f.read(4+8+8))
|
|
||||||
version, n_tensors, n_kv = values
|
|
||||||
if version != 3:
|
|
||||||
warnings.warn(f"Version {version} has never been tested, might not work")
|
|
||||||
|
|
||||||
info = {}
|
|
||||||
for _ in range(n_kv):
|
|
||||||
name = read_value(f, DATA_TYPES["string"])
|
|
||||||
|
|
||||||
data_type = struct.unpack("<I", f.read(4))[0]
|
|
||||||
|
|
||||||
info[name] = read_value(f, data_type)
|
|
||||||
|
|
||||||
tensor_info = {}
|
|
||||||
for _ in range(n_tensors):
|
|
||||||
name = read_value(f, DATA_TYPES["string"])
|
|
||||||
shape_len = read_value(f, DATA_TYPES["uint32"])
|
|
||||||
shape = [read_value(f, DATA_TYPES["uint64"]) for _ in range(shape_len)]
|
|
||||||
ggml_type = read_value(f, DATA_TYPES["uint32"])
|
|
||||||
bad_offset = read_value(f, DATA_TYPES["uint64"])
|
|
||||||
n_elems = int(math.prod(shape))
|
|
||||||
block_size, type_size = GGML_QUANT_SIZES[ggml_type]
|
|
||||||
n_bytes = n_elems * type_size // block_size
|
|
||||||
np_dims = tuple(reversed(shape))
|
|
||||||
|
|
||||||
item_type: npt.DTypeLike
|
|
||||||
if ggml_type == GGMLQuantizationType.F16:
|
|
||||||
item_count = n_elems
|
|
||||||
item_type = np.float16
|
|
||||||
elif ggml_type == GGMLQuantizationType.F32:
|
|
||||||
item_count = n_elems
|
|
||||||
item_type = np.float32
|
|
||||||
elif ggml_type == GGMLQuantizationType.F64:
|
|
||||||
item_count = n_elems
|
|
||||||
item_type = np.float64
|
|
||||||
elif ggml_type == GGMLQuantizationType.I8:
|
|
||||||
item_count = n_elems
|
|
||||||
item_type = np.int8
|
|
||||||
elif ggml_type == GGMLQuantizationType.I16:
|
|
||||||
item_count = n_elems
|
|
||||||
item_type = np.int16
|
|
||||||
elif ggml_type == GGMLQuantizationType.I32:
|
|
||||||
item_count = n_elems
|
|
||||||
item_type = np.int32
|
|
||||||
elif ggml_type == GGMLQuantizationType.I64:
|
|
||||||
item_count = n_elems
|
|
||||||
item_type = np.int64
|
|
||||||
else:
|
|
||||||
item_count = n_bytes
|
|
||||||
item_type = np.uint8
|
|
||||||
np_dims = quant_shape_to_byte_shape(np_dims, ggml_type)
|
|
||||||
|
|
||||||
tensor_info[name] = {
|
|
||||||
"ggml_type": ggml_type,
|
|
||||||
"shape": shape,
|
|
||||||
"bad_offset": bad_offset,
|
|
||||||
"item_type": item_type,
|
|
||||||
"item_count": item_count,
|
|
||||||
"np_dims": np_dims
|
|
||||||
}
|
|
||||||
|
|
||||||
start = f.tell()
|
|
||||||
# Alignment is 32 by default.
|
|
||||||
# https://github.com/ggerganov/ggml/blob/e1daebbf9d38d510ba456c4d50b4500a73ac2b14/docs/gguf.md?plain=1#L253
|
|
||||||
alignment = info.get("general.alignment", 32)
|
|
||||||
|
|
||||||
# Inconveniently, the offset defined in gguf files is relative to the
|
|
||||||
# end of the header and is unaligned.
|
|
||||||
# We need to compute the absolute file offset ourselves instead.
|
|
||||||
for t in tensor_info.values():
|
|
||||||
offset = start + t["bad_offset"]
|
|
||||||
offset += (alignment - offset % alignment) % alignment
|
|
||||||
t["offset"] = offset
|
|
||||||
|
|
||||||
for name in tensor_info:
|
|
||||||
self.tensor_file_map[name] = f.name
|
|
||||||
self.tensor_info.update(tensor_info)
|
|
||||||
self.gguf_file_meta.update(info)
|
|
||||||
|
|
||||||
def get_mmap_tensor(self, name):
|
|
||||||
t = self.tensor_info[name]
|
|
||||||
mmap_data = self.file_data_map[ self.tensor_file_map[name] ]
|
|
||||||
|
|
||||||
offset = t["offset"]
|
|
||||||
item_type = t["item_type"]
|
|
||||||
item_count = t["item_count"]
|
|
||||||
itemsize = int(np.empty([], dtype = item_type).itemsize)
|
|
||||||
return mmap_data[offset : offset + itemsize * item_count]
|
|
||||||
|
|
||||||
def get_undequanted_tensor_and_ggml_type(self, name):
|
|
||||||
t = self.tensor_info[name]
|
|
||||||
data = self.get_mmap_tensor(name)
|
|
||||||
ggml_type = t["ggml_type"]
|
|
||||||
data = torch.from_numpy(data)
|
|
||||||
return data, ggml_type
|
|
||||||
|
|
||||||
def load_expert_tensor(self, name, data, expert_id, elements_per_expert, device = "cuda", target_dtype = torch.get_default_dtype())->torch.Tensor:
|
|
||||||
t = self.tensor_info[name]
|
|
||||||
if device.lower() == "cpu":
|
|
||||||
print(f"loading expert {expert_id} of {name} with CPU")
|
|
||||||
shape = t["shape"]
|
|
||||||
ggml_type = t["ggml_type"]
|
|
||||||
if ggml_type not in GGML_NAMES:
|
|
||||||
raise NotImplementedError(f"ggml_type {ggml_type} not implemented")
|
|
||||||
ggml_name = GGML_NAMES[ggml_type]
|
|
||||||
|
|
||||||
# TODO: experts may fused in quant block, split it
|
|
||||||
assert elements_per_expert % GGML_ELEMENTS_PER_BLOCK[ggml_name] == 0, "experts may fused in quant block, please use CPU dequant"
|
|
||||||
|
|
||||||
blocks_per_experts = elements_per_expert // GGML_ELEMENTS_PER_BLOCK[ggml_name]
|
|
||||||
block_size = GGML_BLOCK_SIZES[ggml_name]
|
|
||||||
offset = expert_id * block_size * blocks_per_experts
|
|
||||||
data = data[offset: offset + block_size * blocks_per_experts]
|
|
||||||
|
|
||||||
if "cuda" in device.lower():
|
|
||||||
values = GGML_DEQUANTIZE_GPU[ggml_name](data, device, target_dtype)
|
|
||||||
else:
|
|
||||||
values = GGML_DEQUANTIZE[ggml_name](data)
|
|
||||||
values = torch.from_numpy(values.copy())
|
|
||||||
|
|
||||||
if ggml_name == "BF16":
|
|
||||||
values = values.view(torch.bfloat16)
|
|
||||||
values = values.view(shape[-2::-1])
|
|
||||||
|
|
||||||
return values
|
|
||||||
|
|
||||||
def load_gguf_tensor(self, name: str, device:str = "cpu", target_dtype = None)->torch.Tensor:
|
|
||||||
t = self.tensor_info[name]
|
|
||||||
if device.lower() == "cpu":
|
|
||||||
print(f"loading {name} with CPU")
|
|
||||||
if target_dtype == None:
|
|
||||||
target_dtype = torch.get_default_dtype()
|
|
||||||
|
|
||||||
shape = t["shape"]
|
|
||||||
ggml_type = t["ggml_type"]
|
|
||||||
|
|
||||||
if ggml_type not in GGML_NAMES:
|
|
||||||
raise NotImplementedError(f"ggml_type {ggml_type} not implemented")
|
|
||||||
|
|
||||||
ggml_name = GGML_NAMES[ggml_type]
|
|
||||||
|
|
||||||
data = self.get_mmap_tensor(name)
|
|
||||||
|
|
||||||
block_size = GGML_BLOCK_SIZES[ggml_name]
|
|
||||||
elements_per_block = GGML_ELEMENTS_PER_BLOCK[ggml_name]
|
|
||||||
num_elements = int(np.prod(shape))
|
|
||||||
num_blocks = num_elements // elements_per_block
|
|
||||||
|
|
||||||
blocks_per_iter = 16384
|
|
||||||
if num_blocks > blocks_per_iter: # dequant large tensor
|
|
||||||
values = torch.empty((num_blocks, elements_per_block), dtype=target_dtype, device=device)
|
|
||||||
for i in range( (num_blocks + blocks_per_iter - 1) // blocks_per_iter):
|
|
||||||
blocks_begin = i * blocks_per_iter
|
|
||||||
blocks_end = min(blocks_begin + blocks_per_iter, num_blocks)
|
|
||||||
if "cuda" in device.lower():
|
|
||||||
cur_values = GGML_DEQUANTIZE_GPU[ggml_name](data[blocks_begin*block_size : blocks_end*block_size], device, target_dtype)
|
|
||||||
else:
|
|
||||||
cur_values = GGML_DEQUANTIZE[ggml_name](data[blocks_begin*block_size : blocks_end*block_size])
|
|
||||||
cur_values = torch.from_numpy(cur_values.copy())
|
|
||||||
|
|
||||||
cur_values = cur_values.view(-1, elements_per_block)
|
|
||||||
if ggml_name == "BF16":
|
|
||||||
cur_values = cur_values.view(torch.bfloat16)
|
|
||||||
values[blocks_begin : blocks_end] = cur_values
|
|
||||||
else:
|
|
||||||
if "cuda" in device.lower():
|
|
||||||
values = GGML_DEQUANTIZE_GPU[ggml_name](data, device)
|
|
||||||
else:
|
|
||||||
values = GGML_DEQUANTIZE[ggml_name](data)
|
|
||||||
values = torch.from_numpy(values)
|
|
||||||
|
|
||||||
if ggml_name == "BF16":
|
|
||||||
values = values.view(torch.bfloat16)
|
|
||||||
|
|
||||||
|
|
||||||
values = values.view(shape[::-1])
|
|
||||||
if "attn_q" in name and self.gguf_file_meta['general.architecture'] in ["llama"]:
|
|
||||||
n_head = self.gguf_file_meta['llama.attention.head_count']
|
|
||||||
values = (values.reshape(n_head, values.shape[0] // n_head // 2, 2, *values.shape[1:])
|
|
||||||
.swapaxes(1, 2)
|
|
||||||
.reshape(values.shape))
|
|
||||||
elif "attn_k" in name and self.gguf_file_meta['general.architecture'] in ["llama"]:
|
|
||||||
n_head = self.gguf_file_meta['llama.attention.head_count_kv']
|
|
||||||
values = (values.reshape(n_head, values.shape[0] // n_head // 2, 2, *values.shape[1:])
|
|
||||||
.swapaxes(1, 2)
|
|
||||||
.reshape(values.shape))
|
|
||||||
return values
|
|
||||||
|
|
||||||
def read_value(f, data_type):
|
def read_value(f, data_type):
|
||||||
if data_type == DATA_TYPES["string"]:
|
if data_type == DATA_TYPES["string"]:
|
||||||
|
@ -921,6 +688,7 @@ def translate_name_to_gguf(name):
|
||||||
name = name.replace(".gate_up_proj.", ".up_proj")
|
name = name.replace(".gate_up_proj.", ".up_proj")
|
||||||
|
|
||||||
name = name.replace(".mlp.shared_experts.down_proj", ".ffn_down_shexp")
|
name = name.replace(".mlp.shared_experts.down_proj", ".ffn_down_shexp")
|
||||||
|
name = name.replace(".mlp.gate.e_score_correction_bias", ".exp_probs_b.bias")
|
||||||
name = name.replace(".mlp.gate", ".ffn_gate_inp")
|
name = name.replace(".mlp.gate", ".ffn_gate_inp")
|
||||||
name = name.replace(".mlp.shared_experts.gate_proj", ".ffn_gate_shexp")
|
name = name.replace(".mlp.shared_experts.gate_proj", ".ffn_gate_shexp")
|
||||||
name = name.replace(".mlp.shared_experts.up_proj", ".ffn_up_shexp")
|
name = name.replace(".mlp.shared_experts.up_proj", ".ffn_up_shexp")
|
||||||
|
|
|
@ -10,12 +10,35 @@ import torch
|
||||||
import KTransformersOps
|
import KTransformersOps
|
||||||
from safetensors import safe_open
|
from safetensors import safe_open
|
||||||
from ktransformers.ktransformers_ext.triton.fp8gemm import fp8_gemm, act_quant, weight_dequant
|
from ktransformers.ktransformers_ext.triton.fp8gemm import fp8_gemm, act_quant, weight_dequant
|
||||||
|
from ktransformers.util.custom_gguf import *
|
||||||
from safetensors.torch import save_file
|
from safetensors.torch import save_file
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from typing import Dict, Any, Optional, Union
|
||||||
|
|
||||||
class SafeTensorLoader:
|
class ModelLoader(ABC):
|
||||||
|
"""
|
||||||
|
Abstract base class for model loaders.
|
||||||
|
Defines the interface that all model loaders must implement.
|
||||||
|
"""
|
||||||
tensor_file_map = {}
|
tensor_file_map = {}
|
||||||
tensor_type_map = {}
|
@abstractmethod
|
||||||
file_handle_map = {}
|
def has_tensor(cls, name: str):
|
||||||
|
"""
|
||||||
|
Check if the tensor exists in the loader.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
name: Name of the tensor to check
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
bool: True if the tensor exists, False otherwise
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
class SafeTensorLoader(ModelLoader):
|
||||||
|
tensor_file_map: dict
|
||||||
|
tensor_type_map: dict
|
||||||
|
file_handle_map: dict
|
||||||
|
tensor_device_map: dict
|
||||||
|
|
||||||
def __init__(self, file_path: str):
|
def __init__(self, file_path: str):
|
||||||
self.__load_tensor_file_map(file_path)
|
self.__load_tensor_file_map(file_path)
|
||||||
|
@ -28,6 +51,10 @@ class SafeTensorLoader:
|
||||||
folder_path = os.path.dirname(file_path)
|
folder_path = os.path.dirname(file_path)
|
||||||
else:
|
else:
|
||||||
folder_path = file_path
|
folder_path = file_path
|
||||||
|
self.file_handle_map = {}
|
||||||
|
self.tensor_file_map = {}
|
||||||
|
self.tensor_type_map = {}
|
||||||
|
self.tensor_device_map = {}
|
||||||
|
|
||||||
found_safetensor = False
|
found_safetensor = False
|
||||||
for root, _, files in os.walk(folder_path):
|
for root, _, files in os.walk(folder_path):
|
||||||
|
@ -57,7 +84,11 @@ class SafeTensorLoader:
|
||||||
# raise FileNotFoundError(f"No Safetensor files found in {folder_path}")
|
# raise FileNotFoundError(f"No Safetensor files found in {folder_path}")
|
||||||
|
|
||||||
def load_tensor(self, key: str, device: str="cpu"):
|
def load_tensor(self, key: str, device: str="cpu"):
|
||||||
if key not in self.tensor_file_map:
|
if translate_name_to_gguf(key) in self.tensor_file_map:
|
||||||
|
key = translate_name_to_gguf(key)
|
||||||
|
elif key in self.tensor_file_map:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
raise KeyError(f"Key {key} not found in Safetensor files")
|
raise KeyError(f"Key {key} not found in Safetensor files")
|
||||||
file = self.tensor_file_map[key]
|
file = self.tensor_file_map[key]
|
||||||
f = self.file_handle_map.get(file)
|
f = self.file_handle_map.get(file)
|
||||||
|
@ -66,13 +97,145 @@ class SafeTensorLoader:
|
||||||
tensor = f.get_tensor(key)
|
tensor = f.get_tensor(key)
|
||||||
return tensor.to(device)
|
return tensor.to(device)
|
||||||
|
|
||||||
|
def load_experts(self, key: str, device: str="cpu"):
|
||||||
|
'''
|
||||||
|
Load experts from safetensor
|
||||||
|
key: the name of the experts
|
||||||
|
device: the device to load the experts to
|
||||||
|
return: dict,
|
||||||
|
{up: tensor, down: tensor, gate: tensor, up_type: int, down_type: int, gate_type: int}
|
||||||
|
{xxx}_type: the type of the up tensor, corresponding to the ggml type
|
||||||
|
'''
|
||||||
|
if self.has_tensor(translate_name_to_gguf(key)+".ffn_gate_exps.weight"):
|
||||||
|
# legacy branch for loading hybrid model
|
||||||
|
base_key = translate_name_to_gguf(key)
|
||||||
|
# Load experts from safetensor
|
||||||
|
gate_key = f"{base_key}.ffn_gate_exps.weight"
|
||||||
|
gate_type_key = f"{base_key}.ffn_gate_exps.ggml_type"
|
||||||
|
up_key = f"{base_key}.ffn_up_exps.weight"
|
||||||
|
up_type_key = f"{base_key}.ffn_up_exps.ggml_type"
|
||||||
|
down_key = f"{base_key}.ffn_down_exps.weight"
|
||||||
|
down_type_key = f"{base_key}.ffn_down_exps.ggml_type"
|
||||||
|
gate_tensor = self.load_tensor(gate_key, device).numpy()
|
||||||
|
up_tensor = self.load_tensor(up_key, device).numpy()
|
||||||
|
down_tensor = self.load_tensor(down_key, device).numpy()
|
||||||
|
gate_type = self.load_tensor(gate_type_key, device).item()
|
||||||
|
up_type = self.load_tensor(up_type_key, device).item()
|
||||||
|
down_type = self.load_tensor(down_type_key, device).item()
|
||||||
|
|
||||||
|
return {
|
||||||
|
"up": up_tensor,
|
||||||
|
"gate": gate_tensor,
|
||||||
|
"down": down_tensor,
|
||||||
|
"up_type": up_type,
|
||||||
|
"gate_type": gate_type,
|
||||||
|
"down_type": down_type
|
||||||
|
}
|
||||||
|
|
||||||
|
else:
|
||||||
|
# Load experts from safetensor
|
||||||
|
base_key = key # e.g. "model.layers.3.mlp.experts"
|
||||||
|
experts_count = 0
|
||||||
|
|
||||||
|
# First, count how many experts we have by checking for expert 0's up_proj
|
||||||
|
while self.has_tensor(f"{base_key}.{experts_count}.up_proj.weight"):
|
||||||
|
experts_count += 1
|
||||||
|
|
||||||
|
if experts_count == 0:
|
||||||
|
raise ValueError(f"No experts found for key {base_key}")
|
||||||
|
|
||||||
|
# Initialize empty lists to store tensors for each projection type
|
||||||
|
up_projs = []
|
||||||
|
gate_projs = []
|
||||||
|
down_projs = []
|
||||||
|
|
||||||
|
# Load all expert weights
|
||||||
|
for expert_id in range(experts_count):
|
||||||
|
up_key = f"{base_key}.{expert_id}.up_proj.weight"
|
||||||
|
gate_key = f"{base_key}.{expert_id}.gate_proj.weight"
|
||||||
|
down_key = f"{base_key}.{expert_id}.down_proj.weight"
|
||||||
|
|
||||||
|
up_tensor = self.load_tensor(up_key, device)
|
||||||
|
gate_tensor = self.load_tensor(gate_key, device)
|
||||||
|
down_tensor = self.load_tensor(down_key, device)
|
||||||
|
|
||||||
|
up_projs.append(up_tensor)
|
||||||
|
gate_projs.append(gate_tensor)
|
||||||
|
down_projs.append(down_tensor)
|
||||||
|
|
||||||
|
# Stack the tensors along a new dimension
|
||||||
|
up_tensor = torch.stack(up_projs, dim=0)
|
||||||
|
gate_tensor = torch.stack(gate_projs, dim=0)
|
||||||
|
down_tensor = torch.stack(down_projs, dim=0)
|
||||||
|
|
||||||
|
# Get original dtype for GGML type determination
|
||||||
|
orig_up_dtype = up_tensor.dtype
|
||||||
|
orig_gate_dtype = gate_tensor.dtype
|
||||||
|
orig_down_dtype = down_tensor.dtype
|
||||||
|
|
||||||
|
# Convert to numpy with proper bfloat16 support
|
||||||
|
up_numpy = up_tensor.view(torch.uint16).numpy()
|
||||||
|
gate_numpy = gate_tensor.view(torch.uint16).numpy()
|
||||||
|
down_numpy = down_tensor.view(torch.uint16).numpy()
|
||||||
|
|
||||||
|
# Determine tensor data types for GGML conversion
|
||||||
|
def get_ggml_type(dtype):
|
||||||
|
if dtype == torch.float32:
|
||||||
|
return GGMLQuantizationType.F32
|
||||||
|
elif dtype == torch.float16:
|
||||||
|
return GGMLQuantizationType.F16
|
||||||
|
elif dtype == torch.bfloat16:
|
||||||
|
return GGMLQuantizationType.BF16
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unsupported tensor dtype: {dtype}")
|
||||||
|
|
||||||
|
return {
|
||||||
|
"up": up_numpy,
|
||||||
|
"gate": gate_numpy,
|
||||||
|
"down": down_numpy,
|
||||||
|
"up_type": get_ggml_type(orig_up_dtype),
|
||||||
|
"gate_type": get_ggml_type(orig_gate_dtype),
|
||||||
|
"down_type": get_ggml_type(orig_down_dtype)
|
||||||
|
}
|
||||||
|
|
||||||
|
def load_gate(self, key: str, device: str="cpu"):
|
||||||
|
'''
|
||||||
|
Load gate from safetensor
|
||||||
|
key: the name of the gate
|
||||||
|
device: the device to load the gate to
|
||||||
|
return: dict,
|
||||||
|
{'weight': tensor, 'e_score_correction_bias': tensor}
|
||||||
|
'''
|
||||||
|
target = ["weight", "e_score_correction_bias"]
|
||||||
|
res = {'weight': None, 'e_score_correction_bias': None}
|
||||||
|
if self.has_tensor(translate_name_to_gguf(key)+".ffn_gate_exps.weight"):
|
||||||
|
# legacy branch for loading hybrid model
|
||||||
|
base_key = key
|
||||||
|
for k in target:
|
||||||
|
translated_key = translate_name_to_gguf(f"{base_key}.{k}")
|
||||||
|
if self.has_tensor(translated_key):
|
||||||
|
tensor = self.load_tensor(translated_key, device)
|
||||||
|
res[k] = tensor
|
||||||
|
else:
|
||||||
|
# Load gate from safetensor
|
||||||
|
base_key = key
|
||||||
|
for k in target:
|
||||||
|
if self.has_tensor(f"{base_key}.{k}"):
|
||||||
|
tensor = self.load_tensor(f"{base_key}.{k}", device)
|
||||||
|
res[k] = tensor
|
||||||
|
return res
|
||||||
|
|
||||||
def close_all_handles(self):
|
def close_all_handles(self):
|
||||||
for handle in self.file_handle_map.values():
|
for handle in self.file_handle_map.values():
|
||||||
handle.close()
|
handle.close()
|
||||||
self.file_handle_map.clear()
|
self.file_handle_map.clear()
|
||||||
|
|
||||||
def load_dequantized_tensor(self, key:str, device: str="cpu"):
|
def load_dequantized_tensor(self, key:str, device: str="cpu"):
|
||||||
if key not in self.tensor_file_map:
|
if key in self.tensor_file_map and translate_name_to_gguf(key):
|
||||||
|
pass
|
||||||
|
elif translate_name_to_gguf(key) in self.tensor_file_map:
|
||||||
|
key = translate_name_to_gguf(key)
|
||||||
|
else:
|
||||||
raise KeyError(f"Key {key} not found in Safetensor files")
|
raise KeyError(f"Key {key} not found in Safetensor files")
|
||||||
file = self.tensor_file_map[key]
|
file = self.tensor_file_map[key]
|
||||||
f = self.file_handle_map.get(file)
|
f = self.file_handle_map.get(file)
|
||||||
|
@ -84,3 +247,314 @@ class SafeTensorLoader:
|
||||||
weight_scale_inv = f.get_tensor(key[:-7] + ".weight_scale_inv").to(device)
|
weight_scale_inv = f.get_tensor(key[:-7] + ".weight_scale_inv").to(device)
|
||||||
tensor = weight_dequant(tensor, weight_scale_inv)
|
tensor = weight_dequant(tensor, weight_scale_inv)
|
||||||
return tensor.to(device)
|
return tensor.to(device)
|
||||||
|
|
||||||
|
def has_tensor(self, name: str):
|
||||||
|
return name in self.tensor_file_map or translate_name_to_gguf(name) in self.tensor_file_map
|
||||||
|
|
||||||
|
class GGUFLoader(ModelLoader):
|
||||||
|
tensor_info: dict
|
||||||
|
gguf_path: str
|
||||||
|
tensor_file_map: dict # {tensor_name: tensor_file_path}
|
||||||
|
gguf_file_meta: dict
|
||||||
|
safetensor_loader: SafeTensorLoader
|
||||||
|
def __init__(self, gguf_path: str):
|
||||||
|
# Check dir exist
|
||||||
|
if not os.path.exists(gguf_path):
|
||||||
|
raise FileNotFoundError(f"GGUF dir not found: {gguf_path}")
|
||||||
|
if os.path.isfile(gguf_path):
|
||||||
|
gguf_path = os.path.dirname(gguf_path)
|
||||||
|
|
||||||
|
self.safetensor_loader = None
|
||||||
|
|
||||||
|
self.tensor_info = {}
|
||||||
|
self.gguf_path = gguf_path
|
||||||
|
self.tensor_file_map = {}
|
||||||
|
self.file_data_map = {}
|
||||||
|
self.gguf_file_meta = {}
|
||||||
|
self.tensor_device_map = {}
|
||||||
|
|
||||||
|
# Walk through all the .gguf files in the directory
|
||||||
|
found_gguf = False
|
||||||
|
for root, dirs, files in os.walk(gguf_path):
|
||||||
|
for file in files:
|
||||||
|
if file.endswith(".gguf"):
|
||||||
|
found_gguf = True
|
||||||
|
file_name = os.path.join(root, file)
|
||||||
|
with open(file_name, "rb") as f:
|
||||||
|
self.load_gguf(f)
|
||||||
|
if file_name not in self.file_data_map:
|
||||||
|
self.file_data_map[file_name] = np.memmap(file_name, mode = 'r')
|
||||||
|
if not found_gguf:
|
||||||
|
raise FileNotFoundError(f"Cannot find any .gguf files in: {gguf_path}")
|
||||||
|
|
||||||
|
def load_gguf(self, f):
|
||||||
|
f.seek(0)
|
||||||
|
assert f.read(4) == b'GGUF'
|
||||||
|
values = struct.unpack("<IQQ", f.read(4+8+8))
|
||||||
|
version, n_tensors, n_kv = values
|
||||||
|
if version != 3:
|
||||||
|
warnings.warn(f"Version {version} has never been tested, might not work")
|
||||||
|
|
||||||
|
info = {}
|
||||||
|
for _ in range(n_kv):
|
||||||
|
name = read_value(f, DATA_TYPES["string"])
|
||||||
|
|
||||||
|
data_type = struct.unpack("<I", f.read(4))[0]
|
||||||
|
|
||||||
|
info[name] = read_value(f, data_type)
|
||||||
|
|
||||||
|
tensor_info = {}
|
||||||
|
for _ in range(n_tensors):
|
||||||
|
name = read_value(f, DATA_TYPES["string"])
|
||||||
|
shape_len = read_value(f, DATA_TYPES["uint32"])
|
||||||
|
shape = [read_value(f, DATA_TYPES["uint64"]) for _ in range(shape_len)]
|
||||||
|
ggml_type = read_value(f, DATA_TYPES["uint32"])
|
||||||
|
bad_offset = read_value(f, DATA_TYPES["uint64"])
|
||||||
|
n_elems = int(math.prod(shape))
|
||||||
|
block_size, type_size = GGML_QUANT_SIZES[ggml_type]
|
||||||
|
n_bytes = n_elems * type_size // block_size
|
||||||
|
np_dims = tuple(reversed(shape))
|
||||||
|
|
||||||
|
item_type: npt.DTypeLike
|
||||||
|
if ggml_type == GGMLQuantizationType.F16:
|
||||||
|
item_count = n_elems
|
||||||
|
item_type = np.float16
|
||||||
|
elif ggml_type == GGMLQuantizationType.F32:
|
||||||
|
item_count = n_elems
|
||||||
|
item_type = np.float32
|
||||||
|
elif ggml_type == GGMLQuantizationType.F64:
|
||||||
|
item_count = n_elems
|
||||||
|
item_type = np.float64
|
||||||
|
elif ggml_type == GGMLQuantizationType.I8:
|
||||||
|
item_count = n_elems
|
||||||
|
item_type = np.int8
|
||||||
|
elif ggml_type == GGMLQuantizationType.I16:
|
||||||
|
item_count = n_elems
|
||||||
|
item_type = np.int16
|
||||||
|
elif ggml_type == GGMLQuantizationType.I32:
|
||||||
|
item_count = n_elems
|
||||||
|
item_type = np.int32
|
||||||
|
elif ggml_type == GGMLQuantizationType.I64:
|
||||||
|
item_count = n_elems
|
||||||
|
item_type = np.int64
|
||||||
|
else:
|
||||||
|
item_count = n_bytes
|
||||||
|
item_type = np.uint8
|
||||||
|
np_dims = quant_shape_to_byte_shape(np_dims, ggml_type)
|
||||||
|
|
||||||
|
tensor_info[name] = {
|
||||||
|
"ggml_type": ggml_type,
|
||||||
|
"shape": shape,
|
||||||
|
"bad_offset": bad_offset,
|
||||||
|
"item_type": item_type,
|
||||||
|
"item_count": item_count,
|
||||||
|
"np_dims": np_dims
|
||||||
|
}
|
||||||
|
|
||||||
|
start = f.tell()
|
||||||
|
# Alignment is 32 by default.
|
||||||
|
# https://github.com/ggerganov/ggml/blob/e1daebbf9d38d510ba456c4d50b4500a73ac2b14/docs/gguf.md?plain=1#L253
|
||||||
|
alignment = info.get("general.alignment", 32)
|
||||||
|
|
||||||
|
# Inconveniently, the offset defined in gguf files is relative to the
|
||||||
|
# end of the header and is unaligned.
|
||||||
|
# We need to compute the absolute file offset ourselves instead.
|
||||||
|
for t in tensor_info.values():
|
||||||
|
offset = start + t["bad_offset"]
|
||||||
|
offset += (alignment - offset % alignment) % alignment
|
||||||
|
t["offset"] = offset
|
||||||
|
|
||||||
|
for name in tensor_info:
|
||||||
|
self.tensor_file_map[name] = f.name
|
||||||
|
self.tensor_info.update(tensor_info)
|
||||||
|
self.gguf_file_meta.update(info)
|
||||||
|
|
||||||
|
def get_mmap_tensor(self, name):
|
||||||
|
name = translate_name_to_gguf(name)
|
||||||
|
t = self.tensor_info[name]
|
||||||
|
mmap_data = self.file_data_map[ self.tensor_file_map[name] ]
|
||||||
|
|
||||||
|
offset = t["offset"]
|
||||||
|
item_type = t["item_type"]
|
||||||
|
item_count = t["item_count"]
|
||||||
|
itemsize = int(np.empty([], dtype = item_type).itemsize)
|
||||||
|
return mmap_data[offset : offset + itemsize * item_count]
|
||||||
|
|
||||||
|
def get_undequanted_tensor_and_ggml_type(self, name):
|
||||||
|
name = translate_name_to_gguf(name)
|
||||||
|
t = self.tensor_info[name]
|
||||||
|
data = self.get_mmap_tensor(name)
|
||||||
|
ggml_type = t["ggml_type"]
|
||||||
|
data = torch.from_numpy(data)
|
||||||
|
return data, ggml_type
|
||||||
|
|
||||||
|
def load_expert_tensor(self, name, data, expert_id, elements_per_expert, device = "cuda", target_dtype = torch.get_default_dtype())->torch.Tensor:
|
||||||
|
name = translate_name_to_gguf(name)
|
||||||
|
t = self.tensor_info[name]
|
||||||
|
shape = t["shape"]
|
||||||
|
ggml_type = t["ggml_type"]
|
||||||
|
if ggml_type not in GGML_NAMES:
|
||||||
|
raise NotImplementedError(f"ggml_type {ggml_type} not implemented")
|
||||||
|
ggml_name = GGML_NAMES[ggml_type]
|
||||||
|
|
||||||
|
# TODO: experts may fused in quant block, split it
|
||||||
|
assert elements_per_expert % GGML_ELEMENTS_PER_BLOCK[ggml_name] == 0, "experts may fused in quant block, please use CPU dequant"
|
||||||
|
|
||||||
|
blocks_per_experts = elements_per_expert // GGML_ELEMENTS_PER_BLOCK[ggml_name]
|
||||||
|
block_size = GGML_BLOCK_SIZES[ggml_name]
|
||||||
|
offset = expert_id * block_size * blocks_per_experts
|
||||||
|
data = data[offset: offset + block_size * blocks_per_experts]
|
||||||
|
|
||||||
|
if "cuda" in device.lower():
|
||||||
|
values = GGML_DEQUANTIZE_GPU[ggml_name](data, device, target_dtype)
|
||||||
|
else:
|
||||||
|
values = GGML_DEQUANTIZE[ggml_name](data)
|
||||||
|
values = torch.from_numpy(values.copy())
|
||||||
|
|
||||||
|
if ggml_name == "BF16":
|
||||||
|
values = values.view(torch.bfloat16)
|
||||||
|
values = values.view(shape[-2::-1])
|
||||||
|
|
||||||
|
return values
|
||||||
|
|
||||||
|
def load_gguf_tensor(self, name: str, device:str = "cpu", target_dtype = None)->torch.Tensor:
|
||||||
|
name = translate_name_to_gguf(name)
|
||||||
|
t = self.tensor_info[name]
|
||||||
|
if target_dtype == None:
|
||||||
|
target_dtype = torch.get_default_dtype()
|
||||||
|
|
||||||
|
shape = t["shape"]
|
||||||
|
ggml_type = t["ggml_type"]
|
||||||
|
|
||||||
|
if ggml_type not in GGML_NAMES:
|
||||||
|
raise NotImplementedError(f"ggml_type {ggml_type} not implemented")
|
||||||
|
|
||||||
|
ggml_name = GGML_NAMES[ggml_type]
|
||||||
|
|
||||||
|
data = self.get_mmap_tensor(name)
|
||||||
|
|
||||||
|
block_size = GGML_BLOCK_SIZES[ggml_name]
|
||||||
|
elements_per_block = GGML_ELEMENTS_PER_BLOCK[ggml_name]
|
||||||
|
num_elements = int(np.prod(shape))
|
||||||
|
num_blocks = num_elements // elements_per_block
|
||||||
|
|
||||||
|
blocks_per_iter = 16384
|
||||||
|
if num_blocks > blocks_per_iter: # dequant large tensor
|
||||||
|
values = torch.empty((num_blocks, elements_per_block), dtype=target_dtype, device=device)
|
||||||
|
for i in range( (num_blocks + blocks_per_iter - 1) // blocks_per_iter):
|
||||||
|
blocks_begin = i * blocks_per_iter
|
||||||
|
blocks_end = min(blocks_begin + blocks_per_iter, num_blocks)
|
||||||
|
if "cuda" in device.lower():
|
||||||
|
cur_values = GGML_DEQUANTIZE_GPU[ggml_name](data[blocks_begin*block_size : blocks_end*block_size], device, target_dtype)
|
||||||
|
else:
|
||||||
|
cur_values = GGML_DEQUANTIZE[ggml_name](data[blocks_begin*block_size : blocks_end*block_size])
|
||||||
|
cur_values = torch.from_numpy(cur_values.copy())
|
||||||
|
|
||||||
|
cur_values = cur_values.view(-1, elements_per_block)
|
||||||
|
if ggml_name == "BF16":
|
||||||
|
cur_values = cur_values.view(torch.bfloat16)
|
||||||
|
values[blocks_begin : blocks_end] = cur_values
|
||||||
|
else:
|
||||||
|
if "cuda" in device.lower():
|
||||||
|
values = GGML_DEQUANTIZE_GPU[ggml_name](data, device)
|
||||||
|
else:
|
||||||
|
values = GGML_DEQUANTIZE[ggml_name](data)
|
||||||
|
values = torch.from_numpy(values)
|
||||||
|
|
||||||
|
if ggml_name == "BF16":
|
||||||
|
values = values.view(torch.bfloat16)
|
||||||
|
|
||||||
|
|
||||||
|
values = values.view(shape[::-1])
|
||||||
|
if "attn_q" in name and self.gguf_file_meta['general.architecture'] in ["llama"]:
|
||||||
|
n_head = self.gguf_file_meta['llama.attention.head_count']
|
||||||
|
values = (values.reshape(n_head, values.shape[0] // n_head // 2, 2, *values.shape[1:])
|
||||||
|
.swapaxes(1, 2)
|
||||||
|
.reshape(values.shape))
|
||||||
|
elif "attn_k" in name and self.gguf_file_meta['general.architecture'] in ["llama"]:
|
||||||
|
n_head = self.gguf_file_meta['llama.attention.head_count_kv']
|
||||||
|
values = (values.reshape(n_head, values.shape[0] // n_head // 2, 2, *values.shape[1:])
|
||||||
|
.swapaxes(1, 2)
|
||||||
|
.reshape(values.shape))
|
||||||
|
return values
|
||||||
|
def has_tensor(self, name: str):
|
||||||
|
name = translate_name_to_gguf(name)
|
||||||
|
return name in self.tensor_info
|
||||||
|
|
||||||
|
def get_ggml_type(self, name: str):
|
||||||
|
name = translate_name_to_gguf(name)
|
||||||
|
if name not in self.tensor_info:
|
||||||
|
raise KeyError(f"Key {name} not found in GGUF files")
|
||||||
|
return self.tensor_info[name]["ggml_type"]
|
||||||
|
|
||||||
|
class ModelLoaderFactory:
|
||||||
|
"""
|
||||||
|
Factory class for creating model loaders.
|
||||||
|
Automatically detects the model format based on file extensions in the directory.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def create_loader(path: str):
|
||||||
|
"""
|
||||||
|
Create a model loader for the given path by detecting the model format.
|
||||||
|
The function checks for the presence of .safetensors or .gguf files
|
||||||
|
in the specified path and creates the appropriate loader.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
path: Path to the model directory or file
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
An appropriate ModelLoader instance (SafeTensorLoader or GGUFLoader)
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
FileNotFoundError: If no supported model files are found in the path
|
||||||
|
"""
|
||||||
|
if not os.path.exists(path):
|
||||||
|
raise FileNotFoundError(f"Path not found: {path}")
|
||||||
|
|
||||||
|
# Normalize to directory path if a file was provided
|
||||||
|
if os.path.isfile(path):
|
||||||
|
if path.endswith(".safetensors"):
|
||||||
|
return SafeTensorLoader(path)
|
||||||
|
elif path.endswith(".gguf"):
|
||||||
|
return GGUFLoader(path)
|
||||||
|
else:
|
||||||
|
folder_path = os.path.dirname(path)
|
||||||
|
else:
|
||||||
|
folder_path = path
|
||||||
|
|
||||||
|
# Check for safetensors files
|
||||||
|
has_safetensors = False
|
||||||
|
has_gguf = False
|
||||||
|
|
||||||
|
for root, _, files in os.walk(folder_path):
|
||||||
|
for file in files:
|
||||||
|
if file.endswith(".safetensors"):
|
||||||
|
has_safetensors = True
|
||||||
|
break
|
||||||
|
elif file.endswith(".gguf"):
|
||||||
|
has_gguf = True
|
||||||
|
break
|
||||||
|
if has_safetensors or has_gguf:
|
||||||
|
break
|
||||||
|
|
||||||
|
# Create the appropriate loader based on detected file types
|
||||||
|
# Prioritize SafeTensor over GGUF if both are present
|
||||||
|
if has_safetensors:
|
||||||
|
try:
|
||||||
|
return SafeTensorLoader(folder_path)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Failed to create SafeTensorLoader: {e}")
|
||||||
|
# Fall through to try GGUF if SafeTensor fails
|
||||||
|
if not has_gguf:
|
||||||
|
raise
|
||||||
|
|
||||||
|
if has_gguf:
|
||||||
|
try:
|
||||||
|
return GGUFLoader(folder_path)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Failed to create GGUFLoader: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
# No supported model files found
|
||||||
|
raise FileNotFoundError(f"No .safetensors or .gguf files found in: {folder_path}")
|
|
@ -22,8 +22,7 @@ from transformers import (
|
||||||
EtaLogitsWarper,
|
EtaLogitsWarper,
|
||||||
)
|
)
|
||||||
|
|
||||||
from ktransformers.util.custom_gguf import translate_name_to_gguf
|
from ktransformers.util.custom_loader import ModelLoaderFactory, ModelLoader, SafeTensorLoader, GGUFLoader, translate_name_to_gguf
|
||||||
from ktransformers.util.custom_gguf import GGUFLoader
|
|
||||||
from ktransformers.operators import base_operator
|
from ktransformers.operators import base_operator
|
||||||
from ktransformers.models.custom_cache import StaticCache
|
from ktransformers.models.custom_cache import StaticCache
|
||||||
from ktransformers.util.cuda_graph_runner import CUDAGraphRunner
|
from ktransformers.util.cuda_graph_runner import CUDAGraphRunner
|
||||||
|
@ -98,25 +97,24 @@ def get_all_used_cuda_device(device_map:dict):
|
||||||
all_device_list = list(all_device_list)
|
all_device_list = list(all_device_list)
|
||||||
return all_device_list
|
return all_device_list
|
||||||
|
|
||||||
def load_cur_state_dict(module: nn.Module, gguf_loader: GGUFLoader, prefix: str = ""):
|
def load_cur_state_dict(module: nn.Module, gguf_loader: ModelLoader, prefix: str = ""):
|
||||||
prefix = prefix.replace("orig_module.", "")
|
prefix = prefix.replace("orig_module.", "")
|
||||||
persistent_buffers = {k: v for k, v in module._buffers.items() if k not in module._non_persistent_buffers_set}
|
persistent_buffers = {k: v for k, v in module._buffers.items() if k not in module._non_persistent_buffers_set}
|
||||||
local_name_params = itertools.chain(module._parameters.items(), persistent_buffers.items())
|
local_name_params = itertools.chain(module._parameters.items(), persistent_buffers.items())
|
||||||
local_state = {k: v for k, v in local_name_params if v is not None}
|
local_state = {k: v for k, v in local_name_params if v is not None}
|
||||||
for name, param in local_state.items():
|
for name, param in local_state.items():
|
||||||
key = prefix + name
|
key = prefix + name
|
||||||
translated_key = translate_name_to_gguf(key)
|
translated_key = key
|
||||||
|
|
||||||
# TODO: Merge all loader.
|
# TODO: Merge all loader.
|
||||||
# I know this is ugly but lets do it for now.
|
# I know this is ugly but lets do it for now.
|
||||||
if gguf_loader.safetensor_loader is not None:
|
if isinstance(gguf_loader, SafeTensorLoader):
|
||||||
load_dequantized_tensor = gguf_loader.safetensor_loader.load_dequantized_tensor
|
load_dequantized_tensor = gguf_loader.load_dequantized_tensor
|
||||||
tensor_file_map = gguf_loader.safetensor_loader.tensor_file_map
|
|
||||||
else:
|
else:
|
||||||
load_dequantized_tensor = gguf_loader.load_gguf_tensor
|
load_dequantized_tensor = gguf_loader.load_gguf_tensor
|
||||||
tensor_file_map = gguf_loader.tensor_file_map
|
tensor_file_map = gguf_loader.tensor_file_map
|
||||||
|
|
||||||
if translated_key in tensor_file_map:
|
if gguf_loader.has_tensor(translated_key):
|
||||||
target_dtype = torch.get_default_dtype()
|
target_dtype = torch.get_default_dtype()
|
||||||
device = get_device(translated_key[:translated_key.rfind(".")], gguf_loader.tensor_device_map)
|
device = get_device(translated_key[:translated_key.rfind(".")], gguf_loader.tensor_device_map)
|
||||||
print(f"loading {translated_key} to {device}")
|
print(f"loading {translated_key} to {device}")
|
||||||
|
@ -128,7 +126,7 @@ def load_cur_state_dict(module: nn.Module, gguf_loader: GGUFLoader, prefix: str
|
||||||
#print(load_config.tensor_file_map.keys())
|
#print(load_config.tensor_file_map.keys())
|
||||||
raise Exception(f"can't find {translated_key} in GGUF file!")
|
raise Exception(f"can't find {translated_key} in GGUF file!")
|
||||||
|
|
||||||
def load_weights(module:nn.Module, gguf_loader:GGUFLoader, prefix=''):
|
def load_weights(module:nn.Module, gguf_loader:ModelLoader, prefix=''):
|
||||||
#print(f"recursively loading weights {prefix}")
|
#print(f"recursively loading weights {prefix}")
|
||||||
if not isinstance(module, base_operator.BaseInjectedModule):
|
if not isinstance(module, base_operator.BaseInjectedModule):
|
||||||
load_cur_state_dict(module, gguf_loader, prefix)
|
load_cur_state_dict(module, gguf_loader, prefix)
|
||||||
|
|
367
ktransformers/util/weight_loader.py
Normal file
367
ktransformers/util/weight_loader.py
Normal file
|
@ -0,0 +1,367 @@
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
import os
|
||||||
|
import torch
|
||||||
|
import numpy as np
|
||||||
|
from safetensors import safe_open
|
||||||
|
from typing import Dict, Any, Optional, Union
|
||||||
|
|
||||||
|
class ModelLoader(ABC):
|
||||||
|
"""
|
||||||
|
Abstract base class for model loaders.
|
||||||
|
Defines the interface that all model loaders must implement.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def load_tensor(self, name: str, device: str = "cpu") -> torch.Tensor:
|
||||||
|
"""
|
||||||
|
Load a tensor by name.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
name: Name of the tensor to load
|
||||||
|
device: Device to load the tensor to
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The loaded tensor
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@abstractmethod
|
||||||
|
def supports_format(cls, path: str) -> bool:
|
||||||
|
"""
|
||||||
|
Check if this loader supports the given path format.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
path: Path to check
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if this loader supports the given path, False otherwise
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class SafeTensorLoader(ModelLoader):
|
||||||
|
"""
|
||||||
|
Loader for SafeTensor format models.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, path: str):
|
||||||
|
"""
|
||||||
|
Initialize the SafeTensor loader.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
path: Path to the model directory or file
|
||||||
|
"""
|
||||||
|
self.tensor_file_map = {} # Maps tensor names to file paths
|
||||||
|
self.file_handle_map = {} # Maps file names to file handles
|
||||||
|
self._load_tensor_file_map(path)
|
||||||
|
|
||||||
|
def _load_tensor_file_map(self, path: str) -> None:
|
||||||
|
"""
|
||||||
|
Load the tensor file map from the given path.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
path: Path to the model directory or file
|
||||||
|
"""
|
||||||
|
# Normalize path to directory
|
||||||
|
if not os.path.exists(path):
|
||||||
|
raise FileNotFoundError(f"Path not found: {path}")
|
||||||
|
if os.path.isfile(path):
|
||||||
|
folder_path = os.path.dirname(path)
|
||||||
|
else:
|
||||||
|
folder_path = path
|
||||||
|
|
||||||
|
found_safetensor = False
|
||||||
|
for root, _, files in os.walk(folder_path):
|
||||||
|
files = sorted(files)
|
||||||
|
for file in files:
|
||||||
|
if file.endswith(".safetensors"):
|
||||||
|
found_safetensor = True
|
||||||
|
file_path = os.path.join(root, file)
|
||||||
|
if file not in self.file_handle_map:
|
||||||
|
try:
|
||||||
|
handle = safe_open(file_path, framework="pt")
|
||||||
|
self.file_handle_map[file] = handle
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error opening Safetensor file {file_path}: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
f = self.file_handle_map.get(file)
|
||||||
|
if f is None:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
for key in f.keys():
|
||||||
|
self.tensor_file_map[key] = file
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error reading Safetensor file {file_path}: {e}")
|
||||||
|
|
||||||
|
if not found_safetensor:
|
||||||
|
# Not raising an error here allows for the factory to try other loaders
|
||||||
|
print(f"No Safetensor files found in {folder_path}")
|
||||||
|
|
||||||
|
def load_tensor(self, name: str, device: str = "cpu") -> torch.Tensor:
|
||||||
|
"""
|
||||||
|
Load a tensor by name.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
name: Name of the tensor to load
|
||||||
|
device: Device to load the tensor to
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The loaded tensor
|
||||||
|
"""
|
||||||
|
if name not in self.tensor_file_map:
|
||||||
|
raise KeyError(f"Key {name} not found in Safetensor files")
|
||||||
|
file = self.tensor_file_map[name]
|
||||||
|
f = self.file_handle_map.get(file)
|
||||||
|
if f is None:
|
||||||
|
raise FileNotFoundError(f"File {file} not found in Safetensor files")
|
||||||
|
tensor = f.get_tensor(name)
|
||||||
|
return tensor.to(device)
|
||||||
|
|
||||||
|
def load_dequantized_tensor(self, name: str, device: str = "cpu") -> torch.Tensor:
|
||||||
|
"""
|
||||||
|
Load and dequantize a tensor.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
name: Name of the tensor to load
|
||||||
|
device: Device to load the tensor to
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The dequantized tensor
|
||||||
|
"""
|
||||||
|
if name not in self.tensor_file_map:
|
||||||
|
raise KeyError(f"Key {name} not found in Safetensor files")
|
||||||
|
file = self.tensor_file_map[name]
|
||||||
|
f = self.file_handle_map.get(file)
|
||||||
|
if f is None:
|
||||||
|
raise FileNotFoundError(f"File {file} not found in Safetensor files")
|
||||||
|
tensor = f.get_tensor(name).to(device)
|
||||||
|
if name.endswith(".weight"):
|
||||||
|
if name[:-7] + ".weight_scale_inv" in self.tensor_file_map:
|
||||||
|
weight_scale_inv = f.get_tensor(name[:-7] + ".weight_scale_inv").to(device)
|
||||||
|
# Assuming weight_dequant function is imported
|
||||||
|
from ktransformers.ktransformers_ext.triton.fp8gemm import weight_dequant
|
||||||
|
tensor = weight_dequant(tensor, weight_scale_inv)
|
||||||
|
return tensor.to(device)
|
||||||
|
|
||||||
|
def close_all_handles(self) -> None:
|
||||||
|
"""
|
||||||
|
Close all file handles.
|
||||||
|
"""
|
||||||
|
for handle in self.file_handle_map.values():
|
||||||
|
handle.close()
|
||||||
|
self.file_handle_map.clear()
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def supports_format(cls, path: str) -> bool:
|
||||||
|
"""
|
||||||
|
Check if this loader supports the given path format.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
path: Path to check
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if safetensor files are found in the path, False otherwise
|
||||||
|
"""
|
||||||
|
# Normalize path to directory
|
||||||
|
if not os.path.exists(path):
|
||||||
|
return False
|
||||||
|
if os.path.isfile(path):
|
||||||
|
if path.endswith(".safetensors"):
|
||||||
|
return True
|
||||||
|
folder_path = os.path.dirname(path)
|
||||||
|
else:
|
||||||
|
folder_path = path
|
||||||
|
|
||||||
|
# Check if any safetensor files exist in the folder
|
||||||
|
for root, _, files in os.walk(folder_path):
|
||||||
|
for file in files:
|
||||||
|
if file.endswith(".safetensors"):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
class GGUFLoader(ModelLoader):
|
||||||
|
"""
|
||||||
|
Loader for GGUF format models.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, path: str):
|
||||||
|
"""
|
||||||
|
Initialize the GGUF loader.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
path: Path to the model directory or file
|
||||||
|
"""
|
||||||
|
# Check if path exists
|
||||||
|
if not os.path.exists(path):
|
||||||
|
raise FileNotFoundError(f"GGUF dir not found: {path}")
|
||||||
|
if os.path.isfile(path):
|
||||||
|
self.gguf_path = os.path.dirname(path)
|
||||||
|
else:
|
||||||
|
self.gguf_path = path
|
||||||
|
|
||||||
|
self.tensor_info = {} # Stores tensor metadata
|
||||||
|
self.tensor_file_map = {} # Maps tensor names to file paths
|
||||||
|
self.file_data_map = {} # Maps file paths to memory-mapped data
|
||||||
|
self.gguf_file_meta = {} # Stores GGUF metadata
|
||||||
|
|
||||||
|
# For compatibility with the factory pattern
|
||||||
|
self.safetensor_loader = None
|
||||||
|
|
||||||
|
# Scan all GGUF files in the directory
|
||||||
|
found_gguf = False
|
||||||
|
for root, _, files in os.walk(self.gguf_path):
|
||||||
|
for file in files:
|
||||||
|
if file.endswith(".gguf"):
|
||||||
|
found_gguf = True
|
||||||
|
file_path = os.path.join(root, file)
|
||||||
|
with open(file_path, "rb") as f:
|
||||||
|
self._load_gguf(f)
|
||||||
|
if file_path not in self.file_data_map:
|
||||||
|
self.file_data_map[file_path] = np.memmap(file_path, mode='r')
|
||||||
|
|
||||||
|
if not found_gguf:
|
||||||
|
raise FileNotFoundError(f"Cannot find any .gguf files in: {self.gguf_path}")
|
||||||
|
|
||||||
|
def _load_gguf(self, f) -> None:
|
||||||
|
"""
|
||||||
|
Load GGUF file metadata and tensor info.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
f: File handle of the GGUF file
|
||||||
|
"""
|
||||||
|
# Implementation should follow the original GGUFLoader._load_gguf
|
||||||
|
# This is a simplified version for illustration
|
||||||
|
f.seek(0)
|
||||||
|
assert f.read(4) == b'GGUF'
|
||||||
|
|
||||||
|
# Read header
|
||||||
|
values = struct.unpack("<IQQ", f.read(4+8+8))
|
||||||
|
version, n_tensors, n_kv = values
|
||||||
|
if version != 3:
|
||||||
|
warnings.warn(f"Version {version} has never been tested, might not work")
|
||||||
|
|
||||||
|
# Read key-value pairs
|
||||||
|
info = {}
|
||||||
|
for _ in range(n_kv):
|
||||||
|
name = self._read_value(f, 8) # DATA_TYPES["string"]
|
||||||
|
data_type = struct.unpack("<I", f.read(4))[0]
|
||||||
|
info[name] = self._read_value(f, data_type)
|
||||||
|
|
||||||
|
# Read tensor info
|
||||||
|
tensor_info = {}
|
||||||
|
for _ in range(n_tensors):
|
||||||
|
name = self._read_value(f, 8) # DATA_TYPES["string"]
|
||||||
|
shape_len = self._read_value(f, 4) # DATA_TYPES["uint32"]
|
||||||
|
shape = [self._read_value(f, 10) for _ in range(shape_len)] # DATA_TYPES["uint64"]
|
||||||
|
ggml_type = self._read_value(f, 4) # DATA_TYPES["uint32"]
|
||||||
|
offset = self._read_value(f, 10) # DATA_TYPES["uint64"]
|
||||||
|
|
||||||
|
# Additional tensor metadata would be calculated here
|
||||||
|
# For brevity, we're omitting the detailed tensor metadata calculation
|
||||||
|
tensor_info[name] = {
|
||||||
|
"ggml_type": ggml_type,
|
||||||
|
"shape": shape,
|
||||||
|
"offset": offset,
|
||||||
|
# ... other tensor metadata
|
||||||
|
}
|
||||||
|
|
||||||
|
start = f.tell()
|
||||||
|
alignment = info.get("general.alignment", 32)
|
||||||
|
|
||||||
|
# Calculate actual file offsets
|
||||||
|
for t in tensor_info.values():
|
||||||
|
offset = start + t["offset"]
|
||||||
|
offset += (alignment - offset % alignment) % alignment
|
||||||
|
t["offset"] = offset
|
||||||
|
|
||||||
|
# Update file maps
|
||||||
|
for name in tensor_info:
|
||||||
|
self.tensor_file_map[name] = f.name
|
||||||
|
|
||||||
|
self.tensor_info.update(tensor_info)
|
||||||
|
self.gguf_file_meta.update(info)
|
||||||
|
|
||||||
|
def _read_value(self, f, data_type) -> Any:
|
||||||
|
"""
|
||||||
|
Read a value from the file according to its data type.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
f: File handle
|
||||||
|
data_type: Type of data to read
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The read value
|
||||||
|
"""
|
||||||
|
# Simplified implementation
|
||||||
|
# In a complete implementation, this would handle all data types
|
||||||
|
if data_type == 8: # DATA_TYPES["string"]
|
||||||
|
length = struct.unpack("<Q", f.read(8))[0]
|
||||||
|
return f.read(length).decode("utf-8")
|
||||||
|
elif data_type == 4: # DATA_TYPES["uint32"]
|
||||||
|
return struct.unpack("<I", f.read(4))[0]
|
||||||
|
elif data_type == 10: # DATA_TYPES["uint64"]
|
||||||
|
return struct.unpack("<Q", f.read(8))[0]
|
||||||
|
# ... handling for other data types
|
||||||
|
return None
|
||||||
|
|
||||||
|
def load_tensor(self, name: str, device: str = "cpu") -> torch.Tensor:
|
||||||
|
"""
|
||||||
|
Load a tensor by name.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
name: Name of the tensor to load
|
||||||
|
device: Device to load the tensor to
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The loaded tensor
|
||||||
|
"""
|
||||||
|
# This should call load_gguf_tensor with the appropriate parameters
|
||||||
|
return self.load_gguf_tensor(name, device)
|
||||||
|
|
||||||
|
def load_gguf_tensor(self, name: str, device: str = "cpu", target_dtype = None) -> torch.Tensor:
|
||||||
|
"""
|
||||||
|
Load a GGUF tensor by name.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
name: Name of the tensor to load
|
||||||
|
device: Device to load the tensor to
|
||||||
|
target_dtype: Target data type for the tensor
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The loaded tensor
|
||||||
|
"""
|
||||||
|
# Implementation would follow the original GGUFLoader.load_gguf_tensor
|
||||||
|
# This is a placeholder for illustration
|
||||||
|
if name not in self.tensor_info:
|
||||||
|
raise KeyError(f"Tensor {name} not found")
|
||||||
|
|
||||||
|
# Actual implementation would dequantize the tensor data
|
||||||
|
# and return a torch.Tensor
|
||||||
|
return torch.zeros(1, device=device) # Placeholder
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def supports_format(cls, path: str) -> bool:
|
||||||
|
"""
|
||||||
|
Check if this loader supports the given path format.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
path: Path to check
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if GGUF files are found in the path, False otherwise
|
||||||
|
"""
|
||||||
|
# Normalize path to directory
|
||||||
|
if not os.path.exists(path):
|
||||||
|
return False
|
||||||
|
if os.path.isfile(path):
|
||||||
|
return path.endswith(".gguf")
|
||||||
|
|
||||||
|
# Check if any GGUF files exist in the folder
|
||||||
|
for root, _, files in os.walk(path):
|
||||||
|
for file in files:
|
||||||
|
if file.endswith(".gguf"):
|
||||||
|
return True
|
||||||
|
return False
|
|
@ -6,7 +6,7 @@ import sys
|
||||||
# sys.path.insert(0, "/home/azure/ktransformers")
|
# sys.path.insert(0, "/home/azure/ktransformers")
|
||||||
import argparse
|
import argparse
|
||||||
import torch
|
import torch
|
||||||
from ktransformers.util.custom_gguf import GGUFLoader, translate_name_to_gguf
|
from ktransformers.util.custom_loader import GGUFLoader, translate_name_to_gguf
|
||||||
from safetensors import safe_open
|
from safetensors import safe_open
|
||||||
from safetensors.torch import save_file
|
from safetensors.torch import save_file
|
||||||
import re
|
import re
|
||||||
|
|
Loading…
Add table
Reference in a new issue