mirror of
https://github.com/kvcache-ai/ktransformers.git
synced 2025-09-12 08:09:42 +00:00
support npu
This commit is contained in:
parent
a641aa8063
commit
b982815325
22 changed files with 162 additions and 1562 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -30,3 +30,4 @@ csrc/demo
|
||||||
CMakeFiles
|
CMakeFiles
|
||||||
kvc2/
|
kvc2/
|
||||||
sched/
|
sched/
|
||||||
|
build*/
|
File diff suppressed because it is too large
Load diff
|
@ -1,76 +0,0 @@
|
||||||
# Install script for directory: /home/djw/py311_717/ktransformers/csrc/ktransformers_ext
|
|
||||||
|
|
||||||
# Set the install prefix
|
|
||||||
if(NOT DEFINED CMAKE_INSTALL_PREFIX)
|
|
||||||
set(CMAKE_INSTALL_PREFIX "/usr/local")
|
|
||||||
endif()
|
|
||||||
string(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
|
|
||||||
|
|
||||||
# Set the install configuration name.
|
|
||||||
if(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
|
|
||||||
if(BUILD_TYPE)
|
|
||||||
string(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
|
|
||||||
CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
|
|
||||||
else()
|
|
||||||
set(CMAKE_INSTALL_CONFIG_NAME "Debug")
|
|
||||||
endif()
|
|
||||||
message(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
# Set the component getting installed.
|
|
||||||
if(NOT CMAKE_INSTALL_COMPONENT)
|
|
||||||
if(COMPONENT)
|
|
||||||
message(STATUS "Install component: \"${COMPONENT}\"")
|
|
||||||
set(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
|
|
||||||
else()
|
|
||||||
set(CMAKE_INSTALL_COMPONENT)
|
|
||||||
endif()
|
|
||||||
endif()
|
|
||||||
|
|
||||||
# Install shared libraries without execute permission?
|
|
||||||
if(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
|
|
||||||
set(CMAKE_INSTALL_SO_NO_EXE "1")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
# Is this installation the result of a crosscompile?
|
|
||||||
if(NOT DEFINED CMAKE_CROSSCOMPILING)
|
|
||||||
set(CMAKE_CROSSCOMPILING "FALSE")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
# Set path to fallback-tool for dependency-resolution.
|
|
||||||
if(NOT DEFINED CMAKE_OBJDUMP)
|
|
||||||
set(CMAKE_OBJDUMP "/usr/bin/objdump")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if(NOT CMAKE_INSTALL_LOCAL_ONLY)
|
|
||||||
# Include the install script for the subdirectory.
|
|
||||||
include("/home/djw/py311_717/ktransformers/build_test/third_party/pybind11/cmake_install.cmake")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if(NOT CMAKE_INSTALL_LOCAL_ONLY)
|
|
||||||
# Include the install script for the subdirectory.
|
|
||||||
include("/home/djw/py311_717/ktransformers/build_test/third_party/llama.cpp/cmake_install.cmake")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
string(REPLACE ";" "\n" CMAKE_INSTALL_MANIFEST_CONTENT
|
|
||||||
"${CMAKE_INSTALL_MANIFEST_FILES}")
|
|
||||||
if(CMAKE_INSTALL_LOCAL_ONLY)
|
|
||||||
file(WRITE "/home/djw/py311_717/ktransformers/build_test/install_local_manifest.txt"
|
|
||||||
"${CMAKE_INSTALL_MANIFEST_CONTENT}")
|
|
||||||
endif()
|
|
||||||
if(CMAKE_INSTALL_COMPONENT)
|
|
||||||
if(CMAKE_INSTALL_COMPONENT MATCHES "^[a-zA-Z0-9_.+-]+$")
|
|
||||||
set(CMAKE_INSTALL_MANIFEST "install_manifest_${CMAKE_INSTALL_COMPONENT}.txt")
|
|
||||||
else()
|
|
||||||
string(MD5 CMAKE_INST_COMP_HASH "${CMAKE_INSTALL_COMPONENT}")
|
|
||||||
set(CMAKE_INSTALL_MANIFEST "install_manifest_${CMAKE_INST_COMP_HASH}.txt")
|
|
||||||
unset(CMAKE_INST_COMP_HASH)
|
|
||||||
endif()
|
|
||||||
else()
|
|
||||||
set(CMAKE_INSTALL_MANIFEST "install_manifest.txt")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if(NOT CMAKE_INSTALL_LOCAL_ONLY)
|
|
||||||
file(WRITE "/home/djw/py311_717/ktransformers/build_test/${CMAKE_INSTALL_MANIFEST}"
|
|
||||||
"${CMAKE_INSTALL_MANIFEST_CONTENT}")
|
|
||||||
endif()
|
|
Binary file not shown.
|
@ -318,20 +318,7 @@ elseif (UNIX)
|
||||||
endif()
|
endif()
|
||||||
elseif (KTRANSFORMERS_USE_XPU)
|
elseif (KTRANSFORMERS_USE_XPU)
|
||||||
add_compile_definitions(KTRANSFORMERS_USE_XPU=1)
|
add_compile_definitions(KTRANSFORMERS_USE_XPU=1)
|
||||||
elseif (KTRANSFORMERS_USE_NPU)
|
elseif (KTRANSFORMERS_USE_CUDA)
|
||||||
include(CheckLanguage)
|
|
||||||
check_language(CUDA)
|
|
||||||
if(CMAKE_CUDA_COMPILER)
|
|
||||||
message(STATUS "CUDA detected")
|
|
||||||
find_package(CUDAToolkit REQUIRED)
|
|
||||||
include_directories(${CUDAToolkit_INCLUDE_DIRS})
|
|
||||||
endif()
|
|
||||||
message(STATUS "enabling CUDA")
|
|
||||||
enable_language(CUDA)
|
|
||||||
add_compile_definitions(KTRANSFORMERS_USE_CUDA=1)
|
|
||||||
else()
|
|
||||||
find_package(CUDA REQUIRED)
|
|
||||||
include_directories("${CUDA_INCLUDE_DIRS}")
|
|
||||||
include(CheckLanguage)
|
include(CheckLanguage)
|
||||||
check_language(CUDA)
|
check_language(CUDA)
|
||||||
if(CMAKE_CUDA_COMPILER)
|
if(CMAKE_CUDA_COMPILER)
|
||||||
|
@ -397,7 +384,7 @@ elseif(UNIX)
|
||||||
elseif(KTRANSFORMERS_USE_MUSA)
|
elseif(KTRANSFORMERS_USE_MUSA)
|
||||||
target_link_libraries(${PROJECT_NAME} PRIVATE MUSA::musart)
|
target_link_libraries(${PROJECT_NAME} PRIVATE MUSA::musart)
|
||||||
elseif(KTRANSFORMERS_USE_XPU)
|
elseif(KTRANSFORMERS_USE_XPU)
|
||||||
else()
|
elseif(KTRANSFORMERS_USE_CUDA AND NOT KTRANSFORMERS_USE_MUSA)
|
||||||
target_link_libraries(${PROJECT_NAME} PRIVATE "${CUDAToolkit_LIBRARY_DIR}/libcudart.so")
|
target_link_libraries(${PROJECT_NAME} PRIVATE "${CUDAToolkit_LIBRARY_DIR}/libcudart.so")
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
|
@ -148,10 +148,10 @@ def local_chat(
|
||||||
logging.basicConfig(level=logging.INFO)
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
|
||||||
system = platform.system()
|
system = platform.system()
|
||||||
if system == "Windows":
|
# if system == "Windows":
|
||||||
os.system("cls") if local_rank == 0 else None
|
# os.system("cls") if local_rank == 0 else None
|
||||||
else:
|
# else:
|
||||||
os.system("clear") if local_rank == 0 else None
|
# os.system("clear") if local_rank == 0 else None
|
||||||
|
|
||||||
print(f"{model=}") if local_rank == 0 else None
|
print(f"{model=}") if local_rank == 0 else None
|
||||||
|
|
||||||
|
|
|
@ -11,7 +11,7 @@ from transformers.cache_utils import Cache
|
||||||
from ktransformers.models.modeling_deepseek import DeepseekV2Attention, apply_rotary_pos_emb
|
from ktransformers.models.modeling_deepseek import DeepseekV2Attention, apply_rotary_pos_emb
|
||||||
from ktransformers.operators.base_operator import BaseInjectedModule
|
from ktransformers.operators.base_operator import BaseInjectedModule
|
||||||
from ktransformers.util.ascend.ascend_utils import get_tensor_parallel_size, allreduce_wrapper
|
from ktransformers.util.ascend.ascend_utils import get_tensor_parallel_size, allreduce_wrapper
|
||||||
from ktransformers.util.custom_gguf import GGUFLoader
|
from ktransformers.util.custom_loader import GGUFLoader
|
||||||
from ktransformers.util.utils import get_compute_capability, get_use_npu_graph, CUR_DEVICE
|
from ktransformers.util.utils import get_compute_capability, get_use_npu_graph, CUR_DEVICE
|
||||||
from ktransformers.util.vendors import device_manager, GPUVendor
|
from ktransformers.util.vendors import device_manager, GPUVendor
|
||||||
from ktransformers.util import utils
|
from ktransformers.util import utils
|
||||||
|
|
|
@ -8,7 +8,7 @@ from transformers import PretrainedConfig
|
||||||
|
|
||||||
from ktransformers.util.ascend.ascend_utils import get_tensor_parallel_size, get_tensor_parallel_group
|
from ktransformers.util.ascend.ascend_utils import get_tensor_parallel_size, get_tensor_parallel_group
|
||||||
from ktransformers.operators.experts import KExpertsCPU, KTransformersExperts, EXPERTS_MAP, KDeepseekV3MoE, cuda_graphs
|
from ktransformers.operators.experts import KExpertsCPU, KTransformersExperts, EXPERTS_MAP, KDeepseekV3MoE, cuda_graphs
|
||||||
from ktransformers.util.custom_gguf import GGUFLoader
|
from ktransformers.util.custom_loader import GGUFLoader
|
||||||
from ktransformers.util.utils import CUR_DEVICE, get_use_npu_graph, InferenceState
|
from ktransformers.util.utils import CUR_DEVICE, get_use_npu_graph, InferenceState
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -5,8 +5,8 @@ from transformers import PretrainedConfig
|
||||||
|
|
||||||
from ktransformers.operators.base_operator import BaseInjectedModule
|
from ktransformers.operators.base_operator import BaseInjectedModule
|
||||||
from ktransformers.util import utils
|
from ktransformers.util import utils
|
||||||
from ktransformers.util.custom_gguf import GGUFLoader
|
from ktransformers.util.custom_loader import GGUFLoader
|
||||||
|
from ktransformers.util.custom_loader import translate_name_to_gguf
|
||||||
|
|
||||||
class KDeepseekV3RMSNormW8A8(BaseInjectedModule):
|
class KDeepseekV3RMSNormW8A8(BaseInjectedModule):
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
|
|
|
@ -14,9 +14,9 @@ from ktransformers.util.ascend.ascend_utils import (
|
||||||
get_tensor_parallel_group
|
get_tensor_parallel_group
|
||||||
)
|
)
|
||||||
from ktransformers.util import utils
|
from ktransformers.util import utils
|
||||||
from ktransformers.util.custom_gguf import GGUFLoader
|
from ktransformers.util.custom_loader import GGUFLoader
|
||||||
from ktransformers.util.utils import InferenceState
|
from ktransformers.util.utils import InferenceState
|
||||||
|
from ktransformers.util.custom_loader import translate_name_to_gguf
|
||||||
|
|
||||||
class KLinearW8A8(KLinearBase):
|
class KLinearW8A8(KLinearBase):
|
||||||
def __init__(
|
def __init__(
|
||||||
|
@ -39,6 +39,11 @@ class KLinearW8A8(KLinearBase):
|
||||||
for key in keys:
|
for key in keys:
|
||||||
if device is None:
|
if device is None:
|
||||||
device = utils.CUR_DEVICE
|
device = utils.CUR_DEVICE
|
||||||
|
|
||||||
|
key = translate_name_to_gguf(key)
|
||||||
|
if key == "lm_head":
|
||||||
|
key = "output"
|
||||||
|
|
||||||
if key + ".weight" in self.gguf_loader.safetensor_loader.tensor_file_map:
|
if key + ".weight" in self.gguf_loader.safetensor_loader.tensor_file_map:
|
||||||
if key + ".deq_scale" in self.gguf_loader.safetensor_loader.tensor_file_map:
|
if key + ".deq_scale" in self.gguf_loader.safetensor_loader.tensor_file_map:
|
||||||
qweight = self.gguf_loader.safetensor_loader.load_tensor(f"{key}.weight")
|
qweight = self.gguf_loader.safetensor_loader.load_tensor(f"{key}.weight")
|
||||||
|
@ -47,25 +52,25 @@ class KLinearW8A8(KLinearBase):
|
||||||
input_scale = self.gguf_loader.safetensor_loader.load_tensor(f"{key}.input_scale")
|
input_scale = self.gguf_loader.safetensor_loader.load_tensor(f"{key}.input_scale")
|
||||||
input_offset = self.gguf_loader.safetensor_loader.load_tensor(f"{key}.input_offset")
|
input_offset = self.gguf_loader.safetensor_loader.load_tensor(f"{key}.input_offset")
|
||||||
tensors = (qweight, deq_scale, quant_bias, input_scale, input_offset)
|
tensors = (qweight, deq_scale, quant_bias, input_scale, input_offset)
|
||||||
|
print(f"Loading {key} with shape {qweight.shape}, {deq_scale.shape}, {quant_bias.shape}, {input_scale.shape}, {input_offset.shape}")
|
||||||
|
print(tensors)
|
||||||
return tensors
|
return tensors
|
||||||
elif key + ".weight_scale" in self.gguf_loader.safetensor_loader.tensor_file_map:
|
elif key + ".weight_scale" in self.gguf_loader.safetensor_loader.tensor_file_map:
|
||||||
if key.endswith("ffn_gate_shexp"):
|
if key.endswith("ffn_gate_shexp"):
|
||||||
parts = key.split(".")
|
parts = key.split(".")
|
||||||
layer = parts[1]
|
layer = parts[1]
|
||||||
gate_weight = self.gguf_loader.safetensor_loader.load_tensor(f"blk.{layer}.ffn_gate_shexp.weight")
|
gate_weight = self.gguf_loader.safetensor_loader.load_tensor(f"blk.{layer}.ffn_gate_shexp.weight")
|
||||||
gate_weight = get_safetensors_cut_weight(self.key, gate_weight).t()
|
|
||||||
up_weight = self.gguf_loader.safetensor_loader.load_tensor(f"blk.{layer}.ffn_up_shexp.weight")
|
up_weight = self.gguf_loader.safetensor_loader.load_tensor(f"blk.{layer}.ffn_up_shexp.weight")
|
||||||
up_weight = get_safetensors_cut_weight(self.key, up_weight).t()
|
gate_up_weight = torch.cat((gate_weight, up_weight), 0)
|
||||||
gate_scale = self.gguf_loader.safetensor_loader.load_tensor(f"blk.{layer}.ffn_gate_shexp.weight_scale")
|
gate_scale = self.gguf_loader.safetensor_loader.load_tensor(f"blk.{layer}.ffn_gate_shexp.weight_scale")
|
||||||
gate_scale = get_safetensors_cut_weight(self.key, gate_scale)
|
|
||||||
up_scale = self.gguf_loader.safetensor_loader.load_tensor(f"blk.{layer}.ffn_up_shexp.weight_scale")
|
up_scale = self.gguf_loader.safetensor_loader.load_tensor(f"blk.{layer}.ffn_up_shexp.weight_scale")
|
||||||
up_scale = get_safetensors_cut_weight(self.key, up_scale)
|
|
||||||
gate_up_weight = torch.cat((gate_weight, up_weight), 1)
|
|
||||||
gate_up_scale = torch.cat((gate_scale, up_scale), 0)
|
gate_up_scale = torch.cat((gate_scale, up_scale), 0)
|
||||||
gate_offset = self.gguf_loader.safetensor_loader.load_tensor(f"blk.{layer}.ffn_gate_shexp.weight_offset")
|
gate_offset = self.gguf_loader.safetensor_loader.load_tensor(f"blk.{layer}.ffn_gate_shexp.weight_offset")
|
||||||
up_offset = self.gguf_loader.safetensor_loader.load_tensor(f"blk.{layer}.ffn_up_shexp.weight_offset")
|
up_offset = self.gguf_loader.safetensor_loader.load_tensor(f"blk.{layer}.ffn_up_shexp.weight_offset")
|
||||||
gate_up_offset = torch.cat((gate_offset, up_offset), 0)
|
gate_up_offset = torch.cat((gate_offset, up_offset), 0)
|
||||||
tensors = (gate_up_weight, gate_up_scale, gate_up_offset)
|
tensors = (gate_up_weight, gate_up_scale, gate_up_offset)
|
||||||
|
print(f"Loading {key} as ffn_gate_shexp with shape {gate_up_weight.shape}, {gate_up_scale.shape}, {gate_up_offset.shape}")
|
||||||
|
print(tensors)
|
||||||
elif key.endswith("ffn_up_shexp"):
|
elif key.endswith("ffn_up_shexp"):
|
||||||
return fake_tensor
|
return fake_tensor
|
||||||
else:
|
else:
|
||||||
|
@ -73,10 +78,11 @@ class KLinearW8A8(KLinearBase):
|
||||||
weight_scale = self.gguf_loader.safetensor_loader.load_tensor(f"{key}.weight_scale")
|
weight_scale = self.gguf_loader.safetensor_loader.load_tensor(f"{key}.weight_scale")
|
||||||
weight_offset = self.gguf_loader.safetensor_loader.load_tensor(f"{key}.weight_offset")
|
weight_offset = self.gguf_loader.safetensor_loader.load_tensor(f"{key}.weight_offset")
|
||||||
tensors = (qweight, weight_scale, weight_offset)
|
tensors = (qweight, weight_scale, weight_offset)
|
||||||
|
print(f"Loading {key} with shape {qweight.shape}, {weight_scale.shape}, {weight_offset.shape}")
|
||||||
|
print(tensors)
|
||||||
return tensors
|
return tensors
|
||||||
else:
|
else:
|
||||||
weight = self.gguf_loader.safetensor_loader.load_tensor(f"{key}.weight")
|
weight = self.gguf_loader.safetensor_loader.load_tensor(f"{key}.weight")
|
||||||
weight = get_safetensors_cut_weight(self.key, weight)
|
|
||||||
return weight
|
return weight
|
||||||
else:
|
else:
|
||||||
raise FileNotFoundError(f"Weight file not found for key {key}")
|
raise FileNotFoundError(f"Weight file not found for key {key}")
|
||||||
|
|
|
@ -49,6 +49,7 @@ class KDeepseekV3MLPW8A8A2V2(BaseInjectedModule, DeepseekV3MLP):
|
||||||
original_dtype = x.dtype
|
original_dtype = x.dtype
|
||||||
quant_out, dynamic_scale = torch_npu.npu_dynamic_quant(x)
|
quant_out, dynamic_scale = torch_npu.npu_dynamic_quant(x)
|
||||||
dynamic_scale = dynamic_scale.view(-1)
|
dynamic_scale = dynamic_scale.view(-1)
|
||||||
|
|
||||||
gate_up_x = torch_npu.npu_quant_matmul(
|
gate_up_x = torch_npu.npu_quant_matmul(
|
||||||
quant_out,
|
quant_out,
|
||||||
self.orig_module.gate_proj.weight,
|
self.orig_module.gate_proj.weight,
|
||||||
|
|
|
@ -36,6 +36,7 @@ from abc import ABC, abstractmethod
|
||||||
from ktransformers.operators.linear import KLinearMarlin, KLinearTorch, KTransformersLinear
|
from ktransformers.operators.linear import KLinearMarlin, KLinearTorch, KTransformersLinear
|
||||||
import time
|
import time
|
||||||
from ktransformers.operators.cpuinfer import CPUInfer
|
from ktransformers.operators.cpuinfer import CPUInfer
|
||||||
|
from ktransformers.util.custom_loader import translate_name_to_gguf
|
||||||
|
|
||||||
|
|
||||||
def deduplicate_and_sort(lst):
|
def deduplicate_and_sort(lst):
|
||||||
|
@ -396,6 +397,16 @@ class KExpertsCPU(KExpertsBase):
|
||||||
gate_type = self.gguf_loader.get_ggml_type(key + ".ffn_gate.0.weight")
|
gate_type = self.gguf_loader.get_ggml_type(key + ".ffn_gate.0.weight")
|
||||||
up_type = self.gguf_loader.get_ggml_type(key + ".ffn_up.0.weight")
|
up_type = self.gguf_loader.get_ggml_type(key + ".ffn_up.0.weight")
|
||||||
down_type = self.gguf_loader.get_ggml_type(key + ".ffn_down.0.weight")
|
down_type = self.gguf_loader.get_ggml_type(key + ".ffn_down.0.weight")
|
||||||
|
elif self.gguf_loader.safetensor_loader is not None:
|
||||||
|
# for npu
|
||||||
|
# using a temp ugly way to temprary load the tensor
|
||||||
|
translate_key = translate_name_to_gguf(key)
|
||||||
|
gate = self.gguf_loader.safetensor_loader.load_tensor(translate_key + ".ffn_gate_exps.weight").numpy()
|
||||||
|
up = self.gguf_loader.safetensor_loader.load_tensor(translate_key + ".ffn_up_exps.weight").numpy()
|
||||||
|
down = self.gguf_loader.safetensor_loader.load_tensor(translate_key + ".ffn_down_exps.weight").numpy()
|
||||||
|
gate_type = self.gguf_loader.safetensor_loader.load_tensor(translate_key + ".ffn_gate_exps.ggml_type").item()
|
||||||
|
up_type = self.gguf_loader.safetensor_loader.load_tensor(translate_key + ".ffn_up_exps.ggml_type").item()
|
||||||
|
down_type = self.gguf_loader.safetensor_loader.load_tensor(translate_key + ".ffn_down_exps.ggml_type").item()
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Experts {key} not found in gguf_loader")
|
raise ValueError(f"Experts {key} not found in gguf_loader")
|
||||||
res = {key:{"gate": gate, "up": up, "down": down, "gate_type": gate_type, "up_type": up_type, "down_type": down_type}}
|
res = {key:{"gate": gate, "up": up, "down": down, "gate_type": gate_type, "up_type": up_type, "down_type": down_type}}
|
||||||
|
|
|
@ -9,7 +9,7 @@ from ktransformers.operators.linear import KTransformersLinear
|
||||||
from ktransformers.util.custom_loader import GGUFLoader, ModelLoader, SafeTensorLoader
|
from ktransformers.util.custom_loader import GGUFLoader, ModelLoader, SafeTensorLoader
|
||||||
from transformers.configuration_utils import PretrainedConfig
|
from transformers.configuration_utils import PretrainedConfig
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
|
from ktransformers.util.custom_loader import translate_name_to_gguf
|
||||||
|
|
||||||
# class Base(BaseInjectedModule, ABC):
|
# class Base(BaseInjectedModule, ABC):
|
||||||
class KMoEGateBase(ABC):
|
class KMoEGateBase(ABC):
|
||||||
|
@ -55,8 +55,18 @@ class KMoEGateBase(ABC):
|
||||||
down_type = None
|
down_type = None
|
||||||
|
|
||||||
for key in keys:
|
for key in keys:
|
||||||
# key = ".".join(key.split(".")[:-1])
|
|
||||||
if isinstance(self.gguf_loader, SafeTensorLoader):
|
if self.gguf_loader.safetensor_loader is not None:
|
||||||
|
# for npu
|
||||||
|
translate_key = translate_name_to_gguf(key)
|
||||||
|
translate_key = ".".join(translate_key.split(".")[:2])
|
||||||
|
targets = [".ffn_gate_inp.weight", ".exp_probs_b.bias"]
|
||||||
|
weight = self.gguf_loader.safetensor_loader.load_tensor(translate_key + ".ffn_gate_inp.weight")
|
||||||
|
e_score_correction_bias = self.gguf_loader.safetensor_loader.load_tensor(translate_key + ".exp_probs_b.bias")
|
||||||
|
weight_type = weight.dtype
|
||||||
|
e_score_correction_bias_type = e_score_correction_bias.dtype
|
||||||
|
res = {"weight": weight, "e_score_correction_bias": e_score_correction_bias, "weight_type": weight_type, "e_score_correction_bias_type": e_score_correction_bias_type}
|
||||||
|
elif isinstance(self.gguf_loader, SafeTensorLoader):
|
||||||
res = self.gguf_loader.load_gate(key, device=device)
|
res = self.gguf_loader.load_gate(key, device=device)
|
||||||
elif self.gguf_loader.has_tensor(key+".weight"):
|
elif self.gguf_loader.has_tensor(key+".weight"):
|
||||||
# targets = [".ffn_gate_inp.weight", ".exp_probs_b.bias"]
|
# targets = [".ffn_gate_inp.weight", ".exp_probs_b.bias"]
|
||||||
|
|
|
@ -38,7 +38,10 @@ if not torch.xpu.is_available():
|
||||||
)
|
)
|
||||||
from ktransformers.operators.base_operator import BaseInjectedModule
|
from ktransformers.operators.base_operator import BaseInjectedModule
|
||||||
from transformers.configuration_utils import PretrainedConfig
|
from transformers.configuration_utils import PretrainedConfig
|
||||||
from ktransformers.ktransformers_ext.triton.fp8gemm import fp8_gemm, act_quant, weight_dequant
|
try:
|
||||||
|
from ktransformers.ktransformers_ext.triton.fp8gemm import fp8_gemm, act_quant, weight_dequant
|
||||||
|
except:
|
||||||
|
print("no triton")
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
import sys, os
|
import sys, os
|
||||||
sys.path.append(os.path.join(os.path.dirname(__file__), "..", "ktransformers_ext", "build"))
|
sys.path.append(os.path.join(os.path.dirname(__file__), "..", "ktransformers_ext", "build"))
|
||||||
|
|
|
@ -18,7 +18,15 @@ import itertools
|
||||||
import copy
|
import copy
|
||||||
from ktransformers.util import utils
|
from ktransformers.util import utils
|
||||||
|
|
||||||
|
try:
|
||||||
|
import torch_npu
|
||||||
|
use_torch_npu = torch_npu.npu.is_available()
|
||||||
|
except:
|
||||||
|
use_torch_npu = False
|
||||||
|
|
||||||
|
|
||||||
def inject(module, local_optimization_dict, model_config:AutoConfig ,gguf_loader:GGUFLoader, prefix=''):
|
def inject(module, local_optimization_dict, model_config:AutoConfig ,gguf_loader:GGUFLoader, prefix=''):
|
||||||
|
|
||||||
for name, child in module._modules.items():
|
for name, child in module._modules.items():
|
||||||
if child is not None:
|
if child is not None:
|
||||||
child_prefix = prefix + name
|
child_prefix = prefix + name
|
||||||
|
@ -124,6 +132,7 @@ def optimize_and_load_gguf(module: nn.Module, rule_file: str, gguf_path: str, mo
|
||||||
|
|
||||||
model_config = translate_model_config(model_config)
|
model_config = translate_model_config(model_config)
|
||||||
|
|
||||||
|
if use_torch_npu:
|
||||||
if q4_gguf_path:
|
if q4_gguf_path:
|
||||||
q4_gguf_loader = GGUFLoader(q4_gguf_path)
|
q4_gguf_loader = GGUFLoader(q4_gguf_path)
|
||||||
utils.Q4_GGUF_LODER = q4_gguf_loader
|
utils.Q4_GGUF_LODER = q4_gguf_loader
|
||||||
|
|
|
@ -24,7 +24,14 @@ from typing import Sequence
|
||||||
import os
|
import os
|
||||||
from enum import IntEnum
|
from enum import IntEnum
|
||||||
import torch
|
import torch
|
||||||
if not torch.xpu.is_available():
|
|
||||||
|
try:
|
||||||
|
import torch_npu
|
||||||
|
use_torch_npu = torch_npu.npu.is_available()
|
||||||
|
except:
|
||||||
|
use_torch_npu = False
|
||||||
|
|
||||||
|
if not torch.xpu.is_available() and not use_torch_npu:
|
||||||
import KTransformersOps
|
import KTransformersOps
|
||||||
import ctypes
|
import ctypes
|
||||||
import math
|
import math
|
||||||
|
|
|
@ -53,6 +53,7 @@ class SafeTensorLoader(ModelLoader):
|
||||||
def __init__(self, file_path: str):
|
def __init__(self, file_path: str):
|
||||||
|
|
||||||
self.__load_tensor_file_map(file_path)
|
self.__load_tensor_file_map(file_path)
|
||||||
|
# print(self.tensor_file_map)
|
||||||
|
|
||||||
def __load_tensor_file_map(self, file_path: str):
|
def __load_tensor_file_map(self, file_path: str):
|
||||||
# 处理传入路径,确保是文件夹路径
|
# 处理传入路径,确保是文件夹路径
|
||||||
|
@ -96,6 +97,7 @@ class SafeTensorLoader(ModelLoader):
|
||||||
|
|
||||||
|
|
||||||
def load_tensor(self, key: str, device: str="cpu"):
|
def load_tensor(self, key: str, device: str="cpu"):
|
||||||
|
|
||||||
if translate_name_to_gguf(key) in self.tensor_file_map:
|
if translate_name_to_gguf(key) in self.tensor_file_map:
|
||||||
key = translate_name_to_gguf(key)
|
key = translate_name_to_gguf(key)
|
||||||
elif key in self.tensor_file_map:
|
elif key in self.tensor_file_map:
|
||||||
|
@ -267,6 +269,7 @@ class SafeTensorLoader(ModelLoader):
|
||||||
|
|
||||||
class W8A8SafeTensorLoader(SafeTensorLoader):
|
class W8A8SafeTensorLoader(SafeTensorLoader):
|
||||||
def load_tensor(self, key: str, device: str = "cpu"):
|
def load_tensor(self, key: str, device: str = "cpu"):
|
||||||
|
key = translate_name_to_gguf(key)
|
||||||
if key not in self.tensor_file_map:
|
if key not in self.tensor_file_map:
|
||||||
raise KeyError(f"Key {key} not found in Safetensor files")
|
raise KeyError(f"Key {key} not found in Safetensor files")
|
||||||
file = self.tensor_file_map[key]
|
file = self.tensor_file_map[key]
|
||||||
|
@ -308,13 +311,6 @@ class GGUFLoader(ModelLoader):
|
||||||
gguf_path = os.path.dirname(gguf_path)
|
gguf_path = os.path.dirname(gguf_path)
|
||||||
|
|
||||||
safetensor_loader = SafeTensorLoader(gguf_path)
|
safetensor_loader = SafeTensorLoader(gguf_path)
|
||||||
if quantize == "w8a8_dynamic":
|
|
||||||
safetensor_loader = W8A8SafeTensorLoader(gguf_path)
|
|
||||||
else:
|
|
||||||
safetensor_loader = SafeTensorLoader(gguf_path)
|
|
||||||
if safetensor_loader.tensor_file_map:
|
|
||||||
self.safetensor_loader = safetensor_loader
|
|
||||||
return
|
|
||||||
|
|
||||||
self.tensor_info = {}
|
self.tensor_info = {}
|
||||||
self.gguf_path = gguf_path
|
self.gguf_path = gguf_path
|
||||||
|
@ -323,6 +319,14 @@ class GGUFLoader(ModelLoader):
|
||||||
self.gguf_file_meta = {}
|
self.gguf_file_meta = {}
|
||||||
self.tensor_device_map = {}
|
self.tensor_device_map = {}
|
||||||
|
|
||||||
|
if quantize == "w8a8_dynamic":
|
||||||
|
safetensor_loader = W8A8SafeTensorLoader(gguf_path)
|
||||||
|
else:
|
||||||
|
safetensor_loader = SafeTensorLoader(gguf_path)
|
||||||
|
if safetensor_loader.tensor_file_map:
|
||||||
|
self.safetensor_loader = safetensor_loader
|
||||||
|
return
|
||||||
|
|
||||||
# Walk through all the .gguf files in the directory
|
# Walk through all the .gguf files in the directory
|
||||||
found_gguf = False
|
found_gguf = False
|
||||||
for root, dirs, files in os.walk(gguf_path):
|
for root, dirs, files in os.walk(gguf_path):
|
||||||
|
@ -431,6 +435,7 @@ class GGUFLoader(ModelLoader):
|
||||||
return mmap_data[offset : offset + itemsize * item_count]
|
return mmap_data[offset : offset + itemsize * item_count]
|
||||||
|
|
||||||
def get_undequanted_tensor_and_ggml_type(self, name):
|
def get_undequanted_tensor_and_ggml_type(self, name):
|
||||||
|
|
||||||
name = translate_name_to_gguf(name)
|
name = translate_name_to_gguf(name)
|
||||||
t = self.tensor_info[name]
|
t = self.tensor_info[name]
|
||||||
data = self.get_mmap_tensor(name)
|
data = self.get_mmap_tensor(name)
|
||||||
|
@ -439,6 +444,7 @@ class GGUFLoader(ModelLoader):
|
||||||
return data, ggml_type
|
return data, ggml_type
|
||||||
|
|
||||||
def load_expert_tensor(self, name, data, expert_id, elements_per_expert, device = "cuda", target_dtype = torch.get_default_dtype())->torch.Tensor:
|
def load_expert_tensor(self, name, data, expert_id, elements_per_expert, device = "cuda", target_dtype = torch.get_default_dtype())->torch.Tensor:
|
||||||
|
|
||||||
name = translate_name_to_gguf(name)
|
name = translate_name_to_gguf(name)
|
||||||
t = self.tensor_info[name]
|
t = self.tensor_info[name]
|
||||||
shape = t["shape"]
|
shape = t["shape"]
|
||||||
|
@ -468,6 +474,7 @@ class GGUFLoader(ModelLoader):
|
||||||
return values
|
return values
|
||||||
|
|
||||||
def load_gguf_tensor(self, name: str, device:str = "cpu", target_dtype = None)->torch.Tensor:
|
def load_gguf_tensor(self, name: str, device:str = "cpu", target_dtype = None)->torch.Tensor:
|
||||||
|
|
||||||
name = translate_name_to_gguf(name)
|
name = translate_name_to_gguf(name)
|
||||||
t = self.tensor_info[name]
|
t = self.tensor_info[name]
|
||||||
if target_dtype == None:
|
if target_dtype == None:
|
||||||
|
@ -533,10 +540,12 @@ class GGUFLoader(ModelLoader):
|
||||||
.reshape(values.shape))
|
.reshape(values.shape))
|
||||||
return values
|
return values
|
||||||
def has_tensor(self, name: str):
|
def has_tensor(self, name: str):
|
||||||
|
|
||||||
name = translate_name_to_gguf(name)
|
name = translate_name_to_gguf(name)
|
||||||
return name in self.tensor_info
|
return name in self.tensor_info
|
||||||
|
|
||||||
def get_ggml_type(self, name: str):
|
def get_ggml_type(self, name: str):
|
||||||
|
|
||||||
name = translate_name_to_gguf(name)
|
name = translate_name_to_gguf(name)
|
||||||
if name not in self.tensor_info:
|
if name not in self.tensor_info:
|
||||||
raise KeyError(f"Key {name} not found in GGUF files")
|
raise KeyError(f"Key {name} not found in GGUF files")
|
||||||
|
|
|
@ -135,10 +135,47 @@ def get_all_used_cuda_device(device_map:dict):
|
||||||
all_device_list = list(all_device_list)
|
all_device_list = list(all_device_list)
|
||||||
return all_device_list
|
return all_device_list
|
||||||
|
|
||||||
|
def get_current_device():
|
||||||
|
return f"npu:{torch.npu.current_device()}"
|
||||||
|
|
||||||
|
|
||||||
|
def load_cur_state_dict_npu(module: nn.Module, gguf_loader: ModelLoader, prefix: str = "", device="npu"):
|
||||||
|
prefix = prefix.replace("orig_module.", "")
|
||||||
|
persistent_buffers = {k: v for k, v in module._buffers.items() if k not in module._non_persistent_buffers_set}
|
||||||
|
local_name_params = itertools.chain(module._parameters.items(), persistent_buffers.items())
|
||||||
|
local_state = {k: v for k, v in local_name_params if v is not None}
|
||||||
|
for name, param in local_state.items():
|
||||||
|
key = prefix + name
|
||||||
|
translated_key = translate_name_to_gguf(key)
|
||||||
|
# TODO: Merge all loader.
|
||||||
|
# I know this is ugly but lets do it for now.
|
||||||
|
if gguf_loader.safetensor_loader is not None:
|
||||||
|
load_dequantized_tensor = gguf_loader.safetensor_loader.load_dequantized_tensor
|
||||||
|
tensor_file_map = gguf_loader.safetensor_loader.tensor_file_map
|
||||||
|
else:
|
||||||
|
load_dequantized_tensor = gguf_loader.load_gguf_tensor
|
||||||
|
tensor_file_map = gguf_loader.tensor_file_map
|
||||||
|
|
||||||
|
if translated_key in tensor_file_map:
|
||||||
|
target_dtype = torch.get_default_dtype()
|
||||||
|
device = get_device(translated_key[:translated_key.rfind(".")], gguf_loader.tensor_device_map)
|
||||||
|
# Todo need fix
|
||||||
|
device = "cpu" if "embd" in translated_key else get_current_device()
|
||||||
|
print(f"loading layer {translated_key} to {device}")
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
weights = load_dequantized_tensor(translated_key, device=device).to(dtype=target_dtype)
|
||||||
|
set_param(module, name, weights)
|
||||||
|
del weights
|
||||||
|
else:
|
||||||
|
#print(load_config.tensor_file_map.keys())
|
||||||
|
raise Exception(f"can't find {translated_key} in GGUF file!")
|
||||||
|
|
||||||
|
|
||||||
# TODO: support NPU
|
|
||||||
def load_cur_state_dict(module: nn.Module, gguf_loader: ModelLoader, prefix: str = "", device="cuda"):
|
def load_cur_state_dict(module: nn.Module, gguf_loader: ModelLoader, prefix: str = "", device="cuda"):
|
||||||
|
if use_torch_npu:
|
||||||
|
load_cur_state_dict_npu(module, gguf_loader, prefix, device)
|
||||||
|
return
|
||||||
|
|
||||||
prefix = prefix.replace("orig_module.", "")
|
prefix = prefix.replace("orig_module.", "")
|
||||||
persistent_buffers = {k: v for k, v in module._buffers.items() if k not in module._non_persistent_buffers_set}
|
persistent_buffers = {k: v for k, v in module._buffers.items() if k not in module._non_persistent_buffers_set}
|
||||||
local_name_params = itertools.chain(module._parameters.items(), persistent_buffers.items())
|
local_name_params = itertools.chain(module._parameters.items(), persistent_buffers.items())
|
||||||
|
@ -214,7 +251,7 @@ def xpu_fp16_model(config):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def load_weights(module:nn.Module, gguf_loader:ModelLoader, prefix='', device="cuda"):
|
def load_weights(module:nn.Module, gguf_loader:ModelLoader, prefix='', device="cuda"):
|
||||||
#print(f"recursively loading weights {prefix}")
|
# print(f"recursively loading weights {prefix}")
|
||||||
if not isinstance(module, base_operator.BaseInjectedModule):
|
if not isinstance(module, base_operator.BaseInjectedModule):
|
||||||
load_cur_state_dict(module, gguf_loader, prefix, device=device)
|
load_cur_state_dict(module, gguf_loader, prefix, device=device)
|
||||||
for name, child in module._modules.items():
|
for name, child in module._modules.items():
|
||||||
|
@ -314,6 +351,7 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000, use_cud
|
||||||
cache_position=cache_position,
|
cache_position=cache_position,
|
||||||
past_key_values=past_key_values,
|
past_key_values=past_key_values,
|
||||||
return_dict=False, use_cache=True)[0]
|
return_dict=False, use_cache=True)[0]
|
||||||
|
print(logits)
|
||||||
if past_key_values != None:
|
if past_key_values != None:
|
||||||
past_key_values.change_seq_length(1)
|
past_key_values.change_seq_length(1)
|
||||||
all_cuda_device = ['npu:' + str(index) for index in range(torch.distributed.get_world_size())]
|
all_cuda_device = ['npu:' + str(index) for index in range(torch.distributed.get_world_size())]
|
||||||
|
@ -361,7 +399,6 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000, use_cud
|
||||||
if past_key_values != None and isinstance(past_key_values, StaticCache):
|
if past_key_values != None and isinstance(past_key_values, StaticCache):
|
||||||
past_key_values.change_seq_length(1)
|
past_key_values.change_seq_length(1)
|
||||||
sync_all_device(all_cuda_device)
|
sync_all_device(all_cuda_device)
|
||||||
#print(logits)
|
|
||||||
next_token_scores = logits_warper(inputs, logits[:, -1, :])
|
next_token_scores = logits_warper(inputs, logits[:, -1, :])
|
||||||
if generation_config.do_sample:
|
if generation_config.do_sample:
|
||||||
probs = nn.functional.softmax(next_token_scores, dim=-1)
|
probs = nn.functional.softmax(next_token_scores, dim=-1)
|
||||||
|
@ -410,6 +447,9 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000, use_cud
|
||||||
next_token = decode_one_tokens(cuda_graph_runner, next_token.unsqueeze(0), position_ids,
|
next_token = decode_one_tokens(cuda_graph_runner, next_token.unsqueeze(0), position_ids,
|
||||||
cache_position, past_key_values, logits_warper, generation_config,
|
cache_position, past_key_values, logits_warper, generation_config,
|
||||||
use_cuda_graph).to(torch_device)
|
use_cuda_graph).to(torch_device)
|
||||||
|
|
||||||
|
print(next_token)
|
||||||
|
|
||||||
inputs = torch.cat((inputs, next_token.unsqueeze(0)), dim=-1)
|
inputs = torch.cat((inputs, next_token.unsqueeze(0)), dim=-1)
|
||||||
generated_ids[:, cache_position] = next_token.int()
|
generated_ids[:, cache_position] = next_token.int()
|
||||||
tokens.append(int(next_token))
|
tokens.append(int(next_token))
|
||||||
|
@ -596,8 +636,7 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000, use_cud
|
||||||
cache_position = torch.tensor([seq_length], device=torch_device, dtype=torch.int32)
|
cache_position = torch.tensor([seq_length], device=torch_device, dtype=torch.int32)
|
||||||
position_ids = cache_position.unsqueeze(0)
|
position_ids = cache_position.unsqueeze(0)
|
||||||
seq_length += 1
|
seq_length += 1
|
||||||
if use_torch_npu:
|
|
||||||
past_key_values.position += 1
|
|
||||||
|
|
||||||
cuda_graph_runner = None
|
cuda_graph_runner = None
|
||||||
|
|
||||||
|
|
|
@ -7,3 +7,4 @@ cpufeature; sys_platform == 'win32' or sys_platform == 'Windows'
|
||||||
protobuf
|
protobuf
|
||||||
tiktoken
|
tiktoken
|
||||||
blobfile
|
blobfile
|
||||||
|
einops
|
22
run_local_chat_npu.sh
Normal file
22
run_local_chat_npu.sh
Normal file
|
@ -0,0 +1,22 @@
|
||||||
|
#!/bin/bash
|
||||||
|
export CAPTURE_PLUGIN_PATH=ktransformers/util/npu_graph_so/arm
|
||||||
|
export USE_MERGE=0
|
||||||
|
export INF_NAN_MODE_FORCE_DISABLE=1
|
||||||
|
export TASK_QUEUE_ENABLE=0
|
||||||
|
#export PROF_DECODE=1
|
||||||
|
#export PROF_PREFILL=1
|
||||||
|
|
||||||
|
source /usr/local/Ascend/ascend-toolkit/set_env.sh
|
||||||
|
source /usr/local/Ascend/nnal/atb/set_env.sh
|
||||||
|
|
||||||
|
torchrun --nproc_per_node 1 \
|
||||||
|
--master_port 25565 \
|
||||||
|
-m ktransformers.local_chat_npu \
|
||||||
|
--cpu_infer 20 \
|
||||||
|
--model_path /mnt/data/models/DeepSeek-R1-q4km-w8a8\
|
||||||
|
--gguf_path /mnt/data/models/DeepSeek-R1-q4km-w8a8 \
|
||||||
|
--optimize_config_path ./ktransformers/optimize/optimize_rules/npu/DeepSeek-V3-Chat-800IA2-npu.yaml \
|
||||||
|
--max_new_tokens 30 \
|
||||||
|
--tp 1
|
||||||
|
|
||||||
|
# --use_cuda_graph True \
|
2
setup.py
2
setup.py
|
@ -186,6 +186,8 @@ class VersionInfo:
|
||||||
else:
|
else:
|
||||||
print("Using native cpu instruct")
|
print("Using native cpu instruct")
|
||||||
if sys.platform.startswith("linux"):
|
if sys.platform.startswith("linux"):
|
||||||
|
if KTRANSFORMERS_BUILD_NPU:
|
||||||
|
return 'aarch64'
|
||||||
with open('/proc/cpuinfo', 'r', encoding="utf-8") as cpu_f:
|
with open('/proc/cpuinfo', 'r', encoding="utf-8") as cpu_f:
|
||||||
cpuinfo = cpu_f.read()
|
cpuinfo = cpu_f.read()
|
||||||
flags_line = [line for line in cpuinfo.split(
|
flags_line = [line for line in cpuinfo.split(
|
||||||
|
|
|
@ -5,5 +5,6 @@
|
||||||
|
|
||||||
#ifdef __aarch64__
|
#ifdef __aarch64__
|
||||||
#define llamafile_sgemm llamafile_sgemm_arm80
|
#define llamafile_sgemm llamafile_sgemm_arm80
|
||||||
|
#define iqk_mul_mat iqk_mul_mat_arm80
|
||||||
#include "tinyblas_cpu_sgemm.inc"
|
#include "tinyblas_cpu_sgemm.inc"
|
||||||
#endif // __aarch64__
|
#endif // __aarch64__
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue