support npu

This commit is contained in:
Dongjw 2025-07-23 09:54:55 +00:00
parent a641aa8063
commit b982815325
22 changed files with 162 additions and 1562 deletions

1
.gitignore vendored
View file

@ -30,3 +30,4 @@ csrc/demo
CMakeFiles CMakeFiles
kvc2/ kvc2/
sched/ sched/
build*/

File diff suppressed because it is too large Load diff

View file

@ -1,76 +0,0 @@
# Install script for directory: /home/djw/py311_717/ktransformers/csrc/ktransformers_ext
# Set the install prefix
if(NOT DEFINED CMAKE_INSTALL_PREFIX)
set(CMAKE_INSTALL_PREFIX "/usr/local")
endif()
string(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
# Set the install configuration name.
if(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
if(BUILD_TYPE)
string(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
else()
set(CMAKE_INSTALL_CONFIG_NAME "Debug")
endif()
message(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
endif()
# Set the component getting installed.
if(NOT CMAKE_INSTALL_COMPONENT)
if(COMPONENT)
message(STATUS "Install component: \"${COMPONENT}\"")
set(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
else()
set(CMAKE_INSTALL_COMPONENT)
endif()
endif()
# Install shared libraries without execute permission?
if(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
set(CMAKE_INSTALL_SO_NO_EXE "1")
endif()
# Is this installation the result of a crosscompile?
if(NOT DEFINED CMAKE_CROSSCOMPILING)
set(CMAKE_CROSSCOMPILING "FALSE")
endif()
# Set path to fallback-tool for dependency-resolution.
if(NOT DEFINED CMAKE_OBJDUMP)
set(CMAKE_OBJDUMP "/usr/bin/objdump")
endif()
if(NOT CMAKE_INSTALL_LOCAL_ONLY)
# Include the install script for the subdirectory.
include("/home/djw/py311_717/ktransformers/build_test/third_party/pybind11/cmake_install.cmake")
endif()
if(NOT CMAKE_INSTALL_LOCAL_ONLY)
# Include the install script for the subdirectory.
include("/home/djw/py311_717/ktransformers/build_test/third_party/llama.cpp/cmake_install.cmake")
endif()
string(REPLACE ";" "\n" CMAKE_INSTALL_MANIFEST_CONTENT
"${CMAKE_INSTALL_MANIFEST_FILES}")
if(CMAKE_INSTALL_LOCAL_ONLY)
file(WRITE "/home/djw/py311_717/ktransformers/build_test/install_local_manifest.txt"
"${CMAKE_INSTALL_MANIFEST_CONTENT}")
endif()
if(CMAKE_INSTALL_COMPONENT)
if(CMAKE_INSTALL_COMPONENT MATCHES "^[a-zA-Z0-9_.+-]+$")
set(CMAKE_INSTALL_MANIFEST "install_manifest_${CMAKE_INSTALL_COMPONENT}.txt")
else()
string(MD5 CMAKE_INST_COMP_HASH "${CMAKE_INSTALL_COMPONENT}")
set(CMAKE_INSTALL_MANIFEST "install_manifest_${CMAKE_INST_COMP_HASH}.txt")
unset(CMAKE_INST_COMP_HASH)
endif()
else()
set(CMAKE_INSTALL_MANIFEST "install_manifest.txt")
endif()
if(NOT CMAKE_INSTALL_LOCAL_ONLY)
file(WRITE "/home/djw/py311_717/ktransformers/build_test/${CMAKE_INSTALL_MANIFEST}"
"${CMAKE_INSTALL_MANIFEST_CONTENT}")
endif()

Binary file not shown.

View file

@ -318,20 +318,7 @@ elseif (UNIX)
endif() endif()
elseif (KTRANSFORMERS_USE_XPU) elseif (KTRANSFORMERS_USE_XPU)
add_compile_definitions(KTRANSFORMERS_USE_XPU=1) add_compile_definitions(KTRANSFORMERS_USE_XPU=1)
elseif (KTRANSFORMERS_USE_NPU) elseif (KTRANSFORMERS_USE_CUDA)
include(CheckLanguage)
check_language(CUDA)
if(CMAKE_CUDA_COMPILER)
message(STATUS "CUDA detected")
find_package(CUDAToolkit REQUIRED)
include_directories(${CUDAToolkit_INCLUDE_DIRS})
endif()
message(STATUS "enabling CUDA")
enable_language(CUDA)
add_compile_definitions(KTRANSFORMERS_USE_CUDA=1)
else()
find_package(CUDA REQUIRED)
include_directories("${CUDA_INCLUDE_DIRS}")
include(CheckLanguage) include(CheckLanguage)
check_language(CUDA) check_language(CUDA)
if(CMAKE_CUDA_COMPILER) if(CMAKE_CUDA_COMPILER)
@ -397,7 +384,7 @@ elseif(UNIX)
elseif(KTRANSFORMERS_USE_MUSA) elseif(KTRANSFORMERS_USE_MUSA)
target_link_libraries(${PROJECT_NAME} PRIVATE MUSA::musart) target_link_libraries(${PROJECT_NAME} PRIVATE MUSA::musart)
elseif(KTRANSFORMERS_USE_XPU) elseif(KTRANSFORMERS_USE_XPU)
else() elseif(KTRANSFORMERS_USE_CUDA AND NOT KTRANSFORMERS_USE_MUSA)
target_link_libraries(${PROJECT_NAME} PRIVATE "${CUDAToolkit_LIBRARY_DIR}/libcudart.so") target_link_libraries(${PROJECT_NAME} PRIVATE "${CUDAToolkit_LIBRARY_DIR}/libcudart.so")
endif() endif()
endif() endif()

View file

@ -148,10 +148,10 @@ def local_chat(
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
system = platform.system() system = platform.system()
if system == "Windows": # if system == "Windows":
os.system("cls") if local_rank == 0 else None # os.system("cls") if local_rank == 0 else None
else: # else:
os.system("clear") if local_rank == 0 else None # os.system("clear") if local_rank == 0 else None
print(f"{model=}") if local_rank == 0 else None print(f"{model=}") if local_rank == 0 else None

View file

@ -11,7 +11,7 @@ from transformers.cache_utils import Cache
from ktransformers.models.modeling_deepseek import DeepseekV2Attention, apply_rotary_pos_emb from ktransformers.models.modeling_deepseek import DeepseekV2Attention, apply_rotary_pos_emb
from ktransformers.operators.base_operator import BaseInjectedModule from ktransformers.operators.base_operator import BaseInjectedModule
from ktransformers.util.ascend.ascend_utils import get_tensor_parallel_size, allreduce_wrapper from ktransformers.util.ascend.ascend_utils import get_tensor_parallel_size, allreduce_wrapper
from ktransformers.util.custom_gguf import GGUFLoader from ktransformers.util.custom_loader import GGUFLoader
from ktransformers.util.utils import get_compute_capability, get_use_npu_graph, CUR_DEVICE from ktransformers.util.utils import get_compute_capability, get_use_npu_graph, CUR_DEVICE
from ktransformers.util.vendors import device_manager, GPUVendor from ktransformers.util.vendors import device_manager, GPUVendor
from ktransformers.util import utils from ktransformers.util import utils

View file

@ -8,7 +8,7 @@ from transformers import PretrainedConfig
from ktransformers.util.ascend.ascend_utils import get_tensor_parallel_size, get_tensor_parallel_group from ktransformers.util.ascend.ascend_utils import get_tensor_parallel_size, get_tensor_parallel_group
from ktransformers.operators.experts import KExpertsCPU, KTransformersExperts, EXPERTS_MAP, KDeepseekV3MoE, cuda_graphs from ktransformers.operators.experts import KExpertsCPU, KTransformersExperts, EXPERTS_MAP, KDeepseekV3MoE, cuda_graphs
from ktransformers.util.custom_gguf import GGUFLoader from ktransformers.util.custom_loader import GGUFLoader
from ktransformers.util.utils import CUR_DEVICE, get_use_npu_graph, InferenceState from ktransformers.util.utils import CUR_DEVICE, get_use_npu_graph, InferenceState

View file

@ -5,8 +5,8 @@ from transformers import PretrainedConfig
from ktransformers.operators.base_operator import BaseInjectedModule from ktransformers.operators.base_operator import BaseInjectedModule
from ktransformers.util import utils from ktransformers.util import utils
from ktransformers.util.custom_gguf import GGUFLoader from ktransformers.util.custom_loader import GGUFLoader
from ktransformers.util.custom_loader import translate_name_to_gguf
class KDeepseekV3RMSNormW8A8(BaseInjectedModule): class KDeepseekV3RMSNormW8A8(BaseInjectedModule):
def __init__(self, def __init__(self,

View file

@ -14,9 +14,9 @@ from ktransformers.util.ascend.ascend_utils import (
get_tensor_parallel_group get_tensor_parallel_group
) )
from ktransformers.util import utils from ktransformers.util import utils
from ktransformers.util.custom_gguf import GGUFLoader from ktransformers.util.custom_loader import GGUFLoader
from ktransformers.util.utils import InferenceState from ktransformers.util.utils import InferenceState
from ktransformers.util.custom_loader import translate_name_to_gguf
class KLinearW8A8(KLinearBase): class KLinearW8A8(KLinearBase):
def __init__( def __init__(
@ -39,6 +39,11 @@ class KLinearW8A8(KLinearBase):
for key in keys: for key in keys:
if device is None: if device is None:
device = utils.CUR_DEVICE device = utils.CUR_DEVICE
key = translate_name_to_gguf(key)
if key == "lm_head":
key = "output"
if key + ".weight" in self.gguf_loader.safetensor_loader.tensor_file_map: if key + ".weight" in self.gguf_loader.safetensor_loader.tensor_file_map:
if key + ".deq_scale" in self.gguf_loader.safetensor_loader.tensor_file_map: if key + ".deq_scale" in self.gguf_loader.safetensor_loader.tensor_file_map:
qweight = self.gguf_loader.safetensor_loader.load_tensor(f"{key}.weight") qweight = self.gguf_loader.safetensor_loader.load_tensor(f"{key}.weight")
@ -47,25 +52,25 @@ class KLinearW8A8(KLinearBase):
input_scale = self.gguf_loader.safetensor_loader.load_tensor(f"{key}.input_scale") input_scale = self.gguf_loader.safetensor_loader.load_tensor(f"{key}.input_scale")
input_offset = self.gguf_loader.safetensor_loader.load_tensor(f"{key}.input_offset") input_offset = self.gguf_loader.safetensor_loader.load_tensor(f"{key}.input_offset")
tensors = (qweight, deq_scale, quant_bias, input_scale, input_offset) tensors = (qweight, deq_scale, quant_bias, input_scale, input_offset)
print(f"Loading {key} with shape {qweight.shape}, {deq_scale.shape}, {quant_bias.shape}, {input_scale.shape}, {input_offset.shape}")
print(tensors)
return tensors return tensors
elif key + ".weight_scale" in self.gguf_loader.safetensor_loader.tensor_file_map: elif key + ".weight_scale" in self.gguf_loader.safetensor_loader.tensor_file_map:
if key.endswith("ffn_gate_shexp"): if key.endswith("ffn_gate_shexp"):
parts = key.split(".") parts = key.split(".")
layer = parts[1] layer = parts[1]
gate_weight = self.gguf_loader.safetensor_loader.load_tensor(f"blk.{layer}.ffn_gate_shexp.weight") gate_weight = self.gguf_loader.safetensor_loader.load_tensor(f"blk.{layer}.ffn_gate_shexp.weight")
gate_weight = get_safetensors_cut_weight(self.key, gate_weight).t()
up_weight = self.gguf_loader.safetensor_loader.load_tensor(f"blk.{layer}.ffn_up_shexp.weight") up_weight = self.gguf_loader.safetensor_loader.load_tensor(f"blk.{layer}.ffn_up_shexp.weight")
up_weight = get_safetensors_cut_weight(self.key, up_weight).t() gate_up_weight = torch.cat((gate_weight, up_weight), 0)
gate_scale = self.gguf_loader.safetensor_loader.load_tensor(f"blk.{layer}.ffn_gate_shexp.weight_scale") gate_scale = self.gguf_loader.safetensor_loader.load_tensor(f"blk.{layer}.ffn_gate_shexp.weight_scale")
gate_scale = get_safetensors_cut_weight(self.key, gate_scale)
up_scale = self.gguf_loader.safetensor_loader.load_tensor(f"blk.{layer}.ffn_up_shexp.weight_scale") up_scale = self.gguf_loader.safetensor_loader.load_tensor(f"blk.{layer}.ffn_up_shexp.weight_scale")
up_scale = get_safetensors_cut_weight(self.key, up_scale)
gate_up_weight = torch.cat((gate_weight, up_weight), 1)
gate_up_scale = torch.cat((gate_scale, up_scale), 0) gate_up_scale = torch.cat((gate_scale, up_scale), 0)
gate_offset = self.gguf_loader.safetensor_loader.load_tensor(f"blk.{layer}.ffn_gate_shexp.weight_offset") gate_offset = self.gguf_loader.safetensor_loader.load_tensor(f"blk.{layer}.ffn_gate_shexp.weight_offset")
up_offset = self.gguf_loader.safetensor_loader.load_tensor(f"blk.{layer}.ffn_up_shexp.weight_offset") up_offset = self.gguf_loader.safetensor_loader.load_tensor(f"blk.{layer}.ffn_up_shexp.weight_offset")
gate_up_offset = torch.cat((gate_offset, up_offset), 0) gate_up_offset = torch.cat((gate_offset, up_offset), 0)
tensors = (gate_up_weight, gate_up_scale, gate_up_offset) tensors = (gate_up_weight, gate_up_scale, gate_up_offset)
print(f"Loading {key} as ffn_gate_shexp with shape {gate_up_weight.shape}, {gate_up_scale.shape}, {gate_up_offset.shape}")
print(tensors)
elif key.endswith("ffn_up_shexp"): elif key.endswith("ffn_up_shexp"):
return fake_tensor return fake_tensor
else: else:
@ -73,10 +78,11 @@ class KLinearW8A8(KLinearBase):
weight_scale = self.gguf_loader.safetensor_loader.load_tensor(f"{key}.weight_scale") weight_scale = self.gguf_loader.safetensor_loader.load_tensor(f"{key}.weight_scale")
weight_offset = self.gguf_loader.safetensor_loader.load_tensor(f"{key}.weight_offset") weight_offset = self.gguf_loader.safetensor_loader.load_tensor(f"{key}.weight_offset")
tensors = (qweight, weight_scale, weight_offset) tensors = (qweight, weight_scale, weight_offset)
print(f"Loading {key} with shape {qweight.shape}, {weight_scale.shape}, {weight_offset.shape}")
print(tensors)
return tensors return tensors
else: else:
weight = self.gguf_loader.safetensor_loader.load_tensor(f"{key}.weight") weight = self.gguf_loader.safetensor_loader.load_tensor(f"{key}.weight")
weight = get_safetensors_cut_weight(self.key, weight)
return weight return weight
else: else:
raise FileNotFoundError(f"Weight file not found for key {key}") raise FileNotFoundError(f"Weight file not found for key {key}")

View file

@ -49,6 +49,7 @@ class KDeepseekV3MLPW8A8A2V2(BaseInjectedModule, DeepseekV3MLP):
original_dtype = x.dtype original_dtype = x.dtype
quant_out, dynamic_scale = torch_npu.npu_dynamic_quant(x) quant_out, dynamic_scale = torch_npu.npu_dynamic_quant(x)
dynamic_scale = dynamic_scale.view(-1) dynamic_scale = dynamic_scale.view(-1)
gate_up_x = torch_npu.npu_quant_matmul( gate_up_x = torch_npu.npu_quant_matmul(
quant_out, quant_out,
self.orig_module.gate_proj.weight, self.orig_module.gate_proj.weight,

View file

@ -36,6 +36,7 @@ from abc import ABC, abstractmethod
from ktransformers.operators.linear import KLinearMarlin, KLinearTorch, KTransformersLinear from ktransformers.operators.linear import KLinearMarlin, KLinearTorch, KTransformersLinear
import time import time
from ktransformers.operators.cpuinfer import CPUInfer from ktransformers.operators.cpuinfer import CPUInfer
from ktransformers.util.custom_loader import translate_name_to_gguf
def deduplicate_and_sort(lst): def deduplicate_and_sort(lst):
@ -396,6 +397,16 @@ class KExpertsCPU(KExpertsBase):
gate_type = self.gguf_loader.get_ggml_type(key + ".ffn_gate.0.weight") gate_type = self.gguf_loader.get_ggml_type(key + ".ffn_gate.0.weight")
up_type = self.gguf_loader.get_ggml_type(key + ".ffn_up.0.weight") up_type = self.gguf_loader.get_ggml_type(key + ".ffn_up.0.weight")
down_type = self.gguf_loader.get_ggml_type(key + ".ffn_down.0.weight") down_type = self.gguf_loader.get_ggml_type(key + ".ffn_down.0.weight")
elif self.gguf_loader.safetensor_loader is not None:
# for npu
# using a temp ugly way to temprary load the tensor
translate_key = translate_name_to_gguf(key)
gate = self.gguf_loader.safetensor_loader.load_tensor(translate_key + ".ffn_gate_exps.weight").numpy()
up = self.gguf_loader.safetensor_loader.load_tensor(translate_key + ".ffn_up_exps.weight").numpy()
down = self.gguf_loader.safetensor_loader.load_tensor(translate_key + ".ffn_down_exps.weight").numpy()
gate_type = self.gguf_loader.safetensor_loader.load_tensor(translate_key + ".ffn_gate_exps.ggml_type").item()
up_type = self.gguf_loader.safetensor_loader.load_tensor(translate_key + ".ffn_up_exps.ggml_type").item()
down_type = self.gguf_loader.safetensor_loader.load_tensor(translate_key + ".ffn_down_exps.ggml_type").item()
else: else:
raise ValueError(f"Experts {key} not found in gguf_loader") raise ValueError(f"Experts {key} not found in gguf_loader")
res = {key:{"gate": gate, "up": up, "down": down, "gate_type": gate_type, "up_type": up_type, "down_type": down_type}} res = {key:{"gate": gate, "up": up, "down": down, "gate_type": gate_type, "up_type": up_type, "down_type": down_type}}

View file

@ -9,7 +9,7 @@ from ktransformers.operators.linear import KTransformersLinear
from ktransformers.util.custom_loader import GGUFLoader, ModelLoader, SafeTensorLoader from ktransformers.util.custom_loader import GGUFLoader, ModelLoader, SafeTensorLoader
from transformers.configuration_utils import PretrainedConfig from transformers.configuration_utils import PretrainedConfig
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from ktransformers.util.custom_loader import translate_name_to_gguf
# class Base(BaseInjectedModule, ABC): # class Base(BaseInjectedModule, ABC):
class KMoEGateBase(ABC): class KMoEGateBase(ABC):
@ -55,8 +55,18 @@ class KMoEGateBase(ABC):
down_type = None down_type = None
for key in keys: for key in keys:
# key = ".".join(key.split(".")[:-1])
if isinstance(self.gguf_loader, SafeTensorLoader): if self.gguf_loader.safetensor_loader is not None:
# for npu
translate_key = translate_name_to_gguf(key)
translate_key = ".".join(translate_key.split(".")[:2])
targets = [".ffn_gate_inp.weight", ".exp_probs_b.bias"]
weight = self.gguf_loader.safetensor_loader.load_tensor(translate_key + ".ffn_gate_inp.weight")
e_score_correction_bias = self.gguf_loader.safetensor_loader.load_tensor(translate_key + ".exp_probs_b.bias")
weight_type = weight.dtype
e_score_correction_bias_type = e_score_correction_bias.dtype
res = {"weight": weight, "e_score_correction_bias": e_score_correction_bias, "weight_type": weight_type, "e_score_correction_bias_type": e_score_correction_bias_type}
elif isinstance(self.gguf_loader, SafeTensorLoader):
res = self.gguf_loader.load_gate(key, device=device) res = self.gguf_loader.load_gate(key, device=device)
elif self.gguf_loader.has_tensor(key+".weight"): elif self.gguf_loader.has_tensor(key+".weight"):
# targets = [".ffn_gate_inp.weight", ".exp_probs_b.bias"] # targets = [".ffn_gate_inp.weight", ".exp_probs_b.bias"]

View file

@ -38,7 +38,10 @@ if not torch.xpu.is_available():
) )
from ktransformers.operators.base_operator import BaseInjectedModule from ktransformers.operators.base_operator import BaseInjectedModule
from transformers.configuration_utils import PretrainedConfig from transformers.configuration_utils import PretrainedConfig
from ktransformers.ktransformers_ext.triton.fp8gemm import fp8_gemm, act_quant, weight_dequant try:
from ktransformers.ktransformers_ext.triton.fp8gemm import fp8_gemm, act_quant, weight_dequant
except:
print("no triton")
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
import sys, os import sys, os
sys.path.append(os.path.join(os.path.dirname(__file__), "..", "ktransformers_ext", "build")) sys.path.append(os.path.join(os.path.dirname(__file__), "..", "ktransformers_ext", "build"))

View file

@ -18,7 +18,15 @@ import itertools
import copy import copy
from ktransformers.util import utils from ktransformers.util import utils
try:
import torch_npu
use_torch_npu = torch_npu.npu.is_available()
except:
use_torch_npu = False
def inject(module, local_optimization_dict, model_config:AutoConfig ,gguf_loader:GGUFLoader, prefix=''): def inject(module, local_optimization_dict, model_config:AutoConfig ,gguf_loader:GGUFLoader, prefix=''):
for name, child in module._modules.items(): for name, child in module._modules.items():
if child is not None: if child is not None:
child_prefix = prefix + name child_prefix = prefix + name
@ -124,9 +132,10 @@ def optimize_and_load_gguf(module: nn.Module, rule_file: str, gguf_path: str, mo
model_config = translate_model_config(model_config) model_config = translate_model_config(model_config)
if q4_gguf_path: if use_torch_npu:
q4_gguf_loader = GGUFLoader(q4_gguf_path) if q4_gguf_path:
utils.Q4_GGUF_LODER = q4_gguf_loader q4_gguf_loader = GGUFLoader(q4_gguf_path)
utils.Q4_GGUF_LODER = q4_gguf_loader
gguf_loader = GGUFLoader(gguf_path, getattr(model_config, "quantize", None)) gguf_loader = GGUFLoader(gguf_path, getattr(model_config, "quantize", None))
with torch.device("meta"): with torch.device("meta"):
inject(module, optimize_config, model_config, gguf_loader) inject(module, optimize_config, model_config, gguf_loader)

View file

@ -24,7 +24,14 @@ from typing import Sequence
import os import os
from enum import IntEnum from enum import IntEnum
import torch import torch
if not torch.xpu.is_available():
try:
import torch_npu
use_torch_npu = torch_npu.npu.is_available()
except:
use_torch_npu = False
if not torch.xpu.is_available() and not use_torch_npu:
import KTransformersOps import KTransformersOps
import ctypes import ctypes
import math import math

View file

@ -53,6 +53,7 @@ class SafeTensorLoader(ModelLoader):
def __init__(self, file_path: str): def __init__(self, file_path: str):
self.__load_tensor_file_map(file_path) self.__load_tensor_file_map(file_path)
# print(self.tensor_file_map)
def __load_tensor_file_map(self, file_path: str): def __load_tensor_file_map(self, file_path: str):
# 处理传入路径,确保是文件夹路径 # 处理传入路径,确保是文件夹路径
@ -96,6 +97,7 @@ class SafeTensorLoader(ModelLoader):
def load_tensor(self, key: str, device: str="cpu"): def load_tensor(self, key: str, device: str="cpu"):
if translate_name_to_gguf(key) in self.tensor_file_map: if translate_name_to_gguf(key) in self.tensor_file_map:
key = translate_name_to_gguf(key) key = translate_name_to_gguf(key)
elif key in self.tensor_file_map: elif key in self.tensor_file_map:
@ -267,6 +269,7 @@ class SafeTensorLoader(ModelLoader):
class W8A8SafeTensorLoader(SafeTensorLoader): class W8A8SafeTensorLoader(SafeTensorLoader):
def load_tensor(self, key: str, device: str = "cpu"): def load_tensor(self, key: str, device: str = "cpu"):
key = translate_name_to_gguf(key)
if key not in self.tensor_file_map: if key not in self.tensor_file_map:
raise KeyError(f"Key {key} not found in Safetensor files") raise KeyError(f"Key {key} not found in Safetensor files")
file = self.tensor_file_map[key] file = self.tensor_file_map[key]
@ -308,13 +311,6 @@ class GGUFLoader(ModelLoader):
gguf_path = os.path.dirname(gguf_path) gguf_path = os.path.dirname(gguf_path)
safetensor_loader = SafeTensorLoader(gguf_path) safetensor_loader = SafeTensorLoader(gguf_path)
if quantize == "w8a8_dynamic":
safetensor_loader = W8A8SafeTensorLoader(gguf_path)
else:
safetensor_loader = SafeTensorLoader(gguf_path)
if safetensor_loader.tensor_file_map:
self.safetensor_loader = safetensor_loader
return
self.tensor_info = {} self.tensor_info = {}
self.gguf_path = gguf_path self.gguf_path = gguf_path
@ -323,6 +319,14 @@ class GGUFLoader(ModelLoader):
self.gguf_file_meta = {} self.gguf_file_meta = {}
self.tensor_device_map = {} self.tensor_device_map = {}
if quantize == "w8a8_dynamic":
safetensor_loader = W8A8SafeTensorLoader(gguf_path)
else:
safetensor_loader = SafeTensorLoader(gguf_path)
if safetensor_loader.tensor_file_map:
self.safetensor_loader = safetensor_loader
return
# Walk through all the .gguf files in the directory # Walk through all the .gguf files in the directory
found_gguf = False found_gguf = False
for root, dirs, files in os.walk(gguf_path): for root, dirs, files in os.walk(gguf_path):
@ -431,6 +435,7 @@ class GGUFLoader(ModelLoader):
return mmap_data[offset : offset + itemsize * item_count] return mmap_data[offset : offset + itemsize * item_count]
def get_undequanted_tensor_and_ggml_type(self, name): def get_undequanted_tensor_and_ggml_type(self, name):
name = translate_name_to_gguf(name) name = translate_name_to_gguf(name)
t = self.tensor_info[name] t = self.tensor_info[name]
data = self.get_mmap_tensor(name) data = self.get_mmap_tensor(name)
@ -439,6 +444,7 @@ class GGUFLoader(ModelLoader):
return data, ggml_type return data, ggml_type
def load_expert_tensor(self, name, data, expert_id, elements_per_expert, device = "cuda", target_dtype = torch.get_default_dtype())->torch.Tensor: def load_expert_tensor(self, name, data, expert_id, elements_per_expert, device = "cuda", target_dtype = torch.get_default_dtype())->torch.Tensor:
name = translate_name_to_gguf(name) name = translate_name_to_gguf(name)
t = self.tensor_info[name] t = self.tensor_info[name]
shape = t["shape"] shape = t["shape"]
@ -468,6 +474,7 @@ class GGUFLoader(ModelLoader):
return values return values
def load_gguf_tensor(self, name: str, device:str = "cpu", target_dtype = None)->torch.Tensor: def load_gguf_tensor(self, name: str, device:str = "cpu", target_dtype = None)->torch.Tensor:
name = translate_name_to_gguf(name) name = translate_name_to_gguf(name)
t = self.tensor_info[name] t = self.tensor_info[name]
if target_dtype == None: if target_dtype == None:
@ -533,10 +540,12 @@ class GGUFLoader(ModelLoader):
.reshape(values.shape)) .reshape(values.shape))
return values return values
def has_tensor(self, name: str): def has_tensor(self, name: str):
name = translate_name_to_gguf(name) name = translate_name_to_gguf(name)
return name in self.tensor_info return name in self.tensor_info
def get_ggml_type(self, name: str): def get_ggml_type(self, name: str):
name = translate_name_to_gguf(name) name = translate_name_to_gguf(name)
if name not in self.tensor_info: if name not in self.tensor_info:
raise KeyError(f"Key {name} not found in GGUF files") raise KeyError(f"Key {name} not found in GGUF files")

View file

@ -135,10 +135,47 @@ def get_all_used_cuda_device(device_map:dict):
all_device_list = list(all_device_list) all_device_list = list(all_device_list)
return all_device_list return all_device_list
def get_current_device():
return f"npu:{torch.npu.current_device()}"
def load_cur_state_dict_npu(module: nn.Module, gguf_loader: ModelLoader, prefix: str = "", device="npu"):
prefix = prefix.replace("orig_module.", "")
persistent_buffers = {k: v for k, v in module._buffers.items() if k not in module._non_persistent_buffers_set}
local_name_params = itertools.chain(module._parameters.items(), persistent_buffers.items())
local_state = {k: v for k, v in local_name_params if v is not None}
for name, param in local_state.items():
key = prefix + name
translated_key = translate_name_to_gguf(key)
# TODO: Merge all loader.
# I know this is ugly but lets do it for now.
if gguf_loader.safetensor_loader is not None:
load_dequantized_tensor = gguf_loader.safetensor_loader.load_dequantized_tensor
tensor_file_map = gguf_loader.safetensor_loader.tensor_file_map
else:
load_dequantized_tensor = gguf_loader.load_gguf_tensor
tensor_file_map = gguf_loader.tensor_file_map
if translated_key in tensor_file_map:
target_dtype = torch.get_default_dtype()
device = get_device(translated_key[:translated_key.rfind(".")], gguf_loader.tensor_device_map)
# Todo need fix
device = "cpu" if "embd" in translated_key else get_current_device()
print(f"loading layer {translated_key} to {device}")
torch.cuda.empty_cache()
weights = load_dequantized_tensor(translated_key, device=device).to(dtype=target_dtype)
set_param(module, name, weights)
del weights
else:
#print(load_config.tensor_file_map.keys())
raise Exception(f"can't find {translated_key} in GGUF file!")
# TODO: support NPU
def load_cur_state_dict(module: nn.Module, gguf_loader: ModelLoader, prefix: str = "", device="cuda"): def load_cur_state_dict(module: nn.Module, gguf_loader: ModelLoader, prefix: str = "", device="cuda"):
if use_torch_npu:
load_cur_state_dict_npu(module, gguf_loader, prefix, device)
return
prefix = prefix.replace("orig_module.", "") prefix = prefix.replace("orig_module.", "")
persistent_buffers = {k: v for k, v in module._buffers.items() if k not in module._non_persistent_buffers_set} persistent_buffers = {k: v for k, v in module._buffers.items() if k not in module._non_persistent_buffers_set}
local_name_params = itertools.chain(module._parameters.items(), persistent_buffers.items()) local_name_params = itertools.chain(module._parameters.items(), persistent_buffers.items())
@ -214,7 +251,7 @@ def xpu_fp16_model(config):
return False return False
def load_weights(module:nn.Module, gguf_loader:ModelLoader, prefix='', device="cuda"): def load_weights(module:nn.Module, gguf_loader:ModelLoader, prefix='', device="cuda"):
#print(f"recursively loading weights {prefix}") # print(f"recursively loading weights {prefix}")
if not isinstance(module, base_operator.BaseInjectedModule): if not isinstance(module, base_operator.BaseInjectedModule):
load_cur_state_dict(module, gguf_loader, prefix, device=device) load_cur_state_dict(module, gguf_loader, prefix, device=device)
for name, child in module._modules.items(): for name, child in module._modules.items():
@ -314,6 +351,7 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000, use_cud
cache_position=cache_position, cache_position=cache_position,
past_key_values=past_key_values, past_key_values=past_key_values,
return_dict=False, use_cache=True)[0] return_dict=False, use_cache=True)[0]
print(logits)
if past_key_values != None: if past_key_values != None:
past_key_values.change_seq_length(1) past_key_values.change_seq_length(1)
all_cuda_device = ['npu:' + str(index) for index in range(torch.distributed.get_world_size())] all_cuda_device = ['npu:' + str(index) for index in range(torch.distributed.get_world_size())]
@ -361,7 +399,6 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000, use_cud
if past_key_values != None and isinstance(past_key_values, StaticCache): if past_key_values != None and isinstance(past_key_values, StaticCache):
past_key_values.change_seq_length(1) past_key_values.change_seq_length(1)
sync_all_device(all_cuda_device) sync_all_device(all_cuda_device)
#print(logits)
next_token_scores = logits_warper(inputs, logits[:, -1, :]) next_token_scores = logits_warper(inputs, logits[:, -1, :])
if generation_config.do_sample: if generation_config.do_sample:
probs = nn.functional.softmax(next_token_scores, dim=-1) probs = nn.functional.softmax(next_token_scores, dim=-1)
@ -410,6 +447,9 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000, use_cud
next_token = decode_one_tokens(cuda_graph_runner, next_token.unsqueeze(0), position_ids, next_token = decode_one_tokens(cuda_graph_runner, next_token.unsqueeze(0), position_ids,
cache_position, past_key_values, logits_warper, generation_config, cache_position, past_key_values, logits_warper, generation_config,
use_cuda_graph).to(torch_device) use_cuda_graph).to(torch_device)
print(next_token)
inputs = torch.cat((inputs, next_token.unsqueeze(0)), dim=-1) inputs = torch.cat((inputs, next_token.unsqueeze(0)), dim=-1)
generated_ids[:, cache_position] = next_token.int() generated_ids[:, cache_position] = next_token.int()
tokens.append(int(next_token)) tokens.append(int(next_token))
@ -596,8 +636,7 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000, use_cud
cache_position = torch.tensor([seq_length], device=torch_device, dtype=torch.int32) cache_position = torch.tensor([seq_length], device=torch_device, dtype=torch.int32)
position_ids = cache_position.unsqueeze(0) position_ids = cache_position.unsqueeze(0)
seq_length += 1 seq_length += 1
if use_torch_npu:
past_key_values.position += 1
cuda_graph_runner = None cuda_graph_runner = None

View file

@ -7,3 +7,4 @@ cpufeature; sys_platform == 'win32' or sys_platform == 'Windows'
protobuf protobuf
tiktoken tiktoken
blobfile blobfile
einops

22
run_local_chat_npu.sh Normal file
View file

@ -0,0 +1,22 @@
#!/bin/bash
export CAPTURE_PLUGIN_PATH=ktransformers/util/npu_graph_so/arm
export USE_MERGE=0
export INF_NAN_MODE_FORCE_DISABLE=1
export TASK_QUEUE_ENABLE=0
#export PROF_DECODE=1
#export PROF_PREFILL=1
source /usr/local/Ascend/ascend-toolkit/set_env.sh
source /usr/local/Ascend/nnal/atb/set_env.sh
torchrun --nproc_per_node 1 \
--master_port 25565 \
-m ktransformers.local_chat_npu \
--cpu_infer 20 \
--model_path /mnt/data/models/DeepSeek-R1-q4km-w8a8\
--gguf_path /mnt/data/models/DeepSeek-R1-q4km-w8a8 \
--optimize_config_path ./ktransformers/optimize/optimize_rules/npu/DeepSeek-V3-Chat-800IA2-npu.yaml \
--max_new_tokens 30 \
--tp 1
# --use_cuda_graph True \

View file

@ -186,6 +186,8 @@ class VersionInfo:
else: else:
print("Using native cpu instruct") print("Using native cpu instruct")
if sys.platform.startswith("linux"): if sys.platform.startswith("linux"):
if KTRANSFORMERS_BUILD_NPU:
return 'aarch64'
with open('/proc/cpuinfo', 'r', encoding="utf-8") as cpu_f: with open('/proc/cpuinfo', 'r', encoding="utf-8") as cpu_f:
cpuinfo = cpu_f.read() cpuinfo = cpu_f.read()
flags_line = [line for line in cpuinfo.split( flags_line = [line for line in cpuinfo.split(

View file

@ -5,5 +5,6 @@
#ifdef __aarch64__ #ifdef __aarch64__
#define llamafile_sgemm llamafile_sgemm_arm80 #define llamafile_sgemm llamafile_sgemm_arm80
#define iqk_mul_mat iqk_mul_mat_arm80
#include "tinyblas_cpu_sgemm.inc" #include "tinyblas_cpu_sgemm.inc"
#endif // __aarch64__ #endif // __aarch64__