support npu

This commit is contained in:
Dongjw 2025-07-23 09:54:55 +00:00
parent a641aa8063
commit b982815325
22 changed files with 162 additions and 1562 deletions

1
.gitignore vendored
View file

@ -30,3 +30,4 @@ csrc/demo
CMakeFiles
kvc2/
sched/
build*/

File diff suppressed because it is too large Load diff

View file

@ -1,76 +0,0 @@
# Install script for directory: /home/djw/py311_717/ktransformers/csrc/ktransformers_ext
# Set the install prefix
if(NOT DEFINED CMAKE_INSTALL_PREFIX)
set(CMAKE_INSTALL_PREFIX "/usr/local")
endif()
string(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
# Set the install configuration name.
if(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
if(BUILD_TYPE)
string(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
else()
set(CMAKE_INSTALL_CONFIG_NAME "Debug")
endif()
message(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
endif()
# Set the component getting installed.
if(NOT CMAKE_INSTALL_COMPONENT)
if(COMPONENT)
message(STATUS "Install component: \"${COMPONENT}\"")
set(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
else()
set(CMAKE_INSTALL_COMPONENT)
endif()
endif()
# Install shared libraries without execute permission?
if(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
set(CMAKE_INSTALL_SO_NO_EXE "1")
endif()
# Is this installation the result of a crosscompile?
if(NOT DEFINED CMAKE_CROSSCOMPILING)
set(CMAKE_CROSSCOMPILING "FALSE")
endif()
# Set path to fallback-tool for dependency-resolution.
if(NOT DEFINED CMAKE_OBJDUMP)
set(CMAKE_OBJDUMP "/usr/bin/objdump")
endif()
if(NOT CMAKE_INSTALL_LOCAL_ONLY)
# Include the install script for the subdirectory.
include("/home/djw/py311_717/ktransformers/build_test/third_party/pybind11/cmake_install.cmake")
endif()
if(NOT CMAKE_INSTALL_LOCAL_ONLY)
# Include the install script for the subdirectory.
include("/home/djw/py311_717/ktransformers/build_test/third_party/llama.cpp/cmake_install.cmake")
endif()
string(REPLACE ";" "\n" CMAKE_INSTALL_MANIFEST_CONTENT
"${CMAKE_INSTALL_MANIFEST_FILES}")
if(CMAKE_INSTALL_LOCAL_ONLY)
file(WRITE "/home/djw/py311_717/ktransformers/build_test/install_local_manifest.txt"
"${CMAKE_INSTALL_MANIFEST_CONTENT}")
endif()
if(CMAKE_INSTALL_COMPONENT)
if(CMAKE_INSTALL_COMPONENT MATCHES "^[a-zA-Z0-9_.+-]+$")
set(CMAKE_INSTALL_MANIFEST "install_manifest_${CMAKE_INSTALL_COMPONENT}.txt")
else()
string(MD5 CMAKE_INST_COMP_HASH "${CMAKE_INSTALL_COMPONENT}")
set(CMAKE_INSTALL_MANIFEST "install_manifest_${CMAKE_INST_COMP_HASH}.txt")
unset(CMAKE_INST_COMP_HASH)
endif()
else()
set(CMAKE_INSTALL_MANIFEST "install_manifest.txt")
endif()
if(NOT CMAKE_INSTALL_LOCAL_ONLY)
file(WRITE "/home/djw/py311_717/ktransformers/build_test/${CMAKE_INSTALL_MANIFEST}"
"${CMAKE_INSTALL_MANIFEST_CONTENT}")
endif()

Binary file not shown.

View file

@ -318,20 +318,7 @@ elseif (UNIX)
endif()
elseif (KTRANSFORMERS_USE_XPU)
add_compile_definitions(KTRANSFORMERS_USE_XPU=1)
elseif (KTRANSFORMERS_USE_NPU)
include(CheckLanguage)
check_language(CUDA)
if(CMAKE_CUDA_COMPILER)
message(STATUS "CUDA detected")
find_package(CUDAToolkit REQUIRED)
include_directories(${CUDAToolkit_INCLUDE_DIRS})
endif()
message(STATUS "enabling CUDA")
enable_language(CUDA)
add_compile_definitions(KTRANSFORMERS_USE_CUDA=1)
else()
find_package(CUDA REQUIRED)
include_directories("${CUDA_INCLUDE_DIRS}")
elseif (KTRANSFORMERS_USE_CUDA)
include(CheckLanguage)
check_language(CUDA)
if(CMAKE_CUDA_COMPILER)
@ -397,7 +384,7 @@ elseif(UNIX)
elseif(KTRANSFORMERS_USE_MUSA)
target_link_libraries(${PROJECT_NAME} PRIVATE MUSA::musart)
elseif(KTRANSFORMERS_USE_XPU)
else()
elseif(KTRANSFORMERS_USE_CUDA AND NOT KTRANSFORMERS_USE_MUSA)
target_link_libraries(${PROJECT_NAME} PRIVATE "${CUDAToolkit_LIBRARY_DIR}/libcudart.so")
endif()
endif()

View file

@ -148,10 +148,10 @@ def local_chat(
logging.basicConfig(level=logging.INFO)
system = platform.system()
if system == "Windows":
os.system("cls") if local_rank == 0 else None
else:
os.system("clear") if local_rank == 0 else None
# if system == "Windows":
# os.system("cls") if local_rank == 0 else None
# else:
# os.system("clear") if local_rank == 0 else None
print(f"{model=}") if local_rank == 0 else None

View file

@ -11,7 +11,7 @@ from transformers.cache_utils import Cache
from ktransformers.models.modeling_deepseek import DeepseekV2Attention, apply_rotary_pos_emb
from ktransformers.operators.base_operator import BaseInjectedModule
from ktransformers.util.ascend.ascend_utils import get_tensor_parallel_size, allreduce_wrapper
from ktransformers.util.custom_gguf import GGUFLoader
from ktransformers.util.custom_loader import GGUFLoader
from ktransformers.util.utils import get_compute_capability, get_use_npu_graph, CUR_DEVICE
from ktransformers.util.vendors import device_manager, GPUVendor
from ktransformers.util import utils

View file

@ -8,7 +8,7 @@ from transformers import PretrainedConfig
from ktransformers.util.ascend.ascend_utils import get_tensor_parallel_size, get_tensor_parallel_group
from ktransformers.operators.experts import KExpertsCPU, KTransformersExperts, EXPERTS_MAP, KDeepseekV3MoE, cuda_graphs
from ktransformers.util.custom_gguf import GGUFLoader
from ktransformers.util.custom_loader import GGUFLoader
from ktransformers.util.utils import CUR_DEVICE, get_use_npu_graph, InferenceState

View file

@ -5,8 +5,8 @@ from transformers import PretrainedConfig
from ktransformers.operators.base_operator import BaseInjectedModule
from ktransformers.util import utils
from ktransformers.util.custom_gguf import GGUFLoader
from ktransformers.util.custom_loader import GGUFLoader
from ktransformers.util.custom_loader import translate_name_to_gguf
class KDeepseekV3RMSNormW8A8(BaseInjectedModule):
def __init__(self,

View file

@ -14,9 +14,9 @@ from ktransformers.util.ascend.ascend_utils import (
get_tensor_parallel_group
)
from ktransformers.util import utils
from ktransformers.util.custom_gguf import GGUFLoader
from ktransformers.util.custom_loader import GGUFLoader
from ktransformers.util.utils import InferenceState
from ktransformers.util.custom_loader import translate_name_to_gguf
class KLinearW8A8(KLinearBase):
def __init__(
@ -39,6 +39,11 @@ class KLinearW8A8(KLinearBase):
for key in keys:
if device is None:
device = utils.CUR_DEVICE
key = translate_name_to_gguf(key)
if key == "lm_head":
key = "output"
if key + ".weight" in self.gguf_loader.safetensor_loader.tensor_file_map:
if key + ".deq_scale" in self.gguf_loader.safetensor_loader.tensor_file_map:
qweight = self.gguf_loader.safetensor_loader.load_tensor(f"{key}.weight")
@ -47,25 +52,25 @@ class KLinearW8A8(KLinearBase):
input_scale = self.gguf_loader.safetensor_loader.load_tensor(f"{key}.input_scale")
input_offset = self.gguf_loader.safetensor_loader.load_tensor(f"{key}.input_offset")
tensors = (qweight, deq_scale, quant_bias, input_scale, input_offset)
print(f"Loading {key} with shape {qweight.shape}, {deq_scale.shape}, {quant_bias.shape}, {input_scale.shape}, {input_offset.shape}")
print(tensors)
return tensors
elif key + ".weight_scale" in self.gguf_loader.safetensor_loader.tensor_file_map:
if key.endswith("ffn_gate_shexp"):
parts = key.split(".")
layer = parts[1]
gate_weight = self.gguf_loader.safetensor_loader.load_tensor(f"blk.{layer}.ffn_gate_shexp.weight")
gate_weight = get_safetensors_cut_weight(self.key, gate_weight).t()
up_weight = self.gguf_loader.safetensor_loader.load_tensor(f"blk.{layer}.ffn_up_shexp.weight")
up_weight = get_safetensors_cut_weight(self.key, up_weight).t()
gate_up_weight = torch.cat((gate_weight, up_weight), 0)
gate_scale = self.gguf_loader.safetensor_loader.load_tensor(f"blk.{layer}.ffn_gate_shexp.weight_scale")
gate_scale = get_safetensors_cut_weight(self.key, gate_scale)
up_scale = self.gguf_loader.safetensor_loader.load_tensor(f"blk.{layer}.ffn_up_shexp.weight_scale")
up_scale = get_safetensors_cut_weight(self.key, up_scale)
gate_up_weight = torch.cat((gate_weight, up_weight), 1)
gate_up_scale = torch.cat((gate_scale, up_scale), 0)
gate_offset = self.gguf_loader.safetensor_loader.load_tensor(f"blk.{layer}.ffn_gate_shexp.weight_offset")
up_offset = self.gguf_loader.safetensor_loader.load_tensor(f"blk.{layer}.ffn_up_shexp.weight_offset")
gate_up_offset = torch.cat((gate_offset, up_offset), 0)
tensors = (gate_up_weight, gate_up_scale, gate_up_offset)
print(f"Loading {key} as ffn_gate_shexp with shape {gate_up_weight.shape}, {gate_up_scale.shape}, {gate_up_offset.shape}")
print(tensors)
elif key.endswith("ffn_up_shexp"):
return fake_tensor
else:
@ -73,10 +78,11 @@ class KLinearW8A8(KLinearBase):
weight_scale = self.gguf_loader.safetensor_loader.load_tensor(f"{key}.weight_scale")
weight_offset = self.gguf_loader.safetensor_loader.load_tensor(f"{key}.weight_offset")
tensors = (qweight, weight_scale, weight_offset)
print(f"Loading {key} with shape {qweight.shape}, {weight_scale.shape}, {weight_offset.shape}")
print(tensors)
return tensors
else:
weight = self.gguf_loader.safetensor_loader.load_tensor(f"{key}.weight")
weight = get_safetensors_cut_weight(self.key, weight)
return weight
else:
raise FileNotFoundError(f"Weight file not found for key {key}")

View file

@ -49,6 +49,7 @@ class KDeepseekV3MLPW8A8A2V2(BaseInjectedModule, DeepseekV3MLP):
original_dtype = x.dtype
quant_out, dynamic_scale = torch_npu.npu_dynamic_quant(x)
dynamic_scale = dynamic_scale.view(-1)
gate_up_x = torch_npu.npu_quant_matmul(
quant_out,
self.orig_module.gate_proj.weight,

View file

@ -36,6 +36,7 @@ from abc import ABC, abstractmethod
from ktransformers.operators.linear import KLinearMarlin, KLinearTorch, KTransformersLinear
import time
from ktransformers.operators.cpuinfer import CPUInfer
from ktransformers.util.custom_loader import translate_name_to_gguf
def deduplicate_and_sort(lst):
@ -396,6 +397,16 @@ class KExpertsCPU(KExpertsBase):
gate_type = self.gguf_loader.get_ggml_type(key + ".ffn_gate.0.weight")
up_type = self.gguf_loader.get_ggml_type(key + ".ffn_up.0.weight")
down_type = self.gguf_loader.get_ggml_type(key + ".ffn_down.0.weight")
elif self.gguf_loader.safetensor_loader is not None:
# for npu
# using a temp ugly way to temprary load the tensor
translate_key = translate_name_to_gguf(key)
gate = self.gguf_loader.safetensor_loader.load_tensor(translate_key + ".ffn_gate_exps.weight").numpy()
up = self.gguf_loader.safetensor_loader.load_tensor(translate_key + ".ffn_up_exps.weight").numpy()
down = self.gguf_loader.safetensor_loader.load_tensor(translate_key + ".ffn_down_exps.weight").numpy()
gate_type = self.gguf_loader.safetensor_loader.load_tensor(translate_key + ".ffn_gate_exps.ggml_type").item()
up_type = self.gguf_loader.safetensor_loader.load_tensor(translate_key + ".ffn_up_exps.ggml_type").item()
down_type = self.gguf_loader.safetensor_loader.load_tensor(translate_key + ".ffn_down_exps.ggml_type").item()
else:
raise ValueError(f"Experts {key} not found in gguf_loader")
res = {key:{"gate": gate, "up": up, "down": down, "gate_type": gate_type, "up_type": up_type, "down_type": down_type}}

View file

@ -9,7 +9,7 @@ from ktransformers.operators.linear import KTransformersLinear
from ktransformers.util.custom_loader import GGUFLoader, ModelLoader, SafeTensorLoader
from transformers.configuration_utils import PretrainedConfig
from abc import ABC, abstractmethod
from ktransformers.util.custom_loader import translate_name_to_gguf
# class Base(BaseInjectedModule, ABC):
class KMoEGateBase(ABC):
@ -55,8 +55,18 @@ class KMoEGateBase(ABC):
down_type = None
for key in keys:
# key = ".".join(key.split(".")[:-1])
if isinstance(self.gguf_loader, SafeTensorLoader):
if self.gguf_loader.safetensor_loader is not None:
# for npu
translate_key = translate_name_to_gguf(key)
translate_key = ".".join(translate_key.split(".")[:2])
targets = [".ffn_gate_inp.weight", ".exp_probs_b.bias"]
weight = self.gguf_loader.safetensor_loader.load_tensor(translate_key + ".ffn_gate_inp.weight")
e_score_correction_bias = self.gguf_loader.safetensor_loader.load_tensor(translate_key + ".exp_probs_b.bias")
weight_type = weight.dtype
e_score_correction_bias_type = e_score_correction_bias.dtype
res = {"weight": weight, "e_score_correction_bias": e_score_correction_bias, "weight_type": weight_type, "e_score_correction_bias_type": e_score_correction_bias_type}
elif isinstance(self.gguf_loader, SafeTensorLoader):
res = self.gguf_loader.load_gate(key, device=device)
elif self.gguf_loader.has_tensor(key+".weight"):
# targets = [".ffn_gate_inp.weight", ".exp_probs_b.bias"]

View file

@ -38,7 +38,10 @@ if not torch.xpu.is_available():
)
from ktransformers.operators.base_operator import BaseInjectedModule
from transformers.configuration_utils import PretrainedConfig
from ktransformers.ktransformers_ext.triton.fp8gemm import fp8_gemm, act_quant, weight_dequant
try:
from ktransformers.ktransformers_ext.triton.fp8gemm import fp8_gemm, act_quant, weight_dequant
except:
print("no triton")
from abc import ABC, abstractmethod
import sys, os
sys.path.append(os.path.join(os.path.dirname(__file__), "..", "ktransformers_ext", "build"))

View file

@ -18,7 +18,15 @@ import itertools
import copy
from ktransformers.util import utils
try:
import torch_npu
use_torch_npu = torch_npu.npu.is_available()
except:
use_torch_npu = False
def inject(module, local_optimization_dict, model_config:AutoConfig ,gguf_loader:GGUFLoader, prefix=''):
for name, child in module._modules.items():
if child is not None:
child_prefix = prefix + name
@ -124,6 +132,7 @@ def optimize_and_load_gguf(module: nn.Module, rule_file: str, gguf_path: str, mo
model_config = translate_model_config(model_config)
if use_torch_npu:
if q4_gguf_path:
q4_gguf_loader = GGUFLoader(q4_gguf_path)
utils.Q4_GGUF_LODER = q4_gguf_loader

View file

@ -24,7 +24,14 @@ from typing import Sequence
import os
from enum import IntEnum
import torch
if not torch.xpu.is_available():
try:
import torch_npu
use_torch_npu = torch_npu.npu.is_available()
except:
use_torch_npu = False
if not torch.xpu.is_available() and not use_torch_npu:
import KTransformersOps
import ctypes
import math

View file

@ -53,6 +53,7 @@ class SafeTensorLoader(ModelLoader):
def __init__(self, file_path: str):
self.__load_tensor_file_map(file_path)
# print(self.tensor_file_map)
def __load_tensor_file_map(self, file_path: str):
# 处理传入路径,确保是文件夹路径
@ -96,6 +97,7 @@ class SafeTensorLoader(ModelLoader):
def load_tensor(self, key: str, device: str="cpu"):
if translate_name_to_gguf(key) in self.tensor_file_map:
key = translate_name_to_gguf(key)
elif key in self.tensor_file_map:
@ -267,6 +269,7 @@ class SafeTensorLoader(ModelLoader):
class W8A8SafeTensorLoader(SafeTensorLoader):
def load_tensor(self, key: str, device: str = "cpu"):
key = translate_name_to_gguf(key)
if key not in self.tensor_file_map:
raise KeyError(f"Key {key} not found in Safetensor files")
file = self.tensor_file_map[key]
@ -308,13 +311,6 @@ class GGUFLoader(ModelLoader):
gguf_path = os.path.dirname(gguf_path)
safetensor_loader = SafeTensorLoader(gguf_path)
if quantize == "w8a8_dynamic":
safetensor_loader = W8A8SafeTensorLoader(gguf_path)
else:
safetensor_loader = SafeTensorLoader(gguf_path)
if safetensor_loader.tensor_file_map:
self.safetensor_loader = safetensor_loader
return
self.tensor_info = {}
self.gguf_path = gguf_path
@ -323,6 +319,14 @@ class GGUFLoader(ModelLoader):
self.gguf_file_meta = {}
self.tensor_device_map = {}
if quantize == "w8a8_dynamic":
safetensor_loader = W8A8SafeTensorLoader(gguf_path)
else:
safetensor_loader = SafeTensorLoader(gguf_path)
if safetensor_loader.tensor_file_map:
self.safetensor_loader = safetensor_loader
return
# Walk through all the .gguf files in the directory
found_gguf = False
for root, dirs, files in os.walk(gguf_path):
@ -431,6 +435,7 @@ class GGUFLoader(ModelLoader):
return mmap_data[offset : offset + itemsize * item_count]
def get_undequanted_tensor_and_ggml_type(self, name):
name = translate_name_to_gguf(name)
t = self.tensor_info[name]
data = self.get_mmap_tensor(name)
@ -439,6 +444,7 @@ class GGUFLoader(ModelLoader):
return data, ggml_type
def load_expert_tensor(self, name, data, expert_id, elements_per_expert, device = "cuda", target_dtype = torch.get_default_dtype())->torch.Tensor:
name = translate_name_to_gguf(name)
t = self.tensor_info[name]
shape = t["shape"]
@ -468,6 +474,7 @@ class GGUFLoader(ModelLoader):
return values
def load_gguf_tensor(self, name: str, device:str = "cpu", target_dtype = None)->torch.Tensor:
name = translate_name_to_gguf(name)
t = self.tensor_info[name]
if target_dtype == None:
@ -533,10 +540,12 @@ class GGUFLoader(ModelLoader):
.reshape(values.shape))
return values
def has_tensor(self, name: str):
name = translate_name_to_gguf(name)
return name in self.tensor_info
def get_ggml_type(self, name: str):
name = translate_name_to_gguf(name)
if name not in self.tensor_info:
raise KeyError(f"Key {name} not found in GGUF files")

View file

@ -135,10 +135,47 @@ def get_all_used_cuda_device(device_map:dict):
all_device_list = list(all_device_list)
return all_device_list
def get_current_device():
return f"npu:{torch.npu.current_device()}"
def load_cur_state_dict_npu(module: nn.Module, gguf_loader: ModelLoader, prefix: str = "", device="npu"):
prefix = prefix.replace("orig_module.", "")
persistent_buffers = {k: v for k, v in module._buffers.items() if k not in module._non_persistent_buffers_set}
local_name_params = itertools.chain(module._parameters.items(), persistent_buffers.items())
local_state = {k: v for k, v in local_name_params if v is not None}
for name, param in local_state.items():
key = prefix + name
translated_key = translate_name_to_gguf(key)
# TODO: Merge all loader.
# I know this is ugly but lets do it for now.
if gguf_loader.safetensor_loader is not None:
load_dequantized_tensor = gguf_loader.safetensor_loader.load_dequantized_tensor
tensor_file_map = gguf_loader.safetensor_loader.tensor_file_map
else:
load_dequantized_tensor = gguf_loader.load_gguf_tensor
tensor_file_map = gguf_loader.tensor_file_map
if translated_key in tensor_file_map:
target_dtype = torch.get_default_dtype()
device = get_device(translated_key[:translated_key.rfind(".")], gguf_loader.tensor_device_map)
# Todo need fix
device = "cpu" if "embd" in translated_key else get_current_device()
print(f"loading layer {translated_key} to {device}")
torch.cuda.empty_cache()
weights = load_dequantized_tensor(translated_key, device=device).to(dtype=target_dtype)
set_param(module, name, weights)
del weights
else:
#print(load_config.tensor_file_map.keys())
raise Exception(f"can't find {translated_key} in GGUF file!")
# TODO: support NPU
def load_cur_state_dict(module: nn.Module, gguf_loader: ModelLoader, prefix: str = "", device="cuda"):
if use_torch_npu:
load_cur_state_dict_npu(module, gguf_loader, prefix, device)
return
prefix = prefix.replace("orig_module.", "")
persistent_buffers = {k: v for k, v in module._buffers.items() if k not in module._non_persistent_buffers_set}
local_name_params = itertools.chain(module._parameters.items(), persistent_buffers.items())
@ -214,7 +251,7 @@ def xpu_fp16_model(config):
return False
def load_weights(module:nn.Module, gguf_loader:ModelLoader, prefix='', device="cuda"):
#print(f"recursively loading weights {prefix}")
# print(f"recursively loading weights {prefix}")
if not isinstance(module, base_operator.BaseInjectedModule):
load_cur_state_dict(module, gguf_loader, prefix, device=device)
for name, child in module._modules.items():
@ -314,6 +351,7 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000, use_cud
cache_position=cache_position,
past_key_values=past_key_values,
return_dict=False, use_cache=True)[0]
print(logits)
if past_key_values != None:
past_key_values.change_seq_length(1)
all_cuda_device = ['npu:' + str(index) for index in range(torch.distributed.get_world_size())]
@ -361,7 +399,6 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000, use_cud
if past_key_values != None and isinstance(past_key_values, StaticCache):
past_key_values.change_seq_length(1)
sync_all_device(all_cuda_device)
#print(logits)
next_token_scores = logits_warper(inputs, logits[:, -1, :])
if generation_config.do_sample:
probs = nn.functional.softmax(next_token_scores, dim=-1)
@ -410,6 +447,9 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000, use_cud
next_token = decode_one_tokens(cuda_graph_runner, next_token.unsqueeze(0), position_ids,
cache_position, past_key_values, logits_warper, generation_config,
use_cuda_graph).to(torch_device)
print(next_token)
inputs = torch.cat((inputs, next_token.unsqueeze(0)), dim=-1)
generated_ids[:, cache_position] = next_token.int()
tokens.append(int(next_token))
@ -596,8 +636,7 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000, use_cud
cache_position = torch.tensor([seq_length], device=torch_device, dtype=torch.int32)
position_ids = cache_position.unsqueeze(0)
seq_length += 1
if use_torch_npu:
past_key_values.position += 1
cuda_graph_runner = None

View file

@ -7,3 +7,4 @@ cpufeature; sys_platform == 'win32' or sys_platform == 'Windows'
protobuf
tiktoken
blobfile
einops

22
run_local_chat_npu.sh Normal file
View file

@ -0,0 +1,22 @@
#!/bin/bash
export CAPTURE_PLUGIN_PATH=ktransformers/util/npu_graph_so/arm
export USE_MERGE=0
export INF_NAN_MODE_FORCE_DISABLE=1
export TASK_QUEUE_ENABLE=0
#export PROF_DECODE=1
#export PROF_PREFILL=1
source /usr/local/Ascend/ascend-toolkit/set_env.sh
source /usr/local/Ascend/nnal/atb/set_env.sh
torchrun --nproc_per_node 1 \
--master_port 25565 \
-m ktransformers.local_chat_npu \
--cpu_infer 20 \
--model_path /mnt/data/models/DeepSeek-R1-q4km-w8a8\
--gguf_path /mnt/data/models/DeepSeek-R1-q4km-w8a8 \
--optimize_config_path ./ktransformers/optimize/optimize_rules/npu/DeepSeek-V3-Chat-800IA2-npu.yaml \
--max_new_tokens 30 \
--tp 1
# --use_cuda_graph True \

View file

@ -186,6 +186,8 @@ class VersionInfo:
else:
print("Using native cpu instruct")
if sys.platform.startswith("linux"):
if KTRANSFORMERS_BUILD_NPU:
return 'aarch64'
with open('/proc/cpuinfo', 'r', encoding="utf-8") as cpu_f:
cpuinfo = cpu_f.read()
flags_line = [line for line in cpuinfo.split(

View file

@ -5,5 +5,6 @@
#ifdef __aarch64__
#define llamafile_sgemm llamafile_sgemm_arm80
#define iqk_mul_mat iqk_mul_mat_arm80
#include "tinyblas_cpu_sgemm.inc"
#endif // __aarch64__