diff --git a/doc/zh/DeepseekR1_tutorial_zh_for_Ascend_NPU.md b/doc/zh/DeepseekR1_tutorial_zh_for_Ascend_NPU.md index 6bc3de9d..31acc0e1 100644 --- a/doc/zh/DeepseekR1_tutorial_zh_for_Ascend_NPU.md +++ b/doc/zh/DeepseekR1_tutorial_zh_for_Ascend_NPU.md @@ -37,8 +37,9 @@ conda install -c conda-forge libstdcxx-ng # 安装`GLIBCXX-3.4.32` apt install zlib1g-dev libtbb-dev libssl-dev libaio-dev libcurl4-openssl-dev pip3 install numpy==1.26.4 # 适配torch/torch_npu pip3 install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cpu -pip3 install packaging ninja transformers==4.43.2 fire protobuf attrs decorator cloudpickle ml-dtypes scipy tornado absl-py psutil +pip3 install packaging ninja fire protobuf attrs decorator cloudpickle ml-dtypes scipy tornado absl-py psutil pip3 install sqlalchemy +pip3 install transformers==4.57.1 #此处注意运行时transformers版本要求4.57.1(其他版本未验证) #pip3 install cpufeature # only for x86 ``` @@ -121,7 +122,7 @@ python ktransformers/server/main.py \ --gguf_path /mnt/data/models/DeepSeek-R1-q4km-w8a8 \ --model_name DeepSeekV3ForCausalLM \ --cpu_infer 60 \ ---optimize_config_path /home/huawei/ktransformers/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-300IA2-npu-serve.yaml \ +--optimize_config_path ./ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-300IA2-npu-serve.yaml \ --max_new_tokens 128 \ --max_batch_size 4 \ --use_cuda_graph \ diff --git a/ktransformers/models/ascend/custom_ascend_modeling_deepseek_v3.py b/ktransformers/models/ascend/custom_ascend_modeling_deepseek_v3.py index e6a81b7b..6034c30f 100644 --- a/ktransformers/models/ascend/custom_ascend_modeling_deepseek_v3.py +++ b/ktransformers/models/ascend/custom_ascend_modeling_deepseek_v3.py @@ -114,16 +114,6 @@ class KNPUDeepseekV3ForCausalLM(DeepseekV3PreTrainedModel): print("########################################") print("hidden_states is ", hidden_states) print("########################################") - # with torch.npu.stream(self.call_stream): - # position_ids, page_idx, page_offset, block_tables, hidden_states, bsz, q_len, hidden_size = param - # print("########################################") - # print("position_ids is ", position_ids) - # print("page_idx is ", page_idx) - # print("page_offset is ", page_offset) - # print("block_tables is ", block_tables) - # print("hidden_states is ", hidden_states) - # print("#########################################") - def forward( self, @@ -172,27 +162,8 @@ class KNPUDeepseekV3ForCausalLM(DeepseekV3PreTrainedModel): q_len_raw = None kv_len_raw = batch.minibatch.d_kv_len_list bsz_real = None - # if utils._USE_NPU_GRAPH: - # from libgraph_capture import graph_capture_launch_callback - # param = (position_ids, page_idx, page_offset, block_tables, hidden_states, bsz, q_len, hidden_size) - # graph_capture_launch_callback(self.print_callback, param, 1, self.stream.npu_stream) - # else: - # param = (position_ids, page_idx, page_offset, block_tables, hidden_states, bsz, q_len, hidden_size) - # self.print_callback(param) - - # with torch_npu.npu.stream(self.stream): - # print_ex("####: before decode layer...") for i, decode_layer in enumerate(self.model.layers): - # if not is_prefill: - # if utils._USE_NPU_GRAPH: - # from libgraph_capture import graph_capture_launch_callback - # param = (hidden_states, ) - # graph_capture_launch_callback(self.print_callback, param, 1, self.stream.npu_stream) - # else: - # param = (hidden_states, ) - # self.print_callback(param) - # attn residual = hidden_states hidden_states = decode_layer.input_layernorm(hidden_states) diff --git a/ktransformers/server/backend/interfaces/ktransformers.py b/ktransformers/server/backend/interfaces/ktransformers.py index 0839098e..80bc7013 100644 --- a/ktransformers/server/backend/interfaces/ktransformers.py +++ b/ktransformers/server/backend/interfaces/ktransformers.py @@ -232,12 +232,6 @@ class KTransformersInterface(TransformersInterface): ) self.seq_length = 1 - # flat_prev_ids = self.generated_ids.flatten() - # for i in range(min(self.seq_length, flat_input_ids.shape[0]) - 1): - # if flat_input_ids[i] == flat_prev_ids[i]: - # same_prefix += 1 - # else: - # break logger.debug(f"same prefix len: {same_prefix}") self.cache.remove_suffix(same_prefix) diff --git a/ktransformers/server/utils/serve_profiling.py b/ktransformers/server/utils/serve_profiling.py index 69d6b8c6..0fd51756 100644 --- a/ktransformers/server/utils/serve_profiling.py +++ b/ktransformers/server/utils/serve_profiling.py @@ -106,19 +106,3 @@ class ProfStatItem: PROF_TIME_STAT = ProfTimeStat() - -# j=0 -# start_time = PROF_TIME_STAT.record_start_time() -# for i in range(500): -# j+=1 -# PROF_TIME_STAT.add_time_stat(ProfStatKey.ExpertsSummitCurrLayer, start_time, False) - -# for i in range(500): -# j+=1 -# PROF_TIME_STAT.add_time_stat(ProfStatKey.ExpertsSummitCurrLayer, start_time, False) - -# for i in range(500): -# j+=1 -# PROF_TIME_STAT.add_time_stat(ProfStatKey.ExpertsSummitCurrLayer, start_time, False) - -# PROF_TIME_STAT.print_all() \ No newline at end of file