From fb9b1f2b007ce3fc9c2a1bc5bae5e812fd0aab74 Mon Sep 17 00:00:00 2001 From: "Li, Zonghang" <870644199@qq.com> Date: Mon, 9 Jun 2025 13:04:22 +0400 Subject: [PATCH] reformat llama.cpp --- src/llama.cpp | 61 +++++++++++++++++++++++++++------------------------ 1 file changed, 32 insertions(+), 29 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 0e615b67..c03d92a5 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -7472,10 +7472,7 @@ static void llm_load_qwen2_tensors( uint32_t n_world, uint32_t my_rank, const uint32_t * n_layer_window, - bool * use_mmap_buffer, bool set_needed) { - (void)use_mmap_buffer; // unused in this function - const auto tn = LLM_TN(model.arch); ggml_context * ctx_input = nullptr; @@ -7590,10 +7587,10 @@ static bool llm_load_tensors_impl( GGML_ASSERT(local_i != -1); if (local_i % window_size >= window_size - n_gpu_layers) { - // LLAMA_LOG_INFO("Layer %i assigned to gpu (cache index %i)\n", i, local_i); + LLAMA_LOG_DEBUG("Layer %i assigned to gpu (cache index %i)\n", i, local_i); model.buft_layer[local_i] = llama_default_buffer_type_offload(model, main_gpu); } else { - // LLAMA_LOG_INFO("Layer %i assigned to cpu (cache index %i)\n", i, local_i); + LLAMA_LOG_DEBUG("Layer %i assigned to cpu (cache index %i)\n", i, local_i); model.buft_layer[local_i] = llama_default_buffer_type_cpu(model, true); } } @@ -7603,8 +7600,8 @@ static bool llm_load_tensors_impl( if (my_rank == 0) { model.buft_input = llama_default_buffer_type_cpu(model, true); model.buft_output = llama_default_buffer_type_cpu(model, true); - // LLAMA_LOG_INFO("Layer input assigned to cpu\n"); - // LLAMA_LOG_INFO("Layer output assigned to cpu\n"); + LLAMA_LOG_DEBUG("Layer input assigned to cpu\n"); + LLAMA_LOG_DEBUG("Layer output assigned to cpu\n"); } // count used buffer types @@ -8212,7 +8209,7 @@ static bool llm_load_tensors_impl( } } break; case LLM_ARCH_QWEN2: - llm_load_qwen2_tensors(ml, model, ctx_map, n_world, my_rank, n_layer_window, &use_mmap_buffer, true); + llm_load_qwen2_tensors(ml, model, ctx_map, n_world, my_rank, n_layer_window, true); break; case LLM_ARCH_QWEN2MOE: { @@ -11182,8 +11179,6 @@ struct llm_build_context { cur = llm_build_out_embd(ctx0, lctx, hparams, cb); - // cur = ggml_get_rows(ctx0, cur, inp_out_ids); - cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, NULL, LLM_NORM_RMS, cb, -1); @@ -12724,18 +12719,19 @@ struct llm_build_context { } std::vector build_qwen2() { + // mutable variable, needed during the last layer of the computation to skip unused tokens + int32_t n_tokens = this->n_tokens; const int64_t n_embd_head = hparams.n_embd_head_v; - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); // create a vector to hold the subgraphs std::vector sub_gfs; struct ggml_cgraph * sub_gf = nullptr; - struct ggml_tensor * cur = nullptr; - struct ggml_tensor * inpL = nullptr; - struct ggml_tensor * inpB = nullptr; - + struct ggml_tensor * cur = nullptr; + struct ggml_tensor * inpL = nullptr; + struct ggml_tensor * inpB = nullptr; const uint32_t n_world = this->cparams.n_world; const uint32_t my_rank = this->cparams.rank; const uint32_t * n_layer_window = this->cparams.n_layer_window; @@ -12751,7 +12747,7 @@ struct llm_build_context { sub_gfs.push_back(sub_gf); sub_gf = nullptr; - inpL = nullptr; + inpL = nullptr; } // inpB - contains the output embedding from other nodes @@ -12763,6 +12759,7 @@ struct llm_build_context { // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; for (int il = 0; il < n_layer; ++il) { if (!this_layer_is_mine(il, n_world, my_rank, n_layer_window)) { // if we have an active sub-graph, add it to the list @@ -12771,7 +12768,6 @@ struct llm_build_context { sub_gfs.push_back(sub_gf); sub_gf = nullptr; } - // synchronous input tensor if (inpL != inpB) { inpL = inpB; } @@ -12801,21 +12797,27 @@ struct llm_build_context { // compute Q and K and RoPE them struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[local_il].wq, cur); cb(Qcur, "Qcur", il); - Qcur = ggml_add(ctx0, Qcur, model.layers[local_il].bq); - cb(Qcur, "Qcur", il); + if (model.layers[local_il].bq) { + Qcur = ggml_add(ctx0, Qcur, model.layers[local_il].bq); + cb(Qcur, "Qcur", il); + } struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[local_il].wk, cur); cb(Kcur, "Kcur", il); - Kcur = ggml_add(ctx0, Kcur, model.layers[local_il].bk); - cb(Kcur, "Kcur", il); + if (model.layers[local_il].bk) { + Kcur = ggml_add(ctx0, Kcur, model.layers[local_il].bk); + cb(Kcur, "Kcur", il); + } struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[local_il].wv, cur); cb(Vcur, "Vcur", il); - Vcur = ggml_add(ctx0, Vcur, model.layers[local_il].bv); - cb(Vcur, "Vcur", il); + if (model.layers[local_il].bv) { + Vcur = ggml_add(ctx0, Vcur, model.layers[local_il].bv); + cb(Vcur, "Vcur", il); + } Qcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); @@ -12830,7 +12832,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, lctx, kv_self, sub_gf, model.layers[local_il].wo, model.layers[local_il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il); } if (il == n_layer - 1) { @@ -12840,7 +12842,7 @@ struct llm_build_context { inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); // shortcut cb(ffn_inp, "ffn_inp", il); // feed-forward network @@ -12858,6 +12860,8 @@ struct llm_build_context { cb(cur, "ffn_out", il); cur = ggml_add(ctx0, cur, ffn_inp); // shortcut + cb(cur, "ffn_out", il); + cur = lctx.cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); @@ -17034,7 +17038,6 @@ static std::vector llama_build_graph( } break; case LLM_ARCH_QWEN2: { - // result.push_back(llm.build_qwen2()); result = llm.build_qwen2(); } break; case LLM_ARCH_QWEN2MOE: @@ -21896,7 +21899,7 @@ void llama_model_n_flops( llm_load_llama_tensors(*ml, *model, ctx_map, 1, 0, n_layer_window, &use_mmap_buffer, false); break; case LLM_ARCH_QWEN2: - llm_load_qwen2_tensors(*ml, *model, ctx_map, 1, 0, n_layer_window, &use_mmap_buffer, false); + llm_load_qwen2_tensors(*ml, *model, ctx_map, 1, 0, n_layer_window, false); break; default: throw std::runtime_error("unsupported architecture\n");