From f99e08b9fe74c038e88606aca78c9447068e5e6f Mon Sep 17 00:00:00 2001 From: leeetao <3122669219@qq.com> Date: Sun, 23 Feb 2025 08:27:37 +0000 Subject: [PATCH] Added inference support for the Deepseek distilled model --- include/llama.h | 2 + src/llama.cpp | 256 ++++++++++++++++++++++++++++++++++-------------- 2 files changed, 185 insertions(+), 73 deletions(-) diff --git a/include/llama.h b/include/llama.h index b7c170ab..d7920930 100644 --- a/include/llama.h +++ b/include/llama.h @@ -105,6 +105,8 @@ extern "C" { LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24, LLAMA_VOCAB_PRE_TYPE_EXAONE = 25, LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26, + LLAMA_VOCAB_PRE_TYPE_MINERVA = 27, + LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28, }; enum llama_rope_type { diff --git a/src/llama.cpp b/src/llama.cpp index 30226968..bb2cf244 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -6695,7 +6695,7 @@ static void llm_load_vocab( vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_COMMAND_R; vocab.tokenizer_clean_spaces = false; } else if ( - tokenizer_pre == "qwen2") { + tokenizer_pre == "qwen2" || tokenizer_pre == "deepseek-r1-qwen") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2; vocab.tokenizer_clean_spaces = false; } else if ( @@ -7383,6 +7383,86 @@ static void llm_load_llama_tensors( } } +static void llm_load_qwen2_tensors( + llama_model_loader & ml, + llama_model & model, + std::map & ctx_map, + uint32_t n_world, + uint32_t my_rank, + const uint32_t * n_layer_window, + bool * use_mmap_buffer, + bool set_needed) { + const auto tn = LLM_TN(model.arch); + + ggml_context * ctx_input = nullptr; + ggml_context * ctx_output = nullptr; + ggml_context * ctx_output_split = nullptr; + + if (my_rank == 0) { + ctx_input = ctx_map.at(model.buft_input.buft); + ctx_output = ctx_map.at(model.buft_output.buft); + ctx_output_split = ctx_map.at(model.buft_output.buft_matrix); + } + + auto ctx_for_layer = [&](int i) { return ctx_map.at(model.buft_layer[i].buft); }; + auto ctx_for_layer_split = [&](int i) { return ctx_map.at(model.buft_layer[i].buft_matrix); }; + + const llama_hparams hparams = model.hparams; + const int64_t n_embd = hparams.n_embd; + const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); + // const int64_t n_embd_gqa = n_embd_v_gqa; + const int64_t n_ff = hparams.n_ff(); + const int64_t n_vocab = hparams.n_vocab; + const int64_t n_layer = hparams.n_layer; + + if (my_rank == 0) { + // token embedding + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0, set_needed); + + // output + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0, set_needed); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED, set_needed); + + // if output is NULL, init from the input tok embed + if (model.output == NULL) { + model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED, set_needed); + } + } + + for (int i = 0; i < n_layer; ++i) { + if (!this_layer_is_mine(i, n_world, my_rank, n_layer_window)) { + continue; + } + + int local_i = map_layer_to_local_id(i, n_world, my_rank, n_layer_window); + ggml_context * ctx_layer = ctx_for_layer(local_i); + ggml_context * ctx_split = ctx_for_layer_split(local_i); + + auto & layer = model.layers[local_i]; + + // attention norm + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0, set_needed); + + // attention + layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0, set_needed); + layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0, set_needed); + layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0, set_needed); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0, set_needed); + + // attention bias tensors + layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0, set_needed); + layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0, set_needed); + layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0, set_needed); + + // feed-forward + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0, set_needed); + + layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0, set_needed); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0, set_needed); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0, set_needed); + } +} + // Returns false if cancelled by progress_callback static bool llm_load_tensors_impl( llama_model_loader & ml, @@ -8049,44 +8129,8 @@ static bool llm_load_tensors_impl( } } break; case LLM_ARCH_QWEN2: - { - model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); - - // output - { - model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); - model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED); - // if output is NULL, init from the input tok embed - if (model.output == NULL) { - model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); - } - } - - for (int i = 0; i < n_layer; ++i) { - ggml_context * ctx_layer = ctx_for_layer(i); - ggml_context * ctx_split = ctx_for_layer_split(i); - - auto & layer = model.layers[i]; - - layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); - - layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}); - layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}); - layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}); - layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); - - // optional bias tensors - layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}); - layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}); - layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}); - - layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); - - layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); - layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); - layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); - } - } break; + llm_load_qwen2_tensors(ml, model, ctx_map, n_world, my_rank, n_layer_window, &use_mmap_buffer, true); + break; case LLM_ARCH_QWEN2MOE: { model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); @@ -12591,18 +12635,40 @@ struct llm_build_context { return gf; } - struct ggml_cgraph * build_qwen2() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); + std::vector build_qwen2() { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); - struct ggml_tensor * cur; - struct ggml_tensor * inpL; + // create a vector to hold the subgraphs + std::vector sub_gfs; + struct ggml_cgraph * sub_gf = nullptr; + struct ggml_tensor * cur = nullptr; + struct ggml_tensor * inpL = nullptr; + struct ggml_tensor * inpB = nullptr; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + const uint32_t n_world = this->cparams.n_world; + const uint32_t my_rank = this->cparams.rank; + const uint32_t * n_layer_window = this->cparams.n_layer_window; + if (my_rank == 0) { + sub_gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); + + // inp_embd - contains the input embedding + inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + + // build the input layer as a seperate subgraph + ggml_build_forward_expand(sub_gf, inpL); + sub_gfs.push_back(sub_gf); + + sub_gf = nullptr; + inpL = nullptr; + } + + // inpB - contains the output embedding from other nodes + inpB = llm_build_backend_embd(ctx0, lctx, hparams, batch, cb); + // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); @@ -12610,30 +12676,54 @@ struct llm_build_context { struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; + if (!this_layer_is_mine(il, n_world, my_rank, n_layer_window)) { + // if we have an active sub-graph, add it to the list + if (sub_gf != nullptr && inpL != nullptr) { + ggml_build_forward_expand(sub_gf, cur); + sub_gfs.push_back(sub_gf); + sub_gf = nullptr; + } + // synchronous input tensor + if (inpL != inpB) { + inpL = inpB; + } + continue; + } + if (inpL == nullptr) { + inpL = inpB; + } + + // start a new sub-graph + if (sub_gf == nullptr) { + sub_gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); + } + + struct ggml_tensor * inpSA = inpL; // use for shortcut + int local_il = map_layer_to_local_id(il, n_world, my_rank, n_layer_window); + // norm cur = llm_build_norm(ctx0, inpL, hparams, - model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + model.layers[local_il].attn_norm, NULL, + LLM_NORM_RMS, cb, il); cb(cur, "attn_norm", il); // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[local_il].wq, cur); cb(Qcur, "Qcur", il); - Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); + Qcur = ggml_add(ctx0, Qcur, model.layers[local_il].bq); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[local_il].wk, cur); cb(Kcur, "Kcur", il); - Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); + Kcur = ggml_add(ctx0, Kcur, model.layers[local_il].bk); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[local_il].wv, cur); cb(Vcur, "Vcur", il); - Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); + Vcur = ggml_add(ctx0, Vcur, model.layers[local_il].bv); cb(Vcur, "Vcur", il); Qcur = ggml_rope_ext( @@ -12650,9 +12740,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, - model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + cur = llm_build_kv(ctx0, lctx, kv_self, sub_gf, + model.layers[local_il].wo, model.layers[local_il].bo, + Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -12667,19 +12757,19 @@ struct llm_build_context { // feed-forward network cur = llm_build_norm(ctx0, ffn_inp, hparams, - model.layers[il].ffn_norm, NULL, + model.layers[local_il].ffn_norm, NULL, LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); cur = llm_build_ffn(ctx0, lctx, cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, + model.layers[local_il].ffn_up, NULL, NULL, + model.layers[local_il].ffn_gate, NULL, NULL, + model.layers[local_il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, cb, il); cb(cur, "ffn_out", il); - cur = ggml_add(ctx0, cur, ffn_inp); + cur = ggml_add(ctx0, cur, ffn_inp); // shortcut cur = lctx.cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); @@ -12687,20 +12777,35 @@ struct llm_build_context { inpL = cur; } - cur = inpL; + // add the last active sub-graph to the list + if (sub_gf != nullptr) { + ggml_build_forward_expand(sub_gf, cur); + sub_gfs.push_back(sub_gf); + sub_gf = nullptr; + } - cur = llm_build_norm(ctx0, cur, hparams, - model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); - cb(cur, "result_norm", -1); + // output norm and lm_head + if (my_rank == 0) { + // start a new sub-graph for the output + sub_gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); - // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); - cb(cur, "result_output", -1); + cur = llm_build_out_embd(ctx0, lctx, hparams, cb); - ggml_build_forward_expand(gf, cur); + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, NULL, + LLM_NORM_RMS, cb, -1); + cb(cur, "result_norm", -1); - return gf; + // lm_head + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(sub_gf, cur); + sub_gfs.push_back(sub_gf); + sub_gf = nullptr; + } + + return sub_gfs; } struct ggml_cgraph * build_qwen2moe() { @@ -16788,7 +16893,7 @@ static std::vector llama_build_graph( llm.init(); - GGML_ASSERT(model.arch == LLM_ARCH_LLAMA && "this model is currently not supported"); + GGML_ASSERT((model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_QWEN2) && "this model is currently not supported"); switch (model.arch) { case LLM_ARCH_LLAMA: @@ -16841,7 +16946,8 @@ static std::vector llama_build_graph( } break; case LLM_ARCH_QWEN2: { - result.push_back(llm.build_qwen2()); + // result.push_back(llm.build_qwen2()); // TODO:Rewrite the build graph function for distributed inference + result = llm.build_qwen2(); } break; case LLM_ARCH_QWEN2MOE: { @@ -21235,8 +21341,12 @@ void llama_model_n_flops( case LLM_ARCH_MINICPM: case LLM_ARCH_GRANITE: case LLM_ARCH_GRANITE_MOE: + llm_load_llama_tensors(*ml, *model, ctx_map, 1, 0, n_layer_window, &use_mmap_buffer, false); break; + case LLM_ARCH_QWEN2: + llm_load_qwen2_tensors(*ml, *model, ctx_map, 1, 0, n_layer_window, &use_mmap_buffer, false); + break; default: throw std::runtime_error("unsupported architecture\n"); }