Merge branch 'tao' into dev

This commit is contained in:
leeetao  2025-02-24 16:48:43 +00:00
commit 224d14eb4c
4 changed files with 198 additions and 74 deletions

View file

@ -873,6 +873,10 @@ static bool assign_layers_to_device(
// model-specific constants // model-specific constants
const int n_embd_k_gqa = llama_model_n_embd_k_gqa(model); const int n_embd_k_gqa = llama_model_n_embd_k_gqa(model);
const int n_embd_v_gqa = llama_model_n_embd_v_gqa(model); const int n_embd_v_gqa = llama_model_n_embd_v_gqa(model);
if (n_embd_k_gqa <= 0 || n_embd_v_gqa <= 0) {
LOG_ERR("Invalid model parameters,n_embd_k_gqa and n_embd_v_gqa cannot be less than 0");
return false;
}
const int n_kv = cparams.n_ctx; const int n_kv = cparams.n_ctx;
const int64_t b = dev_info_set[0].model_bytes.nb_layer; const int64_t b = dev_info_set[0].model_bytes.nb_layer;

View file

@ -105,6 +105,8 @@ extern "C" {
LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24, LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
LLAMA_VOCAB_PRE_TYPE_EXAONE = 25, LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26, LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28,
}; };
enum llama_rope_type { enum llama_rope_type {

View file

@ -2318,6 +2318,7 @@ enum e_model {
MODEL_1B, MODEL_1B,
MODEL_1_3B, MODEL_1_3B,
MODEL_1_4B, MODEL_1_4B,
MODEL_1_5B,
MODEL_1_6B, MODEL_1_6B,
MODEL_2B, MODEL_2B,
MODEL_2_8B, MODEL_2_8B,
@ -2336,6 +2337,7 @@ enum e_model {
MODEL_16B, MODEL_16B,
MODEL_20B, MODEL_20B,
MODEL_30B, MODEL_30B,
MODEL_32B,
MODEL_34B, MODEL_34B,
MODEL_35B, MODEL_35B,
MODEL_40B, MODEL_40B,
@ -5675,6 +5677,7 @@ static const char * llama_model_type_name(e_model type) {
case MODEL_1B: return "1B"; case MODEL_1B: return "1B";
case MODEL_1_3B: return "1.3B"; case MODEL_1_3B: return "1.3B";
case MODEL_1_4B: return "1.4B"; case MODEL_1_4B: return "1.4B";
case MODEL_1_5B: return "1.5B";
case MODEL_1_6B: return "1.6B"; case MODEL_1_6B: return "1.6B";
case MODEL_2B: return "2B"; case MODEL_2B: return "2B";
case MODEL_2_8B: return "2.8B"; case MODEL_2_8B: return "2.8B";
@ -5693,6 +5696,7 @@ static const char * llama_model_type_name(e_model type) {
case MODEL_16B: return "16B"; case MODEL_16B: return "16B";
case MODEL_20B: return "20B"; case MODEL_20B: return "20B";
case MODEL_30B: return "30B"; case MODEL_30B: return "30B";
case MODEL_32B: return "32B";
case MODEL_34B: return "34B"; case MODEL_34B: return "34B";
case MODEL_35B: return "35B"; case MODEL_35B: return "35B";
case MODEL_40B: return "40B"; case MODEL_40B: return "40B";
@ -6046,8 +6050,12 @@ static void llm_load_hparams(
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
switch (hparams.n_layer) { switch (hparams.n_layer) {
case 24: model.type = hparams.n_embd == 1024 ? e_model::MODEL_0_5B : e_model::MODEL_1B; break; case 24: model.type = hparams.n_embd == 1024 ? e_model::MODEL_0_5B : e_model::MODEL_1B; break;
case 28: model.type = hparams.n_embd == 1536 ? e_model::MODEL_1_5B : e_model::MODEL_7B; break;
case 32: model.type = e_model::MODEL_7B; break; case 32: model.type = e_model::MODEL_7B; break;
case 36: model.type = e_model::MODEL_3B; break;
case 40: model.type = hparams.n_head() == 20 ? e_model::MODEL_4B : e_model::MODEL_13B; break; case 40: model.type = hparams.n_head() == 20 ? e_model::MODEL_4B : e_model::MODEL_13B; break;
case 48: model.type = e_model::MODEL_14B; break;
case 64: model.type = e_model::MODEL_32B; break;
case 80: model.type = e_model::MODEL_70B; break; case 80: model.type = e_model::MODEL_70B; break;
default: model.type = e_model::MODEL_UNKNOWN; default: model.type = e_model::MODEL_UNKNOWN;
} }
@ -6695,7 +6703,7 @@ static void llm_load_vocab(
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_COMMAND_R; vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_COMMAND_R;
vocab.tokenizer_clean_spaces = false; vocab.tokenizer_clean_spaces = false;
} else if ( } else if (
tokenizer_pre == "qwen2") { tokenizer_pre == "qwen2" || tokenizer_pre == "deepseek-r1-qwen") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2; vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
vocab.tokenizer_clean_spaces = false; vocab.tokenizer_clean_spaces = false;
} else if ( } else if (
@ -7383,6 +7391,86 @@ static void llm_load_llama_tensors(
} }
} }
static void llm_load_qwen2_tensors(
llama_model_loader & ml,
llama_model & model,
std::map<ggml_backend_buffer_type_t, ggml_context *> & ctx_map,
uint32_t n_world,
uint32_t my_rank,
const uint32_t * n_layer_window,
bool * use_mmap_buffer,
bool set_needed) {
const auto tn = LLM_TN(model.arch);
ggml_context * ctx_input = nullptr;
ggml_context * ctx_output = nullptr;
ggml_context * ctx_output_split = nullptr;
if (my_rank == 0) {
ctx_input = ctx_map.at(model.buft_input.buft);
ctx_output = ctx_map.at(model.buft_output.buft);
ctx_output_split = ctx_map.at(model.buft_output.buft_matrix);
}
auto ctx_for_layer = [&](int i) { return ctx_map.at(model.buft_layer[i].buft); };
auto ctx_for_layer_split = [&](int i) { return ctx_map.at(model.buft_layer[i].buft_matrix); };
const llama_hparams hparams = model.hparams;
const int64_t n_embd = hparams.n_embd;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
// const int64_t n_embd_gqa = n_embd_v_gqa;
const int64_t n_ff = hparams.n_ff();
const int64_t n_vocab = hparams.n_vocab;
const int64_t n_layer = hparams.n_layer;
if (my_rank == 0) {
// token embedding
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0, set_needed);
// output
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0, set_needed);
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED, set_needed);
// if output is NULL, init from the input tok embed
if (model.output == NULL) {
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED, set_needed);
}
}
for (int i = 0; i < n_layer; ++i) {
if (!this_layer_is_mine(i, n_world, my_rank, n_layer_window)) {
continue;
}
int local_i = map_layer_to_local_id(i, n_world, my_rank, n_layer_window);
ggml_context * ctx_layer = ctx_for_layer(local_i);
ggml_context * ctx_split = ctx_for_layer_split(local_i);
auto & layer = model.layers[local_i];
// attention norm
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0, set_needed);
// attention
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0, set_needed);
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0, set_needed);
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0, set_needed);
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0, set_needed);
// attention bias tensors
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0, set_needed);
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0, set_needed);
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0, set_needed);
// feed-forward
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0, set_needed);
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0, set_needed);
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0, set_needed);
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0, set_needed);
}
}
// Returns false if cancelled by progress_callback // Returns false if cancelled by progress_callback
static bool llm_load_tensors_impl( static bool llm_load_tensors_impl(
llama_model_loader & ml, llama_model_loader & ml,
@ -8049,44 +8137,8 @@ static bool llm_load_tensors_impl(
} }
} break; } break;
case LLM_ARCH_QWEN2: case LLM_ARCH_QWEN2:
{ llm_load_qwen2_tensors(ml, model, ctx_map, n_world, my_rank, n_layer_window, &use_mmap_buffer, true);
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); break;
// output
{
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
// if output is NULL, init from the input tok embed
if (model.output == NULL) {
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
}
}
for (int i = 0; i < n_layer; ++i) {
ggml_context * ctx_layer = ctx_for_layer(i);
ggml_context * ctx_split = ctx_for_layer_split(i);
auto & layer = model.layers[i];
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
// optional bias tensors
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
}
} break;
case LLM_ARCH_QWEN2MOE: case LLM_ARCH_QWEN2MOE:
{ {
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
@ -12591,18 +12643,40 @@ struct llm_build_context {
return gf; return gf;
} }
struct ggml_cgraph * build_qwen2() { std::vector<ggml_cgraph *> build_qwen2() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot); GGML_ASSERT(n_embd_head == hparams.n_rot);
struct ggml_tensor * cur; // create a vector to hold the subgraphs
struct ggml_tensor * inpL; std::vector<struct ggml_cgraph *> sub_gfs;
struct ggml_cgraph * sub_gf = nullptr;
struct ggml_tensor * cur = nullptr;
struct ggml_tensor * inpL = nullptr;
struct ggml_tensor * inpB = nullptr;
const uint32_t n_world = this->cparams.n_world;
const uint32_t my_rank = this->cparams.rank;
const uint32_t * n_layer_window = this->cparams.n_layer_window;
if (my_rank == 0) {
sub_gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
// inp_embd - contains the input embedding
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
// build the input layer as a seperate subgraph
ggml_build_forward_expand(sub_gf, inpL);
sub_gfs.push_back(sub_gf);
sub_gf = nullptr;
inpL = nullptr;
}
// inpB - contains the output embedding from other nodes
inpB = llm_build_backend_embd(ctx0, lctx, hparams, batch, cb);
// inp_pos - contains the positions // inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos(); struct ggml_tensor * inp_pos = build_inp_pos();
@ -12610,30 +12684,54 @@ struct llm_build_context {
struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * inpSA = inpL; if (!this_layer_is_mine(il, n_world, my_rank, n_layer_window)) {
// if we have an active sub-graph, add it to the list
if (sub_gf != nullptr && inpL != nullptr) {
ggml_build_forward_expand(sub_gf, cur);
sub_gfs.push_back(sub_gf);
sub_gf = nullptr;
}
// synchronous input tensor
if (inpL != inpB) {
inpL = inpB;
}
continue;
}
if (inpL == nullptr) {
inpL = inpB;
}
// start a new sub-graph
if (sub_gf == nullptr) {
sub_gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
}
struct ggml_tensor * inpSA = inpL; // use for shortcut
int local_il = map_layer_to_local_id(il, n_world, my_rank, n_layer_window);
// norm // norm
cur = llm_build_norm(ctx0, inpL, hparams, cur = llm_build_norm(ctx0, inpL, hparams,
model.layers[il].attn_norm, NULL, model.layers[local_il].attn_norm, NULL,
LLM_NORM_RMS, cb, il); LLM_NORM_RMS, cb, il);
cb(cur, "attn_norm", il); cb(cur, "attn_norm", il);
// self-attention // self-attention
{ {
// compute Q and K and RoPE them // compute Q and K and RoPE them
struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[local_il].wq, cur);
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); Qcur = ggml_add(ctx0, Qcur, model.layers[local_il].bq);
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[local_il].wk, cur);
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); Kcur = ggml_add(ctx0, Kcur, model.layers[local_il].bk);
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[local_il].wv, cur);
cb(Vcur, "Vcur", il); cb(Vcur, "Vcur", il);
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); Vcur = ggml_add(ctx0, Vcur, model.layers[local_il].bv);
cb(Vcur, "Vcur", il); cb(Vcur, "Vcur", il);
Qcur = ggml_rope_ext( Qcur = ggml_rope_ext(
@ -12650,8 +12748,8 @@ struct llm_build_context {
); );
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
cur = llm_build_kv(ctx0, lctx, kv_self, gf, cur = llm_build_kv(ctx0, lctx, kv_self, sub_gf,
model.layers[il].wo, model.layers[il].bo, model.layers[local_il].wo, model.layers[local_il].bo,
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
} }
@ -12667,19 +12765,19 @@ struct llm_build_context {
// feed-forward network // feed-forward network
cur = llm_build_norm(ctx0, ffn_inp, hparams, cur = llm_build_norm(ctx0, ffn_inp, hparams,
model.layers[il].ffn_norm, NULL, model.layers[local_il].ffn_norm, NULL,
LLM_NORM_RMS, cb, il); LLM_NORM_RMS, cb, il);
cb(cur, "ffn_norm", il); cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, lctx, cur, cur = llm_build_ffn(ctx0, lctx, cur,
model.layers[il].ffn_up, NULL, NULL, model.layers[local_il].ffn_up, NULL, NULL,
model.layers[il].ffn_gate, NULL, NULL, model.layers[local_il].ffn_gate, NULL, NULL,
model.layers[il].ffn_down, NULL, NULL, model.layers[local_il].ffn_down, NULL, NULL,
NULL, NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il); LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(cur, "ffn_out", il); cb(cur, "ffn_out", il);
cur = ggml_add(ctx0, cur, ffn_inp); cur = ggml_add(ctx0, cur, ffn_inp); // shortcut
cur = lctx.cvec.apply_to(ctx0, cur, il); cur = lctx.cvec.apply_to(ctx0, cur, il);
cb(cur, "l_out", il); cb(cur, "l_out", il);
@ -12687,7 +12785,19 @@ struct llm_build_context {
inpL = cur; inpL = cur;
} }
cur = inpL; // add the last active sub-graph to the list
if (sub_gf != nullptr) {
ggml_build_forward_expand(sub_gf, cur);
sub_gfs.push_back(sub_gf);
sub_gf = nullptr;
}
// output norm and lm_head
if (my_rank == 0) {
// start a new sub-graph for the output
sub_gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
cur = llm_build_out_embd(ctx0, lctx, hparams, cb);
cur = llm_build_norm(ctx0, cur, hparams, cur = llm_build_norm(ctx0, cur, hparams,
model.output_norm, NULL, model.output_norm, NULL,
@ -12698,9 +12808,12 @@ struct llm_build_context {
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
cb(cur, "result_output", -1); cb(cur, "result_output", -1);
ggml_build_forward_expand(gf, cur); ggml_build_forward_expand(sub_gf, cur);
sub_gfs.push_back(sub_gf);
sub_gf = nullptr;
}
return gf; return sub_gfs;
} }
struct ggml_cgraph * build_qwen2moe() { struct ggml_cgraph * build_qwen2moe() {
@ -16788,7 +16901,7 @@ static std::vector<struct ggml_cgraph *> llama_build_graph(
llm.init(); llm.init();
GGML_ASSERT(model.arch == LLM_ARCH_LLAMA && "this model is currently not supported"); GGML_ASSERT((model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_QWEN2) && "this model is currently not supported");
switch (model.arch) { switch (model.arch) {
case LLM_ARCH_LLAMA: case LLM_ARCH_LLAMA:
@ -16841,7 +16954,8 @@ static std::vector<struct ggml_cgraph *> llama_build_graph(
} break; } break;
case LLM_ARCH_QWEN2: case LLM_ARCH_QWEN2:
{ {
result.push_back(llm.build_qwen2()); // result.push_back(llm.build_qwen2());
result = llm.build_qwen2();
} break; } break;
case LLM_ARCH_QWEN2MOE: case LLM_ARCH_QWEN2MOE:
{ {
@ -21248,8 +21362,12 @@ void llama_model_n_flops(
case LLM_ARCH_MINICPM: case LLM_ARCH_MINICPM:
case LLM_ARCH_GRANITE: case LLM_ARCH_GRANITE:
case LLM_ARCH_GRANITE_MOE: case LLM_ARCH_GRANITE_MOE:
llm_load_llama_tensors(*ml, *model, ctx_map, 1, 0, n_layer_window, &use_mmap_buffer, false); llm_load_llama_tensors(*ml, *model, ctx_map, 1, 0, n_layer_window, &use_mmap_buffer, false);
break; break;
case LLM_ARCH_QWEN2:
llm_load_qwen2_tensors(*ml, *model, ctx_map, 1, 0, n_layer_window, &use_mmap_buffer, false);
break;
default: default:
throw std::runtime_error("unsupported architecture\n"); throw std::runtime_error("unsupported architecture\n");
} }