mirror of
https://github.com/Lizonghang/prima.cpp.git
synced 2025-09-06 18:09:50 +00:00
add llama_model_n_flops
This commit is contained in:
parent
10f6f92c7e
commit
477ecf2084
4 changed files with 445 additions and 107 deletions
|
@ -841,6 +841,8 @@ static void llama_assign_n_layer_window(
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
(void)my_rank;
|
||||||
|
|
||||||
std::fill_n(n_layer_window, n_world, DEFAULT_N_LAYER_WINDOW);
|
std::fill_n(n_layer_window, n_world, DEFAULT_N_LAYER_WINDOW);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -894,7 +896,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
||||||
|
|
||||||
device_info dev_info;
|
device_info dev_info;
|
||||||
dev_info.rank = params.rank;
|
dev_info.rank = params.rank;
|
||||||
llama_profile_device(&dev_info, model, params.model.c_str(), params.cpuparams.n_threads);
|
llama_profile_device(&dev_info, model, ml, params.model.c_str(), params.cpuparams.n_threads);
|
||||||
|
|
||||||
// create llama context
|
// create llama context
|
||||||
struct llama_context_params cparams = llama_context_params_from_gpt_params(params);
|
struct llama_context_params cparams = llama_context_params_from_gpt_params(params);
|
||||||
|
|
|
@ -67,6 +67,21 @@ struct device_info {
|
||||||
: rank(0), device_name(""), disk_read_bandwidth(0.0f), cpu_props(), memory(), gpu_support(), gpu_props() {}
|
: rank(0), device_name(""), disk_read_bandwidth(0.0f), cpu_props(), memory(), gpu_support(), gpu_props() {}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct flops_info {
|
||||||
|
// model flops
|
||||||
|
int64_t input_flops;
|
||||||
|
int64_t output_flops;
|
||||||
|
int64_t layer_flops;
|
||||||
|
|
||||||
|
// model params
|
||||||
|
int64_t input_params;
|
||||||
|
int64_t output_params;
|
||||||
|
int64_t layer_params;
|
||||||
|
|
||||||
|
flops_info()
|
||||||
|
: input_flops(0), output_flops(0), layer_flops(0), input_params(0), output_params(0), layer_params(0) {}
|
||||||
|
};
|
||||||
|
|
||||||
enum profiler_backend_type {
|
enum profiler_backend_type {
|
||||||
PROFILER_BACKEND_TYPE_CPU = 0,
|
PROFILER_BACKEND_TYPE_CPU = 0,
|
||||||
PROFILER_BACKEND_TYPE_METAL = 1,
|
PROFILER_BACKEND_TYPE_METAL = 1,
|
||||||
|
|
|
@ -410,7 +410,13 @@ extern "C" {
|
||||||
// Call once at the start of the program
|
// Call once at the start of the program
|
||||||
LLAMA_API void llama_backend_init(void);
|
LLAMA_API void llama_backend_init(void);
|
||||||
|
|
||||||
LLAMA_API void llama_profile_device (struct device_info * dev_info, struct llama_model * model, const char * test_file, int n_threads);
|
LLAMA_API void llama_profile_device(
|
||||||
|
struct device_info * dev_info,
|
||||||
|
struct llama_model * model,
|
||||||
|
struct llama_model_loader * ml,
|
||||||
|
const char * test_file,
|
||||||
|
int n_threads);
|
||||||
|
|
||||||
LLAMA_API ggml_backend_buffer_type_t llama_dev_buffer_type(struct llama_model * model, int device);
|
LLAMA_API ggml_backend_buffer_type_t llama_dev_buffer_type(struct llama_model * model, int device);
|
||||||
|
|
||||||
//optional:
|
//optional:
|
||||||
|
@ -518,6 +524,14 @@ extern "C" {
|
||||||
// Returns the total number of parameters in the model
|
// Returns the total number of parameters in the model
|
||||||
LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
|
LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
|
||||||
|
|
||||||
|
// Return the total number of float operations in the model
|
||||||
|
LLAMA_API void llama_model_n_flops(
|
||||||
|
struct llama_model * model,
|
||||||
|
struct llama_model_loader * ml,
|
||||||
|
struct flops_info * ffo,
|
||||||
|
const int64_t n_input,
|
||||||
|
const int64_t n_history);
|
||||||
|
|
||||||
// Get a llama model tensor
|
// Get a llama model tensor
|
||||||
LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);
|
LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);
|
||||||
|
|
||||||
|
|
517
src/llama.cpp
517
src/llama.cpp
|
@ -2880,21 +2880,21 @@ struct llama_model {
|
||||||
llama_vocab vocab;
|
llama_vocab vocab;
|
||||||
|
|
||||||
// TODO: should init all tensors to nullptr
|
// TODO: should init all tensors to nullptr
|
||||||
struct ggml_tensor * tok_embd;
|
struct ggml_tensor * tok_embd = nullptr;
|
||||||
struct ggml_tensor * type_embd;
|
struct ggml_tensor * type_embd = nullptr;
|
||||||
struct ggml_tensor * pos_embd;
|
struct ggml_tensor * pos_embd = nullptr;
|
||||||
struct ggml_tensor * tok_norm;
|
struct ggml_tensor * tok_norm = nullptr;
|
||||||
struct ggml_tensor * tok_norm_b;
|
struct ggml_tensor * tok_norm_b = nullptr;
|
||||||
|
|
||||||
struct ggml_tensor * output_norm;
|
struct ggml_tensor * output_norm = nullptr;
|
||||||
struct ggml_tensor * output_norm_b;
|
struct ggml_tensor * output_norm_b = nullptr;
|
||||||
struct ggml_tensor * output;
|
struct ggml_tensor * output = nullptr;
|
||||||
struct ggml_tensor * output_b;
|
struct ggml_tensor * output_b = nullptr;
|
||||||
struct ggml_tensor * output_norm_enc;
|
struct ggml_tensor * output_norm_enc = nullptr;
|
||||||
|
|
||||||
// classifier
|
// classifier
|
||||||
struct ggml_tensor * cls;
|
struct ggml_tensor * cls = nullptr;
|
||||||
struct ggml_tensor * cls_b;
|
struct ggml_tensor * cls_b = nullptr;
|
||||||
struct ggml_tensor * cls_out = nullptr;
|
struct ggml_tensor * cls_out = nullptr;
|
||||||
struct ggml_tensor * cls_out_b = nullptr;
|
struct ggml_tensor * cls_out_b = nullptr;
|
||||||
|
|
||||||
|
@ -3546,7 +3546,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_
|
||||||
GGML_UNUSED(model);
|
GGML_UNUSED(model);
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_profile_device(device_info * dev_info, struct llama_model * model, const char * test_file, int n_threads) {
|
void llama_profile_device(device_info * dev_info, struct llama_model * model, llama_model_loader * ml, const char * test_file, int n_threads) {
|
||||||
dev_info->device_name = device_name();
|
dev_info->device_name = device_name();
|
||||||
dev_info->cpu_props.cores = device_cpu_cores();
|
dev_info->cpu_props.cores = device_cpu_cores();
|
||||||
dev_info->cpu_props.flops_f32 = device_cpu_flops(model, GGML_TYPE_F32, n_threads);
|
dev_info->cpu_props.flops_f32 = device_cpu_flops(model, GGML_TYPE_F32, n_threads);
|
||||||
|
@ -3568,7 +3568,6 @@ void llama_profile_device(device_info * dev_info, struct llama_model * model, co
|
||||||
dev_info->gpu_support.blas = device_has_blas();
|
dev_info->gpu_support.blas = device_has_blas();
|
||||||
dev_info->gpu_support.sycl = device_has_sycl();
|
dev_info->gpu_support.sycl = device_has_sycl();
|
||||||
|
|
||||||
|
|
||||||
ggml_backend_dev_props cpu_props;
|
ggml_backend_dev_props cpu_props;
|
||||||
ggml_backend_dev_props gpu_props;
|
ggml_backend_dev_props gpu_props;
|
||||||
device_get_props(model, -1, &cpu_props); // -1 for cpu
|
device_get_props(model, -1, &cpu_props); // -1 for cpu
|
||||||
|
@ -3582,10 +3581,21 @@ void llama_profile_device(device_info * dev_info, struct llama_model * model, co
|
||||||
dev_info->gpu_props.memory_free = round(gpu_props.memory_free / (double)(1 << 30) * 100) / 100;
|
dev_info->gpu_props.memory_free = round(gpu_props.memory_free / (double)(1 << 30) * 100) / 100;
|
||||||
dev_info->gpu_props.memory_total = round(gpu_props.memory_total / (double)(1 << 30) * 100) / 100;
|
dev_info->gpu_props.memory_total = round(gpu_props.memory_total / (double)(1 << 30) * 100) / 100;
|
||||||
dev_info->gpu_props.metal_flops = device_metal_flops(model, GGML_TYPE_F32);
|
dev_info->gpu_props.metal_flops = device_metal_flops(model, GGML_TYPE_F32);
|
||||||
dev_info->gpu_props.cuda_flops_f32 = device_cuda_flops(model, GGML_TYPE_F32);
|
dev_info->gpu_props.cuda_flops_f32 = device_cuda_flops(model, GGML_TYPE_F32);
|
||||||
dev_info->gpu_props.cuda_flops_f16 = device_cuda_flops(model, GGML_TYPE_F16);
|
dev_info->gpu_props.cuda_flops_f16 = device_cuda_flops(model, GGML_TYPE_F16);
|
||||||
dev_info->gpu_props.cuda_flops_q8 = device_cuda_flops(model, GGML_TYPE_Q8_0);
|
dev_info->gpu_props.cuda_flops_q8 = device_cuda_flops(model, GGML_TYPE_Q8_0);
|
||||||
dev_info->gpu_props.cuda_flops_q4k = device_cuda_flops(model, GGML_TYPE_Q4_K);
|
dev_info->gpu_props.cuda_flops_q4k = device_cuda_flops(model, GGML_TYPE_Q4_K);
|
||||||
|
|
||||||
|
if (dev_info->rank == 0) {
|
||||||
|
struct flops_info ffo = flops_info{};
|
||||||
|
llama_model_n_flops(model, ml, &ffo, 1, 10);
|
||||||
|
LLAMA_LOG_INFO("input_flops: %llu\n", ffo.input_flops);
|
||||||
|
LLAMA_LOG_INFO("output_flops: %llu\n", ffo.output_flops);
|
||||||
|
LLAMA_LOG_INFO("layer_flops: %llu\n", ffo.layer_flops);
|
||||||
|
LLAMA_LOG_INFO("input_params: %llu\n", ffo.input_params);
|
||||||
|
LLAMA_LOG_INFO("output_params: %llu\n", ffo.output_params);
|
||||||
|
LLAMA_LOG_INFO("layer_params: %llu\n", ffo.layer_params);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_buffer_type_t llama_dev_buffer_type(struct llama_model * model, int device) {
|
ggml_backend_buffer_type_t llama_dev_buffer_type(struct llama_model * model, int device) {
|
||||||
|
@ -7141,6 +7151,124 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void llm_load_llama_tensors(
|
||||||
|
llama_model_loader & ml,
|
||||||
|
llama_model & model,
|
||||||
|
std::map<ggml_backend_buffer_type_t, ggml_context *> & ctx_map,
|
||||||
|
uint32_t n_world,
|
||||||
|
uint32_t my_rank,
|
||||||
|
const uint32_t * n_layer_window,
|
||||||
|
bool * use_mmap_buffer) {
|
||||||
|
const auto tn = LLM_TN(model.arch);
|
||||||
|
|
||||||
|
ggml_context * ctx_input = nullptr;
|
||||||
|
ggml_context * ctx_output = nullptr;
|
||||||
|
ggml_context * ctx_output_split = nullptr;
|
||||||
|
|
||||||
|
if (my_rank == 0) {
|
||||||
|
ctx_input = ctx_map.at(model.buft_input.buft);
|
||||||
|
ctx_output = ctx_map.at(model.buft_output.buft);
|
||||||
|
ctx_output_split = ctx_map.at(model.buft_output.buft_matrix);
|
||||||
|
}
|
||||||
|
|
||||||
|
auto ctx_for_layer = [&](int i) { return ctx_map.at(model.buft_layer[i].buft); };
|
||||||
|
auto ctx_for_layer_split = [&](int i) { return ctx_map.at(model.buft_layer[i].buft_matrix); };
|
||||||
|
|
||||||
|
const llama_hparams hparams = model.hparams;
|
||||||
|
const int64_t n_head = hparams.n_head();
|
||||||
|
const int64_t n_embd = hparams.n_embd;
|
||||||
|
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
||||||
|
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
||||||
|
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
||||||
|
const int64_t n_ff = hparams.n_ff();
|
||||||
|
const int64_t n_embd_gqa = n_embd_v_gqa;
|
||||||
|
const int64_t n_vocab = hparams.n_vocab;
|
||||||
|
const int64_t n_rot = hparams.n_rot;
|
||||||
|
const int64_t n_expert = hparams.n_expert;
|
||||||
|
const int64_t n_layer = hparams.n_layer;
|
||||||
|
|
||||||
|
if (my_rank == 0) {
|
||||||
|
// token embedding
|
||||||
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
||||||
|
// output
|
||||||
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
||||||
|
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||||
|
// if output is NULL, init from the input tok embed
|
||||||
|
if (model.output == NULL) {
|
||||||
|
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < n_layer; ++i) {
|
||||||
|
if (!this_layer_is_mine(i, n_world, my_rank, n_layer_window)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
int local_i = map_layer_to_local_id(i, n_world, my_rank, n_layer_window);
|
||||||
|
ggml_context * ctx_layer = ctx_for_layer(local_i);
|
||||||
|
ggml_context * ctx_split = ctx_for_layer_split(local_i);
|
||||||
|
|
||||||
|
auto & layer = model.layers[local_i];
|
||||||
|
|
||||||
|
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
||||||
|
|
||||||
|
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head});
|
||||||
|
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
|
||||||
|
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
|
||||||
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
|
||||||
|
|
||||||
|
// optional bias tensors
|
||||||
|
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||||
|
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||||
|
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||||
|
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||||
|
|
||||||
|
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
||||||
|
layer.rope_freqs = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
||||||
|
|
||||||
|
if (n_expert == 0) {
|
||||||
|
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
||||||
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
||||||
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
||||||
|
|
||||||
|
// optional MLP bias
|
||||||
|
layer.ffn_gate_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||||
|
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||||
|
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||||
|
} else {
|
||||||
|
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
||||||
|
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||||
|
if (layer.ffn_gate_exps) {
|
||||||
|
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert});
|
||||||
|
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
|
||||||
|
} else {
|
||||||
|
// merge split expert into a single tensor for compatibility with older models
|
||||||
|
// requires disabling mmap
|
||||||
|
*use_mmap_buffer = false;
|
||||||
|
|
||||||
|
ggml_type type_gate = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, 0).c_str())->type;
|
||||||
|
ggml_type type_down = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, 0).c_str())->type;
|
||||||
|
ggml_type type_up = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, 0).c_str())->type;
|
||||||
|
|
||||||
|
layer.ffn_gate_exps = ggml_new_tensor_3d(ctx_split, type_gate, n_embd, n_ff, n_expert);
|
||||||
|
layer.ffn_down_exps = ggml_new_tensor_3d(ctx_split, type_down, n_ff, n_embd, n_expert);
|
||||||
|
layer.ffn_up_exps = ggml_new_tensor_3d(ctx_split, type_up, n_embd, n_ff, n_expert);
|
||||||
|
|
||||||
|
ggml_set_name(layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i).c_str());
|
||||||
|
ggml_set_name(layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i).c_str());
|
||||||
|
ggml_set_name(layer.ffn_up_exps, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i).c_str());
|
||||||
|
|
||||||
|
for (uint32_t x = 0; x < n_expert; ++x) {
|
||||||
|
// the individual experts are loaded into a view of the merged tensor
|
||||||
|
ml.create_tensor_as_view(ctx_split, layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), {n_embd, n_ff}, layer.ffn_gate_exps->nb[2] * x);
|
||||||
|
ml.create_tensor_as_view(ctx_split, layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), {n_ff, n_embd}, layer.ffn_down_exps->nb[2] * x);
|
||||||
|
ml.create_tensor_as_view(ctx_split, layer.ffn_up_exps, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), {n_embd, n_ff}, layer.ffn_up_exps->nb[2] * x);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Returns false if cancelled by progress_callback
|
// Returns false if cancelled by progress_callback
|
||||||
static bool llm_load_tensors_impl(
|
static bool llm_load_tensors_impl(
|
||||||
llama_model_loader & ml,
|
llama_model_loader & ml,
|
||||||
|
@ -7195,7 +7323,7 @@ static bool llm_load_tensors_impl(
|
||||||
|
|
||||||
// assign the input and output layers on CPU by default
|
// assign the input and output layers on CPU by default
|
||||||
if (my_rank == 0) {
|
if (my_rank == 0) {
|
||||||
model.buft_input = llama_default_buffer_type_cpu(model, true);
|
model.buft_input = llama_default_buffer_type_cpu(model, true);
|
||||||
model.buft_output = llama_default_buffer_type_cpu(model, true);
|
model.buft_output = llama_default_buffer_type_cpu(model, true);
|
||||||
LLAMA_LOG_INFO("Layer input assigned to cpu\n");
|
LLAMA_LOG_INFO("Layer input assigned to cpu\n");
|
||||||
LLAMA_LOG_INFO("Layer output assigned to cpu\n");
|
LLAMA_LOG_INFO("Layer output assigned to cpu\n");
|
||||||
|
@ -7280,91 +7408,8 @@ static bool llm_load_tensors_impl(
|
||||||
case LLM_ARCH_MINICPM:
|
case LLM_ARCH_MINICPM:
|
||||||
case LLM_ARCH_GRANITE:
|
case LLM_ARCH_GRANITE:
|
||||||
case LLM_ARCH_GRANITE_MOE:
|
case LLM_ARCH_GRANITE_MOE:
|
||||||
{
|
llm_load_llama_tensors(ml, model, ctx_map, n_world, my_rank, n_layer_window, &use_mmap_buffer);
|
||||||
if (my_rank == 0) {
|
break;
|
||||||
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
|
||||||
|
|
||||||
// output
|
|
||||||
{
|
|
||||||
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
|
||||||
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
||||||
|
|
||||||
// if output is NULL, init from the input tok embed
|
|
||||||
if (model.output == NULL) {
|
|
||||||
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int i = 0; i < n_layer; ++i) {
|
|
||||||
if (!this_layer_is_mine(i, n_world, my_rank, n_layer_window)) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
int local_i = map_layer_to_local_id(i, n_world, my_rank, n_layer_window);
|
|
||||||
ggml_context * ctx_layer = ctx_for_layer(local_i);
|
|
||||||
ggml_context * ctx_split = ctx_for_layer_split(local_i);
|
|
||||||
|
|
||||||
auto & layer = model.layers[local_i];
|
|
||||||
|
|
||||||
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
|
||||||
|
|
||||||
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head});
|
|
||||||
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
|
|
||||||
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
|
|
||||||
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
|
|
||||||
|
|
||||||
// optional bias tensors
|
|
||||||
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
||||||
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
||||||
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
||||||
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
||||||
|
|
||||||
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
|
||||||
layer.rope_freqs = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
|
||||||
|
|
||||||
if (n_expert == 0) {
|
|
||||||
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
|
||||||
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
|
||||||
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
|
||||||
|
|
||||||
// optional MLP bias
|
|
||||||
layer.ffn_gate_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
||||||
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
||||||
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
||||||
} else {
|
|
||||||
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
|
||||||
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
||||||
if (layer.ffn_gate_exps) {
|
|
||||||
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert});
|
|
||||||
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
|
|
||||||
} else {
|
|
||||||
// merge split expert into a single tensor for compatibility with older models
|
|
||||||
// requires disabling mmap
|
|
||||||
use_mmap_buffer = false;
|
|
||||||
|
|
||||||
ggml_type type_gate = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, 0).c_str())->type;
|
|
||||||
ggml_type type_down = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, 0).c_str())->type;
|
|
||||||
ggml_type type_up = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, 0).c_str())->type;
|
|
||||||
|
|
||||||
layer.ffn_gate_exps = ggml_new_tensor_3d(ctx_split, type_gate, n_embd, n_ff, n_expert);
|
|
||||||
layer.ffn_down_exps = ggml_new_tensor_3d(ctx_split, type_down, n_ff, n_embd, n_expert);
|
|
||||||
layer.ffn_up_exps = ggml_new_tensor_3d(ctx_split, type_up, n_embd, n_ff, n_expert);
|
|
||||||
|
|
||||||
ggml_set_name(layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i).c_str());
|
|
||||||
ggml_set_name(layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i).c_str());
|
|
||||||
ggml_set_name(layer.ffn_up_exps, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i).c_str());
|
|
||||||
|
|
||||||
for (uint32_t x = 0; x < n_expert; ++x) {
|
|
||||||
// the individual experts are loaded into a view of the merged tensor
|
|
||||||
ml.create_tensor_as_view(ctx_split, layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), {n_embd, n_ff}, layer.ffn_gate_exps->nb[2] * x);
|
|
||||||
ml.create_tensor_as_view(ctx_split, layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), {n_ff, n_embd}, layer.ffn_down_exps->nb[2] * x);
|
|
||||||
ml.create_tensor_as_view(ctx_split, layer.ffn_up_exps, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), {n_embd, n_ff}, layer.ffn_up_exps->nb[2] * x);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} break;
|
|
||||||
case LLM_ARCH_MINICPM3:
|
case LLM_ARCH_MINICPM3:
|
||||||
{
|
{
|
||||||
const int64_t n_embd_head_qk_rope = hparams.n_rot;
|
const int64_t n_embd_head_qk_rope = hparams.n_rot;
|
||||||
|
@ -9285,7 +9330,7 @@ static struct ggml_tensor * llm_build_inp_embd(
|
||||||
|
|
||||||
inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens);
|
inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens);
|
||||||
} else {
|
} else {
|
||||||
lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
|
lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
|
||||||
inpL = lctx.inp_embd;
|
inpL = lctx.inp_embd;
|
||||||
ggml_set_input(lctx.inp_embd);
|
ggml_set_input(lctx.inp_embd);
|
||||||
}
|
}
|
||||||
|
@ -19841,6 +19886,8 @@ int llama_send_device_info(struct llama_context * ctx, struct device_info * dev_
|
||||||
LLAMA_LOG_INFO("Failed to send data: %s\n", e.what());
|
LLAMA_LOG_INFO("Failed to send data: %s\n", e.what());
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
int llama_broadcast_n_layer_window(struct llama_context * ctx, uint32_t * n_layer_window) {
|
int llama_broadcast_n_layer_window(struct llama_context * ctx, uint32_t * n_layer_window) {
|
||||||
|
@ -20586,6 +20633,266 @@ uint64_t llama_model_n_params(const struct llama_model * model) {
|
||||||
return nparams;
|
return nparams;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void llama_model_reset_tensors(struct llama_model * model) {
|
||||||
|
model->buft_input.buft = nullptr;
|
||||||
|
model->buft_input.buft_matrix = nullptr;
|
||||||
|
model->buft_output.buft = nullptr;
|
||||||
|
model->buft_output.buft_matrix = nullptr;
|
||||||
|
for (int i = 0; i < (int)model->hparams.n_layer; ++i) {
|
||||||
|
model->buft_layer[i].buft = nullptr;
|
||||||
|
model->buft_layer[i].buft_matrix = nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
// layers
|
||||||
|
model->buft_layer.resize(0);
|
||||||
|
model->layers.resize(0);
|
||||||
|
|
||||||
|
// input
|
||||||
|
model->tok_embd = nullptr;
|
||||||
|
model->type_embd = nullptr;
|
||||||
|
model->pos_embd = nullptr;
|
||||||
|
model->tok_norm = nullptr;
|
||||||
|
model->tok_norm_b = nullptr;
|
||||||
|
|
||||||
|
// output
|
||||||
|
model->output_norm = nullptr;
|
||||||
|
model->output_norm_b = nullptr;
|
||||||
|
model->output = nullptr;
|
||||||
|
model->output_b = nullptr;
|
||||||
|
model->output_norm_enc = nullptr;
|
||||||
|
|
||||||
|
// classifier
|
||||||
|
model->cls = nullptr;
|
||||||
|
model->cls_b = nullptr;
|
||||||
|
model->cls_out = nullptr;
|
||||||
|
model->cls_out_b = nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
void llama_model_n_flops(struct llama_model * model, struct llama_model_loader * ml, struct flops_info * ffo, const int64_t n_input, const int64_t n_history) {
|
||||||
|
const llama_hparams hparams = model->hparams;
|
||||||
|
const int64_t n_layer = hparams.n_layer;
|
||||||
|
const int64_t n_vocab = hparams.n_vocab;
|
||||||
|
const int64_t n_embd = hparams.n_embd;
|
||||||
|
const int64_t n_head = hparams.n_head();
|
||||||
|
const int64_t n_ff = hparams.n_ff();
|
||||||
|
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
||||||
|
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
||||||
|
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
||||||
|
const int64_t n_expert = hparams.n_expert;
|
||||||
|
|
||||||
|
// assign all the tensors on CPU by default
|
||||||
|
model->buft_input = llama_default_buffer_type_cpu(*model, true);
|
||||||
|
model->buft_output = llama_default_buffer_type_cpu(*model, true);
|
||||||
|
model->buft_layer.resize(n_layer);
|
||||||
|
for (int i = 0; i < (int)n_layer; ++i) {
|
||||||
|
model->buft_layer[i] = llama_default_buffer_type_cpu(*model, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
// count used buffer types
|
||||||
|
std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
|
||||||
|
buft_layer_count[model->buft_input.buft]++;
|
||||||
|
buft_layer_count[model->buft_input.buft_matrix]++;
|
||||||
|
buft_layer_count[model->buft_output.buft]++;
|
||||||
|
buft_layer_count[model->buft_output.buft_matrix]++;
|
||||||
|
for (int i = 0; i < (int)n_layer; ++i) {
|
||||||
|
buft_layer_count[model->buft_layer[i].buft]++;
|
||||||
|
buft_layer_count[model->buft_layer[i].buft_matrix]++;
|
||||||
|
}
|
||||||
|
|
||||||
|
// create one context per buffer type
|
||||||
|
size_t ctx_size = ggml_tensor_overhead() * (ml->n_tensors + 1);
|
||||||
|
|
||||||
|
// for moe merged tensors
|
||||||
|
ctx_size += ggml_tensor_overhead() * n_layer * 3;
|
||||||
|
|
||||||
|
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
||||||
|
std::vector<struct ggml_context *> ctxs;
|
||||||
|
for (auto & it : buft_layer_count) {
|
||||||
|
struct ggml_init_params params = {
|
||||||
|
/*.mem_size =*/ ctx_size,
|
||||||
|
/*.mem_buffer =*/ NULL,
|
||||||
|
/*.no_alloc =*/ true,
|
||||||
|
};
|
||||||
|
ggml_context * ctx = ggml_init(params);
|
||||||
|
if (!ctx) {
|
||||||
|
throw std::runtime_error(format("failed to create context\n"));
|
||||||
|
}
|
||||||
|
ctx_map[it.first] = ctx;
|
||||||
|
ctxs.push_back(ctx);
|
||||||
|
}
|
||||||
|
|
||||||
|
const uint32_t n_layer_window[32] = {(uint32_t)n_layer};
|
||||||
|
bool use_mmap_buffer = false;
|
||||||
|
|
||||||
|
model->layers.resize(n_layer);
|
||||||
|
|
||||||
|
switch (model->arch) {
|
||||||
|
case LLM_ARCH_LLAMA:
|
||||||
|
case LLM_ARCH_REFACT:
|
||||||
|
case LLM_ARCH_MINICPM:
|
||||||
|
case LLM_ARCH_GRANITE:
|
||||||
|
case LLM_ARCH_GRANITE_MOE:
|
||||||
|
llm_load_llama_tensors(*ml, *model, ctx_map, 1, 0, n_layer_window, &use_mmap_buffer);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
throw std::runtime_error("unsupported architecture\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
std::unordered_map<std::string, int> tensor_name_map = {
|
||||||
|
{"token_embd.weight", 1},
|
||||||
|
{"output_norm.weight", 2},
|
||||||
|
{"output.weight", 3},
|
||||||
|
{"blk.0.attn_norm.weight", 4},
|
||||||
|
{"blk.0.attn_q.weight", 5},
|
||||||
|
{"blk.0.attn_k.weight", 6},
|
||||||
|
{"blk.0.attn_v.weight", 7},
|
||||||
|
{"blk.0.attn_output.weight", 8},
|
||||||
|
{"blk.0.ffn_gate.weight", 9},
|
||||||
|
{"blk.0.ffn_down.weight", 10},
|
||||||
|
{"blk.0.ffn_up.weight", 11},
|
||||||
|
{"blk.0.ffn_norm.weight", 12},
|
||||||
|
{"rope_freqs.weight", 13},
|
||||||
|
// optional: bias tensors
|
||||||
|
{"blk.0.attn_q.bias", 14},
|
||||||
|
{"blk.0.attn_k.bias", 15},
|
||||||
|
{"blk.0.attn_v.bias", 16},
|
||||||
|
{"blk.0.attn_output.bias", 17},
|
||||||
|
{"blk.0.ffn_gate.bias", 18},
|
||||||
|
{"blk.0.ffn_down.bias", 19},
|
||||||
|
{"blk.0.ffn_up.bias", 20},
|
||||||
|
// optional: expert tensors
|
||||||
|
{"blk.0.ffn_gate_inp.weight", 21},
|
||||||
|
{"blk.0.ffn_gate_exps.weight", 22},
|
||||||
|
{"blk.0.ffn_down_exps.weight", 23},
|
||||||
|
{"blk.0.ffn_up_exps.weight", 24},
|
||||||
|
};
|
||||||
|
|
||||||
|
for (ggml_context * ctx : ctxs) {
|
||||||
|
for (auto * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
|
||||||
|
auto it = tensor_name_map.find(ggml_get_name(cur));
|
||||||
|
if (it != tensor_name_map.end()) {
|
||||||
|
switch (it->second) {
|
||||||
|
case 1: { // "token_embd.weight"
|
||||||
|
ffo->input_flops += (2 * n_input * n_embd * n_vocab - n_input * n_embd);
|
||||||
|
ffo->input_params += static_cast<int64_t>(ggml_nelements(cur));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 2: { // "output_norm.weight"
|
||||||
|
ffo->output_flops += n_input * (8 * n_embd + 1);
|
||||||
|
ffo->output_params += static_cast<int64_t>(ggml_nelements(cur));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 3: { // "output.weight"
|
||||||
|
ffo->output_flops += 2 * n_input * n_embd * n_vocab;
|
||||||
|
ffo->output_flops += 5 * n_input * n_vocab;
|
||||||
|
ffo->output_params += static_cast<int64_t>(ggml_nelements(cur));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 4: // "blk.0.attn_norm.weight"
|
||||||
|
case 12: // "blk.0.ffn_norm.weight"
|
||||||
|
{
|
||||||
|
ffo->layer_flops += n_input * (8 * n_embd + 1);
|
||||||
|
ffo->layer_params += static_cast<int64_t>(ggml_nelements(cur));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 5: { // "blk.0.attn_q.weight"
|
||||||
|
ffo->layer_flops += 2 * n_input * n_embd * (n_head * n_embd_head_k);
|
||||||
|
ffo->layer_params += static_cast<int64_t>(ggml_nelements(cur));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 6: { // "blk.0.attn_k.weight"
|
||||||
|
ffo->layer_flops += 2 * n_input * n_embd * (n_head * n_embd_k_gqa);
|
||||||
|
ffo->layer_flops += 2 * n_input * (n_input + n_history) * n_embd_head_k * n_head; // Q*K with KVCache
|
||||||
|
ffo->layer_flops += 7 * n_input * (n_input + n_history) * n_head; // scale, mask, and softmax
|
||||||
|
ffo->layer_params += static_cast<int64_t>(ggml_nelements(cur));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 7: { // "blk.0.attn_v.weight"
|
||||||
|
ffo->layer_flops += 2 * n_input * n_embd * (n_head * n_embd_v_gqa);
|
||||||
|
ffo->layer_flops += n_input * (n_input + n_history) * n_embd_head_k * n_head; // QKV with KVCache
|
||||||
|
ffo->layer_params += static_cast<int64_t>(ggml_nelements(cur));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 8: { // "blk.0.attn_output.weight"
|
||||||
|
ffo->layer_flops += 2 * n_input * (n_head * n_embd_head_k) * n_embd;
|
||||||
|
ffo->layer_flops += n_input * n_embd; // shortcut
|
||||||
|
ffo->layer_params += static_cast<int64_t>(ggml_nelements(cur));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 9: { // "blk.0.ffn_gate.weight"
|
||||||
|
ffo->layer_flops += 2 * n_input * n_embd * n_ff;
|
||||||
|
ffo->layer_flops += 5 * n_input * n_ff; // SiLU
|
||||||
|
ffo->layer_params += static_cast<int64_t>(ggml_nelements(cur));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 10: { // "blk.0.ffn_down.weight"
|
||||||
|
ffo->layer_flops += 2 * n_input * n_embd * n_ff;
|
||||||
|
ffo->layer_flops += n_input * n_embd; // shortcut
|
||||||
|
ffo->layer_params += static_cast<int64_t>(ggml_nelements(cur));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 11: { // "blk.0.ffn_up.weight"
|
||||||
|
ffo->layer_flops += 2 * n_input * n_embd * n_ff;
|
||||||
|
ffo->layer_flops += n_input * n_ff; // silu(gate(x)) * up(x)
|
||||||
|
ffo->layer_params += static_cast<int64_t>(ggml_nelements(cur));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 13: { // rope_freqs.weight, for Q and K
|
||||||
|
ffo->layer_flops += 8 * n_input * n_head * n_embd_head_k;
|
||||||
|
ffo->layer_params += static_cast<int64_t>(ggml_nelements(cur));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
// optional: bias tensors
|
||||||
|
case 14: // "blk.0.attn_q.bias"
|
||||||
|
case 15: // "blk.0.attn_k.bias"
|
||||||
|
case 16: // "blk.0.attn_v.bias"
|
||||||
|
case 17: // "blk.0.attn_output.bias"
|
||||||
|
case 19: // "blk.0.ffn_down.bias"
|
||||||
|
{
|
||||||
|
ffo->layer_flops += n_input * n_embd;
|
||||||
|
ffo->layer_params += static_cast<int64_t>(ggml_nelements(cur));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 18: // "blk.0.ffn_gate.bias"
|
||||||
|
case 20: // "blk.0.ffn_up.bias"
|
||||||
|
{
|
||||||
|
ffo->layer_flops += n_input * n_ff;
|
||||||
|
ffo->layer_params += static_cast<int64_t>(ggml_nelements(cur));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
// optional: expert tensors
|
||||||
|
case 21: { // "blk.0.ffn_gate_inp.weight"
|
||||||
|
ffo->layer_flops += 2 * n_input * n_embd * n_expert;
|
||||||
|
ffo->layer_params += static_cast<int64_t>(ggml_nelements(cur));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 22: // "blk.0.ffn_gate_exps.weight"
|
||||||
|
case 23: // "blk.0.ffn_down_exps.weight"
|
||||||
|
case 24: // "blk.0.ffn_up_exps.weight"
|
||||||
|
{
|
||||||
|
ffo->layer_flops += 2 * n_input * n_embd * n_ff * n_expert;
|
||||||
|
ffo->layer_params += static_cast<int64_t>(ggml_nelements(cur));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
LLAMA_LOG_INFO("Uncaught tensor\n");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// reset ml, model, and clear contexts
|
||||||
|
ml->n_created = 0;
|
||||||
|
ml->size_data = 0;
|
||||||
|
llama_model_reset_tensors(model);
|
||||||
|
for (ggml_context * ctx : ctxs) {
|
||||||
|
ggml_free(ctx);
|
||||||
|
}
|
||||||
|
ctxs.clear();
|
||||||
|
ctx_map.clear();
|
||||||
|
}
|
||||||
|
|
||||||
struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name) {
|
struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name) {
|
||||||
auto it = std::find_if(model->tensors_by_name.begin(), model->tensors_by_name.end(),
|
auto it = std::find_if(model->tensors_by_name.begin(), model->tensors_by_name.end(),
|
||||||
[name](const std::pair<std::string, struct ggml_tensor *> & it) {
|
[name](const std::pair<std::string, struct ggml_tensor *> & it) {
|
||||||
|
|
Loading…
Add table
Reference in a new issue