mirror of
https://github.com/Lizonghang/prima.cpp.git
synced 2025-09-06 13:59:04 +00:00
fix model bytes counter
This commit is contained in:
parent
2d79554694
commit
8e9ab45458
4 changed files with 152 additions and 190 deletions
|
@ -1285,33 +1285,17 @@ static float device_compute_delay(struct device_info & dev_info, int n_layers, c
|
|||
|
||||
// estimate the memory access delay, except for the input embedding because it has been considered in n_flops.inp_embd_ms
|
||||
static float device_memory_access_delay(struct device_info & dev_info, struct llama_model * model, const struct llama_context_params cparams, int n_layers) {
|
||||
struct model_params n_params = dev_info.model_params;
|
||||
auto n_bytes = dev_info.model_bytes;
|
||||
int n_gpu_layers = std::min(static_cast<int>(cparams.n_gpu_layers), n_layers);
|
||||
|
||||
int64_t layer_bytes =
|
||||
n_params.layer_f32 * 4 +
|
||||
n_params.layer_f16 * 2 +
|
||||
n_params.layer_q4k * 4 / 8 +
|
||||
n_params.layer_q5k * 5 / 8 +
|
||||
n_params.layer_q6k * 6 / 8 +
|
||||
n_params.layer_q80;
|
||||
|
||||
int64_t output_bytes =
|
||||
n_params.output_f32 * 4 +
|
||||
n_params.output_f16 * 2 +
|
||||
n_params.output_q4k * 4 / 8 +
|
||||
n_params.output_q5k * 5 / 8 +
|
||||
n_params.output_q6k * 6 / 8 +
|
||||
n_params.output_q80;
|
||||
|
||||
uint64_t cpu_kv_size;
|
||||
uint64_t gpu_kv_size;
|
||||
|
||||
#if defined(GGML_USE_METAL) || defined(GGML_USE_CUDA)
|
||||
llama_kv_size(&cpu_kv_size, &gpu_kv_size, model, cparams, true);
|
||||
|
||||
int64_t vram_bytes = layer_bytes * n_gpu_layers + gpu_kv_size;
|
||||
int64_t ram_bytes = layer_bytes * (n_layers - n_gpu_layers) + output_bytes + cpu_kv_size;
|
||||
int64_t vram_bytes = n_bytes.nb_layer * n_gpu_layers + gpu_kv_size;
|
||||
int64_t ram_bytes = n_bytes.nb_layer * (n_layers - n_gpu_layers) + n_bytes.nb_output + cpu_kv_size;
|
||||
|
||||
#ifdef GGML_USE_CUDA
|
||||
double vram_access_delay = (double)(vram_bytes) / 1e6 / dev_info.gpu_props.cuda_read_vram_bw;
|
||||
|
@ -1327,53 +1311,33 @@ static float device_memory_access_delay(struct device_info & dev_info, struct ll
|
|||
|
||||
(void)n_gpu_layers;
|
||||
(void)gpu_kv_size;
|
||||
int64_t ram_bytes = layer_bytes * n_layers + output_bytes + cpu_kv_size;
|
||||
int64_t ram_bytes = n_bytes.nb_layer * n_layers + n_bytes.nb_output + cpu_kv_size;
|
||||
double ram_access_delay = (double)(ram_bytes) / 1e6 / dev_info.memory.cpu_read_ram_bw;
|
||||
return static_cast<float>(ram_access_delay); // ms
|
||||
#endif
|
||||
}
|
||||
|
||||
static float device_disk_access_delay(struct device_info & dev_info, struct llama_model * model, const struct llama_context_params cparams) {
|
||||
auto n_params = dev_info.model_params;
|
||||
auto n_bytes = dev_info.model_bytes;
|
||||
int n_layers = llama_model_n_layers(model);
|
||||
int n_gpu_layers = std::min(static_cast<int>(cparams.n_gpu_layers), n_layers);
|
||||
int n_vocab = llama_n_vocab(model);
|
||||
|
||||
int64_t input_bytes = (
|
||||
n_params.input_f32 * 4 +
|
||||
n_params.input_f16 * 2 +
|
||||
n_params.input_q4k * 4 / 8 +
|
||||
n_params.input_q5k * 5 / 8 +
|
||||
n_params.input_q6k * 6 / 8 +
|
||||
n_params.input_q80) / n_vocab; // lookup table, retrieve only n_embd elements
|
||||
|
||||
int64_t cpu_total_bytes = input_bytes;
|
||||
|
||||
int64_t layer_bytes =
|
||||
n_params.layer_f32 * 4 +
|
||||
n_params.layer_f16 * 2 +
|
||||
n_params.layer_q4k * 4 / 8 +
|
||||
n_params.layer_q5k * 5 / 8 +
|
||||
n_params.layer_q6k * 6 / 8 +
|
||||
n_params.layer_q80;
|
||||
int64_t cpu_total_bytes = 0;
|
||||
int64_t input_bytes = n_bytes.nb_input / n_vocab; // lookup table, retrieve only n_embd elements
|
||||
cpu_total_bytes += input_bytes;
|
||||
|
||||
#if defined(GGML_USE_METAL) || defined(GGML_USE_CUDA)
|
||||
cpu_total_bytes += layer_bytes * (n_layers - n_gpu_layers);
|
||||
cpu_total_bytes += n_bytes.nb_layer * (n_layers - n_gpu_layers);
|
||||
#if defined(GGML_USE_METAL)
|
||||
int64_t gpu_total_bytes = layer_bytes * n_gpu_layers;
|
||||
int64_t gpu_total_bytes = n_bytes.nb_layer * n_gpu_layers;
|
||||
#endif
|
||||
#else
|
||||
(void)n_gpu_layers;
|
||||
cpu_total_bytes += layer_bytes * n_layers;
|
||||
cpu_total_bytes += n_bytes.nb_layer * n_layers;
|
||||
#endif
|
||||
|
||||
cpu_total_bytes += (
|
||||
n_params.output_f32 * 4 +
|
||||
n_params.output_f16 * 2 +
|
||||
n_params.output_q4k * 4 / 8 +
|
||||
n_params.output_q5k * 5 / 8 +
|
||||
n_params.output_q6k * 6 / 8 +
|
||||
n_params.output_q80);
|
||||
cpu_total_bytes += n_bytes.nb_output;
|
||||
|
||||
uint64_t cpu_kv_size;
|
||||
uint64_t gpu_kv_size;
|
||||
|
@ -1850,6 +1814,18 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
|
|||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q80);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model bytes (input) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_bytes.nb_input);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model bytes (layer) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_bytes.nb_layer);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model bytes (output) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_bytes.nb_output);
|
||||
LOG_INF("\n");
|
||||
|
||||
// todo: calculate for each device, not only master
|
||||
float latency = 0.0f;
|
||||
int n_layers = llama_model_n_layers (model);
|
||||
|
|
|
@ -181,6 +181,17 @@ struct model_params {
|
|||
layer_q80 (0) {}
|
||||
};
|
||||
|
||||
struct model_bytes {
|
||||
int64_t nb_input;
|
||||
int64_t nb_layer;
|
||||
int64_t nb_output;
|
||||
|
||||
model_bytes() :
|
||||
nb_input (0),
|
||||
nb_layer (0),
|
||||
nb_output(0) {}
|
||||
};
|
||||
|
||||
struct disk_props {
|
||||
float read_seq_bw; // in GB/s
|
||||
float read_rnd_bw; // in GB/s
|
||||
|
@ -204,6 +215,7 @@ struct device_info {
|
|||
struct gpu_props gpu_props;
|
||||
struct model_flops model_flops;
|
||||
struct model_params model_params;
|
||||
struct model_bytes model_bytes;
|
||||
|
||||
device_info() :
|
||||
rank(0),
|
||||
|
@ -214,7 +226,8 @@ struct device_info {
|
|||
gpu_support(),
|
||||
gpu_props(),
|
||||
model_flops(),
|
||||
model_params() {}
|
||||
model_params(),
|
||||
model_bytes() {}
|
||||
};
|
||||
|
||||
enum profiler_backend_type {
|
||||
|
|
|
@ -563,6 +563,7 @@ extern "C" {
|
|||
struct llama_model_loader * ml,
|
||||
struct model_flops * n_flops,
|
||||
struct model_params * n_params,
|
||||
struct model_bytes * n_bytes,
|
||||
const int64_t n_history,
|
||||
const int64_t n_ctx,
|
||||
enum ggml_type * inp_embd_dtype,
|
||||
|
|
202
src/llama.cpp
202
src/llama.cpp
|
@ -3589,9 +3589,10 @@ void llama_profile_device(
|
|||
|
||||
struct model_flops * n_flops = &dev_info->model_flops;
|
||||
struct model_params * n_params = &dev_info->model_params;
|
||||
struct model_bytes * n_bytes = &dev_info->model_bytes;
|
||||
if (dev_info->rank == 0) {
|
||||
enum ggml_type inp_embd_dtype = GGML_TYPE_F32;
|
||||
llama_model_n_flops(model, ml, n_flops, n_params, n_predict, n_ctx, &inp_embd_dtype, flash_attn);
|
||||
llama_model_n_flops(model, ml, n_flops, n_params, n_bytes, n_predict, n_ctx, &inp_embd_dtype, flash_attn);
|
||||
n_flops->inp_embd_ms = device_inp_embd_delay(model, inp_embd_dtype, 1, n_threads);
|
||||
}
|
||||
|
||||
|
@ -20881,6 +20882,26 @@ static void count_n_params(struct model_params * n_params, enum ggml_type dtype,
|
|||
}
|
||||
}
|
||||
|
||||
static void count_n_bytes(struct model_bytes * n_bytes, enum profiler_layer_type ltype, size_t n) {
|
||||
int64_t n_i64t = static_cast<int64_t>(n);
|
||||
switch (ltype) {
|
||||
case PROFILER_LAYER_INPUT:
|
||||
n_bytes->nb_input += n_i64t;
|
||||
break;
|
||||
|
||||
case PROFILER_LAYER_OUTPUT:
|
||||
n_bytes->nb_output += n_i64t;
|
||||
break;
|
||||
|
||||
case PROFILER_LAYER_BACKEND:
|
||||
n_bytes->nb_layer += n_i64t;
|
||||
break;
|
||||
|
||||
default:
|
||||
throw std::runtime_error("Unrecognized profiler layer type\n");
|
||||
}
|
||||
}
|
||||
|
||||
void llama_model_compute_buf_size(
|
||||
uint64_t * cpu_buf,
|
||||
uint64_t * gpu_buf,
|
||||
|
@ -20977,6 +20998,7 @@ void llama_model_n_flops(
|
|||
struct llama_model_loader * ml,
|
||||
struct model_flops * n_flops,
|
||||
struct model_params * n_params,
|
||||
struct model_bytes * n_bytes,
|
||||
const int64_t n_history,
|
||||
const int64_t n_ctx,
|
||||
enum ggml_type * inp_embd_dtype,
|
||||
|
@ -21052,151 +21074,101 @@ void llama_model_n_flops(
|
|||
throw std::runtime_error("unsupported architecture\n");
|
||||
}
|
||||
|
||||
std::unordered_map<std::string, int> tensor_name_map = {
|
||||
{"token_embd.weight", 1},
|
||||
{"output_norm.weight", 2},
|
||||
{"output.weight", 3},
|
||||
{"blk.0.attn_norm.weight", 4},
|
||||
{"blk.0.attn_q.weight", 5},
|
||||
{"blk.0.attn_k.weight", 6},
|
||||
{"blk.0.attn_v.weight", 7},
|
||||
{"blk.0.attn_output.weight", 8},
|
||||
{"blk.0.ffn_gate.weight", 9},
|
||||
{"blk.0.ffn_down.weight", 10},
|
||||
{"blk.0.ffn_up.weight", 11},
|
||||
{"blk.0.ffn_norm.weight", 12},
|
||||
{"rope_freqs.weight", 13},
|
||||
// optional: bias tensors
|
||||
{"blk.0.attn_q.bias", 14},
|
||||
{"blk.0.attn_k.bias", 15},
|
||||
{"blk.0.attn_v.bias", 16},
|
||||
{"blk.0.attn_output.bias", 17},
|
||||
{"blk.0.ffn_gate.bias", 18},
|
||||
{"blk.0.ffn_down.bias", 19},
|
||||
{"blk.0.ffn_up.bias", 20},
|
||||
// optional: expert tensors
|
||||
{"blk.0.ffn_gate_inp.weight", 21},
|
||||
{"blk.0.ffn_gate_exps.weight", 22},
|
||||
{"blk.0.ffn_down_exps.weight", 23},
|
||||
{"blk.0.ffn_up_exps.weight", 24},
|
||||
};
|
||||
bool rope_used = false;
|
||||
|
||||
for (auto * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
|
||||
auto it = tensor_name_map.find(ggml_get_name(cur));
|
||||
if (it != tensor_name_map.end()) {
|
||||
switch (it->second) {
|
||||
case 1: { // "token_embd.weight"
|
||||
count_n_params(n_params, cur->type, PROFILER_LAYER_INPUT, ggml_nelements(cur));
|
||||
*inp_embd_dtype = cur->type;
|
||||
break;
|
||||
}
|
||||
case 2: { // "output_norm.weight"
|
||||
count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_OUTPUT, 4 * n_embd + 1); // rms_norm
|
||||
count_n_flops (n_flops, cur->type, PROFILER_LAYER_OUTPUT, n_embd); // norm weights
|
||||
count_n_params(n_params, cur->type, PROFILER_LAYER_OUTPUT, ggml_nelements(cur));
|
||||
break;
|
||||
}
|
||||
case 3: { // "output.weight"
|
||||
count_n_flops (n_flops, cur->type, PROFILER_LAYER_OUTPUT, 2 * n_embd * n_vocab);
|
||||
count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_OUTPUT, 5 * n_vocab); // softmax
|
||||
count_n_params(n_params, cur->type, PROFILER_LAYER_OUTPUT, ggml_nelements(cur));
|
||||
break;
|
||||
}
|
||||
case 4: // "blk.0.attn_norm.weight"
|
||||
case 12: // "blk.0.ffn_norm.weight"
|
||||
{
|
||||
std::string tensor_name(ggml_get_name(cur));
|
||||
std::regex blk_regex("blk\\.\\d+\\.(.+)");
|
||||
std::smatch match;
|
||||
|
||||
if (std::regex_match(tensor_name, match, blk_regex) && match.size() > 1) {
|
||||
std::string blk_suffix = match[1].str();
|
||||
|
||||
if (blk_suffix == "attn_norm.weight" || blk_suffix == "ffn_norm.weight") {
|
||||
count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 4 * n_embd + 1); // rms norm
|
||||
count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, n_embd); // norm weights
|
||||
count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));
|
||||
break;
|
||||
}
|
||||
case 5: { // "blk.0.attn_q.weight"
|
||||
} else if (blk_suffix == "attn_q.weight") {
|
||||
count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_embd * n_embd);
|
||||
count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 2.5 * n_embd); // rope
|
||||
count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));
|
||||
break;
|
||||
}
|
||||
case 6: { // "blk.0.attn_k.weight"
|
||||
} else if (blk_suffix == "attn_k.weight") {
|
||||
count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_embd * n_head_kv * n_embd_head_k);
|
||||
count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 2.5 * n_embd_head_k * n_head_kv); // rope
|
||||
count_n_flops (n_flops, GGML_TYPE_F16, PROFILER_LAYER_BACKEND, 2 * n_embd_head_k * n_head * n_kv * n_head_kv); // compute kq with kvcache
|
||||
count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 7 * n_head * n_kv); // scale, mask, and softmax
|
||||
count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));
|
||||
break;
|
||||
}
|
||||
case 7: { // "blk.0.attn_v.weight"
|
||||
} else if (blk_suffix == "attn_v.weight") {
|
||||
count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_embd * n_head_kv * n_embd_head_v);
|
||||
count_n_flops (n_flops, GGML_TYPE_F16, PROFILER_LAYER_BACKEND, 2 * n_embd_head_v * n_head * n_kv * n_head_kv); // compute kqv with kvcache
|
||||
count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));
|
||||
break;
|
||||
}
|
||||
case 8: { // "blk.0.attn_output.weight"
|
||||
} else if (blk_suffix == "attn_output.weight") {
|
||||
count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_embd * n_embd);
|
||||
count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, n_embd); // shortcut
|
||||
count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));
|
||||
break;
|
||||
}
|
||||
case 9: { // "blk.0.ffn_gate.weight"
|
||||
} else if (blk_suffix == "ffn_gate.weight") {
|
||||
count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_embd * n_ff);
|
||||
count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 8 * n_ff); // SiLU
|
||||
count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));
|
||||
break;
|
||||
}
|
||||
case 10: { // "blk.0.ffn_down.weight"
|
||||
} else if (blk_suffix == "ffn_down.weight") {
|
||||
count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_embd * n_ff);
|
||||
count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, n_embd); // shortcut
|
||||
count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));
|
||||
break;
|
||||
}
|
||||
case 11: { // "blk.0.ffn_up.weight"
|
||||
} else if (blk_suffix == "ffn_up.weight") {
|
||||
count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_embd * n_ff);
|
||||
count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, n_ff); // silu(gate(x)) * up(x)
|
||||
count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));
|
||||
break;
|
||||
}
|
||||
case 13: { // rope_freqs.weight, has been counted in q and k
|
||||
count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));
|
||||
break;
|
||||
}
|
||||
// optional: bias tensors
|
||||
case 14: // "blk.0.attn_q.bias"
|
||||
case 15: // "blk.0.attn_k.bias"
|
||||
case 16: // "blk.0.attn_v.bias"
|
||||
case 17: // "blk.0.attn_output.bias"
|
||||
case 19: // "blk.0.ffn_down.bias"
|
||||
{
|
||||
} else if (blk_suffix == "attn_q.bias" || blk_suffix == "attn_k.bias" || blk_suffix == "attn_v.bias" || blk_suffix == "blk.0.attn_output.bias" || blk_suffix == "ffn_down.bias") {
|
||||
count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, n_embd);
|
||||
count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));
|
||||
break;
|
||||
}
|
||||
case 18: // "blk.0.ffn_gate.bias"
|
||||
case 20: // "blk.0.ffn_up.bias"
|
||||
{
|
||||
} else if (blk_suffix == "ffn_gate.bias" || blk_suffix == "ffn_up.bias") {
|
||||
count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, n_ff);
|
||||
count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));
|
||||
break;
|
||||
}
|
||||
// optional: expert tensors
|
||||
case 21: { // "blk.0.ffn_gate_inp.weight"
|
||||
} else if (blk_suffix == "ffn_gate_inp.weight") { // optional: expert tensors
|
||||
count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_embd * n_expert);
|
||||
count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));
|
||||
break;
|
||||
}
|
||||
case 22: // "blk.0.ffn_gate_exps.weight"
|
||||
case 23: // "blk.0.ffn_down_exps.weight"
|
||||
case 24: // "blk.0.ffn_up_exps.weight"
|
||||
{
|
||||
} else if (blk_suffix == "ffn_gate_exps.weight" || blk_suffix == "ffn_down_exps.weight" || blk_suffix == "ffn_up_exps.weight") {
|
||||
count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_embd * n_ff * n_expert);
|
||||
count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));
|
||||
break;
|
||||
} else {
|
||||
LLAMA_LOG_INFO("Uncaught tensor\n");
|
||||
return;
|
||||
}
|
||||
default:
|
||||
count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));
|
||||
count_n_bytes (n_bytes, PROFILER_LAYER_BACKEND, ggml_nbytes(cur));
|
||||
} else {
|
||||
if (tensor_name == "token_embd.weight") {
|
||||
count_n_params(n_params, cur->type, PROFILER_LAYER_INPUT, ggml_nelements(cur));
|
||||
count_n_bytes (n_bytes, PROFILER_LAYER_INPUT, ggml_nbytes(cur));
|
||||
*inp_embd_dtype = cur->type;
|
||||
} else if (tensor_name == "output_norm.weight") {
|
||||
count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_OUTPUT, 4 * n_embd + 1);
|
||||
count_n_flops (n_flops, cur->type, PROFILER_LAYER_OUTPUT, n_embd);
|
||||
count_n_params(n_params, cur->type, PROFILER_LAYER_OUTPUT, ggml_nelements(cur));
|
||||
count_n_bytes (n_bytes, PROFILER_LAYER_OUTPUT, ggml_nbytes(cur));
|
||||
} else if (tensor_name == "output.weight") {
|
||||
count_n_flops (n_flops, cur->type, PROFILER_LAYER_OUTPUT, 2 * n_embd * n_vocab);
|
||||
count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_OUTPUT, 5 * n_vocab); // softmax
|
||||
count_n_params(n_params, cur->type, PROFILER_LAYER_OUTPUT, ggml_nelements(cur));
|
||||
count_n_bytes (n_bytes, PROFILER_LAYER_OUTPUT, ggml_nbytes(cur));
|
||||
} else if (tensor_name == "rope_freqs.weight") {
|
||||
if (!rope_used) {
|
||||
count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));
|
||||
count_n_bytes (n_bytes, PROFILER_LAYER_BACKEND, ggml_nbytes(cur));
|
||||
rope_used = true;
|
||||
}
|
||||
} else {
|
||||
LLAMA_LOG_INFO("Uncaught tensor\n");
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// use average values instead of total values
|
||||
n_flops->layer_f32_f32 = static_cast<int64_t>((double)n_flops->layer_f32_f32 / (double)n_layer);
|
||||
n_flops->layer_f16_f32 = static_cast<int64_t>((double)n_flops->layer_f16_f32 / (double)n_layer);
|
||||
n_flops->layer_q4k_f32 = static_cast<int64_t>((double)n_flops->layer_q4k_f32 / (double)n_layer);
|
||||
n_flops->layer_q5k_f32 = static_cast<int64_t>((double)n_flops->layer_q5k_f32 / (double)n_layer);
|
||||
n_flops->layer_q6k_f32 = static_cast<int64_t>((double)n_flops->layer_q6k_f32 / (double)n_layer);
|
||||
n_flops->layer_q80_f32 = static_cast<int64_t>((double)n_flops->layer_q80_f32 / (double)n_layer);
|
||||
|
||||
n_params->layer_f32 = static_cast<int64_t>((double)n_params->layer_f32 / (double)n_layer);
|
||||
n_params->layer_f16 = static_cast<int64_t>((double)n_params->layer_f16 / (double)n_layer);
|
||||
n_params->layer_q4k = static_cast<int64_t>((double)n_params->layer_q4k / (double)n_layer);
|
||||
n_params->layer_q5k = static_cast<int64_t>((double)n_params->layer_q5k / (double)n_layer);
|
||||
n_params->layer_q6k = static_cast<int64_t>((double)n_params->layer_q6k / (double)n_layer);
|
||||
n_params->layer_q80 = static_cast<int64_t>((double)n_params->layer_q80 / (double)n_layer);
|
||||
|
||||
n_bytes->nb_layer = static_cast<int64_t>((double)n_bytes->nb_layer / (double)n_layer);
|
||||
|
||||
// reset ml, model, and clear contexts
|
||||
ml->n_created = 0;
|
||||
ml->size_data = 0;
|
||||
|
|
Loading…
Add table
Reference in a new issue