fix model bytes counter

This commit is contained in:
Lizonghang 2024-12-10 14:57:48 +04:00
parent 2d79554694
commit 8e9ab45458
4 changed files with 152 additions and 190 deletions

View file

@ -1285,33 +1285,17 @@ static float device_compute_delay(struct device_info & dev_info, int n_layers, c
// estimate the memory access delay, except for the input embedding because it has been considered in n_flops.inp_embd_ms
static float device_memory_access_delay(struct device_info & dev_info, struct llama_model * model, const struct llama_context_params cparams, int n_layers) {
struct model_params n_params = dev_info.model_params;
auto n_bytes = dev_info.model_bytes;
int n_gpu_layers = std::min(static_cast<int>(cparams.n_gpu_layers), n_layers);
int64_t layer_bytes =
n_params.layer_f32 * 4 +
n_params.layer_f16 * 2 +
n_params.layer_q4k * 4 / 8 +
n_params.layer_q5k * 5 / 8 +
n_params.layer_q6k * 6 / 8 +
n_params.layer_q80;
int64_t output_bytes =
n_params.output_f32 * 4 +
n_params.output_f16 * 2 +
n_params.output_q4k * 4 / 8 +
n_params.output_q5k * 5 / 8 +
n_params.output_q6k * 6 / 8 +
n_params.output_q80;
uint64_t cpu_kv_size;
uint64_t gpu_kv_size;
#if defined(GGML_USE_METAL) || defined(GGML_USE_CUDA)
llama_kv_size(&cpu_kv_size, &gpu_kv_size, model, cparams, true);
int64_t vram_bytes = layer_bytes * n_gpu_layers + gpu_kv_size;
int64_t ram_bytes = layer_bytes * (n_layers - n_gpu_layers) + output_bytes + cpu_kv_size;
int64_t vram_bytes = n_bytes.nb_layer * n_gpu_layers + gpu_kv_size;
int64_t ram_bytes = n_bytes.nb_layer * (n_layers - n_gpu_layers) + n_bytes.nb_output + cpu_kv_size;
#ifdef GGML_USE_CUDA
double vram_access_delay = (double)(vram_bytes) / 1e6 / dev_info.gpu_props.cuda_read_vram_bw;
@ -1327,53 +1311,33 @@ static float device_memory_access_delay(struct device_info & dev_info, struct ll
(void)n_gpu_layers;
(void)gpu_kv_size;
int64_t ram_bytes = layer_bytes * n_layers + output_bytes + cpu_kv_size;
int64_t ram_bytes = n_bytes.nb_layer * n_layers + n_bytes.nb_output + cpu_kv_size;
double ram_access_delay = (double)(ram_bytes) / 1e6 / dev_info.memory.cpu_read_ram_bw;
return static_cast<float>(ram_access_delay); // ms
#endif
}
static float device_disk_access_delay(struct device_info & dev_info, struct llama_model * model, const struct llama_context_params cparams) {
auto n_params = dev_info.model_params;
auto n_bytes = dev_info.model_bytes;
int n_layers = llama_model_n_layers(model);
int n_gpu_layers = std::min(static_cast<int>(cparams.n_gpu_layers), n_layers);
int n_vocab = llama_n_vocab(model);
int64_t input_bytes = (
n_params.input_f32 * 4 +
n_params.input_f16 * 2 +
n_params.input_q4k * 4 / 8 +
n_params.input_q5k * 5 / 8 +
n_params.input_q6k * 6 / 8 +
n_params.input_q80) / n_vocab; // lookup table, retrieve only n_embd elements
int64_t cpu_total_bytes = input_bytes;
int64_t layer_bytes =
n_params.layer_f32 * 4 +
n_params.layer_f16 * 2 +
n_params.layer_q4k * 4 / 8 +
n_params.layer_q5k * 5 / 8 +
n_params.layer_q6k * 6 / 8 +
n_params.layer_q80;
int64_t cpu_total_bytes = 0;
int64_t input_bytes = n_bytes.nb_input / n_vocab; // lookup table, retrieve only n_embd elements
cpu_total_bytes += input_bytes;
#if defined(GGML_USE_METAL) || defined(GGML_USE_CUDA)
cpu_total_bytes += layer_bytes * (n_layers - n_gpu_layers);
cpu_total_bytes += n_bytes.nb_layer * (n_layers - n_gpu_layers);
#if defined(GGML_USE_METAL)
int64_t gpu_total_bytes = layer_bytes * n_gpu_layers;
int64_t gpu_total_bytes = n_bytes.nb_layer * n_gpu_layers;
#endif
#else
(void)n_gpu_layers;
cpu_total_bytes += layer_bytes * n_layers;
cpu_total_bytes += n_bytes.nb_layer * n_layers;
#endif
cpu_total_bytes += (
n_params.output_f32 * 4 +
n_params.output_f16 * 2 +
n_params.output_q4k * 4 / 8 +
n_params.output_q5k * 5 / 8 +
n_params.output_q6k * 6 / 8 +
n_params.output_q80);
cpu_total_bytes += n_bytes.nb_output;
uint64_t cpu_kv_size;
uint64_t gpu_kv_size;
@ -1850,6 +1814,18 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q80);
LOG_INF("\n");
LOG_INF("| Model bytes (input) ");
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_bytes.nb_input);
LOG_INF("\n");
LOG_INF("| Model bytes (layer) ");
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_bytes.nb_layer);
LOG_INF("\n");
LOG_INF("| Model bytes (output) ");
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_bytes.nb_output);
LOG_INF("\n");
// todo: calculate for each device, not only master
float latency = 0.0f;
int n_layers = llama_model_n_layers (model);

View file

@ -181,6 +181,17 @@ struct model_params {
layer_q80 (0) {}
};
struct model_bytes {
int64_t nb_input;
int64_t nb_layer;
int64_t nb_output;
model_bytes() :
nb_input (0),
nb_layer (0),
nb_output(0) {}
};
struct disk_props {
float read_seq_bw; // in GB/s
float read_rnd_bw; // in GB/s
@ -204,6 +215,7 @@ struct device_info {
struct gpu_props gpu_props;
struct model_flops model_flops;
struct model_params model_params;
struct model_bytes model_bytes;
device_info() :
rank(0),
@ -214,7 +226,8 @@ struct device_info {
gpu_support(),
gpu_props(),
model_flops(),
model_params() {}
model_params(),
model_bytes() {}
};
enum profiler_backend_type {

View file

@ -563,6 +563,7 @@ extern "C" {
struct llama_model_loader * ml,
struct model_flops * n_flops,
struct model_params * n_params,
struct model_bytes * n_bytes,
const int64_t n_history,
const int64_t n_ctx,
enum ggml_type * inp_embd_dtype,

View file

@ -3589,9 +3589,10 @@ void llama_profile_device(
struct model_flops * n_flops = &dev_info->model_flops;
struct model_params * n_params = &dev_info->model_params;
struct model_bytes * n_bytes = &dev_info->model_bytes;
if (dev_info->rank == 0) {
enum ggml_type inp_embd_dtype = GGML_TYPE_F32;
llama_model_n_flops(model, ml, n_flops, n_params, n_predict, n_ctx, &inp_embd_dtype, flash_attn);
llama_model_n_flops(model, ml, n_flops, n_params, n_bytes, n_predict, n_ctx, &inp_embd_dtype, flash_attn);
n_flops->inp_embd_ms = device_inp_embd_delay(model, inp_embd_dtype, 1, n_threads);
}
@ -20881,6 +20882,26 @@ static void count_n_params(struct model_params * n_params, enum ggml_type dtype,
}
}
static void count_n_bytes(struct model_bytes * n_bytes, enum profiler_layer_type ltype, size_t n) {
int64_t n_i64t = static_cast<int64_t>(n);
switch (ltype) {
case PROFILER_LAYER_INPUT:
n_bytes->nb_input += n_i64t;
break;
case PROFILER_LAYER_OUTPUT:
n_bytes->nb_output += n_i64t;
break;
case PROFILER_LAYER_BACKEND:
n_bytes->nb_layer += n_i64t;
break;
default:
throw std::runtime_error("Unrecognized profiler layer type\n");
}
}
void llama_model_compute_buf_size(
uint64_t * cpu_buf,
uint64_t * gpu_buf,
@ -20977,6 +20998,7 @@ void llama_model_n_flops(
struct llama_model_loader * ml,
struct model_flops * n_flops,
struct model_params * n_params,
struct model_bytes * n_bytes,
const int64_t n_history,
const int64_t n_ctx,
enum ggml_type * inp_embd_dtype,
@ -21052,151 +21074,101 @@ void llama_model_n_flops(
throw std::runtime_error("unsupported architecture\n");
}
std::unordered_map<std::string, int> tensor_name_map = {
{"token_embd.weight", 1},
{"output_norm.weight", 2},
{"output.weight", 3},
{"blk.0.attn_norm.weight", 4},
{"blk.0.attn_q.weight", 5},
{"blk.0.attn_k.weight", 6},
{"blk.0.attn_v.weight", 7},
{"blk.0.attn_output.weight", 8},
{"blk.0.ffn_gate.weight", 9},
{"blk.0.ffn_down.weight", 10},
{"blk.0.ffn_up.weight", 11},
{"blk.0.ffn_norm.weight", 12},
{"rope_freqs.weight", 13},
// optional: bias tensors
{"blk.0.attn_q.bias", 14},
{"blk.0.attn_k.bias", 15},
{"blk.0.attn_v.bias", 16},
{"blk.0.attn_output.bias", 17},
{"blk.0.ffn_gate.bias", 18},
{"blk.0.ffn_down.bias", 19},
{"blk.0.ffn_up.bias", 20},
// optional: expert tensors
{"blk.0.ffn_gate_inp.weight", 21},
{"blk.0.ffn_gate_exps.weight", 22},
{"blk.0.ffn_down_exps.weight", 23},
{"blk.0.ffn_up_exps.weight", 24},
};
bool rope_used = false;
for (auto * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
auto it = tensor_name_map.find(ggml_get_name(cur));
if (it != tensor_name_map.end()) {
switch (it->second) {
case 1: { // "token_embd.weight"
count_n_params(n_params, cur->type, PROFILER_LAYER_INPUT, ggml_nelements(cur));
*inp_embd_dtype = cur->type;
break;
}
case 2: { // "output_norm.weight"
count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_OUTPUT, 4 * n_embd + 1); // rms_norm
count_n_flops (n_flops, cur->type, PROFILER_LAYER_OUTPUT, n_embd); // norm weights
count_n_params(n_params, cur->type, PROFILER_LAYER_OUTPUT, ggml_nelements(cur));
break;
}
case 3: { // "output.weight"
count_n_flops (n_flops, cur->type, PROFILER_LAYER_OUTPUT, 2 * n_embd * n_vocab);
count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_OUTPUT, 5 * n_vocab); // softmax
count_n_params(n_params, cur->type, PROFILER_LAYER_OUTPUT, ggml_nelements(cur));
break;
}
case 4: // "blk.0.attn_norm.weight"
case 12: // "blk.0.ffn_norm.weight"
{
std::string tensor_name(ggml_get_name(cur));
std::regex blk_regex("blk\\.\\d+\\.(.+)");
std::smatch match;
if (std::regex_match(tensor_name, match, blk_regex) && match.size() > 1) {
std::string blk_suffix = match[1].str();
if (blk_suffix == "attn_norm.weight" || blk_suffix == "ffn_norm.weight") {
count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 4 * n_embd + 1); // rms norm
count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, n_embd); // norm weights
count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));
break;
}
case 5: { // "blk.0.attn_q.weight"
} else if (blk_suffix == "attn_q.weight") {
count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_embd * n_embd);
count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 2.5 * n_embd); // rope
count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));
break;
}
case 6: { // "blk.0.attn_k.weight"
} else if (blk_suffix == "attn_k.weight") {
count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_embd * n_head_kv * n_embd_head_k);
count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 2.5 * n_embd_head_k * n_head_kv); // rope
count_n_flops (n_flops, GGML_TYPE_F16, PROFILER_LAYER_BACKEND, 2 * n_embd_head_k * n_head * n_kv * n_head_kv); // compute kq with kvcache
count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 7 * n_head * n_kv); // scale, mask, and softmax
count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));
break;
}
case 7: { // "blk.0.attn_v.weight"
} else if (blk_suffix == "attn_v.weight") {
count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_embd * n_head_kv * n_embd_head_v);
count_n_flops (n_flops, GGML_TYPE_F16, PROFILER_LAYER_BACKEND, 2 * n_embd_head_v * n_head * n_kv * n_head_kv); // compute kqv with kvcache
count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));
break;
}
case 8: { // "blk.0.attn_output.weight"
} else if (blk_suffix == "attn_output.weight") {
count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_embd * n_embd);
count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, n_embd); // shortcut
count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));
break;
}
case 9: { // "blk.0.ffn_gate.weight"
} else if (blk_suffix == "ffn_gate.weight") {
count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_embd * n_ff);
count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 8 * n_ff); // SiLU
count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));
break;
}
case 10: { // "blk.0.ffn_down.weight"
} else if (blk_suffix == "ffn_down.weight") {
count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_embd * n_ff);
count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, n_embd); // shortcut
count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));
break;
}
case 11: { // "blk.0.ffn_up.weight"
} else if (blk_suffix == "ffn_up.weight") {
count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_embd * n_ff);
count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, n_ff); // silu(gate(x)) * up(x)
count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));
break;
}
case 13: { // rope_freqs.weight, has been counted in q and k
count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));
break;
}
// optional: bias tensors
case 14: // "blk.0.attn_q.bias"
case 15: // "blk.0.attn_k.bias"
case 16: // "blk.0.attn_v.bias"
case 17: // "blk.0.attn_output.bias"
case 19: // "blk.0.ffn_down.bias"
{
} else if (blk_suffix == "attn_q.bias" || blk_suffix == "attn_k.bias" || blk_suffix == "attn_v.bias" || blk_suffix == "blk.0.attn_output.bias" || blk_suffix == "ffn_down.bias") {
count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, n_embd);
count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));
break;
}
case 18: // "blk.0.ffn_gate.bias"
case 20: // "blk.0.ffn_up.bias"
{
} else if (blk_suffix == "ffn_gate.bias" || blk_suffix == "ffn_up.bias") {
count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, n_ff);
count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));
break;
}
// optional: expert tensors
case 21: { // "blk.0.ffn_gate_inp.weight"
} else if (blk_suffix == "ffn_gate_inp.weight") { // optional: expert tensors
count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_embd * n_expert);
count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));
break;
}
case 22: // "blk.0.ffn_gate_exps.weight"
case 23: // "blk.0.ffn_down_exps.weight"
case 24: // "blk.0.ffn_up_exps.weight"
{
} else if (blk_suffix == "ffn_gate_exps.weight" || blk_suffix == "ffn_down_exps.weight" || blk_suffix == "ffn_up_exps.weight") {
count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_embd * n_ff * n_expert);
count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));
break;
} else {
LLAMA_LOG_INFO("Uncaught tensor\n");
return;
}
default:
count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));
count_n_bytes (n_bytes, PROFILER_LAYER_BACKEND, ggml_nbytes(cur));
} else {
if (tensor_name == "token_embd.weight") {
count_n_params(n_params, cur->type, PROFILER_LAYER_INPUT, ggml_nelements(cur));
count_n_bytes (n_bytes, PROFILER_LAYER_INPUT, ggml_nbytes(cur));
*inp_embd_dtype = cur->type;
} else if (tensor_name == "output_norm.weight") {
count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_OUTPUT, 4 * n_embd + 1);
count_n_flops (n_flops, cur->type, PROFILER_LAYER_OUTPUT, n_embd);
count_n_params(n_params, cur->type, PROFILER_LAYER_OUTPUT, ggml_nelements(cur));
count_n_bytes (n_bytes, PROFILER_LAYER_OUTPUT, ggml_nbytes(cur));
} else if (tensor_name == "output.weight") {
count_n_flops (n_flops, cur->type, PROFILER_LAYER_OUTPUT, 2 * n_embd * n_vocab);
count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_OUTPUT, 5 * n_vocab); // softmax
count_n_params(n_params, cur->type, PROFILER_LAYER_OUTPUT, ggml_nelements(cur));
count_n_bytes (n_bytes, PROFILER_LAYER_OUTPUT, ggml_nbytes(cur));
} else if (tensor_name == "rope_freqs.weight") {
if (!rope_used) {
count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));
count_n_bytes (n_bytes, PROFILER_LAYER_BACKEND, ggml_nbytes(cur));
rope_used = true;
}
} else {
LLAMA_LOG_INFO("Uncaught tensor\n");
return;
}
}
}
// use average values instead of total values
n_flops->layer_f32_f32 = static_cast<int64_t>((double)n_flops->layer_f32_f32 / (double)n_layer);
n_flops->layer_f16_f32 = static_cast<int64_t>((double)n_flops->layer_f16_f32 / (double)n_layer);
n_flops->layer_q4k_f32 = static_cast<int64_t>((double)n_flops->layer_q4k_f32 / (double)n_layer);
n_flops->layer_q5k_f32 = static_cast<int64_t>((double)n_flops->layer_q5k_f32 / (double)n_layer);
n_flops->layer_q6k_f32 = static_cast<int64_t>((double)n_flops->layer_q6k_f32 / (double)n_layer);
n_flops->layer_q80_f32 = static_cast<int64_t>((double)n_flops->layer_q80_f32 / (double)n_layer);
n_params->layer_f32 = static_cast<int64_t>((double)n_params->layer_f32 / (double)n_layer);
n_params->layer_f16 = static_cast<int64_t>((double)n_params->layer_f16 / (double)n_layer);
n_params->layer_q4k = static_cast<int64_t>((double)n_params->layer_q4k / (double)n_layer);
n_params->layer_q5k = static_cast<int64_t>((double)n_params->layer_q5k / (double)n_layer);
n_params->layer_q6k = static_cast<int64_t>((double)n_params->layer_q6k / (double)n_layer);
n_params->layer_q80 = static_cast<int64_t>((double)n_params->layer_q80 / (double)n_layer);
n_bytes->nb_layer = static_cast<int64_t>((double)n_bytes->nb_layer / (double)n_layer);
// reset ml, model, and clear contexts
ml->n_created = 0;
ml->size_data = 0;