mirror of
https://github.com/Lizonghang/prima.cpp.git
synced 2025-09-10 06:34:53 +00:00
add q80xf32 and count_n_params
This commit is contained in:
parent
3fe00a16a0
commit
a7a95b53fe
3 changed files with 297 additions and 69 deletions
171
src/llama.cpp
171
src/llama.cpp
|
@ -91,6 +91,7 @@
|
|||
#include <unordered_map>
|
||||
#include <chrono>
|
||||
#include <regex>
|
||||
#include <inttypes.h>
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
|
@ -110,7 +111,7 @@ struct Timer {
|
|||
~Timer() {
|
||||
if (enable_timer) {
|
||||
int64_t end_time = ggml_time_us();
|
||||
LLAMA_LOG_INFO("Time to run %s: %lu ms\n", name, (end_time - start_time)/1000);
|
||||
LLAMA_LOG_INFO("Time to run %s: %" PRId64 " ms\n", name, (end_time - start_time) / 1000);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
@ -3553,6 +3554,7 @@ void llama_profile_device(device_info * dev_info, struct llama_model * model, ll
|
|||
dev_info->cpu_props.flops_f16_f32 = device_cpu_flops(model, GGML_TYPE_F16, GGML_TYPE_F32, n_threads);
|
||||
dev_info->cpu_props.flops_q4k_f32 = device_cpu_flops(model, GGML_TYPE_Q4_K, GGML_TYPE_F32, n_threads);
|
||||
dev_info->cpu_props.flops_q6k_f32 = device_cpu_flops(model, GGML_TYPE_Q6_K, GGML_TYPE_F32, n_threads);
|
||||
dev_info->cpu_props.flops_q80_f32 = device_cpu_flops(model, GGML_TYPE_Q8_0, GGML_TYPE_F32, n_threads);
|
||||
|
||||
dev_info->memory.total_physical = round(device_physical_memory(false) / (double)(1 << 30) * 100) / 100;
|
||||
dev_info->memory.available_physical = round(device_physical_memory(true) / (double)(1 << 30) * 100) / 100;
|
||||
|
@ -3586,10 +3588,12 @@ void llama_profile_device(device_info * dev_info, struct llama_model * model, ll
|
|||
dev_info->gpu_props.metal_flops_f16_f32 = device_metal_flops(model, GGML_TYPE_F16, GGML_TYPE_F32);
|
||||
dev_info->gpu_props.metal_flops_q4k_f32 = device_metal_flops(model, GGML_TYPE_Q4_K, GGML_TYPE_F32);
|
||||
dev_info->gpu_props.metal_flops_q6k_f32 = device_metal_flops(model, GGML_TYPE_Q6_K, GGML_TYPE_F32);
|
||||
dev_info->gpu_props.metal_flops_q80_f32 = device_metal_flops(model, GGML_TYPE_Q8_0, GGML_TYPE_F32);
|
||||
dev_info->gpu_props.cuda_flops_f32_f32 = device_cuda_flops (model, GGML_TYPE_F32, GGML_TYPE_F32);
|
||||
dev_info->gpu_props.cuda_flops_f16_f32 = device_cuda_flops (model, GGML_TYPE_F16, GGML_TYPE_F32);
|
||||
dev_info->gpu_props.cuda_flops_q4k_f32 = device_cuda_flops (model, GGML_TYPE_Q4_K, GGML_TYPE_F32);
|
||||
dev_info->gpu_props.cuda_flops_q6k_f32 = device_cuda_flops (model, GGML_TYPE_Q6_K, GGML_TYPE_F32);
|
||||
dev_info->gpu_props.cuda_flops_q80_f32 = device_cuda_flops (model, GGML_TYPE_Q8_0, GGML_TYPE_F32);
|
||||
|
||||
if (dev_info->rank == 0) {
|
||||
struct model_flops * n_flops = &dev_info->model_flops;
|
||||
|
@ -20677,9 +20681,18 @@ static void count_n_flops(struct model_flops * n_flops, enum ggml_type dtype, en
|
|||
case GGML_TYPE_F32:
|
||||
n_flops->output_f32_f32 += n;
|
||||
break;
|
||||
case GGML_TYPE_F16:
|
||||
n_flops->output_f16_f32 += n;
|
||||
break;
|
||||
case GGML_TYPE_Q4_K:
|
||||
n_flops->output_q4k_f32 += n;
|
||||
break;
|
||||
case GGML_TYPE_Q6_K:
|
||||
n_flops->output_q6k_f32 += n;
|
||||
break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
n_flops->output_q80_f32 += n;
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_OUTPUT\n");
|
||||
}
|
||||
|
@ -20699,6 +20712,82 @@ static void count_n_flops(struct model_flops * n_flops, enum ggml_type dtype, en
|
|||
case GGML_TYPE_Q6_K:
|
||||
n_flops->layer_q6k_f32 += n;
|
||||
break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
n_flops->layer_q80_f32 += n;
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_BACKEND\n");
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
throw std::runtime_error("Unrecognized profiler layer type\n");
|
||||
}
|
||||
}
|
||||
|
||||
static void count_n_params(struct model_params * n_params, enum ggml_type dtype, enum profiler_layer_type ltype, size_t n) {
|
||||
int64_t n_i64t = static_cast<int64_t>(n);
|
||||
switch (ltype) {
|
||||
case PROFILER_LAYER_INPUT:
|
||||
switch (dtype) {
|
||||
case GGML_TYPE_F32:
|
||||
n_params->input_f32 += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_F16:
|
||||
n_params->input_f16 += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_Q4_K:
|
||||
n_params->input_q4k += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_Q6_K:
|
||||
n_params->input_q6k += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
n_params->input_q80 += n_i64t;
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_OUTPUT\n");
|
||||
}
|
||||
break;
|
||||
|
||||
case PROFILER_LAYER_OUTPUT:
|
||||
switch (dtype) {
|
||||
case GGML_TYPE_F32:
|
||||
n_params->output_f32 += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_F16:
|
||||
n_params->output_f16 += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_Q4_K:
|
||||
n_params->output_q4k += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_Q6_K:
|
||||
n_params->output_q6k += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
n_params->output_q80 += n_i64t;
|
||||
default:
|
||||
throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_OUTPUT\n");
|
||||
}
|
||||
break;
|
||||
|
||||
case PROFILER_LAYER_BACKEND:
|
||||
switch (dtype) {
|
||||
case GGML_TYPE_F32:
|
||||
n_params->layer_f32 += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_F16:
|
||||
n_params->layer_f16 += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_Q4_K:
|
||||
n_params->layer_q4k += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_Q6_K:
|
||||
n_params->layer_q6k += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
n_params->layer_q80 += n_i64t;
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_BACKEND\n");
|
||||
}
|
||||
|
@ -20814,73 +20903,73 @@ void llama_model_n_flops(struct llama_model * model, struct llama_model_loader *
|
|||
if (it != tensor_name_map.end()) {
|
||||
switch (it->second) {
|
||||
case 1: { // "token_embd.weight"
|
||||
n_params->input_params += static_cast<int64_t>(ggml_nelements(cur));
|
||||
count_n_params(n_params, cur->type, PROFILER_LAYER_INPUT, ggml_nelements(cur));
|
||||
break;
|
||||
}
|
||||
case 2: { // "output_norm.weight"
|
||||
count_n_flops(n_flops, cur->type, PROFILER_LAYER_OUTPUT, n_input * (4 * n_embd + 1));
|
||||
n_params->output_params += static_cast<int64_t>(ggml_nelements(cur));
|
||||
count_n_flops (n_flops, cur->type, PROFILER_LAYER_OUTPUT, n_input * (4 * n_embd + 1));
|
||||
count_n_params(n_params, cur->type, PROFILER_LAYER_OUTPUT, ggml_nelements(cur));
|
||||
break;
|
||||
}
|
||||
case 3: { // "output.weight"
|
||||
count_n_flops(n_flops, cur->type, PROFILER_LAYER_OUTPUT, 2 * n_input * n_embd * n_vocab);
|
||||
count_n_flops(n_flops, GGML_TYPE_F32, PROFILER_LAYER_OUTPUT, 5 * n_input * n_vocab); // softmax
|
||||
n_params->output_params += static_cast<int64_t>(ggml_nelements(cur));
|
||||
count_n_flops (n_flops, cur->type, PROFILER_LAYER_OUTPUT, 2 * n_input * n_embd * n_vocab);
|
||||
count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_OUTPUT, 5 * n_input * n_vocab); // softmax
|
||||
count_n_params(n_params, cur->type, PROFILER_LAYER_OUTPUT, ggml_nelements(cur));
|
||||
break;
|
||||
}
|
||||
case 4: // "blk.0.attn_norm.weight"
|
||||
case 12: // "blk.0.ffn_norm.weight"
|
||||
{
|
||||
count_n_flops(n_flops, cur->type, PROFILER_LAYER_BACKEND, n_input * (4 * n_embd + 1));
|
||||
n_params->layer_params += static_cast<int64_t>(ggml_nelements(cur));
|
||||
count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, n_input * (4 * n_embd + 1));
|
||||
count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));
|
||||
break;
|
||||
}
|
||||
case 5: { // "blk.0.attn_q.weight"
|
||||
count_n_flops(n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * (n_head * n_embd_head_k));
|
||||
count_n_flops(n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd_head_k); // rope
|
||||
n_params->layer_params += static_cast<int64_t>(ggml_nelements(cur));
|
||||
count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * (n_head * n_embd_head_k));
|
||||
count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd_head_k); // rope
|
||||
count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));
|
||||
break;
|
||||
}
|
||||
case 6: { // "blk.0.attn_k.weight"
|
||||
count_n_flops(n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * (n_head * n_embd_k_gqa));
|
||||
count_n_flops(n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd_k_gqa); // rope
|
||||
count_n_flops(n_flops, GGML_TYPE_F16, PROFILER_LAYER_BACKEND, 2 * n_input * (n_input + n_history) * n_embd_head_k * n_head); // compute kq with kvcache
|
||||
count_n_flops(n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 7 * n_input * (n_input + n_history) * n_head); // scale, mask, and softmax
|
||||
n_params->layer_params += static_cast<int64_t>(ggml_nelements(cur));
|
||||
count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * (n_head * n_embd_k_gqa));
|
||||
count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd_k_gqa); // rope
|
||||
count_n_flops (n_flops, GGML_TYPE_F16, PROFILER_LAYER_BACKEND, 2 * n_input * (n_input + n_history) * n_embd_head_k * n_head); // compute kq with kvcache
|
||||
count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 7 * n_input * (n_input + n_history) * n_head); // scale, mask, and softmax
|
||||
count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));
|
||||
break;
|
||||
}
|
||||
case 7: { // "blk.0.attn_v.weight"
|
||||
count_n_flops(n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * (n_head * n_embd_v_gqa));
|
||||
count_n_flops(n_flops, GGML_TYPE_F16, PROFILER_LAYER_BACKEND, n_input * (n_input + n_history) * n_embd_head_k * n_head); // compute kqv with kvcache
|
||||
n_params->layer_params += static_cast<int64_t>(ggml_nelements(cur));
|
||||
count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * (n_head * n_embd_v_gqa));
|
||||
count_n_flops (n_flops, GGML_TYPE_F16, PROFILER_LAYER_BACKEND, n_input * (n_input + n_history) * n_embd_head_k * n_head); // compute kqv with kvcache
|
||||
count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));
|
||||
break;
|
||||
}
|
||||
case 8: { // "blk.0.attn_output.weight"
|
||||
count_n_flops(n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * (n_head * n_embd_head_k) * n_embd);
|
||||
count_n_flops(n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, n_input * n_embd); // shortcut
|
||||
n_params->layer_params += static_cast<int64_t>(ggml_nelements(cur));
|
||||
count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * (n_head * n_embd_head_k) * n_embd);
|
||||
count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, n_input * n_embd); // shortcut
|
||||
count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));
|
||||
break;
|
||||
}
|
||||
case 9: { // "blk.0.ffn_gate.weight"
|
||||
count_n_flops(n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * n_ff);
|
||||
count_n_flops(n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 5 * n_input * n_ff); // SiLU
|
||||
n_params->layer_params += static_cast<int64_t>(ggml_nelements(cur));
|
||||
count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * n_ff);
|
||||
count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 5 * n_input * n_ff); // SiLU
|
||||
count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));
|
||||
break;
|
||||
}
|
||||
case 10: { // "blk.0.ffn_down.weight"
|
||||
count_n_flops(n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * n_ff);
|
||||
count_n_flops(n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, n_input * n_embd); // shortcut
|
||||
n_params->layer_params += static_cast<int64_t>(ggml_nelements(cur));
|
||||
count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * n_ff);
|
||||
count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, n_input * n_embd); // shortcut
|
||||
count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));
|
||||
break;
|
||||
}
|
||||
case 11: { // "blk.0.ffn_up.weight"
|
||||
count_n_flops(n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * n_ff);
|
||||
count_n_flops(n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, n_input * n_ff); // silu(gate(x)) * up(x)
|
||||
n_params->layer_params += static_cast<int64_t>(ggml_nelements(cur));
|
||||
count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * n_ff);
|
||||
count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, n_input * n_ff); // silu(gate(x)) * up(x)
|
||||
count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));
|
||||
break;
|
||||
}
|
||||
case 13: { // rope_freqs.weight, has been counted in q and k
|
||||
n_params->layer_params += static_cast<int64_t>(ggml_nelements(cur));
|
||||
count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));
|
||||
break;
|
||||
}
|
||||
// optional: bias tensors
|
||||
|
@ -20890,29 +20979,29 @@ void llama_model_n_flops(struct llama_model * model, struct llama_model_loader *
|
|||
case 17: // "blk.0.attn_output.bias"
|
||||
case 19: // "blk.0.ffn_down.bias"
|
||||
{
|
||||
count_n_flops(n_flops, cur->type, PROFILER_LAYER_BACKEND, n_input * n_embd);
|
||||
n_params->layer_params += static_cast<int64_t>(ggml_nelements(cur));
|
||||
count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, n_input * n_embd);
|
||||
count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));
|
||||
break;
|
||||
}
|
||||
case 18: // "blk.0.ffn_gate.bias"
|
||||
case 20: // "blk.0.ffn_up.bias"
|
||||
{
|
||||
count_n_flops(n_flops, cur->type, PROFILER_LAYER_BACKEND, n_input * n_ff);
|
||||
n_params->layer_params += static_cast<int64_t>(ggml_nelements(cur));
|
||||
count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, n_input * n_ff);
|
||||
count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));
|
||||
break;
|
||||
}
|
||||
// optional: expert tensors
|
||||
case 21: { // "blk.0.ffn_gate_inp.weight"
|
||||
count_n_flops(n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * n_expert);
|
||||
n_params->layer_params += static_cast<int64_t>(ggml_nelements(cur));
|
||||
count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * n_expert);
|
||||
count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));
|
||||
break;
|
||||
}
|
||||
case 22: // "blk.0.ffn_gate_exps.weight"
|
||||
case 23: // "blk.0.ffn_down_exps.weight"
|
||||
case 24: // "blk.0.ffn_up_exps.weight"
|
||||
{
|
||||
count_n_flops(n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * n_ff * n_expert);
|
||||
n_params->layer_params += static_cast<int64_t>(ggml_nelements(cur));
|
||||
count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * n_ff * n_expert);
|
||||
count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));
|
||||
break;
|
||||
}
|
||||
default:
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue