count model flops for f32xf32, f16xf32, q4kxf32, q6kxf32

This commit is contained in:
Lizonghang 2024-11-24 13:13:32 +04:00
parent a5ba34169a
commit 3fe00a16a0
4 changed files with 188 additions and 119 deletions

View file

@ -465,15 +465,15 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
} }
LOG_INF("\n"); LOG_INF("\n");
LOG_INF("| CPU flops (F32 x F32, GFLOPS)"); LOG_INF("| CPU flops (F32xF32, GFLOPS) ");
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_f32); LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_f32_f32);
} }
LOG_INF("\n"); LOG_INF("\n");
LOG_INF("| CPU flops (F16 x F16, GFLOPS)"); LOG_INF("| CPU flops (F16xF32, GFLOPS) ");
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_f16); LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_f16_f32);
} }
LOG_INF("\n"); LOG_INF("\n");
@ -593,13 +593,13 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
LOG_INF("| Metal flops (F32xF32, GFLOPS)"); LOG_INF("| Metal flops (F32xF32, GFLOPS)");
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_f32); LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_f32_f32);
} }
LOG_INF("\n"); LOG_INF("\n");
LOG_INF("| Metal flops (F16xF16, GFLOPS)"); LOG_INF("| Metal flops (F16xF32, GFLOPS)");
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_f16); LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_f16_f32);
} }
LOG_INF("\n"); LOG_INF("\n");
@ -617,13 +617,13 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
LOG_INF("| CUDA flops (F32xF32, GFLOPS)"); LOG_INF("| CUDA flops (F32xF32, GFLOPS)");
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_f32); LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_f32_f32);
} }
LOG_INF("\n"); LOG_INF("\n");
LOG_INF("| CUDA flops (F16xF16, GFLOPS)"); LOG_INF("| CUDA flops (F16xF32, GFLOPS)");
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_f16); LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_f16_f32);
} }
LOG_INF("\n"); LOG_INF("\n");
@ -639,33 +639,45 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
} }
LOG_INF("\n"); LOG_INF("\n");
LOG_INF("| Model flops (input) "); LOG_INF("| Model flops (output F32xF32) ");
LOG_INF("| %-10lu ", dev_info_set[0].model_flops.input_flops); LOG_INF("| %-10lu ", dev_info_set[0].model_flops.output_f32_f32);
LOG_INF("\n"); LOG_INF("\n");
LOG_INF("| Model flops (each layer) "); LOG_INF("| Model flops (output Q6KxF32) ");
LOG_INF("| %-10lu ", dev_info_set[0].model_flops.layer_flops); LOG_INF("| %-10lu ", dev_info_set[0].model_flops.output_q6k_f32);
LOG_INF("\n"); LOG_INF("\n");
LOG_INF("| Model flops (output) "); LOG_INF("| Model flops (layer F32xF32) ");
LOG_INF("| %-10lu ", dev_info_set[0].model_flops.output_flops); LOG_INF("| %-10lu ", dev_info_set[0].model_flops.layer_f32_f32);
LOG_INF("\n");
LOG_INF("| Model flops (layer F16xF32) ");
LOG_INF("| %-10lu ", dev_info_set[0].model_flops.layer_f16_f32);
LOG_INF("\n");
LOG_INF("| Model flops (layer Q4KxF32) ");
LOG_INF("| %-10lu ", dev_info_set[0].model_flops.layer_q4k_f32);
LOG_INF("\n");
LOG_INF("| Model flops (layer Q6KxF32) ");
LOG_INF("| %-10lu ", dev_info_set[0].model_flops.layer_q6k_f32);
LOG_INF("\n"); LOG_INF("\n");
LOG_INF("| Model params (input) "); LOG_INF("| Model params (input) ");
LOG_INF("| %-10lu ", dev_info_set[0].model_flops.input_params); LOG_INF("| %-10lu ", dev_info_set[0].model_params.input_params);
LOG_INF("\n"); LOG_INF("\n");
LOG_INF("| Model params (each layer) "); LOG_INF("| Model params (each layer) ");
LOG_INF("| %-10lu ", dev_info_set[0].model_flops.layer_params); LOG_INF("| %-10lu ", dev_info_set[0].model_params.layer_params);
LOG_INF("\n"); LOG_INF("\n");
LOG_INF("| Model params (output) "); LOG_INF("| Model params (output) ");
LOG_INF("| %-10lu ", dev_info_set[0].model_flops.output_params); LOG_INF("| %-10lu ", dev_info_set[0].model_params.output_params);
LOG_INF("\n"); LOG_INF("\n");
model_flops ffo = dev_info_set[0].model_flops; model_flops ffo = dev_info_set[0].model_flops;
int64_t total_flops = ffo.input_flops + ffo.output_flops + (ffo.layer_flops * llama_model_n_layers(model)); int64_t total_flops = ffo.output_f32_f32 + (ffo.layer_f32_f32 * llama_model_n_layers(model)); // todo
double cpu_flops_f16 = dev_info_set[0].cpu_props.flops_f16 * 1e9; double cpu_flops_f16 = dev_info_set[0].cpu_props.flops_f16_f32 * 1e9;
LOG_INF("| Token latency (ms) "); LOG_INF("| Token latency (ms) ");
LOG_INF("| %-10.2f ", total_flops / cpu_flops_f16 * 1000); LOG_INF("| %-10.2f ", total_flops / cpu_flops_f16 * 1000);
@ -739,10 +751,10 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) {
memcpy(ptr, &dev_info->cpu_props.cores, sizeof(uint32_t)); memcpy(ptr, &dev_info->cpu_props.cores, sizeof(uint32_t));
ptr += sizeof(uint32_t); ptr += sizeof(uint32_t);
memcpy(ptr, &dev_info->cpu_props.flops_f32, sizeof(float)); memcpy(ptr, &dev_info->cpu_props.flops_f32_f32, sizeof(float));
ptr += sizeof(float); ptr += sizeof(float);
memcpy(ptr, &dev_info->cpu_props.flops_f16, sizeof(float)); memcpy(ptr, &dev_info->cpu_props.flops_f16_f32, sizeof(float));
ptr += sizeof(float); ptr += sizeof(float);
memcpy(ptr, &dev_info->cpu_props.flops_q4k_f32, sizeof(float)); memcpy(ptr, &dev_info->cpu_props.flops_q4k_f32, sizeof(float));
@ -763,10 +775,10 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) {
memcpy(ptr, &dev_info->gpu_props.memory_total, sizeof(float)); memcpy(ptr, &dev_info->gpu_props.memory_total, sizeof(float));
ptr += sizeof(float); ptr += sizeof(float);
memcpy(ptr, &dev_info->gpu_props.metal_flops_f32, sizeof(float)); memcpy(ptr, &dev_info->gpu_props.metal_flops_f32_f32, sizeof(float));
ptr += sizeof(float); ptr += sizeof(float);
memcpy(ptr, &dev_info->gpu_props.metal_flops_f16, sizeof(float)); memcpy(ptr, &dev_info->gpu_props.metal_flops_f16_f32, sizeof(float));
ptr += sizeof(float); ptr += sizeof(float);
memcpy(ptr, &dev_info->gpu_props.metal_flops_q4k_f32, sizeof(float)); memcpy(ptr, &dev_info->gpu_props.metal_flops_q4k_f32, sizeof(float));
@ -775,10 +787,10 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) {
memcpy(ptr, &dev_info->gpu_props.metal_flops_q6k_f32, sizeof(float)); memcpy(ptr, &dev_info->gpu_props.metal_flops_q6k_f32, sizeof(float));
ptr += sizeof(float); ptr += sizeof(float);
memcpy(ptr, &dev_info->gpu_props.cuda_flops_f32, sizeof(float)); memcpy(ptr, &dev_info->gpu_props.cuda_flops_f32_f32, sizeof(float));
ptr += sizeof(float); ptr += sizeof(float);
memcpy(ptr, &dev_info->gpu_props.cuda_flops_f16, sizeof(float)); memcpy(ptr, &dev_info->gpu_props.cuda_flops_f16_f32, sizeof(float));
ptr += sizeof(float); ptr += sizeof(float);
memcpy(ptr, &dev_info->gpu_props.cuda_flops_q4k_f32, sizeof(float)); memcpy(ptr, &dev_info->gpu_props.cuda_flops_q4k_f32, sizeof(float));
@ -786,7 +798,7 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) {
memcpy(ptr, &dev_info->gpu_props.cuda_flops_q6k_f32, sizeof(float)); memcpy(ptr, &dev_info->gpu_props.cuda_flops_q6k_f32, sizeof(float));
// no need to synchronize model flops // no need to synchronize model flops and model params
return total_size; return total_size;
} }
@ -844,10 +856,10 @@ void deserialize(const char * buffer, struct device_info * dev_info) {
memcpy(&dev_info->cpu_props.cores, ptr, sizeof(uint32_t)); memcpy(&dev_info->cpu_props.cores, ptr, sizeof(uint32_t));
ptr += sizeof(uint32_t); ptr += sizeof(uint32_t);
memcpy(&dev_info->cpu_props.flops_f32, ptr, sizeof(float)); memcpy(&dev_info->cpu_props.flops_f32_f32, ptr, sizeof(float));
ptr += sizeof(float); ptr += sizeof(float);
memcpy(&dev_info->cpu_props.flops_f16, ptr, sizeof(float)); memcpy(&dev_info->cpu_props.flops_f16_f32, ptr, sizeof(float));
ptr += sizeof(float); ptr += sizeof(float);
memcpy(&dev_info->cpu_props.flops_q4k_f32, ptr, sizeof(float)); memcpy(&dev_info->cpu_props.flops_q4k_f32, ptr, sizeof(float));
@ -868,10 +880,10 @@ void deserialize(const char * buffer, struct device_info * dev_info) {
memcpy(&dev_info->gpu_props.memory_total, ptr, sizeof(float)); memcpy(&dev_info->gpu_props.memory_total, ptr, sizeof(float));
ptr += sizeof(float); ptr += sizeof(float);
memcpy(&dev_info->gpu_props.metal_flops_f32, ptr, sizeof(float)); memcpy(&dev_info->gpu_props.metal_flops_f32_f32, ptr, sizeof(float));
ptr += sizeof(float); ptr += sizeof(float);
memcpy(&dev_info->gpu_props.metal_flops_f16, ptr, sizeof(float)); memcpy(&dev_info->gpu_props.metal_flops_f16_f32, ptr, sizeof(float));
ptr += sizeof(float); ptr += sizeof(float);
memcpy(&dev_info->gpu_props.metal_flops_q4k_f32, ptr, sizeof(float)); memcpy(&dev_info->gpu_props.metal_flops_q4k_f32, ptr, sizeof(float));
@ -880,10 +892,10 @@ void deserialize(const char * buffer, struct device_info * dev_info) {
memcpy(&dev_info->gpu_props.metal_flops_q6k_f32, ptr, sizeof(float)); memcpy(&dev_info->gpu_props.metal_flops_q6k_f32, ptr, sizeof(float));
ptr += sizeof(float); ptr += sizeof(float);
memcpy(&dev_info->gpu_props.cuda_flops_f32, ptr, sizeof(float)); memcpy(&dev_info->gpu_props.cuda_flops_f32_f32, ptr, sizeof(float));
ptr += sizeof(float); ptr += sizeof(float);
memcpy(&dev_info->gpu_props.cuda_flops_f16, ptr, sizeof(float)); memcpy(&dev_info->gpu_props.cuda_flops_f16_f32, ptr, sizeof(float));
ptr += sizeof(float); ptr += sizeof(float);
memcpy(&dev_info->gpu_props.cuda_flops_q4k_f32, ptr, sizeof(float)); memcpy(&dev_info->gpu_props.cuda_flops_q4k_f32, ptr, sizeof(float));
@ -891,5 +903,5 @@ void deserialize(const char * buffer, struct device_info * dev_info) {
memcpy(&dev_info->gpu_props.cuda_flops_q6k_f32, ptr, sizeof(float)); memcpy(&dev_info->gpu_props.cuda_flops_q6k_f32, ptr, sizeof(float));
// no need to synchronize model flops // no need to synchronize model flops and model params
} }

View file

@ -8,8 +8,8 @@ struct cpu_props {
const char * name; const char * name;
const char * description; const char * description;
uint32_t cores; uint32_t cores;
float flops_f32; // in GFLOPS float flops_f32_f32; // in GFLOPS
float flops_f16; // in GFLOPS float flops_f16_f32; // in GFLOPS
float flops_q4k_f32; // in GFLOPS float flops_q4k_f32; // in GFLOPS
float flops_q6k_f32; // in GFLOPS float flops_q6k_f32; // in GFLOPS
@ -17,8 +17,8 @@ struct cpu_props {
name(""), name(""),
description(""), description(""),
cores(0), cores(0),
flops_f32 (0.0f), flops_f32_f32(0.0f),
flops_f16 (0.0f), flops_f16_f32(0.0f),
flops_q4k_f32(0.0f), flops_q4k_f32(0.0f),
flops_q6k_f32(0.0f) {} flops_q6k_f32(0.0f) {}
}; };
@ -62,12 +62,12 @@ struct gpu_props {
const char * description; const char * description;
float memory_free; // in GB float memory_free; // in GB
float memory_total; // in GB float memory_total; // in GB
float metal_flops_f32; // in GFLOPS float metal_flops_f32_f32; // in GFLOPS
float metal_flops_f16; // in GFLOPS float metal_flops_f16_f32; // in GFLOPS
float metal_flops_q4k_f32; // in GFLOPS float metal_flops_q4k_f32; // in GFLOPS
float metal_flops_q6k_f32; // in GFLOPS float metal_flops_q6k_f32; // in GFLOPS
float cuda_flops_f32; // in GFLOPS float cuda_flops_f32_f32; // in GFLOPS
float cuda_flops_f16; // in GFLOPS float cuda_flops_f16_f32; // in GFLOPS
float cuda_flops_q4k_f32; // in GFLOPS float cuda_flops_q4k_f32; // in GFLOPS
float cuda_flops_q6k_f32; // in GFLOPS float cuda_flops_q6k_f32; // in GFLOPS
@ -76,31 +76,39 @@ struct gpu_props {
description(""), description(""),
memory_free (0.0f), memory_free (0.0f),
memory_total (0.0f), memory_total (0.0f),
metal_flops_f32 (0.0f), metal_flops_f32_f32(0.0f),
metal_flops_f16 (0.0f), metal_flops_f16_f32(0.0f),
metal_flops_q4k_f32(0.0f), metal_flops_q4k_f32(0.0f),
metal_flops_q6k_f32(0.0f), metal_flops_q6k_f32(0.0f),
cuda_flops_f32 (0.0f), cuda_flops_f32_f32 (0.0f),
cuda_flops_f16 (0.0f), cuda_flops_f16_f32 (0.0f),
cuda_flops_q4k_f32 (0.0f), cuda_flops_q4k_f32 (0.0f),
cuda_flops_q6k_f32 (0.0f) {} cuda_flops_q6k_f32 (0.0f) {}
}; };
struct model_flops { struct model_flops {
// model flops int64_t output_f32_f32;
int64_t input_flops; int64_t output_q6k_f32;
int64_t output_flops; int64_t layer_f32_f32;
int64_t layer_flops; int64_t layer_f16_f32;
int64_t layer_q4k_f32;
int64_t layer_q6k_f32;
// model params model_flops() :
output_f32_f32(0),
output_q6k_f32(0),
layer_f32_f32 (0),
layer_f16_f32 (0),
layer_q4k_f32 (0),
layer_q6k_f32 (0) {}
};
struct model_params {
int64_t input_params; int64_t input_params;
int64_t output_params; int64_t output_params;
int64_t layer_params; int64_t layer_params;
model_flops() : model_params() :
input_flops (0),
output_flops (0),
layer_flops (0),
input_params (0), input_params (0),
output_params(0), output_params(0),
layer_params (0) {} layer_params (0) {}
@ -115,6 +123,7 @@ struct device_info {
struct gpu_support gpu_support; struct gpu_support gpu_support;
struct gpu_props gpu_props; struct gpu_props gpu_props;
struct model_flops model_flops; struct model_flops model_flops;
struct model_params model_params;
device_info() : device_info() :
rank(0), rank(0),
@ -124,7 +133,8 @@ struct device_info {
memory(), memory(),
gpu_support(), gpu_support(),
gpu_props(), gpu_props(),
model_flops() {} model_flops(),
model_params() {}
}; };
enum profiler_backend_type { enum profiler_backend_type {
@ -133,6 +143,12 @@ enum profiler_backend_type {
PROFILER_BACKEND_TYPE_CUDA = 2, PROFILER_BACKEND_TYPE_CUDA = 2,
}; };
enum profiler_layer_type {
PROFILER_LAYER_INPUT = 0,
PROFILER_LAYER_OUTPUT = 1,
PROFILER_LAYER_BACKEND = 2,
};
const char * device_name(void); const char * device_name(void);
uint32_t device_cpu_cores (void); uint32_t device_cpu_cores (void);

View file

@ -528,7 +528,8 @@ extern "C" {
LLAMA_API void llama_model_n_flops( LLAMA_API void llama_model_n_flops(
struct llama_model * model, struct llama_model * model,
struct llama_model_loader * ml, struct llama_model_loader * ml,
struct model_flops * ffo, struct model_flops * n_flops,
struct model_params * n_params,
const int64_t n_input, const int64_t n_input,
const int64_t n_history); const int64_t n_history);

View file

@ -3549,8 +3549,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_
void llama_profile_device(device_info * dev_info, struct llama_model * model, llama_model_loader * ml, const char * test_file, int n_threads) { void llama_profile_device(device_info * dev_info, struct llama_model * model, llama_model_loader * ml, const char * test_file, int n_threads) {
dev_info->device_name = device_name(); dev_info->device_name = device_name();
dev_info->cpu_props.cores = device_cpu_cores(); dev_info->cpu_props.cores = device_cpu_cores();
dev_info->cpu_props.flops_f32 = device_cpu_flops(model, GGML_TYPE_F32, GGML_TYPE_F32, n_threads); dev_info->cpu_props.flops_f32_f32 = device_cpu_flops(model, GGML_TYPE_F32, GGML_TYPE_F32, n_threads);
dev_info->cpu_props.flops_f16 = device_cpu_flops(model, GGML_TYPE_F16, GGML_TYPE_F16, n_threads); dev_info->cpu_props.flops_f16_f32 = device_cpu_flops(model, GGML_TYPE_F16, GGML_TYPE_F32, n_threads);
dev_info->cpu_props.flops_q4k_f32 = device_cpu_flops(model, GGML_TYPE_Q4_K, GGML_TYPE_F32, n_threads); dev_info->cpu_props.flops_q4k_f32 = device_cpu_flops(model, GGML_TYPE_Q4_K, GGML_TYPE_F32, n_threads);
dev_info->cpu_props.flops_q6k_f32 = device_cpu_flops(model, GGML_TYPE_Q6_K, GGML_TYPE_F32, n_threads); dev_info->cpu_props.flops_q6k_f32 = device_cpu_flops(model, GGML_TYPE_Q6_K, GGML_TYPE_F32, n_threads);
@ -3582,18 +3582,19 @@ void llama_profile_device(device_info * dev_info, struct llama_model * model, ll
dev_info->gpu_props.description = gpu_props.description; dev_info->gpu_props.description = gpu_props.description;
dev_info->gpu_props.memory_free = round(gpu_props.memory_free / (double)(1 << 30) * 100) / 100; dev_info->gpu_props.memory_free = round(gpu_props.memory_free / (double)(1 << 30) * 100) / 100;
dev_info->gpu_props.memory_total = round(gpu_props.memory_total / (double)(1 << 30) * 100) / 100; dev_info->gpu_props.memory_total = round(gpu_props.memory_total / (double)(1 << 30) * 100) / 100;
dev_info->gpu_props.metal_flops_f32 = device_metal_flops(model, GGML_TYPE_F32, GGML_TYPE_F32); dev_info->gpu_props.metal_flops_f32_f32 = device_metal_flops(model, GGML_TYPE_F32, GGML_TYPE_F32);
dev_info->gpu_props.metal_flops_f16 = device_metal_flops(model, GGML_TYPE_F16, GGML_TYPE_F16); dev_info->gpu_props.metal_flops_f16_f32 = device_metal_flops(model, GGML_TYPE_F16, GGML_TYPE_F32);
dev_info->gpu_props.metal_flops_q4k_f32 = device_metal_flops(model, GGML_TYPE_Q4_K, GGML_TYPE_F32); dev_info->gpu_props.metal_flops_q4k_f32 = device_metal_flops(model, GGML_TYPE_Q4_K, GGML_TYPE_F32);
dev_info->gpu_props.metal_flops_q6k_f32 = device_metal_flops(model, GGML_TYPE_Q6_K, GGML_TYPE_F32); dev_info->gpu_props.metal_flops_q6k_f32 = device_metal_flops(model, GGML_TYPE_Q6_K, GGML_TYPE_F32);
dev_info->gpu_props.cuda_flops_f32 = device_cuda_flops (model, GGML_TYPE_F32, GGML_TYPE_F32); dev_info->gpu_props.cuda_flops_f32_f32 = device_cuda_flops (model, GGML_TYPE_F32, GGML_TYPE_F32);
dev_info->gpu_props.cuda_flops_f16 = device_cuda_flops (model, GGML_TYPE_F16, GGML_TYPE_F16); dev_info->gpu_props.cuda_flops_f16_f32 = device_cuda_flops (model, GGML_TYPE_F16, GGML_TYPE_F32);
dev_info->gpu_props.cuda_flops_q4k_f32 = device_cuda_flops (model, GGML_TYPE_Q4_K, GGML_TYPE_F32); dev_info->gpu_props.cuda_flops_q4k_f32 = device_cuda_flops (model, GGML_TYPE_Q4_K, GGML_TYPE_F32);
dev_info->gpu_props.cuda_flops_q6k_f32 = device_cuda_flops (model, GGML_TYPE_Q6_K, GGML_TYPE_F32); dev_info->gpu_props.cuda_flops_q6k_f32 = device_cuda_flops (model, GGML_TYPE_Q6_K, GGML_TYPE_F32);
if (dev_info->rank == 0) { if (dev_info->rank == 0) {
struct model_flops * ffo = &dev_info->model_flops; struct model_flops * n_flops = &dev_info->model_flops;
llama_model_n_flops(model, ml, ffo, 1, 10); struct model_params * n_params = &dev_info->model_params;
llama_model_n_flops(model, ml, n_flops, n_params, 1, 10);
} }
} }
@ -20669,7 +20670,46 @@ static void llama_model_reset_tensors(struct llama_model * model) {
model->cls_out_b = nullptr; model->cls_out_b = nullptr;
} }
void llama_model_n_flops(struct llama_model * model, struct llama_model_loader * ml, struct model_flops * ffo, const int64_t n_input, const int64_t n_history) { static void count_n_flops(struct model_flops * n_flops, enum ggml_type dtype, enum profiler_layer_type ltype, int64_t n) {
switch (ltype) {
case PROFILER_LAYER_OUTPUT:
switch (dtype) {
case GGML_TYPE_F32:
n_flops->output_f32_f32 += n;
break;
case GGML_TYPE_Q6_K:
n_flops->output_q6k_f32 += n;
break;
default:
throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_OUTPUT\n");
}
break;
case PROFILER_LAYER_BACKEND:
switch (dtype) {
case GGML_TYPE_F32:
n_flops->layer_f32_f32 += n;
break;
case GGML_TYPE_F16:
n_flops->layer_f16_f32 += n;
break;
case GGML_TYPE_Q4_K:
n_flops->layer_q4k_f32 += n;
break;
case GGML_TYPE_Q6_K:
n_flops->layer_q6k_f32 += n;
break;
default:
throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_BACKEND\n");
}
break;
default:
throw std::runtime_error("Unrecognized profiler layer type\n");
}
}
void llama_model_n_flops(struct llama_model * model, struct llama_model_loader * ml, struct model_flops * n_flops, struct model_params * n_params, const int64_t n_input, const int64_t n_history) {
const llama_hparams hparams = model->hparams; const llama_hparams hparams = model->hparams;
const int64_t n_layer = hparams.n_layer; const int64_t n_layer = hparams.n_layer;
const int64_t n_vocab = hparams.n_vocab; const int64_t n_vocab = hparams.n_vocab;
@ -20774,73 +20814,73 @@ void llama_model_n_flops(struct llama_model * model, struct llama_model_loader *
if (it != tensor_name_map.end()) { if (it != tensor_name_map.end()) {
switch (it->second) { switch (it->second) {
case 1: { // "token_embd.weight" case 1: { // "token_embd.weight"
ffo->input_flops += (2 * n_input * n_embd * n_vocab - n_input * n_embd); n_params->input_params += static_cast<int64_t>(ggml_nelements(cur));
ffo->input_params += static_cast<int64_t>(ggml_nelements(cur));
break; break;
} }
case 2: { // "output_norm.weight" case 2: { // "output_norm.weight"
ffo->output_flops += n_input * (8 * n_embd + 1); count_n_flops(n_flops, cur->type, PROFILER_LAYER_OUTPUT, n_input * (4 * n_embd + 1));
ffo->output_params += static_cast<int64_t>(ggml_nelements(cur)); n_params->output_params += static_cast<int64_t>(ggml_nelements(cur));
break; break;
} }
case 3: { // "output.weight" case 3: { // "output.weight"
ffo->output_flops += 2 * n_input * n_embd * n_vocab; count_n_flops(n_flops, cur->type, PROFILER_LAYER_OUTPUT, 2 * n_input * n_embd * n_vocab);
ffo->output_flops += 5 * n_input * n_vocab; count_n_flops(n_flops, GGML_TYPE_F32, PROFILER_LAYER_OUTPUT, 5 * n_input * n_vocab); // softmax
ffo->output_params += static_cast<int64_t>(ggml_nelements(cur)); n_params->output_params += static_cast<int64_t>(ggml_nelements(cur));
break; break;
} }
case 4: // "blk.0.attn_norm.weight" case 4: // "blk.0.attn_norm.weight"
case 12: // "blk.0.ffn_norm.weight" case 12: // "blk.0.ffn_norm.weight"
{ {
ffo->layer_flops += n_input * (8 * n_embd + 1); count_n_flops(n_flops, cur->type, PROFILER_LAYER_BACKEND, n_input * (4 * n_embd + 1));
ffo->layer_params += static_cast<int64_t>(ggml_nelements(cur)); n_params->layer_params += static_cast<int64_t>(ggml_nelements(cur));
break; break;
} }
case 5: { // "blk.0.attn_q.weight" case 5: { // "blk.0.attn_q.weight"
ffo->layer_flops += 2 * n_input * n_embd * (n_head * n_embd_head_k); count_n_flops(n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * (n_head * n_embd_head_k));
ffo->layer_params += static_cast<int64_t>(ggml_nelements(cur)); count_n_flops(n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd_head_k); // rope
n_params->layer_params += static_cast<int64_t>(ggml_nelements(cur));
break; break;
} }
case 6: { // "blk.0.attn_k.weight" case 6: { // "blk.0.attn_k.weight"
ffo->layer_flops += 2 * n_input * n_embd * (n_head * n_embd_k_gqa); count_n_flops(n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * (n_head * n_embd_k_gqa));
ffo->layer_flops += 2 * n_input * (n_input + n_history) * n_embd_head_k * n_head; // Q*K with KVCache count_n_flops(n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd_k_gqa); // rope
ffo->layer_flops += 7 * n_input * (n_input + n_history) * n_head; // scale, mask, and softmax count_n_flops(n_flops, GGML_TYPE_F16, PROFILER_LAYER_BACKEND, 2 * n_input * (n_input + n_history) * n_embd_head_k * n_head); // compute kq with kvcache
ffo->layer_params += static_cast<int64_t>(ggml_nelements(cur)); count_n_flops(n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 7 * n_input * (n_input + n_history) * n_head); // scale, mask, and softmax
n_params->layer_params += static_cast<int64_t>(ggml_nelements(cur));
break; break;
} }
case 7: { // "blk.0.attn_v.weight" case 7: { // "blk.0.attn_v.weight"
ffo->layer_flops += 2 * n_input * n_embd * (n_head * n_embd_v_gqa); count_n_flops(n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * (n_head * n_embd_v_gqa));
ffo->layer_flops += n_input * (n_input + n_history) * n_embd_head_k * n_head; // QKV with KVCache count_n_flops(n_flops, GGML_TYPE_F16, PROFILER_LAYER_BACKEND, n_input * (n_input + n_history) * n_embd_head_k * n_head); // compute kqv with kvcache
ffo->layer_params += static_cast<int64_t>(ggml_nelements(cur)); n_params->layer_params += static_cast<int64_t>(ggml_nelements(cur));
break; break;
} }
case 8: { // "blk.0.attn_output.weight" case 8: { // "blk.0.attn_output.weight"
ffo->layer_flops += 2 * n_input * (n_head * n_embd_head_k) * n_embd; count_n_flops(n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * (n_head * n_embd_head_k) * n_embd);
ffo->layer_flops += n_input * n_embd; // shortcut count_n_flops(n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, n_input * n_embd); // shortcut
ffo->layer_params += static_cast<int64_t>(ggml_nelements(cur)); n_params->layer_params += static_cast<int64_t>(ggml_nelements(cur));
break; break;
} }
case 9: { // "blk.0.ffn_gate.weight" case 9: { // "blk.0.ffn_gate.weight"
ffo->layer_flops += 2 * n_input * n_embd * n_ff; count_n_flops(n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * n_ff);
ffo->layer_flops += 5 * n_input * n_ff; // SiLU count_n_flops(n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 5 * n_input * n_ff); // SiLU
ffo->layer_params += static_cast<int64_t>(ggml_nelements(cur)); n_params->layer_params += static_cast<int64_t>(ggml_nelements(cur));
break; break;
} }
case 10: { // "blk.0.ffn_down.weight" case 10: { // "blk.0.ffn_down.weight"
ffo->layer_flops += 2 * n_input * n_embd * n_ff; count_n_flops(n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * n_ff);
ffo->layer_flops += n_input * n_embd; // shortcut count_n_flops(n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, n_input * n_embd); // shortcut
ffo->layer_params += static_cast<int64_t>(ggml_nelements(cur)); n_params->layer_params += static_cast<int64_t>(ggml_nelements(cur));
break; break;
} }
case 11: { // "blk.0.ffn_up.weight" case 11: { // "blk.0.ffn_up.weight"
ffo->layer_flops += 2 * n_input * n_embd * n_ff; count_n_flops(n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * n_ff);
ffo->layer_flops += n_input * n_ff; // silu(gate(x)) * up(x) count_n_flops(n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, n_input * n_ff); // silu(gate(x)) * up(x)
ffo->layer_params += static_cast<int64_t>(ggml_nelements(cur)); n_params->layer_params += static_cast<int64_t>(ggml_nelements(cur));
break; break;
} }
case 13: { // rope_freqs.weight, for Q and K case 13: { // rope_freqs.weight, has been counted in q and k
ffo->layer_flops += 8 * n_input * n_head * n_embd_head_k; n_params->layer_params += static_cast<int64_t>(ggml_nelements(cur));
ffo->layer_params += static_cast<int64_t>(ggml_nelements(cur));
break; break;
} }
// optional: bias tensors // optional: bias tensors
@ -20850,29 +20890,29 @@ void llama_model_n_flops(struct llama_model * model, struct llama_model_loader *
case 17: // "blk.0.attn_output.bias" case 17: // "blk.0.attn_output.bias"
case 19: // "blk.0.ffn_down.bias" case 19: // "blk.0.ffn_down.bias"
{ {
ffo->layer_flops += n_input * n_embd; count_n_flops(n_flops, cur->type, PROFILER_LAYER_BACKEND, n_input * n_embd);
ffo->layer_params += static_cast<int64_t>(ggml_nelements(cur)); n_params->layer_params += static_cast<int64_t>(ggml_nelements(cur));
break; break;
} }
case 18: // "blk.0.ffn_gate.bias" case 18: // "blk.0.ffn_gate.bias"
case 20: // "blk.0.ffn_up.bias" case 20: // "blk.0.ffn_up.bias"
{ {
ffo->layer_flops += n_input * n_ff; count_n_flops(n_flops, cur->type, PROFILER_LAYER_BACKEND, n_input * n_ff);
ffo->layer_params += static_cast<int64_t>(ggml_nelements(cur)); n_params->layer_params += static_cast<int64_t>(ggml_nelements(cur));
break; break;
} }
// optional: expert tensors // optional: expert tensors
case 21: { // "blk.0.ffn_gate_inp.weight" case 21: { // "blk.0.ffn_gate_inp.weight"
ffo->layer_flops += 2 * n_input * n_embd * n_expert; count_n_flops(n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * n_expert);
ffo->layer_params += static_cast<int64_t>(ggml_nelements(cur)); n_params->layer_params += static_cast<int64_t>(ggml_nelements(cur));
break; break;
} }
case 22: // "blk.0.ffn_gate_exps.weight" case 22: // "blk.0.ffn_gate_exps.weight"
case 23: // "blk.0.ffn_down_exps.weight" case 23: // "blk.0.ffn_down_exps.weight"
case 24: // "blk.0.ffn_up_exps.weight" case 24: // "blk.0.ffn_up_exps.weight"
{ {
ffo->layer_flops += 2 * n_input * n_embd * n_ff * n_expert; count_n_flops(n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * n_ff * n_expert);
ffo->layer_params += static_cast<int64_t>(ggml_nelements(cur)); n_params->layer_params += static_cast<int64_t>(ggml_nelements(cur));
break; break;
} }
default: default: