add model_flops

This commit is contained in:
Zonghang Li 2024-11-21 20:06:16 +04:00
parent 80f6b72e71
commit 7ee1423006
5 changed files with 67 additions and 42 deletions

View file

@ -914,7 +914,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
dev_info_set = (struct device_info *)malloc(n_world * sizeof(struct device_info)); dev_info_set = (struct device_info *)malloc(n_world * sizeof(struct device_info));
dev_info_set[0] = dev_info; dev_info_set[0] = dev_info;
llama_gather_device_info(lctx, dev_info_set); llama_gather_device_info(lctx, dev_info_set);
device_print_props(dev_info_set, n_world); device_print_props(dev_info_set, n_world, model);
} else { } else {
llama_send_device_info(lctx, &dev_info); llama_send_device_info(lctx, &dev_info);
} }

View file

@ -23,6 +23,7 @@
#include "ggml-cuda.h" #include "ggml-cuda.h"
#endif #endif
#include <cmath>
#include <chrono> #include <chrono>
#include <fstream> #include <fstream>
#include <string> #include <string>
@ -83,16 +84,10 @@ uint32_t device_cpu_cores() {
static float device_flops(struct llama_model * model, enum ggml_type dtype, profiler_backend_type btype, int n_threads) { static float device_flops(struct llama_model * model, enum ggml_type dtype, profiler_backend_type btype, int n_threads) {
const int n_embd = llama_n_embd(model); const int n_embd = llama_n_embd(model);
const int n_ff_hidden = llama_n_ff_hidden(model); std::vector<float> matrix_A(n_embd * n_embd, 1.0f);
const int rows_A = n_embd, cols_A = n_ff_hidden; std::vector<float> matrix_B(n_embd * n_embd, 1.0f / n_embd);
const int rows_B = n_embd, cols_B = n_ff_hidden;
GGML_ASSERT(cols_A == cols_B);
std::vector<float> matrix_A(cols_A * rows_A, 1.0f);
std::vector<float> matrix_B(cols_B * rows_B, 1.0f / cols_B);
ggml_backend_t backend = NULL; ggml_backend_t backend = NULL;
switch (btype) { switch (btype) {
case PROFILER_BACKEND_TYPE_CPU: case PROFILER_BACKEND_TYPE_CPU:
backend = ggml_backend_cpu_init(); backend = ggml_backend_cpu_init();
@ -124,8 +119,8 @@ static float device_flops(struct llama_model * model, enum ggml_type dtype, prof
}; };
struct ggml_context * ctx = ggml_init(params); struct ggml_context * ctx = ggml_init(params);
struct ggml_tensor * tensor_a = ggml_new_tensor_2d(ctx, dtype, cols_A, rows_A); struct ggml_tensor * tensor_a = ggml_new_tensor_2d(ctx, dtype, n_embd, n_embd);
struct ggml_tensor * tensor_b = ggml_new_tensor_2d(ctx, dtype, cols_B, rows_B); struct ggml_tensor * tensor_b = ggml_new_tensor_2d(ctx, dtype, n_embd, n_embd);
ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx, backend); ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
@ -162,7 +157,7 @@ static float device_flops(struct llama_model * model, enum ggml_type dtype, prof
const int64_t t_end = ggml_time_us(); const int64_t t_end = ggml_time_us();
double elapsed_seconds = ((double)t_end - (double)t_start) / 1e6; // convert to seconds double elapsed_seconds = ((double)t_end - (double)t_start) / 1e6; // convert to seconds
double flops = (2.0 * (double)cols_A * (double)rows_A * (double)rows_B) / elapsed_seconds / 1e9; // convert to GFLOPS double flops = (2.0 * (double)n_embd * (double)n_embd * (double)n_embd) / elapsed_seconds / 1e9; // convert to GFLOPS
ggml_free(ctx_cgraph); ggml_free(ctx_cgraph);
ggml_gallocr_free(allocr); ggml_gallocr_free(allocr);
@ -435,7 +430,7 @@ void device_get_props(struct llama_model * model, int device, struct ggml_backen
ggml_backend_dev_get_props(dev, props); ggml_backend_dev_get_props(dev, props);
} }
void device_print_props(struct device_info * dev_info_set, int n) { void device_print_props(struct device_info * dev_info_set, int n, struct llama_model * model) {
LOG_INF("\n-------------------------------------------------------------------------------------------\n"); LOG_INF("\n-------------------------------------------------------------------------------------------\n");
LOG_INF("| Property "); LOG_INF("| Property ");
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
@ -612,6 +607,38 @@ void device_print_props(struct device_info * dev_info_set, int n) {
} }
LOG_INF("\n"); LOG_INF("\n");
LOG_INF("| Model flops (input) ");
LOG_INF("| %-10lu ", dev_info_set[0].model_flops.input_flops);
LOG_INF("\n");
LOG_INF("| Model flops (each layer) ");
LOG_INF("| %-10lu ", dev_info_set[0].model_flops.layer_flops);
LOG_INF("\n");
LOG_INF("| Model flops (output) ");
LOG_INF("| %-10lu ", dev_info_set[0].model_flops.output_flops);
LOG_INF("\n");
LOG_INF("| Model params (input) ");
LOG_INF("| %-10lu ", dev_info_set[0].model_flops.input_params);
LOG_INF("\n");
LOG_INF("| Model params (each layer) ");
LOG_INF("| %-10lu ", dev_info_set[0].model_flops.layer_params);
LOG_INF("\n");
LOG_INF("| Model params (output) ");
LOG_INF("| %-10lu ", dev_info_set[0].model_flops.output_params);
LOG_INF("\n");
model_flops ffo = dev_info_set[0].model_flops;
int64_t total_flops = ffo.input_flops + ffo.output_flops + (ffo.layer_flops * llama_model_n_layers(model));
double cpu_flops_f16 = dev_info_set[0].cpu_props.flops_f16 * 1e9;
LOG_INF("| Token latency (ms) ");
LOG_INF("| %-10.2f ", total_flops / cpu_flops_f16 * 1000);
LOG_INF("\n");
LOG_INF("-------------------------------------------------------------------------------------------\n\n"); LOG_INF("-------------------------------------------------------------------------------------------\n\n");
} }
@ -711,6 +738,7 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) {
memcpy(ptr, &dev_info->gpu_props.cuda_flops_q4k, sizeof(float)); memcpy(ptr, &dev_info->gpu_props.cuda_flops_q4k, sizeof(float));
// no need to synchronize model flops
return total_size; return total_size;
} }
@ -799,4 +827,6 @@ void deserialize(const char * buffer, struct device_info * dev_info) {
ptr += sizeof(float); ptr += sizeof(float);
memcpy(&dev_info->gpu_props.cuda_flops_q4k, ptr, sizeof(float)); memcpy(&dev_info->gpu_props.cuda_flops_q4k, ptr, sizeof(float));
// no need to synchronize model flops
} }

View file

@ -54,20 +54,7 @@ struct gpu_props {
: name(""), description(""), memory_free(0.0f), memory_total(0.0f), metal_flops(0.0f), cuda_flops_f32(0.0f), cuda_flops_f16(0.0f), cuda_flops_q8(0.0f), cuda_flops_q4k(0.0f) {} : name(""), description(""), memory_free(0.0f), memory_total(0.0f), metal_flops(0.0f), cuda_flops_f32(0.0f), cuda_flops_f16(0.0f), cuda_flops_q8(0.0f), cuda_flops_q4k(0.0f) {}
}; };
struct device_info { struct model_flops {
uint32_t rank;
const char * device_name;
float disk_read_bandwidth; // in GB/s
struct cpu_props cpu_props;
struct memory_info memory;
struct gpu_support gpu_support;
struct gpu_props gpu_props;
device_info()
: rank(0), device_name(""), disk_read_bandwidth(0.0f), cpu_props(), memory(), gpu_support(), gpu_props() {}
};
struct flops_info {
// model flops // model flops
int64_t input_flops; int64_t input_flops;
int64_t output_flops; int64_t output_flops;
@ -78,10 +65,24 @@ struct flops_info {
int64_t output_params; int64_t output_params;
int64_t layer_params; int64_t layer_params;
flops_info() model_flops()
: input_flops(0), output_flops(0), layer_flops(0), input_params(0), output_params(0), layer_params(0) {} : input_flops(0), output_flops(0), layer_flops(0), input_params(0), output_params(0), layer_params(0) {}
}; };
struct device_info {
uint32_t rank;
const char * device_name;
float disk_read_bandwidth; // in GB/s
struct cpu_props cpu_props;
struct memory_info memory;
struct gpu_support gpu_support;
struct gpu_props gpu_props;
struct model_flops model_flops;
device_info()
: rank(0), device_name(""), disk_read_bandwidth(0.0f), cpu_props(), memory(), gpu_support(), gpu_props(), model_flops() {}
};
enum profiler_backend_type { enum profiler_backend_type {
PROFILER_BACKEND_TYPE_CPU = 0, PROFILER_BACKEND_TYPE_CPU = 0,
PROFILER_BACKEND_TYPE_METAL = 1, PROFILER_BACKEND_TYPE_METAL = 1,
@ -99,7 +100,7 @@ uint64_t device_swap_memory (bool available);
uint64_t device_disk_read_bw (const char * test_file, size_t buffer_size_mb); uint64_t device_disk_read_bw (const char * test_file, size_t buffer_size_mb);
uint64_t device_memory_bw (size_t buffer_size_mb); uint64_t device_memory_bw (size_t buffer_size_mb);
void device_get_props (struct llama_model * model, int device, struct ggml_backend_dev_props * props); void device_get_props (struct llama_model * model, int device, struct ggml_backend_dev_props * props);
void device_print_props (struct device_info * dev_info_set, int n); void device_print_props (struct device_info * dev_info_set, int n, struct llama_model * model);
int device_has_metal (void); int device_has_metal (void);
int device_has_cuda (void); int device_has_cuda (void);

View file

@ -528,7 +528,7 @@ extern "C" {
LLAMA_API void llama_model_n_flops( LLAMA_API void llama_model_n_flops(
struct llama_model * model, struct llama_model * model,
struct llama_model_loader * ml, struct llama_model_loader * ml,
struct flops_info * ffo, struct model_flops * ffo,
const int64_t n_input, const int64_t n_input,
const int64_t n_history); const int64_t n_history);

View file

@ -110,7 +110,7 @@ struct Timer {
~Timer() { ~Timer() {
if (enable_timer) { if (enable_timer) {
int64_t end_time = ggml_time_us(); int64_t end_time = ggml_time_us();
LLAMA_LOG_INFO("Time to run %s: %lld ms\n", name, (end_time - start_time)/1000); LLAMA_LOG_INFO("Time to run %s: %lu ms\n", name, (end_time - start_time)/1000);
} }
} }
}; };
@ -3587,14 +3587,8 @@ void llama_profile_device(device_info * dev_info, struct llama_model * model, ll
dev_info->gpu_props.cuda_flops_q4k = device_cuda_flops(model, GGML_TYPE_Q4_K); dev_info->gpu_props.cuda_flops_q4k = device_cuda_flops(model, GGML_TYPE_Q4_K);
if (dev_info->rank == 0) { if (dev_info->rank == 0) {
struct flops_info ffo = flops_info{}; struct model_flops * ffo = &dev_info->model_flops;
llama_model_n_flops(model, ml, &ffo, 1, 10); llama_model_n_flops(model, ml, ffo, 1, 10);
LLAMA_LOG_INFO("input_flops: %llu\n", ffo.input_flops);
LLAMA_LOG_INFO("output_flops: %llu\n", ffo.output_flops);
LLAMA_LOG_INFO("layer_flops: %llu\n", ffo.layer_flops);
LLAMA_LOG_INFO("input_params: %llu\n", ffo.input_params);
LLAMA_LOG_INFO("output_params: %llu\n", ffo.output_params);
LLAMA_LOG_INFO("layer_params: %llu\n", ffo.layer_params);
} }
} }
@ -20668,7 +20662,7 @@ static void llama_model_reset_tensors(struct llama_model * model) {
model->cls_out_b = nullptr; model->cls_out_b = nullptr;
} }
void llama_model_n_flops(struct llama_model * model, struct llama_model_loader * ml, struct flops_info * ffo, const int64_t n_input, const int64_t n_history) { void llama_model_n_flops(struct llama_model * model, struct llama_model_loader * ml, struct model_flops * ffo, const int64_t n_input, const int64_t n_history) {
const llama_hparams hparams = model->hparams; const llama_hparams hparams = model->hparams;
const int64_t n_layer = hparams.n_layer; const int64_t n_layer = hparams.n_layer;
const int64_t n_vocab = hparams.n_vocab; const int64_t n_vocab = hparams.n_vocab;