diff --git a/common/common.cpp b/common/common.cpp index 4c228626..1a62d260 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -914,7 +914,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { dev_info_set = (struct device_info *)malloc(n_world * sizeof(struct device_info)); dev_info_set[0] = dev_info; llama_gather_device_info(lctx, dev_info_set); - device_print_props(dev_info_set, n_world); + device_print_props(dev_info_set, n_world, model); } else { llama_send_device_info(lctx, &dev_info); } diff --git a/common/profiler.cpp b/common/profiler.cpp index 3bc44de2..05fb4ba0 100644 --- a/common/profiler.cpp +++ b/common/profiler.cpp @@ -23,6 +23,7 @@ #include "ggml-cuda.h" #endif +#include #include #include #include @@ -82,17 +83,11 @@ uint32_t device_cpu_cores() { } static float device_flops(struct llama_model * model, enum ggml_type dtype, profiler_backend_type btype, int n_threads) { - const int n_embd = llama_n_embd(model); - const int n_ff_hidden = llama_n_ff_hidden(model); - const int rows_A = n_embd, cols_A = n_ff_hidden; - const int rows_B = n_embd, cols_B = n_ff_hidden; - GGML_ASSERT(cols_A == cols_B); - - std::vector matrix_A(cols_A * rows_A, 1.0f); - std::vector matrix_B(cols_B * rows_B, 1.0f / cols_B); + const int n_embd = llama_n_embd(model); + std::vector matrix_A(n_embd * n_embd, 1.0f); + std::vector matrix_B(n_embd * n_embd, 1.0f / n_embd); ggml_backend_t backend = NULL; - switch (btype) { case PROFILER_BACKEND_TYPE_CPU: backend = ggml_backend_cpu_init(); @@ -124,15 +119,15 @@ static float device_flops(struct llama_model * model, enum ggml_type dtype, prof }; struct ggml_context * ctx = ggml_init(params); - struct ggml_tensor * tensor_a = ggml_new_tensor_2d(ctx, dtype, cols_A, rows_A); - struct ggml_tensor * tensor_b = ggml_new_tensor_2d(ctx, dtype, cols_B, rows_B); + struct ggml_tensor * tensor_a = ggml_new_tensor_2d(ctx, dtype, n_embd, n_embd); + struct ggml_tensor * tensor_b = ggml_new_tensor_2d(ctx, dtype, n_embd, n_embd); ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx, backend); ggml_backend_tensor_set(tensor_a, matrix_A.data(), 0, ggml_nbytes(tensor_a)); ggml_backend_tensor_set(tensor_b, matrix_B.data(), 0, ggml_nbytes(tensor_b)); - struct ggml_cgraph * gf = NULL; + struct ggml_cgraph * gf = NULL; struct ggml_context * ctx_cgraph = NULL; { struct ggml_init_params params0 = { @@ -162,7 +157,7 @@ static float device_flops(struct llama_model * model, enum ggml_type dtype, prof const int64_t t_end = ggml_time_us(); double elapsed_seconds = ((double)t_end - (double)t_start) / 1e6; // convert to seconds - double flops = (2.0 * (double)cols_A * (double)rows_A * (double)rows_B) / elapsed_seconds / 1e9; // convert to GFLOPS + double flops = (2.0 * (double)n_embd * (double)n_embd * (double)n_embd) / elapsed_seconds / 1e9; // convert to GFLOPS ggml_free(ctx_cgraph); ggml_gallocr_free(allocr); @@ -435,7 +430,7 @@ void device_get_props(struct llama_model * model, int device, struct ggml_backen ggml_backend_dev_get_props(dev, props); } -void device_print_props(struct device_info * dev_info_set, int n) { +void device_print_props(struct device_info * dev_info_set, int n, struct llama_model * model) { LOG_INF("\n-------------------------------------------------------------------------------------------\n"); LOG_INF("| Property "); for (int i = 0; i < n; ++i) { @@ -612,6 +607,38 @@ void device_print_props(struct device_info * dev_info_set, int n) { } LOG_INF("\n"); + LOG_INF("| Model flops (input) "); + LOG_INF("| %-10lu ", dev_info_set[0].model_flops.input_flops); + LOG_INF("\n"); + + LOG_INF("| Model flops (each layer) "); + LOG_INF("| %-10lu ", dev_info_set[0].model_flops.layer_flops); + LOG_INF("\n"); + + LOG_INF("| Model flops (output) "); + LOG_INF("| %-10lu ", dev_info_set[0].model_flops.output_flops); + LOG_INF("\n"); + + LOG_INF("| Model params (input) "); + LOG_INF("| %-10lu ", dev_info_set[0].model_flops.input_params); + LOG_INF("\n"); + + LOG_INF("| Model params (each layer) "); + LOG_INF("| %-10lu ", dev_info_set[0].model_flops.layer_params); + LOG_INF("\n"); + + LOG_INF("| Model params (output) "); + LOG_INF("| %-10lu ", dev_info_set[0].model_flops.output_params); + LOG_INF("\n"); + + model_flops ffo = dev_info_set[0].model_flops; + int64_t total_flops = ffo.input_flops + ffo.output_flops + (ffo.layer_flops * llama_model_n_layers(model)); + double cpu_flops_f16 = dev_info_set[0].cpu_props.flops_f16 * 1e9; + + LOG_INF("| Token latency (ms) "); + LOG_INF("| %-10.2f ", total_flops / cpu_flops_f16 * 1000); + LOG_INF("\n"); + LOG_INF("-------------------------------------------------------------------------------------------\n\n"); } @@ -711,6 +738,7 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) { memcpy(ptr, &dev_info->gpu_props.cuda_flops_q4k, sizeof(float)); + // no need to synchronize model flops return total_size; } @@ -799,4 +827,6 @@ void deserialize(const char * buffer, struct device_info * dev_info) { ptr += sizeof(float); memcpy(&dev_info->gpu_props.cuda_flops_q4k, ptr, sizeof(float)); + + // no need to synchronize model flops } diff --git a/common/profiler.h b/common/profiler.h index 8d796769..c9f046a1 100644 --- a/common/profiler.h +++ b/common/profiler.h @@ -54,20 +54,7 @@ struct gpu_props { : name(""), description(""), memory_free(0.0f), memory_total(0.0f), metal_flops(0.0f), cuda_flops_f32(0.0f), cuda_flops_f16(0.0f), cuda_flops_q8(0.0f), cuda_flops_q4k(0.0f) {} }; -struct device_info { - uint32_t rank; - const char * device_name; - float disk_read_bandwidth; // in GB/s - struct cpu_props cpu_props; - struct memory_info memory; - struct gpu_support gpu_support; - struct gpu_props gpu_props; - - device_info() - : rank(0), device_name(""), disk_read_bandwidth(0.0f), cpu_props(), memory(), gpu_support(), gpu_props() {} -}; - -struct flops_info { +struct model_flops { // model flops int64_t input_flops; int64_t output_flops; @@ -78,10 +65,24 @@ struct flops_info { int64_t output_params; int64_t layer_params; - flops_info() + model_flops() : input_flops(0), output_flops(0), layer_flops(0), input_params(0), output_params(0), layer_params(0) {} }; +struct device_info { + uint32_t rank; + const char * device_name; + float disk_read_bandwidth; // in GB/s + struct cpu_props cpu_props; + struct memory_info memory; + struct gpu_support gpu_support; + struct gpu_props gpu_props; + struct model_flops model_flops; + + device_info() + : rank(0), device_name(""), disk_read_bandwidth(0.0f), cpu_props(), memory(), gpu_support(), gpu_props(), model_flops() {} +}; + enum profiler_backend_type { PROFILER_BACKEND_TYPE_CPU = 0, PROFILER_BACKEND_TYPE_METAL = 1, @@ -99,7 +100,7 @@ uint64_t device_swap_memory (bool available); uint64_t device_disk_read_bw (const char * test_file, size_t buffer_size_mb); uint64_t device_memory_bw (size_t buffer_size_mb); void device_get_props (struct llama_model * model, int device, struct ggml_backend_dev_props * props); -void device_print_props (struct device_info * dev_info_set, int n); +void device_print_props (struct device_info * dev_info_set, int n, struct llama_model * model); int device_has_metal (void); int device_has_cuda (void); diff --git a/include/llama.h b/include/llama.h index 5506ad0a..7fe18d39 100644 --- a/include/llama.h +++ b/include/llama.h @@ -528,7 +528,7 @@ extern "C" { LLAMA_API void llama_model_n_flops( struct llama_model * model, struct llama_model_loader * ml, - struct flops_info * ffo, + struct model_flops * ffo, const int64_t n_input, const int64_t n_history); diff --git a/src/llama.cpp b/src/llama.cpp index 5532585b..303f451d 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -110,7 +110,7 @@ struct Timer { ~Timer() { if (enable_timer) { int64_t end_time = ggml_time_us(); - LLAMA_LOG_INFO("Time to run %s: %lld ms\n", name, (end_time - start_time)/1000); + LLAMA_LOG_INFO("Time to run %s: %lu ms\n", name, (end_time - start_time)/1000); } } }; @@ -3587,14 +3587,8 @@ void llama_profile_device(device_info * dev_info, struct llama_model * model, ll dev_info->gpu_props.cuda_flops_q4k = device_cuda_flops(model, GGML_TYPE_Q4_K); if (dev_info->rank == 0) { - struct flops_info ffo = flops_info{}; - llama_model_n_flops(model, ml, &ffo, 1, 10); - LLAMA_LOG_INFO("input_flops: %llu\n", ffo.input_flops); - LLAMA_LOG_INFO("output_flops: %llu\n", ffo.output_flops); - LLAMA_LOG_INFO("layer_flops: %llu\n", ffo.layer_flops); - LLAMA_LOG_INFO("input_params: %llu\n", ffo.input_params); - LLAMA_LOG_INFO("output_params: %llu\n", ffo.output_params); - LLAMA_LOG_INFO("layer_params: %llu\n", ffo.layer_params); + struct model_flops * ffo = &dev_info->model_flops; + llama_model_n_flops(model, ml, ffo, 1, 10); } } @@ -20668,7 +20662,7 @@ static void llama_model_reset_tensors(struct llama_model * model) { model->cls_out_b = nullptr; } -void llama_model_n_flops(struct llama_model * model, struct llama_model_loader * ml, struct flops_info * ffo, const int64_t n_input, const int64_t n_history) { +void llama_model_n_flops(struct llama_model * model, struct llama_model_loader * ml, struct model_flops * ffo, const int64_t n_input, const int64_t n_history) { const llama_hparams hparams = model->hparams; const int64_t n_layer = hparams.n_layer; const int64_t n_vocab = hparams.n_vocab;