diff --git a/Makefile b/Makefile index f2fe5c24..3ff09f78 100644 --- a/Makefile +++ b/Makefile @@ -1,45 +1,46 @@ # Define the default target now so that it is always the first target -BUILD_TARGETS = \ - libllava.a \ - llama-baby-llama \ - llama-batched \ - llama-batched-bench \ - llama-bench \ - llama-cli \ - llama-convert-llama2c-to-ggml \ - llama-embedding \ - llama-eval-callback \ - llama-export-lora \ - llama-gbnf-validator \ - llama-gguf \ - llama-gguf-hash \ - llama-gguf-split \ - llama-gritlm \ - llama-imatrix \ - llama-infill \ - llama-llava-cli \ - llama-minicpmv-cli\ - llama-lookahead \ - llama-lookup \ - llama-lookup-create \ - llama-lookup-merge \ - llama-lookup-stats \ - llama-parallel \ - llama-passkey \ - llama-perplexity \ - llama-q8dot \ - llama-quantize \ - llama-quantize-stats \ - llama-retrieval \ - llama-save-load-state \ - llama-server \ - llama-simple \ - llama-speculative \ - llama-tokenize \ - llama-vdot \ - llama-cvector-generator \ - llama-gen-docs \ - tests/test-c.o +BUILD_TARGETS = llama-cli +# BUILD_TARGETS = \ +# libllava.a \ +# llama-baby-llama \ +# llama-batched \ +# llama-batched-bench \ +# llama-bench \ +# llama-cli \ +# llama-convert-llama2c-to-ggml \ +# llama-embedding \ +# llama-eval-callback \ +# llama-export-lora \ +# llama-gbnf-validator \ +# llama-gguf \ +# llama-gguf-hash \ +# llama-gguf-split \ +# llama-gritlm \ +# llama-imatrix \ +# llama-infill \ +# llama-llava-cli \ +# llama-minicpmv-cli\ +# llama-lookahead \ +# llama-lookup \ +# llama-lookup-create \ +# llama-lookup-merge \ +# llama-lookup-stats \ +# llama-parallel \ +# llama-passkey \ +# llama-perplexity \ +# llama-q8dot \ +# llama-quantize \ +# llama-quantize-stats \ +# llama-retrieval \ +# llama-save-load-state \ +# llama-server \ +# llama-simple \ +# llama-speculative \ +# llama-tokenize \ +# llama-vdot \ +# llama-cvector-generator \ +# llama-gen-docs \ +# tests/test-c.o # Binaries only useful for tests TEST_TARGETS = \ diff --git a/common/common.cpp b/common/common.cpp index a2f7b4cc..a2690a3c 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -890,9 +890,11 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { } // get device profile + LOG_INF("Start profiling this device, this may take some seconds ...\n"); + device_info dev_info; dev_info.rank = params.rank; - llama_profile_device(&dev_info, model, params.model.c_str()); + llama_profile_device(&dev_info, model, params.model.c_str(), params.cpuparams.n_threads); // create llama context struct llama_context_params cparams = llama_context_params_from_gpt_params(params); diff --git a/common/profiler.cpp b/common/profiler.cpp index 44abfe6a..b4505c42 100644 --- a/common/profiler.cpp +++ b/common/profiler.cpp @@ -73,6 +73,69 @@ uint32_t device_cpu_cores() { return core_count; } +float device_cpu_flops(struct llama_model * model, enum ggml_type dtype, int n_threads) { + // define matrix dimensions + const int n_embd = llama_n_embd(model); + const int n_ff_hidden = llama_n_ff_hidden(model); + const int rows_A = n_embd, cols_A = n_ff_hidden; + const int rows_B = n_embd, cols_B = n_ff_hidden; + + // calculate memory size needed for ggml_context allocation + size_t ctx_size = 0; + ctx_size += rows_A * cols_A * ggml_type_size(dtype); // tensor a + ctx_size += rows_B * cols_B * ggml_type_size(dtype); // tensor b + ctx_size += rows_A * rows_B * ggml_type_size(dtype); // result + ctx_size += 3 * ggml_tensor_overhead(); // metadata for 3 tensors + ctx_size += ggml_graph_overhead(); // compute graph + ctx_size = (size_t)(ctx_size * 1.2); // some overhead + + // allocate ggml_context + struct ggml_init_params params = { + /*.mem_size =*/ ctx_size, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ false, + }; + struct ggml_context * ctx = ggml_init(params); + + // create tensors and set data + struct ggml_tensor * tensor_a = ggml_new_tensor_2d(ctx, dtype, cols_A, rows_A); + struct ggml_tensor * tensor_b = ggml_new_tensor_2d(ctx, dtype, cols_B, rows_B); + + // fill tensors with random data + float * matrix_A = (float *)malloc(rows_A * cols_A * sizeof(float)); + float * matrix_B = (float *)malloc(rows_B * cols_B * sizeof(float)); + + for (int i = 0; i < rows_A * cols_A; i++) { + matrix_A[i] = (float)(rand() % 100) / 10.0f; // random float between 0.0 and 10.0 + } + for (int i = 0; i < rows_B * cols_B; i++) { + matrix_B[i] = (float)(rand() % 100) / 10.0f; + } + + memcpy(tensor_a->data, matrix_A, ggml_nbytes(tensor_a)); + memcpy(tensor_b->data, matrix_B, ggml_nbytes(tensor_b)); + + free(matrix_A); + free(matrix_B); + + // create ggml_cgraph for multiplication + struct ggml_cgraph * gf = ggml_new_graph(ctx); + struct ggml_tensor * result = ggml_mul_mat(ctx, tensor_a, tensor_b); + ggml_build_forward_expand(gf, result); + + // run the computation + int64_t start_time = ggml_time_us(); + ggml_graph_compute_with_ctx(ctx, gf, n_threads); + int64_t end_time = ggml_time_us(); + + double elapsed_seconds = (end_time - start_time) / 1e6; + double flops = (2.0 * rows_A * cols_A * cols_B) / elapsed_seconds / 1e9; + + // free memory + ggml_free(ctx); + return (float)flops; +} + uint64_t device_physical_memory(bool available) { uint64_t memory = 0; @@ -344,6 +407,18 @@ void device_print_props(struct device_info * dev_info_set, int n) { } LOG_INF("\n"); + LOG_INF("| CPU flops (F32) "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_f32); + } + LOG_INF("\n"); + + LOG_INF("| CPU flops (F16) "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_f16); + } + LOG_INF("\n"); + LOG_INF("| Physical Mem Total (GB) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].memory.total_physical); @@ -467,6 +542,7 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) { + gpu_description_len + sizeof(float) // disk_read_bandwidth + sizeof(uint32_t) // cpu_props.cores + + sizeof(float) * 2 // cpu_props.flops_f32 and cpu_props.flops_f16 + sizeof(struct memory_info) + sizeof(struct gpu_support) + sizeof(float) * 2; // gpu_props.memory_free and gpu_props.memory_total @@ -511,6 +587,12 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) { memcpy(ptr, &dev_info->cpu_props.cores, sizeof(uint32_t)); ptr += sizeof(uint32_t); + memcpy(ptr, &dev_info->cpu_props.flops_f32, sizeof(float)); + ptr += sizeof(float); + + memcpy(ptr, &dev_info->cpu_props.flops_f16, sizeof(float)); + ptr += sizeof(float); + memcpy(ptr, &dev_info->memory, sizeof(struct memory_info)); ptr += sizeof(struct memory_info); @@ -579,6 +661,12 @@ void deserialize(const char * buffer, struct device_info * dev_info) { memcpy(&dev_info->cpu_props.cores, ptr, sizeof(uint32_t)); ptr += sizeof(uint32_t); + memcpy(&dev_info->cpu_props.flops_f32, ptr, sizeof(float)); + ptr += sizeof(float); + + memcpy(&dev_info->cpu_props.flops_f16, ptr, sizeof(float)); + ptr += sizeof(float); + memcpy(&dev_info->memory, ptr, sizeof(struct memory_info)); ptr += sizeof(struct memory_info); diff --git a/common/profiler.h b/common/profiler.h index b768b7cc..fc6d1b8a 100644 --- a/common/profiler.h +++ b/common/profiler.h @@ -1,12 +1,15 @@ #ifndef PROFILER_H #define PROFILER_H +#include "ggml.h" #include "llama.h" struct cpu_props { const char * name; const char * description; uint32_t cores; + float flops_f32; + float flops_f16; }; struct memory_info { @@ -47,6 +50,7 @@ struct device_info { const char * device_name(void); uint32_t device_cpu_cores (void); +float device_cpu_flops (struct llama_model * model, enum ggml_type dtype, int n_threads); uint64_t device_physical_memory(bool available); uint64_t device_swap_memory (bool available); uint64_t device_disk_read_bw (const char * test_file, size_t buffer_size_mb); diff --git a/include/llama.h b/include/llama.h index ff7d1599..1c61f624 100644 --- a/include/llama.h +++ b/include/llama.h @@ -410,7 +410,7 @@ extern "C" { // Call once at the start of the program LLAMA_API void llama_backend_init(void); - LLAMA_API void llama_profile_device (struct device_info * dev_info, struct llama_model * model, const char * test_file); + LLAMA_API void llama_profile_device (struct device_info * dev_info, struct llama_model * model, const char * test_file, int n_threads); LLAMA_API ggml_backend_buffer_type_t llama_dev_buffer_type(struct llama_model * model, int device); //optional: @@ -476,6 +476,7 @@ extern "C" { LLAMA_API int32_t llama_n_embd (const struct llama_model * model); LLAMA_API int32_t llama_n_layer (const struct llama_model * model); LLAMA_API int32_t llama_n_head (const struct llama_model * model); + LLAMA_API int32_t llama_n_ff_hidden(const struct llama_model * model); LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx); diff --git a/src/llama.cpp b/src/llama.cpp index b77e7ca9..a68ecfbc 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3546,9 +3546,11 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_ GGML_UNUSED(model); } -void llama_profile_device(device_info * dev_info, struct llama_model * model, const char * test_file) { +void llama_profile_device(device_info * dev_info, struct llama_model * model, const char * test_file, int n_threads) { dev_info->device_name = device_name(); dev_info->cpu_props.cores = device_cpu_cores(); + dev_info->cpu_props.flops_f32 = device_cpu_flops(model, GGML_TYPE_F32, n_threads); + dev_info->cpu_props.flops_f16 = device_cpu_flops(model, GGML_TYPE_F16, n_threads); dev_info->memory.total_physical = round(device_physical_memory(false) / (double)(1 << 30) * 100) / 100; dev_info->memory.available_physical = round(device_physical_memory(true) / (double)(1 << 30) * 100) / 100; @@ -20429,6 +20431,10 @@ int32_t llama_n_head(const struct llama_model * model) { return model->hparams.n_head(); } +int32_t llama_n_ff_hidden(const struct llama_model * model) { + return model->hparams.n_ff_arr[0]; +} + const struct llama_model * llama_get_model(const struct llama_context * ctx) { return &ctx->model; }