From 407c71ae525b8076f78d6b866af450af04424e70 Mon Sep 17 00:00:00 2001 From: Lizonghang <870644199@qq.com> Date: Wed, 6 Nov 2024 20:42:28 +0400 Subject: [PATCH] add cpu and gpu profile --- Makefile | 2 +- common/common.cpp | 51 ++++++++++++++++++++++++++++++++++++++------- common/profiler.cpp | 44 ++++++++++++++++++++++++++++++++++++++ common/profiler.h | 22 ++++++++++++++----- include/llama.h | 2 ++ src/llama.cpp | 8 +++++-- 6 files changed, 113 insertions(+), 16 deletions(-) diff --git a/Makefile b/Makefile index ba931f2b..3521b2b9 100644 --- a/Makefile +++ b/Makefile @@ -1175,7 +1175,7 @@ $(LIB_LLAMA_S): \ common/profiler.o: \ common/profiler.cpp \ - common/profiler.h + common/profiler.h $(CXX) $(CXXFLAGS) -c $< -o $@ common/common.o: \ diff --git a/common/common.cpp b/common/common.cpp index a491449c..0dce48f0 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -843,15 +843,50 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { uint64_t available_swap = profiler::device_swap_memory(true); uint64_t disk_read_bw = profiler::device_disk_read_bw(params.model.c_str(), 500); uint64_t memory_bw = profiler::device_memory_bw(500); + int has_metal = profiler::device_has_metal(); + int has_cuda = profiler::device_has_cuda(); + int has_vulkan = profiler::device_has_vulkan(); + int has_kompute = profiler::device_has_kompute(); + int has_gpublas = profiler::device_has_gpublas(); + int has_blas = profiler::device_has_blas(); + int has_sycl = profiler::device_has_sycl(); + ggml_backend_dev_props cpu_props; + ggml_backend_dev_props gpu_props; + profiler::device_get_props(model, -1, &cpu_props); // -1 for cpu + profiler::device_get_props(model, 0, &gpu_props); // 0 for gpu0 - LOG_INF("Device Name: %s\n", dev_name); - LOG_INF("Number of CPU cores: %u\n", n_cpu_cores); - LOG_INF("Total Physical Memory: %.2f GB\n", total_memory / (double)(1 << 30)); - LOG_INF("Available Physical Memory: %.2f GB\n", available_memory / (double)(1 << 30)); - LOG_INF("Total Swap Memory: %.2f GB\n", total_swap / (double)(1 << 30)); - LOG_INF("Available Swap Memory: %.2f GB\n", available_swap / (double)(1 << 30)); - LOG_INF("Disk Read Bandwidth: %.2f GB/s\n", disk_read_bw / (double)(1 << 30)); - LOG_INF("Memory Bandwidth: %.2f GB/s\n", memory_bw / (double)(1 << 30)); + LOG_INF("\n"); + LOG_INF("Device Info:\n"); + LOG_INF(" Device Name : %s\n", dev_name); + LOG_INF(" CPU Name : %s\n", cpu_props.name); + LOG_INF(" CPU Description : %s\n", cpu_props.description); + LOG_INF(" Number of CPU cores : %u\n", n_cpu_cores); + LOG_INF(" Disk Read Bandwidth : %.2f GB/s\n", disk_read_bw / (double)(1 << 30)); + LOG_INF("\n"); + + LOG_INF("Memory Information:\n"); + LOG_INF(" Physical Mem Total : %.2f GB\n", total_memory / (double)(1 << 30)); + LOG_INF(" Physical Mem Available : %.2f GB\n", available_memory / (double)(1 << 30)); + LOG_INF(" Swap Memory Total : %.2f GB\n", total_swap / (double)(1 << 30)); + LOG_INF(" Swap Memory Available : %.2f GB\n", available_swap / (double)(1 << 30)); + LOG_INF(" Mem Bandwidth : %.2f GB/s\n", memory_bw / (double)(1 << 30)); + LOG_INF("\n"); + + LOG_INF("GPU Support:\n"); + LOG_INF(" Metal : %i\n", has_metal); + LOG_INF(" CUDA : %i\n", has_cuda); + LOG_INF(" Vulkan : %i\n", has_vulkan); + LOG_INF(" Kompute : %i\n", has_kompute); + LOG_INF(" GPU BLAS : %i\n", has_gpublas); + LOG_INF(" BLAS : %i\n", has_blas); + LOG_INF(" SYCL : %i\n", has_sycl); + LOG_INF("\n"); + + LOG_INF("GPU Properties:\n"); + LOG_INF(" GPU Name : %s\n", gpu_props.name); + LOG_INF(" Description : %s\n", gpu_props.description); + LOG_INF(" Memory Free : %d MB\n", (int)(gpu_props.memory_free / (double)(1 << 20))); + LOG_INF(" Memory Total : %.2f GB\n", gpu_props.memory_total / (double)(1 << 30)); if (model == NULL) { LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.c_str()); diff --git a/common/profiler.cpp b/common/profiler.cpp index 0cbf96e5..2bcf1661 100644 --- a/common/profiler.cpp +++ b/common/profiler.cpp @@ -1,5 +1,8 @@ #include "log.h" #include "profiler.h" +#include "ggml.h" +#include "ggml-backend.h" +#include "llama.h" #if defined(_WIN32) || defined(_WIN64) #include @@ -271,4 +274,45 @@ uint64_t device_memory_bw(size_t buffer_size_mb) { return speed; } +int device_has_metal(void) { + return ggml_cpu_has_metal(); +} + +int device_has_cuda(void) { + return ggml_cpu_has_cuda(); +} + +int device_has_vulkan(void) { + return ggml_cpu_has_vulkan(); +} + +int device_has_kompute(void) { + return ggml_cpu_has_kompute(); +} + +int device_has_gpublas(void) { + return ggml_cpu_has_gpublas(); +} + +int device_has_blas(void) { + return ggml_cpu_has_blas(); +} + +int device_has_sycl(void) { + return ggml_cpu_has_sycl(); +} + +// ggml_backend_buffer_type_t llama_dev_buffer_type(const llama_model * model, int device) + +void device_get_props(struct llama_model * model, int device, struct ggml_backend_dev_props * props) { + ggml_backend_buffer_type_t buft_type; + if (device == -1) { // type cpu + buft_type = ggml_backend_cpu_buffer_type(); + } else { // type gpu + buft_type = llama_dev_buffer_type(model, device); + } + ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft_type); + ggml_backend_dev_get_props(dev, props); +} + } // namespace profiler \ No newline at end of file diff --git a/common/profiler.h b/common/profiler.h index e81141ec..f9aa6942 100644 --- a/common/profiler.h +++ b/common/profiler.h @@ -1,17 +1,29 @@ #ifndef PROFILER_H #define PROFILER_H +#include "llama.h" #include #define BUFFER_SIZE_MB 1024 namespace profiler { - const char * device_name(); - uint32_t device_cpu_cores(); + const char * device_name(void); + + uint32_t device_cpu_cores (void); uint64_t device_physical_memory(bool available = true); - uint64_t device_swap_memory(bool available = true); - uint64_t device_disk_read_bw(const char * test_file, size_t buffer_size_mb = BUFFER_SIZE_MB); - uint64_t device_memory_bw(size_t buffer_size_mb = BUFFER_SIZE_MB); + uint64_t device_swap_memory (bool available = true); + uint64_t device_disk_read_bw (const char * test_file, size_t buffer_size_mb = BUFFER_SIZE_MB); + uint64_t device_memory_bw (size_t buffer_size_mb = BUFFER_SIZE_MB); + void device_get_props (struct llama_model * model, int device, struct ggml_backend_dev_props * props); + + int device_has_metal(void); + int device_has_cuda(void); + int device_has_vulkan(void); + int device_has_kompute(void); + int device_has_gpublas(void); + int device_has_blas(void); + int device_has_sycl(void); + } // namespace profiler #endif // PROFILER_H \ No newline at end of file diff --git a/include/llama.h b/include/llama.h index 9913ce1b..b4821ee4 100644 --- a/include/llama.h +++ b/include/llama.h @@ -408,6 +408,8 @@ extern "C" { // Call once at the start of the program LLAMA_API void llama_backend_init(void); + ggml_backend_buffer_type_t llama_dev_buffer_type(struct llama_model * model, int device); + //optional: LLAMA_API void llama_numa_init(enum ggml_numa_strategy numa); diff --git a/src/llama.cpp b/src/llama.cpp index 795273b7..ee835c47 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3544,6 +3544,10 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_ GGML_UNUSED(model); } +ggml_backend_buffer_type_t llama_dev_buffer_type(struct llama_model * model, int device) { + return llama_default_buffer_type_offload(*model, device); +} + static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_model & model, int fallback_gpu, const float * tensor_split) { ggml_backend_buffer_type_t buft = nullptr; @@ -17385,7 +17389,7 @@ static int llama_recv_meta(zmq::socket_t & socket, struct sync_meta * meta) { return 0; } -static void llama_recv_tensors(zmq::socket_t & socket, struct llama_ubatch * ubatch, struct llama_context * lctx, const bool is_out_embd=false) { +static void llama_recv_tensors(zmq::socket_t & socket, struct llama_ubatch * ubatch, const bool is_out_embd=false) { std::vector recv_msgs; if (!zmq::recv_multipart(socket, std::back_inserter(recv_msgs))) { LLAMA_LOG_INFO("Failed to receive tensor data.\n"); @@ -17724,7 +17728,7 @@ static int llama_decode_internal( // receive data from other nodes if (n_world > 1 && !(my_rank == 0 && i == 0) && !(my_rank == 0 && is_last_l)) { const bool is_out_embd = my_rank == 0 && i == (size_t)gf.size() - 1; - llama_recv_tensors(*lctx.recv_socket, &ubatch, &lctx, is_out_embd); + llama_recv_tensors(*lctx.recv_socket, &ubatch, is_out_embd); } // ensure ggml_backend_tensor_get_async of the previous subgraph has finished