From 407c71ae525b8076f78d6b866af450af04424e70 Mon Sep 17 00:00:00 2001
From: Lizonghang <870644199@qq.com>
Date: Wed, 6 Nov 2024 20:42:28 +0400
Subject: [PATCH] add cpu and gpu profile

---
 Makefile            |  2 +-
 common/common.cpp   | 51 ++++++++++++++++++++++++++++++++++++++-------
 common/profiler.cpp | 44 ++++++++++++++++++++++++++++++++++++++
 common/profiler.h   | 22 ++++++++++++++-----
 include/llama.h     |  2 ++
 src/llama.cpp       |  8 +++++--
 6 files changed, 113 insertions(+), 16 deletions(-)

diff --git a/Makefile b/Makefile
index ba931f2b..3521b2b9 100644
--- a/Makefile
+++ b/Makefile
@@ -1175,7 +1175,7 @@ $(LIB_LLAMA_S): \
 
 common/profiler.o: \
     common/profiler.cpp \
-	common/profiler.h
+	common/profiler.h 
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 
 common/common.o: \
diff --git a/common/common.cpp b/common/common.cpp
index a491449c..0dce48f0 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -843,15 +843,50 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
     uint64_t available_swap   = profiler::device_swap_memory(true);
     uint64_t disk_read_bw     = profiler::device_disk_read_bw(params.model.c_str(), 500);
     uint64_t memory_bw        = profiler::device_memory_bw(500);
+    int      has_metal        = profiler::device_has_metal();
+    int      has_cuda         = profiler::device_has_cuda();
+    int      has_vulkan       = profiler::device_has_vulkan();
+    int      has_kompute      = profiler::device_has_kompute();
+    int      has_gpublas      = profiler::device_has_gpublas();
+    int      has_blas         = profiler::device_has_blas();
+    int      has_sycl         = profiler::device_has_sycl();
+    ggml_backend_dev_props cpu_props;
+    ggml_backend_dev_props gpu_props;
+    profiler::device_get_props(model, -1, &cpu_props); // -1 for cpu
+    profiler::device_get_props(model, 0,  &gpu_props); // 0 for gpu0
 
-    LOG_INF("Device Name:               %s\n", dev_name);
-    LOG_INF("Number of CPU cores:       %u\n", n_cpu_cores);
-    LOG_INF("Total Physical Memory:     %.2f GB\n", total_memory / (double)(1 << 30));
-    LOG_INF("Available Physical Memory: %.2f GB\n", available_memory / (double)(1 << 30));
-    LOG_INF("Total Swap Memory:         %.2f GB\n", total_swap / (double)(1 << 30));
-    LOG_INF("Available Swap Memory:     %.2f GB\n", available_swap / (double)(1 << 30));
-    LOG_INF("Disk Read Bandwidth:       %.2f GB/s\n", disk_read_bw / (double)(1 << 30));
-    LOG_INF("Memory Bandwidth:          %.2f GB/s\n", memory_bw / (double)(1 << 30));
+    LOG_INF("\n");
+    LOG_INF("Device Info:\n");
+    LOG_INF("  Device Name               : %s\n", dev_name);
+    LOG_INF("  CPU Name                  : %s\n", cpu_props.name);
+    LOG_INF("  CPU Description           : %s\n", cpu_props.description);
+    LOG_INF("  Number of CPU cores       : %u\n", n_cpu_cores);
+    LOG_INF("  Disk Read Bandwidth       : %.2f GB/s\n", disk_read_bw / (double)(1 << 30));
+    LOG_INF("\n");
+
+    LOG_INF("Memory Information:\n");
+    LOG_INF("  Physical Mem Total        : %.2f GB\n", total_memory / (double)(1 << 30));
+    LOG_INF("  Physical Mem Available    : %.2f GB\n", available_memory / (double)(1 << 30));
+    LOG_INF("  Swap Memory Total         : %.2f GB\n", total_swap / (double)(1 << 30));
+    LOG_INF("  Swap Memory Available     : %.2f GB\n", available_swap / (double)(1 << 30));
+    LOG_INF("  Mem Bandwidth             : %.2f GB/s\n", memory_bw / (double)(1 << 30));
+    LOG_INF("\n");
+
+    LOG_INF("GPU Support:\n");
+    LOG_INF("  Metal                     : %i\n", has_metal);
+    LOG_INF("  CUDA                      : %i\n", has_cuda);
+    LOG_INF("  Vulkan                    : %i\n", has_vulkan);
+    LOG_INF("  Kompute                   : %i\n", has_kompute);
+    LOG_INF("  GPU BLAS                  : %i\n", has_gpublas);
+    LOG_INF("  BLAS                      : %i\n", has_blas);
+    LOG_INF("  SYCL                      : %i\n", has_sycl);
+    LOG_INF("\n");
+
+    LOG_INF("GPU Properties:\n");
+    LOG_INF("  GPU Name                  : %s\n", gpu_props.name);
+    LOG_INF("  Description               : %s\n", gpu_props.description);
+    LOG_INF("  Memory Free               : %d MB\n", (int)(gpu_props.memory_free / (double)(1 << 20)));
+    LOG_INF("  Memory Total              : %.2f GB\n", gpu_props.memory_total / (double)(1 << 30));
 
     if (model == NULL) {
         LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.c_str());
diff --git a/common/profiler.cpp b/common/profiler.cpp
index 0cbf96e5..2bcf1661 100644
--- a/common/profiler.cpp
+++ b/common/profiler.cpp
@@ -1,5 +1,8 @@
 #include "log.h"
 #include "profiler.h"
+#include "ggml.h"
+#include "ggml-backend.h"
+#include "llama.h"
 
 #if defined(_WIN32) || defined(_WIN64)
     #include <windows.h>
@@ -271,4 +274,45 @@ uint64_t device_memory_bw(size_t buffer_size_mb) {
     return speed;
 }
 
+int device_has_metal(void) {
+    return ggml_cpu_has_metal();
+}
+
+int device_has_cuda(void) {
+    return ggml_cpu_has_cuda();
+}
+
+int device_has_vulkan(void) {
+    return ggml_cpu_has_vulkan();
+}
+
+int device_has_kompute(void) {
+    return ggml_cpu_has_kompute();
+}
+
+int device_has_gpublas(void) {
+    return ggml_cpu_has_gpublas();
+}
+
+int device_has_blas(void) {
+    return ggml_cpu_has_blas();
+}
+
+int device_has_sycl(void) {
+    return ggml_cpu_has_sycl();
+}
+
+// ggml_backend_buffer_type_t llama_dev_buffer_type(const llama_model * model, int device)
+
+void device_get_props(struct llama_model * model, int device, struct ggml_backend_dev_props * props) {
+    ggml_backend_buffer_type_t buft_type;
+    if (device == -1) { // type cpu
+        buft_type = ggml_backend_cpu_buffer_type();
+    } else { // type gpu
+        buft_type = llama_dev_buffer_type(model, device);
+    }
+    ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft_type);
+    ggml_backend_dev_get_props(dev, props);
+}
+
 } // namespace profiler
\ No newline at end of file
diff --git a/common/profiler.h b/common/profiler.h
index e81141ec..f9aa6942 100644
--- a/common/profiler.h
+++ b/common/profiler.h
@@ -1,17 +1,29 @@
 #ifndef PROFILER_H
 #define PROFILER_H
 
+#include "llama.h"
 #include <string>
 
 #define BUFFER_SIZE_MB 1024
 
 namespace profiler {
-    const char * device_name(); 
-    uint32_t device_cpu_cores();
+    const char * device_name(void); 
+
+    uint32_t device_cpu_cores      (void);
     uint64_t device_physical_memory(bool available = true);
-    uint64_t device_swap_memory(bool available = true);
-    uint64_t device_disk_read_bw(const char * test_file, size_t buffer_size_mb = BUFFER_SIZE_MB);
-    uint64_t device_memory_bw(size_t buffer_size_mb = BUFFER_SIZE_MB);
+    uint64_t device_swap_memory    (bool available = true);
+    uint64_t device_disk_read_bw   (const char * test_file, size_t buffer_size_mb = BUFFER_SIZE_MB);
+    uint64_t device_memory_bw      (size_t buffer_size_mb = BUFFER_SIZE_MB);
+    void     device_get_props      (struct llama_model * model, int device, struct ggml_backend_dev_props * props);
+
+    int device_has_metal(void);
+    int device_has_cuda(void);
+    int device_has_vulkan(void);
+    int device_has_kompute(void);
+    int device_has_gpublas(void);
+    int device_has_blas(void);
+    int device_has_sycl(void);
+
 } // namespace profiler
 
 #endif // PROFILER_H
\ No newline at end of file
diff --git a/include/llama.h b/include/llama.h
index 9913ce1b..b4821ee4 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -408,6 +408,8 @@ extern "C" {
     // Call once at the start of the program
     LLAMA_API void llama_backend_init(void);
 
+    ggml_backend_buffer_type_t llama_dev_buffer_type(struct llama_model * model, int device);
+
     //optional:
     LLAMA_API void llama_numa_init(enum ggml_numa_strategy numa);
 
diff --git a/src/llama.cpp b/src/llama.cpp
index 795273b7..ee835c47 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -3544,6 +3544,10 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_
     GGML_UNUSED(model);
 }
 
+ggml_backend_buffer_type_t llama_dev_buffer_type(struct llama_model * model, int device) {
+    return llama_default_buffer_type_offload(*model, device);
+}
+
 static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_model & model, int fallback_gpu, const float * tensor_split) {
     ggml_backend_buffer_type_t buft = nullptr;
 
@@ -17385,7 +17389,7 @@ static int llama_recv_meta(zmq::socket_t & socket, struct sync_meta * meta) {
     return 0;
 }
 
-static void llama_recv_tensors(zmq::socket_t & socket, struct llama_ubatch * ubatch, struct llama_context * lctx, const bool is_out_embd=false) {
+static void llama_recv_tensors(zmq::socket_t & socket, struct llama_ubatch * ubatch, const bool is_out_embd=false) {
     std::vector<zmq::message_t> recv_msgs;
     if (!zmq::recv_multipart(socket, std::back_inserter(recv_msgs))) {
         LLAMA_LOG_INFO("Failed to receive tensor data.\n");
@@ -17724,7 +17728,7 @@ static int llama_decode_internal(
             // receive data from other nodes
             if (n_world > 1 && !(my_rank == 0 && i == 0) && !(my_rank == 0 && is_last_l)) {
                 const bool is_out_embd = my_rank == 0 && i == (size_t)gf.size() - 1;
-                llama_recv_tensors(*lctx.recv_socket, &ubatch, &lctx, is_out_embd);
+                llama_recv_tensors(*lctx.recv_socket, &ubatch, is_out_embd);
             }
 
             // ensure ggml_backend_tensor_get_async of the previous subgraph has finished