add LLAMA_API llama_profile_device

2025-09-10 10:24:34 +00:00 · 2024-11-07 09:30:39 +04:00 · 2024-11-07 09:30:39 +04:00 · ef7fdf70cc
commit ef7fdf70cc
parent b922418cca
6 changed files with 131 additions and 80 deletions
--- a/1
+++ b/1
@ -1185,7 +1185,6 @@ common/common.o: \
 	common/sampling.h \
 	common/json.hpp \
 	common/json-schema-to-grammar.h \
-	common/profiler.h \
 	include/llama.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@

--- a/common/common.cpp
+++ b/common/common.cpp
@ -9,7 +9,6 @@
 #include "json.hpp"
 #include "json-schema-to-grammar.h"
 #include "llama.h"
-#include "profiler.h"

 #include <algorithm>
 #include <cinttypes>
@ -824,7 +823,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
    llama_init_result iparams;
    auto mparams = llama_model_params_from_gpt_params(params);

-    llama_model * model = nullptr;
+    struct llama_model * model = nullptr;

    if (!params.hf_repo.empty() && !params.hf_file.empty()) {
        model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
@ -835,58 +834,8 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
    }

    // profile devices and determine the best setup
-    const char * dev_name     = profiler::device_name();
-    uint32_t n_cpu_cores      = profiler::device_cpu_cores();
-    uint64_t total_memory     = profiler::device_physical_memory(false);
-    uint64_t available_memory = profiler::device_physical_memory(true);
-    uint64_t total_swap       = profiler::device_swap_memory(false);
-    uint64_t available_swap   = profiler::device_swap_memory(true);
-    uint64_t disk_read_bw     = profiler::device_disk_read_bw(params.model.c_str(), 500);
-    uint64_t memory_bw        = profiler::device_memory_bw(500);
-    int      has_metal        = profiler::device_has_metal();
-    int      has_cuda         = profiler::device_has_cuda();
-    int      has_vulkan       = profiler::device_has_vulkan();
-    int      has_kompute      = profiler::device_has_kompute();
-    int      has_gpublas      = profiler::device_has_gpublas();
-    int      has_blas         = profiler::device_has_blas();
-    int      has_sycl         = profiler::device_has_sycl();
-    ggml_backend_dev_props cpu_props;
-    ggml_backend_dev_props gpu_props;
-    profiler::device_get_props(model, -1, &cpu_props); // -1 for cpu
-    profiler::device_get_props(model, 0,  &gpu_props); // 0 for gpu0
-
-    LOG_INF("\n");
-    LOG_INF("Device Info:\n");
-    LOG_INF("  Device Name               : %s\n", dev_name);
-    LOG_INF("  CPU Name                  : %s\n", cpu_props.name);
-    LOG_INF("  CPU Description           : %s\n", cpu_props.description);
-    LOG_INF("  Number of CPU cores       : %u\n", n_cpu_cores);
-    LOG_INF("  Disk Read Bandwidth       : %.2f GB/s\n", disk_read_bw / (double)(1 << 30));
-    LOG_INF("\n");
-
-    LOG_INF("Memory Information:\n");
-    LOG_INF("  Physical Mem Total        : %.2f GB\n", total_memory / (double)(1 << 30));
-    LOG_INF("  Physical Mem Available    : %.2f GB\n", available_memory / (double)(1 << 30));
-    LOG_INF("  Swap Memory Total         : %.2f GB\n", total_swap / (double)(1 << 30));
-    LOG_INF("  Swap Memory Available     : %.2f GB\n", available_swap / (double)(1 << 30));
-    LOG_INF("  Mem Bandwidth             : %.2f GB/s\n", memory_bw / (double)(1 << 30));
-    LOG_INF("\n");
-
-    LOG_INF("GPU Support:\n");
-    LOG_INF("  Metal                     : %i\n", has_metal);
-    LOG_INF("  CUDA                      : %i\n", has_cuda);
-    LOG_INF("  Vulkan                    : %i\n", has_vulkan);
-    LOG_INF("  Kompute                   : %i\n", has_kompute);
-    LOG_INF("  GPU BLAS                  : %i\n", has_gpublas);
-    LOG_INF("  BLAS                      : %i\n", has_blas);
-    LOG_INF("  SYCL                      : %i\n", has_sycl);
-    LOG_INF("\n");
-
-    LOG_INF("GPU Properties:\n");
-    LOG_INF("  GPU Name                  : %s\n", gpu_props.name);
-    LOG_INF("  Description               : %s\n", gpu_props.description);
-    LOG_INF("  Memory Free               : %.2f GB\n", gpu_props.memory_free / (double)(1 << 30));
-    LOG_INF("  Memory Total              : %.2f GB\n", gpu_props.memory_total / (double)(1 << 30));
+    device_info dev_info;
+    llama_profile_device(&dev_info, model, params.model.c_str());

    if (model == NULL) {
        LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.c_str());
--- a/common/profiler.cpp
+++ b/common/profiler.cpp
@ -23,8 +23,6 @@
 #include <sys/types.h>
 #include <vector>

-namespace profiler {
-
 const char * device_name() {
    static char device_name[256];

@ -314,5 +312,3 @@ void device_get_props(struct llama_model * model, int device, struct ggml_backen
    ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft_type);
    ggml_backend_dev_get_props(dev, props);
 }
-
-} // namespace profiler
--- a/common/profiler.h
+++ b/common/profiler.h
@ -2,28 +2,62 @@
 #define PROFILER_H

 #include "llama.h"
-#include <string>

-#define BUFFER_SIZE_MB 1024
+struct cpu_props {
+    const char * name;
+    const char * description;
+    uint32_t     cores;
+};

-namespace profiler {
-    const char * device_name(void); 
+struct memory_info {
+    float        total_physical;      // in GB
+    float        available_physical;  // in GB
+    float        total_swap;          // in GB
+    float        available_swap;      // in GB
+    float        bandwidth;           // in GB/s
+};

-    uint32_t device_cpu_cores      (void);
-    uint64_t device_physical_memory(bool available = true);
-    uint64_t device_swap_memory    (bool available = true);
-    uint64_t device_disk_read_bw   (const char * test_file, size_t buffer_size_mb = BUFFER_SIZE_MB);
-    uint64_t device_memory_bw      (size_t buffer_size_mb = BUFFER_SIZE_MB);
-    void     device_get_props      (struct llama_model * model, int device, struct ggml_backend_dev_props * props);
+struct gpu_support {
+    bool         metal;
+    bool         cuda;
+    bool         vulkan;
+    bool         kompute;
+    bool         gpublas;
+    bool         blas;
+    bool         sycl;
+};

-    int device_has_metal(void);
-    int device_has_cuda(void);
-    int device_has_vulkan(void);
-    int device_has_kompute(void);
-    int device_has_gpublas(void);
-    int device_has_blas(void);
-    int device_has_sycl(void);
+struct gpu_props {
+    const char * name;
+    const char * description;
+    float        memory_free;   // in GB
+    float        memory_total;  // in GB
+};

-} // namespace profiler
+struct device_info {
+    const char *       device_name;
+    float              disk_read_bandwidth;  // in GB/s
+    struct cpu_props   cpu_props;
+    struct memory_info memory;
+    struct gpu_support gpu_support;
+    struct gpu_props   gpu_props;
+};
+
+const char * device_name(void); 
+
+uint32_t device_cpu_cores      (void);
+uint64_t device_physical_memory(bool available);
+uint64_t device_swap_memory    (bool available);
+uint64_t device_disk_read_bw   (const char * test_file, size_t buffer_size_mb);
+uint64_t device_memory_bw      (size_t buffer_size_mb);
+void     device_get_props      (struct llama_model * model, int device, struct ggml_backend_dev_props * props);
+
+int      device_has_metal(void);
+int      device_has_cuda(void);
+int      device_has_vulkan(void);
+int      device_has_kompute(void);
+int      device_has_gpublas(void);
+int      device_has_blas(void);
+int      device_has_sycl(void);

 #endif // PROFILER_H
--- a/include/llama.h
+++ b/include/llama.h
@ -4,6 +4,8 @@
 #include "ggml.h"
 #include "ggml-backend.h"

+#include "profiler.h"
+
 #include <stddef.h>
 #include <stdint.h>
 #include <stdio.h>
@ -408,7 +410,8 @@ extern "C" {
    // Call once at the start of the program
    LLAMA_API void llama_backend_init(void);

-    ggml_backend_buffer_type_t llama_dev_buffer_type(struct llama_model * model, int device);
+    LLAMA_API void llama_profile_device (struct device_info * dev_info, struct llama_model * model, const char * test_file);
+    LLAMA_API ggml_backend_buffer_type_t llama_dev_buffer_type(struct llama_model * model, int device);

    //optional:
    LLAMA_API void llama_numa_init(enum ggml_numa_strategy numa);
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -10,6 +10,8 @@
 #include "ggml-alloc.h"
 #include "ggml-backend.h"

+#include "profiler.h"
+
 #ifdef GGML_USE_RPC
 #  include "ggml-rpc.h"
 #endif
@ -3544,6 +3546,74 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_
    GGML_UNUSED(model);
 }

+void llama_profile_device(device_info * dev_info, struct llama_model * model, const char * test_file) {
+    dev_info->device_name               = device_name();
+    dev_info->cpu_props.cores           = device_cpu_cores();
+
+    dev_info->memory.total_physical     = round(device_physical_memory(false) / (double)(1 << 30) * 100) / 100;
+    dev_info->memory.available_physical = round(device_physical_memory(true)  / (double)(1 << 30) * 100) / 100;
+    dev_info->memory.total_swap         = round(device_swap_memory(false) / (double)(1 << 30) * 100) / 100;
+    dev_info->memory.available_swap     = round(device_swap_memory(true) / (double)(1 << 30) * 100) / 100;
+    dev_info->memory.bandwidth          = round(device_memory_bw(500) / (double)(1 << 30) * 100) / 100;
+
+    dev_info->disk_read_bandwidth       = round(device_disk_read_bw(test_file, 500) / (double)(1 << 30) * 100) / 100;
+
+    dev_info->gpu_support.metal         = device_has_metal();
+    dev_info->gpu_support.cuda          = device_has_cuda();
+    dev_info->gpu_support.vulkan        = device_has_vulkan();
+    dev_info->gpu_support.kompute       = device_has_kompute();
+    dev_info->gpu_support.gpublas       = device_has_gpublas();
+    dev_info->gpu_support.blas          = device_has_blas();
+    dev_info->gpu_support.sycl          = device_has_sycl();
+
+
+    ggml_backend_dev_props cpu_props;
+    ggml_backend_dev_props gpu_props;
+    device_get_props(model, -1, &cpu_props); // -1 for cpu
+    device_get_props(model, 0,  &gpu_props); // 0 for gpu0
+
+    dev_info->cpu_props.name            = cpu_props.name;
+    dev_info->cpu_props.description     = cpu_props.description;
+
+    dev_info->gpu_props.name            = gpu_props.name;
+    dev_info->gpu_props.description     = gpu_props.description;
+    dev_info->gpu_props.memory_free     = round(gpu_props.memory_free  / (double)(1 << 30) * 100) / 100;
+    dev_info->gpu_props.memory_total    = round(gpu_props.memory_total / (double)(1 << 30) * 100) / 100;
+
+    LLAMA_LOG_INFO("\n");
+    LLAMA_LOG_INFO("Device Info:\n");
+    LLAMA_LOG_INFO("  Device Name               : %s\n",        dev_info->device_name);
+    LLAMA_LOG_INFO("  CPU Name                  : %s\n",        dev_info->cpu_props.name);
+    LLAMA_LOG_INFO("  CPU Description           : %s\n",        dev_info->cpu_props.description);
+    LLAMA_LOG_INFO("  Number of CPU cores       : %u\n",        dev_info->cpu_props.cores);
+    LLAMA_LOG_INFO("  Disk Read Bandwidth       : %.2f GB/s\n", dev_info->disk_read_bandwidth);
+    LLAMA_LOG_INFO("\n");
+
+    LLAMA_LOG_INFO("Memory Information:\n");
+    LLAMA_LOG_INFO("  Physical Mem Total        : %.2f GB\n",   dev_info->memory.total_physical);
+    LLAMA_LOG_INFO("  Physical Mem Available    : %.2f GB\n",   dev_info->memory.available_physical);
+    LLAMA_LOG_INFO("  Swap Memory Total         : %.2f GB\n",   dev_info->memory.total_swap);
+    LLAMA_LOG_INFO("  Swap Memory Available     : %.2f GB\n",   dev_info->memory.available_swap);
+    LLAMA_LOG_INFO("  Mem Bandwidth             : %.2f GB/s\n", dev_info->memory.bandwidth);
+    LLAMA_LOG_INFO("\n");
+
+    LLAMA_LOG_INFO("GPU Support:\n");
+    LLAMA_LOG_INFO("  Metal                     : %i\n",        dev_info->gpu_support.metal);
+    LLAMA_LOG_INFO("  CUDA                      : %i\n",        dev_info->gpu_support.cuda);
+    LLAMA_LOG_INFO("  Vulkan                    : %i\n",        dev_info->gpu_support.vulkan);
+    LLAMA_LOG_INFO("  Kompute                   : %i\n",        dev_info->gpu_support.kompute);
+    LLAMA_LOG_INFO("  GPU BLAS                  : %i\n",        dev_info->gpu_support.gpublas);
+    LLAMA_LOG_INFO("  BLAS                      : %i\n",        dev_info->gpu_support.blas);
+    LLAMA_LOG_INFO("  SYCL                      : %i\n",        dev_info->gpu_support.sycl);
+    LLAMA_LOG_INFO("\n");
+
+    LLAMA_LOG_INFO("GPU Properties:\n");
+    LLAMA_LOG_INFO("  GPU Name                  : %s\n",        dev_info->gpu_props.name);
+    LLAMA_LOG_INFO("  Description               : %s\n",        dev_info->gpu_props.description);
+    LLAMA_LOG_INFO("  Memory Free               : %.2f GB\n",   dev_info->gpu_props.memory_free);
+    LLAMA_LOG_INFO("  Memory Total              : %.2f GB\n",   dev_info->gpu_props.memory_total);
+}
+
 ggml_backend_buffer_type_t llama_dev_buffer_type(struct llama_model * model, int device) {
    return llama_default_buffer_type_offload(*model, device);
 }