add LLAMA_API llama_profile_device

2025-09-10 16:24:37 +00:00 · 2024-11-07 09:30:39 +04:00 · 2024-11-07 09:30:39 +04:00 · ef7fdf70cc
commit ef7fdf70cc
parent b922418cca
6 changed files with 131 additions and 80 deletions
--- a/1
+++ b/1
@ -1185,7 +1185,6 @@ common/common.o: \
 	common/sampling.h \
 	common/json.hpp \
 	common/json-schema-to-grammar.h \
 	common/profiler.h \
 	include/llama.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
--- a/common/common.cpp
+++ b/common/common.cpp
@ -9,7 +9,6 @@
 #include "json.hpp"
 #include "json-schema-to-grammar.h"
 #include "llama.h"
 #include "profiler.h"
 #include <algorithm>
 #include <cinttypes>
@ -824,7 +823,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
    llama_init_result iparams;
    auto mparams = llama_model_params_from_gpt_params(params);
-    llama_model * model = nullptr;
+    struct llama_model * model = nullptr;
    if (!params.hf_repo.empty() && !params.hf_file.empty()) {
        model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
@ -835,58 +834,8 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
    }
    // profile devices and determine the best setup
-    const char * dev_name     = profiler::device_name();
+    device_info dev_info;
-    uint32_t n_cpu_cores      = profiler::device_cpu_cores();
+    llama_profile_device(&dev_info, model, params.model.c_str());
    uint64_t total_memory     = profiler::device_physical_memory(false);
    uint64_t available_memory = profiler::device_physical_memory(true);
    uint64_t total_swap       = profiler::device_swap_memory(false);
    uint64_t available_swap   = profiler::device_swap_memory(true);
    uint64_t disk_read_bw     = profiler::device_disk_read_bw(params.model.c_str(), 500);
    uint64_t memory_bw        = profiler::device_memory_bw(500);
    int      has_metal        = profiler::device_has_metal();
    int      has_cuda         = profiler::device_has_cuda();
    int      has_vulkan       = profiler::device_has_vulkan();
    int      has_kompute      = profiler::device_has_kompute();
    int      has_gpublas      = profiler::device_has_gpublas();
    int      has_blas         = profiler::device_has_blas();
    int      has_sycl         = profiler::device_has_sycl();
    ggml_backend_dev_props cpu_props;
    ggml_backend_dev_props gpu_props;
    profiler::device_get_props(model, -1, &cpu_props); // -1 for cpu
    profiler::device_get_props(model, 0,  &gpu_props); // 0 for gpu0
    LOG_INF("\n");
    LOG_INF("Device Info:\n");
    LOG_INF("  Device Name               : %s\n", dev_name);
    LOG_INF("  CPU Name                  : %s\n", cpu_props.name);
    LOG_INF("  CPU Description           : %s\n", cpu_props.description);
    LOG_INF("  Number of CPU cores       : %u\n", n_cpu_cores);
    LOG_INF("  Disk Read Bandwidth       : %.2f GB/s\n", disk_read_bw / (double)(1 << 30));
    LOG_INF("\n");
    LOG_INF("Memory Information:\n");
    LOG_INF("  Physical Mem Total        : %.2f GB\n", total_memory / (double)(1 << 30));
    LOG_INF("  Physical Mem Available    : %.2f GB\n", available_memory / (double)(1 << 30));
    LOG_INF("  Swap Memory Total         : %.2f GB\n", total_swap / (double)(1 << 30));
    LOG_INF("  Swap Memory Available     : %.2f GB\n", available_swap / (double)(1 << 30));
    LOG_INF("  Mem Bandwidth             : %.2f GB/s\n", memory_bw / (double)(1 << 30));
    LOG_INF("\n");
    LOG_INF("GPU Support:\n");
    LOG_INF("  Metal                     : %i\n", has_metal);
    LOG_INF("  CUDA                      : %i\n", has_cuda);
    LOG_INF("  Vulkan                    : %i\n", has_vulkan);
    LOG_INF("  Kompute                   : %i\n", has_kompute);
    LOG_INF("  GPU BLAS                  : %i\n", has_gpublas);
    LOG_INF("  BLAS                      : %i\n", has_blas);
    LOG_INF("  SYCL                      : %i\n", has_sycl);
    LOG_INF("\n");
    LOG_INF("GPU Properties:\n");
    LOG_INF("  GPU Name                  : %s\n", gpu_props.name);
    LOG_INF("  Description               : %s\n", gpu_props.description);
    LOG_INF("  Memory Free               : %.2f GB\n", gpu_props.memory_free / (double)(1 << 30));
    LOG_INF("  Memory Total              : %.2f GB\n", gpu_props.memory_total / (double)(1 << 30));
    if (model == NULL) {
        LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.c_str());
--- a/common/profiler.cpp
+++ b/common/profiler.cpp
@ -23,8 +23,6 @@
 #include <sys/types.h>
 #include <vector>
 namespace profiler {
 const char * device_name() {
    static char device_name[256];
@ -314,5 +312,3 @@ void device_get_props(struct llama_model * model, int device, struct ggml_backen
    ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft_type);
    ggml_backend_dev_get_props(dev, props);
 }
 } // namespace profiler
--- a/common/profiler.h
+++ b/common/profiler.h
@ -2,28 +2,62 @@
 #define PROFILER_H
 #include "llama.h"
 #include <string>
-#define BUFFER_SIZE_MB 1024
+struct cpu_props {
    const char * name;
    const char * description;
    uint32_t     cores;
 };
-namespace profiler {
+struct memory_info {
-    const char * device_name(void); 
+    float        total_physical;      // in GB
    float        available_physical;  // in GB
    float        total_swap;          // in GB
    float        available_swap;      // in GB
    float        bandwidth;           // in GB/s
 };
-    uint32_t device_cpu_cores      (void);
+struct gpu_support {
-    uint64_t device_physical_memory(bool available = true);
+    bool         metal;
-    uint64_t device_swap_memory    (bool available = true);
+    bool         cuda;
-    uint64_t device_disk_read_bw   (const char * test_file, size_t buffer_size_mb = BUFFER_SIZE_MB);
+    bool         vulkan;
-    uint64_t device_memory_bw      (size_t buffer_size_mb = BUFFER_SIZE_MB);
+    bool         kompute;
-    void     device_get_props      (struct llama_model * model, int device, struct ggml_backend_dev_props * props);
+    bool         gpublas;
    bool         blas;
    bool         sycl;
 };
-    int device_has_metal(void);
+struct gpu_props {
-    int device_has_cuda(void);
+    const char * name;
-    int device_has_vulkan(void);
+    const char * description;
-    int device_has_kompute(void);
+    float        memory_free;   // in GB
-    int device_has_gpublas(void);
+    float        memory_total;  // in GB
-    int device_has_blas(void);
+};
    int device_has_sycl(void);
-} // namespace profiler
+struct device_info {
    const char *       device_name;
    float              disk_read_bandwidth;  // in GB/s
    struct cpu_props   cpu_props;
    struct memory_info memory;
    struct gpu_support gpu_support;
    struct gpu_props   gpu_props;
 };
 const char * device_name(void); 
 uint32_t device_cpu_cores      (void);
 uint64_t device_physical_memory(bool available);
 uint64_t device_swap_memory    (bool available);
 uint64_t device_disk_read_bw   (const char * test_file, size_t buffer_size_mb);
 uint64_t device_memory_bw      (size_t buffer_size_mb);
 void     device_get_props      (struct llama_model * model, int device, struct ggml_backend_dev_props * props);
 int      device_has_metal(void);
 int      device_has_cuda(void);
 int      device_has_vulkan(void);
 int      device_has_kompute(void);
 int      device_has_gpublas(void);
 int      device_has_blas(void);
 int      device_has_sycl(void);
 #endif // PROFILER_H
--- a/include/llama.h
+++ b/include/llama.h
@ -4,6 +4,8 @@
 #include "ggml.h"
 #include "ggml-backend.h"
 #include "profiler.h"
 #include <stddef.h>
 #include <stdint.h>
 #include <stdio.h>
@ -408,7 +410,8 @@ extern "C" {
    // Call once at the start of the program
    LLAMA_API void llama_backend_init(void);
-    ggml_backend_buffer_type_t llama_dev_buffer_type(struct llama_model * model, int device);
+    LLAMA_API void llama_profile_device (struct device_info * dev_info, struct llama_model * model, const char * test_file);
    LLAMA_API ggml_backend_buffer_type_t llama_dev_buffer_type(struct llama_model * model, int device);
    //optional:
    LLAMA_API void llama_numa_init(enum ggml_numa_strategy numa);
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -10,6 +10,8 @@
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
 #include "profiler.h"
 #ifdef GGML_USE_RPC
 #  include "ggml-rpc.h"
 #endif
@ -3544,6 +3546,74 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_
    GGML_UNUSED(model);
 }
 void llama_profile_device(device_info * dev_info, struct llama_model * model, const char * test_file) {
    dev_info->device_name               = device_name();
    dev_info->cpu_props.cores           = device_cpu_cores();
    dev_info->memory.total_physical     = round(device_physical_memory(false) / (double)(1 << 30) * 100) / 100;
    dev_info->memory.available_physical = round(device_physical_memory(true)  / (double)(1 << 30) * 100) / 100;
    dev_info->memory.total_swap         = round(device_swap_memory(false) / (double)(1 << 30) * 100) / 100;
    dev_info->memory.available_swap     = round(device_swap_memory(true) / (double)(1 << 30) * 100) / 100;
    dev_info->memory.bandwidth          = round(device_memory_bw(500) / (double)(1 << 30) * 100) / 100;
    dev_info->disk_read_bandwidth       = round(device_disk_read_bw(test_file, 500) / (double)(1 << 30) * 100) / 100;
    dev_info->gpu_support.metal         = device_has_metal();
    dev_info->gpu_support.cuda          = device_has_cuda();
    dev_info->gpu_support.vulkan        = device_has_vulkan();
    dev_info->gpu_support.kompute       = device_has_kompute();
    dev_info->gpu_support.gpublas       = device_has_gpublas();
    dev_info->gpu_support.blas          = device_has_blas();
    dev_info->gpu_support.sycl          = device_has_sycl();
    ggml_backend_dev_props cpu_props;
    ggml_backend_dev_props gpu_props;
    device_get_props(model, -1, &cpu_props); // -1 for cpu
    device_get_props(model, 0,  &gpu_props); // 0 for gpu0
    dev_info->cpu_props.name            = cpu_props.name;
    dev_info->cpu_props.description     = cpu_props.description;
    dev_info->gpu_props.name            = gpu_props.name;
    dev_info->gpu_props.description     = gpu_props.description;
    dev_info->gpu_props.memory_free     = round(gpu_props.memory_free  / (double)(1 << 30) * 100) / 100;
    dev_info->gpu_props.memory_total    = round(gpu_props.memory_total / (double)(1 << 30) * 100) / 100;
    LLAMA_LOG_INFO("\n");
    LLAMA_LOG_INFO("Device Info:\n");
    LLAMA_LOG_INFO("  Device Name               : %s\n",        dev_info->device_name);
    LLAMA_LOG_INFO("  CPU Name                  : %s\n",        dev_info->cpu_props.name);
    LLAMA_LOG_INFO("  CPU Description           : %s\n",        dev_info->cpu_props.description);
    LLAMA_LOG_INFO("  Number of CPU cores       : %u\n",        dev_info->cpu_props.cores);
    LLAMA_LOG_INFO("  Disk Read Bandwidth       : %.2f GB/s\n", dev_info->disk_read_bandwidth);
    LLAMA_LOG_INFO("\n");
    LLAMA_LOG_INFO("Memory Information:\n");
    LLAMA_LOG_INFO("  Physical Mem Total        : %.2f GB\n",   dev_info->memory.total_physical);
    LLAMA_LOG_INFO("  Physical Mem Available    : %.2f GB\n",   dev_info->memory.available_physical);
    LLAMA_LOG_INFO("  Swap Memory Total         : %.2f GB\n",   dev_info->memory.total_swap);
    LLAMA_LOG_INFO("  Swap Memory Available     : %.2f GB\n",   dev_info->memory.available_swap);
    LLAMA_LOG_INFO("  Mem Bandwidth             : %.2f GB/s\n", dev_info->memory.bandwidth);
    LLAMA_LOG_INFO("\n");
    LLAMA_LOG_INFO("GPU Support:\n");
    LLAMA_LOG_INFO("  Metal                     : %i\n",        dev_info->gpu_support.metal);
    LLAMA_LOG_INFO("  CUDA                      : %i\n",        dev_info->gpu_support.cuda);
    LLAMA_LOG_INFO("  Vulkan                    : %i\n",        dev_info->gpu_support.vulkan);
    LLAMA_LOG_INFO("  Kompute                   : %i\n",        dev_info->gpu_support.kompute);
    LLAMA_LOG_INFO("  GPU BLAS                  : %i\n",        dev_info->gpu_support.gpublas);
    LLAMA_LOG_INFO("  BLAS                      : %i\n",        dev_info->gpu_support.blas);
    LLAMA_LOG_INFO("  SYCL                      : %i\n",        dev_info->gpu_support.sycl);
    LLAMA_LOG_INFO("\n");
    LLAMA_LOG_INFO("GPU Properties:\n");
    LLAMA_LOG_INFO("  GPU Name                  : %s\n",        dev_info->gpu_props.name);
    LLAMA_LOG_INFO("  Description               : %s\n",        dev_info->gpu_props.description);
    LLAMA_LOG_INFO("  Memory Free               : %.2f GB\n",   dev_info->gpu_props.memory_free);
    LLAMA_LOG_INFO("  Memory Total              : %.2f GB\n",   dev_info->gpu_props.memory_total);
 }
 ggml_backend_buffer_type_t llama_dev_buffer_type(struct llama_model * model, int device) {
    return llama_default_buffer_type_offload(*model, device);
 }