add LLAMA_API llama_profile_device

This commit is contained in:
Lizonghang 2024-11-07 09:30:39 +04:00
parent b922418cca
commit ef7fdf70cc
6 changed files with 131 additions and 80 deletions

View file

@ -1185,7 +1185,6 @@ common/common.o: \
common/sampling.h \
common/json.hpp \
common/json-schema-to-grammar.h \
common/profiler.h \
include/llama.h
$(CXX) $(CXXFLAGS) -c $< -o $@

View file

@ -9,7 +9,6 @@
#include "json.hpp"
#include "json-schema-to-grammar.h"
#include "llama.h"
#include "profiler.h"
#include <algorithm>
#include <cinttypes>
@ -824,7 +823,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
llama_init_result iparams;
auto mparams = llama_model_params_from_gpt_params(params);
llama_model * model = nullptr;
struct llama_model * model = nullptr;
if (!params.hf_repo.empty() && !params.hf_file.empty()) {
model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
@ -835,58 +834,8 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
}
// profile devices and determine the best setup
const char * dev_name = profiler::device_name();
uint32_t n_cpu_cores = profiler::device_cpu_cores();
uint64_t total_memory = profiler::device_physical_memory(false);
uint64_t available_memory = profiler::device_physical_memory(true);
uint64_t total_swap = profiler::device_swap_memory(false);
uint64_t available_swap = profiler::device_swap_memory(true);
uint64_t disk_read_bw = profiler::device_disk_read_bw(params.model.c_str(), 500);
uint64_t memory_bw = profiler::device_memory_bw(500);
int has_metal = profiler::device_has_metal();
int has_cuda = profiler::device_has_cuda();
int has_vulkan = profiler::device_has_vulkan();
int has_kompute = profiler::device_has_kompute();
int has_gpublas = profiler::device_has_gpublas();
int has_blas = profiler::device_has_blas();
int has_sycl = profiler::device_has_sycl();
ggml_backend_dev_props cpu_props;
ggml_backend_dev_props gpu_props;
profiler::device_get_props(model, -1, &cpu_props); // -1 for cpu
profiler::device_get_props(model, 0, &gpu_props); // 0 for gpu0
LOG_INF("\n");
LOG_INF("Device Info:\n");
LOG_INF(" Device Name : %s\n", dev_name);
LOG_INF(" CPU Name : %s\n", cpu_props.name);
LOG_INF(" CPU Description : %s\n", cpu_props.description);
LOG_INF(" Number of CPU cores : %u\n", n_cpu_cores);
LOG_INF(" Disk Read Bandwidth : %.2f GB/s\n", disk_read_bw / (double)(1 << 30));
LOG_INF("\n");
LOG_INF("Memory Information:\n");
LOG_INF(" Physical Mem Total : %.2f GB\n", total_memory / (double)(1 << 30));
LOG_INF(" Physical Mem Available : %.2f GB\n", available_memory / (double)(1 << 30));
LOG_INF(" Swap Memory Total : %.2f GB\n", total_swap / (double)(1 << 30));
LOG_INF(" Swap Memory Available : %.2f GB\n", available_swap / (double)(1 << 30));
LOG_INF(" Mem Bandwidth : %.2f GB/s\n", memory_bw / (double)(1 << 30));
LOG_INF("\n");
LOG_INF("GPU Support:\n");
LOG_INF(" Metal : %i\n", has_metal);
LOG_INF(" CUDA : %i\n", has_cuda);
LOG_INF(" Vulkan : %i\n", has_vulkan);
LOG_INF(" Kompute : %i\n", has_kompute);
LOG_INF(" GPU BLAS : %i\n", has_gpublas);
LOG_INF(" BLAS : %i\n", has_blas);
LOG_INF(" SYCL : %i\n", has_sycl);
LOG_INF("\n");
LOG_INF("GPU Properties:\n");
LOG_INF(" GPU Name : %s\n", gpu_props.name);
LOG_INF(" Description : %s\n", gpu_props.description);
LOG_INF(" Memory Free : %.2f GB\n", gpu_props.memory_free / (double)(1 << 30));
LOG_INF(" Memory Total : %.2f GB\n", gpu_props.memory_total / (double)(1 << 30));
device_info dev_info;
llama_profile_device(&dev_info, model, params.model.c_str());
if (model == NULL) {
LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.c_str());

View file

@ -23,8 +23,6 @@
#include <sys/types.h>
#include <vector>
namespace profiler {
const char * device_name() {
static char device_name[256];
@ -314,5 +312,3 @@ void device_get_props(struct llama_model * model, int device, struct ggml_backen
ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft_type);
ggml_backend_dev_get_props(dev, props);
}
} // namespace profiler

View file

@ -2,28 +2,62 @@
#define PROFILER_H
#include "llama.h"
#include <string>
#define BUFFER_SIZE_MB 1024
struct cpu_props {
const char * name;
const char * description;
uint32_t cores;
};
namespace profiler {
const char * device_name(void);
struct memory_info {
float total_physical; // in GB
float available_physical; // in GB
float total_swap; // in GB
float available_swap; // in GB
float bandwidth; // in GB/s
};
uint32_t device_cpu_cores (void);
uint64_t device_physical_memory(bool available = true);
uint64_t device_swap_memory (bool available = true);
uint64_t device_disk_read_bw (const char * test_file, size_t buffer_size_mb = BUFFER_SIZE_MB);
uint64_t device_memory_bw (size_t buffer_size_mb = BUFFER_SIZE_MB);
void device_get_props (struct llama_model * model, int device, struct ggml_backend_dev_props * props);
struct gpu_support {
bool metal;
bool cuda;
bool vulkan;
bool kompute;
bool gpublas;
bool blas;
bool sycl;
};
int device_has_metal(void);
int device_has_cuda(void);
int device_has_vulkan(void);
int device_has_kompute(void);
int device_has_gpublas(void);
int device_has_blas(void);
int device_has_sycl(void);
struct gpu_props {
const char * name;
const char * description;
float memory_free; // in GB
float memory_total; // in GB
};
} // namespace profiler
struct device_info {
const char * device_name;
float disk_read_bandwidth; // in GB/s
struct cpu_props cpu_props;
struct memory_info memory;
struct gpu_support gpu_support;
struct gpu_props gpu_props;
};
const char * device_name(void);
uint32_t device_cpu_cores (void);
uint64_t device_physical_memory(bool available);
uint64_t device_swap_memory (bool available);
uint64_t device_disk_read_bw (const char * test_file, size_t buffer_size_mb);
uint64_t device_memory_bw (size_t buffer_size_mb);
void device_get_props (struct llama_model * model, int device, struct ggml_backend_dev_props * props);
int device_has_metal(void);
int device_has_cuda(void);
int device_has_vulkan(void);
int device_has_kompute(void);
int device_has_gpublas(void);
int device_has_blas(void);
int device_has_sycl(void);
#endif // PROFILER_H

View file

@ -4,6 +4,8 @@
#include "ggml.h"
#include "ggml-backend.h"
#include "profiler.h"
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
@ -408,7 +410,8 @@ extern "C" {
// Call once at the start of the program
LLAMA_API void llama_backend_init(void);
ggml_backend_buffer_type_t llama_dev_buffer_type(struct llama_model * model, int device);
LLAMA_API void llama_profile_device (struct device_info * dev_info, struct llama_model * model, const char * test_file);
LLAMA_API ggml_backend_buffer_type_t llama_dev_buffer_type(struct llama_model * model, int device);
//optional:
LLAMA_API void llama_numa_init(enum ggml_numa_strategy numa);

View file

@ -10,6 +10,8 @@
#include "ggml-alloc.h"
#include "ggml-backend.h"
#include "profiler.h"
#ifdef GGML_USE_RPC
# include "ggml-rpc.h"
#endif
@ -3544,6 +3546,74 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_
GGML_UNUSED(model);
}
void llama_profile_device(device_info * dev_info, struct llama_model * model, const char * test_file) {
dev_info->device_name = device_name();
dev_info->cpu_props.cores = device_cpu_cores();
dev_info->memory.total_physical = round(device_physical_memory(false) / (double)(1 << 30) * 100) / 100;
dev_info->memory.available_physical = round(device_physical_memory(true) / (double)(1 << 30) * 100) / 100;
dev_info->memory.total_swap = round(device_swap_memory(false) / (double)(1 << 30) * 100) / 100;
dev_info->memory.available_swap = round(device_swap_memory(true) / (double)(1 << 30) * 100) / 100;
dev_info->memory.bandwidth = round(device_memory_bw(500) / (double)(1 << 30) * 100) / 100;
dev_info->disk_read_bandwidth = round(device_disk_read_bw(test_file, 500) / (double)(1 << 30) * 100) / 100;
dev_info->gpu_support.metal = device_has_metal();
dev_info->gpu_support.cuda = device_has_cuda();
dev_info->gpu_support.vulkan = device_has_vulkan();
dev_info->gpu_support.kompute = device_has_kompute();
dev_info->gpu_support.gpublas = device_has_gpublas();
dev_info->gpu_support.blas = device_has_blas();
dev_info->gpu_support.sycl = device_has_sycl();
ggml_backend_dev_props cpu_props;
ggml_backend_dev_props gpu_props;
device_get_props(model, -1, &cpu_props); // -1 for cpu
device_get_props(model, 0, &gpu_props); // 0 for gpu0
dev_info->cpu_props.name = cpu_props.name;
dev_info->cpu_props.description = cpu_props.description;
dev_info->gpu_props.name = gpu_props.name;
dev_info->gpu_props.description = gpu_props.description;
dev_info->gpu_props.memory_free = round(gpu_props.memory_free / (double)(1 << 30) * 100) / 100;
dev_info->gpu_props.memory_total = round(gpu_props.memory_total / (double)(1 << 30) * 100) / 100;
LLAMA_LOG_INFO("\n");
LLAMA_LOG_INFO("Device Info:\n");
LLAMA_LOG_INFO(" Device Name : %s\n", dev_info->device_name);
LLAMA_LOG_INFO(" CPU Name : %s\n", dev_info->cpu_props.name);
LLAMA_LOG_INFO(" CPU Description : %s\n", dev_info->cpu_props.description);
LLAMA_LOG_INFO(" Number of CPU cores : %u\n", dev_info->cpu_props.cores);
LLAMA_LOG_INFO(" Disk Read Bandwidth : %.2f GB/s\n", dev_info->disk_read_bandwidth);
LLAMA_LOG_INFO("\n");
LLAMA_LOG_INFO("Memory Information:\n");
LLAMA_LOG_INFO(" Physical Mem Total : %.2f GB\n", dev_info->memory.total_physical);
LLAMA_LOG_INFO(" Physical Mem Available : %.2f GB\n", dev_info->memory.available_physical);
LLAMA_LOG_INFO(" Swap Memory Total : %.2f GB\n", dev_info->memory.total_swap);
LLAMA_LOG_INFO(" Swap Memory Available : %.2f GB\n", dev_info->memory.available_swap);
LLAMA_LOG_INFO(" Mem Bandwidth : %.2f GB/s\n", dev_info->memory.bandwidth);
LLAMA_LOG_INFO("\n");
LLAMA_LOG_INFO("GPU Support:\n");
LLAMA_LOG_INFO(" Metal : %i\n", dev_info->gpu_support.metal);
LLAMA_LOG_INFO(" CUDA : %i\n", dev_info->gpu_support.cuda);
LLAMA_LOG_INFO(" Vulkan : %i\n", dev_info->gpu_support.vulkan);
LLAMA_LOG_INFO(" Kompute : %i\n", dev_info->gpu_support.kompute);
LLAMA_LOG_INFO(" GPU BLAS : %i\n", dev_info->gpu_support.gpublas);
LLAMA_LOG_INFO(" BLAS : %i\n", dev_info->gpu_support.blas);
LLAMA_LOG_INFO(" SYCL : %i\n", dev_info->gpu_support.sycl);
LLAMA_LOG_INFO("\n");
LLAMA_LOG_INFO("GPU Properties:\n");
LLAMA_LOG_INFO(" GPU Name : %s\n", dev_info->gpu_props.name);
LLAMA_LOG_INFO(" Description : %s\n", dev_info->gpu_props.description);
LLAMA_LOG_INFO(" Memory Free : %.2f GB\n", dev_info->gpu_props.memory_free);
LLAMA_LOG_INFO(" Memory Total : %.2f GB\n", dev_info->gpu_props.memory_total);
}
ggml_backend_buffer_type_t llama_dev_buffer_type(struct llama_model * model, int device) {
return llama_default_buffer_type_offload(*model, device);
}