add cpu and gpu profile

This commit is contained in:
Lizonghang 2024-11-06 20:42:28 +04:00
parent 4e1be1065d
commit 407c71ae52
6 changed files with 113 additions and 16 deletions

View file

@ -843,15 +843,50 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
uint64_t available_swap = profiler::device_swap_memory(true); uint64_t available_swap = profiler::device_swap_memory(true);
uint64_t disk_read_bw = profiler::device_disk_read_bw(params.model.c_str(), 500); uint64_t disk_read_bw = profiler::device_disk_read_bw(params.model.c_str(), 500);
uint64_t memory_bw = profiler::device_memory_bw(500); uint64_t memory_bw = profiler::device_memory_bw(500);
int has_metal = profiler::device_has_metal();
int has_cuda = profiler::device_has_cuda();
int has_vulkan = profiler::device_has_vulkan();
int has_kompute = profiler::device_has_kompute();
int has_gpublas = profiler::device_has_gpublas();
int has_blas = profiler::device_has_blas();
int has_sycl = profiler::device_has_sycl();
ggml_backend_dev_props cpu_props;
ggml_backend_dev_props gpu_props;
profiler::device_get_props(model, -1, &cpu_props); // -1 for cpu
profiler::device_get_props(model, 0, &gpu_props); // 0 for gpu0
LOG_INF("Device Name: %s\n", dev_name); LOG_INF("\n");
LOG_INF("Number of CPU cores: %u\n", n_cpu_cores); LOG_INF("Device Info:\n");
LOG_INF("Total Physical Memory: %.2f GB\n", total_memory / (double)(1 << 30)); LOG_INF(" Device Name : %s\n", dev_name);
LOG_INF("Available Physical Memory: %.2f GB\n", available_memory / (double)(1 << 30)); LOG_INF(" CPU Name : %s\n", cpu_props.name);
LOG_INF("Total Swap Memory: %.2f GB\n", total_swap / (double)(1 << 30)); LOG_INF(" CPU Description : %s\n", cpu_props.description);
LOG_INF("Available Swap Memory: %.2f GB\n", available_swap / (double)(1 << 30)); LOG_INF(" Number of CPU cores : %u\n", n_cpu_cores);
LOG_INF("Disk Read Bandwidth: %.2f GB/s\n", disk_read_bw / (double)(1 << 30)); LOG_INF(" Disk Read Bandwidth : %.2f GB/s\n", disk_read_bw / (double)(1 << 30));
LOG_INF("Memory Bandwidth: %.2f GB/s\n", memory_bw / (double)(1 << 30)); LOG_INF("\n");
LOG_INF("Memory Information:\n");
LOG_INF(" Physical Mem Total : %.2f GB\n", total_memory / (double)(1 << 30));
LOG_INF(" Physical Mem Available : %.2f GB\n", available_memory / (double)(1 << 30));
LOG_INF(" Swap Memory Total : %.2f GB\n", total_swap / (double)(1 << 30));
LOG_INF(" Swap Memory Available : %.2f GB\n", available_swap / (double)(1 << 30));
LOG_INF(" Mem Bandwidth : %.2f GB/s\n", memory_bw / (double)(1 << 30));
LOG_INF("\n");
LOG_INF("GPU Support:\n");
LOG_INF(" Metal : %i\n", has_metal);
LOG_INF(" CUDA : %i\n", has_cuda);
LOG_INF(" Vulkan : %i\n", has_vulkan);
LOG_INF(" Kompute : %i\n", has_kompute);
LOG_INF(" GPU BLAS : %i\n", has_gpublas);
LOG_INF(" BLAS : %i\n", has_blas);
LOG_INF(" SYCL : %i\n", has_sycl);
LOG_INF("\n");
LOG_INF("GPU Properties:\n");
LOG_INF(" GPU Name : %s\n", gpu_props.name);
LOG_INF(" Description : %s\n", gpu_props.description);
LOG_INF(" Memory Free : %d MB\n", (int)(gpu_props.memory_free / (double)(1 << 20)));
LOG_INF(" Memory Total : %.2f GB\n", gpu_props.memory_total / (double)(1 << 30));
if (model == NULL) { if (model == NULL) {
LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.c_str()); LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.c_str());

View file

@ -1,5 +1,8 @@
#include "log.h" #include "log.h"
#include "profiler.h" #include "profiler.h"
#include "ggml.h"
#include "ggml-backend.h"
#include "llama.h"
#if defined(_WIN32) || defined(_WIN64) #if defined(_WIN32) || defined(_WIN64)
#include <windows.h> #include <windows.h>
@ -271,4 +274,45 @@ uint64_t device_memory_bw(size_t buffer_size_mb) {
return speed; return speed;
} }
int device_has_metal(void) {
return ggml_cpu_has_metal();
}
int device_has_cuda(void) {
return ggml_cpu_has_cuda();
}
int device_has_vulkan(void) {
return ggml_cpu_has_vulkan();
}
int device_has_kompute(void) {
return ggml_cpu_has_kompute();
}
int device_has_gpublas(void) {
return ggml_cpu_has_gpublas();
}
int device_has_blas(void) {
return ggml_cpu_has_blas();
}
int device_has_sycl(void) {
return ggml_cpu_has_sycl();
}
// ggml_backend_buffer_type_t llama_dev_buffer_type(const llama_model * model, int device)
void device_get_props(struct llama_model * model, int device, struct ggml_backend_dev_props * props) {
ggml_backend_buffer_type_t buft_type;
if (device == -1) { // type cpu
buft_type = ggml_backend_cpu_buffer_type();
} else { // type gpu
buft_type = llama_dev_buffer_type(model, device);
}
ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft_type);
ggml_backend_dev_get_props(dev, props);
}
} // namespace profiler } // namespace profiler

View file

@ -1,17 +1,29 @@
#ifndef PROFILER_H #ifndef PROFILER_H
#define PROFILER_H #define PROFILER_H
#include "llama.h"
#include <string> #include <string>
#define BUFFER_SIZE_MB 1024 #define BUFFER_SIZE_MB 1024
namespace profiler { namespace profiler {
const char * device_name(); const char * device_name(void);
uint32_t device_cpu_cores();
uint32_t device_cpu_cores (void);
uint64_t device_physical_memory(bool available = true); uint64_t device_physical_memory(bool available = true);
uint64_t device_swap_memory(bool available = true); uint64_t device_swap_memory (bool available = true);
uint64_t device_disk_read_bw(const char * test_file, size_t buffer_size_mb = BUFFER_SIZE_MB); uint64_t device_disk_read_bw (const char * test_file, size_t buffer_size_mb = BUFFER_SIZE_MB);
uint64_t device_memory_bw(size_t buffer_size_mb = BUFFER_SIZE_MB); uint64_t device_memory_bw (size_t buffer_size_mb = BUFFER_SIZE_MB);
void device_get_props (struct llama_model * model, int device, struct ggml_backend_dev_props * props);
int device_has_metal(void);
int device_has_cuda(void);
int device_has_vulkan(void);
int device_has_kompute(void);
int device_has_gpublas(void);
int device_has_blas(void);
int device_has_sycl(void);
} // namespace profiler } // namespace profiler
#endif // PROFILER_H #endif // PROFILER_H

View file

@ -408,6 +408,8 @@ extern "C" {
// Call once at the start of the program // Call once at the start of the program
LLAMA_API void llama_backend_init(void); LLAMA_API void llama_backend_init(void);
ggml_backend_buffer_type_t llama_dev_buffer_type(struct llama_model * model, int device);
//optional: //optional:
LLAMA_API void llama_numa_init(enum ggml_numa_strategy numa); LLAMA_API void llama_numa_init(enum ggml_numa_strategy numa);

View file

@ -3544,6 +3544,10 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_
GGML_UNUSED(model); GGML_UNUSED(model);
} }
ggml_backend_buffer_type_t llama_dev_buffer_type(struct llama_model * model, int device) {
return llama_default_buffer_type_offload(*model, device);
}
static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_model & model, int fallback_gpu, const float * tensor_split) { static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_model & model, int fallback_gpu, const float * tensor_split) {
ggml_backend_buffer_type_t buft = nullptr; ggml_backend_buffer_type_t buft = nullptr;
@ -17385,7 +17389,7 @@ static int llama_recv_meta(zmq::socket_t & socket, struct sync_meta * meta) {
return 0; return 0;
} }
static void llama_recv_tensors(zmq::socket_t & socket, struct llama_ubatch * ubatch, struct llama_context * lctx, const bool is_out_embd=false) { static void llama_recv_tensors(zmq::socket_t & socket, struct llama_ubatch * ubatch, const bool is_out_embd=false) {
std::vector<zmq::message_t> recv_msgs; std::vector<zmq::message_t> recv_msgs;
if (!zmq::recv_multipart(socket, std::back_inserter(recv_msgs))) { if (!zmq::recv_multipart(socket, std::back_inserter(recv_msgs))) {
LLAMA_LOG_INFO("Failed to receive tensor data.\n"); LLAMA_LOG_INFO("Failed to receive tensor data.\n");
@ -17724,7 +17728,7 @@ static int llama_decode_internal(
// receive data from other nodes // receive data from other nodes
if (n_world > 1 && !(my_rank == 0 && i == 0) && !(my_rank == 0 && is_last_l)) { if (n_world > 1 && !(my_rank == 0 && i == 0) && !(my_rank == 0 && is_last_l)) {
const bool is_out_embd = my_rank == 0 && i == (size_t)gf.size() - 1; const bool is_out_embd = my_rank == 0 && i == (size_t)gf.size() - 1;
llama_recv_tensors(*lctx.recv_socket, &ubatch, &lctx, is_out_embd); llama_recv_tensors(*lctx.recv_socket, &ubatch, is_out_embd);
} }
// ensure ggml_backend_tensor_get_async of the previous subgraph has finished // ensure ggml_backend_tensor_get_async of the previous subgraph has finished