mirror of
https://github.com/Lizonghang/prima.cpp.git
synced 2025-09-10 16:24:37 +00:00
add LLAMA_API llama_profile_device
This commit is contained in:
parent
b922418cca
commit
ef7fdf70cc
6 changed files with 131 additions and 80 deletions
1
Makefile
1
Makefile
|
@ -1185,7 +1185,6 @@ common/common.o: \
|
||||||
common/sampling.h \
|
common/sampling.h \
|
||||||
common/json.hpp \
|
common/json.hpp \
|
||||||
common/json-schema-to-grammar.h \
|
common/json-schema-to-grammar.h \
|
||||||
common/profiler.h \
|
|
||||||
include/llama.h
|
include/llama.h
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
|
|
|
@ -9,7 +9,6 @@
|
||||||
#include "json.hpp"
|
#include "json.hpp"
|
||||||
#include "json-schema-to-grammar.h"
|
#include "json-schema-to-grammar.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
#include "profiler.h"
|
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <cinttypes>
|
#include <cinttypes>
|
||||||
|
@ -824,7 +823,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
||||||
llama_init_result iparams;
|
llama_init_result iparams;
|
||||||
auto mparams = llama_model_params_from_gpt_params(params);
|
auto mparams = llama_model_params_from_gpt_params(params);
|
||||||
|
|
||||||
llama_model * model = nullptr;
|
struct llama_model * model = nullptr;
|
||||||
|
|
||||||
if (!params.hf_repo.empty() && !params.hf_file.empty()) {
|
if (!params.hf_repo.empty() && !params.hf_file.empty()) {
|
||||||
model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
|
model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
|
||||||
|
@ -835,58 +834,8 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// profile devices and determine the best setup
|
// profile devices and determine the best setup
|
||||||
const char * dev_name = profiler::device_name();
|
device_info dev_info;
|
||||||
uint32_t n_cpu_cores = profiler::device_cpu_cores();
|
llama_profile_device(&dev_info, model, params.model.c_str());
|
||||||
uint64_t total_memory = profiler::device_physical_memory(false);
|
|
||||||
uint64_t available_memory = profiler::device_physical_memory(true);
|
|
||||||
uint64_t total_swap = profiler::device_swap_memory(false);
|
|
||||||
uint64_t available_swap = profiler::device_swap_memory(true);
|
|
||||||
uint64_t disk_read_bw = profiler::device_disk_read_bw(params.model.c_str(), 500);
|
|
||||||
uint64_t memory_bw = profiler::device_memory_bw(500);
|
|
||||||
int has_metal = profiler::device_has_metal();
|
|
||||||
int has_cuda = profiler::device_has_cuda();
|
|
||||||
int has_vulkan = profiler::device_has_vulkan();
|
|
||||||
int has_kompute = profiler::device_has_kompute();
|
|
||||||
int has_gpublas = profiler::device_has_gpublas();
|
|
||||||
int has_blas = profiler::device_has_blas();
|
|
||||||
int has_sycl = profiler::device_has_sycl();
|
|
||||||
ggml_backend_dev_props cpu_props;
|
|
||||||
ggml_backend_dev_props gpu_props;
|
|
||||||
profiler::device_get_props(model, -1, &cpu_props); // -1 for cpu
|
|
||||||
profiler::device_get_props(model, 0, &gpu_props); // 0 for gpu0
|
|
||||||
|
|
||||||
LOG_INF("\n");
|
|
||||||
LOG_INF("Device Info:\n");
|
|
||||||
LOG_INF(" Device Name : %s\n", dev_name);
|
|
||||||
LOG_INF(" CPU Name : %s\n", cpu_props.name);
|
|
||||||
LOG_INF(" CPU Description : %s\n", cpu_props.description);
|
|
||||||
LOG_INF(" Number of CPU cores : %u\n", n_cpu_cores);
|
|
||||||
LOG_INF(" Disk Read Bandwidth : %.2f GB/s\n", disk_read_bw / (double)(1 << 30));
|
|
||||||
LOG_INF("\n");
|
|
||||||
|
|
||||||
LOG_INF("Memory Information:\n");
|
|
||||||
LOG_INF(" Physical Mem Total : %.2f GB\n", total_memory / (double)(1 << 30));
|
|
||||||
LOG_INF(" Physical Mem Available : %.2f GB\n", available_memory / (double)(1 << 30));
|
|
||||||
LOG_INF(" Swap Memory Total : %.2f GB\n", total_swap / (double)(1 << 30));
|
|
||||||
LOG_INF(" Swap Memory Available : %.2f GB\n", available_swap / (double)(1 << 30));
|
|
||||||
LOG_INF(" Mem Bandwidth : %.2f GB/s\n", memory_bw / (double)(1 << 30));
|
|
||||||
LOG_INF("\n");
|
|
||||||
|
|
||||||
LOG_INF("GPU Support:\n");
|
|
||||||
LOG_INF(" Metal : %i\n", has_metal);
|
|
||||||
LOG_INF(" CUDA : %i\n", has_cuda);
|
|
||||||
LOG_INF(" Vulkan : %i\n", has_vulkan);
|
|
||||||
LOG_INF(" Kompute : %i\n", has_kompute);
|
|
||||||
LOG_INF(" GPU BLAS : %i\n", has_gpublas);
|
|
||||||
LOG_INF(" BLAS : %i\n", has_blas);
|
|
||||||
LOG_INF(" SYCL : %i\n", has_sycl);
|
|
||||||
LOG_INF("\n");
|
|
||||||
|
|
||||||
LOG_INF("GPU Properties:\n");
|
|
||||||
LOG_INF(" GPU Name : %s\n", gpu_props.name);
|
|
||||||
LOG_INF(" Description : %s\n", gpu_props.description);
|
|
||||||
LOG_INF(" Memory Free : %.2f GB\n", gpu_props.memory_free / (double)(1 << 30));
|
|
||||||
LOG_INF(" Memory Total : %.2f GB\n", gpu_props.memory_total / (double)(1 << 30));
|
|
||||||
|
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.c_str());
|
LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.c_str());
|
||||||
|
|
|
@ -23,8 +23,6 @@
|
||||||
#include <sys/types.h>
|
#include <sys/types.h>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
namespace profiler {
|
|
||||||
|
|
||||||
const char * device_name() {
|
const char * device_name() {
|
||||||
static char device_name[256];
|
static char device_name[256];
|
||||||
|
|
||||||
|
@ -314,5 +312,3 @@ void device_get_props(struct llama_model * model, int device, struct ggml_backen
|
||||||
ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft_type);
|
ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft_type);
|
||||||
ggml_backend_dev_get_props(dev, props);
|
ggml_backend_dev_get_props(dev, props);
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace profiler
|
|
|
@ -2,28 +2,62 @@
|
||||||
#define PROFILER_H
|
#define PROFILER_H
|
||||||
|
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
#include <string>
|
|
||||||
|
|
||||||
#define BUFFER_SIZE_MB 1024
|
struct cpu_props {
|
||||||
|
const char * name;
|
||||||
|
const char * description;
|
||||||
|
uint32_t cores;
|
||||||
|
};
|
||||||
|
|
||||||
namespace profiler {
|
struct memory_info {
|
||||||
const char * device_name(void);
|
float total_physical; // in GB
|
||||||
|
float available_physical; // in GB
|
||||||
|
float total_swap; // in GB
|
||||||
|
float available_swap; // in GB
|
||||||
|
float bandwidth; // in GB/s
|
||||||
|
};
|
||||||
|
|
||||||
uint32_t device_cpu_cores (void);
|
struct gpu_support {
|
||||||
uint64_t device_physical_memory(bool available = true);
|
bool metal;
|
||||||
uint64_t device_swap_memory (bool available = true);
|
bool cuda;
|
||||||
uint64_t device_disk_read_bw (const char * test_file, size_t buffer_size_mb = BUFFER_SIZE_MB);
|
bool vulkan;
|
||||||
uint64_t device_memory_bw (size_t buffer_size_mb = BUFFER_SIZE_MB);
|
bool kompute;
|
||||||
void device_get_props (struct llama_model * model, int device, struct ggml_backend_dev_props * props);
|
bool gpublas;
|
||||||
|
bool blas;
|
||||||
|
bool sycl;
|
||||||
|
};
|
||||||
|
|
||||||
int device_has_metal(void);
|
struct gpu_props {
|
||||||
int device_has_cuda(void);
|
const char * name;
|
||||||
int device_has_vulkan(void);
|
const char * description;
|
||||||
int device_has_kompute(void);
|
float memory_free; // in GB
|
||||||
int device_has_gpublas(void);
|
float memory_total; // in GB
|
||||||
int device_has_blas(void);
|
};
|
||||||
int device_has_sycl(void);
|
|
||||||
|
|
||||||
} // namespace profiler
|
struct device_info {
|
||||||
|
const char * device_name;
|
||||||
|
float disk_read_bandwidth; // in GB/s
|
||||||
|
struct cpu_props cpu_props;
|
||||||
|
struct memory_info memory;
|
||||||
|
struct gpu_support gpu_support;
|
||||||
|
struct gpu_props gpu_props;
|
||||||
|
};
|
||||||
|
|
||||||
|
const char * device_name(void);
|
||||||
|
|
||||||
|
uint32_t device_cpu_cores (void);
|
||||||
|
uint64_t device_physical_memory(bool available);
|
||||||
|
uint64_t device_swap_memory (bool available);
|
||||||
|
uint64_t device_disk_read_bw (const char * test_file, size_t buffer_size_mb);
|
||||||
|
uint64_t device_memory_bw (size_t buffer_size_mb);
|
||||||
|
void device_get_props (struct llama_model * model, int device, struct ggml_backend_dev_props * props);
|
||||||
|
|
||||||
|
int device_has_metal(void);
|
||||||
|
int device_has_cuda(void);
|
||||||
|
int device_has_vulkan(void);
|
||||||
|
int device_has_kompute(void);
|
||||||
|
int device_has_gpublas(void);
|
||||||
|
int device_has_blas(void);
|
||||||
|
int device_has_sycl(void);
|
||||||
|
|
||||||
#endif // PROFILER_H
|
#endif // PROFILER_H
|
|
@ -4,6 +4,8 @@
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "ggml-backend.h"
|
#include "ggml-backend.h"
|
||||||
|
|
||||||
|
#include "profiler.h"
|
||||||
|
|
||||||
#include <stddef.h>
|
#include <stddef.h>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
@ -408,7 +410,8 @@ extern "C" {
|
||||||
// Call once at the start of the program
|
// Call once at the start of the program
|
||||||
LLAMA_API void llama_backend_init(void);
|
LLAMA_API void llama_backend_init(void);
|
||||||
|
|
||||||
ggml_backend_buffer_type_t llama_dev_buffer_type(struct llama_model * model, int device);
|
LLAMA_API void llama_profile_device (struct device_info * dev_info, struct llama_model * model, const char * test_file);
|
||||||
|
LLAMA_API ggml_backend_buffer_type_t llama_dev_buffer_type(struct llama_model * model, int device);
|
||||||
|
|
||||||
//optional:
|
//optional:
|
||||||
LLAMA_API void llama_numa_init(enum ggml_numa_strategy numa);
|
LLAMA_API void llama_numa_init(enum ggml_numa_strategy numa);
|
||||||
|
|
|
@ -10,6 +10,8 @@
|
||||||
#include "ggml-alloc.h"
|
#include "ggml-alloc.h"
|
||||||
#include "ggml-backend.h"
|
#include "ggml-backend.h"
|
||||||
|
|
||||||
|
#include "profiler.h"
|
||||||
|
|
||||||
#ifdef GGML_USE_RPC
|
#ifdef GGML_USE_RPC
|
||||||
# include "ggml-rpc.h"
|
# include "ggml-rpc.h"
|
||||||
#endif
|
#endif
|
||||||
|
@ -3544,6 +3546,74 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_
|
||||||
GGML_UNUSED(model);
|
GGML_UNUSED(model);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void llama_profile_device(device_info * dev_info, struct llama_model * model, const char * test_file) {
|
||||||
|
dev_info->device_name = device_name();
|
||||||
|
dev_info->cpu_props.cores = device_cpu_cores();
|
||||||
|
|
||||||
|
dev_info->memory.total_physical = round(device_physical_memory(false) / (double)(1 << 30) * 100) / 100;
|
||||||
|
dev_info->memory.available_physical = round(device_physical_memory(true) / (double)(1 << 30) * 100) / 100;
|
||||||
|
dev_info->memory.total_swap = round(device_swap_memory(false) / (double)(1 << 30) * 100) / 100;
|
||||||
|
dev_info->memory.available_swap = round(device_swap_memory(true) / (double)(1 << 30) * 100) / 100;
|
||||||
|
dev_info->memory.bandwidth = round(device_memory_bw(500) / (double)(1 << 30) * 100) / 100;
|
||||||
|
|
||||||
|
dev_info->disk_read_bandwidth = round(device_disk_read_bw(test_file, 500) / (double)(1 << 30) * 100) / 100;
|
||||||
|
|
||||||
|
dev_info->gpu_support.metal = device_has_metal();
|
||||||
|
dev_info->gpu_support.cuda = device_has_cuda();
|
||||||
|
dev_info->gpu_support.vulkan = device_has_vulkan();
|
||||||
|
dev_info->gpu_support.kompute = device_has_kompute();
|
||||||
|
dev_info->gpu_support.gpublas = device_has_gpublas();
|
||||||
|
dev_info->gpu_support.blas = device_has_blas();
|
||||||
|
dev_info->gpu_support.sycl = device_has_sycl();
|
||||||
|
|
||||||
|
|
||||||
|
ggml_backend_dev_props cpu_props;
|
||||||
|
ggml_backend_dev_props gpu_props;
|
||||||
|
device_get_props(model, -1, &cpu_props); // -1 for cpu
|
||||||
|
device_get_props(model, 0, &gpu_props); // 0 for gpu0
|
||||||
|
|
||||||
|
dev_info->cpu_props.name = cpu_props.name;
|
||||||
|
dev_info->cpu_props.description = cpu_props.description;
|
||||||
|
|
||||||
|
dev_info->gpu_props.name = gpu_props.name;
|
||||||
|
dev_info->gpu_props.description = gpu_props.description;
|
||||||
|
dev_info->gpu_props.memory_free = round(gpu_props.memory_free / (double)(1 << 30) * 100) / 100;
|
||||||
|
dev_info->gpu_props.memory_total = round(gpu_props.memory_total / (double)(1 << 30) * 100) / 100;
|
||||||
|
|
||||||
|
LLAMA_LOG_INFO("\n");
|
||||||
|
LLAMA_LOG_INFO("Device Info:\n");
|
||||||
|
LLAMA_LOG_INFO(" Device Name : %s\n", dev_info->device_name);
|
||||||
|
LLAMA_LOG_INFO(" CPU Name : %s\n", dev_info->cpu_props.name);
|
||||||
|
LLAMA_LOG_INFO(" CPU Description : %s\n", dev_info->cpu_props.description);
|
||||||
|
LLAMA_LOG_INFO(" Number of CPU cores : %u\n", dev_info->cpu_props.cores);
|
||||||
|
LLAMA_LOG_INFO(" Disk Read Bandwidth : %.2f GB/s\n", dev_info->disk_read_bandwidth);
|
||||||
|
LLAMA_LOG_INFO("\n");
|
||||||
|
|
||||||
|
LLAMA_LOG_INFO("Memory Information:\n");
|
||||||
|
LLAMA_LOG_INFO(" Physical Mem Total : %.2f GB\n", dev_info->memory.total_physical);
|
||||||
|
LLAMA_LOG_INFO(" Physical Mem Available : %.2f GB\n", dev_info->memory.available_physical);
|
||||||
|
LLAMA_LOG_INFO(" Swap Memory Total : %.2f GB\n", dev_info->memory.total_swap);
|
||||||
|
LLAMA_LOG_INFO(" Swap Memory Available : %.2f GB\n", dev_info->memory.available_swap);
|
||||||
|
LLAMA_LOG_INFO(" Mem Bandwidth : %.2f GB/s\n", dev_info->memory.bandwidth);
|
||||||
|
LLAMA_LOG_INFO("\n");
|
||||||
|
|
||||||
|
LLAMA_LOG_INFO("GPU Support:\n");
|
||||||
|
LLAMA_LOG_INFO(" Metal : %i\n", dev_info->gpu_support.metal);
|
||||||
|
LLAMA_LOG_INFO(" CUDA : %i\n", dev_info->gpu_support.cuda);
|
||||||
|
LLAMA_LOG_INFO(" Vulkan : %i\n", dev_info->gpu_support.vulkan);
|
||||||
|
LLAMA_LOG_INFO(" Kompute : %i\n", dev_info->gpu_support.kompute);
|
||||||
|
LLAMA_LOG_INFO(" GPU BLAS : %i\n", dev_info->gpu_support.gpublas);
|
||||||
|
LLAMA_LOG_INFO(" BLAS : %i\n", dev_info->gpu_support.blas);
|
||||||
|
LLAMA_LOG_INFO(" SYCL : %i\n", dev_info->gpu_support.sycl);
|
||||||
|
LLAMA_LOG_INFO("\n");
|
||||||
|
|
||||||
|
LLAMA_LOG_INFO("GPU Properties:\n");
|
||||||
|
LLAMA_LOG_INFO(" GPU Name : %s\n", dev_info->gpu_props.name);
|
||||||
|
LLAMA_LOG_INFO(" Description : %s\n", dev_info->gpu_props.description);
|
||||||
|
LLAMA_LOG_INFO(" Memory Free : %.2f GB\n", dev_info->gpu_props.memory_free);
|
||||||
|
LLAMA_LOG_INFO(" Memory Total : %.2f GB\n", dev_info->gpu_props.memory_total);
|
||||||
|
}
|
||||||
|
|
||||||
ggml_backend_buffer_type_t llama_dev_buffer_type(struct llama_model * model, int device) {
|
ggml_backend_buffer_type_t llama_dev_buffer_type(struct llama_model * model, int device) {
|
||||||
return llama_default_buffer_type_offload(*model, device);
|
return llama_default_buffer_type_offload(*model, device);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue