mirror of
https://github.com/Lizonghang/prima.cpp.git
synced 2025-09-06 11:59:02 +00:00
add cpu_read_ram_bw, metal_read_vram_bw, cuda_read_vram_bw
This commit is contained in:
parent
0a6ffe68e0
commit
68ecabc8c3
5 changed files with 139 additions and 48 deletions
|
@ -1117,6 +1117,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
||||||
cparams.n_world = params.n_world;
|
cparams.n_world = params.n_world;
|
||||||
cparams.rank = params.rank;
|
cparams.rank = params.rank;
|
||||||
cparams.unload = params.unload;
|
cparams.unload = params.unload;
|
||||||
|
cparams.n_gpu_layers = params.n_gpu_layers;
|
||||||
std::copy(std::begin(params.n_layer_window), std::end(params.n_layer_window), cparams.n_layer_window);
|
std::copy(std::begin(params.n_layer_window), std::end(params.n_layer_window), cparams.n_layer_window);
|
||||||
|
|
||||||
if (cparams.master_ip != nullptr) {
|
if (cparams.master_ip != nullptr) {
|
||||||
|
|
|
@ -43,6 +43,26 @@
|
||||||
#include <thread>
|
#include <thread>
|
||||||
#include <random>
|
#include <random>
|
||||||
#include <regex>
|
#include <regex>
|
||||||
|
#include <fcntl.h>
|
||||||
|
|
||||||
|
static int disable_log() {
|
||||||
|
int stdout_fd = dup(STDOUT_FILENO);
|
||||||
|
int null_fd = open("/dev/null", O_WRONLY);
|
||||||
|
if (null_fd == -1) {
|
||||||
|
LOG_INF("Failed to open /dev/null\n");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
dup2(null_fd, STDOUT_FILENO);
|
||||||
|
close(null_fd);
|
||||||
|
return stdout_fd;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void enable_log(int stdout_fd) {
|
||||||
|
if (stdout_fd != -1) {
|
||||||
|
dup2(stdout_fd, STDOUT_FILENO);
|
||||||
|
close(stdout_fd);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const char * device_name() {
|
const char * device_name() {
|
||||||
static char device_name[256];
|
static char device_name[256];
|
||||||
|
@ -94,7 +114,7 @@ uint32_t device_cpu_cores() {
|
||||||
return core_count;
|
return core_count;
|
||||||
}
|
}
|
||||||
|
|
||||||
static float device_flops(struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t, profiler_backend_type btype, int n_threads) {
|
static float device_flops(struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t, enum profiler_backend_type btype, int n_threads) {
|
||||||
const int n_repeat = 1;
|
const int n_repeat = 1;
|
||||||
const int n_embd = llama_n_embd(model);
|
const int n_embd = llama_n_embd(model);
|
||||||
std::vector<float> matrix_A(n_embd * n_embd, 1.0f);
|
std::vector<float> matrix_A(n_embd * n_embd, 1.0f);
|
||||||
|
@ -188,7 +208,9 @@ float device_cpu_flops(struct llama_model * model, enum ggml_type src0t, enum gg
|
||||||
|
|
||||||
float device_metal_flops(struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t) {
|
float device_metal_flops(struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t) {
|
||||||
#ifdef GGML_USE_METAL
|
#ifdef GGML_USE_METAL
|
||||||
|
int fd = disable_log();
|
||||||
return device_flops(model, src0t, src1t, PROFILER_BACKEND_TYPE_METAL, 4);
|
return device_flops(model, src0t, src1t, PROFILER_BACKEND_TYPE_METAL, 4);
|
||||||
|
enable_log(fd);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
(void)model;
|
(void)model;
|
||||||
|
@ -199,7 +221,10 @@ float device_metal_flops(struct llama_model * model, enum ggml_type src0t, enum
|
||||||
|
|
||||||
float device_cuda_flops(struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t) {
|
float device_cuda_flops(struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t) {
|
||||||
#ifdef GGML_USE_CUDA
|
#ifdef GGML_USE_CUDA
|
||||||
return device_flops(model, src0t, src1t, PROFILER_BACKEND_TYPE_CUDA, 4);
|
int fd = disable_log();
|
||||||
|
float ret = device_flops(model, src0t, src1t, PROFILER_BACKEND_TYPE_CUDA, 4)
|
||||||
|
enable_log(fd);
|
||||||
|
return ret;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
(void)model;
|
(void)model;
|
||||||
|
@ -712,12 +737,26 @@ float device_memory_bw(int n_thread) {
|
||||||
return static_cast<float>(bandwidth);
|
return static_cast<float>(bandwidth);
|
||||||
}
|
}
|
||||||
|
|
||||||
float device_cuda_memory_bw(struct llama_model * model) {
|
static float device_read_vram_bw(struct llama_model * model, enum profiler_backend_type btype) {
|
||||||
#ifdef GGML_USE_CUDA
|
|
||||||
const int n_embd = llama_n_embd(model) * 2;
|
const int n_embd = llama_n_embd(model) * 2;
|
||||||
std::vector<float> matrix_A(n_embd * n_embd, 1.0f);
|
std::vector<float> matrix_A(n_embd * n_embd, 1.0f);
|
||||||
|
|
||||||
ggml_backend_t backend = ggml_backend_cuda_init(0);
|
ggml_backend_t backend = NULL;
|
||||||
|
switch (btype) {
|
||||||
|
case PROFILER_BACKEND_TYPE_METAL:
|
||||||
|
#ifdef GGML_USE_METAL
|
||||||
|
backend = ggml_backend_metal_init();
|
||||||
|
#endif
|
||||||
|
break;
|
||||||
|
case PROFILER_BACKEND_TYPE_CUDA:
|
||||||
|
#ifdef GGML_USE_CUDA
|
||||||
|
backend = ggml_backend_cuda_init(0);
|
||||||
|
#endif
|
||||||
|
break;
|
||||||
|
case PROFILER_BACKEND_TYPE_CPU:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
if (!backend) {
|
if (!backend) {
|
||||||
LOG_INF("%s: ggml backend init failed\n", __func__);
|
LOG_INF("%s: ggml backend init failed\n", __func__);
|
||||||
return 0.0f;
|
return 0.0f;
|
||||||
|
@ -769,10 +808,28 @@ float device_cuda_memory_bw(struct llama_model * model) {
|
||||||
ggml_backend_free(backend);
|
ggml_backend_free(backend);
|
||||||
|
|
||||||
return bandwidth;
|
return bandwidth;
|
||||||
#else
|
}
|
||||||
|
|
||||||
|
float device_metal_read_vram_bw(struct llama_model * model) {
|
||||||
|
#ifdef GGML_USE_METAL
|
||||||
|
int fd = disable_log();
|
||||||
|
return device_read_vram_bw(model, PROFILER_BACKEND_TYPE_METAL);
|
||||||
|
enable_log(fd);
|
||||||
|
#endif
|
||||||
|
|
||||||
(void)model;
|
(void)model;
|
||||||
return 0.0f;
|
return 0.0f;
|
||||||
|
}
|
||||||
|
|
||||||
|
float device_cuda_read_vram_bw(struct llama_model * model) {
|
||||||
|
#ifdef GGML_USE_CUDA
|
||||||
|
int fd = disable_log();
|
||||||
|
return device_read_vram_bw(model, PROFILER_BACKEND_TYPE_CUDA);
|
||||||
|
enable_log(fd);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
(void)model;
|
||||||
|
return 0.0f;
|
||||||
}
|
}
|
||||||
|
|
||||||
int device_has_metal(void) {
|
int device_has_metal(void) {
|
||||||
|
@ -827,6 +884,14 @@ static float device_compute_delay(struct device_info & dev_info, int n_layers) {
|
||||||
total_latency += (double)n_flops.layer_q4k_f32 / (double)gpu.cuda_flops_q4k_f32 / 1e9;
|
total_latency += (double)n_flops.layer_q4k_f32 / (double)gpu.cuda_flops_q4k_f32 / 1e9;
|
||||||
total_latency += (double)n_flops.layer_q6k_f32 / (double)gpu.cuda_flops_q6k_f32 / 1e9;
|
total_latency += (double)n_flops.layer_q6k_f32 / (double)gpu.cuda_flops_q6k_f32 / 1e9;
|
||||||
total_latency += (double)n_flops.layer_q80_f32 / (double)gpu.cuda_flops_q80_f32 / 1e9;
|
total_latency += (double)n_flops.layer_q80_f32 / (double)gpu.cuda_flops_q80_f32 / 1e9;
|
||||||
|
#elif GGML_USE_METAL
|
||||||
|
struct gpu_props gpu = dev_info.gpu_props;
|
||||||
|
|
||||||
|
total_latency += (double)n_flops.layer_f32_f32 / (double)gpu.metal_flops_f32_f32 / 1e9;
|
||||||
|
total_latency += (double)n_flops.layer_f16_f32 / (double)gpu.metal_flops_f16_f32 / 1e9;
|
||||||
|
total_latency += (double)n_flops.layer_q4k_f32 / (double)gpu.metal_flops_q4k_f32 / 1e9;
|
||||||
|
total_latency += (double)n_flops.layer_q6k_f32 / (double)gpu.metal_flops_q6k_f32 / 1e9;
|
||||||
|
total_latency += (double)n_flops.layer_q80_f32 / (double)gpu.metal_flops_q80_f32 / 1e9;
|
||||||
#else
|
#else
|
||||||
total_latency += (double)n_flops.layer_f32_f32 / (double)cpu.flops_f32_f32 / 1e9;
|
total_latency += (double)n_flops.layer_f32_f32 / (double)cpu.flops_f32_f32 / 1e9;
|
||||||
total_latency += (double)n_flops.layer_f16_f32 / (double)cpu.flops_f16_f32 / 1e9;
|
total_latency += (double)n_flops.layer_f16_f32 / (double)cpu.flops_f16_f32 / 1e9;
|
||||||
|
@ -870,15 +935,18 @@ static float device_memory_access_delay(struct device_info & dev_info, int n_lay
|
||||||
n_params.output_q80;
|
n_params.output_q80;
|
||||||
|
|
||||||
#ifdef GGML_USE_CUDA
|
#ifdef GGML_USE_CUDA
|
||||||
return (double)total_bytes / 1e6 / dev_info.gpu_props.read_bandwidth; // ms
|
return (double)total_bytes / 1e6 / dev_info.gpu_props.cuda_read_vram_bw; // ms
|
||||||
|
#elif GGML_USE_METAL
|
||||||
|
return (double)total_bytes / 1e6 / dev_info.gpu_props.metal_read_vram_bw; // ms
|
||||||
#else
|
#else
|
||||||
return (double)total_bytes / 1e6 / dev_info.memory.read_bandwidth; // ms
|
return (double)total_bytes / 1e6 / dev_info.memory.cpu_read_ram_bw; // ms
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
static float device_disk_access_delay(struct device_info & dev_info, struct llama_model * model, const struct llama_context_params cparams) {
|
static float device_disk_access_delay(struct device_info & dev_info, struct llama_model * model, const struct llama_context_params cparams) {
|
||||||
auto n_params = dev_info.model_params;
|
auto n_params = dev_info.model_params;
|
||||||
int n_layers = llama_model_n_layers(model);
|
int n_layers = llama_model_n_layers(model);
|
||||||
|
int n_gpu_layers = cparams.n_gpu_layers;
|
||||||
double kv_size_gb = static_cast<double>(llama_model_kvcache_size(model, cparams)) / 1e9; // convert to GB
|
double kv_size_gb = static_cast<double>(llama_model_kvcache_size(model, cparams)) / 1e9; // convert to GB
|
||||||
double compute_buf_gb = static_cast<double>(llama_model_compute_buf_size(model, cparams, false)) / 1e9; // convert to GB
|
double compute_buf_gb = static_cast<double>(llama_model_compute_buf_size(model, cparams, false)) / 1e9; // convert to GB
|
||||||
|
|
||||||
|
@ -1005,7 +1073,7 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
|
||||||
|
|
||||||
LOG_INF("| Mem Read Bandwidth (GB/s) ");
|
LOG_INF("| Mem Read Bandwidth (GB/s) ");
|
||||||
for (int i = 0; i < n; ++i) {
|
for (int i = 0; i < n; ++i) {
|
||||||
LOG_INF("| %-10.2f ", dev_info_set[i].memory.read_bandwidth);
|
LOG_INF("| %-10.2f ", dev_info_set[i].memory.cpu_read_ram_bw);
|
||||||
}
|
}
|
||||||
LOG_INF("\n");
|
LOG_INF("\n");
|
||||||
|
|
||||||
|
@ -1099,9 +1167,9 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
|
||||||
}
|
}
|
||||||
LOG_INF("\n");
|
LOG_INF("\n");
|
||||||
|
|
||||||
LOG_INF("| VRAM Read Bandwidth (GB/s) ");
|
LOG_INF("| Metal VRAM Read BW (GB/s) ");
|
||||||
for (int i = 0; i < n; ++i) {
|
for (int i = 0; i < n; ++i) {
|
||||||
LOG_INF("| %-10.2f ", dev_info_set[i].gpu_props.read_bandwidth);
|
LOG_INF("| %-10.2f ", dev_info_set[i].gpu_props.metal_read_vram_bw);
|
||||||
}
|
}
|
||||||
LOG_INF("\n");
|
LOG_INF("\n");
|
||||||
|
|
||||||
|
@ -1135,31 +1203,37 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
|
||||||
}
|
}
|
||||||
LOG_INF("\n");
|
LOG_INF("\n");
|
||||||
|
|
||||||
LOG_INF("| CUDA flops (F32xF32, GFLOPS)");
|
LOG_INF("| CUDA VRAM Read BW (GB/s) ");
|
||||||
|
for (int i = 0; i < n; ++i) {
|
||||||
|
LOG_INF("| %-10.2f ", dev_info_set[i].gpu_props.cuda_read_vram_bw);
|
||||||
|
}
|
||||||
|
LOG_INF("\n");
|
||||||
|
|
||||||
|
LOG_INF("| CUDA flops (F32xF32, GFLOPS) ");
|
||||||
for (int i = 0; i < n; ++i) {
|
for (int i = 0; i < n; ++i) {
|
||||||
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_f32_f32);
|
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_f32_f32);
|
||||||
}
|
}
|
||||||
LOG_INF("\n");
|
LOG_INF("\n");
|
||||||
|
|
||||||
LOG_INF("| CUDA flops (F16xF32, GFLOPS)");
|
LOG_INF("| CUDA flops (F16xF32, GFLOPS) ");
|
||||||
for (int i = 0; i < n; ++i) {
|
for (int i = 0; i < n; ++i) {
|
||||||
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_f16_f32);
|
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_f16_f32);
|
||||||
}
|
}
|
||||||
LOG_INF("\n");
|
LOG_INF("\n");
|
||||||
|
|
||||||
LOG_INF("| CUDA flops (Q4KxF32, GFLOPS)");
|
LOG_INF("| CUDA flops (Q4KxF32, GFLOPS) ");
|
||||||
for (int i = 0; i < n; ++i) {
|
for (int i = 0; i < n; ++i) {
|
||||||
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q4k_f32);
|
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q4k_f32);
|
||||||
}
|
}
|
||||||
LOG_INF("\n");
|
LOG_INF("\n");
|
||||||
|
|
||||||
LOG_INF("| CUDA flops (Q6KxF32, GFLOPS)");
|
LOG_INF("| CUDA flops (Q6KxF32, GFLOPS) ");
|
||||||
for (int i = 0; i < n; ++i) {
|
for (int i = 0; i < n; ++i) {
|
||||||
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q6k_f32);
|
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q6k_f32);
|
||||||
}
|
}
|
||||||
LOG_INF("\n");
|
LOG_INF("\n");
|
||||||
|
|
||||||
LOG_INF("| CUDA flops (Q80xF32, GFLOPS)");
|
LOG_INF("| CUDA flops (Q80xF32, GFLOPS) ");
|
||||||
for (int i = 0; i < n; ++i) {
|
for (int i = 0; i < n; ++i) {
|
||||||
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q80_f32);
|
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q80_f32);
|
||||||
}
|
}
|
||||||
|
@ -1269,7 +1343,9 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
|
||||||
float latency = 0.0f;
|
float latency = 0.0f;
|
||||||
int n_layers = llama_model_n_layers(model);
|
int n_layers = llama_model_n_layers(model);
|
||||||
latency += device_compute_delay(dev_info_set[0], n_layers);
|
latency += device_compute_delay(dev_info_set[0], n_layers);
|
||||||
|
LOG_INF("latency: %.2f\n", latency);
|
||||||
latency += device_memory_access_delay(dev_info_set[0], n_layers);
|
latency += device_memory_access_delay(dev_info_set[0], n_layers);
|
||||||
|
LOG_INF("latency: %.2f\n", latency);
|
||||||
latency += device_disk_access_delay(dev_info_set[0], model, cparams); // if physical memory is not enough, some tensor weights will be released from memory and reloaded by mmap later
|
latency += device_disk_access_delay(dev_info_set[0], model, cparams); // if physical memory is not enough, some tensor weights will be released from memory and reloaded by mmap later
|
||||||
|
|
||||||
LOG_INF("| Token latency (ms) ");
|
LOG_INF("| Token latency (ms) ");
|
||||||
|
@ -1300,7 +1376,7 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) {
|
||||||
+ sizeof(float) * 5 // cpu_props.flops_f32_f32, cpu_props.flops_f16_f32, cpu_props.flops_q4k_f32, cpu_props.flops_q6k_f32, cpu_props.flops_q80_f32
|
+ sizeof(float) * 5 // cpu_props.flops_f32_f32, cpu_props.flops_f16_f32, cpu_props.flops_q4k_f32, cpu_props.flops_q6k_f32, cpu_props.flops_q80_f32
|
||||||
+ sizeof(struct memory_info)
|
+ sizeof(struct memory_info)
|
||||||
+ sizeof(struct gpu_support)
|
+ sizeof(struct gpu_support)
|
||||||
+ sizeof(float) * 13; // gpu_props.memory_free, gpu_props.memory_total, gpu_props.read_bandwidth,
|
+ sizeof(float) * 14; // gpu_props.memory_free, gpu_props.memory_total, gpu_props.metal_read_vram_bw, gpu_props.cuda_read_vram_bw,
|
||||||
// gpu_props.metal_flops_f32_f32, gpu_props.metal_flops_f16_f32, gpu_props.metal_flops_q4k_f32, gpu_props.metal_flops_q6k_f32, gpu_props.metal_flops_q80_f32,
|
// gpu_props.metal_flops_f32_f32, gpu_props.metal_flops_f16_f32, gpu_props.metal_flops_q4k_f32, gpu_props.metal_flops_q6k_f32, gpu_props.metal_flops_q80_f32,
|
||||||
// gpu_props.cuda_flops_f32_f32, gpu_props.cuda_flops_f16_f32, gpu_props.cuda_flops_q4k_f32, gpu_props.cuda_flops_q6k_f32, gpu_props.cuda_flops_q80_f32
|
// gpu_props.cuda_flops_f32_f32, gpu_props.cuda_flops_f16_f32, gpu_props.cuda_flops_q4k_f32, gpu_props.cuda_flops_q6k_f32, gpu_props.cuda_flops_q80_f32
|
||||||
|
|
||||||
|
@ -1371,7 +1447,7 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) {
|
||||||
memcpy(ptr, &dev_info->gpu_props.memory_total, sizeof(float));
|
memcpy(ptr, &dev_info->gpu_props.memory_total, sizeof(float));
|
||||||
ptr += sizeof(float);
|
ptr += sizeof(float);
|
||||||
|
|
||||||
memcpy(ptr, &dev_info->gpu_props.read_bandwidth, sizeof(float));
|
memcpy(ptr, &dev_info->gpu_props.metal_read_vram_bw, sizeof(float));
|
||||||
ptr += sizeof(float);
|
ptr += sizeof(float);
|
||||||
|
|
||||||
memcpy(ptr, &dev_info->gpu_props.metal_flops_f32_f32, sizeof(float));
|
memcpy(ptr, &dev_info->gpu_props.metal_flops_f32_f32, sizeof(float));
|
||||||
|
@ -1389,6 +1465,9 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) {
|
||||||
memcpy(ptr, &dev_info->gpu_props.metal_flops_q80_f32, sizeof(float));
|
memcpy(ptr, &dev_info->gpu_props.metal_flops_q80_f32, sizeof(float));
|
||||||
ptr += sizeof(float);
|
ptr += sizeof(float);
|
||||||
|
|
||||||
|
memcpy(ptr, &dev_info->gpu_props.cuda_read_vram_bw, sizeof(float));
|
||||||
|
ptr += sizeof(float);
|
||||||
|
|
||||||
memcpy(ptr, &dev_info->gpu_props.cuda_flops_f32_f32, sizeof(float));
|
memcpy(ptr, &dev_info->gpu_props.cuda_flops_f32_f32, sizeof(float));
|
||||||
ptr += sizeof(float);
|
ptr += sizeof(float);
|
||||||
|
|
||||||
|
@ -1488,7 +1567,7 @@ void deserialize(const char * buffer, struct device_info * dev_info) {
|
||||||
memcpy(&dev_info->gpu_props.memory_total, ptr, sizeof(float));
|
memcpy(&dev_info->gpu_props.memory_total, ptr, sizeof(float));
|
||||||
ptr += sizeof(float);
|
ptr += sizeof(float);
|
||||||
|
|
||||||
memcpy(&dev_info->gpu_props.read_bandwidth, ptr, sizeof(float));
|
memcpy(&dev_info->gpu_props.metal_read_vram_bw, ptr, sizeof(float));
|
||||||
ptr += sizeof(float);
|
ptr += sizeof(float);
|
||||||
|
|
||||||
memcpy(&dev_info->gpu_props.metal_flops_f32_f32, ptr, sizeof(float));
|
memcpy(&dev_info->gpu_props.metal_flops_f32_f32, ptr, sizeof(float));
|
||||||
|
@ -1506,6 +1585,9 @@ void deserialize(const char * buffer, struct device_info * dev_info) {
|
||||||
memcpy(&dev_info->gpu_props.metal_flops_q80_f32, ptr, sizeof(float));
|
memcpy(&dev_info->gpu_props.metal_flops_q80_f32, ptr, sizeof(float));
|
||||||
ptr += sizeof(float);
|
ptr += sizeof(float);
|
||||||
|
|
||||||
|
memcpy(&dev_info->gpu_props.cuda_read_vram_bw, ptr, sizeof(float));
|
||||||
|
ptr += sizeof(float);
|
||||||
|
|
||||||
memcpy(&dev_info->gpu_props.cuda_flops_f32_f32, ptr, sizeof(float));
|
memcpy(&dev_info->gpu_props.cuda_flops_f32_f32, ptr, sizeof(float));
|
||||||
ptr += sizeof(float);
|
ptr += sizeof(float);
|
||||||
|
|
||||||
|
|
|
@ -35,14 +35,14 @@ struct memory_info {
|
||||||
float available_physical; // in GiB
|
float available_physical; // in GiB
|
||||||
float total_swap; // in GiB
|
float total_swap; // in GiB
|
||||||
float available_swap; // in GiB
|
float available_swap; // in GiB
|
||||||
float read_bandwidth; // in GB/s
|
float cpu_read_ram_bw; // in GB/s
|
||||||
|
|
||||||
memory_info() :
|
memory_info() :
|
||||||
total_physical (0.0f),
|
total_physical (0.0f),
|
||||||
available_physical(0.0f),
|
available_physical(0.0f),
|
||||||
total_swap (0.0f),
|
total_swap (0.0f),
|
||||||
available_swap (0.0f),
|
available_swap (0.0f),
|
||||||
read_bandwidth (0.0f) {}
|
cpu_read_ram_bw (0.0f) {}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct gpu_support {
|
struct gpu_support {
|
||||||
|
@ -69,12 +69,13 @@ struct gpu_props {
|
||||||
const char * description;
|
const char * description;
|
||||||
float memory_free; // in GiB
|
float memory_free; // in GiB
|
||||||
float memory_total; // in GiB
|
float memory_total; // in GiB
|
||||||
float read_bandwidth; // in GB/s
|
float metal_read_vram_bw; // in GB/s
|
||||||
float metal_flops_f32_f32; // in GFLOPS
|
float metal_flops_f32_f32; // in GFLOPS
|
||||||
float metal_flops_f16_f32; // in GFLOPS
|
float metal_flops_f16_f32; // in GFLOPS
|
||||||
float metal_flops_q4k_f32; // in GFLOPS
|
float metal_flops_q4k_f32; // in GFLOPS
|
||||||
float metal_flops_q6k_f32; // in GFLOPS
|
float metal_flops_q6k_f32; // in GFLOPS
|
||||||
float metal_flops_q80_f32; // in GFLOPS
|
float metal_flops_q80_f32; // in GFLOPS
|
||||||
|
float cuda_read_vram_bw; // in GB/s
|
||||||
float cuda_flops_f32_f32; // in GFLOPS
|
float cuda_flops_f32_f32; // in GFLOPS
|
||||||
float cuda_flops_f16_f32; // in GFLOPS
|
float cuda_flops_f16_f32; // in GFLOPS
|
||||||
float cuda_flops_q4k_f32; // in GFLOPS
|
float cuda_flops_q4k_f32; // in GFLOPS
|
||||||
|
@ -86,12 +87,13 @@ struct gpu_props {
|
||||||
description(""),
|
description(""),
|
||||||
memory_free (0.0f),
|
memory_free (0.0f),
|
||||||
memory_total (0.0f),
|
memory_total (0.0f),
|
||||||
read_bandwidth (0.0f),
|
metal_read_vram_bw (0.0f),
|
||||||
metal_flops_f32_f32(0.0f),
|
metal_flops_f32_f32(0.0f),
|
||||||
metal_flops_f16_f32(0.0f),
|
metal_flops_f16_f32(0.0f),
|
||||||
metal_flops_q4k_f32(0.0f),
|
metal_flops_q4k_f32(0.0f),
|
||||||
metal_flops_q6k_f32(0.0f),
|
metal_flops_q6k_f32(0.0f),
|
||||||
metal_flops_q80_f32(0.0f),
|
metal_flops_q80_f32(0.0f),
|
||||||
|
cuda_read_vram_bw (0.0f),
|
||||||
cuda_flops_f32_f32 (0.0f),
|
cuda_flops_f32_f32 (0.0f),
|
||||||
cuda_flops_f16_f32 (0.0f),
|
cuda_flops_f16_f32 (0.0f),
|
||||||
cuda_flops_q4k_f32 (0.0f),
|
cuda_flops_q4k_f32 (0.0f),
|
||||||
|
@ -221,7 +223,8 @@ uint64_t device_swap_memory (bool available);
|
||||||
void device_disk_seq_bw (float * read_seq_bw, float * write_seq_bw, int n_threads);
|
void device_disk_seq_bw (float * read_seq_bw, float * write_seq_bw, int n_threads);
|
||||||
void device_disk_rnd_bw (float * read_rnd_bw, float * write_rnd_bw, int n_threads);
|
void device_disk_rnd_bw (float * read_rnd_bw, float * write_rnd_bw, int n_threads);
|
||||||
float device_memory_bw (int n_thread);
|
float device_memory_bw (int n_thread);
|
||||||
float device_cuda_memory_bw (struct llama_model * model);
|
float device_metal_read_vram_bw(struct llama_model * model);
|
||||||
|
float device_cuda_read_vram_bw (struct llama_model * model);
|
||||||
void device_get_props (struct llama_model * model, int device, struct ggml_backend_dev_props * props);
|
void device_get_props (struct llama_model * model, int device, struct ggml_backend_dev_props * props);
|
||||||
void device_print_props (struct device_info * dev_info_set, int n, struct llama_model * model, const struct llama_context_params cparams);
|
void device_print_props (struct device_info * dev_info_set, int n, struct llama_model * model, const struct llama_context_params cparams);
|
||||||
|
|
||||||
|
|
|
@ -320,6 +320,7 @@ extern "C" {
|
||||||
uint32_t n_world; // world size
|
uint32_t n_world; // world size
|
||||||
uint32_t rank; // my rank
|
uint32_t rank; // my rank
|
||||||
uint32_t n_layer_window[32];// number of layers to process in each compute
|
uint32_t n_layer_window[32];// number of layers to process in each compute
|
||||||
|
uint32_t n_gpu_layers; // number of layers to process on GPU
|
||||||
bool unload; // whether to unload layer weights after use
|
bool unload; // whether to unload layer weights after use
|
||||||
char * master_ip; // ip address of the master node
|
char * master_ip; // ip address of the master node
|
||||||
char * next_node_ip; // ip address of the next node
|
char * next_node_ip; // ip address of the next node
|
||||||
|
|
|
@ -3555,17 +3555,17 @@ void llama_perf_context_sync(struct llama_context * ctx, const struct llama_mode
|
||||||
void llama_profile_device(device_info * dev_info, struct llama_model * model, llama_model_loader * ml, int n_threads) {
|
void llama_profile_device(device_info * dev_info, struct llama_model * model, llama_model_loader * ml, int n_threads) {
|
||||||
dev_info->device_name = device_name();
|
dev_info->device_name = device_name();
|
||||||
dev_info->cpu_props.cores = device_cpu_cores();
|
dev_info->cpu_props.cores = device_cpu_cores();
|
||||||
dev_info->cpu_props.flops_f32_f32 = device_cpu_flops(model, GGML_TYPE_F32, GGML_TYPE_F32, n_threads);
|
// dev_info->cpu_props.flops_f32_f32 = device_cpu_flops(model, GGML_TYPE_F32, GGML_TYPE_F32, n_threads);
|
||||||
dev_info->cpu_props.flops_f16_f32 = device_cpu_flops(model, GGML_TYPE_F16, GGML_TYPE_F32, n_threads);
|
// dev_info->cpu_props.flops_f16_f32 = device_cpu_flops(model, GGML_TYPE_F16, GGML_TYPE_F32, n_threads);
|
||||||
dev_info->cpu_props.flops_q4k_f32 = device_cpu_flops(model, GGML_TYPE_Q4_K, GGML_TYPE_F32, n_threads);
|
// dev_info->cpu_props.flops_q4k_f32 = device_cpu_flops(model, GGML_TYPE_Q4_K, GGML_TYPE_F32, n_threads);
|
||||||
dev_info->cpu_props.flops_q6k_f32 = device_cpu_flops(model, GGML_TYPE_Q6_K, GGML_TYPE_F32, n_threads);
|
// dev_info->cpu_props.flops_q6k_f32 = device_cpu_flops(model, GGML_TYPE_Q6_K, GGML_TYPE_F32, n_threads);
|
||||||
dev_info->cpu_props.flops_q80_f32 = device_cpu_flops(model, GGML_TYPE_Q8_0, GGML_TYPE_F32, n_threads);
|
// dev_info->cpu_props.flops_q80_f32 = device_cpu_flops(model, GGML_TYPE_Q8_0, GGML_TYPE_F32, n_threads);
|
||||||
|
|
||||||
dev_info->memory.total_physical = round(device_physical_memory(false) / (double)(1 << 30) * 100) / 100;
|
dev_info->memory.total_physical = round(device_physical_memory(false) / (double)(1 << 30) * 100) / 100;
|
||||||
dev_info->memory.available_physical = round(device_physical_memory(true) / (double)(1 << 30) * 100) / 100;
|
dev_info->memory.available_physical = round(device_physical_memory(true) / (double)(1 << 30) * 100) / 100;
|
||||||
dev_info->memory.total_swap = round(device_swap_memory(false) / (double)(1 << 30) * 100) / 100;
|
dev_info->memory.total_swap = round(device_swap_memory(false) / (double)(1 << 30) * 100) / 100;
|
||||||
dev_info->memory.available_swap = round(device_swap_memory(true) / (double)(1 << 30) * 100) / 100;
|
dev_info->memory.available_swap = round(device_swap_memory(true) / (double)(1 << 30) * 100) / 100;
|
||||||
dev_info->memory.read_bandwidth = device_memory_bw(n_threads);
|
dev_info->memory.cpu_read_ram_bw = device_memory_bw(n_threads);
|
||||||
|
|
||||||
device_disk_seq_bw(&dev_info->disk.read_seq_bw, &dev_info->disk.write_seq_bw, n_threads);
|
device_disk_seq_bw(&dev_info->disk.read_seq_bw, &dev_info->disk.write_seq_bw, n_threads);
|
||||||
device_disk_rnd_bw(&dev_info->disk.read_rnd_bw, &dev_info->disk.write_rnd_bw, n_threads);
|
device_disk_rnd_bw(&dev_info->disk.read_rnd_bw, &dev_info->disk.write_rnd_bw, n_threads);
|
||||||
|
@ -3590,12 +3590,13 @@ void llama_profile_device(device_info * dev_info, struct llama_model * model, ll
|
||||||
dev_info->gpu_props.description = gpu_props.description;
|
dev_info->gpu_props.description = gpu_props.description;
|
||||||
dev_info->gpu_props.memory_free = round(gpu_props.memory_free / (double)(1 << 30) * 100) / 100;
|
dev_info->gpu_props.memory_free = round(gpu_props.memory_free / (double)(1 << 30) * 100) / 100;
|
||||||
dev_info->gpu_props.memory_total = round(gpu_props.memory_total / (double)(1 << 30) * 100) / 100;
|
dev_info->gpu_props.memory_total = round(gpu_props.memory_total / (double)(1 << 30) * 100) / 100;
|
||||||
dev_info->gpu_props.read_bandwidth = device_cuda_memory_bw(model);
|
dev_info->gpu_props.metal_read_vram_bw = device_metal_read_vram_bw(model);
|
||||||
dev_info->gpu_props.metal_flops_f32_f32 = device_metal_flops(model, GGML_TYPE_F32, GGML_TYPE_F32);
|
dev_info->gpu_props.metal_flops_f32_f32 = device_metal_flops(model, GGML_TYPE_F32, GGML_TYPE_F32);
|
||||||
dev_info->gpu_props.metal_flops_f16_f32 = device_metal_flops(model, GGML_TYPE_F16, GGML_TYPE_F32);
|
dev_info->gpu_props.metal_flops_f16_f32 = device_metal_flops(model, GGML_TYPE_F16, GGML_TYPE_F32);
|
||||||
dev_info->gpu_props.metal_flops_q4k_f32 = device_metal_flops(model, GGML_TYPE_Q4_K, GGML_TYPE_F32);
|
dev_info->gpu_props.metal_flops_q4k_f32 = device_metal_flops(model, GGML_TYPE_Q4_K, GGML_TYPE_F32);
|
||||||
dev_info->gpu_props.metal_flops_q6k_f32 = device_metal_flops(model, GGML_TYPE_Q6_K, GGML_TYPE_F32);
|
dev_info->gpu_props.metal_flops_q6k_f32 = device_metal_flops(model, GGML_TYPE_Q6_K, GGML_TYPE_F32);
|
||||||
dev_info->gpu_props.metal_flops_q80_f32 = device_metal_flops(model, GGML_TYPE_Q8_0, GGML_TYPE_F32);
|
dev_info->gpu_props.metal_flops_q80_f32 = device_metal_flops(model, GGML_TYPE_Q8_0, GGML_TYPE_F32);
|
||||||
|
dev_info->gpu_props.cuda_read_vram_bw = device_cuda_read_vram_bw(model);
|
||||||
dev_info->gpu_props.cuda_flops_f32_f32 = device_cuda_flops (model, GGML_TYPE_F32, GGML_TYPE_F32);
|
dev_info->gpu_props.cuda_flops_f32_f32 = device_cuda_flops (model, GGML_TYPE_F32, GGML_TYPE_F32);
|
||||||
dev_info->gpu_props.cuda_flops_f16_f32 = device_cuda_flops (model, GGML_TYPE_F16, GGML_TYPE_F32);
|
dev_info->gpu_props.cuda_flops_f16_f32 = device_cuda_flops (model, GGML_TYPE_F16, GGML_TYPE_F32);
|
||||||
dev_info->gpu_props.cuda_flops_q4k_f32 = device_cuda_flops (model, GGML_TYPE_Q4_K, GGML_TYPE_F32);
|
dev_info->gpu_props.cuda_flops_q4k_f32 = device_cuda_flops (model, GGML_TYPE_Q4_K, GGML_TYPE_F32);
|
||||||
|
@ -19623,6 +19624,7 @@ struct llama_context_params llama_context_default_params() {
|
||||||
/*.n_world =*/ 1,
|
/*.n_world =*/ 1,
|
||||||
/*.rank =*/ 0,
|
/*.rank =*/ 0,
|
||||||
/*.n_layer_window =*/ {32},
|
/*.n_layer_window =*/ {32},
|
||||||
|
/*.n_gpu_layers =*/ 0,
|
||||||
/*.unload =*/ false,
|
/*.unload =*/ false,
|
||||||
/*.master_ip =*/ nullptr,
|
/*.master_ip =*/ nullptr,
|
||||||
/*.next_node_ip =*/ nullptr,
|
/*.next_node_ip =*/ nullptr,
|
||||||
|
@ -20829,17 +20831,19 @@ uint64_t llama_model_compute_buf_size(const struct llama_model * model, const st
|
||||||
const uint64_t n_output = hparams.n_vocab * cparams.n_ubatch;
|
const uint64_t n_output = hparams.n_vocab * cparams.n_ubatch;
|
||||||
|
|
||||||
// compute buffer size for input, each layer, and output
|
// compute buffer size for input, each layer, and output
|
||||||
// const uint64_t n_buf_inp = (n_inp_toks + n_inp_embd) * ggml_type_size(GGML_TYPE_F32); // do not consider memory compression
|
const uint64_t n_buf_inp = (n_inp_toks + n_inp_embd) * ggml_type_size(GGML_TYPE_F32); // do not consider memory compression
|
||||||
const uint64_t n_buf_inp = (n_inp_toks + n_inp_embd) * ggml_type_size(GGML_TYPE_F32) / 2; // consider compressed memory with ratio 2:1
|
|
||||||
const uint64_t n_buf_act = (n_bak_embd + n_inp_pos + n_kq_mask +
|
const uint64_t n_buf_act = (n_bak_embd + n_inp_pos + n_kq_mask +
|
||||||
n_inp_out_ids + n_norm + n_qcur + n_kq
|
n_inp_out_ids + n_norm + n_qcur + n_kq
|
||||||
) * ggml_type_size(GGML_TYPE_F32);
|
) * ggml_type_size(GGML_TYPE_F32);
|
||||||
// const uint64_t n_buf_out = (n_out_embd + n_output) * ggml_type_size(GGML_TYPE_F32); // do not consider memory compression
|
const uint64_t n_buf_out = (n_out_embd + n_output) * ggml_type_size(GGML_TYPE_F32); // do not consider memory compression
|
||||||
const uint64_t n_buf_out = (n_out_embd + n_output) * ggml_type_size(GGML_TYPE_F32) / 2; // consider compressed memory with ratio 2:1
|
|
||||||
|
|
||||||
uint64_t n_buf_total = 0;
|
uint64_t n_buf_total = 0;
|
||||||
if (cparams.rank == 0) {
|
if (cparams.rank == 0) {
|
||||||
|
if (compress_memory) {
|
||||||
|
n_buf_total = n_buf_inp / 2 + n_buf_act + n_buf_out / 2; // consider compressed memory with ratio 2:1
|
||||||
|
} else {
|
||||||
n_buf_total = n_buf_inp + n_buf_act + n_buf_out;
|
n_buf_total = n_buf_inp + n_buf_act + n_buf_out;
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
n_buf_total = n_buf_act;
|
n_buf_total = n_buf_act;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Reference in a new issue