mirror of
https://github.com/Lizonghang/prima.cpp.git
synced 2025-09-06 19:09:04 +00:00
fix disk r/w test, add disk access latency, and correct units (GB, GiB)
This commit is contained in:
parent
9cd22177d0
commit
f7507ec20b
3 changed files with 206 additions and 78 deletions
|
@ -34,6 +34,8 @@
|
|||
#include <vector>
|
||||
#include <inttypes.h>
|
||||
#include <thread>
|
||||
#include <random>
|
||||
#include <regex>
|
||||
|
||||
const char * device_name() {
|
||||
static char device_name[256];
|
||||
|
@ -439,49 +441,104 @@ uint64_t device_swap_memory(bool available) {
|
|||
return swap_memory;
|
||||
}
|
||||
|
||||
uint64_t device_disk_read_bw(const char * test_file, size_t buffer_size_mb) {
|
||||
uint64_t speed = 0;
|
||||
size_t buffer_size = buffer_size_mb * 1024 * 1024; // buffer size in bytes
|
||||
static void external_fio_impl(float * read_bw, float * write_bw, bool op_rand) {
|
||||
const char * test_file = "fio_test";
|
||||
const char * fio_conf_template = R"(
|
||||
[global]
|
||||
ioengine=posixaio
|
||||
direct=1
|
||||
time_based=1
|
||||
runtime=2
|
||||
size=500M
|
||||
group_reporting=1
|
||||
|
||||
try {
|
||||
// open a file for reading
|
||||
std::ifstream file(test_file, std::ios::binary | std::ios::in);
|
||||
if (!file) {
|
||||
LOG_ERR("Unable to open the file at path: %s\n", test_file);
|
||||
return speed;
|
||||
}
|
||||
[read-job]
|
||||
rw=%s
|
||||
bs=%s
|
||||
filename=%s
|
||||
|
||||
// prepare buffer for reading
|
||||
std::vector<char> buffer(buffer_size);
|
||||
[write-job]
|
||||
rw=%s
|
||||
bs=%s
|
||||
filename=%s
|
||||
)";
|
||||
|
||||
auto start_time = std::chrono::high_resolution_clock::now();
|
||||
const char * read_type = op_rand ? "randread" : "read";
|
||||
const char * write_type = op_rand ? "randwrite" : "write";
|
||||
const char * block_size = op_rand ? "4k" : "1M";
|
||||
|
||||
// read file into buffer
|
||||
file.read(buffer.data(), buffer.size());
|
||||
if (!file) {
|
||||
LOG_ERR("Failed to read enough data from the test file\n");
|
||||
return speed;
|
||||
}
|
||||
// write config to a file
|
||||
char fio_conf[1024];
|
||||
snprintf(fio_conf, sizeof(fio_conf), fio_conf_template,
|
||||
read_type, block_size, test_file,
|
||||
write_type, block_size, test_file);
|
||||
const char * conf_file = "config.fio";
|
||||
std::ofstream conf(conf_file);
|
||||
if (!conf) {
|
||||
LOG_INF("Error: Unable to create configuration file\n");
|
||||
return;
|
||||
}
|
||||
conf << fio_conf;
|
||||
conf.close();
|
||||
|
||||
auto end_time = std::chrono::high_resolution_clock::now();
|
||||
std::chrono::duration<double> elapsed_time = end_time - start_time;
|
||||
|
||||
// speed in bytes per second
|
||||
if (elapsed_time.count() > 0) {
|
||||
speed = static_cast<uint64_t>(buffer.size() / elapsed_time.count());
|
||||
}
|
||||
|
||||
buffer.clear();
|
||||
buffer.shrink_to_fit();
|
||||
} catch (const std::exception &e) {
|
||||
LOG_ERR("Exception while calculating disk read speed: %s\n", e.what());
|
||||
// run fio and redirect output to a file
|
||||
const char * output_file = "fio_output.log";
|
||||
std::string command = "fio " + std::string(conf_file) + " > " + std::string(output_file);
|
||||
if (std::system(command.c_str()) != 0) {
|
||||
LOG_INF("Error: Failed to run fio\n");
|
||||
return;
|
||||
}
|
||||
|
||||
return speed;
|
||||
// parse fio output
|
||||
std::ifstream result(output_file);
|
||||
if (!result) {
|
||||
LOG_INF("Error: Failed to open fio output file\n");
|
||||
return;
|
||||
}
|
||||
*read_bw = 0.0f;
|
||||
*write_bw = 0.0f;
|
||||
|
||||
std::string line;
|
||||
std::regex read_regex(R"(READ: bw=([0-9.]+)([a-zA-Z/]+))");
|
||||
std::regex write_regex(R"(WRITE: bw=([0-9.]+)([a-zA-Z/]+))");
|
||||
std::smatch match;
|
||||
|
||||
while (std::getline(result, line)) {
|
||||
if (std::regex_search(line, match, read_regex)) {
|
||||
float value = std::stof(match[1]);
|
||||
std::string unit = match[2];
|
||||
if (unit == "MiB/s") {
|
||||
*read_bw = value * 1024.0f * 1024.0f / 1e9; // convert MiB/s to GB/s
|
||||
} else if (unit == "MB/s") {
|
||||
*read_bw = value / 1000.0f; // convert MB/s to GB/s
|
||||
}
|
||||
} else if (std::regex_search(line, match, write_regex)) {
|
||||
float value = std::stof(match[1]);
|
||||
std::string unit = match[2];
|
||||
if (unit == "MiB/s") {
|
||||
*write_bw = value * 1024.0f * 1024.0f / 1e9; // convert MiB/s to GB/s
|
||||
} else if (unit == "MB/s") {
|
||||
*write_bw = value / 1000.0f; // convert MB/s to GB/s
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// clean up temporary files
|
||||
std::remove(test_file);
|
||||
std::remove(conf_file);
|
||||
std::remove(output_file);
|
||||
}
|
||||
|
||||
void device_disk_rnd_bw(float * read_rnd_bw, float * write_rnd_bw) {
|
||||
external_fio_impl(read_rnd_bw, write_rnd_bw, true);
|
||||
}
|
||||
|
||||
void device_disk_seq_bw(float * read_seq_bw, float * write_seq_bw) {
|
||||
external_fio_impl(read_seq_bw, write_seq_bw, false);
|
||||
}
|
||||
|
||||
float device_memory_bw(int n_thread) {
|
||||
size_t buffer_size = 5L * 1024 * 1024; // 5m
|
||||
size_t buffer_size = 5L * 1024 * 1024; // 5 MiB
|
||||
std::vector<std::thread> thread_pool;
|
||||
std::vector<double> results(n_thread);
|
||||
std::vector<char *> buffers(n_thread);
|
||||
|
@ -579,6 +636,7 @@ float device_cuda_memory_bw(struct llama_model * model) {
|
|||
|
||||
return bandwidth;
|
||||
#else
|
||||
(void)model;
|
||||
return 0.0f;
|
||||
#endif
|
||||
}
|
||||
|
@ -662,28 +720,58 @@ static float device_compute_delay(struct device_info & dev_info, int n_layers) {
|
|||
static float device_memory_access_delay(struct device_info & dev_info, int n_layers) {
|
||||
struct model_params n_params = dev_info.model_params;
|
||||
|
||||
int64_t total_params = 0;
|
||||
total_params += n_params.layer_f32 * 4 +
|
||||
n_params.layer_f16 * 2 +
|
||||
n_params.layer_q4k / 2 +
|
||||
n_params.layer_q6k * 3 / 8 +
|
||||
n_params.layer_q80;
|
||||
int64_t total_bytes = 0;
|
||||
total_bytes += n_params.layer_f32 * 4 +
|
||||
n_params.layer_f16 * 2 +
|
||||
n_params.layer_q4k / 2 +
|
||||
n_params.layer_q6k * 3 / 8 +
|
||||
n_params.layer_q80;
|
||||
|
||||
total_params *= n_layers;
|
||||
total_bytes *= n_layers;
|
||||
|
||||
total_params += n_params.output_f32 * 4 +
|
||||
n_params.output_f16 * 2 +
|
||||
n_params.output_q4k / 2 +
|
||||
n_params.output_q6k * 3 / 8 +
|
||||
n_params.output_q80;
|
||||
total_bytes += n_params.output_f32 * 4 +
|
||||
n_params.output_f16 * 2 +
|
||||
n_params.output_q4k / 2 +
|
||||
n_params.output_q6k * 3 / 8 +
|
||||
n_params.output_q80;
|
||||
|
||||
#ifdef GGML_USE_CUDA
|
||||
return (double)total_params / 1e6 / dev_info.gpu_props.read_bandwidth;
|
||||
return (double)total_bytes / 1e6 / dev_info.gpu_props.read_bandwidth; // ms
|
||||
#else
|
||||
return (double)total_params / 1e6 / dev_info.memory.read_bandwidth;
|
||||
return (double)total_bytes / 1e6 / dev_info.memory.read_bandwidth; // ms
|
||||
#endif
|
||||
}
|
||||
|
||||
static float device_disk_access_delay(struct device_info & dev_info, int n_layers) {
|
||||
struct model_params n_params = dev_info.model_params;
|
||||
|
||||
int64_t total_bytes = 0;
|
||||
total_bytes += n_params.layer_f32 * 4 +
|
||||
n_params.layer_f16 * 2 +
|
||||
n_params.layer_q4k / 2 +
|
||||
n_params.layer_q6k * 3 / 8 +
|
||||
n_params.layer_q80;
|
||||
|
||||
total_bytes *= n_layers;
|
||||
|
||||
total_bytes += n_params.output_f32 * 4 +
|
||||
n_params.output_f16 * 2 +
|
||||
n_params.output_q4k / 2 +
|
||||
n_params.output_q6k * 3 / 8 +
|
||||
n_params.output_q80;
|
||||
|
||||
float total_gbytes = (double)total_bytes / 1e9; // convert to GB
|
||||
float mem_avail = dev_info.memory.available_physical * 1024.0f * 1024.0f * 1024.0f / 1e9; // convert to GB
|
||||
float disk_read_bw = dev_info.disk.read_seq_bw; // GB/s
|
||||
return std::max(0.0, static_cast<double>(total_gbytes - mem_avail) / disk_read_bw * 1000); // convert to ms
|
||||
}
|
||||
|
||||
static float device_swap_access_delay(struct device_info & dev_info, int n_layers) {
|
||||
(void)dev_info;
|
||||
(void)n_layers;
|
||||
return 0.0f;
|
||||
}
|
||||
|
||||
void device_print_props(struct device_info * dev_info_set, int n, struct llama_model * model) {
|
||||
LOG_INF("\n-------------------------------------------------------------------------------------------\n");
|
||||
LOG_INF("| Property ");
|
||||
|
@ -771,15 +859,33 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
|
|||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Mem Read Bandwidth (GB/s) ");
|
||||
LOG_INF("| Mem Read Bandwidth (GB/s) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.2f ", dev_info_set[i].memory.read_bandwidth);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Disk Read Bandwidth (GB/s) ");
|
||||
LOG_INF("| Disk Read Seq Speed (GB/s) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.2f ", dev_info_set[i].disk_read_bandwidth);
|
||||
LOG_INF("| %-10.2f ", dev_info_set[i].disk.read_seq_bw);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Disk Write Seq Speed (GB/s) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.2f ", dev_info_set[i].disk.write_seq_bw);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Disk Read Rnd Speed (GB/s) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.2f ", dev_info_set[i].disk.read_rnd_bw);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Disk Write Rnd Speed (GB/s) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.2f ", dev_info_set[i].disk.write_rnd_bw);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
|
@ -1015,10 +1121,13 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
|
|||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q80);
|
||||
LOG_INF("\n");
|
||||
|
||||
// todo: calculate for each device, not only master
|
||||
float latency = 0.0f;
|
||||
int n_layers = llama_model_n_layers(model);
|
||||
latency += device_compute_delay(dev_info_set[0], n_layers);
|
||||
latency += device_memory_access_delay(dev_info_set[0], n_layers);
|
||||
latency += device_disk_access_delay(dev_info_set[0], n_layers); // if physical memory is not enough, some tensor weights will be released from memory and reloaded by mmap later
|
||||
latency += device_swap_access_delay(dev_info_set[0], n_layers); // if physical memory is not enough, activations will be stored in swap, which causes additional disk io with random access
|
||||
|
||||
LOG_INF("| Token latency (ms) ");
|
||||
LOG_INF("| %-10.2f ", latency);
|
||||
|
@ -1043,7 +1152,7 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) {
|
|||
+ cpu_description_len
|
||||
+ gpu_name_len
|
||||
+ gpu_description_len
|
||||
+ sizeof(float) // disk_read_bandwidth
|
||||
+ sizeof(struct disk_props)
|
||||
+ sizeof(uint32_t) // cpu_props.cores
|
||||
+ sizeof(float) * 5 // cpu_props.flops_f32_f32, cpu_props.flops_f16_f32, cpu_props.flops_q4k_f32, cpu_props.flops_q6k_f32, cpu_props.flops_q80_f32
|
||||
+ sizeof(struct memory_info)
|
||||
|
@ -1086,8 +1195,8 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) {
|
|||
ptr += gpu_description_len;
|
||||
|
||||
// copy the non-string members
|
||||
memcpy(ptr, &dev_info->disk_read_bandwidth, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
memcpy(ptr, &dev_info->disk, sizeof(struct disk_props));
|
||||
ptr += sizeof(struct disk_props);
|
||||
|
||||
memcpy(ptr, &dev_info->cpu_props.cores, sizeof(uint32_t));
|
||||
ptr += sizeof(uint32_t);
|
||||
|
@ -1203,8 +1312,8 @@ void deserialize(const char * buffer, struct device_info * dev_info) {
|
|||
ptr += gpu_description_len;
|
||||
|
||||
// other non-string members
|
||||
memcpy(&dev_info->disk_read_bandwidth, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
memcpy(&dev_info->disk, ptr, sizeof(struct disk_props));
|
||||
ptr += sizeof(struct disk_props);
|
||||
|
||||
memcpy(&dev_info->cpu_props.cores, ptr, sizeof(uint32_t));
|
||||
ptr += sizeof(uint32_t);
|
||||
|
|
|
@ -4,6 +4,10 @@
|
|||
#include "ggml.h"
|
||||
#include "llama.h"
|
||||
|
||||
#define DISK_TEST_TOTAL_BYTE 500L * 1024 * 1024
|
||||
#define DISK_TEST_SEQ_BLOCK 100L * 1024 * 1024
|
||||
#define DISK_TEST_RND_BLOCK 4096
|
||||
|
||||
struct cpu_props {
|
||||
const char * name;
|
||||
const char * description;
|
||||
|
@ -26,10 +30,10 @@ struct cpu_props {
|
|||
};
|
||||
|
||||
struct memory_info {
|
||||
float total_physical; // in GB
|
||||
float available_physical; // in GB
|
||||
float total_swap; // in GB
|
||||
float available_swap; // in GB
|
||||
float total_physical; // in GiB
|
||||
float available_physical; // in GiB
|
||||
float total_swap; // in GiB
|
||||
float available_swap; // in GiB
|
||||
float read_bandwidth; // in GB/s
|
||||
|
||||
memory_info() :
|
||||
|
@ -62,8 +66,8 @@ struct gpu_support {
|
|||
struct gpu_props {
|
||||
const char * name;
|
||||
const char * description;
|
||||
float memory_free; // in GB
|
||||
float memory_total; // in GB
|
||||
float memory_free; // in GiB
|
||||
float memory_total; // in GiB
|
||||
float read_bandwidth; // in GB/s
|
||||
float metal_flops_f32_f32; // in GFLOPS
|
||||
float metal_flops_f16_f32; // in GFLOPS
|
||||
|
@ -81,7 +85,7 @@ struct gpu_props {
|
|||
description(""),
|
||||
memory_free (0.0f),
|
||||
memory_total (0.0f),
|
||||
read_bandwidth (1.0f),
|
||||
read_bandwidth (0.0f),
|
||||
metal_flops_f32_f32(0.0f),
|
||||
metal_flops_f16_f32(0.0f),
|
||||
metal_flops_q4k_f32(0.0f),
|
||||
|
@ -156,10 +160,23 @@ struct model_params {
|
|||
layer_q80 (0) {}
|
||||
};
|
||||
|
||||
struct disk_props {
|
||||
float read_seq_bw; // in GB/s
|
||||
float read_rnd_bw; // in GB/s
|
||||
float write_seq_bw; // in GB/s
|
||||
float write_rnd_bw; // in GB/s
|
||||
|
||||
disk_props() :
|
||||
read_seq_bw (0.0f),
|
||||
read_rnd_bw (0.0f),
|
||||
write_seq_bw(0.0f),
|
||||
write_rnd_bw(0.0f) {}
|
||||
};
|
||||
|
||||
struct device_info {
|
||||
uint32_t rank;
|
||||
const char * device_name;
|
||||
float disk_read_bandwidth; // in GB/s
|
||||
struct disk_props disk;
|
||||
struct cpu_props cpu_props;
|
||||
struct memory_info memory;
|
||||
struct gpu_support gpu_support;
|
||||
|
@ -170,7 +187,7 @@ struct device_info {
|
|||
device_info() :
|
||||
rank(0),
|
||||
device_name(""),
|
||||
disk_read_bandwidth(0.0f),
|
||||
disk(),
|
||||
cpu_props(),
|
||||
memory(),
|
||||
gpu_support(),
|
||||
|
@ -193,18 +210,19 @@ enum profiler_layer_type {
|
|||
|
||||
const char * device_name(void);
|
||||
|
||||
uint32_t device_cpu_cores (void);
|
||||
float device_cpu_flops (struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t, int n_threads);
|
||||
float device_metal_flops (struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t);
|
||||
float device_cuda_flops (struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t);
|
||||
float device_inp_embd_delay (struct llama_model * model, enum ggml_type src0t, int n_tokens, int n_threads);
|
||||
uint64_t device_physical_memory(bool available);
|
||||
uint64_t device_swap_memory (bool available);
|
||||
uint64_t device_disk_read_bw (const char * test_file, size_t buffer_size_mb);
|
||||
float device_memory_bw (int n_thread);
|
||||
float device_cuda_memory_bw (struct llama_model * model);
|
||||
void device_get_props (struct llama_model * model, int device, struct ggml_backend_dev_props * props);
|
||||
void device_print_props (struct device_info * dev_info_set, int n, struct llama_model * model);
|
||||
uint32_t device_cpu_cores (void);
|
||||
float device_cpu_flops (struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t, int n_threads);
|
||||
float device_metal_flops (struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t);
|
||||
float device_cuda_flops (struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t);
|
||||
float device_inp_embd_delay (struct llama_model * model, enum ggml_type src0t, int n_tokens, int n_threads);
|
||||
uint64_t device_physical_memory (bool available);
|
||||
uint64_t device_swap_memory (bool available);
|
||||
void device_disk_seq_bw (float * read_seq_bw, float * write_seq_bw);
|
||||
void device_disk_rnd_bw (float * read_rnd_bw, float * write_rnd_bw);
|
||||
float device_memory_bw (int n_thread);
|
||||
float device_cuda_memory_bw (struct llama_model * model);
|
||||
void device_get_props (struct llama_model * model, int device, struct ggml_backend_dev_props * props);
|
||||
void device_print_props (struct device_info * dev_info_set, int n, struct llama_model * model);
|
||||
|
||||
int device_has_metal (void);
|
||||
int device_has_cuda (void);
|
||||
|
|
|
@ -3547,7 +3547,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_
|
|||
GGML_UNUSED(model);
|
||||
}
|
||||
|
||||
void llama_profile_device(device_info * dev_info, struct llama_model * model, llama_model_loader * ml, const char * test_file, int n_threads) {
|
||||
void llama_profile_device(device_info * dev_info, struct llama_model * model, llama_model_loader * ml, int n_threads) {
|
||||
dev_info->device_name = device_name();
|
||||
dev_info->cpu_props.cores = device_cpu_cores();
|
||||
dev_info->cpu_props.flops_f32_f32 = device_cpu_flops(model, GGML_TYPE_F32, GGML_TYPE_F32, n_threads);
|
||||
|
@ -3562,7 +3562,8 @@ void llama_profile_device(device_info * dev_info, struct llama_model * model, ll
|
|||
dev_info->memory.available_swap = round(device_swap_memory(true) / (double)(1 << 30) * 100) / 100;
|
||||
dev_info->memory.read_bandwidth = device_memory_bw(n_threads);
|
||||
|
||||
dev_info->disk_read_bandwidth = round(device_disk_read_bw(test_file, 500) / (double)(1 << 30) * 100) / 100;
|
||||
device_disk_seq_bw(&dev_info->disk.read_seq_bw, &dev_info->disk.write_seq_bw);
|
||||
device_disk_rnd_bw(&dev_info->disk.read_rnd_bw, &dev_info->disk.write_rnd_bw);
|
||||
|
||||
dev_info->gpu_support.metal = device_has_metal();
|
||||
dev_info->gpu_support.cuda = device_has_cuda();
|
||||
|
|
Loading…
Add table
Reference in a new issue