fix disk r/w test, add disk access latency, and correct units (GB, GiB)

This commit is contained in:
Lizonghang 2024-11-27 21:36:12 +04:00
parent 9cd22177d0
commit f7507ec20b
3 changed files with 206 additions and 78 deletions

View file

@ -34,6 +34,8 @@
#include <vector> #include <vector>
#include <inttypes.h> #include <inttypes.h>
#include <thread> #include <thread>
#include <random>
#include <regex>
const char * device_name() { const char * device_name() {
static char device_name[256]; static char device_name[256];
@ -439,49 +441,104 @@ uint64_t device_swap_memory(bool available) {
return swap_memory; return swap_memory;
} }
uint64_t device_disk_read_bw(const char * test_file, size_t buffer_size_mb) { static void external_fio_impl(float * read_bw, float * write_bw, bool op_rand) {
uint64_t speed = 0; const char * test_file = "fio_test";
size_t buffer_size = buffer_size_mb * 1024 * 1024; // buffer size in bytes const char * fio_conf_template = R"(
[global]
ioengine=posixaio
direct=1
time_based=1
runtime=2
size=500M
group_reporting=1
try { [read-job]
// open a file for reading rw=%s
std::ifstream file(test_file, std::ios::binary | std::ios::in); bs=%s
if (!file) { filename=%s
LOG_ERR("Unable to open the file at path: %s\n", test_file);
return speed;
}
// prepare buffer for reading [write-job]
std::vector<char> buffer(buffer_size); rw=%s
bs=%s
filename=%s
)";
auto start_time = std::chrono::high_resolution_clock::now(); const char * read_type = op_rand ? "randread" : "read";
const char * write_type = op_rand ? "randwrite" : "write";
const char * block_size = op_rand ? "4k" : "1M";
// read file into buffer // write config to a file
file.read(buffer.data(), buffer.size()); char fio_conf[1024];
if (!file) { snprintf(fio_conf, sizeof(fio_conf), fio_conf_template,
LOG_ERR("Failed to read enough data from the test file\n"); read_type, block_size, test_file,
return speed; write_type, block_size, test_file);
} const char * conf_file = "config.fio";
std::ofstream conf(conf_file);
if (!conf) {
LOG_INF("Error: Unable to create configuration file\n");
return;
}
conf << fio_conf;
conf.close();
auto end_time = std::chrono::high_resolution_clock::now(); // run fio and redirect output to a file
std::chrono::duration<double> elapsed_time = end_time - start_time; const char * output_file = "fio_output.log";
std::string command = "fio " + std::string(conf_file) + " > " + std::string(output_file);
// speed in bytes per second if (std::system(command.c_str()) != 0) {
if (elapsed_time.count() > 0) { LOG_INF("Error: Failed to run fio\n");
speed = static_cast<uint64_t>(buffer.size() / elapsed_time.count()); return;
}
buffer.clear();
buffer.shrink_to_fit();
} catch (const std::exception &e) {
LOG_ERR("Exception while calculating disk read speed: %s\n", e.what());
} }
return speed; // parse fio output
std::ifstream result(output_file);
if (!result) {
LOG_INF("Error: Failed to open fio output file\n");
return;
}
*read_bw = 0.0f;
*write_bw = 0.0f;
std::string line;
std::regex read_regex(R"(READ: bw=([0-9.]+)([a-zA-Z/]+))");
std::regex write_regex(R"(WRITE: bw=([0-9.]+)([a-zA-Z/]+))");
std::smatch match;
while (std::getline(result, line)) {
if (std::regex_search(line, match, read_regex)) {
float value = std::stof(match[1]);
std::string unit = match[2];
if (unit == "MiB/s") {
*read_bw = value * 1024.0f * 1024.0f / 1e9; // convert MiB/s to GB/s
} else if (unit == "MB/s") {
*read_bw = value / 1000.0f; // convert MB/s to GB/s
}
} else if (std::regex_search(line, match, write_regex)) {
float value = std::stof(match[1]);
std::string unit = match[2];
if (unit == "MiB/s") {
*write_bw = value * 1024.0f * 1024.0f / 1e9; // convert MiB/s to GB/s
} else if (unit == "MB/s") {
*write_bw = value / 1000.0f; // convert MB/s to GB/s
}
}
}
// clean up temporary files
std::remove(test_file);
std::remove(conf_file);
std::remove(output_file);
}
void device_disk_rnd_bw(float * read_rnd_bw, float * write_rnd_bw) {
external_fio_impl(read_rnd_bw, write_rnd_bw, true);
}
void device_disk_seq_bw(float * read_seq_bw, float * write_seq_bw) {
external_fio_impl(read_seq_bw, write_seq_bw, false);
} }
float device_memory_bw(int n_thread) { float device_memory_bw(int n_thread) {
size_t buffer_size = 5L * 1024 * 1024; // 5m size_t buffer_size = 5L * 1024 * 1024; // 5 MiB
std::vector<std::thread> thread_pool; std::vector<std::thread> thread_pool;
std::vector<double> results(n_thread); std::vector<double> results(n_thread);
std::vector<char *> buffers(n_thread); std::vector<char *> buffers(n_thread);
@ -579,6 +636,7 @@ float device_cuda_memory_bw(struct llama_model * model) {
return bandwidth; return bandwidth;
#else #else
(void)model;
return 0.0f; return 0.0f;
#endif #endif
} }
@ -662,28 +720,58 @@ static float device_compute_delay(struct device_info & dev_info, int n_layers) {
static float device_memory_access_delay(struct device_info & dev_info, int n_layers) { static float device_memory_access_delay(struct device_info & dev_info, int n_layers) {
struct model_params n_params = dev_info.model_params; struct model_params n_params = dev_info.model_params;
int64_t total_params = 0; int64_t total_bytes = 0;
total_params += n_params.layer_f32 * 4 + total_bytes += n_params.layer_f32 * 4 +
n_params.layer_f16 * 2 + n_params.layer_f16 * 2 +
n_params.layer_q4k / 2 + n_params.layer_q4k / 2 +
n_params.layer_q6k * 3 / 8 + n_params.layer_q6k * 3 / 8 +
n_params.layer_q80; n_params.layer_q80;
total_params *= n_layers; total_bytes *= n_layers;
total_params += n_params.output_f32 * 4 + total_bytes += n_params.output_f32 * 4 +
n_params.output_f16 * 2 + n_params.output_f16 * 2 +
n_params.output_q4k / 2 + n_params.output_q4k / 2 +
n_params.output_q6k * 3 / 8 + n_params.output_q6k * 3 / 8 +
n_params.output_q80; n_params.output_q80;
#ifdef GGML_USE_CUDA #ifdef GGML_USE_CUDA
return (double)total_params / 1e6 / dev_info.gpu_props.read_bandwidth; return (double)total_bytes / 1e6 / dev_info.gpu_props.read_bandwidth; // ms
#else #else
return (double)total_params / 1e6 / dev_info.memory.read_bandwidth; return (double)total_bytes / 1e6 / dev_info.memory.read_bandwidth; // ms
#endif #endif
} }
static float device_disk_access_delay(struct device_info & dev_info, int n_layers) {
struct model_params n_params = dev_info.model_params;
int64_t total_bytes = 0;
total_bytes += n_params.layer_f32 * 4 +
n_params.layer_f16 * 2 +
n_params.layer_q4k / 2 +
n_params.layer_q6k * 3 / 8 +
n_params.layer_q80;
total_bytes *= n_layers;
total_bytes += n_params.output_f32 * 4 +
n_params.output_f16 * 2 +
n_params.output_q4k / 2 +
n_params.output_q6k * 3 / 8 +
n_params.output_q80;
float total_gbytes = (double)total_bytes / 1e9; // convert to GB
float mem_avail = dev_info.memory.available_physical * 1024.0f * 1024.0f * 1024.0f / 1e9; // convert to GB
float disk_read_bw = dev_info.disk.read_seq_bw; // GB/s
return std::max(0.0, static_cast<double>(total_gbytes - mem_avail) / disk_read_bw * 1000); // convert to ms
}
static float device_swap_access_delay(struct device_info & dev_info, int n_layers) {
(void)dev_info;
(void)n_layers;
return 0.0f;
}
void device_print_props(struct device_info * dev_info_set, int n, struct llama_model * model) { void device_print_props(struct device_info * dev_info_set, int n, struct llama_model * model) {
LOG_INF("\n-------------------------------------------------------------------------------------------\n"); LOG_INF("\n-------------------------------------------------------------------------------------------\n");
LOG_INF("| Property "); LOG_INF("| Property ");
@ -771,15 +859,33 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
} }
LOG_INF("\n"); LOG_INF("\n");
LOG_INF("| Mem Read Bandwidth (GB/s) "); LOG_INF("| Mem Read Bandwidth (GB/s) ");
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
LOG_INF("| %-10.2f ", dev_info_set[i].memory.read_bandwidth); LOG_INF("| %-10.2f ", dev_info_set[i].memory.read_bandwidth);
} }
LOG_INF("\n"); LOG_INF("\n");
LOG_INF("| Disk Read Bandwidth (GB/s) "); LOG_INF("| Disk Read Seq Speed (GB/s) ");
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
LOG_INF("| %-10.2f ", dev_info_set[i].disk_read_bandwidth); LOG_INF("| %-10.2f ", dev_info_set[i].disk.read_seq_bw);
}
LOG_INF("\n");
LOG_INF("| Disk Write Seq Speed (GB/s) ");
for (int i = 0; i < n; ++i) {
LOG_INF("| %-10.2f ", dev_info_set[i].disk.write_seq_bw);
}
LOG_INF("\n");
LOG_INF("| Disk Read Rnd Speed (GB/s) ");
for (int i = 0; i < n; ++i) {
LOG_INF("| %-10.2f ", dev_info_set[i].disk.read_rnd_bw);
}
LOG_INF("\n");
LOG_INF("| Disk Write Rnd Speed (GB/s) ");
for (int i = 0; i < n; ++i) {
LOG_INF("| %-10.2f ", dev_info_set[i].disk.write_rnd_bw);
} }
LOG_INF("\n"); LOG_INF("\n");
@ -1015,10 +1121,13 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q80); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q80);
LOG_INF("\n"); LOG_INF("\n");
// todo: calculate for each device, not only master
float latency = 0.0f; float latency = 0.0f;
int n_layers = llama_model_n_layers(model); int n_layers = llama_model_n_layers(model);
latency += device_compute_delay(dev_info_set[0], n_layers); latency += device_compute_delay(dev_info_set[0], n_layers);
latency += device_memory_access_delay(dev_info_set[0], n_layers); latency += device_memory_access_delay(dev_info_set[0], n_layers);
latency += device_disk_access_delay(dev_info_set[0], n_layers); // if physical memory is not enough, some tensor weights will be released from memory and reloaded by mmap later
latency += device_swap_access_delay(dev_info_set[0], n_layers); // if physical memory is not enough, activations will be stored in swap, which causes additional disk io with random access
LOG_INF("| Token latency (ms) "); LOG_INF("| Token latency (ms) ");
LOG_INF("| %-10.2f ", latency); LOG_INF("| %-10.2f ", latency);
@ -1043,7 +1152,7 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) {
+ cpu_description_len + cpu_description_len
+ gpu_name_len + gpu_name_len
+ gpu_description_len + gpu_description_len
+ sizeof(float) // disk_read_bandwidth + sizeof(struct disk_props)
+ sizeof(uint32_t) // cpu_props.cores + sizeof(uint32_t) // cpu_props.cores
+ sizeof(float) * 5 // cpu_props.flops_f32_f32, cpu_props.flops_f16_f32, cpu_props.flops_q4k_f32, cpu_props.flops_q6k_f32, cpu_props.flops_q80_f32 + sizeof(float) * 5 // cpu_props.flops_f32_f32, cpu_props.flops_f16_f32, cpu_props.flops_q4k_f32, cpu_props.flops_q6k_f32, cpu_props.flops_q80_f32
+ sizeof(struct memory_info) + sizeof(struct memory_info)
@ -1086,8 +1195,8 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) {
ptr += gpu_description_len; ptr += gpu_description_len;
// copy the non-string members // copy the non-string members
memcpy(ptr, &dev_info->disk_read_bandwidth, sizeof(float)); memcpy(ptr, &dev_info->disk, sizeof(struct disk_props));
ptr += sizeof(float); ptr += sizeof(struct disk_props);
memcpy(ptr, &dev_info->cpu_props.cores, sizeof(uint32_t)); memcpy(ptr, &dev_info->cpu_props.cores, sizeof(uint32_t));
ptr += sizeof(uint32_t); ptr += sizeof(uint32_t);
@ -1203,8 +1312,8 @@ void deserialize(const char * buffer, struct device_info * dev_info) {
ptr += gpu_description_len; ptr += gpu_description_len;
// other non-string members // other non-string members
memcpy(&dev_info->disk_read_bandwidth, ptr, sizeof(float)); memcpy(&dev_info->disk, ptr, sizeof(struct disk_props));
ptr += sizeof(float); ptr += sizeof(struct disk_props);
memcpy(&dev_info->cpu_props.cores, ptr, sizeof(uint32_t)); memcpy(&dev_info->cpu_props.cores, ptr, sizeof(uint32_t));
ptr += sizeof(uint32_t); ptr += sizeof(uint32_t);

View file

@ -4,6 +4,10 @@
#include "ggml.h" #include "ggml.h"
#include "llama.h" #include "llama.h"
#define DISK_TEST_TOTAL_BYTE 500L * 1024 * 1024
#define DISK_TEST_SEQ_BLOCK 100L * 1024 * 1024
#define DISK_TEST_RND_BLOCK 4096
struct cpu_props { struct cpu_props {
const char * name; const char * name;
const char * description; const char * description;
@ -26,10 +30,10 @@ struct cpu_props {
}; };
struct memory_info { struct memory_info {
float total_physical; // in GB float total_physical; // in GiB
float available_physical; // in GB float available_physical; // in GiB
float total_swap; // in GB float total_swap; // in GiB
float available_swap; // in GB float available_swap; // in GiB
float read_bandwidth; // in GB/s float read_bandwidth; // in GB/s
memory_info() : memory_info() :
@ -62,8 +66,8 @@ struct gpu_support {
struct gpu_props { struct gpu_props {
const char * name; const char * name;
const char * description; const char * description;
float memory_free; // in GB float memory_free; // in GiB
float memory_total; // in GB float memory_total; // in GiB
float read_bandwidth; // in GB/s float read_bandwidth; // in GB/s
float metal_flops_f32_f32; // in GFLOPS float metal_flops_f32_f32; // in GFLOPS
float metal_flops_f16_f32; // in GFLOPS float metal_flops_f16_f32; // in GFLOPS
@ -81,7 +85,7 @@ struct gpu_props {
description(""), description(""),
memory_free (0.0f), memory_free (0.0f),
memory_total (0.0f), memory_total (0.0f),
read_bandwidth (1.0f), read_bandwidth (0.0f),
metal_flops_f32_f32(0.0f), metal_flops_f32_f32(0.0f),
metal_flops_f16_f32(0.0f), metal_flops_f16_f32(0.0f),
metal_flops_q4k_f32(0.0f), metal_flops_q4k_f32(0.0f),
@ -156,10 +160,23 @@ struct model_params {
layer_q80 (0) {} layer_q80 (0) {}
}; };
struct disk_props {
float read_seq_bw; // in GB/s
float read_rnd_bw; // in GB/s
float write_seq_bw; // in GB/s
float write_rnd_bw; // in GB/s
disk_props() :
read_seq_bw (0.0f),
read_rnd_bw (0.0f),
write_seq_bw(0.0f),
write_rnd_bw(0.0f) {}
};
struct device_info { struct device_info {
uint32_t rank; uint32_t rank;
const char * device_name; const char * device_name;
float disk_read_bandwidth; // in GB/s struct disk_props disk;
struct cpu_props cpu_props; struct cpu_props cpu_props;
struct memory_info memory; struct memory_info memory;
struct gpu_support gpu_support; struct gpu_support gpu_support;
@ -170,7 +187,7 @@ struct device_info {
device_info() : device_info() :
rank(0), rank(0),
device_name(""), device_name(""),
disk_read_bandwidth(0.0f), disk(),
cpu_props(), cpu_props(),
memory(), memory(),
gpu_support(), gpu_support(),
@ -193,18 +210,19 @@ enum profiler_layer_type {
const char * device_name(void); const char * device_name(void);
uint32_t device_cpu_cores (void); uint32_t device_cpu_cores (void);
float device_cpu_flops (struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t, int n_threads); float device_cpu_flops (struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t, int n_threads);
float device_metal_flops (struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t); float device_metal_flops (struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t);
float device_cuda_flops (struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t); float device_cuda_flops (struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t);
float device_inp_embd_delay (struct llama_model * model, enum ggml_type src0t, int n_tokens, int n_threads); float device_inp_embd_delay (struct llama_model * model, enum ggml_type src0t, int n_tokens, int n_threads);
uint64_t device_physical_memory(bool available); uint64_t device_physical_memory (bool available);
uint64_t device_swap_memory (bool available); uint64_t device_swap_memory (bool available);
uint64_t device_disk_read_bw (const char * test_file, size_t buffer_size_mb); void device_disk_seq_bw (float * read_seq_bw, float * write_seq_bw);
float device_memory_bw (int n_thread); void device_disk_rnd_bw (float * read_rnd_bw, float * write_rnd_bw);
float device_cuda_memory_bw (struct llama_model * model); float device_memory_bw (int n_thread);
void device_get_props (struct llama_model * model, int device, struct ggml_backend_dev_props * props); float device_cuda_memory_bw (struct llama_model * model);
void device_print_props (struct device_info * dev_info_set, int n, struct llama_model * model); void device_get_props (struct llama_model * model, int device, struct ggml_backend_dev_props * props);
void device_print_props (struct device_info * dev_info_set, int n, struct llama_model * model);
int device_has_metal (void); int device_has_metal (void);
int device_has_cuda (void); int device_has_cuda (void);

View file

@ -3547,7 +3547,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_
GGML_UNUSED(model); GGML_UNUSED(model);
} }
void llama_profile_device(device_info * dev_info, struct llama_model * model, llama_model_loader * ml, const char * test_file, int n_threads) { void llama_profile_device(device_info * dev_info, struct llama_model * model, llama_model_loader * ml, int n_threads) {
dev_info->device_name = device_name(); dev_info->device_name = device_name();
dev_info->cpu_props.cores = device_cpu_cores(); dev_info->cpu_props.cores = device_cpu_cores();
dev_info->cpu_props.flops_f32_f32 = device_cpu_flops(model, GGML_TYPE_F32, GGML_TYPE_F32, n_threads); dev_info->cpu_props.flops_f32_f32 = device_cpu_flops(model, GGML_TYPE_F32, GGML_TYPE_F32, n_threads);
@ -3562,7 +3562,8 @@ void llama_profile_device(device_info * dev_info, struct llama_model * model, ll
dev_info->memory.available_swap = round(device_swap_memory(true) / (double)(1 << 30) * 100) / 100; dev_info->memory.available_swap = round(device_swap_memory(true) / (double)(1 << 30) * 100) / 100;
dev_info->memory.read_bandwidth = device_memory_bw(n_threads); dev_info->memory.read_bandwidth = device_memory_bw(n_threads);
dev_info->disk_read_bandwidth = round(device_disk_read_bw(test_file, 500) / (double)(1 << 30) * 100) / 100; device_disk_seq_bw(&dev_info->disk.read_seq_bw, &dev_info->disk.write_seq_bw);
device_disk_rnd_bw(&dev_info->disk.read_rnd_bw, &dev_info->disk.write_rnd_bw);
dev_info->gpu_support.metal = device_has_metal(); dev_info->gpu_support.metal = device_has_metal();
dev_info->gpu_support.cuda = device_has_cuda(); dev_info->gpu_support.cuda = device_has_cuda();