diff --git a/common/profiler.cpp b/common/profiler.cpp index 5efcf459..9e7094b0 100644 --- a/common/profiler.cpp +++ b/common/profiler.cpp @@ -34,6 +34,8 @@ #include #include #include +#include +#include const char * device_name() { static char device_name[256]; @@ -439,49 +441,104 @@ uint64_t device_swap_memory(bool available) { return swap_memory; } -uint64_t device_disk_read_bw(const char * test_file, size_t buffer_size_mb) { - uint64_t speed = 0; - size_t buffer_size = buffer_size_mb * 1024 * 1024; // buffer size in bytes +static void external_fio_impl(float * read_bw, float * write_bw, bool op_rand) { + const char * test_file = "fio_test"; + const char * fio_conf_template = R"( +[global] +ioengine=posixaio +direct=1 +time_based=1 +runtime=2 +size=500M +group_reporting=1 - try { - // open a file for reading - std::ifstream file(test_file, std::ios::binary | std::ios::in); - if (!file) { - LOG_ERR("Unable to open the file at path: %s\n", test_file); - return speed; - } +[read-job] +rw=%s +bs=%s +filename=%s - // prepare buffer for reading - std::vector buffer(buffer_size); +[write-job] +rw=%s +bs=%s +filename=%s +)"; - auto start_time = std::chrono::high_resolution_clock::now(); + const char * read_type = op_rand ? "randread" : "read"; + const char * write_type = op_rand ? "randwrite" : "write"; + const char * block_size = op_rand ? "4k" : "1M"; - // read file into buffer - file.read(buffer.data(), buffer.size()); - if (!file) { - LOG_ERR("Failed to read enough data from the test file\n"); - return speed; - } + // write config to a file + char fio_conf[1024]; + snprintf(fio_conf, sizeof(fio_conf), fio_conf_template, + read_type, block_size, test_file, + write_type, block_size, test_file); + const char * conf_file = "config.fio"; + std::ofstream conf(conf_file); + if (!conf) { + LOG_INF("Error: Unable to create configuration file\n"); + return; + } + conf << fio_conf; + conf.close(); - auto end_time = std::chrono::high_resolution_clock::now(); - std::chrono::duration elapsed_time = end_time - start_time; - - // speed in bytes per second - if (elapsed_time.count() > 0) { - speed = static_cast(buffer.size() / elapsed_time.count()); - } - - buffer.clear(); - buffer.shrink_to_fit(); - } catch (const std::exception &e) { - LOG_ERR("Exception while calculating disk read speed: %s\n", e.what()); + // run fio and redirect output to a file + const char * output_file = "fio_output.log"; + std::string command = "fio " + std::string(conf_file) + " > " + std::string(output_file); + if (std::system(command.c_str()) != 0) { + LOG_INF("Error: Failed to run fio\n"); + return; } - return speed; + // parse fio output + std::ifstream result(output_file); + if (!result) { + LOG_INF("Error: Failed to open fio output file\n"); + return; + } + *read_bw = 0.0f; + *write_bw = 0.0f; + + std::string line; + std::regex read_regex(R"(READ: bw=([0-9.]+)([a-zA-Z/]+))"); + std::regex write_regex(R"(WRITE: bw=([0-9.]+)([a-zA-Z/]+))"); + std::smatch match; + + while (std::getline(result, line)) { + if (std::regex_search(line, match, read_regex)) { + float value = std::stof(match[1]); + std::string unit = match[2]; + if (unit == "MiB/s") { + *read_bw = value * 1024.0f * 1024.0f / 1e9; // convert MiB/s to GB/s + } else if (unit == "MB/s") { + *read_bw = value / 1000.0f; // convert MB/s to GB/s + } + } else if (std::regex_search(line, match, write_regex)) { + float value = std::stof(match[1]); + std::string unit = match[2]; + if (unit == "MiB/s") { + *write_bw = value * 1024.0f * 1024.0f / 1e9; // convert MiB/s to GB/s + } else if (unit == "MB/s") { + *write_bw = value / 1000.0f; // convert MB/s to GB/s + } + } + } + + // clean up temporary files + std::remove(test_file); + std::remove(conf_file); + std::remove(output_file); +} + +void device_disk_rnd_bw(float * read_rnd_bw, float * write_rnd_bw) { + external_fio_impl(read_rnd_bw, write_rnd_bw, true); +} + +void device_disk_seq_bw(float * read_seq_bw, float * write_seq_bw) { + external_fio_impl(read_seq_bw, write_seq_bw, false); } float device_memory_bw(int n_thread) { - size_t buffer_size = 5L * 1024 * 1024; // 5m + size_t buffer_size = 5L * 1024 * 1024; // 5 MiB std::vector thread_pool; std::vector results(n_thread); std::vector buffers(n_thread); @@ -579,6 +636,7 @@ float device_cuda_memory_bw(struct llama_model * model) { return bandwidth; #else + (void)model; return 0.0f; #endif } @@ -662,28 +720,58 @@ static float device_compute_delay(struct device_info & dev_info, int n_layers) { static float device_memory_access_delay(struct device_info & dev_info, int n_layers) { struct model_params n_params = dev_info.model_params; - int64_t total_params = 0; - total_params += n_params.layer_f32 * 4 + - n_params.layer_f16 * 2 + - n_params.layer_q4k / 2 + - n_params.layer_q6k * 3 / 8 + - n_params.layer_q80; + int64_t total_bytes = 0; + total_bytes += n_params.layer_f32 * 4 + + n_params.layer_f16 * 2 + + n_params.layer_q4k / 2 + + n_params.layer_q6k * 3 / 8 + + n_params.layer_q80; - total_params *= n_layers; + total_bytes *= n_layers; - total_params += n_params.output_f32 * 4 + - n_params.output_f16 * 2 + - n_params.output_q4k / 2 + - n_params.output_q6k * 3 / 8 + - n_params.output_q80; + total_bytes += n_params.output_f32 * 4 + + n_params.output_f16 * 2 + + n_params.output_q4k / 2 + + n_params.output_q6k * 3 / 8 + + n_params.output_q80; #ifdef GGML_USE_CUDA - return (double)total_params / 1e6 / dev_info.gpu_props.read_bandwidth; + return (double)total_bytes / 1e6 / dev_info.gpu_props.read_bandwidth; // ms #else - return (double)total_params / 1e6 / dev_info.memory.read_bandwidth; + return (double)total_bytes / 1e6 / dev_info.memory.read_bandwidth; // ms #endif } +static float device_disk_access_delay(struct device_info & dev_info, int n_layers) { + struct model_params n_params = dev_info.model_params; + + int64_t total_bytes = 0; + total_bytes += n_params.layer_f32 * 4 + + n_params.layer_f16 * 2 + + n_params.layer_q4k / 2 + + n_params.layer_q6k * 3 / 8 + + n_params.layer_q80; + + total_bytes *= n_layers; + + total_bytes += n_params.output_f32 * 4 + + n_params.output_f16 * 2 + + n_params.output_q4k / 2 + + n_params.output_q6k * 3 / 8 + + n_params.output_q80; + + float total_gbytes = (double)total_bytes / 1e9; // convert to GB + float mem_avail = dev_info.memory.available_physical * 1024.0f * 1024.0f * 1024.0f / 1e9; // convert to GB + float disk_read_bw = dev_info.disk.read_seq_bw; // GB/s + return std::max(0.0, static_cast(total_gbytes - mem_avail) / disk_read_bw * 1000); // convert to ms +} + +static float device_swap_access_delay(struct device_info & dev_info, int n_layers) { + (void)dev_info; + (void)n_layers; + return 0.0f; +} + void device_print_props(struct device_info * dev_info_set, int n, struct llama_model * model) { LOG_INF("\n-------------------------------------------------------------------------------------------\n"); LOG_INF("| Property "); @@ -771,15 +859,33 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m } LOG_INF("\n"); - LOG_INF("| Mem Read Bandwidth (GB/s) "); + LOG_INF("| Mem Read Bandwidth (GB/s) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].memory.read_bandwidth); } LOG_INF("\n"); - LOG_INF("| Disk Read Bandwidth (GB/s) "); + LOG_INF("| Disk Read Seq Speed (GB/s) "); for (int i = 0; i < n; ++i) { - LOG_INF("| %-10.2f ", dev_info_set[i].disk_read_bandwidth); + LOG_INF("| %-10.2f ", dev_info_set[i].disk.read_seq_bw); + } + LOG_INF("\n"); + + LOG_INF("| Disk Write Seq Speed (GB/s) "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.2f ", dev_info_set[i].disk.write_seq_bw); + } + LOG_INF("\n"); + + LOG_INF("| Disk Read Rnd Speed (GB/s) "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.2f ", dev_info_set[i].disk.read_rnd_bw); + } + LOG_INF("\n"); + + LOG_INF("| Disk Write Rnd Speed (GB/s) "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.2f ", dev_info_set[i].disk.write_rnd_bw); } LOG_INF("\n"); @@ -1015,10 +1121,13 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q80); LOG_INF("\n"); + // todo: calculate for each device, not only master float latency = 0.0f; int n_layers = llama_model_n_layers(model); latency += device_compute_delay(dev_info_set[0], n_layers); latency += device_memory_access_delay(dev_info_set[0], n_layers); + latency += device_disk_access_delay(dev_info_set[0], n_layers); // if physical memory is not enough, some tensor weights will be released from memory and reloaded by mmap later + latency += device_swap_access_delay(dev_info_set[0], n_layers); // if physical memory is not enough, activations will be stored in swap, which causes additional disk io with random access LOG_INF("| Token latency (ms) "); LOG_INF("| %-10.2f ", latency); @@ -1043,7 +1152,7 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) { + cpu_description_len + gpu_name_len + gpu_description_len - + sizeof(float) // disk_read_bandwidth + + sizeof(struct disk_props) + sizeof(uint32_t) // cpu_props.cores + sizeof(float) * 5 // cpu_props.flops_f32_f32, cpu_props.flops_f16_f32, cpu_props.flops_q4k_f32, cpu_props.flops_q6k_f32, cpu_props.flops_q80_f32 + sizeof(struct memory_info) @@ -1086,8 +1195,8 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) { ptr += gpu_description_len; // copy the non-string members - memcpy(ptr, &dev_info->disk_read_bandwidth, sizeof(float)); - ptr += sizeof(float); + memcpy(ptr, &dev_info->disk, sizeof(struct disk_props)); + ptr += sizeof(struct disk_props); memcpy(ptr, &dev_info->cpu_props.cores, sizeof(uint32_t)); ptr += sizeof(uint32_t); @@ -1203,8 +1312,8 @@ void deserialize(const char * buffer, struct device_info * dev_info) { ptr += gpu_description_len; // other non-string members - memcpy(&dev_info->disk_read_bandwidth, ptr, sizeof(float)); - ptr += sizeof(float); + memcpy(&dev_info->disk, ptr, sizeof(struct disk_props)); + ptr += sizeof(struct disk_props); memcpy(&dev_info->cpu_props.cores, ptr, sizeof(uint32_t)); ptr += sizeof(uint32_t); diff --git a/common/profiler.h b/common/profiler.h index 744af877..3f7453ea 100644 --- a/common/profiler.h +++ b/common/profiler.h @@ -4,6 +4,10 @@ #include "ggml.h" #include "llama.h" +#define DISK_TEST_TOTAL_BYTE 500L * 1024 * 1024 +#define DISK_TEST_SEQ_BLOCK 100L * 1024 * 1024 +#define DISK_TEST_RND_BLOCK 4096 + struct cpu_props { const char * name; const char * description; @@ -26,10 +30,10 @@ struct cpu_props { }; struct memory_info { - float total_physical; // in GB - float available_physical; // in GB - float total_swap; // in GB - float available_swap; // in GB + float total_physical; // in GiB + float available_physical; // in GiB + float total_swap; // in GiB + float available_swap; // in GiB float read_bandwidth; // in GB/s memory_info() : @@ -62,8 +66,8 @@ struct gpu_support { struct gpu_props { const char * name; const char * description; - float memory_free; // in GB - float memory_total; // in GB + float memory_free; // in GiB + float memory_total; // in GiB float read_bandwidth; // in GB/s float metal_flops_f32_f32; // in GFLOPS float metal_flops_f16_f32; // in GFLOPS @@ -81,7 +85,7 @@ struct gpu_props { description(""), memory_free (0.0f), memory_total (0.0f), - read_bandwidth (1.0f), + read_bandwidth (0.0f), metal_flops_f32_f32(0.0f), metal_flops_f16_f32(0.0f), metal_flops_q4k_f32(0.0f), @@ -156,10 +160,23 @@ struct model_params { layer_q80 (0) {} }; +struct disk_props { + float read_seq_bw; // in GB/s + float read_rnd_bw; // in GB/s + float write_seq_bw; // in GB/s + float write_rnd_bw; // in GB/s + + disk_props() : + read_seq_bw (0.0f), + read_rnd_bw (0.0f), + write_seq_bw(0.0f), + write_rnd_bw(0.0f) {} +}; + struct device_info { uint32_t rank; const char * device_name; - float disk_read_bandwidth; // in GB/s + struct disk_props disk; struct cpu_props cpu_props; struct memory_info memory; struct gpu_support gpu_support; @@ -170,7 +187,7 @@ struct device_info { device_info() : rank(0), device_name(""), - disk_read_bandwidth(0.0f), + disk(), cpu_props(), memory(), gpu_support(), @@ -193,18 +210,19 @@ enum profiler_layer_type { const char * device_name(void); -uint32_t device_cpu_cores (void); -float device_cpu_flops (struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t, int n_threads); -float device_metal_flops (struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t); -float device_cuda_flops (struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t); -float device_inp_embd_delay (struct llama_model * model, enum ggml_type src0t, int n_tokens, int n_threads); -uint64_t device_physical_memory(bool available); -uint64_t device_swap_memory (bool available); -uint64_t device_disk_read_bw (const char * test_file, size_t buffer_size_mb); -float device_memory_bw (int n_thread); -float device_cuda_memory_bw (struct llama_model * model); -void device_get_props (struct llama_model * model, int device, struct ggml_backend_dev_props * props); -void device_print_props (struct device_info * dev_info_set, int n, struct llama_model * model); +uint32_t device_cpu_cores (void); +float device_cpu_flops (struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t, int n_threads); +float device_metal_flops (struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t); +float device_cuda_flops (struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t); +float device_inp_embd_delay (struct llama_model * model, enum ggml_type src0t, int n_tokens, int n_threads); +uint64_t device_physical_memory (bool available); +uint64_t device_swap_memory (bool available); +void device_disk_seq_bw (float * read_seq_bw, float * write_seq_bw); +void device_disk_rnd_bw (float * read_rnd_bw, float * write_rnd_bw); +float device_memory_bw (int n_thread); +float device_cuda_memory_bw (struct llama_model * model); +void device_get_props (struct llama_model * model, int device, struct ggml_backend_dev_props * props); +void device_print_props (struct device_info * dev_info_set, int n, struct llama_model * model); int device_has_metal (void); int device_has_cuda (void); diff --git a/src/llama.cpp b/src/llama.cpp index 67e8b21a..c05621f6 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3547,7 +3547,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_ GGML_UNUSED(model); } -void llama_profile_device(device_info * dev_info, struct llama_model * model, llama_model_loader * ml, const char * test_file, int n_threads) { +void llama_profile_device(device_info * dev_info, struct llama_model * model, llama_model_loader * ml, int n_threads) { dev_info->device_name = device_name(); dev_info->cpu_props.cores = device_cpu_cores(); dev_info->cpu_props.flops_f32_f32 = device_cpu_flops(model, GGML_TYPE_F32, GGML_TYPE_F32, n_threads); @@ -3562,7 +3562,8 @@ void llama_profile_device(device_info * dev_info, struct llama_model * model, ll dev_info->memory.available_swap = round(device_swap_memory(true) / (double)(1 << 30) * 100) / 100; dev_info->memory.read_bandwidth = device_memory_bw(n_threads); - dev_info->disk_read_bandwidth = round(device_disk_read_bw(test_file, 500) / (double)(1 << 30) * 100) / 100; + device_disk_seq_bw(&dev_info->disk.read_seq_bw, &dev_info->disk.write_seq_bw); + device_disk_rnd_bw(&dev_info->disk.read_rnd_bw, &dev_info->disk.write_rnd_bw); dev_info->gpu_support.metal = device_has_metal(); dev_info->gpu_support.cuda = device_has_cuda();