fix disk r/w test, add disk access latency, and correct units (GB, GiB)

2025-09-06 22:39:03 +00:00 · 2024-11-27 21:36:12 +04:00 · 2024-11-27 21:36:12 +04:00 · f7507ec20b
commit f7507ec20b
parent 9cd22177d0
3 changed files with 206 additions and 78 deletions
--- a/common/profiler.cpp
+++ b/common/profiler.cpp
@ -34,6 +34,8 @@
 #include <vector>
 #include <inttypes.h>
 #include <thread>
 #include <random>
 #include <regex>
 const char * device_name() {
    static char device_name[256];
@ -439,49 +441,104 @@ uint64_t device_swap_memory(bool available) {
    return swap_memory;
 }
-uint64_t device_disk_read_bw(const char * test_file, size_t buffer_size_mb) {
+static void external_fio_impl(float * read_bw, float * write_bw, bool op_rand) {
-    uint64_t speed = 0;
+    const char * test_file = "fio_test";
-    size_t buffer_size = buffer_size_mb * 1024 * 1024; // buffer size in bytes
+    const char * fio_conf_template = R"(
 [global]
 ioengine=posixaio
 direct=1
 time_based=1
 runtime=2
 size=500M
 group_reporting=1
-    try {
+[read-job]
-        // open a file for reading
+rw=%s
-        std::ifstream file(test_file, std::ios::binary | std::ios::in);
+bs=%s
-        if (!file) {
+filename=%s
            LOG_ERR("Unable to open the file at path: %s\n", test_file);
            return speed;
        }
-        // prepare buffer for reading
+[write-job]
-        std::vector<char> buffer(buffer_size);
+rw=%s
 bs=%s
 filename=%s
 )";
-        auto start_time = std::chrono::high_resolution_clock::now();
+    const char * read_type  = op_rand ? "randread" : "read";
    const char * write_type = op_rand ? "randwrite" : "write";
    const char * block_size = op_rand ? "4k" : "1M";
-        // read file into buffer
+    // write config to a file
-        file.read(buffer.data(), buffer.size());
+    char fio_conf[1024];
-        if (!file) {
+    snprintf(fio_conf, sizeof(fio_conf), fio_conf_template, 
-            LOG_ERR("Failed to read enough data from the test file\n");
+             read_type,  block_size, test_file,
-            return speed;
+             write_type, block_size, test_file);
-        }
+    const char * conf_file = "config.fio";
    std::ofstream conf(conf_file);
    if (!conf) {
        LOG_INF("Error: Unable to create configuration file\n");
        return;
    }
    conf << fio_conf;
    conf.close();
-        auto end_time = std::chrono::high_resolution_clock::now();
+    // run fio and redirect output to a file
-        std::chrono::duration<double> elapsed_time = end_time - start_time;
+    const char * output_file = "fio_output.log";
-
+    std::string command = "fio " + std::string(conf_file) + " > " + std::string(output_file);
-        // speed in bytes per second
+    if (std::system(command.c_str()) != 0) {
-        if (elapsed_time.count() > 0) {
+        LOG_INF("Error: Failed to run fio\n");
-            speed = static_cast<uint64_t>(buffer.size() / elapsed_time.count());
+        return;
        }
        buffer.clear();
        buffer.shrink_to_fit();
    } catch (const std::exception &e) {
        LOG_ERR("Exception while calculating disk read speed: %s\n", e.what());
    }
-    return speed;
+    // parse fio output
    std::ifstream result(output_file);
    if (!result) {
        LOG_INF("Error: Failed to open fio output file\n");
        return;
    }
    *read_bw = 0.0f;
    *write_bw = 0.0f;
    std::string line;
    std::regex read_regex(R"(READ: bw=([0-9.]+)([a-zA-Z/]+))");
    std::regex write_regex(R"(WRITE: bw=([0-9.]+)([a-zA-Z/]+))");
    std::smatch match;
    while (std::getline(result, line)) {
        if (std::regex_search(line, match, read_regex)) {
            float value = std::stof(match[1]);
            std::string unit = match[2];
            if (unit == "MiB/s") {
                *read_bw = value * 1024.0f * 1024.0f / 1e9;  // convert MiB/s to GB/s
            } else if (unit == "MB/s") {
                *read_bw = value / 1000.0f;  // convert MB/s to GB/s
            }
        } else if (std::regex_search(line, match, write_regex)) {
            float value = std::stof(match[1]);
            std::string unit = match[2];
            if (unit == "MiB/s") {
                *write_bw = value * 1024.0f * 1024.0f / 1e9;  // convert MiB/s to GB/s
            } else if (unit == "MB/s") {
                *write_bw = value / 1000.0f;  // convert MB/s to GB/s
            }
        }
    }
    // clean up temporary files
    std::remove(test_file);
    std::remove(conf_file);
    std::remove(output_file);
 }
 void device_disk_rnd_bw(float * read_rnd_bw, float * write_rnd_bw) {
    external_fio_impl(read_rnd_bw, write_rnd_bw, true);
 }
 void device_disk_seq_bw(float * read_seq_bw, float * write_seq_bw) {
    external_fio_impl(read_seq_bw, write_seq_bw, false);
 }
 float device_memory_bw(int n_thread) {
-    size_t buffer_size = 5L * 1024 * 1024; // 5m
+    size_t buffer_size = 5L * 1024 * 1024; // 5 MiB
    std::vector<std::thread> thread_pool;
    std::vector<double> results(n_thread);
    std::vector<char *> buffers(n_thread);
@ -579,6 +636,7 @@ float device_cuda_memory_bw(struct llama_model * model) {
    return bandwidth;
 #else
    (void)model;
    return 0.0f;
 #endif
 }
@ -662,28 +720,58 @@ static float device_compute_delay(struct device_info & dev_info, int n_layers) {
 static float device_memory_access_delay(struct device_info & dev_info, int n_layers) {
    struct model_params n_params = dev_info.model_params;
-    int64_t total_params = 0;
+    int64_t total_bytes = 0;
-    total_params += n_params.layer_f32 * 4 +
+    total_bytes += n_params.layer_f32 * 4 +
-                    n_params.layer_f16 * 2 +
+                   n_params.layer_f16 * 2 +
-                    n_params.layer_q4k / 2 +
+                   n_params.layer_q4k / 2 +
-                    n_params.layer_q6k * 3 / 8 +
+                   n_params.layer_q6k * 3 / 8 +
-                    n_params.layer_q80;
+                   n_params.layer_q80;
-    total_params *= n_layers;
+    total_bytes *= n_layers;
-    total_params += n_params.output_f32 * 4 +
+    total_bytes += n_params.output_f32 * 4 +
-                    n_params.output_f16 * 2 +
+                   n_params.output_f16 * 2 +
-                    n_params.output_q4k / 2 +
+                   n_params.output_q4k / 2 +
-                    n_params.output_q6k * 3 / 8 +
+                   n_params.output_q6k * 3 / 8 +
-                    n_params.output_q80;
+                   n_params.output_q80;
 #ifdef GGML_USE_CUDA
-    return (double)total_params / 1e6 / dev_info.gpu_props.read_bandwidth;
+    return (double)total_bytes / 1e6 / dev_info.gpu_props.read_bandwidth; // ms
 #else
-    return (double)total_params / 1e6 / dev_info.memory.read_bandwidth;
+    return (double)total_bytes / 1e6 / dev_info.memory.read_bandwidth; // ms
 #endif
 }
 static float device_disk_access_delay(struct device_info & dev_info, int n_layers) {
    struct model_params n_params = dev_info.model_params;
    int64_t total_bytes = 0;
    total_bytes += n_params.layer_f32 * 4 +
                   n_params.layer_f16 * 2 +
                   n_params.layer_q4k / 2 +
                   n_params.layer_q6k * 3 / 8 +
                   n_params.layer_q80;
    total_bytes *= n_layers;
    total_bytes += n_params.output_f32 * 4 +
                   n_params.output_f16 * 2 +
                   n_params.output_q4k / 2 +
                   n_params.output_q6k * 3 / 8 +
                   n_params.output_q80;
    float total_gbytes = (double)total_bytes / 1e9; // convert to GB
    float mem_avail = dev_info.memory.available_physical * 1024.0f * 1024.0f * 1024.0f / 1e9; // convert to GB
    float disk_read_bw = dev_info.disk.read_seq_bw; // GB/s
    return std::max(0.0, static_cast<double>(total_gbytes - mem_avail) / disk_read_bw * 1000); // convert to ms
 }
 static float device_swap_access_delay(struct device_info & dev_info, int n_layers) {
    (void)dev_info;
    (void)n_layers;
    return 0.0f;
 }
 void device_print_props(struct device_info * dev_info_set, int n, struct llama_model * model) {
    LOG_INF("\n-------------------------------------------------------------------------------------------\n");
    LOG_INF("| Property                     ");
@ -771,15 +859,33 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
    }
    LOG_INF("\n");
-    LOG_INF("| Mem Read  Bandwidth (GB/s)   ");
+    LOG_INF("| Mem Read Bandwidth (GB/s)    ");
    for (int i = 0; i < n; ++i) {
        LOG_INF("| %-10.2f   ", dev_info_set[i].memory.read_bandwidth);
    }
    LOG_INF("\n");
-    LOG_INF("| Disk Read Bandwidth (GB/s)   ");
+    LOG_INF("| Disk Read Seq Speed (GB/s)   ");
    for (int i = 0; i < n; ++i) {
-        LOG_INF("| %-10.2f   ", dev_info_set[i].disk_read_bandwidth);
+        LOG_INF("| %-10.2f   ", dev_info_set[i].disk.read_seq_bw);
    }
    LOG_INF("\n");
    LOG_INF("| Disk Write Seq Speed (GB/s)  ");
    for (int i = 0; i < n; ++i) {
        LOG_INF("| %-10.2f   ", dev_info_set[i].disk.write_seq_bw);
    }
    LOG_INF("\n");
    LOG_INF("| Disk Read Rnd Speed (GB/s)   ");
    for (int i = 0; i < n; ++i) {
        LOG_INF("| %-10.2f   ", dev_info_set[i].disk.read_rnd_bw);
    }
    LOG_INF("\n");
    LOG_INF("| Disk Write Rnd Speed (GB/s)  ");
    for (int i = 0; i < n; ++i) {
        LOG_INF("| %-10.2f   ", dev_info_set[i].disk.write_rnd_bw);
    }
    LOG_INF("\n");
@ -1015,10 +1121,13 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
    LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_params.output_q80);
    LOG_INF("\n");
    // todo: calculate for each device, not only master
    float latency = 0.0f;
    int n_layers  = llama_model_n_layers(model);
    latency += device_compute_delay(dev_info_set[0], n_layers);
    latency += device_memory_access_delay(dev_info_set[0], n_layers);
    latency += device_disk_access_delay(dev_info_set[0], n_layers); // if physical memory is not enough, some tensor weights will be released from memory and reloaded by mmap later
    latency += device_swap_access_delay(dev_info_set[0], n_layers); // if physical memory is not enough, activations will be stored in swap, which causes additional disk io with random access
    LOG_INF("| Token latency (ms)           ");
    LOG_INF("| %-10.2f   ", latency);
@ -1043,7 +1152,7 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) {
                      + cpu_description_len
                      + gpu_name_len
                      + gpu_description_len
-                      + sizeof(float)       // disk_read_bandwidth
+                      + sizeof(struct disk_props)
                      + sizeof(uint32_t)    // cpu_props.cores
                      + sizeof(float) * 5    // cpu_props.flops_f32_f32, cpu_props.flops_f16_f32, cpu_props.flops_q4k_f32, cpu_props.flops_q6k_f32, cpu_props.flops_q80_f32
                      + sizeof(struct memory_info)
@ -1086,8 +1195,8 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) {
    ptr += gpu_description_len;
    // copy the non-string members
-    memcpy(ptr, &dev_info->disk_read_bandwidth, sizeof(float));
+    memcpy(ptr, &dev_info->disk, sizeof(struct disk_props));
-    ptr += sizeof(float);
+    ptr += sizeof(struct disk_props);
    memcpy(ptr, &dev_info->cpu_props.cores, sizeof(uint32_t));
    ptr += sizeof(uint32_t);
@ -1203,8 +1312,8 @@ void deserialize(const char * buffer, struct device_info * dev_info) {
    ptr += gpu_description_len;
    // other non-string members
-    memcpy(&dev_info->disk_read_bandwidth, ptr, sizeof(float));
+    memcpy(&dev_info->disk, ptr, sizeof(struct disk_props));
-    ptr += sizeof(float);
+    ptr += sizeof(struct disk_props);
    memcpy(&dev_info->cpu_props.cores, ptr, sizeof(uint32_t));
    ptr += sizeof(uint32_t);
--- a/common/profiler.h
+++ b/common/profiler.h
@ -4,6 +4,10 @@
 #include "ggml.h"
 #include "llama.h"
 #define DISK_TEST_TOTAL_BYTE 500L * 1024 * 1024
 #define DISK_TEST_SEQ_BLOCK  100L * 1024 * 1024
 #define DISK_TEST_RND_BLOCK  4096
 struct cpu_props {
    const char * name;
    const char * description;
@ -26,10 +30,10 @@ struct cpu_props {
 };
 struct memory_info {
-    float        total_physical;     // in GB
+    float        total_physical;     // in GiB
-    float        available_physical; // in GB
+    float        available_physical; // in GiB
-    float        total_swap;         // in GB
+    float        total_swap;         // in GiB
-    float        available_swap;     // in GB
+    float        available_swap;     // in GiB
    float        read_bandwidth;     // in GB/s
    memory_info() : 
@ -62,8 +66,8 @@ struct gpu_support {
 struct gpu_props {
    const char * name;
    const char * description;
-    float        memory_free;         // in GB
+    float        memory_free;         // in GiB
-    float        memory_total;        // in GB
+    float        memory_total;        // in GiB
    float        read_bandwidth;      // in GB/s
    float        metal_flops_f32_f32; // in GFLOPS
    float        metal_flops_f16_f32; // in GFLOPS
@ -81,7 +85,7 @@ struct gpu_props {
        description(""), 
        memory_free        (0.0f), 
        memory_total       (0.0f), 
-        read_bandwidth     (1.0f),
+        read_bandwidth     (0.0f),
        metal_flops_f32_f32(0.0f), 
        metal_flops_f16_f32(0.0f),
        metal_flops_q4k_f32(0.0f),
@ -156,10 +160,23 @@ struct model_params {
        layer_q80 (0) {}
 };
 struct disk_props {
    float read_seq_bw;  // in GB/s
    float read_rnd_bw;  // in GB/s
    float write_seq_bw; // in GB/s
    float write_rnd_bw; // in GB/s
    disk_props() :
        read_seq_bw (0.0f),
        read_rnd_bw (0.0f),
        write_seq_bw(0.0f),
        write_rnd_bw(0.0f) {}
 };
 struct device_info {
    uint32_t            rank;
    const char *        device_name;
-    float               disk_read_bandwidth;  // in GB/s
+    struct disk_props   disk;
    struct cpu_props    cpu_props;
    struct memory_info  memory;
    struct gpu_support  gpu_support;
@ -170,7 +187,7 @@ struct device_info {
    device_info() : 
        rank(0), 
        device_name(""), 
-        disk_read_bandwidth(0.0f), 
+        disk(),
        cpu_props(), 
        memory(), 
        gpu_support(), 
@ -193,18 +210,19 @@ enum profiler_layer_type {
 const char * device_name(void); 
-uint32_t device_cpu_cores      (void);
+uint32_t device_cpu_cores       (void);
-float    device_cpu_flops      (struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t, int n_threads);
+float    device_cpu_flops       (struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t, int n_threads);
-float    device_metal_flops    (struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t);
+float    device_metal_flops     (struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t);
-float    device_cuda_flops     (struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t);
+float    device_cuda_flops      (struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t);
-float    device_inp_embd_delay (struct llama_model * model, enum ggml_type src0t, int n_tokens, int n_threads);
+float    device_inp_embd_delay  (struct llama_model * model, enum ggml_type src0t, int n_tokens, int n_threads);
-uint64_t device_physical_memory(bool available);
+uint64_t device_physical_memory (bool available);
-uint64_t device_swap_memory    (bool available);
+uint64_t device_swap_memory     (bool available);
-uint64_t device_disk_read_bw   (const char * test_file, size_t buffer_size_mb);
+void     device_disk_seq_bw     (float * read_seq_bw, float * write_seq_bw);
-float    device_memory_bw      (int n_thread);
+void     device_disk_rnd_bw     (float * read_rnd_bw, float * write_rnd_bw);
-float    device_cuda_memory_bw (struct llama_model * model);
+float    device_memory_bw       (int n_thread);
-void     device_get_props      (struct llama_model * model, int device, struct ggml_backend_dev_props * props); 
+float    device_cuda_memory_bw  (struct llama_model * model);
-void     device_print_props    (struct device_info * dev_info_set, int n, struct llama_model * model);
+void     device_get_props       (struct llama_model * model, int device, struct ggml_backend_dev_props * props); 
 void     device_print_props     (struct device_info * dev_info_set, int n, struct llama_model * model);
 int      device_has_metal  (void);
 int      device_has_cuda   (void);
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -3547,7 +3547,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_
    GGML_UNUSED(model);
 }
-void llama_profile_device(device_info * dev_info, struct llama_model * model, llama_model_loader * ml, const char * test_file, int n_threads) {
+void llama_profile_device(device_info * dev_info, struct llama_model * model, llama_model_loader * ml, int n_threads) {
    dev_info->device_name               = device_name();
    dev_info->cpu_props.cores           = device_cpu_cores();
    dev_info->cpu_props.flops_f32_f32   = device_cpu_flops(model, GGML_TYPE_F32,  GGML_TYPE_F32, n_threads);
@ -3562,7 +3562,8 @@ void llama_profile_device(device_info * dev_info, struct llama_model * model, ll
    dev_info->memory.available_swap     = round(device_swap_memory(true)      / (double)(1 << 30) * 100) / 100;
    dev_info->memory.read_bandwidth     = device_memory_bw(n_threads);
-    dev_info->disk_read_bandwidth       = round(device_disk_read_bw(test_file, 500) / (double)(1 << 30) * 100) / 100;
+    device_disk_seq_bw(&dev_info->disk.read_seq_bw, &dev_info->disk.write_seq_bw);
    device_disk_rnd_bw(&dev_info->disk.read_rnd_bw, &dev_info->disk.write_rnd_bw);
    dev_info->gpu_support.metal         = device_has_metal();
    dev_info->gpu_support.cuda          = device_has_cuda();