device_memory_bw: simulate cache-friendly block access and multi-threading

2025-09-06 23:39:03 +00:00 · 2024-12-04 15:36:59 +04:00 · 2024-12-04 15:36:59 +04:00 · 7521e532c4
commit 7521e532c4
parent 44b4718c8b
2 changed files with 33 additions and 24 deletions
--- a/common/profiler.cpp
+++ b/common/profiler.cpp
@ -360,8 +360,9 @@ static int is_uma_arch() {
    }

    return is_arm64;
-#endif
+#else
    return 0;
+#endif
 }

 static uint64_t device_host_physical_memory(bool available) {
@ -841,43 +842,50 @@ void device_disk_seq_bw(float * read_seq_bw, float * write_seq_bw, int n_threads
 }

 float device_memory_bw(int n_thread) {
-    size_t buffer_size = 5L * 1024 * 1024; // 5 MiB
-    std::vector<std::thread> thread_pool;
+    // simulate large model weights, set to 100 MiB
+    size_t buffer_size = 100L * 1024 * 1024;
+    std::vector<char> data(buffer_size);
+    std::fill(data.begin(), data.end(), 1); // initialize data to avoid lazy loading
+
    std::vector<double> results(n_thread);
-    std::vector<char *> buffers(n_thread);

-    for (int i = 0; i < n_thread; ++i) {
-        buffers[i] = new char[buffer_size];
-    }
-
-    auto memory_bw_test = [](char * buffer, size_t size, double & result) {
-        // read test
-        volatile char temp = 0;
+    // memory bandwidth test function
+    auto memory_bw_test = [](char * data, size_t total_size, size_t block_size, double & result) {
+        size_t n_iters = total_size / block_size; 
+        volatile char temp = 0; // volatile to prevent compiler optimization
        auto start = std::chrono::high_resolution_clock::now();
-        for (size_t i = 0; i < size; i += 64) {
-            temp += buffer[i];
+        
+        for (size_t i = 0; i < n_iters; i++) {
+            // simulate block-wise sequential access
+            size_t offset = i * block_size;
+            for (size_t j = 0; j < block_size; j += 64) {
+                temp += data[offset + j];
+            }
        }
+
        auto end = std::chrono::high_resolution_clock::now();
        std::chrono::duration<double> elapsed = end - start;
-        result = size / elapsed.count() / 1e9; // GB/s
+        result = total_size / elapsed.count() / 1e9; // GB/s
+
+        (void)temp;
    };

+    std::vector<std::thread> thread_pool;
    for (int i = 0; i < n_thread; ++i) {
-        thread_pool.emplace_back(memory_bw_test, buffers[i], buffer_size, std::ref(results[i]));
+        thread_pool.emplace_back(
+            memory_bw_test, 
+            data.data(), 
+            buffer_size / n_thread, 
+            MEM_TEST_BLOCK_SIZE, 
+            std::ref(results[i])
+        );
    }
+
    for (auto & t : thread_pool) {
        t.join();
    }

-    double bandwidth = 0.0f;
-    for (double result : results) {
-        bandwidth += result;
-    }
-
-    for (int i = 0; i < n_thread; ++i) {
-        delete[] buffers[i];
-    }
-
+    double bandwidth = std::accumulate(results.begin(), results.end(), 0.0);
    return static_cast<float>(bandwidth);
 }

--- a/common/profiler.h
+++ b/common/profiler.h
@ -7,6 +7,7 @@
 #define DISK_TEST_TOTAL_BYTE 500L * 1024 * 1024
 #define DISK_TEST_SEQ_BLOCK  100L * 1024 * 1024
 #define DISK_TEST_RND_BLOCK  4096
+#define MEM_TEST_BLOCK_SIZE  64 * 1024


 struct cpu_props {