From 7521e532c49047b79a57f1657fb7cdfd30323270 Mon Sep 17 00:00:00 2001 From: Zonghang Li Date: Wed, 4 Dec 2024 15:36:59 +0400 Subject: [PATCH] device_memory_bw: simulate cache-friendly block access and multi-threading --- common/profiler.cpp | 56 ++++++++++++++++++++++++++------------------- common/profiler.h | 1 + 2 files changed, 33 insertions(+), 24 deletions(-) diff --git a/common/profiler.cpp b/common/profiler.cpp index 63da8c62..51e0f300 100644 --- a/common/profiler.cpp +++ b/common/profiler.cpp @@ -360,8 +360,9 @@ static int is_uma_arch() { } return is_arm64; -#endif +#else return 0; +#endif } static uint64_t device_host_physical_memory(bool available) { @@ -841,43 +842,50 @@ void device_disk_seq_bw(float * read_seq_bw, float * write_seq_bw, int n_threads } float device_memory_bw(int n_thread) { - size_t buffer_size = 5L * 1024 * 1024; // 5 MiB - std::vector thread_pool; + // simulate large model weights, set to 100 MiB + size_t buffer_size = 100L * 1024 * 1024; + std::vector data(buffer_size); + std::fill(data.begin(), data.end(), 1); // initialize data to avoid lazy loading + std::vector results(n_thread); - std::vector buffers(n_thread); - for (int i = 0; i < n_thread; ++i) { - buffers[i] = new char[buffer_size]; - } - - auto memory_bw_test = [](char * buffer, size_t size, double & result) { - // read test - volatile char temp = 0; + // memory bandwidth test function + auto memory_bw_test = [](char * data, size_t total_size, size_t block_size, double & result) { + size_t n_iters = total_size / block_size; + volatile char temp = 0; // volatile to prevent compiler optimization auto start = std::chrono::high_resolution_clock::now(); - for (size_t i = 0; i < size; i += 64) { - temp += buffer[i]; + + for (size_t i = 0; i < n_iters; i++) { + // simulate block-wise sequential access + size_t offset = i * block_size; + for (size_t j = 0; j < block_size; j += 64) { + temp += data[offset + j]; + } } + auto end = std::chrono::high_resolution_clock::now(); std::chrono::duration elapsed = end - start; - result = size / elapsed.count() / 1e9; // GB/s + result = total_size / elapsed.count() / 1e9; // GB/s + + (void)temp; }; + std::vector thread_pool; for (int i = 0; i < n_thread; ++i) { - thread_pool.emplace_back(memory_bw_test, buffers[i], buffer_size, std::ref(results[i])); + thread_pool.emplace_back( + memory_bw_test, + data.data(), + buffer_size / n_thread, + MEM_TEST_BLOCK_SIZE, + std::ref(results[i]) + ); } + for (auto & t : thread_pool) { t.join(); } - double bandwidth = 0.0f; - for (double result : results) { - bandwidth += result; - } - - for (int i = 0; i < n_thread; ++i) { - delete[] buffers[i]; - } - + double bandwidth = std::accumulate(results.begin(), results.end(), 0.0); return static_cast(bandwidth); } diff --git a/common/profiler.h b/common/profiler.h index 2e182380..a0a15b49 100644 --- a/common/profiler.h +++ b/common/profiler.h @@ -7,6 +7,7 @@ #define DISK_TEST_TOTAL_BYTE 500L * 1024 * 1024 #define DISK_TEST_SEQ_BLOCK 100L * 1024 * 1024 #define DISK_TEST_RND_BLOCK 4096 +#define MEM_TEST_BLOCK_SIZE 64 * 1024 struct cpu_props {