mirror of
https://github.com/Lizonghang/prima.cpp.git
synced 2025-09-07 21:20:42 +00:00
device_memory_bw: simulate cache-friendly block access and multi-threading
This commit is contained in:
parent
44b4718c8b
commit
7521e532c4
2 changed files with 33 additions and 24 deletions
|
@ -360,8 +360,9 @@ static int is_uma_arch() {
|
||||||
}
|
}
|
||||||
|
|
||||||
return is_arm64;
|
return is_arm64;
|
||||||
#endif
|
#else
|
||||||
return 0;
|
return 0;
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
static uint64_t device_host_physical_memory(bool available) {
|
static uint64_t device_host_physical_memory(bool available) {
|
||||||
|
@ -841,43 +842,50 @@ void device_disk_seq_bw(float * read_seq_bw, float * write_seq_bw, int n_threads
|
||||||
}
|
}
|
||||||
|
|
||||||
float device_memory_bw(int n_thread) {
|
float device_memory_bw(int n_thread) {
|
||||||
size_t buffer_size = 5L * 1024 * 1024; // 5 MiB
|
// simulate large model weights, set to 100 MiB
|
||||||
std::vector<std::thread> thread_pool;
|
size_t buffer_size = 100L * 1024 * 1024;
|
||||||
|
std::vector<char> data(buffer_size);
|
||||||
|
std::fill(data.begin(), data.end(), 1); // initialize data to avoid lazy loading
|
||||||
|
|
||||||
std::vector<double> results(n_thread);
|
std::vector<double> results(n_thread);
|
||||||
std::vector<char *> buffers(n_thread);
|
|
||||||
|
|
||||||
for (int i = 0; i < n_thread; ++i) {
|
// memory bandwidth test function
|
||||||
buffers[i] = new char[buffer_size];
|
auto memory_bw_test = [](char * data, size_t total_size, size_t block_size, double & result) {
|
||||||
}
|
size_t n_iters = total_size / block_size;
|
||||||
|
volatile char temp = 0; // volatile to prevent compiler optimization
|
||||||
auto memory_bw_test = [](char * buffer, size_t size, double & result) {
|
|
||||||
// read test
|
|
||||||
volatile char temp = 0;
|
|
||||||
auto start = std::chrono::high_resolution_clock::now();
|
auto start = std::chrono::high_resolution_clock::now();
|
||||||
for (size_t i = 0; i < size; i += 64) {
|
|
||||||
temp += buffer[i];
|
for (size_t i = 0; i < n_iters; i++) {
|
||||||
|
// simulate block-wise sequential access
|
||||||
|
size_t offset = i * block_size;
|
||||||
|
for (size_t j = 0; j < block_size; j += 64) {
|
||||||
|
temp += data[offset + j];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
auto end = std::chrono::high_resolution_clock::now();
|
auto end = std::chrono::high_resolution_clock::now();
|
||||||
std::chrono::duration<double> elapsed = end - start;
|
std::chrono::duration<double> elapsed = end - start;
|
||||||
result = size / elapsed.count() / 1e9; // GB/s
|
result = total_size / elapsed.count() / 1e9; // GB/s
|
||||||
|
|
||||||
|
(void)temp;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
std::vector<std::thread> thread_pool;
|
||||||
for (int i = 0; i < n_thread; ++i) {
|
for (int i = 0; i < n_thread; ++i) {
|
||||||
thread_pool.emplace_back(memory_bw_test, buffers[i], buffer_size, std::ref(results[i]));
|
thread_pool.emplace_back(
|
||||||
|
memory_bw_test,
|
||||||
|
data.data(),
|
||||||
|
buffer_size / n_thread,
|
||||||
|
MEM_TEST_BLOCK_SIZE,
|
||||||
|
std::ref(results[i])
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (auto & t : thread_pool) {
|
for (auto & t : thread_pool) {
|
||||||
t.join();
|
t.join();
|
||||||
}
|
}
|
||||||
|
|
||||||
double bandwidth = 0.0f;
|
double bandwidth = std::accumulate(results.begin(), results.end(), 0.0);
|
||||||
for (double result : results) {
|
|
||||||
bandwidth += result;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int i = 0; i < n_thread; ++i) {
|
|
||||||
delete[] buffers[i];
|
|
||||||
}
|
|
||||||
|
|
||||||
return static_cast<float>(bandwidth);
|
return static_cast<float>(bandwidth);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -7,6 +7,7 @@
|
||||||
#define DISK_TEST_TOTAL_BYTE 500L * 1024 * 1024
|
#define DISK_TEST_TOTAL_BYTE 500L * 1024 * 1024
|
||||||
#define DISK_TEST_SEQ_BLOCK 100L * 1024 * 1024
|
#define DISK_TEST_SEQ_BLOCK 100L * 1024 * 1024
|
||||||
#define DISK_TEST_RND_BLOCK 4096
|
#define DISK_TEST_RND_BLOCK 4096
|
||||||
|
#define MEM_TEST_BLOCK_SIZE 64 * 1024
|
||||||
|
|
||||||
|
|
||||||
struct cpu_props {
|
struct cpu_props {
|
||||||
|
|
Loading…
Add table
Reference in a new issue