From e6f4c009abc8e99370b7871cd867b975c556bf4b Mon Sep 17 00:00:00 2001 From: Lizonghang <870644199@qq.com> Date: Thu, 5 Dec 2024 20:38:28 +0400 Subject: [PATCH] device_disk_access_delay: add delay for loading one row from lookup table --- common/profiler.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/common/profiler.cpp b/common/profiler.cpp index 32aba4ad..aa5f388b 100644 --- a/common/profiler.cpp +++ b/common/profiler.cpp @@ -1161,13 +1161,15 @@ static float device_disk_access_delay(struct device_info & dev_info, struct llam int n_gpu_layers = std::min(static_cast(cparams.n_gpu_layers), n_layers); int n_vocab = llama_n_vocab(model); - int64_t cpu_total_bytes = ( + int64_t input_bytes = ( n_params.input_f32 * 4 + n_params.input_f16 * 2 + n_params.input_q4k / 2 + n_params.input_q6k * 3 / 8 + n_params.input_q80) / n_vocab; // lookup table, retrieve only n_embd elements + int64_t cpu_total_bytes = input_bytes; + int64_t layer_bytes = n_params.layer_f32 * 4 + n_params.layer_f16 * 2 + @@ -1214,7 +1216,9 @@ static float device_disk_access_delay(struct device_info & dev_info, struct llam double total_kv_size_gib = cpu_kv_size_gib + gpu_kv_size_gib; double total_compute_buf_gib = cpu_compute_buf_gib + gpu_compute_buf_gib; if (total_bytes_gib + total_kv_size_gib + total_compute_buf_gib < dev_info.memory.total_physical) { - return 0.0f; + float disk_read_bw = dev_info.disk.read_rnd_bw; + // each time one new row of lookup table will be loaded + return static_cast(input_bytes) / 1e9 / disk_read_bw * 1000; // convert to ms } else { LOG_INF("swap occurs, not allowed\n"); // double need_swap_gib = total_bytes_gib + total_kv_size_gib + total_compute_buf_gib - dev_info.memory.total_physical;