diff --git a/common/common.cpp b/common/common.cpp index 80d0c16d..ee33f351 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -914,7 +914,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { dev_info_set = (struct device_info *)malloc(n_world * sizeof(struct device_info)); dev_info_set[0] = dev_info; llama_gather_device_info(lctx, dev_info_set); - device_print_props(dev_info_set, n_world, model); + device_print_props(dev_info_set, n_world, model, cparams); } else { llama_send_device_info(lctx, &dev_info); } diff --git a/common/profiler.cpp b/common/profiler.cpp index 59dd6624..642f793d 100644 --- a/common/profiler.cpp +++ b/common/profiler.cpp @@ -876,8 +876,10 @@ static float device_memory_access_delay(struct device_info & dev_info, int n_lay #endif } -static float device_disk_access_delay(struct device_info & dev_info, int n_layers) { - struct model_params n_params = dev_info.model_params; +static float device_disk_access_delay(struct device_info & dev_info, struct llama_model * model, const struct llama_context_params cparams) { + auto n_params = dev_info.model_params; + int n_layers = llama_model_n_layers(model); + double kv_size_gb = static_cast(llama_model_kvcache_size(model, cparams)) / 1e9; // convert to GB int64_t total_bytes = 0; total_bytes += n_params.layer_f32 * 4 + @@ -895,13 +897,18 @@ static float device_disk_access_delay(struct device_info & dev_info, int n_layer n_params.output_q80; float total_gbytes = (double)total_bytes / 1e9; // convert to GB - float mem_avail = dev_info.memory.available_physical * 1024.0f * 1024.0f * 1024.0f / 1e9; // convert to GB + float mem_avail = dev_info.memory.available_physical * 1024.0f * 1024.0f * 1024.0f / 1e9; // convert to GB + mem_avail -= static_cast(kv_size_gb); // todo: consider activations which also consumes the available memory +#ifdef __linux__ + float disk_read_bw = dev_info.disk.read_seq_bw; // GB/s +#else float disk_read_bw = dev_info.disk.read_rnd_bw; // GB/s +#endif return std::max(0.0, static_cast(total_gbytes - mem_avail) / disk_read_bw * 1000); // convert to ms } -void device_print_props(struct device_info * dev_info_set, int n, struct llama_model * model) { +void device_print_props(struct device_info * dev_info_set, int n, struct llama_model * model, const struct llama_context_params cparams) { LOG_INF("\n-------------------------------------------------------------------------------------------\n"); LOG_INF("| Property "); for (int i = 0; i < n; ++i) { @@ -1255,7 +1262,7 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m int n_layers = llama_model_n_layers(model); latency += device_compute_delay(dev_info_set[0], n_layers); latency += device_memory_access_delay(dev_info_set[0], n_layers); - latency += device_disk_access_delay(dev_info_set[0], n_layers); // if physical memory is not enough, some tensor weights will be released from memory and reloaded by mmap later + latency += device_disk_access_delay(dev_info_set[0], model, cparams); // if physical memory is not enough, some tensor weights will be released from memory and reloaded by mmap later LOG_INF("| Token latency (ms) "); LOG_INF("| %-10.2f ", latency); diff --git a/common/profiler.h b/common/profiler.h index 43a5fc81..286fccd1 100644 --- a/common/profiler.h +++ b/common/profiler.h @@ -8,6 +8,7 @@ #define DISK_TEST_SEQ_BLOCK 100L * 1024 * 1024 #define DISK_TEST_RND_BLOCK 4096 + struct cpu_props { const char * name; const char * description; @@ -222,7 +223,7 @@ void device_disk_rnd_bw (float * read_rnd_bw, float * write_rnd_bw, int float device_memory_bw (int n_thread); float device_cuda_memory_bw (struct llama_model * model); void device_get_props (struct llama_model * model, int device, struct ggml_backend_dev_props * props); -void device_print_props (struct device_info * dev_info_set, int n, struct llama_model * model); +void device_print_props (struct device_info * dev_info_set, int n, struct llama_model * model, const struct llama_context_params cparams); int device_has_metal (void); int device_has_cuda (void); diff --git a/include/llama.h b/include/llama.h index 7ae1d702..27c322f3 100644 --- a/include/llama.h +++ b/include/llama.h @@ -523,6 +523,9 @@ extern "C" { // Returns the total number of parameters in the model LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model); + // Return the size of KV cache in the model + LLAMA_API uint64_t llama_model_kvcache_size(const struct llama_model * model, const struct llama_context_params cparams); + // Return the total number of float operations in the model LLAMA_API void llama_model_n_flops( struct llama_model * model, diff --git a/src/llama.cpp b/src/llama.cpp index f4bb64d7..851d1d69 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -20808,6 +20808,13 @@ static void count_n_params(struct model_params * n_params, enum ggml_type dtype, } } +uint64_t llama_model_kvcache_size(const struct llama_model * model, const struct llama_context_params cparams) { + const llama_hparams hparams = model->hparams; + uint64_t ne_k = static_cast(hparams.n_embd_k_gqa()) * cparams.n_ctx * ggml_type_size(cparams.type_k); + uint64_t ne_v = static_cast(hparams.n_embd_v_gqa()) * cparams.n_ctx * ggml_type_size(cparams.type_v); + return (ne_k + ne_v) * llama_model_n_layers(model); +} + void llama_model_n_flops( struct llama_model * model, struct llama_model_loader * ml,