diff --git a/common/common.cpp b/common/common.cpp index ee8003fd..55221fe8 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1434,6 +1434,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { dev_info_set = (struct device_info *)malloc(n_world * sizeof(struct device_info)); dev_info_set[0] = dev_info; llama_gather_device_info(lctx, dev_info_set); + device_print_props(dev_info_set, n_world, model, cparams); } else { llama_send_device_info(lctx, &dev_info); } @@ -1458,10 +1459,6 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { cparams.n_gpu_layers = n_gpu_layers[my_rank]; mparams.n_gpu_layers = n_gpu_layers[my_rank]; llama_model_set_n_gpu_layers(model, n_gpu_layers[my_rank]); - -#ifdef LLAMA_DEBUG - device_print_props(dev_info_set, n_world, model, cparams); -#endif } LOG_INF("\nUsing window size: %d, GPU layers: %d\n\n", cparams.n_layer_window[my_rank], cparams.n_gpu_layers); diff --git a/common/profiler.cpp b/common/profiler.cpp index 8f0d21ee..a0763dc9 100644 --- a/common/profiler.cpp +++ b/common/profiler.cpp @@ -1966,17 +1966,16 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_bytes.nb_output); LOG_INF("\n"); - // todo: calculate for each device, not only master - float latency = 0.0f; - int n_layers = llama_model_n_layers (model); - latency += device_compute_delay (dev_info_set[0], n_layers, cparams); - latency += device_memory_access_delay(dev_info_set[0], model, cparams, n_layers); - latency += device_disk_access_delay (dev_info_set[0], model, cparams); // if physical memory is not enough, some mapped data will be released and reloaded later - latency += device_mem_copy_delay (dev_info_set[0], model, cparams); // memory copy delay in kvcache + // float latency = 0.0f; + // int n_layers = llama_model_n_layers (model); + // latency += device_compute_delay (dev_info_set[0], n_layers, cparams); + // latency += device_memory_access_delay(dev_info_set[0], model, cparams, n_layers); + // latency += device_disk_access_delay (dev_info_set[0], model, cparams); // if physical memory is not enough, some mapped data will be released and reloaded later + // latency += device_mem_copy_delay (dev_info_set[0], model, cparams); // memory copy delay in kvcache - LOG_INF("| Token latency (ms) "); - LOG_INF("| %-10.2f ", latency); - LOG_INF("\n"); + // LOG_INF("| Token latency (ms) "); + // LOG_INF("| %-10.2f ", latency); + // LOG_INF("\n"); LOG_INF("-------------------------------------------------------------------------------------------\n\n"); }