diff --git a/common/common.cpp b/common/common.cpp index 851d7ebf..ff565163 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1482,7 +1482,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { device_info dev_info; uint32_t n_world = params.n_world; uint32_t my_rank = params.rank; - bool auto_schedule = n_world > 1 && params.n_layer_window[0] == 0; + bool auto_schedule = params.n_layer_window[0] == 0; if (auto_schedule) { // get device profile @@ -1495,33 +1495,52 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { struct llama_context_params cparams = llama_context_params_from_gpt_params(params); llama_context * lctx = llama_new_context_with_model(model, cparams); - // initialize sockets - llama_init_sockets(lctx, n_world, my_rank); - - if (auto_schedule) { - // sychronize device profile to the master node - struct device_info * dev_info_set = nullptr; - if (my_rank == 0) { - dev_info_set = (struct device_info *)malloc(n_world * sizeof(struct device_info)); - dev_info_set[0] = dev_info; - llama_gather_device_info(lctx, dev_info_set); - device_print_props(dev_info_set, n_world, model, cparams); - } else { - llama_send_device_info(lctx, &dev_info); - } + if (n_world == 1) { + uint32_t n_layers = llama_model_n_layers(model); + params.n_layer_window[0] = n_layers; + cparams.n_layer_window[0] = n_layers; + mparams.n_layer_window[0] = n_layers; + llama_context_n_layer_window(lctx)[0] = n_layers; + } else { + // initialize sockets + llama_init_sockets(lctx, n_world, my_rank); uint32_t n_layer_window[32] = {0}, n_gpu_layers[32] = {0}; - if (my_rank == 0) { - // automatically determine n_layer_window and n_gpu_layers - if (!assign_layers_to_device(n_world, my_rank, dev_info_set, n_layer_window, n_gpu_layers, model, cparams)) { - LOG_ERR("%s: Invalid allocation by HiGHS solver\n", __func__); - llama_free(lctx); - llama_free_model(model); - return iparams; + + if (auto_schedule) { + // sychronize device profile to the master node + struct device_info * dev_info_set = nullptr; + if (my_rank == 0) { + dev_info_set = (struct device_info *)malloc(n_world * sizeof(struct device_info)); + dev_info_set[0] = dev_info; + llama_gather_device_info(lctx, dev_info_set); + device_print_props(dev_info_set, n_world, model, cparams); + } else { + llama_send_device_info(lctx, &dev_info); + } + + if (my_rank == 0) { + // automatically determine n_layer_window and n_gpu_layers + if (!assign_layers_to_device(n_world, my_rank, dev_info_set, n_layer_window, n_gpu_layers, model, cparams)) { + LOG_ERR("%s: Invalid allocation by HiGHS solver\n", __func__); + llama_free(lctx); + llama_free_model(model); + return iparams; + } + } + } else { + if (my_rank == 0) { + // use the user-defined n_layer_window + std::copy(std::begin(params.n_layer_window), std::end(params.n_layer_window), n_layer_window); + } + } + + if (my_rank == 0) { + if (auto_schedule) { + llama_bcast_layer_setup(lctx, n_layer_window, n_gpu_layers); + } else { + llama_bcast_layer_setup(lctx, n_layer_window, nullptr); } - - // synchronize the new n_layer_window and n_gpu_layers to other nodes - llama_bcast_layer_setup(lctx, n_layer_window, n_gpu_layers); } else { llama_recv_layer_setup(lctx, n_layer_window, n_gpu_layers); } @@ -1532,16 +1551,12 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { std::copy(std::begin(n_layer_window), std::end(n_layer_window), mparams.n_layer_window); std::copy(std::begin(n_layer_window), std::end(n_layer_window), llama_context_n_layer_window(lctx)); - params.n_gpu_layers = n_gpu_layers[my_rank]; - cparams.n_gpu_layers = n_gpu_layers[my_rank]; - mparams.n_gpu_layers = n_gpu_layers[my_rank]; - llama_model_set_n_gpu_layers(model, n_gpu_layers[my_rank]); - } else if (n_world == 1) { - uint32_t n_layers = llama_model_n_layers(model); - params.n_layer_window[0] = n_layers; - cparams.n_layer_window[0] = n_layers; - mparams.n_layer_window[0] = n_layers; - llama_context_n_layer_window(lctx)[0] = n_layers; + if (params.n_gpu_layers > 0) { + params.n_gpu_layers = n_gpu_layers[my_rank]; + cparams.n_gpu_layers = n_gpu_layers[my_rank]; + mparams.n_gpu_layers = n_gpu_layers[my_rank]; + llama_model_set_n_gpu_layers(model, n_gpu_layers[my_rank]); + } } LOG_INF("\nUsing window size: %d, GPU layers: %d\n\n", cparams.n_layer_window[my_rank], cparams.n_gpu_layers); diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m index 9f890578..635bb346 100644 --- a/ggml/src/ggml-metal.m +++ b/ggml/src/ggml-metal.m @@ -3259,7 +3259,7 @@ static void ggml_backend_metal_log_allocated_size(id device, size_t s #ifndef GGML_METAL_NDEBUG #if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15) if (@available(macOS 10.12, iOS 16.0, *)) { - GGML_LOG_DEBUG("%s: allocated buffer, size = %8.2f MiB, (%8.2f / %8.2f)\n", + GGML_LOG_WARN("%s: allocated buffer, size = %8.2f MiB, (%8.2f / %8.2f)\n", __func__, size_aligned / 1024.0 / 1024.0, device.currentAllocatedSize / 1024.0 / 1024.0, diff --git a/src/llama.cpp b/src/llama.cpp index 0c3715a0..6d5eff70 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -104,10 +104,10 @@ #define timer(name) auto _timer_##name = Timer(#name) struct Timer { - const char* name; + const char * name; int64_t start_time; - bool enable_timer = false; - Timer(const char* name) : name(name), start_time(ggml_time_us()) {} + bool enable_timer = true; + Timer(const char * name) : name(name), start_time(ggml_time_us()) {} ~Timer() { if (enable_timer) { int64_t end_time = ggml_time_us(); @@ -20093,8 +20093,10 @@ int llama_bcast_layer_setup(struct llama_context * ctx, uint32_t * n_layer_windo send_msgs.emplace_back("n_layer_window", strlen("n_layer_window")); send_msgs.emplace_back(n_layer_window, sizeof(uint32_t) * 32); - send_msgs.emplace_back("n_gpu_layers", strlen("n_gpu_layers")); - send_msgs.emplace_back(n_gpu_layers, sizeof(uint32_t) * 32); + if (n_gpu_layers != nullptr) { + send_msgs.emplace_back("n_gpu_layers", strlen("n_gpu_layers")); + send_msgs.emplace_back(n_gpu_layers, sizeof(uint32_t) * 32); + } zmq::send_multipart(*ctx->send_socket, send_msgs); } catch (const zmq::error_t& e) { @@ -20114,20 +20116,15 @@ int llama_recv_layer_setup(struct llama_context * ctx, uint32_t * n_layer_window return -1; } - if (recv_msgs.size() != 4) { // expecting n_layer_windows and n_gpu_layers - LLAMA_LOG_INFO("Unexpected number of messages received: %zu\n", recv_msgs.size()); - return -1; - } - - if (recv_msgs[0].to_string() != "n_layer_window" || recv_msgs[2].to_string() != "n_gpu_layers") { - LLAMA_LOG_INFO("Unexpected message received\n"); - return -1; - } - + GGML_ASSERT(recv_msgs[0].to_string() == "n_layer_window"); GGML_ASSERT(recv_msgs[1].size() == sizeof(uint32_t) * 32); - GGML_ASSERT(recv_msgs[3].size() == sizeof(uint32_t) * 32); memcpy(n_layer_window, recv_msgs[1].data(), sizeof(uint32_t) * 32); - memcpy(n_gpu_layers, recv_msgs[3].data(), sizeof(uint32_t) * 32); + + if (recv_msgs.size() > 2) { + GGML_ASSERT(recv_msgs[2].to_string() == "n_gpu_layers"); + GGML_ASSERT(recv_msgs[3].size() == sizeof(uint32_t) * 32); + memcpy(n_gpu_layers, recv_msgs[3].data(), sizeof(uint32_t) * 32); + } if (my_rank != n_world - 1) { try {