This commit is contained in:
Lizonghang 2025-01-28 16:36:47 +04:00
parent 36f353e374
commit 631daadd92
3 changed files with 65 additions and 53 deletions

View file

@ -1482,7 +1482,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
device_info dev_info; device_info dev_info;
uint32_t n_world = params.n_world; uint32_t n_world = params.n_world;
uint32_t my_rank = params.rank; uint32_t my_rank = params.rank;
bool auto_schedule = n_world > 1 && params.n_layer_window[0] == 0; bool auto_schedule = params.n_layer_window[0] == 0;
if (auto_schedule) { if (auto_schedule) {
// get device profile // get device profile
@ -1495,33 +1495,52 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
struct llama_context_params cparams = llama_context_params_from_gpt_params(params); struct llama_context_params cparams = llama_context_params_from_gpt_params(params);
llama_context * lctx = llama_new_context_with_model(model, cparams); llama_context * lctx = llama_new_context_with_model(model, cparams);
// initialize sockets if (n_world == 1) {
llama_init_sockets(lctx, n_world, my_rank); uint32_t n_layers = llama_model_n_layers(model);
params.n_layer_window[0] = n_layers;
if (auto_schedule) { cparams.n_layer_window[0] = n_layers;
// sychronize device profile to the master node mparams.n_layer_window[0] = n_layers;
struct device_info * dev_info_set = nullptr; llama_context_n_layer_window(lctx)[0] = n_layers;
if (my_rank == 0) { } else {
dev_info_set = (struct device_info *)malloc(n_world * sizeof(struct device_info)); // initialize sockets
dev_info_set[0] = dev_info; llama_init_sockets(lctx, n_world, my_rank);
llama_gather_device_info(lctx, dev_info_set);
device_print_props(dev_info_set, n_world, model, cparams);
} else {
llama_send_device_info(lctx, &dev_info);
}
uint32_t n_layer_window[32] = {0}, n_gpu_layers[32] = {0}; uint32_t n_layer_window[32] = {0}, n_gpu_layers[32] = {0};
if (my_rank == 0) {
// automatically determine n_layer_window and n_gpu_layers if (auto_schedule) {
if (!assign_layers_to_device(n_world, my_rank, dev_info_set, n_layer_window, n_gpu_layers, model, cparams)) { // sychronize device profile to the master node
LOG_ERR("%s: Invalid allocation by HiGHS solver\n", __func__); struct device_info * dev_info_set = nullptr;
llama_free(lctx); if (my_rank == 0) {
llama_free_model(model); dev_info_set = (struct device_info *)malloc(n_world * sizeof(struct device_info));
return iparams; dev_info_set[0] = dev_info;
llama_gather_device_info(lctx, dev_info_set);
device_print_props(dev_info_set, n_world, model, cparams);
} else {
llama_send_device_info(lctx, &dev_info);
}
if (my_rank == 0) {
// automatically determine n_layer_window and n_gpu_layers
if (!assign_layers_to_device(n_world, my_rank, dev_info_set, n_layer_window, n_gpu_layers, model, cparams)) {
LOG_ERR("%s: Invalid allocation by HiGHS solver\n", __func__);
llama_free(lctx);
llama_free_model(model);
return iparams;
}
}
} else {
if (my_rank == 0) {
// use the user-defined n_layer_window
std::copy(std::begin(params.n_layer_window), std::end(params.n_layer_window), n_layer_window);
}
}
if (my_rank == 0) {
if (auto_schedule) {
llama_bcast_layer_setup(lctx, n_layer_window, n_gpu_layers);
} else {
llama_bcast_layer_setup(lctx, n_layer_window, nullptr);
} }
// synchronize the new n_layer_window and n_gpu_layers to other nodes
llama_bcast_layer_setup(lctx, n_layer_window, n_gpu_layers);
} else { } else {
llama_recv_layer_setup(lctx, n_layer_window, n_gpu_layers); llama_recv_layer_setup(lctx, n_layer_window, n_gpu_layers);
} }
@ -1532,16 +1551,12 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
std::copy(std::begin(n_layer_window), std::end(n_layer_window), mparams.n_layer_window); std::copy(std::begin(n_layer_window), std::end(n_layer_window), mparams.n_layer_window);
std::copy(std::begin(n_layer_window), std::end(n_layer_window), llama_context_n_layer_window(lctx)); std::copy(std::begin(n_layer_window), std::end(n_layer_window), llama_context_n_layer_window(lctx));
params.n_gpu_layers = n_gpu_layers[my_rank]; if (params.n_gpu_layers > 0) {
cparams.n_gpu_layers = n_gpu_layers[my_rank]; params.n_gpu_layers = n_gpu_layers[my_rank];
mparams.n_gpu_layers = n_gpu_layers[my_rank]; cparams.n_gpu_layers = n_gpu_layers[my_rank];
llama_model_set_n_gpu_layers(model, n_gpu_layers[my_rank]); mparams.n_gpu_layers = n_gpu_layers[my_rank];
} else if (n_world == 1) { llama_model_set_n_gpu_layers(model, n_gpu_layers[my_rank]);
uint32_t n_layers = llama_model_n_layers(model); }
params.n_layer_window[0] = n_layers;
cparams.n_layer_window[0] = n_layers;
mparams.n_layer_window[0] = n_layers;
llama_context_n_layer_window(lctx)[0] = n_layers;
} }
LOG_INF("\nUsing window size: %d, GPU layers: %d\n\n", cparams.n_layer_window[my_rank], cparams.n_gpu_layers); LOG_INF("\nUsing window size: %d, GPU layers: %d\n\n", cparams.n_layer_window[my_rank], cparams.n_gpu_layers);

View file

@ -3259,7 +3259,7 @@ static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device, size_t s
#ifndef GGML_METAL_NDEBUG #ifndef GGML_METAL_NDEBUG
#if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15) #if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
if (@available(macOS 10.12, iOS 16.0, *)) { if (@available(macOS 10.12, iOS 16.0, *)) {
GGML_LOG_DEBUG("%s: allocated buffer, size = %8.2f MiB, (%8.2f / %8.2f)\n", GGML_LOG_WARN("%s: allocated buffer, size = %8.2f MiB, (%8.2f / %8.2f)\n",
__func__, __func__,
size_aligned / 1024.0 / 1024.0, size_aligned / 1024.0 / 1024.0,
device.currentAllocatedSize / 1024.0 / 1024.0, device.currentAllocatedSize / 1024.0 / 1024.0,

View file

@ -104,10 +104,10 @@
#define timer(name) auto _timer_##name = Timer(#name) #define timer(name) auto _timer_##name = Timer(#name)
struct Timer { struct Timer {
const char* name; const char * name;
int64_t start_time; int64_t start_time;
bool enable_timer = false; bool enable_timer = true;
Timer(const char* name) : name(name), start_time(ggml_time_us()) {} Timer(const char * name) : name(name), start_time(ggml_time_us()) {}
~Timer() { ~Timer() {
if (enable_timer) { if (enable_timer) {
int64_t end_time = ggml_time_us(); int64_t end_time = ggml_time_us();
@ -20093,8 +20093,10 @@ int llama_bcast_layer_setup(struct llama_context * ctx, uint32_t * n_layer_windo
send_msgs.emplace_back("n_layer_window", strlen("n_layer_window")); send_msgs.emplace_back("n_layer_window", strlen("n_layer_window"));
send_msgs.emplace_back(n_layer_window, sizeof(uint32_t) * 32); send_msgs.emplace_back(n_layer_window, sizeof(uint32_t) * 32);
send_msgs.emplace_back("n_gpu_layers", strlen("n_gpu_layers")); if (n_gpu_layers != nullptr) {
send_msgs.emplace_back(n_gpu_layers, sizeof(uint32_t) * 32); send_msgs.emplace_back("n_gpu_layers", strlen("n_gpu_layers"));
send_msgs.emplace_back(n_gpu_layers, sizeof(uint32_t) * 32);
}
zmq::send_multipart(*ctx->send_socket, send_msgs); zmq::send_multipart(*ctx->send_socket, send_msgs);
} catch (const zmq::error_t& e) { } catch (const zmq::error_t& e) {
@ -20114,20 +20116,15 @@ int llama_recv_layer_setup(struct llama_context * ctx, uint32_t * n_layer_window
return -1; return -1;
} }
if (recv_msgs.size() != 4) { // expecting n_layer_windows and n_gpu_layers GGML_ASSERT(recv_msgs[0].to_string() == "n_layer_window");
LLAMA_LOG_INFO("Unexpected number of messages received: %zu\n", recv_msgs.size());
return -1;
}
if (recv_msgs[0].to_string() != "n_layer_window" || recv_msgs[2].to_string() != "n_gpu_layers") {
LLAMA_LOG_INFO("Unexpected message received\n");
return -1;
}
GGML_ASSERT(recv_msgs[1].size() == sizeof(uint32_t) * 32); GGML_ASSERT(recv_msgs[1].size() == sizeof(uint32_t) * 32);
GGML_ASSERT(recv_msgs[3].size() == sizeof(uint32_t) * 32);
memcpy(n_layer_window, recv_msgs[1].data(), sizeof(uint32_t) * 32); memcpy(n_layer_window, recv_msgs[1].data(), sizeof(uint32_t) * 32);
memcpy(n_gpu_layers, recv_msgs[3].data(), sizeof(uint32_t) * 32);
if (recv_msgs.size() > 2) {
GGML_ASSERT(recv_msgs[2].to_string() == "n_gpu_layers");
GGML_ASSERT(recv_msgs[3].size() == sizeof(uint32_t) * 32);
memcpy(n_gpu_layers, recv_msgs[3].data(), sizeof(uint32_t) * 32);
}
if (my_rank != n_world - 1) { if (my_rank != n_world - 1) {
try { try {