diff --git a/common/common.cpp b/common/common.cpp index 6438daae..a2f7b4cc 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -28,6 +28,8 @@ #include #include +#define DEFAULT_N_LAYER_WINDOW 4 + #if defined(__APPLE__) && defined(__MACH__) #include #include @@ -362,6 +364,11 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREAD return true; } +template +void copy_n_layer_window(const uint32_t (&source)[N], uint32_t * destination) { + std::copy(std::begin(source), std::end(source), destination); +} + void gpt_init() { llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) { if (LOG_DEFAULT_LLAMA <= gpt_log_verbosity_thold) { @@ -819,6 +826,24 @@ std::string fs_get_cache_file(const std::string & filename) { // // Model utils // +static void llama_assign_n_layer_window( + uint32_t n_world, + uint32_t my_rank, + const device_info * dev_info_set, + uint32_t * n_layer_window, + struct llama_model * model) { + GGML_ASSERT(dev_info_set != nullptr); + GGML_ASSERT(n_layer_window != nullptr); + + uint32_t n_layer = llama_model_n_layers(model); + if (n_world == 1) { + n_layer_window[0] = n_layer; + return; + } + + std::fill_n(n_layer_window, n_world, DEFAULT_N_LAYER_WINDOW); +} + struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { llama_init_result iparams; auto mparams = llama_model_params_from_gpt_params(params); @@ -838,6 +863,8 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { return iparams; } + llama_model_loader * ml = llama_model_load(params.model.c_str(), model, &mparams); + if (params.reranking) { bool ok = true; @@ -871,21 +898,49 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { struct llama_context_params cparams = llama_context_params_from_gpt_params(params); llama_context * lctx = llama_new_context_with_model(model, cparams); + uint32_t n_world = cparams.n_world; + uint32_t my_rank = cparams.rank; + // initialize sockets - llama_init_sockets(lctx, cparams.n_world, cparams.rank); + llama_init_sockets(lctx, n_world, my_rank); // sychronize device profile to the master node struct device_info * dev_info_set = nullptr; - if (params.rank == 0) { - dev_info_set = (struct device_info *)malloc(cparams.n_world * sizeof(struct device_info)); + if (my_rank == 0) { + dev_info_set = (struct device_info *)malloc(n_world * sizeof(struct device_info)); dev_info_set[0] = dev_info; - llama_collect_device_info(dev_info_set, lctx); - device_print_props(dev_info_set, cparams.n_world); + llama_gather_device_info(lctx, dev_info_set); + device_print_props(dev_info_set, n_world); } else { - llama_send_device_info(&dev_info, lctx); + llama_send_device_info(lctx, &dev_info); } - if (llama_context_setup_backend(lctx) == nullptr) { + uint32_t n_layer_window[32] = {0}; + if (my_rank == 0) { + if (n_world == 1 || params.n_layer_window[0] == 0) { + llama_assign_n_layer_window(n_world, my_rank, dev_info_set, n_layer_window, model); + } else { + copy_n_layer_window(params.n_layer_window, n_layer_window); + } + + // synchronize the new n_layer_window to other nodes + llama_broadcast_n_layer_window(lctx, n_layer_window); + } else { + llama_recv_n_layer_window(lctx, n_layer_window); + } + + // update n_layer_window + copy_n_layer_window(n_layer_window, params.n_layer_window); + copy_n_layer_window(n_layer_window, cparams.n_layer_window); + copy_n_layer_window(n_layer_window, mparams.n_layer_window); + copy_n_layer_window(n_layer_window, llama_context_n_layer_window(lctx)); + + if (!mparams.vocab_only && llm_load_tensors(ml, model, mparams) < 0) { + LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.c_str()); + return iparams; + } + + if (llama_context_setup_backend(model, cparams, lctx) == nullptr) { LOG_ERR("%s: failed to setup context with model '%s'\n", __func__, params.model.c_str()); llama_free_model(model); return iparams; diff --git a/common/common.h b/common/common.h index c074106a..f04691db 100644 --- a/common/common.h +++ b/common/common.h @@ -144,7 +144,7 @@ struct gpt_sampler_params { struct gpt_params { int32_t n_world = 1; // number of devices to use int32_t rank = 0; // my rank for distributed inference - uint32_t n_layer_window[32] = {32}; // layer window size on each node + uint32_t n_layer_window[32] = {0}; // layer window size on each node std::string master_ip = "localhost"; // ip address of the master node std::string next_node_ip = "localhost"; // ip address of my next node bool unload = false; // unload layer weights after use or not diff --git a/common/profiler.cpp b/common/profiler.cpp index 487f3379..44abfe6a 100644 --- a/common/profiler.cpp +++ b/common/profiler.cpp @@ -316,7 +316,7 @@ void device_print_props(struct device_info * dev_info_set, int n) { LOG_INF("| Property "); for (int i = 0; i < n; ++i) { LOG_INF("| Rank %-8d", i); - GGML_ASSERT(dev_info_set[i].rank == i); + GGML_ASSERT((int)dev_info_set[i].rank == i); } LOG_INF("\n-------------------------------------------------------------------------------------------\n"); diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 931c9b4a..39d4b60c 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -142,21 +142,24 @@ int main(int argc, char ** argv) { if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_MAIN, print_usage)) { return 1; } + const uint32_t n_world = params.n_world; const uint32_t my_rank = params.rank; + GGML_ASSERT(!(n_world == 1 && my_rank > 0)); // check if --n-layer-window and --world is matched - uint32_t non_zero_count = 0; - size_t size = sizeof(params.n_layer_window) / sizeof(params.n_layer_window[0]); - for (size_t i = 0; i < size; ++i) { - if (params.n_layer_window[i] != 0) { - ++non_zero_count; + if (my_rank == 0) { + uint32_t non_zero_count = 0; + size_t size = sizeof(params.n_layer_window) / sizeof(params.n_layer_window[0]); + for (size_t i = 0; i < size; ++i) { + if (params.n_layer_window[i] != 0) { + ++non_zero_count; + } } + GGML_ASSERT((non_zero_count == 0 || non_zero_count == n_world) \ + && "Number of non-zero values in --n-layer-window must equal --world"); } - GGML_ASSERT(!(n_world == 1 && my_rank > 0)); - GGML_ASSERT(non_zero_count == n_world && "Number of non-zero values in --n-layer-window must equal --world"); - gpt_init(); auto & sparams = params.sparams; diff --git a/include/llama.h b/include/llama.h index e3890666..ff7d1599 100644 --- a/include/llama.h +++ b/include/llama.h @@ -434,14 +434,26 @@ extern "C" { LLAMA_API void llama_init_sockets (struct llama_context * ctx, uint32_t n_world, uint32_t my_rank); LLAMA_API void llama_free_sockets (struct llama_context * ctx, char ** msg); - LLAMA_API int llama_collect_device_info(struct device_info * dev_info_set, struct llama_context * ctx); - LLAMA_API int llama_send_device_info (struct device_info * dev_info, struct llama_context * ctx); + LLAMA_API int llama_gather_device_info (struct llama_context * ctx, struct device_info * dev_info_set); + LLAMA_API int llama_send_device_info (struct llama_context * ctx, struct device_info * dev_info); + LLAMA_API int llama_broadcast_n_layer_window(struct llama_context * ctx, uint32_t * n_layer_window); + LLAMA_API int llama_recv_n_layer_window(struct llama_context * ctx, uint32_t * n_layer_window); + + LLAMA_API int llm_load_tensors( + struct llama_model_loader * ml, + struct llama_model * model, + struct llama_model_params params); - // TODO: rename to llama_init_from_model LLAMA_API struct llama_context * llama_new_context_with_model( struct llama_model * model, struct llama_context_params params); - LLAMA_API void * llama_context_setup_backend(struct llama_context * ctx); + + LLAMA_API void * llama_context_setup_backend( + struct llama_model * model, + struct llama_context_params params, + struct llama_context * ctx); + + LLAMA_API uint32_t * llama_context_n_layer_window(struct llama_context * ctx); // Frees all allocated memory LLAMA_API void llama_free(struct llama_context * ctx); @@ -491,12 +503,17 @@ extern "C" { // Get metadata value as a string by index LLAMA_API int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size); + LLAMA_API struct llama_model_loader * llama_model_load(const char * fname, struct llama_model * model, struct llama_model_params * params); + // Get a string describing the model type LLAMA_API int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size); // Returns the total size of all the tensors in the model in bytes LLAMA_API uint64_t llama_model_size(const struct llama_model * model); + // Returns the number of model layers in the model + LLAMA_API uint32_t llama_model_n_layers(const struct llama_model * model); + // Returns the total number of parameters in the model LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model); diff --git a/src/llama.cpp b/src/llama.cpp index 9a552ee2..b77e7ca9 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -2572,8 +2572,6 @@ struct llama_cparams { uint32_t n_layer_window[32]; bool unload; uint32_t n_ctx; // context size used during inference - ggml_type type_k; - ggml_type type_v; uint32_t n_batch; uint32_t n_ubatch; uint32_t n_seq_max; @@ -7137,7 +7135,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { } // Returns false if cancelled by progress_callback -static bool llm_load_tensors( +static bool llm_load_tensors_impl( llama_model_loader & ml, llama_model & model, uint32_t n_world, @@ -9159,43 +9157,58 @@ static bool llm_load_tensors( return true; } -// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback -static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) { - model.t_start_us = ggml_time_us(); +int llm_load_tensors( + struct llama_model_loader * ml, + struct llama_model * model, + struct llama_model_params params) { + model->t_start_us = ggml_time_us(); try { - llama_model_loader ml(fname, params.use_mmap, params.check_tensors, params.kv_overrides); + if (!llm_load_tensors_impl( + *ml, *model, params.n_world, params.rank, params.n_layer_window, params.n_gpu_layers, params.split_mode, + params.main_gpu, params.use_mlock, params.progress_callback, params.progress_callback_user_data + )) { + return -2; + } + } catch (const std::exception & err) { + LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what()); + return -1; + } + + model->t_load_us = ggml_time_us() - model->t_start_us; + return 0; +} + +// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback +static llama_model_loader * llama_model_load_impl(const std::string & fname, llama_model & model, llama_model_params & params) { + try { + llama_model_loader * ml = new llama_model_loader(fname, params.use_mmap, params.check_tensors, params.kv_overrides); model.hparams.vocab_only = params.vocab_only; try { - llm_load_arch(ml, model); + llm_load_arch(*ml, model); } catch(const std::exception & e) { throw std::runtime_error("error loading model architecture: " + std::string(e.what())); } try { - llm_load_hparams(ml, model); + llm_load_hparams(*ml, model); } catch(const std::exception & e) { throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what())); } try { - llm_load_vocab(ml, model); + llm_load_vocab(*ml, model); } catch(const std::exception & e) { throw std::runtime_error("error loading model vocabulary: " + std::string(e.what())); } - llm_load_print_meta(ml, model); + llm_load_print_meta(*ml, model); if (model.vocab.type != LLAMA_VOCAB_TYPE_NONE && model.hparams.n_vocab != model.vocab.id_to_token.size()) { throw std::runtime_error("vocab size mismatch"); } - if (params.vocab_only) { - LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__); - return 0; - } - #ifdef GGML_USE_KOMPUTE if (params.n_gpu_layers > 0 && ( !(model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON) @@ -9213,22 +9226,14 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam } #endif - if (!llm_load_tensors( - ml, model, params.n_world, params.rank, params.n_layer_window, params.n_gpu_layers, params.split_mode, - params.main_gpu, params.use_mlock, params.progress_callback, params.progress_callback_user_data - )) { - return -2; - } + return ml; } catch (const std::exception & err) { - LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what()); - return -1; + throw std::runtime_error("error loading model: " + std::string(err.what())); } +} - // loading time will be recalculate after the first eval, so - // we take page faults deferred by mmap() into consideration - model.t_load_us = ggml_time_us() - model.t_start_us; - - return 0; +struct llama_model_loader * llama_model_load(const char * fname, struct llama_model * model, struct llama_model_params * params) { + return llama_model_load_impl(std::string(fname), *model, *params); } // @@ -17383,6 +17388,28 @@ static void llama_send_meta(zmq::socket_t & socket, struct sync_meta * meta) { } } +static int llama_recv_meta(zmq::socket_t & socket, struct sync_meta * meta) { + socket.set(zmq::sockopt::rcvtimeo, 1000); + + std::vector recv_msgs; + if (!zmq::recv_multipart(socket, std::back_inserter(recv_msgs))) { + return -1; + } + + socket.set(zmq::sockopt::rcvtimeo, -1); + + for (size_t i = 0; i < recv_msgs.size(); i += 2) { + std::string key = recv_msgs[i].to_string(); + zmq::message_t & data_msg = recv_msgs[i + 1]; + + if (key == "n_tokens") { + GGML_ASSERT(data_msg.size() == sizeof(meta->n_tokens)); + std::memcpy(&(meta->n_tokens), data_msg.data(), sizeof(meta->n_tokens)); + } + } + return 0; +} + static void llama_send_tensors(zmq::socket_t & socket, struct llama_ubatch * ubatch, struct input_tensors * tensors) { try { std::vector send_msgs; @@ -17406,28 +17433,6 @@ static void llama_send_tensors(zmq::socket_t & socket, struct llama_ubatch * uba } } -static int llama_recv_meta(zmq::socket_t & socket, struct sync_meta * meta) { - socket.set(zmq::sockopt::rcvtimeo, 1000); - - std::vector recv_msgs; - if (!zmq::recv_multipart(socket, std::back_inserter(recv_msgs))) { - return -1; - } - - socket.set(zmq::sockopt::rcvtimeo, -1); - - for (size_t i = 0; i < recv_msgs.size(); i += 2) { - std::string key = recv_msgs[i].to_string(); - zmq::message_t & data_msg = recv_msgs[i + 1]; - - if (key == "n_tokens") { - GGML_ASSERT(data_msg.size() == sizeof(meta->n_tokens)); - std::memcpy(&(meta->n_tokens), data_msg.data(), sizeof(meta->n_tokens)); - } - } - return 0; -} - static void llama_recv_tensors(zmq::socket_t & socket, struct llama_ubatch * ubatch, const bool is_out_embd=false) { std::vector recv_msgs; if (!zmq::recv_multipart(socket, std::back_inserter(recv_msgs))) { @@ -19523,7 +19528,7 @@ struct llama_model_params llama_model_default_params() { struct llama_model_params result = { /*.n_world =*/ 1, /*.rank =*/ 0, - /*.n_layer_window =*/ {32}, + /*.n_layer_window =*/ {0}, /*.n_gpu_layers =*/ 0, /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER, /*.main_gpu =*/ 0, @@ -19726,17 +19731,7 @@ struct llama_model * llama_load_model_from_file(const char * path_model, struct } } - int status = llama_model_load(path_model, *model, params); - GGML_ASSERT(status <= 0); - if (status < 0) { - if (status == -1) { - LLAMA_LOG_ERROR("%s: failed to load model\n", __func__); - } else if (status == -2) { - LLAMA_LOG_INFO("%s: cancelled model load\n", __func__); - } - delete model; - return nullptr; - } + (void)path_model; return model; } @@ -19784,7 +19779,7 @@ void llama_init_sockets(struct llama_context * ctx, uint32_t n_world, uint32_t m } } -int llama_collect_device_info(struct device_info * dev_info_set, struct llama_context * ctx) { +int llama_gather_device_info(struct llama_context * ctx, struct device_info * dev_info_set) { uint32_t n_world = ctx->cparams.n_world; if (n_world == 1) { return 0; @@ -19818,7 +19813,7 @@ int llama_collect_device_info(struct device_info * dev_info_set, struct llama_co return 0; } -int llama_send_device_info(struct device_info * dev_info, struct llama_context * ctx) { +int llama_send_device_info(struct llama_context * ctx, struct device_info * dev_info) { std::vector recv_msgs; if (!zmq::recv_multipart(*ctx->recv_socket, std::back_inserter(recv_msgs))) { return -1; @@ -19841,6 +19836,59 @@ int llama_send_device_info(struct device_info * dev_info, struct llama_context * } } +int llama_broadcast_n_layer_window(struct llama_context * ctx, uint32_t * n_layer_window) { + uint32_t n_world = ctx->cparams.n_world; + if (n_world == 1) { + return 0; + } + + GGML_ASSERT(ctx != nullptr && ctx->send_socket != nullptr); + try { + std::vector send_msgs; + + send_msgs.emplace_back("n_layer_window", strlen("n_layer_window")); + send_msgs.emplace_back(n_layer_window, sizeof(uint32_t) * 32); + + zmq::send_multipart(*ctx->send_socket, send_msgs); + } catch (const zmq::error_t& e) { + LLAMA_LOG_INFO("Failed to send data: %s\n", e.what()); + return -1; + } + + return 0; +} + +int llama_recv_n_layer_window(struct llama_context * ctx, uint32_t * n_layer_window) { + uint32_t n_world = ctx->cparams.n_world; + uint32_t my_rank = ctx->cparams.rank; + + std::vector recv_msgs; + if (!zmq::recv_multipart(*ctx->recv_socket, std::back_inserter(recv_msgs))) { + return -1; + } + + std::string key = recv_msgs[0].to_string(); + if (key != "n_layer_window") { + LLAMA_LOG_INFO("Unexpected message received: %s\n", key.c_str()); + return -1; + } + + zmq::message_t & data_msg = recv_msgs[1]; + GGML_ASSERT(data_msg.size() == sizeof(uint32_t) * 32); + memcpy(n_layer_window, data_msg.data(), sizeof(uint32_t) * 32); + + if (my_rank != n_world - 1) { + try { + zmq::send_multipart(*ctx->send_socket, recv_msgs); + } catch (const zmq::error_t& e) { + LLAMA_LOG_INFO("Failed to send data: %s\n", e.what()); + return -1; + } + } + + return 0; +} + void llama_free_sockets(struct llama_context * ctx, char ** msg) { const uint32_t n_world = ctx->cparams.n_world; const uint32_t my_rank = ctx->cparams.rank; @@ -19873,6 +19921,25 @@ void llama_free_sockets(struct llama_context * ctx, char ** msg) { struct llama_context * llama_new_context_with_model( struct llama_model * model, struct llama_context_params params) { + + if (!model) { + LLAMA_LOG_ERROR("%s: model cannot be NULL\n", __func__); + return nullptr; + } + + llama_context * ctx = new llama_context(*model); + + ctx->master_ip = params.master_ip; + ctx->next_node_ip = params.next_node_ip; + ctx->cparams.n_world = params.n_world; + ctx->cparams.rank = params.rank; + return ctx; +} + +void * llama_context_setup_backend( + struct llama_model * model, + struct llama_context_params params, + struct llama_context * ctx) { if (!model) { LLAMA_LOG_ERROR("%s: model cannot be NULL\n", __func__); @@ -19904,13 +19971,9 @@ struct llama_context * llama_new_context_with_model( return nullptr; } - llama_context * ctx = new llama_context(*model); - const auto & hparams = model->hparams; auto & cparams = ctx->cparams; - cparams.n_world = params.n_world; - cparams.rank = params.rank; std::copy(std::begin(params.n_layer_window), std::end(params.n_layer_window), cparams.n_layer_window); cparams.unload = params.unload; cparams.n_seq_max = std::max(1u, params.n_seq_max); @@ -19927,9 +19990,7 @@ struct llama_context * llama_new_context_with_model( cparams.no_perf = params.no_perf; cparams.pooling_type = params.pooling_type; - cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx; - cparams.type_k = params.type_k; - cparams.type_v = params.type_v; + cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx; cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base; cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale; @@ -19985,10 +20046,6 @@ struct llama_context * llama_new_context_with_model( cparams.causal_attn = params.attention_type == LLAMA_ATTENTION_TYPE_CAUSAL; } - ctx->master_ip = params.master_ip; - ctx->next_node_ip = params.next_node_ip; - - LLAMA_LOG_INFO("\n"); LLAMA_LOG_INFO("%s: n_world = %u\n", __func__, cparams.n_world); LLAMA_LOG_INFO("%s: rank = %u\n", __func__, cparams.rank); LLAMA_LOG_INFO("%s: win_size = %u\n", __func__, cparams.n_layer_window[cparams.rank]); @@ -19998,8 +20055,8 @@ struct llama_context * llama_new_context_with_model( LLAMA_LOG_INFO("%s: flash_attn = %d\n", __func__, cparams.flash_attn); LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base); LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale); - LLAMA_LOG_INFO("%s: master_ip = %s\n", __func__, ctx->master_ip.c_str()); - LLAMA_LOG_INFO("%s: next_node_ip = %s\n", __func__, ctx->next_node_ip.c_str()); + LLAMA_LOG_INFO("%s: master_ip = %s\n", __func__, ctx->master_ip.c_str()); + LLAMA_LOG_INFO("%s: next_node_ip = %s\n", __func__, ctx->next_node_ip.c_str()); ctx->abort_callback = params.abort_callback; ctx->abort_callback_data = params.abort_callback_data; @@ -20009,18 +20066,9 @@ struct llama_context * llama_new_context_with_model( // build worst-case graph for encoder if a model contains encoder ctx->is_encoding = llama_model_has_encoder(model); - return ctx; -} - -void * llama_context_setup_backend(struct llama_context * ctx) { - GGML_ASSERT(ctx != nullptr); - const auto * model = &ctx->model; - const auto & hparams = ctx->model.hparams; - const auto & cparams = ctx->cparams; - uint32_t kv_size = cparams.n_ctx; - ggml_type type_k = cparams.type_k; - ggml_type type_v = cparams.type_v; + ggml_type type_k = params.type_k; + ggml_type type_v = params.type_v; // Mamba only needs a constant number of KV cache cells per sequence if (llama_model_is_recurrent(model)) { @@ -20333,6 +20381,10 @@ void * llama_context_setup_backend(struct llama_context * ctx) { return ctx; } +uint32_t * llama_context_n_layer_window(struct llama_context * ctx) { + return ctx->cparams.n_layer_window; +} + void llama_free(struct llama_context * ctx) { delete ctx; } @@ -20511,6 +20563,10 @@ uint64_t llama_model_size(const struct llama_model * model) { return size; } +uint32_t llama_model_n_layers(const struct llama_model * model) { + return model->hparams.n_layer; +} + uint64_t llama_model_n_params(const struct llama_model * model) { uint64_t nparams = 0; for (const auto & it : model->tensors_by_name) {