add automatic layer window size assignment workflow

This commit is contained in:
Lizonghang 2024-11-08 18:21:03 +04:00
parent 53cb3a6069
commit 2bd4d03aa8
6 changed files with 241 additions and 110 deletions

View file

@ -28,6 +28,8 @@
#include <vector>
#include <thread>
#define DEFAULT_N_LAYER_WINDOW 4
#if defined(__APPLE__) && defined(__MACH__)
#include <sys/types.h>
#include <sys/sysctl.h>
@ -362,6 +364,11 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREAD
return true;
}
template <size_t N>
void copy_n_layer_window(const uint32_t (&source)[N], uint32_t * destination) {
std::copy(std::begin(source), std::end(source), destination);
}
void gpt_init() {
llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) {
if (LOG_DEFAULT_LLAMA <= gpt_log_verbosity_thold) {
@ -819,6 +826,24 @@ std::string fs_get_cache_file(const std::string & filename) {
//
// Model utils
//
static void llama_assign_n_layer_window(
uint32_t n_world,
uint32_t my_rank,
const device_info * dev_info_set,
uint32_t * n_layer_window,
struct llama_model * model) {
GGML_ASSERT(dev_info_set != nullptr);
GGML_ASSERT(n_layer_window != nullptr);
uint32_t n_layer = llama_model_n_layers(model);
if (n_world == 1) {
n_layer_window[0] = n_layer;
return;
}
std::fill_n(n_layer_window, n_world, DEFAULT_N_LAYER_WINDOW);
}
struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
llama_init_result iparams;
auto mparams = llama_model_params_from_gpt_params(params);
@ -838,6 +863,8 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
return iparams;
}
llama_model_loader * ml = llama_model_load(params.model.c_str(), model, &mparams);
if (params.reranking) {
bool ok = true;
@ -871,21 +898,49 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
struct llama_context_params cparams = llama_context_params_from_gpt_params(params);
llama_context * lctx = llama_new_context_with_model(model, cparams);
uint32_t n_world = cparams.n_world;
uint32_t my_rank = cparams.rank;
// initialize sockets
llama_init_sockets(lctx, cparams.n_world, cparams.rank);
llama_init_sockets(lctx, n_world, my_rank);
// sychronize device profile to the master node
struct device_info * dev_info_set = nullptr;
if (params.rank == 0) {
dev_info_set = (struct device_info *)malloc(cparams.n_world * sizeof(struct device_info));
if (my_rank == 0) {
dev_info_set = (struct device_info *)malloc(n_world * sizeof(struct device_info));
dev_info_set[0] = dev_info;
llama_collect_device_info(dev_info_set, lctx);
device_print_props(dev_info_set, cparams.n_world);
llama_gather_device_info(lctx, dev_info_set);
device_print_props(dev_info_set, n_world);
} else {
llama_send_device_info(&dev_info, lctx);
llama_send_device_info(lctx, &dev_info);
}
if (llama_context_setup_backend(lctx) == nullptr) {
uint32_t n_layer_window[32] = {0};
if (my_rank == 0) {
if (n_world == 1 || params.n_layer_window[0] == 0) {
llama_assign_n_layer_window(n_world, my_rank, dev_info_set, n_layer_window, model);
} else {
copy_n_layer_window(params.n_layer_window, n_layer_window);
}
// synchronize the new n_layer_window to other nodes
llama_broadcast_n_layer_window(lctx, n_layer_window);
} else {
llama_recv_n_layer_window(lctx, n_layer_window);
}
// update n_layer_window
copy_n_layer_window(n_layer_window, params.n_layer_window);
copy_n_layer_window(n_layer_window, cparams.n_layer_window);
copy_n_layer_window(n_layer_window, mparams.n_layer_window);
copy_n_layer_window(n_layer_window, llama_context_n_layer_window(lctx));
if (!mparams.vocab_only && llm_load_tensors(ml, model, mparams) < 0) {
LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.c_str());
return iparams;
}
if (llama_context_setup_backend(model, cparams, lctx) == nullptr) {
LOG_ERR("%s: failed to setup context with model '%s'\n", __func__, params.model.c_str());
llama_free_model(model);
return iparams;

View file

@ -144,7 +144,7 @@ struct gpt_sampler_params {
struct gpt_params {
int32_t n_world = 1; // number of devices to use
int32_t rank = 0; // my rank for distributed inference
uint32_t n_layer_window[32] = {32}; // layer window size on each node
uint32_t n_layer_window[32] = {0}; // layer window size on each node
std::string master_ip = "localhost"; // ip address of the master node
std::string next_node_ip = "localhost"; // ip address of my next node
bool unload = false; // unload layer weights after use or not

View file

@ -316,7 +316,7 @@ void device_print_props(struct device_info * dev_info_set, int n) {
LOG_INF("| Property ");
for (int i = 0; i < n; ++i) {
LOG_INF("| Rank %-8d", i);
GGML_ASSERT(dev_info_set[i].rank == i);
GGML_ASSERT((int)dev_info_set[i].rank == i);
}
LOG_INF("\n-------------------------------------------------------------------------------------------\n");

View file

@ -142,21 +142,24 @@ int main(int argc, char ** argv) {
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_MAIN, print_usage)) {
return 1;
}
const uint32_t n_world = params.n_world;
const uint32_t my_rank = params.rank;
GGML_ASSERT(!(n_world == 1 && my_rank > 0));
// check if --n-layer-window and --world is matched
uint32_t non_zero_count = 0;
size_t size = sizeof(params.n_layer_window) / sizeof(params.n_layer_window[0]);
for (size_t i = 0; i < size; ++i) {
if (params.n_layer_window[i] != 0) {
++non_zero_count;
if (my_rank == 0) {
uint32_t non_zero_count = 0;
size_t size = sizeof(params.n_layer_window) / sizeof(params.n_layer_window[0]);
for (size_t i = 0; i < size; ++i) {
if (params.n_layer_window[i] != 0) {
++non_zero_count;
}
}
GGML_ASSERT((non_zero_count == 0 || non_zero_count == n_world) \
&& "Number of non-zero values in --n-layer-window must equal --world");
}
GGML_ASSERT(!(n_world == 1 && my_rank > 0));
GGML_ASSERT(non_zero_count == n_world && "Number of non-zero values in --n-layer-window must equal --world");
gpt_init();
auto & sparams = params.sparams;

View file

@ -434,14 +434,26 @@ extern "C" {
LLAMA_API void llama_init_sockets (struct llama_context * ctx, uint32_t n_world, uint32_t my_rank);
LLAMA_API void llama_free_sockets (struct llama_context * ctx, char ** msg);
LLAMA_API int llama_collect_device_info(struct device_info * dev_info_set, struct llama_context * ctx);
LLAMA_API int llama_send_device_info (struct device_info * dev_info, struct llama_context * ctx);
LLAMA_API int llama_gather_device_info (struct llama_context * ctx, struct device_info * dev_info_set);
LLAMA_API int llama_send_device_info (struct llama_context * ctx, struct device_info * dev_info);
LLAMA_API int llama_broadcast_n_layer_window(struct llama_context * ctx, uint32_t * n_layer_window);
LLAMA_API int llama_recv_n_layer_window(struct llama_context * ctx, uint32_t * n_layer_window);
LLAMA_API int llm_load_tensors(
struct llama_model_loader * ml,
struct llama_model * model,
struct llama_model_params params);
// TODO: rename to llama_init_from_model
LLAMA_API struct llama_context * llama_new_context_with_model(
struct llama_model * model,
struct llama_context_params params);
LLAMA_API void * llama_context_setup_backend(struct llama_context * ctx);
LLAMA_API void * llama_context_setup_backend(
struct llama_model * model,
struct llama_context_params params,
struct llama_context * ctx);
LLAMA_API uint32_t * llama_context_n_layer_window(struct llama_context * ctx);
// Frees all allocated memory
LLAMA_API void llama_free(struct llama_context * ctx);
@ -491,12 +503,17 @@ extern "C" {
// Get metadata value as a string by index
LLAMA_API int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size);
LLAMA_API struct llama_model_loader * llama_model_load(const char * fname, struct llama_model * model, struct llama_model_params * params);
// Get a string describing the model type
LLAMA_API int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
// Returns the total size of all the tensors in the model in bytes
LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
// Returns the number of model layers in the model
LLAMA_API uint32_t llama_model_n_layers(const struct llama_model * model);
// Returns the total number of parameters in the model
LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);

View file

@ -2572,8 +2572,6 @@ struct llama_cparams {
uint32_t n_layer_window[32];
bool unload;
uint32_t n_ctx; // context size used during inference
ggml_type type_k;
ggml_type type_v;
uint32_t n_batch;
uint32_t n_ubatch;
uint32_t n_seq_max;
@ -7137,7 +7135,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
}
// Returns false if cancelled by progress_callback
static bool llm_load_tensors(
static bool llm_load_tensors_impl(
llama_model_loader & ml,
llama_model & model,
uint32_t n_world,
@ -9159,43 +9157,58 @@ static bool llm_load_tensors(
return true;
}
// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
model.t_start_us = ggml_time_us();
int llm_load_tensors(
struct llama_model_loader * ml,
struct llama_model * model,
struct llama_model_params params) {
model->t_start_us = ggml_time_us();
try {
llama_model_loader ml(fname, params.use_mmap, params.check_tensors, params.kv_overrides);
if (!llm_load_tensors_impl(
*ml, *model, params.n_world, params.rank, params.n_layer_window, params.n_gpu_layers, params.split_mode,
params.main_gpu, params.use_mlock, params.progress_callback, params.progress_callback_user_data
)) {
return -2;
}
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
return -1;
}
model->t_load_us = ggml_time_us() - model->t_start_us;
return 0;
}
// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
static llama_model_loader * llama_model_load_impl(const std::string & fname, llama_model & model, llama_model_params & params) {
try {
llama_model_loader * ml = new llama_model_loader(fname, params.use_mmap, params.check_tensors, params.kv_overrides);
model.hparams.vocab_only = params.vocab_only;
try {
llm_load_arch(ml, model);
llm_load_arch(*ml, model);
} catch(const std::exception & e) {
throw std::runtime_error("error loading model architecture: " + std::string(e.what()));
}
try {
llm_load_hparams(ml, model);
llm_load_hparams(*ml, model);
} catch(const std::exception & e) {
throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what()));
}
try {
llm_load_vocab(ml, model);
llm_load_vocab(*ml, model);
} catch(const std::exception & e) {
throw std::runtime_error("error loading model vocabulary: " + std::string(e.what()));
}
llm_load_print_meta(ml, model);
llm_load_print_meta(*ml, model);
if (model.vocab.type != LLAMA_VOCAB_TYPE_NONE &&
model.hparams.n_vocab != model.vocab.id_to_token.size()) {
throw std::runtime_error("vocab size mismatch");
}
if (params.vocab_only) {
LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
return 0;
}
#ifdef GGML_USE_KOMPUTE
if (params.n_gpu_layers > 0 && (
!(model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON)
@ -9213,22 +9226,14 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
}
#endif
if (!llm_load_tensors(
ml, model, params.n_world, params.rank, params.n_layer_window, params.n_gpu_layers, params.split_mode,
params.main_gpu, params.use_mlock, params.progress_callback, params.progress_callback_user_data
)) {
return -2;
}
return ml;
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
return -1;
throw std::runtime_error("error loading model: " + std::string(err.what()));
}
}
// loading time will be recalculate after the first eval, so
// we take page faults deferred by mmap() into consideration
model.t_load_us = ggml_time_us() - model.t_start_us;
return 0;
struct llama_model_loader * llama_model_load(const char * fname, struct llama_model * model, struct llama_model_params * params) {
return llama_model_load_impl(std::string(fname), *model, *params);
}
//
@ -17383,6 +17388,28 @@ static void llama_send_meta(zmq::socket_t & socket, struct sync_meta * meta) {
}
}
static int llama_recv_meta(zmq::socket_t & socket, struct sync_meta * meta) {
socket.set(zmq::sockopt::rcvtimeo, 1000);
std::vector<zmq::message_t> recv_msgs;
if (!zmq::recv_multipart(socket, std::back_inserter(recv_msgs))) {
return -1;
}
socket.set(zmq::sockopt::rcvtimeo, -1);
for (size_t i = 0; i < recv_msgs.size(); i += 2) {
std::string key = recv_msgs[i].to_string();
zmq::message_t & data_msg = recv_msgs[i + 1];
if (key == "n_tokens") {
GGML_ASSERT(data_msg.size() == sizeof(meta->n_tokens));
std::memcpy(&(meta->n_tokens), data_msg.data(), sizeof(meta->n_tokens));
}
}
return 0;
}
static void llama_send_tensors(zmq::socket_t & socket, struct llama_ubatch * ubatch, struct input_tensors * tensors) {
try {
std::vector<zmq::message_t> send_msgs;
@ -17406,28 +17433,6 @@ static void llama_send_tensors(zmq::socket_t & socket, struct llama_ubatch * uba
}
}
static int llama_recv_meta(zmq::socket_t & socket, struct sync_meta * meta) {
socket.set(zmq::sockopt::rcvtimeo, 1000);
std::vector<zmq::message_t> recv_msgs;
if (!zmq::recv_multipart(socket, std::back_inserter(recv_msgs))) {
return -1;
}
socket.set(zmq::sockopt::rcvtimeo, -1);
for (size_t i = 0; i < recv_msgs.size(); i += 2) {
std::string key = recv_msgs[i].to_string();
zmq::message_t & data_msg = recv_msgs[i + 1];
if (key == "n_tokens") {
GGML_ASSERT(data_msg.size() == sizeof(meta->n_tokens));
std::memcpy(&(meta->n_tokens), data_msg.data(), sizeof(meta->n_tokens));
}
}
return 0;
}
static void llama_recv_tensors(zmq::socket_t & socket, struct llama_ubatch * ubatch, const bool is_out_embd=false) {
std::vector<zmq::message_t> recv_msgs;
if (!zmq::recv_multipart(socket, std::back_inserter(recv_msgs))) {
@ -19523,7 +19528,7 @@ struct llama_model_params llama_model_default_params() {
struct llama_model_params result = {
/*.n_world =*/ 1,
/*.rank =*/ 0,
/*.n_layer_window =*/ {32},
/*.n_layer_window =*/ {0},
/*.n_gpu_layers =*/ 0,
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
/*.main_gpu =*/ 0,
@ -19726,17 +19731,7 @@ struct llama_model * llama_load_model_from_file(const char * path_model, struct
}
}
int status = llama_model_load(path_model, *model, params);
GGML_ASSERT(status <= 0);
if (status < 0) {
if (status == -1) {
LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
} else if (status == -2) {
LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
}
delete model;
return nullptr;
}
(void)path_model;
return model;
}
@ -19784,7 +19779,7 @@ void llama_init_sockets(struct llama_context * ctx, uint32_t n_world, uint32_t m
}
}
int llama_collect_device_info(struct device_info * dev_info_set, struct llama_context * ctx) {
int llama_gather_device_info(struct llama_context * ctx, struct device_info * dev_info_set) {
uint32_t n_world = ctx->cparams.n_world;
if (n_world == 1) {
return 0;
@ -19818,7 +19813,7 @@ int llama_collect_device_info(struct device_info * dev_info_set, struct llama_co
return 0;
}
int llama_send_device_info(struct device_info * dev_info, struct llama_context * ctx) {
int llama_send_device_info(struct llama_context * ctx, struct device_info * dev_info) {
std::vector<zmq::message_t> recv_msgs;
if (!zmq::recv_multipart(*ctx->recv_socket, std::back_inserter(recv_msgs))) {
return -1;
@ -19841,6 +19836,59 @@ int llama_send_device_info(struct device_info * dev_info, struct llama_context *
}
}
int llama_broadcast_n_layer_window(struct llama_context * ctx, uint32_t * n_layer_window) {
uint32_t n_world = ctx->cparams.n_world;
if (n_world == 1) {
return 0;
}
GGML_ASSERT(ctx != nullptr && ctx->send_socket != nullptr);
try {
std::vector<zmq::message_t> send_msgs;
send_msgs.emplace_back("n_layer_window", strlen("n_layer_window"));
send_msgs.emplace_back(n_layer_window, sizeof(uint32_t) * 32);
zmq::send_multipart(*ctx->send_socket, send_msgs);
} catch (const zmq::error_t& e) {
LLAMA_LOG_INFO("Failed to send data: %s\n", e.what());
return -1;
}
return 0;
}
int llama_recv_n_layer_window(struct llama_context * ctx, uint32_t * n_layer_window) {
uint32_t n_world = ctx->cparams.n_world;
uint32_t my_rank = ctx->cparams.rank;
std::vector<zmq::message_t> recv_msgs;
if (!zmq::recv_multipart(*ctx->recv_socket, std::back_inserter(recv_msgs))) {
return -1;
}
std::string key = recv_msgs[0].to_string();
if (key != "n_layer_window") {
LLAMA_LOG_INFO("Unexpected message received: %s\n", key.c_str());
return -1;
}
zmq::message_t & data_msg = recv_msgs[1];
GGML_ASSERT(data_msg.size() == sizeof(uint32_t) * 32);
memcpy(n_layer_window, data_msg.data(), sizeof(uint32_t) * 32);
if (my_rank != n_world - 1) {
try {
zmq::send_multipart(*ctx->send_socket, recv_msgs);
} catch (const zmq::error_t& e) {
LLAMA_LOG_INFO("Failed to send data: %s\n", e.what());
return -1;
}
}
return 0;
}
void llama_free_sockets(struct llama_context * ctx, char ** msg) {
const uint32_t n_world = ctx->cparams.n_world;
const uint32_t my_rank = ctx->cparams.rank;
@ -19873,6 +19921,25 @@ void llama_free_sockets(struct llama_context * ctx, char ** msg) {
struct llama_context * llama_new_context_with_model(
struct llama_model * model,
struct llama_context_params params) {
if (!model) {
LLAMA_LOG_ERROR("%s: model cannot be NULL\n", __func__);
return nullptr;
}
llama_context * ctx = new llama_context(*model);
ctx->master_ip = params.master_ip;
ctx->next_node_ip = params.next_node_ip;
ctx->cparams.n_world = params.n_world;
ctx->cparams.rank = params.rank;
return ctx;
}
void * llama_context_setup_backend(
struct llama_model * model,
struct llama_context_params params,
struct llama_context * ctx) {
if (!model) {
LLAMA_LOG_ERROR("%s: model cannot be NULL\n", __func__);
@ -19904,13 +19971,9 @@ struct llama_context * llama_new_context_with_model(
return nullptr;
}
llama_context * ctx = new llama_context(*model);
const auto & hparams = model->hparams;
auto & cparams = ctx->cparams;
cparams.n_world = params.n_world;
cparams.rank = params.rank;
std::copy(std::begin(params.n_layer_window), std::end(params.n_layer_window), cparams.n_layer_window);
cparams.unload = params.unload;
cparams.n_seq_max = std::max(1u, params.n_seq_max);
@ -19927,9 +19990,7 @@ struct llama_context * llama_new_context_with_model(
cparams.no_perf = params.no_perf;
cparams.pooling_type = params.pooling_type;
cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
cparams.type_k = params.type_k;
cparams.type_v = params.type_v;
cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base;
cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
@ -19985,10 +20046,6 @@ struct llama_context * llama_new_context_with_model(
cparams.causal_attn = params.attention_type == LLAMA_ATTENTION_TYPE_CAUSAL;
}
ctx->master_ip = params.master_ip;
ctx->next_node_ip = params.next_node_ip;
LLAMA_LOG_INFO("\n");
LLAMA_LOG_INFO("%s: n_world = %u\n", __func__, cparams.n_world);
LLAMA_LOG_INFO("%s: rank = %u\n", __func__, cparams.rank);
LLAMA_LOG_INFO("%s: win_size = %u\n", __func__, cparams.n_layer_window[cparams.rank]);
@ -19998,8 +20055,8 @@ struct llama_context * llama_new_context_with_model(
LLAMA_LOG_INFO("%s: flash_attn = %d\n", __func__, cparams.flash_attn);
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
LLAMA_LOG_INFO("%s: master_ip = %s\n", __func__, ctx->master_ip.c_str());
LLAMA_LOG_INFO("%s: next_node_ip = %s\n", __func__, ctx->next_node_ip.c_str());
LLAMA_LOG_INFO("%s: master_ip = %s\n", __func__, ctx->master_ip.c_str());
LLAMA_LOG_INFO("%s: next_node_ip = %s\n", __func__, ctx->next_node_ip.c_str());
ctx->abort_callback = params.abort_callback;
ctx->abort_callback_data = params.abort_callback_data;
@ -20009,18 +20066,9 @@ struct llama_context * llama_new_context_with_model(
// build worst-case graph for encoder if a model contains encoder
ctx->is_encoding = llama_model_has_encoder(model);
return ctx;
}
void * llama_context_setup_backend(struct llama_context * ctx) {
GGML_ASSERT(ctx != nullptr);
const auto * model = &ctx->model;
const auto & hparams = ctx->model.hparams;
const auto & cparams = ctx->cparams;
uint32_t kv_size = cparams.n_ctx;
ggml_type type_k = cparams.type_k;
ggml_type type_v = cparams.type_v;
ggml_type type_k = params.type_k;
ggml_type type_v = params.type_v;
// Mamba only needs a constant number of KV cache cells per sequence
if (llama_model_is_recurrent(model)) {
@ -20333,6 +20381,10 @@ void * llama_context_setup_backend(struct llama_context * ctx) {
return ctx;
}
uint32_t * llama_context_n_layer_window(struct llama_context * ctx) {
return ctx->cparams.n_layer_window;
}
void llama_free(struct llama_context * ctx) {
delete ctx;
}
@ -20511,6 +20563,10 @@ uint64_t llama_model_size(const struct llama_model * model) {
return size;
}
uint32_t llama_model_n_layers(const struct llama_model * model) {
return model->hparams.n_layer;
}
uint64_t llama_model_n_params(const struct llama_model * model) {
uint64_t nparams = 0;
for (const auto & it : model->tensors_by_name) {