diff --git a/common/common.cpp b/common/common.cpp
index 6438daae..a2f7b4cc 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -28,6 +28,8 @@
 #include <vector>
 #include <thread>
 
+#define DEFAULT_N_LAYER_WINDOW 4
+
 #if defined(__APPLE__) && defined(__MACH__)
 #include <sys/types.h>
 #include <sys/sysctl.h>
@@ -362,6 +364,11 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREAD
     return true;
 }
 
+template <size_t N>
+void copy_n_layer_window(const uint32_t (&source)[N], uint32_t * destination) {
+    std::copy(std::begin(source), std::end(source), destination);
+}
+
 void gpt_init() {
     llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) {
         if (LOG_DEFAULT_LLAMA <= gpt_log_verbosity_thold) {
@@ -819,6 +826,24 @@ std::string fs_get_cache_file(const std::string & filename) {
 //
 // Model utils
 //
+static void llama_assign_n_layer_window(
+                                uint32_t   n_world, 
+                                uint32_t   my_rank, 
+                       const device_info * dev_info_set, 
+                                uint32_t * n_layer_window, 
+                      struct llama_model * model) {
+    GGML_ASSERT(dev_info_set != nullptr);
+    GGML_ASSERT(n_layer_window != nullptr);
+
+    uint32_t n_layer = llama_model_n_layers(model);
+    if (n_world == 1) {
+        n_layer_window[0] = n_layer;
+        return;
+    }
+
+    std::fill_n(n_layer_window, n_world, DEFAULT_N_LAYER_WINDOW);
+}
+
 struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
     llama_init_result iparams;
     auto mparams = llama_model_params_from_gpt_params(params);
@@ -838,6 +863,8 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
         return iparams;
     }
 
+    llama_model_loader * ml = llama_model_load(params.model.c_str(), model, &mparams);
+
     if (params.reranking) {
         bool ok = true;
 
@@ -871,21 +898,49 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
     struct llama_context_params cparams = llama_context_params_from_gpt_params(params);
     llama_context * lctx                = llama_new_context_with_model(model, cparams);
 
+    uint32_t n_world = cparams.n_world;
+    uint32_t my_rank = cparams.rank;
+
     // initialize sockets
-    llama_init_sockets(lctx, cparams.n_world, cparams.rank);
+    llama_init_sockets(lctx, n_world, my_rank);
 
     // sychronize device profile to the master node
     struct device_info * dev_info_set = nullptr;
-    if (params.rank == 0) {
-        dev_info_set = (struct device_info *)malloc(cparams.n_world * sizeof(struct device_info));
+    if (my_rank == 0) {
+        dev_info_set = (struct device_info *)malloc(n_world * sizeof(struct device_info));
         dev_info_set[0] = dev_info;
-        llama_collect_device_info(dev_info_set, lctx);
-        device_print_props(dev_info_set, cparams.n_world);
+        llama_gather_device_info(lctx, dev_info_set);
+        device_print_props(dev_info_set, n_world);
     } else {
-        llama_send_device_info(&dev_info, lctx);
+        llama_send_device_info(lctx, &dev_info);
     }
 
-    if (llama_context_setup_backend(lctx) == nullptr) {
+    uint32_t n_layer_window[32] = {0};
+    if (my_rank == 0) {
+        if (n_world == 1 || params.n_layer_window[0] == 0) {
+            llama_assign_n_layer_window(n_world, my_rank, dev_info_set, n_layer_window, model);
+        } else {
+            copy_n_layer_window(params.n_layer_window, n_layer_window);
+        }
+
+        // synchronize the new n_layer_window to other nodes
+        llama_broadcast_n_layer_window(lctx, n_layer_window);
+    } else {
+        llama_recv_n_layer_window(lctx, n_layer_window);
+    }
+
+    // update n_layer_window
+    copy_n_layer_window(n_layer_window, params.n_layer_window);
+    copy_n_layer_window(n_layer_window, cparams.n_layer_window);
+    copy_n_layer_window(n_layer_window, mparams.n_layer_window);
+    copy_n_layer_window(n_layer_window, llama_context_n_layer_window(lctx));
+
+    if (!mparams.vocab_only && llm_load_tensors(ml, model, mparams) < 0) {
+        LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.c_str());
+        return iparams;
+    }
+
+    if (llama_context_setup_backend(model, cparams, lctx) == nullptr) {
         LOG_ERR("%s: failed to setup context with model '%s'\n", __func__, params.model.c_str());
         llama_free_model(model);
         return iparams;
diff --git a/common/common.h b/common/common.h
index c074106a..f04691db 100644
--- a/common/common.h
+++ b/common/common.h
@@ -144,7 +144,7 @@ struct gpt_sampler_params {
 struct gpt_params {
     int32_t n_world               =     1; // number of devices to use
     int32_t rank                  =     0; // my rank for distributed inference
-    uint32_t n_layer_window[32]   =  {32}; // layer window size on each node
+    uint32_t n_layer_window[32]   =   {0}; // layer window size on each node
     std::string master_ip         = "localhost"; // ip address of the master node
     std::string next_node_ip      = "localhost"; // ip address of my next node
     bool    unload                = false; // unload layer weights after use or not
diff --git a/common/profiler.cpp b/common/profiler.cpp
index 487f3379..44abfe6a 100644
--- a/common/profiler.cpp
+++ b/common/profiler.cpp
@@ -316,7 +316,7 @@ void device_print_props(struct device_info * dev_info_set, int n) {
     LOG_INF("| Property                     ");
     for (int i = 0; i < n; ++i) {
         LOG_INF("| Rank %-8d", i);
-        GGML_ASSERT(dev_info_set[i].rank == i);
+        GGML_ASSERT((int)dev_info_set[i].rank == i);
     }
     LOG_INF("\n-------------------------------------------------------------------------------------------\n");
 
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 931c9b4a..39d4b60c 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -142,21 +142,24 @@ int main(int argc, char ** argv) {
     if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_MAIN, print_usage)) {
         return 1;
     }
+
     const uint32_t n_world  = params.n_world;
     const uint32_t my_rank  = params.rank;
+    GGML_ASSERT(!(n_world == 1 && my_rank > 0));
 
     // check if --n-layer-window and --world is matched
-    uint32_t non_zero_count = 0;
-    size_t size = sizeof(params.n_layer_window) / sizeof(params.n_layer_window[0]);
-    for (size_t i = 0; i < size; ++i) {
-        if (params.n_layer_window[i] != 0) {
-            ++non_zero_count;
+    if (my_rank == 0) {
+        uint32_t non_zero_count = 0;
+        size_t size = sizeof(params.n_layer_window) / sizeof(params.n_layer_window[0]);
+        for (size_t i = 0; i < size; ++i) {
+            if (params.n_layer_window[i] != 0) {
+                ++non_zero_count;
+            }
         }
+        GGML_ASSERT((non_zero_count == 0 || non_zero_count == n_world) \
+            && "Number of non-zero values in --n-layer-window must equal --world");
     }
 
-    GGML_ASSERT(!(n_world == 1 && my_rank > 0));
-    GGML_ASSERT(non_zero_count == n_world && "Number of non-zero values in --n-layer-window must equal --world");
-
     gpt_init();
 
     auto & sparams = params.sparams;
diff --git a/include/llama.h b/include/llama.h
index e3890666..ff7d1599 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -434,14 +434,26 @@ extern "C" {
 
     LLAMA_API void llama_init_sockets       (struct llama_context * ctx, uint32_t n_world, uint32_t my_rank);
     LLAMA_API void llama_free_sockets       (struct llama_context * ctx, char ** msg);
-    LLAMA_API int  llama_collect_device_info(struct device_info * dev_info_set, struct llama_context * ctx);
-    LLAMA_API int  llama_send_device_info   (struct device_info * dev_info,     struct llama_context * ctx);
+    LLAMA_API int  llama_gather_device_info (struct llama_context * ctx, struct device_info * dev_info_set);
+    LLAMA_API int  llama_send_device_info   (struct llama_context * ctx, struct device_info * dev_info);
+    LLAMA_API int  llama_broadcast_n_layer_window(struct llama_context * ctx, uint32_t * n_layer_window);
+    LLAMA_API int  llama_recv_n_layer_window(struct llama_context * ctx, uint32_t * n_layer_window);
+
+    LLAMA_API int llm_load_tensors(
+              struct llama_model_loader * ml,
+              struct llama_model        * model,
+              struct llama_model_params   params);
 
-    // TODO: rename to llama_init_from_model
     LLAMA_API struct llama_context * llama_new_context_with_model(
                      struct llama_model * model,
             struct llama_context_params   params);
-    LLAMA_API void * llama_context_setup_backend(struct llama_context * ctx);
+
+    LLAMA_API void * llama_context_setup_backend(
+                     struct llama_model * model,
+            struct llama_context_params   params,
+                   struct llama_context * ctx);
+    
+    LLAMA_API uint32_t * llama_context_n_layer_window(struct llama_context * ctx);
 
     // Frees all allocated memory
     LLAMA_API void llama_free(struct llama_context * ctx);
@@ -491,12 +503,17 @@ extern "C" {
     // Get metadata value as a string by index
     LLAMA_API int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size);
 
+    LLAMA_API struct llama_model_loader * llama_model_load(const char * fname, struct llama_model * model, struct llama_model_params * params);
+
     // Get a string describing the model type
     LLAMA_API int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
 
     // Returns the total size of all the tensors in the model in bytes
     LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
 
+    // Returns the number of model layers in the model
+    LLAMA_API uint32_t llama_model_n_layers(const struct llama_model * model);
+
     // Returns the total number of parameters in the model
     LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
 
diff --git a/src/llama.cpp b/src/llama.cpp
index 9a552ee2..b77e7ca9 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -2572,8 +2572,6 @@ struct llama_cparams {
     uint32_t  n_layer_window[32];
     bool      unload;
     uint32_t  n_ctx;           // context size used during inference
-    ggml_type type_k;
-    ggml_type type_v;
     uint32_t  n_batch;
     uint32_t  n_ubatch;
     uint32_t  n_seq_max;
@@ -7137,7 +7135,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
 }
 
 // Returns false if cancelled by progress_callback
-static bool llm_load_tensors(
+static bool llm_load_tensors_impl(
         llama_model_loader   &  ml,
         llama_model          &  model,
         uint32_t                n_world,
@@ -9159,43 +9157,58 @@ static bool llm_load_tensors(
     return true;
 }
 
-// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
-static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
-    model.t_start_us = ggml_time_us();
+int llm_load_tensors(
+    struct llama_model_loader * ml,
+    struct llama_model        * model,
+    struct llama_model_params   params) {
+    model->t_start_us = ggml_time_us();
 
     try {
-        llama_model_loader ml(fname, params.use_mmap, params.check_tensors, params.kv_overrides);
+        if (!llm_load_tensors_impl(
+            *ml, *model, params.n_world, params.rank, params.n_layer_window, params.n_gpu_layers, params.split_mode, 
+            params.main_gpu, params.use_mlock, params.progress_callback, params.progress_callback_user_data
+        )) {
+            return -2;
+        }
+    } catch (const std::exception & err) {
+        LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
+        return -1;
+    }
+
+    model->t_load_us = ggml_time_us() - model->t_start_us;
+    return 0;
+}
+
+// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
+static llama_model_loader * llama_model_load_impl(const std::string & fname, llama_model & model, llama_model_params & params) {
+    try {
+        llama_model_loader * ml = new llama_model_loader(fname, params.use_mmap, params.check_tensors, params.kv_overrides);
 
         model.hparams.vocab_only = params.vocab_only;
 
         try {
-            llm_load_arch(ml, model);
+            llm_load_arch(*ml, model);
         } catch(const std::exception & e) {
             throw std::runtime_error("error loading model architecture: " + std::string(e.what()));
         }
         try {
-            llm_load_hparams(ml, model);
+            llm_load_hparams(*ml, model);
         } catch(const std::exception & e) {
             throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what()));
         }
         try {
-            llm_load_vocab(ml, model);
+            llm_load_vocab(*ml, model);
         } catch(const std::exception & e) {
             throw std::runtime_error("error loading model vocabulary: " + std::string(e.what()));
         }
 
-        llm_load_print_meta(ml, model);
+        llm_load_print_meta(*ml, model);
 
         if (model.vocab.type != LLAMA_VOCAB_TYPE_NONE &&
             model.hparams.n_vocab != model.vocab.id_to_token.size()) {
             throw std::runtime_error("vocab size mismatch");
         }
 
-        if (params.vocab_only) {
-            LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
-            return 0;
-        }
-
 #ifdef GGML_USE_KOMPUTE
         if (params.n_gpu_layers > 0 && (
             !(model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON)
@@ -9213,22 +9226,14 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
         }
 #endif
 
-        if (!llm_load_tensors(
-            ml, model, params.n_world, params.rank, params.n_layer_window, params.n_gpu_layers, params.split_mode, 
-            params.main_gpu, params.use_mlock, params.progress_callback, params.progress_callback_user_data
-        )) {
-            return -2;
-        }
+        return ml;
     } catch (const std::exception & err) {
-        LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
-        return -1;
+        throw std::runtime_error("error loading model: " + std::string(err.what()));
     }
+}
 
-    // loading time will be recalculate after the first eval, so
-    // we take page faults deferred by mmap() into consideration
-    model.t_load_us = ggml_time_us() - model.t_start_us;
-
-    return 0;
+struct llama_model_loader * llama_model_load(const char * fname, struct llama_model * model, struct llama_model_params * params) {
+    return llama_model_load_impl(std::string(fname), *model, *params);
 }
 
 //
@@ -17383,6 +17388,28 @@ static void llama_send_meta(zmq::socket_t & socket, struct sync_meta * meta) {
     }
 }
 
+static int llama_recv_meta(zmq::socket_t & socket, struct sync_meta * meta) {
+    socket.set(zmq::sockopt::rcvtimeo, 1000);
+
+    std::vector<zmq::message_t> recv_msgs;
+    if (!zmq::recv_multipart(socket, std::back_inserter(recv_msgs))) {
+        return -1;
+    }
+
+    socket.set(zmq::sockopt::rcvtimeo, -1);
+
+    for (size_t i = 0; i < recv_msgs.size(); i += 2) {
+        std::string key           = recv_msgs[i].to_string();
+        zmq::message_t & data_msg = recv_msgs[i + 1];
+
+        if (key == "n_tokens") {
+            GGML_ASSERT(data_msg.size() == sizeof(meta->n_tokens));
+            std::memcpy(&(meta->n_tokens), data_msg.data(), sizeof(meta->n_tokens));
+        }
+    }
+    return 0;
+}
+
 static void llama_send_tensors(zmq::socket_t & socket, struct llama_ubatch * ubatch, struct input_tensors * tensors) {
     try {
         std::vector<zmq::message_t> send_msgs;
@@ -17406,28 +17433,6 @@ static void llama_send_tensors(zmq::socket_t & socket, struct llama_ubatch * uba
     }
 }
 
-static int llama_recv_meta(zmq::socket_t & socket, struct sync_meta * meta) {
-    socket.set(zmq::sockopt::rcvtimeo, 1000);
-
-    std::vector<zmq::message_t> recv_msgs;
-    if (!zmq::recv_multipart(socket, std::back_inserter(recv_msgs))) {
-        return -1;
-    }
-
-    socket.set(zmq::sockopt::rcvtimeo, -1);
-
-    for (size_t i = 0; i < recv_msgs.size(); i += 2) {
-        std::string key           = recv_msgs[i].to_string();
-        zmq::message_t & data_msg = recv_msgs[i + 1];
-
-        if (key == "n_tokens") {
-            GGML_ASSERT(data_msg.size() == sizeof(meta->n_tokens));
-            std::memcpy(&(meta->n_tokens), data_msg.data(), sizeof(meta->n_tokens));
-        }
-    }
-    return 0;
-}
-
 static void llama_recv_tensors(zmq::socket_t & socket, struct llama_ubatch * ubatch, const bool is_out_embd=false) {
     std::vector<zmq::message_t> recv_msgs;
     if (!zmq::recv_multipart(socket, std::back_inserter(recv_msgs))) {
@@ -19523,7 +19528,7 @@ struct llama_model_params llama_model_default_params() {
     struct llama_model_params result = {
         /*.n_world                     =*/ 1,
         /*.rank                        =*/ 0,
-        /*.n_layer_window              =*/ {32},
+        /*.n_layer_window              =*/ {0},
         /*.n_gpu_layers                =*/ 0,
         /*.split_mode                  =*/ LLAMA_SPLIT_MODE_LAYER,
         /*.main_gpu                    =*/ 0,
@@ -19726,17 +19731,7 @@ struct llama_model * llama_load_model_from_file(const char * path_model, struct
         }
     }
 
-    int status = llama_model_load(path_model, *model, params);
-    GGML_ASSERT(status <= 0);
-    if (status < 0) {
-        if (status == -1) {
-            LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
-        } else if (status == -2) {
-            LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
-        }
-        delete model;
-        return nullptr;
-    }
+    (void)path_model;
 
     return model;
 }
@@ -19784,7 +19779,7 @@ void llama_init_sockets(struct llama_context * ctx, uint32_t n_world, uint32_t m
     }
 }
 
-int llama_collect_device_info(struct device_info * dev_info_set, struct llama_context * ctx) {
+int llama_gather_device_info(struct llama_context * ctx, struct device_info * dev_info_set) {
     uint32_t n_world = ctx->cparams.n_world;
     if (n_world == 1) {
         return 0;
@@ -19818,7 +19813,7 @@ int llama_collect_device_info(struct device_info * dev_info_set, struct llama_co
     return 0;
 }
 
-int llama_send_device_info(struct device_info * dev_info, struct llama_context * ctx) {
+int llama_send_device_info(struct llama_context * ctx, struct device_info * dev_info) {
     std::vector<zmq::message_t> recv_msgs;
     if (!zmq::recv_multipart(*ctx->recv_socket, std::back_inserter(recv_msgs))) {
         return -1;
@@ -19841,6 +19836,59 @@ int llama_send_device_info(struct device_info * dev_info, struct llama_context *
     }
 }
 
+int llama_broadcast_n_layer_window(struct llama_context * ctx, uint32_t * n_layer_window) {
+    uint32_t n_world = ctx->cparams.n_world;
+    if (n_world == 1) {
+        return 0;
+    }
+
+    GGML_ASSERT(ctx != nullptr && ctx->send_socket != nullptr);
+    try {
+        std::vector<zmq::message_t> send_msgs;
+
+        send_msgs.emplace_back("n_layer_window", strlen("n_layer_window"));
+        send_msgs.emplace_back(n_layer_window, sizeof(uint32_t) * 32);
+
+        zmq::send_multipart(*ctx->send_socket, send_msgs);
+    } catch (const zmq::error_t& e) {
+        LLAMA_LOG_INFO("Failed to send data: %s\n", e.what());
+        return -1;
+    }
+
+    return 0;
+}
+
+int llama_recv_n_layer_window(struct llama_context * ctx, uint32_t * n_layer_window) {
+    uint32_t n_world = ctx->cparams.n_world;
+    uint32_t my_rank = ctx->cparams.rank;
+
+    std::vector<zmq::message_t> recv_msgs;
+    if (!zmq::recv_multipart(*ctx->recv_socket, std::back_inserter(recv_msgs))) {
+        return -1;
+    }
+
+    std::string key = recv_msgs[0].to_string();
+    if (key != "n_layer_window") {
+        LLAMA_LOG_INFO("Unexpected message received: %s\n", key.c_str());
+        return -1;
+    }
+
+    zmq::message_t & data_msg = recv_msgs[1];
+    GGML_ASSERT(data_msg.size() == sizeof(uint32_t) * 32);
+    memcpy(n_layer_window, data_msg.data(), sizeof(uint32_t) * 32);
+
+    if (my_rank != n_world - 1) {
+        try {
+            zmq::send_multipart(*ctx->send_socket, recv_msgs);
+        } catch (const zmq::error_t& e) {
+            LLAMA_LOG_INFO("Failed to send data: %s\n", e.what());
+            return -1;
+        }
+    }
+    
+    return 0;
+}
+
 void llama_free_sockets(struct llama_context * ctx, char ** msg) {
     const uint32_t n_world   = ctx->cparams.n_world;
     const uint32_t my_rank   = ctx->cparams.rank;
@@ -19873,6 +19921,25 @@ void llama_free_sockets(struct llama_context * ctx, char ** msg) {
 struct llama_context * llama_new_context_with_model(
                  struct llama_model * model,
         struct llama_context_params   params) {
+    
+    if (!model) {
+        LLAMA_LOG_ERROR("%s: model cannot be NULL\n", __func__);
+        return nullptr;
+    }
+
+    llama_context * ctx  = new llama_context(*model);
+
+    ctx->master_ip       = params.master_ip;
+    ctx->next_node_ip    = params.next_node_ip;
+    ctx->cparams.n_world = params.n_world;
+    ctx->cparams.rank    = params.rank;
+    return ctx;
+}
+
+void * llama_context_setup_backend(
+                    struct llama_model * model,
+           struct llama_context_params   params,
+                  struct llama_context * ctx) {
 
     if (!model) {
         LLAMA_LOG_ERROR("%s: model cannot be NULL\n", __func__);
@@ -19904,13 +19971,9 @@ struct llama_context * llama_new_context_with_model(
         return nullptr;
     }
 
-    llama_context * ctx  = new llama_context(*model);
-
     const auto & hparams = model->hparams;
     auto       & cparams = ctx->cparams;
 
-    cparams.n_world          = params.n_world;
-    cparams.rank             = params.rank;
     std::copy(std::begin(params.n_layer_window), std::end(params.n_layer_window), cparams.n_layer_window);
     cparams.unload           = params.unload;
     cparams.n_seq_max        = std::max(1u, params.n_seq_max);
@@ -19927,9 +19990,7 @@ struct llama_context * llama_new_context_with_model(
     cparams.no_perf          = params.no_perf;
     cparams.pooling_type     = params.pooling_type;
 
-    cparams.n_ctx            = params.n_ctx           == 0    ? hparams.n_ctx_train           : params.n_ctx;
-    cparams.type_k           = params.type_k;
-    cparams.type_v           = params.type_v;    
+    cparams.n_ctx            = params.n_ctx           == 0    ? hparams.n_ctx_train           : params.n_ctx;   
     cparams.rope_freq_base   = params.rope_freq_base  == 0.0f ? hparams.rope_freq_base_train  : params.rope_freq_base;
     cparams.rope_freq_scale  = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
 
@@ -19985,10 +20046,6 @@ struct llama_context * llama_new_context_with_model(
         cparams.causal_attn = params.attention_type == LLAMA_ATTENTION_TYPE_CAUSAL;
     }
 
-    ctx->master_ip    = params.master_ip;
-    ctx->next_node_ip = params.next_node_ip;
-
-    LLAMA_LOG_INFO("\n");
     LLAMA_LOG_INFO("%s: n_world      = %u\n",     __func__, cparams.n_world);
     LLAMA_LOG_INFO("%s: rank         = %u\n",     __func__, cparams.rank);
     LLAMA_LOG_INFO("%s: win_size     = %u\n",     __func__, cparams.n_layer_window[cparams.rank]);
@@ -19998,8 +20055,8 @@ struct llama_context * llama_new_context_with_model(
     LLAMA_LOG_INFO("%s: flash_attn   = %d\n",     __func__, cparams.flash_attn);
     LLAMA_LOG_INFO("%s: freq_base    = %.1f\n",   __func__, cparams.rope_freq_base);
     LLAMA_LOG_INFO("%s: freq_scale   = %g\n",     __func__, cparams.rope_freq_scale);
-    LLAMA_LOG_INFO("%s: master_ip    = %s\n",   __func__, ctx->master_ip.c_str());
-    LLAMA_LOG_INFO("%s: next_node_ip = %s\n",   __func__, ctx->next_node_ip.c_str());
+    LLAMA_LOG_INFO("%s: master_ip    = %s\n",     __func__, ctx->master_ip.c_str());
+    LLAMA_LOG_INFO("%s: next_node_ip = %s\n",     __func__, ctx->next_node_ip.c_str());
 
     ctx->abort_callback      = params.abort_callback;
     ctx->abort_callback_data = params.abort_callback_data;
@@ -20009,18 +20066,9 @@ struct llama_context * llama_new_context_with_model(
     // build worst-case graph for encoder if a model contains encoder
     ctx->is_encoding = llama_model_has_encoder(model);
 
-    return ctx;
-}
-
-void * llama_context_setup_backend(struct llama_context * ctx) {
-    GGML_ASSERT(ctx != nullptr);
-    const auto * model   = &ctx->model;
-    const auto & hparams = ctx->model.hparams;
-    const auto & cparams = ctx->cparams;
-
     uint32_t kv_size = cparams.n_ctx;
-    ggml_type type_k = cparams.type_k;
-    ggml_type type_v = cparams.type_v;
+    ggml_type type_k = params.type_k;
+    ggml_type type_v = params.type_v;
 
     // Mamba only needs a constant number of KV cache cells per sequence
     if (llama_model_is_recurrent(model)) {
@@ -20333,6 +20381,10 @@ void * llama_context_setup_backend(struct llama_context * ctx) {
     return ctx;
 }
 
+uint32_t * llama_context_n_layer_window(struct llama_context * ctx) {
+    return ctx->cparams.n_layer_window;
+}
+
 void llama_free(struct llama_context * ctx) {
     delete ctx;
 }
@@ -20511,6 +20563,10 @@ uint64_t llama_model_size(const struct llama_model * model) {
     return size;
 }
 
+uint32_t llama_model_n_layers(const struct llama_model * model) {
+    return model->hparams.n_layer;
+}
+
 uint64_t llama_model_n_params(const struct llama_model * model) {
     uint64_t nparams = 0;
     for (const auto & it : model->tensors_by_name) {