From 76a7fc75275c5385bff5e383bae074b57beb3d8a Mon Sep 17 00:00:00 2001
From: Lizonghang <870644199@qq.com>
Date: Sat, 26 Oct 2024 12:34:14 +0400
Subject: [PATCH] support different window sizes

---
 common/arg.cpp         |  33 +++++-
 common/common.cpp      |  11 +-
 common/common.h        |   2 +-
 examples/main/main.cpp |  15 ++-
 include/llama.h        |   4 +-
 src/llama.cpp          | 262 +++++++++++++++++++++++------------------
 6 files changed, 200 insertions(+), 127 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index b0904165..794ebfbf 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -677,9 +677,36 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
     ).set_env("LLAMA_ARG_RANK"));
     add_opt(llama_arg(
         {"-lw", "--layer-window", "--n-layer-window"}, "N",
-        format("number of layers to process in each compute (default: %d)", params.n_layer_window),
-        [](gpt_params & params, int value) {
-            params.n_layer_window = value;
+        format("number of layers to process in each compute (e.g., 16,16)"),
+        [](gpt_params & params, const std::string & value) {
+            uint32_t result[32] = {0};
+            size_t index = 0;
+            std::stringstream ss(value);
+            std::string item;
+
+            while (std::getline(ss, item, ',')) {
+                try {
+                    int num = std::stoi(item);
+                    
+                    if (num <= 0) { 
+                        throw std::runtime_error("All values in --n-layer-window must be non-zero positive integers");
+                    }
+
+                    if (index >= 32) {
+                        throw std::runtime_error("Too many values in --n-layer-window (maximum is 32)");
+                    }
+
+                    result[index++] = static_cast<uint32_t>(num);
+                } catch (const std::invalid_argument &) {
+                    throw std::runtime_error("Non-integer value found in --n-layer-window");
+                }
+            }
+
+            if (index == 0) {
+                throw std::runtime_error("Input cannot be empty");
+            }
+
+            std::copy(std::begin(result), std::end(result), params.n_layer_window);
         }
     ).set_env("LLAMA_ARG_N_LAYER_WINDOW"));
     add_opt(llama_arg(
diff --git a/common/common.cpp b/common/common.cpp
index b08f51bc..e72e07ee 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -858,7 +858,6 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
 
         if (!ok) {
             llama_free_model(model);
-
             return iparams;
         }
     }
@@ -986,7 +985,6 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
     }
     mparams.n_world         = params.n_world;
     mparams.rank            = params.rank;
-    mparams.n_layer_window  = params.n_layer_window;
     mparams.rpc_servers     = params.rpc_servers.c_str();
     mparams.main_gpu        = params.main_gpu;
     mparams.split_mode      = params.split_mode;
@@ -994,6 +992,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
     mparams.use_mmap        = params.use_mmap;
     mparams.use_mlock       = params.use_mlock;
     mparams.check_tensors   = params.check_tensors;
+    std::copy(std::begin(params.n_layer_window), std::end(params.n_layer_window), mparams.n_layer_window);
     if (params.kv_overrides.empty()) {
         mparams.kv_overrides = NULL;
     } else {
@@ -1036,10 +1035,10 @@ static ggml_type kv_cache_type_from_str(const std::string & s) {
 struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
     auto cparams = llama_context_default_params();
 
-    cparams.n_world           = params.n_world;
-    cparams.rank              = params.rank;
-    cparams.n_layer_window    = params.n_layer_window;
-    cparams.unload            = params.unload;
+    cparams.n_world         = params.n_world;
+    cparams.rank            = params.rank;
+    cparams.unload          = params.unload;
+    std::copy(std::begin(params.n_layer_window), std::end(params.n_layer_window), cparams.n_layer_window);
 
     if (cparams.master_ip != nullptr) {
         delete[] cparams.master_ip;
diff --git a/common/common.h b/common/common.h
index 42db03c0..c074106a 100644
--- a/common/common.h
+++ b/common/common.h
@@ -144,7 +144,7 @@ struct gpt_sampler_params {
 struct gpt_params {
     int32_t n_world               =     1; // number of devices to use
     int32_t rank                  =     0; // my rank for distributed inference
-    int32_t n_layer_window        =    32; // number of layers to process in each compute
+    uint32_t n_layer_window[32]   =  {32}; // layer window size on each node
     std::string master_ip         = "localhost"; // ip address of the master node
     std::string next_node_ip      = "localhost"; // ip address of my next node
     bool    unload                = false; // unload layer weights after use or not
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index d85a7bb0..931c9b4a 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -142,9 +142,20 @@ int main(int argc, char ** argv) {
     if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_MAIN, print_usage)) {
         return 1;
     }
-    const uint32_t n_world = params.n_world;
-    const uint32_t my_rank = params.rank;
+    const uint32_t n_world  = params.n_world;
+    const uint32_t my_rank  = params.rank;
+
+    // check if --n-layer-window and --world is matched
+    uint32_t non_zero_count = 0;
+    size_t size = sizeof(params.n_layer_window) / sizeof(params.n_layer_window[0]);
+    for (size_t i = 0; i < size; ++i) {
+        if (params.n_layer_window[i] != 0) {
+            ++non_zero_count;
+        }
+    }
+
     GGML_ASSERT(!(n_world == 1 && my_rank > 0));
+    GGML_ASSERT(non_zero_count == n_world && "Number of non-zero values in --n-layer-window must equal --world");
 
     gpt_init();
 
diff --git a/include/llama.h b/include/llama.h
index e1cad355..9913ce1b 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -278,7 +278,7 @@ extern "C" {
     struct llama_model_params {
         uint32_t n_world; // number of nodes
         uint32_t rank; // my node rank
-        uint32_t n_layer_window; // number of layers to kept each time
+        uint32_t n_layer_window[32]; // number of layers to kept each time
         int32_t n_gpu_layers; // number of layers to store in VRAM
         enum llama_split_mode split_mode; // how to split the model across multiple GPUs
 
@@ -317,7 +317,7 @@ extern "C" {
     struct llama_context_params {
         uint32_t    n_world;           // world size
         uint32_t    rank;              // my rank
-        uint32_t    n_layer_window;    // number of layers to process in each compute
+        uint32_t    n_layer_window[32];// number of layers to process in each compute
         bool        unload;            // whether to unload layer weights after use
         char *      master_ip;         // ip address of the master node
         char *      next_node_ip;      // ip address of the next node
diff --git a/src/llama.cpp b/src/llama.cpp
index 0b4c12ae..39d1cf5b 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -2565,7 +2565,7 @@ static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams m
 struct llama_cparams {
     uint32_t n_world;
     uint32_t rank;
-    uint32_t n_layer_window;
+    uint32_t n_layer_window[32];
     bool     unload;
     uint32_t n_ctx;           // context size used during inference
     uint32_t n_batch;
@@ -3619,25 +3619,52 @@ static bool this_layer_is_mine(
                          uint32_t layer_id, 
                          uint32_t n_world, 
                          uint32_t my_rank, 
-                         uint32_t n_layer_window) {
-    return (layer_id / n_layer_window) % n_world == my_rank;
+                         const uint32_t * n_layer_window) {
+    uint32_t cumulative_layers = 0;
+    uint32_t rank = 0;
+    while (true) {
+        cumulative_layers += n_layer_window[rank];
+        if (layer_id < cumulative_layers) {
+            return rank == my_rank;
+        }
+        rank = (rank + 1) % n_world;
+    }
 }
 
-static int64_t map_layer_to_local_id(
-                         int64_t layer_id, 
-                         uint32_t n_world, 
-                         uint32_t my_rank, 
-                         uint32_t n_layer_window) {
+static int32_t map_layer_to_local_id(
+    uint32_t layer_id, 
+    uint32_t n_world, 
+    uint32_t my_rank, 
+    const uint32_t* n_layer_window) 
+{
     if (!this_layer_is_mine(layer_id, n_world, my_rank, n_layer_window)) {
         return -1;
     }
-    // map layer_id to kvcache_id.
-    // example: For n_world=2 and n_layer_window=4, rank 0 handles layers 0-3, 8-11, 16-19, while rank 1 handles layers 4-7, 12-15, 20-23.
-    // on rank 0, layer_id should map to kvcache_id as follows: 0-3 -> 0-3, 8-11 -> 4-7, 16-19 -> 8-11.
-    // on rank 1, layer_id should map to kvcache_id as follows: 4-7 -> 0-3, 12-15 -> 4-7, 20-23 -> 8-11.
-    int64_t cycle_size = n_world * n_layer_window;
-    int64_t local_offset = (layer_id / cycle_size) * n_layer_window;
-    return (layer_id % cycle_size) % n_layer_window + local_offset;
+    
+    uint32_t cycle_size = 0;
+    for (uint32_t i = 0; i < n_world; ++i) {
+        cycle_size += n_layer_window[i];
+    }
+
+    uint32_t cycle_offset = layer_id % cycle_size;
+    uint32_t cumulative_layers = 0;
+    uint32_t local_offset = (layer_id / cycle_size) * n_layer_window[my_rank];
+
+    for (uint32_t rank = 0; rank < n_world; ++rank) {
+        uint32_t window_size = n_layer_window[rank];
+
+        if (cycle_offset < cumulative_layers + window_size) {
+            if (rank == my_rank) {
+                return cycle_offset - cumulative_layers + local_offset;
+            } else {
+                return -1;
+            }
+        }
+        
+        cumulative_layers += window_size;
+    }
+
+    return -1;
 }
 
 //
@@ -3657,7 +3684,7 @@ static bool llama_kv_cache_init(
     const int64_t  n_layer               = hparams.n_layer;
     const uint32_t n_world               = cparams.n_world;
     const uint32_t my_rank               = cparams.rank;
-    const uint32_t n_layer_window        = cparams.n_layer_window;
+    const uint32_t * n_layer_window      = cparams.n_layer_window;
 
     cache.has_shift = false;
     cache.recurrent = llama_model_is_recurrent(&model);
@@ -3672,20 +3699,24 @@ static bool llama_kv_cache_init(
 
     // count used buffer types
     std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
-    int64_t  local_i;
+    int32_t  local_i;
     uint32_t my_layers = 0;
 
     for (int64_t i = 0; i < n_layer; ++i) {
         if (!this_layer_is_mine(i, n_world, my_rank, n_layer_window)) {
             continue;
         }
-        my_layers++;
+        
         local_i = map_layer_to_local_id(i, n_world, my_rank, n_layer_window);
+        GGML_ASSERT(local_i != -1);
+
         if (offload) {
             buft_layer_count[model.buft_layer[local_i].buft]++;
         } else {
             buft_layer_count[llama_default_buffer_type_cpu(model, true)]++;
         }
+
+        my_layers++;
     }
 
     // create a context for each buffer type
@@ -3714,13 +3745,14 @@ static bool llama_kv_cache_init(
             continue;
         }
         int local_i = map_layer_to_local_id(i, n_world, my_rank, n_layer_window);
+        GGML_ASSERT(local_i != -1);
         
         const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
         const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
 
         struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[local_i].buft) : cache.ctxs.front();
-        ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);  
-        ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);  
+        ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa * kv_size);  
+        ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa * kv_size);  
         ggml_format_name(k, "cache_k_l%d", i);
         ggml_format_name(v, "cache_v_l%d", i);
         cache.k_l.push_back(k);
@@ -5102,11 +5134,11 @@ struct llama_model_loader {
 
     // Returns false if cancelled by progress_callback
     bool load_all_data(
-            struct ggml_context * ctx,
-            llama_buf_map & bufs,
-            llama_mlocks * lmlocks,
+            struct ggml_context   * ctx,
+            llama_buf_map         & bufs,
+            llama_mlocks          * lmlocks,
             llama_progress_callback progress_callback,
-            void * progress_callback_user_data) {
+            void                  * progress_callback_user_data) {
         GGML_ASSERT(size_data != 0 && "call init_mappings() first");
 
         std::vector<no_init<uint8_t>> read_buf;
@@ -5134,7 +5166,7 @@ struct llama_model_loader {
             }
 
             auto * buft = ggml_backend_buffer_get_type(buf);
-            auto * dev = ggml_backend_buft_get_device(buft);
+            auto * dev  = ggml_backend_buft_get_device(buft);
             if (!dev) {
                 LLAMA_LOG_DEBUG("%s: no device found for buffer type %s for async uploads\n", fn,
                     ggml_backend_buft_name(buft));
@@ -7022,17 +7054,17 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
 
 // Returns false if cancelled by progress_callback
 static bool llm_load_tensors(
-        llama_model_loader & ml,
-        llama_model & model,
-        uint32_t n_world,
-        uint32_t my_rank,
-        uint32_t n_layer_window,
-        int n_gpu_layers,
-        enum llama_split_mode split_mode,
-        int main_gpu,
-        bool use_mlock,
+        llama_model_loader   &  ml,
+        llama_model          &  model,
+        uint32_t                n_world,
+        uint32_t                my_rank,
+        const uint32_t       *  n_layer_window,
+        int                     n_gpu_layers,
+        enum llama_split_mode   split_mode,
+        int                     main_gpu,
+        bool                    use_mlock,
         llama_progress_callback progress_callback,
-        void * progress_callback_user_data) {
+        void                  * progress_callback_user_data) {
     auto & hparams = model.hparams;
 
     // check if the value of main_gpu is valid
@@ -7045,13 +7077,9 @@ static bool llm_load_tensors(
     model.split_mode     = split_mode;
     model.main_gpu       = main_gpu;
     model.n_gpu_layers   = n_gpu_layers;
-
-    const int n_layer    = hparams.n_layer;
+    int n_layer          = hparams.n_layer;
     bool use_mmap_buffer = true;
 
-    // there is very little benefit to offloading the input layer, so always keep it on the CPU
-    model.buft_input = llama_default_buffer_type_cpu(model, true);
-
     int my_layers = 0;
     for (int i = 0; i < n_layer; ++i) {
         if (this_layer_is_mine(i, n_world, my_rank, n_layer_window)) {
@@ -7062,8 +7090,11 @@ static bool llm_load_tensors(
 
     for (int i = 0; i < n_layer; ++i) {
         if (this_layer_is_mine(i, n_world, my_rank, n_layer_window)) {
-            int local_i = map_layer_to_local_id(i, n_world, my_rank, n_layer_window);
-            if (i % (int)n_layer_window >= (int)n_layer_window - n_gpu_layers) {
+            int32_t local_i = map_layer_to_local_id(i, n_world, my_rank, n_layer_window);
+            int32_t window_size = static_cast<int32_t>(n_layer_window[my_rank]);
+            GGML_ASSERT(local_i != -1);
+
+            if (local_i % window_size >= window_size - n_gpu_layers) {
                 LLAMA_LOG_INFO("Layer %i assigned to gpu (cache index %i)\n", i, local_i);
                 model.buft_layer[local_i] = llama_default_buffer_type_offload(model, main_gpu);
             } else {
@@ -7073,13 +7104,18 @@ static bool llm_load_tensors(
         }
     }
 
-    // assign the output layer
-    if (my_rank == 0 && n_gpu_layers > (int)n_layer_window) {
-        LLAMA_LOG_INFO("Layer output assigned to gpu\n");
-        model.buft_output = llama_default_buffer_type_offload(model, main_gpu);
-    } else {
-        LLAMA_LOG_INFO("Layer output assigned to cpu\n");
-        model.buft_output = llama_default_buffer_type_cpu(model, true);
+    // assign the output layer (locate on node 0 only)
+    if (my_rank == 0) {
+        // there is very little benefit to offloading the input layer, so always keep it on the CPU
+        model.buft_input = llama_default_buffer_type_cpu(model, true);
+
+        if (n_gpu_layers > (int)n_layer_window[0]) {
+            LLAMA_LOG_INFO("Layer output assigned to gpu\n");
+            model.buft_output = llama_default_buffer_type_offload(model, main_gpu);
+        } else {
+            LLAMA_LOG_INFO("Layer output assigned to cpu\n");
+            model.buft_output = llama_default_buffer_type_cpu(model, true);
+        }
     }
 
     // count used buffer types
@@ -7099,7 +7135,7 @@ static bool llm_load_tensors(
     size_t ctx_size = ggml_tensor_overhead()*(ml.n_tensors + 1); // +1 for models where tok_embd is duplicated as output
 
     // for moe merged tensors
-    ctx_size += ggml_tensor_overhead()*n_layer*3;
+    ctx_size += ggml_tensor_overhead() * my_layers * 3;
 
     std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
     for (auto & it : buft_layer_count) {
@@ -7149,8 +7185,8 @@ static bool llm_load_tensors(
             ctx_output_split = ctx_map.at(model.buft_output.buft_matrix);
         }
 
-        auto ctx_for_layer       = [&](int local_i) { return ctx_map.at(model.buft_layer[local_i].buft); };
-        auto ctx_for_layer_split = [&](int local_i) { return ctx_map.at(model.buft_layer[local_i].buft_matrix); };
+        auto ctx_for_layer       = [&](int i) { return ctx_map.at(model.buft_layer[i].buft); };
+        auto ctx_for_layer_split = [&](int i) { return ctx_map.at(model.buft_layer[i].buft_matrix); };
 
         model.layers.resize(my_layers);
 
@@ -7201,25 +7237,23 @@ static bool llm_load_tensors(
                         layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
                         layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},     llama_model_loader::TENSOR_NOT_REQUIRED);
 
-                        layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
-
+                        layer.ffn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM,   "weight", i), {n_embd});
                         layer.rope_freqs = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
 
                         if (n_expert == 0) {
                             layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
-                            layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
+                            layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff,   n_embd});
                             layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
 
                             // optional MLP bias
-                            layer.ffn_gate_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                            layer.ffn_gate_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff},   llama_model_loader::TENSOR_NOT_REQUIRED);
                             layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
-                            layer.ffn_up_b   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP,   "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                            layer.ffn_up_b   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP,   "bias", i), {n_ff},   llama_model_loader::TENSOR_NOT_REQUIRED);
                         } else {
-                            layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
-
-                            layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,   n_ff, n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                            layer.ffn_gate_inp  = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert});
+                            layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
                             if (layer.ffn_gate_exps) {
-                                layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff, n_embd, n_expert});
+                                layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff,   n_embd, n_expert});
                                 layer.ffn_up_exps   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff, n_expert});
                             } else {
                                 // merge split expert into a single tensor for compatibility with older models
@@ -7231,7 +7265,7 @@ static bool llm_load_tensors(
                                 ggml_type type_up   = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_UP_EXP,   "weight", i, 0).c_str())->type;
 
                                 layer.ffn_gate_exps = ggml_new_tensor_3d(ctx_split, type_gate, n_embd,   n_ff, n_expert);
-                                layer.ffn_down_exps = ggml_new_tensor_3d(ctx_split, type_down,   n_ff, n_embd, n_expert);
+                                layer.ffn_down_exps = ggml_new_tensor_3d(ctx_split, type_down, n_ff,   n_embd, n_expert);
                                 layer.ffn_up_exps   = ggml_new_tensor_3d(ctx_split, type_up,   n_embd,   n_ff, n_expert);
 
                                 ggml_set_name(layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i).c_str());
@@ -7240,9 +7274,9 @@ static bool llm_load_tensors(
 
                                 for (uint32_t x = 0; x < n_expert; ++x) {
                                     // the individual experts are loaded into a view of the merged tensor
-                                    ml.create_tensor_as_view(ctx_split, layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), { n_embd, n_ff }, layer.ffn_gate_exps->nb[2]*x);
-                                    ml.create_tensor_as_view(ctx_split, layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd }, layer.ffn_down_exps->nb[2]*x);
-                                    ml.create_tensor_as_view(ctx_split, layer.ffn_up_exps,   tn(LLM_TENSOR_FFN_UP_EXP,   "weight", i, x), { n_embd, n_ff }, layer.ffn_up_exps->nb[2]*x);
+                                    ml.create_tensor_as_view(ctx_split, layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), {n_embd, n_ff}, layer.ffn_gate_exps->nb[2] * x);
+                                    ml.create_tensor_as_view(ctx_split, layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), {n_ff, n_embd}, layer.ffn_down_exps->nb[2] * x);
+                                    ml.create_tensor_as_view(ctx_split, layer.ffn_up_exps,   tn(LLM_TENSOR_FFN_UP_EXP,   "weight", i, x), {n_embd, n_ff}, layer.ffn_up_exps->nb[2] * x);
                                 }
                             }
                         }
@@ -9031,8 +9065,8 @@ static bool llm_load_tensors(
 
     // load tensor data
     for (auto & it : ctx_bufs) {
-        ggml_context * ctx = it.first;
-        auto & bufs = it.second;
+        ggml_context * ctx  = it.first;
+        auto         & bufs = it.second;
         if (!ml.load_all_data(ctx, bufs, use_mlock ? &model.mlock_mmaps : NULL, progress_callback, progress_callback_user_data)) {
             return false;
         }
@@ -9188,13 +9222,13 @@ static void llm_build_kv_store(
                     int32_t   kv_head,
          const llm_build_cb & cb,
                     int       il) {
-    const int64_t  n_ctx          = cparams.n_ctx;
-    const uint32_t n_world        = cparams.n_world;
-    const uint32_t my_rank        = cparams.rank;
-    const uint32_t n_layer_window = cparams.n_layer_window;
-    const int      local_il       = map_layer_to_local_id(il, n_world, my_rank, n_layer_window);
-    const int64_t  n_embd_k_gqa   = hparams.n_embd_k_gqa(il);
-    const int64_t  n_embd_v_gqa   = hparams.n_embd_v_gqa(il);
+    const int64_t    n_ctx          = cparams.n_ctx;
+    const uint32_t   n_world        = cparams.n_world;
+    const uint32_t   my_rank        = cparams.rank;
+    const uint32_t * n_layer_window = cparams.n_layer_window;
+    const int        local_il       = map_layer_to_local_id(il, n_world, my_rank, n_layer_window);
+    const int64_t    n_embd_k_gqa   = hparams.n_embd_k_gqa(il);
+    const int64_t    n_embd_v_gqa   = hparams.n_embd_v_gqa(il);
 
     GGML_ASSERT(kv.size == n_ctx);
 
@@ -9554,7 +9588,7 @@ static struct ggml_tensor * llm_build_kqv(
     const llama_cparams & cparams        = lctx.cparams;
     const uint32_t        n_world        = cparams.n_world;
     const uint32_t        my_rank        = cparams.rank;
-    const uint32_t        n_layer_window = cparams.n_layer_window;
+    const uint32_t      * n_layer_window = cparams.n_layer_window;
     
     const int     local_il      = map_layer_to_local_id(il, n_world, my_rank, n_layer_window);
     const int64_t n_ctx         = cparams.n_ctx;
@@ -10513,9 +10547,9 @@ struct llm_build_context {
         struct ggml_tensor * inpL = nullptr;
         struct ggml_tensor * inpB = nullptr;
 
-        const uint32_t n_world        = this->cparams.n_world;
-        const uint32_t my_rank        = this->cparams.rank;
-        const uint32_t n_layer_window = this->cparams.n_layer_window;
+        const uint32_t   n_world        = this->cparams.n_world;
+        const uint32_t   my_rank        = this->cparams.rank;
+        const uint32_t * n_layer_window = this->cparams.n_layer_window;
 
         if (my_rank == 0) {
             // inp_embd - contains the input embedding
@@ -16365,34 +16399,35 @@ static struct ggml_cgraph * llama_build_graph_k_shift(llama_context & lctx) {
     return result;
 }
 
-static uint32_t map_layer_to_subgf_id(uint32_t i, uint32_t my_rank, uint32_t n_world, uint32_t n_layer, uint32_t n_layer_window) {
-    uint32_t global_layer_offset = my_rank * n_layer_window;
-    uint32_t step = n_world * n_layer_window;
-    if (i < n_layer) {
-        uint32_t relative_layer = i % step;
-        if (relative_layer >= global_layer_offset && relative_layer < global_layer_offset + n_layer_window) {
-            return i / step;
-        }
+static int32_t map_layer_to_subgf_id(uint32_t i, uint32_t my_rank, uint32_t n_world, const uint32_t * n_layer_window) {
+    if (!this_layer_is_mine(i, n_world, my_rank, n_layer_window)) {
+        return -1;
     }
-    return -1;
+
+    uint32_t total_window_size = 0;
+    for (uint32_t rank = 0; rank < n_world; ++rank) {
+        total_window_size += n_layer_window[rank];
+    }
+    return i / total_window_size;
 }
 
 static std::vector<struct ggml_cgraph *> llama_build_graph(
          llama_context & lctx,
     const llama_ubatch & batch,
                   bool   worst_case) {
-    const auto &   model          = lctx.model;
-    const uint32_t n_world        = lctx.cparams.n_world;
-    const uint32_t my_rank        = lctx.cparams.rank;
-    const uint32_t n_layer_window = lctx.cparams.n_layer_window;
-    const uint32_t n_layer        = lctx.model.hparams.n_layer;
+    const auto &     model          = lctx.model;
+    const uint32_t   n_world        = lctx.cparams.n_world;
+    const uint32_t   my_rank        = lctx.cparams.rank;
+    const uint32_t * n_layer_window = lctx.cparams.n_layer_window;
+    const uint32_t   n_layer        = lctx.model.hparams.n_layer;
 
     // this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
     llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) {
         int sub_gf_id = 0;
         if (il >= 0) {
             ggml_format_name(cur, "%s-%d", name, il);
-            sub_gf_id = map_layer_to_subgf_id(il, my_rank, n_world, n_layer, n_layer_window); 
+            sub_gf_id = map_layer_to_subgf_id(il, my_rank, n_world, n_layer_window);
+            GGML_ASSERT(sub_gf_id != -1);
         } else {
             ggml_set_name(cur, name);
         }
@@ -16406,7 +16441,7 @@ static std::vector<struct ggml_cgraph *> llama_build_graph(
 
         // norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
         // FIXME: fix in ggml_backend_sched
-        const bool full_offload = lctx.model.n_gpu_layers > (int)n_layer_window;
+        const bool full_offload = lctx.model.n_gpu_layers > (int)n_layer_window[0];
         if (batch.n_tokens < 32 || full_offload) {
             if (il != -1 && strcmp(name, "norm") == 0) {
                 for (auto * backend : lctx.backends) {
@@ -18250,6 +18285,8 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
 
     // apply K-shift if needed
     if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE && lctx.kv_self.has_shift) {
+        throw std::runtime_error("shift not supported\n");
+
         if (lctx.model.arch == LLM_ARCH_DEEPSEEK2) { // not supported due to MLA
             GGML_ABORT("Deepseek2 does not support K-shift");
         }
@@ -18290,6 +18327,8 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
 
     // reserve a worst case graph again
     if (need_reserve) {
+        throw std::runtime_error("reserve not supported\n");
+
         // TODO: extract to a function
         // build worst-case graph
         uint32_t n_seqs = 1; // TODO: worst-case number of sequences
@@ -18299,7 +18338,7 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
         std::vector<ggml_cgraph *> gf = llama_build_graph(lctx, ubatch, true);
 
         // initialize scheduler with the worst-case graph
-        ggml_backend_sched_reset(lctx.sched.at(0)); // todo.
+        ggml_backend_sched_reset(lctx.sched[0]); // todo.
 
         bool ok = true;
         GGML_ASSERT(lctx.sched.size() == gf.size());
@@ -19405,7 +19444,7 @@ struct llama_model_params llama_model_default_params() {
     struct llama_model_params result = {
         /*.n_world                     =*/ 1,
         /*.rank                        =*/ 0,
-        /*.n_layer_window              =*/ 32,
+        /*.n_layer_window              =*/ {32},
         /*.n_gpu_layers                =*/ 0,
         /*.split_mode                  =*/ LLAMA_SPLIT_MODE_LAYER,
         /*.main_gpu                    =*/ 0,
@@ -19432,7 +19471,7 @@ struct llama_context_params llama_context_default_params() {
     struct llama_context_params result = {
         /*.n_world                     =*/ 1,
         /*.rank                        =*/ 0,
-        /*.n_layer_window              =*/ 32,
+        /*.n_layer_window              =*/ {32},
         /*.unload                      =*/ false,
         /*.master_ip                   =*/ nullptr,
         /*.next_node_ip                =*/ nullptr,
@@ -19736,7 +19775,7 @@ struct llama_context * llama_new_context_with_model(
 
     cparams.n_world          = params.n_world;
     cparams.rank             = params.rank;
-    cparams.n_layer_window   = params.n_layer_window;
+    std::copy(std::begin(params.n_layer_window), std::end(params.n_layer_window), cparams.n_layer_window);
     cparams.unload           = params.unload;
     cparams.n_seq_max        = std::max(1u, params.n_seq_max);
     cparams.n_threads        = params.n_threads;
@@ -19808,21 +19847,19 @@ struct llama_context * llama_new_context_with_model(
         cparams.causal_attn = params.attention_type == LLAMA_ATTENTION_TYPE_CAUSAL;
     }
 
-    LLAMA_LOG_INFO("\n");
-    LLAMA_LOG_INFO("%s: n_world    = %u\n",     __func__, cparams.n_world);
-    LLAMA_LOG_INFO("%s: rank       = %u\n",     __func__, cparams.rank);
-    LLAMA_LOG_INFO("%s: n_layer_win= %u\n",     __func__, cparams.n_layer_window);
-    LLAMA_LOG_INFO("%s: n_ctx      = %u\n",     __func__, cparams.n_ctx);
-    LLAMA_LOG_INFO("%s: n_batch    = %u\n",     __func__, cparams.n_batch);
-    LLAMA_LOG_INFO("%s: n_ubatch   = %u\n",     __func__, cparams.n_ubatch);
-    LLAMA_LOG_INFO("%s: flash_attn = %d\n",     __func__, cparams.flash_attn);
-    LLAMA_LOG_INFO("%s: freq_base  = %.1f\n",   __func__, cparams.rope_freq_base);
-    LLAMA_LOG_INFO("%s: freq_scale = %g\n",     __func__, cparams.rope_freq_scale);
-
     ctx->master_ip    = params.master_ip;
     ctx->next_node_ip = params.next_node_ip;
 
     LLAMA_LOG_INFO("\n");
+    LLAMA_LOG_INFO("%s: n_world      = %u\n",     __func__, cparams.n_world);
+    LLAMA_LOG_INFO("%s: rank         = %u\n",     __func__, cparams.rank);
+    LLAMA_LOG_INFO("%s: win_size     = %u\n",     __func__, cparams.n_layer_window[cparams.rank]);
+    LLAMA_LOG_INFO("%s: n_ctx        = %u\n",     __func__, cparams.n_ctx);
+    LLAMA_LOG_INFO("%s: n_batch      = %u\n",     __func__, cparams.n_batch);
+    LLAMA_LOG_INFO("%s: n_ubatch     = %u\n",     __func__, cparams.n_ubatch);
+    LLAMA_LOG_INFO("%s: flash_attn   = %d\n",     __func__, cparams.flash_attn);
+    LLAMA_LOG_INFO("%s: freq_base    = %.1f\n",   __func__, cparams.rope_freq_base);
+    LLAMA_LOG_INFO("%s: freq_scale   = %g\n",     __func__, cparams.rope_freq_scale);
     LLAMA_LOG_INFO("%s: master_ip    = %s\n",   __func__, ctx->master_ip.c_str());
     LLAMA_LOG_INFO("%s: next_node_ip = %s\n",   __func__, ctx->next_node_ip.c_str());
 
@@ -20041,8 +20078,7 @@ struct llama_context * llama_new_context_with_model(
         }
 
         // graph outputs buffer, reserve for rank 0 only
-        const uint32_t my_rank = params.rank;
-        if (my_rank == 0) {
+        if (params.rank == 0) {
             // resized during inference when a batch uses more outputs
             if (llama_output_reserve(*ctx, params.n_seq_max) < params.n_seq_max) {
                 LLAMA_LOG_ERROR("%s: failed to reserve initial output buffer\n", __func__);
@@ -20118,7 +20154,7 @@ struct llama_context * llama_new_context_with_model(
             ctx->sched.resize(gf.size());
             
             // prefetch the first subgraph weights
-            manage_graph_tensors(gf.front(), POSIX_MADV_WILLNEED, true);
+            manage_graph_tensors(gf.front(), POSIX_MADV_WILLNEED, false);
 
             // initialize scheduler with the worst-case graph
             bool ok = true;
@@ -20138,7 +20174,7 @@ struct llama_context * llama_new_context_with_model(
 
                 size_t total_size = 0;
                 for (size_t j = 0; j < ctx->sched.size(); j++) {
-                    total_size += ggml_backend_sched_get_buffer_size(ctx->sched.at(j), backend);
+                    total_size += ggml_backend_sched_get_buffer_size(ctx->sched[j], backend);
                 }
                 if (total_size > 1) {
                     LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB (in total)\n", __func__,