diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index a415745f..c7f6adb5 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -2297,7 +2297,6 @@ bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgra
 
     ggml_backend_sched_split_graph(sched, graph);
 
-
     if (!ggml_backend_sched_alloc_splits(sched)) {
         return false;
     }
diff --git a/src/llama.cpp b/src/llama.cpp
index e3b16d21..fc683237 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -3313,7 +3313,7 @@ struct llama_context {
         ggml_backend_buffer_free(buf_output);
     }
 
-    const struct llama_model & model;
+    const struct llama_model  & model;
 
     struct llama_cparams        cparams;
     struct llama_sbatch         sbatch;
@@ -3399,15 +3399,15 @@ struct llama_context {
     struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch]
 
     // sockets
-    std::string master_ip         = "localhost";
-    std::string next_node_ip      = "localhost";
-    uint32_t data_port           = 9000;
-    uint32_t signal_port          = 10000;
-    zmq::context_t * sock_context = nullptr;
-    zmq::socket_t * send_socket   = nullptr; 
-    zmq::socket_t * recv_socket   = nullptr; 
-    zmq::socket_t * master_socket = nullptr; 
-    zmq::socket_t * signal_socket = nullptr;
+    std::string      master_ip     = "localhost";
+    std::string      next_node_ip  = "localhost";
+    uint32_t         data_port     = 9000;
+    uint32_t         signal_port   = 10000;
+    zmq::context_t * sock_context  = nullptr;
+    zmq::socket_t  * send_socket   = nullptr; 
+    zmq::socket_t  * recv_socket   = nullptr; 
+    zmq::socket_t  * master_socket = nullptr; 
+    zmq::socket_t  * signal_socket = nullptr;
 };
 
 struct llama_lora_weight {
@@ -17343,18 +17343,18 @@ static void llama_send_meta(zmq::socket_t & socket, struct sync_meta * meta) {
 static void llama_send_tensors(zmq::socket_t & socket, struct input_tensors * tensors) {
     try {
         std::vector<zmq::message_t> send_msgs;
-        size_t buff_size = 0;
+        size_t buf_size = 0;
 
         send_msgs.emplace_back("sub_gf_out", strlen("sub_gf_out"));
         send_msgs.emplace_back(tensors->sub_gf_out->ne, sizeof(tensors->sub_gf_out->ne));
-        buff_size = tensors->sub_gf_out->ne[0] * tensors->sub_gf_out->ne[1] * sizeof(float);
-        send_msgs.emplace_back(tensors->sub_gf_out->data, buff_size);
+        buf_size = tensors->sub_gf_out->ne[0] * tensors->sub_gf_out->ne[1] * sizeof(float);
+        send_msgs.emplace_back(tensors->sub_gf_out->data, buf_size);
 
         if (tensors->inp_pos) {
             send_msgs.emplace_back("inp_pos", strlen("inp_pos"));
             send_msgs.emplace_back(tensors->inp_pos->ne, sizeof(tensors->inp_pos->ne[0]));
-            buff_size = tensors->inp_pos->ne[0] * sizeof(int32_t);
-            send_msgs.emplace_back(tensors->inp_pos->data, buff_size);
+            buf_size = tensors->inp_pos->ne[0] * sizeof(int32_t);
+            send_msgs.emplace_back(tensors->inp_pos->data, buf_size);
         }
 
         zmq::send_multipart(socket, send_msgs);
@@ -17398,17 +17398,17 @@ static void llama_recv_tensors(zmq::socket_t & socket, input_tensors * tensors)
 
         if (key == "sub_gf_out" && tensors->sub_gf_out) {
             int64_t * dims   = static_cast<int64_t*>(dims_msg.data());
-            size_t buff_size = dims[0] * dims[1] * sizeof(float);
+            size_t buf_size = dims[0] * dims[1] * sizeof(float);
             GGML_ASSERT(dims[0] == tensors->sub_gf_out->ne[0]);
             GGML_ASSERT(dims[1] == tensors->sub_gf_out->ne[1]);
-            GGML_ASSERT(data_msg.size() == buff_size);
-            std::memcpy(tensors->sub_gf_out->data, data_msg.data(), buff_size);
+            GGML_ASSERT(data_msg.size() == buf_size);
+            std::memcpy(tensors->sub_gf_out->data, data_msg.data(), buf_size);
         } else if (key == "inp_pos" && tensors->inp_pos) {
             int64_t * dims   = static_cast<int64_t*>(dims_msg.data());
-            size_t buff_size = dims[0] * sizeof(int32_t);
+            size_t buf_size = dims[0] * sizeof(int32_t);
             GGML_ASSERT(dims[0] == tensors->inp_pos->ne[0]);
-            GGML_ASSERT(data_msg.size() == buff_size);
-            std::memcpy(tensors->inp_pos->data, data_msg.data(), buff_size);
+            GGML_ASSERT(data_msg.size() == buf_size);
+            std::memcpy(tensors->inp_pos->data, data_msg.data(), buf_size);
         }
     }
 }
@@ -17734,15 +17734,15 @@ static int llama_decode_internal(
                 llama_recv_tensors(*lctx.recv_socket, &tensors);
             
                 is_last_l = my_rank == 0 && i == (size_t)gf.size() - 1;
-                size_t buff_size = tensors.sub_gf_out->ne[0] * tensors.sub_gf_out->ne[1] * ggml_element_size(tensors.sub_gf_out);
+                size_t buf_size = tensors.sub_gf_out->ne[0] * tensors.sub_gf_out->ne[1] * ggml_element_size(tensors.sub_gf_out);
                 if (!is_last_l) { 
-                    memcpy(ubatch.backend_embd, tensors.sub_gf_out->data, buff_size);
+                    memcpy(ubatch.backend_embd, tensors.sub_gf_out->data, buf_size);
                 } else { 
-                    memcpy(ubatch.out_embd, tensors.sub_gf_out->data, buff_size);
+                    memcpy(ubatch.out_embd, tensors.sub_gf_out->data, buf_size);
                 }
                 if (my_rank != 0 && i == 0) {
-                    buff_size = tensors.inp_pos->ne[0] * ggml_element_size(tensors.inp_pos);
-                    memcpy(ubatch.pos, tensors.inp_pos->data, buff_size);
+                    buf_size = tensors.inp_pos->ne[0] * ggml_element_size(tensors.inp_pos);
+                    memcpy(ubatch.pos, tensors.inp_pos->data, buf_size);
                 }   
             }
 
@@ -17771,14 +17771,14 @@ static int llama_decode_internal(
             if (n_world == 1 || (my_rank == 0 && is_last_l)) {
                 size_t buf_size = sub_gf_out->ne[0]*sub_gf_out->ne[1]*sizeof(float);
                 float * embd_buf = is_last_l ? ubatch.out_embd : ubatch.backend_embd;
-                memcpy(embd_buf, sub_gf_out->data, buf_size);
+                ggml_backend_tensor_get(sub_gf_out, embd_buf, 0, buf_size);
             } else {
                 input_tensors tensors;
                 tensors.sub_gf_out = sub_gf_out;
                 if (i == 0 && !is_last_l && my_rank != n_world - 1) {
                     tensors.inp_pos = lctx.inp_pos;
-                    const size_t buff_size = ubatch.n_tokens * ggml_element_size(tensors.inp_pos);
-                    memcpy(tensors.inp_pos->data, ubatch.pos, buff_size);
+                    const size_t buf_size = ubatch.n_tokens * ggml_element_size(tensors.inp_pos);
+                    memcpy(tensors.inp_pos->data, ubatch.pos, buf_size);
                 }
                 const bool is_to_master = my_rank != 0 && is_last_l;
                 zmq::socket_t * s = is_to_master ? lctx.master_socket : lctx.send_socket;
@@ -19812,7 +19812,7 @@ struct llama_context * llama_new_context_with_model(
         return nullptr;
     }
 
-    llama_context * ctx = new llama_context(*model);
+    llama_context * ctx  = new llama_context(*model);
 
     const auto & hparams = model->hparams;
     auto       & cparams = ctx->cparams;
@@ -19937,6 +19937,7 @@ struct llama_context * llama_new_context_with_model(
 
         // with registry
         if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
+            throw std::runtime_error("only LLAMA_SPLIT_MODE_LAYER is supported\n");
             if (main_gpu >= 0 && main_gpu < (int)model->devices.size()) {
                 ggml_backend_dev_t main_dev = model->devices[main_gpu];
                 ggml_backend_t backend = ggml_backend_dev_init(main_dev, nullptr);