From 53cb3a606990ebed2cdf9aa4dfa90e4fc42c9ee5 Mon Sep 17 00:00:00 2001
From: Lizonghang <870644199@qq.com>
Date: Thu, 7 Nov 2024 22:02:01 +0400
Subject: [PATCH] synchronize device info

---
 common/common.cpp   |  34 ++++--
 common/profiler.cpp | 283 +++++++++++++++++++++++++++++++++++++++++++-
 common/profiler.h   |  17 ++-
 include/llama.h     |   7 +-
 src/llama.cpp       | 140 ++++++++++++++--------
 5 files changed, 408 insertions(+), 73 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index a31a2e6e..6438daae 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -833,10 +833,6 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
         model = llama_load_model_from_file(params.model.c_str(), mparams);
     }
 
-    // profile devices and determine the best setup
-    device_info dev_info;
-    llama_profile_device(&dev_info, model, params.model.c_str());
-
     if (model == NULL) {
         LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.c_str());
         return iparams;
@@ -866,17 +862,35 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
         }
     }
 
-    auto cparams = llama_context_params_from_gpt_params(params);
+    // get device profile
+    device_info dev_info;
+    dev_info.rank = params.rank;
+    llama_profile_device(&dev_info, model, params.model.c_str());
 
-    llama_context * lctx = llama_new_context_with_model(model, cparams);
-    if (lctx == NULL) {
-        LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str());
+    // create llama context
+    struct llama_context_params cparams = llama_context_params_from_gpt_params(params);
+    llama_context * lctx                = llama_new_context_with_model(model, cparams);
+
+    // initialize sockets
+    llama_init_sockets(lctx, cparams.n_world, cparams.rank);
+
+    // sychronize device profile to the master node
+    struct device_info * dev_info_set = nullptr;
+    if (params.rank == 0) {
+        dev_info_set = (struct device_info *)malloc(cparams.n_world * sizeof(struct device_info));
+        dev_info_set[0] = dev_info;
+        llama_collect_device_info(dev_info_set, lctx);
+        device_print_props(dev_info_set, cparams.n_world);
+    } else {
+        llama_send_device_info(&dev_info, lctx);
+    }
+
+    if (llama_context_setup_backend(lctx) == nullptr) {
+        LOG_ERR("%s: failed to setup context with model '%s'\n", __func__, params.model.c_str());
         llama_free_model(model);
         return iparams;
     }
 
-    llama_init_sockets(lctx, cparams.n_world, cparams.rank);
-
     if (!params.control_vectors.empty()) {
         if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
         if (params.control_vector_layer_end   <= 0) params.control_vector_layer_end   = llama_n_layer(model);
diff --git a/common/profiler.cpp b/common/profiler.cpp
index 25a93ef2..487f3379 100644
--- a/common/profiler.cpp
+++ b/common/profiler.cpp
@@ -300,8 +300,6 @@ int device_has_sycl(void) {
     return ggml_cpu_has_sycl();
 }
 
-// ggml_backend_buffer_type_t llama_dev_buffer_type(const llama_model * model, int device)
-
 void device_get_props(struct llama_model * model, int device, struct ggml_backend_dev_props * props) {
     ggml_backend_buffer_type_t buft_type;
     if (device == -1) { // type cpu
@@ -311,4 +309,283 @@ void device_get_props(struct llama_model * model, int device, struct ggml_backen
     }
     ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft_type);
     ggml_backend_dev_get_props(dev, props);
-}
\ No newline at end of file
+}
+
+void device_print_props(struct device_info * dev_info_set, int n) {
+    LOG_INF("\n-------------------------------------------------------------------------------------------\n");
+    LOG_INF("| Property                     ");
+    for (int i = 0; i < n; ++i) {
+        LOG_INF("| Rank %-8d", i);
+        GGML_ASSERT(dev_info_set[i].rank == i);
+    }
+    LOG_INF("\n-------------------------------------------------------------------------------------------\n");
+
+    LOG_INF("| Device Name                  ");
+    for (int i = 0; i < n; ++i) {
+        LOG_INF("| %-10.10s   ", dev_info_set[i].device_name);
+    }
+    LOG_INF("\n");
+
+    LOG_INF("| CPU Name                     ");
+    for (int i = 0; i < n; ++i) {
+        LOG_INF("| %-10.10s   ", dev_info_set[i].cpu_props.name);
+    }
+    LOG_INF("\n");
+
+    LOG_INF("| CPU Description              ");
+    for (int i = 0; i < n; ++i) {
+        LOG_INF("| %-10.10s   ", dev_info_set[i].cpu_props.description);
+    }
+    LOG_INF("\n");
+
+    LOG_INF("| Number of CPU cores          ");
+    for (int i = 0; i < n; ++i) {
+        LOG_INF("| %-10u   ", dev_info_set[i].cpu_props.cores);
+    }
+    LOG_INF("\n");
+
+    LOG_INF("| Physical Mem Total (GB)      ");
+    for (int i = 0; i < n; ++i) {
+        LOG_INF("| %-10.2f   ", dev_info_set[i].memory.total_physical);
+    }
+    LOG_INF("\n");
+
+    LOG_INF("| Physical Mem Available (GB)  ");
+    for (int i = 0; i < n; ++i) {
+        LOG_INF("| %-10.2f   ", dev_info_set[i].memory.available_physical);
+    }
+    LOG_INF("\n");
+
+    LOG_INF("| Swap Mem Total (GB)          ");
+    for (int i = 0; i < n; ++i) {
+        LOG_INF("| %-10.2f   ", dev_info_set[i].memory.total_swap);
+    }
+    LOG_INF("\n");
+
+    LOG_INF("| Swap Mem Available (GB)      ");
+    for (int i = 0; i < n; ++i) {
+        LOG_INF("| %-10.2f   ", dev_info_set[i].memory.available_swap);
+    }
+    LOG_INF("\n");
+
+    LOG_INF("| Mem Bandwidth (GB/s)         ");
+    for (int i = 0; i < n; ++i) {
+        LOG_INF("| %-10.2f   ", dev_info_set[i].memory.bandwidth);
+    }
+    LOG_INF("\n");
+
+    LOG_INF("| Disk Read Bandwidth (GB/s)   ");
+    for (int i = 0; i < n; ++i) {
+        LOG_INF("| %-10.2f   ", dev_info_set[i].disk_read_bandwidth);
+    }
+    LOG_INF("\n");
+
+    LOG_INF("| GPU Metal                    ");
+    for (int i = 0; i < n; ++i) {
+        LOG_INF("| %-10d   ", dev_info_set[i].gpu_support.metal);
+    }
+    LOG_INF("\n");
+
+    LOG_INF("| GPU CUDA                     ");
+    for (int i = 0; i < n; ++i) {
+        LOG_INF("| %-10d   ", dev_info_set[i].gpu_support.cuda);
+    }
+    LOG_INF("\n");
+
+    LOG_INF("| GPU Vulkan                   ");
+    for (int i = 0; i < n; ++i) {
+        LOG_INF("| %-10d   ", dev_info_set[i].gpu_support.vulkan);
+    }
+    LOG_INF("\n");
+
+    LOG_INF("| GPU Kompute                  ");
+    for (int i = 0; i < n; ++i) {
+        LOG_INF("| %-10d   ", dev_info_set[i].gpu_support.kompute);
+    }
+    LOG_INF("\n");
+
+    LOG_INF("| GPU BLAS                     ");
+    for (int i = 0; i < n; ++i) {
+        LOG_INF("| %-10d   ", dev_info_set[i].gpu_support.gpublas);
+    }
+    LOG_INF("\n");
+
+    LOG_INF("| BLAS                         ");
+    for (int i = 0; i < n; ++i) {
+        LOG_INF("| %-10d   ", dev_info_set[i].gpu_support.blas);
+    }
+    LOG_INF("\n");
+
+    LOG_INF("| SYCL                         ");
+    for (int i = 0; i < n; ++i) {
+        LOG_INF("| %-10d   ", dev_info_set[i].gpu_support.sycl);
+    }
+    LOG_INF("\n");
+
+    LOG_INF("| GPU Name                     ");
+    for (int i = 0; i < n; ++i) {
+        LOG_INF("| %-10.10s   ", dev_info_set[i].gpu_props.name);
+    }
+    LOG_INF("\n");
+
+    LOG_INF("| GPU Description              ");
+    for (int i = 0; i < n; ++i) {
+        LOG_INF("| %-10.10s   ", dev_info_set[i].gpu_props.description);
+    }
+    LOG_INF("\n");
+
+    LOG_INF("| GPU Mem Free (GB)            ");
+    for (int i = 0; i < n; ++i) {
+        LOG_INF("| %-10.2f   ", dev_info_set[i].gpu_props.memory_free);
+    }
+    LOG_INF("\n");
+
+    LOG_INF("| GPU Mem Total (GB)           ");
+    for (int i = 0; i < n; ++i) {
+        LOG_INF("| %-10.2f   ", dev_info_set[i].gpu_props.memory_total);
+    }
+    LOG_INF("\n");
+
+    LOG_INF("-------------------------------------------------------------------------------------------\n\n");
+}
+
+
+size_t serialize(const struct device_info * dev_info, char ** buffer) {
+    // calculate total size for serialized buffer
+    size_t device_name_len     = strlen(dev_info->device_name) + 1;
+    size_t cpu_name_len        = strlen(dev_info->cpu_props.name) + 1;
+    size_t cpu_description_len = strlen(dev_info->cpu_props.description) + 1;
+    size_t gpu_name_len        = strlen(dev_info->gpu_props.name) + 1;
+    size_t gpu_description_len = strlen(dev_info->gpu_props.description) + 1;
+
+    size_t total_size = sizeof(uint32_t)
+                      + sizeof(size_t) * 5  // for lengths of strings
+                      + device_name_len
+                      + cpu_name_len
+                      + cpu_description_len
+                      + gpu_name_len
+                      + gpu_description_len
+                      + sizeof(float)       // disk_read_bandwidth
+                      + sizeof(uint32_t)    // cpu_props.cores
+                      + sizeof(struct memory_info)
+                      + sizeof(struct gpu_support)
+                      + sizeof(float) * 2;  // gpu_props.memory_free and gpu_props.memory_total
+
+    *buffer = (char *)malloc(total_size);
+    char * ptr = *buffer;
+
+    // rank
+    memcpy(ptr, &dev_info->rank, sizeof(uint32_t));
+    ptr += sizeof(uint32_t);
+
+    // copy string lengths and string data
+    memcpy(ptr, &device_name_len, sizeof(size_t));
+    ptr += sizeof(size_t);
+    memcpy(ptr, dev_info->device_name, device_name_len);
+    ptr += device_name_len;
+
+    memcpy(ptr, &cpu_name_len, sizeof(size_t));
+    ptr += sizeof(size_t);
+    memcpy(ptr, dev_info->cpu_props.name, cpu_name_len);
+    ptr += cpu_name_len;
+
+    memcpy(ptr, &cpu_description_len, sizeof(size_t));
+    ptr += sizeof(size_t);
+    memcpy(ptr, dev_info->cpu_props.description, cpu_description_len);
+    ptr += cpu_description_len;
+
+    memcpy(ptr, &gpu_name_len, sizeof(size_t));
+    ptr += sizeof(size_t);
+    memcpy(ptr, dev_info->gpu_props.name, gpu_name_len);
+    ptr += gpu_name_len;
+
+    memcpy(ptr, &gpu_description_len, sizeof(size_t));
+    ptr += sizeof(size_t);
+    memcpy(ptr, dev_info->gpu_props.description, gpu_description_len);
+    ptr += gpu_description_len;
+
+    // copy the non-string members
+    memcpy(ptr, &dev_info->disk_read_bandwidth, sizeof(float));
+    ptr += sizeof(float);
+
+    memcpy(ptr, &dev_info->cpu_props.cores, sizeof(uint32_t));
+    ptr += sizeof(uint32_t);
+
+    memcpy(ptr, &dev_info->memory, sizeof(struct memory_info));
+    ptr += sizeof(struct memory_info);
+
+    memcpy(ptr, &dev_info->gpu_support, sizeof(struct gpu_support));
+    ptr += sizeof(struct gpu_support);
+
+    memcpy(ptr, &dev_info->gpu_props.memory_free, sizeof(float));
+    ptr += sizeof(float);
+
+    memcpy(ptr, &dev_info->gpu_props.memory_total, sizeof(float));
+
+    return total_size;
+}
+
+void deserialize(const char * buffer, struct device_info * dev_info) {
+    const char * ptr = buffer;
+
+    // rank
+    memcpy(&dev_info->rank, ptr, sizeof(uint32_t));
+    ptr += sizeof(uint32_t);
+
+    // device_name
+    size_t device_name_len;
+    memcpy(&device_name_len, ptr, sizeof(size_t));
+    ptr += sizeof(size_t);
+    dev_info->device_name = (char *)malloc(device_name_len);
+    memcpy((void *)dev_info->device_name, ptr, device_name_len);
+    ptr += device_name_len;
+
+    // cpu_props.name
+    size_t cpu_name_len;
+    memcpy(&cpu_name_len, ptr, sizeof(size_t));
+    ptr += sizeof(size_t);
+    dev_info->cpu_props.name = (char *)malloc(cpu_name_len);
+    memcpy((void *)dev_info->cpu_props.name, ptr, cpu_name_len);
+    ptr += cpu_name_len;
+
+    // cpu_props.description
+    size_t cpu_description_len;
+    memcpy(&cpu_description_len, ptr, sizeof(size_t));
+    ptr += sizeof(size_t);
+    dev_info->cpu_props.description = (char *)malloc(cpu_description_len);
+    memcpy((void *)dev_info->cpu_props.description, ptr, cpu_description_len);
+    ptr += cpu_description_len;
+
+    // gpu_props.name
+    size_t gpu_name_len;
+    memcpy(&gpu_name_len, ptr, sizeof(size_t));
+    ptr += sizeof(size_t);
+    dev_info->gpu_props.name = (char *)malloc(gpu_name_len);
+    memcpy((void *)dev_info->gpu_props.name, ptr, gpu_name_len);
+    ptr += gpu_name_len;
+
+    // gpu_props.description
+    size_t gpu_description_len;
+    memcpy(&gpu_description_len, ptr, sizeof(size_t));
+    ptr += sizeof(size_t);
+    dev_info->gpu_props.description = (char *)malloc(gpu_description_len);
+    memcpy((void *)dev_info->gpu_props.description, ptr, gpu_description_len);
+    ptr += gpu_description_len;
+
+    // other non-string members
+    memcpy(&dev_info->disk_read_bandwidth, ptr, sizeof(float));
+    ptr += sizeof(float);
+
+    memcpy(&dev_info->cpu_props.cores, ptr, sizeof(uint32_t));
+    ptr += sizeof(uint32_t);
+
+    memcpy(&dev_info->memory, ptr, sizeof(struct memory_info));
+    ptr += sizeof(struct memory_info);
+
+    memcpy(&dev_info->gpu_support, ptr, sizeof(struct gpu_support));
+    ptr += sizeof(struct gpu_support);
+
+    memcpy(&dev_info->gpu_props.memory_free, ptr, sizeof(float));
+    ptr += sizeof(float);
+    memcpy(&dev_info->gpu_props.memory_total, ptr, sizeof(float));
+}
diff --git a/common/profiler.h b/common/profiler.h
index 4ff0daca..b768b7cc 100644
--- a/common/profiler.h
+++ b/common/profiler.h
@@ -35,6 +35,7 @@ struct gpu_props {
 };
 
 struct device_info {
+    uint32_t           rank;
     const char *       device_name;
     float              disk_read_bandwidth;  // in GB/s
     struct cpu_props   cpu_props;
@@ -50,14 +51,18 @@ uint64_t device_physical_memory(bool available);
 uint64_t device_swap_memory    (bool available);
 uint64_t device_disk_read_bw   (const char * test_file, size_t buffer_size_mb);
 uint64_t device_memory_bw      (size_t buffer_size_mb);
-void     device_get_props      (struct llama_model * model, int device, struct ggml_backend_dev_props * props);
+void     device_get_props      (struct llama_model * model, int device, struct ggml_backend_dev_props * props); 
+void     device_print_props    (struct device_info * dev_info_set, int n);
 
-int      device_has_metal(void);
-int      device_has_cuda(void);
-int      device_has_vulkan(void);
+int      device_has_metal  (void);
+int      device_has_cuda   (void);
+int      device_has_vulkan (void);
 int      device_has_kompute(void);
 int      device_has_gpublas(void);
-int      device_has_blas(void);
-int      device_has_sycl(void);
+int      device_has_blas   (void);
+int      device_has_sycl   (void);
+
+size_t   serialize  (const struct device_info * dev_info, char ** buffer);
+void     deserialize(const char * buffer, struct device_info * dev_info);
 
 #endif // PROFILER_H
diff --git a/include/llama.h b/include/llama.h
index 4fb38e40..e3890666 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -432,13 +432,16 @@ extern "C" {
 
     LLAMA_API void llama_free_model(struct llama_model * model);
 
-    LLAMA_API void llama_init_sockets(struct llama_context * ctx, uint32_t n_world, uint32_t my_rank);
-    LLAMA_API void llama_free_sockets(struct llama_context * ctx, char ** msg);
+    LLAMA_API void llama_init_sockets       (struct llama_context * ctx, uint32_t n_world, uint32_t my_rank);
+    LLAMA_API void llama_free_sockets       (struct llama_context * ctx, char ** msg);
+    LLAMA_API int  llama_collect_device_info(struct device_info * dev_info_set, struct llama_context * ctx);
+    LLAMA_API int  llama_send_device_info   (struct device_info * dev_info,     struct llama_context * ctx);
 
     // TODO: rename to llama_init_from_model
     LLAMA_API struct llama_context * llama_new_context_with_model(
                      struct llama_model * model,
             struct llama_context_params   params);
+    LLAMA_API void * llama_context_setup_backend(struct llama_context * ctx);
 
     // Frees all allocated memory
     LLAMA_API void llama_free(struct llama_context * ctx);
diff --git a/src/llama.cpp b/src/llama.cpp
index bb326b38..9a552ee2 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -2567,16 +2567,18 @@ struct llama_hparams {
 static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
 
 struct llama_cparams {
-    uint32_t n_world;
-    uint32_t rank;
-    uint32_t n_layer_window[32];
-    bool     unload;
-    uint32_t n_ctx;           // context size used during inference
-    uint32_t n_batch;
-    uint32_t n_ubatch;
-    uint32_t n_seq_max;
-    int      n_threads;       // number of threads to use for generation
-    int      n_threads_batch; // number of threads to use for batch processing
+    uint32_t  n_world;
+    uint32_t  rank;
+    uint32_t  n_layer_window[32];
+    bool      unload;
+    uint32_t  n_ctx;           // context size used during inference
+    ggml_type type_k;
+    ggml_type type_v;
+    uint32_t  n_batch;
+    uint32_t  n_ubatch;
+    uint32_t  n_seq_max;
+    int       n_threads;       // number of threads to use for generation
+    int       n_threads_batch; // number of threads to use for batch processing
 
     float rope_freq_base;
     float rope_freq_scale;
@@ -3579,39 +3581,6 @@ void llama_profile_device(device_info * dev_info, struct llama_model * model, co
     dev_info->gpu_props.description     = gpu_props.description;
     dev_info->gpu_props.memory_free     = round(gpu_props.memory_free  / (double)(1 << 30) * 100) / 100;
     dev_info->gpu_props.memory_total    = round(gpu_props.memory_total / (double)(1 << 30) * 100) / 100;
-
-    LLAMA_LOG_INFO("\n");
-    LLAMA_LOG_INFO("Device Info:\n");
-    LLAMA_LOG_INFO("  Device Name               : %s\n",        dev_info->device_name);
-    LLAMA_LOG_INFO("  CPU Name                  : %s\n",        dev_info->cpu_props.name);
-    LLAMA_LOG_INFO("  CPU Description           : %s\n",        dev_info->cpu_props.description);
-    LLAMA_LOG_INFO("  Number of CPU cores       : %u\n",        dev_info->cpu_props.cores);
-    LLAMA_LOG_INFO("  Disk Read Bandwidth       : %.2f GB/s\n", dev_info->disk_read_bandwidth);
-    LLAMA_LOG_INFO("\n");
-
-    LLAMA_LOG_INFO("Memory Information:\n");
-    LLAMA_LOG_INFO("  Physical Mem Total        : %.2f GB\n",   dev_info->memory.total_physical);
-    LLAMA_LOG_INFO("  Physical Mem Available    : %.2f GB\n",   dev_info->memory.available_physical);
-    LLAMA_LOG_INFO("  Swap Memory Total         : %.2f GB\n",   dev_info->memory.total_swap);
-    LLAMA_LOG_INFO("  Swap Memory Available     : %.2f GB\n",   dev_info->memory.available_swap);
-    LLAMA_LOG_INFO("  Mem Bandwidth             : %.2f GB/s\n", dev_info->memory.bandwidth);
-    LLAMA_LOG_INFO("\n");
-
-    LLAMA_LOG_INFO("GPU Support:\n");
-    LLAMA_LOG_INFO("  Metal                     : %i\n",        dev_info->gpu_support.metal);
-    LLAMA_LOG_INFO("  CUDA                      : %i\n",        dev_info->gpu_support.cuda);
-    LLAMA_LOG_INFO("  Vulkan                    : %i\n",        dev_info->gpu_support.vulkan);
-    LLAMA_LOG_INFO("  Kompute                   : %i\n",        dev_info->gpu_support.kompute);
-    LLAMA_LOG_INFO("  GPU BLAS                  : %i\n",        dev_info->gpu_support.gpublas);
-    LLAMA_LOG_INFO("  BLAS                      : %i\n",        dev_info->gpu_support.blas);
-    LLAMA_LOG_INFO("  SYCL                      : %i\n",        dev_info->gpu_support.sycl);
-    LLAMA_LOG_INFO("\n");
-
-    LLAMA_LOG_INFO("GPU Properties:\n");
-    LLAMA_LOG_INFO("  GPU Name                  : %s\n",        dev_info->gpu_props.name);
-    LLAMA_LOG_INFO("  Description               : %s\n",        dev_info->gpu_props.description);
-    LLAMA_LOG_INFO("  Memory Free               : %.2f GB\n",   dev_info->gpu_props.memory_free);
-    LLAMA_LOG_INFO("  Memory Total              : %.2f GB\n",   dev_info->gpu_props.memory_total);
 }
 
 ggml_backend_buffer_type_t llama_dev_buffer_type(struct llama_model * model, int device) {
@@ -19815,6 +19784,63 @@ void llama_init_sockets(struct llama_context * ctx, uint32_t n_world, uint32_t m
     }
 }
 
+int llama_collect_device_info(struct device_info * dev_info_set, struct llama_context * ctx) {
+    uint32_t n_world = ctx->cparams.n_world;
+    if (n_world == 1) {
+        return 0;
+    }
+
+    GGML_ASSERT(dev_info_set != nullptr);
+    GGML_ASSERT(ctx != nullptr && ctx->send_socket != nullptr);
+    try {
+        char * buffer = nullptr;
+        size_t buffer_size = serialize(&dev_info_set[0], &buffer);
+
+        std::vector<zmq::message_t> send_msgs;
+        send_msgs.emplace_back(buffer, buffer_size);
+        zmq::send_multipart(*ctx->send_socket, send_msgs);
+
+        free(buffer);
+    } catch (const zmq::error_t& e) {
+        LLAMA_LOG_INFO("Failed to send data: %s\n", e.what());
+        return -1;
+    }
+
+    std::vector<zmq::message_t> recv_msgs;
+    if (!zmq::recv_multipart(*ctx->recv_socket, std::back_inserter(recv_msgs))) {
+        return -1;
+    }
+    GGML_ASSERT(recv_msgs.size() == n_world);
+
+    for (size_t i = 0; i < recv_msgs.size(); i++) {
+        deserialize((const char *)recv_msgs[i].data(), &dev_info_set[i]);
+    }
+    return 0;
+}
+
+int llama_send_device_info(struct device_info * dev_info, struct llama_context * ctx) {
+    std::vector<zmq::message_t> recv_msgs;
+    if (!zmq::recv_multipart(*ctx->recv_socket, std::back_inserter(recv_msgs))) {
+        return -1;
+    }
+
+    GGML_ASSERT(dev_info != nullptr);
+    GGML_ASSERT(ctx != nullptr && ctx->send_socket != nullptr);
+
+    try {
+        char * buffer = nullptr;
+        size_t buffer_size = serialize(dev_info, &buffer);
+
+        recv_msgs.emplace_back(buffer, buffer_size);
+        zmq::send_multipart(*ctx->send_socket, recv_msgs);
+
+        free(buffer);
+    } catch (const zmq::error_t& e) {
+        LLAMA_LOG_INFO("Failed to send data: %s\n", e.what());
+        return -1;
+    }
+}
+
 void llama_free_sockets(struct llama_context * ctx, char ** msg) {
     const uint32_t n_world   = ctx->cparams.n_world;
     const uint32_t my_rank   = ctx->cparams.rank;
@@ -19902,6 +19928,8 @@ struct llama_context * llama_new_context_with_model(
     cparams.pooling_type     = params.pooling_type;
 
     cparams.n_ctx            = params.n_ctx           == 0    ? hparams.n_ctx_train           : params.n_ctx;
+    cparams.type_k           = params.type_k;
+    cparams.type_v           = params.type_v;    
     cparams.rope_freq_base   = params.rope_freq_base  == 0.0f ? hparams.rope_freq_base_train  : params.rope_freq_base;
     cparams.rope_freq_scale  = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
 
@@ -19981,19 +20009,27 @@ struct llama_context * llama_new_context_with_model(
     // build worst-case graph for encoder if a model contains encoder
     ctx->is_encoding = llama_model_has_encoder(model);
 
+    return ctx;
+}
+
+void * llama_context_setup_backend(struct llama_context * ctx) {
+    GGML_ASSERT(ctx != nullptr);
+    const auto * model   = &ctx->model;
+    const auto & hparams = ctx->model.hparams;
+    const auto & cparams = ctx->cparams;
+
     uint32_t kv_size = cparams.n_ctx;
-    ggml_type type_k = params.type_k;
-    ggml_type type_v = params.type_v;
+    ggml_type type_k = cparams.type_k;
+    ggml_type type_v = cparams.type_v;
 
     // Mamba only needs a constant number of KV cache cells per sequence
     if (llama_model_is_recurrent(model)) {
         // Mamba needs at least as many KV cells as there are sequences kept at any time
-        kv_size = std::max((uint32_t) 1, params.n_seq_max);
+        kv_size = std::max((uint32_t) 1, cparams.n_seq_max);
         // it's probably best to keep as much precision as possible for the states
-        type_k = GGML_TYPE_F32; // required by ggml_ssm_conv for Mamba's conv_states
-        type_v = GGML_TYPE_F32; // required by ggml_ssm_scan for Mamba's ssm_states
+        type_k  = GGML_TYPE_F32; // required by ggml_ssm_conv for Mamba's conv_states
+        type_v  = GGML_TYPE_F32; // required by ggml_ssm_scan for Mamba's ssm_states
     }
-
     GGML_ASSERT(hparams.n_embd_head_k % ggml_blck_size(type_k) == 0);
     GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0);
 
@@ -20189,9 +20225,9 @@ struct llama_context * llama_new_context_with_model(
         }
 
         // graph outputs buffer, reserve for rank 0 only
-        if (params.rank == 0) {
+        if (cparams.rank == 0) {
             // resized during inference when a batch uses more outputs
-            if (llama_output_reserve(*ctx, params.n_seq_max) < params.n_seq_max) {
+            if (llama_output_reserve(*ctx, cparams.n_seq_max) < cparams.n_seq_max) {
                 LLAMA_LOG_ERROR("%s: failed to reserve initial output buffer\n", __func__);
                 llama_free(ctx);
                 return nullptr;
@@ -20226,7 +20262,7 @@ struct llama_context * llama_new_context_with_model(
                 llama_get_device_count(*model) > 1 &&
                 model->n_gpu_layers > (int)model->hparams.n_layer &&
                 model->split_mode == LLAMA_SPLIT_MODE_LAYER &&
-                params.offload_kqv;
+                cparams.offload_kqv;
 
             // pipeline parallelism requires support for async compute and events in all devices
             if (pipeline_parallel) {