From 53cb3a606990ebed2cdf9aa4dfa90e4fc42c9ee5 Mon Sep 17 00:00:00 2001 From: Lizonghang <870644199@qq.com> Date: Thu, 7 Nov 2024 22:02:01 +0400 Subject: [PATCH] synchronize device info --- common/common.cpp | 34 ++++-- common/profiler.cpp | 283 +++++++++++++++++++++++++++++++++++++++++++- common/profiler.h | 17 ++- include/llama.h | 7 +- src/llama.cpp | 140 ++++++++++++++-------- 5 files changed, 408 insertions(+), 73 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index a31a2e6e..6438daae 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -833,10 +833,6 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { model = llama_load_model_from_file(params.model.c_str(), mparams); } - // profile devices and determine the best setup - device_info dev_info; - llama_profile_device(&dev_info, model, params.model.c_str()); - if (model == NULL) { LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.c_str()); return iparams; @@ -866,17 +862,35 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { } } - auto cparams = llama_context_params_from_gpt_params(params); + // get device profile + device_info dev_info; + dev_info.rank = params.rank; + llama_profile_device(&dev_info, model, params.model.c_str()); - llama_context * lctx = llama_new_context_with_model(model, cparams); - if (lctx == NULL) { - LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str()); + // create llama context + struct llama_context_params cparams = llama_context_params_from_gpt_params(params); + llama_context * lctx = llama_new_context_with_model(model, cparams); + + // initialize sockets + llama_init_sockets(lctx, cparams.n_world, cparams.rank); + + // sychronize device profile to the master node + struct device_info * dev_info_set = nullptr; + if (params.rank == 0) { + dev_info_set = (struct device_info *)malloc(cparams.n_world * sizeof(struct device_info)); + dev_info_set[0] = dev_info; + llama_collect_device_info(dev_info_set, lctx); + device_print_props(dev_info_set, cparams.n_world); + } else { + llama_send_device_info(&dev_info, lctx); + } + + if (llama_context_setup_backend(lctx) == nullptr) { + LOG_ERR("%s: failed to setup context with model '%s'\n", __func__, params.model.c_str()); llama_free_model(model); return iparams; } - llama_init_sockets(lctx, cparams.n_world, cparams.rank); - if (!params.control_vectors.empty()) { if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1; if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model); diff --git a/common/profiler.cpp b/common/profiler.cpp index 25a93ef2..487f3379 100644 --- a/common/profiler.cpp +++ b/common/profiler.cpp @@ -300,8 +300,6 @@ int device_has_sycl(void) { return ggml_cpu_has_sycl(); } -// ggml_backend_buffer_type_t llama_dev_buffer_type(const llama_model * model, int device) - void device_get_props(struct llama_model * model, int device, struct ggml_backend_dev_props * props) { ggml_backend_buffer_type_t buft_type; if (device == -1) { // type cpu @@ -311,4 +309,283 @@ void device_get_props(struct llama_model * model, int device, struct ggml_backen } ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft_type); ggml_backend_dev_get_props(dev, props); -} \ No newline at end of file +} + +void device_print_props(struct device_info * dev_info_set, int n) { + LOG_INF("\n-------------------------------------------------------------------------------------------\n"); + LOG_INF("| Property "); + for (int i = 0; i < n; ++i) { + LOG_INF("| Rank %-8d", i); + GGML_ASSERT(dev_info_set[i].rank == i); + } + LOG_INF("\n-------------------------------------------------------------------------------------------\n"); + + LOG_INF("| Device Name "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.10s ", dev_info_set[i].device_name); + } + LOG_INF("\n"); + + LOG_INF("| CPU Name "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.10s ", dev_info_set[i].cpu_props.name); + } + LOG_INF("\n"); + + LOG_INF("| CPU Description "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.10s ", dev_info_set[i].cpu_props.description); + } + LOG_INF("\n"); + + LOG_INF("| Number of CPU cores "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10u ", dev_info_set[i].cpu_props.cores); + } + LOG_INF("\n"); + + LOG_INF("| Physical Mem Total (GB) "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.2f ", dev_info_set[i].memory.total_physical); + } + LOG_INF("\n"); + + LOG_INF("| Physical Mem Available (GB) "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.2f ", dev_info_set[i].memory.available_physical); + } + LOG_INF("\n"); + + LOG_INF("| Swap Mem Total (GB) "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.2f ", dev_info_set[i].memory.total_swap); + } + LOG_INF("\n"); + + LOG_INF("| Swap Mem Available (GB) "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.2f ", dev_info_set[i].memory.available_swap); + } + LOG_INF("\n"); + + LOG_INF("| Mem Bandwidth (GB/s) "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.2f ", dev_info_set[i].memory.bandwidth); + } + LOG_INF("\n"); + + LOG_INF("| Disk Read Bandwidth (GB/s) "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.2f ", dev_info_set[i].disk_read_bandwidth); + } + LOG_INF("\n"); + + LOG_INF("| GPU Metal "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10d ", dev_info_set[i].gpu_support.metal); + } + LOG_INF("\n"); + + LOG_INF("| GPU CUDA "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10d ", dev_info_set[i].gpu_support.cuda); + } + LOG_INF("\n"); + + LOG_INF("| GPU Vulkan "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10d ", dev_info_set[i].gpu_support.vulkan); + } + LOG_INF("\n"); + + LOG_INF("| GPU Kompute "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10d ", dev_info_set[i].gpu_support.kompute); + } + LOG_INF("\n"); + + LOG_INF("| GPU BLAS "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10d ", dev_info_set[i].gpu_support.gpublas); + } + LOG_INF("\n"); + + LOG_INF("| BLAS "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10d ", dev_info_set[i].gpu_support.blas); + } + LOG_INF("\n"); + + LOG_INF("| SYCL "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10d ", dev_info_set[i].gpu_support.sycl); + } + LOG_INF("\n"); + + LOG_INF("| GPU Name "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.10s ", dev_info_set[i].gpu_props.name); + } + LOG_INF("\n"); + + LOG_INF("| GPU Description "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.10s ", dev_info_set[i].gpu_props.description); + } + LOG_INF("\n"); + + LOG_INF("| GPU Mem Free (GB) "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.2f ", dev_info_set[i].gpu_props.memory_free); + } + LOG_INF("\n"); + + LOG_INF("| GPU Mem Total (GB) "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.2f ", dev_info_set[i].gpu_props.memory_total); + } + LOG_INF("\n"); + + LOG_INF("-------------------------------------------------------------------------------------------\n\n"); +} + + +size_t serialize(const struct device_info * dev_info, char ** buffer) { + // calculate total size for serialized buffer + size_t device_name_len = strlen(dev_info->device_name) + 1; + size_t cpu_name_len = strlen(dev_info->cpu_props.name) + 1; + size_t cpu_description_len = strlen(dev_info->cpu_props.description) + 1; + size_t gpu_name_len = strlen(dev_info->gpu_props.name) + 1; + size_t gpu_description_len = strlen(dev_info->gpu_props.description) + 1; + + size_t total_size = sizeof(uint32_t) + + sizeof(size_t) * 5 // for lengths of strings + + device_name_len + + cpu_name_len + + cpu_description_len + + gpu_name_len + + gpu_description_len + + sizeof(float) // disk_read_bandwidth + + sizeof(uint32_t) // cpu_props.cores + + sizeof(struct memory_info) + + sizeof(struct gpu_support) + + sizeof(float) * 2; // gpu_props.memory_free and gpu_props.memory_total + + *buffer = (char *)malloc(total_size); + char * ptr = *buffer; + + // rank + memcpy(ptr, &dev_info->rank, sizeof(uint32_t)); + ptr += sizeof(uint32_t); + + // copy string lengths and string data + memcpy(ptr, &device_name_len, sizeof(size_t)); + ptr += sizeof(size_t); + memcpy(ptr, dev_info->device_name, device_name_len); + ptr += device_name_len; + + memcpy(ptr, &cpu_name_len, sizeof(size_t)); + ptr += sizeof(size_t); + memcpy(ptr, dev_info->cpu_props.name, cpu_name_len); + ptr += cpu_name_len; + + memcpy(ptr, &cpu_description_len, sizeof(size_t)); + ptr += sizeof(size_t); + memcpy(ptr, dev_info->cpu_props.description, cpu_description_len); + ptr += cpu_description_len; + + memcpy(ptr, &gpu_name_len, sizeof(size_t)); + ptr += sizeof(size_t); + memcpy(ptr, dev_info->gpu_props.name, gpu_name_len); + ptr += gpu_name_len; + + memcpy(ptr, &gpu_description_len, sizeof(size_t)); + ptr += sizeof(size_t); + memcpy(ptr, dev_info->gpu_props.description, gpu_description_len); + ptr += gpu_description_len; + + // copy the non-string members + memcpy(ptr, &dev_info->disk_read_bandwidth, sizeof(float)); + ptr += sizeof(float); + + memcpy(ptr, &dev_info->cpu_props.cores, sizeof(uint32_t)); + ptr += sizeof(uint32_t); + + memcpy(ptr, &dev_info->memory, sizeof(struct memory_info)); + ptr += sizeof(struct memory_info); + + memcpy(ptr, &dev_info->gpu_support, sizeof(struct gpu_support)); + ptr += sizeof(struct gpu_support); + + memcpy(ptr, &dev_info->gpu_props.memory_free, sizeof(float)); + ptr += sizeof(float); + + memcpy(ptr, &dev_info->gpu_props.memory_total, sizeof(float)); + + return total_size; +} + +void deserialize(const char * buffer, struct device_info * dev_info) { + const char * ptr = buffer; + + // rank + memcpy(&dev_info->rank, ptr, sizeof(uint32_t)); + ptr += sizeof(uint32_t); + + // device_name + size_t device_name_len; + memcpy(&device_name_len, ptr, sizeof(size_t)); + ptr += sizeof(size_t); + dev_info->device_name = (char *)malloc(device_name_len); + memcpy((void *)dev_info->device_name, ptr, device_name_len); + ptr += device_name_len; + + // cpu_props.name + size_t cpu_name_len; + memcpy(&cpu_name_len, ptr, sizeof(size_t)); + ptr += sizeof(size_t); + dev_info->cpu_props.name = (char *)malloc(cpu_name_len); + memcpy((void *)dev_info->cpu_props.name, ptr, cpu_name_len); + ptr += cpu_name_len; + + // cpu_props.description + size_t cpu_description_len; + memcpy(&cpu_description_len, ptr, sizeof(size_t)); + ptr += sizeof(size_t); + dev_info->cpu_props.description = (char *)malloc(cpu_description_len); + memcpy((void *)dev_info->cpu_props.description, ptr, cpu_description_len); + ptr += cpu_description_len; + + // gpu_props.name + size_t gpu_name_len; + memcpy(&gpu_name_len, ptr, sizeof(size_t)); + ptr += sizeof(size_t); + dev_info->gpu_props.name = (char *)malloc(gpu_name_len); + memcpy((void *)dev_info->gpu_props.name, ptr, gpu_name_len); + ptr += gpu_name_len; + + // gpu_props.description + size_t gpu_description_len; + memcpy(&gpu_description_len, ptr, sizeof(size_t)); + ptr += sizeof(size_t); + dev_info->gpu_props.description = (char *)malloc(gpu_description_len); + memcpy((void *)dev_info->gpu_props.description, ptr, gpu_description_len); + ptr += gpu_description_len; + + // other non-string members + memcpy(&dev_info->disk_read_bandwidth, ptr, sizeof(float)); + ptr += sizeof(float); + + memcpy(&dev_info->cpu_props.cores, ptr, sizeof(uint32_t)); + ptr += sizeof(uint32_t); + + memcpy(&dev_info->memory, ptr, sizeof(struct memory_info)); + ptr += sizeof(struct memory_info); + + memcpy(&dev_info->gpu_support, ptr, sizeof(struct gpu_support)); + ptr += sizeof(struct gpu_support); + + memcpy(&dev_info->gpu_props.memory_free, ptr, sizeof(float)); + ptr += sizeof(float); + memcpy(&dev_info->gpu_props.memory_total, ptr, sizeof(float)); +} diff --git a/common/profiler.h b/common/profiler.h index 4ff0daca..b768b7cc 100644 --- a/common/profiler.h +++ b/common/profiler.h @@ -35,6 +35,7 @@ struct gpu_props { }; struct device_info { + uint32_t rank; const char * device_name; float disk_read_bandwidth; // in GB/s struct cpu_props cpu_props; @@ -50,14 +51,18 @@ uint64_t device_physical_memory(bool available); uint64_t device_swap_memory (bool available); uint64_t device_disk_read_bw (const char * test_file, size_t buffer_size_mb); uint64_t device_memory_bw (size_t buffer_size_mb); -void device_get_props (struct llama_model * model, int device, struct ggml_backend_dev_props * props); +void device_get_props (struct llama_model * model, int device, struct ggml_backend_dev_props * props); +void device_print_props (struct device_info * dev_info_set, int n); -int device_has_metal(void); -int device_has_cuda(void); -int device_has_vulkan(void); +int device_has_metal (void); +int device_has_cuda (void); +int device_has_vulkan (void); int device_has_kompute(void); int device_has_gpublas(void); -int device_has_blas(void); -int device_has_sycl(void); +int device_has_blas (void); +int device_has_sycl (void); + +size_t serialize (const struct device_info * dev_info, char ** buffer); +void deserialize(const char * buffer, struct device_info * dev_info); #endif // PROFILER_H diff --git a/include/llama.h b/include/llama.h index 4fb38e40..e3890666 100644 --- a/include/llama.h +++ b/include/llama.h @@ -432,13 +432,16 @@ extern "C" { LLAMA_API void llama_free_model(struct llama_model * model); - LLAMA_API void llama_init_sockets(struct llama_context * ctx, uint32_t n_world, uint32_t my_rank); - LLAMA_API void llama_free_sockets(struct llama_context * ctx, char ** msg); + LLAMA_API void llama_init_sockets (struct llama_context * ctx, uint32_t n_world, uint32_t my_rank); + LLAMA_API void llama_free_sockets (struct llama_context * ctx, char ** msg); + LLAMA_API int llama_collect_device_info(struct device_info * dev_info_set, struct llama_context * ctx); + LLAMA_API int llama_send_device_info (struct device_info * dev_info, struct llama_context * ctx); // TODO: rename to llama_init_from_model LLAMA_API struct llama_context * llama_new_context_with_model( struct llama_model * model, struct llama_context_params params); + LLAMA_API void * llama_context_setup_backend(struct llama_context * ctx); // Frees all allocated memory LLAMA_API void llama_free(struct llama_context * ctx); diff --git a/src/llama.cpp b/src/llama.cpp index bb326b38..9a552ee2 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -2567,16 +2567,18 @@ struct llama_hparams { static_assert(std::is_trivially_copyable::value, "llama_hparams must be trivially copyable"); struct llama_cparams { - uint32_t n_world; - uint32_t rank; - uint32_t n_layer_window[32]; - bool unload; - uint32_t n_ctx; // context size used during inference - uint32_t n_batch; - uint32_t n_ubatch; - uint32_t n_seq_max; - int n_threads; // number of threads to use for generation - int n_threads_batch; // number of threads to use for batch processing + uint32_t n_world; + uint32_t rank; + uint32_t n_layer_window[32]; + bool unload; + uint32_t n_ctx; // context size used during inference + ggml_type type_k; + ggml_type type_v; + uint32_t n_batch; + uint32_t n_ubatch; + uint32_t n_seq_max; + int n_threads; // number of threads to use for generation + int n_threads_batch; // number of threads to use for batch processing float rope_freq_base; float rope_freq_scale; @@ -3579,39 +3581,6 @@ void llama_profile_device(device_info * dev_info, struct llama_model * model, co dev_info->gpu_props.description = gpu_props.description; dev_info->gpu_props.memory_free = round(gpu_props.memory_free / (double)(1 << 30) * 100) / 100; dev_info->gpu_props.memory_total = round(gpu_props.memory_total / (double)(1 << 30) * 100) / 100; - - LLAMA_LOG_INFO("\n"); - LLAMA_LOG_INFO("Device Info:\n"); - LLAMA_LOG_INFO(" Device Name : %s\n", dev_info->device_name); - LLAMA_LOG_INFO(" CPU Name : %s\n", dev_info->cpu_props.name); - LLAMA_LOG_INFO(" CPU Description : %s\n", dev_info->cpu_props.description); - LLAMA_LOG_INFO(" Number of CPU cores : %u\n", dev_info->cpu_props.cores); - LLAMA_LOG_INFO(" Disk Read Bandwidth : %.2f GB/s\n", dev_info->disk_read_bandwidth); - LLAMA_LOG_INFO("\n"); - - LLAMA_LOG_INFO("Memory Information:\n"); - LLAMA_LOG_INFO(" Physical Mem Total : %.2f GB\n", dev_info->memory.total_physical); - LLAMA_LOG_INFO(" Physical Mem Available : %.2f GB\n", dev_info->memory.available_physical); - LLAMA_LOG_INFO(" Swap Memory Total : %.2f GB\n", dev_info->memory.total_swap); - LLAMA_LOG_INFO(" Swap Memory Available : %.2f GB\n", dev_info->memory.available_swap); - LLAMA_LOG_INFO(" Mem Bandwidth : %.2f GB/s\n", dev_info->memory.bandwidth); - LLAMA_LOG_INFO("\n"); - - LLAMA_LOG_INFO("GPU Support:\n"); - LLAMA_LOG_INFO(" Metal : %i\n", dev_info->gpu_support.metal); - LLAMA_LOG_INFO(" CUDA : %i\n", dev_info->gpu_support.cuda); - LLAMA_LOG_INFO(" Vulkan : %i\n", dev_info->gpu_support.vulkan); - LLAMA_LOG_INFO(" Kompute : %i\n", dev_info->gpu_support.kompute); - LLAMA_LOG_INFO(" GPU BLAS : %i\n", dev_info->gpu_support.gpublas); - LLAMA_LOG_INFO(" BLAS : %i\n", dev_info->gpu_support.blas); - LLAMA_LOG_INFO(" SYCL : %i\n", dev_info->gpu_support.sycl); - LLAMA_LOG_INFO("\n"); - - LLAMA_LOG_INFO("GPU Properties:\n"); - LLAMA_LOG_INFO(" GPU Name : %s\n", dev_info->gpu_props.name); - LLAMA_LOG_INFO(" Description : %s\n", dev_info->gpu_props.description); - LLAMA_LOG_INFO(" Memory Free : %.2f GB\n", dev_info->gpu_props.memory_free); - LLAMA_LOG_INFO(" Memory Total : %.2f GB\n", dev_info->gpu_props.memory_total); } ggml_backend_buffer_type_t llama_dev_buffer_type(struct llama_model * model, int device) { @@ -19815,6 +19784,63 @@ void llama_init_sockets(struct llama_context * ctx, uint32_t n_world, uint32_t m } } +int llama_collect_device_info(struct device_info * dev_info_set, struct llama_context * ctx) { + uint32_t n_world = ctx->cparams.n_world; + if (n_world == 1) { + return 0; + } + + GGML_ASSERT(dev_info_set != nullptr); + GGML_ASSERT(ctx != nullptr && ctx->send_socket != nullptr); + try { + char * buffer = nullptr; + size_t buffer_size = serialize(&dev_info_set[0], &buffer); + + std::vector send_msgs; + send_msgs.emplace_back(buffer, buffer_size); + zmq::send_multipart(*ctx->send_socket, send_msgs); + + free(buffer); + } catch (const zmq::error_t& e) { + LLAMA_LOG_INFO("Failed to send data: %s\n", e.what()); + return -1; + } + + std::vector recv_msgs; + if (!zmq::recv_multipart(*ctx->recv_socket, std::back_inserter(recv_msgs))) { + return -1; + } + GGML_ASSERT(recv_msgs.size() == n_world); + + for (size_t i = 0; i < recv_msgs.size(); i++) { + deserialize((const char *)recv_msgs[i].data(), &dev_info_set[i]); + } + return 0; +} + +int llama_send_device_info(struct device_info * dev_info, struct llama_context * ctx) { + std::vector recv_msgs; + if (!zmq::recv_multipart(*ctx->recv_socket, std::back_inserter(recv_msgs))) { + return -1; + } + + GGML_ASSERT(dev_info != nullptr); + GGML_ASSERT(ctx != nullptr && ctx->send_socket != nullptr); + + try { + char * buffer = nullptr; + size_t buffer_size = serialize(dev_info, &buffer); + + recv_msgs.emplace_back(buffer, buffer_size); + zmq::send_multipart(*ctx->send_socket, recv_msgs); + + free(buffer); + } catch (const zmq::error_t& e) { + LLAMA_LOG_INFO("Failed to send data: %s\n", e.what()); + return -1; + } +} + void llama_free_sockets(struct llama_context * ctx, char ** msg) { const uint32_t n_world = ctx->cparams.n_world; const uint32_t my_rank = ctx->cparams.rank; @@ -19902,6 +19928,8 @@ struct llama_context * llama_new_context_with_model( cparams.pooling_type = params.pooling_type; cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx; + cparams.type_k = params.type_k; + cparams.type_v = params.type_v; cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base; cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale; @@ -19981,19 +20009,27 @@ struct llama_context * llama_new_context_with_model( // build worst-case graph for encoder if a model contains encoder ctx->is_encoding = llama_model_has_encoder(model); + return ctx; +} + +void * llama_context_setup_backend(struct llama_context * ctx) { + GGML_ASSERT(ctx != nullptr); + const auto * model = &ctx->model; + const auto & hparams = ctx->model.hparams; + const auto & cparams = ctx->cparams; + uint32_t kv_size = cparams.n_ctx; - ggml_type type_k = params.type_k; - ggml_type type_v = params.type_v; + ggml_type type_k = cparams.type_k; + ggml_type type_v = cparams.type_v; // Mamba only needs a constant number of KV cache cells per sequence if (llama_model_is_recurrent(model)) { // Mamba needs at least as many KV cells as there are sequences kept at any time - kv_size = std::max((uint32_t) 1, params.n_seq_max); + kv_size = std::max((uint32_t) 1, cparams.n_seq_max); // it's probably best to keep as much precision as possible for the states - type_k = GGML_TYPE_F32; // required by ggml_ssm_conv for Mamba's conv_states - type_v = GGML_TYPE_F32; // required by ggml_ssm_scan for Mamba's ssm_states + type_k = GGML_TYPE_F32; // required by ggml_ssm_conv for Mamba's conv_states + type_v = GGML_TYPE_F32; // required by ggml_ssm_scan for Mamba's ssm_states } - GGML_ASSERT(hparams.n_embd_head_k % ggml_blck_size(type_k) == 0); GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0); @@ -20189,9 +20225,9 @@ struct llama_context * llama_new_context_with_model( } // graph outputs buffer, reserve for rank 0 only - if (params.rank == 0) { + if (cparams.rank == 0) { // resized during inference when a batch uses more outputs - if (llama_output_reserve(*ctx, params.n_seq_max) < params.n_seq_max) { + if (llama_output_reserve(*ctx, cparams.n_seq_max) < cparams.n_seq_max) { LLAMA_LOG_ERROR("%s: failed to reserve initial output buffer\n", __func__); llama_free(ctx); return nullptr; @@ -20226,7 +20262,7 @@ struct llama_context * llama_new_context_with_model( llama_get_device_count(*model) > 1 && model->n_gpu_layers > (int)model->hparams.n_layer && model->split_mode == LLAMA_SPLIT_MODE_LAYER && - params.offload_kqv; + cparams.offload_kqv; // pipeline parallelism requires support for async compute and events in all devices if (pipeline_parallel) {