diff --git a/common/common.cpp b/common/common.cpp index 55807f78..fd02664d 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1527,6 +1527,12 @@ static bool assign_layers_to_device( // struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { + +#if !(defined(GGML_USE_METAL) || defined(GGML_USE_CUDA)) + // reset n_gpu_layers to 0 if GPU is not used + params.n_gpu_layers = 0; +#endif + llama_init_result iparams; auto mparams = llama_model_params_from_gpt_params(params); @@ -1582,6 +1588,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { if (n_world == 1) { uint32_t n_layers = llama_model_n_layers(model); + // assign all layers to this device params.n_layer_window[0] = n_layers; cparams.n_layer_window[0] = n_layers; mparams.n_layer_window[0] = n_layers; @@ -1594,7 +1601,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { // broadcast startup args struct startup_args args; - if (my_rank==0){ + if (my_rank == 0){ args.should_profile = auto_schedule; } llama_bcast_startup_args(lctx, my_rank, &args); diff --git a/common/profiler.cpp b/common/profiler.cpp index 18b345a9..461cc3b9 100644 --- a/common/profiler.cpp +++ b/common/profiler.cpp @@ -350,7 +350,6 @@ float device_inp_embd_delay(struct llama_model * model, enum ggml_type src0t, in return 0.0f; } - size_t QK_K = 0; switch (src0t) { case GGML_TYPE_F32: { matrix_B = malloc(embd_size * sizeof(float)); diff --git a/include/llama.h b/include/llama.h index 9f3da708..fd4fec40 100644 --- a/include/llama.h +++ b/include/llama.h @@ -453,7 +453,7 @@ extern "C" { LLAMA_API void llama_free_sockets (struct llama_context * ctx, char ** msg); LLAMA_API int llama_gather_device_info(struct llama_context * ctx, struct device_info * dev_info_set); LLAMA_API int llama_send_device_info (struct llama_context * ctx, struct device_info * dev_info); - LLAMA_API int llama_bcast_startup_args(struct llama_context * ctx, uint32_t rank, struct startup_args * args); + LLAMA_API int llama_bcast_startup_args(struct llama_context * ctx, uint32_t rank, struct startup_args * args); LLAMA_API int llama_bcast_layer_setup (struct llama_context * ctx, uint32_t * n_layer_window, uint32_t * n_gpu_layers); LLAMA_API int llama_recv_layer_setup (struct llama_context * ctx, uint32_t * n_layer_window, uint32_t * n_gpu_layers); diff --git a/src/llama.cpp b/src/llama.cpp index 87ae83ac..718327e0 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -20262,13 +20262,13 @@ int llama_send_device_info(struct llama_context * ctx, struct device_info * dev_ return 0; } -LLAMA_API int llama_bcast_startup_args(llama_context *ctx, uint32_t rank, startup_args *args) { +int llama_bcast_startup_args(llama_context * ctx, uint32_t rank, startup_args * args) { int32_t n_world = ctx->cparams.n_world; if (n_world == 1) { return 0; } GGML_ASSERT(ctx != nullptr && ctx->send_socket != nullptr); - if (rank==0){ + if (rank == 0){ // send try { std::vector send_msgs; @@ -20289,7 +20289,7 @@ LLAMA_API int llama_bcast_startup_args(llama_context *ctx, uint32_t rank, startu GGML_ASSERT(recv_msgs[1].size() == sizeof(bool)); bool should_profile = *static_cast(recv_msgs[1].data()); args->should_profile = should_profile; - if (rank != n_world-1){ + if ((int)rank != (int)n_world - 1){ // send try { zmq::send_multipart(*ctx->send_socket, recv_msgs);