From 5b4a63abb81f3f1d0853839613a70afded2cb4d7 Mon Sep 17 00:00:00 2001 From: DeEMO Date: Fri, 27 Jun 2025 06:30:57 +0000 Subject: [PATCH] fix: load draft model first --- examples/server/server.cpp | 66 ++++++++++++++++++++------------------ 1 file changed, 35 insertions(+), 31 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 93a9cf33..5b970e4f 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -700,6 +700,40 @@ struct server_context { // dedicate one sequence to the system prompt params.n_parallel += 1; + + // load draft model first + llama_init_result llama_init_dft; + if (!params.speculative.model.empty()) { + SRV_INF("loading draft model '%s'\n", params.speculative.model.c_str()); + + auto params_dft = params; + + params_dft.model = params.speculative.model; + params_dft.n_ctx = params.speculative.n_ctx; + params_dft.n_gpu_layers = params.speculative.n_gpu_layers; + params_dft.use_mlock = true; + params_dft.n_world = 1; // do not split the draft model across devicesAdd commentMore actions + params_dft.rank = 0; // always load the draft model on the head device + + std::fill_n(params_dft.n_layer_window, params.n_world, 0); + + llama_init_dft = llama_init_from_gpt_params(params_dft); + + model_dft = llama_init_dft.model; + + if (model_dft == nullptr) { + SRV_ERR("failed to load draft model, '%s'\n", params.speculative.model.c_str()); + return false; + } + + cparams_dft = llama_context_params_from_gpt_params(params); + cparams_dft.n_batch = llama_n_ctx(llama_init_dft.context); + cparams_dft.n_world = 1; + cparams_dft.rank = 0; + std::fill_n(cparams_dft.n_layer_window, 32, 0); + cparams_dft.n_layer_window[0] = llama_n_layer(model_dft); + cparams_dft.n_gpu_layers = params.speculative.n_gpu_layers; + } llama_init_result llama_init = llama_init_from_gpt_params(params); @@ -719,29 +753,7 @@ struct server_context { add_bos_token = llama_add_bos_token(model); has_eos_token = !llama_add_eos_token(model); - if (!params.speculative.model.empty()) { - SRV_INF("loading draft model '%s'\n", params.speculative.model.c_str()); - - auto params_dft = params; - - params_dft.model = params.speculative.model; - params_dft.n_ctx = params.speculative.n_ctx; - params_dft.n_gpu_layers = params.speculative.n_gpu_layers; - params_dft.use_mlock = true; - params_dft.n_world = 1; // do not split the draft model across devicesAdd commentMore actions - params_dft.rank = 0; // always load the draft model on the head device - - std::fill_n(params_dft.n_layer_window, params.n_world, 0); - - llama_init_result llama_init_dft = llama_init_from_gpt_params(params_dft); - - model_dft = llama_init_dft.model; - - if (model_dft == nullptr) { - SRV_ERR("failed to load draft model, '%s'\n", params.speculative.model.c_str()); - return false; - } - + if (!params.speculative.model.empty()){ if (!common_speculative_are_compatible(ctx, llama_init_dft.context)) { SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params.speculative.model.c_str(), params.model.c_str()); @@ -750,14 +762,6 @@ struct server_context { return false; } - - cparams_dft = llama_context_params_from_gpt_params(params); - cparams_dft.n_batch = llama_n_ctx(llama_init_dft.context); - cparams_dft.n_world = 1; - cparams_dft.rank = 0; - std::fill_n(cparams_dft.n_layer_window, 32, 0); - cparams_dft.n_layer_window[0] = llama_n_layer(model_dft); - cparams_dft.n_gpu_layers = params.speculative.n_gpu_layers; // the context is not needed - we will create one for each slot llama_free(llama_init_dft.context);