mirror of
https://github.com/Lizonghang/prima.cpp.git
synced 2025-09-05 22:09:03 +00:00
fix: load draft model first
This commit is contained in:
parent
0cdf99828c
commit
5b4a63abb8
1 changed files with 35 additions and 31 deletions
|
@ -700,6 +700,40 @@ struct server_context {
|
|||
|
||||
// dedicate one sequence to the system prompt
|
||||
params.n_parallel += 1;
|
||||
|
||||
// load draft model first
|
||||
llama_init_result llama_init_dft;
|
||||
if (!params.speculative.model.empty()) {
|
||||
SRV_INF("loading draft model '%s'\n", params.speculative.model.c_str());
|
||||
|
||||
auto params_dft = params;
|
||||
|
||||
params_dft.model = params.speculative.model;
|
||||
params_dft.n_ctx = params.speculative.n_ctx;
|
||||
params_dft.n_gpu_layers = params.speculative.n_gpu_layers;
|
||||
params_dft.use_mlock = true;
|
||||
params_dft.n_world = 1; // do not split the draft model across devicesAdd commentMore actions
|
||||
params_dft.rank = 0; // always load the draft model on the head device
|
||||
|
||||
std::fill_n(params_dft.n_layer_window, params.n_world, 0);
|
||||
|
||||
llama_init_dft = llama_init_from_gpt_params(params_dft);
|
||||
|
||||
model_dft = llama_init_dft.model;
|
||||
|
||||
if (model_dft == nullptr) {
|
||||
SRV_ERR("failed to load draft model, '%s'\n", params.speculative.model.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
cparams_dft = llama_context_params_from_gpt_params(params);
|
||||
cparams_dft.n_batch = llama_n_ctx(llama_init_dft.context);
|
||||
cparams_dft.n_world = 1;
|
||||
cparams_dft.rank = 0;
|
||||
std::fill_n(cparams_dft.n_layer_window, 32, 0);
|
||||
cparams_dft.n_layer_window[0] = llama_n_layer(model_dft);
|
||||
cparams_dft.n_gpu_layers = params.speculative.n_gpu_layers;
|
||||
}
|
||||
|
||||
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
||||
|
||||
|
@ -719,29 +753,7 @@ struct server_context {
|
|||
add_bos_token = llama_add_bos_token(model);
|
||||
has_eos_token = !llama_add_eos_token(model);
|
||||
|
||||
if (!params.speculative.model.empty()) {
|
||||
SRV_INF("loading draft model '%s'\n", params.speculative.model.c_str());
|
||||
|
||||
auto params_dft = params;
|
||||
|
||||
params_dft.model = params.speculative.model;
|
||||
params_dft.n_ctx = params.speculative.n_ctx;
|
||||
params_dft.n_gpu_layers = params.speculative.n_gpu_layers;
|
||||
params_dft.use_mlock = true;
|
||||
params_dft.n_world = 1; // do not split the draft model across devicesAdd commentMore actions
|
||||
params_dft.rank = 0; // always load the draft model on the head device
|
||||
|
||||
std::fill_n(params_dft.n_layer_window, params.n_world, 0);
|
||||
|
||||
llama_init_result llama_init_dft = llama_init_from_gpt_params(params_dft);
|
||||
|
||||
model_dft = llama_init_dft.model;
|
||||
|
||||
if (model_dft == nullptr) {
|
||||
SRV_ERR("failed to load draft model, '%s'\n", params.speculative.model.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!params.speculative.model.empty()){
|
||||
if (!common_speculative_are_compatible(ctx, llama_init_dft.context)) {
|
||||
SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params.speculative.model.c_str(), params.model.c_str());
|
||||
|
||||
|
@ -750,14 +762,6 @@ struct server_context {
|
|||
|
||||
return false;
|
||||
}
|
||||
|
||||
cparams_dft = llama_context_params_from_gpt_params(params);
|
||||
cparams_dft.n_batch = llama_n_ctx(llama_init_dft.context);
|
||||
cparams_dft.n_world = 1;
|
||||
cparams_dft.rank = 0;
|
||||
std::fill_n(cparams_dft.n_layer_window, 32, 0);
|
||||
cparams_dft.n_layer_window[0] = llama_n_layer(model_dft);
|
||||
cparams_dft.n_gpu_layers = params.speculative.n_gpu_layers;
|
||||
|
||||
// the context is not needed - we will create one for each slot
|
||||
llama_free(llama_init_dft.context);
|
||||
|
|
Loading…
Add table
Reference in a new issue