mirror of
https://github.com/Lizonghang/prima.cpp.git
synced 2025-09-10 06:25:07 +00:00
fix: some fields in cparams_draft
This commit is contained in:
parent
2e8e42a5ad
commit
d248f3c40e
2 changed files with 13 additions and 6 deletions
|
@ -1555,13 +1555,14 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
||||||
{"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
|
{"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
|
||||||
"number of layers to store in VRAM for the draft model",
|
"number of layers to store in VRAM for the draft model",
|
||||||
[](gpt_params & params, int value) {
|
[](gpt_params & params, int value) {
|
||||||
params.n_gpu_layers_draft = value;
|
params.n_gpu_layers_draft = value; // TODO: remove
|
||||||
|
params.speculative.n_gpu_layers = value;
|
||||||
if (!llama_supports_gpu_offload()) {
|
if (!llama_supports_gpu_offload()) {
|
||||||
fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n");
|
fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n");
|
||||||
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
|
||||||
add_opt(llama_arg(
|
add_opt(llama_arg(
|
||||||
{"-sm", "--split-mode"}, "{none,layer,row}",
|
{"-sm", "--split-mode"}, "{none,layer,row}",
|
||||||
"how to split the model across multiple GPUs, one of:\n"
|
"how to split the model across multiple GPUs, one of:\n"
|
||||||
|
|
|
@ -727,6 +727,7 @@ struct server_context {
|
||||||
params_dft.model = params.speculative.model;
|
params_dft.model = params.speculative.model;
|
||||||
params_dft.n_ctx = params.speculative.n_ctx;
|
params_dft.n_ctx = params.speculative.n_ctx;
|
||||||
params_dft.n_gpu_layers = params.speculative.n_gpu_layers;
|
params_dft.n_gpu_layers = params.speculative.n_gpu_layers;
|
||||||
|
params_dft.use_mlock = true;
|
||||||
params_dft.n_world = 1; // do not split the draft model across devicesAdd commentMore actions
|
params_dft.n_world = 1; // do not split the draft model across devicesAdd commentMore actions
|
||||||
params_dft.rank = 0; // always load the draft model on the head device
|
params_dft.rank = 0; // always load the draft model on the head device
|
||||||
|
|
||||||
|
@ -749,9 +750,14 @@ struct server_context {
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
cparams_dft = llama_context_params_from_gpt_params(params);
|
cparams_dft = llama_context_params_from_gpt_params(params);
|
||||||
cparams_dft.n_batch = llama_n_ctx(llama_init_dft.context);
|
cparams_dft.n_batch = llama_n_ctx(llama_init_dft.context);
|
||||||
|
cparams_dft.n_world = 1;
|
||||||
|
cparams_dft.rank = 0;
|
||||||
|
std::fill_n(cparams_dft.n_layer_window, 32, 0);
|
||||||
|
cparams_dft.n_layer_window[0] = llama_n_layer(model_dft);
|
||||||
|
cparams_dft.n_gpu_layers = params.speculative.n_gpu_layers;
|
||||||
|
|
||||||
// the context is not needed - we will create one for each slot
|
// the context is not needed - we will create one for each slot
|
||||||
llama_free(llama_init_dft.context);
|
llama_free(llama_init_dft.context);
|
||||||
|
@ -785,10 +791,10 @@ struct server_context {
|
||||||
|
|
||||||
slot.ctx_dft = llama_new_context_with_model(model_dft, cparams_dft);
|
slot.ctx_dft = llama_new_context_with_model(model_dft, cparams_dft);
|
||||||
|
|
||||||
if (llama_context_setup_backend(model, cparams_dft, slot.ctx_dft) == nullptr) {
|
if (llama_context_setup_backend(model_dft, cparams_dft, slot.ctx_dft) == nullptr) {
|
||||||
SRV_ERR("%s: failed to setup context with model '%s'\n", __func__, params.model.c_str());
|
SRV_ERR("%s: failed to setup context with model '%s'\n", __func__, params.speculative.model.c_str());
|
||||||
llama_free(slot.ctx_dft);
|
llama_free(slot.ctx_dft);
|
||||||
llama_free_model(model);
|
llama_free_model(model_dft);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue