diff --git a/common/arg.cpp b/common/arg.cpp index 813f87e8..2f999dcc 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1555,13 +1555,14 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, {"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N", "number of layers to store in VRAM for the draft model", [](gpt_params & params, int value) { - params.n_gpu_layers_draft = value; + params.n_gpu_layers_draft = value; // TODO: remove + params.speculative.n_gpu_layers = value; if (!llama_supports_gpu_offload()) { fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n"); fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); } } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER})); add_opt(llama_arg( {"-sm", "--split-mode"}, "{none,layer,row}", "how to split the model across multiple GPUs, one of:\n" diff --git a/examples/server/server.cpp b/examples/server/server.cpp index d086eaf3..93a9cf33 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -727,6 +727,7 @@ struct server_context { params_dft.model = params.speculative.model; params_dft.n_ctx = params.speculative.n_ctx; params_dft.n_gpu_layers = params.speculative.n_gpu_layers; + params_dft.use_mlock = true; params_dft.n_world = 1; // do not split the draft model across devicesAdd commentMore actions params_dft.rank = 0; // always load the draft model on the head device @@ -749,9 +750,14 @@ struct server_context { return false; } - + cparams_dft = llama_context_params_from_gpt_params(params); cparams_dft.n_batch = llama_n_ctx(llama_init_dft.context); + cparams_dft.n_world = 1; + cparams_dft.rank = 0; + std::fill_n(cparams_dft.n_layer_window, 32, 0); + cparams_dft.n_layer_window[0] = llama_n_layer(model_dft); + cparams_dft.n_gpu_layers = params.speculative.n_gpu_layers; // the context is not needed - we will create one for each slot llama_free(llama_init_dft.context); @@ -785,10 +791,10 @@ struct server_context { slot.ctx_dft = llama_new_context_with_model(model_dft, cparams_dft); - if (llama_context_setup_backend(model, cparams_dft, slot.ctx_dft) == nullptr) { - SRV_ERR("%s: failed to setup context with model '%s'\n", __func__, params.model.c_str()); + if (llama_context_setup_backend(model_dft, cparams_dft, slot.ctx_dft) == nullptr) { + SRV_ERR("%s: failed to setup context with model '%s'\n", __func__, params.speculative.model.c_str()); llama_free(slot.ctx_dft); - llama_free_model(model); + llama_free_model(model_dft); return; }