diff --git a/common/arg.cpp b/common/arg.cpp
index 813f87e8..2f999dcc 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -1555,13 +1555,14 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
         {"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
         "number of layers to store in VRAM for the draft model",
         [](gpt_params & params, int value) {
-            params.n_gpu_layers_draft = value;
+            params.n_gpu_layers_draft = value; // TODO: remove
+            params.speculative.n_gpu_layers = value;
             if (!llama_supports_gpu_offload()) {
                 fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n");
                 fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
             }
         }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
     add_opt(llama_arg(
         {"-sm", "--split-mode"}, "{none,layer,row}",
         "how to split the model across multiple GPUs, one of:\n"
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index d086eaf3..93a9cf33 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -727,6 +727,7 @@ struct server_context {
             params_dft.model        = params.speculative.model;
             params_dft.n_ctx        = params.speculative.n_ctx;
             params_dft.n_gpu_layers = params.speculative.n_gpu_layers;
+            params_dft.use_mlock    = true;
             params_dft.n_world      = 1;  // do not split the draft model across devicesAdd commentMore actions
             params_dft.rank         = 0;  // always load the draft model on the head device
 
@@ -749,9 +750,14 @@ struct server_context {
 
                 return false;
             }
-
+            
             cparams_dft = llama_context_params_from_gpt_params(params);
             cparams_dft.n_batch = llama_n_ctx(llama_init_dft.context);
+            cparams_dft.n_world = 1;
+            cparams_dft.rank    = 0;
+            std::fill_n(cparams_dft.n_layer_window, 32, 0);
+            cparams_dft.n_layer_window[0] = llama_n_layer(model_dft);
+            cparams_dft.n_gpu_layers      = params.speculative.n_gpu_layers;
 
             // the context is not needed - we will create one for each slot
             llama_free(llama_init_dft.context);
@@ -785,10 +791,10 @@ struct server_context {
 
                 slot.ctx_dft = llama_new_context_with_model(model_dft, cparams_dft);
                 
-                if (llama_context_setup_backend(model, cparams_dft, slot.ctx_dft) == nullptr) {
-                    SRV_ERR("%s: failed to setup context with model '%s'\n", __func__, params.model.c_str());
+                if (llama_context_setup_backend(model_dft, cparams_dft, slot.ctx_dft) == nullptr) {
+                    SRV_ERR("%s: failed to setup context with model '%s'\n", __func__, params.speculative.model.c_str());
                     llama_free(slot.ctx_dft);
-                    llama_free_model(model);
+                    llama_free_model(model_dft);
                     return;
                 }