From cfcdfd69bd7f4ee32851d5ad3652cc48b3958f76 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Sat, 7 Jun 2025 10:14:00 +0800 Subject: [PATCH] allow embeddings models to use mmap --- expose.h | 1 + koboldcpp.py | 3 ++- otherarch/embeddings_adapter.cpp | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/expose.h b/expose.h index e3dd3f6a3..aafc66fc3 100644 --- a/expose.h +++ b/expose.h @@ -256,6 +256,7 @@ struct embeddings_load_model_inputs const char * vulkan_info = nullptr; const int gpulayers = 0; const bool flash_attention = false; + const bool use_mmap = false; const bool quiet = false; const int debugmode = 0; }; diff --git a/koboldcpp.py b/koboldcpp.py index 80fc2ddc9..6d56601f6 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -346,6 +346,7 @@ class embeddings_load_model_inputs(ctypes.Structure): ("vulkan_info", ctypes.c_char_p), ("gpulayers", ctypes.c_int), ("flash_attention", ctypes.c_bool), + ("use_mmap", ctypes.c_bool), ("quiet", ctypes.c_bool), ("debugmode", ctypes.c_int)] @@ -1213,7 +1214,6 @@ def load_model(model_filename): inputs.lora_multiplier = args.loramult if args.lora: inputs.lora_filename = args.lora[0].encode("UTF-8") - inputs.use_mmap = False inputs.draftmodel_filename = args.draftmodel.encode("UTF-8") if args.draftmodel else "".encode("UTF-8") inputs.draft_amount = args.draftamount @@ -1741,6 +1741,7 @@ def embeddings_load_model(model_filename): inputs.gpulayers = 0 inputs.flash_attention = False inputs.threads = args.threads + inputs.use_mmap = args.usemmap inputs = set_backend_props(inputs) ret = handle.embeddings_load_model(inputs) return ret diff --git a/otherarch/embeddings_adapter.cpp b/otherarch/embeddings_adapter.cpp index 0a71a06f8..9bbc0f1f0 100644 --- a/otherarch/embeddings_adapter.cpp +++ b/otherarch/embeddings_adapter.cpp @@ -117,7 +117,7 @@ bool embeddingstype_load_model(const embeddings_load_model_inputs inputs) llama_model_params model_params = llama_model_default_params(); llama_context_params ctx_params = llama_context_default_params(); const int nthreads = inputs.threads; - model_params.use_mmap = false; + model_params.use_mmap = inputs.use_mmap; model_params.use_mlock = false; model_params.n_gpu_layers = inputs.gpulayers; //offload if possible model_params.split_mode = llama_split_mode::LLAMA_SPLIT_MODE_LAYER;