mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-10 17:14:36 +00:00
allow embeddings models to use mmap
This commit is contained in:
parent
abc272d89f
commit
cfcdfd69bd
3 changed files with 4 additions and 2 deletions
1
expose.h
1
expose.h
|
@ -256,6 +256,7 @@ struct embeddings_load_model_inputs
|
||||||
const char * vulkan_info = nullptr;
|
const char * vulkan_info = nullptr;
|
||||||
const int gpulayers = 0;
|
const int gpulayers = 0;
|
||||||
const bool flash_attention = false;
|
const bool flash_attention = false;
|
||||||
|
const bool use_mmap = false;
|
||||||
const bool quiet = false;
|
const bool quiet = false;
|
||||||
const int debugmode = 0;
|
const int debugmode = 0;
|
||||||
};
|
};
|
||||||
|
|
|
@ -346,6 +346,7 @@ class embeddings_load_model_inputs(ctypes.Structure):
|
||||||
("vulkan_info", ctypes.c_char_p),
|
("vulkan_info", ctypes.c_char_p),
|
||||||
("gpulayers", ctypes.c_int),
|
("gpulayers", ctypes.c_int),
|
||||||
("flash_attention", ctypes.c_bool),
|
("flash_attention", ctypes.c_bool),
|
||||||
|
("use_mmap", ctypes.c_bool),
|
||||||
("quiet", ctypes.c_bool),
|
("quiet", ctypes.c_bool),
|
||||||
("debugmode", ctypes.c_int)]
|
("debugmode", ctypes.c_int)]
|
||||||
|
|
||||||
|
@ -1213,7 +1214,6 @@ def load_model(model_filename):
|
||||||
inputs.lora_multiplier = args.loramult
|
inputs.lora_multiplier = args.loramult
|
||||||
if args.lora:
|
if args.lora:
|
||||||
inputs.lora_filename = args.lora[0].encode("UTF-8")
|
inputs.lora_filename = args.lora[0].encode("UTF-8")
|
||||||
inputs.use_mmap = False
|
|
||||||
|
|
||||||
inputs.draftmodel_filename = args.draftmodel.encode("UTF-8") if args.draftmodel else "".encode("UTF-8")
|
inputs.draftmodel_filename = args.draftmodel.encode("UTF-8") if args.draftmodel else "".encode("UTF-8")
|
||||||
inputs.draft_amount = args.draftamount
|
inputs.draft_amount = args.draftamount
|
||||||
|
@ -1741,6 +1741,7 @@ def embeddings_load_model(model_filename):
|
||||||
inputs.gpulayers = 0
|
inputs.gpulayers = 0
|
||||||
inputs.flash_attention = False
|
inputs.flash_attention = False
|
||||||
inputs.threads = args.threads
|
inputs.threads = args.threads
|
||||||
|
inputs.use_mmap = args.usemmap
|
||||||
inputs = set_backend_props(inputs)
|
inputs = set_backend_props(inputs)
|
||||||
ret = handle.embeddings_load_model(inputs)
|
ret = handle.embeddings_load_model(inputs)
|
||||||
return ret
|
return ret
|
||||||
|
|
|
@ -117,7 +117,7 @@ bool embeddingstype_load_model(const embeddings_load_model_inputs inputs)
|
||||||
llama_model_params model_params = llama_model_default_params();
|
llama_model_params model_params = llama_model_default_params();
|
||||||
llama_context_params ctx_params = llama_context_default_params();
|
llama_context_params ctx_params = llama_context_default_params();
|
||||||
const int nthreads = inputs.threads;
|
const int nthreads = inputs.threads;
|
||||||
model_params.use_mmap = false;
|
model_params.use_mmap = inputs.use_mmap;
|
||||||
model_params.use_mlock = false;
|
model_params.use_mlock = false;
|
||||||
model_params.n_gpu_layers = inputs.gpulayers; //offload if possible
|
model_params.n_gpu_layers = inputs.gpulayers; //offload if possible
|
||||||
model_params.split_mode = llama_split_mode::LLAMA_SPLIT_MODE_LAYER;
|
model_params.split_mode = llama_split_mode::LLAMA_SPLIT_MODE_LAYER;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue