more consistency fixes

This commit is contained in:
Concedo 2025-08-13 19:28:53 +08:00
parent 955cf66bbc
commit 4b2ca1169c
3 changed files with 5 additions and 1 deletions

View file

@ -1898,7 +1898,7 @@ def embeddings_load_model(model_filename):
inputs = embeddings_load_model_inputs() inputs = embeddings_load_model_inputs()
inputs.model_filename = model_filename.encode("UTF-8") inputs.model_filename = model_filename.encode("UTF-8")
inputs.gpulayers = (999 if args.embeddingsgpu else 0) inputs.gpulayers = (999 if args.embeddingsgpu else 0)
inputs.flash_attention = False inputs.flash_attention = args.flashattention
inputs.threads = args.threads inputs.threads = args.threads
inputs.use_mmap = args.usemmap inputs.use_mmap = args.usemmap
inputs.embeddingsmaxctx = (args.embeddingsmaxctx if args.embeddingsmaxctx else args.contextsize) # for us to clamp to contextsize if embeddingsmaxctx unspecified inputs.embeddingsmaxctx = (args.embeddingsmaxctx if args.embeddingsmaxctx else args.contextsize) # for us to clamp to contextsize if embeddingsmaxctx unspecified

View file

@ -120,6 +120,8 @@ bool embeddingstype_load_model(const embeddings_load_model_inputs inputs)
model_params.use_mmap = inputs.use_mmap; model_params.use_mmap = inputs.use_mmap;
model_params.use_mlock = false; model_params.use_mlock = false;
model_params.n_gpu_layers = inputs.gpulayers; //offload if possible model_params.n_gpu_layers = inputs.gpulayers; //offload if possible
int kcpp_parseinfo_maindevice = inputs.kcpp_main_gpu<=0?0:inputs.kcpp_main_gpu;
model_params.main_gpu = kcpp_parseinfo_maindevice;
model_params.split_mode = llama_split_mode::LLAMA_SPLIT_MODE_LAYER; model_params.split_mode = llama_split_mode::LLAMA_SPLIT_MODE_LAYER;
llama_model * embeddingsmodel = llama_model_load_from_file(modelfile.c_str(), model_params); llama_model * embeddingsmodel = llama_model_load_from_file(modelfile.c_str(), model_params);

View file

@ -532,6 +532,8 @@ bool ttstype_load_model(const tts_load_model_inputs inputs)
tts_model_params.use_mlock = false; tts_model_params.use_mlock = false;
tts_model_params.n_gpu_layers = inputs.gpulayers; //offload if possible tts_model_params.n_gpu_layers = inputs.gpulayers; //offload if possible
tts_model_params.split_mode = llama_split_mode::LLAMA_SPLIT_MODE_LAYER; tts_model_params.split_mode = llama_split_mode::LLAMA_SPLIT_MODE_LAYER;
int kcpp_parseinfo_maindevice = inputs.kcpp_main_gpu<=0?0:inputs.kcpp_main_gpu;
tts_model_params.main_gpu = kcpp_parseinfo_maindevice;
tts_ctx_params.n_ctx = 8192; tts_ctx_params.n_ctx = 8192;
tts_ctx_params.offload_kqv = true; tts_ctx_params.offload_kqv = true;
tts_ctx_params.n_batch = 8192; tts_ctx_params.n_batch = 8192;