From 4b2ca1169c8a3fb0891e0807030327db60093b15 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Wed, 13 Aug 2025 19:28:53 +0800 Subject: [PATCH] more consistency fixes --- koboldcpp.py | 2 +- otherarch/embeddings_adapter.cpp | 2 ++ otherarch/tts_adapter.cpp | 2 ++ 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/koboldcpp.py b/koboldcpp.py index 79d9547e2..d63bfe0eb 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -1898,7 +1898,7 @@ def embeddings_load_model(model_filename): inputs = embeddings_load_model_inputs() inputs.model_filename = model_filename.encode("UTF-8") inputs.gpulayers = (999 if args.embeddingsgpu else 0) - inputs.flash_attention = False + inputs.flash_attention = args.flashattention inputs.threads = args.threads inputs.use_mmap = args.usemmap inputs.embeddingsmaxctx = (args.embeddingsmaxctx if args.embeddingsmaxctx else args.contextsize) # for us to clamp to contextsize if embeddingsmaxctx unspecified diff --git a/otherarch/embeddings_adapter.cpp b/otherarch/embeddings_adapter.cpp index daad79ab6..e9a57e031 100644 --- a/otherarch/embeddings_adapter.cpp +++ b/otherarch/embeddings_adapter.cpp @@ -120,6 +120,8 @@ bool embeddingstype_load_model(const embeddings_load_model_inputs inputs) model_params.use_mmap = inputs.use_mmap; model_params.use_mlock = false; model_params.n_gpu_layers = inputs.gpulayers; //offload if possible + int kcpp_parseinfo_maindevice = inputs.kcpp_main_gpu<=0?0:inputs.kcpp_main_gpu; + model_params.main_gpu = kcpp_parseinfo_maindevice; model_params.split_mode = llama_split_mode::LLAMA_SPLIT_MODE_LAYER; llama_model * embeddingsmodel = llama_model_load_from_file(modelfile.c_str(), model_params); diff --git a/otherarch/tts_adapter.cpp b/otherarch/tts_adapter.cpp index 56aa605b6..86caf67e5 100644 --- a/otherarch/tts_adapter.cpp +++ b/otherarch/tts_adapter.cpp @@ -532,6 +532,8 @@ bool ttstype_load_model(const tts_load_model_inputs inputs) tts_model_params.use_mlock = false; tts_model_params.n_gpu_layers = inputs.gpulayers; //offload if possible tts_model_params.split_mode = llama_split_mode::LLAMA_SPLIT_MODE_LAYER; + int kcpp_parseinfo_maindevice = inputs.kcpp_main_gpu<=0?0:inputs.kcpp_main_gpu; + tts_model_params.main_gpu = kcpp_parseinfo_maindevice; tts_ctx_params.n_ctx = 8192; tts_ctx_params.offload_kqv = true; tts_ctx_params.n_batch = 8192;