autofit will clear moecpu and overridetensors

This commit is contained in:
Concedo 2026-03-18 21:20:57 +08:00
parent f8aa711e5c
commit 15e86010d8
3 changed files with 35 additions and 35 deletions

View file

@ -66,7 +66,8 @@ Finally, obtain and load a GGUF model. See [here](#Obtaining-a-GGUF-model)
## Obtaining a GGUF model
- KoboldCpp uses GGUF models. They are not included with KoboldCpp, but you can download GGUF files from other places such as [Bartowski's Huggingface](https://huggingface.co/bartowski). Search for "GGUF" on huggingface.co for plenty of compatible models in the `.gguf` format.
- For beginners, we recommend the models [L3-8B-Stheno-v3.2](https://huggingface.co/bartowski/L3-8B-Stheno-v3.2-GGUF/resolve/main/L3-8B-Stheno-v3.2-Q4_K_S.gguf) (smaller and weaker) or [Tiefighter 13B](https://huggingface.co/KoboldAI/LLaMA2-13B-Tiefighter-GGUF/resolve/main/LLaMA2-13B-Tiefighter.Q4_K_S.gguf) (old but very versatile model) or [Gemma-3-27B Abliterated](https://huggingface.co/mlabonne/gemma-3-27b-it-abliterated-GGUF/resolve/main/gemma-3-27b-it-abliterated.q4_k_m.gguf) (largest and most powerful)
- For beginners, we recommend [Qwen3-VL-8B](https://huggingface.co/unsloth/Qwen3-VL-8B-Instruct-GGUF/resolve/main/Qwen3-VL-8B-Instruct-Q4_K_S.gguf) **(Most Recommended, best all rounder model)**
- For creative writing and roleplay, you can try [L3-8B-Stheno-v3.2](https://huggingface.co/bartowski/L3-8B-Stheno-v3.2-GGUF/resolve/main/L3-8B-Stheno-v3.2-Q4_K_S.gguf) (old, smaller and weaker) or [Tiefighter 13B](https://huggingface.co/KoboldAI/LLaMA2-13B-Tiefighter-GGUF/resolve/main/LLaMA2-13B-Tiefighter.Q4_K_S.gguf) (old but very versatile model).
- [Alternatively, you can download the tools to convert models to the GGUF format yourself here](https://kcpptools.concedo.workers.dev). Run `convert-hf-to-gguf.py` to convert them, then `quantize_gguf.exe` to quantize the result.
- Other models for Whisper (speech recognition), Image Generation, Text to Speech or Image Recognition [can be found on the Wiki](https://github.com/LostRuins/koboldcpp/wiki#what-models-does-koboldcpp-support-what-architectures-are-supported)
@ -132,7 +133,7 @@ when you can't use the precompiled binary directly, we provide an automated buil
- OpenBSD's default datasize limit may prevent compiliation `ulimit -d 8388608` should work
- compile using `gmake LLAMA_VULKAN=1`
- After all binaries are built, you can run the python script with the command `python3 koboldcpp.py --model [ggml_model.gguf]`
### Compiling on Android (Termux Installation)
- [First, Install and run Termux from F-Droid](https://f-droid.org/en/packages/com.termux/)
## Termux Quick Setup Script (Easy Setup)

View file

@ -6664,6 +6664,19 @@ Current version indicated by LITEVER below.
}
return header;
}
function get_oai_header(apikey)
{
let oaiheaders = {'Content-Type': 'application/json'};
if (apikey!="" && apikey!=dummy_api_key) {
oaiheaders['Authorization'] = 'Bearer ' + apikey;
}
if(uses_cors_proxy && localsettings.proxy_disable_stream)
{
oaiheaders['nostream'] = true;
}
return oaiheaders;
}
//synchronous and SSE kai and openAI requests
function trigger_abort_controller() //triggers an abort of an in-progress gen http request
@ -13333,10 +13346,8 @@ Current version indicated by LITEVER below.
let desired_oai_ep = document.getElementById("custom_oai_endpoint").value.trim();
desired_oai_ep = transform_oai_ep(desired_oai_ep);
let oaiheaders = {};
if(desired_oai_key!="" && desired_oai_key!=dummy_api_key){
oaiheaders["Authorization"] = "Bearer " + desired_oai_key;
};
let oaiheaders = get_oai_header(desired_oai_key);
if (desired_oai_ep.toLowerCase().includes("api.mistral.ai") || desired_oai_ep.toLowerCase().includes("api.x.ai")) {
if(desired_oai_key=="" || desired_oai_key==dummy_api_key)
{
@ -13344,10 +13355,6 @@ Current version indicated by LITEVER below.
return;
}
}
if(uses_cors_proxy && localsettings.proxy_disable_stream)
{
oaiheaders['nostream'] = true;
}
autofetch_attempt_dict[desired_oai_ep] = true;
@ -13438,11 +13445,7 @@ Current version indicated by LITEVER below.
function fetch_openrouter_balance()
{
let desired_oai_key = document.getElementById("custom_oai_key").value.trim();
let oaiheaders = {};
if(desired_oai_key!="" && desired_oai_key!=dummy_api_key){
oaiheaders["Authorization"] = "Bearer " + desired_oai_key;
};
let oaiheaders = get_oai_header(desired_oai_key);
let fetchurl = default_openrouter_base + openrouter_credits_endpoint;
@ -20896,18 +20899,7 @@ Current version indicated by LITEVER below.
last_request_timestamp = get_current_timestamp();
last_response_obj = null;
last_response_streamlog = "";
let oaiheaders = {
'Content-Type': 'application/json'
};
if (custom_oai_key!="" && custom_oai_key!=dummy_api_key) {
oaiheaders['Authorization'] = 'Bearer ' + custom_oai_key;
}
if(uses_cors_proxy && localsettings.proxy_disable_stream)
{
oaiheaders['nostream'] = true;
}
let oaiheaders = get_oai_header(custom_oai_key);
if(targetep.toLowerCase().includes("api.novita.ai"))
{
@ -28477,12 +28469,7 @@ Current version indicated by LITEVER below.
//queries the decensoring prefix from a second OAI compatible endpoint and returns a string to add to our main request
const FetchDecensoredPrefix = asyncRunner(function* (submit_payload, endpoint_url, modelused, num_tokens)
{
let oaiheaders = {
'Content-Type': 'application/json'
};
if (custom_oai_key!="" && custom_oai_key!=dummy_api_key) {
oaiheaders['Authorization'] = 'Bearer ' + custom_oai_key;
}
let oaiheaders = get_oai_header(custom_oai_key);
let scaled_rep_pen = 0;
if(submit_payload.params.presence_penalty > 0)

View file

@ -6851,9 +6851,17 @@ def show_gui():
quick_gpu_layers_entry.grid_remove()
autofit_padding_label.grid(row=6, column=0, padx=8, pady=1, stick="nw")
autofit_padding_entry.grid(row=6, column=0, padx=160, pady=1, stick="nw")
moecpu_box.grid_remove()
tenos_box.grid_remove()
moecpu_box_lbl.grid_remove()
tenos_box_lbl.grid_remove()
else:
autofit_padding_label.grid_remove()
autofit_padding_entry.grid_remove()
moecpu_box.grid()
tenos_box.grid()
moecpu_box_lbl.grid()
tenos_box_lbl.grid()
changed_gpulayers_estimate()
changed_gpu_choice_var()
@ -7020,9 +7028,9 @@ def show_gui():
jinjatoolsbox = makecheckbox(context_tab, "Jinja for Tools", jinja_tools_var, row=45 ,padx=(140), tooltiptxt="Allows jinja even with tool calls. If unchecked, jinja will be disabled when tools are used.")
jinja_var.trace_add("write", togglejinja)
makelabelentry(context_tab, "MoE Experts:", moeexperts_var, row=55, padx=(120), singleline=True, tooltip="Override number of MoE experts.")
makelabelentry(context_tab, "MoE CPU Layers:", moecpu_var, row=55, padx=(320), singleline=True, tooltip="Force Mixture of Experts (MoE) weights of the first N layers to the CPU.\nSetting it higher than GPU layers has no effect.", labelpadx=(210))
moecpu_box,moecpu_box_lbl = makelabelentry(context_tab, "MoE CPU Layers:", moecpu_var, row=55, padx=(320), singleline=True, tooltip="Force Mixture of Experts (MoE) weights of the first N layers to the CPU.\nSetting it higher than GPU layers has no effect.", labelpadx=(210))
makelabelentry(context_tab, "Override KV:", override_kv_var, row=57, padx=(120), singleline=True, width=150, tooltip="Override metadata value by key. Separate multiple values with commas. Format is name=type:value. Types: int, float, bool, str")
makelabelentry(context_tab, "Override Tensors:", override_tensors_var, row=59, padx=(120), singleline=True, width=150, tooltip="Override selected backend for specific tensors matching tensor_name_regex_pattern=buffer_type, same as in llama.cpp.")
tenos_box,tenos_box_lbl = makelabelentry(context_tab, "Override Tensors:", override_tensors_var, row=59, padx=(120), singleline=True, width=150, tooltip="Override selected backend for specific tensors matching tensor_name_regex_pattern=buffer_type, same as in llama.cpp.")
# Model Tab
model_tab = tabcontent["Loaded Files"]
@ -9289,6 +9297,10 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False):
if MaxMemory[0] == 0: #try to get gpu vram for cuda if not picked yet
fetch_gpu_properties(True,True)
pass
if args.autofit:
print("Forced autofit is selected, moecpu and overridetensors will be set automatically.")
args.overridetensors = ""
args.moecpu = 0
if args.gpulayers==-1:
if (not args.usecpu) and ((args.usecuda is not None) or (args.usevulkan is not None) or sys.platform=="darwin"):
if MaxMemory[0] > 0: