autofit will clear moecpu and overridetensors

2026-04-28 03:30:20 +00:00 · 2026-03-18 21:20:57 +08:00 · 2026-03-18 21:20:57 +08:00 · 15e86010d8
commit 15e86010d8
parent f8aa711e5c
3 changed files with 35 additions and 35 deletions
--- a/README.md
+++ b/README.md
@ -66,7 +66,8 @@ Finally, obtain and load a GGUF model. See [here](#Obtaining-a-GGUF-model)

 ## Obtaining a GGUF model
 - KoboldCpp uses GGUF models. They are not included with KoboldCpp, but you can download GGUF files from other places such as [Bartowski's Huggingface](https://huggingface.co/bartowski). Search for "GGUF" on huggingface.co for plenty of compatible models in the `.gguf` format.
- For beginners, we recommend the models [L3-8B-Stheno-v3.2](https://huggingface.co/bartowski/L3-8B-Stheno-v3.2-GGUF/resolve/main/L3-8B-Stheno-v3.2-Q4_K_S.gguf) (smaller and weaker) or [Tiefighter 13B](https://huggingface.co/KoboldAI/LLaMA2-13B-Tiefighter-GGUF/resolve/main/LLaMA2-13B-Tiefighter.Q4_K_S.gguf) (old but very versatile model) or [Gemma-3-27B Abliterated](https://huggingface.co/mlabonne/gemma-3-27b-it-abliterated-GGUF/resolve/main/gemma-3-27b-it-abliterated.q4_k_m.gguf) (largest and most powerful)
+- For beginners, we recommend [Qwen3-VL-8B](https://huggingface.co/unsloth/Qwen3-VL-8B-Instruct-GGUF/resolve/main/Qwen3-VL-8B-Instruct-Q4_K_S.gguf) **(Most Recommended, best all rounder model)**
+- For creative writing and roleplay, you can try [L3-8B-Stheno-v3.2](https://huggingface.co/bartowski/L3-8B-Stheno-v3.2-GGUF/resolve/main/L3-8B-Stheno-v3.2-Q4_K_S.gguf) (old, smaller and weaker) or [Tiefighter 13B](https://huggingface.co/KoboldAI/LLaMA2-13B-Tiefighter-GGUF/resolve/main/LLaMA2-13B-Tiefighter.Q4_K_S.gguf) (old but very versatile model).
 - [Alternatively, you can download the tools to convert models to the GGUF format yourself here](https://kcpptools.concedo.workers.dev). Run `convert-hf-to-gguf.py` to convert them, then `quantize_gguf.exe` to quantize the result.
 - Other models for Whisper (speech recognition), Image Generation, Text to Speech or Image Recognition [can be found on the Wiki](https://github.com/LostRuins/koboldcpp/wiki#what-models-does-koboldcpp-support-what-architectures-are-supported)

@ -132,7 +133,7 @@ when you can't use the precompiled binary directly, we provide an automated buil
  - OpenBSD's default datasize limit may prevent compiliation `ulimit -d 8388608` should work
  - compile using `gmake LLAMA_VULKAN=1`
 - After all binaries are built, you can run the python script with the command `python3 koboldcpp.py --model [ggml_model.gguf]`
- 
+
 ### Compiling on Android (Termux Installation)
 - [First, Install and run Termux from F-Droid](https://f-droid.org/en/packages/com.termux/)
 ## Termux Quick Setup Script (Easy Setup)
--- a/embd_res/klite.embd
+++ b/embd_res/klite.embd
@ -6664,6 +6664,19 @@ Current version indicated by LITEVER below.
 		}
 		return header;
 	}
+	function get_oai_header(apikey)
+	{
+		let oaiheaders = {'Content-Type': 'application/json'};
+
+		if (apikey!="" && apikey!=dummy_api_key) {
+			oaiheaders['Authorization'] = 'Bearer ' + apikey;
+		}
+		if(uses_cors_proxy && localsettings.proxy_disable_stream)
+		{
+			oaiheaders['nostream'] = true;
+		}
+		return oaiheaders;
+	}

 	//synchronous and SSE kai and openAI requests
 	function trigger_abort_controller() //triggers an abort of an in-progress gen http request
@ -13333,10 +13346,8 @@ Current version indicated by LITEVER below.
 		let desired_oai_ep = document.getElementById("custom_oai_endpoint").value.trim();
 		desired_oai_ep = transform_oai_ep(desired_oai_ep);

-		let oaiheaders = {};
-		if(desired_oai_key!="" && desired_oai_key!=dummy_api_key){
-			oaiheaders["Authorization"] = "Bearer " + desired_oai_key;
-		};
+		let oaiheaders = get_oai_header(desired_oai_key);
+
 		if (desired_oai_ep.toLowerCase().includes("api.mistral.ai") || desired_oai_ep.toLowerCase().includes("api.x.ai")) {
 			if(desired_oai_key=="" || desired_oai_key==dummy_api_key)
 			{
@ -13344,10 +13355,6 @@ Current version indicated by LITEVER below.
 				return;
 			}
 		}
-		if(uses_cors_proxy && localsettings.proxy_disable_stream)
-		{
-			oaiheaders['nostream'] = true;
-		}

 		autofetch_attempt_dict[desired_oai_ep] = true;

@ -13438,11 +13445,7 @@ Current version indicated by LITEVER below.
 	function fetch_openrouter_balance()
 	{
 		let desired_oai_key = document.getElementById("custom_oai_key").value.trim();
-
-		let oaiheaders = {};
-		if(desired_oai_key!="" && desired_oai_key!=dummy_api_key){
-			oaiheaders["Authorization"] = "Bearer " + desired_oai_key;
-		};
+		let oaiheaders = get_oai_header(desired_oai_key);

 		let fetchurl = default_openrouter_base + openrouter_credits_endpoint;

@ -20896,18 +20899,7 @@ Current version indicated by LITEVER below.
 				last_request_timestamp = get_current_timestamp();
 				last_response_obj = null;
 				last_response_streamlog = "";
-				let oaiheaders = {
-					'Content-Type': 'application/json'
-				};
-
-				if (custom_oai_key!="" && custom_oai_key!=dummy_api_key) {
-                    oaiheaders['Authorization'] = 'Bearer ' + custom_oai_key;
-                }
-
-				if(uses_cors_proxy && localsettings.proxy_disable_stream)
-				{
-					oaiheaders['nostream'] = true;
-				}
+				let oaiheaders = get_oai_header(custom_oai_key);

 				if(targetep.toLowerCase().includes("api.novita.ai"))
 				{
@ -28477,12 +28469,7 @@ Current version indicated by LITEVER below.
 	//queries the decensoring prefix from a second OAI compatible endpoint and returns a string to add to our main request
 	const FetchDecensoredPrefix = asyncRunner(function* (submit_payload, endpoint_url, modelused, num_tokens)
 	{
-		let oaiheaders = {
-			'Content-Type': 'application/json'
-		};
-		if (custom_oai_key!="" && custom_oai_key!=dummy_api_key) {
-			oaiheaders['Authorization'] = 'Bearer ' + custom_oai_key;
-		}
+		let oaiheaders = get_oai_header(custom_oai_key);

 		let scaled_rep_pen = 0;
 		if(submit_payload.params.presence_penalty > 0)
--- a/koboldcpp.py
+++ b/koboldcpp.py
@ -6851,9 +6851,17 @@ def show_gui():
            quick_gpu_layers_entry.grid_remove()
            autofit_padding_label.grid(row=6, column=0, padx=8, pady=1, stick="nw")
            autofit_padding_entry.grid(row=6, column=0, padx=160, pady=1, stick="nw")
+            moecpu_box.grid_remove()
+            tenos_box.grid_remove()
+            moecpu_box_lbl.grid_remove()
+            tenos_box_lbl.grid_remove()
        else:
            autofit_padding_label.grid_remove()
            autofit_padding_entry.grid_remove()
+            moecpu_box.grid()
+            tenos_box.grid()
+            moecpu_box_lbl.grid()
+            tenos_box_lbl.grid()

        changed_gpulayers_estimate()
        changed_gpu_choice_var()
@ -7020,9 +7028,9 @@ def show_gui():
    jinjatoolsbox = makecheckbox(context_tab, "Jinja for Tools", jinja_tools_var, row=45 ,padx=(140), tooltiptxt="Allows jinja even with tool calls. If unchecked, jinja will be disabled when tools are used.")
    jinja_var.trace_add("write", togglejinja)
    makelabelentry(context_tab, "MoE Experts:", moeexperts_var, row=55, padx=(120), singleline=True, tooltip="Override number of MoE experts.")
-    makelabelentry(context_tab, "MoE CPU Layers:", moecpu_var, row=55, padx=(320), singleline=True, tooltip="Force Mixture of Experts (MoE) weights of the first N layers to the CPU.\nSetting it higher than GPU layers has no effect.", labelpadx=(210))
+    moecpu_box,moecpu_box_lbl = makelabelentry(context_tab, "MoE CPU Layers:", moecpu_var, row=55, padx=(320), singleline=True, tooltip="Force Mixture of Experts (MoE) weights of the first N layers to the CPU.\nSetting it higher than GPU layers has no effect.", labelpadx=(210))
    makelabelentry(context_tab, "Override KV:", override_kv_var, row=57, padx=(120), singleline=True, width=150, tooltip="Override metadata value by key. Separate multiple values with commas. Format is name=type:value. Types: int, float, bool, str")
-    makelabelentry(context_tab, "Override Tensors:", override_tensors_var, row=59, padx=(120), singleline=True, width=150, tooltip="Override selected backend for specific tensors matching tensor_name_regex_pattern=buffer_type, same as in llama.cpp.")
+    tenos_box,tenos_box_lbl = makelabelentry(context_tab, "Override Tensors:", override_tensors_var, row=59, padx=(120), singleline=True, width=150, tooltip="Override selected backend for specific tensors matching tensor_name_regex_pattern=buffer_type, same as in llama.cpp.")

    # Model Tab
    model_tab = tabcontent["Loaded Files"]
@ -9289,6 +9297,10 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False):
            if MaxMemory[0] == 0: #try to get gpu vram for cuda if not picked yet
                fetch_gpu_properties(True,True)
                pass
+            if args.autofit:
+                print("Forced autofit is selected, moecpu and overridetensors will be set automatically.")
+                args.overridetensors = ""
+                args.moecpu = 0
            if args.gpulayers==-1:
                if (not args.usecpu) and ((args.usecuda is not None) or (args.usevulkan is not None) or sys.platform=="darwin"):
                    if MaxMemory[0] > 0: