diff --git a/README.md b/README.md index 6ea03f3f8..4e15aacb8 100644 --- a/README.md +++ b/README.md @@ -66,7 +66,8 @@ Finally, obtain and load a GGUF model. See [here](#Obtaining-a-GGUF-model) ## Obtaining a GGUF model - KoboldCpp uses GGUF models. They are not included with KoboldCpp, but you can download GGUF files from other places such as [Bartowski's Huggingface](https://huggingface.co/bartowski). Search for "GGUF" on huggingface.co for plenty of compatible models in the `.gguf` format. -- For beginners, we recommend the models [L3-8B-Stheno-v3.2](https://huggingface.co/bartowski/L3-8B-Stheno-v3.2-GGUF/resolve/main/L3-8B-Stheno-v3.2-Q4_K_S.gguf) (smaller and weaker) or [Tiefighter 13B](https://huggingface.co/KoboldAI/LLaMA2-13B-Tiefighter-GGUF/resolve/main/LLaMA2-13B-Tiefighter.Q4_K_S.gguf) (old but very versatile model) or [Gemma-3-27B Abliterated](https://huggingface.co/mlabonne/gemma-3-27b-it-abliterated-GGUF/resolve/main/gemma-3-27b-it-abliterated.q4_k_m.gguf) (largest and most powerful) +- For beginners, we recommend [Qwen3-VL-8B](https://huggingface.co/unsloth/Qwen3-VL-8B-Instruct-GGUF/resolve/main/Qwen3-VL-8B-Instruct-Q4_K_S.gguf) **(Most Recommended, best all rounder model)** +- For creative writing and roleplay, you can try [L3-8B-Stheno-v3.2](https://huggingface.co/bartowski/L3-8B-Stheno-v3.2-GGUF/resolve/main/L3-8B-Stheno-v3.2-Q4_K_S.gguf) (old, smaller and weaker) or [Tiefighter 13B](https://huggingface.co/KoboldAI/LLaMA2-13B-Tiefighter-GGUF/resolve/main/LLaMA2-13B-Tiefighter.Q4_K_S.gguf) (old but very versatile model). - [Alternatively, you can download the tools to convert models to the GGUF format yourself here](https://kcpptools.concedo.workers.dev). Run `convert-hf-to-gguf.py` to convert them, then `quantize_gguf.exe` to quantize the result. - Other models for Whisper (speech recognition), Image Generation, Text to Speech or Image Recognition [can be found on the Wiki](https://github.com/LostRuins/koboldcpp/wiki#what-models-does-koboldcpp-support-what-architectures-are-supported) @@ -132,7 +133,7 @@ when you can't use the precompiled binary directly, we provide an automated buil - OpenBSD's default datasize limit may prevent compiliation `ulimit -d 8388608` should work - compile using `gmake LLAMA_VULKAN=1` - After all binaries are built, you can run the python script with the command `python3 koboldcpp.py --model [ggml_model.gguf]` - + ### Compiling on Android (Termux Installation) - [First, Install and run Termux from F-Droid](https://f-droid.org/en/packages/com.termux/) ## Termux Quick Setup Script (Easy Setup) diff --git a/embd_res/klite.embd b/embd_res/klite.embd index f1ff74dab..7459d4249 100644 --- a/embd_res/klite.embd +++ b/embd_res/klite.embd @@ -6664,6 +6664,19 @@ Current version indicated by LITEVER below. } return header; } + function get_oai_header(apikey) + { + let oaiheaders = {'Content-Type': 'application/json'}; + + if (apikey!="" && apikey!=dummy_api_key) { + oaiheaders['Authorization'] = 'Bearer ' + apikey; + } + if(uses_cors_proxy && localsettings.proxy_disable_stream) + { + oaiheaders['nostream'] = true; + } + return oaiheaders; + } //synchronous and SSE kai and openAI requests function trigger_abort_controller() //triggers an abort of an in-progress gen http request @@ -13333,10 +13346,8 @@ Current version indicated by LITEVER below. let desired_oai_ep = document.getElementById("custom_oai_endpoint").value.trim(); desired_oai_ep = transform_oai_ep(desired_oai_ep); - let oaiheaders = {}; - if(desired_oai_key!="" && desired_oai_key!=dummy_api_key){ - oaiheaders["Authorization"] = "Bearer " + desired_oai_key; - }; + let oaiheaders = get_oai_header(desired_oai_key); + if (desired_oai_ep.toLowerCase().includes("api.mistral.ai") || desired_oai_ep.toLowerCase().includes("api.x.ai")) { if(desired_oai_key=="" || desired_oai_key==dummy_api_key) { @@ -13344,10 +13355,6 @@ Current version indicated by LITEVER below. return; } } - if(uses_cors_proxy && localsettings.proxy_disable_stream) - { - oaiheaders['nostream'] = true; - } autofetch_attempt_dict[desired_oai_ep] = true; @@ -13438,11 +13445,7 @@ Current version indicated by LITEVER below. function fetch_openrouter_balance() { let desired_oai_key = document.getElementById("custom_oai_key").value.trim(); - - let oaiheaders = {}; - if(desired_oai_key!="" && desired_oai_key!=dummy_api_key){ - oaiheaders["Authorization"] = "Bearer " + desired_oai_key; - }; + let oaiheaders = get_oai_header(desired_oai_key); let fetchurl = default_openrouter_base + openrouter_credits_endpoint; @@ -20896,18 +20899,7 @@ Current version indicated by LITEVER below. last_request_timestamp = get_current_timestamp(); last_response_obj = null; last_response_streamlog = ""; - let oaiheaders = { - 'Content-Type': 'application/json' - }; - - if (custom_oai_key!="" && custom_oai_key!=dummy_api_key) { - oaiheaders['Authorization'] = 'Bearer ' + custom_oai_key; - } - - if(uses_cors_proxy && localsettings.proxy_disable_stream) - { - oaiheaders['nostream'] = true; - } + let oaiheaders = get_oai_header(custom_oai_key); if(targetep.toLowerCase().includes("api.novita.ai")) { @@ -28477,12 +28469,7 @@ Current version indicated by LITEVER below. //queries the decensoring prefix from a second OAI compatible endpoint and returns a string to add to our main request const FetchDecensoredPrefix = asyncRunner(function* (submit_payload, endpoint_url, modelused, num_tokens) { - let oaiheaders = { - 'Content-Type': 'application/json' - }; - if (custom_oai_key!="" && custom_oai_key!=dummy_api_key) { - oaiheaders['Authorization'] = 'Bearer ' + custom_oai_key; - } + let oaiheaders = get_oai_header(custom_oai_key); let scaled_rep_pen = 0; if(submit_payload.params.presence_penalty > 0) diff --git a/koboldcpp.py b/koboldcpp.py index e7b59ca17..a2c92e365 100755 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -6851,9 +6851,17 @@ def show_gui(): quick_gpu_layers_entry.grid_remove() autofit_padding_label.grid(row=6, column=0, padx=8, pady=1, stick="nw") autofit_padding_entry.grid(row=6, column=0, padx=160, pady=1, stick="nw") + moecpu_box.grid_remove() + tenos_box.grid_remove() + moecpu_box_lbl.grid_remove() + tenos_box_lbl.grid_remove() else: autofit_padding_label.grid_remove() autofit_padding_entry.grid_remove() + moecpu_box.grid() + tenos_box.grid() + moecpu_box_lbl.grid() + tenos_box_lbl.grid() changed_gpulayers_estimate() changed_gpu_choice_var() @@ -7020,9 +7028,9 @@ def show_gui(): jinjatoolsbox = makecheckbox(context_tab, "Jinja for Tools", jinja_tools_var, row=45 ,padx=(140), tooltiptxt="Allows jinja even with tool calls. If unchecked, jinja will be disabled when tools are used.") jinja_var.trace_add("write", togglejinja) makelabelentry(context_tab, "MoE Experts:", moeexperts_var, row=55, padx=(120), singleline=True, tooltip="Override number of MoE experts.") - makelabelentry(context_tab, "MoE CPU Layers:", moecpu_var, row=55, padx=(320), singleline=True, tooltip="Force Mixture of Experts (MoE) weights of the first N layers to the CPU.\nSetting it higher than GPU layers has no effect.", labelpadx=(210)) + moecpu_box,moecpu_box_lbl = makelabelentry(context_tab, "MoE CPU Layers:", moecpu_var, row=55, padx=(320), singleline=True, tooltip="Force Mixture of Experts (MoE) weights of the first N layers to the CPU.\nSetting it higher than GPU layers has no effect.", labelpadx=(210)) makelabelentry(context_tab, "Override KV:", override_kv_var, row=57, padx=(120), singleline=True, width=150, tooltip="Override metadata value by key. Separate multiple values with commas. Format is name=type:value. Types: int, float, bool, str") - makelabelentry(context_tab, "Override Tensors:", override_tensors_var, row=59, padx=(120), singleline=True, width=150, tooltip="Override selected backend for specific tensors matching tensor_name_regex_pattern=buffer_type, same as in llama.cpp.") + tenos_box,tenos_box_lbl = makelabelentry(context_tab, "Override Tensors:", override_tensors_var, row=59, padx=(120), singleline=True, width=150, tooltip="Override selected backend for specific tensors matching tensor_name_regex_pattern=buffer_type, same as in llama.cpp.") # Model Tab model_tab = tabcontent["Loaded Files"] @@ -9289,6 +9297,10 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False): if MaxMemory[0] == 0: #try to get gpu vram for cuda if not picked yet fetch_gpu_properties(True,True) pass + if args.autofit: + print("Forced autofit is selected, moecpu and overridetensors will be set automatically.") + args.overridetensors = "" + args.moecpu = 0 if args.gpulayers==-1: if (not args.usecpu) and ((args.usecuda is not None) or (args.usevulkan is not None) or sys.platform=="darwin"): if MaxMemory[0] > 0: