mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-10 17:14:36 +00:00
added toggle for guidance
This commit is contained in:
parent
41142ad67a
commit
f59b5eb561
4 changed files with 54 additions and 21 deletions
1
expose.h
1
expose.h
|
@ -62,6 +62,7 @@ struct load_model_inputs
|
||||||
const float rope_freq_base = 10000.0f;
|
const float rope_freq_base = 10000.0f;
|
||||||
const int moe_experts = -1;
|
const int moe_experts = -1;
|
||||||
const bool no_bos_token = false;
|
const bool no_bos_token = false;
|
||||||
|
const bool load_guidance = false;
|
||||||
const char * override_kv = nullptr;
|
const char * override_kv = nullptr;
|
||||||
const char * override_tensors = nullptr;
|
const char * override_tensors = nullptr;
|
||||||
const bool flash_attention = false;
|
const bool flash_attention = false;
|
||||||
|
|
|
@ -98,6 +98,7 @@ static llama_v2_context * llama_ctx_v2 = nullptr;
|
||||||
static llama_v3_context * llama_ctx_v3 = nullptr;
|
static llama_v3_context * llama_ctx_v3 = nullptr;
|
||||||
static llama_context * llama_ctx_v4 = nullptr;
|
static llama_context * llama_ctx_v4 = nullptr;
|
||||||
static llama_context * draft_ctx = nullptr; //will remain null if speculative is unused
|
static llama_context * draft_ctx = nullptr; //will remain null if speculative is unused
|
||||||
|
static llama_context * guidance_ctx = nullptr; //for classifier free guidance, will be null if unused
|
||||||
|
|
||||||
static clip_ctx * clp_ctx = nullptr; //for llava
|
static clip_ctx * clp_ctx = nullptr; //for llava
|
||||||
static clip_image_u8 * clp_img_data = nullptr; //most recent image
|
static clip_image_u8 * clp_img_data = nullptr; //most recent image
|
||||||
|
@ -134,6 +135,7 @@ static std::string concat_output_reader_copy_poll = ""; //for streaming
|
||||||
static std::string concat_output_reader_copy_res = ""; //for gen response
|
static std::string concat_output_reader_copy_res = ""; //for gen response
|
||||||
static std::vector<logit_bias> logit_biases;
|
static std::vector<logit_bias> logit_biases;
|
||||||
static bool add_bos_token = true; // if set to false, mmproj handling breaks. dont disable unless you know what you're doing
|
static bool add_bos_token = true; // if set to false, mmproj handling breaks. dont disable unless you know what you're doing
|
||||||
|
static bool load_guidance = false; //whether to enable cfg for negative prompts
|
||||||
|
|
||||||
static int delayed_generated_tokens_limit = 0;
|
static int delayed_generated_tokens_limit = 0;
|
||||||
std::deque<std::string> delayed_generated_tokens; //for use with antislop sampling
|
std::deque<std::string> delayed_generated_tokens; //for use with antislop sampling
|
||||||
|
@ -1898,6 +1900,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
||||||
kcpp_data->use_fastforward = inputs.use_fastforward;
|
kcpp_data->use_fastforward = inputs.use_fastforward;
|
||||||
debugmode = inputs.debugmode;
|
debugmode = inputs.debugmode;
|
||||||
draft_ctx = nullptr;
|
draft_ctx = nullptr;
|
||||||
|
guidance_ctx = nullptr;
|
||||||
|
|
||||||
auto clamped_max_context_length = inputs.max_context_length;
|
auto clamped_max_context_length = inputs.max_context_length;
|
||||||
|
|
||||||
|
@ -1923,6 +1926,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
||||||
kcpp_data->n_ctx = clamped_max_context_length;
|
kcpp_data->n_ctx = clamped_max_context_length;
|
||||||
max_context_limit_at_load = clamped_max_context_length;
|
max_context_limit_at_load = clamped_max_context_length;
|
||||||
add_bos_token = !inputs.no_bos_token;
|
add_bos_token = !inputs.no_bos_token;
|
||||||
|
load_guidance = inputs.load_guidance;
|
||||||
|
|
||||||
if(!add_bos_token)
|
if(!add_bos_token)
|
||||||
{
|
{
|
||||||
|
@ -2303,6 +2307,10 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
||||||
llama_ctx_params.type_k = (inputs.quant_k>1?GGML_TYPE_Q4_0:(inputs.quant_k==1?GGML_TYPE_Q8_0:GGML_TYPE_F16));
|
llama_ctx_params.type_k = (inputs.quant_k>1?GGML_TYPE_Q4_0:(inputs.quant_k==1?GGML_TYPE_Q8_0:GGML_TYPE_F16));
|
||||||
llama_ctx_params.type_v = (inputs.quant_v>1?GGML_TYPE_Q4_0:(inputs.quant_v==1?GGML_TYPE_Q8_0:GGML_TYPE_F16));
|
llama_ctx_params.type_v = (inputs.quant_v>1?GGML_TYPE_Q4_0:(inputs.quant_v==1?GGML_TYPE_Q8_0:GGML_TYPE_F16));
|
||||||
llama_ctx_v4 = llama_init_from_model(llamamodel, llama_ctx_params);
|
llama_ctx_v4 = llama_init_from_model(llamamodel, llama_ctx_params);
|
||||||
|
if(load_guidance)
|
||||||
|
{
|
||||||
|
guidance_ctx = llama_init_from_model(llamamodel, llama_ctx_params);
|
||||||
|
}
|
||||||
|
|
||||||
if (llama_ctx_v4 == NULL)
|
if (llama_ctx_v4 == NULL)
|
||||||
{
|
{
|
||||||
|
@ -3450,6 +3458,10 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if(guidance_ctx)
|
||||||
|
{
|
||||||
|
llama_kv_self_clear(guidance_ctx);
|
||||||
|
}
|
||||||
|
|
||||||
bool blasmode = (embd_inp.size() >= 32 && kcpp_cpu_has_blas() && kcpp_data->n_batch>=32);
|
bool blasmode = (embd_inp.size() >= 32 && kcpp_cpu_has_blas() && kcpp_data->n_batch>=32);
|
||||||
|
|
||||||
|
|
14
klite.embd
14
klite.embd
|
@ -9615,7 +9615,7 @@ Current version indicated by LITEVER below.
|
||||||
let is_local = is_local_url(desired_oai_ep);
|
let is_local = is_local_url(desired_oai_ep);
|
||||||
desired_oai_ep = (is_local?"http://":"https://") + desired_oai_ep;
|
desired_oai_ep = (is_local?"http://":"https://") + desired_oai_ep;
|
||||||
}
|
}
|
||||||
if (document.getElementById("oaiaddversion").checked && !desired_oai_ep.toLowerCase().includes("pollinations.ai"))
|
if (document.getElementById("oaiaddversion").checked && !desired_oai_ep.toLowerCase().includes("text.pollinations.ai"))
|
||||||
{
|
{
|
||||||
//fix incorrect paths
|
//fix incorrect paths
|
||||||
if(desired_oai_ep!="" && desired_oai_ep.toLowerCase().endsWith("/chat/completions")) {
|
if(desired_oai_ep!="" && desired_oai_ep.toLowerCase().endsWith("/chat/completions")) {
|
||||||
|
@ -14990,6 +14990,14 @@ Current version indicated by LITEVER below.
|
||||||
oai_payload.messages.push({ "role": "assistant", "content": mainoaibody, "prefix":true });
|
oai_payload.messages.push({ "role": "assistant", "content": mainoaibody, "prefix":true });
|
||||||
oaiemulatecompletionscontent = mainoaibody;
|
oaiemulatecompletionscontent = mainoaibody;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if(targetep.toLowerCase().includes("text.pollinations.ai"))
|
||||||
|
{
|
||||||
|
if(localsettings.opmode==1)
|
||||||
|
{
|
||||||
|
oai_payload.messages.unshift({ "role": "system", "content": "Please continue this story directly from where it stopped. Just respond with a direct partial continuation of the story immediately from the latest word." });
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -19168,7 +19176,7 @@ Current version indicated by LITEVER below.
|
||||||
}else if(custom_oai_endpoint.toLowerCase().includes("api.x.ai"))
|
}else if(custom_oai_endpoint.toLowerCase().includes("api.x.ai"))
|
||||||
{
|
{
|
||||||
localsettings.prev_custom_endpoint_type = 9;
|
localsettings.prev_custom_endpoint_type = 9;
|
||||||
}else if(custom_oai_endpoint.toLowerCase().includes("pollinations.ai"))
|
}else if(custom_oai_endpoint.toLowerCase().includes("text.pollinations.ai"))
|
||||||
{
|
{
|
||||||
localsettings.prev_custom_endpoint_type = 10;
|
localsettings.prev_custom_endpoint_type = 10;
|
||||||
}
|
}
|
||||||
|
@ -22712,7 +22720,7 @@ Current version indicated by LITEVER below.
|
||||||
</span>
|
</span>
|
||||||
<span id="pollinationsdesc" class="hidden">
|
<span id="pollinationsdesc" class="hidden">
|
||||||
Pollinations.ai API is free to use without any key required.<br><br>
|
Pollinations.ai API is free to use without any key required.<br><br>
|
||||||
Note that KoboldAI Lite takes no responsibility for your usage or consequences of this feature.<br>Only Temperature, Top-P, Top-K and Repetition Penalty samplers are used.<br><br>
|
Note that KoboldAI Lite takes no responsibility for your usage or consequences of this feature. This service is ad driven, ads may appear in the output.<br>Only Temperature, Top-P, Top-K and Repetition Penalty samplers are used.<br><br>
|
||||||
<span class="color_green" style="font-weight: bold;">No Key Required.</span><br><br>
|
<span class="color_green" style="font-weight: bold;">No Key Required.</span><br><br>
|
||||||
</span>
|
</span>
|
||||||
|
|
||||||
|
|
48
koboldcpp.py
48
koboldcpp.py
|
@ -183,6 +183,7 @@ class load_model_inputs(ctypes.Structure):
|
||||||
("rope_freq_base", ctypes.c_float),
|
("rope_freq_base", ctypes.c_float),
|
||||||
("moe_experts", ctypes.c_int),
|
("moe_experts", ctypes.c_int),
|
||||||
("no_bos_token", ctypes.c_bool),
|
("no_bos_token", ctypes.c_bool),
|
||||||
|
("load_guidance", ctypes.c_bool),
|
||||||
("override_kv", ctypes.c_char_p),
|
("override_kv", ctypes.c_char_p),
|
||||||
("override_tensors", ctypes.c_char_p),
|
("override_tensors", ctypes.c_char_p),
|
||||||
("flash_attention", ctypes.c_bool),
|
("flash_attention", ctypes.c_bool),
|
||||||
|
@ -1230,6 +1231,7 @@ def load_model(model_filename):
|
||||||
|
|
||||||
inputs.moe_experts = args.moeexperts
|
inputs.moe_experts = args.moeexperts
|
||||||
inputs.no_bos_token = args.nobostoken
|
inputs.no_bos_token = args.nobostoken
|
||||||
|
inputs.load_guidance = args.enableguidance
|
||||||
inputs.override_kv = args.overridekv.encode("UTF-8") if args.overridekv else "".encode("UTF-8")
|
inputs.override_kv = args.overridekv.encode("UTF-8") if args.overridekv else "".encode("UTF-8")
|
||||||
inputs.override_tensors = args.overridetensors.encode("UTF-8") if args.overridetensors else "".encode("UTF-8")
|
inputs.override_tensors = args.overridetensors.encode("UTF-8") if args.overridetensors else "".encode("UTF-8")
|
||||||
inputs = set_backend_props(inputs)
|
inputs = set_backend_props(inputs)
|
||||||
|
@ -1238,21 +1240,23 @@ def load_model(model_filename):
|
||||||
|
|
||||||
def generate(genparams, stream_flag=False):
|
def generate(genparams, stream_flag=False):
|
||||||
global maxctx, args, currentusergenkey, totalgens, pendingabortkey
|
global maxctx, args, currentusergenkey, totalgens, pendingabortkey
|
||||||
|
default_adapter = {} if chatcompl_adapter is None else chatcompl_adapter
|
||||||
|
adapter_obj = genparams.get('adapter', default_adapter)
|
||||||
|
|
||||||
prompt = genparams.get('prompt', "")
|
prompt = genparams.get('prompt', "")
|
||||||
memory = genparams.get('memory', "")
|
memory = genparams.get('memory', "")
|
||||||
images = genparams.get('images', [])
|
images = genparams.get('images', [])
|
||||||
max_context_length = tryparseint(genparams.get('max_context_length', maxctx),maxctx)
|
max_context_length = tryparseint(genparams.get('max_context_length', maxctx),maxctx)
|
||||||
max_length = tryparseint(genparams.get('max_length', args.defaultgenamt),args.defaultgenamt)
|
max_length = tryparseint(genparams.get('max_length', args.defaultgenamt),args.defaultgenamt)
|
||||||
temperature = tryparsefloat(genparams.get('temperature', 0.75),0.75)
|
temperature = tryparsefloat(genparams.get('temperature', adapter_obj.get("temperature", 0.75)),0.75)
|
||||||
top_k = tryparseint(genparams.get('top_k', 100),100)
|
top_k = tryparseint(genparams.get('top_k', adapter_obj.get("top_k", 100)),100)
|
||||||
top_a = tryparsefloat(genparams.get('top_a', 0.0),0.0)
|
top_a = tryparsefloat(genparams.get('top_a', 0.0),0.0)
|
||||||
top_p = tryparsefloat(genparams.get('top_p', 0.92),0.92)
|
top_p = tryparsefloat(genparams.get('top_p', adapter_obj.get("top_p", 0.92)),0.92)
|
||||||
min_p = tryparsefloat(genparams.get('min_p', 0.0),0.0)
|
min_p = tryparsefloat(genparams.get('min_p', adapter_obj.get("min_p", 0.0)),0.0)
|
||||||
typical_p = tryparsefloat(genparams.get('typical', 1.0),1.0)
|
typical_p = tryparsefloat(genparams.get('typical', 1.0),1.0)
|
||||||
tfs = tryparsefloat(genparams.get('tfs', 1.0),1.0)
|
tfs = tryparsefloat(genparams.get('tfs', 1.0),1.0)
|
||||||
nsigma = tryparsefloat(genparams.get('nsigma', 0.0),0.0)
|
nsigma = tryparsefloat(genparams.get('nsigma', 0.0),0.0)
|
||||||
rep_pen = tryparsefloat(genparams.get('rep_pen', 1.0),1.0)
|
rep_pen = tryparsefloat(genparams.get('rep_pen', adapter_obj.get("rep_pen", 1.0)),1.0)
|
||||||
rep_pen_range = tryparseint(genparams.get('rep_pen_range', 320),320)
|
rep_pen_range = tryparseint(genparams.get('rep_pen_range', 320),320)
|
||||||
rep_pen_slope = tryparsefloat(genparams.get('rep_pen_slope', 1.0),1.0)
|
rep_pen_slope = tryparsefloat(genparams.get('rep_pen_slope', 1.0),1.0)
|
||||||
presence_penalty = tryparsefloat(genparams.get('presence_penalty', 0.0),0.0)
|
presence_penalty = tryparsefloat(genparams.get('presence_penalty', 0.0),0.0)
|
||||||
|
@ -1268,7 +1272,8 @@ def generate(genparams, stream_flag=False):
|
||||||
xtc_probability = tryparsefloat(genparams.get('xtc_probability', 0),0)
|
xtc_probability = tryparsefloat(genparams.get('xtc_probability', 0),0)
|
||||||
sampler_order = genparams.get('sampler_order', [6, 0, 1, 3, 4, 2, 5])
|
sampler_order = genparams.get('sampler_order', [6, 0, 1, 3, 4, 2, 5])
|
||||||
seed = tryparseint(genparams.get('sampler_seed', -1),-1)
|
seed = tryparseint(genparams.get('sampler_seed', -1),-1)
|
||||||
stop_sequence = genparams.get('stop_sequence', [])
|
stop_sequence = (genparams.get('stop_sequence', []) if genparams.get('stop_sequence', []) is not None else [])
|
||||||
|
stop_sequence = stop_sequence[:stop_token_max]
|
||||||
ban_eos_token = genparams.get('ban_eos_token', False)
|
ban_eos_token = genparams.get('ban_eos_token', False)
|
||||||
stream_sse = stream_flag
|
stream_sse = stream_flag
|
||||||
grammar = genparams.get('grammar', '')
|
grammar = genparams.get('grammar', '')
|
||||||
|
@ -1306,6 +1311,11 @@ def generate(genparams, stream_flag=False):
|
||||||
memory = memory.replace("{{[INPUT]}}", assistant_message_end + user_message_start)
|
memory = memory.replace("{{[INPUT]}}", assistant_message_end + user_message_start)
|
||||||
memory = memory.replace("{{[OUTPUT]}}", user_message_end + assistant_message_start)
|
memory = memory.replace("{{[OUTPUT]}}", user_message_end + assistant_message_start)
|
||||||
memory = memory.replace("{{[SYSTEM]}}", system_message_start)
|
memory = memory.replace("{{[SYSTEM]}}", system_message_start)
|
||||||
|
for i in range(len(stop_sequence)):
|
||||||
|
if stop_sequence[i] == "{{[INPUT]}}":
|
||||||
|
stop_sequence[i] = user_message_start
|
||||||
|
elif stop_sequence[i] == "{{[OUTPUT]}}":
|
||||||
|
stop_sequence[i] = assistant_message_start
|
||||||
|
|
||||||
for tok in custom_token_bans.split(','):
|
for tok in custom_token_bans.split(','):
|
||||||
tok = tok.strip() # Remove leading/trailing whitespace
|
tok = tok.strip() # Remove leading/trailing whitespace
|
||||||
|
@ -1402,9 +1412,6 @@ def generate(genparams, stream_flag=False):
|
||||||
print("ERROR: sampler_order must be a list of integers: " + str(e))
|
print("ERROR: sampler_order must be a list of integers: " + str(e))
|
||||||
inputs.seed = seed
|
inputs.seed = seed
|
||||||
|
|
||||||
if stop_sequence is None:
|
|
||||||
stop_sequence = []
|
|
||||||
stop_sequence = stop_sequence[:stop_token_max]
|
|
||||||
inputs.stop_sequence_len = len(stop_sequence)
|
inputs.stop_sequence_len = len(stop_sequence)
|
||||||
inputs.stop_sequence = (ctypes.c_char_p * inputs.stop_sequence_len)()
|
inputs.stop_sequence = (ctypes.c_char_p * inputs.stop_sequence_len)()
|
||||||
|
|
||||||
|
@ -3819,7 +3826,7 @@ def show_gui():
|
||||||
import customtkinter as ctk
|
import customtkinter as ctk
|
||||||
nextstate = 0 #0=exit, 1=launch
|
nextstate = 0 #0=exit, 1=launch
|
||||||
original_windowwidth = 580
|
original_windowwidth = 580
|
||||||
original_windowheight = 560
|
original_windowheight = 580
|
||||||
windowwidth = original_windowwidth
|
windowwidth = original_windowwidth
|
||||||
windowheight = original_windowheight
|
windowheight = original_windowheight
|
||||||
ctk.set_appearance_mode("dark")
|
ctk.set_appearance_mode("dark")
|
||||||
|
@ -3966,6 +3973,7 @@ def show_gui():
|
||||||
nobostoken_var = ctk.IntVar(value=0)
|
nobostoken_var = ctk.IntVar(value=0)
|
||||||
override_kv_var = ctk.StringVar(value="")
|
override_kv_var = ctk.StringVar(value="")
|
||||||
override_tensors_var = ctk.StringVar(value="")
|
override_tensors_var = ctk.StringVar(value="")
|
||||||
|
enableguidance_var = ctk.IntVar(value=0)
|
||||||
|
|
||||||
model_var = ctk.StringVar()
|
model_var = ctk.StringVar()
|
||||||
lora_var = ctk.StringVar()
|
lora_var = ctk.StringVar()
|
||||||
|
@ -4056,11 +4064,11 @@ def show_gui():
|
||||||
quick_tab = tabcontent["Quick Launch"]
|
quick_tab = tabcontent["Quick Launch"]
|
||||||
|
|
||||||
# helper functions
|
# helper functions
|
||||||
def makecheckbox(parent, text, variable=None, row=0, column=0, command=None, onvalue=1, offvalue=0,tooltiptxt=""):
|
def makecheckbox(parent, text, variable=None, row=0, column=0, command=None, padx=8,tooltiptxt=""):
|
||||||
temp = ctk.CTkCheckBox(parent, text=text,variable=variable, onvalue=onvalue, offvalue=offvalue)
|
temp = ctk.CTkCheckBox(parent, text=text,variable=variable, onvalue=1, offvalue=0)
|
||||||
if command is not None and variable is not None:
|
if command is not None and variable is not None:
|
||||||
variable.trace("w", command)
|
variable.trace("w", command)
|
||||||
temp.grid(row=row,column=column, padx=8, pady=1, stick="nw")
|
temp.grid(row=row,column=column, padx=padx, pady=1, stick="nw")
|
||||||
if tooltiptxt!="":
|
if tooltiptxt!="":
|
||||||
temp.bind("<Enter>", lambda event: show_tooltip(event, tooltiptxt))
|
temp.bind("<Enter>", lambda event: show_tooltip(event, tooltiptxt))
|
||||||
temp.bind("<Leave>", hide_tooltip)
|
temp.bind("<Leave>", hide_tooltip)
|
||||||
|
@ -4577,16 +4585,17 @@ def show_gui():
|
||||||
item.grid_remove()
|
item.grid_remove()
|
||||||
makecheckbox(tokens_tab, "Custom RoPE Config", variable=customrope_var, row=22, command=togglerope,tooltiptxt="Override the default RoPE configuration with custom RoPE scaling.")
|
makecheckbox(tokens_tab, "Custom RoPE Config", variable=customrope_var, row=22, command=togglerope,tooltiptxt="Override the default RoPE configuration with custom RoPE scaling.")
|
||||||
use_flashattn = makecheckbox(tokens_tab, "Use FlashAttention", flashattention, 28, command=toggleflashattn, tooltiptxt="Enable flash attention for GGUF models.")
|
use_flashattn = makecheckbox(tokens_tab, "Use FlashAttention", flashattention, 28, command=toggleflashattn, tooltiptxt="Enable flash attention for GGUF models.")
|
||||||
noqkvlabel = makelabel(tokens_tab,"QuantKV works best with flash attention enabled",33,0,"WARNING: NOT RECOMMENDED.\nOnly K cache can be quantized, and performance can suffer.\nIn some cases, it might even use more VRAM when doing a full offload.")
|
noqkvlabel = makelabel(tokens_tab,"(Note: QuantKV works best with flash attention)",28,0,"Only K cache can be quantized, and performance can suffer.\nIn some cases, it might even use more VRAM when doing a full offload.",padx=160)
|
||||||
noqkvlabel.configure(text_color="#ff5555")
|
noqkvlabel.configure(text_color="#ff5555")
|
||||||
avoidfalabel = makelabel(tokens_tab,"Flash attention discouraged with Vulkan GPU offload!",35,0,"FlashAttention is discouraged when using Vulkan GPU offload.")
|
avoidfalabel = makelabel(tokens_tab,"(Note: Flash attention may be slow on Vulkan)",28,0,"FlashAttention is discouraged when using Vulkan GPU offload.",padx=160)
|
||||||
avoidfalabel.configure(text_color="#ff5555")
|
avoidfalabel.configure(text_color="#ff5555")
|
||||||
qkvslider,qkvlabel,qkvtitle = makeslider(tokens_tab, "Quantize KV Cache:", quantkv_text, quantkv_var, 0, 2, 30, set=0,tooltip="Enable quantization of KV cache.\nRequires FlashAttention for full effect, otherwise only K cache is quantized.")
|
qkvslider,qkvlabel,qkvtitle = makeslider(tokens_tab, "Quantize KV Cache:", quantkv_text, quantkv_var, 0, 2, 30, set=0,tooltip="Enable quantization of KV cache.\nRequires FlashAttention for full effect, otherwise only K cache is quantized.")
|
||||||
quantkv_var.trace("w", toggleflashattn)
|
quantkv_var.trace("w", toggleflashattn)
|
||||||
makecheckbox(tokens_tab, "No BOS Token", nobostoken_var, 43, tooltiptxt="Prevents BOS token from being added at the start of any prompt. Usually NOT recommended for most models.")
|
makecheckbox(tokens_tab, "No BOS Token", nobostoken_var, 43, tooltiptxt="Prevents BOS token from being added at the start of any prompt. Usually NOT recommended for most models.")
|
||||||
makelabelentry(tokens_tab, "MoE Experts:", moeexperts_var, row=45, padx=120, singleline=True, tooltip="Override number of MoE experts.")
|
makecheckbox(tokens_tab, "Enable Guidance", enableguidance_var, 43,padx=140, tooltiptxt="Enables the use of Classifier-Free-Guidance, which allows the use of negative prompts. Has performance and memory impact.")
|
||||||
makelabelentry(tokens_tab, "Override KV:", override_kv_var, row=47, padx=120, singleline=True, width=150, tooltip="Advanced option to override model metadata by key, same as in llama.cpp. Mainly for debugging, not intended for general use. Types: int, float, bool, str")
|
makelabelentry(tokens_tab, "MoE Experts:", moeexperts_var, row=55, padx=120, singleline=True, tooltip="Override number of MoE experts.")
|
||||||
makelabelentry(tokens_tab, "Override Tensors:", override_tensors_var, row=49, padx=120, singleline=True, width=150, tooltip="Advanced option to override tensor backend selection, same as in llama.cpp.")
|
makelabelentry(tokens_tab, "Override KV:", override_kv_var, row=57, padx=120, singleline=True, width=150, tooltip="Advanced option to override model metadata by key, same as in llama.cpp. Mainly for debugging, not intended for general use. Types: int, float, bool, str")
|
||||||
|
makelabelentry(tokens_tab, "Override Tensors:", override_tensors_var, row=59, padx=120, singleline=True, width=150, tooltip="Advanced option to override tensor backend selection, same as in llama.cpp.")
|
||||||
|
|
||||||
# Model Tab
|
# Model Tab
|
||||||
model_tab = tabcontent["Loaded Files"]
|
model_tab = tabcontent["Loaded Files"]
|
||||||
|
@ -4862,6 +4871,7 @@ def show_gui():
|
||||||
args.moeexperts = int(moeexperts_var.get()) if moeexperts_var.get()!="" else -1
|
args.moeexperts = int(moeexperts_var.get()) if moeexperts_var.get()!="" else -1
|
||||||
args.defaultgenamt = int(defaultgenamt_var.get()) if defaultgenamt_var.get()!="" else 512
|
args.defaultgenamt = int(defaultgenamt_var.get()) if defaultgenamt_var.get()!="" else 512
|
||||||
args.nobostoken = (nobostoken_var.get()==1)
|
args.nobostoken = (nobostoken_var.get()==1)
|
||||||
|
args.enableguidance = (enableguidance_var.get()==1)
|
||||||
args.overridekv = None if override_kv_var.get() == "" else override_kv_var.get()
|
args.overridekv = None if override_kv_var.get() == "" else override_kv_var.get()
|
||||||
args.overridetensors = None if override_tensors_var.get() == "" else override_tensors_var.get()
|
args.overridetensors = None if override_tensors_var.get() == "" else override_tensors_var.get()
|
||||||
args.chatcompletionsadapter = None if chatcompletionsadapter_var.get() == "" else chatcompletionsadapter_var.get()
|
args.chatcompletionsadapter = None if chatcompletionsadapter_var.get() == "" else chatcompletionsadapter_var.get()
|
||||||
|
@ -5057,6 +5067,7 @@ def show_gui():
|
||||||
if "defaultgenamt" in dict and dict["defaultgenamt"]:
|
if "defaultgenamt" in dict and dict["defaultgenamt"]:
|
||||||
defaultgenamt_var.set(dict["defaultgenamt"])
|
defaultgenamt_var.set(dict["defaultgenamt"])
|
||||||
nobostoken_var.set(dict["nobostoken"] if ("nobostoken" in dict) else 0)
|
nobostoken_var.set(dict["nobostoken"] if ("nobostoken" in dict) else 0)
|
||||||
|
enableguidance_var.set(dict["enableguidance"] if ("enableguidance" in dict) else 0)
|
||||||
if "overridekv" in dict and dict["overridekv"]:
|
if "overridekv" in dict and dict["overridekv"]:
|
||||||
override_kv_var.set(dict["overridekv"])
|
override_kv_var.set(dict["overridekv"])
|
||||||
if "overridetensors" in dict and dict["overridetensors"]:
|
if "overridetensors" in dict and dict["overridetensors"]:
|
||||||
|
@ -6801,6 +6812,7 @@ if __name__ == '__main__':
|
||||||
advparser.add_argument("--moeexperts", metavar=('[num of experts]'), help="How many experts to use for MoE models (default=follow gguf)", type=int, default=-1)
|
advparser.add_argument("--moeexperts", metavar=('[num of experts]'), help="How many experts to use for MoE models (default=follow gguf)", type=int, default=-1)
|
||||||
advparser.add_argument("--defaultgenamt", help="How many tokens to generate by default, if not specified. Must be smaller than context size. Usually, your frontend GUI will override this.", type=check_range(int,64,4096), default=512)
|
advparser.add_argument("--defaultgenamt", help="How many tokens to generate by default, if not specified. Must be smaller than context size. Usually, your frontend GUI will override this.", type=check_range(int,64,4096), default=512)
|
||||||
advparser.add_argument("--nobostoken", help="Prevents BOS token from being added at the start of any prompt. Usually NOT recommended for most models.", action='store_true')
|
advparser.add_argument("--nobostoken", help="Prevents BOS token from being added at the start of any prompt. Usually NOT recommended for most models.", action='store_true')
|
||||||
|
advparser.add_argument("--enableguidance", help="Enables the use of Classifier-Free-Guidance, which allows the use of negative prompts. Has performance and memory impact.", action='store_true')
|
||||||
advparser.add_argument("--maxrequestsize", metavar=('[size in MB]'), help="Specify a max request payload size. Any requests to the server larger than this size will be dropped. Do not change if unsure.", type=int, default=32)
|
advparser.add_argument("--maxrequestsize", metavar=('[size in MB]'), help="Specify a max request payload size. Any requests to the server larger than this size will be dropped. Do not change if unsure.", type=int, default=32)
|
||||||
advparser.add_argument("--overridekv", metavar=('[name=type:value]'), help="Advanced option to override a metadata by key, same as in llama.cpp. Mainly for debugging, not intended for general use. Types: int, float, bool, str", default="")
|
advparser.add_argument("--overridekv", metavar=('[name=type:value]'), help="Advanced option to override a metadata by key, same as in llama.cpp. Mainly for debugging, not intended for general use. Types: int, float, bool, str", default="")
|
||||||
advparser.add_argument("--overridetensors", metavar=('[tensor name pattern=buffer type]'), help="Advanced option to override tensor backend selection, same as in llama.cpp.", default="")
|
advparser.add_argument("--overridetensors", metavar=('[tensor name pattern=buffer type]'), help="Advanced option to override tensor backend selection, same as in llama.cpp.", default="")
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue