mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-10 17:14:36 +00:00
based on occam and henky advice, disabled flash attention entirely on vulkan.
This commit is contained in:
parent
7a7bdeab6d
commit
5908f2ca19
1 changed files with 16 additions and 11 deletions
27
koboldcpp.py
27
koboldcpp.py
|
@ -4134,8 +4134,10 @@ def show_gui():
|
||||||
tensor_split_label.grid(row=8, column=0, padx = 8, pady=1, stick="nw")
|
tensor_split_label.grid(row=8, column=0, padx = 8, pady=1, stick="nw")
|
||||||
tensor_split_entry.grid(row=8, column=1, padx=8, pady=1, stick="nw")
|
tensor_split_entry.grid(row=8, column=1, padx=8, pady=1, stick="nw")
|
||||||
quick_use_flashattn.grid_remove()
|
quick_use_flashattn.grid_remove()
|
||||||
|
use_flashattn.grid_remove()
|
||||||
else:
|
else:
|
||||||
quick_use_flashattn.grid(row=22, column=1, padx=8, pady=1, stick="nw")
|
quick_use_flashattn.grid(row=22, column=1, padx=8, pady=1, stick="nw")
|
||||||
|
use_flashattn.grid(row=28, column=0, padx=8, pady=1, stick="nw")
|
||||||
|
|
||||||
if index == "Use Vulkan" or index == "Use Vulkan (Old CPU)" or index == "Use CLBlast" or index == "Use CLBlast (Old CPU)" or index == "Use CLBlast (Older CPU)" or index == "Use CuBLAS" or index == "Use hipBLAS (ROCm)":
|
if index == "Use Vulkan" or index == "Use Vulkan (Old CPU)" or index == "Use CLBlast" or index == "Use CLBlast (Old CPU)" or index == "Use CLBlast (Older CPU)" or index == "Use CuBLAS" or index == "Use hipBLAS (ROCm)":
|
||||||
gpu_layers_label.grid(row=6, column=0, padx = 8, pady=1, stick="nw")
|
gpu_layers_label.grid(row=6, column=0, padx = 8, pady=1, stick="nw")
|
||||||
|
@ -4256,11 +4258,6 @@ def show_gui():
|
||||||
ctk.CTkButton(hardware_tab , text = "Run Benchmark", command = guibench ).grid(row=110,column=0, stick="se", padx= 0, pady=2)
|
ctk.CTkButton(hardware_tab , text = "Run Benchmark", command = guibench ).grid(row=110,column=0, stick="se", padx= 0, pady=2)
|
||||||
|
|
||||||
|
|
||||||
runopts_var.trace('w', changerunmode)
|
|
||||||
changerunmode(1,1,1)
|
|
||||||
global runmode_untouched
|
|
||||||
runmode_untouched = True
|
|
||||||
|
|
||||||
# Tokens Tab
|
# Tokens Tab
|
||||||
tokens_tab = tabcontent["Tokens"]
|
tokens_tab = tabcontent["Tokens"]
|
||||||
# tokens checkboxes
|
# tokens checkboxes
|
||||||
|
@ -4283,7 +4280,7 @@ def show_gui():
|
||||||
else:
|
else:
|
||||||
item.grid_remove()
|
item.grid_remove()
|
||||||
makecheckbox(tokens_tab, "Custom RoPE Config", variable=customrope_var, row=22, command=togglerope,tooltiptxt="Override the default RoPE configuration with custom RoPE scaling.")
|
makecheckbox(tokens_tab, "Custom RoPE Config", variable=customrope_var, row=22, command=togglerope,tooltiptxt="Override the default RoPE configuration with custom RoPE scaling.")
|
||||||
makecheckbox(tokens_tab, "Use FlashAttention", flashattention, 28, command=toggleflashattn, tooltiptxt="Enable flash attention for GGUF models.")
|
use_flashattn = makecheckbox(tokens_tab, "Use FlashAttention", flashattention, 28, command=toggleflashattn, tooltiptxt="Enable flash attention for GGUF models.")
|
||||||
noqkvlabel = makelabel(tokens_tab,"QuantKV works best with flash attention enabled",33,0,"WARNING: NOT RECOMMENDED.\nOnly K cache can be quantized, and performance can suffer.\nIn some cases, it might even use more VRAM when doing a full offload.")
|
noqkvlabel = makelabel(tokens_tab,"QuantKV works best with flash attention enabled",33,0,"WARNING: NOT RECOMMENDED.\nOnly K cache can be quantized, and performance can suffer.\nIn some cases, it might even use more VRAM when doing a full offload.")
|
||||||
noqkvlabel.configure(text_color="#ff5555")
|
noqkvlabel.configure(text_color="#ff5555")
|
||||||
qkvslider,qkvlabel,qkvtitle = makeslider(tokens_tab, "Quantize KV Cache:", quantkv_text, quantkv_var, 0, 2, 30, set=0,tooltip="Enable quantization of KV cache.\nRequires FlashAttention for full effect, otherwise only K cache is quantized.")
|
qkvslider,qkvlabel,qkvtitle = makeslider(tokens_tab, "Quantize KV Cache:", quantkv_text, quantkv_var, 0, 2, 30, set=0,tooltip="Enable quantization of KV cache.\nRequires FlashAttention for full effect, otherwise only K cache is quantized.")
|
||||||
|
@ -4291,10 +4288,6 @@ def show_gui():
|
||||||
makecheckbox(tokens_tab, "No BOS Token", nobostoken_var, 43, tooltiptxt="Prevents BOS token from being added at the start of any prompt. Usually NOT recommended for most models.")
|
makecheckbox(tokens_tab, "No BOS Token", nobostoken_var, 43, tooltiptxt="Prevents BOS token from being added at the start of any prompt. Usually NOT recommended for most models.")
|
||||||
makelabelentry(tokens_tab, "MoE Experts:", moeexperts_var, row=45, padx=100, singleline=True, tooltip="Override number of MoE experts.")
|
makelabelentry(tokens_tab, "MoE Experts:", moeexperts_var, row=45, padx=100, singleline=True, tooltip="Override number of MoE experts.")
|
||||||
|
|
||||||
togglerope(1,1,1)
|
|
||||||
toggleflashattn(1,1,1)
|
|
||||||
togglectxshift(1,1,1)
|
|
||||||
|
|
||||||
# Model Tab
|
# Model Tab
|
||||||
model_tab = tabcontent["Loaded Files"]
|
model_tab = tabcontent["Loaded Files"]
|
||||||
|
|
||||||
|
@ -4370,7 +4363,6 @@ def show_gui():
|
||||||
horde_name_var.set(sanitize_string(os.path.splitext(basefile)[0]))
|
horde_name_var.set(sanitize_string(os.path.splitext(basefile)[0]))
|
||||||
|
|
||||||
makecheckbox(horde_tab, "Configure for Horde", usehorde_var, 19, command=togglehorde,tooltiptxt="Enable the embedded AI Horde worker.")
|
makecheckbox(horde_tab, "Configure for Horde", usehorde_var, 19, command=togglehorde,tooltiptxt="Enable the embedded AI Horde worker.")
|
||||||
togglehorde(1,1,1)
|
|
||||||
|
|
||||||
# Image Gen Tab
|
# Image Gen Tab
|
||||||
|
|
||||||
|
@ -4471,6 +4463,16 @@ def show_gui():
|
||||||
makecheckbox(extra_tab, "Use Classic FilePicker", nozenity_var, 20, tooltiptxt="Use the classic TKinter file picker instead.")
|
makecheckbox(extra_tab, "Use Classic FilePicker", nozenity_var, 20, tooltiptxt="Use the classic TKinter file picker instead.")
|
||||||
nozenity_var.trace("w", togglezenity)
|
nozenity_var.trace("w", togglezenity)
|
||||||
|
|
||||||
|
# refresh
|
||||||
|
runopts_var.trace('w', changerunmode)
|
||||||
|
changerunmode(1,1,1)
|
||||||
|
global runmode_untouched
|
||||||
|
runmode_untouched = True
|
||||||
|
togglerope(1,1,1)
|
||||||
|
toggleflashattn(1,1,1)
|
||||||
|
togglectxshift(1,1,1)
|
||||||
|
togglehorde(1,1,1)
|
||||||
|
|
||||||
# launch
|
# launch
|
||||||
def guilaunch():
|
def guilaunch():
|
||||||
if model_var.get() == "" and sd_model_var.get() == "" and whisper_model_var.get() == "" and tts_model_var.get() == "" and embeddings_model_var.get() == "" and nomodel.get()!=1:
|
if model_var.get() == "" and sd_model_var.get() == "" and whisper_model_var.get() == "" and tts_model_var.get() == "" and embeddings_model_var.get() == "" and nomodel.get()!=1:
|
||||||
|
@ -6008,6 +6010,9 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False):
|
||||||
|
|
||||||
if not args.blasthreads or args.blasthreads <= 0:
|
if not args.blasthreads or args.blasthreads <= 0:
|
||||||
args.blasthreads = args.threads
|
args.blasthreads = args.threads
|
||||||
|
if args.flashattention and (args.usevulkan is not None):
|
||||||
|
print("FlashAttention should not be used with Vulkan as it is not fully implemented. Disabling flash attention.")
|
||||||
|
args.flashattention = False
|
||||||
|
|
||||||
modelname = os.path.abspath(args.model_param)
|
modelname = os.path.abspath(args.model_param)
|
||||||
print(args)
|
print(args)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue