mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 01:24:36 +00:00
remove fa restrictions for vulkan
This commit is contained in:
parent
0874cd231a
commit
c4a0b323f0
1 changed files with 2 additions and 19 deletions
21
koboldcpp.py
21
koboldcpp.py
|
@ -4297,13 +4297,6 @@ def show_gui():
|
|||
num_backends_built.grid(row=1, column=1, padx=205, pady=0)
|
||||
num_backends_built.configure(text_color="#00ff00")
|
||||
|
||||
def vulkan_fa_lbl():
|
||||
if flashattention.get()!=0 and (runopts_var.get() == "Use Vulkan" or runopts_var.get() == "Use Vulkan (Old CPU)") and (gpulayers_var.get()!="" and int(gpulayers_var.get())):
|
||||
avoidfalabel.grid()
|
||||
else:
|
||||
avoidfalabel.grid_remove()
|
||||
|
||||
|
||||
def gui_changed_modelfile(*args):
|
||||
global importvars_in_progress
|
||||
if not importvars_in_progress:
|
||||
|
@ -4339,7 +4332,6 @@ def show_gui():
|
|||
else:
|
||||
layercounter_label.grid_remove()
|
||||
quick_layercounter_label.grid_remove()
|
||||
vulkan_fa_lbl()
|
||||
|
||||
def changed_gpu_choice_var(*args):
|
||||
global exitcounter
|
||||
|
@ -4394,7 +4386,6 @@ def show_gui():
|
|||
noqkvlabel.grid()
|
||||
else:
|
||||
noqkvlabel.grid_remove()
|
||||
vulkan_fa_lbl()
|
||||
changed_gpulayers_estimate()
|
||||
|
||||
def guibench():
|
||||
|
@ -4451,11 +4442,6 @@ def show_gui():
|
|||
if index == "Use Vulkan" or index == "Use Vulkan (Old CPU)":
|
||||
tensor_split_label.grid(row=8, column=0, padx = 8, pady=1, stick="nw")
|
||||
tensor_split_entry.grid(row=8, column=1, padx=8, pady=1, stick="nw")
|
||||
quick_use_flashattn.grid_remove()
|
||||
use_flashattn.grid(row=28, column=0, padx=8, pady=1, stick="nw")
|
||||
else:
|
||||
quick_use_flashattn.grid(row=22, column=1, padx=8, pady=1, stick="nw")
|
||||
use_flashattn.grid(row=28, column=0, padx=8, pady=1, stick="nw")
|
||||
|
||||
if index == "Use Vulkan" or index == "Use Vulkan (Old CPU)" or index == "Use CLBlast" or index == "Use CLBlast (Old CPU)" or index == "Use CLBlast (Older CPU)" or index == "Use CuBLAS" or index == "Use hipBLAS (ROCm)":
|
||||
gpu_layers_label.grid(row=6, column=0, padx = 8, pady=1, stick="nw")
|
||||
|
@ -4474,7 +4460,6 @@ def show_gui():
|
|||
quick_gpu_layers_entry.grid_remove()
|
||||
changed_gpulayers_estimate()
|
||||
changed_gpu_choice_var()
|
||||
vulkan_fa_lbl()
|
||||
|
||||
# presets selector
|
||||
makelabel(quick_tab, "Presets:", 1,0,"Select a backend to use.\nCuBLAS runs on Nvidia GPUs, and is much faster.\nVulkan and CLBlast works on all GPUs but is somewhat slower.\nOtherwise, runs on CPU only.\nNoAVX2 and Failsafe modes support older PCs.")
|
||||
|
@ -4511,7 +4496,7 @@ def show_gui():
|
|||
for idx, (name, properties) in enumerate(quick_boxes.items()):
|
||||
makecheckbox(quick_tab, name, properties[0], int(idx/2) + 20, idx % 2, tooltiptxt=properties[1])
|
||||
|
||||
quick_use_flashattn = makecheckbox(quick_tab, "Use FlashAttention", flashattention, 22, 1, tooltiptxt="Enable flash attention for GGUF models.")
|
||||
makecheckbox(quick_tab, "Use FlashAttention", flashattention, 22, 1, tooltiptxt="Enable flash attention for GGUF models.")
|
||||
|
||||
# context size
|
||||
makeslider(quick_tab, "Context Size:", contextsize_text, context_var, 0, len(contextsize_text)-1, 30, width=280, set=5,tooltip="What is the maximum context size to support. Model specific. You cannot exceed it.\nLarger contexts require more memory, and not all models support it.")
|
||||
|
@ -4599,11 +4584,9 @@ def show_gui():
|
|||
else:
|
||||
item.grid_remove()
|
||||
makecheckbox(tokens_tab, "Custom RoPE Config", variable=customrope_var, row=22, command=togglerope,tooltiptxt="Override the default RoPE configuration with custom RoPE scaling.")
|
||||
use_flashattn = makecheckbox(tokens_tab, "Use FlashAttention", flashattention, 28, command=toggleflashattn, tooltiptxt="Enable flash attention for GGUF models.")
|
||||
makecheckbox(tokens_tab, "Use FlashAttention", flashattention, 28, command=toggleflashattn, tooltiptxt="Enable flash attention for GGUF models.")
|
||||
noqkvlabel = makelabel(tokens_tab,"(Note: QuantKV works best with flash attention)",28,0,"Only K cache can be quantized, and performance can suffer.\nIn some cases, it might even use more VRAM when doing a full offload.",padx=160)
|
||||
noqkvlabel.configure(text_color="#ff5555")
|
||||
avoidfalabel = makelabel(tokens_tab,"(Note: Flash attention may be slow on Vulkan)",28,0,"FlashAttention is discouraged when using Vulkan GPU offload.",padx=160)
|
||||
avoidfalabel.configure(text_color="#ff5555")
|
||||
qkvslider,qkvlabel,qkvtitle = makeslider(tokens_tab, "Quantize KV Cache:", quantkv_text, quantkv_var, 0, 2, 30, set=0,tooltip="Enable quantization of KV cache.\nRequires FlashAttention for full effect, otherwise only K cache is quantized.")
|
||||
quantkv_var.trace("w", toggleflashattn)
|
||||
makecheckbox(tokens_tab, "No BOS Token", nobostoken_var, 43, tooltiptxt="Prevents BOS token from being added at the start of any prompt. Usually NOT recommended for most models.")
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue