mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 09:34:37 +00:00
discourage but dont prevent vulkan FA (it's occasionally still useful)
This commit is contained in:
parent
b42fa821d8
commit
6302709fbb
1 changed files with 15 additions and 8 deletions
23
koboldcpp.py
23
koboldcpp.py
|
@ -3986,6 +3986,13 @@ def show_gui():
|
||||||
num_backends_built.grid(row=1, column=1, padx=205, pady=0)
|
num_backends_built.grid(row=1, column=1, padx=205, pady=0)
|
||||||
num_backends_built.configure(text_color="#00ff00")
|
num_backends_built.configure(text_color="#00ff00")
|
||||||
|
|
||||||
|
def vulkan_fa_lbl():
|
||||||
|
if flashattention.get()!=0 and (runopts_var.get() == "Use Vulkan" or runopts_var.get() == "Use Vulkan (Old CPU)") and (gpulayers_var.get()!="" and int(gpulayers_var.get())):
|
||||||
|
avoidfalabel.grid()
|
||||||
|
else:
|
||||||
|
avoidfalabel.grid_remove()
|
||||||
|
|
||||||
|
|
||||||
def gui_changed_modelfile(*args):
|
def gui_changed_modelfile(*args):
|
||||||
global importvars_in_progress
|
global importvars_in_progress
|
||||||
if not importvars_in_progress:
|
if not importvars_in_progress:
|
||||||
|
@ -4021,7 +4028,7 @@ def show_gui():
|
||||||
else:
|
else:
|
||||||
layercounter_label.grid_remove()
|
layercounter_label.grid_remove()
|
||||||
quick_layercounter_label.grid_remove()
|
quick_layercounter_label.grid_remove()
|
||||||
pass
|
vulkan_fa_lbl()
|
||||||
|
|
||||||
def changed_gpu_choice_var(*args):
|
def changed_gpu_choice_var(*args):
|
||||||
global exitcounter
|
global exitcounter
|
||||||
|
@ -4076,8 +4083,7 @@ def show_gui():
|
||||||
noqkvlabel.grid()
|
noqkvlabel.grid()
|
||||||
else:
|
else:
|
||||||
noqkvlabel.grid_remove()
|
noqkvlabel.grid_remove()
|
||||||
|
vulkan_fa_lbl()
|
||||||
|
|
||||||
|
|
||||||
def guibench():
|
def guibench():
|
||||||
args.benchmark = "stdout"
|
args.benchmark = "stdout"
|
||||||
|
@ -4134,7 +4140,7 @@ def show_gui():
|
||||||
tensor_split_label.grid(row=8, column=0, padx = 8, pady=1, stick="nw")
|
tensor_split_label.grid(row=8, column=0, padx = 8, pady=1, stick="nw")
|
||||||
tensor_split_entry.grid(row=8, column=1, padx=8, pady=1, stick="nw")
|
tensor_split_entry.grid(row=8, column=1, padx=8, pady=1, stick="nw")
|
||||||
quick_use_flashattn.grid_remove()
|
quick_use_flashattn.grid_remove()
|
||||||
use_flashattn.grid_remove()
|
use_flashattn.grid(row=28, column=0, padx=8, pady=1, stick="nw")
|
||||||
else:
|
else:
|
||||||
quick_use_flashattn.grid(row=22, column=1, padx=8, pady=1, stick="nw")
|
quick_use_flashattn.grid(row=22, column=1, padx=8, pady=1, stick="nw")
|
||||||
use_flashattn.grid(row=28, column=0, padx=8, pady=1, stick="nw")
|
use_flashattn.grid(row=28, column=0, padx=8, pady=1, stick="nw")
|
||||||
|
@ -4156,7 +4162,7 @@ def show_gui():
|
||||||
quick_gpu_layers_entry.grid_remove()
|
quick_gpu_layers_entry.grid_remove()
|
||||||
changed_gpulayers_estimate()
|
changed_gpulayers_estimate()
|
||||||
changed_gpu_choice_var()
|
changed_gpu_choice_var()
|
||||||
|
vulkan_fa_lbl()
|
||||||
|
|
||||||
# presets selector
|
# presets selector
|
||||||
makelabel(quick_tab, "Presets:", 1,0,"Select a backend to use.\nCuBLAS runs on Nvidia GPUs, and is much faster.\nVulkan and CLBlast works on all GPUs but is somewhat slower.\nOtherwise, runs on CPU only.\nNoAVX2 and Failsafe modes support older PCs.")
|
makelabel(quick_tab, "Presets:", 1,0,"Select a backend to use.\nCuBLAS runs on Nvidia GPUs, and is much faster.\nVulkan and CLBlast works on all GPUs but is somewhat slower.\nOtherwise, runs on CPU only.\nNoAVX2 and Failsafe modes support older PCs.")
|
||||||
|
@ -4283,6 +4289,8 @@ def show_gui():
|
||||||
use_flashattn = makecheckbox(tokens_tab, "Use FlashAttention", flashattention, 28, command=toggleflashattn, tooltiptxt="Enable flash attention for GGUF models.")
|
use_flashattn = makecheckbox(tokens_tab, "Use FlashAttention", flashattention, 28, command=toggleflashattn, tooltiptxt="Enable flash attention for GGUF models.")
|
||||||
noqkvlabel = makelabel(tokens_tab,"QuantKV works best with flash attention enabled",33,0,"WARNING: NOT RECOMMENDED.\nOnly K cache can be quantized, and performance can suffer.\nIn some cases, it might even use more VRAM when doing a full offload.")
|
noqkvlabel = makelabel(tokens_tab,"QuantKV works best with flash attention enabled",33,0,"WARNING: NOT RECOMMENDED.\nOnly K cache can be quantized, and performance can suffer.\nIn some cases, it might even use more VRAM when doing a full offload.")
|
||||||
noqkvlabel.configure(text_color="#ff5555")
|
noqkvlabel.configure(text_color="#ff5555")
|
||||||
|
avoidfalabel = makelabel(tokens_tab,"Flash attention discouraged with Vulkan GPU offload!",35,0,"FlashAttention is discouraged when using Vulkan GPU offload.")
|
||||||
|
avoidfalabel.configure(text_color="#ff5555")
|
||||||
qkvslider,qkvlabel,qkvtitle = makeslider(tokens_tab, "Quantize KV Cache:", quantkv_text, quantkv_var, 0, 2, 30, set=0,tooltip="Enable quantization of KV cache.\nRequires FlashAttention for full effect, otherwise only K cache is quantized.")
|
qkvslider,qkvlabel,qkvtitle = makeslider(tokens_tab, "Quantize KV Cache:", quantkv_text, quantkv_var, 0, 2, 30, set=0,tooltip="Enable quantization of KV cache.\nRequires FlashAttention for full effect, otherwise only K cache is quantized.")
|
||||||
quantkv_var.trace("w", toggleflashattn)
|
quantkv_var.trace("w", toggleflashattn)
|
||||||
makecheckbox(tokens_tab, "No BOS Token", nobostoken_var, 43, tooltiptxt="Prevents BOS token from being added at the start of any prompt. Usually NOT recommended for most models.")
|
makecheckbox(tokens_tab, "No BOS Token", nobostoken_var, 43, tooltiptxt="Prevents BOS token from being added at the start of any prompt. Usually NOT recommended for most models.")
|
||||||
|
@ -6010,9 +6018,8 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False):
|
||||||
|
|
||||||
if not args.blasthreads or args.blasthreads <= 0:
|
if not args.blasthreads or args.blasthreads <= 0:
|
||||||
args.blasthreads = args.threads
|
args.blasthreads = args.threads
|
||||||
if args.flashattention and (args.usevulkan is not None):
|
if args.flashattention and (args.usevulkan is not None) and args.gpulayers!=0:
|
||||||
print("FlashAttention should not be used with Vulkan as it is not fully implemented. Disabling flash attention.")
|
print("\nWARNING: FlashAttention is strongly discouraged when using Vulkan GPU offload as it is extremely slow!\n")
|
||||||
args.flashattention = False
|
|
||||||
|
|
||||||
modelname = os.path.abspath(args.model_param)
|
modelname = os.path.abspath(args.model_param)
|
||||||
print(args)
|
print(args)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue