remove fa restrictions for vulkan

This commit is contained in:
Concedo 2025-05-09 17:34:14 +08:00
parent 0874cd231a
commit c4a0b323f0

View file

@ -4297,13 +4297,6 @@ def show_gui():
num_backends_built.grid(row=1, column=1, padx=205, pady=0) num_backends_built.grid(row=1, column=1, padx=205, pady=0)
num_backends_built.configure(text_color="#00ff00") num_backends_built.configure(text_color="#00ff00")
def vulkan_fa_lbl():
if flashattention.get()!=0 and (runopts_var.get() == "Use Vulkan" or runopts_var.get() == "Use Vulkan (Old CPU)") and (gpulayers_var.get()!="" and int(gpulayers_var.get())):
avoidfalabel.grid()
else:
avoidfalabel.grid_remove()
def gui_changed_modelfile(*args): def gui_changed_modelfile(*args):
global importvars_in_progress global importvars_in_progress
if not importvars_in_progress: if not importvars_in_progress:
@ -4339,7 +4332,6 @@ def show_gui():
else: else:
layercounter_label.grid_remove() layercounter_label.grid_remove()
quick_layercounter_label.grid_remove() quick_layercounter_label.grid_remove()
vulkan_fa_lbl()
def changed_gpu_choice_var(*args): def changed_gpu_choice_var(*args):
global exitcounter global exitcounter
@ -4394,7 +4386,6 @@ def show_gui():
noqkvlabel.grid() noqkvlabel.grid()
else: else:
noqkvlabel.grid_remove() noqkvlabel.grid_remove()
vulkan_fa_lbl()
changed_gpulayers_estimate() changed_gpulayers_estimate()
def guibench(): def guibench():
@ -4451,11 +4442,6 @@ def show_gui():
if index == "Use Vulkan" or index == "Use Vulkan (Old CPU)": if index == "Use Vulkan" or index == "Use Vulkan (Old CPU)":
tensor_split_label.grid(row=8, column=0, padx = 8, pady=1, stick="nw") tensor_split_label.grid(row=8, column=0, padx = 8, pady=1, stick="nw")
tensor_split_entry.grid(row=8, column=1, padx=8, pady=1, stick="nw") tensor_split_entry.grid(row=8, column=1, padx=8, pady=1, stick="nw")
quick_use_flashattn.grid_remove()
use_flashattn.grid(row=28, column=0, padx=8, pady=1, stick="nw")
else:
quick_use_flashattn.grid(row=22, column=1, padx=8, pady=1, stick="nw")
use_flashattn.grid(row=28, column=0, padx=8, pady=1, stick="nw")
if index == "Use Vulkan" or index == "Use Vulkan (Old CPU)" or index == "Use CLBlast" or index == "Use CLBlast (Old CPU)" or index == "Use CLBlast (Older CPU)" or index == "Use CuBLAS" or index == "Use hipBLAS (ROCm)": if index == "Use Vulkan" or index == "Use Vulkan (Old CPU)" or index == "Use CLBlast" or index == "Use CLBlast (Old CPU)" or index == "Use CLBlast (Older CPU)" or index == "Use CuBLAS" or index == "Use hipBLAS (ROCm)":
gpu_layers_label.grid(row=6, column=0, padx = 8, pady=1, stick="nw") gpu_layers_label.grid(row=6, column=0, padx = 8, pady=1, stick="nw")
@ -4474,7 +4460,6 @@ def show_gui():
quick_gpu_layers_entry.grid_remove() quick_gpu_layers_entry.grid_remove()
changed_gpulayers_estimate() changed_gpulayers_estimate()
changed_gpu_choice_var() changed_gpu_choice_var()
vulkan_fa_lbl()
# presets selector # presets selector
makelabel(quick_tab, "Presets:", 1,0,"Select a backend to use.\nCuBLAS runs on Nvidia GPUs, and is much faster.\nVulkan and CLBlast works on all GPUs but is somewhat slower.\nOtherwise, runs on CPU only.\nNoAVX2 and Failsafe modes support older PCs.") makelabel(quick_tab, "Presets:", 1,0,"Select a backend to use.\nCuBLAS runs on Nvidia GPUs, and is much faster.\nVulkan and CLBlast works on all GPUs but is somewhat slower.\nOtherwise, runs on CPU only.\nNoAVX2 and Failsafe modes support older PCs.")
@ -4511,7 +4496,7 @@ def show_gui():
for idx, (name, properties) in enumerate(quick_boxes.items()): for idx, (name, properties) in enumerate(quick_boxes.items()):
makecheckbox(quick_tab, name, properties[0], int(idx/2) + 20, idx % 2, tooltiptxt=properties[1]) makecheckbox(quick_tab, name, properties[0], int(idx/2) + 20, idx % 2, tooltiptxt=properties[1])
quick_use_flashattn = makecheckbox(quick_tab, "Use FlashAttention", flashattention, 22, 1, tooltiptxt="Enable flash attention for GGUF models.") makecheckbox(quick_tab, "Use FlashAttention", flashattention, 22, 1, tooltiptxt="Enable flash attention for GGUF models.")
# context size # context size
makeslider(quick_tab, "Context Size:", contextsize_text, context_var, 0, len(contextsize_text)-1, 30, width=280, set=5,tooltip="What is the maximum context size to support. Model specific. You cannot exceed it.\nLarger contexts require more memory, and not all models support it.") makeslider(quick_tab, "Context Size:", contextsize_text, context_var, 0, len(contextsize_text)-1, 30, width=280, set=5,tooltip="What is the maximum context size to support. Model specific. You cannot exceed it.\nLarger contexts require more memory, and not all models support it.")
@ -4599,11 +4584,9 @@ def show_gui():
else: else:
item.grid_remove() item.grid_remove()
makecheckbox(tokens_tab, "Custom RoPE Config", variable=customrope_var, row=22, command=togglerope,tooltiptxt="Override the default RoPE configuration with custom RoPE scaling.") makecheckbox(tokens_tab, "Custom RoPE Config", variable=customrope_var, row=22, command=togglerope,tooltiptxt="Override the default RoPE configuration with custom RoPE scaling.")
use_flashattn = makecheckbox(tokens_tab, "Use FlashAttention", flashattention, 28, command=toggleflashattn, tooltiptxt="Enable flash attention for GGUF models.") makecheckbox(tokens_tab, "Use FlashAttention", flashattention, 28, command=toggleflashattn, tooltiptxt="Enable flash attention for GGUF models.")
noqkvlabel = makelabel(tokens_tab,"(Note: QuantKV works best with flash attention)",28,0,"Only K cache can be quantized, and performance can suffer.\nIn some cases, it might even use more VRAM when doing a full offload.",padx=160) noqkvlabel = makelabel(tokens_tab,"(Note: QuantKV works best with flash attention)",28,0,"Only K cache can be quantized, and performance can suffer.\nIn some cases, it might even use more VRAM when doing a full offload.",padx=160)
noqkvlabel.configure(text_color="#ff5555") noqkvlabel.configure(text_color="#ff5555")
avoidfalabel = makelabel(tokens_tab,"(Note: Flash attention may be slow on Vulkan)",28,0,"FlashAttention is discouraged when using Vulkan GPU offload.",padx=160)
avoidfalabel.configure(text_color="#ff5555")
qkvslider,qkvlabel,qkvtitle = makeslider(tokens_tab, "Quantize KV Cache:", quantkv_text, quantkv_var, 0, 2, 30, set=0,tooltip="Enable quantization of KV cache.\nRequires FlashAttention for full effect, otherwise only K cache is quantized.") qkvslider,qkvlabel,qkvtitle = makeslider(tokens_tab, "Quantize KV Cache:", quantkv_text, quantkv_var, 0, 2, 30, set=0,tooltip="Enable quantization of KV cache.\nRequires FlashAttention for full effect, otherwise only K cache is quantized.")
quantkv_var.trace("w", toggleflashattn) quantkv_var.trace("w", toggleflashattn)
makecheckbox(tokens_tab, "No BOS Token", nobostoken_var, 43, tooltiptxt="Prevents BOS token from being added at the start of any prompt. Usually NOT recommended for most models.") makecheckbox(tokens_tab, "No BOS Token", nobostoken_var, 43, tooltiptxt="Prevents BOS token from being added at the start of any prompt. Usually NOT recommended for most models.")