remove fa restrictions for vulkan

2025-09-11 09:34:37 +00:00 · 2025-05-09 17:34:14 +08:00 · 2025-05-09 17:34:14 +08:00 · c4a0b323f0
commit c4a0b323f0
parent 0874cd231a
1 changed files with 2 additions and 19 deletions
--- a/koboldcpp.py
+++ b/koboldcpp.py
@ -4297,13 +4297,6 @@ def show_gui():
        num_backends_built.grid(row=1, column=1, padx=205, pady=0)
        num_backends_built.configure(text_color="#00ff00")
    def vulkan_fa_lbl():
        if flashattention.get()!=0 and (runopts_var.get() == "Use Vulkan" or runopts_var.get() == "Use Vulkan (Old CPU)") and (gpulayers_var.get()!="" and int(gpulayers_var.get())):
            avoidfalabel.grid()
        else:
            avoidfalabel.grid_remove()
    def gui_changed_modelfile(*args):
        global importvars_in_progress
        if not importvars_in_progress:
@ -4339,7 +4332,6 @@ def show_gui():
        else:
            layercounter_label.grid_remove()
            quick_layercounter_label.grid_remove()
        vulkan_fa_lbl()
    def changed_gpu_choice_var(*args):
        global exitcounter
@ -4394,7 +4386,6 @@ def show_gui():
            noqkvlabel.grid()
        else:
            noqkvlabel.grid_remove()
        vulkan_fa_lbl()
        changed_gpulayers_estimate()
    def guibench():
@ -4451,11 +4442,6 @@ def show_gui():
        if index == "Use Vulkan" or index == "Use Vulkan (Old CPU)":
            tensor_split_label.grid(row=8, column=0, padx = 8, pady=1, stick="nw")
            tensor_split_entry.grid(row=8, column=1, padx=8, pady=1, stick="nw")
            quick_use_flashattn.grid_remove()
            use_flashattn.grid(row=28, column=0, padx=8, pady=1,  stick="nw")
        else:
            quick_use_flashattn.grid(row=22, column=1, padx=8, pady=1,  stick="nw")
            use_flashattn.grid(row=28, column=0, padx=8, pady=1,  stick="nw")
        if index == "Use Vulkan" or index == "Use Vulkan (Old CPU)" or index == "Use CLBlast" or index == "Use CLBlast (Old CPU)" or index == "Use CLBlast (Older CPU)" or index == "Use CuBLAS" or index == "Use hipBLAS (ROCm)":
            gpu_layers_label.grid(row=6, column=0, padx = 8, pady=1, stick="nw")
@ -4474,7 +4460,6 @@ def show_gui():
            quick_gpu_layers_entry.grid_remove()
        changed_gpulayers_estimate()
        changed_gpu_choice_var()
        vulkan_fa_lbl()
    # presets selector
    makelabel(quick_tab, "Presets:", 1,0,"Select a backend to use.\nCuBLAS runs on Nvidia GPUs, and is much faster.\nVulkan and CLBlast works on all GPUs but is somewhat slower.\nOtherwise, runs on CPU only.\nNoAVX2 and Failsafe modes support older PCs.")
@ -4511,7 +4496,7 @@ def show_gui():
    for idx, (name, properties) in enumerate(quick_boxes.items()):
        makecheckbox(quick_tab, name, properties[0], int(idx/2) + 20, idx % 2, tooltiptxt=properties[1])
-    quick_use_flashattn = makecheckbox(quick_tab, "Use FlashAttention", flashattention, 22, 1, tooltiptxt="Enable flash attention for GGUF models.")
+    makecheckbox(quick_tab, "Use FlashAttention", flashattention, 22, 1, tooltiptxt="Enable flash attention for GGUF models.")
    # context size
    makeslider(quick_tab, "Context Size:", contextsize_text, context_var, 0, len(contextsize_text)-1, 30, width=280, set=5,tooltip="What is the maximum context size to support. Model specific. You cannot exceed it.\nLarger contexts require more memory, and not all models support it.")
@ -4599,11 +4584,9 @@ def show_gui():
            else:
                item.grid_remove()
    makecheckbox(tokens_tab,  "Custom RoPE Config", variable=customrope_var, row=22, command=togglerope,tooltiptxt="Override the default RoPE configuration with custom RoPE scaling.")
-    use_flashattn = makecheckbox(tokens_tab, "Use FlashAttention", flashattention, 28, command=toggleflashattn,  tooltiptxt="Enable flash attention for GGUF models.")
+    makecheckbox(tokens_tab, "Use FlashAttention", flashattention, 28, command=toggleflashattn,  tooltiptxt="Enable flash attention for GGUF models.")
    noqkvlabel = makelabel(tokens_tab,"(Note: QuantKV works best with flash attention)",28,0,"Only K cache can be quantized, and performance can suffer.\nIn some cases, it might even use more VRAM when doing a full offload.",padx=160)
    noqkvlabel.configure(text_color="#ff5555")
    avoidfalabel = makelabel(tokens_tab,"(Note: Flash attention may be slow on Vulkan)",28,0,"FlashAttention is discouraged when using Vulkan GPU offload.",padx=160)
    avoidfalabel.configure(text_color="#ff5555")
    qkvslider,qkvlabel,qkvtitle = makeslider(tokens_tab, "Quantize KV Cache:", quantkv_text, quantkv_var, 0, 2, 30, set=0,tooltip="Enable quantization of KV cache.\nRequires FlashAttention for full effect, otherwise only K cache is quantized.")
    quantkv_var.trace("w", toggleflashattn)
    makecheckbox(tokens_tab, "No BOS Token", nobostoken_var, 43, tooltiptxt="Prevents BOS token from being added at the start of any prompt. Usually NOT recommended for most models.")