support bf16 quantkv cache type

2026-05-19 08:00:25 +00:00 · 2026-03-28 00:01:17 +08:00 · 2026-03-28 00:01:17 +08:00 · 0c2b679ea3
commit 0c2b679ea3
parent 326542f480
2 changed files with 8 additions and 8 deletions
--- a/gpttype_adapter.cpp
+++ b/gpttype_adapter.cpp
@ -2673,8 +2673,8 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in

        llama_ctx_params.flash_attn_type = (kcpp_data->flash_attn?LLAMA_FLASH_ATTN_TYPE_ENABLED:LLAMA_FLASH_ATTN_TYPE_DISABLED);
        llama_ctx_params.swa_full = kcpp_data->swa_full;
-        llama_ctx_params.type_k = (inputs.quant_k>1?GGML_TYPE_Q4_0:(inputs.quant_k==1?GGML_TYPE_Q8_0:GGML_TYPE_F16));
-        llama_ctx_params.type_v = (inputs.quant_v>1?GGML_TYPE_Q4_0:(inputs.quant_v==1?GGML_TYPE_Q8_0:GGML_TYPE_F16));
+        llama_ctx_params.type_k = (inputs.quant_k==2?GGML_TYPE_Q4_0:(inputs.quant_k==1?GGML_TYPE_Q8_0:(inputs.quant_k==3?GGML_TYPE_BF16:GGML_TYPE_F16)));
+        llama_ctx_params.type_v = (inputs.quant_v==2?GGML_TYPE_Q4_0:(inputs.quant_v==1?GGML_TYPE_Q8_0:(inputs.quant_v==3?GGML_TYPE_BF16:GGML_TYPE_F16)));

        llama_ctx_v4 = llama_init_from_model(llamamodel, llama_ctx_params);
        if(load_guidance)
--- a/koboldcpp.py
+++ b/koboldcpp.py
@ -1764,7 +1764,7 @@ def load_model(model_filename):
    if args.quantkv>0:
        if args.noflashattention:
            inputs.quant_k = args.quantkv
-            inputs.quant_v = 0
+            inputs.quant_v = 0 if args.quantkv!=3 else args.quantkv
            print("\nWarning: Quantized KV was used without flash attention! This is NOT RECOMMENDED!\nOnly K cache can be quantized, and performance can suffer.\nIn some cases, it might even use more VRAM when doing a full offload.\nYou are strongly encouraged to use flash attention if you want to use quantkv.")
        else:
            inputs.quant_k = inputs.quant_v = args.quantkv
@ -6316,7 +6316,7 @@ def show_gui():
    batchsize_values = ["-1","16","32","64","128","256","512","1024","2048","4096"]
    batchsize_text = ["Don't Batch","16","32","64","128","256","512","1024","2048","4096"]
    contextsize_text = ["256", "512", "1024", "2048", "3072", "4096", "6144", "8192", "10240", "12288", "14336", "16384", "20480", "24576", "28672", "32768", "40960", "49152", "57344", "65536", "81920", "98304", "114688", "131072"]
-    quantkv_text = ["F16 (Off)","8-Bit","4-Bit"]
+    quantkv_text = ["F16 (Off)","8-Bit","4-Bit","BF16"]

    if not any(runopts):
        exitcounter = 999
@ -6856,7 +6856,7 @@ def show_gui():
            smartcontextbox.grid_remove()
        qkvslider.grid()
        qkvlabel.grid()
-        if flashattention_var.get()==0 and quantkv_var.get()>0:
+        if flashattention_var.get()==0 and (quantkv_var.get()>0 and quantkv_var.get()!=3):
            noqkvlabel.grid()
        else:
            noqkvlabel.grid_remove()
@ -6865,7 +6865,7 @@ def show_gui():
    def toggleflashattn(a,b,c):
        qkvslider.grid()
        qkvlabel.grid()
-        if flashattention_var.get()==0 and quantkv_var.get()>0:
+        if flashattention_var.get()==0 and (quantkv_var.get()>0 and quantkv_var.get()!=3):
            noqkvlabel.grid()
        else:
            noqkvlabel.grid_remove()
@ -7103,7 +7103,7 @@ def show_gui():
    makecheckbox(context_tab, "Custom RoPE Config", variable=customrope_var, row=22, command=togglerope,tooltiptxt="Override the default RoPE configuration with custom RoPE scaling.")
    noqkvlabel = makelabel(context_tab,"(Note: QuantKV works best with flash attention)",30,0,"Only K cache can be quantized, and performance can suffer.\nIn some cases, it might even use more VRAM when doing a full offload.",padx=160)
    noqkvlabel.configure(text_color="#ff5555")
-    qkvslider,qkvlabel,qkvtitle = makeslider(context_tab, "Quantize KV Cache:", quantkv_text, quantkv_var, 0, 2, 30, set=0,tooltip="Enable quantization of KV cache.\nRequires Flash Attention for full effect, otherwise only K cache is quantized.")
+    qkvslider,qkvlabel,qkvtitle = makeslider(context_tab, "Quantize KV Cache:", quantkv_text, quantkv_var, 0, 3, 30, set=0,tooltip="Enable quantization of KV cache.\nRequires Flash Attention for full effect, otherwise only K cache is quantized.")
    quantkv_var.trace_add("write", toggleflashattn)
    makecheckbox(context_tab, "No BOS Token", nobostoken_var, 43, tooltiptxt="Prevents BOS token from being added at the start of any prompt. Usually NOT recommended for most models.")
    makecheckbox(context_tab, "Enable Guidance", enableguidance_var, 43,padx=(140), tooltiptxt="Enables the use of Classifier-Free-Guidance, which allows the use of negative prompts. Has performance and memory impact.")
@ -10137,7 +10137,7 @@ if __name__ == '__main__':
    advparser.add_argument("--jinja_kwargs","--jinja-kwargs","--jinjakwargs","--chat-template-kwargs", metavar=('{"parameter":"value",...}'), help="Set additiona fields for Jinja JSON template parser, must be a valid JSON object.", default="")
    advparser.add_argument("--noflashattention","--no-flash-attn","-nofa", help="Disables flash attention.", action='store_true')
    advparser.add_argument("--lowvram","-nkvo","--no-kv-offload", help="If supported by the backend, do not offload KV to GPU (lowvram mode). Not recommended, will be slow.", action='store_true')
-    advparser.add_argument("--quantkv", help="Sets the KV cache data type quantization, 0=f16, 1=q8, 2=q4. Requires Flash Attention for full effect, otherwise only K cache is quantized.",metavar=('[quantization level 0/1/2]'), type=int, choices=[0,1,2], default=0)
+    advparser.add_argument("--quantkv", help="Sets the KV cache data type quantization, 0=f16, 1=q8, 2=q4, 3=bf16. Requires Flash Attention for full effect, otherwise only K cache is quantized.",metavar=('[quantization level 0/1/2]'), type=int, choices=[0,1,2,3], default=0)
    advparser.add_argument("--smartcontext", help="Reserving a portion of context to try processing less frequently. Outdated. Not recommended.", action='store_true')
    advparser.add_argument("--unpack", help="Extracts the file contents of the KoboldCpp binary into a target directory.", metavar=('destination'), type=str, default="")
    advparser.add_argument("--exportconfig", help="Exports the current selected arguments as a .kcpps settings file", metavar=('[filename]'), type=str, default="")