support q5_1 kv

2026-04-26 10:41:25 +00:00 · 2026-04-17 17:06:15 +08:00 · 2026-04-17 17:06:15 +08:00 · 9a38091207
commit 9a38091207
parent e074939c17
6 changed files with 52 additions and 25 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -107,6 +107,7 @@ if (LLAMA_CUBLAS)
        list(APPEND GGML_SOURCES_CUDA
            ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu
            ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu
+            ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_1.cu
            ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu
            ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-bf16.cu
        )
@ -195,6 +196,7 @@ if (LLAMA_HIPBLAS)
        list(APPEND GGML_SOURCES_ROCM
            ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu
            ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu
+            ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_1.cu
            ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu
            ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-bf16.cu
        )
--- a/1
+++ b/1
@ -206,6 +206,7 @@ OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/templat
 OBJS_CUDA_TEMP_INST += \
    ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-f16.o \
    ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.o \
+	ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_1.o \
    ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.o \
    ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-bf16.o

--- a/ggml/src/ggml-cuda/fattn.cu
+++ b/ggml/src/ggml-cuda/fattn.cu
@ -288,6 +288,7 @@ static void ggml_cuda_flash_attn_ext_vec(ggml_backend_cuda_context & ctx, ggml_t
 #else
    FATTN_VEC_CASES_ALL_D(GGML_TYPE_F16,  GGML_TYPE_F16)
    FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_0, GGML_TYPE_Q4_0)
+    FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_1, GGML_TYPE_Q5_1)
    FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q8_0, GGML_TYPE_Q8_0)
    FATTN_VEC_CASES_ALL_D(GGML_TYPE_BF16, GGML_TYPE_BF16)
 #endif // GGML_CUDA_FA_ALL_QUANTS
--- a/gpttype_adapter.cpp
+++ b/gpttype_adapter.cpp
@ -2528,8 +2528,8 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
        //set some ctx params early so autofit can use them.
        llama_ctx_params.flash_attn_type = (kcpp_data->flash_attn?LLAMA_FLASH_ATTN_TYPE_ENABLED:LLAMA_FLASH_ATTN_TYPE_DISABLED);
        llama_ctx_params.swa_full = kcpp_data->swa_full;
-        llama_ctx_params.type_k = (inputs.quant_k==2?GGML_TYPE_Q4_0:(inputs.quant_k==1?GGML_TYPE_Q8_0:(inputs.quant_k==3?GGML_TYPE_BF16:GGML_TYPE_F16)));
-        llama_ctx_params.type_v = (inputs.quant_v==2?GGML_TYPE_Q4_0:(inputs.quant_v==1?GGML_TYPE_Q8_0:(inputs.quant_v==3?GGML_TYPE_BF16:GGML_TYPE_F16)));
+        llama_ctx_params.type_k = (inputs.quant_k==4?GGML_TYPE_Q4_0:(inputs.quant_k==3?GGML_TYPE_Q5_1:(inputs.quant_k==2?GGML_TYPE_Q8_0:(inputs.quant_k==1?GGML_TYPE_BF16:GGML_TYPE_F16))));
+        llama_ctx_params.type_v = (inputs.quant_v==4?GGML_TYPE_Q4_0:(inputs.quant_v==3?GGML_TYPE_Q5_1:(inputs.quant_v==2?GGML_TYPE_Q8_0:(inputs.quant_v==1?GGML_TYPE_BF16:GGML_TYPE_F16))));


        //apply overrides from autofit
--- a/koboldcpp.py
+++ b/koboldcpp.py
@ -1408,7 +1408,7 @@ def dump_gguf_metadata(file_path): #if you're gonna copy this into your own proj
        data = None
        fptr = 0
        dt_table = ["u8","i8","u16","i16","u32","i32","f32","bool","str","arr","u64","i64","f64"] #13 types, else error
-        tt_table = ["f32","f16","q4_0","q4_1","q4_2","q4_3","q5_0","q5_1","q8_0","q8_1","q2_k","q3_k","q4_k","q5_k","q6_k","q8_k","iq2_xxs","iq2_xs","iq3_xxs","iq1_s","iq4_nl","iq3_s","iq2_s","iq4_xs","i8","i16","i32","i64","f64","iq1_m","bf16","q4_0_4_4","q4_0_4_8","q4_0_8_8","tq1_0","tq2_0","iq4_nl_4_4","unknown","unknown","unknown","unknown","unknown"]
+        tt_table = ["f32","f16","q4_0","q4_1","q4_2","q4_3","q5_0","q5_1","q8_0","q8_1","q2_k","q3_k","q4_k","q5_k","q6_k","q8_k","iq2_xxs","iq2_xs","iq3_xxs","iq1_s","iq4_nl","iq3_s","iq2_s","iq4_xs","i8","i16","i32","i64","f64","iq1_m","bf16","q4_0_4_4","q4_0_4_8","q4_0_8_8","tq1_0","tq2_0","iq4_nl_4_4","iq4_nl_4_8","iq4_nl_8_8","mxfp4","nvfp4","q1_0","unknown","unknown","unknown","unknown"]
        def read_data(datatype):
            nonlocal fptr, data, dt_table
            if datatype=="u32":
@ -1597,7 +1597,7 @@ def extract_modelfile_params(filepath,sdfilepath,whisperfilepath,mmprojfilepath,
        except Exception:
            modelfile_extracted_meta = None

-def autoset_gpu_layers(ctxsize, sdquanted, bbs, qkv_level, musiclowvram): #shitty algo to determine how many layers to use
+def autoset_gpu_layers(ctxsize, sdquanted, bbs, musiclowvram): #shitty algo to determine how many layers to use
    global showusedmemwarning, showmultigpuwarning, modelfile_extracted_meta, calulated_gpu_overhead # reference cached values instead
    gpumem = MaxMemory[0]
    usedmem = 0
@ -1670,7 +1670,7 @@ def autoset_gpu_layers(ctxsize, sdquanted, bbs, qkv_level, musiclowvram): #shitt
                ratio = (mem-usedmem)/(fsize*csmul*1.6*(1.0 if bbs <= 512 else 1.2))
                if headcount > 0:
                    # rubbish random formula. apply batchsize calculations if over 512
-                    fattn_discount = 1.0/(3.2 if qkv_level==2 else (1.6 if qkv_level==1 else 1.0))
+                    fattn_discount = 1.0
                    mem1 = layers*(4 if bbs <= 512 else (bbs/128))*headkvlen*cs*fattn_discount*4*1.45
                    mem2 = layers*headcount*headkvlen*cs*fattn_discount*4*1.15
                    ratio = max(ratio,(mem - reservedmem - mem1) / (fsize + mem2))
@ -1895,13 +1895,24 @@ def load_model(model_filename):
    inputs.use_contextshift = (0 if args.noshift else 1)
    inputs.use_fastforward = (0 if args.nofastforward else 1)
    inputs.flash_attention =  (False if args.noflashattention else True)
-    if args.quantkv>0:
+    if args.quantkv:
+        qkvstr = str(args.quantkv).lower()
+        qkvval = 0
+        if qkvstr=="bf16" or qkvstr=="3": #migration for old index based values
+            qkvval = 1
+        elif qkvstr=="q8_0" or qkvstr=="1":
+            qkvval = 2
+        elif qkvstr=="q5_1":
+            qkvval = 3
+        elif qkvstr=="q4_0" or qkvstr=="2":
+            qkvval = 4
        if args.noflashattention:
-            inputs.quant_k = args.quantkv
-            inputs.quant_v = 0 if args.quantkv!=3 else args.quantkv
-            print("\nWarning: Quantized KV was used without flash attention! This is NOT RECOMMENDED!\nOnly K cache can be quantized, and performance can suffer.\nIn some cases, it might even use more VRAM when doing a full offload.\nYou are strongly encouraged to use flash attention if you want to use quantkv.")
+            inputs.quant_k = qkvval
+            inputs.quant_v = 0 if qkvval!=1 else qkvval
+            if qkvval>1:
+                print("\nWarning: Quantized KV was used without flash attention! This is NOT RECOMMENDED!\nOnly K cache can be quantized, and performance can suffer.\nIn some cases, it might even use more VRAM when doing a full offload.\nYou are strongly encouraged to use flash attention if you want to use quantkv.")
        else:
-            inputs.quant_k = inputs.quant_v = args.quantkv
+            inputs.quant_k = inputs.quant_v = qkvval
    else:
        inputs.quant_k = inputs.quant_v = 0
    inputs.batchsize = args.batchsize
@ -7210,7 +7221,7 @@ def show_gui():
    batchsize_values = ["-1","16","32","64","128","256","512","1024","2048","4096"]
    batchsize_text = ["Don't Batch","16","32","64","128","256","512","1024","2048","4096"]
    contextsize_text = ["256", "512", "1024", "2048", "3072", "4096", "6144", "8192", "10240", "12288", "14336", "16384", "20480", "24576", "28672", "32768", "40960", "49152", "57344", "65536", "81920", "98304", "114688", "131072","163840","196608","229376","262144"]
-    quantkv_text = ["F16 (Off)","8-Bit","4-Bit","BF16"]
+    quantkv_text = ["f16","bf16","q8_0","q5_1","q4_0"]

    if not any(runopts):
        exitcounter = 999
@ -7423,10 +7434,11 @@ def show_gui():
            temp.bind("<Leave>", hide_tooltip)
        return temp

-    def makeslider(parent, label, options, var, from_ , to,  row=0, width=160, height=10, set=0, tooltip=""):
+    def makeslider(parent, label, options, var, row=0, width=160, height=10, set=0, tooltip=""):
        sliderLabel = makelabel(parent, options[set], row + 1, 0, columnspan=2, padx=(width+12))
        titleLabel = makelabel(parent, label, row,0,tooltip)
-
+        from_ = 0
+        to = len(options)-1
        def sliderUpdate(a,b,c):
            sliderLabel.configure(text = options[int(var.get())])
        var.trace_add("write", sliderUpdate)
@ -7687,7 +7699,7 @@ def show_gui():
        changed_gpulayers_estimate()

    def changed_gpulayers_estimate(*args):
-        autoset_gpu_layers(int(contextsize_text[context_var.get()]),sd_quant_option(sd_quant_var.get()),int(batchsize_values[int(blas_size_var.get())]),(quantkv_var.get() if flashattention_var.get()==1 else 0),musiclowvram_var.get()==1)
+        autoset_gpu_layers(int(contextsize_text[context_var.get()]),sd_quant_option(sd_quant_var.get()),int(batchsize_values[int(blas_size_var.get())]),musiclowvram_var.get()==1)
        max_gpu_layers = (f"{modelfile_extracted_meta[1][0]+1}" if (modelfile_extracted_meta and modelfile_extracted_meta[1] and modelfile_extracted_meta[1][0]!=0) else "")
        index = runopts_var.get()
        gpu_be = (index == "Use Vulkan" or index == "Use Vulkan (Old CPU)" or index == "Use Vulkan (Older CPU)" or index == "Use CUDA" or index == "Use hipBLAS (ROCm)")
@ -7761,7 +7773,7 @@ def show_gui():
            smartcontextbox.grid_remove()
        qkvslider.grid()
        qkvlabel.grid()
-        if flashattention_var.get()==0 and (quantkv_var.get()>0 and quantkv_var.get()!=3):
+        if flashattention_var.get()==0 and (quantkv_var.get()>1):
            noqkvlabel.grid()
        else:
            noqkvlabel.grid_remove()
@ -7770,7 +7782,7 @@ def show_gui():
    def toggleflashattn(a,b,c):
        qkvslider.grid()
        qkvlabel.grid()
-        if flashattention_var.get()==0 and (quantkv_var.get()>0 and quantkv_var.get()!=3):
+        if flashattention_var.get()==0 and (quantkv_var.get()>1):
            noqkvlabel.grid()
        else:
            noqkvlabel.grid_remove()
@ -7898,7 +7910,7 @@ def show_gui():
        makecheckbox(quick_tab, name, properties[0], int(idx/2) + 20, idx % 2, tooltiptxt=properties[1])

    # context size
-    makeslider(quick_tab, "Context Size:", contextsize_text, context_var, 0, len(contextsize_text)-1, 40, width=280, set=7,tooltip="What is the maximum context size to support. Model specific. You cannot exceed it.\nLarger contexts require more memory, and not all models support it.")
+    makeslider(quick_tab, "Context Size:", contextsize_text, context_var, 40, width=280, set=7,tooltip="What is the maximum context size to support. Model specific. You cannot exceed it.\nLarger contexts require more memory, and not all models support it.")

    # load model
    makefileentry(quick_tab, "GGUF Text Model:", "Select GGUF or GGML Model File", model_var, 50, 280, onchoosefile=on_picked_model_file,tooltiptxt="Select a GGUF or GGML model file on disk to be loaded.")
@ -7954,7 +7966,7 @@ def show_gui():
        makecheckbox(hardware_tab, name, properties[0], int(idx/2) + 30, 0, padx=(160 if idx % 2 else 8), tooltiptxt=properties[1])

    # blas batch size
-    makeslider(hardware_tab, "Batch Size:", batchsize_text, blas_size_var, 0, len(batchsize_values)-1, 16,width=200, set=6,tooltip="How many tokens to process at once per batch.\nLarger values use more memory.")
+    makeslider(hardware_tab, "Batch Size:", batchsize_text, blas_size_var, 16,width=200, set=6,tooltip="How many tokens to process at once per batch.\nLarger values use more memory.")
    blas_size_var.trace_add("write", changed_gpulayers_estimate)

    makecheckbox(hardware_tab, "Use FlashAttention", flashattention_var, 100, command=toggleflashattn,  tooltiptxt="Enable flash attention for GGUF models.")
@ -7975,7 +7987,7 @@ def show_gui():
    makelabelentry(context_tab, "CacheSlots:", smartcacheslots_var, row=5, padx=(300), singleline=True, tooltip="Number of slots for smartcache",labelpadx=(220))

    # context size
-    makeslider(context_tab, "Context Size:",contextsize_text, context_var, 0, len(contextsize_text)-1, 18, width=280, set=7,tooltip="What is the maximum context size to support. Model specific. You cannot exceed it.\nLarger contexts require more memory, and not all models support it.")
+    makeslider(context_tab, "Context Size:",contextsize_text, context_var, 18, width=280, set=7,tooltip="What is the maximum context size to support. Model specific. You cannot exceed it.\nLarger contexts require more memory, and not all models support it.")
    context_var.trace_add("write", changed_gpulayers_estimate)
    makelabelentry(context_tab, "Default Gen Amt:", defaultgenamt_var, row=20, padx=(120), singleline=True, tooltip="How many tokens to generate by default, if not specified. Must be smaller than context size. Usually, your frontend GUI will override this.")
    makelabelentry(context_tab, "Prompt Limit:", genlimit_var, row=20, padx=(300), singleline=True, tooltip="If set, restricts max output tokens to this limit regardless of API request. Set to 0 to disable.",labelpadx=(210))
@ -8009,7 +8021,7 @@ def show_gui():
    makecheckbox(context_tab, "Custom RoPE Config", variable=customrope_var, row=22, command=togglerope,tooltiptxt="Override the default RoPE configuration with custom RoPE scaling.")
    noqkvlabel = makelabel(context_tab,"(Note: QuantKV works best with flash attention)",30,0,"Only K cache can be quantized, and performance can suffer.\nIn some cases, it might even use more VRAM when doing a full offload.",padx=160)
    noqkvlabel.configure(text_color="#ff5555")
-    qkvslider,qkvlabel,qkvtitle = makeslider(context_tab, "Quantize KV Cache:", quantkv_text, quantkv_var, 0, 3, 30, set=0,tooltip="Enable quantization of KV cache.\nRequires Flash Attention for full effect, otherwise only K cache is quantized.")
+    qkvslider,qkvlabel,qkvtitle = makeslider(context_tab, "Quantize KV Cache:", quantkv_text, quantkv_var, 30, set=0,tooltip="Enable quantization of KV cache.\nRequires Flash Attention for full effect, otherwise only K cache is quantized.")
    quantkv_var.trace_add("write", toggleflashattn)
    makecheckbox(context_tab, "No BOS Token", nobostoken_var, 43, tooltiptxt="Prevents BOS token from being added at the start of any prompt. Usually NOT recommended for most models.")
    makecheckbox(context_tab, "Enable Guidance", enableguidance_var, 43,padx=(140), tooltiptxt="Enables the use of Classifier-Free-Guidance, which allows the use of negative prompts. Has performance and memory impact.")
@ -8320,7 +8332,8 @@ def show_gui():
        args.quiet = quietmode.get()==1
        args.nocertify = nocertifymode.get()==1
        args.nomodel = nomodel.get()==1
-        args.quantkv = quantkv_var.get()
+        qkvopt = quantkv_text[quantkv_var.get()].lower() if (quantkv_var.get()>=0 and quantkv_var.get() < len(quantkv_text)) else "f16"
+        args.quantkv = qkvopt
        args.lowvram = lowvram_var.get()==1

        gpuchoiceidx = 0
@ -8573,7 +8586,17 @@ def show_gui():
        nomodel.set(1 if "nomodel" in mydict and mydict["nomodel"] else 0)
        lowvram_var.set(1 if "lowvram" in mydict and mydict["lowvram"] else 0)
        if "quantkv" in mydict:
-            quantkv_var.set(mydict["quantkv"])
+            qkvstr = str(mydict["quantkv"]).lower()
+            qkvval = 0
+            if qkvstr=="bf16" or qkvstr=="3": #migration for old index based values
+                qkvval = 1
+            elif qkvstr=="q8_0" or qkvstr=="1":
+                qkvval = 2
+            elif qkvstr=="q5_1":
+                qkvval = 3
+            elif qkvstr=="q4_0" or qkvstr=="2":
+                qkvval = 4
+            quantkv_var.set(qkvval)
        if "usecuda" in mydict and mydict["usecuda"]:
            if cublas_option is not None or hipblas_option is not None:
                if cublas_option:
@ -10495,7 +10518,7 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False):
                if (not args.usecpu) and ((args.usecuda is not None) or (args.usevulkan is not None) or sys.platform=="darwin"):
                    if MaxMemory[0] > 0:
                        extract_modelfile_params(args.model_param,args.sdmodel,args.whispermodel,args.mmproj,args.draftmodel,args.ttsmodel if args.ttsgpu else "",args.embeddingsmodel if args.embeddingsgpu else "", args.musicllm, args.musicdiffusion)
-                        layeramt = autoset_gpu_layers(args.contextsize,args.sdquant,args.batchsize,(0 if args.noflashattention else args.quantkv),args.musiclowvram)
+                        layeramt = autoset_gpu_layers(args.contextsize,args.sdquant,args.batchsize,args.musiclowvram)
                        print(f"Auto Recommended GPU Layers: {layeramt}")
                        args.gpulayers = layeramt
                    else:
@ -11189,7 +11212,7 @@ if __name__ == '__main__':
    advparser.add_argument("--jinja_kwargs","--jinja-kwargs","--jinjakwargs","--chat-template-kwargs", metavar=('{"parameter":"value",...}'), help="Set additiona fields for Jinja JSON template parser, must be a valid JSON object.", default="")
    advparser.add_argument("--noflashattention","--no-flash-attn","-nofa", help="Disables flash attention.", action='store_true')
    advparser.add_argument("--lowvram","-nkvo","--no-kv-offload", help="If supported by the backend, do not offload KV to GPU (lowvram mode). Not recommended, will be slow.", action='store_true')
-    advparser.add_argument("--quantkv", help="Sets the KV cache data type quantization, 0=f16, 1=q8, 2=q4, 3=bf16. Requires Flash Attention for full effect, otherwise only K cache is quantized.",metavar=('[quantization level 0/1/2]'), type=int, choices=[0,1,2,3], default=0)
+    advparser.add_argument("--quantkv", help="Sets the KV cache data type quantization, options are f16/bf16/q8_0/q5_1/q4_0. Requires Flash Attention for full effect, otherwise only K cache is quantized.",metavar=('[quantization level f16/bf16/q8_0/q5_1/q4_0]'), type=str, choices=["f16","bf16","q8_0","q5_1","q4_0","0","1","2","3"], default="f16")
    advparser.add_argument("--smartcontext", help="Reserving a portion of context to try processing less frequently. Outdated. Not recommended.", action='store_true')
    advparser.add_argument("--unpack", help="Extracts the file contents of the KoboldCpp binary into a target directory.", metavar=('destination'), type=str, default="")
    advparser.add_argument("--exportconfig", help="Exports the current selected arguments as a .kcpps settings file", metavar=('[filename]'), type=str, default="")
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@ -245,7 +245,7 @@ llama_kv_cache::llama_kv_cache(

            map_layer_ids[il] = map_layer_ids[il_reuse];

-            LLAMA_LOG_DEBUG("%s: - layer %3d: reuse layer %d, is_swa = %d\n", __func__, il, il_reuse, hparams.is_swa(il));
+            // LLAMA_LOG_DEBUG("%s: - layer %3d: reuse layer %d, is_swa = %d\n", __func__, il, il_reuse, hparams.is_swa(il));
        }
    }