mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-19 08:00:25 +00:00
support bf16 quantkv cache type
This commit is contained in:
parent
326542f480
commit
0c2b679ea3
2 changed files with 8 additions and 8 deletions
|
|
@ -2673,8 +2673,8 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
|||
|
||||
llama_ctx_params.flash_attn_type = (kcpp_data->flash_attn?LLAMA_FLASH_ATTN_TYPE_ENABLED:LLAMA_FLASH_ATTN_TYPE_DISABLED);
|
||||
llama_ctx_params.swa_full = kcpp_data->swa_full;
|
||||
llama_ctx_params.type_k = (inputs.quant_k>1?GGML_TYPE_Q4_0:(inputs.quant_k==1?GGML_TYPE_Q8_0:GGML_TYPE_F16));
|
||||
llama_ctx_params.type_v = (inputs.quant_v>1?GGML_TYPE_Q4_0:(inputs.quant_v==1?GGML_TYPE_Q8_0:GGML_TYPE_F16));
|
||||
llama_ctx_params.type_k = (inputs.quant_k==2?GGML_TYPE_Q4_0:(inputs.quant_k==1?GGML_TYPE_Q8_0:(inputs.quant_k==3?GGML_TYPE_BF16:GGML_TYPE_F16)));
|
||||
llama_ctx_params.type_v = (inputs.quant_v==2?GGML_TYPE_Q4_0:(inputs.quant_v==1?GGML_TYPE_Q8_0:(inputs.quant_v==3?GGML_TYPE_BF16:GGML_TYPE_F16)));
|
||||
|
||||
llama_ctx_v4 = llama_init_from_model(llamamodel, llama_ctx_params);
|
||||
if(load_guidance)
|
||||
|
|
|
|||
12
koboldcpp.py
12
koboldcpp.py
|
|
@ -1764,7 +1764,7 @@ def load_model(model_filename):
|
|||
if args.quantkv>0:
|
||||
if args.noflashattention:
|
||||
inputs.quant_k = args.quantkv
|
||||
inputs.quant_v = 0
|
||||
inputs.quant_v = 0 if args.quantkv!=3 else args.quantkv
|
||||
print("\nWarning: Quantized KV was used without flash attention! This is NOT RECOMMENDED!\nOnly K cache can be quantized, and performance can suffer.\nIn some cases, it might even use more VRAM when doing a full offload.\nYou are strongly encouraged to use flash attention if you want to use quantkv.")
|
||||
else:
|
||||
inputs.quant_k = inputs.quant_v = args.quantkv
|
||||
|
|
@ -6316,7 +6316,7 @@ def show_gui():
|
|||
batchsize_values = ["-1","16","32","64","128","256","512","1024","2048","4096"]
|
||||
batchsize_text = ["Don't Batch","16","32","64","128","256","512","1024","2048","4096"]
|
||||
contextsize_text = ["256", "512", "1024", "2048", "3072", "4096", "6144", "8192", "10240", "12288", "14336", "16384", "20480", "24576", "28672", "32768", "40960", "49152", "57344", "65536", "81920", "98304", "114688", "131072"]
|
||||
quantkv_text = ["F16 (Off)","8-Bit","4-Bit"]
|
||||
quantkv_text = ["F16 (Off)","8-Bit","4-Bit","BF16"]
|
||||
|
||||
if not any(runopts):
|
||||
exitcounter = 999
|
||||
|
|
@ -6856,7 +6856,7 @@ def show_gui():
|
|||
smartcontextbox.grid_remove()
|
||||
qkvslider.grid()
|
||||
qkvlabel.grid()
|
||||
if flashattention_var.get()==0 and quantkv_var.get()>0:
|
||||
if flashattention_var.get()==0 and (quantkv_var.get()>0 and quantkv_var.get()!=3):
|
||||
noqkvlabel.grid()
|
||||
else:
|
||||
noqkvlabel.grid_remove()
|
||||
|
|
@ -6865,7 +6865,7 @@ def show_gui():
|
|||
def toggleflashattn(a,b,c):
|
||||
qkvslider.grid()
|
||||
qkvlabel.grid()
|
||||
if flashattention_var.get()==0 and quantkv_var.get()>0:
|
||||
if flashattention_var.get()==0 and (quantkv_var.get()>0 and quantkv_var.get()!=3):
|
||||
noqkvlabel.grid()
|
||||
else:
|
||||
noqkvlabel.grid_remove()
|
||||
|
|
@ -7103,7 +7103,7 @@ def show_gui():
|
|||
makecheckbox(context_tab, "Custom RoPE Config", variable=customrope_var, row=22, command=togglerope,tooltiptxt="Override the default RoPE configuration with custom RoPE scaling.")
|
||||
noqkvlabel = makelabel(context_tab,"(Note: QuantKV works best with flash attention)",30,0,"Only K cache can be quantized, and performance can suffer.\nIn some cases, it might even use more VRAM when doing a full offload.",padx=160)
|
||||
noqkvlabel.configure(text_color="#ff5555")
|
||||
qkvslider,qkvlabel,qkvtitle = makeslider(context_tab, "Quantize KV Cache:", quantkv_text, quantkv_var, 0, 2, 30, set=0,tooltip="Enable quantization of KV cache.\nRequires Flash Attention for full effect, otherwise only K cache is quantized.")
|
||||
qkvslider,qkvlabel,qkvtitle = makeslider(context_tab, "Quantize KV Cache:", quantkv_text, quantkv_var, 0, 3, 30, set=0,tooltip="Enable quantization of KV cache.\nRequires Flash Attention for full effect, otherwise only K cache is quantized.")
|
||||
quantkv_var.trace_add("write", toggleflashattn)
|
||||
makecheckbox(context_tab, "No BOS Token", nobostoken_var, 43, tooltiptxt="Prevents BOS token from being added at the start of any prompt. Usually NOT recommended for most models.")
|
||||
makecheckbox(context_tab, "Enable Guidance", enableguidance_var, 43,padx=(140), tooltiptxt="Enables the use of Classifier-Free-Guidance, which allows the use of negative prompts. Has performance and memory impact.")
|
||||
|
|
@ -10137,7 +10137,7 @@ if __name__ == '__main__':
|
|||
advparser.add_argument("--jinja_kwargs","--jinja-kwargs","--jinjakwargs","--chat-template-kwargs", metavar=('{"parameter":"value",...}'), help="Set additiona fields for Jinja JSON template parser, must be a valid JSON object.", default="")
|
||||
advparser.add_argument("--noflashattention","--no-flash-attn","-nofa", help="Disables flash attention.", action='store_true')
|
||||
advparser.add_argument("--lowvram","-nkvo","--no-kv-offload", help="If supported by the backend, do not offload KV to GPU (lowvram mode). Not recommended, will be slow.", action='store_true')
|
||||
advparser.add_argument("--quantkv", help="Sets the KV cache data type quantization, 0=f16, 1=q8, 2=q4. Requires Flash Attention for full effect, otherwise only K cache is quantized.",metavar=('[quantization level 0/1/2]'), type=int, choices=[0,1,2], default=0)
|
||||
advparser.add_argument("--quantkv", help="Sets the KV cache data type quantization, 0=f16, 1=q8, 2=q4, 3=bf16. Requires Flash Attention for full effect, otherwise only K cache is quantized.",metavar=('[quantization level 0/1/2]'), type=int, choices=[0,1,2,3], default=0)
|
||||
advparser.add_argument("--smartcontext", help="Reserving a portion of context to try processing less frequently. Outdated. Not recommended.", action='store_true')
|
||||
advparser.add_argument("--unpack", help="Extracts the file contents of the KoboldCpp binary into a target directory.", metavar=('destination'), type=str, default="")
|
||||
advparser.add_argument("--exportconfig", help="Exports the current selected arguments as a .kcpps settings file", metavar=('[filename]'), type=str, default="")
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue