From 5a921a40f90edf3a399b46d9651729c5a38433e3 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Thu, 14 Aug 2025 22:54:45 +0800 Subject: [PATCH] add overridenativecontext flag, stop nagging me --- expose.h | 1 + gpttype_adapter.cpp | 15 ++++++++-- koboldcpp.py | 70 ++++++++++++++++++++++++++++++++++++--------- 3 files changed, 69 insertions(+), 17 deletions(-) diff --git a/expose.h b/expose.h index 0621abe15..575b3306e 100644 --- a/expose.h +++ b/expose.h @@ -60,6 +60,7 @@ struct load_model_inputs const int gpulayers = 0; const float rope_freq_scale = 1.0f; const float rope_freq_base = 10000.0f; + const int overridenativecontext = 0; const int moe_experts = -1; const int moecpu = 0; const bool no_bos_token = false; diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index db258c6e3..d2b8c832c 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -2021,7 +2021,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in float rope_freq_scale = 1.0f; float rope_freq_base = 10000.0f; bool overwriteRope = false; - if(inputs.rope_freq_scale>0.0f) + if(inputs.rope_freq_scale>0.0f && inputs.overridenativecontext==0) { rope_freq_scale = inputs.rope_freq_scale; rope_freq_base = inputs.rope_freq_base; @@ -2030,8 +2030,9 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in } else { + const int maxctxtrain = (inputs.overridenativecontext>0?inputs.overridenativecontext:2048); //Set freq base for all, including non GGUF. If we are using GGUF, this will be overwritten with more accurate values later. - rope_freq_base = CalcGradientAIRopeFreqBase(10000.0f,2048,kcpp_data->n_ctx, GGUFArch::ARCH_DEFAULT); + rope_freq_base = CalcGradientAIRopeFreqBase(10000.0f,maxctxtrain,kcpp_data->n_ctx, GGUFArch::ARCH_DEFAULT); if(file_format==FileFormat::GGUF_GENERIC) { printf("Using automatic RoPE scaling for GGUF. If the model has custom RoPE settings, they'll be used directly instead!\n"); @@ -2369,7 +2370,15 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in { //if the model modifes rope in any way, or uses yarn, use the model values. Otherwise, use our automatic ones //special exception for llama, which uses auto scale - if((llamamodel->hparams.rope_freq_base_train!=10000.0f && llamamodel->hparams.rope_freq_base_train!=500000.0f) || + if(inputs.overridenativecontext > 0) + { + printf("Automatic RoPE Scaling: Adjust based on override train context of %d.\n",inputs.overridenativecontext); + rope_freq_base = CalcGradientAIRopeFreqBase(llamamodel->hparams.rope_freq_base_train, inputs.overridenativecontext, kcpp_data->n_ctx, file_format_meta.model_architecture); + llama_ctx_params.rope_freq_base = rope_freq_base; + llama_ctx_params.rope_freq_scale = rope_freq_scale; + printf("Automatic RoPE Scaling: Using (scale:%.3f, base:%.1f).\n", rope_freq_scale, rope_freq_base); + } + else if((llamamodel->hparams.rope_freq_base_train!=10000.0f && llamamodel->hparams.rope_freq_base_train!=500000.0f) || llamamodel->hparams.rope_freq_scale_train!=1.0f || llamamodel->hparams.rope_scaling_type_train==2) { diff --git a/koboldcpp.py b/koboldcpp.py index d63bfe0eb..52da00dc0 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -54,6 +54,7 @@ default_visionmaxres = 1024 net_save_slots = 12 savestate_limit = 3 #3 savestate slots default_vae_tile_threshold = 768 +default_native_ctx = 16384 # abuse prevention stop_token_max = 256 @@ -194,6 +195,7 @@ class load_model_inputs(ctypes.Structure): ("gpulayers", ctypes.c_int), ("rope_freq_scale", ctypes.c_float), ("rope_freq_base", ctypes.c_float), + ("overridenativecontext", ctypes.c_int), ("moe_experts", ctypes.c_int), ("moecpu", ctypes.c_int), ("no_bos_token", ctypes.c_bool), @@ -1381,11 +1383,17 @@ def load_model(model_filename): inputs.blasbatchsize = args.blasbatchsize inputs.forceversion = args.forceversion inputs.gpulayers = args.gpulayers - inputs.rope_freq_scale = args.ropeconfig[0] - if len(args.ropeconfig)>1: - inputs.rope_freq_base = args.ropeconfig[1] - else: + if args.overridenativecontext and args.overridenativecontext>0: + inputs.overridenativecontext = args.overridenativecontext + inputs.rope_freq_scale = 0 inputs.rope_freq_base = 10000 + else: + inputs.overridenativecontext = 0 + inputs.rope_freq_scale = args.ropeconfig[0] + if len(args.ropeconfig)>1: + inputs.rope_freq_base = args.ropeconfig[1] + else: + inputs.rope_freq_base = 10000 for n in range(tensor_split_max): if args.tensor_split and n < len(args.tensor_split): @@ -4507,8 +4515,10 @@ def show_gui(): flashattention_var = ctk.IntVar(value=0) context_var = ctk.IntVar() customrope_var = ctk.IntVar() + manualrope_var = ctk.IntVar() customrope_scale = ctk.StringVar(value="1.0") customrope_base = ctk.StringVar(value="10000") + customrope_nativectx = ctk.StringVar(value=str(default_native_ctx)) chatcompletionsadapter_var = ctk.StringVar(value="AutoGuess") moeexperts_var = ctk.StringVar(value=str(-1)) moecpu_var = ctk.StringVar(value=str(0)) @@ -5168,16 +5178,31 @@ def show_gui(): context_var.trace_add("write", changed_gpulayers_estimate) makelabelentry(tokens_tab, "Default Gen Amt:", defaultgenamt_var, row=20, padx=120, singleline=True, tooltip="How many tokens to generate by default, if not specified. Must be smaller than context size. Usually, your frontend GUI will override this.") + nativectx_entry, nativectx_label = makelabelentry(tokens_tab, "Override Native Context:", customrope_nativectx, row=23, padx=146, singleline=True, tooltip="Overrides the native trained context of the loaded model with a custom value to be used for Rope scaling.") customrope_scale_entry, customrope_scale_label = makelabelentry(tokens_tab, "RoPE Scale:", customrope_scale, row=23, padx=100, singleline=True, tooltip="For Linear RoPE scaling. RoPE frequency scale.") customrope_base_entry, customrope_base_label = makelabelentry(tokens_tab, "RoPE Base:", customrope_base, row=24, padx=100, singleline=True, tooltip="For NTK Aware Scaling. RoPE frequency base.") def togglerope(a,b,c): - items = [customrope_scale_label, customrope_scale_entry,customrope_base_label, customrope_base_entry] - for idx, item in enumerate(items): - if customrope_var.get() == 1: - item.grid() - else: + if customrope_var.get() == 1: + manualropebox.grid() + enabled_items = [customrope_scale_label, customrope_scale_entry,customrope_base_label, customrope_base_entry] + disabled_items = [nativectx_entry,nativectx_label] + for idx, item in enumerate(enabled_items): + if manualrope_var.get() == 1: + item.grid() + else: + item.grid_remove() + for idx, item in enumerate(disabled_items): + if manualrope_var.get() == 0: + item.grid() + else: + item.grid_remove() + else: + disabled_items = [manualropebox, nativectx_entry,nativectx_label, customrope_scale_label, customrope_scale_entry, customrope_base_label, customrope_base_entry] + for idx, item in enumerate(disabled_items): item.grid_remove() - makecheckbox(tokens_tab, "Custom RoPE Config", variable=customrope_var, row=22, command=togglerope,tooltiptxt="Override the default RoPE configuration with custom RoPE scaling.") + manualropebox = makecheckbox(tokens_tab, "Manual Rope Scale", variable=manualrope_var, row=22, command=togglerope, padx=166, tooltiptxt="Set RoPE base and scale manually.") + + makecheckbox(tokens_tab, "Custom RoPE Config", variable=customrope_var, row=22, command=togglerope,tooltiptxt="Override the default RoPE configuration with custom RoPE scaling.") makecheckbox(tokens_tab, "Use FlashAttention", flashattention_var, 28, command=toggleflashattn, tooltiptxt="Enable flash attention for GGUF models.") noqkvlabel = makelabel(tokens_tab,"(Note: QuantKV works best with flash attention)",28,0,"Only K cache can be quantized, and performance can suffer.\nIn some cases, it might even use more VRAM when doing a full offload.",padx=160) noqkvlabel.configure(text_color="#ff5555") @@ -5474,9 +5499,15 @@ def show_gui(): args.forceversion = 0 if version_var.get()=="" else int(version_var.get()) args.contextsize = int(contextsize_text[context_var.get()]) if customrope_var.get()==1: - args.ropeconfig = [float(customrope_scale.get()),float(customrope_base.get())] + if manualrope_var.get()==1: + args.ropeconfig = [float(customrope_scale.get()),float(customrope_base.get())] + args.overridenativecontext = 0 + else: + args.ropeconfig = [0.0, 10000.0] + args.overridenativecontext = int(customrope_nativectx.get()) else: args.ropeconfig = [0.0, 10000.0] + args.overridenativecontext = 0 args.moeexperts = int(moeexperts_var.get()) if moeexperts_var.get()!="" else -1 args.moecpu = int(moecpu_var.get()) if moecpu_var.get()!="" else 0 args.defaultgenamt = int(defaultgenamt_var.get()) if defaultgenamt_var.get()!="" else 512 @@ -5679,13 +5710,24 @@ def show_gui(): blas_threads_var.set("") if "contextsize" in dict and dict["contextsize"]: context_var.set(contextsize_text.index(str(dict["contextsize"]))) - if "ropeconfig" in dict and dict["ropeconfig"] and len(dict["ropeconfig"])>1: + if "overridenativecontext" in dict and dict["overridenativecontext"]>0: + customrope_var.set(1) + manualrope_var.set(0) + customrope_nativectx.set(str(dict["overridenativecontext"])) + elif "ropeconfig" in dict and dict["ropeconfig"] and len(dict["ropeconfig"])>1: + customrope_nativectx.set(default_native_ctx) if dict["ropeconfig"][0]>0: customrope_var.set(1) + manualrope_var.set(1) customrope_scale.set(str(dict["ropeconfig"][0])) customrope_base.set(str(dict["ropeconfig"][1])) else: customrope_var.set(0) + manualrope_var.set(0) + else: + customrope_nativectx.set(default_native_ctx) + customrope_var.set(0) + manualrope_var.set(0) if "moeexperts" in dict and dict["moeexperts"]: moeexperts_var.set(dict["moeexperts"]) if "moecpu" in dict and dict["moecpu"]: @@ -7462,7 +7504,6 @@ if __name__ == '__main__': parser.add_argument("--host", metavar=('[ipaddr]'), help="Host IP to listen on. If this flag is not set, all routable interfaces are accepted.", default="") parser.add_argument("--launch", help="Launches a web browser when load is completed.", action='store_true') parser.add_argument("--config", metavar=('[filename]'), help="Load settings from a .kcpps file. Other arguments will be ignored", type=str, nargs=1) - parser.add_argument("--threads", metavar=('[threads]'), help="Use a custom number of threads if specified. Otherwise, uses an amount based on CPU cores", type=int, default=get_default_threads()) compatgroup = parser.add_mutually_exclusive_group() compatgroup.add_argument("--usecuda", "--usecublas", "--usehipblas", help="Use CUDA for GPU Acceleration. Requires CUDA. Enter a number afterwards to select and use 1 GPU. Leaving no number will use all GPUs.", nargs='*',metavar=('[lowvram|normal] [main GPU ID] [mmq|nommq] [rowsplit]'), choices=['normal', 'lowvram', '0', '1', '2', '3', 'all', 'mmq', 'nommq', 'rowsplit']) @@ -7478,7 +7519,6 @@ if __name__ == '__main__': advparser.add_argument("--version", help="Prints version and exits.", action='store_true') advparser.add_argument("--analyze", metavar=('[filename]'), help="Reads the metadata, weight types and tensor names in any GGUF file.", default="") advparser.add_argument("--maingpu", help="Only used in a multi-gpu setup. Sets the index of the main GPU that will be used.",metavar=('[Device ID]'), type=int, default=-1) - advparser.add_argument("--ropeconfig", help="If set, uses customized RoPE scaling from configured frequency scale and frequency base (e.g. --ropeconfig 0.25 10000). Otherwise, uses NTK-Aware scaling set automatically based on context size. For linear rope, simply set the freq-scale and ignore the freq-base",metavar=('[rope-freq-scale]', '[rope-freq-base]'), default=[0.0, 10000.0], type=float, nargs='+') advparser.add_argument("--blasbatchsize", help="Sets the batch size used in BLAS processing (default 512). Setting it to -1 disables BLAS mode, but keeps other benefits like GPU offload.", type=int,choices=[-1,16,32,64,128,256,512,1024,2048], default=512) advparser.add_argument("--blasthreads", help="Use a different number of threads during BLAS if specified. Otherwise, has the same value as --threads",metavar=('[threads]'), type=int, default=0) advparser.add_argument("--lora", help="GGUF models only, applies a lora file on top of model.", metavar=('[lora_filename]'), nargs='+') @@ -7486,6 +7526,8 @@ if __name__ == '__main__': advparser.add_argument("--noshift", help="If set, do not attempt to Trim and Shift the GGUF context.", action='store_true') advparser.add_argument("--nofastforward", help="If set, do not attempt to fast forward GGUF context (always reprocess). Will also enable noshift", action='store_true') advparser.add_argument("--useswa", help="If set, allows Sliding Window Attention (SWA) KV Cache, which saves memory but cannot be used with context shifting.", action='store_true') + advparser.add_argument("--ropeconfig", help="If set, uses customized RoPE scaling from configured frequency scale and frequency base (e.g. --ropeconfig 0.25 10000). Otherwise, uses NTK-Aware scaling set automatically based on context size. For linear rope, simply set the freq-scale and ignore the freq-base",metavar=('[rope-freq-scale]', '[rope-freq-base]'), default=[0.0, 10000.0], type=float, nargs='+') + advparser.add_argument("--overridenativecontext", help="Overrides the native trained context of the loaded model with a custom value to be used for Rope scaling.",metavar=('[trained context]'), type=int, default=0) compatgroup3 = advparser.add_mutually_exclusive_group() compatgroup3.add_argument("--usemmap", help="If set, uses mmap to load model.", action='store_true') advparser.add_argument("--usemlock", help="Enables mlock, preventing the RAM used to load the model from being paged out. Not usually recommended.", action='store_true')