From 57e1d9c8220acec6213833a2838707a89809f183 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Fri, 24 Oct 2025 18:16:54 +0800 Subject: [PATCH] rename blasbatchsize to batchsize --- expose.h | 2 +- gpttype_adapter.cpp | 2 +- koboldcpp.py | 28 +++++++++++++++------------- 3 files changed, 17 insertions(+), 15 deletions(-) diff --git a/expose.h b/expose.h index fa3a9f7d1..085d31adb 100644 --- a/expose.h +++ b/expose.h @@ -56,7 +56,7 @@ struct load_model_inputs const int clblast_info = 0; const int kcpp_main_gpu = 0; const char * vulkan_info = nullptr; - const int blasbatchsize = 512; + const int batchsize = 512; const int forceversion = 0; const int gpulayers = 0; const float rope_freq_scale = 1.0f; diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index c01a1e969..50fa405a2 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -1972,7 +1972,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in kcpp_data->n_threads = inputs.threads; kcpp_data->n_blasthreads = inputs.blasthreads; bool isGguf = (file_format == FileFormat::GGUF_GENERIC); - kcpp_data->n_batch = GetBatchSize(inputs.blasbatchsize, in_file_format); + kcpp_data->n_batch = GetBatchSize(inputs.batchsize, in_file_format); kcpp_data->n_ubatch = kcpp_data->n_batch; kcpp_data->flash_attn = inputs.flash_attention; kcpp_data->model_filename = inputs.model_filename; diff --git a/koboldcpp.py b/koboldcpp.py index 8152ed6ab..2108be202 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -192,7 +192,7 @@ class load_model_inputs(ctypes.Structure): ("clblast_info", ctypes.c_int), ("kcpp_main_gpu", ctypes.c_int), ("vulkan_info", ctypes.c_char_p), - ("blasbatchsize", ctypes.c_int), + ("batchsize", ctypes.c_int), ("forceversion", ctypes.c_int), ("gpulayers", ctypes.c_int), ("rope_freq_scale", ctypes.c_float), @@ -1177,7 +1177,7 @@ def autoset_gpu_layers(ctxsize, sdquanted, bbs, qkv_level): #shitty algo to dete headkvlen = (ggufmeta[2] if ggufmeta[2] > 0 else 128) ratio = (mem-usedmem)/(fsize*csmul*1.6*(1.0 if bbs <= 512 else 1.2)) if headcount > 0: - # rubbish random formula. apply blasbatchsize calculations if over 512 + # rubbish random formula. apply batchsize calculations if over 512 fattn_discount = 1.0/(3.2 if qkv_level==2 else (1.6 if qkv_level==1 else 1.0)) mem1 = layers*(4 if bbs <= 512 else (bbs/128))*headkvlen*cs*fattn_discount*4*1.45 mem2 = layers*headcount*headkvlen*cs*fattn_discount*4*1.15 @@ -1423,7 +1423,7 @@ def load_model(model_filename): print("\nWarning: quantkv was used without flashattention! This is NOT RECOMMENDED!\nOnly K cache can be quantized, and performance can suffer.\nIn some cases, it might even use more VRAM when doing a full offload.\nYou are strongly encouraged to use flashattention if you want to use quantkv.") else: inputs.quant_k = inputs.quant_v = 0 - inputs.blasbatchsize = args.blasbatchsize + inputs.batchsize = args.batchsize inputs.forceversion = args.forceversion inputs.gpulayers = args.gpulayers if args.overridenativecontext and args.overridenativecontext>0: @@ -4702,8 +4702,8 @@ def show_gui(): tabcontent = {} # slider data - blasbatchsize_values = ["-1","16","32","64","128","256","512","1024","2048","4096"] - blasbatchsize_text = ["Don't Batch","16","32","64","128","256","512","1024","2048","4096"] + batchsize_values = ["-1","16","32","64","128","256","512","1024","2048","4096"] + batchsize_text = ["Don't Batch","16","32","64","128","256","512","1024","2048","4096"] contextsize_text = ["256", "512", "1024", "2048", "3072", "4096", "6144", "8192", "10240", "12288", "14336", "16384", "20480", "24576", "28672", "32768", "40960", "49152", "57344", "65536", "81920", "98304", "114688", "131072"] antirunopts = [opt.replace("Use ", "") for lib, opt in lib_option_pairs if opt not in runopts] quantkv_text = ["F16 (Off)","8-Bit","4-Bit"] @@ -5153,7 +5153,7 @@ def show_gui(): pass def changed_gpulayers_estimate(*args): - predicted_gpu_layers = autoset_gpu_layers(int(contextsize_text[context_var.get()]),sd_quant_option(sd_quant_var.get()),int(blasbatchsize_values[int(blas_size_var.get())]),(quantkv_var.get() if flashattention_var.get()==1 else 0)) + predicted_gpu_layers = autoset_gpu_layers(int(contextsize_text[context_var.get()]),sd_quant_option(sd_quant_var.get()),int(batchsize_values[int(blas_size_var.get())]),(quantkv_var.get() if flashattention_var.get()==1 else 0)) max_gpu_layers = (f"/{modelfile_extracted_meta[1][0]+1}" if (modelfile_extracted_meta and modelfile_extracted_meta[1] and modelfile_extracted_meta[1][0]!=0) else "") index = runopts_var.get() gpu_be = (index == "Use Vulkan" or index == "Use Vulkan (Old CPU)" or index == "Use CLBlast" or index == "Use CLBlast (Old CPU)" or index == "Use CLBlast (Older CPU)" or index == "Use CUDA" or index == "Use hipBLAS (ROCm)") @@ -5409,7 +5409,7 @@ def show_gui(): # blas thread specifier makelabelentry(hardware_tab, "Batch Threads:" , blas_threads_var, 14, 50,tooltip="How many threads to use during batched processing.\nIf left blank, uses same value as regular thread count.") # blas batch size - makeslider(hardware_tab, "Batch Size:", blasbatchsize_text, blas_size_var, 0, len(blasbatchsize_values)-1, 16,width=200, set=6,tooltip="How many tokens to process at once per batch.\nLarger values use more memory.") + makeslider(hardware_tab, "Batch Size:", batchsize_text, blas_size_var, 0, len(batchsize_values)-1, 16,width=200, set=6,tooltip="How many tokens to process at once per batch.\nLarger values use more memory.") blas_size_var.trace_add("write", changed_gpulayers_estimate) # force version @@ -5756,7 +5756,7 @@ def show_gui(): args.maingpu = -1 if maingpu_var.get()=="" else int(maingpu_var.get()) args.blasthreads = None if blas_threads_var.get()=="" else int(blas_threads_var.get()) - args.blasbatchsize = int(blasbatchsize_values[int(blas_size_var.get())]) + args.batchsize = int(batchsize_values[int(blas_size_var.get())]) args.forceversion = 0 if version_var.get()=="" else int(version_var.get()) args.contextsize = int(contextsize_text[context_var.get()]) if customrope_var.get()==1: @@ -6019,8 +6019,8 @@ def show_gui(): if "overridetensors" in dict and dict["overridetensors"]: override_tensors_var.set(dict["overridetensors"]) - if "blasbatchsize" in dict and dict["blasbatchsize"]: - blas_size_var.set(blasbatchsize_values.index(str(dict["blasbatchsize"]))) + if "batchsize" in dict and dict["batchsize"]: + blas_size_var.set(batchsize_values.index(str(dict["batchsize"]))) version_var.set(str(dict["forceversion"]) if ("forceversion" in dict and dict["forceversion"]) else "0") model_var.set(dict["model_param"] if ("model_param" in dict and dict["model_param"]) else "") @@ -6427,6 +6427,8 @@ def convert_invalid_args(args): dict["usecuda"] = dict["usecublas"] if "usecuda" in dict and dict["usecuda"] and "lowvram" in dict["usecuda"]: dict["lowvram"] = True + if "batchsize" not in dict and "blasbatchsize" in dict and dict["blasbatchsize"]: + dict["batchsize"] = dict["blasbatchsize"] if "sdconfig" in dict and dict["sdconfig"] and len(dict["sdconfig"])>0: dict["sdmodel"] = dict["sdconfig"][0] if dict["sdconfig"] and len(dict["sdconfig"]) > 1: @@ -7328,7 +7330,7 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False): if args.gpulayers==-1: if MaxMemory[0] > 0 and (not args.usecpu) and ((args.usecuda is not None) or (args.usevulkan is not None) or (args.useclblast is not None) or sys.platform=="darwin"): extract_modelfile_params(args.model_param,args.sdmodel,args.whispermodel,args.mmproj,args.draftmodel,args.ttsmodel if args.ttsgpu else "",args.embeddingsmodel if args.embeddingsgpu else "") - layeramt = autoset_gpu_layers(args.contextsize,args.sdquant,args.blasbatchsize,(args.quantkv if args.flashattention else 0)) + layeramt = autoset_gpu_layers(args.contextsize,args.sdquant,args.batchsize,(args.quantkv if args.flashattention else 0)) print(f"Auto Recommended GPU Layers: {layeramt}") args.gpulayers = layeramt else: @@ -7759,7 +7761,7 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False): s_pp = float(benchmaxctx-benchlen)/t_pp s_gen = float(benchlen)/t_gen datetimestamp = datetime.now(timezone.utc) - benchflagstr = f"NoAVX2={args.noavx2} Threads={args.threads} HighPriority={args.highpriority} Cuda_Args={args.usecuda} Tensor_Split={args.tensor_split} BlasThreads={args.blasthreads} BlasBatchSize={args.blasbatchsize} FlashAttention={args.flashattention} KvCache={args.quantkv}" + benchflagstr = f"NoAVX2={args.noavx2} Threads={args.threads} HighPriority={args.highpriority} Cuda_Args={args.usecuda} Tensor_Split={args.tensor_split} BlasThreads={args.blasthreads} BatchSize={args.batchsize} FlashAttention={args.flashattention} KvCache={args.quantkv}" print(f"\nBenchmark Completed - v{KcppVersion} Results:\n======") print(f"Flags: {benchflagstr}") print(f"Timestamp: {datetimestamp}") @@ -7839,7 +7841,7 @@ if __name__ == '__main__': advparser.add_argument("--version", help="Prints version and exits.", action='store_true') advparser.add_argument("--analyze", metavar=('[filename]'), help="Reads the metadata, weight types and tensor names in any GGUF file.", default="") advparser.add_argument("--maingpu","--main-gpu","-mg", help="Only used in a multi-gpu setup. Sets the index of the main GPU that will be used.",metavar=('[Device ID]'), type=int, default=-1) - advparser.add_argument("--blasbatchsize","--batchsize","--batch-size","-b", help="Sets the batch size used in batched processing (default 512). Setting it to -1 disables batched mode, but keeps other benefits like GPU offload.", type=int,choices=[-1,16,32,64,128,256,512,1024,2048,4096], default=512) + advparser.add_argument("--batchsize","--blasbatchsize","--batch-size","-b", help="Sets the batch size used in batched processing (default 512). Setting it to -1 disables batched mode, but keeps other benefits like GPU offload.", type=int,choices=[-1,16,32,64,128,256,512,1024,2048,4096], default=512) advparser.add_argument("--blasthreads","--batchthreads","--threadsbatch","--threads-batch", help="Use a different number of threads during batching if specified. Otherwise, has the same value as --threads",metavar=('[threads]'), type=int, default=0) advparser.add_argument("--lora", help="GGUF models only, applies a lora file on top of model.", metavar=('[lora_filename]'), nargs='+') advparser.add_argument("--loramult", metavar=('[amount]'), help="Multiplier for the Text LORA model to be applied.", type=float, default=1.0)