mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-22 03:10:03 +00:00
rename blasbatchsize to batchsize
This commit is contained in:
parent
3712c6e6cd
commit
57e1d9c822
3 changed files with 17 additions and 15 deletions
2
expose.h
2
expose.h
|
|
@ -56,7 +56,7 @@ struct load_model_inputs
|
|||
const int clblast_info = 0;
|
||||
const int kcpp_main_gpu = 0;
|
||||
const char * vulkan_info = nullptr;
|
||||
const int blasbatchsize = 512;
|
||||
const int batchsize = 512;
|
||||
const int forceversion = 0;
|
||||
const int gpulayers = 0;
|
||||
const float rope_freq_scale = 1.0f;
|
||||
|
|
|
|||
|
|
@ -1972,7 +1972,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
|||
kcpp_data->n_threads = inputs.threads;
|
||||
kcpp_data->n_blasthreads = inputs.blasthreads;
|
||||
bool isGguf = (file_format == FileFormat::GGUF_GENERIC);
|
||||
kcpp_data->n_batch = GetBatchSize(inputs.blasbatchsize, in_file_format);
|
||||
kcpp_data->n_batch = GetBatchSize(inputs.batchsize, in_file_format);
|
||||
kcpp_data->n_ubatch = kcpp_data->n_batch;
|
||||
kcpp_data->flash_attn = inputs.flash_attention;
|
||||
kcpp_data->model_filename = inputs.model_filename;
|
||||
|
|
|
|||
28
koboldcpp.py
28
koboldcpp.py
|
|
@ -192,7 +192,7 @@ class load_model_inputs(ctypes.Structure):
|
|||
("clblast_info", ctypes.c_int),
|
||||
("kcpp_main_gpu", ctypes.c_int),
|
||||
("vulkan_info", ctypes.c_char_p),
|
||||
("blasbatchsize", ctypes.c_int),
|
||||
("batchsize", ctypes.c_int),
|
||||
("forceversion", ctypes.c_int),
|
||||
("gpulayers", ctypes.c_int),
|
||||
("rope_freq_scale", ctypes.c_float),
|
||||
|
|
@ -1177,7 +1177,7 @@ def autoset_gpu_layers(ctxsize, sdquanted, bbs, qkv_level): #shitty algo to dete
|
|||
headkvlen = (ggufmeta[2] if ggufmeta[2] > 0 else 128)
|
||||
ratio = (mem-usedmem)/(fsize*csmul*1.6*(1.0 if bbs <= 512 else 1.2))
|
||||
if headcount > 0:
|
||||
# rubbish random formula. apply blasbatchsize calculations if over 512
|
||||
# rubbish random formula. apply batchsize calculations if over 512
|
||||
fattn_discount = 1.0/(3.2 if qkv_level==2 else (1.6 if qkv_level==1 else 1.0))
|
||||
mem1 = layers*(4 if bbs <= 512 else (bbs/128))*headkvlen*cs*fattn_discount*4*1.45
|
||||
mem2 = layers*headcount*headkvlen*cs*fattn_discount*4*1.15
|
||||
|
|
@ -1423,7 +1423,7 @@ def load_model(model_filename):
|
|||
print("\nWarning: quantkv was used without flashattention! This is NOT RECOMMENDED!\nOnly K cache can be quantized, and performance can suffer.\nIn some cases, it might even use more VRAM when doing a full offload.\nYou are strongly encouraged to use flashattention if you want to use quantkv.")
|
||||
else:
|
||||
inputs.quant_k = inputs.quant_v = 0
|
||||
inputs.blasbatchsize = args.blasbatchsize
|
||||
inputs.batchsize = args.batchsize
|
||||
inputs.forceversion = args.forceversion
|
||||
inputs.gpulayers = args.gpulayers
|
||||
if args.overridenativecontext and args.overridenativecontext>0:
|
||||
|
|
@ -4702,8 +4702,8 @@ def show_gui():
|
|||
|
||||
tabcontent = {}
|
||||
# slider data
|
||||
blasbatchsize_values = ["-1","16","32","64","128","256","512","1024","2048","4096"]
|
||||
blasbatchsize_text = ["Don't Batch","16","32","64","128","256","512","1024","2048","4096"]
|
||||
batchsize_values = ["-1","16","32","64","128","256","512","1024","2048","4096"]
|
||||
batchsize_text = ["Don't Batch","16","32","64","128","256","512","1024","2048","4096"]
|
||||
contextsize_text = ["256", "512", "1024", "2048", "3072", "4096", "6144", "8192", "10240", "12288", "14336", "16384", "20480", "24576", "28672", "32768", "40960", "49152", "57344", "65536", "81920", "98304", "114688", "131072"]
|
||||
antirunopts = [opt.replace("Use ", "") for lib, opt in lib_option_pairs if opt not in runopts]
|
||||
quantkv_text = ["F16 (Off)","8-Bit","4-Bit"]
|
||||
|
|
@ -5153,7 +5153,7 @@ def show_gui():
|
|||
pass
|
||||
|
||||
def changed_gpulayers_estimate(*args):
|
||||
predicted_gpu_layers = autoset_gpu_layers(int(contextsize_text[context_var.get()]),sd_quant_option(sd_quant_var.get()),int(blasbatchsize_values[int(blas_size_var.get())]),(quantkv_var.get() if flashattention_var.get()==1 else 0))
|
||||
predicted_gpu_layers = autoset_gpu_layers(int(contextsize_text[context_var.get()]),sd_quant_option(sd_quant_var.get()),int(batchsize_values[int(blas_size_var.get())]),(quantkv_var.get() if flashattention_var.get()==1 else 0))
|
||||
max_gpu_layers = (f"/{modelfile_extracted_meta[1][0]+1}" if (modelfile_extracted_meta and modelfile_extracted_meta[1] and modelfile_extracted_meta[1][0]!=0) else "")
|
||||
index = runopts_var.get()
|
||||
gpu_be = (index == "Use Vulkan" or index == "Use Vulkan (Old CPU)" or index == "Use CLBlast" or index == "Use CLBlast (Old CPU)" or index == "Use CLBlast (Older CPU)" or index == "Use CUDA" or index == "Use hipBLAS (ROCm)")
|
||||
|
|
@ -5409,7 +5409,7 @@ def show_gui():
|
|||
# blas thread specifier
|
||||
makelabelentry(hardware_tab, "Batch Threads:" , blas_threads_var, 14, 50,tooltip="How many threads to use during batched processing.\nIf left blank, uses same value as regular thread count.")
|
||||
# blas batch size
|
||||
makeslider(hardware_tab, "Batch Size:", blasbatchsize_text, blas_size_var, 0, len(blasbatchsize_values)-1, 16,width=200, set=6,tooltip="How many tokens to process at once per batch.\nLarger values use more memory.")
|
||||
makeslider(hardware_tab, "Batch Size:", batchsize_text, blas_size_var, 0, len(batchsize_values)-1, 16,width=200, set=6,tooltip="How many tokens to process at once per batch.\nLarger values use more memory.")
|
||||
blas_size_var.trace_add("write", changed_gpulayers_estimate)
|
||||
|
||||
# force version
|
||||
|
|
@ -5756,7 +5756,7 @@ def show_gui():
|
|||
|
||||
args.maingpu = -1 if maingpu_var.get()=="" else int(maingpu_var.get())
|
||||
args.blasthreads = None if blas_threads_var.get()=="" else int(blas_threads_var.get())
|
||||
args.blasbatchsize = int(blasbatchsize_values[int(blas_size_var.get())])
|
||||
args.batchsize = int(batchsize_values[int(blas_size_var.get())])
|
||||
args.forceversion = 0 if version_var.get()=="" else int(version_var.get())
|
||||
args.contextsize = int(contextsize_text[context_var.get()])
|
||||
if customrope_var.get()==1:
|
||||
|
|
@ -6019,8 +6019,8 @@ def show_gui():
|
|||
if "overridetensors" in dict and dict["overridetensors"]:
|
||||
override_tensors_var.set(dict["overridetensors"])
|
||||
|
||||
if "blasbatchsize" in dict and dict["blasbatchsize"]:
|
||||
blas_size_var.set(blasbatchsize_values.index(str(dict["blasbatchsize"])))
|
||||
if "batchsize" in dict and dict["batchsize"]:
|
||||
blas_size_var.set(batchsize_values.index(str(dict["batchsize"])))
|
||||
|
||||
version_var.set(str(dict["forceversion"]) if ("forceversion" in dict and dict["forceversion"]) else "0")
|
||||
model_var.set(dict["model_param"] if ("model_param" in dict and dict["model_param"]) else "")
|
||||
|
|
@ -6427,6 +6427,8 @@ def convert_invalid_args(args):
|
|||
dict["usecuda"] = dict["usecublas"]
|
||||
if "usecuda" in dict and dict["usecuda"] and "lowvram" in dict["usecuda"]:
|
||||
dict["lowvram"] = True
|
||||
if "batchsize" not in dict and "blasbatchsize" in dict and dict["blasbatchsize"]:
|
||||
dict["batchsize"] = dict["blasbatchsize"]
|
||||
if "sdconfig" in dict and dict["sdconfig"] and len(dict["sdconfig"])>0:
|
||||
dict["sdmodel"] = dict["sdconfig"][0]
|
||||
if dict["sdconfig"] and len(dict["sdconfig"]) > 1:
|
||||
|
|
@ -7328,7 +7330,7 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False):
|
|||
if args.gpulayers==-1:
|
||||
if MaxMemory[0] > 0 and (not args.usecpu) and ((args.usecuda is not None) or (args.usevulkan is not None) or (args.useclblast is not None) or sys.platform=="darwin"):
|
||||
extract_modelfile_params(args.model_param,args.sdmodel,args.whispermodel,args.mmproj,args.draftmodel,args.ttsmodel if args.ttsgpu else "",args.embeddingsmodel if args.embeddingsgpu else "")
|
||||
layeramt = autoset_gpu_layers(args.contextsize,args.sdquant,args.blasbatchsize,(args.quantkv if args.flashattention else 0))
|
||||
layeramt = autoset_gpu_layers(args.contextsize,args.sdquant,args.batchsize,(args.quantkv if args.flashattention else 0))
|
||||
print(f"Auto Recommended GPU Layers: {layeramt}")
|
||||
args.gpulayers = layeramt
|
||||
else:
|
||||
|
|
@ -7759,7 +7761,7 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False):
|
|||
s_pp = float(benchmaxctx-benchlen)/t_pp
|
||||
s_gen = float(benchlen)/t_gen
|
||||
datetimestamp = datetime.now(timezone.utc)
|
||||
benchflagstr = f"NoAVX2={args.noavx2} Threads={args.threads} HighPriority={args.highpriority} Cuda_Args={args.usecuda} Tensor_Split={args.tensor_split} BlasThreads={args.blasthreads} BlasBatchSize={args.blasbatchsize} FlashAttention={args.flashattention} KvCache={args.quantkv}"
|
||||
benchflagstr = f"NoAVX2={args.noavx2} Threads={args.threads} HighPriority={args.highpriority} Cuda_Args={args.usecuda} Tensor_Split={args.tensor_split} BlasThreads={args.blasthreads} BatchSize={args.batchsize} FlashAttention={args.flashattention} KvCache={args.quantkv}"
|
||||
print(f"\nBenchmark Completed - v{KcppVersion} Results:\n======")
|
||||
print(f"Flags: {benchflagstr}")
|
||||
print(f"Timestamp: {datetimestamp}")
|
||||
|
|
@ -7839,7 +7841,7 @@ if __name__ == '__main__':
|
|||
advparser.add_argument("--version", help="Prints version and exits.", action='store_true')
|
||||
advparser.add_argument("--analyze", metavar=('[filename]'), help="Reads the metadata, weight types and tensor names in any GGUF file.", default="")
|
||||
advparser.add_argument("--maingpu","--main-gpu","-mg", help="Only used in a multi-gpu setup. Sets the index of the main GPU that will be used.",metavar=('[Device ID]'), type=int, default=-1)
|
||||
advparser.add_argument("--blasbatchsize","--batchsize","--batch-size","-b", help="Sets the batch size used in batched processing (default 512). Setting it to -1 disables batched mode, but keeps other benefits like GPU offload.", type=int,choices=[-1,16,32,64,128,256,512,1024,2048,4096], default=512)
|
||||
advparser.add_argument("--batchsize","--blasbatchsize","--batch-size","-b", help="Sets the batch size used in batched processing (default 512). Setting it to -1 disables batched mode, but keeps other benefits like GPU offload.", type=int,choices=[-1,16,32,64,128,256,512,1024,2048,4096], default=512)
|
||||
advparser.add_argument("--blasthreads","--batchthreads","--threadsbatch","--threads-batch", help="Use a different number of threads during batching if specified. Otherwise, has the same value as --threads",metavar=('[threads]'), type=int, default=0)
|
||||
advparser.add_argument("--lora", help="GGUF models only, applies a lora file on top of model.", metavar=('[lora_filename]'), nargs='+')
|
||||
advparser.add_argument("--loramult", metavar=('[amount]'), help="Multiplier for the Text LORA model to be applied.", type=float, default=1.0)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue