From a998588f3a15ce800db5f1dbfd72f971fde5fa57 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Fri, 19 Jul 2024 00:20:11 +0800 Subject: [PATCH] improved estimation --- koboldcpp.py | 89 +++++++++++++++++++++++++++++++--------------------- 1 file changed, 54 insertions(+), 35 deletions(-) diff --git a/koboldcpp.py b/koboldcpp.py index 01a40b9a4..100a2ed25 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -556,34 +556,40 @@ def string_contains_sequence_substring(inputstr,sequences): import struct -def read_gguf_layer_count(file_path): +def read_gguf_metadata(file_path): + chunk_size = 8192 # read only first 8kb of file try: - fsize = os.path.getsize(file_path) - if fsize < 10000: #ignore files under 10kb - return 0 - with open(file_path, 'rb') as f: - file_header = f.read(4) - if file_header != b'GGUF': #file is not GGUF - return 0 - magic_key = b'.block_count' - magic_length = len(magic_key) - chunk_size = 4096 # read only first 4kb of file - data = f.read(chunk_size) - index = data.find(magic_key) # Search for the magic number, Read 2 chunks of 4 byte numbers - if index != -1 and index + magic_length + 8 <= chunk_size: - start_index = index + magic_length + def read_gguf_key(keyname,data,maxval): + keylen = len(keyname) + index = data.find(keyname) # Search for the magic number, Read 2 chunks of 4 byte numbers + if index != -1 and index + keylen + 8 <= chunk_size: + start_index = index + keylen first_value_bytes = data[start_index:start_index + 4] second_value_bytes = data[start_index + 4:start_index + 8] # Unpack each 4 bytes as an unsigned int32 in little-endian format - value1 = struct.unpack(' 0 and value2 <= 300: - return value2 #contains layer count + if value1 == 4 and value2 > 0 and value2 <= maxval: + return value2 #contains the desired value return 0 else: return 0 #not found + + fsize = os.path.getsize(file_path) + if fsize < 10000: #ignore files under 10kb + return None + with open(file_path, 'rb') as f: + file_header = f.read(4) + if file_header != b'GGUF': #file is not GGUF + return None + data = f.read(chunk_size) + layercount = read_gguf_key(b'.block_count',data,512) + head_count_kv = read_gguf_key(b'.attention.head_count_kv',data,8192) + key_length = read_gguf_key(b'.attention.key_length',data,8192) + val_length = read_gguf_key(b'.attention.value_length',data,8192) + return [layercount,head_count_kv, max(key_length,val_length)] except Exception as ex: - return 0 + return None def autoset_gpu_layers(filepath,ctxsize,gpumem): #shitty algo to determine how many layers to use try: @@ -592,20 +598,28 @@ def autoset_gpu_layers(filepath,ctxsize,gpumem): #shitty algo to determine how m if fsize>10000000: #dont bother with models < 10mb cs = ctxsize mem = gpumem - if cs and cs > 4096: - fsize *= 1.2 + csmul = 1.0 + if cs and cs > 8192: + csmul = 1.4 + elif cs and cs > 4096: + csmul = 1.2 elif cs and cs > 2048: - fsize *= 1.1 - if mem < fsize*1.6: - layers = read_gguf_layer_count(filepath) - if layers == 0: #fail to read - sizeperlayer = fsize*0.052 + csmul = 1.1 + if mem < fsize*1.6*csmul: + ggufmeta = read_gguf_metadata(filepath) + if not ggufmeta or ggufmeta[0]==0: #fail to read or no layers + sizeperlayer = fsize*csmul*0.052 layerlimit = int(min(200,mem/sizeperlayer)) else: - ratio = mem/(fsize*1.5) + layers = ggufmeta[0] + headcount = ggufmeta[1] + headkvlen = (ggufmeta[2] if ggufmeta[2] > 0 else 128) + ratio = mem/(fsize*csmul*1.5) + if headcount > 0: + ratio = max(ratio,mem/(fsize*1.34 + (layers*headcount*headkvlen*cs*4.25))) layerlimit = int(ratio*layers) else: - layerlimit = 200 #assume full offload + layerlimit = 200 # assume full offload return layerlimit except Exception as ex: return 0 @@ -696,15 +710,17 @@ def fetch_gpu_properties(testCL,testCU,testVK): return def auto_set_backend_cli(): - print("\nA .kcppt template was selected from CLI - automatically selecting your backend...\n") + print("\nA .kcppt template was selected from CLI - automatically selecting your backend...") fetch_gpu_properties(False,True,True) if exitcounter < 100 and MaxMemory[0]>3500000000 and (("Use CuBLAS" in runopts and CUDevicesNames[0]!="") or "Use hipBLAS (ROCm)" in runopts) and any(CUDevicesNames): if "Use CuBLAS" in runopts or "Use hipBLAS (ROCm)" in runopts: args.usecublas = ["normal","mmq"] + print("Auto Selected CUDA Backend...\n") elif exitcounter < 100 and (1 in VKIsDGPU) and "Use Vulkan" in runopts: for i in range(0,len(VKIsDGPU)): if VKIsDGPU[i]==1: args.usevulkan = [] + print("Auto Selected Vulkan Backend...\n") break def load_model(model_filename): @@ -2314,7 +2330,7 @@ def show_gui(): def auto_set_backend_gui(manual_select=False): global exitcounter, runmode_untouched if manual_select: - print("\nA .kcppt template was selected from GUI - automatically selecting your backend...\n") + print("\nA .kcppt template was selected from GUI - automatically selecting your backend...") runmode_untouched = True fetch_gpu_properties(False,True,True) else: @@ -2325,14 +2341,17 @@ def show_gui(): if "Use CuBLAS" in runopts: runopts_var.set("Use CuBLAS") gpu_choice_var.set("1") + print("Auto Selected CUDA Backend...\n") elif "Use hipBLAS (ROCm)" in runopts: runopts_var.set("Use hipBLAS (ROCm)") gpu_choice_var.set("1") + print("Auto Selected HIP Backend...\n") elif exitcounter < 100 and (1 in VKIsDGPU) and runmode_untouched and "Use Vulkan" in runopts: for i in range(0,len(VKIsDGPU)): if VKIsDGPU[i]==1: runopts_var.set("Use Vulkan") gpu_choice_var.set(str(i+1)) + print("Auto Selected Vulkan Backend...\n") break changed_gpu_choice_var() @@ -2654,7 +2673,7 @@ def show_gui(): def togglehorde(a,b,c): horde_items = zip([horde_name_entry, horde_gen_entry, horde_context_entry, horde_apikey_entry, horde_workername_entry], [horde_name_label, horde_gen_label, horde_context_label, horde_apikey_label, horde_workername_label]) - + for item, label in horde_items: if usehorde_var.get() == 1: item.grid() @@ -2668,7 +2687,7 @@ def show_gui(): makecheckbox(horde_tab, "Configure for Horde", usehorde_var, 19, command=togglehorde,tooltiptxt="Enable the embedded AI Horde worker.") togglehorde(1,1,1) - + # Image Gen Tab images_tab = tabcontent["Image Gen"] @@ -3540,9 +3559,9 @@ def download_model_from_url(url): #returns path to downloaded model when done dl_url = url if "https://huggingface.co/" in dl_url and "/blob/main/" in dl_url: dl_url = dl_url.replace("/blob/main/", "/resolve/main/") - print(f"Downloading file from external URL at {dl_url}") + print(f"Downloading file from external URL at {dl_url} now...") subprocess.run(f"curl -fL {dl_url} -o {mdlfilename}", shell=True, capture_output=True, text=True, check=True, encoding='utf-8') - print(f"Download {mdlfilename} completed...", flush=True) + print(f"Download {mdlfilename} completed.", flush=True) return mdlfilename return None @@ -3758,7 +3777,7 @@ def main(launch_args,start_server=True): elif args.gpulayers==-1 and not shouldavoidgpu and os.path.exists(args.model_param): print("Trying to automatically determine GPU layers...") if MaxMemory[0] == 0: #try to get gpu vram for cuda if not picked yet - fetch_gpu_properties(False,True,False) + fetch_gpu_properties(False,True,True) pass if MaxMemory[0] > 0: layeramt = autoset_gpu_layers(args.model_param, args.contextsize, MaxMemory[0])