From a998588f3a15ce800db5f1dbfd72f971fde5fa57 Mon Sep 17 00:00:00 2001
From: Concedo <39025047+LostRuins@users.noreply.github.com>
Date: Fri, 19 Jul 2024 00:20:11 +0800
Subject: [PATCH] improved estimation

---
 koboldcpp.py | 89 +++++++++++++++++++++++++++++++---------------------
 1 file changed, 54 insertions(+), 35 deletions(-)

diff --git a/koboldcpp.py b/koboldcpp.py
index 01a40b9a4..100a2ed25 100644
--- a/koboldcpp.py
+++ b/koboldcpp.py
@@ -556,34 +556,40 @@ def string_contains_sequence_substring(inputstr,sequences):
 
 import struct
 
-def read_gguf_layer_count(file_path):
+def read_gguf_metadata(file_path):
+    chunk_size = 8192  # read only first 8kb of file
     try:
-        fsize = os.path.getsize(file_path)
-        if fsize < 10000: #ignore files under 10kb
-            return 0
-        with open(file_path, 'rb') as f:
-            file_header = f.read(4)
-            if file_header != b'GGUF': #file is not GGUF
-                return 0
-            magic_key = b'.block_count'
-            magic_length = len(magic_key)
-            chunk_size = 4096  # read only first 4kb of file
-            data = f.read(chunk_size)
-            index = data.find(magic_key)  # Search for the magic number, Read 2 chunks of 4 byte numbers
-            if index != -1 and index + magic_length + 8 <= chunk_size:
-                start_index = index + magic_length
+        def read_gguf_key(keyname,data,maxval):
+            keylen = len(keyname)
+            index = data.find(keyname)  # Search for the magic number, Read 2 chunks of 4 byte numbers
+            if index != -1 and index + keylen + 8 <= chunk_size:
+                start_index = index + keylen
                 first_value_bytes = data[start_index:start_index + 4]
                 second_value_bytes = data[start_index + 4:start_index + 8]
                 # Unpack each 4 bytes as an unsigned int32 in little-endian format
-                value1 = struct.unpack('<I', first_value_bytes)[0]
+                value1 = struct.unpack('<I', first_value_bytes)[0] #4 means its a uint32
                 value2 = struct.unpack('<I', second_value_bytes)[0]
-                if value1 == 4 and value2 > 0 and value2 <= 300:
-                    return value2 #contains layer count
+                if value1 == 4 and value2 > 0 and value2 <= maxval:
+                    return value2 #contains the desired value
                 return 0
             else:
                 return 0 #not found
+
+        fsize = os.path.getsize(file_path)
+        if fsize < 10000: #ignore files under 10kb
+            return None
+        with open(file_path, 'rb') as f:
+            file_header = f.read(4)
+            if file_header != b'GGUF': #file is not GGUF
+                return None
+            data = f.read(chunk_size)
+            layercount = read_gguf_key(b'.block_count',data,512)
+            head_count_kv = read_gguf_key(b'.attention.head_count_kv',data,8192)
+            key_length = read_gguf_key(b'.attention.key_length',data,8192)
+            val_length = read_gguf_key(b'.attention.value_length',data,8192)
+            return [layercount,head_count_kv, max(key_length,val_length)]
     except Exception as ex:
-        return 0
+        return None
 
 def autoset_gpu_layers(filepath,ctxsize,gpumem): #shitty algo to determine how many layers to use
     try:
@@ -592,20 +598,28 @@ def autoset_gpu_layers(filepath,ctxsize,gpumem): #shitty algo to determine how m
         if fsize>10000000: #dont bother with models < 10mb
             cs = ctxsize
             mem = gpumem
-            if cs and cs > 4096:
-                fsize *= 1.2
+            csmul = 1.0
+            if cs and cs > 8192:
+                csmul = 1.4
+            elif cs and cs > 4096:
+                csmul = 1.2
             elif cs and cs > 2048:
-                fsize *= 1.1
-            if mem < fsize*1.6:
-                layers = read_gguf_layer_count(filepath)
-                if layers == 0: #fail to read
-                    sizeperlayer = fsize*0.052
+                csmul = 1.1
+            if mem < fsize*1.6*csmul:
+                ggufmeta = read_gguf_metadata(filepath)
+                if not ggufmeta or ggufmeta[0]==0: #fail to read or no layers
+                    sizeperlayer = fsize*csmul*0.052
                     layerlimit = int(min(200,mem/sizeperlayer))
                 else:
-                    ratio = mem/(fsize*1.5)
+                    layers = ggufmeta[0]
+                    headcount = ggufmeta[1]
+                    headkvlen = (ggufmeta[2] if ggufmeta[2] > 0 else 128)
+                    ratio = mem/(fsize*csmul*1.5)
+                    if headcount > 0:
+                        ratio = max(ratio,mem/(fsize*1.34 + (layers*headcount*headkvlen*cs*4.25)))
                     layerlimit = int(ratio*layers)
             else:
-                layerlimit = 200 #assume full offload
+                layerlimit = 200 # assume full offload
         return layerlimit
     except Exception as ex:
         return 0
@@ -696,15 +710,17 @@ def fetch_gpu_properties(testCL,testCU,testVK):
     return
 
 def auto_set_backend_cli():
-    print("\nA .kcppt template was selected from CLI - automatically selecting your backend...\n")
+    print("\nA .kcppt template was selected from CLI - automatically selecting your backend...")
     fetch_gpu_properties(False,True,True)
     if exitcounter < 100 and MaxMemory[0]>3500000000 and (("Use CuBLAS" in runopts and CUDevicesNames[0]!="") or "Use hipBLAS (ROCm)" in runopts) and any(CUDevicesNames):
         if "Use CuBLAS" in runopts or "Use hipBLAS (ROCm)" in runopts:
             args.usecublas = ["normal","mmq"]
+            print("Auto Selected CUDA Backend...\n")
     elif exitcounter < 100 and (1 in VKIsDGPU) and "Use Vulkan" in runopts:
         for i in range(0,len(VKIsDGPU)):
             if VKIsDGPU[i]==1:
                 args.usevulkan = []
+                print("Auto Selected Vulkan Backend...\n")
                 break
 
 def load_model(model_filename):
@@ -2314,7 +2330,7 @@ def show_gui():
     def auto_set_backend_gui(manual_select=False):
         global exitcounter, runmode_untouched
         if manual_select:
-            print("\nA .kcppt template was selected from GUI - automatically selecting your backend...\n")
+            print("\nA .kcppt template was selected from GUI - automatically selecting your backend...")
             runmode_untouched = True
             fetch_gpu_properties(False,True,True)
         else:
@@ -2325,14 +2341,17 @@ def show_gui():
             if "Use CuBLAS" in runopts:
                 runopts_var.set("Use CuBLAS")
                 gpu_choice_var.set("1")
+                print("Auto Selected CUDA Backend...\n")
             elif "Use hipBLAS (ROCm)" in runopts:
                 runopts_var.set("Use hipBLAS (ROCm)")
                 gpu_choice_var.set("1")
+                print("Auto Selected HIP Backend...\n")
         elif exitcounter < 100 and (1 in VKIsDGPU) and runmode_untouched and "Use Vulkan" in runopts:
             for i in range(0,len(VKIsDGPU)):
                 if VKIsDGPU[i]==1:
                     runopts_var.set("Use Vulkan")
                     gpu_choice_var.set(str(i+1))
+                    print("Auto Selected Vulkan Backend...\n")
                     break
         changed_gpu_choice_var()
 
@@ -2654,7 +2673,7 @@ def show_gui():
     def togglehorde(a,b,c):
         horde_items = zip([horde_name_entry, horde_gen_entry, horde_context_entry, horde_apikey_entry, horde_workername_entry],
                           [horde_name_label, horde_gen_label, horde_context_label, horde_apikey_label, horde_workername_label])
-        
+
         for item, label in horde_items:
             if usehorde_var.get() == 1:
                 item.grid()
@@ -2668,7 +2687,7 @@ def show_gui():
 
     makecheckbox(horde_tab, "Configure for Horde", usehorde_var, 19, command=togglehorde,tooltiptxt="Enable the embedded AI Horde worker.")
     togglehorde(1,1,1)
-    
+
     # Image Gen Tab
 
     images_tab = tabcontent["Image Gen"]
@@ -3540,9 +3559,9 @@ def download_model_from_url(url): #returns path to downloaded model when done
             dl_url = url
             if "https://huggingface.co/" in dl_url and "/blob/main/" in dl_url:
                 dl_url = dl_url.replace("/blob/main/", "/resolve/main/")
-            print(f"Downloading file from external URL at {dl_url}")
+            print(f"Downloading file from external URL at {dl_url} now...")
             subprocess.run(f"curl -fL {dl_url} -o {mdlfilename}", shell=True, capture_output=True, text=True, check=True, encoding='utf-8')
-            print(f"Download {mdlfilename} completed...", flush=True)
+            print(f"Download {mdlfilename} completed.", flush=True)
             return mdlfilename
     return None
 
@@ -3758,7 +3777,7 @@ def main(launch_args,start_server=True):
         elif args.gpulayers==-1 and not shouldavoidgpu and os.path.exists(args.model_param):
             print("Trying to automatically determine GPU layers...")
             if MaxMemory[0] == 0: #try to get gpu vram for cuda if not picked yet
-                fetch_gpu_properties(False,True,False)
+                fetch_gpu_properties(False,True,True)
                 pass
             if MaxMemory[0] > 0:
                 layeramt = autoset_gpu_layers(args.model_param, args.contextsize, MaxMemory[0])