mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 01:24:36 +00:00
improved estimation
This commit is contained in:
parent
caab9cb8ae
commit
a998588f3a
1 changed files with 54 additions and 35 deletions
89
koboldcpp.py
89
koboldcpp.py
|
@ -556,34 +556,40 @@ def string_contains_sequence_substring(inputstr,sequences):
|
||||||
|
|
||||||
import struct
|
import struct
|
||||||
|
|
||||||
def read_gguf_layer_count(file_path):
|
def read_gguf_metadata(file_path):
|
||||||
|
chunk_size = 8192 # read only first 8kb of file
|
||||||
try:
|
try:
|
||||||
fsize = os.path.getsize(file_path)
|
def read_gguf_key(keyname,data,maxval):
|
||||||
if fsize < 10000: #ignore files under 10kb
|
keylen = len(keyname)
|
||||||
return 0
|
index = data.find(keyname) # Search for the magic number, Read 2 chunks of 4 byte numbers
|
||||||
with open(file_path, 'rb') as f:
|
if index != -1 and index + keylen + 8 <= chunk_size:
|
||||||
file_header = f.read(4)
|
start_index = index + keylen
|
||||||
if file_header != b'GGUF': #file is not GGUF
|
|
||||||
return 0
|
|
||||||
magic_key = b'.block_count'
|
|
||||||
magic_length = len(magic_key)
|
|
||||||
chunk_size = 4096 # read only first 4kb of file
|
|
||||||
data = f.read(chunk_size)
|
|
||||||
index = data.find(magic_key) # Search for the magic number, Read 2 chunks of 4 byte numbers
|
|
||||||
if index != -1 and index + magic_length + 8 <= chunk_size:
|
|
||||||
start_index = index + magic_length
|
|
||||||
first_value_bytes = data[start_index:start_index + 4]
|
first_value_bytes = data[start_index:start_index + 4]
|
||||||
second_value_bytes = data[start_index + 4:start_index + 8]
|
second_value_bytes = data[start_index + 4:start_index + 8]
|
||||||
# Unpack each 4 bytes as an unsigned int32 in little-endian format
|
# Unpack each 4 bytes as an unsigned int32 in little-endian format
|
||||||
value1 = struct.unpack('<I', first_value_bytes)[0]
|
value1 = struct.unpack('<I', first_value_bytes)[0] #4 means its a uint32
|
||||||
value2 = struct.unpack('<I', second_value_bytes)[0]
|
value2 = struct.unpack('<I', second_value_bytes)[0]
|
||||||
if value1 == 4 and value2 > 0 and value2 <= 300:
|
if value1 == 4 and value2 > 0 and value2 <= maxval:
|
||||||
return value2 #contains layer count
|
return value2 #contains the desired value
|
||||||
return 0
|
return 0
|
||||||
else:
|
else:
|
||||||
return 0 #not found
|
return 0 #not found
|
||||||
|
|
||||||
|
fsize = os.path.getsize(file_path)
|
||||||
|
if fsize < 10000: #ignore files under 10kb
|
||||||
|
return None
|
||||||
|
with open(file_path, 'rb') as f:
|
||||||
|
file_header = f.read(4)
|
||||||
|
if file_header != b'GGUF': #file is not GGUF
|
||||||
|
return None
|
||||||
|
data = f.read(chunk_size)
|
||||||
|
layercount = read_gguf_key(b'.block_count',data,512)
|
||||||
|
head_count_kv = read_gguf_key(b'.attention.head_count_kv',data,8192)
|
||||||
|
key_length = read_gguf_key(b'.attention.key_length',data,8192)
|
||||||
|
val_length = read_gguf_key(b'.attention.value_length',data,8192)
|
||||||
|
return [layercount,head_count_kv, max(key_length,val_length)]
|
||||||
except Exception as ex:
|
except Exception as ex:
|
||||||
return 0
|
return None
|
||||||
|
|
||||||
def autoset_gpu_layers(filepath,ctxsize,gpumem): #shitty algo to determine how many layers to use
|
def autoset_gpu_layers(filepath,ctxsize,gpumem): #shitty algo to determine how many layers to use
|
||||||
try:
|
try:
|
||||||
|
@ -592,20 +598,28 @@ def autoset_gpu_layers(filepath,ctxsize,gpumem): #shitty algo to determine how m
|
||||||
if fsize>10000000: #dont bother with models < 10mb
|
if fsize>10000000: #dont bother with models < 10mb
|
||||||
cs = ctxsize
|
cs = ctxsize
|
||||||
mem = gpumem
|
mem = gpumem
|
||||||
if cs and cs > 4096:
|
csmul = 1.0
|
||||||
fsize *= 1.2
|
if cs and cs > 8192:
|
||||||
|
csmul = 1.4
|
||||||
|
elif cs and cs > 4096:
|
||||||
|
csmul = 1.2
|
||||||
elif cs and cs > 2048:
|
elif cs and cs > 2048:
|
||||||
fsize *= 1.1
|
csmul = 1.1
|
||||||
if mem < fsize*1.6:
|
if mem < fsize*1.6*csmul:
|
||||||
layers = read_gguf_layer_count(filepath)
|
ggufmeta = read_gguf_metadata(filepath)
|
||||||
if layers == 0: #fail to read
|
if not ggufmeta or ggufmeta[0]==0: #fail to read or no layers
|
||||||
sizeperlayer = fsize*0.052
|
sizeperlayer = fsize*csmul*0.052
|
||||||
layerlimit = int(min(200,mem/sizeperlayer))
|
layerlimit = int(min(200,mem/sizeperlayer))
|
||||||
else:
|
else:
|
||||||
ratio = mem/(fsize*1.5)
|
layers = ggufmeta[0]
|
||||||
|
headcount = ggufmeta[1]
|
||||||
|
headkvlen = (ggufmeta[2] if ggufmeta[2] > 0 else 128)
|
||||||
|
ratio = mem/(fsize*csmul*1.5)
|
||||||
|
if headcount > 0:
|
||||||
|
ratio = max(ratio,mem/(fsize*1.34 + (layers*headcount*headkvlen*cs*4.25)))
|
||||||
layerlimit = int(ratio*layers)
|
layerlimit = int(ratio*layers)
|
||||||
else:
|
else:
|
||||||
layerlimit = 200 #assume full offload
|
layerlimit = 200 # assume full offload
|
||||||
return layerlimit
|
return layerlimit
|
||||||
except Exception as ex:
|
except Exception as ex:
|
||||||
return 0
|
return 0
|
||||||
|
@ -696,15 +710,17 @@ def fetch_gpu_properties(testCL,testCU,testVK):
|
||||||
return
|
return
|
||||||
|
|
||||||
def auto_set_backend_cli():
|
def auto_set_backend_cli():
|
||||||
print("\nA .kcppt template was selected from CLI - automatically selecting your backend...\n")
|
print("\nA .kcppt template was selected from CLI - automatically selecting your backend...")
|
||||||
fetch_gpu_properties(False,True,True)
|
fetch_gpu_properties(False,True,True)
|
||||||
if exitcounter < 100 and MaxMemory[0]>3500000000 and (("Use CuBLAS" in runopts and CUDevicesNames[0]!="") or "Use hipBLAS (ROCm)" in runopts) and any(CUDevicesNames):
|
if exitcounter < 100 and MaxMemory[0]>3500000000 and (("Use CuBLAS" in runopts and CUDevicesNames[0]!="") or "Use hipBLAS (ROCm)" in runopts) and any(CUDevicesNames):
|
||||||
if "Use CuBLAS" in runopts or "Use hipBLAS (ROCm)" in runopts:
|
if "Use CuBLAS" in runopts or "Use hipBLAS (ROCm)" in runopts:
|
||||||
args.usecublas = ["normal","mmq"]
|
args.usecublas = ["normal","mmq"]
|
||||||
|
print("Auto Selected CUDA Backend...\n")
|
||||||
elif exitcounter < 100 and (1 in VKIsDGPU) and "Use Vulkan" in runopts:
|
elif exitcounter < 100 and (1 in VKIsDGPU) and "Use Vulkan" in runopts:
|
||||||
for i in range(0,len(VKIsDGPU)):
|
for i in range(0,len(VKIsDGPU)):
|
||||||
if VKIsDGPU[i]==1:
|
if VKIsDGPU[i]==1:
|
||||||
args.usevulkan = []
|
args.usevulkan = []
|
||||||
|
print("Auto Selected Vulkan Backend...\n")
|
||||||
break
|
break
|
||||||
|
|
||||||
def load_model(model_filename):
|
def load_model(model_filename):
|
||||||
|
@ -2314,7 +2330,7 @@ def show_gui():
|
||||||
def auto_set_backend_gui(manual_select=False):
|
def auto_set_backend_gui(manual_select=False):
|
||||||
global exitcounter, runmode_untouched
|
global exitcounter, runmode_untouched
|
||||||
if manual_select:
|
if manual_select:
|
||||||
print("\nA .kcppt template was selected from GUI - automatically selecting your backend...\n")
|
print("\nA .kcppt template was selected from GUI - automatically selecting your backend...")
|
||||||
runmode_untouched = True
|
runmode_untouched = True
|
||||||
fetch_gpu_properties(False,True,True)
|
fetch_gpu_properties(False,True,True)
|
||||||
else:
|
else:
|
||||||
|
@ -2325,14 +2341,17 @@ def show_gui():
|
||||||
if "Use CuBLAS" in runopts:
|
if "Use CuBLAS" in runopts:
|
||||||
runopts_var.set("Use CuBLAS")
|
runopts_var.set("Use CuBLAS")
|
||||||
gpu_choice_var.set("1")
|
gpu_choice_var.set("1")
|
||||||
|
print("Auto Selected CUDA Backend...\n")
|
||||||
elif "Use hipBLAS (ROCm)" in runopts:
|
elif "Use hipBLAS (ROCm)" in runopts:
|
||||||
runopts_var.set("Use hipBLAS (ROCm)")
|
runopts_var.set("Use hipBLAS (ROCm)")
|
||||||
gpu_choice_var.set("1")
|
gpu_choice_var.set("1")
|
||||||
|
print("Auto Selected HIP Backend...\n")
|
||||||
elif exitcounter < 100 and (1 in VKIsDGPU) and runmode_untouched and "Use Vulkan" in runopts:
|
elif exitcounter < 100 and (1 in VKIsDGPU) and runmode_untouched and "Use Vulkan" in runopts:
|
||||||
for i in range(0,len(VKIsDGPU)):
|
for i in range(0,len(VKIsDGPU)):
|
||||||
if VKIsDGPU[i]==1:
|
if VKIsDGPU[i]==1:
|
||||||
runopts_var.set("Use Vulkan")
|
runopts_var.set("Use Vulkan")
|
||||||
gpu_choice_var.set(str(i+1))
|
gpu_choice_var.set(str(i+1))
|
||||||
|
print("Auto Selected Vulkan Backend...\n")
|
||||||
break
|
break
|
||||||
changed_gpu_choice_var()
|
changed_gpu_choice_var()
|
||||||
|
|
||||||
|
@ -2654,7 +2673,7 @@ def show_gui():
|
||||||
def togglehorde(a,b,c):
|
def togglehorde(a,b,c):
|
||||||
horde_items = zip([horde_name_entry, horde_gen_entry, horde_context_entry, horde_apikey_entry, horde_workername_entry],
|
horde_items = zip([horde_name_entry, horde_gen_entry, horde_context_entry, horde_apikey_entry, horde_workername_entry],
|
||||||
[horde_name_label, horde_gen_label, horde_context_label, horde_apikey_label, horde_workername_label])
|
[horde_name_label, horde_gen_label, horde_context_label, horde_apikey_label, horde_workername_label])
|
||||||
|
|
||||||
for item, label in horde_items:
|
for item, label in horde_items:
|
||||||
if usehorde_var.get() == 1:
|
if usehorde_var.get() == 1:
|
||||||
item.grid()
|
item.grid()
|
||||||
|
@ -2668,7 +2687,7 @@ def show_gui():
|
||||||
|
|
||||||
makecheckbox(horde_tab, "Configure for Horde", usehorde_var, 19, command=togglehorde,tooltiptxt="Enable the embedded AI Horde worker.")
|
makecheckbox(horde_tab, "Configure for Horde", usehorde_var, 19, command=togglehorde,tooltiptxt="Enable the embedded AI Horde worker.")
|
||||||
togglehorde(1,1,1)
|
togglehorde(1,1,1)
|
||||||
|
|
||||||
# Image Gen Tab
|
# Image Gen Tab
|
||||||
|
|
||||||
images_tab = tabcontent["Image Gen"]
|
images_tab = tabcontent["Image Gen"]
|
||||||
|
@ -3540,9 +3559,9 @@ def download_model_from_url(url): #returns path to downloaded model when done
|
||||||
dl_url = url
|
dl_url = url
|
||||||
if "https://huggingface.co/" in dl_url and "/blob/main/" in dl_url:
|
if "https://huggingface.co/" in dl_url and "/blob/main/" in dl_url:
|
||||||
dl_url = dl_url.replace("/blob/main/", "/resolve/main/")
|
dl_url = dl_url.replace("/blob/main/", "/resolve/main/")
|
||||||
print(f"Downloading file from external URL at {dl_url}")
|
print(f"Downloading file from external URL at {dl_url} now...")
|
||||||
subprocess.run(f"curl -fL {dl_url} -o {mdlfilename}", shell=True, capture_output=True, text=True, check=True, encoding='utf-8')
|
subprocess.run(f"curl -fL {dl_url} -o {mdlfilename}", shell=True, capture_output=True, text=True, check=True, encoding='utf-8')
|
||||||
print(f"Download {mdlfilename} completed...", flush=True)
|
print(f"Download {mdlfilename} completed.", flush=True)
|
||||||
return mdlfilename
|
return mdlfilename
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
@ -3758,7 +3777,7 @@ def main(launch_args,start_server=True):
|
||||||
elif args.gpulayers==-1 and not shouldavoidgpu and os.path.exists(args.model_param):
|
elif args.gpulayers==-1 and not shouldavoidgpu and os.path.exists(args.model_param):
|
||||||
print("Trying to automatically determine GPU layers...")
|
print("Trying to automatically determine GPU layers...")
|
||||||
if MaxMemory[0] == 0: #try to get gpu vram for cuda if not picked yet
|
if MaxMemory[0] == 0: #try to get gpu vram for cuda if not picked yet
|
||||||
fetch_gpu_properties(False,True,False)
|
fetch_gpu_properties(False,True,True)
|
||||||
pass
|
pass
|
||||||
if MaxMemory[0] > 0:
|
if MaxMemory[0] > 0:
|
||||||
layeramt = autoset_gpu_layers(args.model_param, args.contextsize, MaxMemory[0])
|
layeramt = autoset_gpu_layers(args.model_param, args.contextsize, MaxMemory[0])
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue