mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 09:34:37 +00:00
add used memory checks, add gpulayers for metal
This commit is contained in:
parent
23caa63f94
commit
bd4e55eb74
1 changed files with 27 additions and 9 deletions
36
koboldcpp.py
36
koboldcpp.py
|
@ -46,6 +46,7 @@ showdebug = True
|
||||||
guimode = False
|
guimode = False
|
||||||
showsamplerwarning = True
|
showsamplerwarning = True
|
||||||
showmaxctxwarning = True
|
showmaxctxwarning = True
|
||||||
|
showusedmemwarning = True
|
||||||
session_kudos_earned = 0
|
session_kudos_earned = 0
|
||||||
session_jobs = 0
|
session_jobs = 0
|
||||||
session_starttime = None
|
session_starttime = None
|
||||||
|
@ -80,6 +81,7 @@ CUDevicesNames = ["","","","",""]
|
||||||
VKDevicesNames = ["","","",""]
|
VKDevicesNames = ["","","",""]
|
||||||
VKIsDGPU = [0,0,0,0]
|
VKIsDGPU = [0,0,0,0]
|
||||||
MaxMemory = [0]
|
MaxMemory = [0]
|
||||||
|
MaxFreeMemory = [0]
|
||||||
|
|
||||||
class logit_bias(ctypes.Structure):
|
class logit_bias(ctypes.Structure):
|
||||||
_fields_ = [("token_id", ctypes.c_int32),
|
_fields_ = [("token_id", ctypes.c_int32),
|
||||||
|
@ -622,8 +624,16 @@ def extract_modelfile_params(filepath,sdfilepath,whisperfilepath,mmprojfilepath)
|
||||||
except Exception as ex:
|
except Exception as ex:
|
||||||
modelfile_extracted_meta = None
|
modelfile_extracted_meta = None
|
||||||
|
|
||||||
def autoset_gpu_layers(ctxsize,gpumem,sdquanted,bbs): #shitty algo to determine how many layers to use
|
def autoset_gpu_layers(ctxsize,sdquanted,bbs): #shitty algo to determine how many layers to use
|
||||||
global modelfile_extracted_meta # reference cached values instead
|
global showusedmemwarning, modelfile_extracted_meta # reference cached values instead
|
||||||
|
gpumem = MaxMemory[0]
|
||||||
|
usedmem = 0
|
||||||
|
if MaxFreeMemory[0]>0:
|
||||||
|
usedmem = MaxMemory[0]-MaxFreeMemory[0]
|
||||||
|
if showusedmemwarning and usedmem > (2.5*1024*1024*1024):
|
||||||
|
showusedmemwarning = False
|
||||||
|
print(f"Note: KoboldCpp has detected that a significant amount of GPU VRAM ({usedmem/1024/1024} MB) is currently used by another application.\nFor best results, you may wish to close that application and then restart KoboldCpp.\n***")
|
||||||
|
reservedmem = max(1.5*1024*1024*1024,(0.5*1024*1024*1024 + usedmem)) # determine vram overhead
|
||||||
try:
|
try:
|
||||||
if not modelfile_extracted_meta:
|
if not modelfile_extracted_meta:
|
||||||
return 0
|
return 0
|
||||||
|
@ -647,15 +657,14 @@ def autoset_gpu_layers(ctxsize,gpumem,sdquanted,bbs): #shitty algo to determine
|
||||||
ggufmeta = modelfile_extracted_meta[0]
|
ggufmeta = modelfile_extracted_meta[0]
|
||||||
if not ggufmeta or ggufmeta[0]==0: #fail to read or no layers
|
if not ggufmeta or ggufmeta[0]==0: #fail to read or no layers
|
||||||
sizeperlayer = fsize*csmul*0.052
|
sizeperlayer = fsize*csmul*0.052
|
||||||
layerlimit = int(min(200,mem/sizeperlayer))
|
layerlimit = int(min(200,(mem-usedmem)/sizeperlayer))
|
||||||
else:
|
else:
|
||||||
layers = ggufmeta[0]
|
layers = ggufmeta[0]
|
||||||
headcount = ggufmeta[1]
|
headcount = ggufmeta[1]
|
||||||
headkvlen = (ggufmeta[2] if ggufmeta[2] > 0 else 128)
|
headkvlen = (ggufmeta[2] if ggufmeta[2] > 0 else 128)
|
||||||
ratio = mem/(fsize*csmul*1.5)
|
ratio = (mem-usedmem)/(fsize*csmul*1.55)
|
||||||
computemem = layers*(4 if bbs <= 512 else (bbs/128))*headkvlen*cs*4*1.5 # For now the first 4 is the hardcoded result for a blasbatchsize of 512. Ideally we automatically calculate blasbatchsize / 4 but I couldn't easily grab the value yet - Henk
|
computemem = layers*(4 if bbs <= 512 else (bbs/128))*headkvlen*cs*4*1.5 # apply blasbatchsize calculations if over 512
|
||||||
contextmem = layers*headcount*headkvlen*cs*4*1.1
|
contextmem = layers*headcount*headkvlen*cs*4*1.1
|
||||||
reservedmem = 1.5*1024*1024*1024 # Users often don't have their GPU's VRAM worth of memory, we assume 500MB to avoid driver swapping + 500MB for the OS + 500MB for background apps / browser - Henk
|
|
||||||
if headcount > 0:
|
if headcount > 0:
|
||||||
ratio = max(ratio, (mem - reservedmem - computemem) / (fsize + contextmem))
|
ratio = max(ratio, (mem - reservedmem - computemem) / (fsize + contextmem))
|
||||||
layerlimit = min(int(ratio*layers), (layers + 3))
|
layerlimit = min(int(ratio*layers), (layers + 3))
|
||||||
|
@ -698,11 +707,13 @@ def fetch_gpu_properties(testCL,testCU,testVK):
|
||||||
if testCU:
|
if testCU:
|
||||||
FetchedCUdevices = []
|
FetchedCUdevices = []
|
||||||
FetchedCUdeviceMem = []
|
FetchedCUdeviceMem = []
|
||||||
|
FetchedCUfreeMem = []
|
||||||
AMDgpu = None
|
AMDgpu = None
|
||||||
try: # Get NVIDIA GPU names
|
try: # Get NVIDIA GPU names
|
||||||
output = subprocess.run(['nvidia-smi','--query-gpu=name,memory.total','--format=csv,noheader'], capture_output=True, text=True, check=True, encoding='utf-8').stdout
|
output = subprocess.run(['nvidia-smi','--query-gpu=name,memory.total,memory.free','--format=csv,noheader'], capture_output=True, text=True, check=True, encoding='utf-8').stdout
|
||||||
FetchedCUdevices = [line.split(",")[0].strip() for line in output.splitlines()]
|
FetchedCUdevices = [line.split(",")[0].strip() for line in output.splitlines()]
|
||||||
FetchedCUdeviceMem = [line.split(",")[1].strip().split(" ")[0].strip() for line in output.splitlines()]
|
FetchedCUdeviceMem = [line.split(",")[1].strip().split(" ")[0].strip() for line in output.splitlines()]
|
||||||
|
FetchedCUfreeMem = [line.split(",")[2].strip().split(" ")[0].strip() for line in output.splitlines()]
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
pass
|
pass
|
||||||
if len(FetchedCUdevices)==0:
|
if len(FetchedCUdevices)==0:
|
||||||
|
@ -730,6 +741,8 @@ def fetch_gpu_properties(testCL,testCU,testVK):
|
||||||
MaxMemory[0] = max(int(FetchedCUdeviceMem[idx]),MaxMemory[0])
|
MaxMemory[0] = max(int(FetchedCUdeviceMem[idx]),MaxMemory[0])
|
||||||
else:
|
else:
|
||||||
MaxMemory[0] = max(int(FetchedCUdeviceMem[idx])*1024*1024,MaxMemory[0])
|
MaxMemory[0] = max(int(FetchedCUdeviceMem[idx])*1024*1024,MaxMemory[0])
|
||||||
|
if len(FetchedCUfreeMem)>idx:
|
||||||
|
MaxFreeMemory[0] = max(int(FetchedCUfreeMem[idx])*1024*1024,MaxFreeMemory[0])
|
||||||
|
|
||||||
if testVK:
|
if testVK:
|
||||||
try: # Get Vulkan names
|
try: # Get Vulkan names
|
||||||
|
@ -2458,7 +2471,7 @@ def show_gui():
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def changed_gpulayers_estimate(*args):
|
def changed_gpulayers_estimate(*args):
|
||||||
predicted_gpu_layers = autoset_gpu_layers(int(contextsize_text[context_var.get()]),MaxMemory[0],(sd_quant_var.get()==1),int(blasbatchsize_values[int(blas_size_var.get())]))
|
predicted_gpu_layers = autoset_gpu_layers(int(contextsize_text[context_var.get()]),(sd_quant_var.get()==1),int(blasbatchsize_values[int(blas_size_var.get())]))
|
||||||
max_gpu_layers = (f"/{modelfile_extracted_meta[0][0]+3}" if (modelfile_extracted_meta and modelfile_extracted_meta[0] and modelfile_extracted_meta[0][0]!=0) else "")
|
max_gpu_layers = (f"/{modelfile_extracted_meta[0][0]+3}" if (modelfile_extracted_meta and modelfile_extracted_meta[0] and modelfile_extracted_meta[0][0]!=0) else "")
|
||||||
index = runopts_var.get()
|
index = runopts_var.get()
|
||||||
gpu_be = (index == "Use Vulkan" or index == "Vulkan NoAVX2 (Old CPU)" or index == "Use CLBlast" or index == "CLBlast NoAVX2 (Old CPU)" or index == "Use CuBLAS" or index == "Use hipBLAS (ROCm)")
|
gpu_be = (index == "Use Vulkan" or index == "Vulkan NoAVX2 (Old CPU)" or index == "Use CLBlast" or index == "CLBlast NoAVX2 (Old CPU)" or index == "Use CuBLAS" or index == "Use hipBLAS (ROCm)")
|
||||||
|
@ -2590,6 +2603,11 @@ def show_gui():
|
||||||
gpu_layers_entry.grid(row=6, column=1, padx=8, pady=1, stick="nw")
|
gpu_layers_entry.grid(row=6, column=1, padx=8, pady=1, stick="nw")
|
||||||
quick_gpu_layers_label.grid(row=6, column=0, padx = 8, pady=1, stick="nw")
|
quick_gpu_layers_label.grid(row=6, column=0, padx = 8, pady=1, stick="nw")
|
||||||
quick_gpu_layers_entry.grid(row=6, column=1, padx=8, pady=1, stick="nw")
|
quick_gpu_layers_entry.grid(row=6, column=1, padx=8, pady=1, stick="nw")
|
||||||
|
elif sys.platform=="darwin":
|
||||||
|
gpu_layers_label.grid(row=6, column=0, padx = 8, pady=1, stick="nw")
|
||||||
|
gpu_layers_entry.grid(row=6, column=1, padx=8, pady=1, stick="nw")
|
||||||
|
quick_gpu_layers_label.grid(row=6, column=0, padx = 8, pady=1, stick="nw")
|
||||||
|
quick_gpu_layers_entry.grid(row=6, column=1, padx=8, pady=1, stick="nw")
|
||||||
else:
|
else:
|
||||||
gpu_layers_label.grid_remove()
|
gpu_layers_label.grid_remove()
|
||||||
gpu_layers_entry.grid_remove()
|
gpu_layers_entry.grid_remove()
|
||||||
|
@ -3925,7 +3943,7 @@ def main(launch_args,start_server=True):
|
||||||
pass
|
pass
|
||||||
if MaxMemory[0] > 0:
|
if MaxMemory[0] > 0:
|
||||||
extract_modelfile_params(args.model_param,args.sdmodel,args.whispermodel,args.mmproj)
|
extract_modelfile_params(args.model_param,args.sdmodel,args.whispermodel,args.mmproj)
|
||||||
layeramt = autoset_gpu_layers(args.contextsize, MaxMemory[0],args.sdquant,args.blasbatchsize)
|
layeramt = autoset_gpu_layers(args.contextsize,args.sdquant,args.blasbatchsize)
|
||||||
print(f"Auto Recommended Layers: {layeramt}")
|
print(f"Auto Recommended Layers: {layeramt}")
|
||||||
args.gpulayers = layeramt
|
args.gpulayers = layeramt
|
||||||
else:
|
else:
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue