removed openblas backend, merged into CPU (with llamafile for BLAS). GPU backend is now automatically selected when running from CLI unless noblas is specified.

This commit is contained in:
Concedo 2024-09-15 19:21:52 +08:00
parent 01c7d82185
commit 53bf0fb32d
14 changed files with 115 additions and 159 deletions

View file

@ -292,7 +292,6 @@ def pick_existant_file(ntoption,nonntoption):
lib_default = pick_existant_file("koboldcpp_default.dll","koboldcpp_default.so")
lib_failsafe = pick_existant_file("koboldcpp_failsafe.dll","koboldcpp_failsafe.so")
lib_openblas = pick_existant_file("koboldcpp_openblas.dll","koboldcpp_openblas.so")
lib_noavx2 = pick_existant_file("koboldcpp_noavx2.dll","koboldcpp_noavx2.so")
lib_clblast = pick_existant_file("koboldcpp_clblast.dll","koboldcpp_clblast.so")
lib_clblast_noavx2 = pick_existant_file("koboldcpp_clblast_noavx2.dll","koboldcpp_clblast_noavx2.so")
@ -302,25 +301,23 @@ lib_vulkan = pick_existant_file("koboldcpp_vulkan.dll","koboldcpp_vulkan.so")
lib_vulkan_noavx2 = pick_existant_file("koboldcpp_vulkan_noavx2.dll","koboldcpp_vulkan_noavx2.so")
libname = ""
lib_option_pairs = [
(lib_openblas, "Use OpenBLAS"),
(lib_default, "Use No BLAS"),
(lib_default, "Use CPU"),
(lib_clblast, "Use CLBlast"),
(lib_cublas, "Use CuBLAS"),
(lib_hipblas, "Use hipBLAS (ROCm)"),
(lib_vulkan, "Use Vulkan"),
(lib_noavx2, "NoAVX2 Mode (Old CPU)"),
(lib_clblast_noavx2, "CLBlast NoAVX2 (Old CPU)"),
(lib_vulkan_noavx2, "Vulkan NoAVX2 (Old CPU)"),
(lib_noavx2, "Use CPU (Old CPU)"),
(lib_clblast_noavx2, "Use CLBlast (Old CPU)"),
(lib_vulkan_noavx2, "Use Vulkan (Old CPU)"),
(lib_failsafe, "Failsafe Mode (Old CPU)")]
openblas_option, default_option, clblast_option, cublas_option, hipblas_option, vulkan_option, noavx2_option, clblast_noavx2_option, vulkan_noavx2_option, failsafe_option = (opt if file_exists(lib) or (os.name == 'nt' and file_exists(opt + ".dll")) else None for lib, opt in lib_option_pairs)
default_option, clblast_option, cublas_option, hipblas_option, vulkan_option, noavx2_option, clblast_noavx2_option, vulkan_noavx2_option, failsafe_option = (opt if file_exists(lib) or (os.name == 'nt' and file_exists(opt + ".dll")) else None for lib, opt in lib_option_pairs)
runopts = [opt for lib, opt in lib_option_pairs if file_exists(lib)]
def init_library():
global handle, args, libname
global lib_default,lib_failsafe,lib_openblas,lib_noavx2,lib_clblast,lib_clblast_noavx2,lib_cublas,lib_hipblas,lib_vulkan,lib_vulkan_noavx2
global lib_default,lib_failsafe,lib_noavx2,lib_clblast,lib_clblast_noavx2,lib_cublas,lib_hipblas,lib_vulkan,lib_vulkan_noavx2
libname = ""
use_openblas = False # if true, uses OpenBLAS for acceleration. libopenblas.dll must exist in the same dir.
use_clblast = False #uses CLBlast instead
use_cublas = False #uses cublas instead
use_hipblas = False #uses hipblas instead
@ -373,15 +370,7 @@ def init_library():
print("Attempting to use CLBlast library for faster prompt ingestion. A compatible clblast will be required.")
use_clblast = True
else:
if not file_exists(lib_openblas) or (os.name=='nt' and not file_exists("libopenblas.dll")):
print("Warning: OpenBLAS library file not found. Non-BLAS library will be used.")
elif args.noblas:
print("Attempting to library without OpenBLAS.")
else:
use_openblas = True
print("Attempting to use OpenBLAS library for faster prompt ingestion. A compatible libopenblas will be required.")
if sys.platform=="darwin":
print("Mac OSX note: Some people have found Accelerate actually faster than OpenBLAS. To compare, run Koboldcpp with --noblas instead.")
print("Attempting to use Non-BLAS library.")
if use_noavx2:
if use_failsafe:
@ -399,8 +388,6 @@ def init_library():
libname = lib_cublas
elif use_hipblas:
libname = lib_hipblas
elif use_openblas:
libname = lib_openblas
elif use_vulkan:
libname = lib_vulkan
else:
@ -712,35 +699,6 @@ def autoset_gpu_layers(ctxsize,sdquanted,bbs): #shitty algo to determine how man
def fetch_gpu_properties(testCL,testCU,testVK):
import subprocess
time.sleep(0.1)
if testCL:
try: # Get OpenCL GPU names on windows using a special binary. overwrite at known index if found.
basepath = os.path.abspath(os.path.dirname(__file__))
output = ""
data = None
try:
output = subprocess.run(["clinfo","--json"], capture_output=True, text=True, check=True, encoding='utf-8').stdout
data = json.loads(output)
except Exception as e1:
output = subprocess.run([((os.path.join(basepath, "winclinfo.exe")) if os.name == 'nt' else "clinfo"),"--json"], capture_output=True, text=True, check=True, creationflags=subprocess.CREATE_NO_WINDOW | subprocess.DETACHED_PROCESS, encoding='utf-8').stdout
data = json.loads(output)
plat = 0
dev = 0
lowestclmem = 0
for platform in data["devices"]:
dev = 0
for device in platform["online"]:
dname = device["CL_DEVICE_NAME"]
dmem = int(device["CL_DEVICE_GLOBAL_MEM_SIZE"])
idx = plat+dev*2
if idx<len(CLDevices):
CLDevicesNames[idx] = dname
lowestclmem = dmem if lowestclmem==0 else (dmem if dmem<lowestclmem else lowestclmem)
dev += 1
plat += 1
MaxMemory[0] = lowestclmem
except Exception as e:
pass
if testCU:
FetchedCUdevices = []
@ -804,20 +762,54 @@ def fetch_gpu_properties(testCL,testCU,testVK):
idx += 1
except Exception as e:
pass
if testCL:
try: # Get OpenCL GPU names on windows using a special binary. overwrite at known index if found.
basepath = os.path.abspath(os.path.dirname(__file__))
output = ""
data = None
try:
output = subprocess.run(["clinfo","--json"], capture_output=True, text=True, check=True, encoding='utf-8').stdout
data = json.loads(output)
except Exception as e1:
output = subprocess.run([((os.path.join(basepath, "winclinfo.exe")) if os.name == 'nt' else "clinfo"),"--json"], capture_output=True, text=True, check=True, creationflags=subprocess.CREATE_NO_WINDOW | subprocess.DETACHED_PROCESS, encoding='utf-8').stdout
data = json.loads(output)
plat = 0
dev = 0
lowestclmem = 0
for platform in data["devices"]:
dev = 0
for device in platform["online"]:
dname = device["CL_DEVICE_NAME"]
dmem = int(device["CL_DEVICE_GLOBAL_MEM_SIZE"])
idx = plat+dev*2
if idx<len(CLDevices):
CLDevicesNames[idx] = dname
lowestclmem = dmem if lowestclmem==0 else (dmem if dmem<lowestclmem else lowestclmem)
dev += 1
plat += 1
MaxMemory[0] = max(lowestclmem,MaxMemory[0])
except Exception as e:
pass
return
def auto_set_backend_cli():
fetch_gpu_properties(False,True,True)
found_new_backend = False
if exitcounter < 100 and MaxMemory[0]>3500000000 and (("Use CuBLAS" in runopts and CUDevicesNames[0]!="") or "Use hipBLAS (ROCm)" in runopts) and any(CUDevicesNames):
if "Use CuBLAS" in runopts or "Use hipBLAS (ROCm)" in runopts:
args.usecublas = ["normal","mmq"]
print("Auto Selected CUDA Backend...\n")
found_new_backend = True
elif exitcounter < 100 and (1 in VKIsDGPU) and "Use Vulkan" in runopts:
for i in range(0,len(VKIsDGPU)):
if VKIsDGPU[i]==1:
args.usevulkan = []
print("Auto Selected Vulkan Backend...\n")
found_new_backend = True
break
if not found_new_backend:
print("No GPU Backend found...\n")
def load_model(model_filename):
global args
@ -2177,7 +2169,7 @@ def RunServerMultiThreaded(addr, port):
finally:
exitcounter = 999
self.httpd.server_close()
sys.exit(0)
os._exit(0)
def stop(self):
global exitcounter
exitcounter = 999
@ -2337,7 +2329,7 @@ def show_gui():
if not any(runopts):
exitcounter = 999
exit_with_error(2,"KoboldCPP couldn't locate any backends to use (i.e Default, OpenBLAS, CLBlast, CuBLAS).\n\nTo use the program, please run the 'make' command from the directory.","No Backends Available!")
exit_with_error(2,"KoboldCPP couldn't locate any backends to use (i.e Default, Vulkan, CLBlast, CuBLAS).\n\nTo use the program, please run the 'make' command from the directory.","No Backends Available!")
# Vars - should be in scope to be used by multiple widgets
gpulayers_var = ctk.StringVar(value="-1")
@ -2530,8 +2522,8 @@ def show_gui():
def setup_backend_tooltip(parent):
# backend count label with the tooltip function
nl = '\n'
tooltxt = f"Number of backends you have built and available." + (f"\n\nMissing Backends: \n\n{nl.join(antirunopts)}" if len(runopts) != 6 else "")
num_backends_built = makelabel(parent, str(len(runopts)) + f"/9", 5, 2,tooltxt)
tooltxt = f"Number of backends you have built and available." + (f"\n\nMissing Backends: \n\n{nl.join(antirunopts)}" if len(runopts) < 8 else "")
num_backends_built = makelabel(parent, str(len(runopts)) + f"/8", 5, 2,tooltxt)
num_backends_built.grid(row=1, column=1, padx=195, pady=0)
num_backends_built.configure(text_color="#00ff00")
@ -2550,7 +2542,7 @@ def show_gui():
predicted_gpu_layers = autoset_gpu_layers(int(contextsize_text[context_var.get()]),(sd_quant_var.get()==1),int(blasbatchsize_values[int(blas_size_var.get())]))
max_gpu_layers = (f"/{modelfile_extracted_meta[0][0]+3}" if (modelfile_extracted_meta and modelfile_extracted_meta[0] and modelfile_extracted_meta[0][0]!=0) else "")
index = runopts_var.get()
gpu_be = (index == "Use Vulkan" or index == "Vulkan NoAVX2 (Old CPU)" or index == "Use CLBlast" or index == "CLBlast NoAVX2 (Old CPU)" or index == "Use CuBLAS" or index == "Use hipBLAS (ROCm)")
gpu_be = (index == "Use Vulkan" or index == "Use Vulkan (Old CPU)" or index == "Use CLBlast" or index == "Use CLBlast (Old CPU)" or index == "Use CuBLAS" or index == "Use hipBLAS (ROCm)")
layercounter_label.grid(row=6, column=1, padx=75, sticky="W")
quick_layercounter_label.grid(row=6, column=1, padx=75, sticky="W")
if sys.platform=="darwin" and gpulayers_var.get()=="-1":
@ -2578,10 +2570,10 @@ def show_gui():
try:
s = int(gpu_choice_var.get())-1
v = runopts_var.get()
if v == "Use Vulkan" or v == "Vulkan NoAVX2 (Old CPU)":
if v == "Use Vulkan" or v == "Use Vulkan (Old CPU)":
quick_gpuname_label.configure(text=VKDevicesNames[s])
gpuname_label.configure(text=VKDevicesNames[s])
elif v == "Use CLBlast" or v == "CLBlast NoAVX2 (Old CPU)":
elif v == "Use CLBlast" or v == "Use CLBlast (Old CPU)":
quick_gpuname_label.configure(text=CLDevicesNames[s])
gpuname_label.configure(text=CLDevicesNames[s])
else:
@ -2631,19 +2623,19 @@ def show_gui():
global runmode_untouched
runmode_untouched = False
index = runopts_var.get()
if index == "Use Vulkan" or index == "Vulkan NoAVX2 (Old CPU)" or index == "Use CLBlast" or index == "CLBlast NoAVX2 (Old CPU)" or index == "Use CuBLAS" or index == "Use hipBLAS (ROCm)":
if index == "Use Vulkan" or index == "Use Vulkan (Old CPU)" or index == "Use CLBlast" or index == "Use CLBlast (Old CPU)" or index == "Use CuBLAS" or index == "Use hipBLAS (ROCm)":
quick_gpuname_label.grid(row=3, column=1, padx=75, sticky="W")
gpuname_label.grid(row=3, column=1, padx=75, sticky="W")
gpu_selector_label.grid(row=3, column=0, padx = 8, pady=1, stick="nw")
quick_gpu_selector_label.grid(row=3, column=0, padx = 8, pady=1, stick="nw")
if index == "Use CLBlast" or index == "CLBlast NoAVX2 (Old CPU)":
if index == "Use CLBlast" or index == "Use CLBlast (Old CPU)":
gpu_selector_box.grid(row=3, column=1, padx=8, pady=1, stick="nw")
quick_gpu_selector_box.grid(row=3, column=1, padx=8, pady=1, stick="nw")
CUDA_gpu_selector_box.grid_remove()
CUDA_quick_gpu_selector_box.grid_remove()
if gpu_choice_var.get()=="All":
gpu_choice_var.set("1")
elif index == "Use Vulkan" or index == "Vulkan NoAVX2 (Old CPU)" or index == "Use CuBLAS" or index == "Use hipBLAS (ROCm)":
elif index == "Use Vulkan" or index == "Use Vulkan (Old CPU)" or index == "Use CuBLAS" or index == "Use hipBLAS (ROCm)":
gpu_selector_box.grid_remove()
quick_gpu_selector_box.grid_remove()
CUDA_gpu_selector_box.grid(row=3, column=1, padx=8, pady=1, stick="nw")
@ -2677,7 +2669,7 @@ def show_gui():
tensor_split_label.grid(row=8, column=0, padx = 8, pady=1, stick="nw")
tensor_split_entry.grid(row=8, column=1, padx=8, pady=1, stick="nw")
if index == "Use Vulkan" or index == "Vulkan NoAVX2 (Old CPU)" or index == "Use CLBlast" or index == "CLBlast NoAVX2 (Old CPU)" or index == "Use CuBLAS" or index == "Use hipBLAS (ROCm)":
if index == "Use Vulkan" or index == "Use Vulkan (Old CPU)" or index == "Use CLBlast" or index == "Use CLBlast (Old CPU)" or index == "Use CuBLAS" or index == "Use hipBLAS (ROCm)":
gpu_layers_label.grid(row=6, column=0, padx = 8, pady=1, stick="nw")
gpu_layers_entry.grid(row=6, column=1, padx=8, pady=1, stick="nw")
quick_gpu_layers_label.grid(row=6, column=0, padx = 8, pady=1, stick="nw")
@ -2697,7 +2689,7 @@ def show_gui():
# presets selector
makelabel(quick_tab, "Presets:", 1,0,"Select a backend to use.\nOpenBLAS and NoBLAS runs purely on CPU only.\nCuBLAS runs on Nvidia GPUs, and is much faster.\nCLBlast works on all GPUs but is somewhat slower.\nNoAVX2 and Failsafe modes support older PCs.")
makelabel(quick_tab, "Presets:", 1,0,"Select a backend to use.\nCuBLAS runs on Nvidia GPUs, and is much faster.\nVulkan and CLBlast works on all GPUs but is somewhat slower.\nOtherwise, runs on CPU only.\nNoAVX2 and Failsafe modes support older PCs.")
runoptbox = ctk.CTkComboBox(quick_tab, values=runopts, width=180,variable=runopts_var, state="readonly")
runoptbox.grid(row=1, column=1,padx=8, stick="nw")
@ -2743,7 +2735,7 @@ def show_gui():
hardware_tab = tabcontent["Hardware"]
# presets selector
makelabel(hardware_tab, "Presets:", 1,0,"Select a backend to use.\nOpenBLAS and NoBLAS runs purely on CPU only.\nCuBLAS runs on Nvidia GPUs, and is much faster.\nCLBlast works on all GPUs but is somewhat slower.\nNoAVX2 and Failsafe modes support older PCs.")
makelabel(hardware_tab, "Presets:", 1,0,"Select a backend to use.\nCuBLAS runs on Nvidia GPUs, and is much faster.\nVulkan and CLBlast works on all GPUs but is somewhat slower.\nOtherwise, runs on CPU only.\nNoAVX2 and Failsafe modes support older PCs.")
runoptbox = ctk.CTkComboBox(hardware_tab, values=runopts, width=180,variable=runopts_var, state="readonly")
runoptbox.grid(row=1, column=1,padx=8, stick="nw")
runoptbox.set(runopts[0]) # Set to first available option
@ -3011,9 +3003,9 @@ def show_gui():
gpuchoiceidx = 0
if gpu_choice_var.get()!="All":
gpuchoiceidx = int(gpu_choice_var.get())-1
if runopts_var.get() == "Use CLBlast" or runopts_var.get() == "CLBlast NoAVX2 (Old CPU)":
if runopts_var.get() == "Use CLBlast" or runopts_var.get() == "Use CLBlast (Old CPU)":
args.useclblast = [[0,0], [1,0], [0,1], [1,1]][gpuchoiceidx]
if runopts_var.get() == "CLBlast NoAVX2 (Old CPU)":
if runopts_var.get() == "CUse CLBlast (Old CPU)":
args.noavx2 = True
if runopts_var.get() == "Use CuBLAS" or runopts_var.get() == "Use hipBLAS (ROCm)":
if gpu_choice_var.get()=="All":
@ -3024,18 +3016,18 @@ def show_gui():
args.usecublas.append("mmq")
if rowsplit_var.get()==1:
args.usecublas.append("rowsplit")
if runopts_var.get() == "Use Vulkan" or runopts_var.get() == "Vulkan NoAVX2 (Old CPU)":
if runopts_var.get() == "Use Vulkan" or runopts_var.get() == "Use Vulkan (Old CPU)":
if gpu_choice_var.get()=="All":
args.usevulkan = []
else:
args.usevulkan = [int(gpuchoiceidx)]
if runopts_var.get() == "Vulkan NoAVX2 (Old CPU)":
if runopts_var.get() == "Use Vulkan (Old CPU)":
args.noavx2 = True
if gpulayers_var.get():
args.gpulayers = int(gpulayers_var.get())
if runopts_var.get()=="Use No BLAS":
if runopts_var.get()=="Use CPU":
args.noblas = True
if runopts_var.get()=="NoAVX2 Mode (Old CPU)":
if runopts_var.get()=="Use CPU (Old CPU)":
args.noavx2 = True
if runopts_var.get()=="Failsafe Mode (Old CPU)":
args.noavx2 = True
@ -3193,8 +3185,6 @@ def show_gui():
elif "noblas" in dict and dict["noblas"]:
if default_option is not None:
runopts_var.set(default_option)
elif openblas_option is not None:
runopts_var.set(openblas_option)
if "gpulayers" in dict and dict["gpulayers"]:
gpulayers_var.set(dict["gpulayers"])
else:
@ -4019,34 +4009,31 @@ def main(launch_args,start_server=True):
nocertify = True
if args.gpulayers:
global libname, lib_default, lib_openblas, lib_failsafe, lib_noavx2
nogood = [lib_default,lib_openblas,lib_failsafe,lib_noavx2]
shouldavoidgpu = False
if libname in nogood and sys.platform!="darwin":
if args.noblas and sys.platform!="darwin":
shouldavoidgpu = True
if args.gpulayers>0:
if shouldavoidgpu:
print("WARNING: GPU layers is set, but a GPU backend was not selected!")
pass
if args.gpulayers and args.gpulayers>0:
print("WARNING: GPU layers is set, but a GPU backend was not selected! GPU will not be used!")
args.gpulayers = 0
elif args.gpulayers==-1 and sys.platform=="darwin" and args.model_param and os.path.exists(args.model_param):
print(f"MacOS detected: Auto GPU layers set to maximum")
args.gpulayers = 200
elif args.gpulayers==-1 and not shouldavoidgpu and args.model_param and os.path.exists(args.model_param):
if not args.usecublas and not args.usevulkan and not args.useclblast:
print("NOTE: Auto GPU layers was set without picking a GPU backend! Trying to assign one for you automatically...")
elif not shouldavoidgpu and args.model_param and os.path.exists(args.model_param):
if not args.usecublas and (args.usevulkan is None) and not args.useclblast:
print("No GPU or CPU backend was selected. Trying to assign one for you automatically...")
auto_set_backend_cli()
print("Trying to automatically determine GPU layers...")
if MaxMemory[0] == 0: #try to get gpu vram for cuda if not picked yet
fetch_gpu_properties(False,True,True)
pass
if MaxMemory[0] > 0:
extract_modelfile_params(args.model_param,args.sdmodel,args.whispermodel,args.mmproj)
layeramt = autoset_gpu_layers(args.contextsize,args.sdquant,args.blasbatchsize)
print(f"Auto Recommended Layers: {layeramt}")
args.gpulayers = layeramt
else:
print(f"Could not automatically determine layers. Please set it manually.")
args.gpulayers = 0
if args.gpulayers==-1:
if MaxMemory[0] > 0 and (not args.noblas) and (args.usecublas or (args.usevulkan is not None) or args.useclblast or sys.platform=="darwin"):
extract_modelfile_params(args.model_param,args.sdmodel,args.whispermodel,args.mmproj)
layeramt = autoset_gpu_layers(args.contextsize,args.sdquant,args.blasbatchsize)
print(f"Auto Recommended GPU Layers: {layeramt}")
args.gpulayers = layeramt
else:
print(f"No GPU backend found, or could not automatically determine GPU layers. Please set it manually.")
args.gpulayers = 0
if args.threads == -1:
args.threads = get_default_threads()
@ -4398,9 +4385,9 @@ if __name__ == '__main__':
compatgroup.add_argument("--usecublas", help="Use CuBLAS for GPU Acceleration. Requires CUDA. Select lowvram to not allocate VRAM scratch buffer. Enter a number afterwards to select and use 1 GPU. Leaving no number will use all GPUs. For hipBLAS binaries, please check YellowRoseCx rocm fork.", nargs='*',metavar=('[lowvram|normal] [main GPU ID] [mmq] [rowsplit]'), choices=['normal', 'lowvram', '0', '1', '2', '3', 'mmq', 'rowsplit'])
compatgroup.add_argument("--usevulkan", help="Use Vulkan for GPU Acceleration. Can optionally specify GPU Device ID (e.g. --usevulkan 0).", metavar=('[Device ID]'), nargs='*', type=int, default=None)
compatgroup.add_argument("--useclblast", help="Use CLBlast for GPU Acceleration. Must specify exactly 2 arguments, platform ID and device ID (e.g. --useclblast 1 0).", type=int, choices=range(0,9), nargs=2)
compatgroup.add_argument("--noblas", help="Do not use any accelerated prompt ingestion", action='store_true')
compatgroup.add_argument("--noblas", help="Do not use any GPU acceleration (CPU Only)", action='store_true')
parser.add_argument("--contextsize", help="Controls the memory allocated for maximum context size, only change if you need more RAM for big contexts. (default 4096). Supported values are [256,512,1024,2048,3072,4096,6144,8192,12288,16384,24576,32768,49152,65536,98304,131072]. IF YOU USE ANYTHING ELSE YOU ARE ON YOUR OWN.",metavar=('[256,512,1024,2048,3072,4096,6144,8192,12288,16384,24576,32768,49152,65536,98304,131072]'), type=check_range(int,256,262144), default=4096)
parser.add_argument("--gpulayers", help="Set number of layers to offload to GPU when using GPU. Requires GPU. Set to -1 to try autodetect (experimental)",metavar=('[GPU layers]'), nargs='?', const=1, type=int, default=0)
parser.add_argument("--gpulayers", help="Set number of layers to offload to GPU when using GPU. Requires GPU. Set to -1 to try autodetect, set to 0 to disable GPU offload.",metavar=('[GPU layers]'), nargs='?', const=1, type=int, default=-1)
parser.add_argument("--tensor_split", help="For CUDA and Vulkan only, ratio to split tensors across multiple GPUs, space-separated list of proportions, e.g. 7 3", metavar=('[Ratios]'), type=float, nargs='+')
#more advanced params