removed openblas backend, merged into CPU (with llamafile for BLAS). GPU backend is now automatically selected when running from CLI unless noblas is specified.

2025-09-10 17:14:36 +00:00 · 2024-09-15 19:21:52 +08:00 · 2024-09-15 19:21:52 +08:00 · 53bf0fb32d
commit 53bf0fb32d
parent 01c7d82185
14 changed files with 115 additions and 159 deletions
--- a/koboldcpp.py
+++ b/koboldcpp.py
@ -292,7 +292,6 @@ def pick_existant_file(ntoption,nonntoption):

 lib_default = pick_existant_file("koboldcpp_default.dll","koboldcpp_default.so")
 lib_failsafe = pick_existant_file("koboldcpp_failsafe.dll","koboldcpp_failsafe.so")
-lib_openblas = pick_existant_file("koboldcpp_openblas.dll","koboldcpp_openblas.so")
 lib_noavx2 = pick_existant_file("koboldcpp_noavx2.dll","koboldcpp_noavx2.so")
 lib_clblast = pick_existant_file("koboldcpp_clblast.dll","koboldcpp_clblast.so")
 lib_clblast_noavx2 = pick_existant_file("koboldcpp_clblast_noavx2.dll","koboldcpp_clblast_noavx2.so")
@ -302,25 +301,23 @@ lib_vulkan = pick_existant_file("koboldcpp_vulkan.dll","koboldcpp_vulkan.so")
 lib_vulkan_noavx2 = pick_existant_file("koboldcpp_vulkan_noavx2.dll","koboldcpp_vulkan_noavx2.so")
 libname = ""
 lib_option_pairs = [
-    (lib_openblas, "Use OpenBLAS"),
-    (lib_default, "Use No BLAS"),
+    (lib_default, "Use CPU"),
    (lib_clblast, "Use CLBlast"),
    (lib_cublas, "Use CuBLAS"),
    (lib_hipblas, "Use hipBLAS (ROCm)"),
    (lib_vulkan, "Use Vulkan"),
-    (lib_noavx2, "NoAVX2 Mode (Old CPU)"),
-    (lib_clblast_noavx2, "CLBlast NoAVX2 (Old CPU)"),
-    (lib_vulkan_noavx2, "Vulkan NoAVX2 (Old CPU)"),
+    (lib_noavx2, "Use CPU (Old CPU)"),
+    (lib_clblast_noavx2, "Use CLBlast (Old CPU)"),
+    (lib_vulkan_noavx2, "Use Vulkan (Old CPU)"),
    (lib_failsafe, "Failsafe Mode (Old CPU)")]
-openblas_option, default_option, clblast_option, cublas_option, hipblas_option, vulkan_option, noavx2_option, clblast_noavx2_option, vulkan_noavx2_option, failsafe_option = (opt if file_exists(lib) or (os.name == 'nt' and file_exists(opt + ".dll")) else None for lib, opt in lib_option_pairs)
+default_option, clblast_option, cublas_option, hipblas_option, vulkan_option, noavx2_option, clblast_noavx2_option, vulkan_noavx2_option, failsafe_option = (opt if file_exists(lib) or (os.name == 'nt' and file_exists(opt + ".dll")) else None for lib, opt in lib_option_pairs)
 runopts = [opt for lib, opt in lib_option_pairs if file_exists(lib)]

 def init_library():
    global handle, args, libname
-    global lib_default,lib_failsafe,lib_openblas,lib_noavx2,lib_clblast,lib_clblast_noavx2,lib_cublas,lib_hipblas,lib_vulkan,lib_vulkan_noavx2
+    global lib_default,lib_failsafe,lib_noavx2,lib_clblast,lib_clblast_noavx2,lib_cublas,lib_hipblas,lib_vulkan,lib_vulkan_noavx2

    libname = ""
-    use_openblas = False # if true, uses OpenBLAS for acceleration. libopenblas.dll must exist in the same dir.
    use_clblast = False #uses CLBlast instead
    use_cublas = False #uses cublas instead
    use_hipblas = False #uses hipblas instead
@ -373,15 +370,7 @@ def init_library():
            print("Attempting to use CLBlast library for faster prompt ingestion. A compatible clblast will be required.")
            use_clblast = True
    else:
-        if not file_exists(lib_openblas) or (os.name=='nt' and not file_exists("libopenblas.dll")):
-            print("Warning: OpenBLAS library file not found. Non-BLAS library will be used.")
-        elif args.noblas:
-            print("Attempting to library without OpenBLAS.")
-        else:
-            use_openblas = True
-            print("Attempting to use OpenBLAS library for faster prompt ingestion. A compatible libopenblas will be required.")
-            if sys.platform=="darwin":
-                print("Mac OSX note: Some people have found Accelerate actually faster than OpenBLAS. To compare, run Koboldcpp with --noblas instead.")
+        print("Attempting to use Non-BLAS library.")

    if use_noavx2:
        if use_failsafe:
@ -399,8 +388,6 @@ def init_library():
            libname = lib_cublas
        elif use_hipblas:
            libname = lib_hipblas
-        elif use_openblas:
-            libname = lib_openblas
        elif use_vulkan:
            libname = lib_vulkan
        else:
@ -712,35 +699,6 @@ def autoset_gpu_layers(ctxsize,sdquanted,bbs): #shitty algo to determine how man

 def fetch_gpu_properties(testCL,testCU,testVK):
    import subprocess
-    time.sleep(0.1)
-    if testCL:
-        try: # Get OpenCL GPU names on windows using a special binary. overwrite at known index if found.
-            basepath = os.path.abspath(os.path.dirname(__file__))
-            output = ""
-            data = None
-            try:
-                output = subprocess.run(["clinfo","--json"], capture_output=True, text=True, check=True, encoding='utf-8').stdout
-                data = json.loads(output)
-            except Exception as e1:
-                output = subprocess.run([((os.path.join(basepath, "winclinfo.exe")) if os.name == 'nt' else "clinfo"),"--json"], capture_output=True, text=True, check=True, creationflags=subprocess.CREATE_NO_WINDOW | subprocess.DETACHED_PROCESS, encoding='utf-8').stdout
-                data = json.loads(output)
-            plat = 0
-            dev = 0
-            lowestclmem = 0
-            for platform in data["devices"]:
-                dev = 0
-                for device in platform["online"]:
-                    dname = device["CL_DEVICE_NAME"]
-                    dmem = int(device["CL_DEVICE_GLOBAL_MEM_SIZE"])
-                    idx = plat+dev*2
-                    if idx<len(CLDevices):
-                        CLDevicesNames[idx] = dname
-                        lowestclmem = dmem if lowestclmem==0 else (dmem if dmem<lowestclmem else lowestclmem)
-                    dev += 1
-                plat += 1
-            MaxMemory[0] = lowestclmem
-        except Exception as e:
-            pass

    if testCU:
        FetchedCUdevices = []
@ -804,20 +762,54 @@ def fetch_gpu_properties(testCL,testCU,testVK):
                        idx += 1
        except Exception as e:
            pass
+
+    if testCL:
+        try: # Get OpenCL GPU names on windows using a special binary. overwrite at known index if found.
+            basepath = os.path.abspath(os.path.dirname(__file__))
+            output = ""
+            data = None
+            try:
+                output = subprocess.run(["clinfo","--json"], capture_output=True, text=True, check=True, encoding='utf-8').stdout
+                data = json.loads(output)
+            except Exception as e1:
+                output = subprocess.run([((os.path.join(basepath, "winclinfo.exe")) if os.name == 'nt' else "clinfo"),"--json"], capture_output=True, text=True, check=True, creationflags=subprocess.CREATE_NO_WINDOW | subprocess.DETACHED_PROCESS, encoding='utf-8').stdout
+                data = json.loads(output)
+            plat = 0
+            dev = 0
+            lowestclmem = 0
+            for platform in data["devices"]:
+                dev = 0
+                for device in platform["online"]:
+                    dname = device["CL_DEVICE_NAME"]
+                    dmem = int(device["CL_DEVICE_GLOBAL_MEM_SIZE"])
+                    idx = plat+dev*2
+                    if idx<len(CLDevices):
+                        CLDevicesNames[idx] = dname
+                        lowestclmem = dmem if lowestclmem==0 else (dmem if dmem<lowestclmem else lowestclmem)
+                    dev += 1
+                plat += 1
+            MaxMemory[0] = max(lowestclmem,MaxMemory[0])
+        except Exception as e:
+            pass
    return

 def auto_set_backend_cli():
    fetch_gpu_properties(False,True,True)
+    found_new_backend = False
    if exitcounter < 100 and MaxMemory[0]>3500000000 and (("Use CuBLAS" in runopts and CUDevicesNames[0]!="") or "Use hipBLAS (ROCm)" in runopts) and any(CUDevicesNames):
        if "Use CuBLAS" in runopts or "Use hipBLAS (ROCm)" in runopts:
            args.usecublas = ["normal","mmq"]
            print("Auto Selected CUDA Backend...\n")
+            found_new_backend = True
    elif exitcounter < 100 and (1 in VKIsDGPU) and "Use Vulkan" in runopts:
        for i in range(0,len(VKIsDGPU)):
            if VKIsDGPU[i]==1:
                args.usevulkan = []
                print("Auto Selected Vulkan Backend...\n")
+                found_new_backend = True
                break
+    if not found_new_backend:
+        print("No GPU Backend found...\n")

 def load_model(model_filename):
    global args
@ -2177,7 +2169,7 @@ def RunServerMultiThreaded(addr, port):
                finally:
                    exitcounter = 999
                    self.httpd.server_close()
-                    sys.exit(0)
+                    os._exit(0)
        def stop(self):
            global exitcounter
            exitcounter = 999
@ -2337,7 +2329,7 @@ def show_gui():

    if not any(runopts):
        exitcounter = 999
-        exit_with_error(2,"KoboldCPP couldn't locate any backends to use (i.e Default, OpenBLAS, CLBlast, CuBLAS).\n\nTo use the program, please run the 'make' command from the directory.","No Backends Available!")
+        exit_with_error(2,"KoboldCPP couldn't locate any backends to use (i.e Default, Vulkan, CLBlast, CuBLAS).\n\nTo use the program, please run the 'make' command from the directory.","No Backends Available!")

    # Vars - should be in scope to be used by multiple widgets
    gpulayers_var = ctk.StringVar(value="-1")
@ -2530,8 +2522,8 @@ def show_gui():
    def setup_backend_tooltip(parent):
        # backend count label with the tooltip function
        nl = '\n'
-        tooltxt = f"Number of backends you have built and available." + (f"\n\nMissing Backends: \n\n{nl.join(antirunopts)}" if len(runopts) != 6 else "")
-        num_backends_built = makelabel(parent, str(len(runopts)) + f"/9", 5, 2,tooltxt)
+        tooltxt = f"Number of backends you have built and available." + (f"\n\nMissing Backends: \n\n{nl.join(antirunopts)}" if len(runopts) < 8 else "")
+        num_backends_built = makelabel(parent, str(len(runopts)) + f"/8", 5, 2,tooltxt)
        num_backends_built.grid(row=1, column=1, padx=195, pady=0)
        num_backends_built.configure(text_color="#00ff00")

@ -2550,7 +2542,7 @@ def show_gui():
        predicted_gpu_layers = autoset_gpu_layers(int(contextsize_text[context_var.get()]),(sd_quant_var.get()==1),int(blasbatchsize_values[int(blas_size_var.get())]))
        max_gpu_layers = (f"/{modelfile_extracted_meta[0][0]+3}" if (modelfile_extracted_meta and modelfile_extracted_meta[0] and modelfile_extracted_meta[0][0]!=0) else "")
        index = runopts_var.get()
-        gpu_be = (index == "Use Vulkan" or index == "Vulkan NoAVX2 (Old CPU)" or index == "Use CLBlast" or index == "CLBlast NoAVX2 (Old CPU)" or index == "Use CuBLAS" or index == "Use hipBLAS (ROCm)")
+        gpu_be = (index == "Use Vulkan" or index == "Use Vulkan (Old CPU)" or index == "Use CLBlast" or index == "Use CLBlast (Old CPU)" or index == "Use CuBLAS" or index == "Use hipBLAS (ROCm)")
        layercounter_label.grid(row=6, column=1, padx=75, sticky="W")
        quick_layercounter_label.grid(row=6, column=1, padx=75, sticky="W")
        if sys.platform=="darwin" and gpulayers_var.get()=="-1":
@ -2578,10 +2570,10 @@ def show_gui():
            try:
                s = int(gpu_choice_var.get())-1
                v = runopts_var.get()
-                if v == "Use Vulkan" or v == "Vulkan NoAVX2 (Old CPU)":
+                if v == "Use Vulkan" or v == "Use Vulkan (Old CPU)":
                    quick_gpuname_label.configure(text=VKDevicesNames[s])
                    gpuname_label.configure(text=VKDevicesNames[s])
-                elif v == "Use CLBlast" or v == "CLBlast NoAVX2 (Old CPU)":
+                elif v == "Use CLBlast" or v == "Use CLBlast (Old CPU)":
                    quick_gpuname_label.configure(text=CLDevicesNames[s])
                    gpuname_label.configure(text=CLDevicesNames[s])
                else:
@ -2631,19 +2623,19 @@ def show_gui():
        global runmode_untouched
        runmode_untouched = False
        index = runopts_var.get()
-        if index == "Use Vulkan" or index == "Vulkan NoAVX2 (Old CPU)" or index == "Use CLBlast" or index == "CLBlast NoAVX2 (Old CPU)" or index == "Use CuBLAS" or index == "Use hipBLAS (ROCm)":
+        if index == "Use Vulkan" or index == "Use Vulkan (Old CPU)" or index == "Use CLBlast" or index == "Use CLBlast (Old CPU)" or index == "Use CuBLAS" or index == "Use hipBLAS (ROCm)":
            quick_gpuname_label.grid(row=3, column=1, padx=75, sticky="W")
            gpuname_label.grid(row=3, column=1, padx=75, sticky="W")
            gpu_selector_label.grid(row=3, column=0, padx = 8, pady=1, stick="nw")
            quick_gpu_selector_label.grid(row=3, column=0, padx = 8, pady=1, stick="nw")
-            if index == "Use CLBlast" or index == "CLBlast NoAVX2 (Old CPU)":
+            if index == "Use CLBlast" or index == "Use CLBlast (Old CPU)":
                gpu_selector_box.grid(row=3, column=1, padx=8, pady=1, stick="nw")
                quick_gpu_selector_box.grid(row=3, column=1, padx=8, pady=1, stick="nw")
                CUDA_gpu_selector_box.grid_remove()
                CUDA_quick_gpu_selector_box.grid_remove()
                if gpu_choice_var.get()=="All":
                    gpu_choice_var.set("1")
-            elif index == "Use Vulkan" or index == "Vulkan NoAVX2 (Old CPU)" or index == "Use CuBLAS" or index == "Use hipBLAS (ROCm)":
+            elif index == "Use Vulkan" or index == "Use Vulkan (Old CPU)" or index == "Use CuBLAS" or index == "Use hipBLAS (ROCm)":
                gpu_selector_box.grid_remove()
                quick_gpu_selector_box.grid_remove()
                CUDA_gpu_selector_box.grid(row=3, column=1, padx=8, pady=1, stick="nw")
@ -2677,7 +2669,7 @@ def show_gui():
            tensor_split_label.grid(row=8, column=0, padx = 8, pady=1, stick="nw")
            tensor_split_entry.grid(row=8, column=1, padx=8, pady=1, stick="nw")

-        if index == "Use Vulkan" or index == "Vulkan NoAVX2 (Old CPU)" or index == "Use CLBlast" or index == "CLBlast NoAVX2 (Old CPU)" or index == "Use CuBLAS" or index == "Use hipBLAS (ROCm)":
+        if index == "Use Vulkan" or index == "Use Vulkan (Old CPU)" or index == "Use CLBlast" or index == "Use CLBlast (Old CPU)" or index == "Use CuBLAS" or index == "Use hipBLAS (ROCm)":
            gpu_layers_label.grid(row=6, column=0, padx = 8, pady=1, stick="nw")
            gpu_layers_entry.grid(row=6, column=1, padx=8, pady=1, stick="nw")
            quick_gpu_layers_label.grid(row=6, column=0, padx = 8, pady=1, stick="nw")
@ -2697,7 +2689,7 @@ def show_gui():


    # presets selector
-    makelabel(quick_tab, "Presets:", 1,0,"Select a backend to use.\nOpenBLAS and NoBLAS runs purely on CPU only.\nCuBLAS runs on Nvidia GPUs, and is much faster.\nCLBlast works on all GPUs but is somewhat slower.\nNoAVX2 and Failsafe modes support older PCs.")
+    makelabel(quick_tab, "Presets:", 1,0,"Select a backend to use.\nCuBLAS runs on Nvidia GPUs, and is much faster.\nVulkan and CLBlast works on all GPUs but is somewhat slower.\nOtherwise, runs on CPU only.\nNoAVX2 and Failsafe modes support older PCs.")

    runoptbox = ctk.CTkComboBox(quick_tab, values=runopts, width=180,variable=runopts_var, state="readonly")
    runoptbox.grid(row=1, column=1,padx=8, stick="nw")
@ -2743,7 +2735,7 @@ def show_gui():
    hardware_tab = tabcontent["Hardware"]

    # presets selector
-    makelabel(hardware_tab, "Presets:", 1,0,"Select a backend to use.\nOpenBLAS and NoBLAS runs purely on CPU only.\nCuBLAS runs on Nvidia GPUs, and is much faster.\nCLBlast works on all GPUs but is somewhat slower.\nNoAVX2 and Failsafe modes support older PCs.")
+    makelabel(hardware_tab, "Presets:", 1,0,"Select a backend to use.\nCuBLAS runs on Nvidia GPUs, and is much faster.\nVulkan and CLBlast works on all GPUs but is somewhat slower.\nOtherwise, runs on CPU only.\nNoAVX2 and Failsafe modes support older PCs.")
    runoptbox = ctk.CTkComboBox(hardware_tab, values=runopts,  width=180,variable=runopts_var, state="readonly")
    runoptbox.grid(row=1, column=1,padx=8, stick="nw")
    runoptbox.set(runopts[0]) # Set to first available option
@ -3011,9 +3003,9 @@ def show_gui():
        gpuchoiceidx = 0
        if gpu_choice_var.get()!="All":
            gpuchoiceidx = int(gpu_choice_var.get())-1
-        if runopts_var.get() == "Use CLBlast" or runopts_var.get() == "CLBlast NoAVX2 (Old CPU)":
+        if runopts_var.get() == "Use CLBlast" or runopts_var.get() == "Use CLBlast (Old CPU)":
            args.useclblast = [[0,0], [1,0], [0,1], [1,1]][gpuchoiceidx]
-            if runopts_var.get() == "CLBlast NoAVX2 (Old CPU)":
+            if runopts_var.get() == "CUse CLBlast (Old CPU)":
                args.noavx2 = True
        if runopts_var.get() == "Use CuBLAS" or runopts_var.get() == "Use hipBLAS (ROCm)":
            if gpu_choice_var.get()=="All":
@ -3024,18 +3016,18 @@ def show_gui():
                args.usecublas.append("mmq")
            if rowsplit_var.get()==1:
                args.usecublas.append("rowsplit")
-        if runopts_var.get() == "Use Vulkan" or runopts_var.get() == "Vulkan NoAVX2 (Old CPU)":
+        if runopts_var.get() == "Use Vulkan" or runopts_var.get() == "Use Vulkan (Old CPU)":
            if gpu_choice_var.get()=="All":
                args.usevulkan = []
            else:
                args.usevulkan = [int(gpuchoiceidx)]
-            if runopts_var.get() == "Vulkan NoAVX2 (Old CPU)":
+            if runopts_var.get() == "Use Vulkan (Old CPU)":
                args.noavx2 = True
        if gpulayers_var.get():
            args.gpulayers = int(gpulayers_var.get())
-        if runopts_var.get()=="Use No BLAS":
+        if runopts_var.get()=="Use CPU":
            args.noblas = True
-        if runopts_var.get()=="NoAVX2 Mode (Old CPU)":
+        if runopts_var.get()=="Use CPU (Old CPU)":
            args.noavx2 = True
        if runopts_var.get()=="Failsafe Mode (Old CPU)":
            args.noavx2 = True
@ -3193,8 +3185,6 @@ def show_gui():
        elif "noblas" in dict and dict["noblas"]:
            if default_option is not None:
                runopts_var.set(default_option)
-        elif openblas_option is not None:
-                runopts_var.set(openblas_option)
        if "gpulayers" in dict and dict["gpulayers"]:
            gpulayers_var.set(dict["gpulayers"])
        else:
@ -4019,34 +4009,31 @@ def main(launch_args,start_server=True):
        nocertify = True

    if args.gpulayers:
-        global libname, lib_default, lib_openblas, lib_failsafe, lib_noavx2
-        nogood = [lib_default,lib_openblas,lib_failsafe,lib_noavx2]
        shouldavoidgpu = False
-        if libname in nogood and sys.platform!="darwin":
+        if args.noblas and sys.platform!="darwin":
            shouldavoidgpu = True
-        if args.gpulayers>0:
-            if shouldavoidgpu:
-                print("WARNING: GPU layers is set, but a GPU backend was not selected!")
-                pass
+            if args.gpulayers and args.gpulayers>0:
+                print("WARNING: GPU layers is set, but a GPU backend was not selected! GPU will not be used!")
+            args.gpulayers = 0
        elif args.gpulayers==-1 and sys.platform=="darwin" and args.model_param and os.path.exists(args.model_param):
            print(f"MacOS detected: Auto GPU layers set to maximum")
            args.gpulayers = 200
-        elif args.gpulayers==-1 and not shouldavoidgpu and args.model_param and os.path.exists(args.model_param):
-            if not args.usecublas and not args.usevulkan and not args.useclblast:
-                print("NOTE: Auto GPU layers was set without picking a GPU backend! Trying to assign one for you automatically...")
+        elif not shouldavoidgpu and args.model_param and os.path.exists(args.model_param):
+            if not args.usecublas and (args.usevulkan is None) and not args.useclblast:
+                print("No GPU or CPU backend was selected. Trying to assign one for you automatically...")
                auto_set_backend_cli()
-            print("Trying to automatically determine GPU layers...")
            if MaxMemory[0] == 0: #try to get gpu vram for cuda if not picked yet
                fetch_gpu_properties(False,True,True)
                pass
-            if MaxMemory[0] > 0:
-                extract_modelfile_params(args.model_param,args.sdmodel,args.whispermodel,args.mmproj)
-                layeramt = autoset_gpu_layers(args.contextsize,args.sdquant,args.blasbatchsize)
-                print(f"Auto Recommended Layers: {layeramt}")
-                args.gpulayers = layeramt
-            else:
-                print(f"Could not automatically determine layers. Please set it manually.")
-                args.gpulayers = 0
+            if args.gpulayers==-1:
+                if MaxMemory[0] > 0 and (not args.noblas) and (args.usecublas or (args.usevulkan is not None) or args.useclblast or sys.platform=="darwin"):
+                    extract_modelfile_params(args.model_param,args.sdmodel,args.whispermodel,args.mmproj)
+                    layeramt = autoset_gpu_layers(args.contextsize,args.sdquant,args.blasbatchsize)
+                    print(f"Auto Recommended GPU Layers: {layeramt}")
+                    args.gpulayers = layeramt
+                else:
+                    print(f"No GPU backend found, or could not automatically determine GPU layers. Please set it manually.")
+                    args.gpulayers = 0

    if args.threads == -1:
        args.threads = get_default_threads()
@ -4398,9 +4385,9 @@ if __name__ == '__main__':
    compatgroup.add_argument("--usecublas", help="Use CuBLAS for GPU Acceleration. Requires CUDA. Select lowvram to not allocate VRAM scratch buffer. Enter a number afterwards to select and use 1 GPU. Leaving no number will use all GPUs. For hipBLAS binaries, please check YellowRoseCx rocm fork.", nargs='*',metavar=('[lowvram|normal] [main GPU ID] [mmq] [rowsplit]'), choices=['normal', 'lowvram', '0', '1', '2', '3', 'mmq', 'rowsplit'])
    compatgroup.add_argument("--usevulkan", help="Use Vulkan for GPU Acceleration. Can optionally specify GPU Device ID (e.g. --usevulkan 0).", metavar=('[Device ID]'), nargs='*', type=int, default=None)
    compatgroup.add_argument("--useclblast", help="Use CLBlast for GPU Acceleration. Must specify exactly 2 arguments, platform ID and device ID (e.g. --useclblast 1 0).", type=int, choices=range(0,9), nargs=2)
-    compatgroup.add_argument("--noblas", help="Do not use any accelerated prompt ingestion", action='store_true')
+    compatgroup.add_argument("--noblas", help="Do not use any GPU acceleration (CPU Only)", action='store_true')
    parser.add_argument("--contextsize", help="Controls the memory allocated for maximum context size, only change if you need more RAM for big contexts. (default 4096). Supported values are [256,512,1024,2048,3072,4096,6144,8192,12288,16384,24576,32768,49152,65536,98304,131072]. IF YOU USE ANYTHING ELSE YOU ARE ON YOUR OWN.",metavar=('[256,512,1024,2048,3072,4096,6144,8192,12288,16384,24576,32768,49152,65536,98304,131072]'), type=check_range(int,256,262144), default=4096)
-    parser.add_argument("--gpulayers", help="Set number of layers to offload to GPU when using GPU. Requires GPU. Set to -1 to try autodetect (experimental)",metavar=('[GPU layers]'), nargs='?', const=1, type=int, default=0)
+    parser.add_argument("--gpulayers", help="Set number of layers to offload to GPU when using GPU. Requires GPU. Set to -1 to try autodetect, set to 0 to disable GPU offload.",metavar=('[GPU layers]'), nargs='?', const=1, type=int, default=-1)
    parser.add_argument("--tensor_split", help="For CUDA and Vulkan only, ratio to split tensors across multiple GPUs, space-separated list of proportions, e.g. 7 3", metavar=('[Ratios]'), type=float, nargs='+')

    #more advanced params