mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-07 09:02:04 +00:00
allow lowvram (nkvo) to be used with vulkan.
This commit is contained in:
parent
b5bec86231
commit
a09d8333b5
1 changed files with 13 additions and 8 deletions
21
koboldcpp.py
21
koboldcpp.py
|
|
@ -1381,7 +1381,7 @@ def load_model(model_filename):
|
|||
inputs.model_filename = model_filename.encode("UTF-8")
|
||||
inputs.max_context_length = maxctx #initial value to use for ctx, can be overwritten
|
||||
inputs.threads = args.threads
|
||||
inputs.low_vram = (True if (args.usecuda and "lowvram" in args.usecuda) else False)
|
||||
inputs.low_vram = True if args.lowvram else False
|
||||
inputs.use_mmq = (True if (args.usecuda and "nommq" not in args.usecuda) else False)
|
||||
inputs.use_rowsplit = (True if (args.usecuda and "rowsplit" in args.usecuda) else False)
|
||||
inputs.vulkan_info = "0".encode("UTF-8")
|
||||
|
|
@ -5121,6 +5121,7 @@ def show_gui():
|
|||
maingpu_entry.grid_remove()
|
||||
if gpu_choice_var.get()=="All":
|
||||
gpu_choice_var.set("1")
|
||||
lowvram_box.grid_remove()
|
||||
elif index == "Use Vulkan" or index == "Use Vulkan (Old CPU)" or index == "Use CUDA" or index == "Use hipBLAS (ROCm)":
|
||||
gpu_selector_box.grid_remove()
|
||||
quick_gpu_selector_box.grid_remove()
|
||||
|
|
@ -5128,6 +5129,7 @@ def show_gui():
|
|||
CUDA_quick_gpu_selector_box.grid(row=3, column=1, padx=8, pady=1, stick="nw")
|
||||
maingpu_label.grid(row=10, column=0, padx = 8, pady=1, stick="nw")
|
||||
maingpu_entry.grid(row=10, column=1, padx = 8, pady=1, stick="nw")
|
||||
lowvram_box.grid(row=4, column=0, padx=8, pady=1, stick="nw")
|
||||
else:
|
||||
quick_gpuname_label.grid_remove()
|
||||
gpuname_label.grid_remove()
|
||||
|
|
@ -5139,16 +5141,15 @@ def show_gui():
|
|||
CUDA_quick_gpu_selector_box.grid_remove()
|
||||
maingpu_label.grid_remove()
|
||||
maingpu_entry.grid_remove()
|
||||
lowvram_box.grid_remove()
|
||||
|
||||
if index == "Use CUDA" or index == "Use hipBLAS (ROCm)":
|
||||
lowvram_box.grid(row=4, column=0, padx=8, pady=1, stick="nw")
|
||||
mmq_box.grid(row=4, column=1, padx=8, pady=1, stick="nw")
|
||||
quick_mmq_box.grid(row=4, column=1, padx=8, pady=1, stick="nw")
|
||||
splitmode_box.grid(row=5, column=1, padx=8, pady=1, stick="nw")
|
||||
tensor_split_label.grid(row=8, column=0, padx = 8, pady=1, stick="nw")
|
||||
tensor_split_entry.grid(row=8, column=1, padx=8, pady=1, stick="nw")
|
||||
else:
|
||||
lowvram_box.grid_remove()
|
||||
mmq_box.grid_remove()
|
||||
quick_mmq_box.grid_remove()
|
||||
tensor_split_label.grid_remove()
|
||||
|
|
@ -5246,7 +5247,7 @@ def show_gui():
|
|||
layercounter_label.grid(row=6, column=1, padx=75, sticky="W")
|
||||
layercounter_label.configure(text_color="#ffff00")
|
||||
tensor_split_entry,tensor_split_label = makelabelentry(hardware_tab, "Tensor Split:", tensor_split_str_vars, 8, 80, tooltip='When using multiple GPUs this option controls how large tensors should be split across all GPUs.\nUses a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order.\nFor example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1.')
|
||||
lowvram_box = makecheckbox(hardware_tab, "Low VRAM (No KV offload)", lowvram_var, 4,0, tooltiptxt='Avoid offloading KV Cache or scratch buffers to VRAM.\nAllows more layers to fit, but may result in a speed loss.')
|
||||
lowvram_box = makecheckbox(hardware_tab, "Low VRAM (No KV offload)", lowvram_var, 4,0, tooltiptxt='Avoid offloading KV Cache or scratch buffers to VRAM.\nAllows more layers to fit, but may result in a large speed loss.')
|
||||
mmq_box = makecheckbox(hardware_tab, "Use QuantMatMul (mmq)", mmq_var, 4,1, tooltiptxt="Enable MMQ mode to use finetuned kernels instead of default CuBLAS/HipBLAS for prompt processing.\nRead the wiki. Speed may vary.")
|
||||
splitmode_box = makecheckbox(hardware_tab, "Row-Split", rowsplit_var, 5,0, tooltiptxt="Split rows across GPUs instead of splitting layers and KV across GPUs.\nUses the main GPU for small tensors and intermediate results. Speed may vary.")
|
||||
|
||||
|
|
@ -5557,6 +5558,7 @@ def show_gui():
|
|||
args.nocertify = nocertifymode.get()==1
|
||||
args.nomodel = nomodel.get()==1
|
||||
args.quantkv = quantkv_var.get()
|
||||
args.lowvram = lowvram_var.get()==1
|
||||
|
||||
gpuchoiceidx = 0
|
||||
args.usecpu = False
|
||||
|
|
@ -5575,9 +5577,9 @@ def show_gui():
|
|||
args.failsafe = True
|
||||
if runopts_var.get() == "Use CUDA" or runopts_var.get() == "Use hipBLAS (ROCm)":
|
||||
if gpu_choice_var.get()=="All":
|
||||
args.usecuda = ["lowvram"] if lowvram_var.get() == 1 else ["normal"]
|
||||
args.usecuda = ["normal"]
|
||||
else:
|
||||
args.usecuda = ["lowvram",str(gpuchoiceidx)] if lowvram_var.get() == 1 else ["normal",str(gpuchoiceidx)]
|
||||
args.usecuda = ["normal",str(gpuchoiceidx)]
|
||||
if mmq_var.get()==1:
|
||||
args.usecuda.append("mmq")
|
||||
else:
|
||||
|
|
@ -5770,6 +5772,7 @@ def show_gui():
|
|||
quietmode.set(1 if "quiet" in dict and dict["quiet"] else 0)
|
||||
nocertifymode.set(1 if "nocertify" in dict and dict["nocertify"] else 0)
|
||||
nomodel.set(1 if "nomodel" in dict and dict["nomodel"] else 0)
|
||||
lowvram_var.set(1 if "lowvram" in dict and dict["lowvram"] else 0)
|
||||
if "quantkv" in dict:
|
||||
quantkv_var.set(dict["quantkv"])
|
||||
if "useclblast" in dict and dict["useclblast"]:
|
||||
|
|
@ -5787,7 +5790,6 @@ def show_gui():
|
|||
runopts_var.set(cublas_option)
|
||||
elif hipblas_option:
|
||||
runopts_var.set(hipblas_option)
|
||||
lowvram_var.set(1 if "lowvram" in dict["usecuda"] else 0)
|
||||
mmq_var.set(1 if "mmq" in dict["usecuda"] else 0)
|
||||
rowsplit_var.set(1 if "rowsplit" in dict["usecuda"] else 0)
|
||||
gpu_choice_var.set("All")
|
||||
|
|
@ -6282,6 +6284,8 @@ def convert_invalid_args(args):
|
|||
dict = vars(args)
|
||||
if "usecuda" not in dict and "usecublas" in dict and dict["usecublas"]:
|
||||
dict["usecuda"] = dict["usecublas"]
|
||||
if "usecuda" in dict and dict["usecuda"] and "lowvram" in dict["usecuda"]:
|
||||
dict["lowvram"] = True
|
||||
if "sdconfig" in dict and dict["sdconfig"] and len(dict["sdconfig"])>0:
|
||||
dict["sdmodel"] = dict["sdconfig"][0]
|
||||
if dict["sdconfig"] and len(dict["sdconfig"]) > 1:
|
||||
|
|
@ -7677,7 +7681,7 @@ if __name__ == '__main__':
|
|||
parser.add_argument("--config", metavar=('[filename]'), help="Load settings from a .kcpps file. Other arguments will be ignored", type=str, nargs=1)
|
||||
parser.add_argument("--threads","-t", metavar=('[threads]'), help="Use a custom number of threads if specified. Otherwise, uses an amount based on CPU cores", type=int, default=get_default_threads())
|
||||
compatgroup = parser.add_mutually_exclusive_group()
|
||||
compatgroup.add_argument("--usecuda", "--usecublas", "--usehipblas", help="Use CUDA for GPU Acceleration. Requires CUDA. Enter a number afterwards to select and use 1 GPU. Leaving no number will use all GPUs.", nargs='*',metavar=('[lowvram|normal] [main GPU ID] [mmq|nommq] [rowsplit]'), choices=['normal', 'lowvram', '0', '1', '2', '3', 'all', 'mmq', 'nommq', 'rowsplit'])
|
||||
compatgroup.add_argument("--usecuda", "--usecublas", "--usehipblas", help="Use CUDA for GPU Acceleration. Requires CUDA. Enter a number afterwards to select and use 1 GPU. Leaving no number will use all GPUs.", nargs='*',metavar=('[main GPU ID] [mmq|nommq] [rowsplit]'), choices=['normal', 'lowvram', '0', '1', '2', '3', 'all', 'mmq', 'nommq', 'rowsplit'])
|
||||
compatgroup.add_argument("--usevulkan", help="Use Vulkan for GPU Acceleration. Can optionally specify one or more GPU Device ID (e.g. --usevulkan 0), leave blank to autodetect.", metavar=('[Device IDs]'), nargs='*', type=int, default=None)
|
||||
compatgroup.add_argument("--useclblast", help="Use CLBlast for GPU Acceleration. Must specify exactly 2 arguments, platform ID and device ID (e.g. --useclblast 1 0).", type=int, choices=range(0,9), nargs=2)
|
||||
compatgroup.add_argument("--usecpu", help="Do not use any GPU acceleration (CPU Only)", action='store_true')
|
||||
|
|
@ -7733,6 +7737,7 @@ if __name__ == '__main__':
|
|||
advparser.add_argument("--ignoremissing", help="Ignores all missing non-essential files, just skipping them instead.", action='store_true')
|
||||
advparser.add_argument("--chatcompletionsadapter", metavar=('[filename]'), help="Select an optional ChatCompletions Adapter JSON file to force custom instruct tags.", default="AutoGuess")
|
||||
advparser.add_argument("--flashattention","--flash-attn","-fa", help="Enables flash attention.", action='store_true')
|
||||
advparser.add_argument("--lowvram","-nkvo","--no-kv-offload", help="If supported by the backend, do not offload KV to GPU (lowvram mode). Not recommended, will be slow.", action='store_true')
|
||||
advparser.add_argument("--quantkv", help="Sets the KV cache data type quantization, 0=f16, 1=q8, 2=q4. Requires Flash Attention for full effect, otherwise only K cache is quantized.",metavar=('[quantization level 0/1/2]'), type=int, choices=[0,1,2], default=0)
|
||||
advparser.add_argument("--forceversion", help="If the model file format detection fails (e.g. rogue modified model) you can set this to override the detected format (enter desired version, e.g. 401 for GPTNeoX-Type2).",metavar=('[version]'), type=int, default=0)
|
||||
advparser.add_argument("--smartcontext", help="Reserving a portion of context to try processing less frequently. Outdated. Not recommended.", action='store_true')
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue