mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-10 17:14:36 +00:00
allow setting maingpu
This commit is contained in:
parent
7a688e07cd
commit
1cbe716e45
3 changed files with 49 additions and 28 deletions
51
koboldcpp.py
51
koboldcpp.py
|
@ -178,7 +178,7 @@ class load_model_inputs(ctypes.Structure):
|
|||
("use_contextshift", ctypes.c_bool),
|
||||
("use_fastforward", ctypes.c_bool),
|
||||
("clblast_info", ctypes.c_int),
|
||||
("cublas_info", ctypes.c_int),
|
||||
("kcpp_main_gpu", ctypes.c_int),
|
||||
("vulkan_info", ctypes.c_char_p),
|
||||
("blasbatchsize", ctypes.c_int),
|
||||
("forceversion", ctypes.c_int),
|
||||
|
@ -262,7 +262,7 @@ class sd_load_model_inputs(ctypes.Structure):
|
|||
_fields_ = [("model_filename", ctypes.c_char_p),
|
||||
("executable_path", ctypes.c_char_p),
|
||||
("clblast_info", ctypes.c_int),
|
||||
("cublas_info", ctypes.c_int),
|
||||
("kcpp_main_gpu", ctypes.c_int),
|
||||
("vulkan_info", ctypes.c_char_p),
|
||||
("threads", ctypes.c_int),
|
||||
("quant", ctypes.c_int),
|
||||
|
@ -300,7 +300,7 @@ class whisper_load_model_inputs(ctypes.Structure):
|
|||
_fields_ = [("model_filename", ctypes.c_char_p),
|
||||
("executable_path", ctypes.c_char_p),
|
||||
("clblast_info", ctypes.c_int),
|
||||
("cublas_info", ctypes.c_int),
|
||||
("kcpp_main_gpu", ctypes.c_int),
|
||||
("vulkan_info", ctypes.c_char_p),
|
||||
("quiet", ctypes.c_bool),
|
||||
("debugmode", ctypes.c_int)]
|
||||
|
@ -321,7 +321,7 @@ class tts_load_model_inputs(ctypes.Structure):
|
|||
("cts_model_filename", ctypes.c_char_p),
|
||||
("executable_path", ctypes.c_char_p),
|
||||
("clblast_info", ctypes.c_int),
|
||||
("cublas_info", ctypes.c_int),
|
||||
("kcpp_main_gpu", ctypes.c_int),
|
||||
("vulkan_info", ctypes.c_char_p),
|
||||
("gpulayers", ctypes.c_int),
|
||||
("flash_attention", ctypes.c_bool),
|
||||
|
@ -345,7 +345,7 @@ class embeddings_load_model_inputs(ctypes.Structure):
|
|||
("model_filename", ctypes.c_char_p),
|
||||
("executable_path", ctypes.c_char_p),
|
||||
("clblast_info", ctypes.c_int),
|
||||
("cublas_info", ctypes.c_int),
|
||||
("kcpp_main_gpu", ctypes.c_int),
|
||||
("vulkan_info", ctypes.c_char_p),
|
||||
("gpulayers", ctypes.c_int),
|
||||
("flash_attention", ctypes.c_bool),
|
||||
|
@ -566,7 +566,9 @@ def set_backend_props(inputs):
|
|||
|
||||
# we must force an explicit tensor split
|
||||
# otherwise the default will divide equally and multigpu crap will slow it down badly
|
||||
inputs.cublas_info = 0
|
||||
inputs.kcpp_main_gpu = 0
|
||||
if(args.maingpu is not None and args.maingpu>=0):
|
||||
inputs.kcpp_main_gpu = args.maingpu
|
||||
|
||||
if args.usecublas:
|
||||
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
||||
|
@ -574,24 +576,29 @@ def set_backend_props(inputs):
|
|||
if (args.usecublas and "0" in args.usecublas):
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
||||
os.environ["HIP_VISIBLE_DEVICES"] = "0"
|
||||
inputs.kcpp_main_gpu = 0
|
||||
elif (args.usecublas and "1" in args.usecublas):
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
|
||||
os.environ["HIP_VISIBLE_DEVICES"] = "1"
|
||||
inputs.kcpp_main_gpu = 0
|
||||
elif (args.usecublas and "2" in args.usecublas):
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
|
||||
os.environ["HIP_VISIBLE_DEVICES"] = "2"
|
||||
inputs.kcpp_main_gpu = 0
|
||||
elif (args.usecublas and "3" in args.usecublas):
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = "3"
|
||||
os.environ["HIP_VISIBLE_DEVICES"] = "3"
|
||||
inputs.kcpp_main_gpu = 0
|
||||
else:
|
||||
if (args.usecublas and "0" in args.usecublas):
|
||||
inputs.cublas_info = 0
|
||||
elif (args.usecublas and "1" in args.usecublas):
|
||||
inputs.cublas_info = 1
|
||||
elif (args.usecublas and "2" in args.usecublas):
|
||||
inputs.cublas_info = 2
|
||||
elif (args.usecublas and "3" in args.usecublas):
|
||||
inputs.cublas_info = 3
|
||||
if(args.maingpu is None or args.maingpu<0):
|
||||
if (args.usecublas and "0" in args.usecublas):
|
||||
inputs.kcpp_main_gpu = 0
|
||||
elif (args.usecublas and "1" in args.usecublas):
|
||||
inputs.kcpp_main_gpu = 1
|
||||
elif (args.usecublas and "2" in args.usecublas):
|
||||
inputs.kcpp_main_gpu = 2
|
||||
elif (args.usecublas and "3" in args.usecublas):
|
||||
inputs.kcpp_main_gpu = 3
|
||||
|
||||
if args.usevulkan: #is an empty array if using vulkan without defined gpu
|
||||
s = ""
|
||||
|
@ -4201,6 +4208,7 @@ def show_gui():
|
|||
version_var = ctk.StringVar(value="0")
|
||||
tensor_split_str_vars = ctk.StringVar(value="")
|
||||
rowsplit_var = ctk.IntVar()
|
||||
maingpu_var = ctk.StringVar(value="")
|
||||
|
||||
contextshift_var = ctk.IntVar(value=1)
|
||||
fastforward_var = ctk.IntVar(value=1)
|
||||
|
@ -4676,6 +4684,8 @@ def show_gui():
|
|||
quick_gpu_selector_box.grid(row=3, column=1, padx=8, pady=1, stick="nw")
|
||||
CUDA_gpu_selector_box.grid_remove()
|
||||
CUDA_quick_gpu_selector_box.grid_remove()
|
||||
maingpu_label.grid_remove()
|
||||
maingpu_entry.grid_remove()
|
||||
if gpu_choice_var.get()=="All":
|
||||
gpu_choice_var.set("1")
|
||||
elif index == "Use Vulkan" or index == "Use Vulkan (Old CPU)" or index == "Use CuBLAS" or index == "Use hipBLAS (ROCm)":
|
||||
|
@ -4683,6 +4693,8 @@ def show_gui():
|
|||
quick_gpu_selector_box.grid_remove()
|
||||
CUDA_gpu_selector_box.grid(row=3, column=1, padx=8, pady=1, stick="nw")
|
||||
CUDA_quick_gpu_selector_box.grid(row=3, column=1, padx=8, pady=1, stick="nw")
|
||||
maingpu_label.grid(row=10, column=0, padx = 8, pady=1, stick="nw")
|
||||
maingpu_entry.grid(row=10, column=1, padx = 8, pady=1, stick="nw")
|
||||
else:
|
||||
quick_gpuname_label.grid_remove()
|
||||
gpuname_label.grid_remove()
|
||||
|
@ -4692,6 +4704,8 @@ def show_gui():
|
|||
quick_gpu_selector_label.grid_remove()
|
||||
quick_gpu_selector_box.grid_remove()
|
||||
CUDA_quick_gpu_selector_box.grid_remove()
|
||||
maingpu_label.grid_remove()
|
||||
maingpu_entry.grid_remove()
|
||||
|
||||
if index == "Use CuBLAS" or index == "Use hipBLAS (ROCm)":
|
||||
lowvram_box.grid(row=4, column=0, padx=8, pady=1, stick="nw")
|
||||
|
@ -4803,6 +4817,8 @@ def show_gui():
|
|||
mmq_box = makecheckbox(hardware_tab, "Use QuantMatMul (mmq)", mmq_var, 4,1, tooltiptxt="Enable MMQ mode to use finetuned kernels instead of default CuBLAS/HipBLAS for prompt processing.\nRead the wiki. Speed may vary.")
|
||||
splitmode_box = makecheckbox(hardware_tab, "Row-Split", rowsplit_var, 5,0, tooltiptxt="Split rows across GPUs instead of splitting layers and KV across GPUs.\nUses the main GPU for small tensors and intermediate results. Speed may vary.")
|
||||
|
||||
maingpu_entry,maingpu_label = makelabelentry(hardware_tab, "Main GPU:" , maingpu_var, 10, 50,tooltip="Only for multi-gpu, which GPU to set as main?\nIf left blank, uses default value.")
|
||||
|
||||
# threads
|
||||
makelabelentry(hardware_tab, "Threads:" , threads_var, 11, 50,tooltip="How many threads to use.\nRecommended value is your CPU core count, defaults are usually OK.")
|
||||
|
||||
|
@ -5140,7 +5156,7 @@ def show_gui():
|
|||
else:
|
||||
args.draftgpusplit = [float(x) for x in tssv.split(" ")]
|
||||
|
||||
|
||||
args.maingpu = -1 if maingpu_var.get()=="" else int(maingpu_var.get())
|
||||
args.blasthreads = None if blas_threads_var.get()=="" else int(blas_threads_var.get())
|
||||
args.blasbatchsize = int(blasbatchsize_values[int(blas_size_var.get())])
|
||||
args.forceversion = 0 if version_var.get()=="" else int(version_var.get())
|
||||
|
@ -5330,6 +5346,10 @@ def show_gui():
|
|||
gpulayers_var.set(dict["gpulayers"])
|
||||
else:
|
||||
gpulayers_var.set("0")
|
||||
if "maingpu" in dict:
|
||||
maingpu_var.set(dict["maingpu"])
|
||||
else:
|
||||
maingpu_var.set("")
|
||||
if "tensor_split" in dict and dict["tensor_split"]:
|
||||
tssep = ','.join(map(str, dict["tensor_split"]))
|
||||
tensor_split_str_vars.set(tssep)
|
||||
|
@ -7076,6 +7096,7 @@ if __name__ == '__main__':
|
|||
advparser = parser.add_argument_group('Advanced Commands')
|
||||
advparser.add_argument("--version", help="Prints version and exits.", action='store_true')
|
||||
advparser.add_argument("--analyze", metavar=('[filename]'), help="Reads the metadata, weight types and tensor names in any GGUF file.", default="")
|
||||
advparser.add_argument("--maingpu", help="Only used in a multi-gpu setup. Sets the index of the main GPU that will be used.",metavar=('[Device ID]'), type=int, default=-1)
|
||||
advparser.add_argument("--ropeconfig", help="If set, uses customized RoPE scaling from configured frequency scale and frequency base (e.g. --ropeconfig 0.25 10000). Otherwise, uses NTK-Aware scaling set automatically based on context size. For linear rope, simply set the freq-scale and ignore the freq-base",metavar=('[rope-freq-scale]', '[rope-freq-base]'), default=[0.0, 10000.0], type=float, nargs='+')
|
||||
advparser.add_argument("--blasbatchsize", help="Sets the batch size used in BLAS processing (default 512). Setting it to -1 disables BLAS mode, but keeps other benefits like GPU offload.", type=int,choices=[-1,16,32,64,128,256,512,1024,2048], default=512)
|
||||
advparser.add_argument("--blasthreads", help="Use a different number of threads during BLAS if specified. Otherwise, has the same value as --threads",metavar=('[threads]'), type=int, default=0)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue