mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-10 17:14:36 +00:00
draft model sets gpu split instead of id, made mmq default for cli
This commit is contained in:
parent
102281cfbb
commit
60cd68a39d
3 changed files with 50 additions and 22 deletions
46
koboldcpp.py
46
koboldcpp.py
|
@ -144,7 +144,7 @@ class load_model_inputs(ctypes.Structure):
|
|||
("draftmodel_filename", ctypes.c_char_p),
|
||||
("draft_amount", ctypes.c_int),
|
||||
("draft_gpulayers", ctypes.c_int),
|
||||
("draft_gpuid", ctypes.c_int),
|
||||
("draft_gpusplit", ctypes.c_float * tensor_split_max),
|
||||
("mmproj_filename", ctypes.c_char_p),
|
||||
("use_mmap", ctypes.c_bool),
|
||||
("use_mlock", ctypes.c_bool),
|
||||
|
@ -840,7 +840,7 @@ def load_model(model_filename):
|
|||
inputs.max_context_length = maxctx #initial value to use for ctx, can be overwritten
|
||||
inputs.threads = args.threads
|
||||
inputs.low_vram = (True if (args.usecublas and "lowvram" in args.usecublas) else False)
|
||||
inputs.use_mmq = (True if (args.usecublas and "mmq" in args.usecublas) else False)
|
||||
inputs.use_mmq = (True if (args.usecublas and "nommq" not in args.usecublas) else False)
|
||||
inputs.use_rowsplit = (True if (args.usecublas and "rowsplit" in args.usecublas) else False)
|
||||
inputs.vulkan_info = "0".encode("UTF-8")
|
||||
inputs.blasthreads = args.blasthreads
|
||||
|
@ -857,7 +857,11 @@ def load_model(model_filename):
|
|||
inputs.draftmodel_filename = args.draftmodel.encode("UTF-8") if args.draftmodel else "".encode("UTF-8")
|
||||
inputs.draft_amount = args.draftamount
|
||||
inputs.draft_gpulayers = args.draftgpulayers
|
||||
inputs.draft_gpuid = args.draftgpuid
|
||||
for n in range(tensor_split_max):
|
||||
if args.draftgpusplit and n < len(args.draftgpusplit):
|
||||
inputs.draft_gpusplit[n] = float(args.draftgpusplit[n])
|
||||
else:
|
||||
inputs.draft_gpusplit[n] = 0
|
||||
inputs.mmproj_filename = args.mmproj.encode("UTF-8") if args.mmproj else "".encode("UTF-8")
|
||||
inputs.use_smartcontext = args.smartcontext
|
||||
inputs.use_contextshift = (0 if args.noshift else 1)
|
||||
|
@ -2757,7 +2761,7 @@ def show_gui():
|
|||
draftmodel_var = ctk.StringVar()
|
||||
draftamount_var = ctk.StringVar(value=str(default_draft_amount))
|
||||
draftgpulayers_var = ctk.StringVar(value=str(999))
|
||||
draftgpuid_var = ctk.StringVar(value=str(-1))
|
||||
draftgpusplit_str_vars = ctk.StringVar(value="")
|
||||
nomodel = ctk.IntVar(value=0)
|
||||
|
||||
port_var = ctk.StringVar(value=defaultport)
|
||||
|
@ -3245,7 +3249,7 @@ def show_gui():
|
|||
makefileentry(model_tab, "Vision mmproj:", "Select Vision mmproj File", mmproj_var, 7,width=280,singlerow=True,tooltiptxt="Select a mmproj file to use for vision models like LLaVA.\nLeave blank to skip.")
|
||||
makefileentry(model_tab, "Draft Model:", "Select Speculative Text Model File", draftmodel_var, 9,width=280,singlerow=True,tooltiptxt="Select a draft text model file to use for speculative decoding.\nLeave blank to skip.")
|
||||
makelabelentry(model_tab, "Draft Amount: ", draftamount_var, 11, 50,padx=100,singleline=True,tooltip="How many tokens to draft per chunk before verifying results")
|
||||
makelabelentry(model_tab, "GPU ID: ", draftgpuid_var, 11, 50,padx=210,singleline=True,tooltip="Which GPU to use for draft model. Only works if multi-gpu (All) selected in main model.", labelpadx=160)
|
||||
makelabelentry(model_tab, "Splits: ", draftgpusplit_str_vars, 11, 50,padx=210,singleline=True,tooltip="Distribution of draft model layers. Leave blank to follow main model's gpu split. Only works if multi-gpu (All) selected in main model.", labelpadx=160)
|
||||
makelabelentry(model_tab, "Layers: ", draftgpulayers_var, 11, 50,padx=320,singleline=True,tooltip="How many layers to GPU offload for the draft model", labelpadx=270)
|
||||
makefileentry(model_tab, "Preload Story:", "Select Preloaded Story File", preloadstory_var, 15,width=280,singlerow=True,tooltiptxt="Select an optional KoboldAI JSON savefile \nto be served on launch to any client.")
|
||||
makefileentry(model_tab, "ChatCompletions Adapter:", "Select ChatCompletions Adapter File", chatcompletionsadapter_var, 24, width=250, filetypes=[("JSON Adapter", "*.json")], tooltiptxt="Select an optional ChatCompletions Adapter JSON file to force custom instruct tags.")
|
||||
|
@ -3378,6 +3382,7 @@ def show_gui():
|
|||
savdict["usecublas"] = None
|
||||
savdict["usevulkan"] = None
|
||||
savdict["tensor_split"] = None
|
||||
savdict["draftgpusplit"] = None
|
||||
savdict["config"] = None
|
||||
filename = asksaveasfile(filetypes=file_type, defaultextension=file_type)
|
||||
if filename is None:
|
||||
|
@ -3447,6 +3452,8 @@ def show_gui():
|
|||
args.usecublas = ["lowvram",str(gpuchoiceidx)] if lowvram_var.get() == 1 else ["normal",str(gpuchoiceidx)]
|
||||
if mmq_var.get()==1:
|
||||
args.usecublas.append("mmq")
|
||||
else:
|
||||
args.usecublas.append("nommq")
|
||||
if rowsplit_var.get()==1:
|
||||
args.usecublas.append("rowsplit")
|
||||
if runopts_var.get() == "Use Vulkan" or runopts_var.get() == "Use Vulkan (Old CPU)":
|
||||
|
@ -3473,6 +3480,13 @@ def show_gui():
|
|||
args.tensor_split = [float(x) for x in tssv.split(",")]
|
||||
else:
|
||||
args.tensor_split = [float(x) for x in tssv.split(" ")]
|
||||
if draftgpusplit_str_vars.get()!="":
|
||||
tssv = draftgpusplit_str_vars.get()
|
||||
if "," in tssv:
|
||||
args.draftgpusplit = [float(x) for x in tssv.split(",")]
|
||||
else:
|
||||
args.draftgpusplit = [float(x) for x in tssv.split(" ")]
|
||||
|
||||
|
||||
args.blasthreads = None if blas_threads_var.get()=="" else int(blas_threads_var.get())
|
||||
args.blasbatchsize = int(blasbatchsize_values[int(blas_size_var.get())])
|
||||
|
@ -3504,7 +3518,6 @@ def show_gui():
|
|||
args.draftmodel = None if draftmodel_var.get() == "" else draftmodel_var.get()
|
||||
args.draftamount = int(draftamount_var.get()) if draftamount_var.get()!="" else default_draft_amount
|
||||
args.draftgpulayers = int(draftgpulayers_var.get()) if draftgpulayers_var.get()!="" else 999
|
||||
args.draftgpuid = int(draftgpuid_var.get()) if draftgpuid_var.get()!="" else -1
|
||||
|
||||
args.ssl = None if (ssl_cert_var.get() == "" or ssl_key_var.get() == "") else ([ssl_cert_var.get(), ssl_key_var.get()])
|
||||
args.password = None if (password_var.get() == "") else (password_var.get())
|
||||
|
@ -3635,6 +3648,9 @@ def show_gui():
|
|||
if "tensor_split" in dict and dict["tensor_split"]:
|
||||
tssep = ','.join(map(str, dict["tensor_split"]))
|
||||
tensor_split_str_vars.set(tssep)
|
||||
if "draftgpusplit" in dict and dict["draftgpusplit"]:
|
||||
tssep = ','.join(map(str, dict["draftgpusplit"]))
|
||||
draftgpusplit_str_vars.set(tssep)
|
||||
if "blasthreads" in dict and dict["blasthreads"]:
|
||||
blas_threads_var.set(str(dict["blasthreads"]))
|
||||
else:
|
||||
|
@ -3672,8 +3688,6 @@ def show_gui():
|
|||
draftamount_var.set(dict["draftamount"])
|
||||
if "draftgpulayers" in dict:
|
||||
draftgpulayers_var.set(dict["draftgpulayers"])
|
||||
if "draftgpuid" in dict:
|
||||
draftgpuid_var.set(dict["draftgpuid"])
|
||||
|
||||
ssl_cert_var.set("")
|
||||
ssl_key_var.set("")
|
||||
|
@ -4927,7 +4941,7 @@ if __name__ == '__main__':
|
|||
|
||||
parser.add_argument("--threads", metavar=('[threads]'), help="Use a custom number of threads if specified. Otherwise, uses an amount based on CPU cores", type=int, default=get_default_threads())
|
||||
compatgroup = parser.add_mutually_exclusive_group()
|
||||
compatgroup.add_argument("--usecublas", help="Use CuBLAS for GPU Acceleration. Requires CUDA. Select lowvram to not allocate VRAM scratch buffer. Enter a number afterwards to select and use 1 GPU. Leaving no number will use all GPUs. For hipBLAS binaries, please check YellowRoseCx rocm fork.", nargs='*',metavar=('[lowvram|normal] [main GPU ID] [mmq] [rowsplit]'), choices=['normal', 'lowvram', '0', '1', '2', '3', 'all', 'mmq', 'rowsplit'])
|
||||
compatgroup.add_argument("--usecublas", help="Use CuBLAS for GPU Acceleration. Requires CUDA. Select lowvram to not allocate VRAM scratch buffer. Enter a number afterwards to select and use 1 GPU. Leaving no number will use all GPUs. For hipBLAS binaries, please check YellowRoseCx rocm fork.", nargs='*',metavar=('[lowvram|normal] [main GPU ID] [mmq|nommq] [rowsplit]'), choices=['normal', 'lowvram', '0', '1', '2', '3', 'all', 'mmq', 'nommq', 'rowsplit'])
|
||||
compatgroup.add_argument("--usevulkan", help="Use Vulkan for GPU Acceleration. Can optionally specify one or more GPU Device ID (e.g. --usevulkan 0), leave blank to autodetect.", metavar=('[Device IDs]'), nargs='*', type=int, default=None)
|
||||
compatgroup.add_argument("--useclblast", help="Use CLBlast for GPU Acceleration. Must specify exactly 2 arguments, platform ID and device ID (e.g. --useclblast 1 0).", type=int, choices=range(0,9), nargs=2)
|
||||
compatgroup.add_argument("--usecpu", help="Do not use any GPU acceleration (CPU Only)", action='store_true')
|
||||
|
@ -4957,18 +4971,18 @@ if __name__ == '__main__':
|
|||
advparser.add_argument("--remotetunnel", help="Uses Cloudflare to create a remote tunnel, allowing you to access koboldcpp remotely over the internet even behind a firewall.", action='store_true')
|
||||
advparser.add_argument("--highpriority", help="Experimental flag. If set, increases the process CPU priority, potentially speeding up generation. Use caution.", action='store_true')
|
||||
advparser.add_argument("--foreground", help="Windows only. Sends the terminal to the foreground every time a new prompt is generated. This helps avoid some idle slowdown issues.", action='store_true')
|
||||
advparser.add_argument("--preloadstory", help="Configures a prepared story json save file to be hosted on the server, which frontends (such as KoboldAI Lite) can access over the API.", default="")
|
||||
advparser.add_argument("--preloadstory", metavar=('[savefile]'), help="Configures a prepared story json save file to be hosted on the server, which frontends (such as KoboldAI Lite) can access over the API.", default="")
|
||||
advparser.add_argument("--quiet", help="Enable quiet mode, which hides generation inputs and outputs in the terminal. Quiet mode is automatically enabled when running a horde worker.", action='store_true')
|
||||
advparser.add_argument("--ssl", help="Allows all content to be served over SSL instead. A valid UNENCRYPTED SSL cert and key .pem files must be provided", metavar=('[cert_pem]', '[key_pem]'), nargs='+')
|
||||
advparser.add_argument("--nocertify", help="Allows insecure SSL connections. Use this if you have cert errors and need to bypass certificate restrictions.", action='store_true')
|
||||
advparser.add_argument("--mmproj", help="Select a multimodal projector file for vision models like LLaVA.", default="")
|
||||
advparser.add_argument("--draftmodel", help="Load a small draft model for speculative decoding. It will be fully offloaded. Vocab must match the main model.", default="")
|
||||
advparser.add_argument("--mmproj", metavar=('[filename]'), help="Select a multimodal projector file for vision models like LLaVA.", default="")
|
||||
advparser.add_argument("--draftmodel", metavar=('[filename]'), help="Load a small draft model for speculative decoding. It will be fully offloaded. Vocab must match the main model.", default="")
|
||||
advparser.add_argument("--draftamount", metavar=('[tokens]'), help="How many tokens to draft per chunk before verifying results", type=int, default=default_draft_amount)
|
||||
advparser.add_argument("--draftgpulayers", metavar=('[layers]'), help="How many layers to offload to GPU for the draft model (default=full offload)", type=int, default=999)
|
||||
advparser.add_argument("--draftgpuid", metavar=('[gpu id]'), help="Which GPU to use for draft model (default=same as main). Only works if multi-GPUs selected for MAIN model!", type=int, default=-1)
|
||||
advparser.add_argument("--password", help="Enter a password required to use this instance. This key will be required for all text endpoints. Image endpoints are not secured.", default=None)
|
||||
advparser.add_argument("--draftgpusplit", help="GPU layer distribution ratio for draft model (default=same as main). Only works if multi-GPUs selected for MAIN model and tensor_split is set!", metavar=('[Ratios]'), type=float, nargs='+')
|
||||
advparser.add_argument("--password", metavar=('[API key]'), help="Enter a password required to use this instance. This key will be required for all text endpoints. Image endpoints are not secured.", default=None)
|
||||
advparser.add_argument("--ignoremissing", help="Ignores all missing non-essential files, just skipping them instead.", action='store_true')
|
||||
advparser.add_argument("--chatcompletionsadapter", help="Select an optional ChatCompletions Adapter JSON file to force custom instruct tags.", default="")
|
||||
advparser.add_argument("--chatcompletionsadapter", metavar=('[filename]'), help="Select an optional ChatCompletions Adapter JSON file to force custom instruct tags.", default="")
|
||||
advparser.add_argument("--flashattention", help="Enables flash attention.", action='store_true')
|
||||
advparser.add_argument("--quantkv", help="Sets the KV cache data type quantization, 0=f16, 1=q8, 2=q4. Requires Flash Attention, and disables context shifting.",metavar=('[quantization level 0/1/2]'), type=int, choices=[0,1,2], default=0)
|
||||
advparser.add_argument("--forceversion", help="If the model file format detection fails (e.g. rogue modified model) you can set this to override the detected format (enter desired version, e.g. 401 for GPTNeoX-Type2).",metavar=('[version]'), type=int, default=0)
|
||||
|
@ -4990,7 +5004,7 @@ if __name__ == '__main__':
|
|||
sdparsergroup = parser.add_argument_group('Image Generation Commands')
|
||||
sdparsergroup.add_argument("--sdmodel", metavar=('[filename]'), help="Specify a stable diffusion safetensors or gguf model to enable image generation.", default="")
|
||||
sdparsergroup.add_argument("--sdthreads", metavar=('[threads]'), help="Use a different number of threads for image generation if specified. Otherwise, has the same value as --threads.", type=int, default=0)
|
||||
sdparsergroup.add_argument("--sdclamped", help="If specified, limit generation steps and resolution settings for shared use. Accepts an extra optional parameter that indicates maximum resolution (eg. 768 clamps to 768x768, min 512px, disabled if 0).", nargs='?', const=512, type=int, default=0)
|
||||
sdparsergroup.add_argument("--sdclamped", metavar=('[maxres]'), help="If specified, limit generation steps and resolution settings for shared use. Accepts an extra optional parameter that indicates maximum resolution (eg. 768 clamps to 768x768, min 512px, disabled if 0).", nargs='?', const=512, type=int, default=0)
|
||||
sdparsergroup.add_argument("--sdt5xxl", metavar=('[filename]'), help="Specify a T5-XXL safetensors model for use in SD3 or Flux. Leave blank if prebaked or unused.", default="")
|
||||
sdparsergroup.add_argument("--sdclipl", metavar=('[filename]'), help="Specify a Clip-L safetensors model for use in SD3 or Flux. Leave blank if prebaked or unused.", default="")
|
||||
sdparsergroup.add_argument("--sdclipg", metavar=('[filename]'), help="Specify a Clip-G safetensors model for use in SD3. Leave blank if prebaked or unused.", default="")
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue