mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-10 17:14:36 +00:00
speculative decoding initial impl completed (+6 squashed commit)
Squashed commit: [0a6306ca0] draft wip dont use (will be squashed) [a758a1c9c] wip dont use (will be squashed) [e1994d3ce] wip dont use [f59690d68] wip [77228147d] wip on spec decoding. dont use yet [2445bca54] wip adding speculative decoding (+1 squashed commits) Squashed commits: [50e341bb7] wip adding speculative decoding
This commit is contained in:
parent
b9e99c69e8
commit
f75bbb945f
9 changed files with 539 additions and 280 deletions
53
koboldcpp.py
53
koboldcpp.py
|
@ -131,6 +131,7 @@ class load_model_inputs(ctypes.Structure):
|
|||
("model_filename", ctypes.c_char_p),
|
||||
("lora_filename", ctypes.c_char_p),
|
||||
("lora_base", ctypes.c_char_p),
|
||||
("draftmodel_filename", ctypes.c_char_p),
|
||||
("mmproj_filename", ctypes.c_char_p),
|
||||
("use_mmap", ctypes.c_bool),
|
||||
("use_mlock", ctypes.c_bool),
|
||||
|
@ -672,24 +673,27 @@ def read_gguf_metadata(file_path):
|
|||
except Exception as ex:
|
||||
return None
|
||||
|
||||
def extract_modelfile_params(filepath,sdfilepath,whisperfilepath,mmprojfilepath):
|
||||
def extract_modelfile_params(filepath,sdfilepath,whisperfilepath,mmprojfilepath,draftmodelpath):
|
||||
global modelfile_extracted_meta
|
||||
modelfile_extracted_meta = None
|
||||
sdfsize = 0
|
||||
whisperfsize = 0
|
||||
mmprojsize = 0
|
||||
draftmodelsize = 0
|
||||
if sdfilepath and os.path.exists(sdfilepath):
|
||||
sdfsize = os.path.getsize(sdfilepath)
|
||||
if whisperfilepath and os.path.exists(whisperfilepath):
|
||||
whisperfsize = os.path.getsize(whisperfilepath)
|
||||
if mmprojfilepath and os.path.exists(mmprojfilepath):
|
||||
mmprojsize = os.path.getsize(mmprojfilepath)
|
||||
if draftmodelpath and os.path.exists(draftmodelpath):
|
||||
draftmodelsize = os.path.getsize(draftmodelpath)
|
||||
if filepath and os.path.exists(filepath):
|
||||
try:
|
||||
fsize = os.path.getsize(filepath)
|
||||
if fsize>10000000: #dont bother with models < 10mb as they are probably bad
|
||||
ggufmeta = read_gguf_metadata(filepath)
|
||||
modelfile_extracted_meta = [ggufmeta,fsize,sdfsize,whisperfsize,mmprojsize] #extract done. note that meta may be null
|
||||
modelfile_extracted_meta = [ggufmeta,fsize,sdfsize,whisperfsize,mmprojsize,draftmodelsize] #extract done. note that meta may be null
|
||||
except Exception as ex:
|
||||
modelfile_extracted_meta = None
|
||||
|
||||
|
@ -702,7 +706,7 @@ def autoset_gpu_layers(ctxsize,sdquanted,bbs): #shitty algo to determine how man
|
|||
if showusedmemwarning and usedmem > (2.5*1024*1024*1024):
|
||||
showusedmemwarning = False
|
||||
print(f"Note: KoboldCpp has detected that a significant amount of GPU VRAM ({usedmem/1024/1024} MB) is currently used by another application.\nFor best results, you may wish to close that application and then restart KoboldCpp.\n***")
|
||||
reservedmem = max(1.5*1024*1024*1024,(0.5*1024*1024*1024 + usedmem)) # determine vram overhead
|
||||
reservedmem = max(1.3*1024*1024*1024,(0.5*1024*1024*1024 + usedmem)) # determine vram overhead
|
||||
try:
|
||||
if not modelfile_extracted_meta:
|
||||
return 0
|
||||
|
@ -719,6 +723,9 @@ def autoset_gpu_layers(ctxsize,sdquanted,bbs): #shitty algo to determine how man
|
|||
mem -= 350*1024*1024
|
||||
if modelfile_extracted_meta[4] > 1024*1024*10: #mmproj tax
|
||||
mem -= 350*1024*1024
|
||||
if modelfile_extracted_meta[5] > 1024*1024*10: #draft model tax
|
||||
mem -= (modelfile_extracted_meta[5] * 1.5)
|
||||
mem = 0 if mem < 0 else mem
|
||||
|
||||
csmul = 1.0
|
||||
if cs:
|
||||
|
@ -732,8 +739,8 @@ def autoset_gpu_layers(ctxsize,sdquanted,bbs): #shitty algo to determine how man
|
|||
headcount = ggufmeta[1]
|
||||
headkvlen = (ggufmeta[2] if ggufmeta[2] > 0 else 128)
|
||||
ratio = (mem-usedmem)/(fsize*csmul*1.6*(1.0 if bbs <= 512 else 1.2))
|
||||
computemem = layers*(4 if bbs <= 512 else (bbs/128))*headkvlen*cs*4*1.5 # apply blasbatchsize calculations if over 512
|
||||
contextmem = layers*headcount*headkvlen*cs*4*1.1
|
||||
computemem = layers*(4 if bbs <= 512 else (bbs/128))*headkvlen*cs*4*1.55 # apply blasbatchsize calculations if over 512
|
||||
contextmem = layers*headcount*headkvlen*cs*4*1.15
|
||||
if headcount > 0:
|
||||
ratio = max(ratio, (mem - reservedmem - computemem) / (fsize + contextmem))
|
||||
layerlimit = min(int(ratio*layers), (layers + 3))
|
||||
|
@ -877,6 +884,7 @@ def load_model(model_filename):
|
|||
if len(args.lora) > 1:
|
||||
inputs.lora_base = args.lora[1].encode("UTF-8")
|
||||
|
||||
inputs.draftmodel_filename = args.draftmodel.encode("UTF-8") if args.draftmodel else "".encode("UTF-8")
|
||||
inputs.mmproj_filename = args.mmproj.encode("UTF-8") if args.mmproj else "".encode("UTF-8")
|
||||
inputs.use_smartcontext = args.smartcontext
|
||||
inputs.use_contextshift = (0 if args.noshift else 1)
|
||||
|
@ -1510,15 +1518,18 @@ ws ::= | " " | "\n" [ \t]{0,20}
|
|||
elif api_format==6:
|
||||
detokstr = ""
|
||||
tokids = genparams.get('context', [])
|
||||
adapter_obj = {} if chatcompl_adapter is None else chatcompl_adapter
|
||||
user_message_start = adapter_obj.get("user_start", "\n\n### Instruction:\n")
|
||||
assistant_message_start = adapter_obj.get("assistant_start", "\n\n### Response:\n")
|
||||
try:
|
||||
detokstr = detokenize_ids(tokids)
|
||||
except Exception as e:
|
||||
utfprint("Ollama Context Error: " + str(e))
|
||||
ollamasysprompt = genparams.get('system', "")
|
||||
ollamabodyprompt = detokstr + "\n\n### Instruction:\n" + genparams.get('prompt', "") + "\n\n### Response:\n"
|
||||
ollamabodyprompt = f"{detokstr}{user_message_start}{genparams.get('prompt', '')}{assistant_message_start}"
|
||||
genparams["stop_sequence"] = genparams.get('stop', [])
|
||||
genparams["stop_sequence"].append("\n### Instruction:")
|
||||
genparams["stop_sequence"].append("\n### Response:")
|
||||
genparams["stop_sequence"].append(user_message_start.strip())
|
||||
genparams["stop_sequence"].append(assistant_message_start.strip())
|
||||
genparams["trim_stop"] = True
|
||||
genparams["ollamasysprompt"] = ollamasysprompt
|
||||
genparams["ollamabodyprompt"] = ollamabodyprompt
|
||||
|
@ -2374,9 +2385,8 @@ Enter Prompt:<br>
|
|||
return
|
||||
|
||||
is_quiet = args.quiet
|
||||
utfprint(f"\n{datetime.now().strftime('[%H:%M:%S] Input Received')}")
|
||||
if (args.debugmode != -1 and not is_quiet) or args.debugmode >= 1:
|
||||
utfprint(f"Input: " + json.dumps(genparams))
|
||||
utfprint(f"\nInput: " + json.dumps(genparams))
|
||||
|
||||
if args.foreground:
|
||||
bring_terminal_to_foreground()
|
||||
|
@ -2751,6 +2761,7 @@ def show_gui():
|
|||
lora_base_var = ctk.StringVar()
|
||||
preloadstory_var = ctk.StringVar()
|
||||
mmproj_var = ctk.StringVar()
|
||||
draftmodel_var = ctk.StringVar()
|
||||
nomodel = ctk.IntVar(value=0)
|
||||
|
||||
port_var = ctk.StringVar(value=defaultport)
|
||||
|
@ -2929,7 +2940,8 @@ def show_gui():
|
|||
sdfilepath = sd_model_var.get()
|
||||
whisperfilepath = whisper_model_var.get()
|
||||
mmprojfilepath = mmproj_var.get()
|
||||
extract_modelfile_params(filepath,sdfilepath,whisperfilepath,mmprojfilepath)
|
||||
draftmodelpath = draftmodel_var.get()
|
||||
extract_modelfile_params(filepath,sdfilepath,whisperfilepath,mmprojfilepath,draftmodelpath)
|
||||
changed_gpulayers_estimate()
|
||||
pass
|
||||
|
||||
|
@ -3234,18 +3246,20 @@ def show_gui():
|
|||
makefileentry(model_tab, "Text Lora:", "Select Lora File",lora_var, 3,width=280,tooltiptxt="Select an optional GGML LoRA adapter to use.\nLeave blank to skip.")
|
||||
makefileentry(model_tab, "Text Lora Base:", "Select Lora Base File", lora_base_var, 5,width=280,tooltiptxt="Select an optional F16 GGML LoRA base file to use.\nLeave blank to skip.")
|
||||
makefileentry(model_tab, "Vision mmproj:", "Select Vision mmproj File", mmproj_var, 7,width=280,tooltiptxt="Select a mmproj file to use for vision models like LLaVA.\nLeave blank to skip.")
|
||||
makefileentry(model_tab, "Preloaded Story:", "Select Preloaded Story File", preloadstory_var, 9,width=280,tooltiptxt="Select an optional KoboldAI JSON savefile \nto be served on launch to any client.")
|
||||
makefileentry(model_tab, "ChatCompletions Adapter:", "Select ChatCompletions Adapter File", chatcompletionsadapter_var, 12, width=250, filetypes=[("JSON Adapter", "*.json")], tooltiptxt="Select an optional ChatCompletions Adapter JSON file to force custom instruct tags.")
|
||||
makefileentry(model_tab, "Speculative Model:", "Select Draft Text Model File", draftmodel_var, 9,width=280,tooltiptxt="Select a draft text model file to use for speculative decoding.\nLeave blank to skip.")
|
||||
makefileentry(model_tab, "Preloaded Story:", "Select Preloaded Story File", preloadstory_var, 11,width=280,tooltiptxt="Select an optional KoboldAI JSON savefile \nto be served on launch to any client.")
|
||||
makefileentry(model_tab, "ChatCompletions Adapter:", "Select ChatCompletions Adapter File", chatcompletionsadapter_var, 14, width=250, filetypes=[("JSON Adapter", "*.json")], tooltiptxt="Select an optional ChatCompletions Adapter JSON file to force custom instruct tags.")
|
||||
def pickpremadetemplate():
|
||||
initialDir = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'kcpp_adapters')
|
||||
initialDir = initialDir if os.path.isdir(initialDir) else None
|
||||
fnam = askopenfilename(title="Pick Premade ChatCompletions Adapter",filetypes=[("JSON Adapter", "*.json")], initialdir=initialDir)
|
||||
if fnam:
|
||||
chatcompletionsadapter_var.set(fnam)
|
||||
ctk.CTkButton(model_tab, 64, text="Pick Premade", command=pickpremadetemplate).grid(row=13, column=0, padx=322, stick="nw")
|
||||
ctk.CTkButton(model_tab, 64, text="Pick Premade", command=pickpremadetemplate).grid(row=15, column=0, padx=322, stick="nw")
|
||||
|
||||
mmproj_var.trace("w", gui_changed_modelfile)
|
||||
makecheckbox(model_tab, "Allow Launch Without Models", nomodel, 15, tooltiptxt="Allows running the WebUI with no model loaded.")
|
||||
draftmodel_var.trace("w", gui_changed_modelfile)
|
||||
makecheckbox(model_tab, "Allow Launch Without Models", nomodel, 17, tooltiptxt="Allows running the WebUI with no model loaded.")
|
||||
|
||||
# Network Tab
|
||||
network_tab = tabcontent["Network"]
|
||||
|
@ -3489,6 +3503,7 @@ def show_gui():
|
|||
except Exception as ex2:
|
||||
pass
|
||||
args.mmproj = None if mmproj_var.get() == "" else mmproj_var.get()
|
||||
args.draftmodel = None if draftmodel_var.get() == "" else draftmodel_var.get()
|
||||
|
||||
args.ssl = None if (ssl_cert_var.get() == "" or ssl_key_var.get() == "") else ([ssl_cert_var.get(), ssl_key_var.get()])
|
||||
args.password = None if (password_var.get() == "") else (password_var.get())
|
||||
|
@ -3649,6 +3664,7 @@ def show_gui():
|
|||
lora_var.set(dict["lora"][0])
|
||||
|
||||
mmproj_var.set(dict["mmproj"] if ("mmproj" in dict and dict["mmproj"]) else "")
|
||||
draftmodel_var.set(dict["draftmodel"] if ("draftmodel" in dict and dict["draftmodel"]) else "")
|
||||
|
||||
ssl_cert_var.set("")
|
||||
ssl_key_var.set("")
|
||||
|
@ -4442,6 +4458,10 @@ def main(launch_args,start_server=True):
|
|||
dlfile = download_model_from_url(args.whispermodel,[".gguf",".bin"])
|
||||
if dlfile:
|
||||
args.whispermodel = dlfile
|
||||
if args.draftmodel and args.draftmodel!="":
|
||||
dlfile = download_model_from_url(args.draftmodel,[".gguf"])
|
||||
if dlfile:
|
||||
args.draftmodel = dlfile
|
||||
|
||||
# sanitize and replace the default vanity name. remember me....
|
||||
if args.model_param and args.model_param!="":
|
||||
|
@ -4517,7 +4537,7 @@ def main(launch_args,start_server=True):
|
|||
pass
|
||||
if args.gpulayers==-1:
|
||||
if MaxMemory[0] > 0 and (not args.usecpu) and ((args.usecublas is not None) or (args.usevulkan is not None) or (args.useclblast is not None) or sys.platform=="darwin"):
|
||||
extract_modelfile_params(args.model_param,args.sdmodel,args.whispermodel,args.mmproj)
|
||||
extract_modelfile_params(args.model_param,args.sdmodel,args.whispermodel,args.mmproj,args.draftmodel)
|
||||
layeramt = autoset_gpu_layers(args.contextsize,args.sdquant,args.blasbatchsize)
|
||||
print(f"Auto Recommended GPU Layers: {layeramt}")
|
||||
args.gpulayers = layeramt
|
||||
|
@ -4923,6 +4943,7 @@ if __name__ == '__main__':
|
|||
advparser.add_argument("--ssl", help="Allows all content to be served over SSL instead. A valid UNENCRYPTED SSL cert and key .pem files must be provided", metavar=('[cert_pem]', '[key_pem]'), nargs='+')
|
||||
advparser.add_argument("--nocertify", help="Allows insecure SSL connections. Use this if you have cert errors and need to bypass certificate restrictions.", action='store_true')
|
||||
advparser.add_argument("--mmproj", help="Select a multimodal projector file for vision models like LLaVA.", default="")
|
||||
advparser.add_argument("--draftmodel", help="Load a small draft model for speculative decoding. It will be fully offloaded. Vocab must match the main model.", default="")
|
||||
advparser.add_argument("--password", help="Enter a password required to use this instance. This key will be required for all text endpoints. Image endpoints are not secured.", default=None)
|
||||
advparser.add_argument("--ignoremissing", help="Ignores all missing non-essential files, just skipping them instead.", action='store_true')
|
||||
advparser.add_argument("--chatcompletionsadapter", help="Select an optional ChatCompletions Adapter JSON file to force custom instruct tags.", default="")
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue