mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-10 17:14:36 +00:00
customizable speculative size
This commit is contained in:
parent
f75bbb945f
commit
b21d0fe3ac
3 changed files with 18 additions and 8 deletions
1
expose.h
1
expose.h
|
@ -41,6 +41,7 @@ struct load_model_inputs
|
||||||
const char * lora_filename = nullptr;
|
const char * lora_filename = nullptr;
|
||||||
const char * lora_base = nullptr;
|
const char * lora_base = nullptr;
|
||||||
const char * draftmodel_filename = nullptr;
|
const char * draftmodel_filename = nullptr;
|
||||||
|
const int draft_amount = 16;
|
||||||
const char * mmproj_filename = nullptr;
|
const char * mmproj_filename = nullptr;
|
||||||
const bool use_mmap = false;
|
const bool use_mmap = false;
|
||||||
const bool use_mlock = false;
|
const bool use_mlock = false;
|
||||||
|
|
|
@ -43,7 +43,6 @@
|
||||||
#include "common/common.h"
|
#include "common/common.h"
|
||||||
|
|
||||||
//const
|
//const
|
||||||
const int speculative_chunk_amt = 16; //do it in chunks of this many tokens
|
|
||||||
const int extra_context_handle_fragmentation = 120;
|
const int extra_context_handle_fragmentation = 120;
|
||||||
const int LLAVA_TOKEN_IDENTIFIER_A = -998; //alternate between both, changing when image changes
|
const int LLAVA_TOKEN_IDENTIFIER_A = -998; //alternate between both, changing when image changes
|
||||||
const int LLAVA_TOKEN_IDENTIFIER_B = -999;
|
const int LLAVA_TOKEN_IDENTIFIER_B = -999;
|
||||||
|
@ -54,6 +53,7 @@ std::string lora_filename = "";
|
||||||
std::string lora_base = "";
|
std::string lora_base = "";
|
||||||
std::string mmproj_filename = "";
|
std::string mmproj_filename = "";
|
||||||
std::string draftmodel_filename = "";
|
std::string draftmodel_filename = "";
|
||||||
|
int speculative_chunk_amt = 16; //do it in chunks of this many tokens
|
||||||
bool generation_finished;
|
bool generation_finished;
|
||||||
float last_process_time = 0;
|
float last_process_time = 0;
|
||||||
float last_eval_time = 0;
|
float last_eval_time = 0;
|
||||||
|
@ -2267,6 +2267,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
printf("\nAttempting to load draft model for speculative decoding. It will be fully offloaded if possible. Vocab must match the main model.\n");
|
printf("\nAttempting to load draft model for speculative decoding. It will be fully offloaded if possible. Vocab must match the main model.\n");
|
||||||
|
speculative_chunk_amt = inputs.draft_amount;
|
||||||
speculative_decoding_setup(draftmodel_filename, model_params, llama_ctx_params, n_vocab);
|
speculative_decoding_setup(draftmodel_filename, model_params, llama_ctx_params, n_vocab);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
22
koboldcpp.py
22
koboldcpp.py
|
@ -132,6 +132,7 @@ class load_model_inputs(ctypes.Structure):
|
||||||
("lora_filename", ctypes.c_char_p),
|
("lora_filename", ctypes.c_char_p),
|
||||||
("lora_base", ctypes.c_char_p),
|
("lora_base", ctypes.c_char_p),
|
||||||
("draftmodel_filename", ctypes.c_char_p),
|
("draftmodel_filename", ctypes.c_char_p),
|
||||||
|
("draft_amount", ctypes.c_int),
|
||||||
("mmproj_filename", ctypes.c_char_p),
|
("mmproj_filename", ctypes.c_char_p),
|
||||||
("use_mmap", ctypes.c_bool),
|
("use_mmap", ctypes.c_bool),
|
||||||
("use_mlock", ctypes.c_bool),
|
("use_mlock", ctypes.c_bool),
|
||||||
|
@ -885,6 +886,7 @@ def load_model(model_filename):
|
||||||
inputs.lora_base = args.lora[1].encode("UTF-8")
|
inputs.lora_base = args.lora[1].encode("UTF-8")
|
||||||
|
|
||||||
inputs.draftmodel_filename = args.draftmodel.encode("UTF-8") if args.draftmodel else "".encode("UTF-8")
|
inputs.draftmodel_filename = args.draftmodel.encode("UTF-8") if args.draftmodel else "".encode("UTF-8")
|
||||||
|
inputs.draft_amount = args.draftamount
|
||||||
inputs.mmproj_filename = args.mmproj.encode("UTF-8") if args.mmproj else "".encode("UTF-8")
|
inputs.mmproj_filename = args.mmproj.encode("UTF-8") if args.mmproj else "".encode("UTF-8")
|
||||||
inputs.use_smartcontext = args.smartcontext
|
inputs.use_smartcontext = args.smartcontext
|
||||||
inputs.use_contextshift = (0 if args.noshift else 1)
|
inputs.use_contextshift = (0 if args.noshift else 1)
|
||||||
|
@ -2762,6 +2764,7 @@ def show_gui():
|
||||||
preloadstory_var = ctk.StringVar()
|
preloadstory_var = ctk.StringVar()
|
||||||
mmproj_var = ctk.StringVar()
|
mmproj_var = ctk.StringVar()
|
||||||
draftmodel_var = ctk.StringVar()
|
draftmodel_var = ctk.StringVar()
|
||||||
|
draftamount_var = ctk.StringVar(value="16")
|
||||||
nomodel = ctk.IntVar(value=0)
|
nomodel = ctk.IntVar(value=0)
|
||||||
|
|
||||||
port_var = ctk.StringVar(value=defaultport)
|
port_var = ctk.StringVar(value=defaultport)
|
||||||
|
@ -3242,24 +3245,25 @@ def show_gui():
|
||||||
# Model Tab
|
# Model Tab
|
||||||
model_tab = tabcontent["Model Files"]
|
model_tab = tabcontent["Model Files"]
|
||||||
|
|
||||||
makefileentry(model_tab, "Text Model:", "Select GGUF or GGML Model File", model_var, 1,width=280, onchoosefile=on_picked_model_file,tooltiptxt="Select a GGUF or GGML model file on disk to be loaded.")
|
makefileentry(model_tab, "Text Model:", "Select GGUF or GGML Model File", model_var, 1,width=280,singlerow=True, onchoosefile=on_picked_model_file,tooltiptxt="Select a GGUF or GGML model file on disk to be loaded.")
|
||||||
makefileentry(model_tab, "Text Lora:", "Select Lora File",lora_var, 3,width=280,tooltiptxt="Select an optional GGML LoRA adapter to use.\nLeave blank to skip.")
|
makefileentry(model_tab, "Text Lora:", "Select Lora File",lora_var, 3,width=280,singlerow=True,tooltiptxt="Select an optional GGML Text LoRA adapter to use.\nLeave blank to skip.")
|
||||||
makefileentry(model_tab, "Text Lora Base:", "Select Lora Base File", lora_base_var, 5,width=280,tooltiptxt="Select an optional F16 GGML LoRA base file to use.\nLeave blank to skip.")
|
makefileentry(model_tab, "Lora Base:", "Select Lora Base File", lora_base_var, 5,width=280,singlerow=True,tooltiptxt="Select an optional F16 GGML Text LoRA base file to use.\nLeave blank to skip.")
|
||||||
makefileentry(model_tab, "Vision mmproj:", "Select Vision mmproj File", mmproj_var, 7,width=280,tooltiptxt="Select a mmproj file to use for vision models like LLaVA.\nLeave blank to skip.")
|
makefileentry(model_tab, "Vision mmproj:", "Select Vision mmproj File", mmproj_var, 7,width=280,tooltiptxt="Select a mmproj file to use for vision models like LLaVA.\nLeave blank to skip.")
|
||||||
makefileentry(model_tab, "Speculative Model:", "Select Draft Text Model File", draftmodel_var, 9,width=280,tooltiptxt="Select a draft text model file to use for speculative decoding.\nLeave blank to skip.")
|
makefileentry(model_tab, "Speculative Model:", "Select Draft Text Model File", draftmodel_var, 9,width=280,tooltiptxt="Select a draft text model file to use for speculative decoding.\nLeave blank to skip.")
|
||||||
makefileentry(model_tab, "Preloaded Story:", "Select Preloaded Story File", preloadstory_var, 11,width=280,tooltiptxt="Select an optional KoboldAI JSON savefile \nto be served on launch to any client.")
|
makelabelentry(model_tab, "Draft Amount: ", draftamount_var, 11, 50,padx=100,singleline=True,tooltip="How many tokens to draft per chunk before verifying results")
|
||||||
makefileentry(model_tab, "ChatCompletions Adapter:", "Select ChatCompletions Adapter File", chatcompletionsadapter_var, 14, width=250, filetypes=[("JSON Adapter", "*.json")], tooltiptxt="Select an optional ChatCompletions Adapter JSON file to force custom instruct tags.")
|
makefileentry(model_tab, "Preloaded Story:", "Select Preloaded Story File", preloadstory_var, 15,width=280,tooltiptxt="Select an optional KoboldAI JSON savefile \nto be served on launch to any client.")
|
||||||
|
makefileentry(model_tab, "ChatCompletions Adapter:", "Select ChatCompletions Adapter File", chatcompletionsadapter_var, 24, width=250, filetypes=[("JSON Adapter", "*.json")], tooltiptxt="Select an optional ChatCompletions Adapter JSON file to force custom instruct tags.")
|
||||||
def pickpremadetemplate():
|
def pickpremadetemplate():
|
||||||
initialDir = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'kcpp_adapters')
|
initialDir = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'kcpp_adapters')
|
||||||
initialDir = initialDir if os.path.isdir(initialDir) else None
|
initialDir = initialDir if os.path.isdir(initialDir) else None
|
||||||
fnam = askopenfilename(title="Pick Premade ChatCompletions Adapter",filetypes=[("JSON Adapter", "*.json")], initialdir=initialDir)
|
fnam = askopenfilename(title="Pick Premade ChatCompletions Adapter",filetypes=[("JSON Adapter", "*.json")], initialdir=initialDir)
|
||||||
if fnam:
|
if fnam:
|
||||||
chatcompletionsadapter_var.set(fnam)
|
chatcompletionsadapter_var.set(fnam)
|
||||||
ctk.CTkButton(model_tab, 64, text="Pick Premade", command=pickpremadetemplate).grid(row=15, column=0, padx=322, stick="nw")
|
ctk.CTkButton(model_tab, 64, text="Pick Premade", command=pickpremadetemplate).grid(row=25, column=0, padx=322, stick="nw")
|
||||||
|
|
||||||
mmproj_var.trace("w", gui_changed_modelfile)
|
mmproj_var.trace("w", gui_changed_modelfile)
|
||||||
draftmodel_var.trace("w", gui_changed_modelfile)
|
draftmodel_var.trace("w", gui_changed_modelfile)
|
||||||
makecheckbox(model_tab, "Allow Launch Without Models", nomodel, 17, tooltiptxt="Allows running the WebUI with no model loaded.")
|
makecheckbox(model_tab, "Allow Launch Without Models", nomodel, 27, tooltiptxt="Allows running the WebUI with no model loaded.")
|
||||||
|
|
||||||
# Network Tab
|
# Network Tab
|
||||||
network_tab = tabcontent["Network"]
|
network_tab = tabcontent["Network"]
|
||||||
|
@ -3504,6 +3508,7 @@ def show_gui():
|
||||||
pass
|
pass
|
||||||
args.mmproj = None if mmproj_var.get() == "" else mmproj_var.get()
|
args.mmproj = None if mmproj_var.get() == "" else mmproj_var.get()
|
||||||
args.draftmodel = None if draftmodel_var.get() == "" else draftmodel_var.get()
|
args.draftmodel = None if draftmodel_var.get() == "" else draftmodel_var.get()
|
||||||
|
args.draftamount = int(draftamount_var.get()) if draftamount_var.get()!="" else 16
|
||||||
|
|
||||||
args.ssl = None if (ssl_cert_var.get() == "" or ssl_key_var.get() == "") else ([ssl_cert_var.get(), ssl_key_var.get()])
|
args.ssl = None if (ssl_cert_var.get() == "" or ssl_key_var.get() == "") else ([ssl_cert_var.get(), ssl_key_var.get()])
|
||||||
args.password = None if (password_var.get() == "") else (password_var.get())
|
args.password = None if (password_var.get() == "") else (password_var.get())
|
||||||
|
@ -3665,6 +3670,8 @@ def show_gui():
|
||||||
|
|
||||||
mmproj_var.set(dict["mmproj"] if ("mmproj" in dict and dict["mmproj"]) else "")
|
mmproj_var.set(dict["mmproj"] if ("mmproj" in dict and dict["mmproj"]) else "")
|
||||||
draftmodel_var.set(dict["draftmodel"] if ("draftmodel" in dict and dict["draftmodel"]) else "")
|
draftmodel_var.set(dict["draftmodel"] if ("draftmodel" in dict and dict["draftmodel"]) else "")
|
||||||
|
if "draftamount" in dict:
|
||||||
|
draftamount_var.set(dict["draftamount"])
|
||||||
|
|
||||||
ssl_cert_var.set("")
|
ssl_cert_var.set("")
|
||||||
ssl_key_var.set("")
|
ssl_key_var.set("")
|
||||||
|
@ -4944,6 +4951,7 @@ if __name__ == '__main__':
|
||||||
advparser.add_argument("--nocertify", help="Allows insecure SSL connections. Use this if you have cert errors and need to bypass certificate restrictions.", action='store_true')
|
advparser.add_argument("--nocertify", help="Allows insecure SSL connections. Use this if you have cert errors and need to bypass certificate restrictions.", action='store_true')
|
||||||
advparser.add_argument("--mmproj", help="Select a multimodal projector file for vision models like LLaVA.", default="")
|
advparser.add_argument("--mmproj", help="Select a multimodal projector file for vision models like LLaVA.", default="")
|
||||||
advparser.add_argument("--draftmodel", help="Load a small draft model for speculative decoding. It will be fully offloaded. Vocab must match the main model.", default="")
|
advparser.add_argument("--draftmodel", help="Load a small draft model for speculative decoding. It will be fully offloaded. Vocab must match the main model.", default="")
|
||||||
|
advparser.add_argument("--draftamount", metavar=('[tokens]'), help="How many tokens to draft per chunk before verifying results", type=int, default=16)
|
||||||
advparser.add_argument("--password", help="Enter a password required to use this instance. This key will be required for all text endpoints. Image endpoints are not secured.", default=None)
|
advparser.add_argument("--password", help="Enter a password required to use this instance. This key will be required for all text endpoints. Image endpoints are not secured.", default=None)
|
||||||
advparser.add_argument("--ignoremissing", help="Ignores all missing non-essential files, just skipping them instead.", action='store_true')
|
advparser.add_argument("--ignoremissing", help="Ignores all missing non-essential files, just skipping them instead.", action='store_true')
|
||||||
advparser.add_argument("--chatcompletionsadapter", help="Select an optional ChatCompletions Adapter JSON file to force custom instruct tags.", default="")
|
advparser.add_argument("--chatcompletionsadapter", help="Select an optional ChatCompletions Adapter JSON file to force custom instruct tags.", default="")
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue