diff --git a/expose.h b/expose.h index 855779a23..623fb2134 100644 --- a/expose.h +++ b/expose.h @@ -41,7 +41,7 @@ struct load_model_inputs const char * lora_filename = nullptr; const char * lora_base = nullptr; const char * draftmodel_filename = nullptr; - const int draft_amount = 16; + const int draft_amount = 12; const char * mmproj_filename = nullptr; const bool use_mmap = false; const bool use_mlock = false; diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index 9000fed56..9fab4253f 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -53,7 +53,7 @@ std::string lora_filename = ""; std::string lora_base = ""; std::string mmproj_filename = ""; std::string draftmodel_filename = ""; -int speculative_chunk_amt = 16; //do it in chunks of this many tokens +int speculative_chunk_amt = 12; //do it in chunks of this many tokens bool generation_finished; float last_process_time = 0; float last_eval_time = 0; @@ -680,7 +680,10 @@ static speculative_draft_result speculative_decoding_eval_chunk(llama_context * ++draft_npast; } //now that we have our drafted tokens, we form a batch and PP it - kcpp_embd_batch batch2 = kcpp_embd_batch(drafted_ids, actual_npast, true); + + std::vector real_embd = drafted_ids; + real_embd.pop_back(); + kcpp_embd_batch batch2 = kcpp_embd_batch(real_embd, actual_npast, true); auto draftok = (llama_decode(main_ctx, batch2.batch)==0); //actual eval for big model if(!draftok) { diff --git a/koboldcpp.py b/koboldcpp.py index 839a6c1d0..ef48f8c87 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -2764,7 +2764,7 @@ def show_gui(): preloadstory_var = ctk.StringVar() mmproj_var = ctk.StringVar() draftmodel_var = ctk.StringVar() - draftamount_var = ctk.StringVar(value="16") + draftamount_var = ctk.StringVar(value="12") nomodel = ctk.IntVar(value=0) port_var = ctk.StringVar(value=defaultport) @@ -3508,7 +3508,7 @@ def show_gui(): pass args.mmproj = None if mmproj_var.get() == "" else mmproj_var.get() args.draftmodel = None if draftmodel_var.get() == "" else draftmodel_var.get() - args.draftamount = int(draftamount_var.get()) if draftamount_var.get()!="" else 16 + args.draftamount = int(draftamount_var.get()) if draftamount_var.get()!="" else 12 args.ssl = None if (ssl_cert_var.get() == "" or ssl_key_var.get() == "") else ([ssl_cert_var.get(), ssl_key_var.get()]) args.password = None if (password_var.get() == "") else (password_var.get()) @@ -4951,7 +4951,7 @@ if __name__ == '__main__': advparser.add_argument("--nocertify", help="Allows insecure SSL connections. Use this if you have cert errors and need to bypass certificate restrictions.", action='store_true') advparser.add_argument("--mmproj", help="Select a multimodal projector file for vision models like LLaVA.", default="") advparser.add_argument("--draftmodel", help="Load a small draft model for speculative decoding. It will be fully offloaded. Vocab must match the main model.", default="") - advparser.add_argument("--draftamount", metavar=('[tokens]'), help="How many tokens to draft per chunk before verifying results", type=int, default=16) + advparser.add_argument("--draftamount", metavar=('[tokens]'), help="How many tokens to draft per chunk before verifying results", type=int, default=12) advparser.add_argument("--password", help="Enter a password required to use this instance. This key will be required for all text endpoints. Image endpoints are not secured.", default=None) advparser.add_argument("--ignoremissing", help="Ignores all missing non-essential files, just skipping them instead.", action='store_true') advparser.add_argument("--chatcompletionsadapter", help="Select an optional ChatCompletions Adapter JSON file to force custom instruct tags.", default="")