speculative decoding initial impl completed (+6 squashed commit)

Squashed commit: [0a6306ca0] draft wip dont use (will be squashed) [a758a1c9c] wip dont use (will be squashed) [e1994d3ce] wip dont use [f59690d68] wip [77228147d] wip on spec decoding. dont use yet [2445bca54] wip adding speculative decoding (+1 squashed commits) Squashed commits: [50e341bb7] wip adding speculative decoding
2025-09-10 17:14:36 +00:00 · 2024-11-27 00:16:51 +08:00 · 2024-11-27 00:16:51 +08:00 · f75bbb945f
commit f75bbb945f
parent b9e99c69e8
9 changed files with 539 additions and 280 deletions
--- a/koboldcpp.py
+++ b/koboldcpp.py
@ -131,6 +131,7 @@ class load_model_inputs(ctypes.Structure):
                ("model_filename", ctypes.c_char_p),
                ("lora_filename", ctypes.c_char_p),
                ("lora_base", ctypes.c_char_p),
+                ("draftmodel_filename", ctypes.c_char_p),
                ("mmproj_filename", ctypes.c_char_p),
                ("use_mmap", ctypes.c_bool),
                ("use_mlock", ctypes.c_bool),
@ -672,24 +673,27 @@ def read_gguf_metadata(file_path):
    except Exception as ex:
        return None

-def extract_modelfile_params(filepath,sdfilepath,whisperfilepath,mmprojfilepath):
+def extract_modelfile_params(filepath,sdfilepath,whisperfilepath,mmprojfilepath,draftmodelpath):
    global modelfile_extracted_meta
    modelfile_extracted_meta = None
    sdfsize = 0
    whisperfsize = 0
    mmprojsize = 0
+    draftmodelsize = 0
    if sdfilepath and os.path.exists(sdfilepath):
        sdfsize = os.path.getsize(sdfilepath)
    if whisperfilepath and os.path.exists(whisperfilepath):
        whisperfsize = os.path.getsize(whisperfilepath)
    if mmprojfilepath and os.path.exists(mmprojfilepath):
        mmprojsize = os.path.getsize(mmprojfilepath)
+    if draftmodelpath and os.path.exists(draftmodelpath):
+        draftmodelsize = os.path.getsize(draftmodelpath)
    if filepath and os.path.exists(filepath):
        try:
            fsize = os.path.getsize(filepath)
            if fsize>10000000: #dont bother with models < 10mb as they are probably bad
                ggufmeta = read_gguf_metadata(filepath)
-                modelfile_extracted_meta = [ggufmeta,fsize,sdfsize,whisperfsize,mmprojsize] #extract done. note that meta may be null
+                modelfile_extracted_meta = [ggufmeta,fsize,sdfsize,whisperfsize,mmprojsize,draftmodelsize] #extract done. note that meta may be null
        except Exception as ex:
            modelfile_extracted_meta = None

@ -702,7 +706,7 @@ def autoset_gpu_layers(ctxsize,sdquanted,bbs): #shitty algo to determine how man
        if showusedmemwarning and usedmem > (2.5*1024*1024*1024):
            showusedmemwarning = False
            print(f"Note: KoboldCpp has detected that a significant amount of GPU VRAM ({usedmem/1024/1024} MB) is currently used by another application.\nFor best results, you may wish to close that application and then restart KoboldCpp.\n***")
-    reservedmem = max(1.5*1024*1024*1024,(0.5*1024*1024*1024 + usedmem)) # determine vram overhead
+    reservedmem = max(1.3*1024*1024*1024,(0.5*1024*1024*1024 + usedmem)) # determine vram overhead
    try:
        if not modelfile_extracted_meta:
            return 0
@ -719,6 +723,9 @@ def autoset_gpu_layers(ctxsize,sdquanted,bbs): #shitty algo to determine how man
                mem -= 350*1024*1024
            if modelfile_extracted_meta[4] > 1024*1024*10: #mmproj tax
                mem -= 350*1024*1024
+            if modelfile_extracted_meta[5] > 1024*1024*10: #draft model tax
+                mem -= (modelfile_extracted_meta[5] * 1.5)
+            mem = 0 if mem < 0 else mem

            csmul = 1.0
            if cs:
@ -732,8 +739,8 @@ def autoset_gpu_layers(ctxsize,sdquanted,bbs): #shitty algo to determine how man
                headcount = ggufmeta[1]
                headkvlen = (ggufmeta[2] if ggufmeta[2] > 0 else 128)
                ratio = (mem-usedmem)/(fsize*csmul*1.6*(1.0 if bbs <= 512 else 1.2))
-                computemem = layers*(4 if bbs <= 512 else (bbs/128))*headkvlen*cs*4*1.5 # apply blasbatchsize calculations if over 512
-                contextmem = layers*headcount*headkvlen*cs*4*1.1
+                computemem = layers*(4 if bbs <= 512 else (bbs/128))*headkvlen*cs*4*1.55 # apply blasbatchsize calculations if over 512
+                contextmem = layers*headcount*headkvlen*cs*4*1.15
                if headcount > 0:
                    ratio = max(ratio, (mem - reservedmem - computemem) / (fsize + contextmem))
                layerlimit = min(int(ratio*layers), (layers + 3))
@ -877,6 +884,7 @@ def load_model(model_filename):
        if len(args.lora) > 1:
            inputs.lora_base = args.lora[1].encode("UTF-8")

+    inputs.draftmodel_filename = args.draftmodel.encode("UTF-8") if args.draftmodel else "".encode("UTF-8")
    inputs.mmproj_filename = args.mmproj.encode("UTF-8") if args.mmproj else "".encode("UTF-8")
    inputs.use_smartcontext = args.smartcontext
    inputs.use_contextshift = (0 if args.noshift else 1)
@ -1510,15 +1518,18 @@ ws ::= | " " | "\n" [ \t]{0,20}
    elif api_format==6:
        detokstr = ""
        tokids = genparams.get('context', [])
+        adapter_obj = {} if chatcompl_adapter is None else chatcompl_adapter
+        user_message_start = adapter_obj.get("user_start", "\n\n### Instruction:\n")
+        assistant_message_start = adapter_obj.get("assistant_start", "\n\n### Response:\n")
        try:
            detokstr = detokenize_ids(tokids)
        except Exception as e:
            utfprint("Ollama Context Error: " + str(e))
        ollamasysprompt = genparams.get('system', "")
-        ollamabodyprompt = detokstr + "\n\n### Instruction:\n" + genparams.get('prompt', "") + "\n\n### Response:\n"
+        ollamabodyprompt = f"{detokstr}{user_message_start}{genparams.get('prompt', '')}{assistant_message_start}"
        genparams["stop_sequence"] = genparams.get('stop', [])
-        genparams["stop_sequence"].append("\n### Instruction:")
-        genparams["stop_sequence"].append("\n### Response:")
+        genparams["stop_sequence"].append(user_message_start.strip())
+        genparams["stop_sequence"].append(assistant_message_start.strip())
        genparams["trim_stop"] = True
        genparams["ollamasysprompt"] = ollamasysprompt
        genparams["ollamabodyprompt"] = ollamabodyprompt
@ -2374,9 +2385,8 @@ Enter Prompt:<br>
                        return

                is_quiet = args.quiet
-                utfprint(f"\n{datetime.now().strftime('[%H:%M:%S] Input Received')}")
                if (args.debugmode != -1 and not is_quiet) or args.debugmode >= 1:
-                    utfprint(f"Input: " + json.dumps(genparams))
+                    utfprint(f"\nInput: " + json.dumps(genparams))

                if args.foreground:
                    bring_terminal_to_foreground()
@ -2751,6 +2761,7 @@ def show_gui():
    lora_base_var = ctk.StringVar()
    preloadstory_var = ctk.StringVar()
    mmproj_var = ctk.StringVar()
+    draftmodel_var = ctk.StringVar()
    nomodel = ctk.IntVar(value=0)

    port_var = ctk.StringVar(value=defaultport)
@ -2929,7 +2940,8 @@ def show_gui():
            sdfilepath = sd_model_var.get()
            whisperfilepath = whisper_model_var.get()
            mmprojfilepath = mmproj_var.get()
-            extract_modelfile_params(filepath,sdfilepath,whisperfilepath,mmprojfilepath)
+            draftmodelpath = draftmodel_var.get()
+            extract_modelfile_params(filepath,sdfilepath,whisperfilepath,mmprojfilepath,draftmodelpath)
            changed_gpulayers_estimate()
        pass

@ -3234,18 +3246,20 @@ def show_gui():
    makefileentry(model_tab, "Text Lora:", "Select Lora File",lora_var, 3,width=280,tooltiptxt="Select an optional GGML LoRA adapter to use.\nLeave blank to skip.")
    makefileentry(model_tab, "Text Lora Base:", "Select Lora Base File", lora_base_var, 5,width=280,tooltiptxt="Select an optional F16 GGML LoRA base file to use.\nLeave blank to skip.")
    makefileentry(model_tab, "Vision mmproj:", "Select Vision mmproj File", mmproj_var, 7,width=280,tooltiptxt="Select a mmproj file to use for vision models like LLaVA.\nLeave blank to skip.")
-    makefileentry(model_tab, "Preloaded Story:", "Select Preloaded Story File", preloadstory_var, 9,width=280,tooltiptxt="Select an optional KoboldAI JSON savefile \nto be served on launch to any client.")
-    makefileentry(model_tab, "ChatCompletions Adapter:", "Select ChatCompletions Adapter File", chatcompletionsadapter_var, 12, width=250, filetypes=[("JSON Adapter", "*.json")], tooltiptxt="Select an optional ChatCompletions Adapter JSON file to force custom instruct tags.")
+    makefileentry(model_tab, "Speculative Model:", "Select Draft Text Model File", draftmodel_var, 9,width=280,tooltiptxt="Select a draft text model file to use for speculative decoding.\nLeave blank to skip.")
+    makefileentry(model_tab, "Preloaded Story:", "Select Preloaded Story File", preloadstory_var, 11,width=280,tooltiptxt="Select an optional KoboldAI JSON savefile \nto be served on launch to any client.")
+    makefileentry(model_tab, "ChatCompletions Adapter:", "Select ChatCompletions Adapter File", chatcompletionsadapter_var, 14, width=250, filetypes=[("JSON Adapter", "*.json")], tooltiptxt="Select an optional ChatCompletions Adapter JSON file to force custom instruct tags.")
    def pickpremadetemplate():
        initialDir = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'kcpp_adapters')
        initialDir = initialDir if os.path.isdir(initialDir) else None
        fnam = askopenfilename(title="Pick Premade ChatCompletions Adapter",filetypes=[("JSON Adapter", "*.json")], initialdir=initialDir)
        if fnam:
            chatcompletionsadapter_var.set(fnam)
-    ctk.CTkButton(model_tab, 64, text="Pick Premade", command=pickpremadetemplate).grid(row=13, column=0, padx=322, stick="nw")
+    ctk.CTkButton(model_tab, 64, text="Pick Premade", command=pickpremadetemplate).grid(row=15, column=0, padx=322, stick="nw")

    mmproj_var.trace("w", gui_changed_modelfile)
-    makecheckbox(model_tab, "Allow Launch Without Models", nomodel, 15, tooltiptxt="Allows running the WebUI with no model loaded.")
+    draftmodel_var.trace("w", gui_changed_modelfile)
+    makecheckbox(model_tab, "Allow Launch Without Models", nomodel, 17, tooltiptxt="Allows running the WebUI with no model loaded.")

    # Network Tab
    network_tab = tabcontent["Network"]
@ -3489,6 +3503,7 @@ def show_gui():
        except Exception as ex2:
            pass
        args.mmproj = None if mmproj_var.get() == "" else mmproj_var.get()
+        args.draftmodel = None if draftmodel_var.get() == "" else draftmodel_var.get()

        args.ssl = None if (ssl_cert_var.get() == "" or ssl_key_var.get() == "") else ([ssl_cert_var.get(), ssl_key_var.get()])
        args.password = None if (password_var.get() == "") else (password_var.get())
@ -3649,6 +3664,7 @@ def show_gui():
                lora_var.set(dict["lora"][0])

        mmproj_var.set(dict["mmproj"] if ("mmproj" in dict and dict["mmproj"]) else "")
+        draftmodel_var.set(dict["draftmodel"] if ("draftmodel" in dict and dict["draftmodel"]) else "")

        ssl_cert_var.set("")
        ssl_key_var.set("")
@ -4442,6 +4458,10 @@ def main(launch_args,start_server=True):
        dlfile = download_model_from_url(args.whispermodel,[".gguf",".bin"])
        if dlfile:
            args.whispermodel = dlfile
+    if args.draftmodel and args.draftmodel!="":
+        dlfile = download_model_from_url(args.draftmodel,[".gguf"])
+        if dlfile:
+            args.draftmodel = dlfile

    # sanitize and replace the default vanity name. remember me....
    if args.model_param and args.model_param!="":
@ -4517,7 +4537,7 @@ def main(launch_args,start_server=True):
                pass
            if args.gpulayers==-1:
                if MaxMemory[0] > 0 and (not args.usecpu) and ((args.usecublas is not None) or (args.usevulkan is not None) or (args.useclblast is not None) or sys.platform=="darwin"):
-                    extract_modelfile_params(args.model_param,args.sdmodel,args.whispermodel,args.mmproj)
+                    extract_modelfile_params(args.model_param,args.sdmodel,args.whispermodel,args.mmproj,args.draftmodel)
                    layeramt = autoset_gpu_layers(args.contextsize,args.sdquant,args.blasbatchsize)
                    print(f"Auto Recommended GPU Layers: {layeramt}")
                    args.gpulayers = layeramt
@ -4923,6 +4943,7 @@ if __name__ == '__main__':
    advparser.add_argument("--ssl", help="Allows all content to be served over SSL instead. A valid UNENCRYPTED SSL cert and key .pem files must be provided", metavar=('[cert_pem]', '[key_pem]'), nargs='+')
    advparser.add_argument("--nocertify", help="Allows insecure SSL connections. Use this if you have cert errors and need to bypass certificate restrictions.", action='store_true')
    advparser.add_argument("--mmproj", help="Select a multimodal projector file for vision models like LLaVA.", default="")
+    advparser.add_argument("--draftmodel", help="Load a small draft model for speculative decoding. It will be fully offloaded. Vocab must match the main model.", default="")
    advparser.add_argument("--password", help="Enter a password required to use this instance. This key will be required for all text endpoints. Image endpoints are not secured.", default=None)
    advparser.add_argument("--ignoremissing", help="Ignores all missing non-essential files, just skipping them instead.", action='store_true')
    advparser.add_argument("--chatcompletionsadapter", help="Select an optional ChatCompletions Adapter JSON file to force custom instruct tags.", default="")