fixed embeddings, added new parameter to limit max embeddings context

2025-09-10 17:14:36 +00:00 · 2025-06-10 01:11:55 +08:00 · 2025-06-10 01:11:55 +08:00 · 7d8aa31f1f
commit 7d8aa31f1f
parent 8780b33c64
6 changed files with 360 additions and 9 deletions
--- a/koboldcpp.py
+++ b/koboldcpp.py
@ -56,7 +56,7 @@ logit_bias_max = 512
 dry_seq_break_max = 128

 # global vars
-KcppVersion = "1.93.1"
+KcppVersion = "1.93.2"
 showdebug = True
 kcpp_instance = None #global running instance
 global_memory = {"tunnel_url": "", "restart_target":"", "input_to_exit":False, "load_complete":False}
@ -350,6 +350,7 @@ class embeddings_load_model_inputs(ctypes.Structure):
                ("gpulayers", ctypes.c_int),
                ("flash_attention", ctypes.c_bool),
                ("use_mmap", ctypes.c_bool),
+                ("embeddingsmaxctx", ctypes.c_int),
                ("quiet", ctypes.c_bool),
                ("debugmode", ctypes.c_int)]

@ -1742,6 +1743,7 @@ def embeddings_load_model(model_filename):
    inputs.flash_attention = False
    inputs.threads = args.threads
    inputs.use_mmap = args.usemmap
+    inputs.embeddingsmaxctx = args.embeddingsmaxctx
    inputs = set_backend_props(inputs)
    ret = handle.embeddings_load_model(inputs)
    return ret
@ -4245,6 +4247,7 @@ def show_gui():
    ttsmaxlen_var = ctk.StringVar(value=str(default_ttsmaxlen))

    embeddings_model_var = ctk.StringVar()
+    embeddings_ctx_var = ctk.StringVar(value=str(""))

    admin_var = ctk.IntVar(value=0)
    admin_dir_var = ctk.StringVar()
@ -4852,7 +4855,8 @@ def show_gui():
    makelabelentry(model_tab, "Draft Amount: ", draftamount_var, 13, 50,padx=100,singleline=True,tooltip="How many tokens to draft per chunk before verifying results")
    makelabelentry(model_tab, "Splits: ", draftgpusplit_str_vars, 13, 50,padx=210,singleline=True,tooltip="Distribution of draft model layers. Leave blank to follow main model's gpu split. Only works if multi-gpu (All) selected in main model.", labelpadx=160)
    makelabelentry(model_tab, "Layers: ", draftgpulayers_var, 13, 50,padx=320,singleline=True,tooltip="How many layers to GPU offload for the draft model", labelpadx=270)
-    makefileentry(model_tab, "Embeds Model:", "Select Embeddings Model File", embeddings_model_var, 15, width=280,singlerow=True, filetypes=[("*.gguf","*.gguf")], tooltiptxt="Select an embeddings GGUF model that can be used to generate embedding vectors.")
+    makefileentry(model_tab, "Embeds Model:", "Select Embeddings Model File", embeddings_model_var, 15, width=160,singlerow=True, filetypes=[("*.gguf","*.gguf")], tooltiptxt="Select an embeddings GGUF model that can be used to generate embedding vectors.")
+    makelabelentry(model_tab, "EmbdCtx: ", embeddings_ctx_var, 15, 50,padx=390,singleline=True,tooltip="If set above 0, limits max context for embedding model to save memory.", labelpadx=330)
    makefileentry(model_tab, "Preload Story:", "Select Preloaded Story File", preloadstory_var, 17,width=280,singlerow=True,tooltiptxt="Select an optional KoboldAI JSON savefile \nto be served on launch to any client.")
    makefileentry(model_tab, "SaveData File:", "Select or Create New SaveData Database File", savedatafile_var, 19,width=280,filetypes=[("KoboldCpp SaveDB", "*.jsondb")],singlerow=True,dialog_type=1,tooltiptxt="Selecting a file will allow data to be loaded and saved persistently to this KoboldCpp server remotely. File is created if it does not exist.")
    makefileentry(model_tab, "ChatCompletions Adapter:", "Select ChatCompletions Adapter File", chatcompletionsadapter_var, 24, width=250, filetypes=[("JSON Adapter", "*.json")], tooltiptxt="Select an optional ChatCompletions Adapter JSON file to force custom instruct tags.")
@ -5207,6 +5211,9 @@ def show_gui():
        if embeddings_model_var.get() != "":
            args.embeddingsmodel = embeddings_model_var.get()

+        if embeddings_ctx_var.get() != "":
+            args.embeddingsmaxctx = (0 if embeddings_ctx_var.get()=="" else int(embeddings_ctx_var.get()))
+
        if tts_model_var.get() != "" and wavtokenizer_var.get() != "":
            args.ttsthreads = (0 if tts_threads_var.get()=="" else int(tts_threads_var.get()))
            args.ttsmodel = tts_model_var.get()
@ -5401,6 +5408,7 @@ def show_gui():
        ttsmaxlen_var.set(str(dict["ttsmaxlen"]) if ("ttsmaxlen" in dict and dict["ttsmaxlen"]) else str(default_ttsmaxlen))

        embeddings_model_var.set(dict["embeddingsmodel"] if ("embeddingsmodel" in dict and dict["embeddingsmodel"]) else "")
+        embeddings_ctx_var.set(str(dict["embeddingsmaxctx"]) if ("embeddingsmaxctx" in dict and dict["embeddingsmaxctx"]) else "")

        admin_var.set(dict["admin"] if ("admin" in dict) else 0)
        admin_dir_var.set(dict["admindir"] if ("admindir" in dict and dict["admindir"]) else "")
@ -7139,6 +7147,7 @@ if __name__ == '__main__':

    embeddingsparsergroup = parser.add_argument_group('Embeddings Model Commands')
    embeddingsparsergroup.add_argument("--embeddingsmodel", metavar=('[filename]'), help="Specify an embeddings model to be loaded for generating embedding vectors.", default="")
+    embeddingsparsergroup.add_argument("--embeddingsmaxctx", metavar=('[amount]'), help="Overrides the default maximum supported context of an embeddings model (defaults to trained context).", type=int, default=0)

    admingroup = parser.add_argument_group('Administration Commands')
    admingroup.add_argument("--admin", help="Enables admin mode, allowing you to unload and reload different configurations or models.", action='store_true')