Pass img_min_params and img_max_params to ctx_clip_params (#2133)

* Pass img_min_params and img_max_params to ctx_clip_params These values determine the minimum and maximum size (in tokens) of vision embeddings. The default value of -1 uses a model-dependent default size, for example for Gemma 4 the default is a 280 token embedding. For higher quality results (at the cost of using more memory and slower speed) you can increase the size of the embedding to 1120 tokens. * Change dict to mydict to match change to method
2026-05-19 16:31:59 +00:00 · 2026-04-15 21:27:06 -07:00 · 2026-04-15 21:27:06 -07:00 · c592bd01da
commit c592bd01da
parent a9f9e9a38b
4 changed files with 22 additions and 2 deletions
--- a/expose.h
+++ b/expose.h
@ -47,6 +47,8 @@ struct load_model_inputs
    const char * mmproj_filename = nullptr;
    const bool mmproj_cpu = false;
    const int visionmaxres = 2048;
+    const int image_min_tokens = -1;
+    const int image_max_tokens = -1;
    const bool use_mmap = false;
    const bool use_mlock = false;
    const bool use_smartcontext = false;
--- a/gpttype_adapter.cpp
+++ b/gpttype_adapter.cpp
@ -2159,6 +2159,8 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
    kcpp_pipeline_parallelism = inputs.pipelineparallel;
    kcpp_data->n_batch = GetBatchSize(inputs.batchsize, in_file_format);
    kcpp_data->n_ubatch = kcpp_data->n_batch;
+    kcpp_data->image_min_tokens = inputs.image_min_tokens;
+    kcpp_data->image_max_tokens = inputs.image_max_tokens;
    if(isGguf && kcpp_pipeline_parallelism)
    {
        //double the logical batch, while keeping the physical batch the same, pipeline parallel set GGML_SCHED_MAX_COPIES to 2
@ -2748,8 +2750,8 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
            clip_context_params ctx_clip_params {
                /* use_gpu           */ true,
                /* flash_attn_type   */ clip_fa,
-                /* image_min_tokens  */ -1,
-                /* image_max_tokens  */ -1,
+                /* image_min_tokens  */ kcpp_data->image_min_tokens,
+                /* image_max_tokens  */ kcpp_data->image_max_tokens,
            };
            clip_init_result cres = clip_init(mmproj_filename.c_str(), ctx_clip_params);
            clp_ctx_v = cres.ctx_v;
--- a/koboldcpp.py
+++ b/koboldcpp.py
@ -231,6 +231,8 @@ class load_model_inputs(ctypes.Structure):
                ("mmproj_filename", ctypes.c_char_p),
                ("mmproj_cpu", ctypes.c_bool),
                ("visionmaxres", ctypes.c_int),
+                ("image_min_tokens", ctypes.c_int),
+                ("image_max_tokens", ctypes.c_int),
                ("use_mmap", ctypes.c_bool),
                ("use_mlock", ctypes.c_bool),
                ("use_smartcontext", ctypes.c_bool),
@ -1885,6 +1887,8 @@ def load_model(model_filename):
    inputs.mmproj_filename = args.mmproj.encode("UTF-8") if args.mmproj else "".encode("UTF-8")
    inputs.mmproj_cpu = (True if args.mmprojcpu else False)
    inputs.visionmaxres = (512 if args.visionmaxres < 512 else (2048 if args.visionmaxres > 2048 else args.visionmaxres))
+    inputs.image_min_tokens = args.image_min_tokens
+    inputs.image_max_tokens = args.image_max_tokens
    inputs.use_smartcontext = args.smartcontext
    inputs.use_contextshift = (0 if args.noshift else 1)
    inputs.use_fastforward = (0 if args.nofastforward else 1)
@ -7274,6 +7278,8 @@ def show_gui():
    mmproj_var = ctk.StringVar()
    mmprojcpu_var = ctk.IntVar(value=0)
    visionmaxres_var = ctk.StringVar(value=str(default_visionmaxres))
+    image_min_tokens_var = ctk.StringVar(value="-1")
+    image_max_tokens_var = ctk.StringVar(value="-1")
    draftmodel_var = ctk.StringVar()
    draftamount_var = ctk.StringVar(value=str(default_draft_amount))
    draftgpulayers_var = ctk.StringVar(value=str(999))
@ -8419,6 +8425,8 @@ def show_gui():
        args.mmproj = None if mmproj_var.get() == "" else mmproj_var.get()
        args.mmprojcpu = (mmprojcpu_var.get()==1)
        args.visionmaxres = int(visionmaxres_var.get()) if visionmaxres_var.get()!="" else default_visionmaxres
+        args.image_min_tokens = int(image_min_tokens_var.get()) if image_min_tokens.get()!="" else -1
+        args.image_max_tokens = int(image_max_tokens_var.get()) if image_max_tokens.get()!="" else -1
        args.draftmodel = None if draftmodel_var.get() == "" else draftmodel_var.get()
        args.draftamount = int(draftamount_var.get()) if draftamount_var.get()!="" else default_draft_amount
        args.draftgpulayers = int(draftgpulayers_var.get()) if draftgpulayers_var.get()!="" else 999
@ -8688,6 +8696,10 @@ def show_gui():
        mmprojcpu_var.set(1 if ("mmprojcpu" in mydict and mydict["mmprojcpu"]) else 0)
        if "visionmaxres" in mydict and mydict["visionmaxres"]:
            visionmaxres_var.set(mydict["visionmaxres"])
+        if "image_min_tokens" in mydict and mydict["image_min_tokens"]:
+            image_min_tokens_var.set(mydict["image_min_tokens"])
+        if "image_max_tokens" in mydict and mydict["image_max_tokens"]:
+            image_max_tokens_var.set(mydict["image_max_tokens"])
        draftmodel_var.set(mydict["draftmodel"] if ("draftmodel" in mydict and mydict["draftmodel"]) else "")
        if "draftamount" in mydict:
            draftamount_var.set(mydict["draftamount"])
@ -11146,6 +11158,8 @@ if __name__ == '__main__':
    advparser.add_argument("--mmproj", metavar=('[filename]'), help="Select a multimodal projector file for vision models like LLaVA.", default="")
    advparser.add_argument("--mmprojcpu","--no-mmproj-offload", help="Force CLIP for Vision mmproj always on CPU.", action='store_true')
    advparser.add_argument("--visionmaxres", metavar=('[max px]'), help="Clamp MMProj vision maximum allowed resolution. Allowed values are between 512 to 2048 px (default 1024).", type=int, default=default_visionmaxres)
+    advparser.add_argument("--image-min-tokens", metavar=('[tokens]'), help="Override the minimum tokens for the MMProj embedding (default -1).", type=int, default=-1)
+    advparser.add_argument("--image-max-tokens", metavar=('[tokens]'), help="Override the maximum tokens for the MMProj embedding (default -1).", type=int, default=-1)
    advparser.add_argument("--draftmodel","--model-draft","-md", metavar=('[filename]'), help="Load a small draft model for speculative decoding. It will be fully offloaded. Vocab must match the main model.", default="")
    advparser.add_argument("--draftamount","--draft-max","--draft-n", metavar=('[tokens]'), help="How many tokens to draft per chunk before verifying results", type=int, default=default_draft_amount)
    advparser.add_argument("--draftgpulayers","--gpu-layers-draft","--n-gpu-layers-draft","-ngld", metavar=('[layers]'), help="How many layers to offload to GPU for the draft model (default=full offload)", type=int, default=999)
--- a/otherarch/otherarch.h
+++ b/otherarch/otherarch.h
@ -21,6 +21,8 @@ struct kcpp_params {
    int32_t n_ctx                 =     0; // context size
    int32_t n_batch               =  2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
    int32_t n_ubatch              =   512; // physical batch size for prompt processing (must be >=32 to use BLAS)
+    int image_min_tokens          =    -1; // Minimum image embedding tokens
+    int image_max_tokens          =    -1; // Maximum image embedding tokens
    int      n_threads                   = -1;
    int      n_blasthreads               = -1;