increase max images, take images from the end instead of beginning if too many images

2026-05-19 16:31:59 +00:00 · 2026-03-26 23:03:52 +08:00 · 2026-03-26 23:03:52 +08:00 · c91f350ed5
commit c91f350ed5
parent 4a5c903718
4 changed files with 27 additions and 22 deletions
--- a/embd_res/klite.embd
+++ b/embd_res/klite.embd
@ -13979,9 +13979,9 @@ Current version indicated by LITEVER below.
 										},()=>{
 										});
 									}
-									else if(localflag && koboldcpp_has_musicgen && no_txt_model)
+									else if(localflag && (koboldcpp_has_musicgen||koboldcpp_has_tts) && no_txt_model)
 									{
-										msgboxYesNo("This KoboldCpp instance seems to be running an Music Generation model without any Text Generation model loaded.\n\nWould you like to launch MusicUI (Dedicated Music Generation WebUI bundled with KoboldCpp)?\n\nIf unsure, select 'Yes'.","Launch MusicUI?", ()=>{
+										msgboxYesNo("This KoboldCpp instance seems to be running an Music or Speech Generation model without any Text Generation model loaded.\n\nWould you like to launch MusicUI (Dedicated Audio Generation WebUI bundled with KoboldCpp)?\n\nIf unsure, select 'Yes'.","Launch MusicUI?", ()=>{
 											go_to_musicui();
 										},()=>{
 										});
--- a/expose.h
+++ b/expose.h
@ -2,8 +2,6 @@
 #include <cstdint>

 const int tensor_split_max = 16;
-const int images_max = 8;
-const int audio_max = 4;
 const int logprobs_max = 10;
 const int overridekv_max = 16;

@ -91,8 +89,10 @@ struct generation_inputs
    const char * memory = nullptr;
    const char * negative_prompt = nullptr;
    const float guidance_scale = 1;
-    const char * images[images_max] = {};
-    const char * audio[audio_max] = {};
+    const int images_len = 0;
+    const char ** images = nullptr;
+    const int audio_len = 0;
+    const char ** audio = nullptr;
    const int max_context_length = 0;
    const int max_length = 0;
    const float temperature = 0.0f;
--- a/gpttype_adapter.cpp
+++ b/gpttype_adapter.cpp
@ -3648,7 +3648,8 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
    }
    media_objects.clear();
    std::string new_media_composite = "";
-    for(int x=0;x<images_max;++x)
+
+    for(int x=0;x<inputs.images_len;++x)
    {
        std::string item = inputs.images[x];
        if(item!="")
@ -3678,7 +3679,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
            new_media_composite += item;
        }
    }
-    for(int x=0;x<audio_max;++x)
+    for(int x=0;x<inputs.audio_len;++x)
    {
        std::string item = inputs.audio[x];
        if(item!="")
--- a/koboldcpp.py
+++ b/koboldcpp.py
@ -45,8 +45,8 @@ import queue
 # constants
 sampler_order_max = 7
 tensor_split_max = 16
-images_max = 8
-audio_max = 4
+images_max = 16
+audio_max = 16
 bias_min_value = -100.0
 bias_max_value = 100.0
 logprobs_max = 10
@ -251,8 +251,10 @@ class generation_inputs(ctypes.Structure):
                ("memory", ctypes.c_char_p),
                ("negative_prompt", ctypes.c_char_p),
                ("guidance_scale", ctypes.c_float),
-                ("images", ctypes.c_char_p * images_max),
-                ("audio", ctypes.c_char_p * audio_max),
+                ("images_len", ctypes.c_int),
+                ("images", ctypes.POINTER(ctypes.c_char_p)),
+                ("audio_len", ctypes.c_int),
+                ("audio", ctypes.POINTER(ctypes.c_char_p)),
                ("max_context_length", ctypes.c_int),
                ("max_length", ctypes.c_int),
                ("temperature", ctypes.c_float),
@ -1898,16 +1900,18 @@ def generate(genparams, stream_flag=False):
    inputs.memory = memory.encode("UTF-8")
    inputs.negative_prompt = negative_prompt.encode("UTF-8")
    inputs.guidance_scale = guidance_scale
-    for n in range(images_max):
-        if not images or n >= len(images):
-            inputs.images[n] = "".encode("UTF-8")
-        else:
-            inputs.images[n] = images[n].encode("UTF-8")
-    for n in range(audio_max):
-        if not audio or n >= len(audio):
-            inputs.audio[n] = "".encode("UTF-8")
-        else:
-            inputs.audio[n] = audio[n].encode("UTF-8")
+
+    images = images[-images_max:]
+    inputs.images_len = len(images)
+    inputs.images = (ctypes.c_char_p * inputs.images_len)()
+    for n, item in enumerate(images):
+        inputs.images[n] = item.encode("UTF-8")
+    audio = audio[-audio_max:]
+    inputs.audio_len = len(audio)
+    inputs.audio = (ctypes.c_char_p * inputs.audio_len)()
+    for n, item in enumerate(audio):
+        inputs.audio[n] = item.encode("UTF-8")
+
    global showmaxctxwarning
    if max_context_length > maxctx:
        if showmaxctxwarning: