From c91f350ed51acc980651a23ed0df0cfece979e19 Mon Sep 17 00:00:00 2001
From: Concedo <39025047+LostRuins@users.noreply.github.com>
Date: Thu, 26 Mar 2026 23:03:52 +0800
Subject: [PATCH] increase max images, take images from the end instead of
 beginning if too many images

---
 embd_res/klite.embd |  4 ++--
 expose.h            |  8 ++++----
 gpttype_adapter.cpp |  5 +++--
 koboldcpp.py        | 32 ++++++++++++++++++--------------
 4 files changed, 27 insertions(+), 22 deletions(-)
diff --git a/embd_res/klite.embd b/embd_res/klite.embd
index 4c7d60983..f8faad642 100644
--- a/embd_res/klite.embd
+++ b/embd_res/klite.embd
@@ -13979,9 +13979,9 @@ Current version indicated by LITEVER below.
 										},()=>{
 										});
 									}
-									else if(localflag && koboldcpp_has_musicgen && no_txt_model)
+									else if(localflag && (koboldcpp_has_musicgen||koboldcpp_has_tts) && no_txt_model)
 									{
-										msgboxYesNo("This KoboldCpp instance seems to be running an Music Generation model without any Text Generation model loaded.\n\nWould you like to launch MusicUI (Dedicated Music Generation WebUI bundled with KoboldCpp)?\n\nIf unsure, select 'Yes'.","Launch MusicUI?", ()=>{
+										msgboxYesNo("This KoboldCpp instance seems to be running an Music or Speech Generation model without any Text Generation model loaded.\n\nWould you like to launch MusicUI (Dedicated Audio Generation WebUI bundled with KoboldCpp)?\n\nIf unsure, select 'Yes'.","Launch MusicUI?", ()=>{
 											go_to_musicui();
 										},()=>{
 										});
diff --git a/expose.h b/expose.h
index 5ac536e60..a70012d06 100644
--- a/expose.h
+++ b/expose.h
@@ -2,8 +2,6 @@
 #include <cstdint>
 
 const int tensor_split_max = 16;
-const int images_max = 8;
-const int audio_max = 4;
 const int logprobs_max = 10;
 const int overridekv_max = 16;
 
@@ -91,8 +89,10 @@ struct generation_inputs
     const char * memory = nullptr;
     const char * negative_prompt = nullptr;
     const float guidance_scale = 1;
-    const char * images[images_max] = {};
-    const char * audio[audio_max] = {};
+    const int images_len = 0;
+    const char ** images = nullptr;
+    const int audio_len = 0;
+    const char ** audio = nullptr;
     const int max_context_length = 0;
     const int max_length = 0;
     const float temperature = 0.0f;
diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp
index 3c1f65855..dcecdc9b6 100644
--- a/gpttype_adapter.cpp
+++ b/gpttype_adapter.cpp
@@ -3648,7 +3648,8 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
     }
     media_objects.clear();
     std::string new_media_composite = "";
-    for(int x=0;x<images_max;++x)
+
+    for(int x=0;x<inputs.images_len;++x)
     {
         std::string item = inputs.images[x];
         if(item!="")
@@ -3678,7 +3679,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
             new_media_composite += item;
         }
     }
-    for(int x=0;x<audio_max;++x)
+    for(int x=0;x<inputs.audio_len;++x)
     {
         std::string item = inputs.audio[x];
         if(item!="")
diff --git a/koboldcpp.py b/koboldcpp.py
index c3737f083..12bd8e68e 100755
--- a/koboldcpp.py
+++ b/koboldcpp.py
@@ -45,8 +45,8 @@ import queue
 # constants
 sampler_order_max = 7
 tensor_split_max = 16
-images_max = 8
-audio_max = 4
+images_max = 16
+audio_max = 16
 bias_min_value = -100.0
 bias_max_value = 100.0
 logprobs_max = 10
@@ -251,8 +251,10 @@ class generation_inputs(ctypes.Structure):
                 ("memory", ctypes.c_char_p),
                 ("negative_prompt", ctypes.c_char_p),
                 ("guidance_scale", ctypes.c_float),
-                ("images", ctypes.c_char_p * images_max),
-                ("audio", ctypes.c_char_p * audio_max),
+                ("images_len", ctypes.c_int),
+                ("images", ctypes.POINTER(ctypes.c_char_p)),
+                ("audio_len", ctypes.c_int),
+                ("audio", ctypes.POINTER(ctypes.c_char_p)),
                 ("max_context_length", ctypes.c_int),
                 ("max_length", ctypes.c_int),
                 ("temperature", ctypes.c_float),
@@ -1898,16 +1900,18 @@ def generate(genparams, stream_flag=False):
     inputs.memory = memory.encode("UTF-8")
     inputs.negative_prompt = negative_prompt.encode("UTF-8")
     inputs.guidance_scale = guidance_scale
-    for n in range(images_max):
-        if not images or n >= len(images):
-            inputs.images[n] = "".encode("UTF-8")
-        else:
-            inputs.images[n] = images[n].encode("UTF-8")
-    for n in range(audio_max):
-        if not audio or n >= len(audio):
-            inputs.audio[n] = "".encode("UTF-8")
-        else:
-            inputs.audio[n] = audio[n].encode("UTF-8")
+
+    images = images[-images_max:]
+    inputs.images_len = len(images)
+    inputs.images = (ctypes.c_char_p * inputs.images_len)()
+    for n, item in enumerate(images):
+        inputs.images[n] = item.encode("UTF-8")
+    audio = audio[-audio_max:]
+    inputs.audio_len = len(audio)
+    inputs.audio = (ctypes.c_char_p * inputs.audio_len)()
+    for n, item in enumerate(audio):
+        inputs.audio[n] = item.encode("UTF-8")
+
     global showmaxctxwarning
     if max_context_length > maxctx:
         if showmaxctxwarning: