From c91f350ed51acc980651a23ed0df0cfece979e19 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Thu, 26 Mar 2026 23:03:52 +0800 Subject: [PATCH] increase max images, take images from the end instead of beginning if too many images --- embd_res/klite.embd | 4 ++-- expose.h | 8 ++++---- gpttype_adapter.cpp | 5 +++-- koboldcpp.py | 32 ++++++++++++++++++-------------- 4 files changed, 27 insertions(+), 22 deletions(-) diff --git a/embd_res/klite.embd b/embd_res/klite.embd index 4c7d60983..f8faad642 100644 --- a/embd_res/klite.embd +++ b/embd_res/klite.embd @@ -13979,9 +13979,9 @@ Current version indicated by LITEVER below. },()=>{ }); } - else if(localflag && koboldcpp_has_musicgen && no_txt_model) + else if(localflag && (koboldcpp_has_musicgen||koboldcpp_has_tts) && no_txt_model) { - msgboxYesNo("This KoboldCpp instance seems to be running an Music Generation model without any Text Generation model loaded.\n\nWould you like to launch MusicUI (Dedicated Music Generation WebUI bundled with KoboldCpp)?\n\nIf unsure, select 'Yes'.","Launch MusicUI?", ()=>{ + msgboxYesNo("This KoboldCpp instance seems to be running an Music or Speech Generation model without any Text Generation model loaded.\n\nWould you like to launch MusicUI (Dedicated Audio Generation WebUI bundled with KoboldCpp)?\n\nIf unsure, select 'Yes'.","Launch MusicUI?", ()=>{ go_to_musicui(); },()=>{ }); diff --git a/expose.h b/expose.h index 5ac536e60..a70012d06 100644 --- a/expose.h +++ b/expose.h @@ -2,8 +2,6 @@ #include const int tensor_split_max = 16; -const int images_max = 8; -const int audio_max = 4; const int logprobs_max = 10; const int overridekv_max = 16; @@ -91,8 +89,10 @@ struct generation_inputs const char * memory = nullptr; const char * negative_prompt = nullptr; const float guidance_scale = 1; - const char * images[images_max] = {}; - const char * audio[audio_max] = {}; + const int images_len = 0; + const char ** images = nullptr; + const int audio_len = 0; + const char ** audio = nullptr; const int max_context_length = 0; const int max_length = 0; const float temperature = 0.0f; diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index 3c1f65855..dcecdc9b6 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -3648,7 +3648,8 @@ generation_outputs gpttype_generate(const generation_inputs inputs) } media_objects.clear(); std::string new_media_composite = ""; - for(int x=0;x= len(images): - inputs.images[n] = "".encode("UTF-8") - else: - inputs.images[n] = images[n].encode("UTF-8") - for n in range(audio_max): - if not audio or n >= len(audio): - inputs.audio[n] = "".encode("UTF-8") - else: - inputs.audio[n] = audio[n].encode("UTF-8") + + images = images[-images_max:] + inputs.images_len = len(images) + inputs.images = (ctypes.c_char_p * inputs.images_len)() + for n, item in enumerate(images): + inputs.images[n] = item.encode("UTF-8") + audio = audio[-audio_max:] + inputs.audio_len = len(audio) + inputs.audio = (ctypes.c_char_p * inputs.audio_len)() + for n, item in enumerate(audio): + inputs.audio[n] = item.encode("UTF-8") + global showmaxctxwarning if max_context_length > maxctx: if showmaxctxwarning: