split audio and vision detection separately

This commit is contained in:
Concedo 2025-07-13 17:47:15 +08:00
parent 0938af7c83
commit 811463a704
5 changed files with 82 additions and 21 deletions

View file

@ -268,7 +268,10 @@ extern "C"
bool has_finished() {
return generation_finished;
}
bool has_audio_support()
{
return audio_multimodal_supported;
}
float get_last_eval_time() {
return last_eval_time;
}

View file

@ -287,6 +287,7 @@ extern std::string mmproj_filename;
extern std::string draftmodel_filename;
extern std::vector<std::string> generated_tokens;
extern bool generation_finished;
extern bool audio_multimodal_supported;
extern float last_eval_time;
extern float last_process_time;
extern int last_token_count;

View file

@ -57,6 +57,7 @@ std::string mmproj_filename = "";
std::string draftmodel_filename = "";
int speculative_chunk_amt = 8; //do it in chunks of this many tokens
bool generation_finished;
bool audio_multimodal_supported = false;
float last_process_time = 0;
float last_eval_time = 0;
int last_token_count = 0;
@ -1980,6 +1981,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
debugmode = inputs.debugmode;
draft_ctx = nullptr;
guidance_ctx = nullptr;
audio_multimodal_supported = false;
auto clamped_max_context_length = inputs.max_context_length;
@ -2459,6 +2461,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
// TODO @ngxson : check if model n_mel is 128 or 80
w_filters = whisper_precalc_filters::get_128_bins();
}
audio_multimodal_supported = true;
}
clp_img_data = clip_image_u8_init();
}
@ -3057,7 +3060,7 @@ static void PrepareMediaEmbds(const int nctx, const std::vector<int> & media_sep
}
else
{
printf("\nWarning: LLAVA Image excluded - Context size too low or not enough clip tokens! (needed %d)\n",cliptokensneeded);
printf("\nWarning: Vision Image excluded - Context size too low or not enough clip tokens! (needed %d)\nImage will be IGNORED! You probably want to relaunch with a larger context size!\n",cliptokensneeded);
}
media_objects[i].mediachunks.push_back(chunk);
}
@ -3110,7 +3113,7 @@ static void PrepareMediaEmbds(const int nctx, const std::vector<int> & media_sep
}
else
{
printf("\nWarning: Audio Embd excluded - Context size too low or not enough clip tokens! (needed %d)\n",cliptokensneeded);
printf("\nWarning: Audio Embd excluded - Context size too low or not enough clip tokens! (needed %d)\nAudio will be IGNORED! You probably want to relaunch with a larger context size!\n",cliptokensneeded);
}
}

View file

@ -2135,6 +2135,11 @@ Current version indicated by LITEVER below.
margin-bottom:6px;
margin-left:12px;
}
.corpoeditbtn
{
padding: 5px 7px 5px 7px;
margin: 2px;
}
/* Colors */
.hlchunk
@ -3215,6 +3220,7 @@ Current version indicated by LITEVER below.
var koboldcpp_version = ""; //detect if we are using koboldcpp
var koboldcpp_version_obj = {};
var koboldcpp_has_vision = false;
var koboldcpp_has_audio = false;
var koboldcpp_has_multiplayer = false;
var koboldcpp_has_websearch = false;
var koboldcpp_has_savedatafile = false;
@ -6650,6 +6656,7 @@ Current version indicated by LITEVER below.
image_db[imgid].aspect = (req_payload.params.width>=req_payload.params.height*2?5:(req_payload.params.height>=req_payload.params.width*2?4:(req_payload.params.width>req_payload.params.height?2:(req_payload.params.width<req_payload.params.height?1:0))));
image_db[imgid].imsource = 0; //0=generated,1=uploaded
image_db[imgid].imrefid = "";
image_db[imgid].type = 0; //0=image, 1=audio
uploadBase64ImgToComfy(req_payload["source_image"],comfyimg2imgname).then(() => {
fetch(gen_endpoint, {
@ -6850,6 +6857,7 @@ Current version indicated by LITEVER below.
image_db[imgid].aspect = (req_payload.params.width>=req_payload.params.height*2?5:(req_payload.params.height>=req_payload.params.width*2?4:(req_payload.params.width>req_payload.params.height?2:(req_payload.params.width<req_payload.params.height?1:0))));
image_db[imgid].imsource = 0; //0=generated,1=uploaded
image_db[imgid].imrefid = "";
image_db[imgid].type = 0; //0=image, 1=audio
fetch(gen_endpoint, {
method: 'GET',
@ -7063,10 +7071,14 @@ Current version indicated by LITEVER below.
{
return (custom_kobold_endpoint!="" && koboldcpp_version && koboldcpp_version!="" && compare_version_str(koboldcpp_version, "1.49") >= 0);
}
function is_using_kcpp_with_llava()
function is_using_kcpp_with_vision()
{
return (custom_kobold_endpoint!="" && koboldcpp_version && koboldcpp_version!="" && compare_version_str(koboldcpp_version, "1.61") >= 0 && koboldcpp_has_vision);
}
function is_using_kcpp_with_audio()
{
return (custom_kobold_endpoint!="" && koboldcpp_version && koboldcpp_version!="" && compare_version_str(koboldcpp_version, "1.61") >= 0 && koboldcpp_has_audio);
}
function is_using_kcpp_with_whisper()
{
return (custom_kobold_endpoint!="" && koboldcpp_version && koboldcpp_version!="" && compare_version_str(koboldcpp_version, "1.66") >= 0 && koboldcpp_has_whisper);
@ -10999,6 +11011,7 @@ Current version indicated by LITEVER below.
console.log("KoboldCpp Detected: " + koboldcpp_version);
document.getElementById("connectstatus").innerHTML = (`<span style='cursor: pointer;' onclick='fetch_koboldcpp_perf()'>KoboldCpp ${koboldcpp_version}</a>`);
koboldcpp_has_vision = (data.vision?true:false);
koboldcpp_has_audio = (data.audio?true:false);
koboldcpp_has_whisper = (data.transcribe?true:false);
koboldcpp_has_multiplayer = (data.multiplayer?true:false);
koboldcpp_has_websearch = (data.websearch?true:false);
@ -13861,6 +13874,22 @@ Current version indicated by LITEVER below.
}
}
function memory_add_instruction()
{
inputBox("Add another instruction for the AI to remember.","Add Instruction to Memory","","Enter a Prompt",()=>{
let userinput = getInputBoxValue();
if(userinput.trim()!="")
{
let str = get_instructendplaceholder() + userinput.trim();
if (localsettings.separate_end_tags) {
str += get_instructendplaceholder_end();
}
document.getElementById("memorytext").value += str;
}
},false);
}
let temp_automem_store = "";
function autogenerate_summary_memory()
{
@ -14674,6 +14703,14 @@ Current version indicated by LITEVER below.
function self_upload_file_dispatch(data,filename)
{
const maxSize = 20 * 1024 * 1024; // approx 20MB limit
const dlen = (data.length*0.75);
const mbs = Math.ceil(dlen/1024/1024);
if (dlen > maxSize) {
msgbox(`Selected file exceeds 20MB size limit!\nSelected file was ${mbs}MB. Please try a smaller file.`, "File Too Large");
return;
}
if(data.startsWith("data:audio"))
{
self_upload_audio(data,filename);
@ -14705,6 +14742,7 @@ Current version indicated by LITEVER below.
image_db[imgid].aspect = 0;
image_db[imgid].imsource = 1; //0=generated,1=uploaded
image_db[imgid].imrefid = "";
image_db[imgid].type = 0; //0=image, 1=audio
let imgres = localsettings.img_allowhd?VHD_RES_PX:NO_HD_RES_PX;
compressImage(origImg, (newDataUri, outAspect) => {
image_db[imgid].done = true;
@ -14748,6 +14786,7 @@ Current version indicated by LITEVER below.
image_db[imgid].imsource = 1; //0=generated,1=uploaded
image_db[imgid].imrefid = filename;
image_db[imgid].len = 0;
image_db[imgid].type = 1; //0=image, 1=audio
if(localsettings.no_compress_audio)
{
image_db[imgid].done = true;
@ -16435,11 +16474,11 @@ Current version indicated by LITEVER below.
submit_payload.params.memory = truncated_memory;
submit_payload.params.trim_stop = true;
}
if(is_using_kcpp_with_llava() && insertAIVisionImages.length>0)
if(is_using_kcpp_with_vision() && insertAIVisionImages.length>0)
{
submit_payload.params.images = insertAIVisionImages.map(str => str.includes("base64,")?str.split("base64,")[1]:str);
}
if(is_using_kcpp_with_llava() && insertAIAudioSounds.length>0)
if(is_using_kcpp_with_audio() && insertAIAudioSounds.length>0)
{
submit_payload.params.audio = insertAIAudioSounds.map(str => str.includes("base64,")?str.split("base64,")[1]:str);
}
@ -17619,6 +17658,7 @@ Current version indicated by LITEVER below.
image_db[imgid].aspect = (iwidth>=iheight*2?5:(iheight>=iwidth*2?4:(iwidth>iheight?2:(iwidth<iheight?1:0))));
image_db[imgid].imsource = 0; //0=generated,1=uploaded
image_db[imgid].imrefid = ""; //this will store the real horde ID to poll
image_db[imgid].type = 0; //0=image, 1=audio
fetch(stablehorde_submit_endpoint, {
method: 'POST', // or 'PUT'
headers: {
@ -17670,6 +17710,7 @@ Current version indicated by LITEVER below.
image_db[imgid].aspect = (iwidth>=iheight*2?5:(iheight>=iwidth*2?4:(iwidth>iheight?2:(iwidth<iheight?1:0))));
image_db[imgid].imsource = 0; //0=generated,1=uploaded
image_db[imgid].imrefid = "";
image_db[imgid].type = 0; //0=image, 1=audio
generate_a1111_image(genimg_payload,(outputimg)=>{
if(outputimg)
{
@ -17714,6 +17755,7 @@ Current version indicated by LITEVER below.
image_db[imgid].aspect = 0;
image_db[imgid].imsource = 0; //0=generated,1=uploaded
image_db[imgid].imrefid = "";
image_db[imgid].type = 0; //0=image, 1=audio
generate_dalle_image(genimg_payload,(outputimg,outputerr)=>{
if(outputimg)
{
@ -17899,19 +17941,19 @@ Current version indicated by LITEVER below.
let savedmeta = completed_imgs_meta[imghash];
if(!savedmeta && imghash!="")
{
savedmeta = completed_imgs_meta[imghash] = {prompt:"", desc:"", visionmode:0, aspect:0, ref:"", len:0};
savedmeta = completed_imgs_meta[imghash] = {prompt:"", desc:"", visionmode:0, aspect:0, ref:"", len:0, type:0};
}
if(!savedmeta.visionmode)
{
savedmeta.visionmode = 0;
}
let hasllava = is_using_kcpp_with_llava();
let canvisionaudio = ((is_using_kcpp_with_vision() && savedmeta.type==0) || (is_using_kcpp_with_audio() && savedmeta.type==1));
let visionstatus = "";
if(savedmeta.visionmode==3)
{
if(custom_kobold_endpoint!="") //on a kobo endpoint
{
visionstatus = ((!savedmeta.visionmode || savedmeta.visionmode==0)?`<span class="color_red">Inactive</span>`:(hasllava?`<span class="color_green">Active</span>`:`<span class="color_yellow">Unsupported</span>`));
visionstatus = ((!savedmeta.visionmode || savedmeta.visionmode==0)?`<span class="color_red">Inactive</span>`:(canvisionaudio?`<span class="color_green">Active</span>`:`<span class="color_yellow">Unsupported</span>`));
}
else
{
@ -18712,7 +18754,8 @@ Current version indicated by LITEVER below.
gametext_arr[i] = gametext_arr[i].replace(matchstr, newstr);
let metaid = cyrb_hash(img.result);
//default to llava if supported, and image is self uploaded
completed_imgs_meta[metaid] = {prompt:image_db[key].prompt, desc:"", visionmode:((image_db[key].imsource==1 && is_using_kcpp_with_llava())?3:0), aspect:image_db[key].aspect, ref:image_db[key].imrefid, len:image_db[key].len};
let desiredvismode = ((image_db[key].imsource==1 && ((is_using_kcpp_with_vision() && image_db[key].type==0) || (is_using_kcpp_with_audio() && image_db[key].type==1)))?3:0);
completed_imgs_meta[metaid] = {prompt:image_db[key].prompt, desc:"", visionmode:desiredvismode, aspect:image_db[key].aspect, ref:image_db[key].imrefid, len:image_db[key].len, type:image_db[key].type};
delete image_db[key];
}
}
@ -21183,16 +21226,16 @@ Current version indicated by LITEVER below.
let curr = chatunits[i];
curr = repack_postprocess_turn(curr, countmap);
let resendbtn = ((curr.myturn && i<chatunits.length-1)?`<button type="button" class="btn btn-primary" style="margin:2px;float:right;" onclick="corpo_edit_chunk_resend(${i})">Resend</button>`:``);
let resendbtn = ((curr.myturn && i<chatunits.length-1)?`<button type="button" class="btn btn-primary corpoeditbtn" style="float:right;" onclick="corpo_edit_chunk_resend(${i})">Resend</button>`:``);
let bodypart = (corpo_editing_turn == i ?
`<div class="corpo_edit_outer">
<div class="corpo_edit_inner" id="corpo_edit_inp_lengthtester" style="white-space: nowrap; visibility: hidden; height: 0px; position:absolute; width: auto;"></div>
<textarea class="corpo_edit_inner" id="corpo_edit_inp" type="text" name="crpeditinp" role="presentation" autocomplete="noppynop" spellcheck="true" rows="1" wrap="on" placeholder="Edit Message" value="" oninput="corpoedit_resize_input();"/>${stash_image_placeholders(curr.msg, false)}</textarea>
</div>
<button type="button" class="btn btn-primary" style="margin:2px;float:right;" onclick="corpo_edit_chunk_start(-1)">Cancel</button>
<button type="button" class="btn btn-primary corpoeditbtn" style="float:right;" onclick="corpo_edit_chunk_start(-1)">Cancel</button>
${resendbtn}
<button type="button" class="btn btn-primary" style="margin:2px;float:right;" onclick="corpo_edit_chunk_save()">Save</button>
<button type="button" class="btn btn-primary bg_red" style="margin:2px;float:left;" onclick="corpo_edit_chunk_delete()">Delete</button>`:
<button type="button" class="btn btn-primary corpoeditbtn" style="float:right;" onclick="corpo_edit_chunk_save()">Save</button>
<button type="button" class="btn btn-primary corpoeditbtn bg_red" style="float:left;" onclick="corpo_edit_chunk_delete()">Delete</button>`:
`<div class="corpostyleitemcontent">${curr.msg}</div>`);
let historical_btns = "";
if(!curr.myturn && i==chatunits.length-1 && !incomplete_resp)
@ -23951,7 +23994,9 @@ Current version indicated by LITEVER below.
<div class="context_tab_container" id="memory_tab_container">
<div class="settinglabel">
<span class="justifyleft">Memory<span class="helpicon">?<span
class="helptext">Put the information you want the AI to always remember. It will be inserted into the top of every request sent to the AI.</span></span></span>
class="helptext">Put the information you want the AI to always remember. It will be inserted into the top of every request sent to the AI. Placeholder tags can be used.</span></span>
<button type="button" class="btn btn-primary" style="font-size:10px;padding:2px 5px;margin-left:4px;margin:2px;" onclick="memory_add_instruction()">Add Instruction</button>
</span>
<span class="justifyright flex-push-right" >
<div class="settinglabel" style="padding-top: 4px;">
<div class="justifyleft settingsmall" title="Add newline after injecting memory text">Newline After Memory </div>

View file

@ -105,6 +105,7 @@ runmode_untouched = True
modelfile_extracted_meta = None
importvars_in_progress = False
has_multiplayer = False
has_audio_support = False
savedata_obj = None
multiplayer_story_data_compressed = None #stores the full compressed story of the current multiplayer session
multiplayer_turn_major = 1 # to keep track of when a client needs to sync their stories
@ -537,6 +538,7 @@ def init_library():
handle.new_token.argtypes = [ctypes.c_int]
handle.get_stream_count.restype = ctypes.c_int
handle.has_finished.restype = ctypes.c_bool
handle.has_audio_support.restype = ctypes.c_bool
handle.get_last_eval_time.restype = ctypes.c_float
handle.get_last_process_time.restype = ctypes.c_float
handle.get_last_token_count.restype = ctypes.c_int
@ -889,7 +891,7 @@ def convert_json_to_gbnf(json_obj):
return ""
def get_capabilities():
global savedata_obj, has_multiplayer, KcppVersion, friendlymodelname, friendlysdmodelname, fullsdmodelpath, mmprojpath, password, fullwhispermodelpath, ttsmodelpath, embeddingsmodelpath
global savedata_obj, has_multiplayer, KcppVersion, friendlymodelname, friendlysdmodelname, fullsdmodelpath, mmprojpath, password, fullwhispermodelpath, ttsmodelpath, embeddingsmodelpath, has_audio_support
has_llm = not (friendlymodelname=="inactive")
has_txt2img = not (friendlysdmodelname=="inactive" or fullsdmodelpath=="")
has_vision = (mmprojpath!="")
@ -900,7 +902,7 @@ def get_capabilities():
has_embeddings = (embeddingsmodelpath!="")
has_guidance = True if args.enableguidance else False
admin_type = (2 if args.admin and args.admindir and args.adminpassword else (1 if args.admin and args.admindir else 0))
return {"result":"KoboldCpp", "version":KcppVersion, "protected":has_password, "llm":has_llm, "txt2img":has_txt2img,"vision":has_vision,"transcribe":has_whisper,"multiplayer":has_multiplayer,"websearch":has_search,"tts":has_tts, "embeddings":has_embeddings, "savedata":(savedata_obj is not None), "admin": admin_type, "guidance": has_guidance}
return {"result":"KoboldCpp", "version":KcppVersion, "protected":has_password, "llm":has_llm, "txt2img":has_txt2img,"vision":has_vision,"audio":has_audio_support,"transcribe":has_whisper,"multiplayer":has_multiplayer,"websearch":has_search,"tts":has_tts, "embeddings":has_embeddings, "savedata":(savedata_obj is not None), "admin": admin_type, "guidance": has_guidance}
def dump_gguf_metadata(file_path): #if you're gonna copy this into your own project at least credit concedo
chunk_size = 1024*1024*12 # read first 12mb of file
@ -2287,6 +2289,7 @@ def transform_genparams(genparams, api_format):
tools_message_start = adapter_obj.get("tools_start", "")
tools_message_end = adapter_obj.get("tools_end", "")
images_added = []
audio_added = []
jsongrammar = r"""
root ::= arr
value ::= object | array | string | number | ("true" | "false" | "null") ws
@ -2362,6 +2365,10 @@ ws ::= | " " | "\n" [ \t]{0,20}
if 'image_url' in item and item['image_url'] and item['image_url']['url'] and item['image_url']['url'].startswith("data:image"):
images_added.append(item['image_url']['url'].split(",", 1)[1])
messages_string += "\n(Attached Image)\n"
elif item['type']=="input_audio":
if 'input_audio' in item and item['input_audio'] and item['input_audio']['data']:
audio_added.append(item['input_audio']['data'])
messages_string += "\n(Attached Audio)\n"
# If last message, add any tools calls after message content and before message end token if any
if message['role'] == "user" and message_index == len(messages_array):
# tools handling: Check if user is passing a openai tools array, if so add to end of prompt before assistant prompt unless tool_choice has been set to None
@ -2464,6 +2471,8 @@ ws ::= | " " | "\n" [ \t]{0,20}
genparams["prompt"] = messages_string
if len(images_added)>0:
genparams["images"] = images_added
if len(audio_added)>0:
genparams["audio"] = audio_added
if len(genparams.get('stop_sequence', []))==0: #only set stop seq if it wont overwrite existing
genparams["stop_sequence"] = [user_message_start.strip(),assistant_message_start.strip()]
else:
@ -4947,7 +4956,7 @@ def show_gui():
changed_gpu_choice_var()
# presets selector
makelabel(quick_tab, "Presets:", 1,0,"Select a backend to use.\nCuBLAS runs on Nvidia GPUs, and is much faster.\nVulkan and CLBlast works on all GPUs but is somewhat slower.\nOtherwise, runs on CPU only.\nNoAVX2 and Failsafe modes support older PCs.")
makelabel(quick_tab, "Backend:", 1,0,"Select a backend to use.\nCuBLAS runs on Nvidia GPUs, and is much faster.\nVulkan and CLBlast works on all GPUs but is somewhat slower.\nOtherwise, runs on CPU only.\nNoAVX2 and Failsafe modes support older PCs.")
runoptbox = ctk.CTkComboBox(quick_tab, values=runopts, width=190,variable=runopts_var, state="readonly")
runoptbox.grid(row=1, column=1,padx=8, stick="nw")
@ -4995,7 +5004,7 @@ def show_gui():
hardware_tab = tabcontent["Hardware"]
# presets selector
makelabel(hardware_tab, "Presets:", 1,0,"Select a backend to use.\nCuBLAS runs on Nvidia GPUs, and is much faster.\nVulkan and CLBlast works on all GPUs but is somewhat slower.\nOtherwise, runs on CPU only.\nNoAVX2 and Failsafe modes support older PCs.")
makelabel(hardware_tab, "Backend:", 1,0,"Select a backend to use.\nCuBLAS runs on Nvidia GPUs, and is much faster.\nVulkan and CLBlast works on all GPUs but is somewhat slower.\nOtherwise, runs on CPU only.\nNoAVX2 and Failsafe modes support older PCs.")
runoptbox = ctk.CTkComboBox(hardware_tab, values=runopts, width=180,variable=runopts_var, state="readonly")
runoptbox.grid(row=1, column=1,padx=8, stick="nw")
runoptbox.set(runopts[0]) # Set to first available option
@ -6577,7 +6586,7 @@ def main(launch_args, default_args):
def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False):
global embedded_kailite, embedded_kcpp_docs, embedded_kcpp_sdui, start_time, exitcounter, global_memory, using_gui_launcher
global libname, args, friendlymodelname, friendlysdmodelname, fullsdmodelpath, mmprojpath, password, fullwhispermodelpath, ttsmodelpath, embeddingsmodelpath, friendlyembeddingsmodelname
global libname, args, friendlymodelname, friendlysdmodelname, fullsdmodelpath, mmprojpath, password, fullwhispermodelpath, ttsmodelpath, embeddingsmodelpath, friendlyembeddingsmodelname, has_audio_support
start_server = True
@ -6934,7 +6943,7 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False):
print("WARNING: Selected Text Model does not seem to be a GGUF file! Are you sure you picked the right file?")
loadok = load_model(modelname)
print("Load Text Model OK: " + str(loadok))
has_audio_support = handle.has_audio_support() # multimodal audio support is only known at runtime
if not loadok:
exitcounter = 999
exit_with_error(3,"Could not load text model: " + modelname)