split audio and vision detection separately

This commit is contained in:
Concedo 2025-07-13 17:47:15 +08:00
parent 0938af7c83
commit 811463a704
5 changed files with 82 additions and 21 deletions

View file

@ -268,7 +268,10 @@ extern "C"
bool has_finished() { bool has_finished() {
return generation_finished; return generation_finished;
} }
bool has_audio_support()
{
return audio_multimodal_supported;
}
float get_last_eval_time() { float get_last_eval_time() {
return last_eval_time; return last_eval_time;
} }

View file

@ -287,6 +287,7 @@ extern std::string mmproj_filename;
extern std::string draftmodel_filename; extern std::string draftmodel_filename;
extern std::vector<std::string> generated_tokens; extern std::vector<std::string> generated_tokens;
extern bool generation_finished; extern bool generation_finished;
extern bool audio_multimodal_supported;
extern float last_eval_time; extern float last_eval_time;
extern float last_process_time; extern float last_process_time;
extern int last_token_count; extern int last_token_count;

View file

@ -57,6 +57,7 @@ std::string mmproj_filename = "";
std::string draftmodel_filename = ""; std::string draftmodel_filename = "";
int speculative_chunk_amt = 8; //do it in chunks of this many tokens int speculative_chunk_amt = 8; //do it in chunks of this many tokens
bool generation_finished; bool generation_finished;
bool audio_multimodal_supported = false;
float last_process_time = 0; float last_process_time = 0;
float last_eval_time = 0; float last_eval_time = 0;
int last_token_count = 0; int last_token_count = 0;
@ -1980,6 +1981,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
debugmode = inputs.debugmode; debugmode = inputs.debugmode;
draft_ctx = nullptr; draft_ctx = nullptr;
guidance_ctx = nullptr; guidance_ctx = nullptr;
audio_multimodal_supported = false;
auto clamped_max_context_length = inputs.max_context_length; auto clamped_max_context_length = inputs.max_context_length;
@ -2459,6 +2461,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
// TODO @ngxson : check if model n_mel is 128 or 80 // TODO @ngxson : check if model n_mel is 128 or 80
w_filters = whisper_precalc_filters::get_128_bins(); w_filters = whisper_precalc_filters::get_128_bins();
} }
audio_multimodal_supported = true;
} }
clp_img_data = clip_image_u8_init(); clp_img_data = clip_image_u8_init();
} }
@ -3057,7 +3060,7 @@ static void PrepareMediaEmbds(const int nctx, const std::vector<int> & media_sep
} }
else else
{ {
printf("\nWarning: LLAVA Image excluded - Context size too low or not enough clip tokens! (needed %d)\n",cliptokensneeded); printf("\nWarning: Vision Image excluded - Context size too low or not enough clip tokens! (needed %d)\nImage will be IGNORED! You probably want to relaunch with a larger context size!\n",cliptokensneeded);
} }
media_objects[i].mediachunks.push_back(chunk); media_objects[i].mediachunks.push_back(chunk);
} }
@ -3110,7 +3113,7 @@ static void PrepareMediaEmbds(const int nctx, const std::vector<int> & media_sep
} }
else else
{ {
printf("\nWarning: Audio Embd excluded - Context size too low or not enough clip tokens! (needed %d)\n",cliptokensneeded); printf("\nWarning: Audio Embd excluded - Context size too low or not enough clip tokens! (needed %d)\nAudio will be IGNORED! You probably want to relaunch with a larger context size!\n",cliptokensneeded);
} }
} }

View file

@ -2135,6 +2135,11 @@ Current version indicated by LITEVER below.
margin-bottom:6px; margin-bottom:6px;
margin-left:12px; margin-left:12px;
} }
.corpoeditbtn
{
padding: 5px 7px 5px 7px;
margin: 2px;
}
/* Colors */ /* Colors */
.hlchunk .hlchunk
@ -3215,6 +3220,7 @@ Current version indicated by LITEVER below.
var koboldcpp_version = ""; //detect if we are using koboldcpp var koboldcpp_version = ""; //detect if we are using koboldcpp
var koboldcpp_version_obj = {}; var koboldcpp_version_obj = {};
var koboldcpp_has_vision = false; var koboldcpp_has_vision = false;
var koboldcpp_has_audio = false;
var koboldcpp_has_multiplayer = false; var koboldcpp_has_multiplayer = false;
var koboldcpp_has_websearch = false; var koboldcpp_has_websearch = false;
var koboldcpp_has_savedatafile = false; var koboldcpp_has_savedatafile = false;
@ -6650,6 +6656,7 @@ Current version indicated by LITEVER below.
image_db[imgid].aspect = (req_payload.params.width>=req_payload.params.height*2?5:(req_payload.params.height>=req_payload.params.width*2?4:(req_payload.params.width>req_payload.params.height?2:(req_payload.params.width<req_payload.params.height?1:0)))); image_db[imgid].aspect = (req_payload.params.width>=req_payload.params.height*2?5:(req_payload.params.height>=req_payload.params.width*2?4:(req_payload.params.width>req_payload.params.height?2:(req_payload.params.width<req_payload.params.height?1:0))));
image_db[imgid].imsource = 0; //0=generated,1=uploaded image_db[imgid].imsource = 0; //0=generated,1=uploaded
image_db[imgid].imrefid = ""; image_db[imgid].imrefid = "";
image_db[imgid].type = 0; //0=image, 1=audio
uploadBase64ImgToComfy(req_payload["source_image"],comfyimg2imgname).then(() => { uploadBase64ImgToComfy(req_payload["source_image"],comfyimg2imgname).then(() => {
fetch(gen_endpoint, { fetch(gen_endpoint, {
@ -6850,6 +6857,7 @@ Current version indicated by LITEVER below.
image_db[imgid].aspect = (req_payload.params.width>=req_payload.params.height*2?5:(req_payload.params.height>=req_payload.params.width*2?4:(req_payload.params.width>req_payload.params.height?2:(req_payload.params.width<req_payload.params.height?1:0)))); image_db[imgid].aspect = (req_payload.params.width>=req_payload.params.height*2?5:(req_payload.params.height>=req_payload.params.width*2?4:(req_payload.params.width>req_payload.params.height?2:(req_payload.params.width<req_payload.params.height?1:0))));
image_db[imgid].imsource = 0; //0=generated,1=uploaded image_db[imgid].imsource = 0; //0=generated,1=uploaded
image_db[imgid].imrefid = ""; image_db[imgid].imrefid = "";
image_db[imgid].type = 0; //0=image, 1=audio
fetch(gen_endpoint, { fetch(gen_endpoint, {
method: 'GET', method: 'GET',
@ -7063,10 +7071,14 @@ Current version indicated by LITEVER below.
{ {
return (custom_kobold_endpoint!="" && koboldcpp_version && koboldcpp_version!="" && compare_version_str(koboldcpp_version, "1.49") >= 0); return (custom_kobold_endpoint!="" && koboldcpp_version && koboldcpp_version!="" && compare_version_str(koboldcpp_version, "1.49") >= 0);
} }
function is_using_kcpp_with_llava() function is_using_kcpp_with_vision()
{ {
return (custom_kobold_endpoint!="" && koboldcpp_version && koboldcpp_version!="" && compare_version_str(koboldcpp_version, "1.61") >= 0 && koboldcpp_has_vision); return (custom_kobold_endpoint!="" && koboldcpp_version && koboldcpp_version!="" && compare_version_str(koboldcpp_version, "1.61") >= 0 && koboldcpp_has_vision);
} }
function is_using_kcpp_with_audio()
{
return (custom_kobold_endpoint!="" && koboldcpp_version && koboldcpp_version!="" && compare_version_str(koboldcpp_version, "1.61") >= 0 && koboldcpp_has_audio);
}
function is_using_kcpp_with_whisper() function is_using_kcpp_with_whisper()
{ {
return (custom_kobold_endpoint!="" && koboldcpp_version && koboldcpp_version!="" && compare_version_str(koboldcpp_version, "1.66") >= 0 && koboldcpp_has_whisper); return (custom_kobold_endpoint!="" && koboldcpp_version && koboldcpp_version!="" && compare_version_str(koboldcpp_version, "1.66") >= 0 && koboldcpp_has_whisper);
@ -10999,6 +11011,7 @@ Current version indicated by LITEVER below.
console.log("KoboldCpp Detected: " + koboldcpp_version); console.log("KoboldCpp Detected: " + koboldcpp_version);
document.getElementById("connectstatus").innerHTML = (`<span style='cursor: pointer;' onclick='fetch_koboldcpp_perf()'>KoboldCpp ${koboldcpp_version}</a>`); document.getElementById("connectstatus").innerHTML = (`<span style='cursor: pointer;' onclick='fetch_koboldcpp_perf()'>KoboldCpp ${koboldcpp_version}</a>`);
koboldcpp_has_vision = (data.vision?true:false); koboldcpp_has_vision = (data.vision?true:false);
koboldcpp_has_audio = (data.audio?true:false);
koboldcpp_has_whisper = (data.transcribe?true:false); koboldcpp_has_whisper = (data.transcribe?true:false);
koboldcpp_has_multiplayer = (data.multiplayer?true:false); koboldcpp_has_multiplayer = (data.multiplayer?true:false);
koboldcpp_has_websearch = (data.websearch?true:false); koboldcpp_has_websearch = (data.websearch?true:false);
@ -13861,6 +13874,22 @@ Current version indicated by LITEVER below.
} }
} }
function memory_add_instruction()
{
inputBox("Add another instruction for the AI to remember.","Add Instruction to Memory","","Enter a Prompt",()=>{
let userinput = getInputBoxValue();
if(userinput.trim()!="")
{
let str = get_instructendplaceholder() + userinput.trim();
if (localsettings.separate_end_tags) {
str += get_instructendplaceholder_end();
}
document.getElementById("memorytext").value += str;
}
},false);
}
let temp_automem_store = ""; let temp_automem_store = "";
function autogenerate_summary_memory() function autogenerate_summary_memory()
{ {
@ -14674,6 +14703,14 @@ Current version indicated by LITEVER below.
function self_upload_file_dispatch(data,filename) function self_upload_file_dispatch(data,filename)
{ {
const maxSize = 20 * 1024 * 1024; // approx 20MB limit
const dlen = (data.length*0.75);
const mbs = Math.ceil(dlen/1024/1024);
if (dlen > maxSize) {
msgbox(`Selected file exceeds 20MB size limit!\nSelected file was ${mbs}MB. Please try a smaller file.`, "File Too Large");
return;
}
if(data.startsWith("data:audio")) if(data.startsWith("data:audio"))
{ {
self_upload_audio(data,filename); self_upload_audio(data,filename);
@ -14705,6 +14742,7 @@ Current version indicated by LITEVER below.
image_db[imgid].aspect = 0; image_db[imgid].aspect = 0;
image_db[imgid].imsource = 1; //0=generated,1=uploaded image_db[imgid].imsource = 1; //0=generated,1=uploaded
image_db[imgid].imrefid = ""; image_db[imgid].imrefid = "";
image_db[imgid].type = 0; //0=image, 1=audio
let imgres = localsettings.img_allowhd?VHD_RES_PX:NO_HD_RES_PX; let imgres = localsettings.img_allowhd?VHD_RES_PX:NO_HD_RES_PX;
compressImage(origImg, (newDataUri, outAspect) => { compressImage(origImg, (newDataUri, outAspect) => {
image_db[imgid].done = true; image_db[imgid].done = true;
@ -14748,6 +14786,7 @@ Current version indicated by LITEVER below.
image_db[imgid].imsource = 1; //0=generated,1=uploaded image_db[imgid].imsource = 1; //0=generated,1=uploaded
image_db[imgid].imrefid = filename; image_db[imgid].imrefid = filename;
image_db[imgid].len = 0; image_db[imgid].len = 0;
image_db[imgid].type = 1; //0=image, 1=audio
if(localsettings.no_compress_audio) if(localsettings.no_compress_audio)
{ {
image_db[imgid].done = true; image_db[imgid].done = true;
@ -16435,11 +16474,11 @@ Current version indicated by LITEVER below.
submit_payload.params.memory = truncated_memory; submit_payload.params.memory = truncated_memory;
submit_payload.params.trim_stop = true; submit_payload.params.trim_stop = true;
} }
if(is_using_kcpp_with_llava() && insertAIVisionImages.length>0) if(is_using_kcpp_with_vision() && insertAIVisionImages.length>0)
{ {
submit_payload.params.images = insertAIVisionImages.map(str => str.includes("base64,")?str.split("base64,")[1]:str); submit_payload.params.images = insertAIVisionImages.map(str => str.includes("base64,")?str.split("base64,")[1]:str);
} }
if(is_using_kcpp_with_llava() && insertAIAudioSounds.length>0) if(is_using_kcpp_with_audio() && insertAIAudioSounds.length>0)
{ {
submit_payload.params.audio = insertAIAudioSounds.map(str => str.includes("base64,")?str.split("base64,")[1]:str); submit_payload.params.audio = insertAIAudioSounds.map(str => str.includes("base64,")?str.split("base64,")[1]:str);
} }
@ -17619,6 +17658,7 @@ Current version indicated by LITEVER below.
image_db[imgid].aspect = (iwidth>=iheight*2?5:(iheight>=iwidth*2?4:(iwidth>iheight?2:(iwidth<iheight?1:0)))); image_db[imgid].aspect = (iwidth>=iheight*2?5:(iheight>=iwidth*2?4:(iwidth>iheight?2:(iwidth<iheight?1:0))));
image_db[imgid].imsource = 0; //0=generated,1=uploaded image_db[imgid].imsource = 0; //0=generated,1=uploaded
image_db[imgid].imrefid = ""; //this will store the real horde ID to poll image_db[imgid].imrefid = ""; //this will store the real horde ID to poll
image_db[imgid].type = 0; //0=image, 1=audio
fetch(stablehorde_submit_endpoint, { fetch(stablehorde_submit_endpoint, {
method: 'POST', // or 'PUT' method: 'POST', // or 'PUT'
headers: { headers: {
@ -17670,6 +17710,7 @@ Current version indicated by LITEVER below.
image_db[imgid].aspect = (iwidth>=iheight*2?5:(iheight>=iwidth*2?4:(iwidth>iheight?2:(iwidth<iheight?1:0)))); image_db[imgid].aspect = (iwidth>=iheight*2?5:(iheight>=iwidth*2?4:(iwidth>iheight?2:(iwidth<iheight?1:0))));
image_db[imgid].imsource = 0; //0=generated,1=uploaded image_db[imgid].imsource = 0; //0=generated,1=uploaded
image_db[imgid].imrefid = ""; image_db[imgid].imrefid = "";
image_db[imgid].type = 0; //0=image, 1=audio
generate_a1111_image(genimg_payload,(outputimg)=>{ generate_a1111_image(genimg_payload,(outputimg)=>{
if(outputimg) if(outputimg)
{ {
@ -17714,6 +17755,7 @@ Current version indicated by LITEVER below.
image_db[imgid].aspect = 0; image_db[imgid].aspect = 0;
image_db[imgid].imsource = 0; //0=generated,1=uploaded image_db[imgid].imsource = 0; //0=generated,1=uploaded
image_db[imgid].imrefid = ""; image_db[imgid].imrefid = "";
image_db[imgid].type = 0; //0=image, 1=audio
generate_dalle_image(genimg_payload,(outputimg,outputerr)=>{ generate_dalle_image(genimg_payload,(outputimg,outputerr)=>{
if(outputimg) if(outputimg)
{ {
@ -17899,19 +17941,19 @@ Current version indicated by LITEVER below.
let savedmeta = completed_imgs_meta[imghash]; let savedmeta = completed_imgs_meta[imghash];
if(!savedmeta && imghash!="") if(!savedmeta && imghash!="")
{ {
savedmeta = completed_imgs_meta[imghash] = {prompt:"", desc:"", visionmode:0, aspect:0, ref:"", len:0}; savedmeta = completed_imgs_meta[imghash] = {prompt:"", desc:"", visionmode:0, aspect:0, ref:"", len:0, type:0};
} }
if(!savedmeta.visionmode) if(!savedmeta.visionmode)
{ {
savedmeta.visionmode = 0; savedmeta.visionmode = 0;
} }
let hasllava = is_using_kcpp_with_llava(); let canvisionaudio = ((is_using_kcpp_with_vision() && savedmeta.type==0) || (is_using_kcpp_with_audio() && savedmeta.type==1));
let visionstatus = ""; let visionstatus = "";
if(savedmeta.visionmode==3) if(savedmeta.visionmode==3)
{ {
if(custom_kobold_endpoint!="") //on a kobo endpoint if(custom_kobold_endpoint!="") //on a kobo endpoint
{ {
visionstatus = ((!savedmeta.visionmode || savedmeta.visionmode==0)?`<span class="color_red">Inactive</span>`:(hasllava?`<span class="color_green">Active</span>`:`<span class="color_yellow">Unsupported</span>`)); visionstatus = ((!savedmeta.visionmode || savedmeta.visionmode==0)?`<span class="color_red">Inactive</span>`:(canvisionaudio?`<span class="color_green">Active</span>`:`<span class="color_yellow">Unsupported</span>`));
} }
else else
{ {
@ -18712,7 +18754,8 @@ Current version indicated by LITEVER below.
gametext_arr[i] = gametext_arr[i].replace(matchstr, newstr); gametext_arr[i] = gametext_arr[i].replace(matchstr, newstr);
let metaid = cyrb_hash(img.result); let metaid = cyrb_hash(img.result);
//default to llava if supported, and image is self uploaded //default to llava if supported, and image is self uploaded
completed_imgs_meta[metaid] = {prompt:image_db[key].prompt, desc:"", visionmode:((image_db[key].imsource==1 && is_using_kcpp_with_llava())?3:0), aspect:image_db[key].aspect, ref:image_db[key].imrefid, len:image_db[key].len}; let desiredvismode = ((image_db[key].imsource==1 && ((is_using_kcpp_with_vision() && image_db[key].type==0) || (is_using_kcpp_with_audio() && image_db[key].type==1)))?3:0);
completed_imgs_meta[metaid] = {prompt:image_db[key].prompt, desc:"", visionmode:desiredvismode, aspect:image_db[key].aspect, ref:image_db[key].imrefid, len:image_db[key].len, type:image_db[key].type};
delete image_db[key]; delete image_db[key];
} }
} }
@ -21183,16 +21226,16 @@ Current version indicated by LITEVER below.
let curr = chatunits[i]; let curr = chatunits[i];
curr = repack_postprocess_turn(curr, countmap); curr = repack_postprocess_turn(curr, countmap);
let resendbtn = ((curr.myturn && i<chatunits.length-1)?`<button type="button" class="btn btn-primary" style="margin:2px;float:right;" onclick="corpo_edit_chunk_resend(${i})">Resend</button>`:``); let resendbtn = ((curr.myturn && i<chatunits.length-1)?`<button type="button" class="btn btn-primary corpoeditbtn" style="float:right;" onclick="corpo_edit_chunk_resend(${i})">Resend</button>`:``);
let bodypart = (corpo_editing_turn == i ? let bodypart = (corpo_editing_turn == i ?
`<div class="corpo_edit_outer"> `<div class="corpo_edit_outer">
<div class="corpo_edit_inner" id="corpo_edit_inp_lengthtester" style="white-space: nowrap; visibility: hidden; height: 0px; position:absolute; width: auto;"></div> <div class="corpo_edit_inner" id="corpo_edit_inp_lengthtester" style="white-space: nowrap; visibility: hidden; height: 0px; position:absolute; width: auto;"></div>
<textarea class="corpo_edit_inner" id="corpo_edit_inp" type="text" name="crpeditinp" role="presentation" autocomplete="noppynop" spellcheck="true" rows="1" wrap="on" placeholder="Edit Message" value="" oninput="corpoedit_resize_input();"/>${stash_image_placeholders(curr.msg, false)}</textarea> <textarea class="corpo_edit_inner" id="corpo_edit_inp" type="text" name="crpeditinp" role="presentation" autocomplete="noppynop" spellcheck="true" rows="1" wrap="on" placeholder="Edit Message" value="" oninput="corpoedit_resize_input();"/>${stash_image_placeholders(curr.msg, false)}</textarea>
</div> </div>
<button type="button" class="btn btn-primary" style="margin:2px;float:right;" onclick="corpo_edit_chunk_start(-1)">Cancel</button> <button type="button" class="btn btn-primary corpoeditbtn" style="float:right;" onclick="corpo_edit_chunk_start(-1)">Cancel</button>
${resendbtn} ${resendbtn}
<button type="button" class="btn btn-primary" style="margin:2px;float:right;" onclick="corpo_edit_chunk_save()">Save</button> <button type="button" class="btn btn-primary corpoeditbtn" style="float:right;" onclick="corpo_edit_chunk_save()">Save</button>
<button type="button" class="btn btn-primary bg_red" style="margin:2px;float:left;" onclick="corpo_edit_chunk_delete()">Delete</button>`: <button type="button" class="btn btn-primary corpoeditbtn bg_red" style="float:left;" onclick="corpo_edit_chunk_delete()">Delete</button>`:
`<div class="corpostyleitemcontent">${curr.msg}</div>`); `<div class="corpostyleitemcontent">${curr.msg}</div>`);
let historical_btns = ""; let historical_btns = "";
if(!curr.myturn && i==chatunits.length-1 && !incomplete_resp) if(!curr.myturn && i==chatunits.length-1 && !incomplete_resp)
@ -23951,7 +23994,9 @@ Current version indicated by LITEVER below.
<div class="context_tab_container" id="memory_tab_container"> <div class="context_tab_container" id="memory_tab_container">
<div class="settinglabel"> <div class="settinglabel">
<span class="justifyleft">Memory<span class="helpicon">?<span <span class="justifyleft">Memory<span class="helpicon">?<span
class="helptext">Put the information you want the AI to always remember. It will be inserted into the top of every request sent to the AI.</span></span></span> class="helptext">Put the information you want the AI to always remember. It will be inserted into the top of every request sent to the AI. Placeholder tags can be used.</span></span>
<button type="button" class="btn btn-primary" style="font-size:10px;padding:2px 5px;margin-left:4px;margin:2px;" onclick="memory_add_instruction()">Add Instruction</button>
</span>
<span class="justifyright flex-push-right" > <span class="justifyright flex-push-right" >
<div class="settinglabel" style="padding-top: 4px;"> <div class="settinglabel" style="padding-top: 4px;">
<div class="justifyleft settingsmall" title="Add newline after injecting memory text">Newline After Memory </div> <div class="justifyleft settingsmall" title="Add newline after injecting memory text">Newline After Memory </div>

View file

@ -105,6 +105,7 @@ runmode_untouched = True
modelfile_extracted_meta = None modelfile_extracted_meta = None
importvars_in_progress = False importvars_in_progress = False
has_multiplayer = False has_multiplayer = False
has_audio_support = False
savedata_obj = None savedata_obj = None
multiplayer_story_data_compressed = None #stores the full compressed story of the current multiplayer session multiplayer_story_data_compressed = None #stores the full compressed story of the current multiplayer session
multiplayer_turn_major = 1 # to keep track of when a client needs to sync their stories multiplayer_turn_major = 1 # to keep track of when a client needs to sync their stories
@ -537,6 +538,7 @@ def init_library():
handle.new_token.argtypes = [ctypes.c_int] handle.new_token.argtypes = [ctypes.c_int]
handle.get_stream_count.restype = ctypes.c_int handle.get_stream_count.restype = ctypes.c_int
handle.has_finished.restype = ctypes.c_bool handle.has_finished.restype = ctypes.c_bool
handle.has_audio_support.restype = ctypes.c_bool
handle.get_last_eval_time.restype = ctypes.c_float handle.get_last_eval_time.restype = ctypes.c_float
handle.get_last_process_time.restype = ctypes.c_float handle.get_last_process_time.restype = ctypes.c_float
handle.get_last_token_count.restype = ctypes.c_int handle.get_last_token_count.restype = ctypes.c_int
@ -889,7 +891,7 @@ def convert_json_to_gbnf(json_obj):
return "" return ""
def get_capabilities(): def get_capabilities():
global savedata_obj, has_multiplayer, KcppVersion, friendlymodelname, friendlysdmodelname, fullsdmodelpath, mmprojpath, password, fullwhispermodelpath, ttsmodelpath, embeddingsmodelpath global savedata_obj, has_multiplayer, KcppVersion, friendlymodelname, friendlysdmodelname, fullsdmodelpath, mmprojpath, password, fullwhispermodelpath, ttsmodelpath, embeddingsmodelpath, has_audio_support
has_llm = not (friendlymodelname=="inactive") has_llm = not (friendlymodelname=="inactive")
has_txt2img = not (friendlysdmodelname=="inactive" or fullsdmodelpath=="") has_txt2img = not (friendlysdmodelname=="inactive" or fullsdmodelpath=="")
has_vision = (mmprojpath!="") has_vision = (mmprojpath!="")
@ -900,7 +902,7 @@ def get_capabilities():
has_embeddings = (embeddingsmodelpath!="") has_embeddings = (embeddingsmodelpath!="")
has_guidance = True if args.enableguidance else False has_guidance = True if args.enableguidance else False
admin_type = (2 if args.admin and args.admindir and args.adminpassword else (1 if args.admin and args.admindir else 0)) admin_type = (2 if args.admin and args.admindir and args.adminpassword else (1 if args.admin and args.admindir else 0))
return {"result":"KoboldCpp", "version":KcppVersion, "protected":has_password, "llm":has_llm, "txt2img":has_txt2img,"vision":has_vision,"transcribe":has_whisper,"multiplayer":has_multiplayer,"websearch":has_search,"tts":has_tts, "embeddings":has_embeddings, "savedata":(savedata_obj is not None), "admin": admin_type, "guidance": has_guidance} return {"result":"KoboldCpp", "version":KcppVersion, "protected":has_password, "llm":has_llm, "txt2img":has_txt2img,"vision":has_vision,"audio":has_audio_support,"transcribe":has_whisper,"multiplayer":has_multiplayer,"websearch":has_search,"tts":has_tts, "embeddings":has_embeddings, "savedata":(savedata_obj is not None), "admin": admin_type, "guidance": has_guidance}
def dump_gguf_metadata(file_path): #if you're gonna copy this into your own project at least credit concedo def dump_gguf_metadata(file_path): #if you're gonna copy this into your own project at least credit concedo
chunk_size = 1024*1024*12 # read first 12mb of file chunk_size = 1024*1024*12 # read first 12mb of file
@ -2287,6 +2289,7 @@ def transform_genparams(genparams, api_format):
tools_message_start = adapter_obj.get("tools_start", "") tools_message_start = adapter_obj.get("tools_start", "")
tools_message_end = adapter_obj.get("tools_end", "") tools_message_end = adapter_obj.get("tools_end", "")
images_added = [] images_added = []
audio_added = []
jsongrammar = r""" jsongrammar = r"""
root ::= arr root ::= arr
value ::= object | array | string | number | ("true" | "false" | "null") ws value ::= object | array | string | number | ("true" | "false" | "null") ws
@ -2362,6 +2365,10 @@ ws ::= | " " | "\n" [ \t]{0,20}
if 'image_url' in item and item['image_url'] and item['image_url']['url'] and item['image_url']['url'].startswith("data:image"): if 'image_url' in item and item['image_url'] and item['image_url']['url'] and item['image_url']['url'].startswith("data:image"):
images_added.append(item['image_url']['url'].split(",", 1)[1]) images_added.append(item['image_url']['url'].split(",", 1)[1])
messages_string += "\n(Attached Image)\n" messages_string += "\n(Attached Image)\n"
elif item['type']=="input_audio":
if 'input_audio' in item and item['input_audio'] and item['input_audio']['data']:
audio_added.append(item['input_audio']['data'])
messages_string += "\n(Attached Audio)\n"
# If last message, add any tools calls after message content and before message end token if any # If last message, add any tools calls after message content and before message end token if any
if message['role'] == "user" and message_index == len(messages_array): if message['role'] == "user" and message_index == len(messages_array):
# tools handling: Check if user is passing a openai tools array, if so add to end of prompt before assistant prompt unless tool_choice has been set to None # tools handling: Check if user is passing a openai tools array, if so add to end of prompt before assistant prompt unless tool_choice has been set to None
@ -2464,6 +2471,8 @@ ws ::= | " " | "\n" [ \t]{0,20}
genparams["prompt"] = messages_string genparams["prompt"] = messages_string
if len(images_added)>0: if len(images_added)>0:
genparams["images"] = images_added genparams["images"] = images_added
if len(audio_added)>0:
genparams["audio"] = audio_added
if len(genparams.get('stop_sequence', []))==0: #only set stop seq if it wont overwrite existing if len(genparams.get('stop_sequence', []))==0: #only set stop seq if it wont overwrite existing
genparams["stop_sequence"] = [user_message_start.strip(),assistant_message_start.strip()] genparams["stop_sequence"] = [user_message_start.strip(),assistant_message_start.strip()]
else: else:
@ -4947,7 +4956,7 @@ def show_gui():
changed_gpu_choice_var() changed_gpu_choice_var()
# presets selector # presets selector
makelabel(quick_tab, "Presets:", 1,0,"Select a backend to use.\nCuBLAS runs on Nvidia GPUs, and is much faster.\nVulkan and CLBlast works on all GPUs but is somewhat slower.\nOtherwise, runs on CPU only.\nNoAVX2 and Failsafe modes support older PCs.") makelabel(quick_tab, "Backend:", 1,0,"Select a backend to use.\nCuBLAS runs on Nvidia GPUs, and is much faster.\nVulkan and CLBlast works on all GPUs but is somewhat slower.\nOtherwise, runs on CPU only.\nNoAVX2 and Failsafe modes support older PCs.")
runoptbox = ctk.CTkComboBox(quick_tab, values=runopts, width=190,variable=runopts_var, state="readonly") runoptbox = ctk.CTkComboBox(quick_tab, values=runopts, width=190,variable=runopts_var, state="readonly")
runoptbox.grid(row=1, column=1,padx=8, stick="nw") runoptbox.grid(row=1, column=1,padx=8, stick="nw")
@ -4995,7 +5004,7 @@ def show_gui():
hardware_tab = tabcontent["Hardware"] hardware_tab = tabcontent["Hardware"]
# presets selector # presets selector
makelabel(hardware_tab, "Presets:", 1,0,"Select a backend to use.\nCuBLAS runs on Nvidia GPUs, and is much faster.\nVulkan and CLBlast works on all GPUs but is somewhat slower.\nOtherwise, runs on CPU only.\nNoAVX2 and Failsafe modes support older PCs.") makelabel(hardware_tab, "Backend:", 1,0,"Select a backend to use.\nCuBLAS runs on Nvidia GPUs, and is much faster.\nVulkan and CLBlast works on all GPUs but is somewhat slower.\nOtherwise, runs on CPU only.\nNoAVX2 and Failsafe modes support older PCs.")
runoptbox = ctk.CTkComboBox(hardware_tab, values=runopts, width=180,variable=runopts_var, state="readonly") runoptbox = ctk.CTkComboBox(hardware_tab, values=runopts, width=180,variable=runopts_var, state="readonly")
runoptbox.grid(row=1, column=1,padx=8, stick="nw") runoptbox.grid(row=1, column=1,padx=8, stick="nw")
runoptbox.set(runopts[0]) # Set to first available option runoptbox.set(runopts[0]) # Set to first available option
@ -6577,7 +6586,7 @@ def main(launch_args, default_args):
def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False): def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False):
global embedded_kailite, embedded_kcpp_docs, embedded_kcpp_sdui, start_time, exitcounter, global_memory, using_gui_launcher global embedded_kailite, embedded_kcpp_docs, embedded_kcpp_sdui, start_time, exitcounter, global_memory, using_gui_launcher
global libname, args, friendlymodelname, friendlysdmodelname, fullsdmodelpath, mmprojpath, password, fullwhispermodelpath, ttsmodelpath, embeddingsmodelpath, friendlyembeddingsmodelname global libname, args, friendlymodelname, friendlysdmodelname, fullsdmodelpath, mmprojpath, password, fullwhispermodelpath, ttsmodelpath, embeddingsmodelpath, friendlyembeddingsmodelname, has_audio_support
start_server = True start_server = True
@ -6934,7 +6943,7 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False):
print("WARNING: Selected Text Model does not seem to be a GGUF file! Are you sure you picked the right file?") print("WARNING: Selected Text Model does not seem to be a GGUF file! Are you sure you picked the right file?")
loadok = load_model(modelname) loadok = load_model(modelname)
print("Load Text Model OK: " + str(loadok)) print("Load Text Model OK: " + str(loadok))
has_audio_support = handle.has_audio_support() # multimodal audio support is only known at runtime
if not loadok: if not loadok:
exitcounter = 999 exitcounter = 999
exit_with_error(3,"Could not load text model: " + modelname) exit_with_error(3,"Could not load text model: " + modelname)