diff --git a/expose.cpp b/expose.cpp index 34320756b..2a9686556 100644 --- a/expose.cpp +++ b/expose.cpp @@ -268,7 +268,10 @@ extern "C" bool has_finished() { return generation_finished; } - + bool has_audio_support() + { + return audio_multimodal_supported; + } float get_last_eval_time() { return last_eval_time; } diff --git a/expose.h b/expose.h index e968d0a95..e7d9eaab5 100644 --- a/expose.h +++ b/expose.h @@ -287,6 +287,7 @@ extern std::string mmproj_filename; extern std::string draftmodel_filename; extern std::vector generated_tokens; extern bool generation_finished; +extern bool audio_multimodal_supported; extern float last_eval_time; extern float last_process_time; extern int last_token_count; diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index 7bf9fb358..68f676d43 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -57,6 +57,7 @@ std::string mmproj_filename = ""; std::string draftmodel_filename = ""; int speculative_chunk_amt = 8; //do it in chunks of this many tokens bool generation_finished; +bool audio_multimodal_supported = false; float last_process_time = 0; float last_eval_time = 0; int last_token_count = 0; @@ -1980,6 +1981,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in debugmode = inputs.debugmode; draft_ctx = nullptr; guidance_ctx = nullptr; + audio_multimodal_supported = false; auto clamped_max_context_length = inputs.max_context_length; @@ -2459,6 +2461,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in // TODO @ngxson : check if model n_mel is 128 or 80 w_filters = whisper_precalc_filters::get_128_bins(); } + audio_multimodal_supported = true; } clp_img_data = clip_image_u8_init(); } @@ -3057,7 +3060,7 @@ static void PrepareMediaEmbds(const int nctx, const std::vector & media_sep } else { - printf("\nWarning: LLAVA Image excluded - Context size too low or not enough clip tokens! (needed %d)\n",cliptokensneeded); + printf("\nWarning: Vision Image excluded - Context size too low or not enough clip tokens! (needed %d)\nImage will be IGNORED! You probably want to relaunch with a larger context size!\n",cliptokensneeded); } media_objects[i].mediachunks.push_back(chunk); } @@ -3110,7 +3113,7 @@ static void PrepareMediaEmbds(const int nctx, const std::vector & media_sep } else { - printf("\nWarning: Audio Embd excluded - Context size too low or not enough clip tokens! (needed %d)\n",cliptokensneeded); + printf("\nWarning: Audio Embd excluded - Context size too low or not enough clip tokens! (needed %d)\nAudio will be IGNORED! You probably want to relaunch with a larger context size!\n",cliptokensneeded); } } diff --git a/klite.embd b/klite.embd index 47caf4f42..9d5ee10d4 100644 --- a/klite.embd +++ b/klite.embd @@ -2135,6 +2135,11 @@ Current version indicated by LITEVER below. margin-bottom:6px; margin-left:12px; } + .corpoeditbtn + { + padding: 5px 7px 5px 7px; + margin: 2px; + } /* Colors */ .hlchunk @@ -3215,6 +3220,7 @@ Current version indicated by LITEVER below. var koboldcpp_version = ""; //detect if we are using koboldcpp var koboldcpp_version_obj = {}; var koboldcpp_has_vision = false; + var koboldcpp_has_audio = false; var koboldcpp_has_multiplayer = false; var koboldcpp_has_websearch = false; var koboldcpp_has_savedatafile = false; @@ -6650,6 +6656,7 @@ Current version indicated by LITEVER below. image_db[imgid].aspect = (req_payload.params.width>=req_payload.params.height*2?5:(req_payload.params.height>=req_payload.params.width*2?4:(req_payload.params.width>req_payload.params.height?2:(req_payload.params.width { fetch(gen_endpoint, { @@ -6850,6 +6857,7 @@ Current version indicated by LITEVER below. image_db[imgid].aspect = (req_payload.params.width>=req_payload.params.height*2?5:(req_payload.params.height>=req_payload.params.width*2?4:(req_payload.params.width>req_payload.params.height?2:(req_payload.params.width= 0); } - function is_using_kcpp_with_llava() + function is_using_kcpp_with_vision() { return (custom_kobold_endpoint!="" && koboldcpp_version && koboldcpp_version!="" && compare_version_str(koboldcpp_version, "1.61") >= 0 && koboldcpp_has_vision); } + function is_using_kcpp_with_audio() + { + return (custom_kobold_endpoint!="" && koboldcpp_version && koboldcpp_version!="" && compare_version_str(koboldcpp_version, "1.61") >= 0 && koboldcpp_has_audio); + } function is_using_kcpp_with_whisper() { return (custom_kobold_endpoint!="" && koboldcpp_version && koboldcpp_version!="" && compare_version_str(koboldcpp_version, "1.66") >= 0 && koboldcpp_has_whisper); @@ -10999,6 +11011,7 @@ Current version indicated by LITEVER below. console.log("KoboldCpp Detected: " + koboldcpp_version); document.getElementById("connectstatus").innerHTML = (`KoboldCpp ${koboldcpp_version}`); koboldcpp_has_vision = (data.vision?true:false); + koboldcpp_has_audio = (data.audio?true:false); koboldcpp_has_whisper = (data.transcribe?true:false); koboldcpp_has_multiplayer = (data.multiplayer?true:false); koboldcpp_has_websearch = (data.websearch?true:false); @@ -13861,6 +13874,22 @@ Current version indicated by LITEVER below. } } + function memory_add_instruction() + { + inputBox("Add another instruction for the AI to remember.","Add Instruction to Memory","","Enter a Prompt",()=>{ + let userinput = getInputBoxValue(); + if(userinput.trim()!="") + { + let str = get_instructendplaceholder() + userinput.trim(); + if (localsettings.separate_end_tags) { + str += get_instructendplaceholder_end(); + } + document.getElementById("memorytext").value += str; + } + },false); + + } + let temp_automem_store = ""; function autogenerate_summary_memory() { @@ -14674,6 +14703,14 @@ Current version indicated by LITEVER below. function self_upload_file_dispatch(data,filename) { + const maxSize = 20 * 1024 * 1024; // approx 20MB limit + const dlen = (data.length*0.75); + const mbs = Math.ceil(dlen/1024/1024); + if (dlen > maxSize) { + msgbox(`Selected file exceeds 20MB size limit!\nSelected file was ${mbs}MB. Please try a smaller file.`, "File Too Large"); + return; + } + if(data.startsWith("data:audio")) { self_upload_audio(data,filename); @@ -14705,6 +14742,7 @@ Current version indicated by LITEVER below. image_db[imgid].aspect = 0; image_db[imgid].imsource = 1; //0=generated,1=uploaded image_db[imgid].imrefid = ""; + image_db[imgid].type = 0; //0=image, 1=audio let imgres = localsettings.img_allowhd?VHD_RES_PX:NO_HD_RES_PX; compressImage(origImg, (newDataUri, outAspect) => { image_db[imgid].done = true; @@ -14748,6 +14786,7 @@ Current version indicated by LITEVER below. image_db[imgid].imsource = 1; //0=generated,1=uploaded image_db[imgid].imrefid = filename; image_db[imgid].len = 0; + image_db[imgid].type = 1; //0=image, 1=audio if(localsettings.no_compress_audio) { image_db[imgid].done = true; @@ -16435,11 +16474,11 @@ Current version indicated by LITEVER below. submit_payload.params.memory = truncated_memory; submit_payload.params.trim_stop = true; } - if(is_using_kcpp_with_llava() && insertAIVisionImages.length>0) + if(is_using_kcpp_with_vision() && insertAIVisionImages.length>0) { submit_payload.params.images = insertAIVisionImages.map(str => str.includes("base64,")?str.split("base64,")[1]:str); } - if(is_using_kcpp_with_llava() && insertAIAudioSounds.length>0) + if(is_using_kcpp_with_audio() && insertAIAudioSounds.length>0) { submit_payload.params.audio = insertAIAudioSounds.map(str => str.includes("base64,")?str.split("base64,")[1]:str); } @@ -17619,6 +17658,7 @@ Current version indicated by LITEVER below. image_db[imgid].aspect = (iwidth>=iheight*2?5:(iheight>=iwidth*2?4:(iwidth>iheight?2:(iwidth=iheight*2?5:(iheight>=iwidth*2?4:(iwidth>iheight?2:(iwidth{ if(outputimg) { @@ -17714,6 +17755,7 @@ Current version indicated by LITEVER below. image_db[imgid].aspect = 0; image_db[imgid].imsource = 0; //0=generated,1=uploaded image_db[imgid].imrefid = ""; + image_db[imgid].type = 0; //0=image, 1=audio generate_dalle_image(genimg_payload,(outputimg,outputerr)=>{ if(outputimg) { @@ -17899,19 +17941,19 @@ Current version indicated by LITEVER below. let savedmeta = completed_imgs_meta[imghash]; if(!savedmeta && imghash!="") { - savedmeta = completed_imgs_meta[imghash] = {prompt:"", desc:"", visionmode:0, aspect:0, ref:"", len:0}; + savedmeta = completed_imgs_meta[imghash] = {prompt:"", desc:"", visionmode:0, aspect:0, ref:"", len:0, type:0}; } if(!savedmeta.visionmode) { savedmeta.visionmode = 0; } - let hasllava = is_using_kcpp_with_llava(); + let canvisionaudio = ((is_using_kcpp_with_vision() && savedmeta.type==0) || (is_using_kcpp_with_audio() && savedmeta.type==1)); let visionstatus = ""; if(savedmeta.visionmode==3) { if(custom_kobold_endpoint!="") //on a kobo endpoint { - visionstatus = ((!savedmeta.visionmode || savedmeta.visionmode==0)?`Inactive`:(hasllava?`Active`:`Unsupported`)); + visionstatus = ((!savedmeta.visionmode || savedmeta.visionmode==0)?`Inactive`:(canvisionaudio?`Active`:`Unsupported`)); } else { @@ -18712,7 +18754,8 @@ Current version indicated by LITEVER below. gametext_arr[i] = gametext_arr[i].replace(matchstr, newstr); let metaid = cyrb_hash(img.result); //default to llava if supported, and image is self uploaded - completed_imgs_meta[metaid] = {prompt:image_db[key].prompt, desc:"", visionmode:((image_db[key].imsource==1 && is_using_kcpp_with_llava())?3:0), aspect:image_db[key].aspect, ref:image_db[key].imrefid, len:image_db[key].len}; + let desiredvismode = ((image_db[key].imsource==1 && ((is_using_kcpp_with_vision() && image_db[key].type==0) || (is_using_kcpp_with_audio() && image_db[key].type==1)))?3:0); + completed_imgs_meta[metaid] = {prompt:image_db[key].prompt, desc:"", visionmode:desiredvismode, aspect:image_db[key].aspect, ref:image_db[key].imrefid, len:image_db[key].len, type:image_db[key].type}; delete image_db[key]; } } @@ -21183,16 +21226,16 @@ Current version indicated by LITEVER below. let curr = chatunits[i]; curr = repack_postprocess_turn(curr, countmap); - let resendbtn = ((curr.myturn && iResend`:``); + let resendbtn = ((curr.myturn && iResend`:``); let bodypart = (corpo_editing_turn == i ? `
- + ${resendbtn} - - `: + + `: `
${curr.msg}
`); let historical_btns = ""; if(!curr.myturn && i==chatunits.length-1 && !incomplete_resp) @@ -23951,7 +23994,9 @@ Current version indicated by LITEVER below.
Memory?Put the information you want the AI to always remember. It will be inserted into the top of every request sent to the AI. + class="helptext">Put the information you want the AI to always remember. It will be inserted into the top of every request sent to the AI. Placeholder tags can be used. + +
Newline After Memory
diff --git a/koboldcpp.py b/koboldcpp.py index 94f86a37b..765b6cf83 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -105,6 +105,7 @@ runmode_untouched = True modelfile_extracted_meta = None importvars_in_progress = False has_multiplayer = False +has_audio_support = False savedata_obj = None multiplayer_story_data_compressed = None #stores the full compressed story of the current multiplayer session multiplayer_turn_major = 1 # to keep track of when a client needs to sync their stories @@ -537,6 +538,7 @@ def init_library(): handle.new_token.argtypes = [ctypes.c_int] handle.get_stream_count.restype = ctypes.c_int handle.has_finished.restype = ctypes.c_bool + handle.has_audio_support.restype = ctypes.c_bool handle.get_last_eval_time.restype = ctypes.c_float handle.get_last_process_time.restype = ctypes.c_float handle.get_last_token_count.restype = ctypes.c_int @@ -889,7 +891,7 @@ def convert_json_to_gbnf(json_obj): return "" def get_capabilities(): - global savedata_obj, has_multiplayer, KcppVersion, friendlymodelname, friendlysdmodelname, fullsdmodelpath, mmprojpath, password, fullwhispermodelpath, ttsmodelpath, embeddingsmodelpath + global savedata_obj, has_multiplayer, KcppVersion, friendlymodelname, friendlysdmodelname, fullsdmodelpath, mmprojpath, password, fullwhispermodelpath, ttsmodelpath, embeddingsmodelpath, has_audio_support has_llm = not (friendlymodelname=="inactive") has_txt2img = not (friendlysdmodelname=="inactive" or fullsdmodelpath=="") has_vision = (mmprojpath!="") @@ -900,7 +902,7 @@ def get_capabilities(): has_embeddings = (embeddingsmodelpath!="") has_guidance = True if args.enableguidance else False admin_type = (2 if args.admin and args.admindir and args.adminpassword else (1 if args.admin and args.admindir else 0)) - return {"result":"KoboldCpp", "version":KcppVersion, "protected":has_password, "llm":has_llm, "txt2img":has_txt2img,"vision":has_vision,"transcribe":has_whisper,"multiplayer":has_multiplayer,"websearch":has_search,"tts":has_tts, "embeddings":has_embeddings, "savedata":(savedata_obj is not None), "admin": admin_type, "guidance": has_guidance} + return {"result":"KoboldCpp", "version":KcppVersion, "protected":has_password, "llm":has_llm, "txt2img":has_txt2img,"vision":has_vision,"audio":has_audio_support,"transcribe":has_whisper,"multiplayer":has_multiplayer,"websearch":has_search,"tts":has_tts, "embeddings":has_embeddings, "savedata":(savedata_obj is not None), "admin": admin_type, "guidance": has_guidance} def dump_gguf_metadata(file_path): #if you're gonna copy this into your own project at least credit concedo chunk_size = 1024*1024*12 # read first 12mb of file @@ -2287,6 +2289,7 @@ def transform_genparams(genparams, api_format): tools_message_start = adapter_obj.get("tools_start", "") tools_message_end = adapter_obj.get("tools_end", "") images_added = [] + audio_added = [] jsongrammar = r""" root ::= arr value ::= object | array | string | number | ("true" | "false" | "null") ws @@ -2362,6 +2365,10 @@ ws ::= | " " | "\n" [ \t]{0,20} if 'image_url' in item and item['image_url'] and item['image_url']['url'] and item['image_url']['url'].startswith("data:image"): images_added.append(item['image_url']['url'].split(",", 1)[1]) messages_string += "\n(Attached Image)\n" + elif item['type']=="input_audio": + if 'input_audio' in item and item['input_audio'] and item['input_audio']['data']: + audio_added.append(item['input_audio']['data']) + messages_string += "\n(Attached Audio)\n" # If last message, add any tools calls after message content and before message end token if any if message['role'] == "user" and message_index == len(messages_array): # tools handling: Check if user is passing a openai tools array, if so add to end of prompt before assistant prompt unless tool_choice has been set to None @@ -2464,6 +2471,8 @@ ws ::= | " " | "\n" [ \t]{0,20} genparams["prompt"] = messages_string if len(images_added)>0: genparams["images"] = images_added + if len(audio_added)>0: + genparams["audio"] = audio_added if len(genparams.get('stop_sequence', []))==0: #only set stop seq if it wont overwrite existing genparams["stop_sequence"] = [user_message_start.strip(),assistant_message_start.strip()] else: @@ -4947,7 +4956,7 @@ def show_gui(): changed_gpu_choice_var() # presets selector - makelabel(quick_tab, "Presets:", 1,0,"Select a backend to use.\nCuBLAS runs on Nvidia GPUs, and is much faster.\nVulkan and CLBlast works on all GPUs but is somewhat slower.\nOtherwise, runs on CPU only.\nNoAVX2 and Failsafe modes support older PCs.") + makelabel(quick_tab, "Backend:", 1,0,"Select a backend to use.\nCuBLAS runs on Nvidia GPUs, and is much faster.\nVulkan and CLBlast works on all GPUs but is somewhat slower.\nOtherwise, runs on CPU only.\nNoAVX2 and Failsafe modes support older PCs.") runoptbox = ctk.CTkComboBox(quick_tab, values=runopts, width=190,variable=runopts_var, state="readonly") runoptbox.grid(row=1, column=1,padx=8, stick="nw") @@ -4995,7 +5004,7 @@ def show_gui(): hardware_tab = tabcontent["Hardware"] # presets selector - makelabel(hardware_tab, "Presets:", 1,0,"Select a backend to use.\nCuBLAS runs on Nvidia GPUs, and is much faster.\nVulkan and CLBlast works on all GPUs but is somewhat slower.\nOtherwise, runs on CPU only.\nNoAVX2 and Failsafe modes support older PCs.") + makelabel(hardware_tab, "Backend:", 1,0,"Select a backend to use.\nCuBLAS runs on Nvidia GPUs, and is much faster.\nVulkan and CLBlast works on all GPUs but is somewhat slower.\nOtherwise, runs on CPU only.\nNoAVX2 and Failsafe modes support older PCs.") runoptbox = ctk.CTkComboBox(hardware_tab, values=runopts, width=180,variable=runopts_var, state="readonly") runoptbox.grid(row=1, column=1,padx=8, stick="nw") runoptbox.set(runopts[0]) # Set to first available option @@ -6577,7 +6586,7 @@ def main(launch_args, default_args): def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False): global embedded_kailite, embedded_kcpp_docs, embedded_kcpp_sdui, start_time, exitcounter, global_memory, using_gui_launcher - global libname, args, friendlymodelname, friendlysdmodelname, fullsdmodelpath, mmprojpath, password, fullwhispermodelpath, ttsmodelpath, embeddingsmodelpath, friendlyembeddingsmodelname + global libname, args, friendlymodelname, friendlysdmodelname, fullsdmodelpath, mmprojpath, password, fullwhispermodelpath, ttsmodelpath, embeddingsmodelpath, friendlyembeddingsmodelname, has_audio_support start_server = True @@ -6934,7 +6943,7 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False): print("WARNING: Selected Text Model does not seem to be a GGUF file! Are you sure you picked the right file?") loadok = load_model(modelname) print("Load Text Model OK: " + str(loadok)) - + has_audio_support = handle.has_audio_support() # multimodal audio support is only known at runtime if not loadok: exitcounter = 999 exit_with_error(3,"Could not load text model: " + modelname)