split audio and vision detection separately

This commit is contained in:
Concedo 2025-07-13 17:47:15 +08:00
parent 0938af7c83
commit 811463a704
5 changed files with 82 additions and 21 deletions

View file

@ -105,6 +105,7 @@ runmode_untouched = True
modelfile_extracted_meta = None
importvars_in_progress = False
has_multiplayer = False
has_audio_support = False
savedata_obj = None
multiplayer_story_data_compressed = None #stores the full compressed story of the current multiplayer session
multiplayer_turn_major = 1 # to keep track of when a client needs to sync their stories
@ -537,6 +538,7 @@ def init_library():
handle.new_token.argtypes = [ctypes.c_int]
handle.get_stream_count.restype = ctypes.c_int
handle.has_finished.restype = ctypes.c_bool
handle.has_audio_support.restype = ctypes.c_bool
handle.get_last_eval_time.restype = ctypes.c_float
handle.get_last_process_time.restype = ctypes.c_float
handle.get_last_token_count.restype = ctypes.c_int
@ -889,7 +891,7 @@ def convert_json_to_gbnf(json_obj):
return ""
def get_capabilities():
global savedata_obj, has_multiplayer, KcppVersion, friendlymodelname, friendlysdmodelname, fullsdmodelpath, mmprojpath, password, fullwhispermodelpath, ttsmodelpath, embeddingsmodelpath
global savedata_obj, has_multiplayer, KcppVersion, friendlymodelname, friendlysdmodelname, fullsdmodelpath, mmprojpath, password, fullwhispermodelpath, ttsmodelpath, embeddingsmodelpath, has_audio_support
has_llm = not (friendlymodelname=="inactive")
has_txt2img = not (friendlysdmodelname=="inactive" or fullsdmodelpath=="")
has_vision = (mmprojpath!="")
@ -900,7 +902,7 @@ def get_capabilities():
has_embeddings = (embeddingsmodelpath!="")
has_guidance = True if args.enableguidance else False
admin_type = (2 if args.admin and args.admindir and args.adminpassword else (1 if args.admin and args.admindir else 0))
return {"result":"KoboldCpp", "version":KcppVersion, "protected":has_password, "llm":has_llm, "txt2img":has_txt2img,"vision":has_vision,"transcribe":has_whisper,"multiplayer":has_multiplayer,"websearch":has_search,"tts":has_tts, "embeddings":has_embeddings, "savedata":(savedata_obj is not None), "admin": admin_type, "guidance": has_guidance}
return {"result":"KoboldCpp", "version":KcppVersion, "protected":has_password, "llm":has_llm, "txt2img":has_txt2img,"vision":has_vision,"audio":has_audio_support,"transcribe":has_whisper,"multiplayer":has_multiplayer,"websearch":has_search,"tts":has_tts, "embeddings":has_embeddings, "savedata":(savedata_obj is not None), "admin": admin_type, "guidance": has_guidance}
def dump_gguf_metadata(file_path): #if you're gonna copy this into your own project at least credit concedo
chunk_size = 1024*1024*12 # read first 12mb of file
@ -2287,6 +2289,7 @@ def transform_genparams(genparams, api_format):
tools_message_start = adapter_obj.get("tools_start", "")
tools_message_end = adapter_obj.get("tools_end", "")
images_added = []
audio_added = []
jsongrammar = r"""
root ::= arr
value ::= object | array | string | number | ("true" | "false" | "null") ws
@ -2362,6 +2365,10 @@ ws ::= | " " | "\n" [ \t]{0,20}
if 'image_url' in item and item['image_url'] and item['image_url']['url'] and item['image_url']['url'].startswith("data:image"):
images_added.append(item['image_url']['url'].split(",", 1)[1])
messages_string += "\n(Attached Image)\n"
elif item['type']=="input_audio":
if 'input_audio' in item and item['input_audio'] and item['input_audio']['data']:
audio_added.append(item['input_audio']['data'])
messages_string += "\n(Attached Audio)\n"
# If last message, add any tools calls after message content and before message end token if any
if message['role'] == "user" and message_index == len(messages_array):
# tools handling: Check if user is passing a openai tools array, if so add to end of prompt before assistant prompt unless tool_choice has been set to None
@ -2464,6 +2471,8 @@ ws ::= | " " | "\n" [ \t]{0,20}
genparams["prompt"] = messages_string
if len(images_added)>0:
genparams["images"] = images_added
if len(audio_added)>0:
genparams["audio"] = audio_added
if len(genparams.get('stop_sequence', []))==0: #only set stop seq if it wont overwrite existing
genparams["stop_sequence"] = [user_message_start.strip(),assistant_message_start.strip()]
else:
@ -4947,7 +4956,7 @@ def show_gui():
changed_gpu_choice_var()
# presets selector
makelabel(quick_tab, "Presets:", 1,0,"Select a backend to use.\nCuBLAS runs on Nvidia GPUs, and is much faster.\nVulkan and CLBlast works on all GPUs but is somewhat slower.\nOtherwise, runs on CPU only.\nNoAVX2 and Failsafe modes support older PCs.")
makelabel(quick_tab, "Backend:", 1,0,"Select a backend to use.\nCuBLAS runs on Nvidia GPUs, and is much faster.\nVulkan and CLBlast works on all GPUs but is somewhat slower.\nOtherwise, runs on CPU only.\nNoAVX2 and Failsafe modes support older PCs.")
runoptbox = ctk.CTkComboBox(quick_tab, values=runopts, width=190,variable=runopts_var, state="readonly")
runoptbox.grid(row=1, column=1,padx=8, stick="nw")
@ -4995,7 +5004,7 @@ def show_gui():
hardware_tab = tabcontent["Hardware"]
# presets selector
makelabel(hardware_tab, "Presets:", 1,0,"Select a backend to use.\nCuBLAS runs on Nvidia GPUs, and is much faster.\nVulkan and CLBlast works on all GPUs but is somewhat slower.\nOtherwise, runs on CPU only.\nNoAVX2 and Failsafe modes support older PCs.")
makelabel(hardware_tab, "Backend:", 1,0,"Select a backend to use.\nCuBLAS runs on Nvidia GPUs, and is much faster.\nVulkan and CLBlast works on all GPUs but is somewhat slower.\nOtherwise, runs on CPU only.\nNoAVX2 and Failsafe modes support older PCs.")
runoptbox = ctk.CTkComboBox(hardware_tab, values=runopts, width=180,variable=runopts_var, state="readonly")
runoptbox.grid(row=1, column=1,padx=8, stick="nw")
runoptbox.set(runopts[0]) # Set to first available option
@ -6577,7 +6586,7 @@ def main(launch_args, default_args):
def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False):
global embedded_kailite, embedded_kcpp_docs, embedded_kcpp_sdui, start_time, exitcounter, global_memory, using_gui_launcher
global libname, args, friendlymodelname, friendlysdmodelname, fullsdmodelpath, mmprojpath, password, fullwhispermodelpath, ttsmodelpath, embeddingsmodelpath, friendlyembeddingsmodelname
global libname, args, friendlymodelname, friendlysdmodelname, fullsdmodelpath, mmprojpath, password, fullwhispermodelpath, ttsmodelpath, embeddingsmodelpath, friendlyembeddingsmodelname, has_audio_support
start_server = True
@ -6934,7 +6943,7 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False):
print("WARNING: Selected Text Model does not seem to be a GGUF file! Are you sure you picked the right file?")
loadok = load_model(modelname)
print("Load Text Model OK: " + str(loadok))
has_audio_support = handle.has_audio_support() # multimodal audio support is only known at runtime
if not loadok:
exitcounter = 999
exit_with_error(3,"Could not load text model: " + modelname)