split audio and vision detection separately

2025-09-11 01:24:36 +00:00 · 2025-07-13 17:47:15 +08:00 · 2025-07-13 17:47:15 +08:00 · 811463a704
commit 811463a704
parent 0938af7c83
5 changed files with 82 additions and 21 deletions
--- a/koboldcpp.py
+++ b/koboldcpp.py
@ -105,6 +105,7 @@ runmode_untouched = True
 modelfile_extracted_meta = None
 importvars_in_progress = False
 has_multiplayer = False
+has_audio_support = False
 savedata_obj = None
 multiplayer_story_data_compressed = None #stores the full compressed story of the current multiplayer session
 multiplayer_turn_major = 1 # to keep track of when a client needs to sync their stories
@ -537,6 +538,7 @@ def init_library():
    handle.new_token.argtypes = [ctypes.c_int]
    handle.get_stream_count.restype = ctypes.c_int
    handle.has_finished.restype = ctypes.c_bool
+    handle.has_audio_support.restype = ctypes.c_bool
    handle.get_last_eval_time.restype = ctypes.c_float
    handle.get_last_process_time.restype = ctypes.c_float
    handle.get_last_token_count.restype = ctypes.c_int
@ -889,7 +891,7 @@ def convert_json_to_gbnf(json_obj):
        return ""

 def get_capabilities():
-    global savedata_obj, has_multiplayer, KcppVersion, friendlymodelname, friendlysdmodelname, fullsdmodelpath, mmprojpath, password, fullwhispermodelpath, ttsmodelpath, embeddingsmodelpath
+    global savedata_obj, has_multiplayer, KcppVersion, friendlymodelname, friendlysdmodelname, fullsdmodelpath, mmprojpath, password, fullwhispermodelpath, ttsmodelpath, embeddingsmodelpath, has_audio_support
    has_llm = not (friendlymodelname=="inactive")
    has_txt2img = not (friendlysdmodelname=="inactive" or fullsdmodelpath=="")
    has_vision = (mmprojpath!="")
@ -900,7 +902,7 @@ def get_capabilities():
    has_embeddings = (embeddingsmodelpath!="")
    has_guidance = True if args.enableguidance else False
    admin_type = (2 if args.admin and args.admindir and args.adminpassword else (1 if args.admin and args.admindir else 0))
-    return {"result":"KoboldCpp", "version":KcppVersion, "protected":has_password, "llm":has_llm, "txt2img":has_txt2img,"vision":has_vision,"transcribe":has_whisper,"multiplayer":has_multiplayer,"websearch":has_search,"tts":has_tts, "embeddings":has_embeddings, "savedata":(savedata_obj is not None), "admin": admin_type, "guidance": has_guidance}
+    return {"result":"KoboldCpp", "version":KcppVersion, "protected":has_password, "llm":has_llm, "txt2img":has_txt2img,"vision":has_vision,"audio":has_audio_support,"transcribe":has_whisper,"multiplayer":has_multiplayer,"websearch":has_search,"tts":has_tts, "embeddings":has_embeddings, "savedata":(savedata_obj is not None), "admin": admin_type, "guidance": has_guidance}

 def dump_gguf_metadata(file_path): #if you're gonna copy this into your own project at least credit concedo
    chunk_size = 1024*1024*12  # read first 12mb of file
@ -2287,6 +2289,7 @@ def transform_genparams(genparams, api_format):
            tools_message_start = adapter_obj.get("tools_start", "")
            tools_message_end = adapter_obj.get("tools_end", "")
            images_added = []
+            audio_added = []
            jsongrammar = r"""
 root   ::= arr
 value  ::= object | array | string | number | ("true" | "false" | "null") ws
@ -2362,6 +2365,10 @@ ws ::= | " " | "\n" [ \t]{0,20}
                            if 'image_url' in item and item['image_url'] and item['image_url']['url'] and item['image_url']['url'].startswith("data:image"):
                                images_added.append(item['image_url']['url'].split(",", 1)[1])
                                messages_string += "\n(Attached Image)\n"
+                        elif item['type']=="input_audio":
+                            if 'input_audio' in item and item['input_audio'] and item['input_audio']['data']:
+                                audio_added.append(item['input_audio']['data'])
+                                messages_string += "\n(Attached Audio)\n"
                # If last message, add any tools calls after message content and before message end token if any
                if message['role'] == "user" and message_index == len(messages_array):
                    # tools handling: Check if user is passing a openai tools array, if so add to end of prompt before assistant prompt unless tool_choice has been set to None
@ -2464,6 +2471,8 @@ ws ::= | " " | "\n" [ \t]{0,20}
            genparams["prompt"] = messages_string
            if len(images_added)>0:
                genparams["images"] = images_added
+            if len(audio_added)>0:
+                genparams["audio"] = audio_added
            if len(genparams.get('stop_sequence', []))==0: #only set stop seq if it wont overwrite existing
                genparams["stop_sequence"] = [user_message_start.strip(),assistant_message_start.strip()]
            else:
@ -4947,7 +4956,7 @@ def show_gui():
        changed_gpu_choice_var()

    # presets selector
-    makelabel(quick_tab, "Presets:", 1,0,"Select a backend to use.\nCuBLAS runs on Nvidia GPUs, and is much faster.\nVulkan and CLBlast works on all GPUs but is somewhat slower.\nOtherwise, runs on CPU only.\nNoAVX2 and Failsafe modes support older PCs.")
+    makelabel(quick_tab, "Backend:", 1,0,"Select a backend to use.\nCuBLAS runs on Nvidia GPUs, and is much faster.\nVulkan and CLBlast works on all GPUs but is somewhat slower.\nOtherwise, runs on CPU only.\nNoAVX2 and Failsafe modes support older PCs.")

    runoptbox = ctk.CTkComboBox(quick_tab, values=runopts, width=190,variable=runopts_var, state="readonly")
    runoptbox.grid(row=1, column=1,padx=8, stick="nw")
@ -4995,7 +5004,7 @@ def show_gui():
    hardware_tab = tabcontent["Hardware"]

    # presets selector
-    makelabel(hardware_tab, "Presets:", 1,0,"Select a backend to use.\nCuBLAS runs on Nvidia GPUs, and is much faster.\nVulkan and CLBlast works on all GPUs but is somewhat slower.\nOtherwise, runs on CPU only.\nNoAVX2 and Failsafe modes support older PCs.")
+    makelabel(hardware_tab, "Backend:", 1,0,"Select a backend to use.\nCuBLAS runs on Nvidia GPUs, and is much faster.\nVulkan and CLBlast works on all GPUs but is somewhat slower.\nOtherwise, runs on CPU only.\nNoAVX2 and Failsafe modes support older PCs.")
    runoptbox = ctk.CTkComboBox(hardware_tab, values=runopts,  width=180,variable=runopts_var, state="readonly")
    runoptbox.grid(row=1, column=1,padx=8, stick="nw")
    runoptbox.set(runopts[0]) # Set to first available option
@ -6577,7 +6586,7 @@ def main(launch_args, default_args):

 def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False):
    global embedded_kailite, embedded_kcpp_docs, embedded_kcpp_sdui, start_time, exitcounter, global_memory, using_gui_launcher
-    global libname, args, friendlymodelname, friendlysdmodelname, fullsdmodelpath, mmprojpath, password, fullwhispermodelpath, ttsmodelpath, embeddingsmodelpath, friendlyembeddingsmodelname
+    global libname, args, friendlymodelname, friendlysdmodelname, fullsdmodelpath, mmprojpath, password, fullwhispermodelpath, ttsmodelpath, embeddingsmodelpath, friendlyembeddingsmodelname, has_audio_support

    start_server = True

@ -6934,7 +6943,7 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False):
            print("WARNING: Selected Text Model does not seem to be a GGUF file! Are you sure you picked the right file?")
        loadok = load_model(modelname)
        print("Load Text Model OK: " + str(loadok))
-
+        has_audio_support = handle.has_audio_support() # multimodal audio support is only known at runtime
        if not loadok:
            exitcounter = 999
            exit_with_error(3,"Could not load text model: " + modelname)