From abf527a2078e70c5d45df618c263d0f5b3fe733b Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Mon, 28 Jul 2025 22:54:49 +0800 Subject: [PATCH] clearer multimodal capability display --- expose.cpp | 4 ++++ expose.h | 1 + gpttype_adapter.cpp | 8 +++++++- koboldcpp.py | 28 ++++++++++++++++------------ 4 files changed, 28 insertions(+), 13 deletions(-) diff --git a/expose.cpp b/expose.cpp index 2a9686556..fe0ee8a43 100644 --- a/expose.cpp +++ b/expose.cpp @@ -272,6 +272,10 @@ extern "C" { return audio_multimodal_supported; } + bool has_vision_support() + { + return vision_multimodal_supported; + } float get_last_eval_time() { return last_eval_time; } diff --git a/expose.h b/expose.h index e7d9eaab5..2399db4d8 100644 --- a/expose.h +++ b/expose.h @@ -288,6 +288,7 @@ extern std::string draftmodel_filename; extern std::vector generated_tokens; extern bool generation_finished; extern bool audio_multimodal_supported; +extern bool vision_multimodal_supported; extern float last_eval_time; extern float last_process_time; extern int last_token_count; diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index 7d1066ee5..ff614b052 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -58,6 +58,7 @@ std::string draftmodel_filename = ""; int speculative_chunk_amt = 8; //do it in chunks of this many tokens bool generation_finished; bool audio_multimodal_supported = false; +bool vision_multimodal_supported = false; float last_process_time = 0; float last_eval_time = 0; int last_token_count = 0; @@ -1985,6 +1986,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in draft_ctx = nullptr; guidance_ctx = nullptr; audio_multimodal_supported = false; + vision_multimodal_supported = false; auto clamped_max_context_length = inputs.max_context_length; @@ -2464,6 +2466,11 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in return ModelLoadResult::FAIL; } + if(clp_ctx_v) + { + vision_multimodal_supported = true; + } + clp_img_data = clip_image_u8_init(); if(clp_ctx_a) //init audio { if (clip_has_whisper_encoder(clp_ctx_a)) { @@ -2472,7 +2479,6 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in } audio_multimodal_supported = true; } - clp_img_data = clip_image_u8_init(); } const llama_vocab * tmpvocab = llama_model_get_vocab(llamamodel); diff --git a/koboldcpp.py b/koboldcpp.py index 7ef02ac6c..9f7ba282b 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -76,7 +76,6 @@ friendlyembeddingsmodelname = "inactive" lastgeneratedcomfyimg = b'' lastuploadedcomfyimg = b'' fullsdmodelpath = "" #if empty, it's not initialized -mmprojpath = "" #if empty, it's not initialized password = "" #if empty, no auth key required fullwhispermodelpath = "" #if empty, it's not initialized ttsmodelpath = "" #if empty, not initialized @@ -106,6 +105,7 @@ modelfile_extracted_meta = None importvars_in_progress = False has_multiplayer = False has_audio_support = False +has_vision_support = False savedata_obj = None multiplayer_story_data_compressed = None #stores the full compressed story of the current multiplayer session multiplayer_turn_major = 1 # to keep track of when a client needs to sync their stories @@ -539,6 +539,7 @@ def init_library(): handle.get_stream_count.restype = ctypes.c_int handle.has_finished.restype = ctypes.c_bool handle.has_audio_support.restype = ctypes.c_bool + handle.has_vision_support.restype = ctypes.c_bool handle.get_last_eval_time.restype = ctypes.c_float handle.get_last_process_time.restype = ctypes.c_float handle.get_last_token_count.restype = ctypes.c_int @@ -891,10 +892,9 @@ def convert_json_to_gbnf(json_obj): return "" def get_capabilities(): - global savedata_obj, has_multiplayer, KcppVersion, friendlymodelname, friendlysdmodelname, fullsdmodelpath, mmprojpath, password, fullwhispermodelpath, ttsmodelpath, embeddingsmodelpath, has_audio_support + global savedata_obj, has_multiplayer, KcppVersion, friendlymodelname, friendlysdmodelname, fullsdmodelpath, password, fullwhispermodelpath, ttsmodelpath, embeddingsmodelpath, has_audio_support, has_vision_support has_llm = not (friendlymodelname=="inactive") has_txt2img = not (friendlysdmodelname=="inactive" or fullsdmodelpath=="") - has_vision = (mmprojpath!="") has_password = (password!="") has_whisper = (fullwhispermodelpath!="") has_search = True if args.websearch else False @@ -902,7 +902,7 @@ def get_capabilities(): has_embeddings = (embeddingsmodelpath!="") has_guidance = True if args.enableguidance else False admin_type = (2 if args.admin and args.admindir and args.adminpassword else (1 if args.admin and args.admindir else 0)) - return {"result":"KoboldCpp", "version":KcppVersion, "protected":has_password, "llm":has_llm, "txt2img":has_txt2img,"vision":has_vision,"audio":has_audio_support,"transcribe":has_whisper,"multiplayer":has_multiplayer,"websearch":has_search,"tts":has_tts, "embeddings":has_embeddings, "savedata":(savedata_obj is not None), "admin": admin_type, "guidance": has_guidance} + return {"result":"KoboldCpp", "version":KcppVersion, "protected":has_password, "llm":has_llm, "txt2img":has_txt2img,"vision":has_vision_support,"audio":has_audio_support,"transcribe":has_whisper,"multiplayer":has_multiplayer,"websearch":has_search,"tts":has_tts, "embeddings":has_embeddings, "savedata":(savedata_obj is not None), "admin": admin_type, "guidance": has_guidance} def dump_gguf_metadata(file_path): #if you're gonna copy this into your own project at least credit concedo chunk_size = 1024*1024*12 # read first 12mb of file @@ -3120,7 +3120,7 @@ Change Mode
def do_GET(self): global embedded_kailite, embedded_kcpp_docs, embedded_kcpp_sdui global last_req_time, start_time - global savedata_obj, has_multiplayer, multiplayer_turn_major, multiplayer_turn_minor, multiplayer_story_data_compressed, multiplayer_dataformat, multiplayer_lastactive, maxctx, maxhordelen, friendlymodelname, lastuploadedcomfyimg, lastgeneratedcomfyimg, KcppVersion, totalgens, preloaded_story, exitcounter, currentusergenkey, friendlysdmodelname, fullsdmodelpath, mmprojpath, password, friendlyembeddingsmodelname + global savedata_obj, has_multiplayer, multiplayer_turn_major, multiplayer_turn_minor, multiplayer_story_data_compressed, multiplayer_dataformat, multiplayer_lastactive, maxctx, maxhordelen, friendlymodelname, lastuploadedcomfyimg, lastgeneratedcomfyimg, KcppVersion, totalgens, preloaded_story, exitcounter, currentusergenkey, friendlysdmodelname, fullsdmodelpath, password, friendlyembeddingsmodelname self.path = self.path.rstrip('/') response_body = None content_type = 'application/json' @@ -3370,7 +3370,7 @@ Change Mode
return def do_POST(self): - global modelbusy, requestsinqueue, currentusergenkey, totalgens, pendingabortkey, lastuploadedcomfyimg, lastgeneratedcomfyimg, multiplayer_turn_major, multiplayer_turn_minor, multiplayer_story_data_compressed, multiplayer_dataformat, multiplayer_lastactive, net_save_slots + global modelbusy, requestsinqueue, currentusergenkey, totalgens, pendingabortkey, lastuploadedcomfyimg, lastgeneratedcomfyimg, multiplayer_turn_major, multiplayer_turn_minor, multiplayer_story_data_compressed, multiplayer_dataformat, multiplayer_lastactive, net_save_slots, has_vision_support contlenstr = self.headers['content-length'] content_length = 0 body = None @@ -3846,8 +3846,7 @@ Change Mode
elif self.path.endswith('/v1/chat/completions'): api_format = 4 elif self.path.endswith('/sdapi/v1/interrogate'): - has_vision = (mmprojpath!="") - if not has_vision: + if not has_vision_support: self.send_response(503) self.end_headers(content_type='application/json') self.wfile.write(json.dumps({"detail": { @@ -6641,7 +6640,7 @@ def main(launch_args, default_args): def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False): global embedded_kailite, embedded_kcpp_docs, embedded_kcpp_sdui, start_time, exitcounter, global_memory, using_gui_launcher - global libname, args, friendlymodelname, friendlysdmodelname, fullsdmodelpath, mmprojpath, password, fullwhispermodelpath, ttsmodelpath, embeddingsmodelpath, friendlyembeddingsmodelname, has_audio_support + global libname, args, friendlymodelname, friendlysdmodelname, fullsdmodelpath, password, fullwhispermodelpath, ttsmodelpath, embeddingsmodelpath, friendlyembeddingsmodelname, has_audio_support, has_vision_support start_server = True @@ -6982,9 +6981,7 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False): exitcounter = 999 exit_with_error(2,f"Cannot find mmproj file: {args.mmproj}") else: - global mmprojpath args.mmproj = os.path.abspath(args.mmproj) - mmprojpath = args.mmproj if not args.blasthreads or args.blasthreads <= 0: args.blasthreads = args.threads @@ -6998,7 +6995,13 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False): print("WARNING: Selected Text Model does not seem to be a GGUF file! Are you sure you picked the right file?") loadok = load_model(modelname) print("Load Text Model OK: " + str(loadok)) - has_audio_support = handle.has_audio_support() # multimodal audio support is only known at runtime + if args.mmproj and args.mmproj!="": # multimodal vision and audio support is only known at runtime + has_audio_support = handle.has_audio_support() + has_vision_support = handle.has_vision_support() + else: + has_audio_support = False + has_vision_support = False + if not loadok: exitcounter = 999 exit_with_error(3,"Could not load text model: " + modelname) @@ -7193,6 +7196,7 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False): enabledmlist.append("ImageGeneration") if "txt2img" in caps and caps["txt2img"] else disabledmlist.append("ImageGeneration") enabledmlist.append("VoiceRecognition") if "transcribe" in caps and caps["transcribe"] else disabledmlist.append("VoiceRecognition") enabledmlist.append("MultimodalVision") if "vision" in caps and caps["vision"] else disabledmlist.append("MultimodalVision") + enabledmlist.append("MultimodalAudio") if "audio" in caps and caps["audio"] else disabledmlist.append("MultimodalAudio") enabledmlist.append("NetworkMultiplayer") if "multiplayer" in caps and caps["multiplayer"] else disabledmlist.append("NetworkMultiplayer") enabledmlist.append("ApiKeyPassword") if "protected" in caps and caps["protected"] else disabledmlist.append("ApiKeyPassword") enabledmlist.append("WebSearchProxy") if "websearch" in caps and caps["websearch"] else disabledmlist.append("WebSearchProxy")