diff --git a/koboldcpp.py b/koboldcpp.py index ba1fc5093..4b7c830cd 100755 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -2637,6 +2637,28 @@ def compress_tools_array(tools_array): return tools_array_filtered +def sweep_media_from_messages(messages_array): + images = [] + audio = [] + for message in messages_array: + curr_content = message.get("content", None) + if isinstance(curr_content, list): + for item in curr_content: + if item.get("type") == "image_url": + url = item.get("image_url", {}).get("url", "") + if url.startswith("data:image"): + images.append(url.split(",", 1)[1]) + elif item.get("type") == "input_audio": + data = item.get("input_audio", {}).get("data") + if data: + audio.append(data) + imgs_ollama = message.get("images", None) + if imgs_ollama: + for img in imgs_ollama: + images.append(img) + return images, audio + + def transform_genparams(genparams, api_format, use_jinja): global chatcompl_adapter, maxctx @@ -2784,6 +2806,8 @@ ws ::= | " " | "\n" [ \t]{0,20} messages_string = jinja_output if jinjatools and len(jinjatools)>0: genparams["using_openai_tools"] = True + # handle media + images_added, audio_added = sweep_media_from_messages(messages_array) else: if jinjatools: # inject the tools list at the top of the context window, even if context has shifted