llava support is now fully functioning

2025-09-10 17:14:36 +00:00 · 2024-03-11 15:55:32 +08:00 · 2024-03-11 15:55:32 +08:00 · 484d90c330
commit 484d90c330
parent d943c739a8
2 changed files with 193 additions and 47 deletions
--- a/gpttype_adapter.cpp
+++ b/gpttype_adapter.cpp
@ -33,6 +33,11 @@
 #include "examples/llava/clip.h"
 #include "examples/llava/llava.h"
 //const
 const int extra_context_handle_fragmentation = 80;
 const int LLAVA_TOKEN_IDENTIFIER_A = -998; //alternate between both, changing when image changes
 const int LLAVA_TOKEN_IDENTIFIER_B = -999;
 //shared
 std::string executable_path = "";
 std::string lora_filename = "";
@ -80,6 +85,8 @@ static llama_context * llama_ctx_v4;
 static clip_ctx * clp_ctx = nullptr; //for llava
 static clip_image_u8 * clp_img_data = nullptr; //most recent image
 static std::vector<llava_image> llava_images;
 static std::string llava_composite_image_signature = ""; //for identifying when the llava images change, we need to invalidate the cache
 static int current_llava_identifier = LLAVA_TOKEN_IDENTIFIER_A;
 static gpt_params * kcpp_params = nullptr;
 static int max_context_limit_at_load = 0;
@ -105,8 +112,6 @@ static std::string concat_output_reader_copy_poll = ""; //for streaming
 static std::string concat_output_reader_copy_res = ""; //for gen response
 static std::vector<logit_bias> logit_biases;
 const int extra_context_handle_fragmentation = 80;
 inline bool IsNanCheck(float f)
 {
    const unsigned int u = *(unsigned int*)&f;
@ -1080,7 +1085,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
            }
        }
-        if(mmproj_filename != "")
+        if(mmproj_filename != "" && file_format==FileFormat::GGUF_GENERIC)
        {
            printf("\nAttempting to apply Multimodal Projector: %s\n", mmproj_filename.c_str());
            clp_ctx = clip_model_load(mmproj_filename.c_str(), /*verbosity=*/ 1);
@ -1593,6 +1598,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
        }
    }
    llava_images.clear();
    std::string new_llava_composite = "";
    for(int x=0;x<images_max;++x)
    {
        std::string item = inputs.images[x];
@ -1601,6 +1607,17 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
            llava_image lv;
            lv.b64data = item;
            llava_images.push_back(lv);
            new_llava_composite += item;
        }
    }
    if(llava_composite_image_signature!=new_llava_composite)
    {
        //images have changed. swap identifiers to force reprocessing
        current_llava_identifier = (current_llava_identifier==LLAVA_TOKEN_IDENTIFIER_A?LLAVA_TOKEN_IDENTIFIER_B:LLAVA_TOKEN_IDENTIFIER_A);
        llava_composite_image_signature = new_llava_composite;
        if(debugmode==1)
        {
            printf("\nLLAVA images changed, existing cache invalidated");
        }
    }
@ -1667,6 +1684,10 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
    // tokenize the prompt
    std::vector<int> embd_inp;
    std::vector<int> embd_inp_mem; //for storing added memory
    std::vector<int> llava_mem; //for storing dummy tokens that will be consumed by llava
    int32_t nctx = kcpp_params->n_ctx;
    TokenizeString(kcpp_params->prompt, embd_inp, file_format);
    if(clp_ctx!=nullptr && clp_img_data!=nullptr)
@ -1686,7 +1707,20 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
                if (!llava_image_embed_make_with_clip_img(clp_ctx, kcpp_params->n_threads, clp_img_data, &llava_images[i].clp_img_embd, &llava_images[i].clp_image_tokens)) {
                    printf("\nError: Clip image %d failed to create embd!",i);
                }
-                printf("\nLLAVA Clip Embed %i used Tokens: %d",i,llava_images[i].clp_image_tokens);
+                if(debugmode==1)
                {
                    printf("\nLLAVA Clip Embed %i used Tokens: %d",i,llava_images[i].clp_image_tokens);
                }
                if(llava_images[i].clp_image_tokens>0 && llava_images[i].clp_image_tokens < nctx)
                {
                    for(int n=0;n<llava_images[i].clp_image_tokens;++n)
                    {
                        llava_mem.push_back(current_llava_identifier);
                    }
                }else
                {
                    printf("\nWarning: LLAVA Image excluded - Context size too low or not enough clip tokens!\n");
                }
            }
        }
    }
@ -1697,8 +1731,6 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
    }
    //truncate to front of the prompt if its too long
    int32_t nctx = kcpp_params->n_ctx;
    if (embd_inp.size() + kcpp_params->n_predict > nctx)
    {
        //get bos token
@ -1713,8 +1745,43 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
        }
    }
    if(llava_mem.size()>0) //stick the llava mem before the added mem
    {
        if(llava_mem.size() + kcpp_params->n_predict + 4 > nctx)
        {
            printf("\nWarning: Too many LLaVA tokens, max context exceeded! They will be ignored!\n");
        }
        else
        {
            std::vector<int> bos;
            TokenizeString("", bos, file_format);
            if(embd_inp_mem.size()>0) //remove existing bos if exists
            {
                if (bos.size()>0 && !embd_inp_mem.empty() && bos[0]==embd_inp_mem[0]) {
                    embd_inp_mem.erase(embd_inp_mem.begin());
                }
            }
            //append llava dummy tokens
            embd_inp_mem.insert(embd_inp_mem.begin(), llava_mem.begin(), llava_mem.end());
            if (bos.size() > 0 && embd_inp_mem.size() > 0)
            {
                embd_inp_mem.insert(embd_inp_mem.begin(), bos[0]);  //insert bos at front
            }
             //shorten memory if needed
            if (embd_inp_mem.size() + kcpp_params->n_predict + 4 > nctx)
            {
                int limit = nctx - (kcpp_params->n_predict + 4);
                if (embd_inp_mem.size() > limit) {
                    embd_inp_mem.resize(limit);
                }
            }
        }
    }
    //added special memory, overwrite if needed
-    if(addedmemory!="")
+    if(embd_inp_mem.size()>0)
    {
        //remove bos token from prompt, it'll be taken from memory
        std::vector<int> bos;
@ -1750,7 +1817,6 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
        //stick memory to front of prompt
        embd_inp.insert(embd_inp.begin(), embd_inp_mem.begin(), embd_inp_mem.end());
    }
    //determine how much npast we have to rewind from the current state
@ -2148,15 +2214,69 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
            // some user input remains from prompt or interaction, forward it to processing
            while ((int)embd_inp.size() > input_consumed)
            {
-                embd.push_back(embd_inp[input_consumed]);
+                int currtoken = embd_inp[input_consumed];
-                last_n_tokens.erase(last_n_tokens.begin());
+                if(currtoken==LLAVA_TOKEN_IDENTIFIER_A || currtoken==LLAVA_TOKEN_IDENTIFIER_B) //special llava token hit
                last_n_tokens.push_back(embd_inp[input_consumed]);
                current_context_tokens.push_back(embd_inp[input_consumed]);
                ++input_consumed;
                if ((int)embd.size() >= kcpp_params->n_batch)
                {
-                    break;
+                    //if partial batch, dispatch existing first
                    if(embd.size()>0)
                    {
                        break;
                    }
                    else
                    {
                        //batch is empty, do image processing
                        int llavatokenscounted = 0;
                        int llavatokensevaled = 0;
                        while(input_consumed < embd_inp.size() && (embd_inp[input_consumed]==LLAVA_TOKEN_IDENTIFIER_A || embd_inp[input_consumed]==LLAVA_TOKEN_IDENTIFIER_B))
                        {
                            last_n_tokens.erase(last_n_tokens.begin());
                            last_n_tokens.push_back(currtoken);
                            current_context_tokens.push_back(currtoken);
                            ++input_consumed;
                            ++llavatokenscounted;
                        }
                        for(int i=0;i<llava_images.size();++i)
                        {
                            if(allow_regular_prints)
                            {
                                printf("\rProcessing LLaVa Embedding %d (%d tokens)",(i+1), llava_images[i].clp_image_tokens);
                            }
                            bool err = kcpp_eval_image(llama_ctx_v4,llava_images[i].clp_img_embd,llava_images[i].clp_image_tokens,kcpp_params->n_batch,&n_past);
                            llavatokensevaled += llava_images[i].clp_image_tokens;
                            if(!err)
                            {
                                llava_composite_image_signature = ""; //force invalidate
                                fprintf(stderr, "\nFailed to eval llava image at %d!\n",n_past);
                                output.text = nullptr;
                                output.status = 0;
                                generation_finished = true;
                                return output;
                            }
                        }
                        if(llavatokenscounted!=llavatokensevaled)
                        {
                            llava_composite_image_signature = ""; //force invalidate
                            fprintf(stderr, "\nLLAVA image tokens mismatch at %d! (%d vs %d tokens)\n",n_past,llavatokenscounted,llavatokensevaled);
                            output.text = nullptr;
                            output.status = 0;
                            generation_finished = true;
                            return output;
                        }
                    }
                }
                else
                {
                    embd.push_back(currtoken);
                    last_n_tokens.erase(last_n_tokens.begin());
                    last_n_tokens.push_back(currtoken);
                    current_context_tokens.push_back(currtoken);
                    ++input_consumed;
                    if ((int)embd.size() >= kcpp_params->n_batch)
                    {
                        break;
                    }
                }
            }
        }
    }
--- a/klite.embd
+++ b/klite.embd
@ -3519,6 +3519,8 @@ Current version: 122
 		saved_kai_addr: "", //do not ever share this in save files!
 		saved_oai_jailbreak: "", //customized oai system prompt
 		saved_oai_jailbreak2: "", //oai assistant postfix
 		saved_claude_jailbreak: "", //claude system prompt
 		saved_claude_jailbreak2: "", //claude assistant postfix
 		saved_oai_custommodel: "", //customized oai custom model
 		saved_oai_role: 0, //0=user,1=assistant,2=system
 		saved_a1111_url: default_a1111_base,
@ -4464,6 +4466,10 @@ Current version: 122
 	{
 		return (custom_kobold_endpoint!="" && koboldcpp_version && koboldcpp_version!="" && compare_version_str(koboldcpp_version, "1.49") >= 0);
 	}
 	function is_using_kcpp_with_llava()
 	{
 		return (custom_kobold_endpoint!="" && koboldcpp_version && koboldcpp_version!="" && compare_version_str(koboldcpp_version, "1.61") >= 0);
 	}
 	//0 is none, 1 is pseudostreaming, 2 is true poll-streaming, 3 is sse-streaming
 	function determine_streaming_type()
@ -6861,7 +6867,8 @@ Current version: 122
 			document.getElementById("claudecustom").classList.remove("hidden");
 			document.getElementById("custom_claude_key").value = localsettings.saved_claude_key;
 			document.getElementById("custom_claude_endpoint").value = (localsettings.saved_claude_addr?localsettings.saved_claude_addr:default_claude_base);
-
+			document.getElementById("claudesystemprompt").value = localsettings.saved_claude_jailbreak;
 			document.getElementById("claudejailbreakprompt").value = localsettings.saved_claude_jailbreak2;
 		}
 		else if(epchoice==4)
 		{
@ -7272,6 +7279,8 @@ Current version: 122
 				custom_claude_key = desired_claude_key;
 				localsettings.saved_claude_key = custom_claude_key;
 				localsettings.saved_claude_addr = custom_claude_endpoint;
 				localsettings.saved_claude_jailbreak = document.getElementById("claudesystemprompt").value;
 				localsettings.saved_claude_jailbreak2 = document.getElementById("claudejailbreakprompt").value;
 				custom_claude_model = document.getElementById("custom_claude_model").value.trim();
 				selected_models = [{ "performance": 100.0, "queued": 0.0, "eta": 0, "name": custom_claude_model, "count": 1 }];
@ -9723,6 +9732,10 @@ Current version: 122
 			{
 				submit_payload.params.memory = truncated_memory;
 			}
 			if(is_using_kcpp_with_llava() && insertAIVisionImages.length>0)
 			{
 				submit_payload.params.images = insertAIVisionImages;
 			}
 			if(localsettings.sampler_seed>=1)
 			{
@ -10535,7 +10548,7 @@ Current version: 122
 					compressImage(origImg, (newDataUri) => {
 						image_db[imgid].done = true;
 						image_db[imgid].result = newDataUri;
-					}, true, true, imgres,0.35,false);
+					}, true, false, imgres,0.35,false);
 				}else{
 					image_db[imgid].queue = "Failed";
 					msgbox("Image Generation Failed!\n\nPlease make sure A1111 is running and properly configured!\nIn your local install of Automatic1111 WebUi, modify webui-user.bat and add these flags to enable API access:\n\nset COMMANDLINE_ARGS= --api --listen --cors-allow-origins=*\n");
@ -10574,14 +10587,14 @@ Current version: 122
 		}
 	}
-	function interrogate_new_image(base64img, imghash)
+	function interrogate_new_image(base64img, imghash, use_horde=true)
 	{
 		let parts = base64img.split(',');
 		if (parts.length === 2 && parts[0].startsWith('data:image')) {
 			base64img = parts[1];
 		}
-		if(localsettings.generate_images_mode==2) //a1111
+		if(!use_horde) //a1111
 		{
 			let payload = {
 			"image": base64img,
@ -10657,15 +10670,15 @@ Current version: 122
 		let savedmeta = completed_imgs_meta[imghash];
 		if(savedmeta)
 		{
-			savedmeta.enabled = !savedmeta.enabled;
+			savedmeta.visionmode = document.getElementById("aivisionmode").value;
-			if(!savedmeta.desc && savedmeta.enabled)
+			if(!savedmeta.desc && (savedmeta.visionmode==1 || savedmeta.visionmode==2))
 			{
 				//request a new interrogation
 				var alreadysent = Object.values(interrogation_db).some(item => item.imghash === imghash);
 				if(!alreadysent)
 				{
 					let b64 = document.getElementById("zoomedimg").src;
-					interrogate_new_image(b64,imghash);
+					interrogate_new_image(b64,imghash,(savedmeta.visionmode==1));
 				}
 			}
 			update_clicked_image(imghash);
@ -10681,7 +10694,7 @@ Current version: 122
 		let savedmeta = completed_imgs_meta[imghash];
 		if(!savedmeta && imghash!="")
 		{
-			savedmeta = completed_imgs_meta[imghash] = {prompt:"", desc:"", enabled:false, aspect:0};
+			savedmeta = completed_imgs_meta[imghash] = {prompt:"", desc:"", visionmode:0, aspect:0};
 		}
 		if(savedmeta)
@ -10697,15 +10710,26 @@ Current version: 122
 				document.getElementById("zoomedimg").classList.add("landscape");
 			}
 			if(!savedmeta.visionmode)
 			{
 				savedmeta.visionmode = 0;
 			}
 			let origprompt = (savedmeta.prompt?replaceAll(savedmeta.prompt,"\n"," ") : "No Saved Description");
 			latest_orig_prompt = origprompt;
-			let visionstatus = (savedmeta.enabled?(savedmeta.desc?`<span class="color_green">Active</span>`:`<span class="color_yellow">Analyzing...</span>`):`<span class="color_red">Inactive</span>`);
+			let visionstatus = ((!savedmeta.visionmode || savedmeta.visionmode==0)?`<span class="color_red">Inactive</span>`:((savedmeta.desc||savedmeta.visionmode==3)?`<span class="color_green">Active</span>`:`<span class="color_yellow">Analyzing</span>`));
-			let togglebtn = (savedmeta.enabled?`<button type="button" class="bg_red btn btn-primary" style="width: 84px; padding: 2px; margin: 3px; font-size:12px;" onclick="toggle_ai_vision(\'`+imghash+`\')">Disable</button>`:`<button type="button" class="bg_green btn btn-primary" style="width: 84px; padding: 2px; margin: 3px; font-size:12px;" onclick="toggle_ai_vision(\'`+imghash+`\')">👁️ Enable 👁️</button>`);
+			let togglebtn = `<select class="form-control" id="aivisionmode" style="display:inline;height:24px;width: 134px; padding: 2px; margin: 3px; font-size:12px;" onchange="toggle_ai_vision(\'`+imghash+`\')">
 								<option value="0">Disabled</option>
 								<option value="1">Interrogate (Horde)</option>
 								<option value="2">Interrogate (A1111)</option>
 								<option value="3">Multimodal (LLaVA)</option>
 							</select>`;
 			document.getElementById("zoomedimgdesc").innerHTML = `
 			AI Vision: `+visionstatus+` <span class="helpicon">?<span class="helptext">This allows the AI to visually recognize this image, to see and react to this image. Uses Horde or Local A1111 for image interrogation if enabled.</span></span>
 			`+togglebtn+`
 			<br><button type="button" class="btn btn-primary" style="width: 140px; padding: 2px; margin: 3px; font-size:12px;" onclick="show_orig_prompt()">View Original Prompt</button>
 			`;
 			document.getElementById("aivisionmode").value = savedmeta.visionmode;
 		}
 		else
 		{
@ -11096,7 +11120,7 @@ Current version: 122
 										let imgres = localsettings.img_allowhd?HD_RES_PX:NO_HD_RES_PX;
 										compressImage(origImg, (newDataUri) => {
 											img.result = newDataUri;
-										}, true, true, imgres,0.35,false);
+										}, true, false, imgres,0.35,false);
 									}
 								})
 								.catch((error) => {
@ -11135,7 +11159,7 @@ Current version: 122
 							console.log("Replacing with Image: " + matchstr);
 							gametext_arr[i] = gametext_arr[i].replace(matchstr, newstr);
 							let metaid = cyrb_hash(img.result);
-							completed_imgs_meta[metaid] = {prompt:image_db[key].prompt, desc:"", enabled:false, aspect:image_db[key].aspect};
+							completed_imgs_meta[metaid] = {prompt:image_db[key].prompt, desc:"", visionmode:0, aspect:image_db[key].aspect};
 							delete image_db[key];
 						}
 					}
@ -11167,23 +11191,15 @@ Current version: 122
 			if(!fixedSize)
 			{
 				//otherwise, we preserve the original ratio but scale them down to fit
-				let newwidth = maxSize;
+				let maxImgDim = Math.max(origW,origH);
-				let newheight = maxSize;
+				wantedWidth = origW;
-				let scalef = 1;
+				wantedHeight = origH;
-				if(origW>=origH)
+				if(maxImgDim > maxSize)
 				{
-					newwidth = origW>maxSize?maxSize:origW;
+					let scalef = maxImgDim/maxSize;
-					scalef = newwidth/origW;
+					wantedWidth = origW/scalef;
-					newheight = origH*scalef;
+					wantedHeight = origH/scalef;
 				}
 				else
 				{
 					newheight = origH>maxSize?maxSize:origH;
 					scalef = newheight/origH;
 					newwidth = origW*scalef;
 				}
 				wantedWidth = newwidth;
 				wantedHeight = newheight;
 			}
 			canvas.width = wantedWidth;
@ -11612,6 +11628,7 @@ Current version: 122
 		}
 	}
 	var insertAIVisionImages = []; //concat gametext will populate this
 	function concat_gametext(stripimg = false, stripimg_replace_str = "", append_before_segment="",append_after_segment="",escapeTxt=false,insertAIVision=false) {
 		let fulltxt = "";
 		for (let i = 0; i < gametext_arr.length; ++i) {
@ -11646,9 +11663,6 @@ Current version: 122
 				let a = escapeHtml(localsettings.chatname);
 				fulltxt = replaceAll(fulltxt,a,localsettings.chatname);
 				// let b = escapeHtml(localsettings.chatopponent);
 				// fulltxt = replaceAll(fulltxt,b,localsettings.chatopponent);
 				//unescape other chat opponents too (match anything that is NOT us)
 				var regex = new RegExp("\n(?!" + localsettings.chatname + ").+?\: ", "gi");
 				fulltxt = fulltxt.replace(regex, function (m) {
@ -11667,13 +11681,25 @@ Current version: 122
 		{
 			if(insertAIVision)
 			{
 				insertAIVisionImages = []; //a bit hacky
 				fulltxt = fulltxt.replace(/\[<\|d\|.+?\|d\|>\]/g, function (m) {
 					// m here means the whole matched string
 					let inner = m.substring(5, m.length - 5);
 					let imghash = cyrb_hash(inner);
 					let foundmeta = completed_imgs_meta[imghash];
-					if (foundmeta != null && foundmeta.enabled && foundmeta.desc) {
+					if (foundmeta != null) {
-						return "\n(Attached Image: " + foundmeta.desc + ")\n";
+						if(foundmeta.desc && (foundmeta.visionmode==1||foundmeta.visionmode==2))
 						{
 							return "\n(Attached Image: " + foundmeta.desc + ")\n";
 						}
 						else if(foundmeta.visionmode==3)
 						{
 							let parts = inner.split(',');
 							if (parts.length === 2 && parts[0].startsWith('data:image')) {
 								insertAIVisionImages.push(parts[1]);
 							}
 							return "\n(Attached Image)\n";
 						}
 					}
 					return "";
 				});