diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index a97712c3d..0faf8a73c 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -2796,11 +2796,12 @@ int GetThreadsToUse(bool blasmode) } //this function prepares the clip embds for llava. it's only needed when images change -static void PrepareLlavaEmbds(const int nctx, const std::vector & llava_sep) +static void PrepareLlavaEmbds(const int nctx, const std::vector & llava_sep, const std::vector & llava_intro) { if(clp_ctx!=nullptr && clp_img_data!=nullptr) { int sepsize = llava_sep.size(); + int introsize = llava_intro.size(); last_llava_mem.clear(); for(int i=0;i & llava_sep if(llava_images[i].clp_image_tokens>0 && llava_images[i].clp_image_tokens < nctx) { int tokcnt = (i==0?(llava_images[i].clp_image_tokens):(llava_images[i].clp_image_tokens+sepsize)); + if(i==0) + { + tokcnt += introsize; + } for(int n=0;n embd_inp; std::vector embd_inp_mem; //for storing added memory std::vector llava_sep; //to separate between different llava images + std::vector llava_intro; //to separate between different llava images bool llava_embds_built = false; int32_t nctx = kcpp_data->n_ctx; @@ -3151,6 +3157,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs) TokenizeString(kcpp_data->prompt, embd_inp, file_format, add_bos_token); bool use_mrope = (file_format == FileFormat::GGUF_GENERIC && file_format_meta.model_architecture == GGUFArch::ARCH_QWEN2VL); TokenizeString("\n\n", llava_sep, file_format, false); + TokenizeString("\nImages:\n", llava_intro, file_format, false); if(llava_composite_image_signature=="") { @@ -3158,7 +3165,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs) } if(llava_images_changed) { - PrepareLlavaEmbds(nctx, llava_sep); + PrepareLlavaEmbds(nctx, llava_sep, llava_intro); llava_embds_built = true; } @@ -3872,7 +3879,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs) { if(!llava_embds_built) //this should never happen! however, handle it anyway { - PrepareLlavaEmbds(nctx, llava_sep); + PrepareLlavaEmbds(nctx, llava_sep, llava_intro); llava_embds_built = true; printf("\nSomehow vision embd was not prepared (maybe no fast forward), rebuilding it...\n"); } @@ -3888,6 +3895,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs) int llavatokenscounted = 0; int llavatokensevaled = 0; int sepsize = llava_sep.size(); + int introsize = llava_intro.size(); while(input_consumed < embd_inp.size() && (embd_inp[input_consumed]==LLAVA_TOKEN_IDENTIFIER_A || embd_inp[input_consumed]==LLAVA_TOKEN_IDENTIFIER_B)) { if (!last_n_tokens.empty()) @@ -3902,7 +3910,23 @@ generation_outputs gpttype_generate(const generation_inputs inputs) for(int i=0;i0 && sepsize>0) + if(introsize>0 && i==0) + { + //added at the start of everything + kcpp_embd_batch batch = kcpp_embd_batch(llava_intro, n_past, use_mrope, false); + auto evr = llama_decode(llama_ctx_v4, batch.batch); + if(evr!=0) + { + printf("\nError when appending llava intro: %d\n",evr); + } + else + { + printf("\rProcessing LLaVa Intro (%d tokens)",introsize); + } + n_past += introsize; + llavatokensevaled += introsize; + } + if(sepsize>0 && i>0) { //add a separator between each image kcpp_embd_batch batch = kcpp_embd_batch(llava_sep, n_past, use_mrope, false); diff --git a/kcpp_adapters/AutoGuess.json b/kcpp_adapters/AutoGuess.json index 3d5b27615..48e588989 100644 --- a/kcpp_adapters/AutoGuess.json +++ b/kcpp_adapters/AutoGuess.json @@ -38,8 +38,6 @@ "search": ["System role not supported", ""], "name": "Google Gemma 2.", "adapter": { - "system_start": "user\n", - "system_end": "\n", "user_start": "user\n", "user_end": "\n", "assistant_start": "model\n", @@ -49,8 +47,6 @@ "search": ["", "", ""], "name": "Google Gemma 3.", "adapter": { - "system_start": "user\n", - "system_end": "\n", "user_start": "user\n", "user_end": "\n", "assistant_start": "model\n", diff --git a/koboldcpp.py b/koboldcpp.py index 3651ceb15..5e6364ec6 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -49,7 +49,7 @@ logit_bias_max = 512 dry_seq_break_max = 128 # global vars -KcppVersion = "1.87.3" +KcppVersion = "1.87.4" showdebug = True kcpp_instance = None #global running instance global_memory = {"tunnel_url": "", "restart_target":"", "input_to_exit":False, "load_complete":False} @@ -720,6 +720,22 @@ def string_contains_or_overlaps_sequence_substring(inputstr, sequences): return True return False +def truncate_long_json(data, max_length): + if isinstance(data, dict): + new_data = {} + for key, value in data.items(): + if isinstance(value, str): + new_data[key] = value[:max_length] + "..." if len(value) > max_length else value + else: + new_data[key] = truncate_long_json(value, max_length) + return new_data + elif isinstance(data, list): + return [truncate_long_json(item, max_length) for item in data] + elif isinstance(data, str): + return data[:max_length] + "..." if len(data) > max_length else data + else: + return data + def get_capabilities(): global savedata_obj, has_multiplayer, KcppVersion, friendlymodelname, friendlysdmodelname, fullsdmodelpath, mmprojpath, password, fullwhispermodelpath, ttsmodelpath, embeddingsmodelpath has_llm = not (friendlymodelname=="inactive") @@ -2745,11 +2761,11 @@ Enter Prompt:
body = None if contlenstr: content_length = int(contlenstr) - if content_length > (1024*1024*32): #32mb payload limit + if content_length > (1024*1024*48): #48mb payload limit self.send_response(500) self.end_headers(content_type='application/json') self.wfile.write(json.dumps({"detail": { - "msg": "Payload is too big. Max payload size is 32MB.", + "msg": "Payload is too big. Max payload size is 48MB.", "type": "bad_input", }}).encode()) return @@ -2765,11 +2781,11 @@ Enter Prompt:
if line: chunk_length = max(0,int(line, 16)) content_length += chunk_length - if not line or chunklimit > 512 or content_length > (1024*1024*32): #32mb payload limit + if not line or chunklimit > 512 or content_length > (1024*1024*48): #48mb payload limit self.send_response(500) self.end_headers(content_type='application/json') self.wfile.write(json.dumps({"detail": { - "msg": "Payload is too big. Max payload size is 32MB.", + "msg": "Payload is too big. Max payload size is 48MB.", "type": "bad_input", }}).encode()) return @@ -3178,17 +3194,11 @@ Enter Prompt:
}}).encode()) return - - tmpimgs = genparams.get("images", []) # reduce amount of text printed to terminal when dumping large images - if tmpimgs and isinstance(tmpimgs, (list, tuple)) and len(tmpimgs)>0: - printablegenparams = copy.deepcopy(genparams) - outarr = [] - for img in tmpimgs: - outarr.append(str(img[:512])+"...") - printablegenparams["images"] = outarr - utfprint("\nInput: " + json.dumps(printablegenparams),1) - else: - utfprint("\nInput: " + json.dumps(genparams),1) + trunc_len = 8000 + if args.debugmode >= 1: + trunc_len = 16000 + printablegenparams = truncate_long_json(genparams,trunc_len) + utfprint("\nInput: " + json.dumps(printablegenparams),1) if args.foreground: bring_terminal_to_foreground()