mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-10 09:04:36 +00:00
added prefix for llava, reverted system role in template as it degreaded gemma3. truncated debug logs
This commit is contained in:
parent
b3143384b4
commit
93a226d9e4
3 changed files with 54 additions and 24 deletions
|
@ -2796,11 +2796,12 @@ int GetThreadsToUse(bool blasmode)
|
||||||
}
|
}
|
||||||
|
|
||||||
//this function prepares the clip embds for llava. it's only needed when images change
|
//this function prepares the clip embds for llava. it's only needed when images change
|
||||||
static void PrepareLlavaEmbds(const int nctx, const std::vector<int> & llava_sep)
|
static void PrepareLlavaEmbds(const int nctx, const std::vector<int> & llava_sep, const std::vector<int> & llava_intro)
|
||||||
{
|
{
|
||||||
if(clp_ctx!=nullptr && clp_img_data!=nullptr)
|
if(clp_ctx!=nullptr && clp_img_data!=nullptr)
|
||||||
{
|
{
|
||||||
int sepsize = llava_sep.size();
|
int sepsize = llava_sep.size();
|
||||||
|
int introsize = llava_intro.size();
|
||||||
last_llava_mem.clear();
|
last_llava_mem.clear();
|
||||||
|
|
||||||
for(int i=0;i<llava_images.size();++i)
|
for(int i=0;i<llava_images.size();++i)
|
||||||
|
@ -2829,6 +2830,10 @@ static void PrepareLlavaEmbds(const int nctx, const std::vector<int> & llava_sep
|
||||||
if(llava_images[i].clp_image_tokens>0 && llava_images[i].clp_image_tokens < nctx)
|
if(llava_images[i].clp_image_tokens>0 && llava_images[i].clp_image_tokens < nctx)
|
||||||
{
|
{
|
||||||
int tokcnt = (i==0?(llava_images[i].clp_image_tokens):(llava_images[i].clp_image_tokens+sepsize));
|
int tokcnt = (i==0?(llava_images[i].clp_image_tokens):(llava_images[i].clp_image_tokens+sepsize));
|
||||||
|
if(i==0)
|
||||||
|
{
|
||||||
|
tokcnt += introsize;
|
||||||
|
}
|
||||||
for(int n=0;n<tokcnt;++n)
|
for(int n=0;n<tokcnt;++n)
|
||||||
{
|
{
|
||||||
last_llava_mem.push_back(current_llava_identifier);
|
last_llava_mem.push_back(current_llava_identifier);
|
||||||
|
@ -3144,6 +3149,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
|
||||||
std::vector<int> embd_inp;
|
std::vector<int> embd_inp;
|
||||||
std::vector<int> embd_inp_mem; //for storing added memory
|
std::vector<int> embd_inp_mem; //for storing added memory
|
||||||
std::vector<int> llava_sep; //to separate between different llava images
|
std::vector<int> llava_sep; //to separate between different llava images
|
||||||
|
std::vector<int> llava_intro; //to separate between different llava images
|
||||||
bool llava_embds_built = false;
|
bool llava_embds_built = false;
|
||||||
|
|
||||||
int32_t nctx = kcpp_data->n_ctx;
|
int32_t nctx = kcpp_data->n_ctx;
|
||||||
|
@ -3151,6 +3157,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
|
||||||
TokenizeString(kcpp_data->prompt, embd_inp, file_format, add_bos_token);
|
TokenizeString(kcpp_data->prompt, embd_inp, file_format, add_bos_token);
|
||||||
bool use_mrope = (file_format == FileFormat::GGUF_GENERIC && file_format_meta.model_architecture == GGUFArch::ARCH_QWEN2VL);
|
bool use_mrope = (file_format == FileFormat::GGUF_GENERIC && file_format_meta.model_architecture == GGUFArch::ARCH_QWEN2VL);
|
||||||
TokenizeString("\n\n", llava_sep, file_format, false);
|
TokenizeString("\n\n", llava_sep, file_format, false);
|
||||||
|
TokenizeString("\nImages:\n", llava_intro, file_format, false);
|
||||||
|
|
||||||
if(llava_composite_image_signature=="")
|
if(llava_composite_image_signature=="")
|
||||||
{
|
{
|
||||||
|
@ -3158,7 +3165,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
|
||||||
}
|
}
|
||||||
if(llava_images_changed)
|
if(llava_images_changed)
|
||||||
{
|
{
|
||||||
PrepareLlavaEmbds(nctx, llava_sep);
|
PrepareLlavaEmbds(nctx, llava_sep, llava_intro);
|
||||||
llava_embds_built = true;
|
llava_embds_built = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3872,7 +3879,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
|
||||||
{
|
{
|
||||||
if(!llava_embds_built) //this should never happen! however, handle it anyway
|
if(!llava_embds_built) //this should never happen! however, handle it anyway
|
||||||
{
|
{
|
||||||
PrepareLlavaEmbds(nctx, llava_sep);
|
PrepareLlavaEmbds(nctx, llava_sep, llava_intro);
|
||||||
llava_embds_built = true;
|
llava_embds_built = true;
|
||||||
printf("\nSomehow vision embd was not prepared (maybe no fast forward), rebuilding it...\n");
|
printf("\nSomehow vision embd was not prepared (maybe no fast forward), rebuilding it...\n");
|
||||||
}
|
}
|
||||||
|
@ -3888,6 +3895,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
|
||||||
int llavatokenscounted = 0;
|
int llavatokenscounted = 0;
|
||||||
int llavatokensevaled = 0;
|
int llavatokensevaled = 0;
|
||||||
int sepsize = llava_sep.size();
|
int sepsize = llava_sep.size();
|
||||||
|
int introsize = llava_intro.size();
|
||||||
while(input_consumed < embd_inp.size() && (embd_inp[input_consumed]==LLAVA_TOKEN_IDENTIFIER_A || embd_inp[input_consumed]==LLAVA_TOKEN_IDENTIFIER_B))
|
while(input_consumed < embd_inp.size() && (embd_inp[input_consumed]==LLAVA_TOKEN_IDENTIFIER_A || embd_inp[input_consumed]==LLAVA_TOKEN_IDENTIFIER_B))
|
||||||
{
|
{
|
||||||
if (!last_n_tokens.empty())
|
if (!last_n_tokens.empty())
|
||||||
|
@ -3902,7 +3910,23 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
|
||||||
for(int i=0;i<llava_images.size();++i)
|
for(int i=0;i<llava_images.size();++i)
|
||||||
{
|
{
|
||||||
//note: no handling for draft_ctx as we don't support vision for it
|
//note: no handling for draft_ctx as we don't support vision for it
|
||||||
if(i>0 && sepsize>0)
|
if(introsize>0 && i==0)
|
||||||
|
{
|
||||||
|
//added at the start of everything
|
||||||
|
kcpp_embd_batch batch = kcpp_embd_batch(llava_intro, n_past, use_mrope, false);
|
||||||
|
auto evr = llama_decode(llama_ctx_v4, batch.batch);
|
||||||
|
if(evr!=0)
|
||||||
|
{
|
||||||
|
printf("\nError when appending llava intro: %d\n",evr);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
printf("\rProcessing LLaVa Intro (%d tokens)",introsize);
|
||||||
|
}
|
||||||
|
n_past += introsize;
|
||||||
|
llavatokensevaled += introsize;
|
||||||
|
}
|
||||||
|
if(sepsize>0 && i>0)
|
||||||
{
|
{
|
||||||
//add a separator between each image
|
//add a separator between each image
|
||||||
kcpp_embd_batch batch = kcpp_embd_batch(llava_sep, n_past, use_mrope, false);
|
kcpp_embd_batch batch = kcpp_embd_batch(llava_sep, n_past, use_mrope, false);
|
||||||
|
|
|
@ -38,8 +38,6 @@
|
||||||
"search": ["System role not supported", "<start_of_turn>"],
|
"search": ["System role not supported", "<start_of_turn>"],
|
||||||
"name": "Google Gemma 2.",
|
"name": "Google Gemma 2.",
|
||||||
"adapter": {
|
"adapter": {
|
||||||
"system_start": "<start_of_turn>user\n",
|
|
||||||
"system_end": "<end_of_turn>\n",
|
|
||||||
"user_start": "<start_of_turn>user\n",
|
"user_start": "<start_of_turn>user\n",
|
||||||
"user_end": "<end_of_turn>\n",
|
"user_end": "<end_of_turn>\n",
|
||||||
"assistant_start": "<start_of_turn>model\n",
|
"assistant_start": "<start_of_turn>model\n",
|
||||||
|
@ -49,8 +47,6 @@
|
||||||
"search": ["<start_of_image>", "<start_of_turn>", "<end_of_turn>"],
|
"search": ["<start_of_image>", "<start_of_turn>", "<end_of_turn>"],
|
||||||
"name": "Google Gemma 3.",
|
"name": "Google Gemma 3.",
|
||||||
"adapter": {
|
"adapter": {
|
||||||
"system_start": "<start_of_turn>user\n",
|
|
||||||
"system_end": "<end_of_turn>\n",
|
|
||||||
"user_start": "<start_of_turn>user\n",
|
"user_start": "<start_of_turn>user\n",
|
||||||
"user_end": "<end_of_turn>\n",
|
"user_end": "<end_of_turn>\n",
|
||||||
"assistant_start": "<start_of_turn>model\n",
|
"assistant_start": "<start_of_turn>model\n",
|
||||||
|
|
42
koboldcpp.py
42
koboldcpp.py
|
@ -49,7 +49,7 @@ logit_bias_max = 512
|
||||||
dry_seq_break_max = 128
|
dry_seq_break_max = 128
|
||||||
|
|
||||||
# global vars
|
# global vars
|
||||||
KcppVersion = "1.87.3"
|
KcppVersion = "1.87.4"
|
||||||
showdebug = True
|
showdebug = True
|
||||||
kcpp_instance = None #global running instance
|
kcpp_instance = None #global running instance
|
||||||
global_memory = {"tunnel_url": "", "restart_target":"", "input_to_exit":False, "load_complete":False}
|
global_memory = {"tunnel_url": "", "restart_target":"", "input_to_exit":False, "load_complete":False}
|
||||||
|
@ -720,6 +720,22 @@ def string_contains_or_overlaps_sequence_substring(inputstr, sequences):
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
def truncate_long_json(data, max_length):
|
||||||
|
if isinstance(data, dict):
|
||||||
|
new_data = {}
|
||||||
|
for key, value in data.items():
|
||||||
|
if isinstance(value, str):
|
||||||
|
new_data[key] = value[:max_length] + "..." if len(value) > max_length else value
|
||||||
|
else:
|
||||||
|
new_data[key] = truncate_long_json(value, max_length)
|
||||||
|
return new_data
|
||||||
|
elif isinstance(data, list):
|
||||||
|
return [truncate_long_json(item, max_length) for item in data]
|
||||||
|
elif isinstance(data, str):
|
||||||
|
return data[:max_length] + "..." if len(data) > max_length else data
|
||||||
|
else:
|
||||||
|
return data
|
||||||
|
|
||||||
def get_capabilities():
|
def get_capabilities():
|
||||||
global savedata_obj, has_multiplayer, KcppVersion, friendlymodelname, friendlysdmodelname, fullsdmodelpath, mmprojpath, password, fullwhispermodelpath, ttsmodelpath, embeddingsmodelpath
|
global savedata_obj, has_multiplayer, KcppVersion, friendlymodelname, friendlysdmodelname, fullsdmodelpath, mmprojpath, password, fullwhispermodelpath, ttsmodelpath, embeddingsmodelpath
|
||||||
has_llm = not (friendlymodelname=="inactive")
|
has_llm = not (friendlymodelname=="inactive")
|
||||||
|
@ -2745,11 +2761,11 @@ Enter Prompt:<br>
|
||||||
body = None
|
body = None
|
||||||
if contlenstr:
|
if contlenstr:
|
||||||
content_length = int(contlenstr)
|
content_length = int(contlenstr)
|
||||||
if content_length > (1024*1024*32): #32mb payload limit
|
if content_length > (1024*1024*48): #48mb payload limit
|
||||||
self.send_response(500)
|
self.send_response(500)
|
||||||
self.end_headers(content_type='application/json')
|
self.end_headers(content_type='application/json')
|
||||||
self.wfile.write(json.dumps({"detail": {
|
self.wfile.write(json.dumps({"detail": {
|
||||||
"msg": "Payload is too big. Max payload size is 32MB.",
|
"msg": "Payload is too big. Max payload size is 48MB.",
|
||||||
"type": "bad_input",
|
"type": "bad_input",
|
||||||
}}).encode())
|
}}).encode())
|
||||||
return
|
return
|
||||||
|
@ -2765,11 +2781,11 @@ Enter Prompt:<br>
|
||||||
if line:
|
if line:
|
||||||
chunk_length = max(0,int(line, 16))
|
chunk_length = max(0,int(line, 16))
|
||||||
content_length += chunk_length
|
content_length += chunk_length
|
||||||
if not line or chunklimit > 512 or content_length > (1024*1024*32): #32mb payload limit
|
if not line or chunklimit > 512 or content_length > (1024*1024*48): #48mb payload limit
|
||||||
self.send_response(500)
|
self.send_response(500)
|
||||||
self.end_headers(content_type='application/json')
|
self.end_headers(content_type='application/json')
|
||||||
self.wfile.write(json.dumps({"detail": {
|
self.wfile.write(json.dumps({"detail": {
|
||||||
"msg": "Payload is too big. Max payload size is 32MB.",
|
"msg": "Payload is too big. Max payload size is 48MB.",
|
||||||
"type": "bad_input",
|
"type": "bad_input",
|
||||||
}}).encode())
|
}}).encode())
|
||||||
return
|
return
|
||||||
|
@ -3178,17 +3194,11 @@ Enter Prompt:<br>
|
||||||
}}).encode())
|
}}).encode())
|
||||||
return
|
return
|
||||||
|
|
||||||
|
trunc_len = 8000
|
||||||
tmpimgs = genparams.get("images", []) # reduce amount of text printed to terminal when dumping large images
|
if args.debugmode >= 1:
|
||||||
if tmpimgs and isinstance(tmpimgs, (list, tuple)) and len(tmpimgs)>0:
|
trunc_len = 16000
|
||||||
printablegenparams = copy.deepcopy(genparams)
|
printablegenparams = truncate_long_json(genparams,trunc_len)
|
||||||
outarr = []
|
utfprint("\nInput: " + json.dumps(printablegenparams),1)
|
||||||
for img in tmpimgs:
|
|
||||||
outarr.append(str(img[:512])+"...")
|
|
||||||
printablegenparams["images"] = outarr
|
|
||||||
utfprint("\nInput: " + json.dumps(printablegenparams),1)
|
|
||||||
else:
|
|
||||||
utfprint("\nInput: " + json.dumps(genparams),1)
|
|
||||||
|
|
||||||
if args.foreground:
|
if args.foreground:
|
||||||
bring_terminal_to_foreground()
|
bring_terminal_to_foreground()
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue