diff --git a/expose.h b/expose.h index 4b7bfdf42..fb25fc8ca 100644 --- a/expose.h +++ b/expose.h @@ -251,6 +251,7 @@ struct embeddings_load_model_inputs struct embeddings_generation_inputs { const char * prompt = nullptr; + const bool truncate = true; }; struct embeddings_generation_outputs { diff --git a/koboldcpp.py b/koboldcpp.py index d3e8447e9..b6f9fe713 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -49,7 +49,7 @@ logit_bias_max = 512 dry_seq_break_max = 128 # global vars -KcppVersion = "1.87" +KcppVersion = "1.87.1" showdebug = True kcpp_instance = None #global running instance global_memory = {"tunnel_url": "", "restart_target":"", "input_to_exit":False, "load_complete":False} @@ -332,7 +332,8 @@ class embeddings_load_model_inputs(ctypes.Structure): ("debugmode", ctypes.c_int)] class embeddings_generation_inputs(ctypes.Structure): - _fields_ = [("prompt", ctypes.c_char_p)] + _fields_ = [("prompt", ctypes.c_char_p), + ("truncate", ctypes.c_bool)] class embeddings_generation_outputs(ctypes.Structure): _fields_ = [("status", ctypes.c_int), @@ -1619,6 +1620,7 @@ def embeddings_generate(genparams): try: inputs = embeddings_generation_inputs() inputs.prompt = prompt.encode("UTF-8") + inputs.truncate = genparams.get('truncate', True) ret = handle.embeddings_generate(inputs) if ret.status==1: outstr = ret.data.decode("UTF-8","ignore") @@ -5172,6 +5174,7 @@ def convert_args_to_template(savdict): savdict["useclblast"] = None savdict["usecublas"] = None savdict["usevulkan"] = None + savdict["usecpu"] = None savdict["tensor_split"] = None savdict["draftgpusplit"] = None savdict["config"] = None diff --git a/otherarch/embeddings_adapter.cpp b/otherarch/embeddings_adapter.cpp index 753ced241..428bbc633 100644 --- a/otherarch/embeddings_adapter.cpp +++ b/otherarch/embeddings_adapter.cpp @@ -201,12 +201,30 @@ embeddings_generation_outputs embeddingstype_generate(const embeddings_generatio std::vector> prompt_inputs; auto inp = common_tokenize(embeddings_ctx, prompt, true, true); if (inp.size() > n_batch) { - printf("\n%s: number of tokens in an input (%lld) exceeds embedding size limit for this model (%lld), lower token amount!\n", + if (inputs.truncate) { + int oldsize = inp.size(); + //get bos token + std::vector bos; + bos = common_tokenize(embeddings_ctx, "", true,true); + int offset = inp.size() - n_batch + 1; + inp = std::vector(inp.begin() + offset, inp.end()); + //replace bos into front if exists + if(bos.size()>0 && inp.size()>0) + { + inp[0] = bos[0]; + } + if(embeddings_debug) + { + printf("\n%s: Input too long, truncated from %d to last %d tokens.\n", __func__,oldsize,inp.size()); + } + } else { + printf("\n%s: number of tokens in an input (%lld) exceeds embedding size limit for this model (%lld), lower token amount!\n", __func__, (long long int) inp.size(), (long long int) n_batch); - output.data = ""; - output.status = 0; - output.count = 0; - return output; + output.data = ""; + output.status = 0; + output.count = 0; + return output; + } } prompt_inputs.push_back(inp);