mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 09:34:37 +00:00
clear cpu flag manually for templates, added truncation for embeddings
This commit is contained in:
parent
8a4a9b8c19
commit
e37f27632f
3 changed files with 29 additions and 7 deletions
1
expose.h
1
expose.h
|
@ -251,6 +251,7 @@ struct embeddings_load_model_inputs
|
||||||
struct embeddings_generation_inputs
|
struct embeddings_generation_inputs
|
||||||
{
|
{
|
||||||
const char * prompt = nullptr;
|
const char * prompt = nullptr;
|
||||||
|
const bool truncate = true;
|
||||||
};
|
};
|
||||||
struct embeddings_generation_outputs
|
struct embeddings_generation_outputs
|
||||||
{
|
{
|
||||||
|
|
|
@ -49,7 +49,7 @@ logit_bias_max = 512
|
||||||
dry_seq_break_max = 128
|
dry_seq_break_max = 128
|
||||||
|
|
||||||
# global vars
|
# global vars
|
||||||
KcppVersion = "1.87"
|
KcppVersion = "1.87.1"
|
||||||
showdebug = True
|
showdebug = True
|
||||||
kcpp_instance = None #global running instance
|
kcpp_instance = None #global running instance
|
||||||
global_memory = {"tunnel_url": "", "restart_target":"", "input_to_exit":False, "load_complete":False}
|
global_memory = {"tunnel_url": "", "restart_target":"", "input_to_exit":False, "load_complete":False}
|
||||||
|
@ -332,7 +332,8 @@ class embeddings_load_model_inputs(ctypes.Structure):
|
||||||
("debugmode", ctypes.c_int)]
|
("debugmode", ctypes.c_int)]
|
||||||
|
|
||||||
class embeddings_generation_inputs(ctypes.Structure):
|
class embeddings_generation_inputs(ctypes.Structure):
|
||||||
_fields_ = [("prompt", ctypes.c_char_p)]
|
_fields_ = [("prompt", ctypes.c_char_p),
|
||||||
|
("truncate", ctypes.c_bool)]
|
||||||
|
|
||||||
class embeddings_generation_outputs(ctypes.Structure):
|
class embeddings_generation_outputs(ctypes.Structure):
|
||||||
_fields_ = [("status", ctypes.c_int),
|
_fields_ = [("status", ctypes.c_int),
|
||||||
|
@ -1619,6 +1620,7 @@ def embeddings_generate(genparams):
|
||||||
try:
|
try:
|
||||||
inputs = embeddings_generation_inputs()
|
inputs = embeddings_generation_inputs()
|
||||||
inputs.prompt = prompt.encode("UTF-8")
|
inputs.prompt = prompt.encode("UTF-8")
|
||||||
|
inputs.truncate = genparams.get('truncate', True)
|
||||||
ret = handle.embeddings_generate(inputs)
|
ret = handle.embeddings_generate(inputs)
|
||||||
if ret.status==1:
|
if ret.status==1:
|
||||||
outstr = ret.data.decode("UTF-8","ignore")
|
outstr = ret.data.decode("UTF-8","ignore")
|
||||||
|
@ -5172,6 +5174,7 @@ def convert_args_to_template(savdict):
|
||||||
savdict["useclblast"] = None
|
savdict["useclblast"] = None
|
||||||
savdict["usecublas"] = None
|
savdict["usecublas"] = None
|
||||||
savdict["usevulkan"] = None
|
savdict["usevulkan"] = None
|
||||||
|
savdict["usecpu"] = None
|
||||||
savdict["tensor_split"] = None
|
savdict["tensor_split"] = None
|
||||||
savdict["draftgpusplit"] = None
|
savdict["draftgpusplit"] = None
|
||||||
savdict["config"] = None
|
savdict["config"] = None
|
||||||
|
|
|
@ -201,12 +201,30 @@ embeddings_generation_outputs embeddingstype_generate(const embeddings_generatio
|
||||||
std::vector<std::vector<int32_t>> prompt_inputs;
|
std::vector<std::vector<int32_t>> prompt_inputs;
|
||||||
auto inp = common_tokenize(embeddings_ctx, prompt, true, true);
|
auto inp = common_tokenize(embeddings_ctx, prompt, true, true);
|
||||||
if (inp.size() > n_batch) {
|
if (inp.size() > n_batch) {
|
||||||
printf("\n%s: number of tokens in an input (%lld) exceeds embedding size limit for this model (%lld), lower token amount!\n",
|
if (inputs.truncate) {
|
||||||
|
int oldsize = inp.size();
|
||||||
|
//get bos token
|
||||||
|
std::vector<int> bos;
|
||||||
|
bos = common_tokenize(embeddings_ctx, "", true,true);
|
||||||
|
int offset = inp.size() - n_batch + 1;
|
||||||
|
inp = std::vector<int>(inp.begin() + offset, inp.end());
|
||||||
|
//replace bos into front if exists
|
||||||
|
if(bos.size()>0 && inp.size()>0)
|
||||||
|
{
|
||||||
|
inp[0] = bos[0];
|
||||||
|
}
|
||||||
|
if(embeddings_debug)
|
||||||
|
{
|
||||||
|
printf("\n%s: Input too long, truncated from %d to last %d tokens.\n", __func__,oldsize,inp.size());
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
printf("\n%s: number of tokens in an input (%lld) exceeds embedding size limit for this model (%lld), lower token amount!\n",
|
||||||
__func__, (long long int) inp.size(), (long long int) n_batch);
|
__func__, (long long int) inp.size(), (long long int) n_batch);
|
||||||
output.data = "";
|
output.data = "";
|
||||||
output.status = 0;
|
output.status = 0;
|
||||||
output.count = 0;
|
output.count = 0;
|
||||||
return output;
|
return output;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
prompt_inputs.push_back(inp);
|
prompt_inputs.push_back(inp);
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue