added token delay feature

This commit is contained in:
Concedo 2024-10-07 19:45:51 +08:00
parent 1803382415
commit 740c5e01cb

View file

@ -120,6 +120,10 @@ static std::string concat_output_reader_copy_poll = ""; //for streaming
static std::string concat_output_reader_copy_res = ""; //for gen response static std::string concat_output_reader_copy_res = ""; //for gen response
static std::vector<logit_bias> logit_biases; static std::vector<logit_bias> logit_biases;
static int delayed_generated_tokens_limit = 0;
std::deque<std::string> delayed_generated_tokens; //for use with antislop sampling
inline bool IsNanCheck(float f) inline bool IsNanCheck(float f)
{ {
const unsigned int u = *(unsigned int*)&f; const unsigned int u = *(unsigned int*)&f;
@ -2451,6 +2455,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
generation_finished = false; // Set current generation status generation_finished = false; // Set current generation status
generated_tokens.clear(); // New Generation, new tokens generated_tokens.clear(); // New Generation, new tokens
delayed_generated_tokens.clear();
concat_output_mtx.lock(); concat_output_mtx.lock();
concat_output = ""; concat_output = "";
@ -3195,13 +3200,16 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
{ {
tokenizedstr = ""; //prevent render tokenizedstr = ""; //prevent render
} }
if(stream_sse)
delayed_generated_tokens.push_back(tokenizedstr);
while(delayed_generated_tokens.size() > delayed_generated_tokens_limit && delayed_generated_tokens.size() > 0)
{ {
generated_tokens.push_back(tokenizedstr); generated_tokens.push_back(delayed_generated_tokens[0]);
}
concat_output_mtx.lock(); concat_output_mtx.lock();
concat_output += tokenizedstr; concat_output += delayed_generated_tokens[0];
concat_output_mtx.unlock(); concat_output_mtx.unlock();
delayed_generated_tokens.pop_front();
}
} }
if (startedsampling && allow_regular_prints) if (startedsampling && allow_regular_prints)
@ -3372,6 +3380,16 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
} }
} }
//flush any remaining delayed tokens
while(delayed_generated_tokens.size() > 0)
{
generated_tokens.push_back(delayed_generated_tokens[0]);
concat_output_mtx.lock();
concat_output += delayed_generated_tokens[0];
concat_output_mtx.unlock();
delayed_generated_tokens.pop_front();
}
if(debugmode==1 && file_format == FileFormat::GGUF_GENERIC) if(debugmode==1 && file_format == FileFormat::GGUF_GENERIC)
{ {
printf("\n"); printf("\n");
@ -3389,7 +3407,6 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
fflush(stdout); fflush(stdout);
output.status = 1; output.status = 1;
output.stopreason = last_stop_reason; output.stopreason = last_stop_reason;
generation_finished = true;
last_eval_time = pt2; last_eval_time = pt2;
last_process_time = pt1; last_process_time = pt1;
last_token_count = realnpredict; last_token_count = realnpredict;
@ -3399,5 +3416,6 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
concat_output_reader_copy_res = concat_output; concat_output_reader_copy_res = concat_output;
concat_output_mtx.unlock(); concat_output_mtx.unlock();
output.text = concat_output_reader_copy_res.c_str(); output.text = concat_output_reader_copy_res.c_str();
generation_finished = true;
return output; return output;
} }