mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-04-28 03:30:20 +00:00
better approach when SWA window exceeded, simply refill the window. this is not 100% correct but good enough for fastforward users. Disable FF or increase window if not good enough
This commit is contained in:
parent
fa3f86ee70
commit
64ce5fca15
3 changed files with 26 additions and 14 deletions
|
|
@ -4245,7 +4245,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
|
|||
{
|
||||
if(kcpp_data->use_fastforward)
|
||||
{
|
||||
ContextFastForward(current_context_tokens, embd_inp, n_past, last_n_tokens, nctx, smartcontext, false, true, 0);
|
||||
ContextFastForward(current_context_tokens, embd_inp, n_past, last_n_tokens, nctx, smartcontext, false, true, 0, 0);
|
||||
}
|
||||
}
|
||||
if(is_recurrent)
|
||||
|
|
@ -4297,6 +4297,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
|
|||
bool triggerff = kcpp_data->use_fastforward;
|
||||
if(!blank_prompt) //special case for blank prompts, no fast forward or shifts
|
||||
{
|
||||
int ff_swa_retain_amount = 0; //a hack for SWA to improve coherency for illegal rewinds
|
||||
if(triggerff && !kcpp_data->swa_full && (file_format == FileFormat::GGUF_GENERIC))
|
||||
{
|
||||
const int swa_pos_min = llama_memory_seq_pos_min(llama_get_memory(llama_ctx_v4), 0); //this is the furthest back we can rewind to.
|
||||
|
|
@ -4304,10 +4305,10 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
|
|||
goal_npast -= 4;
|
||||
goal_npast = goal_npast < 0 ? 0 : goal_npast;
|
||||
if (swa_pos_min < 0 || goal_npast <= swa_pos_min) {
|
||||
triggerff = false;
|
||||
ff_swa_retain_amount = kcpp_active_swa_size;
|
||||
if (debugmode==1 && !is_quiet)
|
||||
{
|
||||
printf("\nNote: Context cannot be reused (Desired n_past=%d, SWA lowest n_past=%d), doing a full reprocess... to avoid this, disable SWA or increase SWA padding)\n", goal_npast, swa_pos_min);
|
||||
printf("\nNote: SWA context cannot be reused (Desired n_past=%d, SWA lowest n_past=%d), to avoid this, disable SWA or increase SWA padding), output may degrade.\n", goal_npast, swa_pos_min);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -4318,7 +4319,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
|
|||
}
|
||||
if(triggerff)
|
||||
{
|
||||
ContextFastForward(current_context_tokens, embd_inp, n_past, last_n_tokens, nctx, smartcontext, triggersc, false, 4);
|
||||
ContextFastForward(current_context_tokens, embd_inp, n_past, last_n_tokens, nctx, smartcontext, triggersc, false, 4, ff_swa_retain_amount);
|
||||
}
|
||||
}
|
||||
if(file_format == FileFormat::GGUF_GENERIC)
|
||||
|
|
|
|||
|
|
@ -468,15 +468,15 @@ std::string gguf_get_model_arch(const std::string & gguf_filename)
|
|||
return longest;
|
||||
}
|
||||
|
||||
void ContextFastForward(std::vector<int> ¤t_context_tokens, std::vector<int> &embd_inp,
|
||||
int &n_past, std::vector<int> &last_n_tokens, const int nctx, std::vector<int> &smartcontext,
|
||||
bool useSmartContext, const bool requireFullSubset, const int minimum_to_proceed)
|
||||
{
|
||||
const int SCCtxLenThreshold = nctx * 0.8; //how much context length must be reach to trigger smartcontext
|
||||
const int SCInpLenThreshold = nctx * 0.6; //how big must the input array be to trigger smartcontext
|
||||
const int SCPastLenThreshold = nctx * 0.5; //how wide of a gap between the fast forwarded past and the present to trigger smart context
|
||||
const float SCTruncationRatio = 0.5; //ratio for how many tokens to fast forward
|
||||
const int SCTokThreshold = 32 + (nctx*0.05); //how many tokens of similarity triggers smartcontext
|
||||
void ContextFastForward(std::vector<int> ¤t_context_tokens, std::vector<int> &embd_inp,
|
||||
int &n_past, std::vector<int> &last_n_tokens, const int nctx, std::vector<int> &smartcontext,
|
||||
bool useSmartContext, const bool requireFullSubset, const int minimum_to_proceed, const int minimum_input_to_keep)
|
||||
{
|
||||
const int SCCtxLenThreshold = nctx * 0.8; //how much context length must be reach to trigger smartcontext
|
||||
const int SCInpLenThreshold = nctx * 0.6; //how big must the input array be to trigger smartcontext
|
||||
const int SCPastLenThreshold = nctx * 0.5; //how wide of a gap between the fast forwarded past and the present to trigger smart context
|
||||
const float SCTruncationRatio = 0.5; //ratio for how many tokens to fast forward
|
||||
const int SCTokThreshold = 32 + (nctx*0.05); //how many tokens of similarity triggers smartcontext
|
||||
|
||||
|
||||
//fast forward the past based on identical tokens, stop once a divergence is noted
|
||||
|
|
@ -532,6 +532,17 @@ std::string gguf_get_model_arch(const std::string & gguf_filename)
|
|||
fastforwardok = false;
|
||||
}
|
||||
|
||||
//we must ensure that embd_input is at least minimum_input_to_keep if possible, or as large as it can be
|
||||
if (minimum_input_to_keep > 0 && n_past > embd_inp_len - minimum_input_to_keep)
|
||||
{
|
||||
int max_allowed_past = std::max(0, embd_inp_len - minimum_input_to_keep);
|
||||
n_past = max_allowed_past;
|
||||
if(n_past<=0)
|
||||
{
|
||||
fastforwardok = false;
|
||||
}
|
||||
}
|
||||
|
||||
if(fastforwardok)
|
||||
{
|
||||
last_n_tokens.erase(last_n_tokens.begin(), last_n_tokens.begin() + n_past);
|
||||
|
|
|
|||
|
|
@ -117,7 +117,7 @@ bool ArrStartWith(const std::vector<int> targetArray, const std::vector<int> sea
|
|||
int ArrFindIndexOf(const std::vector<int> targetArray, const std::vector<int> searchSeq);
|
||||
|
||||
FileFormat check_file_format(const std::string & fname, FileFormatExtraMeta * fileformatmeta);
|
||||
void ContextFastForward(std::vector<int> ¤t_context_tokens, std::vector<int> &embd_inp, int &n_past, std::vector<int> &last_n_tokens, const int nctx, std::vector<int> &smartcontext, const bool useSmartContext, const bool requireFullSubset, const int minimum_to_proceed);
|
||||
void ContextFastForward(std::vector<int> ¤t_context_tokens, std::vector<int> &embd_inp, int &n_past, std::vector<int> &last_n_tokens, const int nctx, std::vector<int> &smartcontext, const bool useSmartContext, const bool requireFullSubset, const int minimum_to_proceed, const int minimum_input_to_keep);
|
||||
bool gguf_tensor_exists(const std::string & filename, std::string tensor_name, bool exactmatch);
|
||||
std::string gguf_get_model_arch(const std::string & filename);
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue