better approach when SWA window exceeded, simply refill the window. this is not 100% correct but good enough for fastforward users. Disable FF or increase window if not good enough

This commit is contained in:
Concedo 2026-04-17 11:44:13 +08:00
parent fa3f86ee70
commit 64ce5fca15
3 changed files with 26 additions and 14 deletions

View file

@ -4245,7 +4245,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
{
if(kcpp_data->use_fastforward)
{
ContextFastForward(current_context_tokens, embd_inp, n_past, last_n_tokens, nctx, smartcontext, false, true, 0);
ContextFastForward(current_context_tokens, embd_inp, n_past, last_n_tokens, nctx, smartcontext, false, true, 0, 0);
}
}
if(is_recurrent)
@ -4297,6 +4297,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
bool triggerff = kcpp_data->use_fastforward;
if(!blank_prompt) //special case for blank prompts, no fast forward or shifts
{
int ff_swa_retain_amount = 0; //a hack for SWA to improve coherency for illegal rewinds
if(triggerff && !kcpp_data->swa_full && (file_format == FileFormat::GGUF_GENERIC))
{
const int swa_pos_min = llama_memory_seq_pos_min(llama_get_memory(llama_ctx_v4), 0); //this is the furthest back we can rewind to.
@ -4304,10 +4305,10 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
goal_npast -= 4;
goal_npast = goal_npast < 0 ? 0 : goal_npast;
if (swa_pos_min < 0 || goal_npast <= swa_pos_min) {
triggerff = false;
ff_swa_retain_amount = kcpp_active_swa_size;
if (debugmode==1 && !is_quiet)
{
printf("\nNote: Context cannot be reused (Desired n_past=%d, SWA lowest n_past=%d), doing a full reprocess... to avoid this, disable SWA or increase SWA padding)\n", goal_npast, swa_pos_min);
printf("\nNote: SWA context cannot be reused (Desired n_past=%d, SWA lowest n_past=%d), to avoid this, disable SWA or increase SWA padding), output may degrade.\n", goal_npast, swa_pos_min);
}
}
}
@ -4318,7 +4319,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
}
if(triggerff)
{
ContextFastForward(current_context_tokens, embd_inp, n_past, last_n_tokens, nctx, smartcontext, triggersc, false, 4);
ContextFastForward(current_context_tokens, embd_inp, n_past, last_n_tokens, nctx, smartcontext, triggersc, false, 4, ff_swa_retain_amount);
}
}
if(file_format == FileFormat::GGUF_GENERIC)

View file

@ -468,15 +468,15 @@ std::string gguf_get_model_arch(const std::string & gguf_filename)
return longest;
}
void ContextFastForward(std::vector<int> &current_context_tokens, std::vector<int> &embd_inp,
int &n_past, std::vector<int> &last_n_tokens, const int nctx, std::vector<int> &smartcontext,
bool useSmartContext, const bool requireFullSubset, const int minimum_to_proceed)
{
const int SCCtxLenThreshold = nctx * 0.8; //how much context length must be reach to trigger smartcontext
const int SCInpLenThreshold = nctx * 0.6; //how big must the input array be to trigger smartcontext
const int SCPastLenThreshold = nctx * 0.5; //how wide of a gap between the fast forwarded past and the present to trigger smart context
const float SCTruncationRatio = 0.5; //ratio for how many tokens to fast forward
const int SCTokThreshold = 32 + (nctx*0.05); //how many tokens of similarity triggers smartcontext
void ContextFastForward(std::vector<int> &current_context_tokens, std::vector<int> &embd_inp,
int &n_past, std::vector<int> &last_n_tokens, const int nctx, std::vector<int> &smartcontext,
bool useSmartContext, const bool requireFullSubset, const int minimum_to_proceed, const int minimum_input_to_keep)
{
const int SCCtxLenThreshold = nctx * 0.8; //how much context length must be reach to trigger smartcontext
const int SCInpLenThreshold = nctx * 0.6; //how big must the input array be to trigger smartcontext
const int SCPastLenThreshold = nctx * 0.5; //how wide of a gap between the fast forwarded past and the present to trigger smart context
const float SCTruncationRatio = 0.5; //ratio for how many tokens to fast forward
const int SCTokThreshold = 32 + (nctx*0.05); //how many tokens of similarity triggers smartcontext
//fast forward the past based on identical tokens, stop once a divergence is noted
@ -532,6 +532,17 @@ std::string gguf_get_model_arch(const std::string & gguf_filename)
fastforwardok = false;
}
//we must ensure that embd_input is at least minimum_input_to_keep if possible, or as large as it can be
if (minimum_input_to_keep > 0 && n_past > embd_inp_len - minimum_input_to_keep)
{
int max_allowed_past = std::max(0, embd_inp_len - minimum_input_to_keep);
n_past = max_allowed_past;
if(n_past<=0)
{
fastforwardok = false;
}
}
if(fastforwardok)
{
last_n_tokens.erase(last_n_tokens.begin(), last_n_tokens.begin() + n_past);

View file

@ -117,7 +117,7 @@ bool ArrStartWith(const std::vector<int> targetArray, const std::vector<int> sea
int ArrFindIndexOf(const std::vector<int> targetArray, const std::vector<int> searchSeq);
FileFormat check_file_format(const std::string & fname, FileFormatExtraMeta * fileformatmeta);
void ContextFastForward(std::vector<int> &current_context_tokens, std::vector<int> &embd_inp, int &n_past, std::vector<int> &last_n_tokens, const int nctx, std::vector<int> &smartcontext, const bool useSmartContext, const bool requireFullSubset, const int minimum_to_proceed);
void ContextFastForward(std::vector<int> &current_context_tokens, std::vector<int> &embd_inp, int &n_past, std::vector<int> &last_n_tokens, const int nctx, std::vector<int> &smartcontext, const bool useSmartContext, const bool requireFullSubset, const int minimum_to_proceed, const int minimum_input_to_keep);
bool gguf_tensor_exists(const std::string & filename, std::string tensor_name, bool exactmatch);
std::string gguf_get_model_arch(const std::string & filename);