better approach when SWA window exceeded, simply refill the window. this is not 100% correct but good enough for fastforward users. Disable FF or increase window if not good enough

2026-05-18 23:49:46 +00:00 · 2026-04-17 11:44:13 +08:00 · 2026-04-17 11:44:13 +08:00 · 64ce5fca15
commit 64ce5fca15
parent fa3f86ee70
3 changed files with 26 additions and 14 deletions
--- a/gpttype_adapter.cpp
+++ b/gpttype_adapter.cpp
@ -4245,7 +4245,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
        {
            if(kcpp_data->use_fastforward)
            {
-                ContextFastForward(current_context_tokens, embd_inp, n_past, last_n_tokens, nctx, smartcontext, false, true, 0);
+                ContextFastForward(current_context_tokens, embd_inp, n_past, last_n_tokens, nctx, smartcontext, false, true, 0, 0);
            }
        }
        if(is_recurrent)
@ -4297,6 +4297,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
        bool triggerff = kcpp_data->use_fastforward;
        if(!blank_prompt) //special case for blank prompts, no fast forward or shifts
        {
+            int ff_swa_retain_amount = 0; //a hack for SWA to improve coherency for illegal rewinds
            if(triggerff && !kcpp_data->swa_full && (file_format == FileFormat::GGUF_GENERIC))
            {
                const int swa_pos_min = llama_memory_seq_pos_min(llama_get_memory(llama_ctx_v4), 0); //this is the furthest back we can rewind to.
@ -4304,10 +4305,10 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
                goal_npast -= 4;
                goal_npast = goal_npast < 0 ? 0 : goal_npast;
                if (swa_pos_min < 0 || goal_npast <= swa_pos_min) {
-                    triggerff = false;
+                    ff_swa_retain_amount = kcpp_active_swa_size;
                    if (debugmode==1 && !is_quiet)
                    {
-                         printf("\nNote: Context cannot be reused (Desired n_past=%d, SWA lowest n_past=%d), doing a full reprocess... to avoid this, disable SWA or increase SWA padding)\n", goal_npast, swa_pos_min);
+                         printf("\nNote: SWA context cannot be reused (Desired n_past=%d, SWA lowest n_past=%d), to avoid this, disable SWA or increase SWA padding), output may degrade.\n", goal_npast, swa_pos_min);
                    }
                }
            }
@ -4318,7 +4319,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
            }
            if(triggerff)
            {
-                ContextFastForward(current_context_tokens, embd_inp, n_past, last_n_tokens, nctx, smartcontext, triggersc, false, 4);
+                ContextFastForward(current_context_tokens, embd_inp, n_past, last_n_tokens, nctx, smartcontext, triggersc, false, 4, ff_swa_retain_amount);
            }
        }
        if(file_format == FileFormat::GGUF_GENERIC)
--- a/model_adapter.cpp
+++ b/model_adapter.cpp
@ -468,15 +468,15 @@ std::string gguf_get_model_arch(const std::string & gguf_filename)
     return longest;
 }

- void ContextFastForward(std::vector<int> &current_context_tokens, std::vector<int> &embd_inp,
- int &n_past, std::vector<int> &last_n_tokens, const int nctx, std::vector<int> &smartcontext,
- bool useSmartContext, const bool requireFullSubset, const int minimum_to_proceed)
- {
-     const int SCCtxLenThreshold = nctx * 0.8; //how much context length must be reach to trigger smartcontext
-     const int SCInpLenThreshold = nctx * 0.6; //how big must the input array be to trigger smartcontext
-     const int SCPastLenThreshold = nctx * 0.5; //how wide of a gap between the fast forwarded past and the present to trigger smart context
-     const float SCTruncationRatio = 0.5; //ratio for how many tokens to fast forward
-     const int SCTokThreshold = 32 + (nctx*0.05); //how many tokens of similarity triggers smartcontext
+void ContextFastForward(std::vector<int> &current_context_tokens, std::vector<int> &embd_inp,
+int &n_past, std::vector<int> &last_n_tokens, const int nctx, std::vector<int> &smartcontext,
+bool useSmartContext, const bool requireFullSubset, const int minimum_to_proceed, const int minimum_input_to_keep)
+{
+    const int SCCtxLenThreshold = nctx * 0.8; //how much context length must be reach to trigger smartcontext
+    const int SCInpLenThreshold = nctx * 0.6; //how big must the input array be to trigger smartcontext
+    const int SCPastLenThreshold = nctx * 0.5; //how wide of a gap between the fast forwarded past and the present to trigger smart context
+    const float SCTruncationRatio = 0.5; //ratio for how many tokens to fast forward
+    const int SCTokThreshold = 32 + (nctx*0.05); //how many tokens of similarity triggers smartcontext


    //fast forward the past based on identical tokens, stop once a divergence is noted
@ -532,6 +532,17 @@ std::string gguf_get_model_arch(const std::string & gguf_filename)
        fastforwardok = false;
    }

+    //we must ensure that embd_input is at least minimum_input_to_keep if possible, or as large as it can be
+    if (minimum_input_to_keep > 0 && n_past > embd_inp_len - minimum_input_to_keep)
+    {
+        int max_allowed_past = std::max(0, embd_inp_len - minimum_input_to_keep);
+        n_past = max_allowed_past;
+        if(n_past<=0)
+        {
+            fastforwardok = false;
+        }
+    }
+
    if(fastforwardok)
    {
        last_n_tokens.erase(last_n_tokens.begin(), last_n_tokens.begin() + n_past);
--- a/model_adapter.h
+++ b/model_adapter.h
@ -117,7 +117,7 @@ bool ArrStartWith(const std::vector<int> targetArray, const std::vector<int> sea
 int ArrFindIndexOf(const std::vector<int> targetArray, const std::vector<int> searchSeq);

 FileFormat check_file_format(const std::string & fname, FileFormatExtraMeta * fileformatmeta);
-void ContextFastForward(std::vector<int> &current_context_tokens, std::vector<int> &embd_inp, int &n_past, std::vector<int> &last_n_tokens, const int nctx, std::vector<int> &smartcontext, const bool useSmartContext, const bool requireFullSubset, const int minimum_to_proceed);
+void ContextFastForward(std::vector<int> &current_context_tokens, std::vector<int> &embd_inp, int &n_past, std::vector<int> &last_n_tokens, const int nctx, std::vector<int> &smartcontext, const bool useSmartContext, const bool requireFullSubset, const int minimum_to_proceed, const int minimum_input_to_keep);
 bool gguf_tensor_exists(const std::string & filename, std::string tensor_name, bool exactmatch);
 std::string gguf_get_model_arch(const std::string & filename);