added mechanics for a full clear if fast forward is not used, this should help recover from bad states

2026-05-18 23:49:46 +00:00 · 2025-12-05 16:43:37 +08:00 · 2025-12-05 16:43:37 +08:00 · b867b67e7e
commit b867b67e7e
parent 3550265249
3 changed files with 19 additions and 5 deletions
--- a/gpttype_adapter.cpp
+++ b/gpttype_adapter.cpp
@ -3782,7 +3782,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
        {
            if(kcpp_data->use_fastforward)
            {
-                ContextFastForward(current_context_tokens, embd_inp, n_past, last_n_tokens, nctx, smartcontext, false, true);
+                ContextFastForward(current_context_tokens, embd_inp, n_past, last_n_tokens, nctx, smartcontext, false, true, 0);
            }
        }
        if(is_recurrent)
@ -3830,12 +3830,19 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
            }
            if(kcpp_data->use_fastforward)
            {
-                ContextFastForward(current_context_tokens, embd_inp, n_past, last_n_tokens, nctx, smartcontext, triggersc, false);
+                ContextFastForward(current_context_tokens, embd_inp, n_past, last_n_tokens, nctx, smartcontext, triggersc, false, 4);
            }
        }
        if(file_format == FileFormat::GGUF_GENERIC)
        {
-            llama_memory_seq_rm(llama_get_memory(llama_ctx_v4), 0, n_past, -1);
+            if(n_past==0) //force full clear
+            {
+                llama_memory_clear(llama_get_memory(llama_ctx_v4),true);
+            }
+            else
+            {
+                llama_memory_seq_rm(llama_get_memory(llama_ctx_v4), 0, n_past, -1);
+            }
            if(draft_ctx)
            {
                llama_memory_seq_rm(llama_get_memory(draft_ctx), 0, n_past, -1);
--- a/model_adapter.cpp
+++ b/model_adapter.cpp
@ -513,7 +513,7 @@ std::string gguf_get_model_arch(const std::string & gguf_filename)

 void ContextFastForward(std::vector<int> &current_context_tokens, std::vector<int> &embd_inp,
 int &n_past, std::vector<int> &last_n_tokens, const int nctx, std::vector<int> &smartcontext,
- bool useSmartContext, const bool requireFullSubset)
+ bool useSmartContext, const bool requireFullSubset, const int minimum_to_proceed)
 {
     const int SCCtxLenThreshold = nctx * 0.8; //how much context length must be reach to trigger smartcontext
     const int SCInpLenThreshold = nctx * 0.6; //how big must the input array be to trigger smartcontext
@ -568,6 +568,13 @@ std::string gguf_get_model_arch(const std::string & gguf_filename)
        }
    }

+    if(n_past < minimum_to_proceed) //too few tokens to fast forward, so lets start fresh
+    {
+        last_n_tokens.erase(last_n_tokens.end() - n_past, last_n_tokens.end());
+        n_past = 0;
+        fastforwardok = false;
+    }
+
    if(fastforwardok)
    {
        last_n_tokens.erase(last_n_tokens.begin(), last_n_tokens.begin() + n_past);
--- a/model_adapter.h
+++ b/model_adapter.h
@ -129,7 +129,7 @@ int ArrFindIndexOf(const std::vector<int> targetArray, const std::vector<int> se
 FileFormat check_file_format(const std::string & fname, FileFormatExtraMeta * fileformatmeta);
 void ContextFastForward(std::vector<int> &current_context_tokens, std::vector<int> &embd_inp,
 int &n_past, std::vector<int> &last_n_tokens, const int nctx, std::vector<int> &smartcontext,
- const bool useSmartContext, const bool requireFullSubset);
+ const bool useSmartContext, const bool requireFullSubset, const int minimum_to_proceed);
 bool gguf_tensor_exists(const std::string & filename, std::string tensor_name, bool exactmatch);
 std::string gguf_get_model_arch(const std::string & filename);