diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp
index 66f120187..826dac50a 100644
--- a/gpttype_adapter.cpp
+++ b/gpttype_adapter.cpp
@@ -5197,10 +5197,12 @@ size_t gpttype_save_state_kv(int slot)
             savestates[slot].savestate_context_tokens = current_context_tokens;
             savestates[slot].media_signature = media_composite_image_signature;
             int maxedpos = llama_memory_seq_pos_max(llama_get_memory(llama_ctx_v4),0);
-            if(maxedpos > 0 && savestates[slot].savestate_context_tokens.size() > maxedpos && savestates[slot].savestate_context_tokens.size()-maxedpos<=2)
+            //kcpp: so maxedpos appears to always be equal to ctx tokens - 2, if savestate_ctx_tokens > maxedpos + 2 then trim excess
+            if(maxedpos > 0 && savestates[slot].savestate_context_tokens.size() > maxedpos + 2)
             {
-                //dirty hack for the memory actually being off by 1 or 2, correct the state
-                while(savestates[slot].savestate_context_tokens.size() > maxedpos)
+                //dirty hack for the memory actually being off, correct the state
+                printf("\nSaveState inconsistency fix, trimming from %d to %d\n",savestates[slot].savestate_context_tokens.size(),maxedpos+2);
+                while(savestates[slot].savestate_context_tokens.size() > maxedpos+2)
                 {
                     savestates[slot].savestate_context_tokens.pop_back();
                 }
@@ -5244,6 +5246,7 @@ bool gpttype_load_state_kv(int slot)
         if (savestates[slot].current_savestate_buffer.empty()) {
             return false;
         }
+        llama_memory_clear(llama_get_memory(llama_ctx_v4),true);
         auto res = llama_state_set_data(llama_ctx_v4, savestates[slot].current_savestate_buffer.data(), savestates[slot].current_savestate_size);
         if(res > 0)
         {