Save KV State and Load KV State to memory added. GUI not yet updated

2025-09-10 09:04:36 +00:00 · 2025-06-03 17:46:29 +08:00 · 2025-06-03 17:46:29 +08:00 · 4b57108508
commit 4b57108508
parent b42b618897
5 changed files with 271 additions and 1 deletions
--- a/gpttype_adapter.cpp
+++ b/gpttype_adapter.cpp
@ -142,6 +142,10 @@ static int delayed_generated_tokens_limit = 0;
 std::deque<std::string> delayed_generated_tokens; //for use with antislop sampling
 static std::map<int,std::vector<int>> antislop_banned_token_ids; //first is the npast position, second is the array of banned ids at that index

+static size_t current_savestate_size = 0;
+uint8_t * current_savestate_ptr = nullptr;
+static std::vector<gpt_vocab::id> savestate_context_tokens; //for context clones
+
 inline int kcpp_cpu_has_blas(void) {
 #if defined(GGML_USE_BLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
    return 1;
@ -4310,3 +4314,87 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
    generation_finished = true;
    return output;
 }
+
+size_t gpttype_calc_new_state_kv()
+{
+    if(kcpp_data==nullptr)
+    {
+        return 0;
+    }
+    if(file_format == FileFormat::GGUF_GENERIC)
+    {
+        return llama_state_get_size(llama_ctx_v4);
+    }
+    return 0;
+}
+size_t gpttype_calc_old_state_kv()
+{
+    return current_savestate_size;
+}
+bool gpttype_save_state_kv()
+{
+    if(kcpp_data==nullptr)
+    {
+        return false;
+    }
+    if(file_format == FileFormat::GGUF_GENERIC)
+    {
+        gpttype_clear_state_kv(); //JIT free
+        size_t newsize = llama_state_get_size(llama_ctx_v4);
+        current_savestate_ptr = (uint8_t *) malloc(newsize + 512);  //add some padding
+        if(!current_savestate_ptr)
+        {
+            return false;
+        }
+        auto res = llama_state_get_data(llama_ctx_v4, current_savestate_ptr, newsize);
+        if (res > 0) {
+            current_savestate_size   = newsize;
+            savestate_context_tokens = current_context_tokens;
+            printf("\nKV Save State: Created SaveState of %zu tokens, costing %zu MB.\n",current_context_tokens.size(),current_savestate_size/(1024*1024));
+        }
+        return (res > 0);
+    }
+    return false;
+}
+bool gpttype_load_state_kv()
+{
+    if(kcpp_data==nullptr)
+    {
+        return false;
+    }
+    if(file_format == FileFormat::GGUF_GENERIC)
+    {
+        if (current_savestate_ptr == nullptr || current_savestate_size == 0) {
+            return false;
+        }
+        auto res = llama_state_set_data(llama_ctx_v4, current_savestate_ptr, current_savestate_size);
+        if(res > 0)
+        {
+            current_context_tokens = savestate_context_tokens;
+            printf("\nKV Load SaveState: Restored KV with %zu tokens.\n",current_context_tokens.size());
+        }
+        return (res > 0);
+    }
+    return false;
+}
+bool gpttype_clear_state_kv()
+{
+    if(kcpp_data==nullptr)
+    {
+        return false;
+    }
+    if(file_format == FileFormat::GGUF_GENERIC)
+    {
+        if (current_savestate_ptr != nullptr) {
+            //JIT free
+            printf("\nKV Clear SaveState: Freed %zu MB.\n",current_savestate_size/(1024*1024));
+            free(current_savestate_ptr);
+            current_savestate_ptr = nullptr;
+            savestate_context_tokens.clear();
+            current_savestate_size = 0;
+            return true;
+        }
+        return false;
+    }
+    return false;
+}