diff --git a/expose.cpp b/expose.cpp index 17b8d82d1..8b3a1bc7f 100644 --- a/expose.cpp +++ b/expose.cpp @@ -376,11 +376,19 @@ extern "C" { return gpttype_calc_new_state_kv(); } + size_t calc_new_state_tokencount() + { + return gpttype_calc_new_state_tokencount(); + } size_t calc_old_state_kv() //returns how much memory current savestate is using { return gpttype_calc_old_state_kv(); } - bool save_state_kv() //triggers the save kv state of current ctx to memory + size_t calc_old_state_tokencount() + { + return gpttype_calc_old_state_tokencount(); + } + size_t save_state_kv() //triggers the save kv state of current ctx to memory { return gpttype_save_state_kv(); } @@ -390,6 +398,6 @@ extern "C" } bool clear_state_kv() { - return gpttype_clear_state_kv(); + return gpttype_clear_state_kv(true); } } diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index f3167f252..020104940 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -143,7 +143,7 @@ std::deque delayed_generated_tokens; //for use with antislop sampli static std::map> antislop_banned_token_ids; //first is the npast position, second is the array of banned ids at that index static size_t current_savestate_size = 0; -uint8_t * current_savestate_ptr = nullptr; +static std::vector current_savestate_buffer; static std::vector savestate_context_tokens; //for context clones inline int kcpp_cpu_has_blas(void) { @@ -4331,30 +4331,44 @@ size_t gpttype_calc_old_state_kv() { return current_savestate_size; } -bool gpttype_save_state_kv() +size_t gpttype_calc_old_state_tokencount() +{ + return savestate_context_tokens.size(); +} +size_t gpttype_calc_new_state_tokencount() +{ + return current_context_tokens.size(); +} +size_t gpttype_save_state_kv() { if(kcpp_data==nullptr) { - return false; + return 0; } if(file_format == FileFormat::GGUF_GENERIC) { - gpttype_clear_state_kv(); //JIT free + gpttype_clear_state_kv(false); //JIT free size_t newsize = llama_state_get_size(llama_ctx_v4); - current_savestate_ptr = (uint8_t *) malloc(newsize + 512); //add some padding - if(!current_savestate_ptr) - { - return false; + try { + if (current_savestate_buffer.capacity() < newsize + 512) { + current_savestate_buffer = std::vector(newsize + 512); + } else { + current_savestate_buffer.resize(newsize + 512); + } + current_savestate_buffer.resize(newsize + 512); // add some padding. May throw std::bad_alloc + } catch (const std::bad_alloc&) { + fprintf(stderr, "KV Save State: Failed to allocate %zu bytes.\n", newsize + 512); + return 0; } - auto res = llama_state_get_data(llama_ctx_v4, current_savestate_ptr, newsize); + auto res = llama_state_get_data(llama_ctx_v4, current_savestate_buffer.data(), newsize); if (res > 0) { current_savestate_size = newsize; savestate_context_tokens = current_context_tokens; printf("\nKV Save State: Created SaveState of %zu tokens, costing %zu MB.\n",current_context_tokens.size(),current_savestate_size/(1024*1024)); } - return (res > 0); + return res; } - return false; + return 0; } bool gpttype_load_state_kv() { @@ -4364,10 +4378,10 @@ bool gpttype_load_state_kv() } if(file_format == FileFormat::GGUF_GENERIC) { - if (current_savestate_ptr == nullptr || current_savestate_size == 0) { + if (current_savestate_buffer.empty()) { return false; } - auto res = llama_state_set_data(llama_ctx_v4, current_savestate_ptr, current_savestate_size); + auto res = llama_state_set_data(llama_ctx_v4, current_savestate_buffer.data(), current_savestate_size); if(res > 0) { current_context_tokens = savestate_context_tokens; @@ -4377,7 +4391,7 @@ bool gpttype_load_state_kv() } return false; } -bool gpttype_clear_state_kv() +bool gpttype_clear_state_kv(bool shrink) { if(kcpp_data==nullptr) { @@ -4385,11 +4399,13 @@ bool gpttype_clear_state_kv() } if(file_format == FileFormat::GGUF_GENERIC) { - if (current_savestate_ptr != nullptr) { - //JIT free - printf("\nKV Clear SaveState: Freed %zu MB.\n",current_savestate_size/(1024*1024)); - free(current_savestate_ptr); - current_savestate_ptr = nullptr; + if (!current_savestate_buffer.empty()) { + printf("\nKV Clear SaveState: Freed %zu MB.\n", current_savestate_size / (1024 * 1024)); + current_savestate_buffer.clear(); + if(shrink) + { + current_savestate_buffer.shrink_to_fit(); + } savestate_context_tokens.clear(); current_savestate_size = 0; return true; diff --git a/kcpp_docs.embd b/kcpp_docs.embd index 0e3e8464c..ba224b471 100644 --- a/kcpp_docs.embd +++ b/kcpp_docs.embd @@ -440,7 +440,7 @@ "info": { "title": "KoboldCpp API", "description": "For swagger.json, click here or use online version.", - "version": "2025.01.08" + "version": "2025.06.03" }, "openapi": "3.0.3", "paths": { @@ -639,7 +639,7 @@ "application/json": { "example": { "result": "KoboldCpp", - "version": "2025.01.08", + "version": "2025.06.03", "protected": false, "txt2img": false, "vision": false, @@ -1909,8 +1909,10 @@ "application/json": { "example": { "success": true, - "old_state": 0, - "new_state": 0 + "old_state_size": 0, + "old_tokens": 0, + "new_state_size": 0, + "new_tokens": 0, }, "schema": { "properties": { @@ -1918,13 +1920,21 @@ "type": "boolean", "description": "Whether the operation was successful." }, - "old_state": { + "old_state_size": { "type": "number", "description": "Bytes currently in used for existing save state." }, - "new_state": { + "old_tokens": { + "type": "number", + "description": "How many tokens in currently existing save state." + }, + "new_state_size": { "type": "number", "description": "Bytes a new save state is estimated to consume." + }, + "new_tokens": { + "type": "number", + "description": "How many tokens will be stored if a new save state is made." } } } @@ -1947,13 +1957,23 @@ "content": { "application/json": { "example": { - "success": true + "success": true, + "new_state_size": 12345678, + "new_tokens": 100, }, "schema": { "properties": { "success": { "type": "boolean", "description": "Whether the operation was successful." + }, + "new_state_size": { + "type": "number", + "description": "Bytes a new save state is estimated to consume." + }, + "new_tokens": { + "type": "number", + "description": "How many context tokens were saved in state." } } } @@ -1976,13 +1996,18 @@ "content": { "application/json": { "example": { - "success": true + "success": true, + "new_tokens": 100 }, "schema": { "properties": { "success": { "type": "boolean", "description": "Whether the operation was successful." + }, + "new_tokens": { + "type": "number", + "description": "How many context tokens were loaded from state." } } } @@ -2423,7 +2448,7 @@ "/v1/completions": { "post": { "summary": "Generates text continuations given a prompt. Please refer to OpenAI documentation", - "description": "Generates text continuations given a prompt.\n\nThis is an OpenAI compatibility endpoint.\n\n Please refer to OpenAI documentation at [https://platform.openai.com/docs/api-reference/completions](https://platform.openai.com/docs/api-reference/completions)", + "description": "Generates text continuations given a prompt.\n\nThis is an OpenAI compatibility endpoint.\n\n Please refer to OpenAI documentation at [https://platform.openai.com/docs/api-reference/completions](https://platform.openai.com/docs/api-reference/completions). All KoboldCpp samplers are supported, please refer to /api/v1/generate for more details.", "requestBody": { "content": { "application/json": { @@ -2445,7 +2470,7 @@ "/v1/chat/completions": { "post": { "summary": "Generates a response from a list of messages. Please refer to OpenAI documentation", - "description": "Given a list of messages comprising a conversation, the model will return a response.\n\n This is an OpenAI compatibility endpoint.\n\n Please refer to OpenAI documentation at [https://platform.openai.com/docs/api-reference/chat](https://platform.openai.com/docs/api-reference/chat)", + "description": "Given a list of messages comprising a conversation, the model will return a response.\n\n This is an OpenAI compatibility endpoint.\n\n Please refer to OpenAI documentation at [https://platform.openai.com/docs/api-reference/chat](https://platform.openai.com/docs/api-reference/chat). All KoboldCpp samplers are supported, please refer to /api/v1/generate for more details.", "requestBody": { "content": { "application/json": { diff --git a/klite.embd b/klite.embd index 5284125d8..184e655d5 100644 --- a/klite.embd +++ b/klite.embd @@ -12,7 +12,7 @@ Current version indicated by LITEVER below. -->