use a static buffer for kv reloads instead. also, added into lite ui

2025-09-10 17:14:36 +00:00 · 2025-06-03 22:32:46 +08:00 · 2025-06-03 22:32:46 +08:00 · 53f1511396
commit 53f1511396
parent 4b57108508
6 changed files with 239 additions and 99 deletions
--- a/expose.cpp
+++ b/expose.cpp
@ -376,11 +376,19 @@ extern "C"
    {
        return gpttype_calc_new_state_kv();
    }
    size_t calc_new_state_tokencount()
    {
        return gpttype_calc_new_state_tokencount();
    }
    size_t calc_old_state_kv() //returns how much memory current savestate is using
    {
        return gpttype_calc_old_state_kv();
    }
-    bool save_state_kv() //triggers the save kv state of current ctx to memory
+    size_t calc_old_state_tokencount()
    {
        return gpttype_calc_old_state_tokencount();
    }
    size_t save_state_kv() //triggers the save kv state of current ctx to memory
    {
        return gpttype_save_state_kv();
    }
@ -390,6 +398,6 @@ extern "C"
    }
    bool clear_state_kv()
    {
-        return gpttype_clear_state_kv();
+        return gpttype_clear_state_kv(true);
    }
 }
--- a/gpttype_adapter.cpp
+++ b/gpttype_adapter.cpp
@ -143,7 +143,7 @@ std::deque<std::string> delayed_generated_tokens; //for use with antislop sampli
 static std::map<int,std::vector<int>> antislop_banned_token_ids; //first is the npast position, second is the array of banned ids at that index
 static size_t current_savestate_size = 0;
-uint8_t * current_savestate_ptr = nullptr;
+static std::vector<uint8_t> current_savestate_buffer;
 static std::vector<gpt_vocab::id> savestate_context_tokens; //for context clones
 inline int kcpp_cpu_has_blas(void) {
@ -4331,30 +4331,44 @@ size_t gpttype_calc_old_state_kv()
 {
    return current_savestate_size;
 }
-bool gpttype_save_state_kv()
+size_t gpttype_calc_old_state_tokencount()
 {
    return savestate_context_tokens.size();
 }
 size_t gpttype_calc_new_state_tokencount()
 {
    return current_context_tokens.size();
 }
 size_t gpttype_save_state_kv()
 {
    if(kcpp_data==nullptr)
    {
-        return false;
+        return 0;
    }
    if(file_format == FileFormat::GGUF_GENERIC)
    {
-        gpttype_clear_state_kv(); //JIT free
+        gpttype_clear_state_kv(false); //JIT free
        size_t newsize = llama_state_get_size(llama_ctx_v4);
-        current_savestate_ptr = (uint8_t *) malloc(newsize + 512);  //add some padding
+        try {
-        if(!current_savestate_ptr)
+            if (current_savestate_buffer.capacity() < newsize + 512) {
-        {
+                current_savestate_buffer = std::vector<uint8_t>(newsize + 512);
-            return false;
+            } else {
                current_savestate_buffer.resize(newsize + 512);
            }
-        auto res = llama_state_get_data(llama_ctx_v4, current_savestate_ptr, newsize);
+            current_savestate_buffer.resize(newsize + 512);  // add some padding. May throw std::bad_alloc
        } catch (const std::bad_alloc&) {
            fprintf(stderr, "KV Save State: Failed to allocate %zu bytes.\n", newsize + 512);
            return 0;
        }
        auto res = llama_state_get_data(llama_ctx_v4, current_savestate_buffer.data(), newsize);
        if (res > 0) {
            current_savestate_size   = newsize;
            savestate_context_tokens = current_context_tokens;
            printf("\nKV Save State: Created SaveState of %zu tokens, costing %zu MB.\n",current_context_tokens.size(),current_savestate_size/(1024*1024));
        }
-        return (res > 0);
+        return res;
    }
-    return false;
+    return 0;
 }
 bool gpttype_load_state_kv()
 {
@ -4364,10 +4378,10 @@ bool gpttype_load_state_kv()
    }
    if(file_format == FileFormat::GGUF_GENERIC)
    {
-        if (current_savestate_ptr == nullptr || current_savestate_size == 0) {
+        if (current_savestate_buffer.empty()) {
            return false;
        }
-        auto res = llama_state_set_data(llama_ctx_v4, current_savestate_ptr, current_savestate_size);
+        auto res = llama_state_set_data(llama_ctx_v4, current_savestate_buffer.data(), current_savestate_size);
        if(res > 0)
        {
            current_context_tokens = savestate_context_tokens;
@ -4377,7 +4391,7 @@ bool gpttype_load_state_kv()
    }
    return false;
 }
-bool gpttype_clear_state_kv()
+bool gpttype_clear_state_kv(bool shrink)
 {
    if(kcpp_data==nullptr)
    {
@ -4385,11 +4399,13 @@ bool gpttype_clear_state_kv()
    }
    if(file_format == FileFormat::GGUF_GENERIC)
    {
-        if (current_savestate_ptr != nullptr) {
+        if (!current_savestate_buffer.empty()) {
-            //JIT free
+            printf("\nKV Clear SaveState: Freed %zu MB.\n", current_savestate_size / (1024 * 1024));
-            printf("\nKV Clear SaveState: Freed %zu MB.\n",current_savestate_size/(1024*1024));
+            current_savestate_buffer.clear();
-            free(current_savestate_ptr);
+            if(shrink)
-            current_savestate_ptr = nullptr;
+            {
                current_savestate_buffer.shrink_to_fit();
            }
            savestate_context_tokens.clear();
            current_savestate_size = 0;
            return true;
--- a/kcpp_docs.embd
+++ b/kcpp_docs.embd
@ -440,7 +440,7 @@
             "info": {
                "title": "KoboldCpp API",
                "description": "For swagger.json, <a href=\"?json=1\">click here</a> or use <a href=\"https://lite.koboldai.net/koboldcpp_api.json\">online version</a>.",
-                "version": "2025.01.08"
+                "version": "2025.06.03"
             },
             "openapi": "3.0.3",
             "paths": {
@ -639,7 +639,7 @@
                               "application/json": {
                                  "example": {
                                     "result": "KoboldCpp",
-                                     "version": "2025.01.08",
+                                     "version": "2025.06.03",
                                     "protected": false,
                                     "txt2img": false,
                                     "vision": false,
@ -1909,8 +1909,10 @@
                                 "application/json": {
                                  "example": {
                                     "success": true,
-                                     "old_state": 0,
+                                     "old_state_size": 0,
-                                     "new_state": 0
+                                     "old_tokens": 0,
                                     "new_state_size": 0,
                                     "new_tokens": 0,
                                  },
                                  "schema": {
                                     "properties": {
@ -1918,13 +1920,21 @@
                                           "type": "boolean",
                                           "description": "Whether the operation was successful."
                                        },
-                                        "old_state": {
+                                        "old_state_size": {
                                           "type": "number",
                                           "description": "Bytes currently in used for existing save state."
                                        },
-                                        "new_state": {
+                                        "old_tokens": {
                                           "type": "number",
                                           "description": "How many tokens in currently existing save state."
                                        },
                                        "new_state_size": {
                                           "type": "number",
                                           "description": "Bytes a new save state is estimated to consume."
                                        },
                                        "new_tokens": {
                                           "type": "number",
                                           "description": "How many tokens will be stored if a new save state is made."
                                        }
                                     }
                                  }
@ -1947,13 +1957,23 @@
                            "content": {
                                 "application/json": {
                                  "example": {
-                                     "success": true
+                                     "success": true,
                                     "new_state_size": 12345678,
                                     "new_tokens": 100,
                                  },
                                  "schema": {
                                     "properties": {
                                        "success": {
                                           "type": "boolean",
                                           "description": "Whether the operation was successful."
                                        },
                                        "new_state_size": {
                                           "type": "number",
                                           "description": "Bytes a new save state is estimated to consume."
                                        },
                                        "new_tokens": {
                                           "type": "number",
                                           "description": "How many context tokens were saved in state."
                                        }
                                     }
                                  }
@ -1976,13 +1996,18 @@
                            "content": {
                                 "application/json": {
                                  "example": {
-                                     "success": true
+                                     "success": true,
                                     "new_tokens": 100
                                  },
                                  "schema": {
                                     "properties": {
                                        "success": {
                                           "type": "boolean",
                                           "description": "Whether the operation was successful."
                                        },
                                        "new_tokens": {
                                           "type": "number",
                                           "description": "How many context tokens were loaded from state."
                                        }
                                     }
                                  }
@ -2423,7 +2448,7 @@
                "/v1/completions": {
                   "post": {
                      "summary": "Generates text continuations given a prompt. Please refer to OpenAI documentation",
-                      "description": "Generates text continuations given a prompt.\n\nThis is an OpenAI compatibility endpoint.\n\n Please refer to OpenAI documentation at [https://platform.openai.com/docs/api-reference/completions](https://platform.openai.com/docs/api-reference/completions)",
+                      "description": "Generates text continuations given a prompt.\n\nThis is an OpenAI compatibility endpoint.\n\n Please refer to OpenAI documentation at [https://platform.openai.com/docs/api-reference/completions](https://platform.openai.com/docs/api-reference/completions). All KoboldCpp samplers are supported, please refer to /api/v1/generate for more details.",
                      "requestBody": {
                         "content": {
                            "application/json": {
@ -2445,7 +2470,7 @@
                "/v1/chat/completions": {
                   "post": {
                      "summary": "Generates a response from a list of messages. Please refer to OpenAI documentation",
-                      "description": "Given a list of messages comprising a conversation, the model will return a response.\n\n This is an OpenAI compatibility endpoint.\n\n Please refer to OpenAI documentation at [https://platform.openai.com/docs/api-reference/chat](https://platform.openai.com/docs/api-reference/chat)",
+                      "description": "Given a list of messages comprising a conversation, the model will return a response.\n\n This is an OpenAI compatibility endpoint.\n\n Please refer to OpenAI documentation at [https://platform.openai.com/docs/api-reference/chat](https://platform.openai.com/docs/api-reference/chat). All KoboldCpp samplers are supported, please refer to /api/v1/generate for more details.",
                      "requestBody": {
                         "content": {
                            "application/json": {
--- a/klite.embd
+++ b/klite.embd
@ -12,7 +12,7 @@ Current version indicated by LITEVER below.
 -->
 <script>
-	const LITEVER = 248;
+	const LITEVER = 250;
 	const urlParams = new URLSearchParams(window.location.search);
 	var localflag = urlParams.get('local'); //this will be replaced automatically in embedded kcpp
 	const STORAGE_PREFIX = (localflag?"e_":"")+"kaihordewebui_";
@ -2163,6 +2163,9 @@ Current version indicated by LITEVER below.
 	.color_offwhite {
 		color: #bedae9;
 	}
 	.color_white {
 		color: #ffffff;
 	}
 	.color_darkgreen {
 		color: #63975c;
 	}
@ -2999,6 +3002,8 @@ Current version indicated by LITEVER below.
 	const koboldcpp_tts_endpoint = "/api/extra/tts";
 	const koboldcpp_admin_list_endpoint = "/api/admin/list_options";
 	const koboldcpp_admin_reload_endpoint = "/api/admin/reload_config";
 	const koboldcpp_admin_savestate_endpoint = "/api/admin/save_state";
 	const koboldcpp_admin_loadstate_endpoint = "/api/admin/load_state";
 	const koboldcpp_savedata_list_endpoint = "/api/extra/data/list";
 	const koboldcpp_savedata_save_endpoint = "/api/extra/data/save";
 	const koboldcpp_savedata_load_endpoint = "/api/extra/data/load";
@ -7362,7 +7367,7 @@ Current version indicated by LITEVER below.
 					}
 				} else {
 					//check for tavernai fields
-					let has_tav_wi_check = (new_loaded_storyobj && new_loaded_storyobj.entries && new_loaded_storyobj.entries["0"] && new_loaded_storyobj.entries["0"].hasOwnProperty("uid"));
+					let has_tav_wi_check = has_tavern_wi_check(new_loaded_storyobj);
 					if (!new_loaded_storyobj.scenarioVersion && (new_loaded_storyobj.name != null || new_loaded_storyobj.description != null ||
 						new_loaded_storyobj.personality != null || new_loaded_storyobj.spec=="chara_card_v2" || has_tav_wi_check)) {
 						load_tavern_obj(new_loaded_storyobj);
@ -7837,6 +7842,18 @@ Current version indicated by LITEVER below.
 			}
 		});
 	}
 	function has_tavern_wi_check(obj)
 	{
 		let checkresult = false;
 		if (obj && obj.entries) {
 			const keys = Object.keys(obj.entries);
 			if (keys.length > 0) {
 				const firstEntry = obj.entries[keys[0]];
 				checkresult = firstEntry && firstEntry.hasOwnProperty("uid");
 			}
 		}
 		return checkresult;
 	}
 	function load_agnai_wi(obj,chatopponent,myname)
 	{
@ -7930,7 +7947,7 @@ Current version indicated by LITEVER below.
 	function importLorebookAsTextDB(lorebook)
 	{
-		let has_tav_wi_check = (lorebook && lorebook.entries && lorebook.entries["0"] && lorebook.entries["0"].hasOwnProperty("uid"));
+		let has_tav_wi_check = has_tavern_wi_check(lorebook);
 		if (lorebook && has_tav_wi_check)
 		{
 			let lbname = lorebook.name?lorebook.name:"UntitledLorebook";
@ -8051,7 +8068,7 @@ Current version indicated by LITEVER below.
 			}
 			let combinedmem = sysprompt + memory + scenario + examplemsg;
 			let agnaidatafieldsempty = scenario + examplemsg + (obj.personality?obj.personality:"") + greeting;
-			let has_tav_wi_check = (obj && obj.entries && obj.entries["0"] && obj.entries["0"].hasOwnProperty("uid"));
+			let has_tav_wi_check = has_tavern_wi_check(obj);
 			//check if it's a world info only card, if so, do not restart game
 			if(combinedmem.trim()=="" && greeting=="" && has_tav_wi_check)
 			{
@ -8102,6 +8119,10 @@ Current version indicated by LITEVER below.
 				{
 					current_wi = load_agnai_wi(obj,chatopponent,myname);
 				}
 				else if (has_tav_wi_check)
 				{
 					current_wi = load_tavern_wi(obj,chatopponent,myname);
 				}
 			}
 			update_for_sidepanel();
 			render_gametext(true);
@ -11074,6 +11095,7 @@ Current version indicated by LITEVER below.
 	function display_admin_container()
 	{
 		mainmenu_untab(false);
 		document.getElementById("loadstatetxt").innerText = "";
 		let fetch_kcpps_configs = function(adminkey)
 		{
 			let header = {'Content-Type': 'application/json'};
@ -11189,6 +11211,62 @@ Current version indicated by LITEVER below.
 		});
 	}
 	function trigger_admin_savestate()
 	{
 		document.getElementById("loadstatetxt").innerText = "Saving State...";
 		let header = {'Content-Type': 'application/json'};
 		if(last_admin_key!="")
 		{
 			header['Authorization'] = 'Bearer ' + last_admin_key;
 		}
 		fetch(custom_kobold_endpoint + koboldcpp_admin_savestate_endpoint, {
 			method: 'POST',
 			headers: header
 		})
 		.then(x => x.json())
 		.then(values => {
 			console.log(values);
 			if(values.success)
 			{
 				document.getElementById("loadstatetxt").innerText = `State Saved (${values.new_tokens} tokens in ${parseInt(values.new_state_size/(1024*1024))} MB)`;
 			}else{
 				document.getElementById("loadstatetxt").innerText = `Save State Failed!`;
 			}
 		}).catch((error) => {
 			console.log("Error: " + error);
 			document.getElementById("loadstatetxt").innerText = `Save State Failed!`;
 			msgbox(error,"Error");
 		});
 	}
 	function trigger_admin_loadstate()
 	{
 		document.getElementById("loadstatetxt").innerText = "Loading State...";
 		let header = {'Content-Type': 'application/json'};
 		if(last_admin_key!="")
 		{
 			header['Authorization'] = 'Bearer ' + last_admin_key;
 		}
 		fetch(custom_kobold_endpoint + koboldcpp_admin_loadstate_endpoint, {
 			method: 'POST',
 			headers: header
 		})
 		.then(x => x.json())
 		.then(values => {
 			console.log(values);
 			if(values.success)
 			{
 				document.getElementById("loadstatetxt").innerText = `State Loaded (${values.new_tokens} tokens)`;
 			}else{
 				document.getElementById("loadstatetxt").innerText = `Load State Failed!`;
 			}
 		}).catch((error) => {
 			console.log("Error: " + error);
 			document.getElementById("loadstatetxt").innerText = `Load State Failed!`;
 			msgbox(error,"Error");
 		});
 	}
 	var cachedsaveslotlabels = [];
 	var netsaveslotlabels = [];
 	function saveloadchangeslot(updatelist=false)
@ -22754,7 +22832,7 @@ Current version indicated by LITEVER below.
 				<div style="float:right;">
 					<div class="settinglabel">
-						<button type="button" class="btn purplebtn widelbtn" style="padding:4px;margin:2px;margin-top:4px;margin-bottom:4px;font-size:8px" id="wiexport" onclick="wi_group_export()">[Export / Import Group]</button>
+						<button type="button" class="btn purplebtn widelbtn" style="padding:4px;margin:2px;margin-top:4px;margin-bottom:4px;font-size:8px" id="wiexport" onclick="wi_group_export()">[Edit Group]</button>
 						<button type="button" class="btn purplebtn widelbtn" style="padding:4px;margin:2px;margin-top:4px;margin-bottom:4px;font-size:8px" id="wiexport" onclick="export_wi_to_file()">[Export all WI to file]</button>
 						<button type="button" class="btn purplebtn widelbtn" style="padding:4px;margin:2px;margin-top:4px;margin-bottom:4px;font-size:8px" id="wiexport" onclick="import_wi_from_file()">[Import all WI from file]</button>
 					</div>
@ -24698,21 +24776,31 @@ Current version indicated by LITEVER below.
 	<div class="popupcontainer flex hidden" id="admincontainer">
 		<div class="popupbg flex"></div>
-		<div class="nspopup flexsizevsmall">
+		<div class="nspopup flexsizesmall">
 			<div class="popuptitlebar">
-				<div class="popuptitletext">Change Loaded KoboldCpp Config</div>
+				<div class="popuptitletext">KoboldCpp Admin Config</div>
 			</div>
-			<div class="menutext">
+			<br>
 				<b></b>Warning: This will terminate the current KoboldCpp instance and relaunch it with a new config.</b><br><br>
 				If an invalid configuration is selected, the new server may fail to relaunch!<br><br>
 			<div>
-					<select title="Select New Config" style="padding:4px;" class="form-control" id="adminconfigdropdown">
+				<b class="color_white" style="padding: 5px;">Save / Load Context State:</b><br>
-					</select>
+				<div style="display:flex;padding: 5px;">
 				<button type="button" style="margin:2px;width:50%" class="btn btn-primary" onclick="trigger_admin_savestate()">Save State</button>
 				<button type="button" style="margin:2px;width:50%" class="btn btn-primary" onclick="trigger_admin_loadstate()">Load State</button>
 				</div>
 				<div class="menutext" id="loadstatetxt"></div>
 			</div>
 			<br>
 			<div>
 				<b class="color_white" style="padding: 5px;">Change Loaded Model / Config:</b><br>
 				<div style="display:flex;padding: 5px;">
 					<select title="Select New Config" style="padding:4px; width:calc(100% - 150px)" class="form-control" id="adminconfigdropdown">
 					</select>
 					<button type="button" style="margin-left:2px;width:146px" class="btn btn-primary" onclick="trigger_admin_reload()">Reload KoboldCpp</button>
 				</div>
 				<div class="menutext">Warning: This will terminate the current KoboldCpp instance and relaunch it with a new config. If an invalid configuration is selected, the new server may fail to relaunch!</div>
 				<br>
 			</div>
 			<div class="popupfooter">
 				<button type="button" style="width:200px" class="btn btn-primary" onclick="trigger_admin_reload()">Reload KoboldCpp</button>
 				<button type="button" class="btn btn-primary" onclick="hide_popups()">Cancel</button>
 			</div>
 		</div>
--- a/koboldcpp.py
+++ b/koboldcpp.py
@ -523,7 +523,9 @@ def init_library():
    handle.get_chat_template.restype = ctypes.c_char_p
    handle.calc_new_state_kv.restype = ctypes.c_size_t
    handle.calc_old_state_kv.restype = ctypes.c_size_t
-    handle.save_state_kv.restype = ctypes.c_bool
+    handle.calc_new_state_tokencount.restype = ctypes.c_size_t
    handle.calc_old_state_tokencount.restype = ctypes.c_size_t
    handle.save_state_kv.restype = ctypes.c_size_t
    handle.load_state_kv.restype = ctypes.c_bool
    handle.clear_state_kv.restype = ctypes.c_bool
    handle.sd_load_model.argtypes = [sd_load_model_inputs]
@ -3090,7 +3092,7 @@ Change Mode<br>
        elif self.path=="/v1":
            content_type = 'text/html'
-            response_body = ("KoboldCpp OpenAI compatible endpoint is running!\n\nFor usage reference, see https://platform.openai.com/docs/api-reference").encode()
+            response_body = ("KoboldCpp OpenAI compatible endpoint is running!<br>For usage reference, see <a href='https://platform.openai.com/docs/api-reference'>https://platform.openai.com/docs/api-reference</a><br>For other endpoints, see <a href='/api'>KoboldCpp API Documentation</a>").encode()
        elif self.path=="/api/extra/preloadstory":
            if preloaded_story is None:
@ -3457,32 +3459,6 @@ Change Mode<br>
                            resp = {"success": True}
            response_body = (json.dumps(resp).encode())
        elif self.path.endswith('/api/admin/check_state'):
            if global_memory and args.admin and args.admindir and os.path.exists(args.admindir) and self.check_header_password(args.adminpassword):
                newstate = handle.calc_new_state_kv()
                oldstate = handle.calc_old_state_kv()
                response_body = (json.dumps({"success": True, "old_state":oldstate, "new_state":newstate}).encode())
            else:
                response_body = (json.dumps({"success": False}).encode())
        elif self.path.endswith('/api/admin/load_state'):
            if global_memory and args.admin and args.admindir and os.path.exists(args.admindir) and self.check_header_password(args.adminpassword):
                result = handle.load_state_kv()
                response_body = (json.dumps({"success": result}).encode())
            else:
                response_body = (json.dumps({"success": False}).encode())
        elif self.path.endswith('/api/admin/save_state'):
            if global_memory and args.admin and args.admindir and os.path.exists(args.admindir) and self.check_header_password(args.adminpassword):
                result = handle.save_state_kv()
                response_body = (json.dumps({"success": result}).encode())
            else:
                response_body = (json.dumps({"success": False}).encode())
        elif self.path.endswith('/api/admin/clear_state'):
            if global_memory and args.admin and args.admindir and os.path.exists(args.admindir) and self.check_header_password(args.adminpassword):
                result = handle.clear_state_kv()
                response_body = (json.dumps({"success": result}).encode())
            else:
                response_body = (json.dumps({"success": False}).encode())
        elif self.path.endswith('/set_tts_settings'): #return dummy response
            response_body = (json.dumps({"message": "Settings successfully applied"}).encode())
@ -3532,33 +3508,58 @@ Change Mode<br>
        if reqblocking:
            requestsinqueue = (requestsinqueue - 1) if requestsinqueue > 0 else 0
        # handle endpoints that require mutex locking and handle actual gens
        try:
            sse_stream_flag = False
            api_format = 0 #1=basic,2=kai,3=oai,4=oai-chat,5=interrogate,6=ollama,7=ollamachat
            is_imggen = False
            is_comfyui_imggen = False
            is_transcribe = False
            is_tts = False
            is_embeddings = False
            response_body = None
-            if self.path.endswith('/request'):
+            if self.path.endswith('/api/admin/check_state'):
                if global_memory and args.admin and args.admindir and os.path.exists(args.admindir) and self.check_header_password(args.adminpassword):
                    newstate = handle.calc_new_state_kv()
                    oldstate = handle.calc_old_state_kv()
                    newtokencnt = handle.calc_new_state_tokencount()
                    oldtokencnt = handle.calc_old_state_tokencount()
                    response_body = (json.dumps({"success": True, "old_state_size":oldstate, "old_tokens":oldtokencnt, "new_state_size":newstate, "new_tokens":newtokencnt}).encode())
                else:
                    response_body = (json.dumps({"success": False, "old_state_size":0, "old_tokens":0, "new_state_size":0, "new_tokens":0}).encode())
            elif self.path.endswith('/api/admin/load_state'):
                if global_memory and args.admin and args.admindir and os.path.exists(args.admindir) and self.check_header_password(args.adminpassword):
                    result = handle.load_state_kv()
                    tokencnt = handle.calc_new_state_tokencount()
                    response_body = (json.dumps({"success": result, "new_tokens":tokencnt}).encode())
                else:
                    response_body = (json.dumps({"success": False, "new_tokens":0}).encode())
            elif self.path.endswith('/api/admin/save_state'):
                if global_memory and args.admin and args.admindir and os.path.exists(args.admindir) and self.check_header_password(args.adminpassword):
                    result = handle.save_state_kv()
                    tokencnt = handle.calc_new_state_tokencount()
                    response_body = (json.dumps({"success": (result>0), "new_state_size":result, "new_tokens":tokencnt}).encode())
                else:
                    response_body = (json.dumps({"success": False, "new_state_size":0, "new_tokens":0}).encode())
            elif self.path.endswith('/api/admin/clear_state'):
                if global_memory and args.admin and args.admindir and os.path.exists(args.admindir) and self.check_header_password(args.adminpassword):
                    result = handle.clear_state_kv()
                    response_body = (json.dumps({"success": result}).encode())
                else:
                    response_body = (json.dumps({"success": False}).encode())
            elif self.path.endswith('/request'):
                api_format = 1
-
+            elif self.path.endswith(('/api/v1/generate', '/api/latest/generate')):
            if self.path.endswith(('/api/v1/generate', '/api/latest/generate')):
                api_format = 2
-
+            elif self.path.endswith('/api/extra/generate/stream'):
            if self.path.endswith('/api/extra/generate/stream'):
                api_format = 2
                sse_stream_flag = True
-
+            elif self.path.endswith('/v1/completions') or self.path.endswith('/v1/completion'):
            if self.path.endswith('/v1/completions') or self.path.endswith('/v1/completion'):
                api_format = 3
-
+            elif self.path.endswith('/v1/chat/completions'):
            if self.path.endswith('/v1/chat/completions'):
                api_format = 4
-
+            elif self.path.endswith('/sdapi/v1/interrogate'):
            if self.path.endswith('/sdapi/v1/interrogate'):
                has_vision = (mmprojpath!="")
                if not has_vision:
                    self.send_response(503)
@ -3569,27 +3570,27 @@ Change Mode<br>
                        }}).encode())
                    return
                api_format = 5
-
+            elif self.path.endswith('/api/generate'):
            if self.path.endswith('/api/generate'):
                api_format = 6
-            if self.path.endswith('/api/chat'):
+            elif self.path.endswith('/api/chat'):
                api_format = 7
-
+            elif self.path=="/prompt" or self.path.endswith('/sdapi/v1/txt2img') or self.path.endswith('/sdapi/v1/img2img'):
            if self.path=="/prompt" or self.path.endswith('/sdapi/v1/txt2img') or self.path.endswith('/sdapi/v1/img2img'):
                is_imggen = True
                if self.path=="/prompt":
                    is_comfyui_imggen = True
-
+            elif self.path.endswith('/api/extra/transcribe') or self.path.endswith('/v1/audio/transcriptions'):
            if self.path.endswith('/api/extra/transcribe') or self.path.endswith('/v1/audio/transcriptions'):
                is_transcribe = True
-
+            elif self.path.endswith('/api/extra/tts') or self.path.endswith('/v1/audio/speech') or self.path.endswith('/tts_to_audio'):
            if self.path.endswith('/api/extra/tts') or self.path.endswith('/v1/audio/speech') or self.path.endswith('/tts_to_audio'):
                is_tts = True
-
+            elif self.path.endswith('/api/extra/embeddings') or self.path.endswith('/v1/embeddings'):
            if self.path.endswith('/api/extra/embeddings') or self.path.endswith('/v1/embeddings'):
                is_embeddings = True
-            if is_imggen or is_transcribe or is_tts or is_embeddings or api_format > 0:
+            if response_body is not None:
                self.send_response(response_code)
                self.send_header('content-length', str(len(response_body)))
                self.end_headers(content_type='application/json')
                self.wfile.write(response_body)
            elif is_imggen or is_transcribe or is_tts or is_embeddings or api_format > 0:
                global last_req_time
                last_req_time = time.time()
--- a/model_adapter.h
+++ b/model_adapter.h
@ -130,7 +130,9 @@ void ContextFastForward(std::vector<int> &current_context_tokens, std::vector<in
 const bool useSmartContext, const bool requireFullSubset);
 size_t gpttype_calc_new_state_kv();
 size_t gpttype_calc_new_state_tokencount();
 size_t gpttype_calc_old_state_kv();
-bool gpttype_save_state_kv();
+size_t gpttype_calc_old_state_tokencount();
 size_t gpttype_save_state_kv();
 bool gpttype_load_state_kv();
-bool gpttype_clear_state_kv();
+bool gpttype_clear_state_kv(bool shrink);