almost working

2026-05-19 08:00:25 +00:00 · 2026-04-12 01:44:41 +08:00 · 2026-04-12 01:44:41 +08:00 · c4abba8868
commit c4abba8868
parent d216aabfdc
2 changed files with 133 additions and 55 deletions
--- a/embd_res/klite.embd
+++ b/embd_res/klite.embd
@ -12,7 +12,7 @@ Current version indicated by LITEVER below.
 -->
 <head>
 	<script id="init-config">
-	const LITEVER = 327;
+	const LITEVER = 328;
 	const urlParams = new URLSearchParams(window.location.search);
 	var localflag = urlParams.get('local'); //this will be replaced automatically in embedded kcpp
 	const STORAGE_PREFIX = (localflag?"e_":"")+"kaihordewebui_";
@ -7464,6 +7464,11 @@ Current version indicated by LITEVER below.
 					}
 				},
 				close() { //end of stream
+					if(streaming_was_thinking && synchro_pending_stream && (last_stop_reason=="stop" || last_stop_reason=="tool_calls")) //wrap up if we stopped properly
+					{
+						synchro_pending_stream = `${localsettings.start_thinking_tag}${synchro_pending_stream}${localsettings.stop_thinking_tag}`;
+					}
+					streaming_was_thinking = false;
 					synchro_polled_response = synchro_pending_stream;
 					let need_clean_output = (synchro_polled_response!="" && localsettings.opmode==1 && gametext_arr.length>0 && document.getElementById("useoaichatcompl").checked);
 					if(need_clean_output)
@ -7497,7 +7502,6 @@ Current version indicated by LITEVER below.
 						}
 					}
 					synchro_pending_stream = "";
-					streaming_was_thinking = false;
 					if (logprobs_content_arr.length>0 && last_response_obj==null) {
 						//fake a last response obj
 						let fakedresponse = {
@ -18022,7 +18026,7 @@ Current version indicated by LITEVER below.

 	function end_trim_to_sentence(input, include_newline = false) {
 		let last = -1;
-		let enders = ['.', '!', '?', '*', '"', ')', '}', '`', ']', ';', '…', '~'];
+		let enders = ['.', '!', '?', '*', '"', ')', '}', '`', ']', ';', '…', '~','>'];
 		for (let i = 0; i < enders.length; ++i) {
 			last = Math.max(last, input.lastIndexOf(enders[i]));
 		}
@ -22537,7 +22541,7 @@ Current version indicated by LITEVER below.

 		//allow trim incomplete sentences
 		//do not trim if instruct mode AND stop token reached
-		let donottrim = (last_stop_reason=="stop");
+		let donottrim = (last_stop_reason=="stop"||last_stop_reason=="tool_calls");
 		if (!donottrim && localsettings.trimsentences == true) {
 			//also, to prevent a trim from bisecting a chat name, if a response contains a chatname, do not trim
 			donottrim = false;
--- a/koboldcpp.py
+++ b/koboldcpp.py
@ -169,6 +169,14 @@ zenity_permitted = True
 thinkformats = [{"start":"<|channel|>analysis<|message|>","end":"<|start|>assistant<|channel|>final<|message|>"},
                {"start":"<think>","end":"</think>"},
                {"start":"<|channel>thought","end":"<channel|>"}]
+tool_call_pairs = [
+    ("<tool_call>", "</tool_call>"),
+    ("<seed:tool_call>", "</seed:tool_call>"),
+    ("<|tool_call_begin|>", "<|tool_call_end|>"),
+    ("<｜tool▁call▁begin｜>", "<｜tool▁call▁end｜>"),
+    ("<minimax:tool_call>", "</minimax:tool_call>"),
+    ("<|tool_call>call:", "<tool_call|>"),
+]

 saved_stdout = None
 saved_stderr = None
@ -3267,7 +3275,7 @@ def toolcall_to_normalized_json(text,start_tag,end_tag): #convert weird formats
    return text #fallback

 def repack_toolcall_tags(text: str, original_tools:list):
-    global thinkformats
+    global thinkformats, tool_call_pairs
    tool_calls = []
    if not text:
        return tool_calls
@ -3275,16 +3283,8 @@ def repack_toolcall_tags(text: str, original_tools:list):
        pattern = f"{re.escape(fmt['start'])}.*?{re.escape(fmt['end'])}"
        text = re.sub(pattern, '', text, flags=re.DOTALL)
    text = text.strip()
-    tcpairs = [
-        ("<tool_call>", "</tool_call>"),
-        ("<seed:tool_call>", "</seed:tool_call>"),
-        ("<|tool_call_begin|>", "<|tool_call_end|>"),
-        ("<｜tool▁call▁begin｜>", "<｜tool▁call▁end｜>"),
-        ("<minimax:tool_call>", "</minimax:tool_call>"),
-        ("<|tool_call>call:", "<tool_call|>"),
-    ]
    found = False
-    for start, end in tcpairs:
+    for start, end in tool_call_pairs:
        pattern = re.escape(start) + r"(.*?)" + re.escape(end)
        matches = re.findall(pattern, text, flags=re.DOTALL)
        if matches:
@ -4721,7 +4721,7 @@ class KcppServerRequestHandler(http.server.SimpleHTTPRequestHandler):
        self.wfile.flush()

    async def handle_sse_stream(self, genparams, api_format):
-        global friendlymodelname, currfinishreason, thinkformats
+        global friendlymodelname, currfinishreason, thinkformats, tool_call_pairs, cached_chat_template
        global autoswapmode, textName, sttName, ttsName, embedName, musicName, imageName, mmprojName

        modelNameToReturn = friendlymodelname
@ -4737,8 +4737,19 @@ class KcppServerRequestHandler(http.server.SimpleHTTPRequestHandler):
        self.send_header("cache-control", "no-cache")
        self.send_header("connection", "keep-alive")
        self.end_headers(content_type='text/event-stream')
-        if api_format == 4 and using_openai_tools: # if tools, do not send anything else - OAI tool calls will be handled with fakestreaming!
-            return
+
+        # if tools, do not send anything else - OAI tool calls will be handled with fakestreaming!
+        # only exception is if we know the exact toolcall tag to segment!
+        tool_segment_tag = ""
+        for start, end in tool_call_pairs:
+            if cached_chat_template and start in cached_chat_template:
+                tool_segment_tag = start
+                break
+        jinjatools = (args.jinja and args.jinja_tools)
+        if api_format == 4 and using_openai_tools:
+            if not jinjatools or not tool_segment_tag:
+                genparams['sync_toolcall_stream_ineligible'] = True
+                return

        think_tag_buf = ""
        encap_in_thinking = False
@ -4814,7 +4825,19 @@ class KcppServerRequestHandler(http.server.SimpleHTTPRequestHandler):
                                    if sindex != -1 and trim_str!="":
                                        tokenStr = tokenStr[:sindex]

+                        sync_potential_toolcall_splitmatch = ""
                        if tokenStr!="" or streamDone:
+                            # Tool boundary detection for tool-capable chat completions.
+                            # if triggered, stop real streaming, and let the buffered fakestreaming take over
+                            if api_format == 4 and using_openai_tools:
+                                tokenStr = tokenReserve + tokenStr
+                                tokenReserve = ""
+                                splitter = tool_segment_tag
+                                if splitter in tokenStr:
+                                    if not genparams.get("sync_toolcall_potential_triggered",False):
+                                        sync_potential_toolcall_splitmatch = splitter
+                                        genparams['sync_toolcall_potential_triggered'] = True #if tool calls is triggered, rest will be sync fake streaming. we'll buffer it for later
+
                            need_split_final_msg = True if (currfinishreason is not None and streamDone and tokenStr!="") else False

                            # Hack for lcppui reasoning_content for thinking models
@ -4874,6 +4897,28 @@ class KcppServerRequestHandler(http.server.SimpleHTTPRequestHandler):
                            else:
                                delta['content'] = tokenStr

+                            if genparams.get("sync_toolcall_potential_triggered",False) and delta: # if sync_toolcall_potential_triggered, buffer up the impending content chunk for tools in fakestreaming, in case toolcalls fail
+                                ec = genparams.get("sync_toolcall_extra_content","")
+                                erc = genparams.get("sync_toolcall_extra_reasoning_content","")
+                                ec += delta.get("content","")
+                                erc += delta.get("reasoning_content","")
+                                if erc and sync_potential_toolcall_splitmatch and sync_potential_toolcall_splitmatch in erc:
+                                    parts = erc.split(sync_potential_toolcall_splitmatch,1)
+                                    erc = sync_potential_toolcall_splitmatch + parts[1]
+                                    delta["reasoning_content"] = parts[0]
+                                elif ec and sync_potential_toolcall_splitmatch and sync_potential_toolcall_splitmatch in ec:
+                                    parts = ec.split(sync_potential_toolcall_splitmatch,1)
+                                    ec = sync_potential_toolcall_splitmatch + parts[1]
+                                    delta["content"] = parts[0]
+                                genparams['sync_toolcall_extra_content'] = ec
+                                genparams['sync_toolcall_extra_reasoning_content'] = erc
+                                if not sync_potential_toolcall_splitmatch:
+                                    if not streamDone:
+                                        await asyncio.sleep(async_sleep_short)
+                                        continue
+                                    await asyncio.sleep(async_sleep_short)
+                                    return
+
                            if need_split_final_msg: #we need to send one message without the finish reason, then send a finish reason with no msg to follow standards
                                if api_format == 4:  # if oai chat, set format to expected openai streaming response
                                    event_str = json.dumps({"id":chatcmpl_id,"object":"chat.completion.chunk","created":int(time.time()),"model":modelNameToReturn,"choices":[{"index":0,"finish_reason":None,"delta":delta}]})
@ -4894,6 +4939,7 @@ class KcppServerRequestHandler(http.server.SimpleHTTPRequestHandler):
                                    addonstr = json.dumps({"id":chatcmpl_id,"object":"chat.completion.chunk","created":int(time.time()),"model":modelNameToReturn,"choices":[{"index":0,"finish_reason":None,"delta":{'role':'assistant','content':''},"logprobs":logprobsdict}]})
                                    await self.send_oai_sse_event(addonstr)
                                event_str = json.dumps({"id":chatcmpl_id,"object":"chat.completion.chunk","created":int(time.time()),"model":modelNameToReturn,"choices":[{"index":0,"finish_reason":currfinishreason,"delta":delta}]})
+                                genparams['sync_toolcall_first_role_sent'] = True
                                await self.send_oai_sse_event(event_str)
                            elif api_format == 3:  # non chat completions
                                if streamDone and ("logprobs" in genparams and genparams["logprobs"]): # this is a hack that sends an extra message containing ALL the logprobs
@ -6324,7 +6370,9 @@ Change Mode<br>
                            self.end_headers(content_type='application/json')
                            self.wfile.write(genresp)
                        elif api_format == 4 and genparams.get('using_openai_tools', False): #special case, fake streaming for openai tool calls
-                            content_text = None
+                            # we only send content_text and reasoning_text if tools aren't used. they contain the balance of the output after sync_toolcall_potential_triggered was triggered
+                            content_text = genparams.get('sync_toolcall_extra_content', "") #populated by the sse call, we don't use gendat['choices'][0]['message'].get('content', None)
+                            reasoning_text = genparams.get('sync_toolcall_extra_reasoning_content', "")
                            toolsdata_res = []
                            try:
                                toolsdata_res = gendat['choices'][0]['message']['tool_calls']
@ -6332,53 +6380,57 @@ Change Mode<br>
                                    toolsdata_res[0]["index"] = 0 # need to add an index for OWUI
                            except Exception:
                                toolsdata_res = []
-                            try:
-                                content_text = gendat['choices'][0]['message'].get('content', None)
-                            except Exception:
-                                content_text = None

-                           # Send role chunk first
-                            chunk_role = json.dumps({
-                                "id": "koboldcpp",
-                                "object": "chat.completion.chunk",
-                                "created": int(time.time()),
-                                "model": modelNameToReturn,
-                                "choices": [{"index": 0, "finish_reason": None, "delta": {"role": "assistant"}}]
-                            })
-                            self.wfile.write(f"data: {chunk_role}\n\n".encode())
-                            self.wfile.flush()
+                           # Send role chunk first, if needed
+                            if genparams.get('sync_toolcall_first_role_sent', False):
+                                genparams['sync_toolcall_first_role_sent'] = True
+                                chunk_role = json.dumps({
+                                    "id": "koboldcpp",
+                                    "object": "chat.completion.chunk",
+                                    "created": int(time.time()),
+                                    "model": modelNameToReturn,
+                                    "choices": [{"index": 0, "finish_reason": None, "delta": {"role": "assistant"}}]
+                                })
+                                self.wfile.write(f"data: {chunk_role}\n\n".encode())
+                                self.wfile.flush()

-                            # Send content if present
-                            if content_text:
-                                reasoning_txt = ""
-                                thinkstrips = [item["start"] for item in thinkformats] #start thinking tags
-                                thinksplitters = [item["end"] for item in thinkformats] #end thinking tags
-                                for tsp in thinksplitters:
-                                    if tsp in content_text:
-                                        parts = content_text.split(tsp, 1)
-                                        reasoning_txt = parts[0]
-                                        content_text = parts[1]
-                                        for ts in thinkstrips:
-                                            reasoning_txt = reasoning_txt.replace(ts, "")
-                                if reasoning_txt:
+                            # if no valid tool splitter, we have to do 100% synchronous
+                            if not content_text and not reasoning_text and genparams.get('sync_toolcall_stream_ineligible', False):
+                                temp_content = ""
+                                try:
+                                    temp_content = gendat['choices'][0]['message'].get('content', None)
+                                except Exception:
+                                    temp_content = None
+                                if temp_content:
+                                    temp_reasoning = ""
+                                    thinkstrips = [item["start"] for item in thinkformats] #start thinking tags
+                                    thinksplitters = [item["end"] for item in thinkformats] #end thinking tags
+                                    for tsp in thinksplitters:
+                                        if tsp in temp_content:
+                                            parts = temp_content.split(tsp, 1)
+                                            temp_reasoning = parts[0]
+                                            temp_content = parts[1]
+                                            for ts in thinkstrips:
+                                                temp_reasoning = temp_reasoning.replace(ts, "")
+                                    if temp_reasoning:
+                                        chunk_content = json.dumps({
+                                            "id": "koboldcpp",
+                                            "object": "chat.completion.chunk",
+                                            "created": int(time.time()),
+                                            "model": modelNameToReturn,
+                                            "choices": [{"index": 0, "finish_reason": None, "delta": {"reasoning_content": temp_reasoning}}]
+                                        })
+                                        self.wfile.write(f"data: {chunk_content}\n\n".encode())
+                                        self.wfile.flush()
                                    chunk_content = json.dumps({
                                        "id": "koboldcpp",
                                        "object": "chat.completion.chunk",
                                        "created": int(time.time()),
                                        "model": modelNameToReturn,
-                                        "choices": [{"index": 0, "finish_reason": None, "delta": {"reasoning_content": reasoning_txt}}]
+                                        "choices": [{"index": 0, "finish_reason": None, "delta": {"content": temp_content}}]
                                    })
                                    self.wfile.write(f"data: {chunk_content}\n\n".encode())
                                    self.wfile.flush()
-                                chunk_content = json.dumps({
-                                    "id": "koboldcpp",
-                                    "object": "chat.completion.chunk",
-                                    "created": int(time.time()),
-                                    "model": modelNameToReturn,
-                                    "choices": [{"index": 0, "finish_reason": None, "delta": {"content": content_text}}]
-                                })
-                                self.wfile.write(f"data: {chunk_content}\n\n".encode())
-                                self.wfile.flush()

                            # Send tool calls incrementally in OpenAI format
                            if toolsdata_res and len(toolsdata_res) > 0:
@ -6418,6 +6470,28 @@ Change Mode<br>
                                    })
                                    self.wfile.write(f"data: {chunk_args}\n\n".encode())
                                    self.wfile.flush()
+                            else:
+                                # Send remaining buffered content if no tool calls were made
+                                if reasoning_text:
+                                    chunk_content = json.dumps({
+                                        "id": "koboldcpp",
+                                        "object": "chat.completion.chunk",
+                                        "created": int(time.time()),
+                                        "model": modelNameToReturn,
+                                        "choices": [{"index": 0, "finish_reason": None, "delta": {"reasoning_content": reasoning_text}}]
+                                    })
+                                    self.wfile.write(f"data: {chunk_content}\n\n".encode())
+                                    self.wfile.flush()
+                                if content_text:
+                                    chunk_content = json.dumps({
+                                        "id": "koboldcpp",
+                                        "object": "chat.completion.chunk",
+                                        "created": int(time.time()),
+                                        "model": modelNameToReturn,
+                                        "choices": [{"index": 0, "finish_reason": None, "delta": {"content": content_text}}]
+                                    })
+                                    self.wfile.write(f"data: {chunk_content}\n\n".encode())
+                                    self.wfile.flush()

                            # Final chunk
                            chunk_final = json.dumps({