add streaming support for oai tools (+2 squashed commit)

Squashed commit: [4d080b37] qwen2.5vl surgery script [4bebe7e5] add streaming support for oai tools
2025-09-10 17:14:36 +00:00 · 2025-03-31 16:10:11 +08:00 · 2025-03-31 16:10:11 +08:00 · 1ebadc515e
commit 1ebadc515e
parent 091eb367fc
2 changed files with 129 additions and 66 deletions
--- a/koboldcpp.py
+++ b/koboldcpp.py
@ -2018,8 +2018,8 @@ def transform_genparams(genparams, api_format):
                        #if auto mode, determine whether a tool is needed
                        tools_string = json.dumps(tools_array, indent=0)
                        should_use_tools = True
-                        user_start = adapter_obj.get("user_start", "### Instruction:\n\n")
-                        user_end = adapter_obj.get("user_end", "\n\n### Response:\n\n")
+                        user_start = user_message_start
+                        user_end = assistant_message_start
                        if chosen_tool=="auto":
                            temp_poll = {
                                "prompt": f"{user_start}User query:\n\n{messages_string}\n\nTool Code:\n{tools_string}Determine from the provided tool code if the user query would be best answered by a listed tool (One word: yes / no):{user_end}",
@ -2030,7 +2030,7 @@ def transform_genparams(genparams, api_format):
                                "ban_eos_token":False
                                }
                            temp_poll_result = generate(genparams=temp_poll)
-                            if temp_poll_result and not "yes" in temp_poll_result['text'].lower():
+                            if temp_poll_result and "yes" not in temp_poll_result['text'].lower():
                                should_use_tools = False
                            if not args.quiet:
                                print(f"\nRelevant tool is listed: {temp_poll_result['text']} ({should_use_tools})")
@ -2301,6 +2301,10 @@ class KcppServerRequestHandler(http.server.SimpleHTTPRequestHandler):

    async def handle_sse_stream(self, genparams, api_format):
        global friendlymodelname, currfinishreason
+        # if tools, do not send anything - OAI tool calls will be handled with fakestreaming!
+        using_openai_tools = genparams.get('using_openai_tools', False)
+        if api_format == 4 and using_openai_tools:
+            return
        self.send_response(200)
        self.send_header("X-Accel-Buffering", "no")
        self.send_header("cache-control", "no-cache")
@ -2311,6 +2315,7 @@ class KcppServerRequestHandler(http.server.SimpleHTTPRequestHandler):
        incomplete_token_buffer = bytearray()
        async_sleep_short = 0.02
        await asyncio.sleep(0.35) #anti race condition, prevent check from overtaking generate
+
        try:
            tokenReserve = "" #keeps fully formed tokens that we cannot send out yet
            while True:
@ -3188,6 +3193,24 @@ Enter Prompt:<br>
                            self.send_header('content-length', str(len(genresp)))
                            self.end_headers(content_type='application/json')
                            self.wfile.write(genresp)
+                        elif api_format == 4 and genparams.get('using_openai_tools', False): #special case, fake streaming for openai tool calls
+                            self.send_response(200)
+                            self.send_header("X-Accel-Buffering", "no")
+                            self.send_header("cache-control", "no-cache")
+                            self.send_header("connection", "keep-alive")
+                            self.end_headers(content_type='text/event-stream')
+                            toolsdata_res = []
+                            try:
+                                toolsdata_res = gen['choices'][0]['message']['tool_calls']
+                            except Exception:
+                                toolsdata_res = []
+                            toolsdata_p1 = json.dumps({"id":"koboldcpp","object":"chat.completion.chunk","created":int(time.time()),"model":friendlymodelname,"choices":[{"index":0,"finish_reason":None,"delta":{'role':'assistant','content':None, "tool_calls":toolsdata_res}}]})
+                            toolsdata_p2 = json.dumps({"id":"koboldcpp","object":"chat.completion.chunk","created":int(time.time()),"model":friendlymodelname,"choices":[{"index":0,"finish_reason":"tool_calls","delta":{}}]})
+                            self.wfile.write(f'data: {toolsdata_p1}\n\n'.encode())
+                            self.wfile.write(f'data: {toolsdata_p2}\n\n'.encode())
+                            self.wfile.write('data: [DONE]'.encode())
+                            self.wfile.flush()
+                            self.close_connection = True
                    except Exception as ex:
                        utfprint(ex,1)
                        print("Generate: The response could not be sent, maybe connection was terminated?")