From ccd4745e0ccced3d227cdb50ca7bc13b5a2bb542 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Sun, 15 Mar 2026 18:25:37 +0800 Subject: [PATCH] ollama streaming emulation --- koboldcpp.py | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/koboldcpp.py b/koboldcpp.py index 295fd5446..635ee77b9 100755 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -5254,7 +5254,35 @@ Change Mode
try: # Headers are already sent when streaming - if not sse_stream_flag: + if (api_format == 6 or api_format == 7) and genparams.get('stream', True): + #ollama fake streaming + self.send_response(200) + self.send_header("X-Accel-Buffering", "no") + self.send_header("cache-control", "no-cache") + self.send_header("connection", "keep-alive") + self.end_headers(content_type='text/event-stream') + if api_format == 6: + bodytxt = gendat.get("response","") # extract and erase the AI response from the sync payload. + gendat["response"] = "" + pl = {"model":friendlymodelname,"created_at":str(datetime.now(timezone.utc).isoformat()),"response":bodytxt,"done":False} + self.wfile.write(f'{json.dumps(pl)}\n'.encode()) + self.wfile.flush() + time.sleep(0.05) #short delay + self.wfile.write(f'{json.dumps(gendat)}\n'.encode()) # note: gendat already contains done=true and empty response + self.wfile.flush() + time.sleep(0.05) #short delay + else: + bodytxt = gendat.get("message",{}).get("content","") # extract and erase the AI response from the sync payload. + gendat["message"] = {"role":"assistant","content":""} + pl = {"model":friendlymodelname,"created_at":str(datetime.now(timezone.utc).isoformat()),"message":{"role":"assistant","content":bodytxt},"done":False} + self.wfile.write(f'{json.dumps(pl)}\n'.encode()) + self.wfile.flush() + time.sleep(0.05) #short delay + self.wfile.write(f'{json.dumps(gendat)}\n'.encode()) # note: gendat already contains done=true and empty response + self.wfile.flush() + time.sleep(0.05) #short delay + self.close_connection = True + elif not sse_stream_flag: self.send_response(200) genresp = (json.dumps(gendat).encode()) self.send_header('content-length', str(len(genresp)))