From cbecc34667661db53113cda5452813e2af9545a2 Mon Sep 17 00:00:00 2001 From: Gustavo Rocha Dias <91472747+gustrd@users.noreply.github.com> Date: Fri, 6 Mar 2026 09:57:22 -0300 Subject: [PATCH] Fix OAI-compatible token usage and unique request IDs (#2015) * fix: token usage fix for mistral-vibe * fix: generate unique request IDs for OAI-compatible responses * fix: prompt_tokens reporting KV cache size instead of actual count during streaming * fixes for PR #2015 For (1), this is not a good idea. If it returned 0 (e.g. during an error), this value may not be updated and will return the value of a previous or different request. It's better to return 0 in those cases. For (2), this is a good idea but we don't need that level of randomness. I'll probably swap it with a 6 digit random number instead. For (3), the official openai spec gates it behind stream_options.include_usage = true so i'll do that too * missed 1 item --------- Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com> --- koboldcpp.py | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/koboldcpp.py b/koboldcpp.py index 7df37df3f..8b4e4458d 100755 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -3483,6 +3483,9 @@ class KcppServerRequestHandler(http.server.SimpleHTTPRequestHandler): async def generate_text(self, genparams, api_format, stream_flag): global friendlymodelname, chatcompl_adapter, currfinishreason currfinishreason = None + req_id_suffix = genparams.get('oai_uniqueid',1) + chatcmpl_id = f"chatcmpl-A{req_id_suffix}" + cmpl_id = f"cmpl-A{req_id_suffix}" def run_blocking(): # api format 1=basic,2=kai,3=oai,4=oai-chat # flag instance as non-idle for a while @@ -3502,8 +3505,8 @@ class KcppServerRequestHandler(http.server.SimpleHTTPRequestHandler): genout = run_blocking() recvtxt = genout['text'] - prompttokens = genout['prompt_tokens'] - comptokens = genout['completion_tokens'] + prompttokens = genout['prompt_tokens'] if genout['prompt_tokens'] > 0 else 0 + comptokens = genout['completion_tokens'] if genout['completion_tokens'] > 0 else 0 currfinishreason = "error" if (genout['stopreason'] == -2) else ("length" if (genout['stopreason'] != 1) else "stop") # grab logprobs if not streaming @@ -3539,11 +3542,11 @@ class KcppServerRequestHandler(http.server.SimpleHTTPRequestHandler): if api_format == 1: res = {"data": {"seqs": [recvtxt]}} elif api_format == 3: - res = {"id": "cmpl-A1", "object": "text_completion", "created": int(time.time()), "model": friendlymodelname, + res = {"id": cmpl_id, "object": "text_completion", "created": int(time.time()), "model": friendlymodelname, "usage": {"prompt_tokens": prompttokens, "completion_tokens": comptokens, "total_tokens": (prompttokens+comptokens)}, "choices": [{"text": recvtxt, "index": 0, "finish_reason": currfinishreason, "logprobs":logprobsdict}]} elif api_format == 4: - res = {"id": "chatcmpl-A1", "object": "chat.completion", "created": int(time.time()), "model": friendlymodelname, + res = {"id": chatcmpl_id, "object": "chat.completion", "created": int(time.time()), "model": friendlymodelname, "usage": {"prompt_tokens": prompttokens, "completion_tokens": comptokens, "total_tokens": (prompttokens+comptokens)}, "choices": [{"index": 0, "message": {"role": "assistant", "content": recvtxt, "tool_calls": tool_calls}, "finish_reason": currfinishreason, "logprobs":logprobsdict}]} elif api_format == 5: @@ -3577,6 +3580,9 @@ class KcppServerRequestHandler(http.server.SimpleHTTPRequestHandler): async def handle_sse_stream(self, genparams, api_format): global friendlymodelname, currfinishreason using_openai_tools = genparams.get('using_openai_tools', False) + req_id_suffix = genparams.get('oai_uniqueid',1) + chatcmpl_id = f"chatcmpl-A{req_id_suffix}" + cmpl_id = f"cmpl-A{req_id_suffix}" self.send_response(200) self.send_header("X-Accel-Buffering", "no") self.send_header("cache-control", "no-cache") @@ -3590,6 +3596,7 @@ class KcppServerRequestHandler(http.server.SimpleHTTPRequestHandler): thinkpairs = [{"start":"<|channel|>analysis<|message|>","end":"<|start|>assistant<|channel|>final<|message|>"}, {"start":"","end":""}] current_token = 0 + prompttokens = 0 incomplete_token_buffer = bytearray() async_sleep_short = 0.02 await asyncio.sleep(0.35) #anti race condition, prevent check from overtaking generate @@ -3601,6 +3608,7 @@ class KcppServerRequestHandler(http.server.SimpleHTTPRequestHandler): if streamDone: sr = handle.get_last_stop_reason() currfinishreason = "error" if sr==-2 else ("length" if (sr!=1) else "stop") + prompttokens = handle.get_last_input_count() tokenStr = "" streamcount = handle.get_stream_count() while current_token < streamcount: @@ -3669,10 +3677,10 @@ class KcppServerRequestHandler(http.server.SimpleHTTPRequestHandler): if need_split_final_msg: #we need to send one message without the finish reason, then send a finish reason with no msg to follow standards if api_format == 4: # if oai chat, set format to expected openai streaming response - event_str = json.dumps({"id":"koboldcpp","object":"chat.completion.chunk","created":int(time.time()),"model":friendlymodelname,"choices":[{"index":0,"finish_reason":None,"delta":delta}]}) + event_str = json.dumps({"id":chatcmpl_id,"object":"chat.completion.chunk","created":int(time.time()),"model":friendlymodelname,"choices":[{"index":0,"finish_reason":None,"delta":delta}]}) await self.send_oai_sse_event(event_str) elif api_format == 3: # non chat completions - event_str = json.dumps({"id":"koboldcpp","object":"text_completion","created":int(time.time()),"model":friendlymodelname,"choices":[{"index":0,"finish_reason":None,"text":tokenStr}]}) + event_str = json.dumps({"id":cmpl_id,"object":"text_completion","created":int(time.time()),"model":friendlymodelname,"choices":[{"index":0,"finish_reason":None,"text":tokenStr}]}) await self.send_oai_sse_event(event_str) else: event_str = json.dumps({"token": tokenStr, "finish_reason":None}) @@ -3684,17 +3692,17 @@ class KcppServerRequestHandler(http.server.SimpleHTTPRequestHandler): if streamDone and ("logprobs" in genparams and genparams["logprobs"]): # this is a hack that sends an extra message containing ALL the logprobs lastlogprobs = handle.last_logprobs() logprobsdict = parse_last_logprobs(lastlogprobs) - addonstr = json.dumps({"id":"koboldcpp","object":"chat.completion.chunk","created":int(time.time()),"model":friendlymodelname,"choices":[{"index":0,"finish_reason":None,"delta":{'role':'assistant','content':''},"logprobs":logprobsdict}]}) + addonstr = json.dumps({"id":chatcmpl_id,"object":"chat.completion.chunk","created":int(time.time()),"model":friendlymodelname,"choices":[{"index":0,"finish_reason":None,"delta":{'role':'assistant','content':''},"logprobs":logprobsdict}]}) await self.send_oai_sse_event(addonstr) - event_str = json.dumps({"id":"koboldcpp","object":"chat.completion.chunk","created":int(time.time()),"model":friendlymodelname,"choices":[{"index":0,"finish_reason":currfinishreason,"delta":delta}]}) + event_str = json.dumps({"id":chatcmpl_id,"object":"chat.completion.chunk","created":int(time.time()),"model":friendlymodelname,"choices":[{"index":0,"finish_reason":currfinishreason,"delta":delta}]}) await self.send_oai_sse_event(event_str) elif api_format == 3: # non chat completions if streamDone and ("logprobs" in genparams and genparams["logprobs"]): # this is a hack that sends an extra message containing ALL the logprobs lastlogprobs = handle.last_logprobs() logprobsdict = parse_last_logprobs(lastlogprobs) - addonstr = json.dumps({"id":"koboldcpp","object":"text_completion","created":int(time.time()),"model":friendlymodelname,"choices":[{"index":0,"finish_reason":None,"text":"","logprobs":logprobsdict}]}) + addonstr = json.dumps({"id":cmpl_id,"object":"text_completion","created":int(time.time()),"model":friendlymodelname,"choices":[{"index":0,"finish_reason":None,"text":"","logprobs":logprobsdict}]}) await self.send_oai_sse_event(addonstr) - event_str = json.dumps({"id":"koboldcpp","object":"text_completion","created":int(time.time()),"model":friendlymodelname,"choices":[{"index":0,"finish_reason":currfinishreason,"text":tokenStr}]}) + event_str = json.dumps({"id":cmpl_id,"object":"text_completion","created":int(time.time()),"model":friendlymodelname,"choices":[{"index":0,"finish_reason":currfinishreason,"text":tokenStr}]}) await self.send_oai_sse_event(event_str) else: event_str = json.dumps({"token": tokenStr, "finish_reason":currfinishreason}) @@ -3707,6 +3715,14 @@ class KcppServerRequestHandler(http.server.SimpleHTTPRequestHandler): if streamDone: if api_format == 4 or api_format == 3: # if oai chat, send last [DONE] message consistent with openai format + strop = genparams.get("stream_options",None) + if (strop and strop.get("include_usage",False)): # Send a final chunk with usage info, only if requested + usage_obj = {"prompt_tokens": prompttokens, "completion_tokens": current_token, "total_tokens": (prompttokens + current_token)} + if api_format == 4: + usage_str = json.dumps({"id":chatcmpl_id,"object":"chat.completion.chunk","created":int(time.time()),"model":friendlymodelname,"choices":[],"usage":usage_obj}) + else: + usage_str = json.dumps({"id":cmpl_id,"object":"text_completion","created":int(time.time()),"model":friendlymodelname,"choices":[],"usage":usage_obj}) + await self.send_oai_sse_event(usage_str) await self.send_oai_sse_event('[DONE]') await asyncio.sleep(async_sleep_short) break @@ -3725,7 +3741,7 @@ class KcppServerRequestHandler(http.server.SimpleHTTPRequestHandler): async def handle_request(self, genparams, api_format, stream_flag): tasks = [] - + genparams["oai_uniqueid"] = random.randint(100000, 999999) try: if stream_flag: tasks.append(self.handle_sse_stream(genparams, api_format))