mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-07 00:41:50 +00:00
Fix OAI-compatible token usage and unique request IDs (#2015)
* fix: token usage fix for mistral-vibe * fix: generate unique request IDs for OAI-compatible responses * fix: prompt_tokens reporting KV cache size instead of actual count during streaming * fixes for PR #2015 For (1), this is not a good idea. If it returned 0 (e.g. during an error), this value may not be updated and will return the value of a previous or different request. It's better to return 0 in those cases. For (2), this is a good idea but we don't need that level of randomness. I'll probably swap it with a 6 digit random number instead. For (3), the official openai spec gates it behind stream_options.include_usage = true so i'll do that too * missed 1 item --------- Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>
This commit is contained in:
parent
2fbc3b2ae5
commit
cbecc34667
1 changed files with 27 additions and 11 deletions
38
koboldcpp.py
38
koboldcpp.py
|
|
@ -3483,6 +3483,9 @@ class KcppServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
|||
async def generate_text(self, genparams, api_format, stream_flag):
|
||||
global friendlymodelname, chatcompl_adapter, currfinishreason
|
||||
currfinishreason = None
|
||||
req_id_suffix = genparams.get('oai_uniqueid',1)
|
||||
chatcmpl_id = f"chatcmpl-A{req_id_suffix}"
|
||||
cmpl_id = f"cmpl-A{req_id_suffix}"
|
||||
|
||||
def run_blocking(): # api format 1=basic,2=kai,3=oai,4=oai-chat
|
||||
# flag instance as non-idle for a while
|
||||
|
|
@ -3502,8 +3505,8 @@ class KcppServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
|||
genout = run_blocking()
|
||||
|
||||
recvtxt = genout['text']
|
||||
prompttokens = genout['prompt_tokens']
|
||||
comptokens = genout['completion_tokens']
|
||||
prompttokens = genout['prompt_tokens'] if genout['prompt_tokens'] > 0 else 0
|
||||
comptokens = genout['completion_tokens'] if genout['completion_tokens'] > 0 else 0
|
||||
currfinishreason = "error" if (genout['stopreason'] == -2) else ("length" if (genout['stopreason'] != 1) else "stop")
|
||||
|
||||
# grab logprobs if not streaming
|
||||
|
|
@ -3539,11 +3542,11 @@ class KcppServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
|||
if api_format == 1:
|
||||
res = {"data": {"seqs": [recvtxt]}}
|
||||
elif api_format == 3:
|
||||
res = {"id": "cmpl-A1", "object": "text_completion", "created": int(time.time()), "model": friendlymodelname,
|
||||
res = {"id": cmpl_id, "object": "text_completion", "created": int(time.time()), "model": friendlymodelname,
|
||||
"usage": {"prompt_tokens": prompttokens, "completion_tokens": comptokens, "total_tokens": (prompttokens+comptokens)},
|
||||
"choices": [{"text": recvtxt, "index": 0, "finish_reason": currfinishreason, "logprobs":logprobsdict}]}
|
||||
elif api_format == 4:
|
||||
res = {"id": "chatcmpl-A1", "object": "chat.completion", "created": int(time.time()), "model": friendlymodelname,
|
||||
res = {"id": chatcmpl_id, "object": "chat.completion", "created": int(time.time()), "model": friendlymodelname,
|
||||
"usage": {"prompt_tokens": prompttokens, "completion_tokens": comptokens, "total_tokens": (prompttokens+comptokens)},
|
||||
"choices": [{"index": 0, "message": {"role": "assistant", "content": recvtxt, "tool_calls": tool_calls}, "finish_reason": currfinishreason, "logprobs":logprobsdict}]}
|
||||
elif api_format == 5:
|
||||
|
|
@ -3577,6 +3580,9 @@ class KcppServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
|||
async def handle_sse_stream(self, genparams, api_format):
|
||||
global friendlymodelname, currfinishreason
|
||||
using_openai_tools = genparams.get('using_openai_tools', False)
|
||||
req_id_suffix = genparams.get('oai_uniqueid',1)
|
||||
chatcmpl_id = f"chatcmpl-A{req_id_suffix}"
|
||||
cmpl_id = f"cmpl-A{req_id_suffix}"
|
||||
self.send_response(200)
|
||||
self.send_header("X-Accel-Buffering", "no")
|
||||
self.send_header("cache-control", "no-cache")
|
||||
|
|
@ -3590,6 +3596,7 @@ class KcppServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
|||
thinkpairs = [{"start":"<|channel|>analysis<|message|>","end":"<|start|>assistant<|channel|>final<|message|>"},
|
||||
{"start":"<think>","end":"</think>"}]
|
||||
current_token = 0
|
||||
prompttokens = 0
|
||||
incomplete_token_buffer = bytearray()
|
||||
async_sleep_short = 0.02
|
||||
await asyncio.sleep(0.35) #anti race condition, prevent check from overtaking generate
|
||||
|
|
@ -3601,6 +3608,7 @@ class KcppServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
|||
if streamDone:
|
||||
sr = handle.get_last_stop_reason()
|
||||
currfinishreason = "error" if sr==-2 else ("length" if (sr!=1) else "stop")
|
||||
prompttokens = handle.get_last_input_count()
|
||||
tokenStr = ""
|
||||
streamcount = handle.get_stream_count()
|
||||
while current_token < streamcount:
|
||||
|
|
@ -3669,10 +3677,10 @@ class KcppServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
|||
|
||||
if need_split_final_msg: #we need to send one message without the finish reason, then send a finish reason with no msg to follow standards
|
||||
if api_format == 4: # if oai chat, set format to expected openai streaming response
|
||||
event_str = json.dumps({"id":"koboldcpp","object":"chat.completion.chunk","created":int(time.time()),"model":friendlymodelname,"choices":[{"index":0,"finish_reason":None,"delta":delta}]})
|
||||
event_str = json.dumps({"id":chatcmpl_id,"object":"chat.completion.chunk","created":int(time.time()),"model":friendlymodelname,"choices":[{"index":0,"finish_reason":None,"delta":delta}]})
|
||||
await self.send_oai_sse_event(event_str)
|
||||
elif api_format == 3: # non chat completions
|
||||
event_str = json.dumps({"id":"koboldcpp","object":"text_completion","created":int(time.time()),"model":friendlymodelname,"choices":[{"index":0,"finish_reason":None,"text":tokenStr}]})
|
||||
event_str = json.dumps({"id":cmpl_id,"object":"text_completion","created":int(time.time()),"model":friendlymodelname,"choices":[{"index":0,"finish_reason":None,"text":tokenStr}]})
|
||||
await self.send_oai_sse_event(event_str)
|
||||
else:
|
||||
event_str = json.dumps({"token": tokenStr, "finish_reason":None})
|
||||
|
|
@ -3684,17 +3692,17 @@ class KcppServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
|||
if streamDone and ("logprobs" in genparams and genparams["logprobs"]): # this is a hack that sends an extra message containing ALL the logprobs
|
||||
lastlogprobs = handle.last_logprobs()
|
||||
logprobsdict = parse_last_logprobs(lastlogprobs)
|
||||
addonstr = json.dumps({"id":"koboldcpp","object":"chat.completion.chunk","created":int(time.time()),"model":friendlymodelname,"choices":[{"index":0,"finish_reason":None,"delta":{'role':'assistant','content':''},"logprobs":logprobsdict}]})
|
||||
addonstr = json.dumps({"id":chatcmpl_id,"object":"chat.completion.chunk","created":int(time.time()),"model":friendlymodelname,"choices":[{"index":0,"finish_reason":None,"delta":{'role':'assistant','content':''},"logprobs":logprobsdict}]})
|
||||
await self.send_oai_sse_event(addonstr)
|
||||
event_str = json.dumps({"id":"koboldcpp","object":"chat.completion.chunk","created":int(time.time()),"model":friendlymodelname,"choices":[{"index":0,"finish_reason":currfinishreason,"delta":delta}]})
|
||||
event_str = json.dumps({"id":chatcmpl_id,"object":"chat.completion.chunk","created":int(time.time()),"model":friendlymodelname,"choices":[{"index":0,"finish_reason":currfinishreason,"delta":delta}]})
|
||||
await self.send_oai_sse_event(event_str)
|
||||
elif api_format == 3: # non chat completions
|
||||
if streamDone and ("logprobs" in genparams and genparams["logprobs"]): # this is a hack that sends an extra message containing ALL the logprobs
|
||||
lastlogprobs = handle.last_logprobs()
|
||||
logprobsdict = parse_last_logprobs(lastlogprobs)
|
||||
addonstr = json.dumps({"id":"koboldcpp","object":"text_completion","created":int(time.time()),"model":friendlymodelname,"choices":[{"index":0,"finish_reason":None,"text":"","logprobs":logprobsdict}]})
|
||||
addonstr = json.dumps({"id":cmpl_id,"object":"text_completion","created":int(time.time()),"model":friendlymodelname,"choices":[{"index":0,"finish_reason":None,"text":"","logprobs":logprobsdict}]})
|
||||
await self.send_oai_sse_event(addonstr)
|
||||
event_str = json.dumps({"id":"koboldcpp","object":"text_completion","created":int(time.time()),"model":friendlymodelname,"choices":[{"index":0,"finish_reason":currfinishreason,"text":tokenStr}]})
|
||||
event_str = json.dumps({"id":cmpl_id,"object":"text_completion","created":int(time.time()),"model":friendlymodelname,"choices":[{"index":0,"finish_reason":currfinishreason,"text":tokenStr}]})
|
||||
await self.send_oai_sse_event(event_str)
|
||||
else:
|
||||
event_str = json.dumps({"token": tokenStr, "finish_reason":currfinishreason})
|
||||
|
|
@ -3707,6 +3715,14 @@ class KcppServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
|||
|
||||
if streamDone:
|
||||
if api_format == 4 or api_format == 3: # if oai chat, send last [DONE] message consistent with openai format
|
||||
strop = genparams.get("stream_options",None)
|
||||
if (strop and strop.get("include_usage",False)): # Send a final chunk with usage info, only if requested
|
||||
usage_obj = {"prompt_tokens": prompttokens, "completion_tokens": current_token, "total_tokens": (prompttokens + current_token)}
|
||||
if api_format == 4:
|
||||
usage_str = json.dumps({"id":chatcmpl_id,"object":"chat.completion.chunk","created":int(time.time()),"model":friendlymodelname,"choices":[],"usage":usage_obj})
|
||||
else:
|
||||
usage_str = json.dumps({"id":cmpl_id,"object":"text_completion","created":int(time.time()),"model":friendlymodelname,"choices":[],"usage":usage_obj})
|
||||
await self.send_oai_sse_event(usage_str)
|
||||
await self.send_oai_sse_event('[DONE]')
|
||||
await asyncio.sleep(async_sleep_short)
|
||||
break
|
||||
|
|
@ -3725,7 +3741,7 @@ class KcppServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
|||
|
||||
async def handle_request(self, genparams, api_format, stream_flag):
|
||||
tasks = []
|
||||
|
||||
genparams["oai_uniqueid"] = random.randint(100000, 999999)
|
||||
try:
|
||||
if stream_flag:
|
||||
tasks.append(self.handle_sse_stream(genparams, api_format))
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue