mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-19 08:00:25 +00:00
almost working
This commit is contained in:
parent
d216aabfdc
commit
c4abba8868
2 changed files with 133 additions and 55 deletions
|
|
@ -12,7 +12,7 @@ Current version indicated by LITEVER below.
|
|||
-->
|
||||
<head>
|
||||
<script id="init-config">
|
||||
const LITEVER = 327;
|
||||
const LITEVER = 328;
|
||||
const urlParams = new URLSearchParams(window.location.search);
|
||||
var localflag = urlParams.get('local'); //this will be replaced automatically in embedded kcpp
|
||||
const STORAGE_PREFIX = (localflag?"e_":"")+"kaihordewebui_";
|
||||
|
|
@ -7464,6 +7464,11 @@ Current version indicated by LITEVER below.
|
|||
}
|
||||
},
|
||||
close() { //end of stream
|
||||
if(streaming_was_thinking && synchro_pending_stream && (last_stop_reason=="stop" || last_stop_reason=="tool_calls")) //wrap up if we stopped properly
|
||||
{
|
||||
synchro_pending_stream = `${localsettings.start_thinking_tag}${synchro_pending_stream}${localsettings.stop_thinking_tag}`;
|
||||
}
|
||||
streaming_was_thinking = false;
|
||||
synchro_polled_response = synchro_pending_stream;
|
||||
let need_clean_output = (synchro_polled_response!="" && localsettings.opmode==1 && gametext_arr.length>0 && document.getElementById("useoaichatcompl").checked);
|
||||
if(need_clean_output)
|
||||
|
|
@ -7497,7 +7502,6 @@ Current version indicated by LITEVER below.
|
|||
}
|
||||
}
|
||||
synchro_pending_stream = "";
|
||||
streaming_was_thinking = false;
|
||||
if (logprobs_content_arr.length>0 && last_response_obj==null) {
|
||||
//fake a last response obj
|
||||
let fakedresponse = {
|
||||
|
|
@ -18022,7 +18026,7 @@ Current version indicated by LITEVER below.
|
|||
|
||||
function end_trim_to_sentence(input, include_newline = false) {
|
||||
let last = -1;
|
||||
let enders = ['.', '!', '?', '*', '"', ')', '}', '`', ']', ';', '…', '~'];
|
||||
let enders = ['.', '!', '?', '*', '"', ')', '}', '`', ']', ';', '…', '~','>'];
|
||||
for (let i = 0; i < enders.length; ++i) {
|
||||
last = Math.max(last, input.lastIndexOf(enders[i]));
|
||||
}
|
||||
|
|
@ -22537,7 +22541,7 @@ Current version indicated by LITEVER below.
|
|||
|
||||
//allow trim incomplete sentences
|
||||
//do not trim if instruct mode AND stop token reached
|
||||
let donottrim = (last_stop_reason=="stop");
|
||||
let donottrim = (last_stop_reason=="stop"||last_stop_reason=="tool_calls");
|
||||
if (!donottrim && localsettings.trimsentences == true) {
|
||||
//also, to prevent a trim from bisecting a chat name, if a response contains a chatname, do not trim
|
||||
donottrim = false;
|
||||
|
|
|
|||
176
koboldcpp.py
176
koboldcpp.py
|
|
@ -169,6 +169,14 @@ zenity_permitted = True
|
|||
thinkformats = [{"start":"<|channel|>analysis<|message|>","end":"<|start|>assistant<|channel|>final<|message|>"},
|
||||
{"start":"<think>","end":"</think>"},
|
||||
{"start":"<|channel>thought","end":"<channel|>"}]
|
||||
tool_call_pairs = [
|
||||
("<tool_call>", "</tool_call>"),
|
||||
("<seed:tool_call>", "</seed:tool_call>"),
|
||||
("<|tool_call_begin|>", "<|tool_call_end|>"),
|
||||
("<|tool▁call▁begin|>", "<|tool▁call▁end|>"),
|
||||
("<minimax:tool_call>", "</minimax:tool_call>"),
|
||||
("<|tool_call>call:", "<tool_call|>"),
|
||||
]
|
||||
|
||||
saved_stdout = None
|
||||
saved_stderr = None
|
||||
|
|
@ -3267,7 +3275,7 @@ def toolcall_to_normalized_json(text,start_tag,end_tag): #convert weird formats
|
|||
return text #fallback
|
||||
|
||||
def repack_toolcall_tags(text: str, original_tools:list):
|
||||
global thinkformats
|
||||
global thinkformats, tool_call_pairs
|
||||
tool_calls = []
|
||||
if not text:
|
||||
return tool_calls
|
||||
|
|
@ -3275,16 +3283,8 @@ def repack_toolcall_tags(text: str, original_tools:list):
|
|||
pattern = f"{re.escape(fmt['start'])}.*?{re.escape(fmt['end'])}"
|
||||
text = re.sub(pattern, '', text, flags=re.DOTALL)
|
||||
text = text.strip()
|
||||
tcpairs = [
|
||||
("<tool_call>", "</tool_call>"),
|
||||
("<seed:tool_call>", "</seed:tool_call>"),
|
||||
("<|tool_call_begin|>", "<|tool_call_end|>"),
|
||||
("<|tool▁call▁begin|>", "<|tool▁call▁end|>"),
|
||||
("<minimax:tool_call>", "</minimax:tool_call>"),
|
||||
("<|tool_call>call:", "<tool_call|>"),
|
||||
]
|
||||
found = False
|
||||
for start, end in tcpairs:
|
||||
for start, end in tool_call_pairs:
|
||||
pattern = re.escape(start) + r"(.*?)" + re.escape(end)
|
||||
matches = re.findall(pattern, text, flags=re.DOTALL)
|
||||
if matches:
|
||||
|
|
@ -4721,7 +4721,7 @@ class KcppServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
|||
self.wfile.flush()
|
||||
|
||||
async def handle_sse_stream(self, genparams, api_format):
|
||||
global friendlymodelname, currfinishreason, thinkformats
|
||||
global friendlymodelname, currfinishreason, thinkformats, tool_call_pairs, cached_chat_template
|
||||
global autoswapmode, textName, sttName, ttsName, embedName, musicName, imageName, mmprojName
|
||||
|
||||
modelNameToReturn = friendlymodelname
|
||||
|
|
@ -4737,8 +4737,19 @@ class KcppServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
|||
self.send_header("cache-control", "no-cache")
|
||||
self.send_header("connection", "keep-alive")
|
||||
self.end_headers(content_type='text/event-stream')
|
||||
if api_format == 4 and using_openai_tools: # if tools, do not send anything else - OAI tool calls will be handled with fakestreaming!
|
||||
return
|
||||
|
||||
# if tools, do not send anything else - OAI tool calls will be handled with fakestreaming!
|
||||
# only exception is if we know the exact toolcall tag to segment!
|
||||
tool_segment_tag = ""
|
||||
for start, end in tool_call_pairs:
|
||||
if cached_chat_template and start in cached_chat_template:
|
||||
tool_segment_tag = start
|
||||
break
|
||||
jinjatools = (args.jinja and args.jinja_tools)
|
||||
if api_format == 4 and using_openai_tools:
|
||||
if not jinjatools or not tool_segment_tag:
|
||||
genparams['sync_toolcall_stream_ineligible'] = True
|
||||
return
|
||||
|
||||
think_tag_buf = ""
|
||||
encap_in_thinking = False
|
||||
|
|
@ -4814,7 +4825,19 @@ class KcppServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
|||
if sindex != -1 and trim_str!="":
|
||||
tokenStr = tokenStr[:sindex]
|
||||
|
||||
sync_potential_toolcall_splitmatch = ""
|
||||
if tokenStr!="" or streamDone:
|
||||
# Tool boundary detection for tool-capable chat completions.
|
||||
# if triggered, stop real streaming, and let the buffered fakestreaming take over
|
||||
if api_format == 4 and using_openai_tools:
|
||||
tokenStr = tokenReserve + tokenStr
|
||||
tokenReserve = ""
|
||||
splitter = tool_segment_tag
|
||||
if splitter in tokenStr:
|
||||
if not genparams.get("sync_toolcall_potential_triggered",False):
|
||||
sync_potential_toolcall_splitmatch = splitter
|
||||
genparams['sync_toolcall_potential_triggered'] = True #if tool calls is triggered, rest will be sync fake streaming. we'll buffer it for later
|
||||
|
||||
need_split_final_msg = True if (currfinishreason is not None and streamDone and tokenStr!="") else False
|
||||
|
||||
# Hack for lcppui reasoning_content for thinking models
|
||||
|
|
@ -4874,6 +4897,28 @@ class KcppServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
|||
else:
|
||||
delta['content'] = tokenStr
|
||||
|
||||
if genparams.get("sync_toolcall_potential_triggered",False) and delta: # if sync_toolcall_potential_triggered, buffer up the impending content chunk for tools in fakestreaming, in case toolcalls fail
|
||||
ec = genparams.get("sync_toolcall_extra_content","")
|
||||
erc = genparams.get("sync_toolcall_extra_reasoning_content","")
|
||||
ec += delta.get("content","")
|
||||
erc += delta.get("reasoning_content","")
|
||||
if erc and sync_potential_toolcall_splitmatch and sync_potential_toolcall_splitmatch in erc:
|
||||
parts = erc.split(sync_potential_toolcall_splitmatch,1)
|
||||
erc = sync_potential_toolcall_splitmatch + parts[1]
|
||||
delta["reasoning_content"] = parts[0]
|
||||
elif ec and sync_potential_toolcall_splitmatch and sync_potential_toolcall_splitmatch in ec:
|
||||
parts = ec.split(sync_potential_toolcall_splitmatch,1)
|
||||
ec = sync_potential_toolcall_splitmatch + parts[1]
|
||||
delta["content"] = parts[0]
|
||||
genparams['sync_toolcall_extra_content'] = ec
|
||||
genparams['sync_toolcall_extra_reasoning_content'] = erc
|
||||
if not sync_potential_toolcall_splitmatch:
|
||||
if not streamDone:
|
||||
await asyncio.sleep(async_sleep_short)
|
||||
continue
|
||||
await asyncio.sleep(async_sleep_short)
|
||||
return
|
||||
|
||||
if need_split_final_msg: #we need to send one message without the finish reason, then send a finish reason with no msg to follow standards
|
||||
if api_format == 4: # if oai chat, set format to expected openai streaming response
|
||||
event_str = json.dumps({"id":chatcmpl_id,"object":"chat.completion.chunk","created":int(time.time()),"model":modelNameToReturn,"choices":[{"index":0,"finish_reason":None,"delta":delta}]})
|
||||
|
|
@ -4894,6 +4939,7 @@ class KcppServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
|||
addonstr = json.dumps({"id":chatcmpl_id,"object":"chat.completion.chunk","created":int(time.time()),"model":modelNameToReturn,"choices":[{"index":0,"finish_reason":None,"delta":{'role':'assistant','content':''},"logprobs":logprobsdict}]})
|
||||
await self.send_oai_sse_event(addonstr)
|
||||
event_str = json.dumps({"id":chatcmpl_id,"object":"chat.completion.chunk","created":int(time.time()),"model":modelNameToReturn,"choices":[{"index":0,"finish_reason":currfinishreason,"delta":delta}]})
|
||||
genparams['sync_toolcall_first_role_sent'] = True
|
||||
await self.send_oai_sse_event(event_str)
|
||||
elif api_format == 3: # non chat completions
|
||||
if streamDone and ("logprobs" in genparams and genparams["logprobs"]): # this is a hack that sends an extra message containing ALL the logprobs
|
||||
|
|
@ -6324,7 +6370,9 @@ Change Mode<br>
|
|||
self.end_headers(content_type='application/json')
|
||||
self.wfile.write(genresp)
|
||||
elif api_format == 4 and genparams.get('using_openai_tools', False): #special case, fake streaming for openai tool calls
|
||||
content_text = None
|
||||
# we only send content_text and reasoning_text if tools aren't used. they contain the balance of the output after sync_toolcall_potential_triggered was triggered
|
||||
content_text = genparams.get('sync_toolcall_extra_content', "") #populated by the sse call, we don't use gendat['choices'][0]['message'].get('content', None)
|
||||
reasoning_text = genparams.get('sync_toolcall_extra_reasoning_content', "")
|
||||
toolsdata_res = []
|
||||
try:
|
||||
toolsdata_res = gendat['choices'][0]['message']['tool_calls']
|
||||
|
|
@ -6332,53 +6380,57 @@ Change Mode<br>
|
|||
toolsdata_res[0]["index"] = 0 # need to add an index for OWUI
|
||||
except Exception:
|
||||
toolsdata_res = []
|
||||
try:
|
||||
content_text = gendat['choices'][0]['message'].get('content', None)
|
||||
except Exception:
|
||||
content_text = None
|
||||
|
||||
# Send role chunk first
|
||||
chunk_role = json.dumps({
|
||||
"id": "koboldcpp",
|
||||
"object": "chat.completion.chunk",
|
||||
"created": int(time.time()),
|
||||
"model": modelNameToReturn,
|
||||
"choices": [{"index": 0, "finish_reason": None, "delta": {"role": "assistant"}}]
|
||||
})
|
||||
self.wfile.write(f"data: {chunk_role}\n\n".encode())
|
||||
self.wfile.flush()
|
||||
# Send role chunk first, if needed
|
||||
if genparams.get('sync_toolcall_first_role_sent', False):
|
||||
genparams['sync_toolcall_first_role_sent'] = True
|
||||
chunk_role = json.dumps({
|
||||
"id": "koboldcpp",
|
||||
"object": "chat.completion.chunk",
|
||||
"created": int(time.time()),
|
||||
"model": modelNameToReturn,
|
||||
"choices": [{"index": 0, "finish_reason": None, "delta": {"role": "assistant"}}]
|
||||
})
|
||||
self.wfile.write(f"data: {chunk_role}\n\n".encode())
|
||||
self.wfile.flush()
|
||||
|
||||
# Send content if present
|
||||
if content_text:
|
||||
reasoning_txt = ""
|
||||
thinkstrips = [item["start"] for item in thinkformats] #start thinking tags
|
||||
thinksplitters = [item["end"] for item in thinkformats] #end thinking tags
|
||||
for tsp in thinksplitters:
|
||||
if tsp in content_text:
|
||||
parts = content_text.split(tsp, 1)
|
||||
reasoning_txt = parts[0]
|
||||
content_text = parts[1]
|
||||
for ts in thinkstrips:
|
||||
reasoning_txt = reasoning_txt.replace(ts, "")
|
||||
if reasoning_txt:
|
||||
# if no valid tool splitter, we have to do 100% synchronous
|
||||
if not content_text and not reasoning_text and genparams.get('sync_toolcall_stream_ineligible', False):
|
||||
temp_content = ""
|
||||
try:
|
||||
temp_content = gendat['choices'][0]['message'].get('content', None)
|
||||
except Exception:
|
||||
temp_content = None
|
||||
if temp_content:
|
||||
temp_reasoning = ""
|
||||
thinkstrips = [item["start"] for item in thinkformats] #start thinking tags
|
||||
thinksplitters = [item["end"] for item in thinkformats] #end thinking tags
|
||||
for tsp in thinksplitters:
|
||||
if tsp in temp_content:
|
||||
parts = temp_content.split(tsp, 1)
|
||||
temp_reasoning = parts[0]
|
||||
temp_content = parts[1]
|
||||
for ts in thinkstrips:
|
||||
temp_reasoning = temp_reasoning.replace(ts, "")
|
||||
if temp_reasoning:
|
||||
chunk_content = json.dumps({
|
||||
"id": "koboldcpp",
|
||||
"object": "chat.completion.chunk",
|
||||
"created": int(time.time()),
|
||||
"model": modelNameToReturn,
|
||||
"choices": [{"index": 0, "finish_reason": None, "delta": {"reasoning_content": temp_reasoning}}]
|
||||
})
|
||||
self.wfile.write(f"data: {chunk_content}\n\n".encode())
|
||||
self.wfile.flush()
|
||||
chunk_content = json.dumps({
|
||||
"id": "koboldcpp",
|
||||
"object": "chat.completion.chunk",
|
||||
"created": int(time.time()),
|
||||
"model": modelNameToReturn,
|
||||
"choices": [{"index": 0, "finish_reason": None, "delta": {"reasoning_content": reasoning_txt}}]
|
||||
"choices": [{"index": 0, "finish_reason": None, "delta": {"content": temp_content}}]
|
||||
})
|
||||
self.wfile.write(f"data: {chunk_content}\n\n".encode())
|
||||
self.wfile.flush()
|
||||
chunk_content = json.dumps({
|
||||
"id": "koboldcpp",
|
||||
"object": "chat.completion.chunk",
|
||||
"created": int(time.time()),
|
||||
"model": modelNameToReturn,
|
||||
"choices": [{"index": 0, "finish_reason": None, "delta": {"content": content_text}}]
|
||||
})
|
||||
self.wfile.write(f"data: {chunk_content}\n\n".encode())
|
||||
self.wfile.flush()
|
||||
|
||||
# Send tool calls incrementally in OpenAI format
|
||||
if toolsdata_res and len(toolsdata_res) > 0:
|
||||
|
|
@ -6418,6 +6470,28 @@ Change Mode<br>
|
|||
})
|
||||
self.wfile.write(f"data: {chunk_args}\n\n".encode())
|
||||
self.wfile.flush()
|
||||
else:
|
||||
# Send remaining buffered content if no tool calls were made
|
||||
if reasoning_text:
|
||||
chunk_content = json.dumps({
|
||||
"id": "koboldcpp",
|
||||
"object": "chat.completion.chunk",
|
||||
"created": int(time.time()),
|
||||
"model": modelNameToReturn,
|
||||
"choices": [{"index": 0, "finish_reason": None, "delta": {"reasoning_content": reasoning_text}}]
|
||||
})
|
||||
self.wfile.write(f"data: {chunk_content}\n\n".encode())
|
||||
self.wfile.flush()
|
||||
if content_text:
|
||||
chunk_content = json.dumps({
|
||||
"id": "koboldcpp",
|
||||
"object": "chat.completion.chunk",
|
||||
"created": int(time.time()),
|
||||
"model": modelNameToReturn,
|
||||
"choices": [{"index": 0, "finish_reason": None, "delta": {"content": content_text}}]
|
||||
})
|
||||
self.wfile.write(f"data: {chunk_content}\n\n".encode())
|
||||
self.wfile.flush()
|
||||
|
||||
# Final chunk
|
||||
chunk_final = json.dumps({
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue