From 49941b62689f27c6c30e9256632409c9efbe5718 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Sun, 5 Apr 2026 13:48:07 +0800 Subject: [PATCH] handle think streaming for gemma4 --- koboldcpp.py | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/koboldcpp.py b/koboldcpp.py index aad330af9..e2c6e07c7 100755 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -3051,6 +3051,7 @@ def repack_toolcall_tags(text: str): text = re.sub(r'.*?', '', text, flags=re.DOTALL) text = re.sub(r'.*?', '', text, flags=re.DOTALL) text = re.sub(r'.*?', '', text, flags=re.DOTALL) + text = re.sub(r'<\|channel>thought.*?', '', text, flags=re.DOTALL) text = text.strip() tcpairs = [ ("", ""), @@ -3628,7 +3629,7 @@ ws ::= | " " | "\n" [ \t]{0,20} jinja_output = format_jinja(messages_array,jinjatools,jinjakwargs) if jinja_output: messages_string = jinja_output - if jinja_output.rstrip().endswith(""): #the prompt template already forced a start think. + if jinja_output.rstrip().endswith("") or jinja_output.rstrip().endswith("<|channel>thought") : #the prompt template already forced a start think. genparams["already_started_thinking"] = True if jinjatools and len(jinjatools)>0: genparams["using_openai_tools"] = True @@ -3746,7 +3747,7 @@ ws ::= | " " | "\n" [ \t]{0,20} if (latest_turn_was_assistant and continue_assistant_turn): #allow continue a prefill, chop off end messages_string = messages_string[:-(len(assistant_message_gen)+len(assistant_message_end))] genparams["prompt"] = messages_string - if messages_string.rstrip().endswith(""): #the prompt template already forced a start think. + if messages_string.rstrip().endswith("") or messages_string.rstrip().endswith("<|channel>thought") : #the prompt template already forced a start think. genparams["already_started_thinking"] = True if len(images_added)>0: genparams["images"] = images_added @@ -4486,12 +4487,14 @@ class KcppServerRequestHandler(http.server.SimpleHTTPRequestHandler): if api_format == 4 and using_openai_tools: # if tools, do not send anything else - OAI tool calls will be handled with fakestreaming! return + think_tag_buf = "" encap_in_thinking = False if genparams.get('already_started_thinking', False): encap_in_thinking = True encap_first_loop = True thinkpairs = [{"start":"<|channel|>analysis<|message|>","end":"<|start|>assistant<|channel|>final<|message|>"}, - {"start":"","end":""}] + {"start":"","end":""}, + {"start":"<|channel>thought","end":""}] responses_first_loop = True anthropic_first_loop = True rseq_num = 0 @@ -4528,6 +4531,21 @@ class KcppServerRequestHandler(http.server.SimpleHTTPRequestHandler): tokenStr += tokenSeg if tokenStr!="" or streamDone: + # split think tag handling + tokenStr = think_tag_buf + tokenStr + think_tag_buf = "" + if not streamDone and genparams.get('encapsulate_thinking', True): + tail = "" + for pair in thinkpairs: + for tag in (pair["start"], pair["end"]): + for n in range(1, len(tag)): + if tokenStr.endswith(tag[:n]) and len(tag[:n]) > len(tail): + tail = tag[:n] + if tail: + think_tag_buf = tail + tokenStr = tokenStr[:-len(tail)] + # end split think tag handling + sseq = genparams.get('stop_sequence', []) trimstop = genparams.get('trim_stop', True) if trimstop and not streamDone and string_contains_or_overlaps_sequence_substring(tokenStr,sseq): @@ -6079,8 +6097,8 @@ Change Mode
# Send content if present if content_text: reasoning_txt = "" - thinkstrips = [""] - thinksplitters = [""] + thinkstrips = ["","<|channel>thought"] + thinksplitters = ["",""] for tsp in thinksplitters: if tsp in content_text: parts = content_text.split(tsp, 1)