From 0028e7199358727afecb2d5016e7d8b39df37633 Mon Sep 17 00:00:00 2001
From: Concedo <39025047+LostRuins@users.noreply.github.com>
Date: Sat, 30 Nov 2024 16:54:01 +0800
Subject: [PATCH] special handling to resolve incomplete utf8 token sequences
 in qwen

---
 koboldcpp.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/koboldcpp.py b/koboldcpp.py
index b9fb82042..8d38e0a8c 100644
--- a/koboldcpp.py
+++ b/koboldcpp.py
@@ -545,6 +545,15 @@ def tryparseint(value):
     except ValueError:
         return value
 
+def is_incomplete_utf8_sequence(byte_seq): #note, this will only flag INCOMPLETE sequences, corrupted ones will be ignored.
+    try:
+        byte_seq.decode('utf-8')
+        return False  # Valid UTF-8
+    except UnicodeDecodeError as e:
+        if e.reason == 'unexpected end of data':
+            return True #incomplete sequence
+        return False #invalid sequence, but not incomplete
+
 def unpack_to_dir(destpath = ""):
     import shutil
     srcpath = os.path.abspath(os.path.dirname(__file__))
@@ -1697,7 +1706,8 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
                     newbyte = ctypes.string_at(token)
                     incomplete_token_buffer += bytearray(newbyte)
                     tokenSeg = incomplete_token_buffer.decode("UTF-8","ignore")
-                    badFragment = (tokenSeg==" " and len(incomplete_token_buffer)>1) #partial incomplete unicode
+                    incseq = is_incomplete_utf8_sequence(incomplete_token_buffer)
+                    badFragment = (tokenSeg==" " and len(incomplete_token_buffer)>1) or incseq #partial incomplete unicode
                     if tokenSeg!="" and not badFragment:
                         incomplete_token_buffer.clear()
                         tokenStr += tokenSeg