special handling to resolve incomplete utf8 token sequences in qwen

2025-09-10 17:14:36 +00:00 · 2024-11-30 16:54:01 +08:00 · 2024-11-30 16:54:01 +08:00 · 0028e71993
commit 0028e71993
parent 32ac3153e4
1 changed files with 11 additions and 1 deletions
--- a/koboldcpp.py
+++ b/koboldcpp.py
@ -545,6 +545,15 @@ def tryparseint(value):
    except ValueError:
        return value

+def is_incomplete_utf8_sequence(byte_seq): #note, this will only flag INCOMPLETE sequences, corrupted ones will be ignored.
+    try:
+        byte_seq.decode('utf-8')
+        return False  # Valid UTF-8
+    except UnicodeDecodeError as e:
+        if e.reason == 'unexpected end of data':
+            return True #incomplete sequence
+        return False #invalid sequence, but not incomplete
+
 def unpack_to_dir(destpath = ""):
    import shutil
    srcpath = os.path.abspath(os.path.dirname(__file__))
@ -1697,7 +1706,8 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
                    newbyte = ctypes.string_at(token)
                    incomplete_token_buffer += bytearray(newbyte)
                    tokenSeg = incomplete_token_buffer.decode("UTF-8","ignore")
-                    badFragment = (tokenSeg==" " and len(incomplete_token_buffer)>1) #partial incomplete unicode
+                    incseq = is_incomplete_utf8_sequence(incomplete_token_buffer)
+                    badFragment = (tokenSeg==" " and len(incomplete_token_buffer)>1) or incseq #partial incomplete unicode
                    if tokenSeg!="" and not badFragment:
                        incomplete_token_buffer.clear()
                        tokenStr += tokenSeg