From 0028e7199358727afecb2d5016e7d8b39df37633 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Sat, 30 Nov 2024 16:54:01 +0800 Subject: [PATCH] special handling to resolve incomplete utf8 token sequences in qwen --- koboldcpp.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/koboldcpp.py b/koboldcpp.py index b9fb82042..8d38e0a8c 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -545,6 +545,15 @@ def tryparseint(value): except ValueError: return value +def is_incomplete_utf8_sequence(byte_seq): #note, this will only flag INCOMPLETE sequences, corrupted ones will be ignored. + try: + byte_seq.decode('utf-8') + return False # Valid UTF-8 + except UnicodeDecodeError as e: + if e.reason == 'unexpected end of data': + return True #incomplete sequence + return False #invalid sequence, but not incomplete + def unpack_to_dir(destpath = ""): import shutil srcpath = os.path.abspath(os.path.dirname(__file__)) @@ -1697,7 +1706,8 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler): newbyte = ctypes.string_at(token) incomplete_token_buffer += bytearray(newbyte) tokenSeg = incomplete_token_buffer.decode("UTF-8","ignore") - badFragment = (tokenSeg==" " and len(incomplete_token_buffer)>1) #partial incomplete unicode + incseq = is_incomplete_utf8_sequence(incomplete_token_buffer) + badFragment = (tokenSeg==" " and len(incomplete_token_buffer)>1) or incseq #partial incomplete unicode if tokenSeg!="" and not badFragment: incomplete_token_buffer.clear() tokenStr += tokenSeg