mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-10 17:14:36 +00:00
special handling to resolve incomplete utf8 token sequences in qwen
This commit is contained in:
parent
32ac3153e4
commit
0028e71993
1 changed files with 11 additions and 1 deletions
12
koboldcpp.py
12
koboldcpp.py
|
@ -545,6 +545,15 @@ def tryparseint(value):
|
|||
except ValueError:
|
||||
return value
|
||||
|
||||
def is_incomplete_utf8_sequence(byte_seq): #note, this will only flag INCOMPLETE sequences, corrupted ones will be ignored.
|
||||
try:
|
||||
byte_seq.decode('utf-8')
|
||||
return False # Valid UTF-8
|
||||
except UnicodeDecodeError as e:
|
||||
if e.reason == 'unexpected end of data':
|
||||
return True #incomplete sequence
|
||||
return False #invalid sequence, but not incomplete
|
||||
|
||||
def unpack_to_dir(destpath = ""):
|
||||
import shutil
|
||||
srcpath = os.path.abspath(os.path.dirname(__file__))
|
||||
|
@ -1697,7 +1706,8 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
|||
newbyte = ctypes.string_at(token)
|
||||
incomplete_token_buffer += bytearray(newbyte)
|
||||
tokenSeg = incomplete_token_buffer.decode("UTF-8","ignore")
|
||||
badFragment = (tokenSeg==" " and len(incomplete_token_buffer)>1) #partial incomplete unicode
|
||||
incseq = is_incomplete_utf8_sequence(incomplete_token_buffer)
|
||||
badFragment = (tokenSeg==" " and len(incomplete_token_buffer)>1) or incseq #partial incomplete unicode
|
||||
if tokenSeg!="" and not badFragment:
|
||||
incomplete_token_buffer.clear()
|
||||
tokenStr += tokenSeg
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue