mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-31 21:39:42 +00:00
unicode,test: add Qwen3.5 non-backtracking tokenizer handler and regr… (#22110)
* unicode,test: add Qwen3.5 non-backtracking tokenizer handler and regression tests
- Add unicode_regex_split_custom_qwen35() to [src/unicode.cpp](src/unicode.cpp), a non-backtracking handler for Qwen3.5's [\p{L}\p{M}]+ regex (letters + combining marks).
- Register the handler in the custom tokenizer dispatch table to prevent stack overflows on long inputs (fixes #21919).
- Add [models/ggml-vocab-qwen35.gguf](models/ggml-vocab-qwen35.gguf) (test vocab), [models/ggml-vocab-qwen35.gguf.inp](models/ggml-vocab-qwen35.gguf.inp) (test cases), and [models/ggml-vocab-qwen35.gguf.out](models/ggml-vocab-qwen35.gguf.out) (expected output) for regression testing.
- Update [tests/CMakeLists.txt](tests/CMakeLists.txt) to include the new test entry.
This mirrors the Qwen2 fix (commit 0d049d6), but adapts for Qwen3.5's regex. Ensures robust Unicode tokenization and prevents std::regex stack overflows.
Closes #21919.
* fix: enhance regex handling for Qwen3.5 tokenizer to include accent marks
* cont : remove trailing whitespace
---------
Co-authored-by: Kabir <kabir@example.com>
Co-authored-by: Alde Rojas <hello@alde.dev>
This commit is contained in:
parent
dbe7901ca6
commit
42532afff4
5 changed files with 304 additions and 0 deletions
133
src/unicode.cpp
133
src/unicode.cpp
|
|
@ -605,6 +605,136 @@ static std::vector<size_t> unicode_regex_split_custom_qwen2(const std::string &
|
|||
return bpe_offsets;
|
||||
}
|
||||
|
||||
// Qwen3.5 system regex: "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?[\\p{L}\\p{M}]+|\\p{N}| ?[^\\s\\p{L}\\p{M}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
|
||||
// Compared to Qwen2, letter-runs also consume Unicode combining marks (\p{M}): [\p{L}\p{M}]+ instead of \p{L}+
|
||||
static std::vector<size_t> unicode_regex_split_custom_qwen35(const std::string & text, const std::vector<size_t> & offsets) {
|
||||
std::vector<size_t> bpe_offsets; // store the offset of each word
|
||||
bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
|
||||
|
||||
const auto cpts = unicode_cpts_from_utf8(text);
|
||||
|
||||
size_t start = 0;
|
||||
for (auto offset : offsets) {
|
||||
const size_t offset_ini = start;
|
||||
const size_t offset_end = start + offset;
|
||||
assert(offset_end <= cpts.size());
|
||||
start = offset_end;
|
||||
|
||||
static const uint32_t OUT_OF_RANGE = 0xFFFFFFFF;
|
||||
auto _get_cpt = [&] (const size_t pos) -> uint32_t {
|
||||
return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
|
||||
};
|
||||
|
||||
auto _get_flags = [&] (const size_t pos) -> unicode_cpt_flags {
|
||||
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags_from_cpt(cpts[pos]) : unicode_cpt_flags{};
|
||||
};
|
||||
|
||||
size_t _prev_end = offset_ini;
|
||||
auto _add_token = [&] (const size_t end) -> size_t {
|
||||
assert(_prev_end <= end && end <= offset_end);
|
||||
size_t len = end - _prev_end;
|
||||
if (len > 0) {
|
||||
bpe_offsets.push_back(len);
|
||||
}
|
||||
_prev_end = end;
|
||||
return len;
|
||||
};
|
||||
|
||||
for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
|
||||
const uint32_t cpt = _get_cpt(pos);
|
||||
const auto flags = _get_flags(pos);
|
||||
|
||||
// regex: (?i:'s|'t|'re|'ve|'m|'ll|'d) // case insensitive
|
||||
if (cpt == '\'' && pos+1 < offset_end) {
|
||||
uint32_t cpt_next = unicode_tolower(_get_cpt(pos+1));
|
||||
if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || cpt_next == 'd') {
|
||||
pos += _add_token(pos+2);
|
||||
continue;
|
||||
}
|
||||
if (pos+2 < offset_end) {
|
||||
uint32_t cpt_next_next = unicode_tolower(_get_cpt(pos+2));
|
||||
if ((cpt_next == 'r' && cpt_next_next == 'e') ||
|
||||
(cpt_next == 'v' && cpt_next_next == 'e') ||
|
||||
(cpt_next == 'l' && cpt_next_next == 'l')) {
|
||||
pos += _add_token(pos+3);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// regex: [^\r\n\p{L}\p{N}]?[\p{L}\p{M}]+
|
||||
if (!(cpt == '\r' || cpt == '\n' || flags.is_number)) {
|
||||
if (flags.is_letter || flags.is_accent_mark || _get_flags(pos + 1).is_accent_mark || _get_flags(pos+1).is_letter) {
|
||||
pos++;
|
||||
while (_get_flags(pos).is_letter || _get_flags(pos).is_accent_mark) {
|
||||
pos++;
|
||||
}
|
||||
_add_token(pos);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// regex: \p{N}
|
||||
if (flags.is_number) {
|
||||
pos++;
|
||||
_add_token(pos);
|
||||
continue;
|
||||
}
|
||||
|
||||
// regex: <space>?[^\s\p{L}\p{M}\p{N}]+[\r\n]*
|
||||
auto flags2 = (cpt == ' ' ? _get_flags(pos+1) : flags);
|
||||
if (!(flags2.is_whitespace | flags2.is_letter | flags2.is_accent_mark | flags2.is_number) && flags.as_uint()) {
|
||||
pos += (cpt == ' ');
|
||||
while (!(flags2.is_whitespace | flags2.is_letter | flags2.is_accent_mark | flags2.is_number) && flags2.as_uint()) {
|
||||
flags2 = _get_flags(++pos);
|
||||
}
|
||||
uint32_t cpt2 = _get_cpt(pos);
|
||||
while (cpt2 == '\r' || cpt2 == '\n') {
|
||||
cpt2 = _get_cpt(++pos);
|
||||
}
|
||||
_add_token(pos);
|
||||
continue;
|
||||
}
|
||||
|
||||
size_t num_whitespaces = 0;
|
||||
size_t last_end_r_or_n = 0;
|
||||
while (_get_flags(pos+num_whitespaces).is_whitespace) {
|
||||
uint32_t cpt2 = _get_cpt(pos+num_whitespaces);
|
||||
if (cpt2 == '\r' || cpt2 == '\n') {
|
||||
last_end_r_or_n = pos + num_whitespaces + 1;
|
||||
}
|
||||
num_whitespaces++;
|
||||
}
|
||||
|
||||
// regex: \s*[\r\n]+
|
||||
if (last_end_r_or_n > 0) {
|
||||
pos = last_end_r_or_n;
|
||||
_add_token(pos);
|
||||
continue;
|
||||
}
|
||||
|
||||
// regex: \s+(?!\S)
|
||||
if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != OUT_OF_RANGE) {
|
||||
pos += num_whitespaces - 1;
|
||||
_add_token(pos);
|
||||
continue;
|
||||
}
|
||||
|
||||
// regex: \s+
|
||||
if (num_whitespaces > 0) {
|
||||
pos += num_whitespaces;
|
||||
_add_token(pos);
|
||||
continue;
|
||||
}
|
||||
|
||||
// no matches
|
||||
_add_token(++pos);
|
||||
}
|
||||
}
|
||||
|
||||
return bpe_offsets;
|
||||
}
|
||||
|
||||
template <typename CharT>
|
||||
static std::vector<size_t> unicode_regex_split_stl(const std::basic_string<CharT> & text, const std::basic_string<CharT> & regex, const std::vector<size_t> & offsets) {
|
||||
using BidirIt = typename std::basic_string<CharT>::const_iterator;
|
||||
|
|
@ -929,6 +1059,9 @@ static std::vector<size_t> unicode_regex_split_custom(const std::string & text,
|
|||
} else if (
|
||||
regex_expr == "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+") {
|
||||
bpe_offsets = unicode_regex_split_custom_qwen2(text, offsets);
|
||||
} else if (
|
||||
regex_expr == "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?[\\p{L}\\p{M}]+|\\p{N}| ?[^\\s\\p{L}\\p{M}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+") {
|
||||
bpe_offsets = unicode_regex_split_custom_qwen35(text, offsets);
|
||||
} else if (regex_expr == "\\p{Han}+") {
|
||||
// K2's first pattern - handle all K2 patterns together
|
||||
bpe_offsets = unicode_regex_split_custom_kimi_k2(text, offsets);
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue