mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-22 03:10:03 +00:00
Merge branch 'upstream' into concedo_experimental
# Conflicts: # .flake8 # .github/workflows/bench.yml # .github/workflows/python-lint.yml # .pre-commit-config.yaml # Makefile # README.md # models/ggml-vocab-bert-bge.gguf.inp # models/ggml-vocab-bert-bge.gguf.out # models/ggml-vocab-deepseek-coder.gguf.inp # models/ggml-vocab-deepseek-coder.gguf.out # models/ggml-vocab-deepseek-llm.gguf.inp # models/ggml-vocab-deepseek-llm.gguf.out # models/ggml-vocab-falcon.gguf.inp # models/ggml-vocab-falcon.gguf.out # models/ggml-vocab-gpt-2.gguf.inp # models/ggml-vocab-gpt-2.gguf.out # models/ggml-vocab-llama-bpe.gguf.inp # models/ggml-vocab-llama-bpe.gguf.out # models/ggml-vocab-llama-spm.gguf.inp # models/ggml-vocab-llama-spm.gguf.out # models/ggml-vocab-mpt.gguf.inp # models/ggml-vocab-mpt.gguf.out # models/ggml-vocab-phi-3.gguf # models/ggml-vocab-phi-3.gguf.inp # models/ggml-vocab-phi-3.gguf.out # models/ggml-vocab-refact.gguf # models/ggml-vocab-starcoder.gguf.inp # models/ggml-vocab-starcoder.gguf.out # requirements/requirements-convert.txt # scripts/compare-llama-bench.py # scripts/run-with-preset.py # scripts/verify-checksum-models.py # tests/CMakeLists.txt # tests/test-tokenizer-0.cpp
This commit is contained in:
commit
6c000cbe7a
40 changed files with 1593 additions and 936 deletions
66
scripts/gen-unicode-data.py
Normal file
66
scripts/gen-unicode-data.py
Normal file
|
|
@ -0,0 +1,66 @@
|
|||
import regex
|
||||
|
||||
|
||||
def cpt_to_utf8_str(cpt):
|
||||
if cpt <= 0xFF:
|
||||
return bytes([cpt, 0, 0, 0])
|
||||
elif cpt <= 0xFFFF:
|
||||
return bytes([cpt & 0xFF, cpt >> 8, 0, 0])
|
||||
elif cpt <= 0xFFFFFF:
|
||||
return bytes([cpt & 0xFF, (cpt >> 8) & 0xFF, (cpt >> 16) & 0xFF, 0])
|
||||
else:
|
||||
return bytes([cpt & 0xFF, (cpt >> 8) & 0xFF, (cpt >> 16) & 0xFF, cpt >> 24])
|
||||
|
||||
|
||||
def is_match(codepoint, regex_expr):
|
||||
try:
|
||||
res = regex.match(regex_expr, cpt_to_utf8_str(codepoint).decode('utf-32'))
|
||||
return res is not None
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def get_matches(regex_expr):
|
||||
unicode_ranges = []
|
||||
current_range = None
|
||||
|
||||
for codepoint in range(0x110000):
|
||||
if is_match(codepoint, regex_expr):
|
||||
if current_range is None:
|
||||
current_range = [codepoint, codepoint]
|
||||
else:
|
||||
current_range[1] = codepoint
|
||||
elif current_range is not None:
|
||||
unicode_ranges.append(tuple(current_range))
|
||||
current_range = None
|
||||
|
||||
if current_range is not None:
|
||||
unicode_ranges.append(tuple(current_range))
|
||||
|
||||
return unicode_ranges
|
||||
|
||||
|
||||
def print_cat(cat, ranges):
|
||||
print("const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_{} = {{".format(cat)) # noqa: NP100
|
||||
cnt = 0
|
||||
for start, end in ranges:
|
||||
if cnt % 4 != 0:
|
||||
print(" ", end="") # noqa: NP100
|
||||
print("{{0x{:08X}, 0x{:08X}}},".format(start, end), end="") # noqa: NP100
|
||||
if cnt % 4 == 3:
|
||||
print("") # noqa: NP100
|
||||
cnt += 1
|
||||
|
||||
if cnt % 4 != 0:
|
||||
print("") # noqa: NP100
|
||||
print("};") # noqa: NP100
|
||||
print("") # noqa: NP100
|
||||
|
||||
|
||||
print_cat("number", get_matches(r'\p{N}'))
|
||||
print_cat("letter", get_matches(r'\p{L}'))
|
||||
print_cat("whitespace", get_matches(r'\p{Z}'))
|
||||
print_cat("accent_mark", get_matches(r'\p{M}'))
|
||||
print_cat("punctuation", get_matches(r'\p{P}'))
|
||||
print_cat("symbol", get_matches(r'\p{S}'))
|
||||
print_cat("control", get_matches(r'\p{C}'))
|
||||
Loading…
Add table
Add a link
Reference in a new issue