Merge branch 'upstream' into concedo_experimental

# Conflicts: # .flake8 # .github/workflows/bench.yml # .github/workflows/python-lint.yml # .pre-commit-config.yaml # Makefile # README.md # models/ggml-vocab-bert-bge.gguf.inp # models/ggml-vocab-bert-bge.gguf.out # models/ggml-vocab-deepseek-coder.gguf.inp # models/ggml-vocab-deepseek-coder.gguf.out # models/ggml-vocab-deepseek-llm.gguf.inp # models/ggml-vocab-deepseek-llm.gguf.out # models/ggml-vocab-falcon.gguf.inp # models/ggml-vocab-falcon.gguf.out # models/ggml-vocab-gpt-2.gguf.inp # models/ggml-vocab-gpt-2.gguf.out # models/ggml-vocab-llama-bpe.gguf.inp # models/ggml-vocab-llama-bpe.gguf.out # models/ggml-vocab-llama-spm.gguf.inp # models/ggml-vocab-llama-spm.gguf.out # models/ggml-vocab-mpt.gguf.inp # models/ggml-vocab-mpt.gguf.out # models/ggml-vocab-phi-3.gguf # models/ggml-vocab-phi-3.gguf.inp # models/ggml-vocab-phi-3.gguf.out # models/ggml-vocab-refact.gguf # models/ggml-vocab-starcoder.gguf.inp # models/ggml-vocab-starcoder.gguf.out # requirements/requirements-convert.txt # scripts/compare-llama-bench.py # scripts/run-with-preset.py # scripts/verify-checksum-models.py # tests/CMakeLists.txt # tests/test-tokenizer-0.cpp
2026-05-22 03:10:03 +00:00 · 2024-05-06 18:09:45 +08:00 · 2024-05-06 18:09:45 +08:00 · 6c000cbe7a
commit 6c000cbe7a
parent 173c7272d5 bcdee0daa7
40 changed files with 1593 additions and 936 deletions
--- a/scripts/gen-unicode-data.py
+++ b/scripts/gen-unicode-data.py
@ -0,0 +1,66 @@
+import regex
+
+
+def cpt_to_utf8_str(cpt):
+    if cpt <= 0xFF:
+        return bytes([cpt, 0, 0, 0])
+    elif cpt <= 0xFFFF:
+        return bytes([cpt & 0xFF, cpt >> 8, 0, 0])
+    elif cpt <= 0xFFFFFF:
+        return bytes([cpt & 0xFF, (cpt >> 8) & 0xFF, (cpt >> 16) & 0xFF, 0])
+    else:
+        return bytes([cpt & 0xFF, (cpt >> 8) & 0xFF, (cpt >> 16) & 0xFF, cpt >> 24])
+
+
+def is_match(codepoint, regex_expr):
+    try:
+        res = regex.match(regex_expr, cpt_to_utf8_str(codepoint).decode('utf-32'))
+        return res is not None
+    except Exception:
+        return False
+
+
+def get_matches(regex_expr):
+    unicode_ranges = []
+    current_range = None
+
+    for codepoint in range(0x110000):
+        if is_match(codepoint, regex_expr):
+            if current_range is None:
+                current_range = [codepoint, codepoint]
+            else:
+                current_range[1] = codepoint
+        elif current_range is not None:
+            unicode_ranges.append(tuple(current_range))
+            current_range = None
+
+    if current_range is not None:
+        unicode_ranges.append(tuple(current_range))
+
+    return unicode_ranges
+
+
+def print_cat(cat, ranges):
+    print("const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_{} = {{".format(cat)) # noqa: NP100
+    cnt = 0
+    for start, end in ranges:
+        if cnt % 4 != 0:
+            print(" ", end="") # noqa: NP100
+        print("{{0x{:08X}, 0x{:08X}}},".format(start, end), end="") # noqa: NP100
+        if cnt % 4 == 3:
+            print("") # noqa: NP100
+        cnt += 1
+
+    if cnt % 4 != 0:
+        print("") # noqa: NP100
+    print("};") # noqa: NP100
+    print("") # noqa: NP100
+
+
+print_cat("number",      get_matches(r'\p{N}'))
+print_cat("letter",      get_matches(r'\p{L}'))
+print_cat("whitespace",  get_matches(r'\p{Z}'))
+print_cat("accent_mark", get_matches(r'\p{M}'))
+print_cat("punctuation", get_matches(r'\p{P}'))
+print_cat("symbol",      get_matches(r'\p{S}'))
+print_cat("control",     get_matches(r'\p{C}'))