Merge branch 'upstream' into concedo_experimental

# Conflicts: # .gitignore # README.md # docs/backend/BLIS.md # docs/backend/SYCL.md # docs/development/llama-star/idea-arch.key # docs/development/llama-star/idea-arch.pdf # docs/development/token_generation_performance_tips.md # src/llama.cpp # tests/test-tokenizer-0.cpp # tests/test-tokenizer-1-bpe.cpp # tests/test-tokenizer-1-spm.cpp # tests/test-tokenizer-random.py
2026-05-09 19:46:11 +00:00 · 2024-07-06 19:39:24 +08:00 · 2024-07-06 19:39:24 +08:00 · 8e5fd6f509
commit 8e5fd6f509
parent 5e458f42da 87e25a1d1b
28 changed files with 352 additions and 2091 deletions
--- a/scripts/debug-test.sh
+++ b/scripts/debug-test.sh
@ -1,203 +0,0 @@
-#!/bin/bash
-
-PROG=${0##*/}
-build_dir="build-ci-debug"
-
-# Print Color Commands
-red=$(tput setaf 1)
-green=$(tput setaf 2)
-yellow=$(tput setaf 3)
-blue=$(tput setaf 4)
-magenta=$(tput setaf 5)
-cyan=$(tput setaf 6)
-normal=$(tput sgr0)
-
-
-# Print Help Message
-####################
-
-print_full_help() {
-  cat << EOF
-Usage: $PROG [OPTION]... <test_regex> (test_number)
-Debug specific ctest program.
-
-Options:
-  -h, --help            display this help and exit
-  -g                    run in gdb mode
-
-Arguments:
-  <test_regex>     (Mandatory) Supply one regex to the script to filter tests
-  (test_number)    (Optional) Test number to run a specific test
-
-Example:
-  $PROG test-tokenizer
-  $PROG test-tokenizer 3
-EOF
-}
-
-abort() {
-  echo "Error: $1" >&2
-  cat << EOF >&2
-Usage: $PROG [OPTION]... <test_regex> (test_number)
-Debug specific ctest program.
-Refer to --help for full instructions.
-EOF
-  exit 1
-}
-
-
-# Dependency Sanity Check
-#########################
-
-check_dependency() {
-  command -v "$1" >/dev/null 2>&1 || {
-    abort "$1 is required but not found. Please install it and try again."
-  }
-}
-
-check_dependency ctest
-check_dependency cmake
-
-
-# Step 0: Check the args
-########################
-
-if [ x"$1" = x"-h" ] || [ x"$1" = x"--help" ]; then
-  print_full_help >&2
-  exit 0
-fi
-
-# Parse command-line options
-gdb_mode=false
-while getopts "g" opt; do
-    case $opt in
-        g)
-            gdb_mode=true
-            echo "gdb_mode Mode Enabled"
-            ;;
-    esac
-done
-
-# Shift the option parameters
-shift $((OPTIND - 1))
-
-# Positionial Argument Processing : <test_regex>
-if [ -z "${1}" ]; then
-    abort "Test regex is required"
-else
-    test_suite=${1:-}
-fi
-
-# Positionial Argument Processing : (test_number)
-test_number=${2:-}
-
-
-# Step 1: Reset and Setup folder context
-########################################
-
-## Sanity check that we are actually in a git repo
-repo_root=$(git rev-parse --show-toplevel)
-if [ ! -d "$repo_root" ]; then
-    abort "Not in a Git repository."
-fi
-
-## Reset folder to root context of git repo and Create and enter build directory
-pushd "$repo_root"
-rm -rf "$build_dir" && mkdir "$build_dir" || abort "Failed to make $build_dir"
-
-
-# Step 2: Setup Build Environment and Compile Test Binaries
-###########################################################
-
-# Note: test-eval-callback requires -DLLAMA_CURL
-cmake -B "./$build_dir" -DCMAKE_BUILD_TYPE=Debug -DGGML_CUDA=1 -DLLAMA_CURL=1 || abort "Failed to build enviroment"
-pushd "$build_dir"
-make -j || abort "Failed to compile"
-popd > /dev/null || exit 1
-
-
-# Step 3: Find all tests available that matches REGEX
-####################################################
-
-# Ctest Gather Tests
-# `-R test-tokenizer` : looks for all the test files named `test-tokenizer*` (R=Regex)
-# `-N` : "show-only" disables test execution & shows test commands that you can feed to GDB.
-# `-V` : Verbose Mode
-printf "\n\nGathering tests that fit REGEX: ${test_suite} ...\n"
-pushd "$build_dir"
-tests=($(ctest -R ${test_suite} -V -N | grep -E " +Test +#[0-9]+*" | cut -d':' -f2 | awk '{$1=$1};1'))
-if [ ${#tests[@]} -eq 0 ]; then
-    abort "No tests avaliable... check your compliation process..."
-fi
-popd > /dev/null || exit 1
-
-
-# Step 4: Identify Test Command for Debugging
-#############################################
-
-# Select test number
-if [ -z $test_number ]; then
-    # List out avaliable tests
-    printf "Which test would you like to debug?\n"
-    id=0
-    for s in "${tests[@]}"
-    do
-        echo "Test# ${id}"
-        echo "  $s"
-        ((id++))
-    done
-
-    # Prompt user which test they wanted to run
-    printf "\nRun test#? "
-    read test_number
-
-else
-    printf "\nUser Already Requested #${test_number}\n"
-
-fi
-
-# Grab all tests commands
-pushd "$build_dir"
-sIFS=$IFS # Save Initial IFS (Internal Field Separator)
-IFS=$'\n' # Change IFS (Internal Field Separator) (So we split ctest output by newline rather than by spaces)
-test_args=($(ctest -R ${test_suite} -V -N | grep "Test command" | cut -d':' -f3 | awk '{$1=$1};1' )) # Get test args
-IFS=$sIFS # Reset IFS (Internal Field Separator)
-popd > /dev/null || exit 1
-
-# Grab specific test command
-single_test_name="${tests[test_number]}"
-single_test_command="${test_args[test_number]}"
-
-
-# Step 5: Execute or GDB Debug
-##############################
-
-printf "${magenta}Running Test #${test_number}: ${single_test_name}${normal}\n"
-printf "${cyan}single_test_command: ${single_test_command}${normal}\n"
-
-if [ "$gdb_mode" = "true" ]; then
-    # Execute debugger
-    pushd "$repo_root" || exit 1
-    eval "gdb --args ${single_test_command}"
-    popd > /dev/null || exit 1
-
-else
-    # Execute Test
-    pushd "$repo_root" || exit 1
-    eval "${single_test_command}"
-    exit_code=$?
-    popd > /dev/null || exit 1
-
-    # Print Result
-    printf "${blue}Ran Test #${test_number}: ${single_test_name}${normal}\n"
-    printf "${yellow}Command: ${single_test_command}${normal}\n"
-    if [ $exit_code -eq 0 ]; then
-        printf "${green}TEST PASS${normal}\n"
-    else
-        printf "${red}TEST FAIL${normal}\n"
-    fi
-
-fi
-
-# Return to the directory from which the user ran the command.
-popd > /dev/null || exit 1
--- a/scripts/gen-unicode-data.py
+++ b/scripts/gen-unicode-data.py
@ -1,194 +0,0 @@
-import array
-import unicodedata
-import requests
-
-
-MAX_CODEPOINTS = 0x110000
-
-UNICODE_DATA_URL = "https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt"
-
-
-# see https://www.unicode.org/L2/L1999/UnicodeData.html
-def unicode_data_iter():
-    res = requests.get(UNICODE_DATA_URL)
-    res.raise_for_status()
-    data = res.content.decode()
-
-    prev = []
-
-    for line in data.splitlines():
-        # ej: 0000;<control>;Cc;0;BN;;;;;N;NULL;;;;
-        line = line.split(";")
-
-        cpt = int(line[0], base=16)
-        assert cpt < MAX_CODEPOINTS
-
-        cpt_lower = int(line[-2] or "0", base=16)
-        assert cpt_lower < MAX_CODEPOINTS
-
-        cpt_upper = int(line[-3] or "0", base=16)
-        assert cpt_upper < MAX_CODEPOINTS
-
-        categ = line[2].strip()
-        assert len(categ) == 2
-
-        bidir = line[4].strip()
-        assert len(categ) == 2
-
-        name = line[1]
-        if name.endswith(", First>"):
-            prev = (cpt, cpt_lower, cpt_upper, categ, bidir)
-            continue
-        if name.endswith(", Last>"):
-            assert prev[1:] == (0, 0, categ, bidir)
-            for c in range(prev[0], cpt):
-                yield (c, cpt_lower, cpt_upper, categ, bidir)
-
-        yield (cpt, cpt_lower, cpt_upper, categ, bidir)
-
-
-# see definition in unicode.h
-CODEPOINT_FLAG_UNDEFINED   = 0x0001  #
-CODEPOINT_FLAG_NUMBER      = 0x0002  # \p{N}
-CODEPOINT_FLAG_LETTER      = 0x0004  # \p{L}
-CODEPOINT_FLAG_SEPARATOR   = 0x0008  # \p{Z}
-CODEPOINT_FLAG_MARK        = 0x0010  # \p{M}
-CODEPOINT_FLAG_PUNCTUATION = 0x0020  # \p{P}
-CODEPOINT_FLAG_SYMBOL      = 0x0040  # \p{S}
-CODEPOINT_FLAG_CONTROL     = 0x0080  # \p{C}
-
-UNICODE_CATEGORY_TO_FLAG = {
-    "Cn": CODEPOINT_FLAG_UNDEFINED,    # Undefined
-    "Cc": CODEPOINT_FLAG_CONTROL,      # Control
-    "Cf": CODEPOINT_FLAG_CONTROL,      # Format
-    "Co": CODEPOINT_FLAG_CONTROL,      # Private Use
-    "Cs": CODEPOINT_FLAG_CONTROL,      # Surrrogate
-    "Ll": CODEPOINT_FLAG_LETTER,       # Lowercase Letter
-    "Lm": CODEPOINT_FLAG_LETTER,       # Modifier Letter
-    "Lo": CODEPOINT_FLAG_LETTER,       # Other Letter
-    "Lt": CODEPOINT_FLAG_LETTER,       # Titlecase Letter
-    "Lu": CODEPOINT_FLAG_LETTER,       # Uppercase Letter
-    "L&": CODEPOINT_FLAG_LETTER,       # Cased Letter
-    "Mc": CODEPOINT_FLAG_MARK,         # Spacing Mark
-    "Me": CODEPOINT_FLAG_MARK,         # Enclosing Mark
-    "Mn": CODEPOINT_FLAG_MARK,         # Nonspacing Mark
-    "Nd": CODEPOINT_FLAG_NUMBER,       # Decimal Number
-    "Nl": CODEPOINT_FLAG_NUMBER,       # Letter Number
-    "No": CODEPOINT_FLAG_NUMBER,       # Other Number
-    "Pc": CODEPOINT_FLAG_PUNCTUATION,  # Connector Punctuation
-    "Pd": CODEPOINT_FLAG_PUNCTUATION,  # Dash Punctuation
-    "Pe": CODEPOINT_FLAG_PUNCTUATION,  # Close Punctuation
-    "Pf": CODEPOINT_FLAG_PUNCTUATION,  # Final Punctuation
-    "Pi": CODEPOINT_FLAG_PUNCTUATION,  # Initial Punctuation
-    "Po": CODEPOINT_FLAG_PUNCTUATION,  # Other Punctuation
-    "Ps": CODEPOINT_FLAG_PUNCTUATION,  # Open Punctuation
-    "Sc": CODEPOINT_FLAG_SYMBOL,       # Currency Symbol
-    "Sk": CODEPOINT_FLAG_SYMBOL,       # Modifier Symbol
-    "Sm": CODEPOINT_FLAG_SYMBOL,       # Math Symbol
-    "So": CODEPOINT_FLAG_SYMBOL,       # Other Symbol
-    "Zl": CODEPOINT_FLAG_SEPARATOR,    # Line Separator
-    "Zp": CODEPOINT_FLAG_SEPARATOR,    # Paragraph Separator
-    "Zs": CODEPOINT_FLAG_SEPARATOR,    # Space Separator
-}
-
-
-codepoint_flags = array.array('H', [CODEPOINT_FLAG_UNDEFINED]) * MAX_CODEPOINTS
-table_whitespace = []
-table_lowercase = []
-table_uppercase = []
-table_nfd = []
-
-for (cpt, cpt_lower, cpt_upper, categ, bidir) in unicode_data_iter():
-    # convert codepoint to unicode character
-    char = chr(cpt)
-
-    # codepoint category flags
-    codepoint_flags[cpt] = UNICODE_CATEGORY_TO_FLAG[categ]
-
-    # lowercase conversion
-    if cpt_lower:
-        table_lowercase.append((cpt, cpt_lower))
-
-    # uppercase conversion
-    if cpt_upper:
-        table_uppercase.append((cpt, cpt_upper))
-
-    # NFD normalization
-    norm = ord(unicodedata.normalize('NFD', char)[0])
-    if cpt != norm:
-        table_nfd.append((cpt, norm))
-
-
-# whitespaces, see "<White_Space>" https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt
-table_whitespace.extend(range(0x0009, 0x000D + 1))
-table_whitespace.extend(range(0x2000, 0x200A + 1))
-table_whitespace.extend([0x0020, 0x0085, 0x00A0, 0x1680, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000])
-
-
-# sort by codepoint
-table_whitespace.sort()
-table_lowercase.sort()
-table_uppercase.sort()
-table_nfd.sort()
-
-
-# group ranges with same flags
-ranges_flags = [(0, codepoint_flags[0])]  # start, flags
-for codepoint, flags in enumerate(codepoint_flags):
-    if flags != ranges_flags[-1][1]:
-        ranges_flags.append((codepoint, flags))
-ranges_flags.append((MAX_CODEPOINTS, 0x0000))
-
-
-# group ranges with same nfd
-ranges_nfd = [(0, 0, 0)]  # start, last, nfd
-for codepoint, norm in table_nfd:
-    start = ranges_nfd[-1][0]
-    if ranges_nfd[-1] != (start, codepoint - 1, norm):
-        ranges_nfd.append(None)
-        start = codepoint
-    ranges_nfd[-1] = (start, codepoint, norm)
-
-
-# Generate 'unicode-data.cpp':
-#   python ./scripts//gen-unicode-data.py > unicode-data.cpp
-
-def out(line=""):
-    print(line, end='\n')  # noqa
-
-
-out("""\
-// generated with scripts/gen-unicode-data.py
-
-#include "unicode-data.h"
-
-#include <cstdint>
-#include <vector>
-#include <unordered_map>
-#include <unordered_set>
-""")
-
-out("const std::vector<std::pair<uint32_t, uint16_t>> unicode_ranges_flags = {  // start, flags // last=next_start-1")
-for codepoint, flags in ranges_flags:
-    out("{0x%06X, 0x%04X}," % (codepoint, flags))
-out("};\n")
-
-out("const std::unordered_set<uint32_t> unicode_set_whitespace = {")
-for codepoint in table_whitespace:
-    out("0x%06X," % codepoint)
-out("};\n")
-
-out("const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase = {")
-for tuple in table_lowercase:
-    out("{0x%06X, 0x%06X}," % tuple)
-out("};\n")
-
-out("const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase = {")
-for tuple in table_uppercase:
-    out("{0x%06X, 0x%06X}," % tuple)
-out("};\n")
-
-out("const std::vector<range_nfd> unicode_ranges_nfd = {  // start, last, nfd")
-for triple in ranges_nfd:
-    out("{0x%06X, 0x%06X, 0x%06X}," % triple)
-out("};\n")
--- a/scripts/xxd.cmake
+++ b/scripts/xxd.cmake
@ -1,16 +0,0 @@
-# CMake equivalent of `xxd -i ${INPUT} ${OUTPUT}`
-# Usage: cmake -DINPUT=examples/server/public/index.html -DOUTPUT=examples/server/index.html.hpp -P scripts/xxd.cmake
-
-SET(INPUT "" CACHE STRING "Input File")
-SET(OUTPUT "" CACHE STRING "Output File")
-
-get_filename_component(filename "${INPUT}" NAME)
-string(REGEX REPLACE "\\.|-" "_" name "${filename}")
-
-file(READ "${INPUT}" hex_data HEX)
-string(REGEX REPLACE "([0-9a-f][0-9a-f])" "0x\\1," hex_sequence "${hex_data}")
-
-string(LENGTH ${hex_data} hex_len)
-math(EXPR len "${hex_len} / 2")
-
-file(WRITE "${OUTPUT}" "unsigned char ${name}[] = {${hex_sequence}};\nunsigned int ${name}_len = ${len};\n")