koboldcpp/ci/run.sh
Jan Ekström a6d6183dbc
ggml-vulkan/CMakeLists: add a check for SPIRV-Headers (#22009)
* ci/run: set explicit SPIR-V Headers search path for macOS vulkan CI

For whatever reason, the files are under additional sub-path
`vulkan/` under the cmake directory, which does not match either
current LunarG macOS Vulkan SDK structure (`lib/cmake/SPIRV-Headers`),
nor what gets installed when you run the cmake build+install for
SPIRV-Headers itself on at least Linux (`share/cmake/SPIRV-Headers`).

This allows for SPIRV-Headers to be found, as currently the CI
runner's setup does not seem to include the relevant path in
list of search locations.

* ggml-vulkan/CMakeLists: add a check for SPIRV-Headers

This is installed by the project if it is built and installed.
Receiving an error during the configuration step is generally
preferred to receiving an error in the middle of a build.
2026-05-17 13:12:11 +02:00

749 lines
29 KiB
Bash
Executable file

#!/usr/bin/env bash
#
# sample usage:
#
# mkdir tmp
#
# # CPU-only build
# bash ./ci/run.sh ./tmp/results ./tmp/mnt
#
# # with CUDA support
# GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
#
# # with SYCL support
# GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
#
# # with VULKAN support
# GG_BUILD_VULKAN=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
#
# # with WebGPU support
# GG_BUILD_WEBGPU=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
#
# # with MUSA support
# GG_BUILD_MUSA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
#
# # with KLEIDIAI support
# GG_BUILD_KLEIDIAI=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
#
# # with BLAS support
# GG_BUILD_BLAS=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
#
# with BLAS support (custom vendor)
# GG_BUILD_BLAS=1 GG_BUILD_BLAS_VENDOR=Intel10_64lp bash ./ci/run.sh ./tmp/results ./tmp/mnt
#
# with OPENVINO support
# GG_BUILD_OPENVINO=1 GG_BUILD_LOW_PERF=1 GGML_OPENVINO_DEVICE=CPU bash ./ci/run.sh ./tmp/results ./tmp/mnt
#
if [ -z "$2" ]; then
echo "usage: $0 <output-dir> <mnt-dir>"
exit 1
fi
mkdir -p "$1"
mkdir -p "$2"
OUT=$(realpath "$1")
MNT=$(realpath "$2")
rm -f $OUT/*.log
rm -f $OUT/*.exit
rm -f $OUT/*.md
sd=`dirname $0`
cd $sd/../
SRC=`pwd`
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=${LLAMA_FATAL_WARNINGS:-ON} -DLLAMA_OPENSSL=OFF -DGGML_SCHED_NO_REALLOC=ON"
CTEST_EXTRA=""
# Default to use make unless specified for compatibility
CMAKE_GENERATOR="Unix Makefiles"
if [ ! -z "${GG_BUILD_NINJA}" ]; then
CMAKE_GENERATOR="Ninja"
fi
if [ ! -z ${GG_BUILD_METAL} ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
fi
if [ ! -z ${GG_BUILD_CUDA} ]; then
# TODO: Remove GGML_CUDA_CUB_3DOT2 flag once CCCL 3.2 is bundled within CTK and that CTK version is used in this project
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=ON -DGGML_CUDA_CUB_3DOT2=ON"
if command -v nvidia-smi >/dev/null 2>&1; then
CUDA_ARCH=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader,nounits 2>/dev/null | head -1 | tr -d '.')
if [[ -n "$CUDA_ARCH" && "$CUDA_ARCH" =~ ^[0-9]+$ ]]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCH}"
else
echo "Warning: Using fallback CUDA architectures"
CMAKE_EXTRA="${CMAKE_EXTRA} -DCMAKE_CUDA_ARCHITECTURES=61;70;75;80;86;89"
fi
else
echo "Error: nvidia-smi not found, cannot build with CUDA"
exit 1
fi
fi
if [ ! -z ${GG_BUILD_ROCM} ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_HIP=ON"
if [ -z ${GG_BUILD_AMDGPU_TARGETS} ]; then
echo "Missing GG_BUILD_AMDGPU_TARGETS, please set it to your GPU architecture (e.g. gfx90a, gfx1100, etc.)"
exit 1
fi
CMAKE_EXTRA="${CMAKE_EXTRA} -DGPU_TARGETS=${GG_BUILD_AMDGPU_TARGETS}"
fi
if [ ! -z ${GG_BUILD_SYCL} ]; then
if [ -z ${ONEAPI_ROOT} ]; then
echo "Not detected ONEAPI_ROOT, please install oneAPI base toolkit and enable it by:"
echo "source /opt/intel/oneapi/setvars.sh"
exit 1
fi
# Use only main GPU
export ONEAPI_DEVICE_SELECTOR="level_zero:0"
# Enable sysman for correct memory reporting
export ZES_ENABLE_SYSMAN=1
# to circumvent precision issues on CPY operations
export SYCL_PROGRAM_COMPILE_OPTIONS="-cl-fp32-correctly-rounded-divide-sqrt"
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON"
fi
if [ ! -z ${GG_BUILD_VULKAN} ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_VULKAN=1"
# if on Mac, disable METAL
if [[ "$OSTYPE" == "darwin"* ]]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=OFF -DGGML_BLAS=OFF"
MACOS_RUNNER_CUSTOM_VULKAN_CMAKE_LOCATION="/usr/local/lib/cmake/vulkan"
MACOS_RUNNER_CUSTOM_SPIRV_HEADERS_LOCATION="${MACOS_RUNNER_CUSTOM_VULKAN_CMAKE_LOCATION}/SPIRV-Headers/SPIRV-HeadersConfig.cmake"
if [[ -f "${MACOS_RUNNER_CUSTOM_SPIRV_HEADERS_LOCATION}" || -h "${MACOS_RUNNER_CUSTOM_SPIRV_HEADERS_LOCATION}" ]]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DSPIRV-Headers_DIR=${MACOS_RUNNER_CUSTOM_VULKAN_CMAKE_LOCATION}/SPIRV-Headers"
fi
fi
# Build shared libs on Windows
# to reduce binary size and avoid errors in library loading unit tests
if uname -s | grep -qi nt; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DBUILD_SHARED_LIBS=ON"
fi
fi
if [ ! -z ${GG_BUILD_WEBGPU} ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_WEBGPU=1 -DGGML_METAL=OFF -DGGML_BLAS=OFF"
if [ ! -z "${GG_BUILD_WEBGPU_DAWN_PREFIX}" ]; then
if [ -z "${CMAKE_PREFIX_PATH}" ]; then
export CMAKE_PREFIX_PATH="${GG_BUILD_WEBGPU_DAWN_PREFIX}"
else
export CMAKE_PREFIX_PATH="${GG_BUILD_WEBGPU_DAWN_PREFIX}:${CMAKE_PREFIX_PATH}"
fi
fi
# For some systems, Dawn_DIR needs to be set explicitly, e.g., the lib64 path
if [ ! -z "${GG_BUILD_WEBGPU_DAWN_DIR}" ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DDawn_DIR=${GG_BUILD_WEBGPU_DAWN_DIR}"
fi
fi
if [ ! -z ${GG_BUILD_MUSA} ]; then
# Use qy1 by default (MTT S80)
MUSA_ARCH=${MUSA_ARCH:-21}
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_MUSA=ON -DMUSA_ARCHITECTURES=${MUSA_ARCH}"
fi
if [ ! -z ${GG_BUILD_NO_SVE} ]; then
# arm 9 and newer enables sve by default, adjust these flags depending on the cpu used
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8.5-a+fp16+i8mm"
fi
if [ -n "${GG_BUILD_KLEIDIAI}" ]; then
echo ">>===== Enabling KleidiAI support"
CMAKE_EXTRA="${CMAKE_EXTRA:+$CMAKE_EXTRA } -DGGML_CPU_KLEIDIAI=ON"
fi
if [ ! -z ${GG_BUILD_BLAS} ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=${GG_BUILD_BLAS_VENDOR:-OpenBLAS}"
fi
if [ ! -z ${GG_BUILD_OPENVINO} ]; then
if [ -z ${OpenVINO_DIR} ]; then
echo "OpenVINO_DIR not found, please install OpenVINO via archives and enable it by:"
echo "source /opt/intel/openvino/setupvars.sh"
exit 1
fi
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_OPENVINO=ON"
# TODO: fix and re-enable the `test-llama-archs` test below
CTEST_EXTRA="-E test-llama-archs"
fi
## helpers
# download a file if it does not exist or if it is outdated
function gg_wget {
local out=$1
local url=$2
local cwd=`pwd`
mkdir -p $out
cd $out
# should not re-download if file is the same
wget -nv -c -N $url
cd $cwd
}
function gg_printf {
printf -- "$@" >> $OUT/README.md
}
function gg_run {
ci=$1
set -o pipefail
set -x
gg_run_$ci | tee $OUT/$ci.log
cur=$?
echo "$cur" > $OUT/$ci.exit
set +x
set +o pipefail
gg_sum_$ci
ret=$((ret | cur))
}
## ci
# ctest_debug
function gg_run_ctest_debug {
cd ${SRC}
rm -rf build-ci-debug && mkdir build-ci-debug && cd build-ci-debug
set -e
# Check required binaries are installed
gg_check_build_requirements
(cmake -G "${CMAKE_GENERATOR}" -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time cmake --build . --config Debug -j$(nproc)) 2>&1 | tee -a $OUT/${ci}-make.log
(time ctest -C Debug --output-on-failure -L main -E "test-opt|test-backend-ops" ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log
set +e
}
function gg_sum_ctest_debug {
gg_printf '### %s\n\n' "${ci}"
gg_printf 'Runs ctest in debug mode\n'
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
gg_printf '```\n'
gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
gg_printf '```\n'
gg_printf '\n'
}
# ctest_release
function gg_run_ctest_release {
cd ${SRC}
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
set -e
# Check required binaries are installed
gg_check_build_requirements
(cmake -G "${CMAKE_GENERATOR}" -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time cmake --build . --config Release -j$(nproc)) 2>&1 | tee -a $OUT/${ci}-make.log
if [ -z ${GG_BUILD_LOW_PERF} ]; then
(time ctest -C Release --output-on-failure -L 'main|python' ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log
else
(time ctest -C Release --output-on-failure -L main -E test-opt ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log
fi
set +e
}
function gg_sum_ctest_release {
gg_printf '### %s\n\n' "${ci}"
gg_printf 'Runs ctest in release mode\n'
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
gg_printf '```\n'
gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
gg_printf '```\n'
}
# test_scripts
function gg_run_test_scripts {
cd ${SRC}
set -e
(cd ./tools/gguf-split && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
(cd ./tools/quantize && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
set +e
}
function gg_sum_test_scripts {
gg_printf '### %s\n\n' "${ci}"
gg_printf 'Runs test scripts\n'
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
gg_printf '```\n'
gg_printf '%s\n' "$(cat $OUT/${ci}-scripts.log)"
gg_printf '```\n'
gg_printf '\n'
}
function gg_get_model {
#local gguf_0="$MNT/models/qwen3/0.6B/ggml-model-f16.gguf"
local gguf_0="$MNT/models/qwen3/0.6B/ggml-model-q4_0.gguf"
if [[ -s $gguf_0 ]]; then
echo -n "$gguf_0"
else
echo >&2 "No model found. Can't run gg_run_ctest_with_model."
exit 1
fi
}
function gg_run_ctest_with_model_debug {
cd ${SRC}
local model; model=$(gg_get_model)
cd build-ci-debug
set -e
(LLAMACPP_TEST_MODELFILE="$model" time ctest -C Debug --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
set +e
cd ..
}
function gg_run_ctest_with_model_release {
cd ${SRC}
local model; model=$(gg_get_model)
cd build-ci-release
set -e
(LLAMACPP_TEST_MODELFILE="$model" time ctest -C Release --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
# test memory leaks
#if [[ ! -z ${GG_BUILD_METAL} ]]; then
# # TODO: this hangs for some reason ...
# (time leaks -quiet -atExit -- ./bin/test-thread-safety -m $model --parallel 2 -t 2 -p "hello") 2>&1 | tee -a $OUT/${ci}-leaks.log
#fi
set +e
cd ..
}
function gg_sum_ctest_with_model_debug {
gg_printf '### %s\n\n' "${ci}"
gg_printf 'Runs ctest with model files in debug mode\n'
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
gg_printf '```\n'
gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
gg_printf '```\n'
}
function gg_sum_ctest_with_model_release {
gg_printf '### %s\n\n' "${ci}"
gg_printf 'Runs ctest with model files in release mode\n'
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
gg_printf '```\n'
gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
gg_printf '```\n'
}
# qwen3_0_6b
function gg_run_qwen3_0_6b {
cd ${SRC}
gg_wget models-mnt/qwen3/0.6B/ https://huggingface.co/Qwen/Qwen3-0.6B-Base/raw/main/config.json
gg_wget models-mnt/qwen3/0.6B/ https://huggingface.co/Qwen/Qwen3-0.6B-Base/raw/main/tokenizer.json
gg_wget models-mnt/qwen3/0.6B/ https://huggingface.co/Qwen/Qwen3-0.6B-Base/raw/main/tokenizer_config.json
#gg_wget models-mnt/qwen3/0.6B/ https://huggingface.co/Qwen/Qwen3-0.6B-Base/raw/main/special_tokens_map.json
gg_wget models-mnt/qwen3/0.6B/ https://huggingface.co/Qwen/Qwen3-0.6B-Base/resolve/main/model.safetensors
gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
path_models="../models-mnt/qwen3/0.6B"
path_wiki="../models-mnt/wikitext/wikitext-2-raw"
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
set -e
(cmake -G "${CMAKE_GENERATOR}" -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time cmake --build . --config Release -j$(nproc)) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf --outtype f16
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-bf16.gguf --outtype bf16
model_f16="${path_models}/ggml-model-f16.gguf"
model_bf16="${path_models}/ggml-model-bf16.gguf"
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
model_q4_0="${path_models}/ggml-model-q4_0.gguf"
model_q4_1="${path_models}/ggml-model-q4_1.gguf"
model_q5_0="${path_models}/ggml-model-q5_0.gguf"
model_q5_1="${path_models}/ggml-model-q5_1.gguf"
model_q2_k="${path_models}/ggml-model-q2_k.gguf"
model_q3_k="${path_models}/ggml-model-q3_k.gguf"
model_q4_k="${path_models}/ggml-model-q4_k.gguf"
model_q5_k="${path_models}/ggml-model-q5_k.gguf"
model_q6_k="${path_models}/ggml-model-q6_k.gguf"
wiki_test="${path_wiki}/wiki.test.raw"
./bin/llama-quantize ${model_bf16} ${model_q8_0} q8_0 $(nproc)
./bin/llama-quantize ${model_bf16} ${model_q4_0} q4_0 $(nproc)
./bin/llama-quantize ${model_bf16} ${model_q4_1} q4_1 $(nproc)
./bin/llama-quantize ${model_bf16} ${model_q5_0} q5_0 $(nproc)
./bin/llama-quantize ${model_bf16} ${model_q5_1} q5_1 $(nproc)
./bin/llama-quantize ${model_bf16} ${model_q2_k} q2_k $(nproc)
./bin/llama-quantize ${model_bf16} ${model_q3_k} q3_k $(nproc)
./bin/llama-quantize ${model_bf16} ${model_q4_k} q4_k $(nproc)
./bin/llama-quantize ${model_bf16} ${model_q5_k} q5_k $(nproc)
./bin/llama-quantize ${model_bf16} ${model_q6_k} q6_k $(nproc)
(time ./bin/llama-fit-params --model ${model_f16} 2>&1 | tee -a $OUT/${ci}-fp-f16.log)
(time ./bin/llama-completion -no-cnv --model ${model_f16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-completion -no-cnv --model ${model_bf16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-bf16.log
(time ./bin/llama-completion -no-cnv --model ${model_q8_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/llama-completion -no-cnv --model ${model_q4_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/llama-completion -no-cnv --model ${model_q4_1} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/llama-completion -no-cnv --model ${model_q5_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/llama-completion -no-cnv --model ${model_q5_1} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/llama-completion -no-cnv --model ${model_q2_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/llama-completion -no-cnv --model ${model_q3_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/llama-completion -no-cnv --model ${model_q4_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/llama-completion -no-cnv --model ${model_q5_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/llama-completion -no-cnv --model ${model_q6_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
if [ -z ${GG_BUILD_NO_BF16} ]; then
(time ./bin/llama-perplexity --model ${model_bf16} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-bf16.log
fi
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa off --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa on --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa on ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
function check_ppl {
qnt="$1"
ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
printf ' - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
return 20
fi
printf ' - %s @ %s OK\n' "$qnt" "$ppl"
return 0
}
check_ppl "f16" "$(cat $OUT/${ci}-tg-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
if [ -z ${GG_BUILD_NO_BF16} ]; then
check_ppl "bf16" "$(cat $OUT/${ci}-tg-bf16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
fi
check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
#check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log # note: ppl > 20.0 for this quant and model
check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
set +e
}
function gg_sum_qwen3_0_6b {
gg_printf '### %s\n\n' "${ci}"
gg_printf 'Qwen3 0.6B:\n'
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
gg_printf '- f16:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
if [ -z ${GG_BUILD_NO_BF16} ]; then
gg_printf '- bf16:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-bf16.log)"
fi
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
}
# bge-small
function gg_run_embd_bge_small {
cd ${SRC}
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/config.json
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/tokenizer.json
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/tokenizer_config.json
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/special_tokens_map.json
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/pytorch_model.bin
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/sentence_bert_config.json
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/vocab.txt
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/modules.json
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/config.json
gg_wget models-mnt/bge-small/1_Pooling https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/1_Pooling/config.json
path_models="../models-mnt/bge-small"
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
set -e
(cmake -G "${CMAKE_GENERATOR}" -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time cmake --build . --config Release -j$(nproc)) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
model_f16="${path_models}/ggml-model-f16.gguf"
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
(time ./bin/llama-fit-params --model ${model_f16} 2>&1 | tee -a $OUT/${ci}-fp-f16.log)
(time ./bin/llama-embedding --model ${model_f16} -p "I believe the meaning of life is" -ngl 99 -c 0 --no-op-offload) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" -ngl 99 -c 0 --no-op-offload) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
set +e
}
function gg_sum_embd_bge_small {
gg_printf '### %s\n\n' "${ci}"
gg_printf 'BGE Small (BERT):\n'
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
}
# rerank_tiny
function gg_run_rerank_tiny {
cd ${SRC}
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/config.json
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/tokenizer.json
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/tokenizer_config.json
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/special_tokens_map.json
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/resolve/main/pytorch_model.bin
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/vocab.json
path_models="../models-mnt/rerank-tiny"
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
set -e
(cmake -G "${CMAKE_GENERATOR}" -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time cmake --build . --config Release -j$(nproc)) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
model_f16="${path_models}/ggml-model-f16.gguf"
(time ./bin/llama-fit-params --model ${model_f16} 2>&1 | tee -a $OUT/${ci}-fp-f16.log)
# for this model, the SEP token is "</s>"
(time ./bin/llama-embedding --model ${model_f16} -p "what is panda?\thi\nwhat is panda?\tit's a bear\nwhat is panda?\tThe giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --no-op-offload --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
# sample output
# rerank score 0: 0.029
# rerank score 1: 0.029
# rerank score 2: 0.135
# check that the score is in the range [$3, $4]
function check_score {
qnt="$1"
score=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
if [ $(echo "$score < $3" | bc) -eq 1 ] || [ $(echo "$score > $4" | bc) -eq 1 ]; then
printf ' - %s @ %s (FAIL: score not in range [%s, %s])\n' "$qnt" "$score" "$3" "$4"
return 20
fi
printf ' - %s @ %s OK\n' "$qnt" "$score"
return 0
}
check_score "rerank score 0" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 0")" "0.00" "0.05" | tee -a $OUT/${ci}-rk-f16.log
check_score "rerank score 1" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 1")" "0.00" "0.05" | tee -a $OUT/${ci}-rk-f16.log
check_score "rerank score 2" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 2")" "0.10" "0.30" | tee -a $OUT/${ci}-rk-f16.log
set +e
}
function gg_sum_rerank_tiny {
gg_printf '### %s\n\n' "${ci}"
gg_printf 'Rerank Tiny (Jina):\n'
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-rk-f16.log)"
}
function gg_check_build_requirements {
if ! command -v git &> /dev/null; then
gg_printf 'git not found, please install'
fi
if ! command -v git-lfs &> /dev/null; then
gg_printf 'git-lfs not found, please install'
fi
if ! command -v wget &> /dev/null; then
gg_printf 'wget not found, please install'
fi
if ! command -v python3 &> /dev/null; then
gg_printf 'python3 not found, please install'
fi
if ! command -v pip3 &> /dev/null; then
gg_printf 'pip3 not found, please install'
fi
if ! python3 -m ensurepip --help &> /dev/null; then
gg_printf 'ensurepip not found, please install python3-venv package'
fi
if ! command -v cmake &> /dev/null; then
gg_printf 'cmake not found, please install'
fi
if ! command -v ccache &> /dev/null; then
gg_printf 'ccache not found, please consider installing for faster builds'
fi
if ! command -v ctest &> /dev/null; then
gg_printf 'ctest not found, please install'
fi
}
function gg_run_test_backend_ops_cpu {
cd ${SRC}
cd build-ci-release
set -e
(time ./bin/test-backend-ops -b CPU ) 2>&1 | tee -a $OUT/${ci}-test-backend-ops-cpu.log
set +e
}
function gg_sum_test_backend_ops_cpu {
gg_printf '### %s\n\n' "${ci}"
gg_printf 'Runs test-backend-ops for CPU backend\n'
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
gg_printf '```\n'
gg_printf '%s\n' "$(cat $OUT/${ci}-test-backend-ops-cpu.log)"
gg_printf '```\n'
gg_printf '\n'
}
## main
export LLAMA_LOG_PREFIX=1
export LLAMA_LOG_TIMESTAMPS=1
if [ -z ${GG_BUILD_LOW_PERF} ]; then
# Create symlink: ./llama.cpp/models-mnt -> $MNT/models
rm -rf ${SRC}/models-mnt
mnt_models=${MNT}/models
mkdir -p ${mnt_models}
ln -sfn ${mnt_models} ${SRC}/models-mnt
# Create a fresh python3 venv and enter it
if ! python3 -m venv "$MNT/venv"; then
echo "Error: Failed to create Python virtual environment at $MNT/venv."
exit 1
fi
source "$MNT/venv/bin/activate"
pip install -r ${SRC}/requirements.txt --disable-pip-version-check
pip install --editable gguf-py --disable-pip-version-check
fi
ret=0
test $ret -eq 0 && gg_run ctest_debug
test $ret -eq 0 && gg_run ctest_release
if [ ! -z ${GG_BUILD_HIGH_PERF} ]; then
test $ret -eq 0 && gg_run test_backend_ops_cpu
fi
if [ -z ${GG_BUILD_LOW_PERF} ]; then
test $ret -eq 0 && gg_run embd_bge_small
test $ret -eq 0 && gg_run rerank_tiny
if [ -z ${GG_BUILD_CLOUD} ] || [ ${GG_BUILD_EXTRA_TESTS_0} ]; then
test $ret -eq 0 && gg_run test_scripts
fi
test $ret -eq 0 && gg_run qwen3_0_6b
test $ret -eq 0 && gg_run ctest_with_model_debug
test $ret -eq 0 && gg_run ctest_with_model_release
fi
cat $OUT/README.md
exit $ret