Merge branch 'upstream' into concedo_experimental

# Conflicts:
#	.github/workflows/build.yml
#	.github/workflows/docker.yml
#	README.md
#	build-xcframework.sh
#	common/CMakeLists.txt
#	examples/CMakeLists.txt
#	ggml/src/ggml-cpu/CMakeLists.txt
#	ggml/src/ggml-cuda/CMakeLists.txt
#	ggml/src/ggml-metal/ggml-metal.m
#	ggml/src/ggml-metal/ggml-metal.metal
#	ggml/src/ggml-sycl/CMakeLists.txt
#	ggml/src/ggml-sycl/backend.hpp
#	ggml/src/ggml-sycl/common.hpp
#	ggml/src/ggml-sycl/ggml-sycl.cpp
#	ggml/src/ggml-sycl/mmvq.cpp
#	ggml/src/ggml-sycl/vecdotq.hpp
#	scripts/compare-llama-bench.py
#	src/CMakeLists.txt
#	src/llama-model.cpp
#	src/llama.cpp
#	tests/test-backend-ops.cpp
#	tests/test-opt.cpp
#	tools/llama-bench/README.md
#	tools/llama-bench/llama-bench.cpp
#	tools/mtmd/CMakeLists.txt
#	tools/mtmd/README.md
#	tools/mtmd/clip.cpp
#	tools/rpc/rpc-server.cpp
#	tools/server/CMakeLists.txt
#	tools/server/README.md
This commit is contained in:
Concedo 2025-05-13 00:28:35 +08:00
commit 21e31e255b
90 changed files with 4390 additions and 1388 deletions

View file

@ -522,7 +522,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
nthread = std::thread::hardware_concurrency();
}
// mmap consistently increases speed Linux, and also increases speed on Windows with
// mmap consistently increases speed on Linux, and also increases speed on Windows with
// hot cache. It may cause a slowdown on macOS, possibly related to free memory.
#if defined(__linux__) || defined(_WIN32)
constexpr bool use_mmap = true;
@ -532,7 +532,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
llama_model_kv_override * kv_overrides = nullptr;
if (params->kv_overrides) {
auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
auto * v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
kv_overrides = v->data();
}