mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 09:34:37 +00:00
Merge branch 'upstream' into concedo_experimental
# Conflicts: # .devops/llama-cpp-cuda.srpm.spec # .devops/llama-cpp.srpm.spec # .devops/nix/package.nix # .devops/rocm.Dockerfile # .github/ISSUE_TEMPLATE/020-enhancement.yml # .github/ISSUE_TEMPLATE/030-research.yml # .github/ISSUE_TEMPLATE/040-refactor.yml # .github/ISSUE_TEMPLATE/config.yml # .github/pull_request_template.md # .github/workflows/bench.yml.disabled # .github/workflows/build.yml # .github/workflows/labeler.yml # CONTRIBUTING.md # Makefile # README.md # SECURITY.md # ci/README.md # common/CMakeLists.txt # docs/android.md # docs/backend/SYCL.md # docs/build.md # docs/cuda-fedora.md # docs/development/HOWTO-add-model.md # docs/docker.md # docs/install.md # docs/llguidance.md # examples/cvector-generator/README.md # examples/imatrix/README.md # examples/imatrix/imatrix.cpp # examples/llama.android/llama/src/main/cpp/CMakeLists.txt # examples/llama.swiftui/README.md # examples/llama.vim # examples/lookahead/README.md # examples/lookup/README.md # examples/main/README.md # examples/passkey/README.md # examples/pydantic_models_to_grammar_examples.py # examples/retrieval/README.md # examples/server/CMakeLists.txt # examples/server/README.md # examples/simple-cmake-pkg/README.md # examples/speculative/README.md # flake.nix # grammars/README.md # pyproject.toml # scripts/check-requirements.sh
This commit is contained in:
commit
f144b1f345
44 changed files with 276250 additions and 93 deletions
|
@ -1821,7 +1821,7 @@ inline static float ggml_silu_f32(float x) {
|
|||
|
||||
#if __FINITE_MATH_ONLY__
|
||||
#error "some routines in ggml.c require non-finite math arithmetics -- pass -fno-finite-math-only to the compiler to fix"
|
||||
#error "ref: https://github.com/ggerganov/llama.cpp/pull/7154#issuecomment-2143844461"
|
||||
#error "ref: https://github.com/ggml-org/llama.cpp/pull/7154#issuecomment-2143844461"
|
||||
#endif
|
||||
|
||||
#if defined(__ARM_NEON) && defined(__aarch64__)
|
||||
|
@ -7613,7 +7613,7 @@ UseGgmlGemm2:;
|
|||
int64_t nchunk1 = (nr1 + chunk_size - 1) / chunk_size;
|
||||
|
||||
// If the chunking is poor for the number of threads on this setup, scrap the whole plan. Re-chunk it by thread.
|
||||
// Also, chunking by thread was measured to have perform better on NUMA systems. See https://github.com/ggerganov/llama.cpp/pull/6915
|
||||
// Also, chunking by thread was measured to have perform better on NUMA systems. See https://github.com/ggml-org/llama.cpp/pull/6915
|
||||
// In theory, chunking should be just as useful on NUMA and non NUMA systems, but testing disagreed with that.
|
||||
if (nchunk0 * nchunk1 < nth * 4 || ggml_is_numa()) {
|
||||
// distribute the thread work across the inner or outer loop based on which one is larger
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue