Merge commit '1c641e6aac' into concedo_experimental

# Conflicts:
#	.devops/cloud-v-pipeline
#	.devops/llama-cli-cuda.Dockerfile
#	.devops/llama-cli-rocm.Dockerfile
#	.devops/llama-cli-vulkan.Dockerfile
#	.devops/llama-cli.Dockerfile
#	.devops/llama-cpp-clblast.srpm.spec
#	.devops/llama-cpp-cuda.srpm.spec
#	.devops/llama-cpp.srpm.spec
#	.devops/llama-server-cuda.Dockerfile
#	.devops/llama-server-rocm.Dockerfile
#	.devops/llama-server-vulkan.Dockerfile
#	.devops/llama-server.Dockerfile
#	.devops/nix/apps.nix
#	.devops/nix/package.nix
#	.devops/tools.sh
#	.dockerignore
#	.github/ISSUE_TEMPLATE/01-bug-low.yml
#	.github/ISSUE_TEMPLATE/02-bug-medium.yml
#	.github/ISSUE_TEMPLATE/03-bug-high.yml
#	.github/ISSUE_TEMPLATE/04-bug-critical.yml
#	.github/workflows/bench.yml
#	.github/workflows/build.yml
#	.github/workflows/docker.yml
#	.github/workflows/server.yml
#	.gitignore
#	Makefile
#	README-sycl.md
#	README.md
#	ci/run.sh
#	docs/token_generation_performance_tips.md
#	flake.nix
#	grammars/README.md
#	pocs/vdot/CMakeLists.txt
#	scripts/get-hellaswag.sh
#	scripts/get-wikitext-103.sh
#	scripts/get-wikitext-2.sh
#	scripts/get-winogrande.sh
#	scripts/hf.sh
#	scripts/pod-llama.sh
#	scripts/qnt-all.sh
#	scripts/run-all-ppl.sh
#	scripts/run-with-preset.py
#	scripts/server-llm.sh
#	tests/test-backend-ops.cpp
This commit is contained in:
Concedo 2024-06-14 18:41:37 +08:00
commit b53e760557
94 changed files with 457 additions and 317 deletions

View file

@ -1,4 +1,4 @@
set(TARGET imatrix)
set(TARGET llama-imatrix)
add_executable(${TARGET} imatrix.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})

View file

@ -6,7 +6,7 @@ More information is available here: https://github.com/ggerganov/llama.cpp/pull/
## Usage
```
./imatrix \
./llama-imatrix \
-m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \
[--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \
[--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]
@ -28,8 +28,8 @@ For faster computation, make sure to use GPU offloading via the `-ngl` argument
LLAMA_CUDA=1 make -j
# generate importance matrix (imatrix.dat)
./imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99
./llama-imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99
# use the imatrix to perform a Q4_K_M quantization
./quantize --imatrix imatrix.dat ggml-model-f16.gguf ./ggml-model-q4_k_m.gguf q4_k_m
./llama-quantize --imatrix imatrix.dat ggml-model-f16.gguf ./ggml-model-q4_k_m.gguf q4_k_m
```