Add CMake flag for pipeline parallelism for multi-GPU (#940)

LCPP Default is set to 4, which is a bit too much in my opinion.
Saves VRAM (0.5-1%?), some compute and some electricity if set to 2, at the expense of some potential performance (prompt processing?), that I do not notice in usage. 2 is thus my own setting.
This commit is contained in:
Nexesenex 2024-06-25 13:28:41 +02:00 committed by GitHub
parent f7a0d252e6
commit dd5cda06b7
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -26,6 +26,7 @@ set(LLAMA_GPROF OFF)
set(LLAMA_SANITIZE_THREAD OFF) set(LLAMA_SANITIZE_THREAD OFF)
set(LLAMA_SANITIZE_ADDRESS OFF) set(LLAMA_SANITIZE_ADDRESS OFF)
set(LLAMA_SANITIZE_UNDEFINED OFF) set(LLAMA_SANITIZE_UNDEFINED OFF)
set(LLAMA_SCHED_MAX_COPIES "2" CACHE STRING "llama: max input copies for pipeline parallelism")
# instruction set specific # instruction set specific
option(LLAMA_AVX "llama: enable AVX" ON) option(LLAMA_AVX "llama: enable AVX" ON)
@ -66,6 +67,7 @@ set(THREADS_PREFER_PTHREAD_FLAG ON)
find_package(Threads REQUIRED) find_package(Threads REQUIRED)
add_compile_definitions(LOG_DISABLE_LOGS) add_compile_definitions(LOG_DISABLE_LOGS)
add_compile_definitions(GGML_SCHED_MAX_COPIES=${LLAMA_SCHED_MAX_COPIES})
file(GLOB GGML_SOURCES_CUDA "ggml-cuda/*.cu") file(GLOB GGML_SOURCES_CUDA "ggml-cuda/*.cu")
list(APPEND GGML_SOURCES_CUDA "ggml-cuda.cu") list(APPEND GGML_SOURCES_CUDA "ggml-cuda.cu")