mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-08 09:59:50 +00:00
tryout smaller binaries
This commit is contained in:
parent
a5b6f372a3
commit
b951310ca5
6 changed files with 35 additions and 13 deletions
|
|
@ -11,6 +11,7 @@ on:
|
|||
env:
|
||||
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
||||
KCPP_CUDA: 12.1.0
|
||||
ARCHES_CU12: 1
|
||||
|
||||
jobs:
|
||||
linux:
|
||||
|
|
|
|||
|
|
@ -11,6 +11,7 @@ on:
|
|||
env:
|
||||
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
||||
NOAVX2: 1
|
||||
ARCHES_CU11: 1
|
||||
|
||||
jobs:
|
||||
linux:
|
||||
|
|
|
|||
|
|
@ -131,14 +131,14 @@ if (LLAMA_CUBLAS)
|
|||
# 70 == (assumed) compute capability at which unrolling a loop in mul_mat_q kernels is faster
|
||||
# 75 == int8 tensor cores
|
||||
if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
|
||||
set(CMAKE_CUDA_ARCHITECTURES "60;61;70;75") # needed for f16 CUDA intrinsics
|
||||
set(CMAKE_CUDA_ARCHITECTURES "60-virtual;61-virtual;70-virtual;75-virtual") # needed for f16 CUDA intrinsics
|
||||
else()
|
||||
message("CUDA Toolkit Version: ${CUDAToolkit_VERSION}")
|
||||
if(CUDAToolkit_VERSION VERSION_GREATER 12)
|
||||
add_compile_definitions(GGML_CUDA_USE_GRAPHS) #try enable cuda graphs on cu12 build
|
||||
set(CMAKE_CUDA_ARCHITECTURES "50;61;70;75") # lowest CUDA 12 standard + lowest for integer intrinsics
|
||||
set(CMAKE_CUDA_ARCHITECTURES "50-virtual;61-virtual;70-virtual;75-virtual;80-virtual") # lowest CUDA 12 standard + lowest for integer intrinsics
|
||||
else()
|
||||
set(CMAKE_CUDA_ARCHITECTURES "35;50;61;70;75") # lowest CUDA 12 standard + lowest for integer intrinsics
|
||||
set(CMAKE_CUDA_ARCHITECTURES "35-virtual;50-virtual;61-virtual;70-virtual;75-virtual") # lowest CUDA 12 standard + lowest for integer intrinsics
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
|
|
|||
29
Makefile
29
Makefile
|
|
@ -189,19 +189,32 @@ ifdef LLAMA_ADD_CONDA_PATHS
|
|||
CUBLASLD_FLAGS += -Lconda/envs/linux/lib -Lconda/envs/linux/lib/stubs
|
||||
endif
|
||||
|
||||
ifdef CUDA_DOCKER_ARCH
|
||||
NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
|
||||
else
|
||||
|
||||
ifdef LLAMA_PORTABLE
|
||||
ifdef LLAMA_COLAB #colab does not need all targets, all-major doesnt work correctly with pascal
|
||||
NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=all-major
|
||||
|
||||
ifdef LLAMA_ARCHES_CU11
|
||||
NVCCFLAGS += -Wno-deprecated-gpu-targets \
|
||||
-gencode arch=compute_35,code=compute_35 \
|
||||
-gencode arch=compute_50,code=compute_50 \
|
||||
-gencode arch=compute_61,code=compute_61 \
|
||||
-gencode arch=compute_70,code=compute_70 \
|
||||
-gencode arch=compute_75,code=compute_75
|
||||
|
||||
else ifdef LLAMA_ARCHES_CU12
|
||||
NVCCFLAGS += -Wno-deprecated-gpu-targets \
|
||||
-gencode arch=compute_50,code=compute_50 \
|
||||
-gencode arch=compute_61,code=compute_61 \
|
||||
-gencode arch=compute_70,code=compute_70 \
|
||||
-gencode arch=compute_75,code=compute_75 \
|
||||
-gencode arch=compute_80,code=compute_80
|
||||
|
||||
else
|
||||
NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=all
|
||||
endif #LLAMA_COLAB
|
||||
endif
|
||||
|
||||
else
|
||||
NVCCFLAGS += -arch=native
|
||||
endif #LLAMA_PORTABLE
|
||||
endif # CUDA_DOCKER_ARCH
|
||||
endif # LLAMA_PORTABLE
|
||||
|
||||
ifdef LLAMA_CUDA_F16
|
||||
NVCCFLAGS += -DGGML_CUDA_F16
|
||||
|
|
|
|||
|
|
@ -59,7 +59,7 @@ KoboldCpp can now also be run on Novita AI, a newer alternative GPU cloud provid
|
|||
|
||||
## Docker
|
||||
- The official docker can be found at https://hub.docker.com/r/koboldai/koboldcpp
|
||||
- If you're building your own docker, remember to set CUDA_DOCKER_ARCH or enable LLAMA_PORTABLE
|
||||
- If you're building your own docker, remember to enable LLAMA_PORTABLE
|
||||
|
||||
## Obtaining a GGUF model
|
||||
- KoboldCpp uses GGUF models. They are not included with KoboldCpp, but you can download GGUF files from other places such as [Bartowski's Huggingface](https://huggingface.co/bartowski). Search for "GGUF" on huggingface.co for plenty of compatible models in the `.gguf` format.
|
||||
|
|
|
|||
|
|
@ -21,11 +21,18 @@ KCPP_CUDA=$(<conda/envs/linux/cudaver)
|
|||
KCPP_CUDAAPPEND=-cuda${KCPP_CUDA//.}$KCPP_APPEND
|
||||
|
||||
LLAMA_NOAVX2_FLAG=""
|
||||
ARCHES_FLAG=""
|
||||
if [ -n "$NOAVX2" ]; then
|
||||
LLAMA_NOAVX2_FLAG="LLAMA_NOAVX2=1"
|
||||
fi
|
||||
if [ -n "$ARCHES_CU11" ]; then
|
||||
ARCHES_FLAG="LLAMA_ARCHES_CU11=1"
|
||||
fi
|
||||
if [ -n "$ARCHES_CU12" ]; then
|
||||
ARCHES_FLAG="LLAMA_ARCHES_CU12=1"
|
||||
fi
|
||||
|
||||
bin/micromamba run -r conda -p conda/envs/linux make -j$(nproc) LLAMA_VULKAN=1 LLAMA_CLBLAST=1 LLAMA_CUBLAS=1 LLAMA_PORTABLE=1 LLAMA_ADD_CONDA_PATHS=1 $LLAMA_NOAVX2_FLAG
|
||||
bin/micromamba run -r conda -p conda/envs/linux make -j$(nproc) LLAMA_VULKAN=1 LLAMA_CLBLAST=1 LLAMA_CUBLAS=1 LLAMA_PORTABLE=1 LLAMA_ADD_CONDA_PATHS=1 $LLAMA_NOAVX2_FLAG $ARCHES_FLAG
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Error: make failed."
|
||||
exit 1
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue