tryout smaller binaries

2026-05-15 09:50:58 +00:00 · 2025-05-07 14:56:34 +08:00 · 2025-05-07 14:56:34 +08:00 · b951310ca5
commit b951310ca5
parent a5b6f372a3
6 changed files with 35 additions and 13 deletions
--- a/.github/workflows/kcpp-build-release-linux-cuda12.yaml
+++ b/.github/workflows/kcpp-build-release-linux-cuda12.yaml
@ -11,6 +11,7 @@ on:
 env:
  BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
  KCPP_CUDA: 12.1.0
+  ARCHES_CU12: 1

 jobs:
  linux:
--- a/.github/workflows/kcpp-build-release-linux.yaml
+++ b/.github/workflows/kcpp-build-release-linux.yaml
@ -11,6 +11,7 @@ on:
 env:
  BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
  NOAVX2: 1
+  ARCHES_CU11: 1

 jobs:
  linux:
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -131,14 +131,14 @@ if (LLAMA_CUBLAS)
        # 70 == (assumed) compute capability at which unrolling a loop in mul_mat_q kernels is faster
        # 75 == int8 tensor cores
        if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
-            set(CMAKE_CUDA_ARCHITECTURES "60;61;70;75") # needed for f16 CUDA intrinsics
+            set(CMAKE_CUDA_ARCHITECTURES "60-virtual;61-virtual;70-virtual;75-virtual") # needed for f16 CUDA intrinsics
        else()
            message("CUDA Toolkit Version: ${CUDAToolkit_VERSION}")
            if(CUDAToolkit_VERSION VERSION_GREATER 12)
                 add_compile_definitions(GGML_CUDA_USE_GRAPHS) #try enable cuda graphs on cu12 build
-                 set(CMAKE_CUDA_ARCHITECTURES "50;61;70;75") # lowest CUDA 12 standard + lowest for integer intrinsics
+                 set(CMAKE_CUDA_ARCHITECTURES "50-virtual;61-virtual;70-virtual;75-virtual;80-virtual") # lowest CUDA 12 standard + lowest for integer intrinsics
            else()
-                 set(CMAKE_CUDA_ARCHITECTURES "35;50;61;70;75") # lowest CUDA 12 standard + lowest for integer intrinsics
+                 set(CMAKE_CUDA_ARCHITECTURES "35-virtual;50-virtual;61-virtual;70-virtual;75-virtual") # lowest CUDA 12 standard + lowest for integer intrinsics
            endif()
        endif()
    endif()
--- a/29
+++ b/29
@ -189,19 +189,32 @@ ifdef LLAMA_ADD_CONDA_PATHS
 	CUBLASLD_FLAGS += -Lconda/envs/linux/lib -Lconda/envs/linux/lib/stubs
 endif

-ifdef CUDA_DOCKER_ARCH
-	NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
-else
+
 ifdef LLAMA_PORTABLE
-ifdef LLAMA_COLAB #colab does not need all targets, all-major doesnt work correctly with pascal
-	NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=all-major
+
+ifdef LLAMA_ARCHES_CU11
+	NVCCFLAGS += -Wno-deprecated-gpu-targets \
+	             -gencode arch=compute_35,code=compute_35 \
+	             -gencode arch=compute_50,code=compute_50 \
+	             -gencode arch=compute_61,code=compute_61 \
+	             -gencode arch=compute_70,code=compute_70 \
+	             -gencode arch=compute_75,code=compute_75
+
+else ifdef LLAMA_ARCHES_CU12
+	NVCCFLAGS += -Wno-deprecated-gpu-targets \
+	             -gencode arch=compute_50,code=compute_50 \
+	             -gencode arch=compute_61,code=compute_61 \
+	             -gencode arch=compute_70,code=compute_70 \
+	             -gencode arch=compute_75,code=compute_75 \
+	             -gencode arch=compute_80,code=compute_80
+
 else
 	NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=all
-endif #LLAMA_COLAB
+endif
+
 else
 	NVCCFLAGS += -arch=native
-endif #LLAMA_PORTABLE
-endif # CUDA_DOCKER_ARCH
+endif # LLAMA_PORTABLE

 ifdef LLAMA_CUDA_F16
 	NVCCFLAGS += -DGGML_CUDA_F16
--- a/README.md
+++ b/README.md
@ -59,7 +59,7 @@ KoboldCpp can now also be run on Novita AI, a newer alternative GPU cloud provid

 ## Docker
 - The official docker can be found at https://hub.docker.com/r/koboldai/koboldcpp
- If you're building your own docker, remember to set CUDA_DOCKER_ARCH or enable LLAMA_PORTABLE
+- If you're building your own docker, remember to enable LLAMA_PORTABLE

 ## Obtaining a GGUF model
 - KoboldCpp uses GGUF models. They are not included with KoboldCpp, but you can download GGUF files from other places such as [Bartowski's Huggingface](https://huggingface.co/bartowski). Search for "GGUF" on huggingface.co for plenty of compatible models in the `.gguf` format.
--- a/koboldcpp.sh
+++ b/koboldcpp.sh
@ -21,11 +21,18 @@ KCPP_CUDA=$(<conda/envs/linux/cudaver)
 KCPP_CUDAAPPEND=-cuda${KCPP_CUDA//.}$KCPP_APPEND

 LLAMA_NOAVX2_FLAG=""
+ARCHES_FLAG=""
 if [ -n "$NOAVX2" ]; then
 	LLAMA_NOAVX2_FLAG="LLAMA_NOAVX2=1"
 fi
+if [ -n "$ARCHES_CU11" ]; then
+	ARCHES_FLAG="LLAMA_ARCHES_CU11=1"
+fi
+if [ -n "$ARCHES_CU12" ]; then
+	ARCHES_FLAG="LLAMA_ARCHES_CU12=1"
+fi

-bin/micromamba run -r conda -p conda/envs/linux make -j$(nproc) LLAMA_VULKAN=1 LLAMA_CLBLAST=1 LLAMA_CUBLAS=1 LLAMA_PORTABLE=1 LLAMA_ADD_CONDA_PATHS=1 $LLAMA_NOAVX2_FLAG
+bin/micromamba run -r conda -p conda/envs/linux make -j$(nproc) LLAMA_VULKAN=1 LLAMA_CLBLAST=1 LLAMA_CUBLAS=1 LLAMA_PORTABLE=1 LLAMA_ADD_CONDA_PATHS=1 $LLAMA_NOAVX2_FLAG $ARCHES_FLAG
 if [ $? -ne 0 ]; then
    echo "Error: make failed."
    exit 1