diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml new file mode 100644 index 000000000..dfe8487bc --- /dev/null +++ b/.github/workflows/c-cpp.yml @@ -0,0 +1,17 @@ +name: C/C++ CI + +on: + push: + branches: [ "main" ] + pull_request: + branches: [ "main" ] + +jobs: + build: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + - name: make + run: make diff --git a/.gitignore b/.gitignore index 53ebc74a4..c2cc1d8a3 100644 --- a/.gitignore +++ b/.gitignore @@ -12,20 +12,7 @@ .vs/ .vscode/ -build/ -build-em/ -build-debug/ -build-release/ -build-ci-debug/ -build-ci-release/ -build-static/ -build-cublas/ -build-opencl/ -build-metal/ -build-mpi/ -build-no-accel/ -build-sanitize-addr/ -build-sanitize-thread/ +build*/ out/ tmp/ diff --git a/CMakeLists.txt b/CMakeLists.txt index 466ffd04f..426067a6d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -43,13 +43,17 @@ if (NOT MSVC) endif() # 3rd party libs -option(LLAMA_CUBLAS "llama: use CUDA" ON) +option(LLAMA_CUBLAS "llama: use CUDA" OFF) set(LLAMA_CUDA_MMQ_Y "64" CACHE STRING "llama: y tile size for mmq CUDA kernels") set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels") set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels") set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels") option(LLAMA_CUDA_F16 "llama: use 16 bit floats for dmmv CUDA kernels" OFF) set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K") +option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF) +option(LLAMA_CLBLAST "llama: use CLBlast" OFF) +option(LLAMA_METAL "llama: use Metal" OFF) +option(LLAMA_MPI "llama: use MPI" OFF) option(LLAMA_K_QUANTS "llama: use k-quants" ON) @@ -121,6 +125,77 @@ if (LLAMA_CUBLAS) endif() endif() +if (LLAMA_HIPBLAS) + list(APPEND CMAKE_PREFIX_PATH /opt/rocm) + + if (NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang") + message(WARNING "Only LLVM is supported for HIP, hint: CC=/opt/rocm/llvm/bin/clang") + endif() + if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang") + message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++") + endif() + + find_package(hip) + find_package(hipblas) + find_package(rocblas) + + if (${hipblas_FOUND} AND ${hip_FOUND}) + message(STATUS "HIP and hipBLAS found") + add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS) + add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h) + target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X}) + target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y}) + target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER}) + target_compile_definitions(ggml-rocm PRIVATE CC_TURING=1000000000) + set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX) + target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::rocblas roc::hipblas) + + if (LLAMA_STATIC) + message(FATAL_ERROR "Static linking not supported for HIP/ROCm") + endif() + set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ggml-rocm) + else() + message(WARNING "hipBLAS or HIP not found. Try setting CMAKE_PREFIX_PATH=/opt/rocm") + endif() +endif() + +if (LLAMA_HIPBLAS) + list(APPEND CMAKE_PREFIX_PATH /opt/rocm) + + if (NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang") + message(WARNING "Only LLVM is supported for HIP, hint: CC=/opt/rocm/llvm/bin/clang") + endif() + if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang") + message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++") + endif() + + find_package(hip) + find_package(hipblas) + find_package(rocblas) + + if (${hipblas_FOUND} AND ${hip_FOUND}) + message(STATUS "HIP and hipBLAS found") + add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS) + add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h) + if (LLAMA_CUDA_FORCE_DMMV) + target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV) + endif() + target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X}) + target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y}) + target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER}) + target_compile_definitions(ggml-rocm PRIVATE CC_TURING=1000000000) + set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX) + target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::rocblas roc::hipblas) + + if (LLAMA_STATIC) + message(FATAL_ERROR "Static linking not supported for HIP/ROCm") + endif() + set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ggml-rocm) + else() + message(WARNING "hipBLAS or HIP not found. Try setting CMAKE_PREFIX_PATH=/opt/rocm") + endif() +endif() + if (LLAMA_ALL_WARNINGS) if (NOT MSVC) set(c_flags diff --git a/Makefile b/Makefile index f7cf21d5c..d5ea6184d 100644 --- a/Makefile +++ b/Makefile @@ -20,8 +20,6 @@ ifneq ($(shell grep -e "Arch Linux" -e "ID_LIKE=arch" /etc/os-release 2>/dev/nul ARCH_ADD = -lcblas endif -CCV := $(shell $(CC) --version | head -n 1) -CXXV := $(shell $(CXX) --version | head -n 1) # Mac OS + Arm can report x86_64 # ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789 @@ -195,6 +193,45 @@ ggml_v2-cuda-legacy.o: otherarch/ggml_v2-cuda-legacy.cu otherarch/ggml_v2-cuda-l $(NVCC) $(NVCCFLAGS) $(subst -Ofast,-O3,$(CXXFLAGS)) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@ endif # LLAMA_CUBLAS +ifdef LLAMA_HIPBLAS + ROCM_PATH ?= /opt/rocm + CC := $(ROCM_PATH)/llvm/bin/clang + CXX := $(ROCM_PATH)/llvm/bin/clang++ + GPU_TARGETS ?= gfx803 gfx900 gfx906 gfx908 gfx90a gfx1030 gfx1100 + LLAMA_CUDA_DMMV_X ?= 128 + LLAMA_CUDA_MMV_Y ?= 2 + LLAMA_CUDA_KQUANTS_ITER ?= 1 + HIPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C) +ifdef LLAMA_CUDA_FORCE_DMMV + HIPFLAGS += -DGGML_CUDA_FORCE_DMMV +endif # LLAMA_CUDA_FORCE_DMMV + HIPLDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib -lhipblas -lamdhip64 -lrocblas + HIP_OBJS += ggml-cuda.o ggml_v2-cuda.o ggml_v2-cuda-legacy.o +ggml-cuda.o: HIPFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) \ + -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X) \ + -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y) \ + -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER) \ + -DCC_TURING=1000000000 +ggml_v2-cuda.o: HIPFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) \ + -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X) \ + -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y) \ + -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER) \ + -DCC_TURING=1000000000 +ggml_v2-cuda-legacy.o: HIPFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) \ + -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X) \ + -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y) \ + -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER) \ + -DCC_TURING=1000000000 # DGGML_CUDA_DMMV_F16 does not currently work with AMD. +ggml-cuda.o: ggml-cuda.cu ggml-cuda.h + $(CXX) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $< +ggml_v2-cuda.o: otherarch/ggml_v2-cuda.cu otherarch/ggml_v2-cuda.h + $(CXX) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $< +ggml_v2-cuda-legacy.o: otherarch/ggml_v2-cuda-legacy.cu otherarch/ggml_v2-cuda-legacy.h + $(CXX) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $< +endif # LLAMA_HIPBLAS + + + ifdef LLAMA_METAL CFLAGS += -DGGML_USE_METAL -DGGML_METAL_NDEBUG CXXFLAGS += -DGGML_USE_METAL @@ -224,12 +261,16 @@ ifneq ($(filter armv8%,$(UNAME_M)),) CFLAGS += -mfp16-format=ieee -mno-unaligned-access endif +CCV := $(shell $(CC) --version | head -n 1) +CXXV := $(shell $(CXX) --version | head -n 1) + DEFAULT_BUILD = FAILSAFE_BUILD = OPENBLAS_BUILD = NOAVX2_BUILD = CLBLAST_BUILD = CUBLAS_BUILD = +HIPBLAS_BUILD = ifeq ($(OS),Windows_NT) DEFAULT_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o $@.dll $(LDFLAGS) @@ -238,10 +279,12 @@ ifeq ($(OS),Windows_NT) NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o $@.dll $(LDFLAGS) CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ lib/OpenCL.lib lib/clblast.lib -shared -o $@.dll $(LDFLAGS) -ifdef LLAMA_CUBLAS - CUBLAS_BUILD = $(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) $^ -shared -o $@.dll $(CUBLASLD_FLAGS) $(LDFLAGS) -endif - + ifdef LLAMA_CUBLAS + CUBLAS_BUILD = $(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) $^ -shared -o $@.dll $(CUBLASLD_FLAGS) $(LDFLAGS) + endif + ifdef LLAMA_HIPBLAS + HIPBLAS_BUILD = $(CXX) $(CXXFLAGS) $(HIPFLAGS) $^ -shared -o $@.dll $(HIPLDFLAGS) $(LDFLAGS) + endif else DEFAULT_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o $@.so $(LDFLAGS) FAILSAFE_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o $@.so $(LDFLAGS) @@ -250,24 +293,29 @@ else NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS) endif ifdef LLAMA_CLBLAST - ifeq ($(UNAME_S),Darwin) - CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -framework OpenCL $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS) - else - CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -lOpenCL $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS) - endif + ifeq ($(UNAME_S),Darwin) + CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -framework OpenCL $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS) + else + CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -lOpenCL $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS) + endif endif -ifdef LLAMA_CUBLAS - CUBLAS_BUILD = $(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) $^ -shared -o $@.so $(CUBLASLD_FLAGS) $(LDFLAGS) -endif + ifdef LLAMA_CUBLAS + CUBLAS_BUILD = $(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) $^ -shared -o $@.so $(CUBLASLD_FLAGS) $(LDFLAGS) + endif + ifdef LLAMA_HIPBLAS + HIPBLAS_BUILD = $(CXX) $(CXXFLAGS) $(HIPFLAGS) $^ -shared -o $@.so $(HIPLDFLAGS) $(LDFLAGS) + endif ifndef LLAMA_OPENBLAS ifndef LLAMA_CLBLAST ifndef LLAMA_CUBLAS + ifndef LLAMA_HIPBLAS OPENBLAS_BUILD = @echo 'Your OS $(OS) does not appear to be Windows. For faster speeds, install and link a BLAS library. Set LLAMA_OPENBLAS=1 to compile with OpenBLAS support or LLAMA_CLBLAST=1 to compile with ClBlast support. This is just a reminder, not an error.' endif endif endif + endif endif @@ -302,7 +350,7 @@ ggml_noavx2.o: ggml.c ggml.h ggml_clblast.o: ggml.c ggml.h $(CC) $(CFLAGS) $(FULLCFLAGS) $(CLBLAST_FLAGS) -c $< -o $@ ggml_cublas.o: ggml.c ggml.h - $(CC) $(CFLAGS) $(FULLCFLAGS) $(CUBLAS_FLAGS) -c $< -o $@ + $(CC) $(CFLAGS) $(FULLCFLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@ #quants K k_quants.o: k_quants.c k_quants.h ggml.h ggml-cuda.h @@ -328,7 +376,7 @@ ggml_v2_noavx2.o: otherarch/ggml_v2.c otherarch/ggml_v2.h ggml_v2_clblast.o: otherarch/ggml_v2.c otherarch/ggml_v2.h $(CC) $(CFLAGS) $(FULLCFLAGS) $(CLBLAST_FLAGS) -c $< -o $@ ggml_v2_cublas.o: otherarch/ggml_v2.c otherarch/ggml_v2.h - $(CC) $(CFLAGS) $(FULLCFLAGS) $(CUBLAS_FLAGS) -c $< -o $@ + $(CC) $(CFLAGS) $(FULLCFLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@ #extreme old version compat ggml_v1.o: otherarch/ggml_v1.c otherarch/ggml_v1.h @@ -365,7 +413,7 @@ gpttype_adapter.o: $(GPTTYPE_ADAPTER) gpttype_adapter_clblast.o: $(GPTTYPE_ADAPTER) $(CXX) $(CXXFLAGS) $(CLBLAST_FLAGS) -c $< -o $@ gpttype_adapter_cublas.o: $(GPTTYPE_ADAPTER) - $(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) -c $< -o $@ + $(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@ clean: rm -vf *.o main quantize_llama quantize_gpt2 quantize_gptj quantize_neox quantize_mpt quantize-stats perplexity embedding benchmark-matmult save-load-state gguf gguf.exe main.exe quantize_llama.exe quantize_gptj.exe quantize_gpt2.exe quantize_neox.exe quantize_mpt.exe koboldcpp_default.dll koboldcpp_openblas.dll koboldcpp_failsafe.dll koboldcpp_noavx2.dll koboldcpp_clblast.dll koboldcpp_cublas.dll koboldcpp_default.so koboldcpp_openblas.so koboldcpp_failsafe.so koboldcpp_noavx2.so koboldcpp_clblast.so koboldcpp_cublas.so @@ -390,8 +438,8 @@ koboldcpp_noavx2: ggml_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o com $(NOAVX2_BUILD) koboldcpp_clblast: ggml_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o common.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o k_quants.o ggml-alloc.o $(OBJS) $(CLBLAST_BUILD) -koboldcpp_cublas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o k_quants.o ggml-alloc.o $(CUBLAS_OBJS) $(OBJS) - $(CUBLAS_BUILD) +koboldcpp_cublas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o k_quants.o ggml-alloc.o $(CUBLAS_OBJS) $(HIP_OBJS) $(OBJS) + $(CUBLAS_BUILD) $(HIPBLAS_BUILD) quantize_llama: examples/quantize/quantize.cpp ggml.o llama.o k_quants.o ggml-alloc.o $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) diff --git a/README.md b/README.md index 4a1889bf9..f60595cf0 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,32 @@ -# koboldcpp +# koboldcpp-ROCM +This is mostly for Linux, but with the release of ROCm frameworks on Windows, it might eventually be possible to run this on Windows. -KoboldCpp is an easy-to-use AI text-generation software for GGML models. It's a single self contained distributable from Concedo, that builds off llama.cpp, and adds a versatile Kobold API endpoint, additional format support, backward compatibility, as well as a fancy UI with persistent stories, editing tools, save formats, memory, world info, author's note, characters, scenarios and everything Kobold and Kobold Lite have to offer. +To install, either use the file "[easy_KCPP-ROCm_install.sh](https://github.com/YellowRoseCx/koboldcpp-rocm/blob/main/easy_KCPP-ROCm_install.sh)" or navigate to the folder you want to download to in Terminal then run +``` +git clone https://github.com/YellowRoseCx/koboldcpp-rocm.git -b main --depth 1 && \ +cd koboldcpp-rocm && \ +make LLAMA_HIPBLAS=1 -j4 && \ +./koboldcpp.py +``` +When the KoboldCPP GUI appears, make sure to select "Use CuBLAS/hipBLAS" and set GPU layers + +Original [llama.cpp rocm port](https://github.com/ggerganov/llama.cpp/pull/1087) by SlyEcho, modified and ported to koboldcpp by YellowRoseCx + +Comparison with OpenCL using 6800xt +| Model | Offloading Method | Time Taken - Processing 593 tokens| Time Taken - Generating 200 tokens| Total Time | Perf. Diff. +|-----------------|----------------------------|--------------------|--------------------|------------|---| +| Robin 7b q6_K |CLBLAST 6-t, All Layers on GPU | 6.8s (11ms/T) | 12.0s (60ms/T) | 18.7s (10.7T/s) | 1x +| Robin 7b q6_K |ROCM 1-t, All Layers on GPU | 1.4s (2ms/T) | 5.5s (28ms/T) | 6.9s (29.1T/s)| **2.71x** +| Robin 13b q5_K_M |CLBLAST 6-t, All Layers on GPU | 10.9s (18ms/T) | 16.7s (83ms/T) | 27.6s (7.3T/s) | 1x +| Robin 13b q5_K_M |ROCM 1-t, All Layers on GPU | 2.4s (4ms/T) | 7.8s (39ms/T) | 10.2s (19.6T/s)| **2.63x** +| Robin 33b q4_K_S |CLBLAST 6-t, 46/63 Layers on GPU | 23.2s (39ms/T) | 48.6s (243ms/T) | 71.9s (2.8T/s) | 1x +| Robin 33b q4_K_S |CLBLAST 6-t, 50/63 Layers on GPU | 25.5s (43ms/T) | 44.6s (223ms/T) | 70.0s (2.9T/s) | 1x +| Robin 33b q4_K_S |ROCM 6-t, 46/63 Layers on GPU | 14.6s (25ms/T) | 44.1s (221ms/T) | 58.7s (3.4T/s)| **1.19x** + +-------- +A self contained distributable from Concedo that exposes llama.cpp function bindings, allowing it to be used via a simulated Kobold API endpoint. + +What does it mean? You get llama.cpp with a fancy UI, persistent stories, editing tools, save formats, memory, world info, author's note, characters, scenarios and everything Kobold and Kobold Lite have to offer. In a tiny package around 20 MB in size, excluding model weights. ![Preview](media/preview.png) diff --git a/easy_KCPP-ROCm_install.sh b/easy_KCPP-ROCm_install.sh new file mode 100755 index 000000000..f41f91f98 --- /dev/null +++ b/easy_KCPP-ROCm_install.sh @@ -0,0 +1,24 @@ +#!/bin/bash +function countdown { + local num=$1 + while [ $num -gt 0 ]; do + printf "\rAbout to build KoboldCPP-ROCm in %s seconds..." $num + sleep 1 + num=$((num - 1)) + done + printf "\Building KoboldCPP... \n" +} + +if [ "$(basename "$PWD")" = "koboldcpp-rocm" ]; then + echo "Already inside 'koboldcpp-rocm' directory." +else + git clone https://github.com/YellowRoseCx/koboldcpp-rocm.git -b main --depth 1 && \ + cd "koboldcpp-rocm" || exit 1 +fi + +echo "Build will start shortly." +countdown 5 + +make clean && \ +make LLAMA_HIPBLAS=1 LLAMA_OPENBLAS=1 LLAMA_CLBLAST=1 -j4 && \ +./koboldcpp.py diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 5ac5786f1..8a10438ee 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -5,16 +5,115 @@ #include #include #include - +#if defined(GGML_USE_HIPBLAS) +#include +#include +#include +#ifdef __HIP_PLATFORM_AMD__ +// for rocblas_initialize() +#include "rocblas/rocblas.h" +#endif +#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F +#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F +#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT +#define CUBLAS_OP_N HIPBLAS_OP_N +#define CUBLAS_OP_T HIPBLAS_OP_T +#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#define CUBLAS_TF32_TENSOR_OP_MATH 0 +#define CUDA_R_16F HIPBLAS_R_16F +#define CUDA_R_32F HIPBLAS_R_32F +#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width) +#define cublasCreate hipblasCreate +#define cublasGemmEx hipblasGemmEx +#define cublasHandle_t hipblasHandle_t +#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS +#define cublasSetStream hipblasSetStream +#define cublasSgemm hipblasSgemm +#define cublasStatus_t hipblasStatus_t +#define cudaDeviceProp hipDeviceProp_t +#define cudaDeviceSynchronize hipDeviceSynchronize +#define cudaError_t hipError_t +#define cudaEventCreateWithFlags hipEventCreateWithFlags +#define cudaEventDisableTiming hipEventDisableTiming +#define cudaEventRecord hipEventRecord +#define cudaEvent_t hipEvent_t +#define cudaEventDestroy hipEventDestroy +#define cudaFree hipFree +#define cudaFreeHost hipHostFree +#define cudaGetDevice hipGetDevice +#define cudaGetDeviceCount hipGetDeviceCount +#define cudaGetDeviceProperties hipGetDeviceProperties +#define cudaGetErrorString hipGetErrorString +#define cudaGetLastError hipGetLastError +#define cudaMalloc hipMalloc +#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault) +#define cudaMemcpy hipMemcpy +#define cudaMemcpy2DAsync hipMemcpy2DAsync +#define cudaMemcpyAsync hipMemcpyAsync +#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice +#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost +#define cudaMemcpyHostToDevice hipMemcpyHostToDevice +#define cudaMemcpyKind hipMemcpyKind +#define cudaMemset hipMemset +#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize +#define cudaSetDevice hipSetDevice +#define cudaStreamCreateWithFlags hipStreamCreateWithFlags +#define cudaStreamNonBlocking hipStreamNonBlocking +#define cudaStreamSynchronize hipStreamSynchronize +#define cudaStreamWaitEvent(stream, event) hipStreamWaitEvent(stream, event, 0) +#define cudaStream_t hipStream_t +#define cudaSuccess hipSuccess +#else #include #include #include - +#endif #include "ggml-cuda.h" #include "ggml.h" #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products +#ifndef CC_TURING #define CC_TURING 700 +#endif + +#if defined(GGML_USE_HIPBLAS) +#define __CUDA_ARCH__ 1300 + +typedef int8_t int8x4_t __attribute__((ext_vector_type(4))); +static __device__ __forceinline__ int __vsubss4(const int a, const int b) { + const int8x4_t va = reinterpret_cast(a); + const int8x4_t vb = reinterpret_cast(b); + const int8x4_t c = __builtin_elementwise_sub_sat(va, vb); + return reinterpret_cast(c); +} + +static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) { +#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__) + c = __builtin_amdgcn_sdot4(a, b, c, false); +#elif defined(__gfx1100__) + c = __builtin_amdgcn_sudot4( true, a, true, b, c, false); +#elif defined(__gfx1010__) || defined(__gfx900__) + int tmp1; + int tmp2; + asm("\n \ + v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \ + v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \ + v_add3_u32 %0, %1, %2, %0 \n \ + v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \ + v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \ + v_add3_u32 %0, %1, %2, %0 \n \ + " + : "+v"(c), "=&v"(tmp1), "=&v"(tmp2) + : "v"(a), "v"(b) + ); +#else + const int8x4_t va = reinterpret_cast(a); + const int8x4_t vb = reinterpret_cast(b); + c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3]; +#endif + return c; +} +#endif #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data @@ -424,8 +523,8 @@ static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const in static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){ const block_q4_1 * x = (const block_q4_1 *) vx; - const dfloat d = x[ib].dm.x; - const dfloat m = x[ib].dm.y; + const dfloat d = __low2half(x[ib].dm); + const dfloat m = __high2half(x[ib].dm); const int vui = x[ib].qs[iqs]; @@ -467,8 +566,8 @@ static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const in static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){ const block_q5_1 * x = (const block_q5_1 *) vx; - const dfloat d = x[ib].dm.x; - const dfloat m = x[ib].dm.y; + const dfloat d = __low2half(x[ib].dm); + const dfloat m = __high2half(x[ib].dm); uint32_t qh; memcpy(&qh, x[ib].qh, sizeof(qh)); @@ -520,8 +619,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float const uint8_t q = x[i].qs[32*n + l]; float * y = yy + i*QK_K + 128*n; - float dall = x[i].dm.x; - float dmin = x[i].dm.y; + float dall = __low2half(x[i].dm); + float dmin = __high2half(x[i].dm); y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4); y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4); y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4); @@ -531,8 +630,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float const int il = tid%16; // 0...15 const uint8_t q = x[i].qs[il] >> (2*is); float * y = yy + i*QK_K + 16*is + il; - float dall = x[i].dm.x; - float dmin = x[i].dm.y; + float dall = __low2half(x[i].dm); + float dmin = __high2half(x[i].dm); y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4); y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4); #endif @@ -618,8 +717,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float float * y = yy + i*QK_K + 64*il + n*ir; - const float dall = x[i].dm.x; - const float dmin = x[i].dm.y; + const float dall = __low2half(x[i].dm); + const float dmin = __high2half(x[i].dm); const uint8_t * q = x[i].qs + 32*il + n*ir; @@ -657,8 +756,8 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float float * y = yy + i*QK_K + 64*il + 2*ir; - const float dall = x[i].dm.x; - const float dmin = x[i].dm.y; + const float dall = __low2half(x[i].dm); + const float dmin = __high2half(x[i].dm); const uint8_t * ql = x[i].qs + 32*il + 2*ir; const uint8_t * qh = x[i].qh + 2*ir; @@ -770,8 +869,8 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * y = yy + i * QK_K + y_offset; const uint8_t * q = x[i].qs + q_offset; - const float dall = x[i].dm.x; - const float dmin = x[i].dm.y; + const float dall = __low2half(x[i].dm); + const float dmin = __high2half(x[i].dm); const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset); aux[0] = a[0] & 0x0f0f0f0f; @@ -991,8 +1090,8 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * y1 = yy + i*QK_K + y_offset; const float * y2 = y1 + 128; - const float dall = x[i].dm.x; - const float dmin = x[i].dm.y; + const float dall = __low2half(x[i].dm); + const float dmin = __high2half(x[i].dm); const uint16_t * a = (const uint16_t *)x[i].scales; aux[0] = a[im+0] & kmask1; @@ -1124,8 +1223,8 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx, const float * y1 = yy + i*QK_K + y_offset; const float * y2 = y1 + 128; - const float dall = x[i].dm.x; - const float dmin = x[i].dm.y; + const float dall = __low2half(x[i].dm); + const float dmin = __high2half(x[i].dm); const uint16_t * a = (const uint16_t *)x[i].scales; aux[0] = a[im+0] & kmask1; @@ -1348,8 +1447,8 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest return; } - y[ib].ds.x = d; - y[ib].ds.y = sum; + reinterpret_cast(y[ib].ds.x) = d; + reinterpret_cast(y[ib].ds.y) = sum; } template @@ -2346,7 +2445,7 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1( u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); } - return vec_dot_q8_0_q8_1_impl(v, u, bq8_0->d, bq8_1->ds.x); + return vec_dot_q8_0_q8_1_impl(v, u, bq8_0->d, __low2half(bq8_1->ds)); } template static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { @@ -2432,7 +2531,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1( #pragma unroll for (int i = 0; i < QR2_K; ++ i) { u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1); - d8[i] = bq8_1[bq8_offset + i].ds.x; + d8[i] = __low2half(bq8_1[bq8_offset + i].ds); } return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8); @@ -2551,7 +2650,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1( #pragma unroll for (int i = 0; i < QR3_K; ++i) { u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1); - d8[i] = bq8_1[bq8_offset + i].ds.x; + d8[i] = __low2half(bq8_1[bq8_offset + i].ds); } return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8); @@ -2720,7 +2819,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1( for (int i = 0; i < QR4_K; ++i) { const block_q8_1 * bq8i = bq8_1 + bq8_offset + i; - d8[i] = bq8i->ds.x; + d8[i] = __low2half(bq8i->ds); const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4); u[2*i+0] = q8[0]; @@ -2747,8 +2846,8 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1( const float dall = bq4_K->d[0]; const float dmin = bq4_K->d[1]; - const float d8_1 = bq8_1[0].ds.x; - const float d8_2 = bq8_1[1].ds.x; + const float d8_1 = __low2float(bq8_1[0].ds); + const float d8_2 = __low2float(bq8_1[1].ds); const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2)); const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4); @@ -2901,7 +3000,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1( #pragma unroll for (int i = 0; i < QR5_K; ++i) { const block_q8_1 * bq8i = bq8_1 + bq8_offset + i; - d8[i] = bq8i->ds.x; + d8[i] = __low2float(bq8i->ds); const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4); u[2*i+0] = q8[0]; @@ -2919,8 +3018,8 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1( const float d = bq5_K->d; - const float d8_1 = bq8_1[0].ds.x; - const float d8_2 = bq8_1[1].ds.x; + const float d8_1 = __low2half(bq8_1[0].ds); + const float d8_2 = __low2half(bq8_1[1].ds); const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2)); const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4); @@ -3075,7 +3174,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1( #pragma unroll for (int i = 0; i < QR6_K; ++i) { u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1); - d8[i] = bq8_1[bq8_offset + 2*i].ds.x; + d8[i] = __low2half(bq8_1[bq8_offset + 2*i].ds); } return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8); @@ -3243,7 +3342,7 @@ static __device__ __forceinline__ void mul_mat_q( *dsi_dst = *dsi_src; } else { float * dfi_dst = (float *) dsi_dst; - *dfi_dst = (*dsi_src).x; + *dfi_dst = __low2half(*dsi_src); } } @@ -4933,10 +5032,18 @@ void ggml_init_cublas() { static bool initialized = false; if (!initialized) { + +#ifdef __HIP_PLATFORM_AMD__ + // Workaround for a rocBLAS bug when using multiple graphics cards: + // https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346 + rocblas_initialize(); + CUDA_CHECK(cudaDeviceSynchronize()); +#endif + CUDA_CHECK(cudaGetDeviceCount(&g_device_count)); GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES); int64_t total_vram = 0; - fprintf(stderr, "%s: found %d CUDA devices:\n", __func__, g_device_count); + fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count); for (int id = 0; id < g_device_count; ++id) { cudaDeviceProp prop; CUDA_CHECK(cudaGetDeviceProperties(&prop, id)); diff --git a/ggml-cuda.h b/ggml-cuda.h index f66bb1678..a72e82069 100644 --- a/ggml-cuda.h +++ b/ggml-cuda.h @@ -2,6 +2,14 @@ #include "ggml.h" +#ifdef GGML_USE_HIPBLAS +#define GGML_CUDA_NAME "ROCm" +#define GGML_CUBLAS_NAME "hipBLAS" +#else +#define GGML_CUDA_NAME "CUDA" +#define GGML_CUBLAS_NAME "cuBLAS" +#endif + #ifdef __cplusplus extern "C" { #endif diff --git a/koboldcpp.py b/koboldcpp.py index 43a0590a2..04c5580d9 100755 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -9,6 +9,7 @@ import ctypes import os import argparse import json, sys, http.server, time, asyncio, socket, threading +import re from concurrent.futures import ThreadPoolExecutor stop_token_max = 10 @@ -217,11 +218,11 @@ def load_model(model_filename): inputs.clblast_info = clblastids inputs.cublas_info = 0 if (args.usecublas and "0" in args.usecublas): - os.environ["CUDA_VISIBLE_DEVICES"] = "0" + inputs.cublas_info = 0 elif (args.usecublas and "1" in args.usecublas): - os.environ["CUDA_VISIBLE_DEVICES"] = "1" + inputs.cublas_info = 1 elif (args.usecublas and "2" in args.usecublas): - os.environ["CUDA_VISIBLE_DEVICES"] = "2" + inputs.cublas_info = 2 for n in range(tensor_split_max): if args.tensor_split and n < len(args.tensor_split): @@ -727,10 +728,11 @@ def show_new_gui(): tabcontentframe.grid_propagate(False) tabcontent = {} + lib_option_pairs = [ (lib_openblas, "Use OpenBLAS"), (lib_clblast, "Use CLBlast"), - (lib_cublas, "Use CuBLAS"), + (lib_cublas, "Use CuBLAS/hipBLAS"), (lib_default, "Use No BLAS"), (lib_noavx2, "NoAVX2 Mode (Old CPU)"), (lib_failsafe, "Failsafe Mode (Old CPU)")] @@ -807,6 +809,52 @@ def show_new_gui(): button = ctk.CTkButton(parent, 50, text="Browse", command= lambda a=var,b=searchtext:getfilename(a,b)) button.grid(row=row+1, column=1, stick="nw") return + + from subprocess import run, CalledProcessError + def get_device_names(): + CUdevices = [] + CLdevices = [] + try: # Get OpenCL GPU names + output = run(['clinfo'], capture_output=True, text=True, check=True, encoding='utf-8').stdout + CLdevices = [line.split(":", 1)[1].strip() for line in output.splitlines() if line.strip().startswith("Board name:")] + except Exception as e: + pass + try: # Get AMD ROCm GPU names + output = run(['rocminfo'], capture_output=True, text=True, check=True, encoding='utf-8').stdout + device_name = None + for line in output.splitlines(): + line = line.strip() + if line.startswith("Marketing Name:"): device_name = line.split(":", 1)[1].strip() + elif line.startswith("Device Type:") and "GPU" in line and device_name is not None: CUdevices.append(device_name) + elif line.startswith("Device Type:") and "GPU" not in line: device_name = None + except Exception as e: + pass + # try: # Get NVIDIA GPU names , Couldn't test so probably not working yet. + # output = run(['nvidia-smi', '-L'], capture_output=True, text=True, check=True, encoding='utf-8').stdout + # CUdevices = [line.split(":", 1)[1].strip() for line in output.splitlines() if line.startswith("GPU:")] + # except FileNotFoundError: pass + CUdevices.append('All') if CUdevices else CUdevices.extend(['1', '2', '3', 'All']) + if not CLdevices: CLdevices.extend(['1', '2', '3']) + return CUdevices, CLdevices + + def show_tooltip(event, tooltip_text=None): + if hasattr(show_tooltip, "_tooltip"): + tooltip = show_tooltip._tooltip + else: + tooltip = ctk.CTkToplevel(root) + tooltip.configure(fg_color="#ffffe0") + tooltip.withdraw() + tooltip.overrideredirect(True) + tooltip_label = ctk.CTkLabel(tooltip, text=tooltip_text, text_color="#000000", fg_color="#ffffe0") + tooltip_label.pack(expand=True, padx=2, pady=1) + show_tooltip._tooltip = tooltip + x, y = root.winfo_pointerxy() + tooltip.wm_geometry(f"+{x + 10}+{y + 10}") + tooltip.deiconify() + def hide_tooltip(event): + if hasattr(show_tooltip, "_tooltip"): + tooltip = show_tooltip._tooltip + tooltip.withdraw() def show_tooltip(event, tooltip_text=None): if hasattr(show_tooltip, "_tooltip"): @@ -835,10 +883,11 @@ def show_new_gui(): num_backends_built.bind("", hide_tooltip) # Vars - should be in scope to be used by multiple widgets + CUdevices, CLdevices = get_device_names() gpulayers_var = ctk.StringVar(value="0") threads_var = ctk.StringVar(value=str(default_threads)) runopts_var = ctk.StringVar() - gpu_choice_var = ctk.StringVar(value="1") + gpu_choice_var = ctk.StringVar(value=CLdevices[0] if not None else CUdevices[0] if not None else "1") launchbrowser = ctk.IntVar(value=1) highpriority = ctk.IntVar() @@ -885,25 +934,23 @@ def show_new_gui(): quick_tab = tabcontent["Quick Launch"] # gpu options - quick_gpu_layers_entry,quick_gpu_layers_label = makelabelentry(quick_tab,"GPU Layers:", gpulayers_var, 5, 50) - quick_gpu_selector_label = makelabel(quick_tab, "GPU ID:", 3) - quick_gpu_selector_box = ctk.CTkComboBox(quick_tab, values=["1","2","3"], width=60, variable=gpu_choice_var, state="readonly") - CUDA_quick_gpu_selector_box = ctk.CTkComboBox(quick_tab, values=["1","2","3","All"], width=60, variable=gpu_choice_var, state="readonly") - quick_lowvram_box = makecheckbox(quick_tab, "Low VRAM", lowvram_var, 4,0) - quick_mmq_box = makecheckbox(quick_tab, "Use QuantMatMul (mmq)", mmq_var, 4,1) + quick_gpu_layers_entry, quick_gpu_layers_label = makelabelentry(quick_tab, "GPU Layers:", gpulayers_var, 5, 50) + quick_gpu_selector_label = makelabel(quick_tab, "GPU ID:", 3) + quick_gpu_selector_box = ctk.CTkComboBox(quick_tab, values=CLdevices, width=180, variable=gpu_choice_var, state="readonly") + CUDA_quick_gpu_selector_box = ctk.CTkComboBox(quick_tab, values=CUdevices, width=180, variable=gpu_choice_var, state="readonly") + quick_lowvram_box = makecheckbox(quick_tab, "Low VRAM", lowvram_var, 4,0) + quick_mmq_box = makecheckbox(quick_tab, "Use QuantMatMul (mmq)", mmq_var, 4,1) def changerunmode(a,b,c): index = runopts_var.get() - if index == "Use CLBlast" or index == "Use CuBLAS": + if index == "Use CLBlast" or index == "Use CuBLAS/hipBLAS": gpu_selector_label.grid(row=3, column=0, padx = 8, pady=1, stick="nw") quick_gpu_selector_label.grid(row=3, column=0, padx = 8, pady=1, stick="nw") if index == "Use CLBlast": gpu_selector_box.grid(row=3, column=1, padx=8, pady=1, stick="nw") quick_gpu_selector_box.grid(row=3, column=1, padx=8, pady=1, stick="nw") - if gpu_choice_var.get()=="All": - gpu_choice_var.set("1") - elif index == "Use CuBLAS": + elif index == "Use CuBLAS/hipBLAS": CUDA_gpu_selector_box.grid(row=3, column=1, padx=8, pady=1, stick="nw") CUDA_quick_gpu_selector_box.grid(row=3, column=1, padx=8, pady=1, stick="nw") else: @@ -914,7 +961,7 @@ def show_new_gui(): quick_gpu_selector_box.grid_forget() CUDA_quick_gpu_selector_box.grid_forget() - if index == "Use CuBLAS": + if index == "Use CuBLAS/hipBLAS": lowvram_box.grid(row=4, column=0, padx=8, pady=1, stick="nw") quick_lowvram_box.grid(row=4, column=0, padx=8, pady=1, stick="nw") mmq_box.grid(row=4, column=1, padx=8, pady=1, stick="nw") @@ -925,7 +972,7 @@ def show_new_gui(): mmq_box.grid_forget() quick_mmq_box.grid_forget() - if index == "Use CLBlast" or index == "Use CuBLAS": + if index == "Use CLBlast" or index == "Use CuBLAS/hipBLAS": gpu_layers_label.grid(row=5, column=0, padx = 8, pady=1, stick="nw") gpu_layers_entry.grid(row=5, column=1, padx=8, pady=1, stick="nw") quick_gpu_layers_label.grid(row=5, column=0, padx = 8, pady=1, stick="nw") @@ -968,9 +1015,9 @@ def show_new_gui(): # gpu options gpu_layers_entry,gpu_layers_label = makelabelentry(hardware_tab,"GPU Layers:", gpulayers_var, 5, 50) gpu_selector_label = makelabel(hardware_tab, "GPU ID:", 3) - gpu_selector_box = ctk.CTkComboBox(hardware_tab, values=["1","2","3"], width=60, variable=gpu_choice_var, state="readonly") - CUDA_gpu_selector_box = ctk.CTkComboBox(hardware_tab, values=["1","2","3", "All"], width=60, variable=gpu_choice_var, state="readonly") - lowvram_box = makecheckbox(hardware_tab, "Low VRAM", lowvram_var, 4,0) + gpu_selector_box = ctk.CTkComboBox(hardware_tab, values=CLdevices, width=180, variable=gpu_choice_var, state="readonly") + CUDA_gpu_selector_box = ctk.CTkComboBox(hardware_tab, values=CUdevices, width=180, variable=gpu_choice_var, state="readonly") + lowvram_box = makecheckbox(hardware_tab, "Low VRAM", lowvram_var, 4,0) mmq_box = makecheckbox(hardware_tab, "Use QuantMatMul (mmq)", mmq_var, 4,1) # presets selector @@ -1105,13 +1152,17 @@ def show_new_gui(): args.stream = stream.get()==1 args.smartcontext = smartcontext.get()==1 args.unbantokens = unbantokens.get()==1 - gpuchoiceidx = 0 if gpu_choice_var.get()!="All": - gpuchoiceidx = int(gpu_choice_var.get())-1 + if runopts_var.get() == "Use CLBlast": #if CLBlast selected + if (gpu_choice_var.get()) in CLdevices: + gpuchoiceidx = CLdevices.index((gpu_choice_var.get())) + elif runopts_var.get() == "Use CuBLAS/hipBLAS": + if (gpu_choice_var.get()) in CUdevices: + gpuchoiceidx = CUdevices.index((gpu_choice_var.get())) if runopts_var.get() == "Use CLBlast": args.useclblast = [[0,0], [1,0], [0,1]][gpuchoiceidx] - if runopts_var.get() == "Use CuBLAS": + if runopts_var.get() == "Use CuBLAS/hipBLAS": if gpu_choice_var.get()=="All": args.usecublas = ["lowvram"] if lowvram_var.get() == 1 else ["normal"] else: @@ -1170,13 +1221,15 @@ def show_new_gui(): elif "usecublas" in dict and dict["usecublas"]: if cublas_option is not None: runopts_var.set(cublas_option) - lowvram_var.set(1 if "lowvram" in dict["usecublas"] else 0) - mmq_var.set(1 if "mmq" in dict["usecublas"] else 0) - gpu_choice_var.set("All") - for g in range(3): - if str(g) in dict["usecublas"]: - gpu_choice_var.set(str(g+1)) - break + if len(dict["usecublas"])==1: + lowvram_var.set(1 if dict["usecublas"][0]=="lowvram" else 0) + else: + lowvram_var.set(1 if "lowvram" in dict["usecublas"] else 0) + gpu_choice_var.set("1") + for g in range(3): + if str(g) in dict["usecublas"]: + gpu_choice_var.set(str(g+1)) + break elif "noavx2" in dict and "noblas" in dict and dict["noblas"] and dict["noavx2"]: if failsafe_option is not None: runopts_var.set(failsafe_option) @@ -1240,6 +1293,7 @@ def show_new_gui(): horde_workername_var.set(dict["hordeconfig"][4]) usehorde_var.set("1") + def save_config(): file_type = [("KoboldCpp Settings", "*.kcpps")] filename = asksaveasfile(filetypes=file_type, defaultextension=file_type) @@ -1337,7 +1391,7 @@ def show_old_gui(): blaschoice = tk.StringVar() blaschoice.set("BLAS = 512") - runopts = ["Use OpenBLAS","Use CLBLast GPU #1","Use CLBLast GPU #2","Use CLBLast GPU #3","Use CuBLAS GPU","Use No BLAS","NoAVX2 Mode (Old CPU)","Failsafe Mode (Old CPU)"] + runopts = ["Use OpenBLAS","Use CLBLast GPU #1","Use CLBLast GPU #2","Use CLBLast GPU #3","Use CuBLAS/hipBLAS GPU","Use No BLAS","NoAVX2 Mode (Old CPU)","Failsafe Mode (Old CPU)"] runchoice = tk.StringVar() runchoice.set("Use OpenBLAS") @@ -1402,7 +1456,7 @@ def show_old_gui(): #load all the vars args.threads = int(threads_var.get()) args.gpulayers = int(gpu_layers_var.get()) - + args.stream = (stream.get()==1) args.smartcontext = (smartcontext.get()==1) args.launch = (launchbrowser.get()==1) @@ -1779,7 +1833,7 @@ if __name__ == '__main__': compatgroup = parser.add_mutually_exclusive_group() compatgroup.add_argument("--noblas", help="Do not use OpenBLAS for accelerated prompt ingestion", action='store_true') compatgroup.add_argument("--useclblast", help="Use CLBlast for GPU Acceleration. Must specify exactly 2 arguments, platform ID and device ID (e.g. --useclblast 1 0).", type=int, choices=range(0,9), nargs=2) - compatgroup.add_argument("--usecublas", help="Use CuBLAS for GPU Acceleration. Requires CUDA. Select lowvram to not allocate VRAM scratch buffer. Enter a number afterwards to select and use 1 GPU. Leaving no number will use all GPUs.", nargs='*',metavar=('[lowvram|normal] [main GPU ID] [mmq]'), choices=['normal', 'lowvram', '0', '1', '2', 'mmq']) + compatgroup.add_argument("--usecublas", help="Use CuBLAS/hipBLAS for GPU Acceleration. Requires CUDA. Select lowvram to not allocate VRAM scratch buffer. Enter a number afterwards to select and use 1 GPU. Leaving no number will use all GPUs.", nargs='*',metavar=('[lowvram|normal] [main GPU ID] [mmq]'), choices=['normal', 'lowvram', '0', '1', '2', 'mmq']) parser.add_argument("--gpulayers", help="Set number of layers to offload to GPU when using GPU. Requires GPU.",metavar=('[GPU layers]'), type=int, default=0) parser.add_argument("--tensor_split", help="For CUDA with ALL GPU set only, ratio to split tensors across multiple GPUs, space-separated list of proportions, e.g. 7 3", metavar=('[Ratios]'), type=float, nargs='+') diff --git a/llama.cpp b/llama.cpp index 9aa1af7a7..15b20d4a9 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2004,7 +2004,11 @@ static void llm_load_tensors( #ifdef GGML_USE_CUBLAS const int max_backend_supported_layers = hparams.n_layer + 3; +#if defined(GGML_USE_HIPBLAS) + const int max_offloadable_layers = low_vram ? hparams.n_layer + 3 : hparams.n_layer + 3; +#else const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3; +#endif if (n_gpu_layers > (int) hparams.n_layer + 1) { if (low_vram) { LLAMA_LOG_INFO("%s: cannot offload v cache to GPU due to low VRAM option\n", __func__); diff --git a/otherarch/ggml_v2-cuda-legacy.cu b/otherarch/ggml_v2-cuda-legacy.cu index d3220a786..e7053b764 100644 --- a/otherarch/ggml_v2-cuda-legacy.cu +++ b/otherarch/ggml_v2-cuda-legacy.cu @@ -4,9 +4,64 @@ #include #include +#if defined(GGML_USE_HIPBLAS) +#include +#include +#include +#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F +#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F +#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT +#define CUBLAS_OP_N HIPBLAS_OP_N +#define CUBLAS_OP_T HIPBLAS_OP_T +#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#define CUBLAS_TF32_TENSOR_OP_MATH 0 +#define CUDA_R_16F HIPBLAS_R_16F +#define CUDA_R_32F HIPBLAS_R_32F +#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width) +#define cublasCreate hipblasCreate +#define cublasGemmEx hipblasGemmEx +#define cublasHandle_t hipblasHandle_t +#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS +#define cublasSetStream hipblasSetStream +#define cublasSgemm hipblasSgemm +#define cublasStatus_t hipblasStatus_t +#define cudaDeviceProp hipDeviceProp_t +#define cudaDeviceSynchronize hipDeviceSynchronize +#define cudaError_t hipError_t +#define cudaEventCreateWithFlags hipEventCreateWithFlags +#define cudaEventDisableTiming hipEventDisableTiming +#define cudaEventRecord hipEventRecord +#define cudaEvent_t hipEvent_t +#define cudaFree hipFree +#define cudaFreeHost hipHostFree +#define cudaGetDevice hipGetDevice +#define cudaGetDeviceCount hipGetDeviceCount +#define cudaGetDeviceProperties hipGetDeviceProperties +#define cudaGetErrorString hipGetErrorString +#define cudaGetLastError hipGetLastError +#define cudaMalloc hipMalloc +#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault) +#define cudaMemcpy hipMemcpy +#define cudaMemcpy2DAsync hipMemcpy2DAsync +#define cudaMemcpyAsync hipMemcpyAsync +#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice +#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost +#define cudaMemcpyHostToDevice hipMemcpyHostToDevice +#define cudaMemcpyKind hipMemcpyKind +#define cudaMemset hipMemset +#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize +#define cudaSetDevice hipSetDevice +#define cudaStreamCreateWithFlags hipStreamCreateWithFlags +#define cudaStreamNonBlocking hipStreamNonBlocking +#define cudaStreamSynchronize hipStreamSynchronize +#define cudaStreamWaitEvent hipStreamWaitEvent +#define cudaStream_t hipStream_t +#define cudaSuccess hipSuccess +#else #include #include #include +#endif #include "ggml_v2-cuda-legacy.h" #include "ggml_v2-cuda.h" diff --git a/otherarch/ggml_v2-cuda.cu b/otherarch/ggml_v2-cuda.cu index 8314adb25..b4502df00 100644 --- a/otherarch/ggml_v2-cuda.cu +++ b/otherarch/ggml_v2-cuda.cu @@ -4,10 +4,66 @@ #include #include +#if defined(GGML_USE_HIPBLAS) +#include +#include +#include +#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F +#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F +#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT +#define CUBLAS_OP_N HIPBLAS_OP_N +#define CUBLAS_OP_T HIPBLAS_OP_T +#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#define CUBLAS_TF32_TENSOR_OP_MATH 0 +#define CUDA_R_16F HIPBLAS_R_16F +#define CUDA_R_32F HIPBLAS_R_32F +#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width) +#define cublasCreate hipblasCreate +#define cublasGemmEx hipblasGemmEx +#define cublasHandle_t hipblasHandle_t +#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS +#define cublasSetStream hipblasSetStream +#define cublasSgemm hipblasSgemm +#define cublasStatus_t hipblasStatus_t +#define cudaDeviceProp hipDeviceProp_t +#define cudaDeviceSynchronize hipDeviceSynchronize +#define cudaError_t hipError_t +#define cudaEventCreateWithFlags hipEventCreateWithFlags +#define cudaEventDisableTiming hipEventDisableTiming +#define cudaEventRecord hipEventRecord +#define cudaEvent_t hipEvent_t +#define cudaFree hipFree +#define cudaFreeHost hipHostFree +#define cudaGetDevice hipGetDevice +#define cudaGetDeviceCount hipGetDeviceCount +#define cudaGetDeviceProperties hipGetDeviceProperties +#define cudaGetErrorString hipGetErrorString +#define cudaGetLastError hipGetLastError +#define cudaMalloc hipMalloc +#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault) +#define cudaMemcpy hipMemcpy +#define cudaMemcpy2DAsync hipMemcpy2DAsync +#define cudaMemcpyAsync hipMemcpyAsync +#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice +#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost +#define cudaMemcpyHostToDevice hipMemcpyHostToDevice +#define cudaMemcpyKind hipMemcpyKind +#define cudaMemset hipMemset +#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize +#define cudaSetDevice hipSetDevice +#define cudaStreamCreateWithFlags hipStreamCreateWithFlags +#define cudaStreamNonBlocking hipStreamNonBlocking +#define cudaStreamSynchronize hipStreamSynchronize +#define cudaStreamWaitEvent hipStreamWaitEvent +#define cudaStream_t hipStream_t +#define cudaSuccess hipSuccess +#else #include #include #include +#endif + #include "ggml_v2-cuda.h" #include "ggml_v2.h" @@ -807,4 +863,4 @@ void ggml_v2_cuda_transform_tensor(ggml_v2_tensor * tensor) { tensor->data = d_Q; tensor->backend = GGML_V2_BACKEND_CUDA; -} \ No newline at end of file +} diff --git a/otherarch/ggml_v2.c b/otherarch/ggml_v2.c index 6b18fe723..74a9c4e21 100644 --- a/otherarch/ggml_v2.c +++ b/otherarch/ggml_v2.c @@ -139,7 +139,8 @@ inline static void* ggml_v2_aligned_malloc(size_t size) { #include #elif defined(GGML_USE_OPENBLAS) #include -#elif defined(GGML_USE_CUBLAS) +#endif +#if defined(GGML_USE_CUBLAS) #include "ggml_v2-cuda.h" #include "ggml_v2-cuda-legacy.h" #endif @@ -148,6 +149,8 @@ inline static void* ggml_v2_aligned_malloc(size_t size) { #include "ggml_v2-opencl-legacy.h" #endif + + #undef MIN #undef MAX #define MIN(a, b) ((a) < (b) ? (a) : (b)) diff --git a/otherarch/gptj_v3.cpp b/otherarch/gptj_v3.cpp index 8f7cc47f1..42512e190 100644 --- a/otherarch/gptj_v3.cpp +++ b/otherarch/gptj_v3.cpp @@ -348,7 +348,11 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g const auto & hparams = model.hparams; size_t vram_total = 0; const int n_gpu = std::min(gpulayers, int(hparams.n_layer)); - fprintf(stderr, "%s: [GPU] offloading %d layers to GPU\n", __func__, n_gpu); + #if defined(GGML_USE_CLBLAST) + fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu); + #else + fprintf(stderr, "%s: [CUDA] offloading %d layers to GPU\n", __func__, n_gpu); + #endif for (int i = 0; i < n_gpu; ++i) { const auto & layer = model.layers[i]; layer.c_attn_q_proj_w->backend = GGML_BACKEND_GPU; @@ -373,7 +377,11 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g ggml_cuda_transform_tensor(layer.c_mlp_proj_w->data,layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w); #endif } - fprintf(stderr, "%s: [GPU] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024); + #if defined(GGML_USE_CLBLAST) + fprintf(stderr, "%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024); + #else + fprintf(stderr, "%s: [CUDA] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024); + #endif } #endif @@ -644,4 +652,4 @@ bool gptj_eval( ggml_free(ctx0); return true; -} \ No newline at end of file +} diff --git a/otherarch/llama_v2.cpp b/otherarch/llama_v2.cpp index ab9d82f93..01b47697c 100644 --- a/otherarch/llama_v2.cpp +++ b/otherarch/llama_v2.cpp @@ -3101,4 +3101,4 @@ std::vector llama_v2_tokenize(struct llama_v2_context * ctx, const res.resize(n); return res; -} \ No newline at end of file +} diff --git a/otherarch/mpt_v3.cpp b/otherarch/mpt_v3.cpp index 2bf23055c..57ed90888 100644 --- a/otherarch/mpt_v3.cpp +++ b/otherarch/mpt_v3.cpp @@ -301,7 +301,11 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo const auto & hparams = model.hparams; size_t vram_total = 0; const int n_gpu = std::min(gpulayers, int(hparams.n_layers)); - fprintf(stderr, "%s: [GPU] offloading %d layers to GPU\n", __func__, n_gpu); + #if defined(GGML_USE_CLBLAST) + fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu); + #else + fprintf(stderr, "%s: [CUDA] offloading %d layers to GPU\n", __func__, n_gpu); + #endif for (int i = 0; i < n_gpu; ++i) { const auto & layer = model.layers[i]; layer.ffn_up_proj->backend = GGML_BACKEND_GPU; @@ -320,7 +324,11 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo ggml_cuda_transform_tensor(layer.c_attn_out_proj_weight->data,layer.c_attn_out_proj_weight); vram_total += ggml_nbytes(layer.c_attn_out_proj_weight); #endif } - fprintf(stderr, "%s: [GPU] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024); + #if defined(GGML_USE_CLBLAST) + fprintf(stderr, "%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024); + #else + fprintf(stderr, "%s: [CUDA] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024); + #endif } #endif diff --git a/otherarch/neox_v3.cpp b/otherarch/neox_v3.cpp index 7802cab86..d9fb93b28 100644 --- a/otherarch/neox_v3.cpp +++ b/otherarch/neox_v3.cpp @@ -335,7 +335,11 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model & const auto & hparams = model.hparams; size_t vram_total = 0; const int n_gpu = std::min(gpulayers, int(hparams.n_layer)); - fprintf(stderr, "%s: [GPU] offloading %d layers to GPU\n", __func__, n_gpu); + #if defined(GGML_USE_CLBLAST) + fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu); + #else + fprintf(stderr, "%s: [CUDA] offloading %d layers to GPU\n", __func__, n_gpu); + #endif for (int i = 0; i < n_gpu; ++i) { const auto & layer = model.layers[i]; layer.c_attn_attn_w->backend = GGML_BACKEND_GPU; @@ -354,7 +358,11 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model & ggml_cuda_transform_tensor(layer.c_mlp_proj_w->data,layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w); #endif } - fprintf(stderr, "%s: [GPU] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024); + #if defined(GGML_USE_CLBLAST) + fprintf(stderr, "%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024); + #else + fprintf(stderr, "%s: [CUDA] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024); + #endif } #endif @@ -663,4 +671,4 @@ bool gpt_neox_eval( ggml_free(ctx0); return true; -} \ No newline at end of file +}