diff --git a/.github/labeler.yml b/.github/labeler.yml deleted file mode 100644 index 97d739b58..000000000 --- a/.github/labeler.yml +++ /dev/null @@ -1,90 +0,0 @@ -# https://github.com/actions/labeler -Kompute: - - changed-files: - - any-glob-to-any-file: - - ggml-kompute.h - - ggml-kompute.cpp - - README-kompute.md -Apple Metal: - - changed-files: - - any-glob-to-any-file: - - ggml-metal.h - - ggml-metal.cpp - - README-metal.md -SYCL: - - changed-files: - - any-glob-to-any-file: - - ggml-sycl.h - - ggml-sycl.cpp - - README-sycl.md -Nvidia GPU: - - changed-files: - - any-glob-to-any-file: - - ggml-cuda.h - - ggml-cuda/** -Vulkan: - - changed-files: - - any-glob-to-any-file: - - ggml_vk_generate_shaders.py - - ggml-vulkan* -documentation: - - changed-files: - - any-glob-to-any-file: - - docs/** - - media/** -testing: - - changed-files: - - any-glob-to-any-file: - - tests/** -build: - - changed-files: - - any-glob-to-any-file: - - cmake/** - - CMakeLists.txt - - CMakePresets.json - - codecov.yml -examples: - - changed-files: - - any-glob-to-any-file: examples/** -devops: - - changed-files: - - any-glob-to-any-file: - - .devops/** - - .github/** - - ci/** -python: - - changed-files: - - any-glob-to-any-file: - - "**/*.py" - - requirements/** - - gguf-py/** - - .flake8 -script: - - changed-files: - - any-glob-to-any-file: - - scripts/** -android: - - changed-files: - - any-glob-to-any-file: - - examples/llama.android/** -server: - - changed-files: - - any-glob-to-any-file: - - examples/server/** -ggml: - - changed-files: - - any-glob-to-any-file: - - ggml.c - - ggml.h - - ggml-*.c - - ggml-*.h - - ggml-cuda/** -nix: - - changed-files: - - any-glob-to-any-file: - - "**/*.nix" - - .github/workflows/nix-*.yml - - .devops/nix/nixpkgs-instances.nix -embedding: - - changed-files: - - any-glob-to-any-file: examples/embedding/ diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml deleted file mode 100644 index 368dbdbe5..000000000 --- a/.github/workflows/labeler.yml +++ /dev/null @@ -1,17 +0,0 @@ -name: "Pull Request Labeler" -on: -- pull_request_target - -jobs: - labeler: - permissions: - contents: read - pull-requests: write - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - with: - repository: "ggerganov/llama.cpp" - - uses: actions/labeler@v5 - with: - configuration-path: '.github/labeler.yml' diff --git a/CMakeLists.txt b/CMakeLists.txt index 6152856c6..96d27716a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -67,6 +67,8 @@ add_compile_definitions(LOG_DISABLE_LOGS) file(GLOB GGML_SOURCES_CUDA "ggml-cuda/*.cu") list(APPEND GGML_SOURCES_CUDA "ggml-cuda.cu") +file(GLOB SRCS "ggml-cuda/template-instances/fattn-wmma*.cu") +list(APPEND GGML_SOURCES_CUDA ${SRCS}) set(GGML_V3_CUDA_SOURCES otherarch/ggml_v3-cuda.cu otherarch/ggml_v3-cuda.h) set(GGML_V2_CUDA_SOURCES otherarch/ggml_v2-cuda.cu otherarch/ggml_v2-cuda.h) set(GGML_V2_LEGACY_CUDA_SOURCES otherarch/ggml_v2-cuda-legacy.cu otherarch/ggml_v2-cuda-legacy.h) @@ -94,6 +96,14 @@ if (LLAMA_CUBLAS) add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER}) add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${LLAMA_CUDA_PEER_MAX_BATCH_SIZE}) + # only build minimal quants required for fattn quant kv + file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu") + list(APPEND GGML_SOURCES_CUDA ${SRCS}) + file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu") + list(APPEND GGML_SOURCES_CUDA ${SRCS}) + file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*f16-f16.cu") + list(APPEND GGML_SOURCES_CUDA ${SRCS}) + if (LLAMA_STATIC) if (WIN32) # As of 12.3.1 CUDA Tookit for Windows does not offer a static cublas library @@ -153,18 +163,28 @@ if (LLAMA_HIPBLAS) message(STATUS "HIP and hipBLAS found") file(GLOB GGML_SOURCES_ROCM "ggml-cuda/*.cu") list(APPEND GGML_SOURCES_ROCM "ggml-cuda.cu") + file(GLOB SRCS "ggml-cuda/template-instances/fattn-wmma*.cu") + list(APPEND GGML_SOURCES_ROCM ${SRCS}) add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUDA SD_USE_CUBLAS) add_library(ggml-rocm OBJECT ${GGML_SOURCES_CUDA}) if (LLAMA_CUDA_FORCE_DMMV) target_compile_definitions(ggml-rocm PUBLIC GGML_CUDA_FORCE_DMMV) endif() + + file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu") + list(APPEND GGML_SOURCES_ROCM ${SRCS}) + file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu") + list(APPEND GGML_SOURCES_ROCM ${SRCS}) + file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*f16-f16.cu") + list(APPEND GGML_SOURCES_ROCM ${SRCS}) + + # only build minimal quants required for fattn quant kv target_compile_definitions(ggml-rocm PUBLIC GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X}) target_compile_definitions(ggml-rocm PUBLIC GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y}) target_compile_definitions(ggml-rocm PUBLIC K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER}) set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX) target_link_libraries(ggml-rocm PUBLIC hip::device hip::host roc::rocblas roc::hipblas) - add_library(ggml-v2-rocm OBJECT ${GGML_V2_CUDA_SOURCES}) if (LLAMA_CUDA_FORCE_DMMV) target_compile_definitions(ggml-v2-rocm PUBLIC GGML_CUDA_FORCE_DMMV) @@ -195,9 +215,6 @@ if (LLAMA_HIPBLAS) set_source_files_properties(otherarch/ggml_v2-cuda-legacy.cu PROPERTIES LANGUAGE CXX) target_link_libraries(ggml-v2-legacy-rocm PUBLIC hip::device hip::host roc::rocblas roc::hipblas) - - - if (LLAMA_STATIC) message(FATAL_ERROR "Static linking not supported for HIP/ROCm") endif() @@ -451,6 +468,13 @@ target_compile_features(sdtype_adapter PUBLIC cxx_std_11) # don't bump target_link_libraries(sdtype_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS}) set_target_properties(sdtype_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON) +add_library(whisper_adapter + otherarch/whispercpp/whisper_adapter.cpp) +target_include_directories(whisper_adapter PUBLIC . ./otherarch ./otherarch/tools ./otherarch/whispercpp ./examples ./common) +target_compile_features(whisper_adapter PUBLIC cxx_std_11) # don't bump +target_link_libraries(whisper_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS}) +set_target_properties(whisper_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON) + add_library(gpttype_adapter gpttype_adapter.cpp) target_include_directories(gpttype_adapter PUBLIC . ./otherarch ./otherarch/tools ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./examples ./common) @@ -466,7 +490,7 @@ if (LLAMA_CUBLAS) set_target_properties(${TARGET} PROPERTIES PREFIX "") set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "koboldcpp_cublas") set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON) - target_link_libraries(${TARGET} PUBLIC Threads::Threads ggml ggml_v1 ggml_v2 ggml_v3 common2 gpttype_adapter sdtype_adapter ${LLAMA_EXTRA_LIBS}) + target_link_libraries(${TARGET} PUBLIC Threads::Threads ggml ggml_v1 ggml_v2 ggml_v3 common2 gpttype_adapter whisper_adapter sdtype_adapter ${LLAMA_EXTRA_LIBS}) target_compile_features(${TARGET} PRIVATE cxx_std_11) endif() @@ -478,7 +502,7 @@ if (LLAMA_HIPBLAS) set_target_properties(${TARGET} PROPERTIES PREFIX "") set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "koboldcpp_hipblas") set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON) - target_link_libraries(${TARGET} PUBLIC Threads::Threads ggml ggml_v1 ggml_v2 ggml_v3 common2 gpttype_adapter sdtype_adapter ${LLAMA_EXTRA_LIBS}) + target_link_libraries(${TARGET} PUBLIC Threads::Threads ggml ggml_v1 ggml_v2 ggml_v3 common2 gpttype_adapter whisper_adapter sdtype_adapter ${LLAMA_EXTRA_LIBS}) target_compile_features(${TARGET} PRIVATE cxx_std_11) endif() diff --git a/Makefile b/Makefile index c1247f071..38c573d03 100644 --- a/Makefile +++ b/Makefile @@ -145,11 +145,17 @@ ifndef LLAMA_NO_ACCELERATE endif # it is recommended to use the CMAKE file to build for cublas if you can - will likely work better +OBJS_CUDA_TEMP_INST = $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-wmma*.cu)) +OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu)) +OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu)) +OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*f16-f16.cu)) + ifdef LLAMA_CUBLAS CUBLAS_FLAGS = -DGGML_USE_CUDA -DSD_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include CUBLASLD_FLAGS = -lcuda -lcublas -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib -L/usr/local/cuda/targets/aarch64-linux/lib -L/usr/local/cuda/targets/sbsa-linux/lib -L/usr/lib/wsl/lib CUBLAS_OBJS = ggml-cuda.o ggml_v3-cuda.o ggml_v2-cuda.o ggml_v2-cuda-legacy.o CUBLAS_OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu)) + CUBLAS_OBJS += $(OBJS_CUDA_TEMP_INST) NVCC = nvcc NVCCFLAGS = --forward-unknown-to-host-compiler -use_fast_math @@ -206,7 +212,7 @@ ifdef LLAMA_CUDA_CCBIN NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN) endif -ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh +ggml-cuda/%.o: ggml-cuda/%.cu ggml.h ggml-common.h ggml-cuda/common.cuh $(NVCC) $(NVCCFLAGS) $(subst -Ofast,-O3,$(CXXFLAGS)) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@ ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh) $(NVCC) $(NVCCFLAGS) $(subst -Ofast,-O3,$(CXXFLAGS)) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@ @@ -237,13 +243,14 @@ ifdef LLAMA_HIPBLAS HIPLDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib -lhipblas -lamdhip64 -lrocblas HIP_OBJS += ggml-cuda.o ggml_v3-cuda.o ggml_v2-cuda.o ggml_v2-cuda-legacy.o HIP_OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu)) + HIP_OBJS += $(OBJS_CUDA_TEMP_INST) HIPFLAGS2 += $(addprefix --offload-arch=,$(GPU_TARGETS)) HIPFLAGS2 += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X) HIPFLAGS2 += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y) HIPFLAGS2 += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER) -ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh +ggml-cuda/%.o: ggml-cuda/%.cu ggml.h ggml-common.h ggml-cuda/common.cuh $(HCXX) $(CXXFLAGS) $(HIPFLAGS) $(HIPFLAGS2) -x hip -c -o $@ $< ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh) $(HCXX) $(CXXFLAGS) $(HIPFLAGS) $(HIPFLAGS2) -x hip -c -o $@ $< @@ -536,6 +543,7 @@ gpttype_adapter_vulkan_noavx2.o: $(GPTTYPE_ADAPTER) clean: rm -vf *.o main sdmain whispermain quantize_gguf quantize_clip quantize_gpt2 quantize_gptj quantize_neox quantize_mpt quantize-stats perplexity embedding benchmark-matmult save-load-state gguf imatrix imatrix.exe gguf.exe main.exe quantize_clip.exe quantize_gguf.exe quantize_gptj.exe quantize_gpt2.exe quantize_neox.exe quantize_mpt.exe koboldcpp_default.dll koboldcpp_openblas.dll koboldcpp_failsafe.dll koboldcpp_noavx2.dll koboldcpp_clblast.dll koboldcpp_clblast_noavx2.dll koboldcpp_cublas.dll koboldcpp_hipblas.dll koboldcpp_vulkan.dll koboldcpp_vulkan_noavx2.dll koboldcpp_default.so koboldcpp_openblas.so koboldcpp_failsafe.so koboldcpp_noavx2.so koboldcpp_clblast.so koboldcpp_clblast_noavx2.so koboldcpp_cublas.so koboldcpp_hipblas.so koboldcpp_vulkan.so koboldcpp_vulkan_noavx2.so rm -vrf ggml-cuda/*.o + rm -vrf ggml-cuda/template-instances/*.o # useful tools main: examples/main/main.cpp build-info.h ggml.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_FULL) $(OBJS) diff --git a/klite.embd b/klite.embd index 0fa20ac52..28f060957 100644 --- a/klite.embd +++ b/klite.embd @@ -10214,11 +10214,11 @@ Current version: 143 { if(aesthetic_ui) { - submit_generation(); + chat_submit_generation(); } else { - chat_submit_generation(); + submit_generation(); } } } diff --git a/koboldcpp.py b/koboldcpp.py index 8dd785117..afb23fb63 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -1321,7 +1321,16 @@ Enter Prompt:
body = None if contlenstr: content_length = int(contlenstr) + if content_length > (1024*1024*24): #24mb payload limit + self.send_response(500) + self.end_headers(content_type='application/json') + self.wfile.write(json.dumps({"detail": { + "msg": "Payload is too big. Max payload size is 24MB.", + "type": "bad_input", + }}).encode()) + return body = self.rfile.read(content_length) + self.path = self.path.rstrip('/') response_body = None response_code = 200 diff --git a/otherarch/whispercpp/whisper.cpp b/otherarch/whispercpp/whisper.cpp index d125f8e9c..bfbd30755 100644 --- a/otherarch/whispercpp/whisper.cpp +++ b/otherarch/whispercpp/whisper.cpp @@ -28,6 +28,7 @@ #include #include #define _USE_MATH_DEFINES +#include #include #include #include