diff --git a/.github/labeler.yml b/.github/labeler.yml
deleted file mode 100644
index 97d739b58..000000000
--- a/.github/labeler.yml
+++ /dev/null
@@ -1,90 +0,0 @@
-# https://github.com/actions/labeler
-Kompute:
- - changed-files:
- - any-glob-to-any-file:
- - ggml-kompute.h
- - ggml-kompute.cpp
- - README-kompute.md
-Apple Metal:
- - changed-files:
- - any-glob-to-any-file:
- - ggml-metal.h
- - ggml-metal.cpp
- - README-metal.md
-SYCL:
- - changed-files:
- - any-glob-to-any-file:
- - ggml-sycl.h
- - ggml-sycl.cpp
- - README-sycl.md
-Nvidia GPU:
- - changed-files:
- - any-glob-to-any-file:
- - ggml-cuda.h
- - ggml-cuda/**
-Vulkan:
- - changed-files:
- - any-glob-to-any-file:
- - ggml_vk_generate_shaders.py
- - ggml-vulkan*
-documentation:
- - changed-files:
- - any-glob-to-any-file:
- - docs/**
- - media/**
-testing:
- - changed-files:
- - any-glob-to-any-file:
- - tests/**
-build:
- - changed-files:
- - any-glob-to-any-file:
- - cmake/**
- - CMakeLists.txt
- - CMakePresets.json
- - codecov.yml
-examples:
- - changed-files:
- - any-glob-to-any-file: examples/**
-devops:
- - changed-files:
- - any-glob-to-any-file:
- - .devops/**
- - .github/**
- - ci/**
-python:
- - changed-files:
- - any-glob-to-any-file:
- - "**/*.py"
- - requirements/**
- - gguf-py/**
- - .flake8
-script:
- - changed-files:
- - any-glob-to-any-file:
- - scripts/**
-android:
- - changed-files:
- - any-glob-to-any-file:
- - examples/llama.android/**
-server:
- - changed-files:
- - any-glob-to-any-file:
- - examples/server/**
-ggml:
- - changed-files:
- - any-glob-to-any-file:
- - ggml.c
- - ggml.h
- - ggml-*.c
- - ggml-*.h
- - ggml-cuda/**
-nix:
- - changed-files:
- - any-glob-to-any-file:
- - "**/*.nix"
- - .github/workflows/nix-*.yml
- - .devops/nix/nixpkgs-instances.nix
-embedding:
- - changed-files:
- - any-glob-to-any-file: examples/embedding/
diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml
deleted file mode 100644
index 368dbdbe5..000000000
--- a/.github/workflows/labeler.yml
+++ /dev/null
@@ -1,17 +0,0 @@
-name: "Pull Request Labeler"
-on:
-- pull_request_target
-
-jobs:
- labeler:
- permissions:
- contents: read
- pull-requests: write
- runs-on: ubuntu-latest
- steps:
- - uses: actions/checkout@v4
- with:
- repository: "ggerganov/llama.cpp"
- - uses: actions/labeler@v5
- with:
- configuration-path: '.github/labeler.yml'
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6152856c6..96d27716a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -67,6 +67,8 @@ add_compile_definitions(LOG_DISABLE_LOGS)
file(GLOB GGML_SOURCES_CUDA "ggml-cuda/*.cu")
list(APPEND GGML_SOURCES_CUDA "ggml-cuda.cu")
+file(GLOB SRCS "ggml-cuda/template-instances/fattn-wmma*.cu")
+list(APPEND GGML_SOURCES_CUDA ${SRCS})
set(GGML_V3_CUDA_SOURCES otherarch/ggml_v3-cuda.cu otherarch/ggml_v3-cuda.h)
set(GGML_V2_CUDA_SOURCES otherarch/ggml_v2-cuda.cu otherarch/ggml_v2-cuda.h)
set(GGML_V2_LEGACY_CUDA_SOURCES otherarch/ggml_v2-cuda-legacy.cu otherarch/ggml_v2-cuda-legacy.h)
@@ -94,6 +96,14 @@ if (LLAMA_CUBLAS)
add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${LLAMA_CUDA_PEER_MAX_BATCH_SIZE})
+ # only build minimal quants required for fattn quant kv
+ file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu")
+ list(APPEND GGML_SOURCES_CUDA ${SRCS})
+ file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu")
+ list(APPEND GGML_SOURCES_CUDA ${SRCS})
+ file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*f16-f16.cu")
+ list(APPEND GGML_SOURCES_CUDA ${SRCS})
+
if (LLAMA_STATIC)
if (WIN32)
# As of 12.3.1 CUDA Tookit for Windows does not offer a static cublas library
@@ -153,18 +163,28 @@ if (LLAMA_HIPBLAS)
message(STATUS "HIP and hipBLAS found")
file(GLOB GGML_SOURCES_ROCM "ggml-cuda/*.cu")
list(APPEND GGML_SOURCES_ROCM "ggml-cuda.cu")
+ file(GLOB SRCS "ggml-cuda/template-instances/fattn-wmma*.cu")
+ list(APPEND GGML_SOURCES_ROCM ${SRCS})
add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUDA SD_USE_CUBLAS)
add_library(ggml-rocm OBJECT ${GGML_SOURCES_CUDA})
if (LLAMA_CUDA_FORCE_DMMV)
target_compile_definitions(ggml-rocm PUBLIC GGML_CUDA_FORCE_DMMV)
endif()
+
+ file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu")
+ list(APPEND GGML_SOURCES_ROCM ${SRCS})
+ file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu")
+ list(APPEND GGML_SOURCES_ROCM ${SRCS})
+ file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*f16-f16.cu")
+ list(APPEND GGML_SOURCES_ROCM ${SRCS})
+
+ # only build minimal quants required for fattn quant kv
target_compile_definitions(ggml-rocm PUBLIC GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
target_compile_definitions(ggml-rocm PUBLIC GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
target_compile_definitions(ggml-rocm PUBLIC K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX)
target_link_libraries(ggml-rocm PUBLIC hip::device hip::host roc::rocblas roc::hipblas)
-
add_library(ggml-v2-rocm OBJECT ${GGML_V2_CUDA_SOURCES})
if (LLAMA_CUDA_FORCE_DMMV)
target_compile_definitions(ggml-v2-rocm PUBLIC GGML_CUDA_FORCE_DMMV)
@@ -195,9 +215,6 @@ if (LLAMA_HIPBLAS)
set_source_files_properties(otherarch/ggml_v2-cuda-legacy.cu PROPERTIES LANGUAGE CXX)
target_link_libraries(ggml-v2-legacy-rocm PUBLIC hip::device hip::host roc::rocblas roc::hipblas)
-
-
-
if (LLAMA_STATIC)
message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
endif()
@@ -451,6 +468,13 @@ target_compile_features(sdtype_adapter PUBLIC cxx_std_11) # don't bump
target_link_libraries(sdtype_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS})
set_target_properties(sdtype_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
+add_library(whisper_adapter
+ otherarch/whispercpp/whisper_adapter.cpp)
+target_include_directories(whisper_adapter PUBLIC . ./otherarch ./otherarch/tools ./otherarch/whispercpp ./examples ./common)
+target_compile_features(whisper_adapter PUBLIC cxx_std_11) # don't bump
+target_link_libraries(whisper_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS})
+set_target_properties(whisper_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
+
add_library(gpttype_adapter
gpttype_adapter.cpp)
target_include_directories(gpttype_adapter PUBLIC . ./otherarch ./otherarch/tools ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./examples ./common)
@@ -466,7 +490,7 @@ if (LLAMA_CUBLAS)
set_target_properties(${TARGET} PROPERTIES PREFIX "")
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "koboldcpp_cublas")
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
- target_link_libraries(${TARGET} PUBLIC Threads::Threads ggml ggml_v1 ggml_v2 ggml_v3 common2 gpttype_adapter sdtype_adapter ${LLAMA_EXTRA_LIBS})
+ target_link_libraries(${TARGET} PUBLIC Threads::Threads ggml ggml_v1 ggml_v2 ggml_v3 common2 gpttype_adapter whisper_adapter sdtype_adapter ${LLAMA_EXTRA_LIBS})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
endif()
@@ -478,7 +502,7 @@ if (LLAMA_HIPBLAS)
set_target_properties(${TARGET} PROPERTIES PREFIX "")
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "koboldcpp_hipblas")
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
- target_link_libraries(${TARGET} PUBLIC Threads::Threads ggml ggml_v1 ggml_v2 ggml_v3 common2 gpttype_adapter sdtype_adapter ${LLAMA_EXTRA_LIBS})
+ target_link_libraries(${TARGET} PUBLIC Threads::Threads ggml ggml_v1 ggml_v2 ggml_v3 common2 gpttype_adapter whisper_adapter sdtype_adapter ${LLAMA_EXTRA_LIBS})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
endif()
diff --git a/Makefile b/Makefile
index c1247f071..38c573d03 100644
--- a/Makefile
+++ b/Makefile
@@ -145,11 +145,17 @@ ifndef LLAMA_NO_ACCELERATE
endif
# it is recommended to use the CMAKE file to build for cublas if you can - will likely work better
+OBJS_CUDA_TEMP_INST = $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-wmma*.cu))
+OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu))
+OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu))
+OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*f16-f16.cu))
+
ifdef LLAMA_CUBLAS
CUBLAS_FLAGS = -DGGML_USE_CUDA -DSD_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
CUBLASLD_FLAGS = -lcuda -lcublas -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib -L/usr/local/cuda/targets/aarch64-linux/lib -L/usr/local/cuda/targets/sbsa-linux/lib -L/usr/lib/wsl/lib
CUBLAS_OBJS = ggml-cuda.o ggml_v3-cuda.o ggml_v2-cuda.o ggml_v2-cuda-legacy.o
CUBLAS_OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
+ CUBLAS_OBJS += $(OBJS_CUDA_TEMP_INST)
NVCC = nvcc
NVCCFLAGS = --forward-unknown-to-host-compiler -use_fast_math
@@ -206,7 +212,7 @@ ifdef LLAMA_CUDA_CCBIN
NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
endif
-ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh
+ggml-cuda/%.o: ggml-cuda/%.cu ggml.h ggml-common.h ggml-cuda/common.cuh
$(NVCC) $(NVCCFLAGS) $(subst -Ofast,-O3,$(CXXFLAGS)) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
$(NVCC) $(NVCCFLAGS) $(subst -Ofast,-O3,$(CXXFLAGS)) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
@@ -237,13 +243,14 @@ ifdef LLAMA_HIPBLAS
HIPLDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib -lhipblas -lamdhip64 -lrocblas
HIP_OBJS += ggml-cuda.o ggml_v3-cuda.o ggml_v2-cuda.o ggml_v2-cuda-legacy.o
HIP_OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
+ HIP_OBJS += $(OBJS_CUDA_TEMP_INST)
HIPFLAGS2 += $(addprefix --offload-arch=,$(GPU_TARGETS))
HIPFLAGS2 += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
HIPFLAGS2 += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
HIPFLAGS2 += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
-ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh
+ggml-cuda/%.o: ggml-cuda/%.cu ggml.h ggml-common.h ggml-cuda/common.cuh
$(HCXX) $(CXXFLAGS) $(HIPFLAGS) $(HIPFLAGS2) -x hip -c -o $@ $<
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
$(HCXX) $(CXXFLAGS) $(HIPFLAGS) $(HIPFLAGS2) -x hip -c -o $@ $<
@@ -536,6 +543,7 @@ gpttype_adapter_vulkan_noavx2.o: $(GPTTYPE_ADAPTER)
clean:
rm -vf *.o main sdmain whispermain quantize_gguf quantize_clip quantize_gpt2 quantize_gptj quantize_neox quantize_mpt quantize-stats perplexity embedding benchmark-matmult save-load-state gguf imatrix imatrix.exe gguf.exe main.exe quantize_clip.exe quantize_gguf.exe quantize_gptj.exe quantize_gpt2.exe quantize_neox.exe quantize_mpt.exe koboldcpp_default.dll koboldcpp_openblas.dll koboldcpp_failsafe.dll koboldcpp_noavx2.dll koboldcpp_clblast.dll koboldcpp_clblast_noavx2.dll koboldcpp_cublas.dll koboldcpp_hipblas.dll koboldcpp_vulkan.dll koboldcpp_vulkan_noavx2.dll koboldcpp_default.so koboldcpp_openblas.so koboldcpp_failsafe.so koboldcpp_noavx2.so koboldcpp_clblast.so koboldcpp_clblast_noavx2.so koboldcpp_cublas.so koboldcpp_hipblas.so koboldcpp_vulkan.so koboldcpp_vulkan_noavx2.so
rm -vrf ggml-cuda/*.o
+ rm -vrf ggml-cuda/template-instances/*.o
# useful tools
main: examples/main/main.cpp build-info.h ggml.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_FULL) $(OBJS)
diff --git a/klite.embd b/klite.embd
index 0fa20ac52..28f060957 100644
--- a/klite.embd
+++ b/klite.embd
@@ -10214,11 +10214,11 @@ Current version: 143
{
if(aesthetic_ui)
{
- submit_generation();
+ chat_submit_generation();
}
else
{
- chat_submit_generation();
+ submit_generation();
}
}
}
diff --git a/koboldcpp.py b/koboldcpp.py
index 8dd785117..afb23fb63 100644
--- a/koboldcpp.py
+++ b/koboldcpp.py
@@ -1321,7 +1321,16 @@ Enter Prompt:
body = None
if contlenstr:
content_length = int(contlenstr)
+ if content_length > (1024*1024*24): #24mb payload limit
+ self.send_response(500)
+ self.end_headers(content_type='application/json')
+ self.wfile.write(json.dumps({"detail": {
+ "msg": "Payload is too big. Max payload size is 24MB.",
+ "type": "bad_input",
+ }}).encode())
+ return
body = self.rfile.read(content_length)
+
self.path = self.path.rstrip('/')
response_body = None
response_code = 200
diff --git a/otherarch/whispercpp/whisper.cpp b/otherarch/whispercpp/whisper.cpp
index d125f8e9c..bfbd30755 100644
--- a/otherarch/whispercpp/whisper.cpp
+++ b/otherarch/whispercpp/whisper.cpp
@@ -28,6 +28,7 @@
#include
#include
#define _USE_MATH_DEFINES
+#include
#include
#include
#include