diff --git a/.github/labeler.yml b/.github/labeler.yml
deleted file mode 100644
index 97d739b58..000000000
--- a/.github/labeler.yml
+++ /dev/null
@@ -1,90 +0,0 @@
-# https://github.com/actions/labeler
-Kompute:
-    - changed-files:
-        - any-glob-to-any-file:
-            - ggml-kompute.h
-            - ggml-kompute.cpp
-            - README-kompute.md
-Apple Metal:
-    - changed-files:
-        - any-glob-to-any-file:
-            - ggml-metal.h
-            - ggml-metal.cpp
-            - README-metal.md
-SYCL:
-    - changed-files:
-        - any-glob-to-any-file:
-            - ggml-sycl.h
-            - ggml-sycl.cpp
-            - README-sycl.md
-Nvidia GPU:
-    - changed-files:
-        - any-glob-to-any-file:
-            - ggml-cuda.h
-            - ggml-cuda/**
-Vulkan:
-    - changed-files:
-        - any-glob-to-any-file:
-            - ggml_vk_generate_shaders.py
-            - ggml-vulkan*
-documentation:
-    - changed-files:
-        - any-glob-to-any-file:
-            - docs/**
-            - media/**
-testing:
-    - changed-files:
-        - any-glob-to-any-file:
-            - tests/**
-build:
-    - changed-files:
-        - any-glob-to-any-file:
-            - cmake/**
-            - CMakeLists.txt
-            - CMakePresets.json
-            - codecov.yml
-examples:
-    - changed-files:
-        - any-glob-to-any-file: examples/**
-devops:
-    - changed-files:
-        - any-glob-to-any-file:
-            - .devops/**
-            - .github/**
-            - ci/**
-python:
-    - changed-files:
-        - any-glob-to-any-file:
-            - "**/*.py"
-            - requirements/**
-            - gguf-py/**
-            - .flake8
-script:
-    - changed-files:
-        - any-glob-to-any-file:
-            - scripts/**
-android:
-    - changed-files:
-        - any-glob-to-any-file:
-            - examples/llama.android/**
-server:
-    - changed-files:
-        - any-glob-to-any-file:
-            - examples/server/**
-ggml:
-    - changed-files:
-        - any-glob-to-any-file:
-            - ggml.c
-            - ggml.h
-            - ggml-*.c
-            - ggml-*.h
-            - ggml-cuda/**
-nix:
-    - changed-files:
-        - any-glob-to-any-file:
-            - "**/*.nix"
-            - .github/workflows/nix-*.yml
-            - .devops/nix/nixpkgs-instances.nix
-embedding:
-    - changed-files:
-        - any-glob-to-any-file: examples/embedding/
diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml
deleted file mode 100644
index 368dbdbe5..000000000
--- a/.github/workflows/labeler.yml
+++ /dev/null
@@ -1,17 +0,0 @@
-name: "Pull Request Labeler"
-on:
-- pull_request_target
-
-jobs:
-  labeler:
-    permissions:
-      contents: read
-      pull-requests: write
-    runs-on: ubuntu-latest
-    steps:
-    - uses: actions/checkout@v4
-      with:
-        repository: "ggerganov/llama.cpp"
-    - uses: actions/labeler@v5
-      with:
-        configuration-path: '.github/labeler.yml'
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6152856c6..96d27716a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -67,6 +67,8 @@ add_compile_definitions(LOG_DISABLE_LOGS)
 
 file(GLOB GGML_SOURCES_CUDA "ggml-cuda/*.cu")
 list(APPEND GGML_SOURCES_CUDA "ggml-cuda.cu")
+file(GLOB SRCS "ggml-cuda/template-instances/fattn-wmma*.cu")
+list(APPEND GGML_SOURCES_CUDA ${SRCS})
 set(GGML_V3_CUDA_SOURCES otherarch/ggml_v3-cuda.cu otherarch/ggml_v3-cuda.h)
 set(GGML_V2_CUDA_SOURCES otherarch/ggml_v2-cuda.cu otherarch/ggml_v2-cuda.h)
 set(GGML_V2_LEGACY_CUDA_SOURCES otherarch/ggml_v2-cuda-legacy.cu otherarch/ggml_v2-cuda-legacy.h)
@@ -94,6 +96,14 @@ if (LLAMA_CUBLAS)
         add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
         add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${LLAMA_CUDA_PEER_MAX_BATCH_SIZE})
 
+        # only build minimal quants required for fattn quant kv
+        file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu")
+        list(APPEND GGML_SOURCES_CUDA ${SRCS})
+        file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu")
+        list(APPEND GGML_SOURCES_CUDA ${SRCS})
+        file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*f16-f16.cu")
+        list(APPEND GGML_SOURCES_CUDA ${SRCS})
+
         if (LLAMA_STATIC)
             if (WIN32)
                 # As of 12.3.1 CUDA Tookit for Windows does not offer a static cublas library
@@ -153,18 +163,28 @@ if (LLAMA_HIPBLAS)
         message(STATUS "HIP and hipBLAS found")
         file(GLOB GGML_SOURCES_ROCM "ggml-cuda/*.cu")
         list(APPEND GGML_SOURCES_ROCM "ggml-cuda.cu")
+        file(GLOB SRCS "ggml-cuda/template-instances/fattn-wmma*.cu")
+        list(APPEND GGML_SOURCES_ROCM ${SRCS})
         add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUDA SD_USE_CUBLAS)
         add_library(ggml-rocm OBJECT ${GGML_SOURCES_CUDA})
         if (LLAMA_CUDA_FORCE_DMMV)
             target_compile_definitions(ggml-rocm PUBLIC GGML_CUDA_FORCE_DMMV)
         endif()
+
+        file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu")
+        list(APPEND GGML_SOURCES_ROCM ${SRCS})
+        file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu")
+        list(APPEND GGML_SOURCES_ROCM ${SRCS})
+        file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*f16-f16.cu")
+        list(APPEND GGML_SOURCES_ROCM ${SRCS})
+
+        # only build minimal quants required for fattn quant kv
         target_compile_definitions(ggml-rocm PUBLIC GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
         target_compile_definitions(ggml-rocm PUBLIC GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
         target_compile_definitions(ggml-rocm PUBLIC K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
         set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX)
         target_link_libraries(ggml-rocm PUBLIC hip::device hip::host roc::rocblas roc::hipblas)
 
-
         add_library(ggml-v2-rocm OBJECT ${GGML_V2_CUDA_SOURCES})
         if (LLAMA_CUDA_FORCE_DMMV)
             target_compile_definitions(ggml-v2-rocm PUBLIC GGML_CUDA_FORCE_DMMV)
@@ -195,9 +215,6 @@ if (LLAMA_HIPBLAS)
         set_source_files_properties(otherarch/ggml_v2-cuda-legacy.cu PROPERTIES LANGUAGE CXX)
         target_link_libraries(ggml-v2-legacy-rocm PUBLIC hip::device hip::host roc::rocblas roc::hipblas)
 
-
-
-
         if (LLAMA_STATIC)
             message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
         endif()
@@ -451,6 +468,13 @@ target_compile_features(sdtype_adapter PUBLIC cxx_std_11) # don't bump
 target_link_libraries(sdtype_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS})
 set_target_properties(sdtype_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
 
+add_library(whisper_adapter
+            otherarch/whispercpp/whisper_adapter.cpp)
+target_include_directories(whisper_adapter PUBLIC . ./otherarch ./otherarch/tools ./otherarch/whispercpp ./examples ./common)
+target_compile_features(whisper_adapter PUBLIC cxx_std_11) # don't bump
+target_link_libraries(whisper_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS})
+set_target_properties(whisper_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
+
 add_library(gpttype_adapter
             gpttype_adapter.cpp)
 target_include_directories(gpttype_adapter PUBLIC . ./otherarch ./otherarch/tools ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./examples ./common)
@@ -466,7 +490,7 @@ if (LLAMA_CUBLAS)
     set_target_properties(${TARGET} PROPERTIES PREFIX "")
     set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "koboldcpp_cublas")
     set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
-    target_link_libraries(${TARGET} PUBLIC Threads::Threads ggml ggml_v1 ggml_v2 ggml_v3 common2 gpttype_adapter sdtype_adapter ${LLAMA_EXTRA_LIBS})
+    target_link_libraries(${TARGET} PUBLIC Threads::Threads ggml ggml_v1 ggml_v2 ggml_v3 common2 gpttype_adapter whisper_adapter sdtype_adapter ${LLAMA_EXTRA_LIBS})
     target_compile_features(${TARGET} PRIVATE cxx_std_11)
 endif()
 
@@ -478,7 +502,7 @@ if (LLAMA_HIPBLAS)
     set_target_properties(${TARGET} PROPERTIES PREFIX "")
     set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "koboldcpp_hipblas")
     set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
-    target_link_libraries(${TARGET} PUBLIC Threads::Threads ggml ggml_v1 ggml_v2 ggml_v3 common2 gpttype_adapter sdtype_adapter ${LLAMA_EXTRA_LIBS})
+    target_link_libraries(${TARGET} PUBLIC Threads::Threads ggml ggml_v1 ggml_v2 ggml_v3 common2 gpttype_adapter whisper_adapter sdtype_adapter ${LLAMA_EXTRA_LIBS})
     target_compile_features(${TARGET} PRIVATE cxx_std_11)
 endif()
 
diff --git a/Makefile b/Makefile
index c1247f071..38c573d03 100644
--- a/Makefile
+++ b/Makefile
@@ -145,11 +145,17 @@ ifndef LLAMA_NO_ACCELERATE
 endif
 
 # it is recommended to use the CMAKE file to build for cublas if you can - will likely work better
+OBJS_CUDA_TEMP_INST = $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-wmma*.cu))
+OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu))
+OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu))
+OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*f16-f16.cu))
+
 ifdef LLAMA_CUBLAS
 	CUBLAS_FLAGS = -DGGML_USE_CUDA -DSD_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
 	CUBLASLD_FLAGS = -lcuda -lcublas -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib -L/usr/local/cuda/targets/aarch64-linux/lib -L/usr/local/cuda/targets/sbsa-linux/lib -L/usr/lib/wsl/lib
 	CUBLAS_OBJS = ggml-cuda.o ggml_v3-cuda.o ggml_v2-cuda.o ggml_v2-cuda-legacy.o
 	CUBLAS_OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
+	CUBLAS_OBJS += $(OBJS_CUDA_TEMP_INST)
 	NVCC      = nvcc
 	NVCCFLAGS = --forward-unknown-to-host-compiler -use_fast_math
 
@@ -206,7 +212,7 @@ ifdef LLAMA_CUDA_CCBIN
 	NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
 endif
 
-ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh
+ggml-cuda/%.o: ggml-cuda/%.cu ggml.h ggml-common.h ggml-cuda/common.cuh
 	$(NVCC) $(NVCCFLAGS) $(subst -Ofast,-O3,$(CXXFLAGS)) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
 	$(NVCC) $(NVCCFLAGS) $(subst -Ofast,-O3,$(CXXFLAGS)) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
@@ -237,13 +243,14 @@ ifdef LLAMA_HIPBLAS
 	HIPLDFLAGS    += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib -lhipblas -lamdhip64 -lrocblas
 	HIP_OBJS      += ggml-cuda.o ggml_v3-cuda.o ggml_v2-cuda.o ggml_v2-cuda-legacy.o
 	HIP_OBJS      += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
+	HIP_OBJS      += $(OBJS_CUDA_TEMP_INST)
 
 	HIPFLAGS2    += $(addprefix --offload-arch=,$(GPU_TARGETS))
 	HIPFLAGS2    += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
 	HIPFLAGS2    += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
 	HIPFLAGS2    += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
 
-ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh
+ggml-cuda/%.o: ggml-cuda/%.cu ggml.h ggml-common.h ggml-cuda/common.cuh
 	$(HCXX) $(CXXFLAGS) $(HIPFLAGS) $(HIPFLAGS2) -x hip -c -o $@ $<
 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
 	$(HCXX) $(CXXFLAGS) $(HIPFLAGS) $(HIPFLAGS2) -x hip -c -o $@ $<
@@ -536,6 +543,7 @@ gpttype_adapter_vulkan_noavx2.o: $(GPTTYPE_ADAPTER)
 clean:
 	rm -vf *.o main sdmain whispermain quantize_gguf quantize_clip quantize_gpt2 quantize_gptj quantize_neox quantize_mpt quantize-stats perplexity embedding benchmark-matmult save-load-state gguf imatrix imatrix.exe gguf.exe main.exe quantize_clip.exe quantize_gguf.exe quantize_gptj.exe quantize_gpt2.exe quantize_neox.exe quantize_mpt.exe koboldcpp_default.dll koboldcpp_openblas.dll koboldcpp_failsafe.dll koboldcpp_noavx2.dll koboldcpp_clblast.dll koboldcpp_clblast_noavx2.dll koboldcpp_cublas.dll koboldcpp_hipblas.dll koboldcpp_vulkan.dll koboldcpp_vulkan_noavx2.dll koboldcpp_default.so koboldcpp_openblas.so koboldcpp_failsafe.so koboldcpp_noavx2.so koboldcpp_clblast.so koboldcpp_clblast_noavx2.so koboldcpp_cublas.so koboldcpp_hipblas.so koboldcpp_vulkan.so koboldcpp_vulkan_noavx2.so
 	rm -vrf ggml-cuda/*.o
+	rm -vrf ggml-cuda/template-instances/*.o
 
 # useful tools
 main: examples/main/main.cpp build-info.h ggml.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_FULL) $(OBJS)
diff --git a/klite.embd b/klite.embd
index 0fa20ac52..28f060957 100644
--- a/klite.embd
+++ b/klite.embd
@@ -10214,11 +10214,11 @@ Current version: 143
 		{
 			if(aesthetic_ui)
 			{
-				submit_generation();
+				chat_submit_generation();
 			}
 			else
 			{
-				chat_submit_generation();
+				submit_generation();
 			}
 		}
 	}
diff --git a/koboldcpp.py b/koboldcpp.py
index 8dd785117..afb23fb63 100644
--- a/koboldcpp.py
+++ b/koboldcpp.py
@@ -1321,7 +1321,16 @@ Enter Prompt:<br>
         body = None
         if contlenstr:
             content_length = int(contlenstr)
+            if content_length > (1024*1024*24): #24mb payload limit
+                self.send_response(500)
+                self.end_headers(content_type='application/json')
+                self.wfile.write(json.dumps({"detail": {
+                "msg": "Payload is too big. Max payload size is 24MB.",
+                "type": "bad_input",
+                }}).encode())
+                return
             body = self.rfile.read(content_length)
+
         self.path = self.path.rstrip('/')
         response_body = None
         response_code = 200
diff --git a/otherarch/whispercpp/whisper.cpp b/otherarch/whispercpp/whisper.cpp
index d125f8e9c..bfbd30755 100644
--- a/otherarch/whispercpp/whisper.cpp
+++ b/otherarch/whispercpp/whisper.cpp
@@ -28,6 +28,7 @@
 #include <algorithm>
 #include <cassert>
 #define _USE_MATH_DEFINES
+#include <math.h>
 #include <cmath>
 #include <cstdio>
 #include <cstdarg>