diff --git a/CMakeLists.txt b/CMakeLists.txt
index b052650fb..51535046c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -495,7 +495,9 @@ add_library(common2
             examples/llava/clip.h
             src/unicode.h
             src/unicode.cpp
-            src/unicode-data.cpp)
+            src/unicode-data.cpp
+            otherarch/utils.cpp
+            otherarch/utils.h)
 target_include_directories(common2 PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./examples ./common)
 target_compile_features(common2 PUBLIC cxx_std_17) # don't bump
 target_link_libraries(common2 PRIVATE ggml ${LLAMA_EXTRA_LIBS})
@@ -515,11 +517,18 @@ target_compile_features(whisper_adapter PUBLIC cxx_std_17) # don't bump
 target_link_libraries(whisper_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS})
 set_target_properties(whisper_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
 
+add_library(tts_adapter
+            otherarch/tts_adapter.cpp)
+target_include_directories(tts_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./examples ./common)
+target_compile_features(tts_adapter PUBLIC cxx_std_17) # don't bump
+target_link_libraries(tts_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS})
+set_target_properties(tts_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
+
 add_library(gpttype_adapter
             gpttype_adapter.cpp)
 target_include_directories(gpttype_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./examples ./common)
 target_compile_features(gpttype_adapter PUBLIC cxx_std_17) # don't bump
-target_link_libraries(gpttype_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS})
+target_link_libraries(gpttype_adapter PRIVATE common2 ggml ggml_v1 ggml_v2 ggml_v3 ${LLAMA_EXTRA_LIBS})
 set_target_properties(gpttype_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
 
 if (LLAMA_CUBLAS)
@@ -530,8 +539,16 @@ if (LLAMA_CUBLAS)
     set_target_properties(${TARGET} PROPERTIES PREFIX "")
     set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "koboldcpp_cublas")
     set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
-    target_link_libraries(${TARGET} PUBLIC Threads::Threads ggml ggml_v1 ggml_v2 ggml_v3 common2 gpttype_adapter whisper_adapter sdtype_adapter ${LLAMA_EXTRA_LIBS})
+    target_link_libraries(${TARGET} PUBLIC Threads::Threads ggml ggml_v1 ggml_v2 ggml_v3 common2 gpttype_adapter whisper_adapter tts_adapter sdtype_adapter ${LLAMA_EXTRA_LIBS})
     target_compile_features(${TARGET} PRIVATE cxx_std_17)
+
+    add_custom_command(
+    TARGET koboldcpp_cublas POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy
+        $<TARGET_FILE:koboldcpp_cublas>     # The generated DLL
+        ${CMAKE_SOURCE_DIR}/                # Destination directory
+    COMMENT "Copying DLL to parent directory"
+    )
 endif()
 
 if (LLAMA_HIPBLAS)
@@ -542,7 +559,15 @@ if (LLAMA_HIPBLAS)
     set_target_properties(${TARGET} PROPERTIES PREFIX "")
     set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "koboldcpp_hipblas")
     set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
-    target_link_libraries(${TARGET} PUBLIC Threads::Threads ggml ggml_v1 ggml_v2 ggml_v3 common2 gpttype_adapter whisper_adapter sdtype_adapter ${LLAMA_EXTRA_LIBS})
+    target_link_libraries(${TARGET} PUBLIC Threads::Threads ggml ggml_v1 ggml_v2 ggml_v3 common2 gpttype_adapter whisper_adapter tts_adapter sdtype_adapter ${LLAMA_EXTRA_LIBS})
     target_compile_features(${TARGET} PRIVATE cxx_std_17)
+
+    add_custom_command(
+    TARGET koboldcpp_hipblas POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy
+        $<TARGET_FILE:koboldcpp_hipblas>     # The generated DLL
+        ${CMAKE_SOURCE_DIR}/                # Destination directory
+    COMMENT "Copying DLL to parent directory"
+    )
 endif()
 
diff --git a/Makefile b/Makefile
index 5ee7e24ae..bb5e74109 100644
--- a/Makefile
+++ b/Makefile
@@ -4,7 +4,7 @@
 .PHONY: finishedmsg
 
 default: koboldcpp_default koboldcpp_failsafe koboldcpp_noavx2 koboldcpp_clblast koboldcpp_clblast_noavx2 koboldcpp_cublas koboldcpp_hipblas koboldcpp_vulkan koboldcpp_vulkan_noavx2 finishedmsg
-tools: quantize_gpt2 quantize_gptj quantize_gguf quantize_neox quantize_mpt quantize_clip whispermain sdmain gguf-split
+tools: quantize_gpt2 quantize_gptj quantize_gguf quantize_neox quantize_mpt quantize_clip ttsmain whispermain sdmain gguf-split
 
 ifndef UNAME_S
 UNAME_S := $(shell uname -s)
@@ -90,10 +90,10 @@ endif
 CUBLASLD_FLAGS =
 CUBLAS_OBJS =
 
-OBJS_FULL += ggml-alloc.o ggml-cpu-traits.o ggml-quants.o ggml-cpu-quants.o ggml-cpu-aarch64.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm.o common.o sampling.o
-OBJS_SIMPLE += ggml-alloc.o ggml-cpu-traits.o ggml-quants_noavx2.o ggml-cpu-quants_noavx2.o ggml-cpu-aarch64_noavx2.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm_noavx2.o common.o sampling.o
-OBJS_SIMPLER += ggml-alloc.o ggml-cpu-traits.o ggml-quants_noavx1.o ggml-cpu-quants_noavx1.o ggml-cpu-aarch64_noavx1.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm_noavx1.o common.o sampling.o
-OBJS_FAILSAFE += ggml-alloc.o ggml-cpu-traits.o ggml-quants_failsafe.o ggml-cpu-quants_failsafe.o ggml-cpu-aarch64_failsafe.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm_failsafe.o common.o sampling.o
+OBJS_FULL += ggml-alloc.o ggml-cpu-traits.o ggml-quants.o ggml-cpu-quants.o ggml-cpu-aarch64.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm.o common.o sampling.o kcpputils.o
+OBJS_SIMPLE += ggml-alloc.o ggml-cpu-traits.o ggml-quants_noavx2.o ggml-cpu-quants_noavx2.o ggml-cpu-aarch64_noavx2.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm_noavx2.o common.o sampling.o kcpputils.o
+OBJS_SIMPLER += ggml-alloc.o ggml-cpu-traits.o ggml-quants_noavx1.o ggml-cpu-quants_noavx1.o ggml-cpu-aarch64_noavx1.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm_noavx1.o common.o sampling.o kcpputils.o
+OBJS_FAILSAFE += ggml-alloc.o ggml-cpu-traits.o ggml-quants_failsafe.o ggml-cpu-quants_failsafe.o ggml-cpu-aarch64_failsafe.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm_failsafe.o common.o sampling.o kcpputils.o
 
 # OS specific
 ifeq ($(UNAME_S),Linux)
@@ -539,6 +539,8 @@ ggml-cpu-cpp.o: ggml/src/ggml-cpu/ggml-cpu.cpp ggml/include/ggml.h ggml/src/ggml
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 gguf.o: ggml/src/gguf.cpp ggml/include/gguf.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
+kcpputils.o: otherarch/utils.cpp otherarch/utils.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
 
 #these have special gpu defines
 ggml-backend_default.o: ggml/src/ggml-backend.cpp ggml/src/ggml-backend-impl.h ggml/include/ggml.h ggml/include/ggml-backend.h
@@ -639,8 +641,12 @@ whispercpp_default.o: otherarch/whispercpp/whisper_adapter.cpp
 whispercpp_cublas.o: otherarch/whispercpp/whisper_adapter.cpp
 	$(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@
 
+#tts objects
+tts_default.o: otherarch/tts_adapter.cpp
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
 # idiotic "for easier compilation"
-GPTTYPE_ADAPTER = gpttype_adapter.cpp otherarch/llama_v2.cpp otherarch/llama_v3.cpp src/llama.cpp src/llama-impl.cpp src/llama-chat.cpp src/llama-mmap.cpp src/llama-context.cpp src/llama-adapter.cpp src/llama-arch.cpp src/llama-batch.cpp src/llama-vocab.cpp src/llama-grammar.cpp src/llama-sampling.cpp src/llama-kv-cache.cpp src/llama-model-loader.cpp src/llama-model.cpp src/llama-quant.cpp src/llama-hparams.cpp otherarch/utils.cpp otherarch/gptj_v1.cpp otherarch/gptj_v2.cpp otherarch/gptj_v3.cpp otherarch/gpt2_v1.cpp otherarch/gpt2_v2.cpp otherarch/gpt2_v3.cpp otherarch/rwkv_v2.cpp otherarch/rwkv_v3.cpp otherarch/neox_v2.cpp otherarch/neox_v3.cpp otherarch/mpt_v3.cpp ggml/include/ggml.h ggml/include/ggml-cpu.h ggml/include/ggml-cuda.h include/llama.h otherarch/llama-util.h
+GPTTYPE_ADAPTER = gpttype_adapter.cpp otherarch/llama_v2.cpp otherarch/llama_v3.cpp src/llama.cpp src/llama-impl.cpp src/llama-chat.cpp src/llama-mmap.cpp src/llama-context.cpp src/llama-adapter.cpp src/llama-arch.cpp src/llama-batch.cpp src/llama-vocab.cpp src/llama-grammar.cpp src/llama-sampling.cpp src/llama-kv-cache.cpp src/llama-model-loader.cpp src/llama-model.cpp src/llama-quant.cpp src/llama-hparams.cpp otherarch/gptj_v1.cpp otherarch/gptj_v2.cpp otherarch/gptj_v3.cpp otherarch/gpt2_v1.cpp otherarch/gpt2_v2.cpp otherarch/gpt2_v3.cpp otherarch/rwkv_v2.cpp otherarch/rwkv_v3.cpp otherarch/neox_v2.cpp otherarch/neox_v3.cpp otherarch/mpt_v3.cpp ggml/include/ggml.h ggml/include/ggml-cpu.h ggml/include/ggml-cuda.h include/llama.h otherarch/llama-util.h
 gpttype_adapter_failsafe.o: $(GPTTYPE_ADAPTER)
 	$(CXX) $(CXXFLAGS) $(FAILSAFE_FLAGS) -c $< -o $@
 gpttype_adapter.o: $(GPTTYPE_ADAPTER)
@@ -680,11 +686,11 @@ vulkan-shaders-gen: ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
 	$(shell) vulkan-shaders-gen --glslc glslc --input-dir ggml/src/ggml-vulkan/vulkan-shaders --target-hpp ggml/src/ggml-vulkan-shaders.hpp --target-cpp ggml/src/ggml-vulkan-shaders.cpp
 
 #generated libraries
-koboldcpp_default: ggml.o ggml-cpu.o ggml_v3.o ggml_v2.o ggml_v1.o expose.o gpttype_adapter.o sdcpp_default.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL) $(OBJS)
+koboldcpp_default: ggml.o ggml-cpu.o ggml_v3.o ggml_v2.o ggml_v1.o expose.o gpttype_adapter.o sdcpp_default.o whispercpp_default.o tts_default.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL) $(OBJS)
 	$(DEFAULT_BUILD)
 
 ifdef FAILSAFE_BUILD
-koboldcpp_failsafe: ggml_v4_failsafe.o ggml-cpu_v4_failsafe.o ggml_v3_failsafe.o ggml_v2_failsafe.o ggml_v1_failsafe.o expose.o gpttype_adapter_failsafe.o sdcpp_default.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FAILSAFE) $(OBJS)
+koboldcpp_failsafe: ggml_v4_failsafe.o ggml-cpu_v4_failsafe.o ggml_v3_failsafe.o ggml_v2_failsafe.o ggml_v1_failsafe.o expose.o gpttype_adapter_failsafe.o sdcpp_default.o whispercpp_default.o tts_default.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FAILSAFE) $(OBJS)
 	$(FAILSAFE_BUILD)
 else
 koboldcpp_failsafe:
@@ -692,7 +698,7 @@ koboldcpp_failsafe:
 endif
 
 ifdef NOAVX2_BUILD
-koboldcpp_noavx2: ggml_v4_noavx2.o ggml-cpu_v4_noavx2.o ggml_v3_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o gpttype_adapter_failsafe.o sdcpp_default.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_SIMPLE) $(OBJS)
+koboldcpp_noavx2: ggml_v4_noavx2.o ggml-cpu_v4_noavx2.o ggml_v3_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o gpttype_adapter_failsafe.o sdcpp_default.o whispercpp_default.o tts_default.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_SIMPLE) $(OBJS)
 	$(NOAVX2_BUILD)
 else
 koboldcpp_noavx2:
@@ -700,10 +706,10 @@ koboldcpp_noavx2:
 endif
 
 ifdef CLBLAST_BUILD
-koboldcpp_clblast: ggml_v4_clblast.o ggml-cpu_v4_clblast.o ggml_v3_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v3-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o sdcpp_default.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL) $(OBJS)
+koboldcpp_clblast: ggml_v4_clblast.o ggml-cpu_v4_clblast.o ggml_v3_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v3-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o sdcpp_default.o whispercpp_default.o tts_default.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL) $(OBJS)
 	$(CLBLAST_BUILD)
 ifdef NOAVX2_BUILD
-koboldcpp_clblast_noavx2: ggml_v4_clblast_noavx2.o ggml-cpu_v4_clblast_noavx2.o ggml_v3_clblast_noavx2.o ggml_v2_clblast_noavx2.o ggml_v1_failsafe.o expose.o gpttype_adapter_clblast_noavx2.o ggml-opencl.o ggml_v3-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o sdcpp_default.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_SIMPLER) $(OBJS)
+koboldcpp_clblast_noavx2: ggml_v4_clblast_noavx2.o ggml-cpu_v4_clblast_noavx2.o ggml_v3_clblast_noavx2.o ggml_v2_clblast_noavx2.o ggml_v1_failsafe.o expose.o gpttype_adapter_clblast_noavx2.o ggml-opencl.o ggml_v3-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o sdcpp_default.o whispercpp_default.o tts_default.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_SIMPLER) $(OBJS)
 	$(CLBLAST_BUILD)
 else
 koboldcpp_clblast_noavx2:
@@ -717,7 +723,7 @@ koboldcpp_clblast_noavx2:
 endif
 
 ifdef CUBLAS_BUILD
-koboldcpp_cublas: ggml_v4_cublas.o ggml-cpu.o ggml_v3_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o gpttype_adapter_cublas.o sdcpp_cublas.o whispercpp_cublas.o llavaclip_cublas.o llava.o ggml-backend_cublas.o ggml-backend-reg_cublas.o $(CUBLAS_OBJS) $(OBJS_FULL) $(OBJS)
+koboldcpp_cublas: ggml_v4_cublas.o ggml-cpu.o ggml_v3_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o gpttype_adapter_cublas.o sdcpp_cublas.o whispercpp_cublas.o tts_default.o llavaclip_cublas.o llava.o ggml-backend_cublas.o ggml-backend-reg_cublas.o $(CUBLAS_OBJS) $(OBJS_FULL) $(OBJS)
 	$(CUBLAS_BUILD)
 else
 koboldcpp_cublas:
@@ -725,7 +731,7 @@ koboldcpp_cublas:
 endif
 
 ifdef HIPBLAS_BUILD
-koboldcpp_hipblas: ggml_v4_cublas.o ggml-cpu.o ggml_v3_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o gpttype_adapter_cublas.o sdcpp_cublas.o whispercpp_cublas.o llavaclip_cublas.o llava.o ggml-backend_cublas.o ggml-backend-reg_cublas.o $(HIP_OBJS) $(OBJS_FULL) $(OBJS)
+koboldcpp_hipblas: ggml_v4_cublas.o ggml-cpu.o ggml_v3_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o gpttype_adapter_cublas.o sdcpp_cublas.o whispercpp_cublas.o tts_default.o llavaclip_cublas.o llava.o ggml-backend_cublas.o ggml-backend-reg_cublas.o $(HIP_OBJS) $(OBJS_FULL) $(OBJS)
 	$(HIPBLAS_BUILD)
 else
 koboldcpp_hipblas:
@@ -733,10 +739,10 @@ koboldcpp_hipblas:
 endif
 
 ifdef VULKAN_BUILD
-koboldcpp_vulkan: ggml_v4_vulkan.o ggml-cpu.o ggml_v3.o ggml_v2.o ggml_v1.o expose.o gpttype_adapter_vulkan.o ggml-vulkan.o sdcpp_vulkan.o whispercpp_default.o llavaclip_vulkan.o llava.o ggml-backend_vulkan.o ggml-backend-reg_vulkan.o $(OBJS_FULL) $(OBJS)
+koboldcpp_vulkan: ggml_v4_vulkan.o ggml-cpu.o ggml_v3.o ggml_v2.o ggml_v1.o expose.o gpttype_adapter_vulkan.o ggml-vulkan.o sdcpp_vulkan.o whispercpp_default.o tts_default.o llavaclip_vulkan.o llava.o ggml-backend_vulkan.o ggml-backend-reg_vulkan.o $(OBJS_FULL) $(OBJS)
 	$(VULKAN_BUILD)
 ifdef NOAVX2_BUILD
-koboldcpp_vulkan_noavx2: ggml_v4_vulkan_noavx2.o ggml-cpu_v4_noavx2.o ggml_v3_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o gpttype_adapter_vulkan_noavx2.o ggml-vulkan.o sdcpp_vulkan.o whispercpp_default.o llavaclip_vulkan.o llava.o ggml-backend_vulkan.o ggml-backend-reg_vulkan.o $(OBJS_SIMPLE) $(OBJS)
+koboldcpp_vulkan_noavx2: ggml_v4_vulkan_noavx2.o ggml-cpu_v4_noavx2.o ggml_v3_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o gpttype_adapter_vulkan_noavx2.o ggml-vulkan.o sdcpp_vulkan.o whispercpp_default.o tts_default.o llavaclip_vulkan.o llava.o ggml-backend_vulkan.o ggml-backend-reg_vulkan.o $(OBJS_SIMPLE) $(OBJS)
 	$(VULKAN_BUILD)
 else
 koboldcpp_vulkan_noavx2:
diff --git a/expose.cpp b/expose.cpp
index 369b71866..b386e3c2c 100644
--- a/expose.cpp
+++ b/expose.cpp
@@ -238,6 +238,15 @@ extern "C"
         return whispertype_generate(inputs);
     }
 
+    bool tts_load_model(const tts_load_model_inputs inputs)
+    {
+        return ttstype_load_model(inputs);
+    }
+    tts_generation_outputs tts_generate(const tts_generation_inputs inputs)
+    {
+        return ttstype_generate(inputs);
+    }
+
     const char * new_token(int idx) {
         if (generated_tokens.size() <= idx || idx < 0) return nullptr;
 
diff --git a/expose.h b/expose.h
index a96bdda3b..12ad17a99 100644
--- a/expose.h
+++ b/expose.h
@@ -139,6 +139,7 @@ struct last_logprobs_outputs {
     int count = 0;
     logprob_item * logprob_items = nullptr;
 };
+
 struct sd_load_model_inputs
 {
     const char * model_filename = nullptr;
@@ -178,6 +179,7 @@ struct sd_generation_outputs
     int status = -1;
     const char * data = "";
 };
+
 struct whisper_load_model_inputs
 {
     const char * model_filename = nullptr;
@@ -201,6 +203,30 @@ struct whisper_generation_outputs
     const char * text = "";
 };
 
+struct tts_load_model_inputs
+{
+    const char * ttc_model_filename = nullptr;
+    const char * cts_model_filename = nullptr;
+    const char * executable_path = nullptr;
+    const int clblast_info = 0;
+    const int cublas_info = 0;
+    const char * vulkan_info = nullptr;
+    const int gpulayers = 0;
+    const int debugmode = 0;
+};
+struct tts_generation_inputs
+{
+    const char * prompt = nullptr;
+    const int speaker_seed = 0;
+    const int audio_seed = 0;
+    const bool quiet = false;
+};
+struct tts_generation_outputs
+{
+    int status = -1;
+    const char * data = "";
+};
+
 extern std::string executable_path;
 extern std::string lora_filename;
 extern std::string lora_base;
diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp
index b5dc11974..b0310cd46 100644
--- a/ggml/src/gguf.cpp
+++ b/ggml/src/gguf.cpp
@@ -383,7 +383,7 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
         }
 
         if (ok && gr.read(n_kv_32)) {
-            n_kv_32 = n_kv_32;
+            n_kv = n_kv_32;
         } else {
             ok = false;
         }
diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp
index 901200ea1..d5bae07be 100644
--- a/gpttype_adapter.cpp
+++ b/gpttype_adapter.cpp
@@ -21,12 +21,13 @@
 #include <cctype>
 #include <locale>
 
+#include "utils.h"
+
 //for easier compilation
 //concat source files into one file for compilation purposes
 #include "llama_v2.cpp"
 #include "llama_v3.cpp"
 #include "src/llama.cpp"
-#include "utils.cpp"
 #include "gptj_v1.cpp"
 #include "gptj_v2.cpp"
 #include "gptj_v3.cpp"
@@ -535,99 +536,6 @@ const char * kcpp_print_system_info(void) {
     return s.c_str();
 }
 
-struct kcpp_embd_batch { //duplcated from llava_embd_batch
-    std::vector<int32_t> pos;
-    std::vector<int32_t> n_seq_id;
-    std::vector<int32_t> seq_id_0;
-    std::vector<int32_t *> seq_ids;
-    std::vector<int8_t> logits;
-    llama_batch batch;
-    kcpp_embd_batch(float * embd, int32_t n_tokens, int32_t npast, bool use_mrope) {
-        int32_t seq_id = 0;
-        pos.resize(n_tokens * (use_mrope?4:1));
-        std::fill(pos.begin(), pos.end(), 0);
-        n_seq_id.resize(n_tokens);
-        seq_ids.resize(n_tokens + 1);
-        logits.resize(n_tokens);
-        seq_id_0.resize(1);
-        seq_id_0[0] = seq_id;
-        seq_ids [n_tokens] = nullptr;
-        batch = {
-            /*n_tokens       =*/ n_tokens,
-            /*tokens         =*/ nullptr,
-            /*embd           =*/ embd,
-            /*pos            =*/ pos.data(),
-            /*n_seq_id       =*/ n_seq_id.data(),
-            /*seq_id         =*/ seq_ids.data(),
-            /*logits         =*/ logits.data(),
-        };
-
-        if(!use_mrope)
-        {
-           for (int i = 0; i < n_tokens; i++) {
-                batch.pos     [i] = npast + i;
-                batch.n_seq_id[i] = 1;
-                batch.seq_id  [i] = seq_id_0.data();
-                batch.logits  [i] = false;
-            }
-        }
-        else
-        {
-            for (int i = 0; i < n_tokens; i++) {
-                batch.n_seq_id[i] = 1;
-                batch.seq_id  [i] = seq_id_0.data();
-                batch.logits  [i] = false;
-            }
-             for (int j = 0; j < batch.n_tokens * 3; j++) {
-                batch.pos[j] = npast + (j % batch.n_tokens);
-            }
-        }
-    }
-    kcpp_embd_batch(std::vector<llama_token> & tokens, int32_t npast, bool use_mrope, bool return_all_logits) {
-        int32_t seq_id = 0;
-        int32_t n_tokens = tokens.size();
-        pos.resize(n_tokens * (use_mrope?4:1));
-        std::fill(pos.begin(), pos.end(), 0);
-        n_seq_id.resize(n_tokens);
-        seq_ids.resize(n_tokens + 1);
-        logits.resize(n_tokens);
-        seq_id_0.resize(1);
-        seq_id_0[0] = seq_id;
-        seq_ids[n_tokens] = nullptr;
-        batch = {
-            /*n_tokens       =*/ n_tokens,
-            /*tokens         =*/ tokens.data(),
-            /*embd           =*/ nullptr,
-            /*pos            =*/ pos.data(),
-            /*n_seq_id       =*/ n_seq_id.data(),
-            /*seq_id         =*/ seq_ids.data(),
-            /*logits         =*/ logits.data(),
-        };
-
-        if(!use_mrope)
-        {
-           for (int i = 0; i < n_tokens; i++) {
-                batch.pos     [i] = npast + i;
-                batch.n_seq_id[i] = 1;
-                batch.seq_id  [i] = seq_id_0.data();
-                batch.logits  [i] = (return_all_logits?true:false);
-            }
-        }
-        else
-        {
-            for (int i = 0; i < n_tokens; i++) {
-                batch.n_seq_id[i] = 1;
-                batch.seq_id  [i] = seq_id_0.data();
-                batch.logits  [i] = (return_all_logits?true:false);
-            }
-             for (int j = 0; j < batch.n_tokens * 3; j++) {
-                batch.pos[j] = npast + (j % batch.n_tokens);
-            }
-        }
-        batch.logits[n_tokens - 1] = true;
-    }
-};
-
 //loads a model for speculative decoding.
 static void speculative_decoding_setup(std::string spec_model_filename, const llama_model_params & base_model_params, const llama_context_params & base_ctx_params, int base_n_vocab, const float * draft_gpusplit, int draft_gpulayers)
 {
@@ -664,7 +572,7 @@ static void speculative_decoding_setup(std::string spec_model_filename, const ll
     draft_ctx_params.type_k = base_ctx_params.type_k;
     draft_ctx_params.type_v = base_ctx_params.type_v;
 
-    llama_model * draftmodel = llama_load_model_from_file(spec_model_filename.c_str(), draft_model_params);
+    llama_model * draftmodel = llama_model_load_from_file(spec_model_filename.c_str(), draft_model_params);
     draft_ctx = llama_new_context_with_model(draftmodel, draft_ctx_params);
     if(draft_ctx == NULL)
     {
@@ -2252,7 +2160,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
             kvos.push_back(kvo);
             model_params.kv_overrides = kvos.data();
         }
-        llama_model * llamamodel = llama_load_model_from_file(kcpp_data->model_filename.c_str(), model_params);
+        llama_model * llamamodel = llama_model_load_from_file(kcpp_data->model_filename.c_str(), model_params);
 
         if(overwriteRope)
         {
diff --git a/include/llama.h b/include/llama.h
index 0295a51fb..e8dd55f66 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -1,3 +1,5 @@
+#pragma once
+
 #ifndef LLAMA_H
 #define LLAMA_H
 
diff --git a/koboldcpp.py b/koboldcpp.py
index 3279fde60..6f878191d 100644
--- a/koboldcpp.py
+++ b/koboldcpp.py
@@ -53,6 +53,7 @@ fullsdmodelpath = ""  #if empty, it's not initialized
 mmprojpath = "" #if empty, it's not initialized
 password = "" #if empty, no auth key required
 fullwhispermodelpath = "" #if empty, it's not initialized
+ttsmodelpath = "" #if empty, not initialized
 maxctx = 4096
 maxhordectx = 4096
 maxhordelen = 400
@@ -281,6 +282,26 @@ class whisper_generation_outputs(ctypes.Structure):
     _fields_ = [("status", ctypes.c_int),
                 ("data", ctypes.c_char_p)]
 
+class tts_load_model_inputs(ctypes.Structure):
+    _fields_ = [("ttc_model_filename", ctypes.c_char_p),
+                ("cts_model_filename", ctypes.c_char_p),
+                ("executable_path", ctypes.c_char_p),
+                ("clblast_info", ctypes.c_int),
+                ("cublas_info", ctypes.c_int),
+                ("vulkan_info", ctypes.c_char_p),
+                ("gpulayers", ctypes.c_int),
+                ("debugmode", ctypes.c_int)]
+
+class tts_generation_inputs(ctypes.Structure):
+    _fields_ = [("prompt", ctypes.c_char_p),
+                ("speaker_seed", ctypes.c_int),
+                ("audio_seed", ctypes.c_int),
+                ("quiet", ctypes.c_bool)]
+
+class tts_generation_outputs(ctypes.Structure):
+    _fields_ = [("status", ctypes.c_int),
+                ("data", ctypes.c_char_p)]
+
 def getdirpath():
     return os.path.dirname(os.path.realpath(__file__))
 def getabspath():
@@ -440,6 +461,10 @@ def init_library():
     handle.whisper_load_model.restype = ctypes.c_bool
     handle.whisper_generate.argtypes = [whisper_generation_inputs]
     handle.whisper_generate.restype = whisper_generation_outputs
+    handle.tts_load_model.argtypes = [tts_load_model_inputs]
+    handle.tts_load_model.restype = ctypes.c_bool
+    handle.tts_generate.argtypes = [tts_generation_inputs]
+    handle.tts_generate.restype = tts_generation_outputs
     handle.last_logprobs.restype = last_logprobs_outputs
     handle.detokenize.argtypes = [token_count_outputs]
     handle.detokenize.restype = ctypes.c_char_p
@@ -577,9 +602,13 @@ def utfprint(str, importance = 2): #0 = only debugmode, 1 = except quiet, 2 = al
     maxlen = 32000
     if args.debugmode >= 1:
         maxlen = 64000
-    strlength = len(str)
-    if strlength > maxlen: #limit max output len
-        str = str[:maxlen] + f"... (+{strlength-maxlen} chars)"
+    try:
+        strlength = len(str)
+        if strlength > maxlen: #limit max output len
+            str = str[:maxlen] + f"... (+{strlength-maxlen} chars)"
+    except Exception:
+        pass
+
     try:
         print(str)
     except UnicodeEncodeError:
@@ -647,13 +676,14 @@ def read_gguf_metadata(file_path):
     except Exception:
         return None
 
-def extract_modelfile_params(filepath,sdfilepath,whisperfilepath,mmprojfilepath,draftmodelpath):
+def extract_modelfile_params(filepath,sdfilepath,whisperfilepath,mmprojfilepath,draftmodelpath,ttsmodelpath):
     global modelfile_extracted_meta
     modelfile_extracted_meta = None
     sdfsize = 0
     whisperfsize = 0
     mmprojsize = 0
     draftmodelsize = 0
+    ttsmodelsize = 0
     if sdfilepath and os.path.exists(sdfilepath):
         sdfsize = os.path.getsize(sdfilepath)
     if whisperfilepath and os.path.exists(whisperfilepath):
@@ -662,12 +692,14 @@ def extract_modelfile_params(filepath,sdfilepath,whisperfilepath,mmprojfilepath,
         mmprojsize = os.path.getsize(mmprojfilepath)
     if draftmodelpath and os.path.exists(draftmodelpath):
         draftmodelsize = os.path.getsize(draftmodelpath)
+    if ttsmodelpath and os.path.exists(ttsmodelpath):
+        ttsmodelsize = os.path.getsize(ttsmodelpath)
     if filepath and os.path.exists(filepath):
         try:
             fsize = os.path.getsize(filepath)
             if fsize>10000000: #dont bother with models < 10mb as they are probably bad
                 ggufmeta = read_gguf_metadata(filepath)
-                modelfile_extracted_meta = [ggufmeta,fsize,sdfsize,whisperfsize,mmprojsize,draftmodelsize] #extract done. note that meta may be null
+                modelfile_extracted_meta = [ggufmeta,fsize,sdfsize,whisperfsize,mmprojsize,draftmodelsize,ttsmodelsize] #extract done. note that meta may be null
         except Exception:
             modelfile_extracted_meta = None
 
@@ -699,6 +731,8 @@ def autoset_gpu_layers(ctxsize,sdquanted,bbs): #shitty algo to determine how man
                 mem -= 350*1024*1024
             if modelfile_extracted_meta[5] > 1024*1024*10: #draft model tax
                 mem -= (modelfile_extracted_meta[5] * 1.5)
+            if modelfile_extracted_meta[6] > 1024*1024*10: #tts model tax
+                mem -= max(600*1024*1024, modelfile_extracted_meta[6] * 3)
             mem = 0 if mem < 0 else mem
 
             csmul = 1.0
@@ -730,6 +764,8 @@ def fetch_gpu_properties(testCL,testCU,testVK):
         FetchedCUdevices = []
         FetchedCUdeviceMem = []
         FetchedCUfreeMem = []
+        faileddetectvram = False
+
         AMDgpu = None
         try: # Get NVIDIA GPU names
             output = subprocess.run(['nvidia-smi','--query-gpu=name,memory.total,memory.free','--format=csv,noheader'], capture_output=True, text=True, check=True, encoding='utf-8').stdout
@@ -737,6 +773,10 @@ def fetch_gpu_properties(testCL,testCU,testVK):
             FetchedCUdeviceMem = [line.split(",")[1].strip().split(" ")[0].strip() for line in output.splitlines()]
             FetchedCUfreeMem = [line.split(",")[2].strip().split(" ")[0].strip() for line in output.splitlines()]
         except Exception:
+            FetchedCUdevices = []
+            FetchedCUdeviceMem = []
+            FetchedCUfreeMem = []
+            faileddetectvram = True
             pass
         if len(FetchedCUdevices)==0:
             try: # Get AMD ROCm GPU names
@@ -756,18 +796,30 @@ def fetch_gpu_properties(testCL,testCU,testVK):
                     if getamdvram:
                         FetchedCUdeviceMem = [line.split(",")[1].strip() for line in getamdvram.splitlines()[1:] if line.strip()]
             except Exception:
+                FetchedCUdevices = []
+                FetchedCUdeviceMem = []
+                FetchedCUfreeMem = []
+                faileddetectvram = True
                 pass
         lowestcumem = 0
         lowestfreecumem = 0
-        for idx in range(0,4):
-            if(len(FetchedCUdevices)>idx):
-                CUDevicesNames[idx] = FetchedCUdevices[idx]
-                if len(FetchedCUdeviceMem)>idx:
-                    dmem = int(FetchedCUdeviceMem[idx]) if AMDgpu else (int(FetchedCUdeviceMem[idx])*1024*1024)
-                    lowestcumem = dmem if lowestcumem==0 else (dmem if dmem<lowestcumem else lowestcumem)
-                if len(FetchedCUfreeMem)>idx:
-                    dmem = (int(FetchedCUfreeMem[idx])*1024*1024)
-                    lowestfreecumem = dmem if lowestfreecumem==0 else (dmem if dmem<lowestfreecumem else lowestfreecumem)
+        try:
+            for idx in range(0,4):
+                if(len(FetchedCUdevices)>idx):
+                    CUDevicesNames[idx] = FetchedCUdevices[idx]
+                    if len(FetchedCUdeviceMem)>idx:
+                        dmem = int(FetchedCUdeviceMem[idx]) if AMDgpu else (int(FetchedCUdeviceMem[idx])*1024*1024)
+                        lowestcumem = dmem if lowestcumem==0 else (dmem if dmem<lowestcumem else lowestcumem)
+                    if len(FetchedCUfreeMem)>idx:
+                        dmem = (int(FetchedCUfreeMem[idx])*1024*1024)
+                        lowestfreecumem = dmem if lowestfreecumem==0 else (dmem if dmem<lowestfreecumem else lowestfreecumem)
+        except Exception:
+            lowestcumem = 0
+            lowestfreecumem = 0
+            faileddetectvram = True
+
+        if faileddetectvram:
+            print("Unable to detect VRAM, please set layers manually.")
 
         MaxMemory[0] = max(lowestcumem,MaxMemory[0])
         MaxFreeMemory[0] = max(lowestfreecumem,MaxFreeMemory[0])
@@ -1264,6 +1316,34 @@ def whisper_generate(genparams):
         outstr = ret.data.decode("UTF-8","ignore")
     return outstr
 
+def tts_load_model(ttc_model_filename,cts_model_filename):
+    global args
+    inputs = tts_load_model_inputs()
+    inputs.debugmode = args.debugmode
+    inputs.executable_path = (getdirpath()+"/").encode("UTF-8")
+    inputs.ttc_model_filename = ttc_model_filename.encode("UTF-8")
+    inputs.cts_model_filename = cts_model_filename.encode("UTF-8")
+    inputs.gpulayers = (999 if args.ttsgpu else 0)
+    inputs = set_backend_props(inputs)
+    ret = handle.tts_load_model(inputs)
+    return ret
+
+def tts_generate(genparams):
+    global args
+    is_quiet = True if (args.quiet or args.debugmode == -1) else False
+    prompt = genparams.get("input", "")
+    prompt = prompt.strip()
+    inputs = tts_generation_inputs()
+    inputs.prompt = prompt.encode("UTF-8")
+    inputs.speaker_seed = 0
+    inputs.audio_seed = 0
+    inputs.quiet = is_quiet
+    ret = handle.tts_generate(inputs)
+    outstr = ""
+    if ret.status==1:
+        outstr = ret.data.decode("UTF-8","ignore")
+    return outstr
+
 def tokenize_ids(countprompt,tcaddspecial):
     rawcountdata = handle.token_count(countprompt.encode("UTF-8"),tcaddspecial)
     countlimit = rawcountdata.count if (rawcountdata.count>=0 and rawcountdata.count<50000) else 0
@@ -1738,10 +1818,11 @@ def LaunchWebbrowser(target_url, failedmsg):
     try:
         import webbrowser as wb
         if wb.open(target_url, autoraise=True):
-          return
+            return
         raise RuntimeError("Cannot open default browser")
-    except Exception:
+    except Exception as e:
         try:
+            print(f"Browser failed to launch: {e}, attempting to use xdg-open...")
             import webbrowser as wb
             if wb.get('xdg-open').open(target_url, autoraise=True):
                 return
@@ -2102,7 +2183,7 @@ Enter Prompt:<br>
 
     def do_GET(self):
         global embedded_kailite, embedded_kcpp_docs, embedded_kcpp_sdui
-        global has_multiplayer, multiplayer_turn_major, multiplayer_turn_minor, multiplayer_story_data_compressed, multiplayer_dataformat, multiplayer_lastactive, maxctx, maxhordelen, friendlymodelname, lastgeneratedcomfyimg, KcppVersion, totalgens, preloaded_story, exitcounter, currentusergenkey, friendlysdmodelname, fullsdmodelpath, mmprojpath, password, fullwhispermodelpath
+        global has_multiplayer, multiplayer_turn_major, multiplayer_turn_minor, multiplayer_story_data_compressed, multiplayer_dataformat, multiplayer_lastactive, maxctx, maxhordelen, friendlymodelname, lastgeneratedcomfyimg, KcppVersion, totalgens, preloaded_story, exitcounter, currentusergenkey, friendlysdmodelname, fullsdmodelpath, mmprojpath, password, fullwhispermodelpath, ttsmodelpath
         self.path = self.path.rstrip('/')
         response_body = None
         content_type = 'application/json'
@@ -2160,7 +2241,8 @@ Enter Prompt:<br>
             has_password = (password!="")
             has_whisper = (fullwhispermodelpath!="")
             has_search = True if args.websearch else False
-            response_body = (json.dumps({"result":"KoboldCpp","version":KcppVersion, "protected":has_password ,"txt2img":has_txt2img,"vision":has_vision,"transcribe":has_whisper,"multiplayer":has_multiplayer,"websearch":has_search}).encode())
+            has_tts = (ttsmodelpath!="")
+            response_body = (json.dumps({"result":"KoboldCpp","version":KcppVersion, "protected":has_password ,"txt2img":has_txt2img,"vision":has_vision,"transcribe":has_whisper,"multiplayer":has_multiplayer,"websearch":has_search,"tts":has_tts}).encode())
 
         elif self.path.endswith(('/api/extra/perf')):
             global last_req_time, start_time
@@ -2521,7 +2603,7 @@ Enter Prompt:<br>
 
         reqblocking = False
         muint = int(args.multiuser)
-        if muint<=0 and ((args.whispermodel and args.whispermodel!="") or (args.sdmodel and args.sdmodel!="")):
+        if muint<=0 and ((args.whispermodel and args.whispermodel!="") or (args.sdmodel and args.sdmodel!="") or (args.ttsmodel and args.ttsmodel!="")):
             muint = 2 # this prevents errors when using voice/img together with text
         multiuserlimit = ((muint-1) if muint > 1 else 6)
         #backwards compatibility for up to 7 concurrent requests, use default limit of 7 if multiuser set to 1
@@ -2546,6 +2628,7 @@ Enter Prompt:<br>
             is_imggen = False
             is_comfyui_imggen = False
             is_transcribe = False
+            is_tts = False
 
             if self.path.endswith('/request'):
                 api_format = 1
@@ -2588,11 +2671,14 @@ Enter Prompt:<br>
             if self.path.endswith('/api/extra/transcribe') or self.path.endswith('/v1/audio/transcriptions'):
                 is_transcribe = True
 
-            if is_imggen or is_transcribe or api_format > 0:
+            if self.path.endswith('/api/extra/tts') or self.path.endswith('/v1/audio/speech'):
+                is_tts = True
+
+            if is_imggen or is_transcribe or is_tts or api_format > 0:
                 global last_req_time
                 last_req_time = time.time()
 
-                if not is_imggen and not is_transcribe and api_format!=5:
+                if not is_imggen and not is_transcribe and not is_tts and api_format!=5:
                     if not self.secure_endpoint():
                         return
 
@@ -2680,6 +2766,21 @@ Enter Prompt:<br>
                         print("Transcribe: The response could not be sent, maybe connection was terminated?")
                         time.sleep(0.2) #short delay
                     return
+                elif is_tts:
+                    try:
+                        gen = tts_generate(genparams)
+                        wav_data = b''
+                        if gen:
+                            wav_data = base64.b64decode(gen) # Decode the Base64 string into binary data
+                        self.send_response(200)
+                        self.send_header('content-length', str(len(wav_data)))  # Set content length
+                        self.end_headers(content_type='audio/wav')
+                        self.wfile.write(wav_data) # Write the binary WAV data to the response
+                    except Exception as ex:
+                        utfprint(ex,0)
+                        print("TTS: The response could not be sent, maybe connection was terminated?")
+                        time.sleep(0.2) #short delay
+                    return
 
         finally:
             time.sleep(0.05)
@@ -2806,7 +2907,7 @@ def show_gui():
             if dlfile:
                 args.model_param = dlfile
             load_config_cli(args.model_param)
-        if not args.model_param and not args.sdmodel and not args.whispermodel and not args.nomodel:
+        if not args.model_param and not args.sdmodel and not args.whispermodel and not args.ttsmodel and not args.nomodel:
             global exitcounter
             exitcounter = 999
             exit_with_error(2,"No ggml model or kcpps file was selected. Exiting.")
@@ -3008,6 +3109,9 @@ def show_gui():
     sd_quant_var = ctk.IntVar(value=0)
 
     whisper_model_var = ctk.StringVar()
+    tts_model_var = ctk.StringVar()
+    wavtokenizer_var = ctk.StringVar()
+    ttsgpu_var = ctk.IntVar(value=0)
 
     def tabbuttonaction(name):
         for t in tabcontent:
@@ -3158,7 +3262,8 @@ def show_gui():
             whisperfilepath = whisper_model_var.get()
             mmprojfilepath = mmproj_var.get()
             draftmodelpath = draftmodel_var.get()
-            extract_modelfile_params(filepath,sdfilepath,whisperfilepath,mmprojfilepath,draftmodelpath)
+            ttsmodelpath = tts_model_var.get() if ttsgpu_var.get()==1 else ""
+            extract_modelfile_params(filepath,sdfilepath,whisperfilepath,mmprojfilepath,draftmodelpath,ttsmodelpath)
             changed_gpulayers_estimate()
         pass
 
@@ -3575,8 +3680,14 @@ def show_gui():
 
     # audio tab
     audio_tab = tabcontent["Audio"]
-    makefileentry(audio_tab, "Whisper Model (Speech-To-Text):", "Select Whisper .bin Model File", whisper_model_var, 1, width=280, filetypes=[("*.bin","*.bin")], tooltiptxt="Select a Whisper .bin model file on disk to be loaded.")
+    makefileentry(audio_tab, "Whisper Model (Speech-To-Text):", "Select Whisper .bin Model File", whisper_model_var, 1, width=280, filetypes=[("*.bin","*.bin")], tooltiptxt="Select a Whisper .bin model file on disk to be loaded for Voice Recognition.")
     whisper_model_var.trace("w", gui_changed_modelfile)
+    makefileentry(audio_tab, "OuteTTS Model (Text-To-Speech):", "Select OuteTTS GGUF Model File", tts_model_var, 3, width=280, filetypes=[("*.gguf","*.gguf")], tooltiptxt="Select a OuteTTS GGUF model file on disk to be loaded for Narration.")
+    tts_model_var.trace("w", gui_changed_modelfile)
+    makefileentry(audio_tab, "WavTokenizer Model (Text-To-Speech):", "Select WavTokenizer GGUF Model File", wavtokenizer_var, 5, width=280, filetypes=[("*.gguf","*.gguf")], tooltiptxt="Select a WavTokenizer GGUF model file on disk to be loaded for Narration.")
+    wavtokenizer_var.trace("w", gui_changed_modelfile)
+    makecheckbox(audio_tab, "TTS Use GPU", ttsgpu_var, 7, 0,tooltiptxt="Uses the GPU for TTS.")
+    ttsgpu_var.trace("w", gui_changed_modelfile)
 
     def kcpp_export_template():
         nonlocal kcpp_exporting_template
@@ -3625,7 +3736,7 @@ def show_gui():
 
     # launch
     def guilaunch():
-        if model_var.get() == "" and sd_model_var.get() == "" and whisper_model_var.get() == "" and nomodel.get()!=1:
+        if model_var.get() == "" and sd_model_var.get() == "" and whisper_model_var.get() == "" and tts_model_var.get() == "" and nomodel.get()!=1:
             tmp = askopenfilename(title="Select ggml model .bin or .gguf file")
             model_var.set(tmp)
         nonlocal nextstate
@@ -3792,6 +3903,11 @@ def show_gui():
         if whisper_model_var.get() != "":
             args.whispermodel = whisper_model_var.get()
 
+        if tts_model_var.get() != "" and wavtokenizer_var.get() != "":
+            args.ttsmodel = tts_model_var.get()
+            args.ttswavtokenizer = wavtokenizer_var.get()
+            args.ttsgpu = (ttsgpu_var.get()==1)
+
     def import_vars(dict):
         global importvars_in_progress
         importvars_in_progress = True
@@ -3952,6 +4068,10 @@ def show_gui():
 
         whisper_model_var.set(dict["whispermodel"] if ("whispermodel" in dict and dict["whispermodel"]) else "")
 
+        tts_model_var.set(dict["ttsmodel"] if ("ttsmodel" in dict and dict["ttsmodel"]) else "")
+        wavtokenizer_var.set(dict["ttswavtokenizer"] if ("ttswavtokenizer" in dict and dict["ttswavtokenizer"]) else "")
+        ttsgpu_var.set(dict["ttsgpu"] if ("ttsgpu" in dict) else 0)
+
         importvars_in_progress = False
         gui_changed_modelfile()
         if "istemplate" in dict and dict["istemplate"]:
@@ -4022,7 +4142,7 @@ def show_gui():
         kcpp_exporting_template = False
         export_vars()
 
-        if not args.model_param and not args.sdmodel and not args.whispermodel and not args.nomodel:
+        if not args.model_param and not args.sdmodel and not args.whispermodel and not args.ttsmodel and not args.nomodel:
             exitcounter = 999
             print("")
             time.sleep(0.5)
@@ -4566,7 +4686,7 @@ def analyze_gguf_model_wrapper(filename=""):
 
 def main(launch_args,start_server=True):
     global embedded_kailite, embedded_kcpp_docs, embedded_kcpp_sdui
-    global libname, args, friendlymodelname, friendlysdmodelname, fullsdmodelpath, mmprojpath, password, fullwhispermodelpath
+    global libname, args, friendlymodelname, friendlysdmodelname, fullsdmodelpath, mmprojpath, password, fullwhispermodelpath, ttsmodelpath
 
     args = launch_args
     if (args.version) and len(sys.argv) <= 2:
@@ -4629,7 +4749,7 @@ def main(launch_args,start_server=True):
     if not args.model_param:
         args.model_param = args.model
 
-    if args.showgui or (not args.model_param and not args.sdmodel and not args.whispermodel and not args.nomodel):
+    if args.showgui or (not args.model_param and not args.sdmodel and not args.whispermodel and not args.ttsmodel and not args.nomodel):
         #give them a chance to pick a file
         print("For command line arguments, please refer to --help")
         print("***")
@@ -4753,6 +4873,14 @@ def main(launch_args,start_server=True):
         dlfile = download_model_from_url(args.draftmodel,[".gguf"])
         if dlfile:
             args.draftmodel = dlfile
+    if args.ttsmodel and args.ttsmodel!="":
+        dlfile = download_model_from_url(args.ttsmodel,[".gguf"])
+        if dlfile:
+            args.ttsmodel = dlfile
+    if args.ttswavtokenizer and args.ttswavtokenizer!="":
+        dlfile = download_model_from_url(args.ttswavtokenizer,[".gguf"])
+        if dlfile:
+            args.ttswavtokenizer = dlfile
 
     # sanitize and replace the default vanity name. remember me....
     if args.model_param and args.model_param!="":
@@ -4830,7 +4958,7 @@ def main(launch_args,start_server=True):
                 pass
             if args.gpulayers==-1:
                 if MaxMemory[0] > 0 and (not args.usecpu) and ((args.usecublas is not None) or (args.usevulkan is not None) or (args.useclblast is not None) or sys.platform=="darwin"):
-                    extract_modelfile_params(args.model_param,args.sdmodel,args.whispermodel,args.mmproj,args.draftmodel)
+                    extract_modelfile_params(args.model_param,args.sdmodel,args.whispermodel,args.mmproj,args.draftmodel,args.ttsmodel if args.ttsgpu else "")
                     layeramt = autoset_gpu_layers(args.contextsize,args.sdquant,args.blasbatchsize)
                     print(f"Auto Recommended GPU Layers: {layeramt}")
                     args.gpulayers = layeramt
@@ -4999,6 +5127,27 @@ def main(launch_args,start_server=True):
                 exitcounter = 999
                 exit_with_error(3,"Could not load whisper model: " + whispermodel)
 
+    #handle tts model
+    if args.ttsmodel and args.ttsmodel!="" and args.ttswavtokenizer and args.ttswavtokenizer!="":
+        if not os.path.exists(args.ttsmodel) or not os.path.exists(args.ttswavtokenizer):
+            if args.ignoremissing:
+                print("Ignoring missing TTS model files!")
+                args.ttsmodel = None
+                args.ttswavtokenizer = None
+            else:
+                exitcounter = 999
+                exit_with_error(2,f"Cannot find tts model files: {args.ttsmodel} or {args.ttswavtokenizer}")
+        else:
+            ttsmodelpath = args.ttsmodel
+            ttsmodelpath = os.path.abspath(ttsmodelpath)
+            wavtokpath = args.ttswavtokenizer
+            wavtokpath = os.path.abspath(wavtokpath)
+            loadok = tts_load_model(ttsmodelpath,wavtokpath)
+            print("Load TTS Model OK: " + str(loadok))
+            if not loadok:
+                exitcounter = 999
+                exit_with_error(3,"Could not load TTS model!")
+
 
     #load embedded lite
     try:
@@ -5296,7 +5445,12 @@ if __name__ == '__main__':
     sdparsergroup.add_argument("--sdnotile", help="Disables VAE tiling, may not work for large images.", action='store_true')
 
     whisperparsergroup = parser.add_argument_group('Whisper Transcription Commands')
-    whisperparsergroup.add_argument("--whispermodel", metavar=('[filename]'), help="Specify a Whisper bin model to enable Speech-To-Text transcription.", default="")
+    whisperparsergroup.add_argument("--whispermodel", metavar=('[filename]'), help="Specify a Whisper .bin model to enable Speech-To-Text transcription.", default="")
+
+    ttsparsergroup = parser.add_argument_group('TTS Narration Commands')
+    ttsparsergroup.add_argument("--ttsmodel", metavar=('[filename]'), help="Specify the OuteTTS Text-To-Speech GGUF model.", default="")
+    ttsparsergroup.add_argument("--ttswavtokenizer", metavar=('[filename]'), help="Specify the WavTokenizer GGUF model.", default="")
+    ttsparsergroup.add_argument("--ttsgpu", help="Use the GPU for TTS.", action='store_true')
 
     deprecatedgroup = parser.add_argument_group('Deprecated Commands, DO NOT USE!')
     deprecatedgroup.add_argument("--hordeconfig", help=argparse.SUPPRESS, nargs='+')
diff --git a/model_adapter.h b/model_adapter.h
index a0e921cb3..2b7f566a7 100644
--- a/model_adapter.h
+++ b/model_adapter.h
@@ -105,6 +105,9 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs);
 bool whispertype_load_model(const whisper_load_model_inputs inputs);
 whisper_generation_outputs whispertype_generate(const whisper_generation_inputs inputs);
 
+bool ttstype_load_model(const tts_load_model_inputs inputs);
+tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs);
+
 void timer_start();
 double timer_check();
 void print_tok_vec(std::vector<int> &embd);
diff --git a/otherarch/ggml_v3.h b/otherarch/ggml_v3.h
index cd8ed48b1..6a354efac 100644
--- a/otherarch/ggml_v3.h
+++ b/otherarch/ggml_v3.h
@@ -188,13 +188,8 @@
 #endif
 
 // TODO: support for clang
-#ifdef __GNUC__
-#    define GGML_V3_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
-#elif defined(_MSC_VER)
-#    define GGML_V3_DEPRECATED(func, hint) __declspec(deprecated(hint)) func
-#else
 #    define GGML_V3_DEPRECATED(func, hint) func
-#endif
+
 
 #ifndef __GNUC__
 #    define GGML_V3_ATTRIBUTE_FORMAT(...)
diff --git a/otherarch/llama_v2.cpp b/otherarch/llama_v2.cpp
index a1229fa09..ccd68288c 100644
--- a/otherarch/llama_v2.cpp
+++ b/otherarch/llama_v2.cpp
@@ -436,19 +436,23 @@ struct llama_v2_file_loader {
         uint32_t magic = file.read_u32();
         uint32_t version = 0;
 
-        if (magic != 'ggml') {
+        uint32_t magic_ggjt = 0x67676a74u; // 'ggjt'
+        uint32_t magic_ggmf = 0x67676d66u; // 'ggmf'
+        uint32_t magic_ggml = 0x67676d6cu; // 'ggml'
+
+        if (magic != magic_ggml) {
             version = file.read_u32();
         }
 
-        if (magic == 'ggml' && version == 0) {
+        if (magic == magic_ggml && version == 0) {
             file_version = LLAMA_V2_FILE_VERSION_GGML;
-        } else if (magic == 'ggmf' && version == 1) {
+        } else if (magic == magic_ggmf && version == 1) {
             file_version = LLAMA_V2_FILE_VERSION_GGMF_V1;
-        } else if (magic == 'ggjt' && version == 1) {
+        } else if (magic == magic_ggjt && version == 1) {
             file_version = LLAMA_V2_FILE_VERSION_GGJT_V1;
-        } else if (magic == 'ggjt' && version == 2) {
+        } else if (magic == magic_ggjt && version == 2) {
             file_version = LLAMA_V2_FILE_VERSION_GGJT_V2;
-        } else if (magic == 'ggjt' && version == 3) {
+        } else if (magic == magic_ggjt && version == 3) {
             file_version = LLAMA_V2_FILE_VERSION_GGJT_V3;
         } else {
             throw format_old("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
@@ -553,7 +557,8 @@ struct llama_v2_file_saver {
         write_vocab();
     }
     void write_magic() {
-        file.write_u32(LLAMA_V2_FILE_MAGIC);   // magic
+        uint32_t magic_ggjt = 0x67676a74u; // 'ggjt'
+        file.write_u32(magic_ggjt);   // magic
         file.write_u32(LLAMA_V2_FILE_VERSION); // version
     }
     void write_hparams(enum llama_v2_ftype new_ftype) {
@@ -2308,7 +2313,8 @@ int llama_v2_apply_lora_from_file_internal(struct llama_v2_context * ctx, const
     {
         uint32_t magic;
         fin.read((char *) &magic, sizeof(magic));
-        if (magic != 'ggla') {
+        uint32_t magic_ggla = 0x67676c61u; // 'ggla'
+        if (magic != magic_ggla) {
             fprintf(stderr, "%s: bad file magic\n", __func__);
             return 1;
         }
@@ -2800,85 +2806,6 @@ size_t llama_v2_set_state_data(struct llama_v2_context * ctx, const uint8_t * sr
     return nread;
 }
 
-bool llama_v2_load_session_file(struct llama_v2_context * ctx, const char * path_session, llama_v2_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
-    llama_v2_file file(path_session, "rb");
-
-    // sanity checks
-    {
-        const uint32_t magic   = file.read_u32();
-        const uint32_t version = file.read_u32();
-
-        if (magic != LLAMA_V2_SESSION_MAGIC || version != LLAMA_V2_SESSION_VERSION) {
-            fprintf(stderr, "%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
-            return false;
-        }
-
-        llama_v2_hparams session_hparams;
-        file.read_raw(&session_hparams, sizeof(llama_v2_hparams));
-
-        if (session_hparams != ctx->model.hparams) {
-            fprintf(stderr, "%s : model hparams didn't match from session file!\n", __func__);
-            return false;
-        }
-    }
-
-    // load the prompt
-    {
-        const uint32_t n_token_count = file.read_u32();
-
-        if (n_token_count > n_token_capacity) {
-            fprintf(stderr, "%s : token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
-            return false;
-        }
-
-        file.read_raw(tokens_out, sizeof(llama_v2_token) * n_token_count);
-        *n_token_count_out = n_token_count;
-    }
-
-    // restore the context state
-    {
-        const size_t n_state_size_cur = file.size - file.tell();
-        const size_t n_state_size_max = llama_v2_get_state_size(ctx);
-
-        if (n_state_size_cur > n_state_size_max) {
-            fprintf(stderr, "%s : the state size in session file is too big! max %zu, got %zu\n", __func__, n_state_size_max, n_state_size_cur);
-            return false;
-        }
-
-        std::vector<uint8_t> state_data(n_state_size_max);
-        file.read_raw(state_data.data(), n_state_size_cur);
-
-        llama_v2_set_state_data(ctx, state_data.data());
-    }
-
-    return true;
-}
-
-bool llama_v2_save_session_file(struct llama_v2_context * ctx, const char * path_session, const llama_v2_token * tokens, size_t n_token_count) {
-    llama_v2_file file(path_session, "wb");
-
-    file.write_u32(LLAMA_V2_SESSION_MAGIC);
-    file.write_u32(LLAMA_V2_SESSION_VERSION);
-
-    file.write_raw(&ctx->model.hparams, sizeof(llama_v2_hparams));
-
-    // save the prompt
-    file.write_u32((uint32_t) n_token_count);
-    file.write_raw(tokens, sizeof(llama_v2_token) * n_token_count);
-
-    // save the context state
-    {
-        const size_t n_state_size_max = llama_v2_get_state_size(ctx);
-
-        std::vector<uint8_t> state_data(n_state_size_max);
-        const size_t n_state_size_cur = llama_v2_copy_state_data(ctx, state_data.data());
-
-        file.write_raw(state_data.data(), n_state_size_cur);
-    }
-
-    return true;
-}
-
 int llama_v2_eval(
         struct llama_v2_context * ctx,
            const llama_v2_token * tokens,
diff --git a/otherarch/llama_v2.h b/otherarch/llama_v2.h
index 2b1cfc725..cc18ed88e 100644
--- a/otherarch/llama_v2.h
+++ b/otherarch/llama_v2.h
@@ -140,10 +140,6 @@ extern "C" {
     // Returns the number of bytes read
     LLAMA_V2_API size_t llama_v2_set_state_data(struct llama_v2_context * ctx, const uint8_t * src);
 
-    // Save/load session file
-    LLAMA_V2_API bool llama_v2_load_session_file(struct llama_v2_context * ctx, const char * path_session, llama_v2_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
-    LLAMA_V2_API bool llama_v2_save_session_file(struct llama_v2_context * ctx, const char * path_session, const llama_v2_token * tokens, size_t n_token_count);
-
     // Run the llama inference to obtain the logits and probabilities for the next token.
     // tokens + n_tokens is the provided batch of new tokens to process
     // n_past is the number of tokens to use from previous eval calls
@@ -167,7 +163,7 @@ extern "C" {
                              int   n_max_tokens,
                             bool   add_bos);
 
-    
+
     std::vector<llama_v2_token> legacy_llama_v2_tokenize(struct llama_v2_context * ctx, const std::string & text, bool add_bos);
 
     LLAMA_V2_API int llama_v2_n_vocab(const struct llama_v2_context * ctx);
diff --git a/otherarch/rwkv_v2.cpp b/otherarch/rwkv_v2.cpp
index 7d2065eaa..ffc159d62 100644
--- a/otherarch/rwkv_v2.cpp
+++ b/otherarch/rwkv_v2.cpp
@@ -126,7 +126,7 @@ struct rwkv_v2_model {
 
 // Finds model parameter by key and sets it into dest.
 // If the parameter was not found, returns false.
-bool rwkv_v2_set_parameter(std::unordered_map<std::string, struct ggml_v2_tensor *> * parameters, char * key, struct ggml_v2_tensor ** dest) {
+bool rwkv_v2_set_parameter(std::unordered_map<std::string, struct ggml_v2_tensor *> * parameters, const char * key, struct ggml_v2_tensor ** dest) {
     struct ggml_v2_tensor * parameter = (*parameters)[key];
     RWKV_V2_ASSERT_FALSE(parameter != NULL, "Parameter %s not found in model file", key);
     *dest = parameter;
@@ -135,7 +135,7 @@ bool rwkv_v2_set_parameter(std::unordered_map<std::string, struct ggml_v2_tensor
 
 // Finds block parameter by block index and key and sets it into dest.
 // If the parameter was not found, returns false.
-bool rwkv_v2_set_block_parameter(std::unordered_map<std::string, struct ggml_v2_tensor *> * parameters, int32_t block_index, char * key, struct ggml_v2_tensor ** dest) {
+bool rwkv_v2_set_block_parameter(std::unordered_map<std::string, struct ggml_v2_tensor *> * parameters, int32_t block_index, const char * key, struct ggml_v2_tensor ** dest) {
     char full_key[128];
     sprintf(full_key, "blocks.%d.%s", block_index, key);
     return rwkv_v2_set_parameter(parameters, full_key, dest);
diff --git a/otherarch/sdcpp/sdtype_adapter.cpp b/otherarch/sdcpp/sdtype_adapter.cpp
index 5b7ab7605..687f1589d 100644
--- a/otherarch/sdcpp/sdtype_adapter.cpp
+++ b/otherarch/sdcpp/sdtype_adapter.cpp
@@ -112,28 +112,6 @@ static sd_ctx_t * sd_ctx = nullptr;
 static int sddebugmode = 0;
 static std::string recent_data = "";
 
-std::string base64_encode(const unsigned char* data, unsigned int data_length) {
-    const std::string base64_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
-    std::string encoded;
-    encoded.reserve(((data_length + 2) / 3) * 4);
-    for (unsigned int i = 0; i < data_length; i += 3) {
-        unsigned int triple = (data[i] << 16) + (i + 1 < data_length ? data[i + 1] << 8 : 0) + (i + 2 < data_length ? data[i + 2] : 0);
-        encoded.push_back(base64_chars[(triple >> 18) & 0x3F]);
-        encoded.push_back(base64_chars[(triple >> 12) & 0x3F]);
-        if (i + 1 < data_length) {
-            encoded.push_back(base64_chars[(triple >> 6) & 0x3F]);
-        } else {
-            encoded.push_back('=');
-        }
-        if (i + 2 < data_length) {
-            encoded.push_back(base64_chars[triple & 0x3F]);
-        } else {
-            encoded.push_back('=');
-        }
-    }
-    return encoded;
-}
-
 static std::string sdplatformenv, sddeviceenv, sdvulkandeviceenv;
 static bool notiling = false;
 bool sdtype_load_model(const sd_load_model_inputs inputs) {
@@ -553,7 +531,7 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)
         unsigned char * png = stbi_write_png_to_mem(results[i].data, 0, results[i].width, results[i].height, results[i].channel, &out_data_len, "");
         if (png != NULL)
         {
-            recent_data = base64_encode(png,out_data_len);
+            recent_data = kcpp_base64_encode(png,out_data_len);
             free(png);
         }
 
diff --git a/otherarch/tts_adapter.cpp b/otherarch/tts_adapter.cpp
new file mode 100644
index 000000000..faef59d9c
--- /dev/null
+++ b/otherarch/tts_adapter.cpp
@@ -0,0 +1,672 @@
+#include "model_adapter.h"
+#include "otherarch/utils.h"
+
+#include "common.h"
+#include "sampling.h"
+#include "llama.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <fstream>
+#include <map>
+#include <regex>
+#include <string>
+#include <thread>
+#include <vector>
+
+#include "src/llama-context.h"
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+struct wav_header {
+    char riff[4] = {'R', 'I', 'F', 'F'};
+    uint32_t chunk_size;
+    char wave[4] = {'W', 'A', 'V', 'E'};
+    char fmt[4] = {'f', 'm', 't', ' '};
+    uint32_t fmt_chunk_size = 16;
+    uint16_t audio_format = 1; // PCM
+    uint16_t num_channels = 1; // Mono
+    uint32_t sample_rate;
+    uint32_t byte_rate;
+    uint16_t block_align;
+    uint16_t bits_per_sample = 16;
+    char data[4] = {'d', 'a', 't', 'a'};
+    uint32_t data_size;
+};
+
+static std::string save_wav16_base64(const std::vector<float> &data, int sample_rate) {
+    std::ostringstream oss;
+    wav_header header;
+
+    // Fill header fields
+    header.sample_rate = sample_rate;
+    header.byte_rate = header.sample_rate * header.num_channels * (header.bits_per_sample / 8);
+    header.block_align = header.num_channels * (header.bits_per_sample / 8);
+    header.data_size = data.size() * (header.bits_per_sample / 8);
+    header.chunk_size = 36 + header.data_size;
+
+    // Write header
+    oss.write(reinterpret_cast<const char*>(&header), sizeof(header));
+
+    // Write samples
+    for (const auto &sample : data) {
+        int16_t pcm_sample = static_cast<int16_t>(std::clamp(sample * 32767.0, -32768.0, 32767.0));
+        oss.write(reinterpret_cast<const char*>(&pcm_sample), sizeof(pcm_sample));
+    }
+
+    // Get binary WAV data
+    std::string wav_data = oss.str();
+    return kcpp_base64_encode(wav_data); //return as base64 string
+}
+
+static void fill_hann_window(int length, bool periodic, float * output) {
+    int offset = -1;
+    if (periodic) {
+        offset = 0;
+    }
+    for (int i = 0; i < length; i++) {
+        output[i] = 0.5 * (1.0 - cosf((2.0 * M_PI * i) / (length + offset)));
+    }
+}
+
+// very poor-man fft
+static void twiddle(float * real, float * imag, int k, int N) {
+    float angle = 2 * M_PI * k / N;
+    *real = cos(angle);
+    *imag = sin(angle);
+}
+
+static void irfft(int n, const float * inp_cplx, float * out_real) {
+    int N = n / 2 + 1;
+
+    std::vector<float> real_input(N);
+    std::vector<float> imag_input(N);
+    for (int i = 0; i < N; ++i) {
+        real_input[i] = inp_cplx[2 * i];
+        imag_input[i] = inp_cplx[2 * i + 1];
+    }
+
+    std::vector<float> real_output(n);
+    std::vector<float> imag_output(n);
+
+    for (int k = 0; k < n; ++k) {
+        real_output[k] = 0.0f;
+        imag_output[k] = 0.0f;
+        for (int m = 0; m < N; ++m) {
+            float twiddle_real;
+            float twiddle_imag;
+
+            twiddle(&twiddle_real, &twiddle_imag, k * m, n);
+
+            real_output[k] += real_input[m] * twiddle_real - imag_input[m] * twiddle_imag;
+            imag_output[k] += real_input[m] * twiddle_imag + imag_input[m] * twiddle_real;
+        }
+    }
+
+    for (int i = 0; i < n; ++i) {
+        out_real[i] = real_output[i] / N;
+    }
+}
+
+
+static void fold(const std::vector<float> & data, int64_t n_out, int64_t n_win, int64_t n_hop, int64_t n_pad, std::vector<float> & output) {
+    int64_t output_height = n_out;
+    int64_t kernel_w = n_win;
+    int64_t stride_w = n_hop;
+    int64_t width    = n_out;
+
+    output.resize(width, 0.0f);
+
+    int64_t col_idx = 0;
+    for (int64_t w_col = 0; w_col < width; ++w_col) {
+        int64_t start = w_col * stride_w - n_pad;
+        int64_t end   = start + kernel_w;
+
+        for (int64_t w_im = start; w_im < end; ++w_im) {
+            if (w_im >= 0 && w_im < output_height && col_idx < (int64_t) data.size()) {
+                output[w_im] += data[col_idx];
+            }
+            col_idx++;
+        }
+    }
+
+    output.resize(n_out - 2 * n_pad);
+}
+
+// TODO: not optimized at all
+static std::vector<float> embd_to_audio(
+        const float * embd,
+        const int n_codes,
+        const int n_embd,
+        const int n_thread) {
+    const int n_fft = 1280;
+    const int n_hop = 320;
+    const int n_win = 1280;
+    const int n_pad = (n_win - n_hop)/2;
+    const int n_out = (n_codes - 1)*n_hop + n_win;
+
+    std::vector<float> hann(n_fft);
+
+    fill_hann_window(hann.size(), true, hann.data());
+
+    int n_spec = n_embd*n_codes;
+
+    std::vector<float> E (n_spec);
+    std::vector<float> S (n_spec);
+    std::vector<float> ST(n_spec);
+
+    for (int l = 0; l < n_codes; ++l) {
+        for (int k = 0; k < n_embd; ++k) {
+            E[k*n_codes + l] = embd[l*n_embd + k];
+        }
+    }
+
+    for (int k = 0; k < n_embd/2; ++k) {
+        for (int l = 0; l < n_codes; ++l) {
+            float mag = E[(k           )*n_codes + l];
+            float phi = E[(k + n_embd/2)*n_codes + l];
+
+            mag = exp(mag);
+
+            if (mag > 1e2) {
+                mag = 1e2;
+            }
+            S[2*(k*n_codes + l) + 0] = mag*cosf(phi);
+            S[2*(k*n_codes + l) + 1] = mag*sinf(phi);
+        }
+    }
+
+    for (int l = 0; l < n_codes; ++l) {
+        for (int k = 0; k < n_embd/2; ++k) {
+            ST[l*n_embd + 2*k + 0] = S[2*(k*n_codes + l) + 0];
+            ST[l*n_embd + 2*k + 1] = S[2*(k*n_codes + l) + 1];
+        }
+    }
+
+    std::vector<float> res  (n_codes*n_fft);
+    std::vector<float> hann2(n_codes*n_fft);
+
+    std::vector<std::thread> workers(n_thread);
+    for (int i = 0; i < n_thread; ++i) {
+        workers[i] = std::thread([&, i]() {
+            for (int l = i; l < n_codes; l += n_thread) {
+                irfft(n_fft, ST.data() + l*n_embd, res.data() + l*n_fft);
+                for (int j = 0; j < n_fft; ++j) {
+                    res  [l*n_fft + j] *= hann[j];
+                    hann2[l*n_fft + j]  = hann[j] * hann[j];
+                }
+            }
+        });
+    }
+    for (int i = 0; i < n_thread; ++i) {
+        workers[i].join();
+    }
+
+    std::vector<float> audio;
+    std::vector<float> env;
+
+    fold(res,   n_out, n_win, n_hop, n_pad, audio);
+    fold(hann2, n_out, n_win, n_hop, n_pad, env); // TODO: can be done once
+
+    for (size_t i = 0; i < audio.size(); ++i) {
+        audio[i] /= env[i];
+    }
+
+    return audio;
+}
+
+static const std::map<int, std::string> ones = {
+    {0, "zero"}, {1, "one"}, {2, "two"}, {3, "three"}, {4, "four"},
+    {5, "five"}, {6, "six"}, {7, "seven"}, {8, "eight"}, {9, "nine"},
+    {10, "ten"}, {11, "eleven"}, {12, "twelve"}, {13, "thirteen"}, {14, "fourteen"},
+    {15, "fifteen"}, {16, "sixteen"}, {17, "seventeen"}, {18, "eighteen"}, {19, "nineteen"}
+};
+
+static const std::map<int, std::string> tens = {
+    {2, "twenty"}, {3, "thirty"}, {4, "forty"}, {5, "fifty"},
+    {6, "sixty"}, {7, "seventy"}, {8, "eighty"}, {9, "ninety"}
+};
+
+// Convert a number less than 1000 to words
+static std::string convert_less_than_thousand(int num) {
+    std::string result;
+
+    if (num >= 100) {
+        result += ones.at(num / 100) + " hundred ";
+        num %= 100;
+    }
+
+    if (num >= 20) {
+        result += tens.at(num / 10);
+        if (num % 10 > 0) {
+            result += "-" + ones.at(num % 10);
+        }
+    } else if (num > 0) {
+        result += ones.at(num);
+    }
+
+    return result;
+}
+
+static std::string number_to_words(const std::string & number_str) {
+    try {
+        size_t decimal_pos = number_str.find('.');
+        std::string integer_part = number_str.substr(0, decimal_pos);
+
+        int int_number = std::stoi(integer_part);
+        std::string result;
+
+        if (int_number == 0) {
+            result = "zero";
+        } else {
+            if (int_number >= 1000000000) {
+                int billions = int_number / 1000000000;
+                result += convert_less_than_thousand(billions) + " billion ";
+                int_number %= 1000000000;
+            }
+
+            if (int_number >= 1000000) {
+                int millions = int_number / 1000000;
+                result += convert_less_than_thousand(millions) + " million ";
+                int_number %= 1000000;
+            }
+
+            if (int_number >= 1000) {
+                int thousands = int_number / 1000;
+                result += convert_less_than_thousand(thousands) + " thousand ";
+                int_number %= 1000;
+            }
+
+            if (int_number > 0) {
+                result += convert_less_than_thousand(int_number);
+            }
+        }
+
+        // Handle decimal part
+        if (decimal_pos != std::string::npos) {
+            result += " point";
+            std::string decimal_part = number_str.substr(decimal_pos + 1);
+            for (char digit : decimal_part) {
+                result += " " + ones.at(digit - '0');
+            }
+        }
+
+        return result;
+    } catch (const std::exception& e) {
+        // Skip if fails
+        return " ";
+    }
+}
+
+static std::string replace_numbers_with_words(const std::string & input_text) {
+    std::regex number_pattern(R"(\d+(\.\d+)?)");
+    std::string result;
+    auto it = std::sregex_iterator(input_text.begin(), input_text.end(), number_pattern);
+    auto end = std::sregex_iterator();
+
+    size_t last_pos = 0;
+    for (std::sregex_iterator i = it; i != end; ++i) {
+        const std::smatch& match = *i;
+        result.append(input_text, last_pos, match.position() - last_pos);
+        result.append(number_to_words(match.str()));
+        last_pos = match.position() + match.length();
+    }
+    result.append(input_text, last_pos);
+
+    return result;
+}
+
+static std::string process_text(const std::string & text) {
+
+    std::string processed_text = replace_numbers_with_words(text);
+
+    std::transform(processed_text.begin(), processed_text.end(),
+                  processed_text.begin(), ::tolower);
+
+    std::regex special_chars(R"([-_/,\.\\])");
+    processed_text = std::regex_replace(processed_text, special_chars, " ");
+    std::regex non_alpha(R"([^a-z\s])");
+    processed_text = std::regex_replace(processed_text, non_alpha, "");
+    std::regex multiple_spaces(R"(\s+)");
+    processed_text = std::regex_replace(processed_text, multiple_spaces, " ");
+    processed_text = std::regex_replace(processed_text, std::regex(R"(^\s+|\s+$)"), "");
+    processed_text = std::regex_replace(processed_text, std::regex(R"(\s)"), "<|text_sep|>");
+
+    return processed_text;
+}
+
+
+static void prompt_add(llama_tokens & prompt, const llama_tokens & tokens) {
+    prompt.insert(prompt.end(), tokens.begin(), tokens.end());
+}
+static void prompt_add(llama_tokens & prompt, const llama_model * model, const std::string & txt, bool add_special, bool parse_special) {
+    auto tmp = common_tokenize(model, txt, add_special, parse_special);
+    prompt_add(prompt, tmp);
+}
+static void prompt_init(llama_tokens & prompt, const llama_model * model) {
+    prompt.clear();
+    prompt_add(prompt, model, "<|im_start|>\n", true, true);
+}
+
+static std::vector<llama_token> prepare_guide_tokens(const llama_model * model, const std::string& str)
+{
+    const std::string& delimiter = "<|text_sep|>";
+
+    std::vector<llama_token> result;
+    size_t start = 0;
+    size_t end = str.find(delimiter);
+
+    while (end != std::string::npos) {
+        std::string current_word = str.substr(start, end - start);
+        auto tmp = common_tokenize(model, current_word, false, true);
+        result.push_back(tmp[0]);
+        start = end + delimiter.length();
+        end = str.find(delimiter, start);
+    }
+
+    // Add the last part
+    std::string current_word = str.substr(start);
+    auto tmp = common_tokenize(model, current_word, false, true);
+    result.push_back(tmp[0]);
+    return result;
+}
+
+static llama_context * ttc_ctx = nullptr; //text to codes ctx
+static llama_context * cts_ctx = nullptr; //codes to speech
+
+static int ttsdebugmode = 0;
+static std::string ttsplatformenv, ttsdeviceenv, ttsvulkandeviceenv;
+static std::string last_generated_audio = "";
+
+bool ttstype_load_model(const tts_load_model_inputs inputs)
+{
+    //duplicated from expose.cpp
+    int cl_parseinfo = inputs.clblast_info; //first digit is whether configured, second is platform, third is devices
+    std::string usingclblast = "GGML_OPENCL_CONFIGURED="+std::to_string(cl_parseinfo>0?1:0);
+    putenv((char*)usingclblast.c_str());
+    cl_parseinfo = cl_parseinfo%100; //keep last 2 digits
+    int platform = cl_parseinfo/10;
+    int devices = cl_parseinfo%10;
+    ttsplatformenv = "GGML_OPENCL_PLATFORM="+std::to_string(platform);
+    ttsdeviceenv = "GGML_OPENCL_DEVICE="+std::to_string(devices);
+    putenv((char*)ttsplatformenv.c_str());
+    putenv((char*)ttsdeviceenv.c_str());
+    std::string vulkan_info_raw = inputs.vulkan_info;
+    std::string vulkan_info_str = "";
+    for (size_t i = 0; i < vulkan_info_raw.length(); ++i) {
+        vulkan_info_str += vulkan_info_raw[i];
+        if (i < vulkan_info_raw.length() - 1) {
+            vulkan_info_str += ",";
+        }
+    }
+    if(vulkan_info_str!="")
+    {
+        ttsvulkandeviceenv = "GGML_VK_VISIBLE_DEVICES="+vulkan_info_str;
+        putenv((char*)ttsvulkandeviceenv.c_str());
+    }
+
+    llama_backend_init();
+
+    std::string modelfile_ttc = inputs.ttc_model_filename;
+    std::string modelfile_cts = inputs.cts_model_filename;
+    printf("\nLoading TTS Model, OuteTTS: %s \nWavTokenizer: %s \n",modelfile_ttc.c_str(),modelfile_cts.c_str());
+
+    ttsdebugmode = inputs.debugmode;
+
+    // tts init
+    llama_model_params tts_model_params = llama_model_default_params();
+    llama_context_params tts_ctx_params = llama_context_default_params();
+
+    const int nthreads = 4;
+
+    tts_model_params.use_mmap = false;
+    tts_model_params.use_mlock = false;
+    tts_model_params.n_gpu_layers = inputs.gpulayers; //offload if possible
+    tts_model_params.split_mode = llama_split_mode::LLAMA_SPLIT_MODE_LAYER;
+    tts_ctx_params.n_ctx = 8192;
+    tts_ctx_params.logits_all = false;
+    tts_ctx_params.offload_kqv = true;
+    tts_ctx_params.n_batch = 8192;
+    tts_ctx_params.n_ubatch = 512;
+    tts_ctx_params.n_threads = nthreads;
+    tts_ctx_params.n_threads_batch = nthreads;
+    tts_ctx_params.flash_attn = false;
+
+    llama_model * ttcmodel = llama_model_load_from_file(modelfile_ttc.c_str(), tts_model_params);
+    ttc_ctx = llama_new_context_with_model(ttcmodel, tts_ctx_params);
+
+    if (ttc_ctx == nullptr) {
+        printf("\nTTS Load Error: Failed to initialize ttc context!\n");
+        return false;
+    }
+
+    llama_model * ctsmodel = llama_model_load_from_file(modelfile_cts.c_str(), tts_model_params);
+
+    tts_ctx_params.embeddings = true; //this requires embeddings instead
+    cts_ctx = llama_new_context_with_model(ctsmodel, tts_ctx_params);
+
+    if (cts_ctx == nullptr) {
+        printf("\nTTS Load Error: Failed to initialize cts context!\n");
+        return false;
+    }
+
+    std::vector<int> tmp = {1, 2, 3, 4};
+    llama_kv_cache_clear(ttc_ctx);
+    auto er = llama_decode(ttc_ctx, llama_batch_get_one(tmp.data(), tmp.size()));
+    if(er!=0)
+    {
+        printf("\nTTS Eval returned nonzero: %d\n",er);
+        return false;
+    }
+
+    printf("\nTTS Load Complete.\n");
+    return true;
+}
+
+tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs)
+{
+    tts_generation_outputs output;
+
+    if(ttc_ctx==nullptr || cts_ctx==nullptr)
+    {
+        printf("\nWarning: KCPP TTS not initialized!\n");
+        output.data = "";
+        output.status = 0;
+        return output;
+    }
+
+    std::vector<llama_token> codes;
+    std::vector<llama_token> guide_tokens;
+    const llama_model * model_ttc = &(ttc_ctx->model);
+    const llama_model * model_cts = &(cts_ctx->model);
+    const int ttc_n_vocab = llama_n_vocab(model_ttc);
+    std::string prompt = inputs.prompt;
+
+    if(!inputs.quiet)
+    {
+        printf("\nTTS Generating... ");
+    }
+
+    // process prompt and generate voice codes
+
+    std::vector<llama_token> prompt_inp;
+    prompt_init(prompt_inp, model_ttc);
+    prompt_add(prompt_inp, model_ttc, "<|text_start|>", false, true);
+
+    int speaker_seed = inputs.speaker_seed;
+    int audio_seed = inputs.audio_seed;
+    if (speaker_seed <= 0 || speaker_seed==0xFFFFFFFF)
+    {
+        speaker_seed = (((uint32_t)time(NULL)) % 1000000u);
+        if(ttsdebugmode==1)
+        {
+            printf("\nUsing Speaker Seed: %d", speaker_seed);
+        }
+    }
+    if (audio_seed <= 0 || audio_seed==0xFFFFFFFF)
+    {
+        audio_seed = (((uint32_t)time(NULL)) % 1000000u);
+        if(ttsdebugmode==1)
+        {
+            printf("\nUsing Audio Seed: %d", audio_seed);
+        }
+    }
+
+    std::mt19937 tts_rng(audio_seed);
+    std::mt19937 speaker_rng(speaker_seed);
+
+    //add the speaker based on the seed
+    if(speaker_seed>0)
+    {
+        std::string sampletext = "but<|text_sep|>that<|text_sep|>is<|text_sep|>what<|text_sep|>it<|text_sep|>is<|text_sep|>";
+    }
+
+    // convert the input text into the necessary format expected by OuteTTS
+    std::string prompt_clean = process_text(prompt);
+
+    if(prompt_clean.size()==0)
+    {
+        //no input
+         if(!inputs.quiet)
+        {
+            printf("\nTTS sent empty input.\n");
+            output.data = "";
+            output.status = 1;
+            return output;
+        }
+    }
+
+    if(!inputs.quiet && ttsdebugmode==1)
+    {
+        printf("\nInput: %s\n", prompt_clean.c_str());
+    }
+
+    guide_tokens = prepare_guide_tokens(model_ttc,prompt_clean);
+    prompt_add(prompt_inp, model_ttc, prompt_clean, false, true);
+
+    if(!inputs.quiet)
+    {
+        printf(" (%d input words)...", guide_tokens.size());
+    }
+
+    prompt_add(prompt_inp, model_ttc, "<|text_end|>\n", false, true);
+
+    //create batch with tokens for decoding prompt processing
+    llama_kv_cache_clear(ttc_ctx);
+    llama_kv_cache_clear(cts_ctx);
+    kcpp_embd_batch tts_batch = kcpp_embd_batch(prompt_inp, 0, false, true);
+
+    auto evalok = (llama_decode(ttc_ctx, tts_batch.batch)==0);
+    if (!evalok) {
+        printf("\nError: TTS prompt batch processing failed\n");
+        output.data = "";
+        output.status = 0;
+        return output;
+    }
+
+    // main loop
+    int n_decode = 0;
+    int n_predict = 4096; //max 4096 tokens
+
+    bool next_token_uses_guide_token = true;
+
+    while (n_decode <= n_predict)
+    {
+        float * logits = llama_get_logits(ttc_ctx);
+
+        llama_token new_token_id = kcpp_quick_sample(logits,ttc_n_vocab,20,1.0,tts_rng);
+
+        //guide tokens help prevent hallucinations by forcing the TTS to use the correct word
+        if(!guide_tokens.empty() && next_token_uses_guide_token && !llama_token_is_control(model_ttc, new_token_id) && !llama_token_is_eog(model_ttc, new_token_id))
+        {
+            llama_token guide_token = guide_tokens[0];
+            guide_tokens.erase(guide_tokens.begin());
+            new_token_id = guide_token; //ensure correct word fragment is used
+        }
+
+        //this is the token id that always precedes a new word
+        next_token_uses_guide_token = (new_token_id == 198);
+
+        codes.push_back(new_token_id);
+
+        // is it an end of generation? -> mark the stream as finished
+        if (llama_token_is_eog(model_ttc, new_token_id) || n_decode >= n_predict) {
+            break;
+        }
+
+        n_decode += 1;
+        std::vector<llama_token> next = {new_token_id};
+        llama_batch batch = llama_batch_get_one(next.data(), next.size());
+
+        // evaluate the current batch with the transformer model
+        if (llama_decode(ttc_ctx, batch)) {
+            printf("\nError: TTS code generation failed!\n");
+            output.data = "";
+            output.status = 0;
+            return output;
+        }
+    }
+
+    if(!inputs.quiet && ttsdebugmode==1)
+    {
+        const std::string inp_txt = common_detokenize(ttc_ctx, codes, true);
+
+        printf("\nGenerated %d Codes: '%s'\n",codes.size(), inp_txt.c_str());
+    }
+
+    // remove all non-audio tokens (i.e. < 151672 || > 155772)
+    codes.erase(std::remove_if(codes.begin(), codes.end(), [](llama_token t) { return t < 151672 || t > 155772; }), codes.end());
+
+    for (auto & token : codes) {
+        token -= 151672;
+    }
+
+    const int n_codes = codes.size();
+    if(n_codes<=1)
+    {
+        printf("\nWarning: TTS vocoder generated nothing!\n");
+        output.data = "";
+        output.status = 0;
+        return output;
+    }
+    kcpp_embd_batch codebatch = kcpp_embd_batch(codes,0,false,true);
+
+    if (llama_decode(cts_ctx, codebatch.batch) != 0) {
+        printf("\nError: TTS vocoder generation failed!\n");
+        output.data = "";
+        output.status = 0;
+        return output;
+    }
+    else
+    {
+        // spectral operations
+        const int n_embd = llama_n_embd(model_cts);
+        const float * embd = llama_get_embeddings(cts_ctx);
+        std::vector<float> audio = embd_to_audio(embd, n_codes, n_embd, 4);
+
+        const int n_sr = 24000; // sampling rate
+
+        // zero out first 0.05 seconds
+        for (int i = 0; i < 24000/20; ++i) {
+            audio[i] = 0.0f;
+        }
+        //add some silence at the end
+        for (int i = 0; i < 24000/20; ++i) {
+            audio.push_back(0.0f);
+        }
+
+        last_generated_audio = save_wav16_base64(audio, n_sr);
+
+        if(!inputs.quiet)
+        {
+            printf("\nTTS Generated %d audio tokens.\n",(int) codes.size());
+        }
+
+        output.data = last_generated_audio.c_str();
+        output.status = 1;
+        return output;
+    }
+}
diff --git a/otherarch/utils.cpp b/otherarch/utils.cpp
index cfeec8470..f848eeabd 100644
--- a/otherarch/utils.cpp
+++ b/otherarch/utils.cpp
@@ -1,5 +1,6 @@
 #include "utils.h"
 #include "common.h"
+#include "llama.h"
 
 #include <cmath>
 #include <cstring>
@@ -303,6 +304,47 @@ std::vector<uint8_t> kcpp_base64_decode(const std::string & encoded_string)
 
     return ret;
 }
+std::string kcpp_base64_encode(const unsigned char* data, unsigned int data_length) {
+    const std::string base64_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+    std::string encoded;
+    encoded.reserve(((data_length + 2) / 3) * 4);
+    for (unsigned int i = 0; i < data_length; i += 3) {
+        unsigned int triple = (data[i] << 16) + (i + 1 < data_length ? data[i + 1] << 8 : 0) + (i + 2 < data_length ? data[i + 2] : 0);
+        encoded.push_back(base64_chars[(triple >> 18) & 0x3F]);
+        encoded.push_back(base64_chars[(triple >> 12) & 0x3F]);
+        if (i + 1 < data_length) {
+            encoded.push_back(base64_chars[(triple >> 6) & 0x3F]);
+        } else {
+            encoded.push_back('=');
+        }
+        if (i + 2 < data_length) {
+            encoded.push_back(base64_chars[triple & 0x3F]);
+        } else {
+            encoded.push_back('=');
+        }
+    }
+    return encoded;
+}
+std::string kcpp_base64_encode(const std::string &data) {
+    static const char lookup[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+    std::string encoded;
+    int val = 0, valb = -6;
+    for (unsigned char c : data) {
+        val = (val << 8) + c;
+        valb += 8;
+        while (valb >= 0) {
+            encoded.push_back(lookup[(val >> valb) & 0x3F]);
+            valb -= 6;
+        }
+    }
+    if (valb > -6) {
+        encoded.push_back(lookup[((val << 8) >> (valb + 8)) & 0x3F]);
+    }
+    while (encoded.size() % 4) {
+        encoded.push_back('=');
+    }
+    return encoded;
+}
 
 std::string get_timestamp_str()
 {
@@ -314,3 +356,150 @@ std::string get_timestamp_str()
     std::string timestamp(buffer);
     return timestamp;
 }
+
+//a very rudimentary all in one sampling function which has no dependencies
+int32_t kcpp_quick_sample(float * logits, const int n_logits, int top_k, float temp, std::mt19937 & rng)
+{
+    if (temp <= 0 || top_k==1) {
+        // select the token with the highest logit directly
+        float max_logit = logits[0];
+        int32_t max_id = 0;
+        for (int i = 1; i < n_logits; ++i) {
+            if (logits[i] > max_logit) {
+                max_logit = logits[i];
+                max_id = i;
+            }
+        }
+        return max_id;
+    }
+
+    top_k = (top_k<=0 || top_k>300)?300:top_k;
+    top_k = std::min(top_k, n_logits);
+
+    std::vector<std::pair<float, int32_t>> logits_id;
+    logits_id.reserve(n_logits);
+
+    //temperature sample
+    const float scale = 1.0f/temp;
+    for (int i = 0; i < n_logits; ++i) {
+        logits_id.push_back(std::make_pair(logits[i]*scale, i));
+    }
+
+    //sample top_k
+    std::partial_sort(
+            logits_id.begin(),
+            logits_id.begin() + top_k, logits_id.end(),
+            [](const std::pair<float, int32_t> & a, const std::pair<float, int32_t> & b) {
+        return a.first > b.first;
+    });
+    logits_id.resize(top_k);
+
+    // compute probs for the top k tokens
+    std::vector<float> probs;
+    probs.reserve(logits_id.size());
+    float maxl = logits_id[0].first;
+    double sum = 0.0;
+    for (const auto & kv : logits_id) {
+        const float p = expf(kv.first - maxl);
+        probs.push_back(p);
+        sum += p;
+    }
+
+    // normalize the probs
+    for (auto & p : probs) {
+        p /= sum;
+    }
+
+    std::discrete_distribution<> dist(probs.begin(), probs.end());
+    int idx = dist(rng);
+
+    return logits_id[idx].second;
+}
+
+kcpp_embd_batch::kcpp_embd_batch(float * embd, int32_t n_tokens, int32_t npast, bool use_mrope)
+{
+     int32_t seq_id = 0;
+        pos.resize(n_tokens * (use_mrope?4:1));
+        std::fill(pos.begin(), pos.end(), 0);
+        n_seq_id.resize(n_tokens);
+        seq_ids.resize(n_tokens + 1);
+        logits.resize(n_tokens);
+        seq_id_0.resize(1);
+        seq_id_0[0] = seq_id;
+        seq_ids [n_tokens] = nullptr;
+        batch = {
+            /*n_tokens       =*/ n_tokens,
+            /*tokens         =*/ nullptr,
+            /*embd           =*/ embd,
+            /*pos            =*/ pos.data(),
+            /*n_seq_id       =*/ n_seq_id.data(),
+            /*seq_id         =*/ seq_ids.data(),
+            /*logits         =*/ logits.data(),
+        };
+
+        if(!use_mrope)
+        {
+           for (int i = 0; i < n_tokens; i++) {
+                batch.pos     [i] = npast + i;
+                batch.n_seq_id[i] = 1;
+                batch.seq_id  [i] = seq_id_0.data();
+                batch.logits  [i] = false;
+            }
+        }
+        else
+        {
+            for (int i = 0; i < n_tokens; i++) {
+                batch.n_seq_id[i] = 1;
+                batch.seq_id  [i] = seq_id_0.data();
+                batch.logits  [i] = false;
+            }
+             for (int j = 0; j < batch.n_tokens * 3; j++) {
+                batch.pos[j] = npast + (j % batch.n_tokens);
+            }
+        }
+}
+
+kcpp_embd_batch::kcpp_embd_batch(std::vector<llama_token> & tokens, int32_t npast, bool use_mrope, bool return_all_logits)
+{
+       int32_t seq_id = 0;
+        int32_t n_tokens = tokens.size();
+        pos.resize(n_tokens * (use_mrope?4:1));
+        std::fill(pos.begin(), pos.end(), 0);
+        n_seq_id.resize(n_tokens);
+        seq_ids.resize(n_tokens + 1);
+        logits.resize(n_tokens);
+        seq_id_0.resize(1);
+        seq_id_0[0] = seq_id;
+        seq_ids[n_tokens] = nullptr;
+        batch = {
+            /*n_tokens       =*/ n_tokens,
+            /*tokens         =*/ tokens.data(),
+            /*embd           =*/ nullptr,
+            /*pos            =*/ pos.data(),
+            /*n_seq_id       =*/ n_seq_id.data(),
+            /*seq_id         =*/ seq_ids.data(),
+            /*logits         =*/ logits.data(),
+        };
+
+        if(!use_mrope)
+        {
+           for (int i = 0; i < n_tokens; i++) {
+                batch.pos     [i] = npast + i;
+                batch.n_seq_id[i] = 1;
+                batch.seq_id  [i] = seq_id_0.data();
+                batch.logits  [i] = (return_all_logits?true:false);
+            }
+        }
+        else
+        {
+            for (int i = 0; i < n_tokens; i++) {
+                batch.n_seq_id[i] = 1;
+                batch.seq_id  [i] = seq_id_0.data();
+                batch.logits  [i] = (return_all_logits?true:false);
+            }
+             for (int j = 0; j < batch.n_tokens * 3; j++) {
+                batch.pos[j] = npast + (j % batch.n_tokens);
+            }
+        }
+        batch.logits[n_tokens - 1] = true;
+}
\ No newline at end of file
diff --git a/otherarch/utils.h b/otherarch/utils.h
index e0a60c95b..29977c40c 100644
--- a/otherarch/utils.h
+++ b/otherarch/utils.h
@@ -8,6 +8,7 @@
 #include <random>
 #include <thread>
 #include "ggml_v3.h"
+#include "llama.h"
 
 //
 // CLI argument parsing
@@ -52,10 +53,23 @@ void gpt_split_words(std::string str, std::vector<std::string>& words);
 //
 std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text);
 
-
 bool should_transpose_layer(std::string name);
 void kcpp_graph_compute_helper(ggml_v3_cgraph * graph, int n_threads);
 
 std::vector<uint8_t> kcpp_base64_decode(const std::string & encoded_string);
+std::string kcpp_base64_encode(const unsigned char* data, unsigned int data_length);
+std::string kcpp_base64_encode(const std::string &data);
 
-std::string get_timestamp_str();
\ No newline at end of file
+std::string get_timestamp_str();
+int32_t kcpp_quick_sample(float * logits, const int n_logits, int top_k, float temp, std::mt19937 & rng);
+
+struct kcpp_embd_batch { //duplcated from llava_embd_batch
+    std::vector<int32_t> pos;
+    std::vector<int32_t> n_seq_id;
+    std::vector<int32_t> seq_id_0;
+    std::vector<int32_t *> seq_ids;
+    std::vector<int8_t> logits;
+    llama_batch batch;
+    kcpp_embd_batch(float * embd, int32_t n_tokens, int32_t npast, bool use_mrope);
+    kcpp_embd_batch(std::vector<llama_token> & tokens, int32_t npast, bool use_mrope, bool return_all_logits);
+};
\ No newline at end of file