Fixed some GGUFv1 loading bugs, long overdue cleanup for compiling, integrated TTS

tts is functional (+6 squashed commit) Squashed commit: [22396311] wip tts [3a883027] tts not yet working [0dcfab0e] fix silly bug [a378d9ef] some long overdue cleanup [fc5a6fb5] Wip tts [39f50497] wip TTS integration
2025-09-10 09:04:36 +00:00 · 2025-01-12 16:33:02 +08:00 · 2025-01-12 16:33:02 +08:00 · b3de1598e7
commit b3de1598e7
parent 12cdcf0abe
17 changed files with 1175 additions and 271 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -495,7 +495,9 @@ add_library(common2
            examples/llava/clip.h
            src/unicode.h
            src/unicode.cpp
-            src/unicode-data.cpp)
+            src/unicode-data.cpp
            otherarch/utils.cpp
            otherarch/utils.h)
 target_include_directories(common2 PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./examples ./common)
 target_compile_features(common2 PUBLIC cxx_std_17) # don't bump
 target_link_libraries(common2 PRIVATE ggml ${LLAMA_EXTRA_LIBS})
@ -515,11 +517,18 @@ target_compile_features(whisper_adapter PUBLIC cxx_std_17) # don't bump
 target_link_libraries(whisper_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS})
 set_target_properties(whisper_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
 add_library(tts_adapter
            otherarch/tts_adapter.cpp)
 target_include_directories(tts_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./examples ./common)
 target_compile_features(tts_adapter PUBLIC cxx_std_17) # don't bump
 target_link_libraries(tts_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS})
 set_target_properties(tts_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
 add_library(gpttype_adapter
            gpttype_adapter.cpp)
 target_include_directories(gpttype_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./examples ./common)
 target_compile_features(gpttype_adapter PUBLIC cxx_std_17) # don't bump
-target_link_libraries(gpttype_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS})
+target_link_libraries(gpttype_adapter PRIVATE common2 ggml ggml_v1 ggml_v2 ggml_v3 ${LLAMA_EXTRA_LIBS})
 set_target_properties(gpttype_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
 if (LLAMA_CUBLAS)
@ -530,8 +539,16 @@ if (LLAMA_CUBLAS)
    set_target_properties(${TARGET} PROPERTIES PREFIX "")
    set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "koboldcpp_cublas")
    set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
-    target_link_libraries(${TARGET} PUBLIC Threads::Threads ggml ggml_v1 ggml_v2 ggml_v3 common2 gpttype_adapter whisper_adapter sdtype_adapter ${LLAMA_EXTRA_LIBS})
+    target_link_libraries(${TARGET} PUBLIC Threads::Threads ggml ggml_v1 ggml_v2 ggml_v3 common2 gpttype_adapter whisper_adapter tts_adapter sdtype_adapter ${LLAMA_EXTRA_LIBS})
    target_compile_features(${TARGET} PRIVATE cxx_std_17)
    add_custom_command(
    TARGET koboldcpp_cublas POST_BUILD
    COMMAND ${CMAKE_COMMAND} -E copy
        $<TARGET_FILE:koboldcpp_cublas>     # The generated DLL
        ${CMAKE_SOURCE_DIR}/                # Destination directory
    COMMENT "Copying DLL to parent directory"
    )
 endif()
 if (LLAMA_HIPBLAS)
@ -542,7 +559,15 @@ if (LLAMA_HIPBLAS)
    set_target_properties(${TARGET} PROPERTIES PREFIX "")
    set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "koboldcpp_hipblas")
    set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
-    target_link_libraries(${TARGET} PUBLIC Threads::Threads ggml ggml_v1 ggml_v2 ggml_v3 common2 gpttype_adapter whisper_adapter sdtype_adapter ${LLAMA_EXTRA_LIBS})
+    target_link_libraries(${TARGET} PUBLIC Threads::Threads ggml ggml_v1 ggml_v2 ggml_v3 common2 gpttype_adapter whisper_adapter tts_adapter sdtype_adapter ${LLAMA_EXTRA_LIBS})
    target_compile_features(${TARGET} PRIVATE cxx_std_17)
    add_custom_command(
    TARGET koboldcpp_hipblas POST_BUILD
    COMMAND ${CMAKE_COMMAND} -E copy
        $<TARGET_FILE:koboldcpp_hipblas>     # The generated DLL
        ${CMAKE_SOURCE_DIR}/                # Destination directory
    COMMENT "Copying DLL to parent directory"
    )
 endif()
--- a/36
+++ b/36
@ -4,7 +4,7 @@
 .PHONY: finishedmsg
 default: koboldcpp_default koboldcpp_failsafe koboldcpp_noavx2 koboldcpp_clblast koboldcpp_clblast_noavx2 koboldcpp_cublas koboldcpp_hipblas koboldcpp_vulkan koboldcpp_vulkan_noavx2 finishedmsg
-tools: quantize_gpt2 quantize_gptj quantize_gguf quantize_neox quantize_mpt quantize_clip whispermain sdmain gguf-split
+tools: quantize_gpt2 quantize_gptj quantize_gguf quantize_neox quantize_mpt quantize_clip ttsmain whispermain sdmain gguf-split
 ifndef UNAME_S
 UNAME_S := $(shell uname -s)
@ -90,10 +90,10 @@ endif
 CUBLASLD_FLAGS =
 CUBLAS_OBJS =
-OBJS_FULL += ggml-alloc.o ggml-cpu-traits.o ggml-quants.o ggml-cpu-quants.o ggml-cpu-aarch64.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm.o common.o sampling.o
+OBJS_FULL += ggml-alloc.o ggml-cpu-traits.o ggml-quants.o ggml-cpu-quants.o ggml-cpu-aarch64.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm.o common.o sampling.o kcpputils.o
-OBJS_SIMPLE += ggml-alloc.o ggml-cpu-traits.o ggml-quants_noavx2.o ggml-cpu-quants_noavx2.o ggml-cpu-aarch64_noavx2.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm_noavx2.o common.o sampling.o
+OBJS_SIMPLE += ggml-alloc.o ggml-cpu-traits.o ggml-quants_noavx2.o ggml-cpu-quants_noavx2.o ggml-cpu-aarch64_noavx2.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm_noavx2.o common.o sampling.o kcpputils.o
-OBJS_SIMPLER += ggml-alloc.o ggml-cpu-traits.o ggml-quants_noavx1.o ggml-cpu-quants_noavx1.o ggml-cpu-aarch64_noavx1.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm_noavx1.o common.o sampling.o
+OBJS_SIMPLER += ggml-alloc.o ggml-cpu-traits.o ggml-quants_noavx1.o ggml-cpu-quants_noavx1.o ggml-cpu-aarch64_noavx1.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm_noavx1.o common.o sampling.o kcpputils.o
-OBJS_FAILSAFE += ggml-alloc.o ggml-cpu-traits.o ggml-quants_failsafe.o ggml-cpu-quants_failsafe.o ggml-cpu-aarch64_failsafe.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm_failsafe.o common.o sampling.o
+OBJS_FAILSAFE += ggml-alloc.o ggml-cpu-traits.o ggml-quants_failsafe.o ggml-cpu-quants_failsafe.o ggml-cpu-aarch64_failsafe.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm_failsafe.o common.o sampling.o kcpputils.o
 # OS specific
 ifeq ($(UNAME_S),Linux)
@ -539,6 +539,8 @@ ggml-cpu-cpp.o: ggml/src/ggml-cpu/ggml-cpu.cpp ggml/include/ggml.h ggml/src/ggml
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 gguf.o: ggml/src/gguf.cpp ggml/include/gguf.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 kcpputils.o: otherarch/utils.cpp otherarch/utils.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 #these have special gpu defines
 ggml-backend_default.o: ggml/src/ggml-backend.cpp ggml/src/ggml-backend-impl.h ggml/include/ggml.h ggml/include/ggml-backend.h
@ -639,8 +641,12 @@ whispercpp_default.o: otherarch/whispercpp/whisper_adapter.cpp
 whispercpp_cublas.o: otherarch/whispercpp/whisper_adapter.cpp
 	$(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@
 #tts objects
 tts_default.o: otherarch/tts_adapter.cpp
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 # idiotic "for easier compilation"
-GPTTYPE_ADAPTER = gpttype_adapter.cpp otherarch/llama_v2.cpp otherarch/llama_v3.cpp src/llama.cpp src/llama-impl.cpp src/llama-chat.cpp src/llama-mmap.cpp src/llama-context.cpp src/llama-adapter.cpp src/llama-arch.cpp src/llama-batch.cpp src/llama-vocab.cpp src/llama-grammar.cpp src/llama-sampling.cpp src/llama-kv-cache.cpp src/llama-model-loader.cpp src/llama-model.cpp src/llama-quant.cpp src/llama-hparams.cpp otherarch/utils.cpp otherarch/gptj_v1.cpp otherarch/gptj_v2.cpp otherarch/gptj_v3.cpp otherarch/gpt2_v1.cpp otherarch/gpt2_v2.cpp otherarch/gpt2_v3.cpp otherarch/rwkv_v2.cpp otherarch/rwkv_v3.cpp otherarch/neox_v2.cpp otherarch/neox_v3.cpp otherarch/mpt_v3.cpp ggml/include/ggml.h ggml/include/ggml-cpu.h ggml/include/ggml-cuda.h include/llama.h otherarch/llama-util.h
+GPTTYPE_ADAPTER = gpttype_adapter.cpp otherarch/llama_v2.cpp otherarch/llama_v3.cpp src/llama.cpp src/llama-impl.cpp src/llama-chat.cpp src/llama-mmap.cpp src/llama-context.cpp src/llama-adapter.cpp src/llama-arch.cpp src/llama-batch.cpp src/llama-vocab.cpp src/llama-grammar.cpp src/llama-sampling.cpp src/llama-kv-cache.cpp src/llama-model-loader.cpp src/llama-model.cpp src/llama-quant.cpp src/llama-hparams.cpp otherarch/gptj_v1.cpp otherarch/gptj_v2.cpp otherarch/gptj_v3.cpp otherarch/gpt2_v1.cpp otherarch/gpt2_v2.cpp otherarch/gpt2_v3.cpp otherarch/rwkv_v2.cpp otherarch/rwkv_v3.cpp otherarch/neox_v2.cpp otherarch/neox_v3.cpp otherarch/mpt_v3.cpp ggml/include/ggml.h ggml/include/ggml-cpu.h ggml/include/ggml-cuda.h include/llama.h otherarch/llama-util.h
 gpttype_adapter_failsafe.o: $(GPTTYPE_ADAPTER)
 	$(CXX) $(CXXFLAGS) $(FAILSAFE_FLAGS) -c $< -o $@
 gpttype_adapter.o: $(GPTTYPE_ADAPTER)
@ -680,11 +686,11 @@ vulkan-shaders-gen: ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
 	$(shell) vulkan-shaders-gen --glslc glslc --input-dir ggml/src/ggml-vulkan/vulkan-shaders --target-hpp ggml/src/ggml-vulkan-shaders.hpp --target-cpp ggml/src/ggml-vulkan-shaders.cpp
 #generated libraries
-koboldcpp_default: ggml.o ggml-cpu.o ggml_v3.o ggml_v2.o ggml_v1.o expose.o gpttype_adapter.o sdcpp_default.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL) $(OBJS)
+koboldcpp_default: ggml.o ggml-cpu.o ggml_v3.o ggml_v2.o ggml_v1.o expose.o gpttype_adapter.o sdcpp_default.o whispercpp_default.o tts_default.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL) $(OBJS)
 	$(DEFAULT_BUILD)
 ifdef FAILSAFE_BUILD
-koboldcpp_failsafe: ggml_v4_failsafe.o ggml-cpu_v4_failsafe.o ggml_v3_failsafe.o ggml_v2_failsafe.o ggml_v1_failsafe.o expose.o gpttype_adapter_failsafe.o sdcpp_default.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FAILSAFE) $(OBJS)
+koboldcpp_failsafe: ggml_v4_failsafe.o ggml-cpu_v4_failsafe.o ggml_v3_failsafe.o ggml_v2_failsafe.o ggml_v1_failsafe.o expose.o gpttype_adapter_failsafe.o sdcpp_default.o whispercpp_default.o tts_default.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FAILSAFE) $(OBJS)
 	$(FAILSAFE_BUILD)
 else
 koboldcpp_failsafe:
@ -692,7 +698,7 @@ koboldcpp_failsafe:
 endif
 ifdef NOAVX2_BUILD
-koboldcpp_noavx2: ggml_v4_noavx2.o ggml-cpu_v4_noavx2.o ggml_v3_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o gpttype_adapter_failsafe.o sdcpp_default.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_SIMPLE) $(OBJS)
+koboldcpp_noavx2: ggml_v4_noavx2.o ggml-cpu_v4_noavx2.o ggml_v3_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o gpttype_adapter_failsafe.o sdcpp_default.o whispercpp_default.o tts_default.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_SIMPLE) $(OBJS)
 	$(NOAVX2_BUILD)
 else
 koboldcpp_noavx2:
@ -700,10 +706,10 @@ koboldcpp_noavx2:
 endif
 ifdef CLBLAST_BUILD
-koboldcpp_clblast: ggml_v4_clblast.o ggml-cpu_v4_clblast.o ggml_v3_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v3-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o sdcpp_default.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL) $(OBJS)
+koboldcpp_clblast: ggml_v4_clblast.o ggml-cpu_v4_clblast.o ggml_v3_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v3-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o sdcpp_default.o whispercpp_default.o tts_default.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL) $(OBJS)
 	$(CLBLAST_BUILD)
 ifdef NOAVX2_BUILD
-koboldcpp_clblast_noavx2: ggml_v4_clblast_noavx2.o ggml-cpu_v4_clblast_noavx2.o ggml_v3_clblast_noavx2.o ggml_v2_clblast_noavx2.o ggml_v1_failsafe.o expose.o gpttype_adapter_clblast_noavx2.o ggml-opencl.o ggml_v3-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o sdcpp_default.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_SIMPLER) $(OBJS)
+koboldcpp_clblast_noavx2: ggml_v4_clblast_noavx2.o ggml-cpu_v4_clblast_noavx2.o ggml_v3_clblast_noavx2.o ggml_v2_clblast_noavx2.o ggml_v1_failsafe.o expose.o gpttype_adapter_clblast_noavx2.o ggml-opencl.o ggml_v3-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o sdcpp_default.o whispercpp_default.o tts_default.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_SIMPLER) $(OBJS)
 	$(CLBLAST_BUILD)
 else
 koboldcpp_clblast_noavx2:
@ -717,7 +723,7 @@ koboldcpp_clblast_noavx2:
 endif
 ifdef CUBLAS_BUILD
-koboldcpp_cublas: ggml_v4_cublas.o ggml-cpu.o ggml_v3_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o gpttype_adapter_cublas.o sdcpp_cublas.o whispercpp_cublas.o llavaclip_cublas.o llava.o ggml-backend_cublas.o ggml-backend-reg_cublas.o $(CUBLAS_OBJS) $(OBJS_FULL) $(OBJS)
+koboldcpp_cublas: ggml_v4_cublas.o ggml-cpu.o ggml_v3_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o gpttype_adapter_cublas.o sdcpp_cublas.o whispercpp_cublas.o tts_default.o llavaclip_cublas.o llava.o ggml-backend_cublas.o ggml-backend-reg_cublas.o $(CUBLAS_OBJS) $(OBJS_FULL) $(OBJS)
 	$(CUBLAS_BUILD)
 else
 koboldcpp_cublas:
@ -725,7 +731,7 @@ koboldcpp_cublas:
 endif
 ifdef HIPBLAS_BUILD
-koboldcpp_hipblas: ggml_v4_cublas.o ggml-cpu.o ggml_v3_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o gpttype_adapter_cublas.o sdcpp_cublas.o whispercpp_cublas.o llavaclip_cublas.o llava.o ggml-backend_cublas.o ggml-backend-reg_cublas.o $(HIP_OBJS) $(OBJS_FULL) $(OBJS)
+koboldcpp_hipblas: ggml_v4_cublas.o ggml-cpu.o ggml_v3_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o gpttype_adapter_cublas.o sdcpp_cublas.o whispercpp_cublas.o tts_default.o llavaclip_cublas.o llava.o ggml-backend_cublas.o ggml-backend-reg_cublas.o $(HIP_OBJS) $(OBJS_FULL) $(OBJS)
 	$(HIPBLAS_BUILD)
 else
 koboldcpp_hipblas:
@ -733,10 +739,10 @@ koboldcpp_hipblas:
 endif
 ifdef VULKAN_BUILD
-koboldcpp_vulkan: ggml_v4_vulkan.o ggml-cpu.o ggml_v3.o ggml_v2.o ggml_v1.o expose.o gpttype_adapter_vulkan.o ggml-vulkan.o sdcpp_vulkan.o whispercpp_default.o llavaclip_vulkan.o llava.o ggml-backend_vulkan.o ggml-backend-reg_vulkan.o $(OBJS_FULL) $(OBJS)
+koboldcpp_vulkan: ggml_v4_vulkan.o ggml-cpu.o ggml_v3.o ggml_v2.o ggml_v1.o expose.o gpttype_adapter_vulkan.o ggml-vulkan.o sdcpp_vulkan.o whispercpp_default.o tts_default.o llavaclip_vulkan.o llava.o ggml-backend_vulkan.o ggml-backend-reg_vulkan.o $(OBJS_FULL) $(OBJS)
 	$(VULKAN_BUILD)
 ifdef NOAVX2_BUILD
-koboldcpp_vulkan_noavx2: ggml_v4_vulkan_noavx2.o ggml-cpu_v4_noavx2.o ggml_v3_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o gpttype_adapter_vulkan_noavx2.o ggml-vulkan.o sdcpp_vulkan.o whispercpp_default.o llavaclip_vulkan.o llava.o ggml-backend_vulkan.o ggml-backend-reg_vulkan.o $(OBJS_SIMPLE) $(OBJS)
+koboldcpp_vulkan_noavx2: ggml_v4_vulkan_noavx2.o ggml-cpu_v4_noavx2.o ggml_v3_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o gpttype_adapter_vulkan_noavx2.o ggml-vulkan.o sdcpp_vulkan.o whispercpp_default.o tts_default.o llavaclip_vulkan.o llava.o ggml-backend_vulkan.o ggml-backend-reg_vulkan.o $(OBJS_SIMPLE) $(OBJS)
 	$(VULKAN_BUILD)
 else
 koboldcpp_vulkan_noavx2:
--- a/expose.cpp
+++ b/expose.cpp
@ -238,6 +238,15 @@ extern "C"
        return whispertype_generate(inputs);
    }
    bool tts_load_model(const tts_load_model_inputs inputs)
    {
        return ttstype_load_model(inputs);
    }
    tts_generation_outputs tts_generate(const tts_generation_inputs inputs)
    {
        return ttstype_generate(inputs);
    }
    const char * new_token(int idx) {
        if (generated_tokens.size() <= idx || idx < 0) return nullptr;
--- a/expose.h
+++ b/expose.h
@ -139,6 +139,7 @@ struct last_logprobs_outputs {
    int count = 0;
    logprob_item * logprob_items = nullptr;
 };
 struct sd_load_model_inputs
 {
    const char * model_filename = nullptr;
@ -178,6 +179,7 @@ struct sd_generation_outputs
    int status = -1;
    const char * data = "";
 };
 struct whisper_load_model_inputs
 {
    const char * model_filename = nullptr;
@ -201,6 +203,30 @@ struct whisper_generation_outputs
    const char * text = "";
 };
 struct tts_load_model_inputs
 {
    const char * ttc_model_filename = nullptr;
    const char * cts_model_filename = nullptr;
    const char * executable_path = nullptr;
    const int clblast_info = 0;
    const int cublas_info = 0;
    const char * vulkan_info = nullptr;
    const int gpulayers = 0;
    const int debugmode = 0;
 };
 struct tts_generation_inputs
 {
    const char * prompt = nullptr;
    const int speaker_seed = 0;
    const int audio_seed = 0;
    const bool quiet = false;
 };
 struct tts_generation_outputs
 {
    int status = -1;
    const char * data = "";
 };
 extern std::string executable_path;
 extern std::string lora_filename;
 extern std::string lora_base;
--- a/ggml/src/gguf.cpp
+++ b/ggml/src/gguf.cpp
@ -383,7 +383,7 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
        }
        if (ok && gr.read(n_kv_32)) {
-            n_kv_32 = n_kv_32;
+            n_kv = n_kv_32;
        } else {
            ok = false;
        }
--- a/gpttype_adapter.cpp
+++ b/gpttype_adapter.cpp
@ -21,12 +21,13 @@
 #include <cctype>
 #include <locale>
 #include "utils.h"
 //for easier compilation
 //concat source files into one file for compilation purposes
 #include "llama_v2.cpp"
 #include "llama_v3.cpp"
 #include "src/llama.cpp"
 #include "utils.cpp"
 #include "gptj_v1.cpp"
 #include "gptj_v2.cpp"
 #include "gptj_v3.cpp"
@ -535,99 +536,6 @@ const char * kcpp_print_system_info(void) {
    return s.c_str();
 }
 struct kcpp_embd_batch { //duplcated from llava_embd_batch
    std::vector<int32_t> pos;
    std::vector<int32_t> n_seq_id;
    std::vector<int32_t> seq_id_0;
    std::vector<int32_t *> seq_ids;
    std::vector<int8_t> logits;
    llama_batch batch;
    kcpp_embd_batch(float * embd, int32_t n_tokens, int32_t npast, bool use_mrope) {
        int32_t seq_id = 0;
        pos.resize(n_tokens * (use_mrope?4:1));
        std::fill(pos.begin(), pos.end(), 0);
        n_seq_id.resize(n_tokens);
        seq_ids.resize(n_tokens + 1);
        logits.resize(n_tokens);
        seq_id_0.resize(1);
        seq_id_0[0] = seq_id;
        seq_ids [n_tokens] = nullptr;
        batch = {
            /*n_tokens       =*/ n_tokens,
            /*tokens         =*/ nullptr,
            /*embd           =*/ embd,
            /*pos            =*/ pos.data(),
            /*n_seq_id       =*/ n_seq_id.data(),
            /*seq_id         =*/ seq_ids.data(),
            /*logits         =*/ logits.data(),
        };
        if(!use_mrope)
        {
           for (int i = 0; i < n_tokens; i++) {
                batch.pos     [i] = npast + i;
                batch.n_seq_id[i] = 1;
                batch.seq_id  [i] = seq_id_0.data();
                batch.logits  [i] = false;
            }
        }
        else
        {
            for (int i = 0; i < n_tokens; i++) {
                batch.n_seq_id[i] = 1;
                batch.seq_id  [i] = seq_id_0.data();
                batch.logits  [i] = false;
            }
             for (int j = 0; j < batch.n_tokens * 3; j++) {
                batch.pos[j] = npast + (j % batch.n_tokens);
            }
        }
    }
    kcpp_embd_batch(std::vector<llama_token> & tokens, int32_t npast, bool use_mrope, bool return_all_logits) {
        int32_t seq_id = 0;
        int32_t n_tokens = tokens.size();
        pos.resize(n_tokens * (use_mrope?4:1));
        std::fill(pos.begin(), pos.end(), 0);
        n_seq_id.resize(n_tokens);
        seq_ids.resize(n_tokens + 1);
        logits.resize(n_tokens);
        seq_id_0.resize(1);
        seq_id_0[0] = seq_id;
        seq_ids[n_tokens] = nullptr;
        batch = {
            /*n_tokens       =*/ n_tokens,
            /*tokens         =*/ tokens.data(),
            /*embd           =*/ nullptr,
            /*pos            =*/ pos.data(),
            /*n_seq_id       =*/ n_seq_id.data(),
            /*seq_id         =*/ seq_ids.data(),
            /*logits         =*/ logits.data(),
        };
        if(!use_mrope)
        {
           for (int i = 0; i < n_tokens; i++) {
                batch.pos     [i] = npast + i;
                batch.n_seq_id[i] = 1;
                batch.seq_id  [i] = seq_id_0.data();
                batch.logits  [i] = (return_all_logits?true:false);
            }
        }
        else
        {
            for (int i = 0; i < n_tokens; i++) {
                batch.n_seq_id[i] = 1;
                batch.seq_id  [i] = seq_id_0.data();
                batch.logits  [i] = (return_all_logits?true:false);
            }
             for (int j = 0; j < batch.n_tokens * 3; j++) {
                batch.pos[j] = npast + (j % batch.n_tokens);
            }
        }
        batch.logits[n_tokens - 1] = true;
    }
 };
 //loads a model for speculative decoding.
 static void speculative_decoding_setup(std::string spec_model_filename, const llama_model_params & base_model_params, const llama_context_params & base_ctx_params, int base_n_vocab, const float * draft_gpusplit, int draft_gpulayers)
 {
@ -664,7 +572,7 @@ static void speculative_decoding_setup(std::string spec_model_filename, const ll
    draft_ctx_params.type_k = base_ctx_params.type_k;
    draft_ctx_params.type_v = base_ctx_params.type_v;
-    llama_model * draftmodel = llama_load_model_from_file(spec_model_filename.c_str(), draft_model_params);
+    llama_model * draftmodel = llama_model_load_from_file(spec_model_filename.c_str(), draft_model_params);
    draft_ctx = llama_new_context_with_model(draftmodel, draft_ctx_params);
    if(draft_ctx == NULL)
    {
@ -2252,7 +2160,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
            kvos.push_back(kvo);
            model_params.kv_overrides = kvos.data();
        }
-        llama_model * llamamodel = llama_load_model_from_file(kcpp_data->model_filename.c_str(), model_params);
+        llama_model * llamamodel = llama_model_load_from_file(kcpp_data->model_filename.c_str(), model_params);
        if(overwriteRope)
        {
--- a/include/llama.h
+++ b/include/llama.h
@ -1,3 +1,5 @@
 #pragma once
 #ifndef LLAMA_H
 #define LLAMA_H
--- a/koboldcpp.py
+++ b/koboldcpp.py
@ -53,6 +53,7 @@ fullsdmodelpath = ""  #if empty, it's not initialized
 mmprojpath = "" #if empty, it's not initialized
 password = "" #if empty, no auth key required
 fullwhispermodelpath = "" #if empty, it's not initialized
 ttsmodelpath = "" #if empty, not initialized
 maxctx = 4096
 maxhordectx = 4096
 maxhordelen = 400
@ -281,6 +282,26 @@ class whisper_generation_outputs(ctypes.Structure):
    _fields_ = [("status", ctypes.c_int),
                ("data", ctypes.c_char_p)]
 class tts_load_model_inputs(ctypes.Structure):
    _fields_ = [("ttc_model_filename", ctypes.c_char_p),
                ("cts_model_filename", ctypes.c_char_p),
                ("executable_path", ctypes.c_char_p),
                ("clblast_info", ctypes.c_int),
                ("cublas_info", ctypes.c_int),
                ("vulkan_info", ctypes.c_char_p),
                ("gpulayers", ctypes.c_int),
                ("debugmode", ctypes.c_int)]
 class tts_generation_inputs(ctypes.Structure):
    _fields_ = [("prompt", ctypes.c_char_p),
                ("speaker_seed", ctypes.c_int),
                ("audio_seed", ctypes.c_int),
                ("quiet", ctypes.c_bool)]
 class tts_generation_outputs(ctypes.Structure):
    _fields_ = [("status", ctypes.c_int),
                ("data", ctypes.c_char_p)]
 def getdirpath():
    return os.path.dirname(os.path.realpath(__file__))
 def getabspath():
@ -440,6 +461,10 @@ def init_library():
    handle.whisper_load_model.restype = ctypes.c_bool
    handle.whisper_generate.argtypes = [whisper_generation_inputs]
    handle.whisper_generate.restype = whisper_generation_outputs
    handle.tts_load_model.argtypes = [tts_load_model_inputs]
    handle.tts_load_model.restype = ctypes.c_bool
    handle.tts_generate.argtypes = [tts_generation_inputs]
    handle.tts_generate.restype = tts_generation_outputs
    handle.last_logprobs.restype = last_logprobs_outputs
    handle.detokenize.argtypes = [token_count_outputs]
    handle.detokenize.restype = ctypes.c_char_p
@ -577,9 +602,13 @@ def utfprint(str, importance = 2): #0 = only debugmode, 1 = except quiet, 2 = al
    maxlen = 32000
    if args.debugmode >= 1:
        maxlen = 64000
-    strlength = len(str)
+    try:
-    if strlength > maxlen: #limit max output len
+        strlength = len(str)
-        str = str[:maxlen] + f"... (+{strlength-maxlen} chars)"
+        if strlength > maxlen: #limit max output len
            str = str[:maxlen] + f"... (+{strlength-maxlen} chars)"
    except Exception:
        pass
    try:
        print(str)
    except UnicodeEncodeError:
@ -647,13 +676,14 @@ def read_gguf_metadata(file_path):
    except Exception:
        return None
-def extract_modelfile_params(filepath,sdfilepath,whisperfilepath,mmprojfilepath,draftmodelpath):
+def extract_modelfile_params(filepath,sdfilepath,whisperfilepath,mmprojfilepath,draftmodelpath,ttsmodelpath):
    global modelfile_extracted_meta
    modelfile_extracted_meta = None
    sdfsize = 0
    whisperfsize = 0
    mmprojsize = 0
    draftmodelsize = 0
    ttsmodelsize = 0
    if sdfilepath and os.path.exists(sdfilepath):
        sdfsize = os.path.getsize(sdfilepath)
    if whisperfilepath and os.path.exists(whisperfilepath):
@ -662,12 +692,14 @@ def extract_modelfile_params(filepath,sdfilepath,whisperfilepath,mmprojfilepath,
        mmprojsize = os.path.getsize(mmprojfilepath)
    if draftmodelpath and os.path.exists(draftmodelpath):
        draftmodelsize = os.path.getsize(draftmodelpath)
    if ttsmodelpath and os.path.exists(ttsmodelpath):
        ttsmodelsize = os.path.getsize(ttsmodelpath)
    if filepath and os.path.exists(filepath):
        try:
            fsize = os.path.getsize(filepath)
            if fsize>10000000: #dont bother with models < 10mb as they are probably bad
                ggufmeta = read_gguf_metadata(filepath)
-                modelfile_extracted_meta = [ggufmeta,fsize,sdfsize,whisperfsize,mmprojsize,draftmodelsize] #extract done. note that meta may be null
+                modelfile_extracted_meta = [ggufmeta,fsize,sdfsize,whisperfsize,mmprojsize,draftmodelsize,ttsmodelsize] #extract done. note that meta may be null
        except Exception:
            modelfile_extracted_meta = None
@ -699,6 +731,8 @@ def autoset_gpu_layers(ctxsize,sdquanted,bbs): #shitty algo to determine how man
                mem -= 350*1024*1024
            if modelfile_extracted_meta[5] > 1024*1024*10: #draft model tax
                mem -= (modelfile_extracted_meta[5] * 1.5)
            if modelfile_extracted_meta[6] > 1024*1024*10: #tts model tax
                mem -= max(600*1024*1024, modelfile_extracted_meta[6] * 3)
            mem = 0 if mem < 0 else mem
            csmul = 1.0
@ -730,6 +764,8 @@ def fetch_gpu_properties(testCL,testCU,testVK):
        FetchedCUdevices = []
        FetchedCUdeviceMem = []
        FetchedCUfreeMem = []
        faileddetectvram = False
        AMDgpu = None
        try: # Get NVIDIA GPU names
            output = subprocess.run(['nvidia-smi','--query-gpu=name,memory.total,memory.free','--format=csv,noheader'], capture_output=True, text=True, check=True, encoding='utf-8').stdout
@ -737,6 +773,10 @@ def fetch_gpu_properties(testCL,testCU,testVK):
            FetchedCUdeviceMem = [line.split(",")[1].strip().split(" ")[0].strip() for line in output.splitlines()]
            FetchedCUfreeMem = [line.split(",")[2].strip().split(" ")[0].strip() for line in output.splitlines()]
        except Exception:
            FetchedCUdevices = []
            FetchedCUdeviceMem = []
            FetchedCUfreeMem = []
            faileddetectvram = True
            pass
        if len(FetchedCUdevices)==0:
            try: # Get AMD ROCm GPU names
@ -756,18 +796,30 @@ def fetch_gpu_properties(testCL,testCU,testVK):
                    if getamdvram:
                        FetchedCUdeviceMem = [line.split(",")[1].strip() for line in getamdvram.splitlines()[1:] if line.strip()]
            except Exception:
                FetchedCUdevices = []
                FetchedCUdeviceMem = []
                FetchedCUfreeMem = []
                faileddetectvram = True
                pass
        lowestcumem = 0
        lowestfreecumem = 0
-        for idx in range(0,4):
+        try:
-            if(len(FetchedCUdevices)>idx):
+            for idx in range(0,4):
-                CUDevicesNames[idx] = FetchedCUdevices[idx]
+                if(len(FetchedCUdevices)>idx):
-                if len(FetchedCUdeviceMem)>idx:
+                    CUDevicesNames[idx] = FetchedCUdevices[idx]
-                    dmem = int(FetchedCUdeviceMem[idx]) if AMDgpu else (int(FetchedCUdeviceMem[idx])*1024*1024)
+                    if len(FetchedCUdeviceMem)>idx:
-                    lowestcumem = dmem if lowestcumem==0 else (dmem if dmem<lowestcumem else lowestcumem)
+                        dmem = int(FetchedCUdeviceMem[idx]) if AMDgpu else (int(FetchedCUdeviceMem[idx])*1024*1024)
-                if len(FetchedCUfreeMem)>idx:
+                        lowestcumem = dmem if lowestcumem==0 else (dmem if dmem<lowestcumem else lowestcumem)
-                    dmem = (int(FetchedCUfreeMem[idx])*1024*1024)
+                    if len(FetchedCUfreeMem)>idx:
-                    lowestfreecumem = dmem if lowestfreecumem==0 else (dmem if dmem<lowestfreecumem else lowestfreecumem)
+                        dmem = (int(FetchedCUfreeMem[idx])*1024*1024)
                        lowestfreecumem = dmem if lowestfreecumem==0 else (dmem if dmem<lowestfreecumem else lowestfreecumem)
        except Exception:
            lowestcumem = 0
            lowestfreecumem = 0
            faileddetectvram = True
        if faileddetectvram:
            print("Unable to detect VRAM, please set layers manually.")
        MaxMemory[0] = max(lowestcumem,MaxMemory[0])
        MaxFreeMemory[0] = max(lowestfreecumem,MaxFreeMemory[0])
@ -1264,6 +1316,34 @@ def whisper_generate(genparams):
        outstr = ret.data.decode("UTF-8","ignore")
    return outstr
 def tts_load_model(ttc_model_filename,cts_model_filename):
    global args
    inputs = tts_load_model_inputs()
    inputs.debugmode = args.debugmode
    inputs.executable_path = (getdirpath()+"/").encode("UTF-8")
    inputs.ttc_model_filename = ttc_model_filename.encode("UTF-8")
    inputs.cts_model_filename = cts_model_filename.encode("UTF-8")
    inputs.gpulayers = (999 if args.ttsgpu else 0)
    inputs = set_backend_props(inputs)
    ret = handle.tts_load_model(inputs)
    return ret
 def tts_generate(genparams):
    global args
    is_quiet = True if (args.quiet or args.debugmode == -1) else False
    prompt = genparams.get("input", "")
    prompt = prompt.strip()
    inputs = tts_generation_inputs()
    inputs.prompt = prompt.encode("UTF-8")
    inputs.speaker_seed = 0
    inputs.audio_seed = 0
    inputs.quiet = is_quiet
    ret = handle.tts_generate(inputs)
    outstr = ""
    if ret.status==1:
        outstr = ret.data.decode("UTF-8","ignore")
    return outstr
 def tokenize_ids(countprompt,tcaddspecial):
    rawcountdata = handle.token_count(countprompt.encode("UTF-8"),tcaddspecial)
    countlimit = rawcountdata.count if (rawcountdata.count>=0 and rawcountdata.count<50000) else 0
@ -1738,10 +1818,11 @@ def LaunchWebbrowser(target_url, failedmsg):
    try:
        import webbrowser as wb
        if wb.open(target_url, autoraise=True):
-          return
+            return
        raise RuntimeError("Cannot open default browser")
-    except Exception:
+    except Exception as e:
        try:
            print(f"Browser failed to launch: {e}, attempting to use xdg-open...")
            import webbrowser as wb
            if wb.get('xdg-open').open(target_url, autoraise=True):
                return
@ -2102,7 +2183,7 @@ Enter Prompt:<br>
    def do_GET(self):
        global embedded_kailite, embedded_kcpp_docs, embedded_kcpp_sdui
-        global has_multiplayer, multiplayer_turn_major, multiplayer_turn_minor, multiplayer_story_data_compressed, multiplayer_dataformat, multiplayer_lastactive, maxctx, maxhordelen, friendlymodelname, lastgeneratedcomfyimg, KcppVersion, totalgens, preloaded_story, exitcounter, currentusergenkey, friendlysdmodelname, fullsdmodelpath, mmprojpath, password, fullwhispermodelpath
+        global has_multiplayer, multiplayer_turn_major, multiplayer_turn_minor, multiplayer_story_data_compressed, multiplayer_dataformat, multiplayer_lastactive, maxctx, maxhordelen, friendlymodelname, lastgeneratedcomfyimg, KcppVersion, totalgens, preloaded_story, exitcounter, currentusergenkey, friendlysdmodelname, fullsdmodelpath, mmprojpath, password, fullwhispermodelpath, ttsmodelpath
        self.path = self.path.rstrip('/')
        response_body = None
        content_type = 'application/json'
@ -2160,7 +2241,8 @@ Enter Prompt:<br>
            has_password = (password!="")
            has_whisper = (fullwhispermodelpath!="")
            has_search = True if args.websearch else False
-            response_body = (json.dumps({"result":"KoboldCpp","version":KcppVersion, "protected":has_password ,"txt2img":has_txt2img,"vision":has_vision,"transcribe":has_whisper,"multiplayer":has_multiplayer,"websearch":has_search}).encode())
+            has_tts = (ttsmodelpath!="")
            response_body = (json.dumps({"result":"KoboldCpp","version":KcppVersion, "protected":has_password ,"txt2img":has_txt2img,"vision":has_vision,"transcribe":has_whisper,"multiplayer":has_multiplayer,"websearch":has_search,"tts":has_tts}).encode())
        elif self.path.endswith(('/api/extra/perf')):
            global last_req_time, start_time
@ -2521,7 +2603,7 @@ Enter Prompt:<br>
        reqblocking = False
        muint = int(args.multiuser)
-        if muint<=0 and ((args.whispermodel and args.whispermodel!="") or (args.sdmodel and args.sdmodel!="")):
+        if muint<=0 and ((args.whispermodel and args.whispermodel!="") or (args.sdmodel and args.sdmodel!="") or (args.ttsmodel and args.ttsmodel!="")):
            muint = 2 # this prevents errors when using voice/img together with text
        multiuserlimit = ((muint-1) if muint > 1 else 6)
        #backwards compatibility for up to 7 concurrent requests, use default limit of 7 if multiuser set to 1
@ -2546,6 +2628,7 @@ Enter Prompt:<br>
            is_imggen = False
            is_comfyui_imggen = False
            is_transcribe = False
            is_tts = False
            if self.path.endswith('/request'):
                api_format = 1
@ -2588,11 +2671,14 @@ Enter Prompt:<br>
            if self.path.endswith('/api/extra/transcribe') or self.path.endswith('/v1/audio/transcriptions'):
                is_transcribe = True
-            if is_imggen or is_transcribe or api_format > 0:
+            if self.path.endswith('/api/extra/tts') or self.path.endswith('/v1/audio/speech'):
                is_tts = True
            if is_imggen or is_transcribe or is_tts or api_format > 0:
                global last_req_time
                last_req_time = time.time()
-                if not is_imggen and not is_transcribe and api_format!=5:
+                if not is_imggen and not is_transcribe and not is_tts and api_format!=5:
                    if not self.secure_endpoint():
                        return
@ -2680,6 +2766,21 @@ Enter Prompt:<br>
                        print("Transcribe: The response could not be sent, maybe connection was terminated?")
                        time.sleep(0.2) #short delay
                    return
                elif is_tts:
                    try:
                        gen = tts_generate(genparams)
                        wav_data = b''
                        if gen:
                            wav_data = base64.b64decode(gen) # Decode the Base64 string into binary data
                        self.send_response(200)
                        self.send_header('content-length', str(len(wav_data)))  # Set content length
                        self.end_headers(content_type='audio/wav')
                        self.wfile.write(wav_data) # Write the binary WAV data to the response
                    except Exception as ex:
                        utfprint(ex,0)
                        print("TTS: The response could not be sent, maybe connection was terminated?")
                        time.sleep(0.2) #short delay
                    return
        finally:
            time.sleep(0.05)
@ -2806,7 +2907,7 @@ def show_gui():
            if dlfile:
                args.model_param = dlfile
            load_config_cli(args.model_param)
-        if not args.model_param and not args.sdmodel and not args.whispermodel and not args.nomodel:
+        if not args.model_param and not args.sdmodel and not args.whispermodel and not args.ttsmodel and not args.nomodel:
            global exitcounter
            exitcounter = 999
            exit_with_error(2,"No ggml model or kcpps file was selected. Exiting.")
@ -3008,6 +3109,9 @@ def show_gui():
    sd_quant_var = ctk.IntVar(value=0)
    whisper_model_var = ctk.StringVar()
    tts_model_var = ctk.StringVar()
    wavtokenizer_var = ctk.StringVar()
    ttsgpu_var = ctk.IntVar(value=0)
    def tabbuttonaction(name):
        for t in tabcontent:
@ -3158,7 +3262,8 @@ def show_gui():
            whisperfilepath = whisper_model_var.get()
            mmprojfilepath = mmproj_var.get()
            draftmodelpath = draftmodel_var.get()
-            extract_modelfile_params(filepath,sdfilepath,whisperfilepath,mmprojfilepath,draftmodelpath)
+            ttsmodelpath = tts_model_var.get() if ttsgpu_var.get()==1 else ""
            extract_modelfile_params(filepath,sdfilepath,whisperfilepath,mmprojfilepath,draftmodelpath,ttsmodelpath)
            changed_gpulayers_estimate()
        pass
@ -3575,8 +3680,14 @@ def show_gui():
    # audio tab
    audio_tab = tabcontent["Audio"]
-    makefileentry(audio_tab, "Whisper Model (Speech-To-Text):", "Select Whisper .bin Model File", whisper_model_var, 1, width=280, filetypes=[("*.bin","*.bin")], tooltiptxt="Select a Whisper .bin model file on disk to be loaded.")
+    makefileentry(audio_tab, "Whisper Model (Speech-To-Text):", "Select Whisper .bin Model File", whisper_model_var, 1, width=280, filetypes=[("*.bin","*.bin")], tooltiptxt="Select a Whisper .bin model file on disk to be loaded for Voice Recognition.")
    whisper_model_var.trace("w", gui_changed_modelfile)
    makefileentry(audio_tab, "OuteTTS Model (Text-To-Speech):", "Select OuteTTS GGUF Model File", tts_model_var, 3, width=280, filetypes=[("*.gguf","*.gguf")], tooltiptxt="Select a OuteTTS GGUF model file on disk to be loaded for Narration.")
    tts_model_var.trace("w", gui_changed_modelfile)
    makefileentry(audio_tab, "WavTokenizer Model (Text-To-Speech):", "Select WavTokenizer GGUF Model File", wavtokenizer_var, 5, width=280, filetypes=[("*.gguf","*.gguf")], tooltiptxt="Select a WavTokenizer GGUF model file on disk to be loaded for Narration.")
    wavtokenizer_var.trace("w", gui_changed_modelfile)
    makecheckbox(audio_tab, "TTS Use GPU", ttsgpu_var, 7, 0,tooltiptxt="Uses the GPU for TTS.")
    ttsgpu_var.trace("w", gui_changed_modelfile)
    def kcpp_export_template():
        nonlocal kcpp_exporting_template
@ -3625,7 +3736,7 @@ def show_gui():
    # launch
    def guilaunch():
-        if model_var.get() == "" and sd_model_var.get() == "" and whisper_model_var.get() == "" and nomodel.get()!=1:
+        if model_var.get() == "" and sd_model_var.get() == "" and whisper_model_var.get() == "" and tts_model_var.get() == "" and nomodel.get()!=1:
            tmp = askopenfilename(title="Select ggml model .bin or .gguf file")
            model_var.set(tmp)
        nonlocal nextstate
@ -3792,6 +3903,11 @@ def show_gui():
        if whisper_model_var.get() != "":
            args.whispermodel = whisper_model_var.get()
        if tts_model_var.get() != "" and wavtokenizer_var.get() != "":
            args.ttsmodel = tts_model_var.get()
            args.ttswavtokenizer = wavtokenizer_var.get()
            args.ttsgpu = (ttsgpu_var.get()==1)
    def import_vars(dict):
        global importvars_in_progress
        importvars_in_progress = True
@ -3952,6 +4068,10 @@ def show_gui():
        whisper_model_var.set(dict["whispermodel"] if ("whispermodel" in dict and dict["whispermodel"]) else "")
        tts_model_var.set(dict["ttsmodel"] if ("ttsmodel" in dict and dict["ttsmodel"]) else "")
        wavtokenizer_var.set(dict["ttswavtokenizer"] if ("ttswavtokenizer" in dict and dict["ttswavtokenizer"]) else "")
        ttsgpu_var.set(dict["ttsgpu"] if ("ttsgpu" in dict) else 0)
        importvars_in_progress = False
        gui_changed_modelfile()
        if "istemplate" in dict and dict["istemplate"]:
@ -4022,7 +4142,7 @@ def show_gui():
        kcpp_exporting_template = False
        export_vars()
-        if not args.model_param and not args.sdmodel and not args.whispermodel and not args.nomodel:
+        if not args.model_param and not args.sdmodel and not args.whispermodel and not args.ttsmodel and not args.nomodel:
            exitcounter = 999
            print("")
            time.sleep(0.5)
@ -4566,7 +4686,7 @@ def analyze_gguf_model_wrapper(filename=""):
 def main(launch_args,start_server=True):
    global embedded_kailite, embedded_kcpp_docs, embedded_kcpp_sdui
-    global libname, args, friendlymodelname, friendlysdmodelname, fullsdmodelpath, mmprojpath, password, fullwhispermodelpath
+    global libname, args, friendlymodelname, friendlysdmodelname, fullsdmodelpath, mmprojpath, password, fullwhispermodelpath, ttsmodelpath
    args = launch_args
    if (args.version) and len(sys.argv) <= 2:
@ -4629,7 +4749,7 @@ def main(launch_args,start_server=True):
    if not args.model_param:
        args.model_param = args.model
-    if args.showgui or (not args.model_param and not args.sdmodel and not args.whispermodel and not args.nomodel):
+    if args.showgui or (not args.model_param and not args.sdmodel and not args.whispermodel and not args.ttsmodel and not args.nomodel):
        #give them a chance to pick a file
        print("For command line arguments, please refer to --help")
        print("***")
@ -4753,6 +4873,14 @@ def main(launch_args,start_server=True):
        dlfile = download_model_from_url(args.draftmodel,[".gguf"])
        if dlfile:
            args.draftmodel = dlfile
    if args.ttsmodel and args.ttsmodel!="":
        dlfile = download_model_from_url(args.ttsmodel,[".gguf"])
        if dlfile:
            args.ttsmodel = dlfile
    if args.ttswavtokenizer and args.ttswavtokenizer!="":
        dlfile = download_model_from_url(args.ttswavtokenizer,[".gguf"])
        if dlfile:
            args.ttswavtokenizer = dlfile
    # sanitize and replace the default vanity name. remember me....
    if args.model_param and args.model_param!="":
@ -4830,7 +4958,7 @@ def main(launch_args,start_server=True):
                pass
            if args.gpulayers==-1:
                if MaxMemory[0] > 0 and (not args.usecpu) and ((args.usecublas is not None) or (args.usevulkan is not None) or (args.useclblast is not None) or sys.platform=="darwin"):
-                    extract_modelfile_params(args.model_param,args.sdmodel,args.whispermodel,args.mmproj,args.draftmodel)
+                    extract_modelfile_params(args.model_param,args.sdmodel,args.whispermodel,args.mmproj,args.draftmodel,args.ttsmodel if args.ttsgpu else "")
                    layeramt = autoset_gpu_layers(args.contextsize,args.sdquant,args.blasbatchsize)
                    print(f"Auto Recommended GPU Layers: {layeramt}")
                    args.gpulayers = layeramt
@ -4999,6 +5127,27 @@ def main(launch_args,start_server=True):
                exitcounter = 999
                exit_with_error(3,"Could not load whisper model: " + whispermodel)
    #handle tts model
    if args.ttsmodel and args.ttsmodel!="" and args.ttswavtokenizer and args.ttswavtokenizer!="":
        if not os.path.exists(args.ttsmodel) or not os.path.exists(args.ttswavtokenizer):
            if args.ignoremissing:
                print("Ignoring missing TTS model files!")
                args.ttsmodel = None
                args.ttswavtokenizer = None
            else:
                exitcounter = 999
                exit_with_error(2,f"Cannot find tts model files: {args.ttsmodel} or {args.ttswavtokenizer}")
        else:
            ttsmodelpath = args.ttsmodel
            ttsmodelpath = os.path.abspath(ttsmodelpath)
            wavtokpath = args.ttswavtokenizer
            wavtokpath = os.path.abspath(wavtokpath)
            loadok = tts_load_model(ttsmodelpath,wavtokpath)
            print("Load TTS Model OK: " + str(loadok))
            if not loadok:
                exitcounter = 999
                exit_with_error(3,"Could not load TTS model!")
    #load embedded lite
    try:
@ -5296,7 +5445,12 @@ if __name__ == '__main__':
    sdparsergroup.add_argument("--sdnotile", help="Disables VAE tiling, may not work for large images.", action='store_true')
    whisperparsergroup = parser.add_argument_group('Whisper Transcription Commands')
-    whisperparsergroup.add_argument("--whispermodel", metavar=('[filename]'), help="Specify a Whisper bin model to enable Speech-To-Text transcription.", default="")
+    whisperparsergroup.add_argument("--whispermodel", metavar=('[filename]'), help="Specify a Whisper .bin model to enable Speech-To-Text transcription.", default="")
    ttsparsergroup = parser.add_argument_group('TTS Narration Commands')
    ttsparsergroup.add_argument("--ttsmodel", metavar=('[filename]'), help="Specify the OuteTTS Text-To-Speech GGUF model.", default="")
    ttsparsergroup.add_argument("--ttswavtokenizer", metavar=('[filename]'), help="Specify the WavTokenizer GGUF model.", default="")
    ttsparsergroup.add_argument("--ttsgpu", help="Use the GPU for TTS.", action='store_true')
    deprecatedgroup = parser.add_argument_group('Deprecated Commands, DO NOT USE!')
    deprecatedgroup.add_argument("--hordeconfig", help=argparse.SUPPRESS, nargs='+')
--- a/model_adapter.h
+++ b/model_adapter.h
@ -105,6 +105,9 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs);
 bool whispertype_load_model(const whisper_load_model_inputs inputs);
 whisper_generation_outputs whispertype_generate(const whisper_generation_inputs inputs);
 bool ttstype_load_model(const tts_load_model_inputs inputs);
 tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs);
 void timer_start();
 double timer_check();
 void print_tok_vec(std::vector<int> &embd);
--- a/otherarch/ggml_v3.h
+++ b/otherarch/ggml_v3.h
@ -188,13 +188,8 @@
 #endif
 // TODO: support for clang
 #ifdef __GNUC__
 #    define GGML_V3_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
 #elif defined(_MSC_VER)
 #    define GGML_V3_DEPRECATED(func, hint) __declspec(deprecated(hint)) func
 #else
 #    define GGML_V3_DEPRECATED(func, hint) func
-#endif
+
 #ifndef __GNUC__
 #    define GGML_V3_ATTRIBUTE_FORMAT(...)
--- a/otherarch/llama_v2.cpp
+++ b/otherarch/llama_v2.cpp
@ -436,19 +436,23 @@ struct llama_v2_file_loader {
        uint32_t magic = file.read_u32();
        uint32_t version = 0;
-        if (magic != 'ggml') {
+        uint32_t magic_ggjt = 0x67676a74u; // 'ggjt'
        uint32_t magic_ggmf = 0x67676d66u; // 'ggmf'
        uint32_t magic_ggml = 0x67676d6cu; // 'ggml'
        if (magic != magic_ggml) {
            version = file.read_u32();
        }
-        if (magic == 'ggml' && version == 0) {
+        if (magic == magic_ggml && version == 0) {
            file_version = LLAMA_V2_FILE_VERSION_GGML;
-        } else if (magic == 'ggmf' && version == 1) {
+        } else if (magic == magic_ggmf && version == 1) {
            file_version = LLAMA_V2_FILE_VERSION_GGMF_V1;
-        } else if (magic == 'ggjt' && version == 1) {
+        } else if (magic == magic_ggjt && version == 1) {
            file_version = LLAMA_V2_FILE_VERSION_GGJT_V1;
-        } else if (magic == 'ggjt' && version == 2) {
+        } else if (magic == magic_ggjt && version == 2) {
            file_version = LLAMA_V2_FILE_VERSION_GGJT_V2;
-        } else if (magic == 'ggjt' && version == 3) {
+        } else if (magic == magic_ggjt && version == 3) {
            file_version = LLAMA_V2_FILE_VERSION_GGJT_V3;
        } else {
            throw format_old("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
@ -553,7 +557,8 @@ struct llama_v2_file_saver {
        write_vocab();
    }
    void write_magic() {
-        file.write_u32(LLAMA_V2_FILE_MAGIC);   // magic
+        uint32_t magic_ggjt = 0x67676a74u; // 'ggjt'
        file.write_u32(magic_ggjt);   // magic
        file.write_u32(LLAMA_V2_FILE_VERSION); // version
    }
    void write_hparams(enum llama_v2_ftype new_ftype) {
@ -2308,7 +2313,8 @@ int llama_v2_apply_lora_from_file_internal(struct llama_v2_context * ctx, const
    {
        uint32_t magic;
        fin.read((char *) &magic, sizeof(magic));
-        if (magic != 'ggla') {
+        uint32_t magic_ggla = 0x67676c61u; // 'ggla'
        if (magic != magic_ggla) {
            fprintf(stderr, "%s: bad file magic\n", __func__);
            return 1;
        }
@ -2800,85 +2806,6 @@ size_t llama_v2_set_state_data(struct llama_v2_context * ctx, const uint8_t * sr
    return nread;
 }
 bool llama_v2_load_session_file(struct llama_v2_context * ctx, const char * path_session, llama_v2_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
    llama_v2_file file(path_session, "rb");
    // sanity checks
    {
        const uint32_t magic   = file.read_u32();
        const uint32_t version = file.read_u32();
        if (magic != LLAMA_V2_SESSION_MAGIC || version != LLAMA_V2_SESSION_VERSION) {
            fprintf(stderr, "%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
            return false;
        }
        llama_v2_hparams session_hparams;
        file.read_raw(&session_hparams, sizeof(llama_v2_hparams));
        if (session_hparams != ctx->model.hparams) {
            fprintf(stderr, "%s : model hparams didn't match from session file!\n", __func__);
            return false;
        }
    }
    // load the prompt
    {
        const uint32_t n_token_count = file.read_u32();
        if (n_token_count > n_token_capacity) {
            fprintf(stderr, "%s : token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
            return false;
        }
        file.read_raw(tokens_out, sizeof(llama_v2_token) * n_token_count);
        *n_token_count_out = n_token_count;
    }
    // restore the context state
    {
        const size_t n_state_size_cur = file.size - file.tell();
        const size_t n_state_size_max = llama_v2_get_state_size(ctx);
        if (n_state_size_cur > n_state_size_max) {
            fprintf(stderr, "%s : the state size in session file is too big! max %zu, got %zu\n", __func__, n_state_size_max, n_state_size_cur);
            return false;
        }
        std::vector<uint8_t> state_data(n_state_size_max);
        file.read_raw(state_data.data(), n_state_size_cur);
        llama_v2_set_state_data(ctx, state_data.data());
    }
    return true;
 }
 bool llama_v2_save_session_file(struct llama_v2_context * ctx, const char * path_session, const llama_v2_token * tokens, size_t n_token_count) {
    llama_v2_file file(path_session, "wb");
    file.write_u32(LLAMA_V2_SESSION_MAGIC);
    file.write_u32(LLAMA_V2_SESSION_VERSION);
    file.write_raw(&ctx->model.hparams, sizeof(llama_v2_hparams));
    // save the prompt
    file.write_u32((uint32_t) n_token_count);
    file.write_raw(tokens, sizeof(llama_v2_token) * n_token_count);
    // save the context state
    {
        const size_t n_state_size_max = llama_v2_get_state_size(ctx);
        std::vector<uint8_t> state_data(n_state_size_max);
        const size_t n_state_size_cur = llama_v2_copy_state_data(ctx, state_data.data());
        file.write_raw(state_data.data(), n_state_size_cur);
    }
    return true;
 }
 int llama_v2_eval(
        struct llama_v2_context * ctx,
           const llama_v2_token * tokens,
--- a/otherarch/llama_v2.h
+++ b/otherarch/llama_v2.h
@ -140,10 +140,6 @@ extern "C" {
    // Returns the number of bytes read
    LLAMA_V2_API size_t llama_v2_set_state_data(struct llama_v2_context * ctx, const uint8_t * src);
    // Save/load session file
    LLAMA_V2_API bool llama_v2_load_session_file(struct llama_v2_context * ctx, const char * path_session, llama_v2_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
    LLAMA_V2_API bool llama_v2_save_session_file(struct llama_v2_context * ctx, const char * path_session, const llama_v2_token * tokens, size_t n_token_count);
    // Run the llama inference to obtain the logits and probabilities for the next token.
    // tokens + n_tokens is the provided batch of new tokens to process
    // n_past is the number of tokens to use from previous eval calls
@ -167,7 +163,7 @@ extern "C" {
                             int   n_max_tokens,
                            bool   add_bos);
-    
+
    std::vector<llama_v2_token> legacy_llama_v2_tokenize(struct llama_v2_context * ctx, const std::string & text, bool add_bos);
    LLAMA_V2_API int llama_v2_n_vocab(const struct llama_v2_context * ctx);
--- a/otherarch/rwkv_v2.cpp
+++ b/otherarch/rwkv_v2.cpp
@ -126,7 +126,7 @@ struct rwkv_v2_model {
 // Finds model parameter by key and sets it into dest.
 // If the parameter was not found, returns false.
-bool rwkv_v2_set_parameter(std::unordered_map<std::string, struct ggml_v2_tensor *> * parameters, char * key, struct ggml_v2_tensor ** dest) {
+bool rwkv_v2_set_parameter(std::unordered_map<std::string, struct ggml_v2_tensor *> * parameters, const char * key, struct ggml_v2_tensor ** dest) {
    struct ggml_v2_tensor * parameter = (*parameters)[key];
    RWKV_V2_ASSERT_FALSE(parameter != NULL, "Parameter %s not found in model file", key);
    *dest = parameter;
@ -135,7 +135,7 @@ bool rwkv_v2_set_parameter(std::unordered_map<std::string, struct ggml_v2_tensor
 // Finds block parameter by block index and key and sets it into dest.
 // If the parameter was not found, returns false.
-bool rwkv_v2_set_block_parameter(std::unordered_map<std::string, struct ggml_v2_tensor *> * parameters, int32_t block_index, char * key, struct ggml_v2_tensor ** dest) {
+bool rwkv_v2_set_block_parameter(std::unordered_map<std::string, struct ggml_v2_tensor *> * parameters, int32_t block_index, const char * key, struct ggml_v2_tensor ** dest) {
    char full_key[128];
    sprintf(full_key, "blocks.%d.%s", block_index, key);
    return rwkv_v2_set_parameter(parameters, full_key, dest);
--- a/otherarch/sdcpp/sdtype_adapter.cpp
+++ b/otherarch/sdcpp/sdtype_adapter.cpp
@ -112,28 +112,6 @@ static sd_ctx_t * sd_ctx = nullptr;
 static int sddebugmode = 0;
 static std::string recent_data = "";
 std::string base64_encode(const unsigned char* data, unsigned int data_length) {
    const std::string base64_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
    std::string encoded;
    encoded.reserve(((data_length + 2) / 3) * 4);
    for (unsigned int i = 0; i < data_length; i += 3) {
        unsigned int triple = (data[i] << 16) + (i + 1 < data_length ? data[i + 1] << 8 : 0) + (i + 2 < data_length ? data[i + 2] : 0);
        encoded.push_back(base64_chars[(triple >> 18) & 0x3F]);
        encoded.push_back(base64_chars[(triple >> 12) & 0x3F]);
        if (i + 1 < data_length) {
            encoded.push_back(base64_chars[(triple >> 6) & 0x3F]);
        } else {
            encoded.push_back('=');
        }
        if (i + 2 < data_length) {
            encoded.push_back(base64_chars[triple & 0x3F]);
        } else {
            encoded.push_back('=');
        }
    }
    return encoded;
 }
 static std::string sdplatformenv, sddeviceenv, sdvulkandeviceenv;
 static bool notiling = false;
 bool sdtype_load_model(const sd_load_model_inputs inputs) {
@ -553,7 +531,7 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)
        unsigned char * png = stbi_write_png_to_mem(results[i].data, 0, results[i].width, results[i].height, results[i].channel, &out_data_len, "");
        if (png != NULL)
        {
-            recent_data = base64_encode(png,out_data_len);
+            recent_data = kcpp_base64_encode(png,out_data_len);
            free(png);
        }
--- a/otherarch/tts_adapter.cpp
+++ b/otherarch/tts_adapter.cpp
@ -0,0 +1,672 @@
 #include "model_adapter.h"
 #include "otherarch/utils.h"
 #include "common.h"
 #include "sampling.h"
 #include "llama.h"
 #include <algorithm>
 #include <cmath>
 #include <cstdio>
 #include <fstream>
 #include <map>
 #include <regex>
 #include <string>
 #include <thread>
 #include <vector>
 #include "src/llama-context.h"
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 struct wav_header {
    char riff[4] = {'R', 'I', 'F', 'F'};
    uint32_t chunk_size;
    char wave[4] = {'W', 'A', 'V', 'E'};
    char fmt[4] = {'f', 'm', 't', ' '};
    uint32_t fmt_chunk_size = 16;
    uint16_t audio_format = 1; // PCM
    uint16_t num_channels = 1; // Mono
    uint32_t sample_rate;
    uint32_t byte_rate;
    uint16_t block_align;
    uint16_t bits_per_sample = 16;
    char data[4] = {'d', 'a', 't', 'a'};
    uint32_t data_size;
 };
 static std::string save_wav16_base64(const std::vector<float> &data, int sample_rate) {
    std::ostringstream oss;
    wav_header header;
    // Fill header fields
    header.sample_rate = sample_rate;
    header.byte_rate = header.sample_rate * header.num_channels * (header.bits_per_sample / 8);
    header.block_align = header.num_channels * (header.bits_per_sample / 8);
    header.data_size = data.size() * (header.bits_per_sample / 8);
    header.chunk_size = 36 + header.data_size;
    // Write header
    oss.write(reinterpret_cast<const char*>(&header), sizeof(header));
    // Write samples
    for (const auto &sample : data) {
        int16_t pcm_sample = static_cast<int16_t>(std::clamp(sample * 32767.0, -32768.0, 32767.0));
        oss.write(reinterpret_cast<const char*>(&pcm_sample), sizeof(pcm_sample));
    }
    // Get binary WAV data
    std::string wav_data = oss.str();
    return kcpp_base64_encode(wav_data); //return as base64 string
 }
 static void fill_hann_window(int length, bool periodic, float * output) {
    int offset = -1;
    if (periodic) {
        offset = 0;
    }
    for (int i = 0; i < length; i++) {
        output[i] = 0.5 * (1.0 - cosf((2.0 * M_PI * i) / (length + offset)));
    }
 }
 // very poor-man fft
 static void twiddle(float * real, float * imag, int k, int N) {
    float angle = 2 * M_PI * k / N;
    *real = cos(angle);
    *imag = sin(angle);
 }
 static void irfft(int n, const float * inp_cplx, float * out_real) {
    int N = n / 2 + 1;
    std::vector<float> real_input(N);
    std::vector<float> imag_input(N);
    for (int i = 0; i < N; ++i) {
        real_input[i] = inp_cplx[2 * i];
        imag_input[i] = inp_cplx[2 * i + 1];
    }
    std::vector<float> real_output(n);
    std::vector<float> imag_output(n);
    for (int k = 0; k < n; ++k) {
        real_output[k] = 0.0f;
        imag_output[k] = 0.0f;
        for (int m = 0; m < N; ++m) {
            float twiddle_real;
            float twiddle_imag;
            twiddle(&twiddle_real, &twiddle_imag, k * m, n);
            real_output[k] += real_input[m] * twiddle_real - imag_input[m] * twiddle_imag;
            imag_output[k] += real_input[m] * twiddle_imag + imag_input[m] * twiddle_real;
        }
    }
    for (int i = 0; i < n; ++i) {
        out_real[i] = real_output[i] / N;
    }
 }
 static void fold(const std::vector<float> & data, int64_t n_out, int64_t n_win, int64_t n_hop, int64_t n_pad, std::vector<float> & output) {
    int64_t output_height = n_out;
    int64_t kernel_w = n_win;
    int64_t stride_w = n_hop;
    int64_t width    = n_out;
    output.resize(width, 0.0f);
    int64_t col_idx = 0;
    for (int64_t w_col = 0; w_col < width; ++w_col) {
        int64_t start = w_col * stride_w - n_pad;
        int64_t end   = start + kernel_w;
        for (int64_t w_im = start; w_im < end; ++w_im) {
            if (w_im >= 0 && w_im < output_height && col_idx < (int64_t) data.size()) {
                output[w_im] += data[col_idx];
            }
            col_idx++;
        }
    }
    output.resize(n_out - 2 * n_pad);
 }
 // TODO: not optimized at all
 static std::vector<float> embd_to_audio(
        const float * embd,
        const int n_codes,
        const int n_embd,
        const int n_thread) {
    const int n_fft = 1280;
    const int n_hop = 320;
    const int n_win = 1280;
    const int n_pad = (n_win - n_hop)/2;
    const int n_out = (n_codes - 1)*n_hop + n_win;
    std::vector<float> hann(n_fft);
    fill_hann_window(hann.size(), true, hann.data());
    int n_spec = n_embd*n_codes;
    std::vector<float> E (n_spec);
    std::vector<float> S (n_spec);
    std::vector<float> ST(n_spec);
    for (int l = 0; l < n_codes; ++l) {
        for (int k = 0; k < n_embd; ++k) {
            E[k*n_codes + l] = embd[l*n_embd + k];
        }
    }
    for (int k = 0; k < n_embd/2; ++k) {
        for (int l = 0; l < n_codes; ++l) {
            float mag = E[(k           )*n_codes + l];
            float phi = E[(k + n_embd/2)*n_codes + l];
            mag = exp(mag);
            if (mag > 1e2) {
                mag = 1e2;
            }
            S[2*(k*n_codes + l) + 0] = mag*cosf(phi);
            S[2*(k*n_codes + l) + 1] = mag*sinf(phi);
        }
    }
    for (int l = 0; l < n_codes; ++l) {
        for (int k = 0; k < n_embd/2; ++k) {
            ST[l*n_embd + 2*k + 0] = S[2*(k*n_codes + l) + 0];
            ST[l*n_embd + 2*k + 1] = S[2*(k*n_codes + l) + 1];
        }
    }
    std::vector<float> res  (n_codes*n_fft);
    std::vector<float> hann2(n_codes*n_fft);
    std::vector<std::thread> workers(n_thread);
    for (int i = 0; i < n_thread; ++i) {
        workers[i] = std::thread([&, i]() {
            for (int l = i; l < n_codes; l += n_thread) {
                irfft(n_fft, ST.data() + l*n_embd, res.data() + l*n_fft);
                for (int j = 0; j < n_fft; ++j) {
                    res  [l*n_fft + j] *= hann[j];
                    hann2[l*n_fft + j]  = hann[j] * hann[j];
                }
            }
        });
    }
    for (int i = 0; i < n_thread; ++i) {
        workers[i].join();
    }
    std::vector<float> audio;
    std::vector<float> env;
    fold(res,   n_out, n_win, n_hop, n_pad, audio);
    fold(hann2, n_out, n_win, n_hop, n_pad, env); // TODO: can be done once
    for (size_t i = 0; i < audio.size(); ++i) {
        audio[i] /= env[i];
    }
    return audio;
 }
 static const std::map<int, std::string> ones = {
    {0, "zero"}, {1, "one"}, {2, "two"}, {3, "three"}, {4, "four"},
    {5, "five"}, {6, "six"}, {7, "seven"}, {8, "eight"}, {9, "nine"},
    {10, "ten"}, {11, "eleven"}, {12, "twelve"}, {13, "thirteen"}, {14, "fourteen"},
    {15, "fifteen"}, {16, "sixteen"}, {17, "seventeen"}, {18, "eighteen"}, {19, "nineteen"}
 };
 static const std::map<int, std::string> tens = {
    {2, "twenty"}, {3, "thirty"}, {4, "forty"}, {5, "fifty"},
    {6, "sixty"}, {7, "seventy"}, {8, "eighty"}, {9, "ninety"}
 };
 // Convert a number less than 1000 to words
 static std::string convert_less_than_thousand(int num) {
    std::string result;
    if (num >= 100) {
        result += ones.at(num / 100) + " hundred ";
        num %= 100;
    }
    if (num >= 20) {
        result += tens.at(num / 10);
        if (num % 10 > 0) {
            result += "-" + ones.at(num % 10);
        }
    } else if (num > 0) {
        result += ones.at(num);
    }
    return result;
 }
 static std::string number_to_words(const std::string & number_str) {
    try {
        size_t decimal_pos = number_str.find('.');
        std::string integer_part = number_str.substr(0, decimal_pos);
        int int_number = std::stoi(integer_part);
        std::string result;
        if (int_number == 0) {
            result = "zero";
        } else {
            if (int_number >= 1000000000) {
                int billions = int_number / 1000000000;
                result += convert_less_than_thousand(billions) + " billion ";
                int_number %= 1000000000;
            }
            if (int_number >= 1000000) {
                int millions = int_number / 1000000;
                result += convert_less_than_thousand(millions) + " million ";
                int_number %= 1000000;
            }
            if (int_number >= 1000) {
                int thousands = int_number / 1000;
                result += convert_less_than_thousand(thousands) + " thousand ";
                int_number %= 1000;
            }
            if (int_number > 0) {
                result += convert_less_than_thousand(int_number);
            }
        }
        // Handle decimal part
        if (decimal_pos != std::string::npos) {
            result += " point";
            std::string decimal_part = number_str.substr(decimal_pos + 1);
            for (char digit : decimal_part) {
                result += " " + ones.at(digit - '0');
            }
        }
        return result;
    } catch (const std::exception& e) {
        // Skip if fails
        return " ";
    }
 }
 static std::string replace_numbers_with_words(const std::string & input_text) {
    std::regex number_pattern(R"(\d+(\.\d+)?)");
    std::string result;
    auto it = std::sregex_iterator(input_text.begin(), input_text.end(), number_pattern);
    auto end = std::sregex_iterator();
    size_t last_pos = 0;
    for (std::sregex_iterator i = it; i != end; ++i) {
        const std::smatch& match = *i;
        result.append(input_text, last_pos, match.position() - last_pos);
        result.append(number_to_words(match.str()));
        last_pos = match.position() + match.length();
    }
    result.append(input_text, last_pos);
    return result;
 }
 static std::string process_text(const std::string & text) {
    std::string processed_text = replace_numbers_with_words(text);
    std::transform(processed_text.begin(), processed_text.end(),
                  processed_text.begin(), ::tolower);
    std::regex special_chars(R"([-_/,\.\\])");
    processed_text = std::regex_replace(processed_text, special_chars, " ");
    std::regex non_alpha(R"([^a-z\s])");
    processed_text = std::regex_replace(processed_text, non_alpha, "");
    std::regex multiple_spaces(R"(\s+)");
    processed_text = std::regex_replace(processed_text, multiple_spaces, " ");
    processed_text = std::regex_replace(processed_text, std::regex(R"(^\s+|\s+$)"), "");
    processed_text = std::regex_replace(processed_text, std::regex(R"(\s)"), "<|text_sep|>");
    return processed_text;
 }
 static void prompt_add(llama_tokens & prompt, const llama_tokens & tokens) {
    prompt.insert(prompt.end(), tokens.begin(), tokens.end());
 }
 static void prompt_add(llama_tokens & prompt, const llama_model * model, const std::string & txt, bool add_special, bool parse_special) {
    auto tmp = common_tokenize(model, txt, add_special, parse_special);
    prompt_add(prompt, tmp);
 }
 static void prompt_init(llama_tokens & prompt, const llama_model * model) {
    prompt.clear();
    prompt_add(prompt, model, "<|im_start|>\n", true, true);
 }
 static std::vector<llama_token> prepare_guide_tokens(const llama_model * model, const std::string& str)
 {
    const std::string& delimiter = "<|text_sep|>";
    std::vector<llama_token> result;
    size_t start = 0;
    size_t end = str.find(delimiter);
    while (end != std::string::npos) {
        std::string current_word = str.substr(start, end - start);
        auto tmp = common_tokenize(model, current_word, false, true);
        result.push_back(tmp[0]);
        start = end + delimiter.length();
        end = str.find(delimiter, start);
    }
    // Add the last part
    std::string current_word = str.substr(start);
    auto tmp = common_tokenize(model, current_word, false, true);
    result.push_back(tmp[0]);
    return result;
 }
 static llama_context * ttc_ctx = nullptr; //text to codes ctx
 static llama_context * cts_ctx = nullptr; //codes to speech
 static int ttsdebugmode = 0;
 static std::string ttsplatformenv, ttsdeviceenv, ttsvulkandeviceenv;
 static std::string last_generated_audio = "";
 bool ttstype_load_model(const tts_load_model_inputs inputs)
 {
    //duplicated from expose.cpp
    int cl_parseinfo = inputs.clblast_info; //first digit is whether configured, second is platform, third is devices
    std::string usingclblast = "GGML_OPENCL_CONFIGURED="+std::to_string(cl_parseinfo>0?1:0);
    putenv((char*)usingclblast.c_str());
    cl_parseinfo = cl_parseinfo%100; //keep last 2 digits
    int platform = cl_parseinfo/10;
    int devices = cl_parseinfo%10;
    ttsplatformenv = "GGML_OPENCL_PLATFORM="+std::to_string(platform);
    ttsdeviceenv = "GGML_OPENCL_DEVICE="+std::to_string(devices);
    putenv((char*)ttsplatformenv.c_str());
    putenv((char*)ttsdeviceenv.c_str());
    std::string vulkan_info_raw = inputs.vulkan_info;
    std::string vulkan_info_str = "";
    for (size_t i = 0; i < vulkan_info_raw.length(); ++i) {
        vulkan_info_str += vulkan_info_raw[i];
        if (i < vulkan_info_raw.length() - 1) {
            vulkan_info_str += ",";
        }
    }
    if(vulkan_info_str!="")
    {
        ttsvulkandeviceenv = "GGML_VK_VISIBLE_DEVICES="+vulkan_info_str;
        putenv((char*)ttsvulkandeviceenv.c_str());
    }
    llama_backend_init();
    std::string modelfile_ttc = inputs.ttc_model_filename;
    std::string modelfile_cts = inputs.cts_model_filename;
    printf("\nLoading TTS Model, OuteTTS: %s \nWavTokenizer: %s \n",modelfile_ttc.c_str(),modelfile_cts.c_str());
    ttsdebugmode = inputs.debugmode;
    // tts init
    llama_model_params tts_model_params = llama_model_default_params();
    llama_context_params tts_ctx_params = llama_context_default_params();
    const int nthreads = 4;
    tts_model_params.use_mmap = false;
    tts_model_params.use_mlock = false;
    tts_model_params.n_gpu_layers = inputs.gpulayers; //offload if possible
    tts_model_params.split_mode = llama_split_mode::LLAMA_SPLIT_MODE_LAYER;
    tts_ctx_params.n_ctx = 8192;
    tts_ctx_params.logits_all = false;
    tts_ctx_params.offload_kqv = true;
    tts_ctx_params.n_batch = 8192;
    tts_ctx_params.n_ubatch = 512;
    tts_ctx_params.n_threads = nthreads;
    tts_ctx_params.n_threads_batch = nthreads;
    tts_ctx_params.flash_attn = false;
    llama_model * ttcmodel = llama_model_load_from_file(modelfile_ttc.c_str(), tts_model_params);
    ttc_ctx = llama_new_context_with_model(ttcmodel, tts_ctx_params);
    if (ttc_ctx == nullptr) {
        printf("\nTTS Load Error: Failed to initialize ttc context!\n");
        return false;
    }
    llama_model * ctsmodel = llama_model_load_from_file(modelfile_cts.c_str(), tts_model_params);
    tts_ctx_params.embeddings = true; //this requires embeddings instead
    cts_ctx = llama_new_context_with_model(ctsmodel, tts_ctx_params);
    if (cts_ctx == nullptr) {
        printf("\nTTS Load Error: Failed to initialize cts context!\n");
        return false;
    }
    std::vector<int> tmp = {1, 2, 3, 4};
    llama_kv_cache_clear(ttc_ctx);
    auto er = llama_decode(ttc_ctx, llama_batch_get_one(tmp.data(), tmp.size()));
    if(er!=0)
    {
        printf("\nTTS Eval returned nonzero: %d\n",er);
        return false;
    }
    printf("\nTTS Load Complete.\n");
    return true;
 }
 tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs)
 {
    tts_generation_outputs output;
    if(ttc_ctx==nullptr || cts_ctx==nullptr)
    {
        printf("\nWarning: KCPP TTS not initialized!\n");
        output.data = "";
        output.status = 0;
        return output;
    }
    std::vector<llama_token> codes;
    std::vector<llama_token> guide_tokens;
    const llama_model * model_ttc = &(ttc_ctx->model);
    const llama_model * model_cts = &(cts_ctx->model);
    const int ttc_n_vocab = llama_n_vocab(model_ttc);
    std::string prompt = inputs.prompt;
    if(!inputs.quiet)
    {
        printf("\nTTS Generating... ");
    }
    // process prompt and generate voice codes
    std::vector<llama_token> prompt_inp;
    prompt_init(prompt_inp, model_ttc);
    prompt_add(prompt_inp, model_ttc, "<|text_start|>", false, true);
    int speaker_seed = inputs.speaker_seed;
    int audio_seed = inputs.audio_seed;
    if (speaker_seed <= 0 || speaker_seed==0xFFFFFFFF)
    {
        speaker_seed = (((uint32_t)time(NULL)) % 1000000u);
        if(ttsdebugmode==1)
        {
            printf("\nUsing Speaker Seed: %d", speaker_seed);
        }
    }
    if (audio_seed <= 0 || audio_seed==0xFFFFFFFF)
    {
        audio_seed = (((uint32_t)time(NULL)) % 1000000u);
        if(ttsdebugmode==1)
        {
            printf("\nUsing Audio Seed: %d", audio_seed);
        }
    }
    std::mt19937 tts_rng(audio_seed);
    std::mt19937 speaker_rng(speaker_seed);
    //add the speaker based on the seed
    if(speaker_seed>0)
    {
        std::string sampletext = "but<|text_sep|>that<|text_sep|>is<|text_sep|>what<|text_sep|>it<|text_sep|>is<|text_sep|>";
    }
    // convert the input text into the necessary format expected by OuteTTS
    std::string prompt_clean = process_text(prompt);
    if(prompt_clean.size()==0)
    {
        //no input
         if(!inputs.quiet)
        {
            printf("\nTTS sent empty input.\n");
            output.data = "";
            output.status = 1;
            return output;
        }
    }
    if(!inputs.quiet && ttsdebugmode==1)
    {
        printf("\nInput: %s\n", prompt_clean.c_str());
    }
    guide_tokens = prepare_guide_tokens(model_ttc,prompt_clean);
    prompt_add(prompt_inp, model_ttc, prompt_clean, false, true);
    if(!inputs.quiet)
    {
        printf(" (%d input words)...", guide_tokens.size());
    }
    prompt_add(prompt_inp, model_ttc, "<|text_end|>\n", false, true);
    //create batch with tokens for decoding prompt processing
    llama_kv_cache_clear(ttc_ctx);
    llama_kv_cache_clear(cts_ctx);
    kcpp_embd_batch tts_batch = kcpp_embd_batch(prompt_inp, 0, false, true);
    auto evalok = (llama_decode(ttc_ctx, tts_batch.batch)==0);
    if (!evalok) {
        printf("\nError: TTS prompt batch processing failed\n");
        output.data = "";
        output.status = 0;
        return output;
    }
    // main loop
    int n_decode = 0;
    int n_predict = 4096; //max 4096 tokens
    bool next_token_uses_guide_token = true;
    while (n_decode <= n_predict)
    {
        float * logits = llama_get_logits(ttc_ctx);
        llama_token new_token_id = kcpp_quick_sample(logits,ttc_n_vocab,20,1.0,tts_rng);
        //guide tokens help prevent hallucinations by forcing the TTS to use the correct word
        if(!guide_tokens.empty() && next_token_uses_guide_token && !llama_token_is_control(model_ttc, new_token_id) && !llama_token_is_eog(model_ttc, new_token_id))
        {
            llama_token guide_token = guide_tokens[0];
            guide_tokens.erase(guide_tokens.begin());
            new_token_id = guide_token; //ensure correct word fragment is used
        }
        //this is the token id that always precedes a new word
        next_token_uses_guide_token = (new_token_id == 198);
        codes.push_back(new_token_id);
        // is it an end of generation? -> mark the stream as finished
        if (llama_token_is_eog(model_ttc, new_token_id) || n_decode >= n_predict) {
            break;
        }
        n_decode += 1;
        std::vector<llama_token> next = {new_token_id};
        llama_batch batch = llama_batch_get_one(next.data(), next.size());
        // evaluate the current batch with the transformer model
        if (llama_decode(ttc_ctx, batch)) {
            printf("\nError: TTS code generation failed!\n");
            output.data = "";
            output.status = 0;
            return output;
        }
    }
    if(!inputs.quiet && ttsdebugmode==1)
    {
        const std::string inp_txt = common_detokenize(ttc_ctx, codes, true);
        printf("\nGenerated %d Codes: '%s'\n",codes.size(), inp_txt.c_str());
    }
    // remove all non-audio tokens (i.e. < 151672 || > 155772)
    codes.erase(std::remove_if(codes.begin(), codes.end(), [](llama_token t) { return t < 151672 || t > 155772; }), codes.end());
    for (auto & token : codes) {
        token -= 151672;
    }
    const int n_codes = codes.size();
    if(n_codes<=1)
    {
        printf("\nWarning: TTS vocoder generated nothing!\n");
        output.data = "";
        output.status = 0;
        return output;
    }
    kcpp_embd_batch codebatch = kcpp_embd_batch(codes,0,false,true);
    if (llama_decode(cts_ctx, codebatch.batch) != 0) {
        printf("\nError: TTS vocoder generation failed!\n");
        output.data = "";
        output.status = 0;
        return output;
    }
    else
    {
        // spectral operations
        const int n_embd = llama_n_embd(model_cts);
        const float * embd = llama_get_embeddings(cts_ctx);
        std::vector<float> audio = embd_to_audio(embd, n_codes, n_embd, 4);
        const int n_sr = 24000; // sampling rate
        // zero out first 0.05 seconds
        for (int i = 0; i < 24000/20; ++i) {
            audio[i] = 0.0f;
        }
        //add some silence at the end
        for (int i = 0; i < 24000/20; ++i) {
            audio.push_back(0.0f);
        }
        last_generated_audio = save_wav16_base64(audio, n_sr);
        if(!inputs.quiet)
        {
            printf("\nTTS Generated %d audio tokens.\n",(int) codes.size());
        }
        output.data = last_generated_audio.c_str();
        output.status = 1;
        return output;
    }
 }
--- a/otherarch/utils.cpp
+++ b/otherarch/utils.cpp
@ -1,5 +1,6 @@
 #include "utils.h"
 #include "common.h"
 #include "llama.h"
 #include <cmath>
 #include <cstring>
@ -303,6 +304,47 @@ std::vector<uint8_t> kcpp_base64_decode(const std::string & encoded_string)
    return ret;
 }
 std::string kcpp_base64_encode(const unsigned char* data, unsigned int data_length) {
    const std::string base64_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
    std::string encoded;
    encoded.reserve(((data_length + 2) / 3) * 4);
    for (unsigned int i = 0; i < data_length; i += 3) {
        unsigned int triple = (data[i] << 16) + (i + 1 < data_length ? data[i + 1] << 8 : 0) + (i + 2 < data_length ? data[i + 2] : 0);
        encoded.push_back(base64_chars[(triple >> 18) & 0x3F]);
        encoded.push_back(base64_chars[(triple >> 12) & 0x3F]);
        if (i + 1 < data_length) {
            encoded.push_back(base64_chars[(triple >> 6) & 0x3F]);
        } else {
            encoded.push_back('=');
        }
        if (i + 2 < data_length) {
            encoded.push_back(base64_chars[triple & 0x3F]);
        } else {
            encoded.push_back('=');
        }
    }
    return encoded;
 }
 std::string kcpp_base64_encode(const std::string &data) {
    static const char lookup[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
    std::string encoded;
    int val = 0, valb = -6;
    for (unsigned char c : data) {
        val = (val << 8) + c;
        valb += 8;
        while (valb >= 0) {
            encoded.push_back(lookup[(val >> valb) & 0x3F]);
            valb -= 6;
        }
    }
    if (valb > -6) {
        encoded.push_back(lookup[((val << 8) >> (valb + 8)) & 0x3F]);
    }
    while (encoded.size() % 4) {
        encoded.push_back('=');
    }
    return encoded;
 }
 std::string get_timestamp_str()
 {
@ -314,3 +356,150 @@ std::string get_timestamp_str()
    std::string timestamp(buffer);
    return timestamp;
 }
 //a very rudimentary all in one sampling function which has no dependencies
 int32_t kcpp_quick_sample(float * logits, const int n_logits, int top_k, float temp, std::mt19937 & rng)
 {
    if (temp <= 0 || top_k==1) {
        // select the token with the highest logit directly
        float max_logit = logits[0];
        int32_t max_id = 0;
        for (int i = 1; i < n_logits; ++i) {
            if (logits[i] > max_logit) {
                max_logit = logits[i];
                max_id = i;
            }
        }
        return max_id;
    }
    top_k = (top_k<=0 || top_k>300)?300:top_k;
    top_k = std::min(top_k, n_logits);
    std::vector<std::pair<float, int32_t>> logits_id;
    logits_id.reserve(n_logits);
    //temperature sample
    const float scale = 1.0f/temp;
    for (int i = 0; i < n_logits; ++i) {
        logits_id.push_back(std::make_pair(logits[i]*scale, i));
    }
    //sample top_k
    std::partial_sort(
            logits_id.begin(),
            logits_id.begin() + top_k, logits_id.end(),
            [](const std::pair<float, int32_t> & a, const std::pair<float, int32_t> & b) {
        return a.first > b.first;
    });
    logits_id.resize(top_k);
    // compute probs for the top k tokens
    std::vector<float> probs;
    probs.reserve(logits_id.size());
    float maxl = logits_id[0].first;
    double sum = 0.0;
    for (const auto & kv : logits_id) {
        const float p = expf(kv.first - maxl);
        probs.push_back(p);
        sum += p;
    }
    // normalize the probs
    for (auto & p : probs) {
        p /= sum;
    }
    std::discrete_distribution<> dist(probs.begin(), probs.end());
    int idx = dist(rng);
    return logits_id[idx].second;
 }
 kcpp_embd_batch::kcpp_embd_batch(float * embd, int32_t n_tokens, int32_t npast, bool use_mrope)
 {
     int32_t seq_id = 0;
        pos.resize(n_tokens * (use_mrope?4:1));
        std::fill(pos.begin(), pos.end(), 0);
        n_seq_id.resize(n_tokens);
        seq_ids.resize(n_tokens + 1);
        logits.resize(n_tokens);
        seq_id_0.resize(1);
        seq_id_0[0] = seq_id;
        seq_ids [n_tokens] = nullptr;
        batch = {
            /*n_tokens       =*/ n_tokens,
            /*tokens         =*/ nullptr,
            /*embd           =*/ embd,
            /*pos            =*/ pos.data(),
            /*n_seq_id       =*/ n_seq_id.data(),
            /*seq_id         =*/ seq_ids.data(),
            /*logits         =*/ logits.data(),
        };
        if(!use_mrope)
        {
           for (int i = 0; i < n_tokens; i++) {
                batch.pos     [i] = npast + i;
                batch.n_seq_id[i] = 1;
                batch.seq_id  [i] = seq_id_0.data();
                batch.logits  [i] = false;
            }
        }
        else
        {
            for (int i = 0; i < n_tokens; i++) {
                batch.n_seq_id[i] = 1;
                batch.seq_id  [i] = seq_id_0.data();
                batch.logits  [i] = false;
            }
             for (int j = 0; j < batch.n_tokens * 3; j++) {
                batch.pos[j] = npast + (j % batch.n_tokens);
            }
        }
 }
 kcpp_embd_batch::kcpp_embd_batch(std::vector<llama_token> & tokens, int32_t npast, bool use_mrope, bool return_all_logits)
 {
       int32_t seq_id = 0;
        int32_t n_tokens = tokens.size();
        pos.resize(n_tokens * (use_mrope?4:1));
        std::fill(pos.begin(), pos.end(), 0);
        n_seq_id.resize(n_tokens);
        seq_ids.resize(n_tokens + 1);
        logits.resize(n_tokens);
        seq_id_0.resize(1);
        seq_id_0[0] = seq_id;
        seq_ids[n_tokens] = nullptr;
        batch = {
            /*n_tokens       =*/ n_tokens,
            /*tokens         =*/ tokens.data(),
            /*embd           =*/ nullptr,
            /*pos            =*/ pos.data(),
            /*n_seq_id       =*/ n_seq_id.data(),
            /*seq_id         =*/ seq_ids.data(),
            /*logits         =*/ logits.data(),
        };
        if(!use_mrope)
        {
           for (int i = 0; i < n_tokens; i++) {
                batch.pos     [i] = npast + i;
                batch.n_seq_id[i] = 1;
                batch.seq_id  [i] = seq_id_0.data();
                batch.logits  [i] = (return_all_logits?true:false);
            }
        }
        else
        {
            for (int i = 0; i < n_tokens; i++) {
                batch.n_seq_id[i] = 1;
                batch.seq_id  [i] = seq_id_0.data();
                batch.logits  [i] = (return_all_logits?true:false);
            }
             for (int j = 0; j < batch.n_tokens * 3; j++) {
                batch.pos[j] = npast + (j % batch.n_tokens);
            }
        }
        batch.logits[n_tokens - 1] = true;
 }
--- a/otherarch/utils.h
+++ b/otherarch/utils.h
@ -8,6 +8,7 @@
 #include <random>
 #include <thread>
 #include "ggml_v3.h"
 #include "llama.h"
 //
 // CLI argument parsing
@ -52,10 +53,23 @@ void gpt_split_words(std::string str, std::vector<std::string>& words);
 //
 std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text);
 bool should_transpose_layer(std::string name);
 void kcpp_graph_compute_helper(ggml_v3_cgraph * graph, int n_threads);
 std::vector<uint8_t> kcpp_base64_decode(const std::string & encoded_string);
 std::string kcpp_base64_encode(const unsigned char* data, unsigned int data_length);
 std::string kcpp_base64_encode(const std::string &data);
-std::string get_timestamp_str();
+std::string get_timestamp_str();
 int32_t kcpp_quick_sample(float * logits, const int n_logits, int top_k, float temp, std::mt19937 & rng);
 struct kcpp_embd_batch { //duplcated from llava_embd_batch
    std::vector<int32_t> pos;
    std::vector<int32_t> n_seq_id;
    std::vector<int32_t> seq_id_0;
    std::vector<int32_t *> seq_ids;
    std::vector<int8_t> logits;
    llama_batch batch;
    kcpp_embd_batch(float * embd, int32_t n_tokens, int32_t npast, bool use_mrope);
    kcpp_embd_batch(std::vector<llama_token> & tokens, int32_t npast, bool use_mrope, bool return_all_logits);
 };