diff --git a/CMakeLists.txt b/CMakeLists.txt index b052650fb..51535046c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -495,7 +495,9 @@ add_library(common2 examples/llava/clip.h src/unicode.h src/unicode.cpp - src/unicode-data.cpp) + src/unicode-data.cpp + otherarch/utils.cpp + otherarch/utils.h) target_include_directories(common2 PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./examples ./common) target_compile_features(common2 PUBLIC cxx_std_17) # don't bump target_link_libraries(common2 PRIVATE ggml ${LLAMA_EXTRA_LIBS}) @@ -515,11 +517,18 @@ target_compile_features(whisper_adapter PUBLIC cxx_std_17) # don't bump target_link_libraries(whisper_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS}) set_target_properties(whisper_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON) +add_library(tts_adapter + otherarch/tts_adapter.cpp) +target_include_directories(tts_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./examples ./common) +target_compile_features(tts_adapter PUBLIC cxx_std_17) # don't bump +target_link_libraries(tts_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS}) +set_target_properties(tts_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON) + add_library(gpttype_adapter gpttype_adapter.cpp) target_include_directories(gpttype_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./examples ./common) target_compile_features(gpttype_adapter PUBLIC cxx_std_17) # don't bump -target_link_libraries(gpttype_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS}) +target_link_libraries(gpttype_adapter PRIVATE common2 ggml ggml_v1 ggml_v2 ggml_v3 ${LLAMA_EXTRA_LIBS}) set_target_properties(gpttype_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON) if (LLAMA_CUBLAS) @@ -530,8 +539,16 @@ if (LLAMA_CUBLAS) set_target_properties(${TARGET} PROPERTIES PREFIX "") set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "koboldcpp_cublas") set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON) - target_link_libraries(${TARGET} PUBLIC Threads::Threads ggml ggml_v1 ggml_v2 ggml_v3 common2 gpttype_adapter whisper_adapter sdtype_adapter ${LLAMA_EXTRA_LIBS}) + target_link_libraries(${TARGET} PUBLIC Threads::Threads ggml ggml_v1 ggml_v2 ggml_v3 common2 gpttype_adapter whisper_adapter tts_adapter sdtype_adapter ${LLAMA_EXTRA_LIBS}) target_compile_features(${TARGET} PRIVATE cxx_std_17) + + add_custom_command( + TARGET koboldcpp_cublas POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy + $ # The generated DLL + ${CMAKE_SOURCE_DIR}/ # Destination directory + COMMENT "Copying DLL to parent directory" + ) endif() if (LLAMA_HIPBLAS) @@ -542,7 +559,15 @@ if (LLAMA_HIPBLAS) set_target_properties(${TARGET} PROPERTIES PREFIX "") set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "koboldcpp_hipblas") set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON) - target_link_libraries(${TARGET} PUBLIC Threads::Threads ggml ggml_v1 ggml_v2 ggml_v3 common2 gpttype_adapter whisper_adapter sdtype_adapter ${LLAMA_EXTRA_LIBS}) + target_link_libraries(${TARGET} PUBLIC Threads::Threads ggml ggml_v1 ggml_v2 ggml_v3 common2 gpttype_adapter whisper_adapter tts_adapter sdtype_adapter ${LLAMA_EXTRA_LIBS}) target_compile_features(${TARGET} PRIVATE cxx_std_17) + + add_custom_command( + TARGET koboldcpp_hipblas POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy + $ # The generated DLL + ${CMAKE_SOURCE_DIR}/ # Destination directory + COMMENT "Copying DLL to parent directory" + ) endif() diff --git a/Makefile b/Makefile index 5ee7e24ae..bb5e74109 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,7 @@ .PHONY: finishedmsg default: koboldcpp_default koboldcpp_failsafe koboldcpp_noavx2 koboldcpp_clblast koboldcpp_clblast_noavx2 koboldcpp_cublas koboldcpp_hipblas koboldcpp_vulkan koboldcpp_vulkan_noavx2 finishedmsg -tools: quantize_gpt2 quantize_gptj quantize_gguf quantize_neox quantize_mpt quantize_clip whispermain sdmain gguf-split +tools: quantize_gpt2 quantize_gptj quantize_gguf quantize_neox quantize_mpt quantize_clip ttsmain whispermain sdmain gguf-split ifndef UNAME_S UNAME_S := $(shell uname -s) @@ -90,10 +90,10 @@ endif CUBLASLD_FLAGS = CUBLAS_OBJS = -OBJS_FULL += ggml-alloc.o ggml-cpu-traits.o ggml-quants.o ggml-cpu-quants.o ggml-cpu-aarch64.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm.o common.o sampling.o -OBJS_SIMPLE += ggml-alloc.o ggml-cpu-traits.o ggml-quants_noavx2.o ggml-cpu-quants_noavx2.o ggml-cpu-aarch64_noavx2.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm_noavx2.o common.o sampling.o -OBJS_SIMPLER += ggml-alloc.o ggml-cpu-traits.o ggml-quants_noavx1.o ggml-cpu-quants_noavx1.o ggml-cpu-aarch64_noavx1.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm_noavx1.o common.o sampling.o -OBJS_FAILSAFE += ggml-alloc.o ggml-cpu-traits.o ggml-quants_failsafe.o ggml-cpu-quants_failsafe.o ggml-cpu-aarch64_failsafe.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm_failsafe.o common.o sampling.o +OBJS_FULL += ggml-alloc.o ggml-cpu-traits.o ggml-quants.o ggml-cpu-quants.o ggml-cpu-aarch64.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm.o common.o sampling.o kcpputils.o +OBJS_SIMPLE += ggml-alloc.o ggml-cpu-traits.o ggml-quants_noavx2.o ggml-cpu-quants_noavx2.o ggml-cpu-aarch64_noavx2.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm_noavx2.o common.o sampling.o kcpputils.o +OBJS_SIMPLER += ggml-alloc.o ggml-cpu-traits.o ggml-quants_noavx1.o ggml-cpu-quants_noavx1.o ggml-cpu-aarch64_noavx1.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm_noavx1.o common.o sampling.o kcpputils.o +OBJS_FAILSAFE += ggml-alloc.o ggml-cpu-traits.o ggml-quants_failsafe.o ggml-cpu-quants_failsafe.o ggml-cpu-aarch64_failsafe.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm_failsafe.o common.o sampling.o kcpputils.o # OS specific ifeq ($(UNAME_S),Linux) @@ -539,6 +539,8 @@ ggml-cpu-cpp.o: ggml/src/ggml-cpu/ggml-cpu.cpp ggml/include/ggml.h ggml/src/ggml $(CXX) $(CXXFLAGS) -c $< -o $@ gguf.o: ggml/src/gguf.cpp ggml/include/gguf.h $(CXX) $(CXXFLAGS) -c $< -o $@ +kcpputils.o: otherarch/utils.cpp otherarch/utils.h + $(CXX) $(CXXFLAGS) -c $< -o $@ #these have special gpu defines ggml-backend_default.o: ggml/src/ggml-backend.cpp ggml/src/ggml-backend-impl.h ggml/include/ggml.h ggml/include/ggml-backend.h @@ -639,8 +641,12 @@ whispercpp_default.o: otherarch/whispercpp/whisper_adapter.cpp whispercpp_cublas.o: otherarch/whispercpp/whisper_adapter.cpp $(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@ +#tts objects +tts_default.o: otherarch/tts_adapter.cpp + $(CXX) $(CXXFLAGS) -c $< -o $@ + # idiotic "for easier compilation" -GPTTYPE_ADAPTER = gpttype_adapter.cpp otherarch/llama_v2.cpp otherarch/llama_v3.cpp src/llama.cpp src/llama-impl.cpp src/llama-chat.cpp src/llama-mmap.cpp src/llama-context.cpp src/llama-adapter.cpp src/llama-arch.cpp src/llama-batch.cpp src/llama-vocab.cpp src/llama-grammar.cpp src/llama-sampling.cpp src/llama-kv-cache.cpp src/llama-model-loader.cpp src/llama-model.cpp src/llama-quant.cpp src/llama-hparams.cpp otherarch/utils.cpp otherarch/gptj_v1.cpp otherarch/gptj_v2.cpp otherarch/gptj_v3.cpp otherarch/gpt2_v1.cpp otherarch/gpt2_v2.cpp otherarch/gpt2_v3.cpp otherarch/rwkv_v2.cpp otherarch/rwkv_v3.cpp otherarch/neox_v2.cpp otherarch/neox_v3.cpp otherarch/mpt_v3.cpp ggml/include/ggml.h ggml/include/ggml-cpu.h ggml/include/ggml-cuda.h include/llama.h otherarch/llama-util.h +GPTTYPE_ADAPTER = gpttype_adapter.cpp otherarch/llama_v2.cpp otherarch/llama_v3.cpp src/llama.cpp src/llama-impl.cpp src/llama-chat.cpp src/llama-mmap.cpp src/llama-context.cpp src/llama-adapter.cpp src/llama-arch.cpp src/llama-batch.cpp src/llama-vocab.cpp src/llama-grammar.cpp src/llama-sampling.cpp src/llama-kv-cache.cpp src/llama-model-loader.cpp src/llama-model.cpp src/llama-quant.cpp src/llama-hparams.cpp otherarch/gptj_v1.cpp otherarch/gptj_v2.cpp otherarch/gptj_v3.cpp otherarch/gpt2_v1.cpp otherarch/gpt2_v2.cpp otherarch/gpt2_v3.cpp otherarch/rwkv_v2.cpp otherarch/rwkv_v3.cpp otherarch/neox_v2.cpp otherarch/neox_v3.cpp otherarch/mpt_v3.cpp ggml/include/ggml.h ggml/include/ggml-cpu.h ggml/include/ggml-cuda.h include/llama.h otherarch/llama-util.h gpttype_adapter_failsafe.o: $(GPTTYPE_ADAPTER) $(CXX) $(CXXFLAGS) $(FAILSAFE_FLAGS) -c $< -o $@ gpttype_adapter.o: $(GPTTYPE_ADAPTER) @@ -680,11 +686,11 @@ vulkan-shaders-gen: ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp $(shell) vulkan-shaders-gen --glslc glslc --input-dir ggml/src/ggml-vulkan/vulkan-shaders --target-hpp ggml/src/ggml-vulkan-shaders.hpp --target-cpp ggml/src/ggml-vulkan-shaders.cpp #generated libraries -koboldcpp_default: ggml.o ggml-cpu.o ggml_v3.o ggml_v2.o ggml_v1.o expose.o gpttype_adapter.o sdcpp_default.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL) $(OBJS) +koboldcpp_default: ggml.o ggml-cpu.o ggml_v3.o ggml_v2.o ggml_v1.o expose.o gpttype_adapter.o sdcpp_default.o whispercpp_default.o tts_default.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL) $(OBJS) $(DEFAULT_BUILD) ifdef FAILSAFE_BUILD -koboldcpp_failsafe: ggml_v4_failsafe.o ggml-cpu_v4_failsafe.o ggml_v3_failsafe.o ggml_v2_failsafe.o ggml_v1_failsafe.o expose.o gpttype_adapter_failsafe.o sdcpp_default.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FAILSAFE) $(OBJS) +koboldcpp_failsafe: ggml_v4_failsafe.o ggml-cpu_v4_failsafe.o ggml_v3_failsafe.o ggml_v2_failsafe.o ggml_v1_failsafe.o expose.o gpttype_adapter_failsafe.o sdcpp_default.o whispercpp_default.o tts_default.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FAILSAFE) $(OBJS) $(FAILSAFE_BUILD) else koboldcpp_failsafe: @@ -692,7 +698,7 @@ koboldcpp_failsafe: endif ifdef NOAVX2_BUILD -koboldcpp_noavx2: ggml_v4_noavx2.o ggml-cpu_v4_noavx2.o ggml_v3_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o gpttype_adapter_failsafe.o sdcpp_default.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_SIMPLE) $(OBJS) +koboldcpp_noavx2: ggml_v4_noavx2.o ggml-cpu_v4_noavx2.o ggml_v3_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o gpttype_adapter_failsafe.o sdcpp_default.o whispercpp_default.o tts_default.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_SIMPLE) $(OBJS) $(NOAVX2_BUILD) else koboldcpp_noavx2: @@ -700,10 +706,10 @@ koboldcpp_noavx2: endif ifdef CLBLAST_BUILD -koboldcpp_clblast: ggml_v4_clblast.o ggml-cpu_v4_clblast.o ggml_v3_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v3-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o sdcpp_default.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL) $(OBJS) +koboldcpp_clblast: ggml_v4_clblast.o ggml-cpu_v4_clblast.o ggml_v3_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v3-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o sdcpp_default.o whispercpp_default.o tts_default.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL) $(OBJS) $(CLBLAST_BUILD) ifdef NOAVX2_BUILD -koboldcpp_clblast_noavx2: ggml_v4_clblast_noavx2.o ggml-cpu_v4_clblast_noavx2.o ggml_v3_clblast_noavx2.o ggml_v2_clblast_noavx2.o ggml_v1_failsafe.o expose.o gpttype_adapter_clblast_noavx2.o ggml-opencl.o ggml_v3-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o sdcpp_default.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_SIMPLER) $(OBJS) +koboldcpp_clblast_noavx2: ggml_v4_clblast_noavx2.o ggml-cpu_v4_clblast_noavx2.o ggml_v3_clblast_noavx2.o ggml_v2_clblast_noavx2.o ggml_v1_failsafe.o expose.o gpttype_adapter_clblast_noavx2.o ggml-opencl.o ggml_v3-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o sdcpp_default.o whispercpp_default.o tts_default.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_SIMPLER) $(OBJS) $(CLBLAST_BUILD) else koboldcpp_clblast_noavx2: @@ -717,7 +723,7 @@ koboldcpp_clblast_noavx2: endif ifdef CUBLAS_BUILD -koboldcpp_cublas: ggml_v4_cublas.o ggml-cpu.o ggml_v3_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o gpttype_adapter_cublas.o sdcpp_cublas.o whispercpp_cublas.o llavaclip_cublas.o llava.o ggml-backend_cublas.o ggml-backend-reg_cublas.o $(CUBLAS_OBJS) $(OBJS_FULL) $(OBJS) +koboldcpp_cublas: ggml_v4_cublas.o ggml-cpu.o ggml_v3_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o gpttype_adapter_cublas.o sdcpp_cublas.o whispercpp_cublas.o tts_default.o llavaclip_cublas.o llava.o ggml-backend_cublas.o ggml-backend-reg_cublas.o $(CUBLAS_OBJS) $(OBJS_FULL) $(OBJS) $(CUBLAS_BUILD) else koboldcpp_cublas: @@ -725,7 +731,7 @@ koboldcpp_cublas: endif ifdef HIPBLAS_BUILD -koboldcpp_hipblas: ggml_v4_cublas.o ggml-cpu.o ggml_v3_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o gpttype_adapter_cublas.o sdcpp_cublas.o whispercpp_cublas.o llavaclip_cublas.o llava.o ggml-backend_cublas.o ggml-backend-reg_cublas.o $(HIP_OBJS) $(OBJS_FULL) $(OBJS) +koboldcpp_hipblas: ggml_v4_cublas.o ggml-cpu.o ggml_v3_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o gpttype_adapter_cublas.o sdcpp_cublas.o whispercpp_cublas.o tts_default.o llavaclip_cublas.o llava.o ggml-backend_cublas.o ggml-backend-reg_cublas.o $(HIP_OBJS) $(OBJS_FULL) $(OBJS) $(HIPBLAS_BUILD) else koboldcpp_hipblas: @@ -733,10 +739,10 @@ koboldcpp_hipblas: endif ifdef VULKAN_BUILD -koboldcpp_vulkan: ggml_v4_vulkan.o ggml-cpu.o ggml_v3.o ggml_v2.o ggml_v1.o expose.o gpttype_adapter_vulkan.o ggml-vulkan.o sdcpp_vulkan.o whispercpp_default.o llavaclip_vulkan.o llava.o ggml-backend_vulkan.o ggml-backend-reg_vulkan.o $(OBJS_FULL) $(OBJS) +koboldcpp_vulkan: ggml_v4_vulkan.o ggml-cpu.o ggml_v3.o ggml_v2.o ggml_v1.o expose.o gpttype_adapter_vulkan.o ggml-vulkan.o sdcpp_vulkan.o whispercpp_default.o tts_default.o llavaclip_vulkan.o llava.o ggml-backend_vulkan.o ggml-backend-reg_vulkan.o $(OBJS_FULL) $(OBJS) $(VULKAN_BUILD) ifdef NOAVX2_BUILD -koboldcpp_vulkan_noavx2: ggml_v4_vulkan_noavx2.o ggml-cpu_v4_noavx2.o ggml_v3_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o gpttype_adapter_vulkan_noavx2.o ggml-vulkan.o sdcpp_vulkan.o whispercpp_default.o llavaclip_vulkan.o llava.o ggml-backend_vulkan.o ggml-backend-reg_vulkan.o $(OBJS_SIMPLE) $(OBJS) +koboldcpp_vulkan_noavx2: ggml_v4_vulkan_noavx2.o ggml-cpu_v4_noavx2.o ggml_v3_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o gpttype_adapter_vulkan_noavx2.o ggml-vulkan.o sdcpp_vulkan.o whispercpp_default.o tts_default.o llavaclip_vulkan.o llava.o ggml-backend_vulkan.o ggml-backend-reg_vulkan.o $(OBJS_SIMPLE) $(OBJS) $(VULKAN_BUILD) else koboldcpp_vulkan_noavx2: diff --git a/expose.cpp b/expose.cpp index 369b71866..b386e3c2c 100644 --- a/expose.cpp +++ b/expose.cpp @@ -238,6 +238,15 @@ extern "C" return whispertype_generate(inputs); } + bool tts_load_model(const tts_load_model_inputs inputs) + { + return ttstype_load_model(inputs); + } + tts_generation_outputs tts_generate(const tts_generation_inputs inputs) + { + return ttstype_generate(inputs); + } + const char * new_token(int idx) { if (generated_tokens.size() <= idx || idx < 0) return nullptr; diff --git a/expose.h b/expose.h index a96bdda3b..12ad17a99 100644 --- a/expose.h +++ b/expose.h @@ -139,6 +139,7 @@ struct last_logprobs_outputs { int count = 0; logprob_item * logprob_items = nullptr; }; + struct sd_load_model_inputs { const char * model_filename = nullptr; @@ -178,6 +179,7 @@ struct sd_generation_outputs int status = -1; const char * data = ""; }; + struct whisper_load_model_inputs { const char * model_filename = nullptr; @@ -201,6 +203,30 @@ struct whisper_generation_outputs const char * text = ""; }; +struct tts_load_model_inputs +{ + const char * ttc_model_filename = nullptr; + const char * cts_model_filename = nullptr; + const char * executable_path = nullptr; + const int clblast_info = 0; + const int cublas_info = 0; + const char * vulkan_info = nullptr; + const int gpulayers = 0; + const int debugmode = 0; +}; +struct tts_generation_inputs +{ + const char * prompt = nullptr; + const int speaker_seed = 0; + const int audio_seed = 0; + const bool quiet = false; +}; +struct tts_generation_outputs +{ + int status = -1; + const char * data = ""; +}; + extern std::string executable_path; extern std::string lora_filename; extern std::string lora_base; diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp index b5dc11974..b0310cd46 100644 --- a/ggml/src/gguf.cpp +++ b/ggml/src/gguf.cpp @@ -383,7 +383,7 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par } if (ok && gr.read(n_kv_32)) { - n_kv_32 = n_kv_32; + n_kv = n_kv_32; } else { ok = false; } diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index 901200ea1..d5bae07be 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -21,12 +21,13 @@ #include #include +#include "utils.h" + //for easier compilation //concat source files into one file for compilation purposes #include "llama_v2.cpp" #include "llama_v3.cpp" #include "src/llama.cpp" -#include "utils.cpp" #include "gptj_v1.cpp" #include "gptj_v2.cpp" #include "gptj_v3.cpp" @@ -535,99 +536,6 @@ const char * kcpp_print_system_info(void) { return s.c_str(); } -struct kcpp_embd_batch { //duplcated from llava_embd_batch - std::vector pos; - std::vector n_seq_id; - std::vector seq_id_0; - std::vector seq_ids; - std::vector logits; - llama_batch batch; - kcpp_embd_batch(float * embd, int32_t n_tokens, int32_t npast, bool use_mrope) { - int32_t seq_id = 0; - pos.resize(n_tokens * (use_mrope?4:1)); - std::fill(pos.begin(), pos.end(), 0); - n_seq_id.resize(n_tokens); - seq_ids.resize(n_tokens + 1); - logits.resize(n_tokens); - seq_id_0.resize(1); - seq_id_0[0] = seq_id; - seq_ids [n_tokens] = nullptr; - batch = { - /*n_tokens =*/ n_tokens, - /*tokens =*/ nullptr, - /*embd =*/ embd, - /*pos =*/ pos.data(), - /*n_seq_id =*/ n_seq_id.data(), - /*seq_id =*/ seq_ids.data(), - /*logits =*/ logits.data(), - }; - - if(!use_mrope) - { - for (int i = 0; i < n_tokens; i++) { - batch.pos [i] = npast + i; - batch.n_seq_id[i] = 1; - batch.seq_id [i] = seq_id_0.data(); - batch.logits [i] = false; - } - } - else - { - for (int i = 0; i < n_tokens; i++) { - batch.n_seq_id[i] = 1; - batch.seq_id [i] = seq_id_0.data(); - batch.logits [i] = false; - } - for (int j = 0; j < batch.n_tokens * 3; j++) { - batch.pos[j] = npast + (j % batch.n_tokens); - } - } - } - kcpp_embd_batch(std::vector & tokens, int32_t npast, bool use_mrope, bool return_all_logits) { - int32_t seq_id = 0; - int32_t n_tokens = tokens.size(); - pos.resize(n_tokens * (use_mrope?4:1)); - std::fill(pos.begin(), pos.end(), 0); - n_seq_id.resize(n_tokens); - seq_ids.resize(n_tokens + 1); - logits.resize(n_tokens); - seq_id_0.resize(1); - seq_id_0[0] = seq_id; - seq_ids[n_tokens] = nullptr; - batch = { - /*n_tokens =*/ n_tokens, - /*tokens =*/ tokens.data(), - /*embd =*/ nullptr, - /*pos =*/ pos.data(), - /*n_seq_id =*/ n_seq_id.data(), - /*seq_id =*/ seq_ids.data(), - /*logits =*/ logits.data(), - }; - - if(!use_mrope) - { - for (int i = 0; i < n_tokens; i++) { - batch.pos [i] = npast + i; - batch.n_seq_id[i] = 1; - batch.seq_id [i] = seq_id_0.data(); - batch.logits [i] = (return_all_logits?true:false); - } - } - else - { - for (int i = 0; i < n_tokens; i++) { - batch.n_seq_id[i] = 1; - batch.seq_id [i] = seq_id_0.data(); - batch.logits [i] = (return_all_logits?true:false); - } - for (int j = 0; j < batch.n_tokens * 3; j++) { - batch.pos[j] = npast + (j % batch.n_tokens); - } - } - batch.logits[n_tokens - 1] = true; - } -}; - //loads a model for speculative decoding. static void speculative_decoding_setup(std::string spec_model_filename, const llama_model_params & base_model_params, const llama_context_params & base_ctx_params, int base_n_vocab, const float * draft_gpusplit, int draft_gpulayers) { @@ -664,7 +572,7 @@ static void speculative_decoding_setup(std::string spec_model_filename, const ll draft_ctx_params.type_k = base_ctx_params.type_k; draft_ctx_params.type_v = base_ctx_params.type_v; - llama_model * draftmodel = llama_load_model_from_file(spec_model_filename.c_str(), draft_model_params); + llama_model * draftmodel = llama_model_load_from_file(spec_model_filename.c_str(), draft_model_params); draft_ctx = llama_new_context_with_model(draftmodel, draft_ctx_params); if(draft_ctx == NULL) { @@ -2252,7 +2160,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in kvos.push_back(kvo); model_params.kv_overrides = kvos.data(); } - llama_model * llamamodel = llama_load_model_from_file(kcpp_data->model_filename.c_str(), model_params); + llama_model * llamamodel = llama_model_load_from_file(kcpp_data->model_filename.c_str(), model_params); if(overwriteRope) { diff --git a/include/llama.h b/include/llama.h index 0295a51fb..e8dd55f66 100644 --- a/include/llama.h +++ b/include/llama.h @@ -1,3 +1,5 @@ +#pragma once + #ifndef LLAMA_H #define LLAMA_H diff --git a/koboldcpp.py b/koboldcpp.py index 3279fde60..6f878191d 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -53,6 +53,7 @@ fullsdmodelpath = "" #if empty, it's not initialized mmprojpath = "" #if empty, it's not initialized password = "" #if empty, no auth key required fullwhispermodelpath = "" #if empty, it's not initialized +ttsmodelpath = "" #if empty, not initialized maxctx = 4096 maxhordectx = 4096 maxhordelen = 400 @@ -281,6 +282,26 @@ class whisper_generation_outputs(ctypes.Structure): _fields_ = [("status", ctypes.c_int), ("data", ctypes.c_char_p)] +class tts_load_model_inputs(ctypes.Structure): + _fields_ = [("ttc_model_filename", ctypes.c_char_p), + ("cts_model_filename", ctypes.c_char_p), + ("executable_path", ctypes.c_char_p), + ("clblast_info", ctypes.c_int), + ("cublas_info", ctypes.c_int), + ("vulkan_info", ctypes.c_char_p), + ("gpulayers", ctypes.c_int), + ("debugmode", ctypes.c_int)] + +class tts_generation_inputs(ctypes.Structure): + _fields_ = [("prompt", ctypes.c_char_p), + ("speaker_seed", ctypes.c_int), + ("audio_seed", ctypes.c_int), + ("quiet", ctypes.c_bool)] + +class tts_generation_outputs(ctypes.Structure): + _fields_ = [("status", ctypes.c_int), + ("data", ctypes.c_char_p)] + def getdirpath(): return os.path.dirname(os.path.realpath(__file__)) def getabspath(): @@ -440,6 +461,10 @@ def init_library(): handle.whisper_load_model.restype = ctypes.c_bool handle.whisper_generate.argtypes = [whisper_generation_inputs] handle.whisper_generate.restype = whisper_generation_outputs + handle.tts_load_model.argtypes = [tts_load_model_inputs] + handle.tts_load_model.restype = ctypes.c_bool + handle.tts_generate.argtypes = [tts_generation_inputs] + handle.tts_generate.restype = tts_generation_outputs handle.last_logprobs.restype = last_logprobs_outputs handle.detokenize.argtypes = [token_count_outputs] handle.detokenize.restype = ctypes.c_char_p @@ -577,9 +602,13 @@ def utfprint(str, importance = 2): #0 = only debugmode, 1 = except quiet, 2 = al maxlen = 32000 if args.debugmode >= 1: maxlen = 64000 - strlength = len(str) - if strlength > maxlen: #limit max output len - str = str[:maxlen] + f"... (+{strlength-maxlen} chars)" + try: + strlength = len(str) + if strlength > maxlen: #limit max output len + str = str[:maxlen] + f"... (+{strlength-maxlen} chars)" + except Exception: + pass + try: print(str) except UnicodeEncodeError: @@ -647,13 +676,14 @@ def read_gguf_metadata(file_path): except Exception: return None -def extract_modelfile_params(filepath,sdfilepath,whisperfilepath,mmprojfilepath,draftmodelpath): +def extract_modelfile_params(filepath,sdfilepath,whisperfilepath,mmprojfilepath,draftmodelpath,ttsmodelpath): global modelfile_extracted_meta modelfile_extracted_meta = None sdfsize = 0 whisperfsize = 0 mmprojsize = 0 draftmodelsize = 0 + ttsmodelsize = 0 if sdfilepath and os.path.exists(sdfilepath): sdfsize = os.path.getsize(sdfilepath) if whisperfilepath and os.path.exists(whisperfilepath): @@ -662,12 +692,14 @@ def extract_modelfile_params(filepath,sdfilepath,whisperfilepath,mmprojfilepath, mmprojsize = os.path.getsize(mmprojfilepath) if draftmodelpath and os.path.exists(draftmodelpath): draftmodelsize = os.path.getsize(draftmodelpath) + if ttsmodelpath and os.path.exists(ttsmodelpath): + ttsmodelsize = os.path.getsize(ttsmodelpath) if filepath and os.path.exists(filepath): try: fsize = os.path.getsize(filepath) if fsize>10000000: #dont bother with models < 10mb as they are probably bad ggufmeta = read_gguf_metadata(filepath) - modelfile_extracted_meta = [ggufmeta,fsize,sdfsize,whisperfsize,mmprojsize,draftmodelsize] #extract done. note that meta may be null + modelfile_extracted_meta = [ggufmeta,fsize,sdfsize,whisperfsize,mmprojsize,draftmodelsize,ttsmodelsize] #extract done. note that meta may be null except Exception: modelfile_extracted_meta = None @@ -699,6 +731,8 @@ def autoset_gpu_layers(ctxsize,sdquanted,bbs): #shitty algo to determine how man mem -= 350*1024*1024 if modelfile_extracted_meta[5] > 1024*1024*10: #draft model tax mem -= (modelfile_extracted_meta[5] * 1.5) + if modelfile_extracted_meta[6] > 1024*1024*10: #tts model tax + mem -= max(600*1024*1024, modelfile_extracted_meta[6] * 3) mem = 0 if mem < 0 else mem csmul = 1.0 @@ -730,6 +764,8 @@ def fetch_gpu_properties(testCL,testCU,testVK): FetchedCUdevices = [] FetchedCUdeviceMem = [] FetchedCUfreeMem = [] + faileddetectvram = False + AMDgpu = None try: # Get NVIDIA GPU names output = subprocess.run(['nvidia-smi','--query-gpu=name,memory.total,memory.free','--format=csv,noheader'], capture_output=True, text=True, check=True, encoding='utf-8').stdout @@ -737,6 +773,10 @@ def fetch_gpu_properties(testCL,testCU,testVK): FetchedCUdeviceMem = [line.split(",")[1].strip().split(" ")[0].strip() for line in output.splitlines()] FetchedCUfreeMem = [line.split(",")[2].strip().split(" ")[0].strip() for line in output.splitlines()] except Exception: + FetchedCUdevices = [] + FetchedCUdeviceMem = [] + FetchedCUfreeMem = [] + faileddetectvram = True pass if len(FetchedCUdevices)==0: try: # Get AMD ROCm GPU names @@ -756,18 +796,30 @@ def fetch_gpu_properties(testCL,testCU,testVK): if getamdvram: FetchedCUdeviceMem = [line.split(",")[1].strip() for line in getamdvram.splitlines()[1:] if line.strip()] except Exception: + FetchedCUdevices = [] + FetchedCUdeviceMem = [] + FetchedCUfreeMem = [] + faileddetectvram = True pass lowestcumem = 0 lowestfreecumem = 0 - for idx in range(0,4): - if(len(FetchedCUdevices)>idx): - CUDevicesNames[idx] = FetchedCUdevices[idx] - if len(FetchedCUdeviceMem)>idx: - dmem = int(FetchedCUdeviceMem[idx]) if AMDgpu else (int(FetchedCUdeviceMem[idx])*1024*1024) - lowestcumem = dmem if lowestcumem==0 else (dmem if dmemidx: - dmem = (int(FetchedCUfreeMem[idx])*1024*1024) - lowestfreecumem = dmem if lowestfreecumem==0 else (dmem if dmemidx): + CUDevicesNames[idx] = FetchedCUdevices[idx] + if len(FetchedCUdeviceMem)>idx: + dmem = int(FetchedCUdeviceMem[idx]) if AMDgpu else (int(FetchedCUdeviceMem[idx])*1024*1024) + lowestcumem = dmem if lowestcumem==0 else (dmem if dmemidx: + dmem = (int(FetchedCUfreeMem[idx])*1024*1024) + lowestfreecumem = dmem if lowestfreecumem==0 else (dmem if dmem=0 and rawcountdata.count<50000) else 0 @@ -1738,10 +1818,11 @@ def LaunchWebbrowser(target_url, failedmsg): try: import webbrowser as wb if wb.open(target_url, autoraise=True): - return + return raise RuntimeError("Cannot open default browser") - except Exception: + except Exception as e: try: + print(f"Browser failed to launch: {e}, attempting to use xdg-open...") import webbrowser as wb if wb.get('xdg-open').open(target_url, autoraise=True): return @@ -2102,7 +2183,7 @@ Enter Prompt:
def do_GET(self): global embedded_kailite, embedded_kcpp_docs, embedded_kcpp_sdui - global has_multiplayer, multiplayer_turn_major, multiplayer_turn_minor, multiplayer_story_data_compressed, multiplayer_dataformat, multiplayer_lastactive, maxctx, maxhordelen, friendlymodelname, lastgeneratedcomfyimg, KcppVersion, totalgens, preloaded_story, exitcounter, currentusergenkey, friendlysdmodelname, fullsdmodelpath, mmprojpath, password, fullwhispermodelpath + global has_multiplayer, multiplayer_turn_major, multiplayer_turn_minor, multiplayer_story_data_compressed, multiplayer_dataformat, multiplayer_lastactive, maxctx, maxhordelen, friendlymodelname, lastgeneratedcomfyimg, KcppVersion, totalgens, preloaded_story, exitcounter, currentusergenkey, friendlysdmodelname, fullsdmodelpath, mmprojpath, password, fullwhispermodelpath, ttsmodelpath self.path = self.path.rstrip('/') response_body = None content_type = 'application/json' @@ -2160,7 +2241,8 @@ Enter Prompt:
has_password = (password!="") has_whisper = (fullwhispermodelpath!="") has_search = True if args.websearch else False - response_body = (json.dumps({"result":"KoboldCpp","version":KcppVersion, "protected":has_password ,"txt2img":has_txt2img,"vision":has_vision,"transcribe":has_whisper,"multiplayer":has_multiplayer,"websearch":has_search}).encode()) + has_tts = (ttsmodelpath!="") + response_body = (json.dumps({"result":"KoboldCpp","version":KcppVersion, "protected":has_password ,"txt2img":has_txt2img,"vision":has_vision,"transcribe":has_whisper,"multiplayer":has_multiplayer,"websearch":has_search,"tts":has_tts}).encode()) elif self.path.endswith(('/api/extra/perf')): global last_req_time, start_time @@ -2521,7 +2603,7 @@ Enter Prompt:
reqblocking = False muint = int(args.multiuser) - if muint<=0 and ((args.whispermodel and args.whispermodel!="") or (args.sdmodel and args.sdmodel!="")): + if muint<=0 and ((args.whispermodel and args.whispermodel!="") or (args.sdmodel and args.sdmodel!="") or (args.ttsmodel and args.ttsmodel!="")): muint = 2 # this prevents errors when using voice/img together with text multiuserlimit = ((muint-1) if muint > 1 else 6) #backwards compatibility for up to 7 concurrent requests, use default limit of 7 if multiuser set to 1 @@ -2546,6 +2628,7 @@ Enter Prompt:
is_imggen = False is_comfyui_imggen = False is_transcribe = False + is_tts = False if self.path.endswith('/request'): api_format = 1 @@ -2588,11 +2671,14 @@ Enter Prompt:
if self.path.endswith('/api/extra/transcribe') or self.path.endswith('/v1/audio/transcriptions'): is_transcribe = True - if is_imggen or is_transcribe or api_format > 0: + if self.path.endswith('/api/extra/tts') or self.path.endswith('/v1/audio/speech'): + is_tts = True + + if is_imggen or is_transcribe or is_tts or api_format > 0: global last_req_time last_req_time = time.time() - if not is_imggen and not is_transcribe and api_format!=5: + if not is_imggen and not is_transcribe and not is_tts and api_format!=5: if not self.secure_endpoint(): return @@ -2680,6 +2766,21 @@ Enter Prompt:
print("Transcribe: The response could not be sent, maybe connection was terminated?") time.sleep(0.2) #short delay return + elif is_tts: + try: + gen = tts_generate(genparams) + wav_data = b'' + if gen: + wav_data = base64.b64decode(gen) # Decode the Base64 string into binary data + self.send_response(200) + self.send_header('content-length', str(len(wav_data))) # Set content length + self.end_headers(content_type='audio/wav') + self.wfile.write(wav_data) # Write the binary WAV data to the response + except Exception as ex: + utfprint(ex,0) + print("TTS: The response could not be sent, maybe connection was terminated?") + time.sleep(0.2) #short delay + return finally: time.sleep(0.05) @@ -2806,7 +2907,7 @@ def show_gui(): if dlfile: args.model_param = dlfile load_config_cli(args.model_param) - if not args.model_param and not args.sdmodel and not args.whispermodel and not args.nomodel: + if not args.model_param and not args.sdmodel and not args.whispermodel and not args.ttsmodel and not args.nomodel: global exitcounter exitcounter = 999 exit_with_error(2,"No ggml model or kcpps file was selected. Exiting.") @@ -3008,6 +3109,9 @@ def show_gui(): sd_quant_var = ctk.IntVar(value=0) whisper_model_var = ctk.StringVar() + tts_model_var = ctk.StringVar() + wavtokenizer_var = ctk.StringVar() + ttsgpu_var = ctk.IntVar(value=0) def tabbuttonaction(name): for t in tabcontent: @@ -3158,7 +3262,8 @@ def show_gui(): whisperfilepath = whisper_model_var.get() mmprojfilepath = mmproj_var.get() draftmodelpath = draftmodel_var.get() - extract_modelfile_params(filepath,sdfilepath,whisperfilepath,mmprojfilepath,draftmodelpath) + ttsmodelpath = tts_model_var.get() if ttsgpu_var.get()==1 else "" + extract_modelfile_params(filepath,sdfilepath,whisperfilepath,mmprojfilepath,draftmodelpath,ttsmodelpath) changed_gpulayers_estimate() pass @@ -3575,8 +3680,14 @@ def show_gui(): # audio tab audio_tab = tabcontent["Audio"] - makefileentry(audio_tab, "Whisper Model (Speech-To-Text):", "Select Whisper .bin Model File", whisper_model_var, 1, width=280, filetypes=[("*.bin","*.bin")], tooltiptxt="Select a Whisper .bin model file on disk to be loaded.") + makefileentry(audio_tab, "Whisper Model (Speech-To-Text):", "Select Whisper .bin Model File", whisper_model_var, 1, width=280, filetypes=[("*.bin","*.bin")], tooltiptxt="Select a Whisper .bin model file on disk to be loaded for Voice Recognition.") whisper_model_var.trace("w", gui_changed_modelfile) + makefileentry(audio_tab, "OuteTTS Model (Text-To-Speech):", "Select OuteTTS GGUF Model File", tts_model_var, 3, width=280, filetypes=[("*.gguf","*.gguf")], tooltiptxt="Select a OuteTTS GGUF model file on disk to be loaded for Narration.") + tts_model_var.trace("w", gui_changed_modelfile) + makefileentry(audio_tab, "WavTokenizer Model (Text-To-Speech):", "Select WavTokenizer GGUF Model File", wavtokenizer_var, 5, width=280, filetypes=[("*.gguf","*.gguf")], tooltiptxt="Select a WavTokenizer GGUF model file on disk to be loaded for Narration.") + wavtokenizer_var.trace("w", gui_changed_modelfile) + makecheckbox(audio_tab, "TTS Use GPU", ttsgpu_var, 7, 0,tooltiptxt="Uses the GPU for TTS.") + ttsgpu_var.trace("w", gui_changed_modelfile) def kcpp_export_template(): nonlocal kcpp_exporting_template @@ -3625,7 +3736,7 @@ def show_gui(): # launch def guilaunch(): - if model_var.get() == "" and sd_model_var.get() == "" and whisper_model_var.get() == "" and nomodel.get()!=1: + if model_var.get() == "" and sd_model_var.get() == "" and whisper_model_var.get() == "" and tts_model_var.get() == "" and nomodel.get()!=1: tmp = askopenfilename(title="Select ggml model .bin or .gguf file") model_var.set(tmp) nonlocal nextstate @@ -3792,6 +3903,11 @@ def show_gui(): if whisper_model_var.get() != "": args.whispermodel = whisper_model_var.get() + if tts_model_var.get() != "" and wavtokenizer_var.get() != "": + args.ttsmodel = tts_model_var.get() + args.ttswavtokenizer = wavtokenizer_var.get() + args.ttsgpu = (ttsgpu_var.get()==1) + def import_vars(dict): global importvars_in_progress importvars_in_progress = True @@ -3952,6 +4068,10 @@ def show_gui(): whisper_model_var.set(dict["whispermodel"] if ("whispermodel" in dict and dict["whispermodel"]) else "") + tts_model_var.set(dict["ttsmodel"] if ("ttsmodel" in dict and dict["ttsmodel"]) else "") + wavtokenizer_var.set(dict["ttswavtokenizer"] if ("ttswavtokenizer" in dict and dict["ttswavtokenizer"]) else "") + ttsgpu_var.set(dict["ttsgpu"] if ("ttsgpu" in dict) else 0) + importvars_in_progress = False gui_changed_modelfile() if "istemplate" in dict and dict["istemplate"]: @@ -4022,7 +4142,7 @@ def show_gui(): kcpp_exporting_template = False export_vars() - if not args.model_param and not args.sdmodel and not args.whispermodel and not args.nomodel: + if not args.model_param and not args.sdmodel and not args.whispermodel and not args.ttsmodel and not args.nomodel: exitcounter = 999 print("") time.sleep(0.5) @@ -4566,7 +4686,7 @@ def analyze_gguf_model_wrapper(filename=""): def main(launch_args,start_server=True): global embedded_kailite, embedded_kcpp_docs, embedded_kcpp_sdui - global libname, args, friendlymodelname, friendlysdmodelname, fullsdmodelpath, mmprojpath, password, fullwhispermodelpath + global libname, args, friendlymodelname, friendlysdmodelname, fullsdmodelpath, mmprojpath, password, fullwhispermodelpath, ttsmodelpath args = launch_args if (args.version) and len(sys.argv) <= 2: @@ -4629,7 +4749,7 @@ def main(launch_args,start_server=True): if not args.model_param: args.model_param = args.model - if args.showgui or (not args.model_param and not args.sdmodel and not args.whispermodel and not args.nomodel): + if args.showgui or (not args.model_param and not args.sdmodel and not args.whispermodel and not args.ttsmodel and not args.nomodel): #give them a chance to pick a file print("For command line arguments, please refer to --help") print("***") @@ -4753,6 +4873,14 @@ def main(launch_args,start_server=True): dlfile = download_model_from_url(args.draftmodel,[".gguf"]) if dlfile: args.draftmodel = dlfile + if args.ttsmodel and args.ttsmodel!="": + dlfile = download_model_from_url(args.ttsmodel,[".gguf"]) + if dlfile: + args.ttsmodel = dlfile + if args.ttswavtokenizer and args.ttswavtokenizer!="": + dlfile = download_model_from_url(args.ttswavtokenizer,[".gguf"]) + if dlfile: + args.ttswavtokenizer = dlfile # sanitize and replace the default vanity name. remember me.... if args.model_param and args.model_param!="": @@ -4830,7 +4958,7 @@ def main(launch_args,start_server=True): pass if args.gpulayers==-1: if MaxMemory[0] > 0 and (not args.usecpu) and ((args.usecublas is not None) or (args.usevulkan is not None) or (args.useclblast is not None) or sys.platform=="darwin"): - extract_modelfile_params(args.model_param,args.sdmodel,args.whispermodel,args.mmproj,args.draftmodel) + extract_modelfile_params(args.model_param,args.sdmodel,args.whispermodel,args.mmproj,args.draftmodel,args.ttsmodel if args.ttsgpu else "") layeramt = autoset_gpu_layers(args.contextsize,args.sdquant,args.blasbatchsize) print(f"Auto Recommended GPU Layers: {layeramt}") args.gpulayers = layeramt @@ -4999,6 +5127,27 @@ def main(launch_args,start_server=True): exitcounter = 999 exit_with_error(3,"Could not load whisper model: " + whispermodel) + #handle tts model + if args.ttsmodel and args.ttsmodel!="" and args.ttswavtokenizer and args.ttswavtokenizer!="": + if not os.path.exists(args.ttsmodel) or not os.path.exists(args.ttswavtokenizer): + if args.ignoremissing: + print("Ignoring missing TTS model files!") + args.ttsmodel = None + args.ttswavtokenizer = None + else: + exitcounter = 999 + exit_with_error(2,f"Cannot find tts model files: {args.ttsmodel} or {args.ttswavtokenizer}") + else: + ttsmodelpath = args.ttsmodel + ttsmodelpath = os.path.abspath(ttsmodelpath) + wavtokpath = args.ttswavtokenizer + wavtokpath = os.path.abspath(wavtokpath) + loadok = tts_load_model(ttsmodelpath,wavtokpath) + print("Load TTS Model OK: " + str(loadok)) + if not loadok: + exitcounter = 999 + exit_with_error(3,"Could not load TTS model!") + #load embedded lite try: @@ -5296,7 +5445,12 @@ if __name__ == '__main__': sdparsergroup.add_argument("--sdnotile", help="Disables VAE tiling, may not work for large images.", action='store_true') whisperparsergroup = parser.add_argument_group('Whisper Transcription Commands') - whisperparsergroup.add_argument("--whispermodel", metavar=('[filename]'), help="Specify a Whisper bin model to enable Speech-To-Text transcription.", default="") + whisperparsergroup.add_argument("--whispermodel", metavar=('[filename]'), help="Specify a Whisper .bin model to enable Speech-To-Text transcription.", default="") + + ttsparsergroup = parser.add_argument_group('TTS Narration Commands') + ttsparsergroup.add_argument("--ttsmodel", metavar=('[filename]'), help="Specify the OuteTTS Text-To-Speech GGUF model.", default="") + ttsparsergroup.add_argument("--ttswavtokenizer", metavar=('[filename]'), help="Specify the WavTokenizer GGUF model.", default="") + ttsparsergroup.add_argument("--ttsgpu", help="Use the GPU for TTS.", action='store_true') deprecatedgroup = parser.add_argument_group('Deprecated Commands, DO NOT USE!') deprecatedgroup.add_argument("--hordeconfig", help=argparse.SUPPRESS, nargs='+') diff --git a/model_adapter.h b/model_adapter.h index a0e921cb3..2b7f566a7 100644 --- a/model_adapter.h +++ b/model_adapter.h @@ -105,6 +105,9 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs); bool whispertype_load_model(const whisper_load_model_inputs inputs); whisper_generation_outputs whispertype_generate(const whisper_generation_inputs inputs); +bool ttstype_load_model(const tts_load_model_inputs inputs); +tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs); + void timer_start(); double timer_check(); void print_tok_vec(std::vector &embd); diff --git a/otherarch/ggml_v3.h b/otherarch/ggml_v3.h index cd8ed48b1..6a354efac 100644 --- a/otherarch/ggml_v3.h +++ b/otherarch/ggml_v3.h @@ -188,13 +188,8 @@ #endif // TODO: support for clang -#ifdef __GNUC__ -# define GGML_V3_DEPRECATED(func, hint) func __attribute__((deprecated(hint))) -#elif defined(_MSC_VER) -# define GGML_V3_DEPRECATED(func, hint) __declspec(deprecated(hint)) func -#else # define GGML_V3_DEPRECATED(func, hint) func -#endif + #ifndef __GNUC__ # define GGML_V3_ATTRIBUTE_FORMAT(...) diff --git a/otherarch/llama_v2.cpp b/otherarch/llama_v2.cpp index a1229fa09..ccd68288c 100644 --- a/otherarch/llama_v2.cpp +++ b/otherarch/llama_v2.cpp @@ -436,19 +436,23 @@ struct llama_v2_file_loader { uint32_t magic = file.read_u32(); uint32_t version = 0; - if (magic != 'ggml') { + uint32_t magic_ggjt = 0x67676a74u; // 'ggjt' + uint32_t magic_ggmf = 0x67676d66u; // 'ggmf' + uint32_t magic_ggml = 0x67676d6cu; // 'ggml' + + if (magic != magic_ggml) { version = file.read_u32(); } - if (magic == 'ggml' && version == 0) { + if (magic == magic_ggml && version == 0) { file_version = LLAMA_V2_FILE_VERSION_GGML; - } else if (magic == 'ggmf' && version == 1) { + } else if (magic == magic_ggmf && version == 1) { file_version = LLAMA_V2_FILE_VERSION_GGMF_V1; - } else if (magic == 'ggjt' && version == 1) { + } else if (magic == magic_ggjt && version == 1) { file_version = LLAMA_V2_FILE_VERSION_GGJT_V1; - } else if (magic == 'ggjt' && version == 2) { + } else if (magic == magic_ggjt && version == 2) { file_version = LLAMA_V2_FILE_VERSION_GGJT_V2; - } else if (magic == 'ggjt' && version == 3) { + } else if (magic == magic_ggjt && version == 3) { file_version = LLAMA_V2_FILE_VERSION_GGJT_V3; } else { throw format_old("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?", @@ -553,7 +557,8 @@ struct llama_v2_file_saver { write_vocab(); } void write_magic() { - file.write_u32(LLAMA_V2_FILE_MAGIC); // magic + uint32_t magic_ggjt = 0x67676a74u; // 'ggjt' + file.write_u32(magic_ggjt); // magic file.write_u32(LLAMA_V2_FILE_VERSION); // version } void write_hparams(enum llama_v2_ftype new_ftype) { @@ -2308,7 +2313,8 @@ int llama_v2_apply_lora_from_file_internal(struct llama_v2_context * ctx, const { uint32_t magic; fin.read((char *) &magic, sizeof(magic)); - if (magic != 'ggla') { + uint32_t magic_ggla = 0x67676c61u; // 'ggla' + if (magic != magic_ggla) { fprintf(stderr, "%s: bad file magic\n", __func__); return 1; } @@ -2800,85 +2806,6 @@ size_t llama_v2_set_state_data(struct llama_v2_context * ctx, const uint8_t * sr return nread; } -bool llama_v2_load_session_file(struct llama_v2_context * ctx, const char * path_session, llama_v2_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { - llama_v2_file file(path_session, "rb"); - - // sanity checks - { - const uint32_t magic = file.read_u32(); - const uint32_t version = file.read_u32(); - - if (magic != LLAMA_V2_SESSION_MAGIC || version != LLAMA_V2_SESSION_VERSION) { - fprintf(stderr, "%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version); - return false; - } - - llama_v2_hparams session_hparams; - file.read_raw(&session_hparams, sizeof(llama_v2_hparams)); - - if (session_hparams != ctx->model.hparams) { - fprintf(stderr, "%s : model hparams didn't match from session file!\n", __func__); - return false; - } - } - - // load the prompt - { - const uint32_t n_token_count = file.read_u32(); - - if (n_token_count > n_token_capacity) { - fprintf(stderr, "%s : token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity); - return false; - } - - file.read_raw(tokens_out, sizeof(llama_v2_token) * n_token_count); - *n_token_count_out = n_token_count; - } - - // restore the context state - { - const size_t n_state_size_cur = file.size - file.tell(); - const size_t n_state_size_max = llama_v2_get_state_size(ctx); - - if (n_state_size_cur > n_state_size_max) { - fprintf(stderr, "%s : the state size in session file is too big! max %zu, got %zu\n", __func__, n_state_size_max, n_state_size_cur); - return false; - } - - std::vector state_data(n_state_size_max); - file.read_raw(state_data.data(), n_state_size_cur); - - llama_v2_set_state_data(ctx, state_data.data()); - } - - return true; -} - -bool llama_v2_save_session_file(struct llama_v2_context * ctx, const char * path_session, const llama_v2_token * tokens, size_t n_token_count) { - llama_v2_file file(path_session, "wb"); - - file.write_u32(LLAMA_V2_SESSION_MAGIC); - file.write_u32(LLAMA_V2_SESSION_VERSION); - - file.write_raw(&ctx->model.hparams, sizeof(llama_v2_hparams)); - - // save the prompt - file.write_u32((uint32_t) n_token_count); - file.write_raw(tokens, sizeof(llama_v2_token) * n_token_count); - - // save the context state - { - const size_t n_state_size_max = llama_v2_get_state_size(ctx); - - std::vector state_data(n_state_size_max); - const size_t n_state_size_cur = llama_v2_copy_state_data(ctx, state_data.data()); - - file.write_raw(state_data.data(), n_state_size_cur); - } - - return true; -} - int llama_v2_eval( struct llama_v2_context * ctx, const llama_v2_token * tokens, diff --git a/otherarch/llama_v2.h b/otherarch/llama_v2.h index 2b1cfc725..cc18ed88e 100644 --- a/otherarch/llama_v2.h +++ b/otherarch/llama_v2.h @@ -140,10 +140,6 @@ extern "C" { // Returns the number of bytes read LLAMA_V2_API size_t llama_v2_set_state_data(struct llama_v2_context * ctx, const uint8_t * src); - // Save/load session file - LLAMA_V2_API bool llama_v2_load_session_file(struct llama_v2_context * ctx, const char * path_session, llama_v2_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out); - LLAMA_V2_API bool llama_v2_save_session_file(struct llama_v2_context * ctx, const char * path_session, const llama_v2_token * tokens, size_t n_token_count); - // Run the llama inference to obtain the logits and probabilities for the next token. // tokens + n_tokens is the provided batch of new tokens to process // n_past is the number of tokens to use from previous eval calls @@ -167,7 +163,7 @@ extern "C" { int n_max_tokens, bool add_bos); - + std::vector legacy_llama_v2_tokenize(struct llama_v2_context * ctx, const std::string & text, bool add_bos); LLAMA_V2_API int llama_v2_n_vocab(const struct llama_v2_context * ctx); diff --git a/otherarch/rwkv_v2.cpp b/otherarch/rwkv_v2.cpp index 7d2065eaa..ffc159d62 100644 --- a/otherarch/rwkv_v2.cpp +++ b/otherarch/rwkv_v2.cpp @@ -126,7 +126,7 @@ struct rwkv_v2_model { // Finds model parameter by key and sets it into dest. // If the parameter was not found, returns false. -bool rwkv_v2_set_parameter(std::unordered_map * parameters, char * key, struct ggml_v2_tensor ** dest) { +bool rwkv_v2_set_parameter(std::unordered_map * parameters, const char * key, struct ggml_v2_tensor ** dest) { struct ggml_v2_tensor * parameter = (*parameters)[key]; RWKV_V2_ASSERT_FALSE(parameter != NULL, "Parameter %s not found in model file", key); *dest = parameter; @@ -135,7 +135,7 @@ bool rwkv_v2_set_parameter(std::unordered_map * parameters, int32_t block_index, char * key, struct ggml_v2_tensor ** dest) { +bool rwkv_v2_set_block_parameter(std::unordered_map * parameters, int32_t block_index, const char * key, struct ggml_v2_tensor ** dest) { char full_key[128]; sprintf(full_key, "blocks.%d.%s", block_index, key); return rwkv_v2_set_parameter(parameters, full_key, dest); diff --git a/otherarch/sdcpp/sdtype_adapter.cpp b/otherarch/sdcpp/sdtype_adapter.cpp index 5b7ab7605..687f1589d 100644 --- a/otherarch/sdcpp/sdtype_adapter.cpp +++ b/otherarch/sdcpp/sdtype_adapter.cpp @@ -112,28 +112,6 @@ static sd_ctx_t * sd_ctx = nullptr; static int sddebugmode = 0; static std::string recent_data = ""; -std::string base64_encode(const unsigned char* data, unsigned int data_length) { - const std::string base64_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; - std::string encoded; - encoded.reserve(((data_length + 2) / 3) * 4); - for (unsigned int i = 0; i < data_length; i += 3) { - unsigned int triple = (data[i] << 16) + (i + 1 < data_length ? data[i + 1] << 8 : 0) + (i + 2 < data_length ? data[i + 2] : 0); - encoded.push_back(base64_chars[(triple >> 18) & 0x3F]); - encoded.push_back(base64_chars[(triple >> 12) & 0x3F]); - if (i + 1 < data_length) { - encoded.push_back(base64_chars[(triple >> 6) & 0x3F]); - } else { - encoded.push_back('='); - } - if (i + 2 < data_length) { - encoded.push_back(base64_chars[triple & 0x3F]); - } else { - encoded.push_back('='); - } - } - return encoded; -} - static std::string sdplatformenv, sddeviceenv, sdvulkandeviceenv; static bool notiling = false; bool sdtype_load_model(const sd_load_model_inputs inputs) { @@ -553,7 +531,7 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs) unsigned char * png = stbi_write_png_to_mem(results[i].data, 0, results[i].width, results[i].height, results[i].channel, &out_data_len, ""); if (png != NULL) { - recent_data = base64_encode(png,out_data_len); + recent_data = kcpp_base64_encode(png,out_data_len); free(png); } diff --git a/otherarch/tts_adapter.cpp b/otherarch/tts_adapter.cpp new file mode 100644 index 000000000..faef59d9c --- /dev/null +++ b/otherarch/tts_adapter.cpp @@ -0,0 +1,672 @@ +#include "model_adapter.h" +#include "otherarch/utils.h" + +#include "common.h" +#include "sampling.h" +#include "llama.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "src/llama-context.h" + +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + +struct wav_header { + char riff[4] = {'R', 'I', 'F', 'F'}; + uint32_t chunk_size; + char wave[4] = {'W', 'A', 'V', 'E'}; + char fmt[4] = {'f', 'm', 't', ' '}; + uint32_t fmt_chunk_size = 16; + uint16_t audio_format = 1; // PCM + uint16_t num_channels = 1; // Mono + uint32_t sample_rate; + uint32_t byte_rate; + uint16_t block_align; + uint16_t bits_per_sample = 16; + char data[4] = {'d', 'a', 't', 'a'}; + uint32_t data_size; +}; + +static std::string save_wav16_base64(const std::vector &data, int sample_rate) { + std::ostringstream oss; + wav_header header; + + // Fill header fields + header.sample_rate = sample_rate; + header.byte_rate = header.sample_rate * header.num_channels * (header.bits_per_sample / 8); + header.block_align = header.num_channels * (header.bits_per_sample / 8); + header.data_size = data.size() * (header.bits_per_sample / 8); + header.chunk_size = 36 + header.data_size; + + // Write header + oss.write(reinterpret_cast(&header), sizeof(header)); + + // Write samples + for (const auto &sample : data) { + int16_t pcm_sample = static_cast(std::clamp(sample * 32767.0, -32768.0, 32767.0)); + oss.write(reinterpret_cast(&pcm_sample), sizeof(pcm_sample)); + } + + // Get binary WAV data + std::string wav_data = oss.str(); + return kcpp_base64_encode(wav_data); //return as base64 string +} + +static void fill_hann_window(int length, bool periodic, float * output) { + int offset = -1; + if (periodic) { + offset = 0; + } + for (int i = 0; i < length; i++) { + output[i] = 0.5 * (1.0 - cosf((2.0 * M_PI * i) / (length + offset))); + } +} + +// very poor-man fft +static void twiddle(float * real, float * imag, int k, int N) { + float angle = 2 * M_PI * k / N; + *real = cos(angle); + *imag = sin(angle); +} + +static void irfft(int n, const float * inp_cplx, float * out_real) { + int N = n / 2 + 1; + + std::vector real_input(N); + std::vector imag_input(N); + for (int i = 0; i < N; ++i) { + real_input[i] = inp_cplx[2 * i]; + imag_input[i] = inp_cplx[2 * i + 1]; + } + + std::vector real_output(n); + std::vector imag_output(n); + + for (int k = 0; k < n; ++k) { + real_output[k] = 0.0f; + imag_output[k] = 0.0f; + for (int m = 0; m < N; ++m) { + float twiddle_real; + float twiddle_imag; + + twiddle(&twiddle_real, &twiddle_imag, k * m, n); + + real_output[k] += real_input[m] * twiddle_real - imag_input[m] * twiddle_imag; + imag_output[k] += real_input[m] * twiddle_imag + imag_input[m] * twiddle_real; + } + } + + for (int i = 0; i < n; ++i) { + out_real[i] = real_output[i] / N; + } +} + + +static void fold(const std::vector & data, int64_t n_out, int64_t n_win, int64_t n_hop, int64_t n_pad, std::vector & output) { + int64_t output_height = n_out; + int64_t kernel_w = n_win; + int64_t stride_w = n_hop; + int64_t width = n_out; + + output.resize(width, 0.0f); + + int64_t col_idx = 0; + for (int64_t w_col = 0; w_col < width; ++w_col) { + int64_t start = w_col * stride_w - n_pad; + int64_t end = start + kernel_w; + + for (int64_t w_im = start; w_im < end; ++w_im) { + if (w_im >= 0 && w_im < output_height && col_idx < (int64_t) data.size()) { + output[w_im] += data[col_idx]; + } + col_idx++; + } + } + + output.resize(n_out - 2 * n_pad); +} + +// TODO: not optimized at all +static std::vector embd_to_audio( + const float * embd, + const int n_codes, + const int n_embd, + const int n_thread) { + const int n_fft = 1280; + const int n_hop = 320; + const int n_win = 1280; + const int n_pad = (n_win - n_hop)/2; + const int n_out = (n_codes - 1)*n_hop + n_win; + + std::vector hann(n_fft); + + fill_hann_window(hann.size(), true, hann.data()); + + int n_spec = n_embd*n_codes; + + std::vector E (n_spec); + std::vector S (n_spec); + std::vector ST(n_spec); + + for (int l = 0; l < n_codes; ++l) { + for (int k = 0; k < n_embd; ++k) { + E[k*n_codes + l] = embd[l*n_embd + k]; + } + } + + for (int k = 0; k < n_embd/2; ++k) { + for (int l = 0; l < n_codes; ++l) { + float mag = E[(k )*n_codes + l]; + float phi = E[(k + n_embd/2)*n_codes + l]; + + mag = exp(mag); + + if (mag > 1e2) { + mag = 1e2; + } + S[2*(k*n_codes + l) + 0] = mag*cosf(phi); + S[2*(k*n_codes + l) + 1] = mag*sinf(phi); + } + } + + for (int l = 0; l < n_codes; ++l) { + for (int k = 0; k < n_embd/2; ++k) { + ST[l*n_embd + 2*k + 0] = S[2*(k*n_codes + l) + 0]; + ST[l*n_embd + 2*k + 1] = S[2*(k*n_codes + l) + 1]; + } + } + + std::vector res (n_codes*n_fft); + std::vector hann2(n_codes*n_fft); + + std::vector workers(n_thread); + for (int i = 0; i < n_thread; ++i) { + workers[i] = std::thread([&, i]() { + for (int l = i; l < n_codes; l += n_thread) { + irfft(n_fft, ST.data() + l*n_embd, res.data() + l*n_fft); + for (int j = 0; j < n_fft; ++j) { + res [l*n_fft + j] *= hann[j]; + hann2[l*n_fft + j] = hann[j] * hann[j]; + } + } + }); + } + for (int i = 0; i < n_thread; ++i) { + workers[i].join(); + } + + std::vector audio; + std::vector env; + + fold(res, n_out, n_win, n_hop, n_pad, audio); + fold(hann2, n_out, n_win, n_hop, n_pad, env); // TODO: can be done once + + for (size_t i = 0; i < audio.size(); ++i) { + audio[i] /= env[i]; + } + + return audio; +} + +static const std::map ones = { + {0, "zero"}, {1, "one"}, {2, "two"}, {3, "three"}, {4, "four"}, + {5, "five"}, {6, "six"}, {7, "seven"}, {8, "eight"}, {9, "nine"}, + {10, "ten"}, {11, "eleven"}, {12, "twelve"}, {13, "thirteen"}, {14, "fourteen"}, + {15, "fifteen"}, {16, "sixteen"}, {17, "seventeen"}, {18, "eighteen"}, {19, "nineteen"} +}; + +static const std::map tens = { + {2, "twenty"}, {3, "thirty"}, {4, "forty"}, {5, "fifty"}, + {6, "sixty"}, {7, "seventy"}, {8, "eighty"}, {9, "ninety"} +}; + +// Convert a number less than 1000 to words +static std::string convert_less_than_thousand(int num) { + std::string result; + + if (num >= 100) { + result += ones.at(num / 100) + " hundred "; + num %= 100; + } + + if (num >= 20) { + result += tens.at(num / 10); + if (num % 10 > 0) { + result += "-" + ones.at(num % 10); + } + } else if (num > 0) { + result += ones.at(num); + } + + return result; +} + +static std::string number_to_words(const std::string & number_str) { + try { + size_t decimal_pos = number_str.find('.'); + std::string integer_part = number_str.substr(0, decimal_pos); + + int int_number = std::stoi(integer_part); + std::string result; + + if (int_number == 0) { + result = "zero"; + } else { + if (int_number >= 1000000000) { + int billions = int_number / 1000000000; + result += convert_less_than_thousand(billions) + " billion "; + int_number %= 1000000000; + } + + if (int_number >= 1000000) { + int millions = int_number / 1000000; + result += convert_less_than_thousand(millions) + " million "; + int_number %= 1000000; + } + + if (int_number >= 1000) { + int thousands = int_number / 1000; + result += convert_less_than_thousand(thousands) + " thousand "; + int_number %= 1000; + } + + if (int_number > 0) { + result += convert_less_than_thousand(int_number); + } + } + + // Handle decimal part + if (decimal_pos != std::string::npos) { + result += " point"; + std::string decimal_part = number_str.substr(decimal_pos + 1); + for (char digit : decimal_part) { + result += " " + ones.at(digit - '0'); + } + } + + return result; + } catch (const std::exception& e) { + // Skip if fails + return " "; + } +} + +static std::string replace_numbers_with_words(const std::string & input_text) { + std::regex number_pattern(R"(\d+(\.\d+)?)"); + std::string result; + auto it = std::sregex_iterator(input_text.begin(), input_text.end(), number_pattern); + auto end = std::sregex_iterator(); + + size_t last_pos = 0; + for (std::sregex_iterator i = it; i != end; ++i) { + const std::smatch& match = *i; + result.append(input_text, last_pos, match.position() - last_pos); + result.append(number_to_words(match.str())); + last_pos = match.position() + match.length(); + } + result.append(input_text, last_pos); + + return result; +} + +static std::string process_text(const std::string & text) { + + std::string processed_text = replace_numbers_with_words(text); + + std::transform(processed_text.begin(), processed_text.end(), + processed_text.begin(), ::tolower); + + std::regex special_chars(R"([-_/,\.\\])"); + processed_text = std::regex_replace(processed_text, special_chars, " "); + std::regex non_alpha(R"([^a-z\s])"); + processed_text = std::regex_replace(processed_text, non_alpha, ""); + std::regex multiple_spaces(R"(\s+)"); + processed_text = std::regex_replace(processed_text, multiple_spaces, " "); + processed_text = std::regex_replace(processed_text, std::regex(R"(^\s+|\s+$)"), ""); + processed_text = std::regex_replace(processed_text, std::regex(R"(\s)"), "<|text_sep|>"); + + return processed_text; +} + + +static void prompt_add(llama_tokens & prompt, const llama_tokens & tokens) { + prompt.insert(prompt.end(), tokens.begin(), tokens.end()); +} +static void prompt_add(llama_tokens & prompt, const llama_model * model, const std::string & txt, bool add_special, bool parse_special) { + auto tmp = common_tokenize(model, txt, add_special, parse_special); + prompt_add(prompt, tmp); +} +static void prompt_init(llama_tokens & prompt, const llama_model * model) { + prompt.clear(); + prompt_add(prompt, model, "<|im_start|>\n", true, true); +} + +static std::vector prepare_guide_tokens(const llama_model * model, const std::string& str) +{ + const std::string& delimiter = "<|text_sep|>"; + + std::vector result; + size_t start = 0; + size_t end = str.find(delimiter); + + while (end != std::string::npos) { + std::string current_word = str.substr(start, end - start); + auto tmp = common_tokenize(model, current_word, false, true); + result.push_back(tmp[0]); + start = end + delimiter.length(); + end = str.find(delimiter, start); + } + + // Add the last part + std::string current_word = str.substr(start); + auto tmp = common_tokenize(model, current_word, false, true); + result.push_back(tmp[0]); + return result; +} + +static llama_context * ttc_ctx = nullptr; //text to codes ctx +static llama_context * cts_ctx = nullptr; //codes to speech + +static int ttsdebugmode = 0; +static std::string ttsplatformenv, ttsdeviceenv, ttsvulkandeviceenv; +static std::string last_generated_audio = ""; + +bool ttstype_load_model(const tts_load_model_inputs inputs) +{ + //duplicated from expose.cpp + int cl_parseinfo = inputs.clblast_info; //first digit is whether configured, second is platform, third is devices + std::string usingclblast = "GGML_OPENCL_CONFIGURED="+std::to_string(cl_parseinfo>0?1:0); + putenv((char*)usingclblast.c_str()); + cl_parseinfo = cl_parseinfo%100; //keep last 2 digits + int platform = cl_parseinfo/10; + int devices = cl_parseinfo%10; + ttsplatformenv = "GGML_OPENCL_PLATFORM="+std::to_string(platform); + ttsdeviceenv = "GGML_OPENCL_DEVICE="+std::to_string(devices); + putenv((char*)ttsplatformenv.c_str()); + putenv((char*)ttsdeviceenv.c_str()); + std::string vulkan_info_raw = inputs.vulkan_info; + std::string vulkan_info_str = ""; + for (size_t i = 0; i < vulkan_info_raw.length(); ++i) { + vulkan_info_str += vulkan_info_raw[i]; + if (i < vulkan_info_raw.length() - 1) { + vulkan_info_str += ","; + } + } + if(vulkan_info_str!="") + { + ttsvulkandeviceenv = "GGML_VK_VISIBLE_DEVICES="+vulkan_info_str; + putenv((char*)ttsvulkandeviceenv.c_str()); + } + + llama_backend_init(); + + std::string modelfile_ttc = inputs.ttc_model_filename; + std::string modelfile_cts = inputs.cts_model_filename; + printf("\nLoading TTS Model, OuteTTS: %s \nWavTokenizer: %s \n",modelfile_ttc.c_str(),modelfile_cts.c_str()); + + ttsdebugmode = inputs.debugmode; + + // tts init + llama_model_params tts_model_params = llama_model_default_params(); + llama_context_params tts_ctx_params = llama_context_default_params(); + + const int nthreads = 4; + + tts_model_params.use_mmap = false; + tts_model_params.use_mlock = false; + tts_model_params.n_gpu_layers = inputs.gpulayers; //offload if possible + tts_model_params.split_mode = llama_split_mode::LLAMA_SPLIT_MODE_LAYER; + tts_ctx_params.n_ctx = 8192; + tts_ctx_params.logits_all = false; + tts_ctx_params.offload_kqv = true; + tts_ctx_params.n_batch = 8192; + tts_ctx_params.n_ubatch = 512; + tts_ctx_params.n_threads = nthreads; + tts_ctx_params.n_threads_batch = nthreads; + tts_ctx_params.flash_attn = false; + + llama_model * ttcmodel = llama_model_load_from_file(modelfile_ttc.c_str(), tts_model_params); + ttc_ctx = llama_new_context_with_model(ttcmodel, tts_ctx_params); + + if (ttc_ctx == nullptr) { + printf("\nTTS Load Error: Failed to initialize ttc context!\n"); + return false; + } + + llama_model * ctsmodel = llama_model_load_from_file(modelfile_cts.c_str(), tts_model_params); + + tts_ctx_params.embeddings = true; //this requires embeddings instead + cts_ctx = llama_new_context_with_model(ctsmodel, tts_ctx_params); + + if (cts_ctx == nullptr) { + printf("\nTTS Load Error: Failed to initialize cts context!\n"); + return false; + } + + std::vector tmp = {1, 2, 3, 4}; + llama_kv_cache_clear(ttc_ctx); + auto er = llama_decode(ttc_ctx, llama_batch_get_one(tmp.data(), tmp.size())); + if(er!=0) + { + printf("\nTTS Eval returned nonzero: %d\n",er); + return false; + } + + printf("\nTTS Load Complete.\n"); + return true; +} + +tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs) +{ + tts_generation_outputs output; + + if(ttc_ctx==nullptr || cts_ctx==nullptr) + { + printf("\nWarning: KCPP TTS not initialized!\n"); + output.data = ""; + output.status = 0; + return output; + } + + std::vector codes; + std::vector guide_tokens; + const llama_model * model_ttc = &(ttc_ctx->model); + const llama_model * model_cts = &(cts_ctx->model); + const int ttc_n_vocab = llama_n_vocab(model_ttc); + std::string prompt = inputs.prompt; + + if(!inputs.quiet) + { + printf("\nTTS Generating... "); + } + + // process prompt and generate voice codes + + std::vector prompt_inp; + prompt_init(prompt_inp, model_ttc); + prompt_add(prompt_inp, model_ttc, "<|text_start|>", false, true); + + int speaker_seed = inputs.speaker_seed; + int audio_seed = inputs.audio_seed; + if (speaker_seed <= 0 || speaker_seed==0xFFFFFFFF) + { + speaker_seed = (((uint32_t)time(NULL)) % 1000000u); + if(ttsdebugmode==1) + { + printf("\nUsing Speaker Seed: %d", speaker_seed); + } + } + if (audio_seed <= 0 || audio_seed==0xFFFFFFFF) + { + audio_seed = (((uint32_t)time(NULL)) % 1000000u); + if(ttsdebugmode==1) + { + printf("\nUsing Audio Seed: %d", audio_seed); + } + } + + std::mt19937 tts_rng(audio_seed); + std::mt19937 speaker_rng(speaker_seed); + + //add the speaker based on the seed + if(speaker_seed>0) + { + std::string sampletext = "but<|text_sep|>that<|text_sep|>is<|text_sep|>what<|text_sep|>it<|text_sep|>is<|text_sep|>"; + } + + // convert the input text into the necessary format expected by OuteTTS + std::string prompt_clean = process_text(prompt); + + if(prompt_clean.size()==0) + { + //no input + if(!inputs.quiet) + { + printf("\nTTS sent empty input.\n"); + output.data = ""; + output.status = 1; + return output; + } + } + + if(!inputs.quiet && ttsdebugmode==1) + { + printf("\nInput: %s\n", prompt_clean.c_str()); + } + + guide_tokens = prepare_guide_tokens(model_ttc,prompt_clean); + prompt_add(prompt_inp, model_ttc, prompt_clean, false, true); + + if(!inputs.quiet) + { + printf(" (%d input words)...", guide_tokens.size()); + } + + prompt_add(prompt_inp, model_ttc, "<|text_end|>\n", false, true); + + //create batch with tokens for decoding prompt processing + llama_kv_cache_clear(ttc_ctx); + llama_kv_cache_clear(cts_ctx); + kcpp_embd_batch tts_batch = kcpp_embd_batch(prompt_inp, 0, false, true); + + auto evalok = (llama_decode(ttc_ctx, tts_batch.batch)==0); + if (!evalok) { + printf("\nError: TTS prompt batch processing failed\n"); + output.data = ""; + output.status = 0; + return output; + } + + // main loop + int n_decode = 0; + int n_predict = 4096; //max 4096 tokens + + bool next_token_uses_guide_token = true; + + while (n_decode <= n_predict) + { + float * logits = llama_get_logits(ttc_ctx); + + llama_token new_token_id = kcpp_quick_sample(logits,ttc_n_vocab,20,1.0,tts_rng); + + //guide tokens help prevent hallucinations by forcing the TTS to use the correct word + if(!guide_tokens.empty() && next_token_uses_guide_token && !llama_token_is_control(model_ttc, new_token_id) && !llama_token_is_eog(model_ttc, new_token_id)) + { + llama_token guide_token = guide_tokens[0]; + guide_tokens.erase(guide_tokens.begin()); + new_token_id = guide_token; //ensure correct word fragment is used + } + + //this is the token id that always precedes a new word + next_token_uses_guide_token = (new_token_id == 198); + + codes.push_back(new_token_id); + + // is it an end of generation? -> mark the stream as finished + if (llama_token_is_eog(model_ttc, new_token_id) || n_decode >= n_predict) { + break; + } + + n_decode += 1; + std::vector next = {new_token_id}; + llama_batch batch = llama_batch_get_one(next.data(), next.size()); + + // evaluate the current batch with the transformer model + if (llama_decode(ttc_ctx, batch)) { + printf("\nError: TTS code generation failed!\n"); + output.data = ""; + output.status = 0; + return output; + } + } + + if(!inputs.quiet && ttsdebugmode==1) + { + const std::string inp_txt = common_detokenize(ttc_ctx, codes, true); + + printf("\nGenerated %d Codes: '%s'\n",codes.size(), inp_txt.c_str()); + } + + // remove all non-audio tokens (i.e. < 151672 || > 155772) + codes.erase(std::remove_if(codes.begin(), codes.end(), [](llama_token t) { return t < 151672 || t > 155772; }), codes.end()); + + for (auto & token : codes) { + token -= 151672; + } + + const int n_codes = codes.size(); + if(n_codes<=1) + { + printf("\nWarning: TTS vocoder generated nothing!\n"); + output.data = ""; + output.status = 0; + return output; + } + kcpp_embd_batch codebatch = kcpp_embd_batch(codes,0,false,true); + + if (llama_decode(cts_ctx, codebatch.batch) != 0) { + printf("\nError: TTS vocoder generation failed!\n"); + output.data = ""; + output.status = 0; + return output; + } + else + { + // spectral operations + const int n_embd = llama_n_embd(model_cts); + const float * embd = llama_get_embeddings(cts_ctx); + std::vector audio = embd_to_audio(embd, n_codes, n_embd, 4); + + const int n_sr = 24000; // sampling rate + + // zero out first 0.05 seconds + for (int i = 0; i < 24000/20; ++i) { + audio[i] = 0.0f; + } + //add some silence at the end + for (int i = 0; i < 24000/20; ++i) { + audio.push_back(0.0f); + } + + last_generated_audio = save_wav16_base64(audio, n_sr); + + if(!inputs.quiet) + { + printf("\nTTS Generated %d audio tokens.\n",(int) codes.size()); + } + + output.data = last_generated_audio.c_str(); + output.status = 1; + return output; + } +} diff --git a/otherarch/utils.cpp b/otherarch/utils.cpp index cfeec8470..f848eeabd 100644 --- a/otherarch/utils.cpp +++ b/otherarch/utils.cpp @@ -1,5 +1,6 @@ #include "utils.h" #include "common.h" +#include "llama.h" #include #include @@ -303,6 +304,47 @@ std::vector kcpp_base64_decode(const std::string & encoded_string) return ret; } +std::string kcpp_base64_encode(const unsigned char* data, unsigned int data_length) { + const std::string base64_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; + std::string encoded; + encoded.reserve(((data_length + 2) / 3) * 4); + for (unsigned int i = 0; i < data_length; i += 3) { + unsigned int triple = (data[i] << 16) + (i + 1 < data_length ? data[i + 1] << 8 : 0) + (i + 2 < data_length ? data[i + 2] : 0); + encoded.push_back(base64_chars[(triple >> 18) & 0x3F]); + encoded.push_back(base64_chars[(triple >> 12) & 0x3F]); + if (i + 1 < data_length) { + encoded.push_back(base64_chars[(triple >> 6) & 0x3F]); + } else { + encoded.push_back('='); + } + if (i + 2 < data_length) { + encoded.push_back(base64_chars[triple & 0x3F]); + } else { + encoded.push_back('='); + } + } + return encoded; +} +std::string kcpp_base64_encode(const std::string &data) { + static const char lookup[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; + std::string encoded; + int val = 0, valb = -6; + for (unsigned char c : data) { + val = (val << 8) + c; + valb += 8; + while (valb >= 0) { + encoded.push_back(lookup[(val >> valb) & 0x3F]); + valb -= 6; + } + } + if (valb > -6) { + encoded.push_back(lookup[((val << 8) >> (valb + 8)) & 0x3F]); + } + while (encoded.size() % 4) { + encoded.push_back('='); + } + return encoded; +} std::string get_timestamp_str() { @@ -314,3 +356,150 @@ std::string get_timestamp_str() std::string timestamp(buffer); return timestamp; } + +//a very rudimentary all in one sampling function which has no dependencies +int32_t kcpp_quick_sample(float * logits, const int n_logits, int top_k, float temp, std::mt19937 & rng) +{ + if (temp <= 0 || top_k==1) { + // select the token with the highest logit directly + float max_logit = logits[0]; + int32_t max_id = 0; + for (int i = 1; i < n_logits; ++i) { + if (logits[i] > max_logit) { + max_logit = logits[i]; + max_id = i; + } + } + return max_id; + } + + top_k = (top_k<=0 || top_k>300)?300:top_k; + top_k = std::min(top_k, n_logits); + + std::vector> logits_id; + logits_id.reserve(n_logits); + + //temperature sample + const float scale = 1.0f/temp; + for (int i = 0; i < n_logits; ++i) { + logits_id.push_back(std::make_pair(logits[i]*scale, i)); + } + + //sample top_k + std::partial_sort( + logits_id.begin(), + logits_id.begin() + top_k, logits_id.end(), + [](const std::pair & a, const std::pair & b) { + return a.first > b.first; + }); + logits_id.resize(top_k); + + // compute probs for the top k tokens + std::vector probs; + probs.reserve(logits_id.size()); + float maxl = logits_id[0].first; + double sum = 0.0; + for (const auto & kv : logits_id) { + const float p = expf(kv.first - maxl); + probs.push_back(p); + sum += p; + } + + // normalize the probs + for (auto & p : probs) { + p /= sum; + } + + std::discrete_distribution<> dist(probs.begin(), probs.end()); + int idx = dist(rng); + + return logits_id[idx].second; +} + +kcpp_embd_batch::kcpp_embd_batch(float * embd, int32_t n_tokens, int32_t npast, bool use_mrope) +{ + int32_t seq_id = 0; + pos.resize(n_tokens * (use_mrope?4:1)); + std::fill(pos.begin(), pos.end(), 0); + n_seq_id.resize(n_tokens); + seq_ids.resize(n_tokens + 1); + logits.resize(n_tokens); + seq_id_0.resize(1); + seq_id_0[0] = seq_id; + seq_ids [n_tokens] = nullptr; + batch = { + /*n_tokens =*/ n_tokens, + /*tokens =*/ nullptr, + /*embd =*/ embd, + /*pos =*/ pos.data(), + /*n_seq_id =*/ n_seq_id.data(), + /*seq_id =*/ seq_ids.data(), + /*logits =*/ logits.data(), + }; + + if(!use_mrope) + { + for (int i = 0; i < n_tokens; i++) { + batch.pos [i] = npast + i; + batch.n_seq_id[i] = 1; + batch.seq_id [i] = seq_id_0.data(); + batch.logits [i] = false; + } + } + else + { + for (int i = 0; i < n_tokens; i++) { + batch.n_seq_id[i] = 1; + batch.seq_id [i] = seq_id_0.data(); + batch.logits [i] = false; + } + for (int j = 0; j < batch.n_tokens * 3; j++) { + batch.pos[j] = npast + (j % batch.n_tokens); + } + } +} + +kcpp_embd_batch::kcpp_embd_batch(std::vector & tokens, int32_t npast, bool use_mrope, bool return_all_logits) +{ + int32_t seq_id = 0; + int32_t n_tokens = tokens.size(); + pos.resize(n_tokens * (use_mrope?4:1)); + std::fill(pos.begin(), pos.end(), 0); + n_seq_id.resize(n_tokens); + seq_ids.resize(n_tokens + 1); + logits.resize(n_tokens); + seq_id_0.resize(1); + seq_id_0[0] = seq_id; + seq_ids[n_tokens] = nullptr; + batch = { + /*n_tokens =*/ n_tokens, + /*tokens =*/ tokens.data(), + /*embd =*/ nullptr, + /*pos =*/ pos.data(), + /*n_seq_id =*/ n_seq_id.data(), + /*seq_id =*/ seq_ids.data(), + /*logits =*/ logits.data(), + }; + + if(!use_mrope) + { + for (int i = 0; i < n_tokens; i++) { + batch.pos [i] = npast + i; + batch.n_seq_id[i] = 1; + batch.seq_id [i] = seq_id_0.data(); + batch.logits [i] = (return_all_logits?true:false); + } + } + else + { + for (int i = 0; i < n_tokens; i++) { + batch.n_seq_id[i] = 1; + batch.seq_id [i] = seq_id_0.data(); + batch.logits [i] = (return_all_logits?true:false); + } + for (int j = 0; j < batch.n_tokens * 3; j++) { + batch.pos[j] = npast + (j % batch.n_tokens); + } + } + batch.logits[n_tokens - 1] = true; +} \ No newline at end of file diff --git a/otherarch/utils.h b/otherarch/utils.h index e0a60c95b..29977c40c 100644 --- a/otherarch/utils.h +++ b/otherarch/utils.h @@ -8,6 +8,7 @@ #include #include #include "ggml_v3.h" +#include "llama.h" // // CLI argument parsing @@ -52,10 +53,23 @@ void gpt_split_words(std::string str, std::vector& words); // std::vector gpt_tokenize(const gpt_vocab & vocab, const std::string & text); - bool should_transpose_layer(std::string name); void kcpp_graph_compute_helper(ggml_v3_cgraph * graph, int n_threads); std::vector kcpp_base64_decode(const std::string & encoded_string); +std::string kcpp_base64_encode(const unsigned char* data, unsigned int data_length); +std::string kcpp_base64_encode(const std::string &data); -std::string get_timestamp_str(); \ No newline at end of file +std::string get_timestamp_str(); +int32_t kcpp_quick_sample(float * logits, const int n_logits, int top_k, float temp, std::mt19937 & rng); + +struct kcpp_embd_batch { //duplcated from llava_embd_batch + std::vector pos; + std::vector n_seq_id; + std::vector seq_id_0; + std::vector seq_ids; + std::vector logits; + llama_batch batch; + kcpp_embd_batch(float * embd, int32_t n_tokens, int32_t npast, bool use_mrope); + kcpp_embd_batch(std::vector & tokens, int32_t npast, bool use_mrope, bool return_all_logits); +}; \ No newline at end of file